diff --git "a/checkpoint/trainer_state (4).json" "b/checkpoint/trainer_state (4).json" new file mode 100644--- /dev/null +++ "b/checkpoint/trainer_state (4).json" @@ -0,0 +1,91176 @@ +{ + "best_metric": 0.63648960197002, + "best_model_checkpoint": "./results/checkpoint-130000", + "epoch": 0.9609414269241005, + "eval_steps": 10000, + "global_step": 130000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 7.391857130185388e-05, + "grad_norm": 0.10461781173944473, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.0494, + "step": 10 + }, + { + "epoch": 0.00014783714260370776, + "grad_norm": 0.08169854432344437, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.0441, + "step": 20 + }, + { + "epoch": 0.00022175571390556164, + "grad_norm": 0.10416487604379654, + "learning_rate": 3e-06, + "loss": 0.0489, + "step": 30 + }, + { + "epoch": 0.0002956742852074155, + "grad_norm": 0.07958583533763885, + "learning_rate": 4.000000000000001e-06, + "loss": 0.0493, + "step": 40 + }, + { + "epoch": 0.0003695928565092694, + "grad_norm": 0.10156191140413284, + "learning_rate": 5e-06, + "loss": 0.0476, + "step": 50 + }, + { + "epoch": 0.0004435114278111233, + "grad_norm": 0.08352667093276978, + "learning_rate": 6e-06, + "loss": 0.0475, + "step": 60 + }, + { + "epoch": 0.0005174299991129772, + "grad_norm": 0.08660561591386795, + "learning_rate": 7.000000000000001e-06, + "loss": 0.0471, + "step": 70 + }, + { + "epoch": 0.000591348570414831, + "grad_norm": 0.09334266930818558, + "learning_rate": 8.000000000000001e-06, + "loss": 0.0428, + "step": 80 + }, + { + "epoch": 0.0006652671417166849, + "grad_norm": 0.08674201369285583, + "learning_rate": 9e-06, + "loss": 0.0439, + "step": 90 + }, + { + "epoch": 0.0007391857130185388, + "grad_norm": 0.09628602117300034, + "learning_rate": 1e-05, + "loss": 0.0433, + "step": 100 + }, + { + "epoch": 0.0008131042843203927, + "grad_norm": 0.09610876441001892, + "learning_rate": 1.1000000000000001e-05, + "loss": 0.0464, + "step": 110 + }, + { + "epoch": 0.0008870228556222466, + "grad_norm": 0.10648789256811142, + "learning_rate": 1.2e-05, + "loss": 0.0446, + "step": 120 + }, + { + "epoch": 0.0009609414269241004, + "grad_norm": 0.0938977524638176, + "learning_rate": 1.3000000000000001e-05, + "loss": 0.0425, + "step": 130 + }, + { + "epoch": 0.0010348599982259543, + "grad_norm": 0.10076262056827545, + "learning_rate": 1.4000000000000001e-05, + "loss": 0.044, + "step": 140 + }, + { + "epoch": 0.001108778569527808, + "grad_norm": 0.10113703459501266, + "learning_rate": 1.5e-05, + "loss": 0.0426, + "step": 150 + }, + { + "epoch": 0.001182697140829662, + "grad_norm": 0.08176673948764801, + "learning_rate": 1.6000000000000003e-05, + "loss": 0.0411, + "step": 160 + }, + { + "epoch": 0.0012566157121315159, + "grad_norm": 0.1192707046866417, + "learning_rate": 1.7000000000000003e-05, + "loss": 0.0429, + "step": 170 + }, + { + "epoch": 0.0013305342834333698, + "grad_norm": 0.0927235409617424, + "learning_rate": 1.8e-05, + "loss": 0.0451, + "step": 180 + }, + { + "epoch": 0.0014044528547352236, + "grad_norm": 0.09320451319217682, + "learning_rate": 1.9e-05, + "loss": 0.0441, + "step": 190 + }, + { + "epoch": 0.0014783714260370776, + "grad_norm": 0.10145901888608932, + "learning_rate": 2e-05, + "loss": 0.0413, + "step": 200 + }, + { + "epoch": 0.0015522899973389314, + "grad_norm": 0.12276735156774521, + "learning_rate": 2.1e-05, + "loss": 0.0389, + "step": 210 + }, + { + "epoch": 0.0016262085686407854, + "grad_norm": 0.1172725036740303, + "learning_rate": 2.2000000000000003e-05, + "loss": 0.0393, + "step": 220 + }, + { + "epoch": 0.0017001271399426391, + "grad_norm": 0.10647889226675034, + "learning_rate": 2.3000000000000003e-05, + "loss": 0.0437, + "step": 230 + }, + { + "epoch": 0.0017740457112444931, + "grad_norm": 0.10727279633283615, + "learning_rate": 2.4e-05, + "loss": 0.04, + "step": 240 + }, + { + "epoch": 0.001847964282546347, + "grad_norm": 0.08067339658737183, + "learning_rate": 2.5e-05, + "loss": 0.0385, + "step": 250 + }, + { + "epoch": 0.0019218828538482009, + "grad_norm": 0.09736546128988266, + "learning_rate": 2.6000000000000002e-05, + "loss": 0.0385, + "step": 260 + }, + { + "epoch": 0.0019958014251500547, + "grad_norm": 0.08011851459741592, + "learning_rate": 2.7000000000000002e-05, + "loss": 0.039, + "step": 270 + }, + { + "epoch": 0.0020697199964519086, + "grad_norm": 0.08815836906433105, + "learning_rate": 2.8000000000000003e-05, + "loss": 0.0368, + "step": 280 + }, + { + "epoch": 0.0021436385677537626, + "grad_norm": 0.08860961347818375, + "learning_rate": 2.9e-05, + "loss": 0.0397, + "step": 290 + }, + { + "epoch": 0.002217557139055616, + "grad_norm": 0.0761508047580719, + "learning_rate": 3e-05, + "loss": 0.0357, + "step": 300 + }, + { + "epoch": 0.00229147571035747, + "grad_norm": 0.08089523762464523, + "learning_rate": 3.1e-05, + "loss": 0.0384, + "step": 310 + }, + { + "epoch": 0.002365394281659324, + "grad_norm": 0.09379731118679047, + "learning_rate": 3.2000000000000005e-05, + "loss": 0.0413, + "step": 320 + }, + { + "epoch": 0.002439312852961178, + "grad_norm": 0.09349235147237778, + "learning_rate": 3.3e-05, + "loss": 0.039, + "step": 330 + }, + { + "epoch": 0.0025132314242630317, + "grad_norm": 0.0876515731215477, + "learning_rate": 3.4000000000000007e-05, + "loss": 0.037, + "step": 340 + }, + { + "epoch": 0.0025871499955648857, + "grad_norm": 0.10627899318933487, + "learning_rate": 3.5e-05, + "loss": 0.0371, + "step": 350 + }, + { + "epoch": 0.0026610685668667397, + "grad_norm": 0.0915268212556839, + "learning_rate": 3.6e-05, + "loss": 0.0371, + "step": 360 + }, + { + "epoch": 0.0027349871381685937, + "grad_norm": 0.1196686178445816, + "learning_rate": 3.7e-05, + "loss": 0.0379, + "step": 370 + }, + { + "epoch": 0.0028089057094704472, + "grad_norm": 0.1142694279551506, + "learning_rate": 3.8e-05, + "loss": 0.0373, + "step": 380 + }, + { + "epoch": 0.0028828242807723012, + "grad_norm": 0.07585809379816055, + "learning_rate": 3.9000000000000006e-05, + "loss": 0.0369, + "step": 390 + }, + { + "epoch": 0.002956742852074155, + "grad_norm": 0.10797297954559326, + "learning_rate": 4e-05, + "loss": 0.0373, + "step": 400 + }, + { + "epoch": 0.003030661423376009, + "grad_norm": 0.1142873466014862, + "learning_rate": 4.1e-05, + "loss": 0.0406, + "step": 410 + }, + { + "epoch": 0.0031045799946778628, + "grad_norm": 0.09952966868877411, + "learning_rate": 4.2e-05, + "loss": 0.0361, + "step": 420 + }, + { + "epoch": 0.0031784985659797167, + "grad_norm": 0.09601733088493347, + "learning_rate": 4.3e-05, + "loss": 0.033, + "step": 430 + }, + { + "epoch": 0.0032524171372815707, + "grad_norm": 0.0806892067193985, + "learning_rate": 4.4000000000000006e-05, + "loss": 0.035, + "step": 440 + }, + { + "epoch": 0.0033263357085834243, + "grad_norm": 0.09059257805347443, + "learning_rate": 4.5e-05, + "loss": 0.0334, + "step": 450 + }, + { + "epoch": 0.0034002542798852783, + "grad_norm": 0.12588344514369965, + "learning_rate": 4.600000000000001e-05, + "loss": 0.0349, + "step": 460 + }, + { + "epoch": 0.0034741728511871323, + "grad_norm": 0.10129739344120026, + "learning_rate": 4.7e-05, + "loss": 0.032, + "step": 470 + }, + { + "epoch": 0.0035480914224889862, + "grad_norm": 0.08103670924901962, + "learning_rate": 4.8e-05, + "loss": 0.0322, + "step": 480 + }, + { + "epoch": 0.00362200999379084, + "grad_norm": 0.0991249829530716, + "learning_rate": 4.9e-05, + "loss": 0.034, + "step": 490 + }, + { + "epoch": 0.003695928565092694, + "grad_norm": 0.09622903913259506, + "learning_rate": 5e-05, + "loss": 0.0319, + "step": 500 + }, + { + "epoch": 0.0037698471363945478, + "grad_norm": 0.09769944846630096, + "learning_rate": 4.99962903608737e-05, + "loss": 0.0293, + "step": 510 + }, + { + "epoch": 0.0038437657076964018, + "grad_norm": 0.10930614918470383, + "learning_rate": 4.999258072174739e-05, + "loss": 0.0353, + "step": 520 + }, + { + "epoch": 0.003917684278998255, + "grad_norm": 0.0826733186841011, + "learning_rate": 4.9988871082621084e-05, + "loss": 0.0336, + "step": 530 + }, + { + "epoch": 0.003991602850300109, + "grad_norm": 0.0965137779712677, + "learning_rate": 4.998516144349478e-05, + "loss": 0.0347, + "step": 540 + }, + { + "epoch": 0.004065521421601963, + "grad_norm": 0.09630349278450012, + "learning_rate": 4.9981451804368475e-05, + "loss": 0.0313, + "step": 550 + }, + { + "epoch": 0.004139439992903817, + "grad_norm": 0.1024831011891365, + "learning_rate": 4.997774216524217e-05, + "loss": 0.0332, + "step": 560 + }, + { + "epoch": 0.004213358564205671, + "grad_norm": 0.14341188967227936, + "learning_rate": 4.997403252611586e-05, + "loss": 0.0316, + "step": 570 + }, + { + "epoch": 0.004287277135507525, + "grad_norm": 0.0977560356259346, + "learning_rate": 4.9970322886989557e-05, + "loss": 0.0293, + "step": 580 + }, + { + "epoch": 0.004361195706809378, + "grad_norm": 0.10209672152996063, + "learning_rate": 4.9966613247863246e-05, + "loss": 0.0331, + "step": 590 + }, + { + "epoch": 0.004435114278111232, + "grad_norm": 0.0934465304017067, + "learning_rate": 4.996290360873694e-05, + "loss": 0.0326, + "step": 600 + }, + { + "epoch": 0.004509032849413086, + "grad_norm": 0.09404677152633667, + "learning_rate": 4.995919396961064e-05, + "loss": 0.0348, + "step": 610 + }, + { + "epoch": 0.00458295142071494, + "grad_norm": 0.07064525038003922, + "learning_rate": 4.9955484330484334e-05, + "loss": 0.0321, + "step": 620 + }, + { + "epoch": 0.004656869992016794, + "grad_norm": 0.1300738900899887, + "learning_rate": 4.995177469135803e-05, + "loss": 0.0334, + "step": 630 + }, + { + "epoch": 0.004730788563318648, + "grad_norm": 0.11687029898166656, + "learning_rate": 4.9948065052231726e-05, + "loss": 0.0313, + "step": 640 + }, + { + "epoch": 0.004804707134620502, + "grad_norm": 0.10037320107221603, + "learning_rate": 4.9944355413105415e-05, + "loss": 0.03, + "step": 650 + }, + { + "epoch": 0.004878625705922356, + "grad_norm": 0.12054847180843353, + "learning_rate": 4.994064577397911e-05, + "loss": 0.0291, + "step": 660 + }, + { + "epoch": 0.004952544277224209, + "grad_norm": 0.09925072640180588, + "learning_rate": 4.99369361348528e-05, + "loss": 0.0297, + "step": 670 + }, + { + "epoch": 0.005026462848526063, + "grad_norm": 0.11456023156642914, + "learning_rate": 4.9933226495726496e-05, + "loss": 0.033, + "step": 680 + }, + { + "epoch": 0.005100381419827917, + "grad_norm": 0.1687926948070526, + "learning_rate": 4.992951685660019e-05, + "loss": 0.0326, + "step": 690 + }, + { + "epoch": 0.005174299991129771, + "grad_norm": 0.09963828325271606, + "learning_rate": 4.992580721747389e-05, + "loss": 0.0317, + "step": 700 + }, + { + "epoch": 0.005248218562431625, + "grad_norm": 0.09856045991182327, + "learning_rate": 4.9922097578347584e-05, + "loss": 0.0325, + "step": 710 + }, + { + "epoch": 0.005322137133733479, + "grad_norm": 0.08094804733991623, + "learning_rate": 4.991838793922127e-05, + "loss": 0.0286, + "step": 720 + }, + { + "epoch": 0.005396055705035333, + "grad_norm": 0.09114375710487366, + "learning_rate": 4.991467830009497e-05, + "loss": 0.0289, + "step": 730 + }, + { + "epoch": 0.005469974276337187, + "grad_norm": 0.1050422415137291, + "learning_rate": 4.9910968660968665e-05, + "loss": 0.0294, + "step": 740 + }, + { + "epoch": 0.0055438928476390405, + "grad_norm": 0.0893392488360405, + "learning_rate": 4.9907259021842354e-05, + "loss": 0.0309, + "step": 750 + }, + { + "epoch": 0.0056178114189408945, + "grad_norm": 0.1049913540482521, + "learning_rate": 4.990354938271605e-05, + "loss": 0.0262, + "step": 760 + }, + { + "epoch": 0.0056917299902427484, + "grad_norm": 0.11163979023694992, + "learning_rate": 4.9899839743589746e-05, + "loss": 0.03, + "step": 770 + }, + { + "epoch": 0.0057656485615446024, + "grad_norm": 0.09120380878448486, + "learning_rate": 4.989613010446344e-05, + "loss": 0.0311, + "step": 780 + }, + { + "epoch": 0.005839567132846456, + "grad_norm": 0.10143250972032547, + "learning_rate": 4.989242046533714e-05, + "loss": 0.0312, + "step": 790 + }, + { + "epoch": 0.00591348570414831, + "grad_norm": 0.0912647396326065, + "learning_rate": 4.988871082621083e-05, + "loss": 0.0291, + "step": 800 + }, + { + "epoch": 0.005987404275450164, + "grad_norm": 0.09807098656892776, + "learning_rate": 4.988500118708452e-05, + "loss": 0.0306, + "step": 810 + }, + { + "epoch": 0.006061322846752018, + "grad_norm": 0.09862897545099258, + "learning_rate": 4.988129154795821e-05, + "loss": 0.0296, + "step": 820 + }, + { + "epoch": 0.0061352414180538715, + "grad_norm": 0.09236502647399902, + "learning_rate": 4.987758190883191e-05, + "loss": 0.0312, + "step": 830 + }, + { + "epoch": 0.0062091599893557255, + "grad_norm": 0.10084735602140427, + "learning_rate": 4.9873872269705604e-05, + "loss": 0.0295, + "step": 840 + }, + { + "epoch": 0.0062830785606575795, + "grad_norm": 0.11696548759937286, + "learning_rate": 4.98701626305793e-05, + "loss": 0.0308, + "step": 850 + }, + { + "epoch": 0.0063569971319594335, + "grad_norm": 0.12154483795166016, + "learning_rate": 4.9866452991452996e-05, + "loss": 0.0302, + "step": 860 + }, + { + "epoch": 0.0064309157032612875, + "grad_norm": 0.148088276386261, + "learning_rate": 4.986274335232669e-05, + "loss": 0.0269, + "step": 870 + }, + { + "epoch": 0.0065048342745631415, + "grad_norm": 0.13604265451431274, + "learning_rate": 4.985903371320038e-05, + "loss": 0.0279, + "step": 880 + }, + { + "epoch": 0.0065787528458649954, + "grad_norm": 0.14540578424930573, + "learning_rate": 4.985532407407408e-05, + "loss": 0.0287, + "step": 890 + }, + { + "epoch": 0.006652671417166849, + "grad_norm": 0.08655355125665665, + "learning_rate": 4.9851614434947767e-05, + "loss": 0.0276, + "step": 900 + }, + { + "epoch": 0.0067265899884687026, + "grad_norm": 0.12625974416732788, + "learning_rate": 4.984790479582146e-05, + "loss": 0.0303, + "step": 910 + }, + { + "epoch": 0.0068005085597705565, + "grad_norm": 0.10419458895921707, + "learning_rate": 4.984419515669516e-05, + "loss": 0.0297, + "step": 920 + }, + { + "epoch": 0.0068744271310724105, + "grad_norm": 0.10234599560499191, + "learning_rate": 4.9840485517568854e-05, + "loss": 0.0278, + "step": 930 + }, + { + "epoch": 0.0069483457023742645, + "grad_norm": 0.12286118417978287, + "learning_rate": 4.983677587844255e-05, + "loss": 0.0303, + "step": 940 + }, + { + "epoch": 0.0070222642736761185, + "grad_norm": 0.10289271920919418, + "learning_rate": 4.983306623931624e-05, + "loss": 0.0284, + "step": 950 + }, + { + "epoch": 0.0070961828449779725, + "grad_norm": 0.12728984653949738, + "learning_rate": 4.9829356600189936e-05, + "loss": 0.0321, + "step": 960 + }, + { + "epoch": 0.0071701014162798265, + "grad_norm": 0.11914363503456116, + "learning_rate": 4.982564696106363e-05, + "loss": 0.0281, + "step": 970 + }, + { + "epoch": 0.00724401998758168, + "grad_norm": 0.09279730916023254, + "learning_rate": 4.982193732193732e-05, + "loss": 0.0319, + "step": 980 + }, + { + "epoch": 0.007317938558883534, + "grad_norm": 0.12616656720638275, + "learning_rate": 4.981822768281102e-05, + "loss": 0.0289, + "step": 990 + }, + { + "epoch": 0.007391857130185388, + "grad_norm": 0.09209688007831573, + "learning_rate": 4.981451804368471e-05, + "loss": 0.0299, + "step": 1000 + }, + { + "epoch": 0.007465775701487242, + "grad_norm": 0.08926288038492203, + "learning_rate": 4.981080840455841e-05, + "loss": 0.0286, + "step": 1010 + }, + { + "epoch": 0.0075396942727890956, + "grad_norm": 0.10940416902303696, + "learning_rate": 4.9807098765432105e-05, + "loss": 0.0284, + "step": 1020 + }, + { + "epoch": 0.0076136128440909496, + "grad_norm": 0.08967240899801254, + "learning_rate": 4.9803389126305794e-05, + "loss": 0.0277, + "step": 1030 + }, + { + "epoch": 0.0076875314153928035, + "grad_norm": 0.09682679921388626, + "learning_rate": 4.979967948717949e-05, + "loss": 0.028, + "step": 1040 + }, + { + "epoch": 0.0077614499866946575, + "grad_norm": 0.11966560035943985, + "learning_rate": 4.979596984805318e-05, + "loss": 0.0286, + "step": 1050 + }, + { + "epoch": 0.00783536855799651, + "grad_norm": 0.08031348884105682, + "learning_rate": 4.9792260208926875e-05, + "loss": 0.029, + "step": 1060 + }, + { + "epoch": 0.007909287129298365, + "grad_norm": 0.08792899549007416, + "learning_rate": 4.978855056980057e-05, + "loss": 0.0298, + "step": 1070 + }, + { + "epoch": 0.007983205700600219, + "grad_norm": 0.11339164525270462, + "learning_rate": 4.978484093067427e-05, + "loss": 0.0274, + "step": 1080 + }, + { + "epoch": 0.008057124271902073, + "grad_norm": 0.13752642273902893, + "learning_rate": 4.978113129154796e-05, + "loss": 0.0239, + "step": 1090 + }, + { + "epoch": 0.008131042843203927, + "grad_norm": 0.1251632571220398, + "learning_rate": 4.977742165242166e-05, + "loss": 0.0302, + "step": 1100 + }, + { + "epoch": 0.00820496141450578, + "grad_norm": 0.12013223767280579, + "learning_rate": 4.977371201329535e-05, + "loss": 0.029, + "step": 1110 + }, + { + "epoch": 0.008278879985807635, + "grad_norm": 0.11286813020706177, + "learning_rate": 4.9770002374169044e-05, + "loss": 0.0281, + "step": 1120 + }, + { + "epoch": 0.008352798557109489, + "grad_norm": 0.12342401593923569, + "learning_rate": 4.976629273504273e-05, + "loss": 0.0269, + "step": 1130 + }, + { + "epoch": 0.008426717128411343, + "grad_norm": 0.09709502756595612, + "learning_rate": 4.976258309591643e-05, + "loss": 0.0318, + "step": 1140 + }, + { + "epoch": 0.008500635699713197, + "grad_norm": 0.1205952987074852, + "learning_rate": 4.9758873456790125e-05, + "loss": 0.0306, + "step": 1150 + }, + { + "epoch": 0.00857455427101505, + "grad_norm": 0.08721502870321274, + "learning_rate": 4.975516381766382e-05, + "loss": 0.0268, + "step": 1160 + }, + { + "epoch": 0.008648472842316905, + "grad_norm": 0.09323421120643616, + "learning_rate": 4.975145417853752e-05, + "loss": 0.0276, + "step": 1170 + }, + { + "epoch": 0.008722391413618757, + "grad_norm": 0.09394596517086029, + "learning_rate": 4.9747744539411206e-05, + "loss": 0.0271, + "step": 1180 + }, + { + "epoch": 0.00879630998492061, + "grad_norm": 0.09926934540271759, + "learning_rate": 4.97440349002849e-05, + "loss": 0.0304, + "step": 1190 + }, + { + "epoch": 0.008870228556222465, + "grad_norm": 0.09732608497142792, + "learning_rate": 4.97403252611586e-05, + "loss": 0.0248, + "step": 1200 + }, + { + "epoch": 0.008944147127524319, + "grad_norm": 0.10517530143260956, + "learning_rate": 4.973661562203229e-05, + "loss": 0.0307, + "step": 1210 + }, + { + "epoch": 0.009018065698826173, + "grad_norm": 0.09877362102270126, + "learning_rate": 4.973290598290598e-05, + "loss": 0.0263, + "step": 1220 + }, + { + "epoch": 0.009091984270128027, + "grad_norm": 0.0978366956114769, + "learning_rate": 4.972919634377968e-05, + "loss": 0.0298, + "step": 1230 + }, + { + "epoch": 0.00916590284142988, + "grad_norm": 0.14290444552898407, + "learning_rate": 4.9725486704653375e-05, + "loss": 0.0283, + "step": 1240 + }, + { + "epoch": 0.009239821412731735, + "grad_norm": 0.10938958823680878, + "learning_rate": 4.972177706552707e-05, + "loss": 0.0294, + "step": 1250 + }, + { + "epoch": 0.009313739984033589, + "grad_norm": 0.12752975523471832, + "learning_rate": 4.971806742640076e-05, + "loss": 0.027, + "step": 1260 + }, + { + "epoch": 0.009387658555335443, + "grad_norm": 0.08596224337816238, + "learning_rate": 4.9714357787274456e-05, + "loss": 0.0273, + "step": 1270 + }, + { + "epoch": 0.009461577126637297, + "grad_norm": 0.1066613495349884, + "learning_rate": 4.9710648148148146e-05, + "loss": 0.028, + "step": 1280 + }, + { + "epoch": 0.00953549569793915, + "grad_norm": 0.12049562484025955, + "learning_rate": 4.970693850902184e-05, + "loss": 0.028, + "step": 1290 + }, + { + "epoch": 0.009609414269241005, + "grad_norm": 0.12397699803113937, + "learning_rate": 4.970322886989554e-05, + "loss": 0.0272, + "step": 1300 + }, + { + "epoch": 0.009683332840542859, + "grad_norm": 0.1257927119731903, + "learning_rate": 4.9699519230769233e-05, + "loss": 0.0285, + "step": 1310 + }, + { + "epoch": 0.009757251411844713, + "grad_norm": 0.11058560013771057, + "learning_rate": 4.969580959164293e-05, + "loss": 0.0287, + "step": 1320 + }, + { + "epoch": 0.009831169983146567, + "grad_norm": 0.10039151459932327, + "learning_rate": 4.9692099952516625e-05, + "loss": 0.0292, + "step": 1330 + }, + { + "epoch": 0.009905088554448419, + "grad_norm": 0.1464836746454239, + "learning_rate": 4.9688390313390315e-05, + "loss": 0.0284, + "step": 1340 + }, + { + "epoch": 0.009979007125750273, + "grad_norm": 0.1260063499212265, + "learning_rate": 4.968468067426401e-05, + "loss": 0.0288, + "step": 1350 + }, + { + "epoch": 0.010052925697052127, + "grad_norm": 0.13692636787891388, + "learning_rate": 4.96809710351377e-05, + "loss": 0.0305, + "step": 1360 + }, + { + "epoch": 0.01012684426835398, + "grad_norm": 0.09072256833314896, + "learning_rate": 4.9677261396011396e-05, + "loss": 0.0272, + "step": 1370 + }, + { + "epoch": 0.010200762839655835, + "grad_norm": 0.1056734025478363, + "learning_rate": 4.967355175688509e-05, + "loss": 0.027, + "step": 1380 + }, + { + "epoch": 0.010274681410957689, + "grad_norm": 0.1367037445306778, + "learning_rate": 4.966984211775879e-05, + "loss": 0.0274, + "step": 1390 + }, + { + "epoch": 0.010348599982259543, + "grad_norm": 0.15280531346797943, + "learning_rate": 4.9666132478632484e-05, + "loss": 0.0292, + "step": 1400 + }, + { + "epoch": 0.010422518553561397, + "grad_norm": 0.10445661842823029, + "learning_rate": 4.966242283950617e-05, + "loss": 0.0265, + "step": 1410 + }, + { + "epoch": 0.01049643712486325, + "grad_norm": 0.093659907579422, + "learning_rate": 4.965871320037987e-05, + "loss": 0.0277, + "step": 1420 + }, + { + "epoch": 0.010570355696165105, + "grad_norm": 0.13440953195095062, + "learning_rate": 4.9655003561253565e-05, + "loss": 0.0287, + "step": 1430 + }, + { + "epoch": 0.010644274267466959, + "grad_norm": 0.09561889618635178, + "learning_rate": 4.9651293922127254e-05, + "loss": 0.0269, + "step": 1440 + }, + { + "epoch": 0.010718192838768813, + "grad_norm": 0.12212470918893814, + "learning_rate": 4.964758428300095e-05, + "loss": 0.0245, + "step": 1450 + }, + { + "epoch": 0.010792111410070667, + "grad_norm": 0.12034928053617477, + "learning_rate": 4.9643874643874646e-05, + "loss": 0.0273, + "step": 1460 + }, + { + "epoch": 0.01086602998137252, + "grad_norm": 0.10437193512916565, + "learning_rate": 4.964016500474834e-05, + "loss": 0.0298, + "step": 1470 + }, + { + "epoch": 0.010939948552674375, + "grad_norm": 0.10191825777292252, + "learning_rate": 4.963645536562204e-05, + "loss": 0.0282, + "step": 1480 + }, + { + "epoch": 0.011013867123976227, + "grad_norm": 0.15169893205165863, + "learning_rate": 4.963274572649573e-05, + "loss": 0.0281, + "step": 1490 + }, + { + "epoch": 0.011087785695278081, + "grad_norm": 0.13512222468852997, + "learning_rate": 4.962903608736942e-05, + "loss": 0.026, + "step": 1500 + }, + { + "epoch": 0.011161704266579935, + "grad_norm": 0.09260152280330658, + "learning_rate": 4.962532644824311e-05, + "loss": 0.0272, + "step": 1510 + }, + { + "epoch": 0.011235622837881789, + "grad_norm": 0.15228793025016785, + "learning_rate": 4.962161680911681e-05, + "loss": 0.028, + "step": 1520 + }, + { + "epoch": 0.011309541409183643, + "grad_norm": 0.11596718430519104, + "learning_rate": 4.9617907169990504e-05, + "loss": 0.0272, + "step": 1530 + }, + { + "epoch": 0.011383459980485497, + "grad_norm": 0.12411506474018097, + "learning_rate": 4.96141975308642e-05, + "loss": 0.0303, + "step": 1540 + }, + { + "epoch": 0.011457378551787351, + "grad_norm": 0.12651769816875458, + "learning_rate": 4.9610487891737896e-05, + "loss": 0.0265, + "step": 1550 + }, + { + "epoch": 0.011531297123089205, + "grad_norm": 0.10328517854213715, + "learning_rate": 4.960677825261159e-05, + "loss": 0.0276, + "step": 1560 + }, + { + "epoch": 0.011605215694391059, + "grad_norm": 0.11359741538763046, + "learning_rate": 4.960306861348528e-05, + "loss": 0.0271, + "step": 1570 + }, + { + "epoch": 0.011679134265692913, + "grad_norm": 0.11071664839982986, + "learning_rate": 4.959935897435898e-05, + "loss": 0.0251, + "step": 1580 + }, + { + "epoch": 0.011753052836994767, + "grad_norm": 0.10678543895483017, + "learning_rate": 4.9595649335232666e-05, + "loss": 0.0244, + "step": 1590 + }, + { + "epoch": 0.01182697140829662, + "grad_norm": 0.11132028698921204, + "learning_rate": 4.959193969610636e-05, + "loss": 0.0297, + "step": 1600 + }, + { + "epoch": 0.011900889979598475, + "grad_norm": 0.08840740472078323, + "learning_rate": 4.9588230056980065e-05, + "loss": 0.0246, + "step": 1610 + }, + { + "epoch": 0.011974808550900329, + "grad_norm": 0.11825734376907349, + "learning_rate": 4.9584520417853754e-05, + "loss": 0.0267, + "step": 1620 + }, + { + "epoch": 0.012048727122202183, + "grad_norm": 0.0901472344994545, + "learning_rate": 4.958081077872745e-05, + "loss": 0.0273, + "step": 1630 + }, + { + "epoch": 0.012122645693504037, + "grad_norm": 0.09711014479398727, + "learning_rate": 4.957710113960114e-05, + "loss": 0.0255, + "step": 1640 + }, + { + "epoch": 0.012196564264805889, + "grad_norm": 0.12570925056934357, + "learning_rate": 4.9573391500474835e-05, + "loss": 0.0247, + "step": 1650 + }, + { + "epoch": 0.012270482836107743, + "grad_norm": 0.09919515997171402, + "learning_rate": 4.956968186134853e-05, + "loss": 0.0241, + "step": 1660 + }, + { + "epoch": 0.012344401407409597, + "grad_norm": 0.10764940083026886, + "learning_rate": 4.956597222222222e-05, + "loss": 0.0281, + "step": 1670 + }, + { + "epoch": 0.012418319978711451, + "grad_norm": 0.1037641242146492, + "learning_rate": 4.9562262583095916e-05, + "loss": 0.0289, + "step": 1680 + }, + { + "epoch": 0.012492238550013305, + "grad_norm": 0.09550962597131729, + "learning_rate": 4.955855294396961e-05, + "loss": 0.0255, + "step": 1690 + }, + { + "epoch": 0.012566157121315159, + "grad_norm": 0.1115112155675888, + "learning_rate": 4.955484330484331e-05, + "loss": 0.0281, + "step": 1700 + }, + { + "epoch": 0.012640075692617013, + "grad_norm": 0.12160573899745941, + "learning_rate": 4.9551133665717004e-05, + "loss": 0.0246, + "step": 1710 + }, + { + "epoch": 0.012713994263918867, + "grad_norm": 0.10318835824728012, + "learning_rate": 4.9547424026590694e-05, + "loss": 0.0247, + "step": 1720 + }, + { + "epoch": 0.012787912835220721, + "grad_norm": 0.09020746499300003, + "learning_rate": 4.954371438746439e-05, + "loss": 0.0255, + "step": 1730 + }, + { + "epoch": 0.012861831406522575, + "grad_norm": 0.09125480055809021, + "learning_rate": 4.954000474833808e-05, + "loss": 0.026, + "step": 1740 + }, + { + "epoch": 0.012935749977824429, + "grad_norm": 0.12356351315975189, + "learning_rate": 4.9536295109211775e-05, + "loss": 0.0262, + "step": 1750 + }, + { + "epoch": 0.013009668549126283, + "grad_norm": 0.09936685860157013, + "learning_rate": 4.953258547008548e-05, + "loss": 0.0274, + "step": 1760 + }, + { + "epoch": 0.013083587120428137, + "grad_norm": 0.11199319362640381, + "learning_rate": 4.9528875830959167e-05, + "loss": 0.0256, + "step": 1770 + }, + { + "epoch": 0.013157505691729991, + "grad_norm": 0.13244837522506714, + "learning_rate": 4.952516619183286e-05, + "loss": 0.0292, + "step": 1780 + }, + { + "epoch": 0.013231424263031845, + "grad_norm": 0.09166007488965988, + "learning_rate": 4.952145655270656e-05, + "loss": 0.0247, + "step": 1790 + }, + { + "epoch": 0.013305342834333697, + "grad_norm": 0.12759141623973846, + "learning_rate": 4.951774691358025e-05, + "loss": 0.0266, + "step": 1800 + }, + { + "epoch": 0.013379261405635551, + "grad_norm": 0.10648078471422195, + "learning_rate": 4.9514037274453944e-05, + "loss": 0.0264, + "step": 1810 + }, + { + "epoch": 0.013453179976937405, + "grad_norm": 0.10549057275056839, + "learning_rate": 4.951032763532763e-05, + "loss": 0.0267, + "step": 1820 + }, + { + "epoch": 0.013527098548239259, + "grad_norm": 0.104535773396492, + "learning_rate": 4.950661799620133e-05, + "loss": 0.0293, + "step": 1830 + }, + { + "epoch": 0.013601017119541113, + "grad_norm": 0.1186116635799408, + "learning_rate": 4.950290835707503e-05, + "loss": 0.0266, + "step": 1840 + }, + { + "epoch": 0.013674935690842967, + "grad_norm": 0.10436084866523743, + "learning_rate": 4.949919871794872e-05, + "loss": 0.0265, + "step": 1850 + }, + { + "epoch": 0.013748854262144821, + "grad_norm": 0.09677781164646149, + "learning_rate": 4.949548907882242e-05, + "loss": 0.026, + "step": 1860 + }, + { + "epoch": 0.013822772833446675, + "grad_norm": 0.08779451996088028, + "learning_rate": 4.9491779439696106e-05, + "loss": 0.0242, + "step": 1870 + }, + { + "epoch": 0.013896691404748529, + "grad_norm": 0.1076350212097168, + "learning_rate": 4.94880698005698e-05, + "loss": 0.0244, + "step": 1880 + }, + { + "epoch": 0.013970609976050383, + "grad_norm": 0.09812363982200623, + "learning_rate": 4.94843601614435e-05, + "loss": 0.0277, + "step": 1890 + }, + { + "epoch": 0.014044528547352237, + "grad_norm": 0.09892376512289047, + "learning_rate": 4.948065052231719e-05, + "loss": 0.0269, + "step": 1900 + }, + { + "epoch": 0.014118447118654091, + "grad_norm": 0.10767515003681183, + "learning_rate": 4.947694088319089e-05, + "loss": 0.0229, + "step": 1910 + }, + { + "epoch": 0.014192365689955945, + "grad_norm": 0.10170572251081467, + "learning_rate": 4.947323124406458e-05, + "loss": 0.0279, + "step": 1920 + }, + { + "epoch": 0.014266284261257799, + "grad_norm": 0.17412474751472473, + "learning_rate": 4.9469521604938275e-05, + "loss": 0.0278, + "step": 1930 + }, + { + "epoch": 0.014340202832559653, + "grad_norm": 0.10065030306577682, + "learning_rate": 4.946581196581197e-05, + "loss": 0.0221, + "step": 1940 + }, + { + "epoch": 0.014414121403861507, + "grad_norm": 0.1184917613863945, + "learning_rate": 4.946210232668566e-05, + "loss": 0.0239, + "step": 1950 + }, + { + "epoch": 0.01448803997516336, + "grad_norm": 0.12135294079780579, + "learning_rate": 4.9458392687559356e-05, + "loss": 0.0254, + "step": 1960 + }, + { + "epoch": 0.014561958546465213, + "grad_norm": 0.09968862682580948, + "learning_rate": 4.9454683048433045e-05, + "loss": 0.0265, + "step": 1970 + }, + { + "epoch": 0.014635877117767067, + "grad_norm": 0.1089881956577301, + "learning_rate": 4.945097340930674e-05, + "loss": 0.0233, + "step": 1980 + }, + { + "epoch": 0.014709795689068921, + "grad_norm": 0.09672956168651581, + "learning_rate": 4.9447263770180444e-05, + "loss": 0.0229, + "step": 1990 + }, + { + "epoch": 0.014783714260370775, + "grad_norm": 0.10003507882356644, + "learning_rate": 4.944355413105413e-05, + "loss": 0.0218, + "step": 2000 + }, + { + "epoch": 0.01485763283167263, + "grad_norm": 0.108626589179039, + "learning_rate": 4.943984449192783e-05, + "loss": 0.0241, + "step": 2010 + }, + { + "epoch": 0.014931551402974483, + "grad_norm": 0.13139499723911285, + "learning_rate": 4.9436134852801525e-05, + "loss": 0.0295, + "step": 2020 + }, + { + "epoch": 0.015005469974276337, + "grad_norm": 0.09990837424993515, + "learning_rate": 4.9432425213675214e-05, + "loss": 0.0232, + "step": 2030 + }, + { + "epoch": 0.015079388545578191, + "grad_norm": 0.10354658961296082, + "learning_rate": 4.942871557454891e-05, + "loss": 0.0262, + "step": 2040 + }, + { + "epoch": 0.015153307116880045, + "grad_norm": 0.08615817874670029, + "learning_rate": 4.94250059354226e-05, + "loss": 0.0274, + "step": 2050 + }, + { + "epoch": 0.015227225688181899, + "grad_norm": 0.1270134299993515, + "learning_rate": 4.94212962962963e-05, + "loss": 0.0224, + "step": 2060 + }, + { + "epoch": 0.015301144259483753, + "grad_norm": 0.14910046756267548, + "learning_rate": 4.941758665717e-05, + "loss": 0.0244, + "step": 2070 + }, + { + "epoch": 0.015375062830785607, + "grad_norm": 0.10157673060894012, + "learning_rate": 4.941387701804369e-05, + "loss": 0.0273, + "step": 2080 + }, + { + "epoch": 0.015448981402087461, + "grad_norm": 0.11614657193422318, + "learning_rate": 4.941016737891738e-05, + "loss": 0.024, + "step": 2090 + }, + { + "epoch": 0.015522899973389315, + "grad_norm": 0.10795578360557556, + "learning_rate": 4.940645773979107e-05, + "loss": 0.025, + "step": 2100 + }, + { + "epoch": 0.015596818544691169, + "grad_norm": 0.095462366938591, + "learning_rate": 4.940274810066477e-05, + "loss": 0.0258, + "step": 2110 + }, + { + "epoch": 0.01567073711599302, + "grad_norm": 0.11237543076276779, + "learning_rate": 4.9399038461538464e-05, + "loss": 0.0264, + "step": 2120 + }, + { + "epoch": 0.015744655687294877, + "grad_norm": 0.10964473336935043, + "learning_rate": 4.9395328822412154e-05, + "loss": 0.0241, + "step": 2130 + }, + { + "epoch": 0.01581857425859673, + "grad_norm": 0.11441141366958618, + "learning_rate": 4.9391619183285856e-05, + "loss": 0.0281, + "step": 2140 + }, + { + "epoch": 0.015892492829898585, + "grad_norm": 0.07639171928167343, + "learning_rate": 4.9387909544159546e-05, + "loss": 0.0232, + "step": 2150 + }, + { + "epoch": 0.015966411401200437, + "grad_norm": 0.10645004361867905, + "learning_rate": 4.938419990503324e-05, + "loss": 0.0275, + "step": 2160 + }, + { + "epoch": 0.016040329972502293, + "grad_norm": 0.10596774518489838, + "learning_rate": 4.938049026590694e-05, + "loss": 0.0229, + "step": 2170 + }, + { + "epoch": 0.016114248543804145, + "grad_norm": 0.15091291069984436, + "learning_rate": 4.937678062678063e-05, + "loss": 0.0218, + "step": 2180 + }, + { + "epoch": 0.016188167115106, + "grad_norm": 0.08416672796010971, + "learning_rate": 4.937307098765432e-05, + "loss": 0.0245, + "step": 2190 + }, + { + "epoch": 0.016262085686407853, + "grad_norm": 0.11193812638521194, + "learning_rate": 4.936936134852801e-05, + "loss": 0.0244, + "step": 2200 + }, + { + "epoch": 0.016336004257709705, + "grad_norm": 0.1309780776500702, + "learning_rate": 4.9365651709401715e-05, + "loss": 0.0234, + "step": 2210 + }, + { + "epoch": 0.01640992282901156, + "grad_norm": 0.09695091843605042, + "learning_rate": 4.936194207027541e-05, + "loss": 0.024, + "step": 2220 + }, + { + "epoch": 0.016483841400313413, + "grad_norm": 0.12367673218250275, + "learning_rate": 4.93582324311491e-05, + "loss": 0.0265, + "step": 2230 + }, + { + "epoch": 0.01655775997161527, + "grad_norm": 0.08766240626573563, + "learning_rate": 4.9354522792022796e-05, + "loss": 0.0258, + "step": 2240 + }, + { + "epoch": 0.01663167854291712, + "grad_norm": 0.2597678303718567, + "learning_rate": 4.935081315289649e-05, + "loss": 0.0265, + "step": 2250 + }, + { + "epoch": 0.016705597114218977, + "grad_norm": 0.12836165726184845, + "learning_rate": 4.934710351377018e-05, + "loss": 0.0253, + "step": 2260 + }, + { + "epoch": 0.01677951568552083, + "grad_norm": 0.09131550043821335, + "learning_rate": 4.934339387464388e-05, + "loss": 0.0263, + "step": 2270 + }, + { + "epoch": 0.016853434256822685, + "grad_norm": 0.0957304835319519, + "learning_rate": 4.9339684235517566e-05, + "loss": 0.0248, + "step": 2280 + }, + { + "epoch": 0.016927352828124537, + "grad_norm": 0.08887193351984024, + "learning_rate": 4.933597459639127e-05, + "loss": 0.0269, + "step": 2290 + }, + { + "epoch": 0.017001271399426393, + "grad_norm": 0.1088196337223053, + "learning_rate": 4.9332264957264965e-05, + "loss": 0.0229, + "step": 2300 + }, + { + "epoch": 0.017075189970728245, + "grad_norm": 0.10806940495967865, + "learning_rate": 4.9328555318138654e-05, + "loss": 0.024, + "step": 2310 + }, + { + "epoch": 0.0171491085420301, + "grad_norm": 0.11325336247682571, + "learning_rate": 4.932484567901235e-05, + "loss": 0.0256, + "step": 2320 + }, + { + "epoch": 0.017223027113331953, + "grad_norm": 0.08867617696523666, + "learning_rate": 4.932113603988604e-05, + "loss": 0.0239, + "step": 2330 + }, + { + "epoch": 0.01729694568463381, + "grad_norm": 0.08839539438486099, + "learning_rate": 4.9317426400759735e-05, + "loss": 0.0232, + "step": 2340 + }, + { + "epoch": 0.01737086425593566, + "grad_norm": 0.09393464028835297, + "learning_rate": 4.931371676163343e-05, + "loss": 0.0272, + "step": 2350 + }, + { + "epoch": 0.017444782827237514, + "grad_norm": 0.14300134778022766, + "learning_rate": 4.931000712250713e-05, + "loss": 0.0287, + "step": 2360 + }, + { + "epoch": 0.01751870139853937, + "grad_norm": 0.10064567625522614, + "learning_rate": 4.930629748338082e-05, + "loss": 0.0239, + "step": 2370 + }, + { + "epoch": 0.01759261996984122, + "grad_norm": 0.08626890927553177, + "learning_rate": 4.930258784425451e-05, + "loss": 0.0239, + "step": 2380 + }, + { + "epoch": 0.017666538541143077, + "grad_norm": 0.08982729911804199, + "learning_rate": 4.929887820512821e-05, + "loss": 0.0232, + "step": 2390 + }, + { + "epoch": 0.01774045711244493, + "grad_norm": 0.11209940910339355, + "learning_rate": 4.9295168566001904e-05, + "loss": 0.027, + "step": 2400 + }, + { + "epoch": 0.017814375683746785, + "grad_norm": 0.09284085780382156, + "learning_rate": 4.929145892687559e-05, + "loss": 0.0265, + "step": 2410 + }, + { + "epoch": 0.017888294255048637, + "grad_norm": 0.11055232584476471, + "learning_rate": 4.928774928774929e-05, + "loss": 0.0229, + "step": 2420 + }, + { + "epoch": 0.017962212826350493, + "grad_norm": 0.08943367004394531, + "learning_rate": 4.928403964862298e-05, + "loss": 0.0243, + "step": 2430 + }, + { + "epoch": 0.018036131397652345, + "grad_norm": 0.08455714583396912, + "learning_rate": 4.928033000949668e-05, + "loss": 0.0227, + "step": 2440 + }, + { + "epoch": 0.0181100499689542, + "grad_norm": 0.08124112337827682, + "learning_rate": 4.927662037037038e-05, + "loss": 0.0241, + "step": 2450 + }, + { + "epoch": 0.018183968540256053, + "grad_norm": 0.0854414626955986, + "learning_rate": 4.9272910731244066e-05, + "loss": 0.0234, + "step": 2460 + }, + { + "epoch": 0.01825788711155791, + "grad_norm": 0.10405626893043518, + "learning_rate": 4.926920109211776e-05, + "loss": 0.023, + "step": 2470 + }, + { + "epoch": 0.01833180568285976, + "grad_norm": 0.10854543745517731, + "learning_rate": 4.926549145299146e-05, + "loss": 0.025, + "step": 2480 + }, + { + "epoch": 0.018405724254161617, + "grad_norm": 0.14602318406105042, + "learning_rate": 4.926178181386515e-05, + "loss": 0.0238, + "step": 2490 + }, + { + "epoch": 0.01847964282546347, + "grad_norm": 0.11568371206521988, + "learning_rate": 4.9258072174738843e-05, + "loss": 0.0239, + "step": 2500 + }, + { + "epoch": 0.01855356139676532, + "grad_norm": 0.10612810403108597, + "learning_rate": 4.925436253561253e-05, + "loss": 0.0264, + "step": 2510 + }, + { + "epoch": 0.018627479968067177, + "grad_norm": 0.12310179322957993, + "learning_rate": 4.9250652896486235e-05, + "loss": 0.0229, + "step": 2520 + }, + { + "epoch": 0.01870139853936903, + "grad_norm": 0.12349710613489151, + "learning_rate": 4.924694325735993e-05, + "loss": 0.0252, + "step": 2530 + }, + { + "epoch": 0.018775317110670885, + "grad_norm": 0.09072163701057434, + "learning_rate": 4.924323361823362e-05, + "loss": 0.0234, + "step": 2540 + }, + { + "epoch": 0.018849235681972738, + "grad_norm": 0.11435705423355103, + "learning_rate": 4.9239523979107316e-05, + "loss": 0.0253, + "step": 2550 + }, + { + "epoch": 0.018923154253274593, + "grad_norm": 0.12449908256530762, + "learning_rate": 4.9235814339981006e-05, + "loss": 0.0257, + "step": 2560 + }, + { + "epoch": 0.018997072824576446, + "grad_norm": 0.103098563849926, + "learning_rate": 4.92321047008547e-05, + "loss": 0.0257, + "step": 2570 + }, + { + "epoch": 0.0190709913958783, + "grad_norm": 0.13323761522769928, + "learning_rate": 4.92283950617284e-05, + "loss": 0.0265, + "step": 2580 + }, + { + "epoch": 0.019144909967180154, + "grad_norm": 0.08810979127883911, + "learning_rate": 4.9224685422602094e-05, + "loss": 0.0256, + "step": 2590 + }, + { + "epoch": 0.01921882853848201, + "grad_norm": 0.0921526849269867, + "learning_rate": 4.922097578347579e-05, + "loss": 0.0251, + "step": 2600 + }, + { + "epoch": 0.01929274710978386, + "grad_norm": 0.11148613691329956, + "learning_rate": 4.921726614434948e-05, + "loss": 0.0264, + "step": 2610 + }, + { + "epoch": 0.019366665681085717, + "grad_norm": 0.10836231708526611, + "learning_rate": 4.9213556505223175e-05, + "loss": 0.0206, + "step": 2620 + }, + { + "epoch": 0.01944058425238757, + "grad_norm": 0.07907849550247192, + "learning_rate": 4.920984686609687e-05, + "loss": 0.0239, + "step": 2630 + }, + { + "epoch": 0.019514502823689425, + "grad_norm": 0.10812706500291824, + "learning_rate": 4.920613722697056e-05, + "loss": 0.0237, + "step": 2640 + }, + { + "epoch": 0.019588421394991277, + "grad_norm": 0.12056887894868851, + "learning_rate": 4.9202427587844256e-05, + "loss": 0.0274, + "step": 2650 + }, + { + "epoch": 0.019662339966293133, + "grad_norm": 0.10785806179046631, + "learning_rate": 4.9198717948717945e-05, + "loss": 0.0258, + "step": 2660 + }, + { + "epoch": 0.019736258537594985, + "grad_norm": 0.12284066528081894, + "learning_rate": 4.919500830959165e-05, + "loss": 0.0254, + "step": 2670 + }, + { + "epoch": 0.019810177108896838, + "grad_norm": 0.08642525970935822, + "learning_rate": 4.9191298670465344e-05, + "loss": 0.0256, + "step": 2680 + }, + { + "epoch": 0.019884095680198693, + "grad_norm": 0.10404020547866821, + "learning_rate": 4.918758903133903e-05, + "loss": 0.0234, + "step": 2690 + }, + { + "epoch": 0.019958014251500546, + "grad_norm": 0.11959104984998703, + "learning_rate": 4.918387939221273e-05, + "loss": 0.0254, + "step": 2700 + }, + { + "epoch": 0.0200319328228024, + "grad_norm": 0.12476497888565063, + "learning_rate": 4.9180169753086425e-05, + "loss": 0.0248, + "step": 2710 + }, + { + "epoch": 0.020105851394104254, + "grad_norm": 0.13198316097259521, + "learning_rate": 4.9176460113960114e-05, + "loss": 0.0241, + "step": 2720 + }, + { + "epoch": 0.02017976996540611, + "grad_norm": 0.08732737600803375, + "learning_rate": 4.917275047483381e-05, + "loss": 0.0259, + "step": 2730 + }, + { + "epoch": 0.02025368853670796, + "grad_norm": 0.11259639263153076, + "learning_rate": 4.9169040835707506e-05, + "loss": 0.0238, + "step": 2740 + }, + { + "epoch": 0.020327607108009817, + "grad_norm": 0.09043506532907486, + "learning_rate": 4.91653311965812e-05, + "loss": 0.0213, + "step": 2750 + }, + { + "epoch": 0.02040152567931167, + "grad_norm": 0.10611968487501144, + "learning_rate": 4.91616215574549e-05, + "loss": 0.0249, + "step": 2760 + }, + { + "epoch": 0.020475444250613525, + "grad_norm": 0.10952197015285492, + "learning_rate": 4.915791191832859e-05, + "loss": 0.0281, + "step": 2770 + }, + { + "epoch": 0.020549362821915378, + "grad_norm": 0.1053660586476326, + "learning_rate": 4.915420227920228e-05, + "loss": 0.027, + "step": 2780 + }, + { + "epoch": 0.020623281393217233, + "grad_norm": 0.14103762805461884, + "learning_rate": 4.915049264007597e-05, + "loss": 0.0242, + "step": 2790 + }, + { + "epoch": 0.020697199964519086, + "grad_norm": 0.1869761049747467, + "learning_rate": 4.914678300094967e-05, + "loss": 0.0244, + "step": 2800 + }, + { + "epoch": 0.02077111853582094, + "grad_norm": 0.1109655573964119, + "learning_rate": 4.9143073361823364e-05, + "loss": 0.0252, + "step": 2810 + }, + { + "epoch": 0.020845037107122794, + "grad_norm": 0.08841854333877563, + "learning_rate": 4.913936372269706e-05, + "loss": 0.0249, + "step": 2820 + }, + { + "epoch": 0.020918955678424646, + "grad_norm": 0.11417660862207413, + "learning_rate": 4.9135654083570756e-05, + "loss": 0.0263, + "step": 2830 + }, + { + "epoch": 0.0209928742497265, + "grad_norm": 0.13581736385822296, + "learning_rate": 4.9131944444444445e-05, + "loss": 0.0245, + "step": 2840 + }, + { + "epoch": 0.021066792821028354, + "grad_norm": 0.10360286384820938, + "learning_rate": 4.912823480531814e-05, + "loss": 0.0241, + "step": 2850 + }, + { + "epoch": 0.02114071139233021, + "grad_norm": 0.11725900322198868, + "learning_rate": 4.912452516619184e-05, + "loss": 0.0229, + "step": 2860 + }, + { + "epoch": 0.021214629963632062, + "grad_norm": 0.08907769620418549, + "learning_rate": 4.9120815527065526e-05, + "loss": 0.0214, + "step": 2870 + }, + { + "epoch": 0.021288548534933917, + "grad_norm": 0.0941435769200325, + "learning_rate": 4.911710588793922e-05, + "loss": 0.0261, + "step": 2880 + }, + { + "epoch": 0.02136246710623577, + "grad_norm": 0.14143934845924377, + "learning_rate": 4.911339624881292e-05, + "loss": 0.0232, + "step": 2890 + }, + { + "epoch": 0.021436385677537625, + "grad_norm": 0.1258591115474701, + "learning_rate": 4.9109686609686614e-05, + "loss": 0.0223, + "step": 2900 + }, + { + "epoch": 0.021510304248839478, + "grad_norm": 0.12183864414691925, + "learning_rate": 4.910597697056031e-05, + "loss": 0.0257, + "step": 2910 + }, + { + "epoch": 0.021584222820141333, + "grad_norm": 0.1272670328617096, + "learning_rate": 4.9102267331434e-05, + "loss": 0.0212, + "step": 2920 + }, + { + "epoch": 0.021658141391443186, + "grad_norm": 0.11030561476945877, + "learning_rate": 4.9098557692307695e-05, + "loss": 0.023, + "step": 2930 + }, + { + "epoch": 0.02173205996274504, + "grad_norm": 0.09155961871147156, + "learning_rate": 4.909484805318139e-05, + "loss": 0.0231, + "step": 2940 + }, + { + "epoch": 0.021805978534046894, + "grad_norm": 0.09755866974592209, + "learning_rate": 4.909113841405508e-05, + "loss": 0.0237, + "step": 2950 + }, + { + "epoch": 0.02187989710534875, + "grad_norm": 0.1016668975353241, + "learning_rate": 4.9087428774928777e-05, + "loss": 0.0262, + "step": 2960 + }, + { + "epoch": 0.0219538156766506, + "grad_norm": 0.08545517176389694, + "learning_rate": 4.908371913580247e-05, + "loss": 0.0234, + "step": 2970 + }, + { + "epoch": 0.022027734247952454, + "grad_norm": 0.11458541452884674, + "learning_rate": 4.908000949667617e-05, + "loss": 0.0265, + "step": 2980 + }, + { + "epoch": 0.02210165281925431, + "grad_norm": 0.09427709132432938, + "learning_rate": 4.9076299857549864e-05, + "loss": 0.0257, + "step": 2990 + }, + { + "epoch": 0.022175571390556162, + "grad_norm": 0.10649760067462921, + "learning_rate": 4.9072590218423554e-05, + "loss": 0.0264, + "step": 3000 + }, + { + "epoch": 0.022249489961858018, + "grad_norm": 0.09446646273136139, + "learning_rate": 4.906888057929725e-05, + "loss": 0.0227, + "step": 3010 + }, + { + "epoch": 0.02232340853315987, + "grad_norm": 0.10109565407037735, + "learning_rate": 4.906517094017094e-05, + "loss": 0.0261, + "step": 3020 + }, + { + "epoch": 0.022397327104461726, + "grad_norm": 0.1348312348127365, + "learning_rate": 4.9061461301044635e-05, + "loss": 0.0239, + "step": 3030 + }, + { + "epoch": 0.022471245675763578, + "grad_norm": 0.08773966878652573, + "learning_rate": 4.905775166191833e-05, + "loss": 0.0236, + "step": 3040 + }, + { + "epoch": 0.022545164247065434, + "grad_norm": 0.11004015058279037, + "learning_rate": 4.905404202279203e-05, + "loss": 0.023, + "step": 3050 + }, + { + "epoch": 0.022619082818367286, + "grad_norm": 0.10679835081100464, + "learning_rate": 4.905033238366572e-05, + "loss": 0.0239, + "step": 3060 + }, + { + "epoch": 0.02269300138966914, + "grad_norm": 0.11770069599151611, + "learning_rate": 4.904662274453941e-05, + "loss": 0.0259, + "step": 3070 + }, + { + "epoch": 0.022766919960970994, + "grad_norm": 0.11873900145292282, + "learning_rate": 4.904291310541311e-05, + "loss": 0.0249, + "step": 3080 + }, + { + "epoch": 0.02284083853227285, + "grad_norm": 0.11943815648555756, + "learning_rate": 4.9039203466286804e-05, + "loss": 0.0242, + "step": 3090 + }, + { + "epoch": 0.022914757103574702, + "grad_norm": 0.1334373950958252, + "learning_rate": 4.903549382716049e-05, + "loss": 0.0258, + "step": 3100 + }, + { + "epoch": 0.022988675674876557, + "grad_norm": 0.13952457904815674, + "learning_rate": 4.903178418803419e-05, + "loss": 0.0257, + "step": 3110 + }, + { + "epoch": 0.02306259424617841, + "grad_norm": 0.07053710520267487, + "learning_rate": 4.9028074548907885e-05, + "loss": 0.0226, + "step": 3120 + }, + { + "epoch": 0.023136512817480265, + "grad_norm": 0.11949121952056885, + "learning_rate": 4.902436490978158e-05, + "loss": 0.0276, + "step": 3130 + }, + { + "epoch": 0.023210431388782118, + "grad_norm": 0.11576800048351288, + "learning_rate": 4.902065527065528e-05, + "loss": 0.0247, + "step": 3140 + }, + { + "epoch": 0.02328434996008397, + "grad_norm": 0.09908673912286758, + "learning_rate": 4.9016945631528966e-05, + "loss": 0.0266, + "step": 3150 + }, + { + "epoch": 0.023358268531385826, + "grad_norm": 0.1006418839097023, + "learning_rate": 4.901323599240266e-05, + "loss": 0.0245, + "step": 3160 + }, + { + "epoch": 0.023432187102687678, + "grad_norm": 0.09430541843175888, + "learning_rate": 4.900952635327636e-05, + "loss": 0.0241, + "step": 3170 + }, + { + "epoch": 0.023506105673989534, + "grad_norm": 0.08641228079795837, + "learning_rate": 4.900581671415005e-05, + "loss": 0.0244, + "step": 3180 + }, + { + "epoch": 0.023580024245291386, + "grad_norm": 0.11231532692909241, + "learning_rate": 4.900210707502374e-05, + "loss": 0.0263, + "step": 3190 + }, + { + "epoch": 0.02365394281659324, + "grad_norm": 0.08847200870513916, + "learning_rate": 4.899839743589744e-05, + "loss": 0.023, + "step": 3200 + }, + { + "epoch": 0.023727861387895094, + "grad_norm": 0.08235383033752441, + "learning_rate": 4.8994687796771135e-05, + "loss": 0.0207, + "step": 3210 + }, + { + "epoch": 0.02380177995919695, + "grad_norm": 0.07484035193920135, + "learning_rate": 4.899097815764483e-05, + "loss": 0.0202, + "step": 3220 + }, + { + "epoch": 0.023875698530498802, + "grad_norm": 0.08688723295927048, + "learning_rate": 4.898726851851852e-05, + "loss": 0.0226, + "step": 3230 + }, + { + "epoch": 0.023949617101800658, + "grad_norm": 0.11297879368066788, + "learning_rate": 4.8983558879392216e-05, + "loss": 0.0257, + "step": 3240 + }, + { + "epoch": 0.02402353567310251, + "grad_norm": 0.08175697922706604, + "learning_rate": 4.8979849240265905e-05, + "loss": 0.0226, + "step": 3250 + }, + { + "epoch": 0.024097454244404366, + "grad_norm": 0.10165733844041824, + "learning_rate": 4.89761396011396e-05, + "loss": 0.023, + "step": 3260 + }, + { + "epoch": 0.024171372815706218, + "grad_norm": 0.10624788701534271, + "learning_rate": 4.89724299620133e-05, + "loss": 0.0243, + "step": 3270 + }, + { + "epoch": 0.024245291387008074, + "grad_norm": 0.11952788382768631, + "learning_rate": 4.896872032288699e-05, + "loss": 0.0229, + "step": 3280 + }, + { + "epoch": 0.024319209958309926, + "grad_norm": 0.0901573970913887, + "learning_rate": 4.896501068376069e-05, + "loss": 0.023, + "step": 3290 + }, + { + "epoch": 0.024393128529611778, + "grad_norm": 0.13301552832126617, + "learning_rate": 4.896130104463438e-05, + "loss": 0.0259, + "step": 3300 + }, + { + "epoch": 0.024467047100913634, + "grad_norm": 0.11203417927026749, + "learning_rate": 4.8957591405508074e-05, + "loss": 0.0234, + "step": 3310 + }, + { + "epoch": 0.024540965672215486, + "grad_norm": 0.09428996592760086, + "learning_rate": 4.895388176638177e-05, + "loss": 0.0236, + "step": 3320 + }, + { + "epoch": 0.024614884243517342, + "grad_norm": 0.11264640092849731, + "learning_rate": 4.895017212725546e-05, + "loss": 0.0217, + "step": 3330 + }, + { + "epoch": 0.024688802814819194, + "grad_norm": 0.14519962668418884, + "learning_rate": 4.8946462488129156e-05, + "loss": 0.024, + "step": 3340 + }, + { + "epoch": 0.02476272138612105, + "grad_norm": 0.0921749472618103, + "learning_rate": 4.894275284900285e-05, + "loss": 0.0253, + "step": 3350 + }, + { + "epoch": 0.024836639957422902, + "grad_norm": 0.0954316258430481, + "learning_rate": 4.893904320987655e-05, + "loss": 0.0219, + "step": 3360 + }, + { + "epoch": 0.024910558528724758, + "grad_norm": 0.10995155572891235, + "learning_rate": 4.8935333570750243e-05, + "loss": 0.0215, + "step": 3370 + }, + { + "epoch": 0.02498447710002661, + "grad_norm": 0.10263795405626297, + "learning_rate": 4.893162393162393e-05, + "loss": 0.021, + "step": 3380 + }, + { + "epoch": 0.025058395671328466, + "grad_norm": 0.15626676380634308, + "learning_rate": 4.892791429249763e-05, + "loss": 0.0228, + "step": 3390 + }, + { + "epoch": 0.025132314242630318, + "grad_norm": 0.10327065736055374, + "learning_rate": 4.8924204653371325e-05, + "loss": 0.0216, + "step": 3400 + }, + { + "epoch": 0.025206232813932174, + "grad_norm": 0.10524246096611023, + "learning_rate": 4.8920495014245014e-05, + "loss": 0.0248, + "step": 3410 + }, + { + "epoch": 0.025280151385234026, + "grad_norm": 0.0885235071182251, + "learning_rate": 4.891678537511871e-05, + "loss": 0.0227, + "step": 3420 + }, + { + "epoch": 0.02535406995653588, + "grad_norm": 0.11894160509109497, + "learning_rate": 4.8913075735992406e-05, + "loss": 0.0244, + "step": 3430 + }, + { + "epoch": 0.025427988527837734, + "grad_norm": 0.08932094275951385, + "learning_rate": 4.89093660968661e-05, + "loss": 0.0224, + "step": 3440 + }, + { + "epoch": 0.025501907099139586, + "grad_norm": 0.12240859866142273, + "learning_rate": 4.89056564577398e-05, + "loss": 0.0218, + "step": 3450 + }, + { + "epoch": 0.025575825670441442, + "grad_norm": 0.10221285372972488, + "learning_rate": 4.890194681861349e-05, + "loss": 0.021, + "step": 3460 + }, + { + "epoch": 0.025649744241743294, + "grad_norm": 0.13738150894641876, + "learning_rate": 4.889823717948718e-05, + "loss": 0.0253, + "step": 3470 + }, + { + "epoch": 0.02572366281304515, + "grad_norm": 0.10425925999879837, + "learning_rate": 4.889452754036087e-05, + "loss": 0.0236, + "step": 3480 + }, + { + "epoch": 0.025797581384347002, + "grad_norm": 0.16000455617904663, + "learning_rate": 4.889081790123457e-05, + "loss": 0.0259, + "step": 3490 + }, + { + "epoch": 0.025871499955648858, + "grad_norm": 0.10048896074295044, + "learning_rate": 4.8887108262108264e-05, + "loss": 0.0227, + "step": 3500 + }, + { + "epoch": 0.02594541852695071, + "grad_norm": 0.10501318424940109, + "learning_rate": 4.888339862298196e-05, + "loss": 0.025, + "step": 3510 + }, + { + "epoch": 0.026019337098252566, + "grad_norm": 0.11101071536540985, + "learning_rate": 4.8879688983855656e-05, + "loss": 0.0231, + "step": 3520 + }, + { + "epoch": 0.026093255669554418, + "grad_norm": 0.09944774210453033, + "learning_rate": 4.8875979344729345e-05, + "loss": 0.0218, + "step": 3530 + }, + { + "epoch": 0.026167174240856274, + "grad_norm": 0.10541743040084839, + "learning_rate": 4.887226970560304e-05, + "loss": 0.0235, + "step": 3540 + }, + { + "epoch": 0.026241092812158126, + "grad_norm": 0.11680968105792999, + "learning_rate": 4.886856006647674e-05, + "loss": 0.0247, + "step": 3550 + }, + { + "epoch": 0.026315011383459982, + "grad_norm": 0.08649037778377533, + "learning_rate": 4.8864850427350426e-05, + "loss": 0.0219, + "step": 3560 + }, + { + "epoch": 0.026388929954761834, + "grad_norm": 0.09611788392066956, + "learning_rate": 4.886114078822412e-05, + "loss": 0.0249, + "step": 3570 + }, + { + "epoch": 0.02646284852606369, + "grad_norm": 0.09479816257953644, + "learning_rate": 4.885743114909782e-05, + "loss": 0.0223, + "step": 3580 + }, + { + "epoch": 0.026536767097365542, + "grad_norm": 0.1077667772769928, + "learning_rate": 4.8853721509971514e-05, + "loss": 0.0229, + "step": 3590 + }, + { + "epoch": 0.026610685668667394, + "grad_norm": 0.12021303921937943, + "learning_rate": 4.885001187084521e-05, + "loss": 0.023, + "step": 3600 + }, + { + "epoch": 0.02668460423996925, + "grad_norm": 0.10672514885663986, + "learning_rate": 4.88463022317189e-05, + "loss": 0.0204, + "step": 3610 + }, + { + "epoch": 0.026758522811271102, + "grad_norm": 0.11359500885009766, + "learning_rate": 4.8842592592592595e-05, + "loss": 0.0256, + "step": 3620 + }, + { + "epoch": 0.026832441382572958, + "grad_norm": 0.1413438767194748, + "learning_rate": 4.883888295346629e-05, + "loss": 0.0275, + "step": 3630 + }, + { + "epoch": 0.02690635995387481, + "grad_norm": 0.10484588891267776, + "learning_rate": 4.883517331433998e-05, + "loss": 0.0279, + "step": 3640 + }, + { + "epoch": 0.026980278525176666, + "grad_norm": 0.10619416832923889, + "learning_rate": 4.8831463675213676e-05, + "loss": 0.0245, + "step": 3650 + }, + { + "epoch": 0.027054197096478518, + "grad_norm": 0.11082316190004349, + "learning_rate": 4.882775403608737e-05, + "loss": 0.0226, + "step": 3660 + }, + { + "epoch": 0.027128115667780374, + "grad_norm": 0.1237013041973114, + "learning_rate": 4.882404439696107e-05, + "loss": 0.0222, + "step": 3670 + }, + { + "epoch": 0.027202034239082226, + "grad_norm": 0.16743157804012299, + "learning_rate": 4.8820334757834764e-05, + "loss": 0.0243, + "step": 3680 + }, + { + "epoch": 0.027275952810384082, + "grad_norm": 0.10074224323034286, + "learning_rate": 4.8816625118708453e-05, + "loss": 0.0252, + "step": 3690 + }, + { + "epoch": 0.027349871381685934, + "grad_norm": 0.10610511898994446, + "learning_rate": 4.881291547958215e-05, + "loss": 0.0234, + "step": 3700 + }, + { + "epoch": 0.02742378995298779, + "grad_norm": 0.09411498159170151, + "learning_rate": 4.880920584045584e-05, + "loss": 0.0253, + "step": 3710 + }, + { + "epoch": 0.027497708524289642, + "grad_norm": 0.12062369287014008, + "learning_rate": 4.8805496201329535e-05, + "loss": 0.0268, + "step": 3720 + }, + { + "epoch": 0.027571627095591498, + "grad_norm": 0.12509259581565857, + "learning_rate": 4.880178656220323e-05, + "loss": 0.0228, + "step": 3730 + }, + { + "epoch": 0.02764554566689335, + "grad_norm": 0.11746224761009216, + "learning_rate": 4.8798076923076926e-05, + "loss": 0.0253, + "step": 3740 + }, + { + "epoch": 0.027719464238195206, + "grad_norm": 0.08255576342344284, + "learning_rate": 4.879436728395062e-05, + "loss": 0.0211, + "step": 3750 + }, + { + "epoch": 0.027793382809497058, + "grad_norm": 0.13874807953834534, + "learning_rate": 4.879065764482431e-05, + "loss": 0.0258, + "step": 3760 + }, + { + "epoch": 0.02786730138079891, + "grad_norm": 0.1162390410900116, + "learning_rate": 4.878694800569801e-05, + "loss": 0.0219, + "step": 3770 + }, + { + "epoch": 0.027941219952100766, + "grad_norm": 0.08504689484834671, + "learning_rate": 4.8783238366571704e-05, + "loss": 0.0251, + "step": 3780 + }, + { + "epoch": 0.02801513852340262, + "grad_norm": 0.12767374515533447, + "learning_rate": 4.877952872744539e-05, + "loss": 0.0226, + "step": 3790 + }, + { + "epoch": 0.028089057094704474, + "grad_norm": 0.09197654575109482, + "learning_rate": 4.877581908831909e-05, + "loss": 0.0213, + "step": 3800 + }, + { + "epoch": 0.028162975666006326, + "grad_norm": 0.08843453228473663, + "learning_rate": 4.8772109449192785e-05, + "loss": 0.0225, + "step": 3810 + }, + { + "epoch": 0.028236894237308182, + "grad_norm": 0.13571232557296753, + "learning_rate": 4.876839981006648e-05, + "loss": 0.0226, + "step": 3820 + }, + { + "epoch": 0.028310812808610034, + "grad_norm": 0.09555158764123917, + "learning_rate": 4.876469017094018e-05, + "loss": 0.0234, + "step": 3830 + }, + { + "epoch": 0.02838473137991189, + "grad_norm": 0.07628829032182693, + "learning_rate": 4.8760980531813866e-05, + "loss": 0.0219, + "step": 3840 + }, + { + "epoch": 0.028458649951213742, + "grad_norm": 0.11912820488214493, + "learning_rate": 4.875727089268756e-05, + "loss": 0.0214, + "step": 3850 + }, + { + "epoch": 0.028532568522515598, + "grad_norm": 0.09438012540340424, + "learning_rate": 4.875356125356126e-05, + "loss": 0.0217, + "step": 3860 + }, + { + "epoch": 0.02860648709381745, + "grad_norm": 0.09157334268093109, + "learning_rate": 4.874985161443495e-05, + "loss": 0.026, + "step": 3870 + }, + { + "epoch": 0.028680405665119306, + "grad_norm": 0.08945705741643906, + "learning_rate": 4.874614197530864e-05, + "loss": 0.026, + "step": 3880 + }, + { + "epoch": 0.028754324236421158, + "grad_norm": 0.10933069884777069, + "learning_rate": 4.874243233618234e-05, + "loss": 0.0242, + "step": 3890 + }, + { + "epoch": 0.028828242807723014, + "grad_norm": 0.12087175250053406, + "learning_rate": 4.8738722697056035e-05, + "loss": 0.0259, + "step": 3900 + }, + { + "epoch": 0.028902161379024866, + "grad_norm": 0.11087092012166977, + "learning_rate": 4.873501305792973e-05, + "loss": 0.0242, + "step": 3910 + }, + { + "epoch": 0.02897607995032672, + "grad_norm": 0.08969036489725113, + "learning_rate": 4.873130341880342e-05, + "loss": 0.024, + "step": 3920 + }, + { + "epoch": 0.029049998521628574, + "grad_norm": 0.08746183663606644, + "learning_rate": 4.8727593779677116e-05, + "loss": 0.0238, + "step": 3930 + }, + { + "epoch": 0.029123917092930426, + "grad_norm": 0.09499067813158035, + "learning_rate": 4.8723884140550805e-05, + "loss": 0.0256, + "step": 3940 + }, + { + "epoch": 0.029197835664232282, + "grad_norm": 0.09479714184999466, + "learning_rate": 4.87201745014245e-05, + "loss": 0.0254, + "step": 3950 + }, + { + "epoch": 0.029271754235534134, + "grad_norm": 0.07698513567447662, + "learning_rate": 4.87164648622982e-05, + "loss": 0.0246, + "step": 3960 + }, + { + "epoch": 0.02934567280683599, + "grad_norm": 0.08836125582456589, + "learning_rate": 4.871275522317189e-05, + "loss": 0.0229, + "step": 3970 + }, + { + "epoch": 0.029419591378137842, + "grad_norm": 0.10476455837488174, + "learning_rate": 4.870904558404559e-05, + "loss": 0.0226, + "step": 3980 + }, + { + "epoch": 0.029493509949439698, + "grad_norm": 0.12344580888748169, + "learning_rate": 4.870533594491928e-05, + "loss": 0.0249, + "step": 3990 + }, + { + "epoch": 0.02956742852074155, + "grad_norm": 0.09737499803304672, + "learning_rate": 4.8701626305792974e-05, + "loss": 0.0227, + "step": 4000 + }, + { + "epoch": 0.029641347092043406, + "grad_norm": 0.11230745911598206, + "learning_rate": 4.869791666666667e-05, + "loss": 0.0248, + "step": 4010 + }, + { + "epoch": 0.02971526566334526, + "grad_norm": 0.11428452283143997, + "learning_rate": 4.869420702754036e-05, + "loss": 0.0245, + "step": 4020 + }, + { + "epoch": 0.029789184234647114, + "grad_norm": 0.09732426702976227, + "learning_rate": 4.8690497388414055e-05, + "loss": 0.0267, + "step": 4030 + }, + { + "epoch": 0.029863102805948966, + "grad_norm": 0.08766438066959381, + "learning_rate": 4.868678774928775e-05, + "loss": 0.0236, + "step": 4040 + }, + { + "epoch": 0.029937021377250822, + "grad_norm": 0.09755643457174301, + "learning_rate": 4.868307811016145e-05, + "loss": 0.025, + "step": 4050 + }, + { + "epoch": 0.030010939948552674, + "grad_norm": 0.11232589185237885, + "learning_rate": 4.867936847103514e-05, + "loss": 0.0254, + "step": 4060 + }, + { + "epoch": 0.030084858519854527, + "grad_norm": 0.08921632915735245, + "learning_rate": 4.867565883190883e-05, + "loss": 0.0228, + "step": 4070 + }, + { + "epoch": 0.030158777091156382, + "grad_norm": 0.19874423742294312, + "learning_rate": 4.867194919278253e-05, + "loss": 0.0239, + "step": 4080 + }, + { + "epoch": 0.030232695662458235, + "grad_norm": 0.09998581558465958, + "learning_rate": 4.8668239553656224e-05, + "loss": 0.0241, + "step": 4090 + }, + { + "epoch": 0.03030661423376009, + "grad_norm": 0.09771718829870224, + "learning_rate": 4.8664529914529914e-05, + "loss": 0.0224, + "step": 4100 + }, + { + "epoch": 0.030380532805061942, + "grad_norm": 0.11452648788690567, + "learning_rate": 4.866082027540361e-05, + "loss": 0.0219, + "step": 4110 + }, + { + "epoch": 0.030454451376363798, + "grad_norm": 0.14325961470603943, + "learning_rate": 4.8657110636277305e-05, + "loss": 0.0257, + "step": 4120 + }, + { + "epoch": 0.03052836994766565, + "grad_norm": 0.11013491451740265, + "learning_rate": 4.8653400997151e-05, + "loss": 0.0249, + "step": 4130 + }, + { + "epoch": 0.030602288518967506, + "grad_norm": 0.10259222984313965, + "learning_rate": 4.86496913580247e-05, + "loss": 0.0238, + "step": 4140 + }, + { + "epoch": 0.03067620709026936, + "grad_norm": 0.08032498508691788, + "learning_rate": 4.8645981718898387e-05, + "loss": 0.0244, + "step": 4150 + }, + { + "epoch": 0.030750125661571214, + "grad_norm": 0.09167686849832535, + "learning_rate": 4.864227207977208e-05, + "loss": 0.0236, + "step": 4160 + }, + { + "epoch": 0.030824044232873066, + "grad_norm": 0.11132217943668365, + "learning_rate": 4.863856244064577e-05, + "loss": 0.024, + "step": 4170 + }, + { + "epoch": 0.030897962804174922, + "grad_norm": 0.09461288899183273, + "learning_rate": 4.863485280151947e-05, + "loss": 0.0221, + "step": 4180 + }, + { + "epoch": 0.030971881375476774, + "grad_norm": 0.1043187603354454, + "learning_rate": 4.8631143162393164e-05, + "loss": 0.0212, + "step": 4190 + }, + { + "epoch": 0.03104579994677863, + "grad_norm": 0.08882828056812286, + "learning_rate": 4.862743352326686e-05, + "loss": 0.0239, + "step": 4200 + }, + { + "epoch": 0.031119718518080482, + "grad_norm": 0.11656889319419861, + "learning_rate": 4.8623723884140556e-05, + "loss": 0.0212, + "step": 4210 + }, + { + "epoch": 0.031193637089382338, + "grad_norm": 0.09930509328842163, + "learning_rate": 4.8620014245014245e-05, + "loss": 0.0243, + "step": 4220 + }, + { + "epoch": 0.03126755566068419, + "grad_norm": 0.11788803339004517, + "learning_rate": 4.861630460588794e-05, + "loss": 0.0232, + "step": 4230 + }, + { + "epoch": 0.03134147423198604, + "grad_norm": 0.13846427202224731, + "learning_rate": 4.861259496676164e-05, + "loss": 0.0229, + "step": 4240 + }, + { + "epoch": 0.031415392803287895, + "grad_norm": 0.10129490494728088, + "learning_rate": 4.8608885327635326e-05, + "loss": 0.023, + "step": 4250 + }, + { + "epoch": 0.031489311374589754, + "grad_norm": 0.0792374238371849, + "learning_rate": 4.860517568850902e-05, + "loss": 0.0242, + "step": 4260 + }, + { + "epoch": 0.031563229945891606, + "grad_norm": 0.1227840855717659, + "learning_rate": 4.860146604938272e-05, + "loss": 0.0256, + "step": 4270 + }, + { + "epoch": 0.03163714851719346, + "grad_norm": 0.1305595189332962, + "learning_rate": 4.8597756410256414e-05, + "loss": 0.0229, + "step": 4280 + }, + { + "epoch": 0.03171106708849531, + "grad_norm": 0.10320558398962021, + "learning_rate": 4.859404677113011e-05, + "loss": 0.0239, + "step": 4290 + }, + { + "epoch": 0.03178498565979717, + "grad_norm": 0.11386742442846298, + "learning_rate": 4.85903371320038e-05, + "loss": 0.0239, + "step": 4300 + }, + { + "epoch": 0.03185890423109902, + "grad_norm": 0.09916136413812637, + "learning_rate": 4.8586627492877495e-05, + "loss": 0.026, + "step": 4310 + }, + { + "epoch": 0.031932822802400875, + "grad_norm": 0.13368867337703705, + "learning_rate": 4.858291785375119e-05, + "loss": 0.0265, + "step": 4320 + }, + { + "epoch": 0.03200674137370273, + "grad_norm": 0.11623481661081314, + "learning_rate": 4.857920821462488e-05, + "loss": 0.026, + "step": 4330 + }, + { + "epoch": 0.032080659945004586, + "grad_norm": 0.1140202060341835, + "learning_rate": 4.8575498575498576e-05, + "loss": 0.0236, + "step": 4340 + }, + { + "epoch": 0.03215457851630644, + "grad_norm": 0.08688046783208847, + "learning_rate": 4.857178893637227e-05, + "loss": 0.0234, + "step": 4350 + }, + { + "epoch": 0.03222849708760829, + "grad_norm": 0.11289218068122864, + "learning_rate": 4.856807929724597e-05, + "loss": 0.0225, + "step": 4360 + }, + { + "epoch": 0.03230241565891014, + "grad_norm": 0.1553889960050583, + "learning_rate": 4.8564369658119664e-05, + "loss": 0.0252, + "step": 4370 + }, + { + "epoch": 0.032376334230212, + "grad_norm": 0.09674090892076492, + "learning_rate": 4.856066001899335e-05, + "loss": 0.025, + "step": 4380 + }, + { + "epoch": 0.032450252801513854, + "grad_norm": 0.10170965641736984, + "learning_rate": 4.855695037986705e-05, + "loss": 0.0228, + "step": 4390 + }, + { + "epoch": 0.032524171372815706, + "grad_norm": 0.09621470421552658, + "learning_rate": 4.855324074074074e-05, + "loss": 0.0209, + "step": 4400 + }, + { + "epoch": 0.03259808994411756, + "grad_norm": 0.1560935229063034, + "learning_rate": 4.8549531101614434e-05, + "loss": 0.0233, + "step": 4410 + }, + { + "epoch": 0.03267200851541941, + "grad_norm": 0.1017451360821724, + "learning_rate": 4.854582146248813e-05, + "loss": 0.0243, + "step": 4420 + }, + { + "epoch": 0.03274592708672127, + "grad_norm": 0.10564211755990982, + "learning_rate": 4.8542111823361826e-05, + "loss": 0.0235, + "step": 4430 + }, + { + "epoch": 0.03281984565802312, + "grad_norm": 0.08408430963754654, + "learning_rate": 4.853840218423552e-05, + "loss": 0.0212, + "step": 4440 + }, + { + "epoch": 0.032893764229324975, + "grad_norm": 0.10621950775384903, + "learning_rate": 4.853469254510921e-05, + "loss": 0.0203, + "step": 4450 + }, + { + "epoch": 0.03296768280062683, + "grad_norm": 0.19483087956905365, + "learning_rate": 4.853098290598291e-05, + "loss": 0.023, + "step": 4460 + }, + { + "epoch": 0.033041601371928686, + "grad_norm": 0.1312873363494873, + "learning_rate": 4.85272732668566e-05, + "loss": 0.0232, + "step": 4470 + }, + { + "epoch": 0.03311551994323054, + "grad_norm": 0.10183980315923691, + "learning_rate": 4.852356362773029e-05, + "loss": 0.0216, + "step": 4480 + }, + { + "epoch": 0.03318943851453239, + "grad_norm": 0.10369880497455597, + "learning_rate": 4.851985398860399e-05, + "loss": 0.0194, + "step": 4490 + }, + { + "epoch": 0.03326335708583424, + "grad_norm": 0.10564015805721283, + "learning_rate": 4.8516144349477684e-05, + "loss": 0.0247, + "step": 4500 + }, + { + "epoch": 0.0333372756571361, + "grad_norm": 0.09465089440345764, + "learning_rate": 4.851243471035138e-05, + "loss": 0.0253, + "step": 4510 + }, + { + "epoch": 0.033411194228437954, + "grad_norm": 0.11145325005054474, + "learning_rate": 4.8508725071225076e-05, + "loss": 0.0231, + "step": 4520 + }, + { + "epoch": 0.03348511279973981, + "grad_norm": 0.09158811718225479, + "learning_rate": 4.8505015432098766e-05, + "loss": 0.0232, + "step": 4530 + }, + { + "epoch": 0.03355903137104166, + "grad_norm": 0.10896454751491547, + "learning_rate": 4.850130579297246e-05, + "loss": 0.0224, + "step": 4540 + }, + { + "epoch": 0.03363294994234351, + "grad_norm": 0.09764958918094635, + "learning_rate": 4.849759615384616e-05, + "loss": 0.0251, + "step": 4550 + }, + { + "epoch": 0.03370686851364537, + "grad_norm": 0.16335123777389526, + "learning_rate": 4.849388651471985e-05, + "loss": 0.0241, + "step": 4560 + }, + { + "epoch": 0.03378078708494722, + "grad_norm": 0.09329704940319061, + "learning_rate": 4.849017687559354e-05, + "loss": 0.0234, + "step": 4570 + }, + { + "epoch": 0.033854705656249075, + "grad_norm": 0.11753322929143906, + "learning_rate": 4.848646723646724e-05, + "loss": 0.0202, + "step": 4580 + }, + { + "epoch": 0.03392862422755093, + "grad_norm": 0.12258260697126389, + "learning_rate": 4.8482757597340935e-05, + "loss": 0.0258, + "step": 4590 + }, + { + "epoch": 0.034002542798852786, + "grad_norm": 0.10052376240491867, + "learning_rate": 4.847904795821463e-05, + "loss": 0.0219, + "step": 4600 + }, + { + "epoch": 0.03407646137015464, + "grad_norm": 0.07553847879171371, + "learning_rate": 4.847533831908832e-05, + "loss": 0.0243, + "step": 4610 + }, + { + "epoch": 0.03415037994145649, + "grad_norm": 0.09385402500629425, + "learning_rate": 4.8471628679962016e-05, + "loss": 0.023, + "step": 4620 + }, + { + "epoch": 0.03422429851275834, + "grad_norm": 0.15772198140621185, + "learning_rate": 4.8467919040835705e-05, + "loss": 0.0214, + "step": 4630 + }, + { + "epoch": 0.0342982170840602, + "grad_norm": 0.10387273132801056, + "learning_rate": 4.84642094017094e-05, + "loss": 0.0256, + "step": 4640 + }, + { + "epoch": 0.034372135655362054, + "grad_norm": 0.13165810704231262, + "learning_rate": 4.8460499762583104e-05, + "loss": 0.028, + "step": 4650 + }, + { + "epoch": 0.03444605422666391, + "grad_norm": 0.09483859688043594, + "learning_rate": 4.845679012345679e-05, + "loss": 0.0256, + "step": 4660 + }, + { + "epoch": 0.03451997279796576, + "grad_norm": 0.11623439192771912, + "learning_rate": 4.845308048433049e-05, + "loss": 0.0261, + "step": 4670 + }, + { + "epoch": 0.03459389136926762, + "grad_norm": 0.139932781457901, + "learning_rate": 4.844937084520418e-05, + "loss": 0.0229, + "step": 4680 + }, + { + "epoch": 0.03466780994056947, + "grad_norm": 0.12717574834823608, + "learning_rate": 4.8445661206077874e-05, + "loss": 0.0239, + "step": 4690 + }, + { + "epoch": 0.03474172851187132, + "grad_norm": 0.1345638930797577, + "learning_rate": 4.844195156695157e-05, + "loss": 0.0224, + "step": 4700 + }, + { + "epoch": 0.034815647083173175, + "grad_norm": 0.10481065511703491, + "learning_rate": 4.843824192782526e-05, + "loss": 0.0262, + "step": 4710 + }, + { + "epoch": 0.03488956565447503, + "grad_norm": 0.09500519186258316, + "learning_rate": 4.8434532288698955e-05, + "loss": 0.0211, + "step": 4720 + }, + { + "epoch": 0.034963484225776886, + "grad_norm": 0.13622979819774628, + "learning_rate": 4.843082264957265e-05, + "loss": 0.025, + "step": 4730 + }, + { + "epoch": 0.03503740279707874, + "grad_norm": 0.10376254469156265, + "learning_rate": 4.842711301044635e-05, + "loss": 0.021, + "step": 4740 + }, + { + "epoch": 0.03511132136838059, + "grad_norm": 0.095113605260849, + "learning_rate": 4.842340337132004e-05, + "loss": 0.0229, + "step": 4750 + }, + { + "epoch": 0.03518523993968244, + "grad_norm": 0.10841836780309677, + "learning_rate": 4.841969373219373e-05, + "loss": 0.0235, + "step": 4760 + }, + { + "epoch": 0.0352591585109843, + "grad_norm": 0.10691962391138077, + "learning_rate": 4.841598409306743e-05, + "loss": 0.0234, + "step": 4770 + }, + { + "epoch": 0.035333077082286155, + "grad_norm": 0.10558052361011505, + "learning_rate": 4.8412274453941124e-05, + "loss": 0.0201, + "step": 4780 + }, + { + "epoch": 0.03540699565358801, + "grad_norm": 0.10873495787382126, + "learning_rate": 4.840856481481481e-05, + "loss": 0.0231, + "step": 4790 + }, + { + "epoch": 0.03548091422488986, + "grad_norm": 0.11643857508897781, + "learning_rate": 4.8404855175688516e-05, + "loss": 0.0222, + "step": 4800 + }, + { + "epoch": 0.03555483279619172, + "grad_norm": 0.10683627426624298, + "learning_rate": 4.8401145536562205e-05, + "loss": 0.0224, + "step": 4810 + }, + { + "epoch": 0.03562875136749357, + "grad_norm": 0.11844533681869507, + "learning_rate": 4.83974358974359e-05, + "loss": 0.0197, + "step": 4820 + }, + { + "epoch": 0.03570266993879542, + "grad_norm": 0.09665479511022568, + "learning_rate": 4.83937262583096e-05, + "loss": 0.0222, + "step": 4830 + }, + { + "epoch": 0.035776588510097275, + "grad_norm": 0.1258080154657364, + "learning_rate": 4.8390016619183286e-05, + "loss": 0.022, + "step": 4840 + }, + { + "epoch": 0.035850507081399134, + "grad_norm": 0.06919804215431213, + "learning_rate": 4.838630698005698e-05, + "loss": 0.0218, + "step": 4850 + }, + { + "epoch": 0.035924425652700986, + "grad_norm": 0.09917058795690536, + "learning_rate": 4.838259734093067e-05, + "loss": 0.0227, + "step": 4860 + }, + { + "epoch": 0.03599834422400284, + "grad_norm": 0.1694340854883194, + "learning_rate": 4.837888770180437e-05, + "loss": 0.0235, + "step": 4870 + }, + { + "epoch": 0.03607226279530469, + "grad_norm": 0.10197817534208298, + "learning_rate": 4.837517806267807e-05, + "loss": 0.0262, + "step": 4880 + }, + { + "epoch": 0.03614618136660654, + "grad_norm": 0.10047579556703568, + "learning_rate": 4.837146842355176e-05, + "loss": 0.0255, + "step": 4890 + }, + { + "epoch": 0.0362200999379084, + "grad_norm": 0.07849813252687454, + "learning_rate": 4.8367758784425455e-05, + "loss": 0.0209, + "step": 4900 + }, + { + "epoch": 0.036294018509210255, + "grad_norm": 0.10035062581300735, + "learning_rate": 4.8364049145299145e-05, + "loss": 0.0248, + "step": 4910 + }, + { + "epoch": 0.03636793708051211, + "grad_norm": 0.12051332741975784, + "learning_rate": 4.836033950617284e-05, + "loss": 0.0236, + "step": 4920 + }, + { + "epoch": 0.03644185565181396, + "grad_norm": 0.09612835943698883, + "learning_rate": 4.8356629867046536e-05, + "loss": 0.0218, + "step": 4930 + }, + { + "epoch": 0.03651577422311582, + "grad_norm": 0.11352284252643585, + "learning_rate": 4.8352920227920226e-05, + "loss": 0.0251, + "step": 4940 + }, + { + "epoch": 0.03658969279441767, + "grad_norm": 0.11685504764318466, + "learning_rate": 4.834921058879393e-05, + "loss": 0.0246, + "step": 4950 + }, + { + "epoch": 0.03666361136571952, + "grad_norm": 0.09964120388031006, + "learning_rate": 4.834550094966762e-05, + "loss": 0.023, + "step": 4960 + }, + { + "epoch": 0.036737529937021375, + "grad_norm": 0.09999851137399673, + "learning_rate": 4.8341791310541314e-05, + "loss": 0.0224, + "step": 4970 + }, + { + "epoch": 0.036811448508323234, + "grad_norm": 0.10812164843082428, + "learning_rate": 4.833808167141501e-05, + "loss": 0.0246, + "step": 4980 + }, + { + "epoch": 0.03688536707962509, + "grad_norm": 0.10093782842159271, + "learning_rate": 4.83343720322887e-05, + "loss": 0.0238, + "step": 4990 + }, + { + "epoch": 0.03695928565092694, + "grad_norm": 0.08897837996482849, + "learning_rate": 4.8330662393162395e-05, + "loss": 0.0212, + "step": 5000 + }, + { + "epoch": 0.03703320422222879, + "grad_norm": 0.08802120387554169, + "learning_rate": 4.832695275403609e-05, + "loss": 0.0261, + "step": 5010 + }, + { + "epoch": 0.03710712279353064, + "grad_norm": 0.1296195685863495, + "learning_rate": 4.832324311490978e-05, + "loss": 0.0243, + "step": 5020 + }, + { + "epoch": 0.0371810413648325, + "grad_norm": 0.09052955359220505, + "learning_rate": 4.831953347578348e-05, + "loss": 0.0217, + "step": 5030 + }, + { + "epoch": 0.037254959936134355, + "grad_norm": 0.11013671010732651, + "learning_rate": 4.831582383665717e-05, + "loss": 0.0262, + "step": 5040 + }, + { + "epoch": 0.03732887850743621, + "grad_norm": 0.08840049803256989, + "learning_rate": 4.831211419753087e-05, + "loss": 0.0245, + "step": 5050 + }, + { + "epoch": 0.03740279707873806, + "grad_norm": 0.14922712743282318, + "learning_rate": 4.8308404558404564e-05, + "loss": 0.023, + "step": 5060 + }, + { + "epoch": 0.03747671565003992, + "grad_norm": 0.13218800723552704, + "learning_rate": 4.830469491927825e-05, + "loss": 0.0223, + "step": 5070 + }, + { + "epoch": 0.03755063422134177, + "grad_norm": 0.15943408012390137, + "learning_rate": 4.830098528015195e-05, + "loss": 0.0229, + "step": 5080 + }, + { + "epoch": 0.03762455279264362, + "grad_norm": 0.10490970313549042, + "learning_rate": 4.829727564102564e-05, + "loss": 0.0244, + "step": 5090 + }, + { + "epoch": 0.037698471363945475, + "grad_norm": 0.11059322208166122, + "learning_rate": 4.8293566001899334e-05, + "loss": 0.0231, + "step": 5100 + }, + { + "epoch": 0.037772389935247334, + "grad_norm": 0.13888812065124512, + "learning_rate": 4.828985636277304e-05, + "loss": 0.0216, + "step": 5110 + }, + { + "epoch": 0.03784630850654919, + "grad_norm": 0.10815978795289993, + "learning_rate": 4.8286146723646726e-05, + "loss": 0.0236, + "step": 5120 + }, + { + "epoch": 0.03792022707785104, + "grad_norm": 0.11108426004648209, + "learning_rate": 4.828243708452042e-05, + "loss": 0.0229, + "step": 5130 + }, + { + "epoch": 0.03799414564915289, + "grad_norm": 0.11670207977294922, + "learning_rate": 4.827872744539411e-05, + "loss": 0.0227, + "step": 5140 + }, + { + "epoch": 0.03806806422045475, + "grad_norm": 0.08577834069728851, + "learning_rate": 4.827501780626781e-05, + "loss": 0.0228, + "step": 5150 + }, + { + "epoch": 0.0381419827917566, + "grad_norm": 0.11961571127176285, + "learning_rate": 4.82713081671415e-05, + "loss": 0.0244, + "step": 5160 + }, + { + "epoch": 0.038215901363058455, + "grad_norm": 0.11697886884212494, + "learning_rate": 4.826759852801519e-05, + "loss": 0.021, + "step": 5170 + }, + { + "epoch": 0.03828981993436031, + "grad_norm": 0.08601241558790207, + "learning_rate": 4.8263888888888895e-05, + "loss": 0.0263, + "step": 5180 + }, + { + "epoch": 0.03836373850566216, + "grad_norm": 0.1259474903345108, + "learning_rate": 4.8260179249762584e-05, + "loss": 0.0215, + "step": 5190 + }, + { + "epoch": 0.03843765707696402, + "grad_norm": 0.11750676482915878, + "learning_rate": 4.825646961063628e-05, + "loss": 0.0219, + "step": 5200 + }, + { + "epoch": 0.03851157564826587, + "grad_norm": 0.10238835960626602, + "learning_rate": 4.8252759971509976e-05, + "loss": 0.0214, + "step": 5210 + }, + { + "epoch": 0.03858549421956772, + "grad_norm": 0.2656376361846924, + "learning_rate": 4.8249050332383665e-05, + "loss": 0.0244, + "step": 5220 + }, + { + "epoch": 0.038659412790869575, + "grad_norm": 0.10988765954971313, + "learning_rate": 4.824534069325736e-05, + "loss": 0.023, + "step": 5230 + }, + { + "epoch": 0.038733331362171435, + "grad_norm": 0.13231900334358215, + "learning_rate": 4.824163105413106e-05, + "loss": 0.0249, + "step": 5240 + }, + { + "epoch": 0.03880724993347329, + "grad_norm": 0.10795532912015915, + "learning_rate": 4.8237921415004746e-05, + "loss": 0.022, + "step": 5250 + }, + { + "epoch": 0.03888116850477514, + "grad_norm": 0.12678179144859314, + "learning_rate": 4.823421177587845e-05, + "loss": 0.0275, + "step": 5260 + }, + { + "epoch": 0.03895508707607699, + "grad_norm": 0.10373057425022125, + "learning_rate": 4.823050213675214e-05, + "loss": 0.0235, + "step": 5270 + }, + { + "epoch": 0.03902900564737885, + "grad_norm": 0.13203772902488708, + "learning_rate": 4.8226792497625834e-05, + "loss": 0.0254, + "step": 5280 + }, + { + "epoch": 0.0391029242186807, + "grad_norm": 0.09419368207454681, + "learning_rate": 4.822308285849953e-05, + "loss": 0.0209, + "step": 5290 + }, + { + "epoch": 0.039176842789982555, + "grad_norm": 0.12208737432956696, + "learning_rate": 4.821937321937322e-05, + "loss": 0.0225, + "step": 5300 + }, + { + "epoch": 0.03925076136128441, + "grad_norm": 0.11247088760137558, + "learning_rate": 4.8215663580246915e-05, + "loss": 0.0255, + "step": 5310 + }, + { + "epoch": 0.039324679932586266, + "grad_norm": 0.09036470949649811, + "learning_rate": 4.8211953941120605e-05, + "loss": 0.0246, + "step": 5320 + }, + { + "epoch": 0.03939859850388812, + "grad_norm": 0.08673543483018875, + "learning_rate": 4.820824430199431e-05, + "loss": 0.02, + "step": 5330 + }, + { + "epoch": 0.03947251707518997, + "grad_norm": 0.10785869508981705, + "learning_rate": 4.8204534662868e-05, + "loss": 0.0232, + "step": 5340 + }, + { + "epoch": 0.03954643564649182, + "grad_norm": 0.15774936974048615, + "learning_rate": 4.820082502374169e-05, + "loss": 0.02, + "step": 5350 + }, + { + "epoch": 0.039620354217793675, + "grad_norm": 0.09657442569732666, + "learning_rate": 4.819711538461539e-05, + "loss": 0.0222, + "step": 5360 + }, + { + "epoch": 0.039694272789095535, + "grad_norm": 0.08638004213571548, + "learning_rate": 4.819340574548908e-05, + "loss": 0.0202, + "step": 5370 + }, + { + "epoch": 0.03976819136039739, + "grad_norm": 0.07568100094795227, + "learning_rate": 4.8189696106362774e-05, + "loss": 0.0221, + "step": 5380 + }, + { + "epoch": 0.03984210993169924, + "grad_norm": 0.10558179020881653, + "learning_rate": 4.818598646723647e-05, + "loss": 0.0239, + "step": 5390 + }, + { + "epoch": 0.03991602850300109, + "grad_norm": 0.15386056900024414, + "learning_rate": 4.818227682811016e-05, + "loss": 0.0214, + "step": 5400 + }, + { + "epoch": 0.03998994707430295, + "grad_norm": 0.1022673174738884, + "learning_rate": 4.817856718898386e-05, + "loss": 0.0231, + "step": 5410 + }, + { + "epoch": 0.0400638656456048, + "grad_norm": 0.10176906734704971, + "learning_rate": 4.817485754985755e-05, + "loss": 0.023, + "step": 5420 + }, + { + "epoch": 0.040137784216906655, + "grad_norm": 0.11575393378734589, + "learning_rate": 4.817114791073125e-05, + "loss": 0.023, + "step": 5430 + }, + { + "epoch": 0.04021170278820851, + "grad_norm": 0.11693524569272995, + "learning_rate": 4.816743827160494e-05, + "loss": 0.024, + "step": 5440 + }, + { + "epoch": 0.04028562135951037, + "grad_norm": 0.10131990909576416, + "learning_rate": 4.816372863247863e-05, + "loss": 0.0211, + "step": 5450 + }, + { + "epoch": 0.04035953993081222, + "grad_norm": 0.1304934024810791, + "learning_rate": 4.816001899335233e-05, + "loss": 0.0225, + "step": 5460 + }, + { + "epoch": 0.04043345850211407, + "grad_norm": 0.12911392748355865, + "learning_rate": 4.8156309354226024e-05, + "loss": 0.0198, + "step": 5470 + }, + { + "epoch": 0.04050737707341592, + "grad_norm": 0.07756907492876053, + "learning_rate": 4.815259971509972e-05, + "loss": 0.0186, + "step": 5480 + }, + { + "epoch": 0.040581295644717776, + "grad_norm": 0.09241756796836853, + "learning_rate": 4.8148890075973416e-05, + "loss": 0.0208, + "step": 5490 + }, + { + "epoch": 0.040655214216019635, + "grad_norm": 0.09917902946472168, + "learning_rate": 4.8145180436847105e-05, + "loss": 0.024, + "step": 5500 + }, + { + "epoch": 0.04072913278732149, + "grad_norm": 0.11154290288686752, + "learning_rate": 4.81414707977208e-05, + "loss": 0.0222, + "step": 5510 + }, + { + "epoch": 0.04080305135862334, + "grad_norm": 0.08803219348192215, + "learning_rate": 4.81377611585945e-05, + "loss": 0.0219, + "step": 5520 + }, + { + "epoch": 0.04087696992992519, + "grad_norm": 0.112600177526474, + "learning_rate": 4.8134051519468186e-05, + "loss": 0.0228, + "step": 5530 + }, + { + "epoch": 0.04095088850122705, + "grad_norm": 0.08610142767429352, + "learning_rate": 4.813034188034188e-05, + "loss": 0.0211, + "step": 5540 + }, + { + "epoch": 0.0410248070725289, + "grad_norm": 0.10519230365753174, + "learning_rate": 4.812663224121557e-05, + "loss": 0.0226, + "step": 5550 + }, + { + "epoch": 0.041098725643830755, + "grad_norm": 0.10147269815206528, + "learning_rate": 4.8122922602089274e-05, + "loss": 0.0201, + "step": 5560 + }, + { + "epoch": 0.04117264421513261, + "grad_norm": 0.115921251475811, + "learning_rate": 4.811921296296297e-05, + "loss": 0.0251, + "step": 5570 + }, + { + "epoch": 0.04124656278643447, + "grad_norm": 0.09626685827970505, + "learning_rate": 4.811550332383666e-05, + "loss": 0.0206, + "step": 5580 + }, + { + "epoch": 0.04132048135773632, + "grad_norm": 0.08677060902118683, + "learning_rate": 4.8111793684710355e-05, + "loss": 0.0212, + "step": 5590 + }, + { + "epoch": 0.04139439992903817, + "grad_norm": 0.0958794429898262, + "learning_rate": 4.8108084045584044e-05, + "loss": 0.0224, + "step": 5600 + }, + { + "epoch": 0.04146831850034002, + "grad_norm": 0.11729035526514053, + "learning_rate": 4.810437440645774e-05, + "loss": 0.0211, + "step": 5610 + }, + { + "epoch": 0.04154223707164188, + "grad_norm": 0.10277558118104935, + "learning_rate": 4.8100664767331436e-05, + "loss": 0.0232, + "step": 5620 + }, + { + "epoch": 0.041616155642943735, + "grad_norm": 0.13820619881153107, + "learning_rate": 4.809695512820513e-05, + "loss": 0.0231, + "step": 5630 + }, + { + "epoch": 0.04169007421424559, + "grad_norm": 0.11183051764965057, + "learning_rate": 4.809324548907883e-05, + "loss": 0.0243, + "step": 5640 + }, + { + "epoch": 0.04176399278554744, + "grad_norm": 0.10406447947025299, + "learning_rate": 4.808953584995252e-05, + "loss": 0.0211, + "step": 5650 + }, + { + "epoch": 0.04183791135684929, + "grad_norm": 0.08350703120231628, + "learning_rate": 4.808582621082621e-05, + "loss": 0.0203, + "step": 5660 + }, + { + "epoch": 0.04191182992815115, + "grad_norm": 0.11309721320867538, + "learning_rate": 4.808211657169991e-05, + "loss": 0.0222, + "step": 5670 + }, + { + "epoch": 0.041985748499453, + "grad_norm": 0.08373697847127914, + "learning_rate": 4.80784069325736e-05, + "loss": 0.0199, + "step": 5680 + }, + { + "epoch": 0.042059667070754855, + "grad_norm": 0.09311481565237045, + "learning_rate": 4.8074697293447294e-05, + "loss": 0.0247, + "step": 5690 + }, + { + "epoch": 0.04213358564205671, + "grad_norm": 0.08883152902126312, + "learning_rate": 4.807098765432099e-05, + "loss": 0.025, + "step": 5700 + }, + { + "epoch": 0.04220750421335857, + "grad_norm": 0.14571034908294678, + "learning_rate": 4.8067278015194686e-05, + "loss": 0.0229, + "step": 5710 + }, + { + "epoch": 0.04228142278466042, + "grad_norm": 0.09702489525079727, + "learning_rate": 4.806356837606838e-05, + "loss": 0.0221, + "step": 5720 + }, + { + "epoch": 0.04235534135596227, + "grad_norm": 0.13664387166500092, + "learning_rate": 4.805985873694207e-05, + "loss": 0.0237, + "step": 5730 + }, + { + "epoch": 0.042429259927264124, + "grad_norm": 0.09496665000915527, + "learning_rate": 4.805614909781577e-05, + "loss": 0.0206, + "step": 5740 + }, + { + "epoch": 0.04250317849856598, + "grad_norm": 0.1166987493634224, + "learning_rate": 4.8052439458689463e-05, + "loss": 0.021, + "step": 5750 + }, + { + "epoch": 0.042577097069867835, + "grad_norm": 0.11197572201490402, + "learning_rate": 4.804872981956315e-05, + "loss": 0.0223, + "step": 5760 + }, + { + "epoch": 0.04265101564116969, + "grad_norm": 0.10108604282140732, + "learning_rate": 4.804502018043685e-05, + "loss": 0.0228, + "step": 5770 + }, + { + "epoch": 0.04272493421247154, + "grad_norm": 0.09897824376821518, + "learning_rate": 4.8041310541310545e-05, + "loss": 0.0219, + "step": 5780 + }, + { + "epoch": 0.0427988527837734, + "grad_norm": 0.10928849875926971, + "learning_rate": 4.803760090218424e-05, + "loss": 0.0227, + "step": 5790 + }, + { + "epoch": 0.04287277135507525, + "grad_norm": 0.09280960261821747, + "learning_rate": 4.8033891263057937e-05, + "loss": 0.0214, + "step": 5800 + }, + { + "epoch": 0.0429466899263771, + "grad_norm": 0.11366759985685349, + "learning_rate": 4.8030181623931626e-05, + "loss": 0.0215, + "step": 5810 + }, + { + "epoch": 0.043020608497678955, + "grad_norm": 0.10362562537193298, + "learning_rate": 4.802647198480532e-05, + "loss": 0.0227, + "step": 5820 + }, + { + "epoch": 0.04309452706898081, + "grad_norm": 0.08938121795654297, + "learning_rate": 4.802276234567901e-05, + "loss": 0.0222, + "step": 5830 + }, + { + "epoch": 0.04316844564028267, + "grad_norm": 0.08877433091402054, + "learning_rate": 4.801905270655271e-05, + "loss": 0.0228, + "step": 5840 + }, + { + "epoch": 0.04324236421158452, + "grad_norm": 0.12970751523971558, + "learning_rate": 4.80153430674264e-05, + "loss": 0.0204, + "step": 5850 + }, + { + "epoch": 0.04331628278288637, + "grad_norm": 0.0647781565785408, + "learning_rate": 4.80116334283001e-05, + "loss": 0.0205, + "step": 5860 + }, + { + "epoch": 0.043390201354188224, + "grad_norm": 0.09601494669914246, + "learning_rate": 4.8007923789173795e-05, + "loss": 0.0234, + "step": 5870 + }, + { + "epoch": 0.04346411992549008, + "grad_norm": 0.11197741329669952, + "learning_rate": 4.800421415004749e-05, + "loss": 0.0228, + "step": 5880 + }, + { + "epoch": 0.043538038496791935, + "grad_norm": 0.12032473832368851, + "learning_rate": 4.800050451092118e-05, + "loss": 0.0208, + "step": 5890 + }, + { + "epoch": 0.04361195706809379, + "grad_norm": 0.09236224740743637, + "learning_rate": 4.7996794871794876e-05, + "loss": 0.0232, + "step": 5900 + }, + { + "epoch": 0.04368587563939564, + "grad_norm": 0.12311521917581558, + "learning_rate": 4.7993085232668565e-05, + "loss": 0.0208, + "step": 5910 + }, + { + "epoch": 0.0437597942106975, + "grad_norm": 0.09777011722326279, + "learning_rate": 4.798937559354226e-05, + "loss": 0.0233, + "step": 5920 + }, + { + "epoch": 0.04383371278199935, + "grad_norm": 0.09315415471792221, + "learning_rate": 4.798566595441596e-05, + "loss": 0.0215, + "step": 5930 + }, + { + "epoch": 0.0439076313533012, + "grad_norm": 0.11994203925132751, + "learning_rate": 4.798195631528965e-05, + "loss": 0.0238, + "step": 5940 + }, + { + "epoch": 0.043981549924603056, + "grad_norm": 0.10405700653791428, + "learning_rate": 4.797824667616335e-05, + "loss": 0.0251, + "step": 5950 + }, + { + "epoch": 0.04405546849590491, + "grad_norm": 0.10718826204538345, + "learning_rate": 4.797453703703704e-05, + "loss": 0.0228, + "step": 5960 + }, + { + "epoch": 0.04412938706720677, + "grad_norm": 0.15280954539775848, + "learning_rate": 4.7970827397910734e-05, + "loss": 0.0245, + "step": 5970 + }, + { + "epoch": 0.04420330563850862, + "grad_norm": 0.10383470356464386, + "learning_rate": 4.796711775878443e-05, + "loss": 0.026, + "step": 5980 + }, + { + "epoch": 0.04427722420981047, + "grad_norm": 0.0948052927851677, + "learning_rate": 4.796340811965812e-05, + "loss": 0.0219, + "step": 5990 + }, + { + "epoch": 0.044351142781112324, + "grad_norm": 0.10084927082061768, + "learning_rate": 4.7959698480531815e-05, + "loss": 0.0241, + "step": 6000 + }, + { + "epoch": 0.04442506135241418, + "grad_norm": 0.10056540369987488, + "learning_rate": 4.795598884140551e-05, + "loss": 0.0234, + "step": 6010 + }, + { + "epoch": 0.044498979923716035, + "grad_norm": 0.0987926572561264, + "learning_rate": 4.795227920227921e-05, + "loss": 0.0214, + "step": 6020 + }, + { + "epoch": 0.04457289849501789, + "grad_norm": 0.10318058729171753, + "learning_rate": 4.79485695631529e-05, + "loss": 0.024, + "step": 6030 + }, + { + "epoch": 0.04464681706631974, + "grad_norm": 0.08674127608537674, + "learning_rate": 4.794485992402659e-05, + "loss": 0.0257, + "step": 6040 + }, + { + "epoch": 0.0447207356376216, + "grad_norm": 0.09985113888978958, + "learning_rate": 4.794115028490029e-05, + "loss": 0.0252, + "step": 6050 + }, + { + "epoch": 0.04479465420892345, + "grad_norm": 0.08422184735536575, + "learning_rate": 4.793744064577398e-05, + "loss": 0.0224, + "step": 6060 + }, + { + "epoch": 0.0448685727802253, + "grad_norm": 0.10255678743124008, + "learning_rate": 4.7933731006647673e-05, + "loss": 0.0211, + "step": 6070 + }, + { + "epoch": 0.044942491351527156, + "grad_norm": 0.0996970385313034, + "learning_rate": 4.793002136752137e-05, + "loss": 0.0222, + "step": 6080 + }, + { + "epoch": 0.045016409922829015, + "grad_norm": 0.1050659790635109, + "learning_rate": 4.7926311728395065e-05, + "loss": 0.0232, + "step": 6090 + }, + { + "epoch": 0.04509032849413087, + "grad_norm": 0.11204075068235397, + "learning_rate": 4.792260208926876e-05, + "loss": 0.0254, + "step": 6100 + }, + { + "epoch": 0.04516424706543272, + "grad_norm": 0.09773686528205872, + "learning_rate": 4.791889245014246e-05, + "loss": 0.0235, + "step": 6110 + }, + { + "epoch": 0.04523816563673457, + "grad_norm": 0.09989985823631287, + "learning_rate": 4.7915182811016147e-05, + "loss": 0.0216, + "step": 6120 + }, + { + "epoch": 0.045312084208036424, + "grad_norm": 0.10974659770727158, + "learning_rate": 4.791147317188984e-05, + "loss": 0.0216, + "step": 6130 + }, + { + "epoch": 0.04538600277933828, + "grad_norm": 0.11773477494716644, + "learning_rate": 4.790776353276353e-05, + "loss": 0.0232, + "step": 6140 + }, + { + "epoch": 0.045459921350640135, + "grad_norm": 0.09761492162942886, + "learning_rate": 4.790405389363723e-05, + "loss": 0.0232, + "step": 6150 + }, + { + "epoch": 0.04553383992194199, + "grad_norm": 0.10829565674066544, + "learning_rate": 4.7900344254510924e-05, + "loss": 0.0234, + "step": 6160 + }, + { + "epoch": 0.04560775849324384, + "grad_norm": 0.12356780469417572, + "learning_rate": 4.789663461538462e-05, + "loss": 0.0199, + "step": 6170 + }, + { + "epoch": 0.0456816770645457, + "grad_norm": 0.09067923575639725, + "learning_rate": 4.7892924976258316e-05, + "loss": 0.0237, + "step": 6180 + }, + { + "epoch": 0.04575559563584755, + "grad_norm": 0.09042474627494812, + "learning_rate": 4.7889215337132005e-05, + "loss": 0.0228, + "step": 6190 + }, + { + "epoch": 0.045829514207149404, + "grad_norm": 0.08567062765359879, + "learning_rate": 4.78855056980057e-05, + "loss": 0.0229, + "step": 6200 + }, + { + "epoch": 0.045903432778451256, + "grad_norm": 0.10306154936552048, + "learning_rate": 4.78817960588794e-05, + "loss": 0.0201, + "step": 6210 + }, + { + "epoch": 0.045977351349753115, + "grad_norm": 0.1060139611363411, + "learning_rate": 4.7878086419753086e-05, + "loss": 0.0216, + "step": 6220 + }, + { + "epoch": 0.04605126992105497, + "grad_norm": 0.09518091380596161, + "learning_rate": 4.787437678062678e-05, + "loss": 0.0243, + "step": 6230 + }, + { + "epoch": 0.04612518849235682, + "grad_norm": 0.09898778051137924, + "learning_rate": 4.787066714150048e-05, + "loss": 0.0225, + "step": 6240 + }, + { + "epoch": 0.04619910706365867, + "grad_norm": 0.08496725559234619, + "learning_rate": 4.7866957502374174e-05, + "loss": 0.0225, + "step": 6250 + }, + { + "epoch": 0.04627302563496053, + "grad_norm": 0.0848846286535263, + "learning_rate": 4.786324786324787e-05, + "loss": 0.0241, + "step": 6260 + }, + { + "epoch": 0.04634694420626238, + "grad_norm": 0.1118076741695404, + "learning_rate": 4.785953822412156e-05, + "loss": 0.0214, + "step": 6270 + }, + { + "epoch": 0.046420862777564235, + "grad_norm": 0.08831203728914261, + "learning_rate": 4.7855828584995255e-05, + "loss": 0.0201, + "step": 6280 + }, + { + "epoch": 0.04649478134886609, + "grad_norm": 0.1217513382434845, + "learning_rate": 4.7852118945868944e-05, + "loss": 0.025, + "step": 6290 + }, + { + "epoch": 0.04656869992016794, + "grad_norm": 0.13399529457092285, + "learning_rate": 4.784840930674264e-05, + "loss": 0.0249, + "step": 6300 + }, + { + "epoch": 0.0466426184914698, + "grad_norm": 0.10110674798488617, + "learning_rate": 4.7844699667616336e-05, + "loss": 0.0214, + "step": 6310 + }, + { + "epoch": 0.04671653706277165, + "grad_norm": 0.12537550926208496, + "learning_rate": 4.784099002849003e-05, + "loss": 0.0233, + "step": 6320 + }, + { + "epoch": 0.046790455634073504, + "grad_norm": 0.08597607165575027, + "learning_rate": 4.783728038936373e-05, + "loss": 0.0225, + "step": 6330 + }, + { + "epoch": 0.046864374205375356, + "grad_norm": 0.12496454268693924, + "learning_rate": 4.7833570750237424e-05, + "loss": 0.0233, + "step": 6340 + }, + { + "epoch": 0.046938292776677215, + "grad_norm": 0.11686607450246811, + "learning_rate": 4.782986111111111e-05, + "loss": 0.0212, + "step": 6350 + }, + { + "epoch": 0.04701221134797907, + "grad_norm": 0.09545844793319702, + "learning_rate": 4.782615147198481e-05, + "loss": 0.0223, + "step": 6360 + }, + { + "epoch": 0.04708612991928092, + "grad_norm": 0.12743836641311646, + "learning_rate": 4.78224418328585e-05, + "loss": 0.0212, + "step": 6370 + }, + { + "epoch": 0.04716004849058277, + "grad_norm": 0.10326215624809265, + "learning_rate": 4.7818732193732194e-05, + "loss": 0.0216, + "step": 6380 + }, + { + "epoch": 0.04723396706188463, + "grad_norm": 0.1223498284816742, + "learning_rate": 4.781502255460589e-05, + "loss": 0.0279, + "step": 6390 + }, + { + "epoch": 0.04730788563318648, + "grad_norm": 0.08332131803035736, + "learning_rate": 4.7811312915479586e-05, + "loss": 0.0213, + "step": 6400 + }, + { + "epoch": 0.047381804204488336, + "grad_norm": 0.10657671838998795, + "learning_rate": 4.780760327635328e-05, + "loss": 0.0222, + "step": 6410 + }, + { + "epoch": 0.04745572277579019, + "grad_norm": 0.0815543606877327, + "learning_rate": 4.780389363722697e-05, + "loss": 0.025, + "step": 6420 + }, + { + "epoch": 0.04752964134709204, + "grad_norm": 0.1204519271850586, + "learning_rate": 4.780018399810067e-05, + "loss": 0.0204, + "step": 6430 + }, + { + "epoch": 0.0476035599183939, + "grad_norm": 0.12509311735630035, + "learning_rate": 4.779647435897436e-05, + "loss": 0.02, + "step": 6440 + }, + { + "epoch": 0.04767747848969575, + "grad_norm": 0.1401158720254898, + "learning_rate": 4.779276471984805e-05, + "loss": 0.0234, + "step": 6450 + }, + { + "epoch": 0.047751397060997604, + "grad_norm": 0.08719848096370697, + "learning_rate": 4.778905508072175e-05, + "loss": 0.0211, + "step": 6460 + }, + { + "epoch": 0.047825315632299456, + "grad_norm": 0.08922212570905685, + "learning_rate": 4.7785345441595444e-05, + "loss": 0.0246, + "step": 6470 + }, + { + "epoch": 0.047899234203601315, + "grad_norm": 0.14943371713161469, + "learning_rate": 4.778163580246914e-05, + "loss": 0.0233, + "step": 6480 + }, + { + "epoch": 0.04797315277490317, + "grad_norm": 0.09004784375429153, + "learning_rate": 4.7777926163342836e-05, + "loss": 0.0209, + "step": 6490 + }, + { + "epoch": 0.04804707134620502, + "grad_norm": 0.10191723704338074, + "learning_rate": 4.7774216524216525e-05, + "loss": 0.0241, + "step": 6500 + }, + { + "epoch": 0.04812098991750687, + "grad_norm": 0.08505424857139587, + "learning_rate": 4.777050688509022e-05, + "loss": 0.0216, + "step": 6510 + }, + { + "epoch": 0.04819490848880873, + "grad_norm": 0.09687618911266327, + "learning_rate": 4.776679724596391e-05, + "loss": 0.024, + "step": 6520 + }, + { + "epoch": 0.04826882706011058, + "grad_norm": 0.067629374563694, + "learning_rate": 4.7763087606837607e-05, + "loss": 0.0227, + "step": 6530 + }, + { + "epoch": 0.048342745631412436, + "grad_norm": 0.09508171677589417, + "learning_rate": 4.77593779677113e-05, + "loss": 0.0223, + "step": 6540 + }, + { + "epoch": 0.04841666420271429, + "grad_norm": 0.09708387404680252, + "learning_rate": 4.7755668328585e-05, + "loss": 0.0205, + "step": 6550 + }, + { + "epoch": 0.04849058277401615, + "grad_norm": 0.11333795636892319, + "learning_rate": 4.7751958689458695e-05, + "loss": 0.0216, + "step": 6560 + }, + { + "epoch": 0.048564501345318, + "grad_norm": 0.0891813114285469, + "learning_rate": 4.774824905033239e-05, + "loss": 0.0213, + "step": 6570 + }, + { + "epoch": 0.04863841991661985, + "grad_norm": 0.09201674908399582, + "learning_rate": 4.774453941120608e-05, + "loss": 0.0238, + "step": 6580 + }, + { + "epoch": 0.048712338487921704, + "grad_norm": 0.10166573524475098, + "learning_rate": 4.7740829772079776e-05, + "loss": 0.0241, + "step": 6590 + }, + { + "epoch": 0.048786257059223556, + "grad_norm": 0.08590281754732132, + "learning_rate": 4.7737120132953465e-05, + "loss": 0.0232, + "step": 6600 + }, + { + "epoch": 0.048860175630525415, + "grad_norm": 0.0673174038529396, + "learning_rate": 4.773341049382716e-05, + "loss": 0.021, + "step": 6610 + }, + { + "epoch": 0.04893409420182727, + "grad_norm": 0.09542142599821091, + "learning_rate": 4.772970085470086e-05, + "loss": 0.0212, + "step": 6620 + }, + { + "epoch": 0.04900801277312912, + "grad_norm": 0.12368637323379517, + "learning_rate": 4.772599121557455e-05, + "loss": 0.0235, + "step": 6630 + }, + { + "epoch": 0.04908193134443097, + "grad_norm": 0.10154004395008087, + "learning_rate": 4.772228157644825e-05, + "loss": 0.0215, + "step": 6640 + }, + { + "epoch": 0.04915584991573283, + "grad_norm": 0.09259312599897385, + "learning_rate": 4.771857193732194e-05, + "loss": 0.0206, + "step": 6650 + }, + { + "epoch": 0.049229768487034684, + "grad_norm": 0.07898467779159546, + "learning_rate": 4.7714862298195634e-05, + "loss": 0.0188, + "step": 6660 + }, + { + "epoch": 0.049303687058336536, + "grad_norm": 0.12356526404619217, + "learning_rate": 4.771115265906933e-05, + "loss": 0.0217, + "step": 6670 + }, + { + "epoch": 0.04937760562963839, + "grad_norm": 0.14405713975429535, + "learning_rate": 4.770744301994302e-05, + "loss": 0.0208, + "step": 6680 + }, + { + "epoch": 0.04945152420094025, + "grad_norm": 0.0964358001947403, + "learning_rate": 4.7703733380816715e-05, + "loss": 0.0222, + "step": 6690 + }, + { + "epoch": 0.0495254427722421, + "grad_norm": 0.07914058864116669, + "learning_rate": 4.770002374169041e-05, + "loss": 0.0222, + "step": 6700 + }, + { + "epoch": 0.04959936134354395, + "grad_norm": 0.09293976426124573, + "learning_rate": 4.769631410256411e-05, + "loss": 0.0231, + "step": 6710 + }, + { + "epoch": 0.049673279914845804, + "grad_norm": 0.09488365799188614, + "learning_rate": 4.76926044634378e-05, + "loss": 0.0205, + "step": 6720 + }, + { + "epoch": 0.049747198486147656, + "grad_norm": 0.10206745564937592, + "learning_rate": 4.768889482431149e-05, + "loss": 0.0231, + "step": 6730 + }, + { + "epoch": 0.049821117057449515, + "grad_norm": 0.08070364594459534, + "learning_rate": 4.768518518518519e-05, + "loss": 0.0233, + "step": 6740 + }, + { + "epoch": 0.04989503562875137, + "grad_norm": 0.10973179340362549, + "learning_rate": 4.768147554605888e-05, + "loss": 0.0223, + "step": 6750 + }, + { + "epoch": 0.04996895420005322, + "grad_norm": 0.12484482675790787, + "learning_rate": 4.767776590693257e-05, + "loss": 0.0233, + "step": 6760 + }, + { + "epoch": 0.05004287277135507, + "grad_norm": 0.11287377029657364, + "learning_rate": 4.767405626780627e-05, + "loss": 0.0241, + "step": 6770 + }, + { + "epoch": 0.05011679134265693, + "grad_norm": 0.12103903293609619, + "learning_rate": 4.7670346628679965e-05, + "loss": 0.0215, + "step": 6780 + }, + { + "epoch": 0.050190709913958784, + "grad_norm": 0.1062537357211113, + "learning_rate": 4.766663698955366e-05, + "loss": 0.0205, + "step": 6790 + }, + { + "epoch": 0.050264628485260636, + "grad_norm": 0.09399276226758957, + "learning_rate": 4.766292735042736e-05, + "loss": 0.0242, + "step": 6800 + }, + { + "epoch": 0.05033854705656249, + "grad_norm": 0.09174839407205582, + "learning_rate": 4.7659217711301046e-05, + "loss": 0.0209, + "step": 6810 + }, + { + "epoch": 0.05041246562786435, + "grad_norm": 0.11377735435962677, + "learning_rate": 4.765550807217474e-05, + "loss": 0.0205, + "step": 6820 + }, + { + "epoch": 0.0504863841991662, + "grad_norm": 0.11271318048238754, + "learning_rate": 4.765179843304843e-05, + "loss": 0.0235, + "step": 6830 + }, + { + "epoch": 0.05056030277046805, + "grad_norm": 0.10871057957410812, + "learning_rate": 4.764808879392213e-05, + "loss": 0.0214, + "step": 6840 + }, + { + "epoch": 0.050634221341769904, + "grad_norm": 0.13600623607635498, + "learning_rate": 4.764437915479582e-05, + "loss": 0.0239, + "step": 6850 + }, + { + "epoch": 0.05070813991307176, + "grad_norm": 0.11014101654291153, + "learning_rate": 4.764066951566952e-05, + "loss": 0.0239, + "step": 6860 + }, + { + "epoch": 0.050782058484373616, + "grad_norm": 0.10503526031970978, + "learning_rate": 4.7636959876543215e-05, + "loss": 0.0217, + "step": 6870 + }, + { + "epoch": 0.05085597705567547, + "grad_norm": 0.12979675829410553, + "learning_rate": 4.7633250237416904e-05, + "loss": 0.0212, + "step": 6880 + }, + { + "epoch": 0.05092989562697732, + "grad_norm": 0.1068769097328186, + "learning_rate": 4.76295405982906e-05, + "loss": 0.0209, + "step": 6890 + }, + { + "epoch": 0.05100381419827917, + "grad_norm": 0.11308766901493073, + "learning_rate": 4.7625830959164296e-05, + "loss": 0.0216, + "step": 6900 + }, + { + "epoch": 0.05107773276958103, + "grad_norm": 0.1139088124036789, + "learning_rate": 4.7622121320037986e-05, + "loss": 0.0228, + "step": 6910 + }, + { + "epoch": 0.051151651340882884, + "grad_norm": 0.10564363747835159, + "learning_rate": 4.761841168091168e-05, + "loss": 0.0218, + "step": 6920 + }, + { + "epoch": 0.051225569912184736, + "grad_norm": 0.11074735224246979, + "learning_rate": 4.761470204178538e-05, + "loss": 0.023, + "step": 6930 + }, + { + "epoch": 0.05129948848348659, + "grad_norm": 0.0938468873500824, + "learning_rate": 4.7610992402659073e-05, + "loss": 0.023, + "step": 6940 + }, + { + "epoch": 0.05137340705478845, + "grad_norm": 0.08696165680885315, + "learning_rate": 4.760728276353277e-05, + "loss": 0.0242, + "step": 6950 + }, + { + "epoch": 0.0514473256260903, + "grad_norm": 0.06806029379367828, + "learning_rate": 4.760357312440646e-05, + "loss": 0.0238, + "step": 6960 + }, + { + "epoch": 0.05152124419739215, + "grad_norm": 0.165776327252388, + "learning_rate": 4.7599863485280155e-05, + "loss": 0.0169, + "step": 6970 + }, + { + "epoch": 0.051595162768694004, + "grad_norm": 0.09899604320526123, + "learning_rate": 4.7596153846153844e-05, + "loss": 0.0217, + "step": 6980 + }, + { + "epoch": 0.05166908133999586, + "grad_norm": 0.09343422204256058, + "learning_rate": 4.759244420702754e-05, + "loss": 0.0226, + "step": 6990 + }, + { + "epoch": 0.051742999911297716, + "grad_norm": 0.09994067251682281, + "learning_rate": 4.7588734567901236e-05, + "loss": 0.0236, + "step": 7000 + }, + { + "epoch": 0.05181691848259957, + "grad_norm": 0.08840420097112656, + "learning_rate": 4.758502492877493e-05, + "loss": 0.0205, + "step": 7010 + }, + { + "epoch": 0.05189083705390142, + "grad_norm": 0.11010074615478516, + "learning_rate": 4.758131528964863e-05, + "loss": 0.0248, + "step": 7020 + }, + { + "epoch": 0.05196475562520328, + "grad_norm": 0.1090909019112587, + "learning_rate": 4.7577605650522324e-05, + "loss": 0.0219, + "step": 7030 + }, + { + "epoch": 0.05203867419650513, + "grad_norm": 0.09599191695451736, + "learning_rate": 4.757389601139601e-05, + "loss": 0.0203, + "step": 7040 + }, + { + "epoch": 0.052112592767806984, + "grad_norm": 0.10519011318683624, + "learning_rate": 4.757018637226971e-05, + "loss": 0.0225, + "step": 7050 + }, + { + "epoch": 0.052186511339108836, + "grad_norm": 0.1019931212067604, + "learning_rate": 4.75664767331434e-05, + "loss": 0.0219, + "step": 7060 + }, + { + "epoch": 0.05226042991041069, + "grad_norm": 0.07580450177192688, + "learning_rate": 4.7562767094017094e-05, + "loss": 0.0244, + "step": 7070 + }, + { + "epoch": 0.05233434848171255, + "grad_norm": 0.10171625763177872, + "learning_rate": 4.755905745489079e-05, + "loss": 0.0236, + "step": 7080 + }, + { + "epoch": 0.0524082670530144, + "grad_norm": 0.10591152310371399, + "learning_rate": 4.7555347815764486e-05, + "loss": 0.0229, + "step": 7090 + }, + { + "epoch": 0.05248218562431625, + "grad_norm": 0.07710936665534973, + "learning_rate": 4.755163817663818e-05, + "loss": 0.0209, + "step": 7100 + }, + { + "epoch": 0.052556104195618104, + "grad_norm": 0.08563518524169922, + "learning_rate": 4.754792853751187e-05, + "loss": 0.0212, + "step": 7110 + }, + { + "epoch": 0.052630022766919964, + "grad_norm": 0.09113353490829468, + "learning_rate": 4.754421889838557e-05, + "loss": 0.0207, + "step": 7120 + }, + { + "epoch": 0.052703941338221816, + "grad_norm": 0.12325076758861542, + "learning_rate": 4.754050925925926e-05, + "loss": 0.0201, + "step": 7130 + }, + { + "epoch": 0.05277785990952367, + "grad_norm": 0.11564420163631439, + "learning_rate": 4.753679962013295e-05, + "loss": 0.0195, + "step": 7140 + }, + { + "epoch": 0.05285177848082552, + "grad_norm": 0.11357983201742172, + "learning_rate": 4.753308998100665e-05, + "loss": 0.0201, + "step": 7150 + }, + { + "epoch": 0.05292569705212738, + "grad_norm": 0.10943233221769333, + "learning_rate": 4.7529380341880344e-05, + "loss": 0.0238, + "step": 7160 + }, + { + "epoch": 0.05299961562342923, + "grad_norm": 0.12073173373937607, + "learning_rate": 4.752567070275404e-05, + "loss": 0.0259, + "step": 7170 + }, + { + "epoch": 0.053073534194731084, + "grad_norm": 0.08202937990427017, + "learning_rate": 4.7521961063627736e-05, + "loss": 0.0198, + "step": 7180 + }, + { + "epoch": 0.053147452766032936, + "grad_norm": 0.10718532651662827, + "learning_rate": 4.7518251424501425e-05, + "loss": 0.0206, + "step": 7190 + }, + { + "epoch": 0.05322137133733479, + "grad_norm": 0.10505930334329605, + "learning_rate": 4.751454178537512e-05, + "loss": 0.0238, + "step": 7200 + }, + { + "epoch": 0.05329528990863665, + "grad_norm": 0.11115860939025879, + "learning_rate": 4.751083214624881e-05, + "loss": 0.0204, + "step": 7210 + }, + { + "epoch": 0.0533692084799385, + "grad_norm": 0.10364003479480743, + "learning_rate": 4.7507122507122506e-05, + "loss": 0.0226, + "step": 7220 + }, + { + "epoch": 0.05344312705124035, + "grad_norm": 0.10836455971002579, + "learning_rate": 4.75034128679962e-05, + "loss": 0.0228, + "step": 7230 + }, + { + "epoch": 0.053517045622542204, + "grad_norm": 0.11580884456634521, + "learning_rate": 4.74997032288699e-05, + "loss": 0.0223, + "step": 7240 + }, + { + "epoch": 0.053590964193844064, + "grad_norm": 0.10361438989639282, + "learning_rate": 4.7495993589743594e-05, + "loss": 0.0216, + "step": 7250 + }, + { + "epoch": 0.053664882765145916, + "grad_norm": 0.10192573815584183, + "learning_rate": 4.749228395061729e-05, + "loss": 0.0246, + "step": 7260 + }, + { + "epoch": 0.05373880133644777, + "grad_norm": 0.11494297534227371, + "learning_rate": 4.748857431149098e-05, + "loss": 0.0222, + "step": 7270 + }, + { + "epoch": 0.05381271990774962, + "grad_norm": 0.1062757596373558, + "learning_rate": 4.7484864672364675e-05, + "loss": 0.018, + "step": 7280 + }, + { + "epoch": 0.05388663847905148, + "grad_norm": 0.1719273179769516, + "learning_rate": 4.7481155033238365e-05, + "loss": 0.0239, + "step": 7290 + }, + { + "epoch": 0.05396055705035333, + "grad_norm": 0.0962737575173378, + "learning_rate": 4.747744539411206e-05, + "loss": 0.0218, + "step": 7300 + }, + { + "epoch": 0.054034475621655184, + "grad_norm": 0.12584984302520752, + "learning_rate": 4.7473735754985757e-05, + "loss": 0.0238, + "step": 7310 + }, + { + "epoch": 0.054108394192957036, + "grad_norm": 0.11260943859815598, + "learning_rate": 4.747002611585945e-05, + "loss": 0.0199, + "step": 7320 + }, + { + "epoch": 0.054182312764258896, + "grad_norm": 0.08724574744701385, + "learning_rate": 4.746631647673315e-05, + "loss": 0.0227, + "step": 7330 + }, + { + "epoch": 0.05425623133556075, + "grad_norm": 0.09556634724140167, + "learning_rate": 4.746260683760684e-05, + "loss": 0.0207, + "step": 7340 + }, + { + "epoch": 0.0543301499068626, + "grad_norm": 0.09617023169994354, + "learning_rate": 4.7458897198480534e-05, + "loss": 0.02, + "step": 7350 + }, + { + "epoch": 0.05440406847816445, + "grad_norm": 0.11631660163402557, + "learning_rate": 4.745518755935423e-05, + "loss": 0.0209, + "step": 7360 + }, + { + "epoch": 0.054477987049466305, + "grad_norm": 0.08489320427179337, + "learning_rate": 4.745147792022792e-05, + "loss": 0.0201, + "step": 7370 + }, + { + "epoch": 0.054551905620768164, + "grad_norm": 0.09915885329246521, + "learning_rate": 4.7447768281101615e-05, + "loss": 0.0211, + "step": 7380 + }, + { + "epoch": 0.054625824192070016, + "grad_norm": 0.09194263815879822, + "learning_rate": 4.744405864197531e-05, + "loss": 0.0213, + "step": 7390 + }, + { + "epoch": 0.05469974276337187, + "grad_norm": 0.09207558631896973, + "learning_rate": 4.744034900284901e-05, + "loss": 0.0204, + "step": 7400 + }, + { + "epoch": 0.05477366133467372, + "grad_norm": 0.09770120680332184, + "learning_rate": 4.74366393637227e-05, + "loss": 0.0233, + "step": 7410 + }, + { + "epoch": 0.05484757990597558, + "grad_norm": 0.09625663608312607, + "learning_rate": 4.743292972459639e-05, + "loss": 0.0217, + "step": 7420 + }, + { + "epoch": 0.05492149847727743, + "grad_norm": 0.07443071156740189, + "learning_rate": 4.742922008547009e-05, + "loss": 0.0234, + "step": 7430 + }, + { + "epoch": 0.054995417048579284, + "grad_norm": 0.10070905834436417, + "learning_rate": 4.742551044634378e-05, + "loss": 0.0209, + "step": 7440 + }, + { + "epoch": 0.055069335619881137, + "grad_norm": 0.12188205868005753, + "learning_rate": 4.742180080721747e-05, + "loss": 0.0233, + "step": 7450 + }, + { + "epoch": 0.055143254191182996, + "grad_norm": 0.09652313590049744, + "learning_rate": 4.741809116809117e-05, + "loss": 0.0199, + "step": 7460 + }, + { + "epoch": 0.05521717276248485, + "grad_norm": 0.10234474390745163, + "learning_rate": 4.7414381528964865e-05, + "loss": 0.0221, + "step": 7470 + }, + { + "epoch": 0.0552910913337867, + "grad_norm": 0.10418083518743515, + "learning_rate": 4.741067188983856e-05, + "loss": 0.0204, + "step": 7480 + }, + { + "epoch": 0.05536500990508855, + "grad_norm": 0.0978635922074318, + "learning_rate": 4.740696225071226e-05, + "loss": 0.0199, + "step": 7490 + }, + { + "epoch": 0.05543892847639041, + "grad_norm": 0.11908965557813644, + "learning_rate": 4.7403252611585946e-05, + "loss": 0.0239, + "step": 7500 + }, + { + "epoch": 0.055512847047692264, + "grad_norm": 0.09557194262742996, + "learning_rate": 4.739954297245964e-05, + "loss": 0.0216, + "step": 7510 + }, + { + "epoch": 0.055586765618994116, + "grad_norm": 0.08496109396219254, + "learning_rate": 4.739583333333333e-05, + "loss": 0.0217, + "step": 7520 + }, + { + "epoch": 0.05566068419029597, + "grad_norm": 0.08821641653776169, + "learning_rate": 4.739212369420703e-05, + "loss": 0.0193, + "step": 7530 + }, + { + "epoch": 0.05573460276159782, + "grad_norm": 0.11779448390007019, + "learning_rate": 4.738841405508073e-05, + "loss": 0.0218, + "step": 7540 + }, + { + "epoch": 0.05580852133289968, + "grad_norm": 0.12221374362707138, + "learning_rate": 4.738470441595442e-05, + "loss": 0.0238, + "step": 7550 + }, + { + "epoch": 0.05588243990420153, + "grad_norm": 0.15340213477611542, + "learning_rate": 4.7380994776828115e-05, + "loss": 0.0249, + "step": 7560 + }, + { + "epoch": 0.055956358475503384, + "grad_norm": 0.15344905853271484, + "learning_rate": 4.7377285137701804e-05, + "loss": 0.0253, + "step": 7570 + }, + { + "epoch": 0.05603027704680524, + "grad_norm": 0.10215380042791367, + "learning_rate": 4.73735754985755e-05, + "loss": 0.0195, + "step": 7580 + }, + { + "epoch": 0.056104195618107096, + "grad_norm": 0.07139965891838074, + "learning_rate": 4.7369865859449196e-05, + "loss": 0.018, + "step": 7590 + }, + { + "epoch": 0.05617811418940895, + "grad_norm": 0.121430903673172, + "learning_rate": 4.7366156220322885e-05, + "loss": 0.0224, + "step": 7600 + }, + { + "epoch": 0.0562520327607108, + "grad_norm": 0.11266914755105972, + "learning_rate": 4.736244658119658e-05, + "loss": 0.0233, + "step": 7610 + }, + { + "epoch": 0.05632595133201265, + "grad_norm": 0.09315747767686844, + "learning_rate": 4.735873694207028e-05, + "loss": 0.0214, + "step": 7620 + }, + { + "epoch": 0.05639986990331451, + "grad_norm": 0.07198236882686615, + "learning_rate": 4.735502730294397e-05, + "loss": 0.0216, + "step": 7630 + }, + { + "epoch": 0.056473788474616364, + "grad_norm": 0.12135116010904312, + "learning_rate": 4.735131766381767e-05, + "loss": 0.0209, + "step": 7640 + }, + { + "epoch": 0.056547707045918216, + "grad_norm": 0.08780569583177567, + "learning_rate": 4.734760802469136e-05, + "loss": 0.0218, + "step": 7650 + }, + { + "epoch": 0.05662162561722007, + "grad_norm": 0.10140490531921387, + "learning_rate": 4.7343898385565054e-05, + "loss": 0.0235, + "step": 7660 + }, + { + "epoch": 0.05669554418852192, + "grad_norm": 0.08562912791967392, + "learning_rate": 4.7340188746438744e-05, + "loss": 0.0181, + "step": 7670 + }, + { + "epoch": 0.05676946275982378, + "grad_norm": 0.13408763706684113, + "learning_rate": 4.733647910731244e-05, + "loss": 0.0256, + "step": 7680 + }, + { + "epoch": 0.05684338133112563, + "grad_norm": 0.11752607673406601, + "learning_rate": 4.733276946818614e-05, + "loss": 0.022, + "step": 7690 + }, + { + "epoch": 0.056917299902427484, + "grad_norm": 0.08813085407018661, + "learning_rate": 4.732905982905983e-05, + "loss": 0.0202, + "step": 7700 + }, + { + "epoch": 0.05699121847372934, + "grad_norm": 0.09921573847532272, + "learning_rate": 4.732535018993353e-05, + "loss": 0.0201, + "step": 7710 + }, + { + "epoch": 0.057065137045031196, + "grad_norm": 0.09355275332927704, + "learning_rate": 4.7321640550807223e-05, + "loss": 0.0194, + "step": 7720 + }, + { + "epoch": 0.05713905561633305, + "grad_norm": 0.08472117781639099, + "learning_rate": 4.731793091168091e-05, + "loss": 0.0228, + "step": 7730 + }, + { + "epoch": 0.0572129741876349, + "grad_norm": 0.11851277202367783, + "learning_rate": 4.731422127255461e-05, + "loss": 0.0235, + "step": 7740 + }, + { + "epoch": 0.05728689275893675, + "grad_norm": 0.0786430612206459, + "learning_rate": 4.73105116334283e-05, + "loss": 0.0197, + "step": 7750 + }, + { + "epoch": 0.05736081133023861, + "grad_norm": 0.12320347875356674, + "learning_rate": 4.7306801994301994e-05, + "loss": 0.0283, + "step": 7760 + }, + { + "epoch": 0.057434729901540464, + "grad_norm": 0.0892769992351532, + "learning_rate": 4.7303092355175696e-05, + "loss": 0.0203, + "step": 7770 + }, + { + "epoch": 0.057508648472842316, + "grad_norm": 0.09888837486505508, + "learning_rate": 4.7299382716049386e-05, + "loss": 0.0221, + "step": 7780 + }, + { + "epoch": 0.05758256704414417, + "grad_norm": 0.11733058840036392, + "learning_rate": 4.729567307692308e-05, + "loss": 0.0223, + "step": 7790 + }, + { + "epoch": 0.05765648561544603, + "grad_norm": 0.16852419078350067, + "learning_rate": 4.729196343779677e-05, + "loss": 0.0223, + "step": 7800 + }, + { + "epoch": 0.05773040418674788, + "grad_norm": 0.11733348667621613, + "learning_rate": 4.728825379867047e-05, + "loss": 0.0226, + "step": 7810 + }, + { + "epoch": 0.05780432275804973, + "grad_norm": 0.13076969981193542, + "learning_rate": 4.728454415954416e-05, + "loss": 0.0223, + "step": 7820 + }, + { + "epoch": 0.057878241329351585, + "grad_norm": 0.07921652495861053, + "learning_rate": 4.728083452041785e-05, + "loss": 0.0219, + "step": 7830 + }, + { + "epoch": 0.05795215990065344, + "grad_norm": 0.10504285991191864, + "learning_rate": 4.727712488129155e-05, + "loss": 0.0224, + "step": 7840 + }, + { + "epoch": 0.058026078471955296, + "grad_norm": 0.11202266067266464, + "learning_rate": 4.7273415242165244e-05, + "loss": 0.0245, + "step": 7850 + }, + { + "epoch": 0.05809999704325715, + "grad_norm": 0.12498188763856888, + "learning_rate": 4.726970560303894e-05, + "loss": 0.0231, + "step": 7860 + }, + { + "epoch": 0.058173915614559, + "grad_norm": 0.0961272194981575, + "learning_rate": 4.7265995963912636e-05, + "loss": 0.0238, + "step": 7870 + }, + { + "epoch": 0.05824783418586085, + "grad_norm": 0.09642474353313446, + "learning_rate": 4.7262286324786325e-05, + "loss": 0.0208, + "step": 7880 + }, + { + "epoch": 0.05832175275716271, + "grad_norm": 0.07648549973964691, + "learning_rate": 4.725857668566002e-05, + "loss": 0.0213, + "step": 7890 + }, + { + "epoch": 0.058395671328464564, + "grad_norm": 0.12183746695518494, + "learning_rate": 4.725486704653371e-05, + "loss": 0.0225, + "step": 7900 + }, + { + "epoch": 0.058469589899766417, + "grad_norm": 0.10755477100610733, + "learning_rate": 4.7251157407407406e-05, + "loss": 0.0191, + "step": 7910 + }, + { + "epoch": 0.05854350847106827, + "grad_norm": 0.12599851191043854, + "learning_rate": 4.724744776828111e-05, + "loss": 0.0236, + "step": 7920 + }, + { + "epoch": 0.05861742704237013, + "grad_norm": 0.11307878792285919, + "learning_rate": 4.72437381291548e-05, + "loss": 0.0208, + "step": 7930 + }, + { + "epoch": 0.05869134561367198, + "grad_norm": 0.107178695499897, + "learning_rate": 4.7240028490028494e-05, + "loss": 0.0211, + "step": 7940 + }, + { + "epoch": 0.05876526418497383, + "grad_norm": 0.16315461695194244, + "learning_rate": 4.723631885090219e-05, + "loss": 0.0217, + "step": 7950 + }, + { + "epoch": 0.058839182756275685, + "grad_norm": 0.09971021860837936, + "learning_rate": 4.723260921177588e-05, + "loss": 0.0187, + "step": 7960 + }, + { + "epoch": 0.058913101327577544, + "grad_norm": 0.1025579646229744, + "learning_rate": 4.7228899572649575e-05, + "loss": 0.0224, + "step": 7970 + }, + { + "epoch": 0.058987019898879396, + "grad_norm": 0.08908526599407196, + "learning_rate": 4.7225189933523264e-05, + "loss": 0.0235, + "step": 7980 + }, + { + "epoch": 0.05906093847018125, + "grad_norm": 0.18570634722709656, + "learning_rate": 4.722148029439696e-05, + "loss": 0.0232, + "step": 7990 + }, + { + "epoch": 0.0591348570414831, + "grad_norm": 0.08980167657136917, + "learning_rate": 4.721777065527066e-05, + "loss": 0.0235, + "step": 8000 + }, + { + "epoch": 0.05920877561278495, + "grad_norm": 0.1067327931523323, + "learning_rate": 4.721406101614435e-05, + "loss": 0.024, + "step": 8010 + }, + { + "epoch": 0.05928269418408681, + "grad_norm": 0.10643284767866135, + "learning_rate": 4.721035137701805e-05, + "loss": 0.0235, + "step": 8020 + }, + { + "epoch": 0.059356612755388664, + "grad_norm": 0.11560133099555969, + "learning_rate": 4.720664173789174e-05, + "loss": 0.0243, + "step": 8030 + }, + { + "epoch": 0.05943053132669052, + "grad_norm": 0.1009771004319191, + "learning_rate": 4.720293209876543e-05, + "loss": 0.0243, + "step": 8040 + }, + { + "epoch": 0.05950444989799237, + "grad_norm": 0.1109670102596283, + "learning_rate": 4.719922245963913e-05, + "loss": 0.0213, + "step": 8050 + }, + { + "epoch": 0.05957836846929423, + "grad_norm": 0.09228888154029846, + "learning_rate": 4.719551282051282e-05, + "loss": 0.0211, + "step": 8060 + }, + { + "epoch": 0.05965228704059608, + "grad_norm": 0.10006465017795563, + "learning_rate": 4.719180318138652e-05, + "loss": 0.0209, + "step": 8070 + }, + { + "epoch": 0.05972620561189793, + "grad_norm": 0.10764101892709732, + "learning_rate": 4.718809354226021e-05, + "loss": 0.0213, + "step": 8080 + }, + { + "epoch": 0.059800124183199785, + "grad_norm": 0.08787568658590317, + "learning_rate": 4.7184383903133906e-05, + "loss": 0.0199, + "step": 8090 + }, + { + "epoch": 0.059874042754501644, + "grad_norm": 0.1373230218887329, + "learning_rate": 4.71806742640076e-05, + "loss": 0.0252, + "step": 8100 + }, + { + "epoch": 0.059947961325803496, + "grad_norm": 0.08644258230924606, + "learning_rate": 4.717696462488129e-05, + "loss": 0.0198, + "step": 8110 + }, + { + "epoch": 0.06002187989710535, + "grad_norm": 0.08341766148805618, + "learning_rate": 4.717325498575499e-05, + "loss": 0.0241, + "step": 8120 + }, + { + "epoch": 0.0600957984684072, + "grad_norm": 0.09608685970306396, + "learning_rate": 4.716954534662868e-05, + "loss": 0.0215, + "step": 8130 + }, + { + "epoch": 0.06016971703970905, + "grad_norm": 0.1065250039100647, + "learning_rate": 4.716583570750237e-05, + "loss": 0.0225, + "step": 8140 + }, + { + "epoch": 0.06024363561101091, + "grad_norm": 0.12095583975315094, + "learning_rate": 4.7162126068376075e-05, + "loss": 0.0212, + "step": 8150 + }, + { + "epoch": 0.060317554182312764, + "grad_norm": 0.09507690370082855, + "learning_rate": 4.7158416429249765e-05, + "loss": 0.0219, + "step": 8160 + }, + { + "epoch": 0.06039147275361462, + "grad_norm": 0.09556754678487778, + "learning_rate": 4.715470679012346e-05, + "loss": 0.023, + "step": 8170 + }, + { + "epoch": 0.06046539132491647, + "grad_norm": 0.08235856890678406, + "learning_rate": 4.7150997150997157e-05, + "loss": 0.0218, + "step": 8180 + }, + { + "epoch": 0.06053930989621833, + "grad_norm": 0.1018480509519577, + "learning_rate": 4.7147287511870846e-05, + "loss": 0.022, + "step": 8190 + }, + { + "epoch": 0.06061322846752018, + "grad_norm": 0.08940430730581284, + "learning_rate": 4.714357787274454e-05, + "loss": 0.0218, + "step": 8200 + }, + { + "epoch": 0.06068714703882203, + "grad_norm": 0.11216285824775696, + "learning_rate": 4.713986823361823e-05, + "loss": 0.025, + "step": 8210 + }, + { + "epoch": 0.060761065610123885, + "grad_norm": 0.09381680935621262, + "learning_rate": 4.7136158594491934e-05, + "loss": 0.0229, + "step": 8220 + }, + { + "epoch": 0.060834984181425744, + "grad_norm": 0.1107122078537941, + "learning_rate": 4.713244895536563e-05, + "loss": 0.02, + "step": 8230 + }, + { + "epoch": 0.060908902752727596, + "grad_norm": 0.10714753717184067, + "learning_rate": 4.712873931623932e-05, + "loss": 0.0224, + "step": 8240 + }, + { + "epoch": 0.06098282132402945, + "grad_norm": 0.09350134432315826, + "learning_rate": 4.7125029677113015e-05, + "loss": 0.0223, + "step": 8250 + }, + { + "epoch": 0.0610567398953313, + "grad_norm": 0.10498078912496567, + "learning_rate": 4.7121320037986704e-05, + "loss": 0.0218, + "step": 8260 + }, + { + "epoch": 0.06113065846663316, + "grad_norm": 0.09392526000738144, + "learning_rate": 4.71176103988604e-05, + "loss": 0.022, + "step": 8270 + }, + { + "epoch": 0.06120457703793501, + "grad_norm": 0.08764494955539703, + "learning_rate": 4.7113900759734096e-05, + "loss": 0.0202, + "step": 8280 + }, + { + "epoch": 0.061278495609236865, + "grad_norm": 0.14300397038459778, + "learning_rate": 4.7110191120607785e-05, + "loss": 0.0213, + "step": 8290 + }, + { + "epoch": 0.06135241418053872, + "grad_norm": 0.09841963648796082, + "learning_rate": 4.710648148148149e-05, + "loss": 0.0219, + "step": 8300 + }, + { + "epoch": 0.06142633275184057, + "grad_norm": 0.10955634713172913, + "learning_rate": 4.710277184235518e-05, + "loss": 0.0194, + "step": 8310 + }, + { + "epoch": 0.06150025132314243, + "grad_norm": 0.0871150940656662, + "learning_rate": 4.709906220322887e-05, + "loss": 0.0239, + "step": 8320 + }, + { + "epoch": 0.06157416989444428, + "grad_norm": 0.08169959485530853, + "learning_rate": 4.709535256410257e-05, + "loss": 0.0213, + "step": 8330 + }, + { + "epoch": 0.06164808846574613, + "grad_norm": 0.08927330374717712, + "learning_rate": 4.709164292497626e-05, + "loss": 0.0194, + "step": 8340 + }, + { + "epoch": 0.061722007037047985, + "grad_norm": 0.09410972148180008, + "learning_rate": 4.7087933285849954e-05, + "loss": 0.0223, + "step": 8350 + }, + { + "epoch": 0.061795925608349844, + "grad_norm": 0.09375404566526413, + "learning_rate": 4.708422364672364e-05, + "loss": 0.0211, + "step": 8360 + }, + { + "epoch": 0.061869844179651697, + "grad_norm": 0.08001571893692017, + "learning_rate": 4.7080514007597346e-05, + "loss": 0.0206, + "step": 8370 + }, + { + "epoch": 0.06194376275095355, + "grad_norm": 0.10927123576402664, + "learning_rate": 4.707680436847104e-05, + "loss": 0.024, + "step": 8380 + }, + { + "epoch": 0.0620176813222554, + "grad_norm": 0.13424262404441833, + "learning_rate": 4.707309472934473e-05, + "loss": 0.0227, + "step": 8390 + }, + { + "epoch": 0.06209159989355726, + "grad_norm": 0.13544967770576477, + "learning_rate": 4.706938509021843e-05, + "loss": 0.0217, + "step": 8400 + }, + { + "epoch": 0.06216551846485911, + "grad_norm": 0.09935387969017029, + "learning_rate": 4.706567545109212e-05, + "loss": 0.0231, + "step": 8410 + }, + { + "epoch": 0.062239437036160965, + "grad_norm": 0.08632265031337738, + "learning_rate": 4.706196581196581e-05, + "loss": 0.0239, + "step": 8420 + }, + { + "epoch": 0.06231335560746282, + "grad_norm": 0.09724076092243195, + "learning_rate": 4.705825617283951e-05, + "loss": 0.0197, + "step": 8430 + }, + { + "epoch": 0.062387274178764676, + "grad_norm": 0.11309103667736053, + "learning_rate": 4.70545465337132e-05, + "loss": 0.0227, + "step": 8440 + }, + { + "epoch": 0.06246119275006653, + "grad_norm": 0.12005878984928131, + "learning_rate": 4.70508368945869e-05, + "loss": 0.0228, + "step": 8450 + }, + { + "epoch": 0.06253511132136838, + "grad_norm": 0.09755506366491318, + "learning_rate": 4.7047127255460596e-05, + "loss": 0.0218, + "step": 8460 + }, + { + "epoch": 0.06260902989267024, + "grad_norm": 0.10736546665430069, + "learning_rate": 4.7043417616334285e-05, + "loss": 0.0236, + "step": 8470 + }, + { + "epoch": 0.06268294846397209, + "grad_norm": 0.13473279774188995, + "learning_rate": 4.703970797720798e-05, + "loss": 0.0206, + "step": 8480 + }, + { + "epoch": 0.06275686703527394, + "grad_norm": 0.08754564076662064, + "learning_rate": 4.703599833808167e-05, + "loss": 0.0215, + "step": 8490 + }, + { + "epoch": 0.06283078560657579, + "grad_norm": 0.11559465527534485, + "learning_rate": 4.7032288698955367e-05, + "loss": 0.023, + "step": 8500 + }, + { + "epoch": 0.06290470417787765, + "grad_norm": 0.11303414404392242, + "learning_rate": 4.702857905982906e-05, + "loss": 0.0204, + "step": 8510 + }, + { + "epoch": 0.06297862274917951, + "grad_norm": 0.1281813234090805, + "learning_rate": 4.702486942070276e-05, + "loss": 0.0227, + "step": 8520 + }, + { + "epoch": 0.06305254132048135, + "grad_norm": 0.1113453060388565, + "learning_rate": 4.7021159781576454e-05, + "loss": 0.0195, + "step": 8530 + }, + { + "epoch": 0.06312645989178321, + "grad_norm": 0.09026116132736206, + "learning_rate": 4.7017450142450144e-05, + "loss": 0.023, + "step": 8540 + }, + { + "epoch": 0.06320037846308507, + "grad_norm": 0.10665830224752426, + "learning_rate": 4.701374050332384e-05, + "loss": 0.0237, + "step": 8550 + }, + { + "epoch": 0.06327429703438692, + "grad_norm": 0.08242546766996384, + "learning_rate": 4.7010030864197536e-05, + "loss": 0.0274, + "step": 8560 + }, + { + "epoch": 0.06334821560568878, + "grad_norm": 0.09004712104797363, + "learning_rate": 4.7006321225071225e-05, + "loss": 0.0198, + "step": 8570 + }, + { + "epoch": 0.06342213417699062, + "grad_norm": 0.11416617780923843, + "learning_rate": 4.700261158594492e-05, + "loss": 0.0252, + "step": 8580 + }, + { + "epoch": 0.06349605274829248, + "grad_norm": 0.1045583114027977, + "learning_rate": 4.699890194681861e-05, + "loss": 0.0228, + "step": 8590 + }, + { + "epoch": 0.06356997131959434, + "grad_norm": 0.09816955775022507, + "learning_rate": 4.699519230769231e-05, + "loss": 0.0198, + "step": 8600 + }, + { + "epoch": 0.06364388989089619, + "grad_norm": 0.10833249986171722, + "learning_rate": 4.699148266856601e-05, + "loss": 0.021, + "step": 8610 + }, + { + "epoch": 0.06371780846219804, + "grad_norm": 0.11117073148488998, + "learning_rate": 4.69877730294397e-05, + "loss": 0.023, + "step": 8620 + }, + { + "epoch": 0.06379172703349989, + "grad_norm": 0.10439736396074295, + "learning_rate": 4.6984063390313394e-05, + "loss": 0.0235, + "step": 8630 + }, + { + "epoch": 0.06386564560480175, + "grad_norm": 0.1064261794090271, + "learning_rate": 4.698035375118709e-05, + "loss": 0.0235, + "step": 8640 + }, + { + "epoch": 0.06393956417610361, + "grad_norm": 0.08481346070766449, + "learning_rate": 4.697664411206078e-05, + "loss": 0.0199, + "step": 8650 + }, + { + "epoch": 0.06401348274740545, + "grad_norm": 0.07966621220111847, + "learning_rate": 4.6972934472934475e-05, + "loss": 0.0214, + "step": 8660 + }, + { + "epoch": 0.06408740131870731, + "grad_norm": 0.09658713638782501, + "learning_rate": 4.696922483380817e-05, + "loss": 0.0205, + "step": 8670 + }, + { + "epoch": 0.06416131989000917, + "grad_norm": 0.09488363564014435, + "learning_rate": 4.696551519468187e-05, + "loss": 0.024, + "step": 8680 + }, + { + "epoch": 0.06423523846131102, + "grad_norm": 0.10669881850481033, + "learning_rate": 4.696180555555556e-05, + "loss": 0.0207, + "step": 8690 + }, + { + "epoch": 0.06430915703261288, + "grad_norm": 0.12813617289066315, + "learning_rate": 4.695809591642925e-05, + "loss": 0.0218, + "step": 8700 + }, + { + "epoch": 0.06438307560391472, + "grad_norm": 0.12795375287532806, + "learning_rate": 4.695438627730295e-05, + "loss": 0.0223, + "step": 8710 + }, + { + "epoch": 0.06445699417521658, + "grad_norm": 0.08577212691307068, + "learning_rate": 4.695067663817664e-05, + "loss": 0.0201, + "step": 8720 + }, + { + "epoch": 0.06453091274651844, + "grad_norm": 0.14490552246570587, + "learning_rate": 4.694696699905033e-05, + "loss": 0.0211, + "step": 8730 + }, + { + "epoch": 0.06460483131782029, + "grad_norm": 0.07944640517234802, + "learning_rate": 4.694325735992403e-05, + "loss": 0.0211, + "step": 8740 + }, + { + "epoch": 0.06467874988912214, + "grad_norm": 0.08489855378866196, + "learning_rate": 4.6939547720797725e-05, + "loss": 0.0215, + "step": 8750 + }, + { + "epoch": 0.064752668460424, + "grad_norm": 0.11695750057697296, + "learning_rate": 4.693583808167142e-05, + "loss": 0.0238, + "step": 8760 + }, + { + "epoch": 0.06482658703172585, + "grad_norm": 0.09435489028692245, + "learning_rate": 4.693212844254511e-05, + "loss": 0.0193, + "step": 8770 + }, + { + "epoch": 0.06490050560302771, + "grad_norm": 0.0975150316953659, + "learning_rate": 4.6928418803418806e-05, + "loss": 0.0213, + "step": 8780 + }, + { + "epoch": 0.06497442417432955, + "grad_norm": 0.07952667772769928, + "learning_rate": 4.69247091642925e-05, + "loss": 0.0216, + "step": 8790 + }, + { + "epoch": 0.06504834274563141, + "grad_norm": 0.09845630079507828, + "learning_rate": 4.692099952516619e-05, + "loss": 0.022, + "step": 8800 + }, + { + "epoch": 0.06512226131693327, + "grad_norm": 0.09602982550859451, + "learning_rate": 4.691728988603989e-05, + "loss": 0.0198, + "step": 8810 + }, + { + "epoch": 0.06519617988823512, + "grad_norm": 0.09934545308351517, + "learning_rate": 4.691358024691358e-05, + "loss": 0.0205, + "step": 8820 + }, + { + "epoch": 0.06527009845953698, + "grad_norm": 0.10673316568136215, + "learning_rate": 4.690987060778728e-05, + "loss": 0.0227, + "step": 8830 + }, + { + "epoch": 0.06534401703083882, + "grad_norm": 0.0985034927725792, + "learning_rate": 4.6906160968660975e-05, + "loss": 0.0226, + "step": 8840 + }, + { + "epoch": 0.06541793560214068, + "grad_norm": 0.09951265901327133, + "learning_rate": 4.6902451329534664e-05, + "loss": 0.0235, + "step": 8850 + }, + { + "epoch": 0.06549185417344254, + "grad_norm": 0.11149190366268158, + "learning_rate": 4.689874169040836e-05, + "loss": 0.0248, + "step": 8860 + }, + { + "epoch": 0.06556577274474439, + "grad_norm": 0.11519861966371536, + "learning_rate": 4.6895032051282056e-05, + "loss": 0.0228, + "step": 8870 + }, + { + "epoch": 0.06563969131604624, + "grad_norm": 0.12151552736759186, + "learning_rate": 4.6891322412155745e-05, + "loss": 0.0183, + "step": 8880 + }, + { + "epoch": 0.0657136098873481, + "grad_norm": 0.09308876097202301, + "learning_rate": 4.688761277302944e-05, + "loss": 0.0225, + "step": 8890 + }, + { + "epoch": 0.06578752845864995, + "grad_norm": 0.0984383374452591, + "learning_rate": 4.688390313390314e-05, + "loss": 0.02, + "step": 8900 + }, + { + "epoch": 0.06586144702995181, + "grad_norm": 0.10223396122455597, + "learning_rate": 4.6880193494776833e-05, + "loss": 0.0201, + "step": 8910 + }, + { + "epoch": 0.06593536560125365, + "grad_norm": 0.12238939851522446, + "learning_rate": 4.687648385565053e-05, + "loss": 0.0211, + "step": 8920 + }, + { + "epoch": 0.06600928417255551, + "grad_norm": 0.0670313909649849, + "learning_rate": 4.687277421652422e-05, + "loss": 0.0225, + "step": 8930 + }, + { + "epoch": 0.06608320274385737, + "grad_norm": 0.09160995483398438, + "learning_rate": 4.6869064577397915e-05, + "loss": 0.0235, + "step": 8940 + }, + { + "epoch": 0.06615712131515922, + "grad_norm": 0.09256432950496674, + "learning_rate": 4.6865354938271604e-05, + "loss": 0.0236, + "step": 8950 + }, + { + "epoch": 0.06623103988646108, + "grad_norm": 0.1058010682463646, + "learning_rate": 4.68616452991453e-05, + "loss": 0.0224, + "step": 8960 + }, + { + "epoch": 0.06630495845776292, + "grad_norm": 0.12149609625339508, + "learning_rate": 4.6857935660018996e-05, + "loss": 0.0223, + "step": 8970 + }, + { + "epoch": 0.06637887702906478, + "grad_norm": 0.09200827777385712, + "learning_rate": 4.685422602089269e-05, + "loss": 0.0238, + "step": 8980 + }, + { + "epoch": 0.06645279560036664, + "grad_norm": 0.0811193585395813, + "learning_rate": 4.685051638176639e-05, + "loss": 0.0211, + "step": 8990 + }, + { + "epoch": 0.06652671417166849, + "grad_norm": 0.10229609906673431, + "learning_rate": 4.684680674264008e-05, + "loss": 0.0235, + "step": 9000 + }, + { + "epoch": 0.06660063274297034, + "grad_norm": 0.08031179010868073, + "learning_rate": 4.684309710351377e-05, + "loss": 0.0217, + "step": 9010 + }, + { + "epoch": 0.0666745513142722, + "grad_norm": 0.08803895115852356, + "learning_rate": 4.683938746438747e-05, + "loss": 0.0213, + "step": 9020 + }, + { + "epoch": 0.06674846988557405, + "grad_norm": 0.07700366526842117, + "learning_rate": 4.683567782526116e-05, + "loss": 0.0209, + "step": 9030 + }, + { + "epoch": 0.06682238845687591, + "grad_norm": 0.11948195844888687, + "learning_rate": 4.6831968186134854e-05, + "loss": 0.0234, + "step": 9040 + }, + { + "epoch": 0.06689630702817775, + "grad_norm": 0.10591305792331696, + "learning_rate": 4.682825854700855e-05, + "loss": 0.022, + "step": 9050 + }, + { + "epoch": 0.06697022559947961, + "grad_norm": 0.09493808448314667, + "learning_rate": 4.6824548907882246e-05, + "loss": 0.0215, + "step": 9060 + }, + { + "epoch": 0.06704414417078147, + "grad_norm": 0.09348826110363007, + "learning_rate": 4.682083926875594e-05, + "loss": 0.0242, + "step": 9070 + }, + { + "epoch": 0.06711806274208332, + "grad_norm": 0.10891830921173096, + "learning_rate": 4.681712962962963e-05, + "loss": 0.0202, + "step": 9080 + }, + { + "epoch": 0.06719198131338518, + "grad_norm": 0.08920338749885559, + "learning_rate": 4.681341999050333e-05, + "loss": 0.0207, + "step": 9090 + }, + { + "epoch": 0.06726589988468702, + "grad_norm": 0.08524997532367706, + "learning_rate": 4.680971035137702e-05, + "loss": 0.0213, + "step": 9100 + }, + { + "epoch": 0.06733981845598888, + "grad_norm": 0.10065700858831406, + "learning_rate": 4.680600071225071e-05, + "loss": 0.0206, + "step": 9110 + }, + { + "epoch": 0.06741373702729074, + "grad_norm": 0.09767220169305801, + "learning_rate": 4.680229107312441e-05, + "loss": 0.0212, + "step": 9120 + }, + { + "epoch": 0.06748765559859259, + "grad_norm": 0.08452901989221573, + "learning_rate": 4.6798581433998104e-05, + "loss": 0.0207, + "step": 9130 + }, + { + "epoch": 0.06756157416989444, + "grad_norm": 0.12105948477983475, + "learning_rate": 4.67948717948718e-05, + "loss": 0.0194, + "step": 9140 + }, + { + "epoch": 0.0676354927411963, + "grad_norm": 0.08149902522563934, + "learning_rate": 4.6791162155745496e-05, + "loss": 0.0207, + "step": 9150 + }, + { + "epoch": 0.06770941131249815, + "grad_norm": 0.11312330514192581, + "learning_rate": 4.6787452516619185e-05, + "loss": 0.0229, + "step": 9160 + }, + { + "epoch": 0.06778332988380001, + "grad_norm": 0.0945291742682457, + "learning_rate": 4.678374287749288e-05, + "loss": 0.0209, + "step": 9170 + }, + { + "epoch": 0.06785724845510185, + "grad_norm": 0.10265430808067322, + "learning_rate": 4.678003323836657e-05, + "loss": 0.0203, + "step": 9180 + }, + { + "epoch": 0.06793116702640371, + "grad_norm": 0.11135527491569519, + "learning_rate": 4.6776323599240266e-05, + "loss": 0.0218, + "step": 9190 + }, + { + "epoch": 0.06800508559770557, + "grad_norm": 0.14490050077438354, + "learning_rate": 4.677261396011396e-05, + "loss": 0.0238, + "step": 9200 + }, + { + "epoch": 0.06807900416900742, + "grad_norm": 0.08578338474035263, + "learning_rate": 4.676890432098766e-05, + "loss": 0.023, + "step": 9210 + }, + { + "epoch": 0.06815292274030928, + "grad_norm": 0.10262192040681839, + "learning_rate": 4.6765194681861354e-05, + "loss": 0.0224, + "step": 9220 + }, + { + "epoch": 0.06822684131161114, + "grad_norm": 0.09532645344734192, + "learning_rate": 4.676148504273504e-05, + "loss": 0.0227, + "step": 9230 + }, + { + "epoch": 0.06830075988291298, + "grad_norm": 0.09172376990318298, + "learning_rate": 4.675777540360874e-05, + "loss": 0.0219, + "step": 9240 + }, + { + "epoch": 0.06837467845421484, + "grad_norm": 0.08387801051139832, + "learning_rate": 4.6754065764482435e-05, + "loss": 0.0205, + "step": 9250 + }, + { + "epoch": 0.06844859702551669, + "grad_norm": 0.09637405723333359, + "learning_rate": 4.6750356125356124e-05, + "loss": 0.0243, + "step": 9260 + }, + { + "epoch": 0.06852251559681855, + "grad_norm": 0.08822491765022278, + "learning_rate": 4.674664648622982e-05, + "loss": 0.0211, + "step": 9270 + }, + { + "epoch": 0.0685964341681204, + "grad_norm": 0.19279421865940094, + "learning_rate": 4.6742936847103516e-05, + "loss": 0.0235, + "step": 9280 + }, + { + "epoch": 0.06867035273942225, + "grad_norm": 0.09226642549037933, + "learning_rate": 4.673922720797721e-05, + "loss": 0.0212, + "step": 9290 + }, + { + "epoch": 0.06874427131072411, + "grad_norm": 0.1656857132911682, + "learning_rate": 4.673551756885091e-05, + "loss": 0.0228, + "step": 9300 + }, + { + "epoch": 0.06881818988202595, + "grad_norm": 0.0956842228770256, + "learning_rate": 4.67318079297246e-05, + "loss": 0.0219, + "step": 9310 + }, + { + "epoch": 0.06889210845332781, + "grad_norm": 0.09942670166492462, + "learning_rate": 4.6728098290598294e-05, + "loss": 0.0212, + "step": 9320 + }, + { + "epoch": 0.06896602702462967, + "grad_norm": 0.09194504469633102, + "learning_rate": 4.672438865147199e-05, + "loss": 0.0231, + "step": 9330 + }, + { + "epoch": 0.06903994559593152, + "grad_norm": 0.09336016327142715, + "learning_rate": 4.672067901234568e-05, + "loss": 0.0218, + "step": 9340 + }, + { + "epoch": 0.06911386416723338, + "grad_norm": 0.10817690193653107, + "learning_rate": 4.6716969373219375e-05, + "loss": 0.0238, + "step": 9350 + }, + { + "epoch": 0.06918778273853524, + "grad_norm": 0.10755597054958344, + "learning_rate": 4.671325973409307e-05, + "loss": 0.0211, + "step": 9360 + }, + { + "epoch": 0.06926170130983708, + "grad_norm": 0.11105576902627945, + "learning_rate": 4.6709550094966767e-05, + "loss": 0.0198, + "step": 9370 + }, + { + "epoch": 0.06933561988113894, + "grad_norm": 0.09294114261865616, + "learning_rate": 4.670584045584046e-05, + "loss": 0.0195, + "step": 9380 + }, + { + "epoch": 0.06940953845244079, + "grad_norm": 0.10066384822130203, + "learning_rate": 4.670213081671415e-05, + "loss": 0.0223, + "step": 9390 + }, + { + "epoch": 0.06948345702374265, + "grad_norm": 0.11441809684038162, + "learning_rate": 4.669842117758785e-05, + "loss": 0.0206, + "step": 9400 + }, + { + "epoch": 0.0695573755950445, + "grad_norm": 0.11113165318965912, + "learning_rate": 4.669471153846154e-05, + "loss": 0.0236, + "step": 9410 + }, + { + "epoch": 0.06963129416634635, + "grad_norm": 0.081535205245018, + "learning_rate": 4.669100189933523e-05, + "loss": 0.0206, + "step": 9420 + }, + { + "epoch": 0.06970521273764821, + "grad_norm": 0.08822637796401978, + "learning_rate": 4.668729226020893e-05, + "loss": 0.0222, + "step": 9430 + }, + { + "epoch": 0.06977913130895005, + "grad_norm": 0.10207725316286087, + "learning_rate": 4.6683582621082625e-05, + "loss": 0.0188, + "step": 9440 + }, + { + "epoch": 0.06985304988025191, + "grad_norm": 0.10252611339092255, + "learning_rate": 4.667987298195632e-05, + "loss": 0.0216, + "step": 9450 + }, + { + "epoch": 0.06992696845155377, + "grad_norm": 0.07835535705089569, + "learning_rate": 4.667616334283001e-05, + "loss": 0.0207, + "step": 9460 + }, + { + "epoch": 0.07000088702285562, + "grad_norm": 0.08460769802331924, + "learning_rate": 4.6672453703703706e-05, + "loss": 0.0211, + "step": 9470 + }, + { + "epoch": 0.07007480559415748, + "grad_norm": 0.15236924588680267, + "learning_rate": 4.66687440645774e-05, + "loss": 0.0247, + "step": 9480 + }, + { + "epoch": 0.07014872416545934, + "grad_norm": 0.1699046641588211, + "learning_rate": 4.666503442545109e-05, + "loss": 0.0223, + "step": 9490 + }, + { + "epoch": 0.07022264273676118, + "grad_norm": 0.10630764812231064, + "learning_rate": 4.666132478632479e-05, + "loss": 0.0248, + "step": 9500 + }, + { + "epoch": 0.07029656130806304, + "grad_norm": 0.08956080675125122, + "learning_rate": 4.665761514719848e-05, + "loss": 0.0213, + "step": 9510 + }, + { + "epoch": 0.07037047987936489, + "grad_norm": 0.1831154227256775, + "learning_rate": 4.665390550807218e-05, + "loss": 0.0259, + "step": 9520 + }, + { + "epoch": 0.07044439845066675, + "grad_norm": 0.08845095336437225, + "learning_rate": 4.6650195868945875e-05, + "loss": 0.0215, + "step": 9530 + }, + { + "epoch": 0.0705183170219686, + "grad_norm": 0.0894920602440834, + "learning_rate": 4.6646486229819564e-05, + "loss": 0.0228, + "step": 9540 + }, + { + "epoch": 0.07059223559327045, + "grad_norm": 0.10273686796426773, + "learning_rate": 4.664277659069326e-05, + "loss": 0.0224, + "step": 9550 + }, + { + "epoch": 0.07066615416457231, + "grad_norm": 0.09976007044315338, + "learning_rate": 4.6639066951566956e-05, + "loss": 0.0225, + "step": 9560 + }, + { + "epoch": 0.07074007273587415, + "grad_norm": 0.11248171329498291, + "learning_rate": 4.6635357312440645e-05, + "loss": 0.0223, + "step": 9570 + }, + { + "epoch": 0.07081399130717601, + "grad_norm": 0.11681903153657913, + "learning_rate": 4.663164767331434e-05, + "loss": 0.0234, + "step": 9580 + }, + { + "epoch": 0.07088790987847787, + "grad_norm": 0.10257956385612488, + "learning_rate": 4.662793803418804e-05, + "loss": 0.0218, + "step": 9590 + }, + { + "epoch": 0.07096182844977972, + "grad_norm": 0.0785127729177475, + "learning_rate": 4.662422839506173e-05, + "loss": 0.0184, + "step": 9600 + }, + { + "epoch": 0.07103574702108158, + "grad_norm": 0.10620249807834625, + "learning_rate": 4.662051875593543e-05, + "loss": 0.0229, + "step": 9610 + }, + { + "epoch": 0.07110966559238344, + "grad_norm": 0.04874080419540405, + "learning_rate": 4.661680911680912e-05, + "loss": 0.0207, + "step": 9620 + }, + { + "epoch": 0.07118358416368528, + "grad_norm": 0.1125664934515953, + "learning_rate": 4.6613099477682814e-05, + "loss": 0.0236, + "step": 9630 + }, + { + "epoch": 0.07125750273498714, + "grad_norm": 0.08896657824516296, + "learning_rate": 4.6609389838556503e-05, + "loss": 0.0207, + "step": 9640 + }, + { + "epoch": 0.07133142130628899, + "grad_norm": 0.10772913694381714, + "learning_rate": 4.66056801994302e-05, + "loss": 0.0206, + "step": 9650 + }, + { + "epoch": 0.07140533987759085, + "grad_norm": 0.10582844167947769, + "learning_rate": 4.6601970560303895e-05, + "loss": 0.0219, + "step": 9660 + }, + { + "epoch": 0.0714792584488927, + "grad_norm": 0.0744025856256485, + "learning_rate": 4.659826092117759e-05, + "loss": 0.0216, + "step": 9670 + }, + { + "epoch": 0.07155317702019455, + "grad_norm": 0.08924957364797592, + "learning_rate": 4.659455128205129e-05, + "loss": 0.024, + "step": 9680 + }, + { + "epoch": 0.07162709559149641, + "grad_norm": 0.08392763137817383, + "learning_rate": 4.6590841642924977e-05, + "loss": 0.0185, + "step": 9690 + }, + { + "epoch": 0.07170101416279827, + "grad_norm": 0.06921983510255814, + "learning_rate": 4.658713200379867e-05, + "loss": 0.0204, + "step": 9700 + }, + { + "epoch": 0.07177493273410011, + "grad_norm": 0.07526102662086487, + "learning_rate": 4.658342236467237e-05, + "loss": 0.0199, + "step": 9710 + }, + { + "epoch": 0.07184885130540197, + "grad_norm": 0.07144385576248169, + "learning_rate": 4.657971272554606e-05, + "loss": 0.0219, + "step": 9720 + }, + { + "epoch": 0.07192276987670382, + "grad_norm": 0.11594574898481369, + "learning_rate": 4.6576003086419754e-05, + "loss": 0.0207, + "step": 9730 + }, + { + "epoch": 0.07199668844800568, + "grad_norm": 0.10086581110954285, + "learning_rate": 4.657229344729345e-05, + "loss": 0.0214, + "step": 9740 + }, + { + "epoch": 0.07207060701930754, + "grad_norm": 0.11353831738233566, + "learning_rate": 4.6568583808167146e-05, + "loss": 0.0219, + "step": 9750 + }, + { + "epoch": 0.07214452559060938, + "grad_norm": 0.07933341711759567, + "learning_rate": 4.656487416904084e-05, + "loss": 0.0205, + "step": 9760 + }, + { + "epoch": 0.07221844416191124, + "grad_norm": 0.11647021025419235, + "learning_rate": 4.656116452991453e-05, + "loss": 0.019, + "step": 9770 + }, + { + "epoch": 0.07229236273321309, + "grad_norm": 0.13896368443965912, + "learning_rate": 4.655745489078823e-05, + "loss": 0.0212, + "step": 9780 + }, + { + "epoch": 0.07236628130451495, + "grad_norm": 0.09607965499162674, + "learning_rate": 4.655374525166192e-05, + "loss": 0.0203, + "step": 9790 + }, + { + "epoch": 0.0724401998758168, + "grad_norm": 0.11384850740432739, + "learning_rate": 4.655003561253561e-05, + "loss": 0.0212, + "step": 9800 + }, + { + "epoch": 0.07251411844711865, + "grad_norm": 0.10243972390890121, + "learning_rate": 4.654632597340931e-05, + "loss": 0.0208, + "step": 9810 + }, + { + "epoch": 0.07258803701842051, + "grad_norm": 0.1119912713766098, + "learning_rate": 4.6542616334283004e-05, + "loss": 0.0235, + "step": 9820 + }, + { + "epoch": 0.07266195558972237, + "grad_norm": 0.09179473668336868, + "learning_rate": 4.65389066951567e-05, + "loss": 0.0226, + "step": 9830 + }, + { + "epoch": 0.07273587416102421, + "grad_norm": 0.13562758266925812, + "learning_rate": 4.6535197056030396e-05, + "loss": 0.0226, + "step": 9840 + }, + { + "epoch": 0.07280979273232607, + "grad_norm": 0.12494170665740967, + "learning_rate": 4.6531487416904085e-05, + "loss": 0.0196, + "step": 9850 + }, + { + "epoch": 0.07288371130362792, + "grad_norm": 0.07629244774580002, + "learning_rate": 4.652777777777778e-05, + "loss": 0.0198, + "step": 9860 + }, + { + "epoch": 0.07295762987492978, + "grad_norm": 0.0884569063782692, + "learning_rate": 4.652406813865147e-05, + "loss": 0.0204, + "step": 9870 + }, + { + "epoch": 0.07303154844623164, + "grad_norm": 0.11918651312589645, + "learning_rate": 4.6520358499525166e-05, + "loss": 0.0233, + "step": 9880 + }, + { + "epoch": 0.07310546701753348, + "grad_norm": 0.10792674869298935, + "learning_rate": 4.651664886039886e-05, + "loss": 0.0227, + "step": 9890 + }, + { + "epoch": 0.07317938558883534, + "grad_norm": 0.08517907559871674, + "learning_rate": 4.651293922127256e-05, + "loss": 0.0213, + "step": 9900 + }, + { + "epoch": 0.07325330416013719, + "grad_norm": 0.11669266223907471, + "learning_rate": 4.6509229582146254e-05, + "loss": 0.0212, + "step": 9910 + }, + { + "epoch": 0.07332722273143905, + "grad_norm": 0.08712171763181686, + "learning_rate": 4.650551994301994e-05, + "loss": 0.0226, + "step": 9920 + }, + { + "epoch": 0.0734011413027409, + "grad_norm": 0.11379371583461761, + "learning_rate": 4.650181030389364e-05, + "loss": 0.0223, + "step": 9930 + }, + { + "epoch": 0.07347505987404275, + "grad_norm": 0.08450641483068466, + "learning_rate": 4.6498100664767335e-05, + "loss": 0.0214, + "step": 9940 + }, + { + "epoch": 0.07354897844534461, + "grad_norm": 0.09043429046869278, + "learning_rate": 4.6494391025641024e-05, + "loss": 0.0206, + "step": 9950 + }, + { + "epoch": 0.07362289701664647, + "grad_norm": 0.08543136715888977, + "learning_rate": 4.649068138651472e-05, + "loss": 0.0211, + "step": 9960 + }, + { + "epoch": 0.07369681558794831, + "grad_norm": 0.11070428788661957, + "learning_rate": 4.6486971747388416e-05, + "loss": 0.0199, + "step": 9970 + }, + { + "epoch": 0.07377073415925017, + "grad_norm": 0.08564955741167068, + "learning_rate": 4.648326210826211e-05, + "loss": 0.02, + "step": 9980 + }, + { + "epoch": 0.07384465273055202, + "grad_norm": 0.09740497916936874, + "learning_rate": 4.647955246913581e-05, + "loss": 0.0219, + "step": 9990 + }, + { + "epoch": 0.07391857130185388, + "grad_norm": 0.13537032902240753, + "learning_rate": 4.64758428300095e-05, + "loss": 0.0231, + "step": 10000 + }, + { + "epoch": 0.07391857130185388, + "eval_f1": 0.5774625204378211, + "eval_loss": 0.0209824051707983, + "eval_precision": 0.4580666850684049, + "eval_recall": 0.7810425453601965, + "eval_runtime": 2765.0265, + "eval_samples_per_second": 195.707, + "eval_steps_per_second": 3.058, + "step": 10000 + }, + { + "epoch": 0.07399248987315574, + "grad_norm": 0.08094476908445358, + "learning_rate": 4.647213319088319e-05, + "loss": 0.0204, + "step": 10010 + }, + { + "epoch": 0.07406640844445758, + "grad_norm": 0.11555436998605728, + "learning_rate": 4.646842355175689e-05, + "loss": 0.0186, + "step": 10020 + }, + { + "epoch": 0.07414032701575944, + "grad_norm": 0.09903301298618317, + "learning_rate": 4.646471391263058e-05, + "loss": 0.0238, + "step": 10030 + }, + { + "epoch": 0.07421424558706129, + "grad_norm": 0.09888233989477158, + "learning_rate": 4.6461004273504274e-05, + "loss": 0.0206, + "step": 10040 + }, + { + "epoch": 0.07428816415836315, + "grad_norm": 0.08759963512420654, + "learning_rate": 4.645729463437797e-05, + "loss": 0.0233, + "step": 10050 + }, + { + "epoch": 0.074362082729665, + "grad_norm": 0.11545658856630325, + "learning_rate": 4.6453584995251666e-05, + "loss": 0.0216, + "step": 10060 + }, + { + "epoch": 0.07443600130096685, + "grad_norm": 0.08216286450624466, + "learning_rate": 4.644987535612536e-05, + "loss": 0.0226, + "step": 10070 + }, + { + "epoch": 0.07450991987226871, + "grad_norm": 0.0777827575802803, + "learning_rate": 4.644616571699905e-05, + "loss": 0.0213, + "step": 10080 + }, + { + "epoch": 0.07458383844357057, + "grad_norm": 0.11039154976606369, + "learning_rate": 4.644245607787275e-05, + "loss": 0.0241, + "step": 10090 + }, + { + "epoch": 0.07465775701487241, + "grad_norm": 0.08568236231803894, + "learning_rate": 4.643874643874644e-05, + "loss": 0.0226, + "step": 10100 + }, + { + "epoch": 0.07473167558617427, + "grad_norm": 0.11165298521518707, + "learning_rate": 4.643503679962013e-05, + "loss": 0.0207, + "step": 10110 + }, + { + "epoch": 0.07480559415747612, + "grad_norm": 0.0949447751045227, + "learning_rate": 4.643132716049383e-05, + "loss": 0.0213, + "step": 10120 + }, + { + "epoch": 0.07487951272877798, + "grad_norm": 0.10933278501033783, + "learning_rate": 4.6427617521367525e-05, + "loss": 0.0212, + "step": 10130 + }, + { + "epoch": 0.07495343130007984, + "grad_norm": 0.07423562556505203, + "learning_rate": 4.642390788224122e-05, + "loss": 0.0197, + "step": 10140 + }, + { + "epoch": 0.07502734987138168, + "grad_norm": 0.12832492589950562, + "learning_rate": 4.642019824311491e-05, + "loss": 0.021, + "step": 10150 + }, + { + "epoch": 0.07510126844268354, + "grad_norm": 0.10669612884521484, + "learning_rate": 4.6416488603988606e-05, + "loss": 0.0228, + "step": 10160 + }, + { + "epoch": 0.0751751870139854, + "grad_norm": 0.11487097293138504, + "learning_rate": 4.64127789648623e-05, + "loss": 0.0219, + "step": 10170 + }, + { + "epoch": 0.07524910558528725, + "grad_norm": 0.11099672317504883, + "learning_rate": 4.640906932573599e-05, + "loss": 0.0202, + "step": 10180 + }, + { + "epoch": 0.0753230241565891, + "grad_norm": 0.11408338695764542, + "learning_rate": 4.640535968660969e-05, + "loss": 0.021, + "step": 10190 + }, + { + "epoch": 0.07539694272789095, + "grad_norm": 0.0799785628914833, + "learning_rate": 4.640165004748338e-05, + "loss": 0.0196, + "step": 10200 + }, + { + "epoch": 0.07547086129919281, + "grad_norm": 0.11926797777414322, + "learning_rate": 4.639794040835708e-05, + "loss": 0.0232, + "step": 10210 + }, + { + "epoch": 0.07554477987049467, + "grad_norm": 0.1310286521911621, + "learning_rate": 4.6394230769230775e-05, + "loss": 0.0202, + "step": 10220 + }, + { + "epoch": 0.07561869844179651, + "grad_norm": 0.08759672194719315, + "learning_rate": 4.6390521130104464e-05, + "loss": 0.0221, + "step": 10230 + }, + { + "epoch": 0.07569261701309837, + "grad_norm": 0.10249481350183487, + "learning_rate": 4.638681149097816e-05, + "loss": 0.0203, + "step": 10240 + }, + { + "epoch": 0.07576653558440022, + "grad_norm": 0.07235664129257202, + "learning_rate": 4.6383101851851856e-05, + "loss": 0.0218, + "step": 10250 + }, + { + "epoch": 0.07584045415570208, + "grad_norm": 0.08320758491754532, + "learning_rate": 4.6379392212725545e-05, + "loss": 0.0209, + "step": 10260 + }, + { + "epoch": 0.07591437272700394, + "grad_norm": 0.09801856428384781, + "learning_rate": 4.637568257359924e-05, + "loss": 0.0226, + "step": 10270 + }, + { + "epoch": 0.07598829129830578, + "grad_norm": 0.09113458544015884, + "learning_rate": 4.637197293447294e-05, + "loss": 0.0251, + "step": 10280 + }, + { + "epoch": 0.07606220986960764, + "grad_norm": 0.09591145813465118, + "learning_rate": 4.636826329534663e-05, + "loss": 0.0211, + "step": 10290 + }, + { + "epoch": 0.0761361284409095, + "grad_norm": 0.10051578283309937, + "learning_rate": 4.636455365622033e-05, + "loss": 0.0219, + "step": 10300 + }, + { + "epoch": 0.07621004701221135, + "grad_norm": 0.09922511130571365, + "learning_rate": 4.636084401709402e-05, + "loss": 0.0191, + "step": 10310 + }, + { + "epoch": 0.0762839655835132, + "grad_norm": 0.17670218646526337, + "learning_rate": 4.6357134377967714e-05, + "loss": 0.0214, + "step": 10320 + }, + { + "epoch": 0.07635788415481505, + "grad_norm": 0.07537711411714554, + "learning_rate": 4.63534247388414e-05, + "loss": 0.0205, + "step": 10330 + }, + { + "epoch": 0.07643180272611691, + "grad_norm": 0.10695944726467133, + "learning_rate": 4.63497150997151e-05, + "loss": 0.022, + "step": 10340 + }, + { + "epoch": 0.07650572129741877, + "grad_norm": 0.10096986591815948, + "learning_rate": 4.6346005460588795e-05, + "loss": 0.0218, + "step": 10350 + }, + { + "epoch": 0.07657963986872061, + "grad_norm": 0.1410195380449295, + "learning_rate": 4.634229582146249e-05, + "loss": 0.0214, + "step": 10360 + }, + { + "epoch": 0.07665355844002247, + "grad_norm": 0.07780885696411133, + "learning_rate": 4.633858618233619e-05, + "loss": 0.0208, + "step": 10370 + }, + { + "epoch": 0.07672747701132432, + "grad_norm": 0.08565017580986023, + "learning_rate": 4.6334876543209876e-05, + "loss": 0.0203, + "step": 10380 + }, + { + "epoch": 0.07680139558262618, + "grad_norm": 0.10375560820102692, + "learning_rate": 4.633116690408357e-05, + "loss": 0.0224, + "step": 10390 + }, + { + "epoch": 0.07687531415392804, + "grad_norm": 0.10677853971719742, + "learning_rate": 4.632745726495727e-05, + "loss": 0.0224, + "step": 10400 + }, + { + "epoch": 0.07694923272522988, + "grad_norm": 0.08839945495128632, + "learning_rate": 4.632374762583096e-05, + "loss": 0.0245, + "step": 10410 + }, + { + "epoch": 0.07702315129653174, + "grad_norm": 0.11980149149894714, + "learning_rate": 4.632003798670465e-05, + "loss": 0.0212, + "step": 10420 + }, + { + "epoch": 0.0770970698678336, + "grad_norm": 0.08483819663524628, + "learning_rate": 4.631632834757835e-05, + "loss": 0.0219, + "step": 10430 + }, + { + "epoch": 0.07717098843913545, + "grad_norm": 0.15062575042247772, + "learning_rate": 4.6312618708452045e-05, + "loss": 0.0217, + "step": 10440 + }, + { + "epoch": 0.0772449070104373, + "grad_norm": 0.08806386590003967, + "learning_rate": 4.630890906932574e-05, + "loss": 0.0225, + "step": 10450 + }, + { + "epoch": 0.07731882558173915, + "grad_norm": 0.11287733167409897, + "learning_rate": 4.630519943019943e-05, + "loss": 0.0263, + "step": 10460 + }, + { + "epoch": 0.07739274415304101, + "grad_norm": 0.12708717584609985, + "learning_rate": 4.6301489791073126e-05, + "loss": 0.0217, + "step": 10470 + }, + { + "epoch": 0.07746666272434287, + "grad_norm": 0.1146998181939125, + "learning_rate": 4.629778015194682e-05, + "loss": 0.0203, + "step": 10480 + }, + { + "epoch": 0.07754058129564471, + "grad_norm": 0.11010906845331192, + "learning_rate": 4.629407051282051e-05, + "loss": 0.021, + "step": 10490 + }, + { + "epoch": 0.07761449986694657, + "grad_norm": 0.07615244388580322, + "learning_rate": 4.629036087369421e-05, + "loss": 0.0221, + "step": 10500 + }, + { + "epoch": 0.07768841843824842, + "grad_norm": 0.09600045531988144, + "learning_rate": 4.6286651234567904e-05, + "loss": 0.0231, + "step": 10510 + }, + { + "epoch": 0.07776233700955028, + "grad_norm": 0.11465899646282196, + "learning_rate": 4.62829415954416e-05, + "loss": 0.0218, + "step": 10520 + }, + { + "epoch": 0.07783625558085214, + "grad_norm": 0.08350540697574615, + "learning_rate": 4.6279231956315295e-05, + "loss": 0.0199, + "step": 10530 + }, + { + "epoch": 0.07791017415215398, + "grad_norm": 0.1006699725985527, + "learning_rate": 4.6275522317188985e-05, + "loss": 0.022, + "step": 10540 + }, + { + "epoch": 0.07798409272345584, + "grad_norm": 0.08721012622117996, + "learning_rate": 4.627181267806268e-05, + "loss": 0.0268, + "step": 10550 + }, + { + "epoch": 0.0780580112947577, + "grad_norm": 0.07063861191272736, + "learning_rate": 4.626810303893637e-05, + "loss": 0.0176, + "step": 10560 + }, + { + "epoch": 0.07813192986605955, + "grad_norm": 0.12120065093040466, + "learning_rate": 4.6264393399810066e-05, + "loss": 0.0205, + "step": 10570 + }, + { + "epoch": 0.0782058484373614, + "grad_norm": 0.1365332305431366, + "learning_rate": 4.626068376068376e-05, + "loss": 0.0235, + "step": 10580 + }, + { + "epoch": 0.07827976700866325, + "grad_norm": 0.09110606461763382, + "learning_rate": 4.625697412155746e-05, + "loss": 0.0181, + "step": 10590 + }, + { + "epoch": 0.07835368557996511, + "grad_norm": 0.09668658673763275, + "learning_rate": 4.6253264482431154e-05, + "loss": 0.0226, + "step": 10600 + }, + { + "epoch": 0.07842760415126697, + "grad_norm": 0.13093529641628265, + "learning_rate": 4.624955484330484e-05, + "loss": 0.0232, + "step": 10610 + }, + { + "epoch": 0.07850152272256881, + "grad_norm": 0.11461758613586426, + "learning_rate": 4.624584520417854e-05, + "loss": 0.0215, + "step": 10620 + }, + { + "epoch": 0.07857544129387067, + "grad_norm": 0.08175163716077805, + "learning_rate": 4.6242135565052235e-05, + "loss": 0.021, + "step": 10630 + }, + { + "epoch": 0.07864935986517253, + "grad_norm": 0.09790777415037155, + "learning_rate": 4.6238425925925924e-05, + "loss": 0.0215, + "step": 10640 + }, + { + "epoch": 0.07872327843647438, + "grad_norm": 0.09221351891756058, + "learning_rate": 4.623471628679962e-05, + "loss": 0.0246, + "step": 10650 + }, + { + "epoch": 0.07879719700777624, + "grad_norm": 0.09441790729761124, + "learning_rate": 4.6231006647673316e-05, + "loss": 0.0197, + "step": 10660 + }, + { + "epoch": 0.07887111557907808, + "grad_norm": 0.07809023559093475, + "learning_rate": 4.622729700854701e-05, + "loss": 0.0175, + "step": 10670 + }, + { + "epoch": 0.07894503415037994, + "grad_norm": 0.12023591250181198, + "learning_rate": 4.622358736942071e-05, + "loss": 0.0206, + "step": 10680 + }, + { + "epoch": 0.0790189527216818, + "grad_norm": 0.10869825631380081, + "learning_rate": 4.62198777302944e-05, + "loss": 0.0229, + "step": 10690 + }, + { + "epoch": 0.07909287129298365, + "grad_norm": 0.09325161576271057, + "learning_rate": 4.621616809116809e-05, + "loss": 0.0259, + "step": 10700 + }, + { + "epoch": 0.0791667898642855, + "grad_norm": 0.10033397376537323, + "learning_rate": 4.621245845204179e-05, + "loss": 0.0227, + "step": 10710 + }, + { + "epoch": 0.07924070843558735, + "grad_norm": 0.11315719038248062, + "learning_rate": 4.620874881291548e-05, + "loss": 0.0225, + "step": 10720 + }, + { + "epoch": 0.07931462700688921, + "grad_norm": 0.08948411047458649, + "learning_rate": 4.6205039173789174e-05, + "loss": 0.023, + "step": 10730 + }, + { + "epoch": 0.07938854557819107, + "grad_norm": 0.11530829966068268, + "learning_rate": 4.620132953466287e-05, + "loss": 0.0232, + "step": 10740 + }, + { + "epoch": 0.07946246414949291, + "grad_norm": 0.08206850290298462, + "learning_rate": 4.6197619895536566e-05, + "loss": 0.0198, + "step": 10750 + }, + { + "epoch": 0.07953638272079477, + "grad_norm": 0.07246618717908859, + "learning_rate": 4.619391025641026e-05, + "loss": 0.0208, + "step": 10760 + }, + { + "epoch": 0.07961030129209663, + "grad_norm": 0.10258765518665314, + "learning_rate": 4.619020061728395e-05, + "loss": 0.0219, + "step": 10770 + }, + { + "epoch": 0.07968421986339848, + "grad_norm": 0.09250310063362122, + "learning_rate": 4.618649097815765e-05, + "loss": 0.0221, + "step": 10780 + }, + { + "epoch": 0.07975813843470034, + "grad_norm": 0.09063389152288437, + "learning_rate": 4.6182781339031336e-05, + "loss": 0.0223, + "step": 10790 + }, + { + "epoch": 0.07983205700600218, + "grad_norm": 0.08710388094186783, + "learning_rate": 4.617907169990503e-05, + "loss": 0.02, + "step": 10800 + }, + { + "epoch": 0.07990597557730404, + "grad_norm": 0.11745099723339081, + "learning_rate": 4.6175362060778735e-05, + "loss": 0.0221, + "step": 10810 + }, + { + "epoch": 0.0799798941486059, + "grad_norm": 0.0828913003206253, + "learning_rate": 4.6171652421652424e-05, + "loss": 0.0215, + "step": 10820 + }, + { + "epoch": 0.08005381271990775, + "grad_norm": 0.10530825704336166, + "learning_rate": 4.616794278252612e-05, + "loss": 0.0226, + "step": 10830 + }, + { + "epoch": 0.0801277312912096, + "grad_norm": 0.08071672916412354, + "learning_rate": 4.616423314339981e-05, + "loss": 0.0189, + "step": 10840 + }, + { + "epoch": 0.08020164986251145, + "grad_norm": 0.085964135825634, + "learning_rate": 4.6160523504273505e-05, + "loss": 0.0226, + "step": 10850 + }, + { + "epoch": 0.08027556843381331, + "grad_norm": 0.08544149994850159, + "learning_rate": 4.61568138651472e-05, + "loss": 0.0238, + "step": 10860 + }, + { + "epoch": 0.08034948700511517, + "grad_norm": 0.09496266394853592, + "learning_rate": 4.615310422602089e-05, + "loss": 0.0212, + "step": 10870 + }, + { + "epoch": 0.08042340557641701, + "grad_norm": 0.08944917470216751, + "learning_rate": 4.6149394586894587e-05, + "loss": 0.0194, + "step": 10880 + }, + { + "epoch": 0.08049732414771887, + "grad_norm": 0.1165291890501976, + "learning_rate": 4.614568494776828e-05, + "loss": 0.0226, + "step": 10890 + }, + { + "epoch": 0.08057124271902073, + "grad_norm": 0.1558302789926529, + "learning_rate": 4.614197530864198e-05, + "loss": 0.0221, + "step": 10900 + }, + { + "epoch": 0.08064516129032258, + "grad_norm": 0.1054091528058052, + "learning_rate": 4.6138265669515674e-05, + "loss": 0.0173, + "step": 10910 + }, + { + "epoch": 0.08071907986162444, + "grad_norm": 0.0848877876996994, + "learning_rate": 4.6134556030389364e-05, + "loss": 0.0188, + "step": 10920 + }, + { + "epoch": 0.08079299843292628, + "grad_norm": 0.10108659416437149, + "learning_rate": 4.613084639126306e-05, + "loss": 0.0232, + "step": 10930 + }, + { + "epoch": 0.08086691700422814, + "grad_norm": 0.11190935969352722, + "learning_rate": 4.6127136752136756e-05, + "loss": 0.0226, + "step": 10940 + }, + { + "epoch": 0.08094083557553, + "grad_norm": 0.09286840260028839, + "learning_rate": 4.6123427113010445e-05, + "loss": 0.0237, + "step": 10950 + }, + { + "epoch": 0.08101475414683185, + "grad_norm": 0.08794547617435455, + "learning_rate": 4.611971747388415e-05, + "loss": 0.0219, + "step": 10960 + }, + { + "epoch": 0.0810886727181337, + "grad_norm": 0.08183534443378448, + "learning_rate": 4.611600783475784e-05, + "loss": 0.0212, + "step": 10970 + }, + { + "epoch": 0.08116259128943555, + "grad_norm": 0.0907917469739914, + "learning_rate": 4.611229819563153e-05, + "loss": 0.0209, + "step": 10980 + }, + { + "epoch": 0.08123650986073741, + "grad_norm": 0.09019339084625244, + "learning_rate": 4.610858855650523e-05, + "loss": 0.0222, + "step": 10990 + }, + { + "epoch": 0.08131042843203927, + "grad_norm": 0.09627287089824677, + "learning_rate": 4.610487891737892e-05, + "loss": 0.0217, + "step": 11000 + }, + { + "epoch": 0.08138434700334111, + "grad_norm": 0.09994477778673172, + "learning_rate": 4.6101169278252614e-05, + "loss": 0.0219, + "step": 11010 + }, + { + "epoch": 0.08145826557464297, + "grad_norm": 0.07893578708171844, + "learning_rate": 4.60974596391263e-05, + "loss": 0.0243, + "step": 11020 + }, + { + "epoch": 0.08153218414594483, + "grad_norm": 0.10810483247041702, + "learning_rate": 4.609375e-05, + "loss": 0.0217, + "step": 11030 + }, + { + "epoch": 0.08160610271724668, + "grad_norm": 0.09902021288871765, + "learning_rate": 4.60900403608737e-05, + "loss": 0.0216, + "step": 11040 + }, + { + "epoch": 0.08168002128854854, + "grad_norm": 0.12042216211557388, + "learning_rate": 4.608633072174739e-05, + "loss": 0.0214, + "step": 11050 + }, + { + "epoch": 0.08175393985985038, + "grad_norm": 0.10445085912942886, + "learning_rate": 4.608262108262109e-05, + "loss": 0.0229, + "step": 11060 + }, + { + "epoch": 0.08182785843115224, + "grad_norm": 0.11058395355939865, + "learning_rate": 4.6078911443494776e-05, + "loss": 0.0224, + "step": 11070 + }, + { + "epoch": 0.0819017770024541, + "grad_norm": 0.08662155270576477, + "learning_rate": 4.607520180436847e-05, + "loss": 0.0219, + "step": 11080 + }, + { + "epoch": 0.08197569557375595, + "grad_norm": 0.12212938070297241, + "learning_rate": 4.607149216524217e-05, + "loss": 0.0243, + "step": 11090 + }, + { + "epoch": 0.0820496141450578, + "grad_norm": 0.09374962747097015, + "learning_rate": 4.606778252611586e-05, + "loss": 0.0215, + "step": 11100 + }, + { + "epoch": 0.08212353271635967, + "grad_norm": 0.07948266714811325, + "learning_rate": 4.606407288698956e-05, + "loss": 0.0221, + "step": 11110 + }, + { + "epoch": 0.08219745128766151, + "grad_norm": 0.09368613362312317, + "learning_rate": 4.606036324786325e-05, + "loss": 0.0187, + "step": 11120 + }, + { + "epoch": 0.08227136985896337, + "grad_norm": 0.10957033187150955, + "learning_rate": 4.6056653608736945e-05, + "loss": 0.0229, + "step": 11130 + }, + { + "epoch": 0.08234528843026521, + "grad_norm": 0.11067720502614975, + "learning_rate": 4.605294396961064e-05, + "loss": 0.0219, + "step": 11140 + }, + { + "epoch": 0.08241920700156707, + "grad_norm": 0.10309172421693802, + "learning_rate": 4.604923433048433e-05, + "loss": 0.0221, + "step": 11150 + }, + { + "epoch": 0.08249312557286893, + "grad_norm": 0.10994286090135574, + "learning_rate": 4.6045524691358026e-05, + "loss": 0.0198, + "step": 11160 + }, + { + "epoch": 0.08256704414417078, + "grad_norm": 0.11497566848993301, + "learning_rate": 4.604181505223172e-05, + "loss": 0.0214, + "step": 11170 + }, + { + "epoch": 0.08264096271547264, + "grad_norm": 0.10038833320140839, + "learning_rate": 4.603810541310541e-05, + "loss": 0.0188, + "step": 11180 + }, + { + "epoch": 0.08271488128677448, + "grad_norm": 0.08967998623847961, + "learning_rate": 4.6034395773979114e-05, + "loss": 0.021, + "step": 11190 + }, + { + "epoch": 0.08278879985807634, + "grad_norm": 0.06262663006782532, + "learning_rate": 4.60306861348528e-05, + "loss": 0.0182, + "step": 11200 + }, + { + "epoch": 0.0828627184293782, + "grad_norm": 0.15144652128219604, + "learning_rate": 4.60269764957265e-05, + "loss": 0.024, + "step": 11210 + }, + { + "epoch": 0.08293663700068005, + "grad_norm": 0.07688304036855698, + "learning_rate": 4.6023266856600195e-05, + "loss": 0.0206, + "step": 11220 + }, + { + "epoch": 0.0830105555719819, + "grad_norm": 0.07474672049283981, + "learning_rate": 4.6019557217473884e-05, + "loss": 0.0197, + "step": 11230 + }, + { + "epoch": 0.08308447414328377, + "grad_norm": 0.07008994370698929, + "learning_rate": 4.601584757834758e-05, + "loss": 0.0191, + "step": 11240 + }, + { + "epoch": 0.08315839271458561, + "grad_norm": 0.10242119431495667, + "learning_rate": 4.601213793922127e-05, + "loss": 0.0239, + "step": 11250 + }, + { + "epoch": 0.08323231128588747, + "grad_norm": 0.10365156084299088, + "learning_rate": 4.600842830009497e-05, + "loss": 0.0237, + "step": 11260 + }, + { + "epoch": 0.08330622985718932, + "grad_norm": 0.09733454138040543, + "learning_rate": 4.600471866096867e-05, + "loss": 0.0216, + "step": 11270 + }, + { + "epoch": 0.08338014842849117, + "grad_norm": 0.11051540821790695, + "learning_rate": 4.600100902184236e-05, + "loss": 0.0217, + "step": 11280 + }, + { + "epoch": 0.08345406699979303, + "grad_norm": 0.07793629914522171, + "learning_rate": 4.5997299382716053e-05, + "loss": 0.0206, + "step": 11290 + }, + { + "epoch": 0.08352798557109488, + "grad_norm": 0.08378177881240845, + "learning_rate": 4.599358974358974e-05, + "loss": 0.0229, + "step": 11300 + }, + { + "epoch": 0.08360190414239674, + "grad_norm": 0.12937644124031067, + "learning_rate": 4.598988010446344e-05, + "loss": 0.0212, + "step": 11310 + }, + { + "epoch": 0.08367582271369858, + "grad_norm": 0.07999670505523682, + "learning_rate": 4.5986170465337135e-05, + "loss": 0.0211, + "step": 11320 + }, + { + "epoch": 0.08374974128500044, + "grad_norm": 0.09235605597496033, + "learning_rate": 4.5982460826210824e-05, + "loss": 0.0186, + "step": 11330 + }, + { + "epoch": 0.0838236598563023, + "grad_norm": 0.09930342435836792, + "learning_rate": 4.5978751187084526e-05, + "loss": 0.0206, + "step": 11340 + }, + { + "epoch": 0.08389757842760415, + "grad_norm": 0.10215435177087784, + "learning_rate": 4.5975041547958216e-05, + "loss": 0.023, + "step": 11350 + }, + { + "epoch": 0.083971496998906, + "grad_norm": 0.09137150645256042, + "learning_rate": 4.597133190883191e-05, + "loss": 0.0223, + "step": 11360 + }, + { + "epoch": 0.08404541557020787, + "grad_norm": 0.10764443874359131, + "learning_rate": 4.596762226970561e-05, + "loss": 0.0199, + "step": 11370 + }, + { + "epoch": 0.08411933414150971, + "grad_norm": 0.09711901098489761, + "learning_rate": 4.59639126305793e-05, + "loss": 0.0218, + "step": 11380 + }, + { + "epoch": 0.08419325271281157, + "grad_norm": 0.10127461701631546, + "learning_rate": 4.596020299145299e-05, + "loss": 0.0196, + "step": 11390 + }, + { + "epoch": 0.08426717128411342, + "grad_norm": 0.08950147032737732, + "learning_rate": 4.595649335232669e-05, + "loss": 0.0189, + "step": 11400 + }, + { + "epoch": 0.08434108985541527, + "grad_norm": 0.11337869614362717, + "learning_rate": 4.5952783713200385e-05, + "loss": 0.0214, + "step": 11410 + }, + { + "epoch": 0.08441500842671713, + "grad_norm": 0.10098463296890259, + "learning_rate": 4.594907407407408e-05, + "loss": 0.0179, + "step": 11420 + }, + { + "epoch": 0.08448892699801898, + "grad_norm": 0.12542690336704254, + "learning_rate": 4.594536443494777e-05, + "loss": 0.0216, + "step": 11430 + }, + { + "epoch": 0.08456284556932084, + "grad_norm": 0.08482198417186737, + "learning_rate": 4.5941654795821466e-05, + "loss": 0.0199, + "step": 11440 + }, + { + "epoch": 0.08463676414062268, + "grad_norm": 0.09225975722074509, + "learning_rate": 4.593794515669516e-05, + "loss": 0.0212, + "step": 11450 + }, + { + "epoch": 0.08471068271192454, + "grad_norm": 0.09084470570087433, + "learning_rate": 4.593423551756885e-05, + "loss": 0.0185, + "step": 11460 + }, + { + "epoch": 0.0847846012832264, + "grad_norm": 0.087517149746418, + "learning_rate": 4.593052587844255e-05, + "loss": 0.0192, + "step": 11470 + }, + { + "epoch": 0.08485851985452825, + "grad_norm": 0.10069217532873154, + "learning_rate": 4.5926816239316236e-05, + "loss": 0.0216, + "step": 11480 + }, + { + "epoch": 0.0849324384258301, + "grad_norm": 0.1067439615726471, + "learning_rate": 4.592310660018994e-05, + "loss": 0.0213, + "step": 11490 + }, + { + "epoch": 0.08500635699713197, + "grad_norm": 0.11482270807027817, + "learning_rate": 4.5919396961063635e-05, + "loss": 0.0234, + "step": 11500 + }, + { + "epoch": 0.08508027556843381, + "grad_norm": 0.13125424087047577, + "learning_rate": 4.5915687321937324e-05, + "loss": 0.0221, + "step": 11510 + }, + { + "epoch": 0.08515419413973567, + "grad_norm": 0.139346182346344, + "learning_rate": 4.591197768281102e-05, + "loss": 0.0228, + "step": 11520 + }, + { + "epoch": 0.08522811271103752, + "grad_norm": 0.09836461395025253, + "learning_rate": 4.590826804368471e-05, + "loss": 0.0222, + "step": 11530 + }, + { + "epoch": 0.08530203128233937, + "grad_norm": 0.09203539043664932, + "learning_rate": 4.5904558404558405e-05, + "loss": 0.0214, + "step": 11540 + }, + { + "epoch": 0.08537594985364123, + "grad_norm": 0.07745225727558136, + "learning_rate": 4.59008487654321e-05, + "loss": 0.0188, + "step": 11550 + }, + { + "epoch": 0.08544986842494308, + "grad_norm": 0.08701054751873016, + "learning_rate": 4.58971391263058e-05, + "loss": 0.0223, + "step": 11560 + }, + { + "epoch": 0.08552378699624494, + "grad_norm": 0.1272624433040619, + "learning_rate": 4.589342948717949e-05, + "loss": 0.0235, + "step": 11570 + }, + { + "epoch": 0.0855977055675468, + "grad_norm": 0.11356864869594574, + "learning_rate": 4.588971984805318e-05, + "loss": 0.0203, + "step": 11580 + }, + { + "epoch": 0.08567162413884864, + "grad_norm": 0.12194553017616272, + "learning_rate": 4.588601020892688e-05, + "loss": 0.0225, + "step": 11590 + }, + { + "epoch": 0.0857455427101505, + "grad_norm": 0.10308793932199478, + "learning_rate": 4.5882300569800574e-05, + "loss": 0.0212, + "step": 11600 + }, + { + "epoch": 0.08581946128145235, + "grad_norm": 0.11726551502943039, + "learning_rate": 4.587859093067426e-05, + "loss": 0.0211, + "step": 11610 + }, + { + "epoch": 0.0858933798527542, + "grad_norm": 0.0865316092967987, + "learning_rate": 4.587488129154796e-05, + "loss": 0.0211, + "step": 11620 + }, + { + "epoch": 0.08596729842405607, + "grad_norm": 0.08077540993690491, + "learning_rate": 4.5871171652421655e-05, + "loss": 0.0207, + "step": 11630 + }, + { + "epoch": 0.08604121699535791, + "grad_norm": 0.08920275419950485, + "learning_rate": 4.586746201329535e-05, + "loss": 0.024, + "step": 11640 + }, + { + "epoch": 0.08611513556665977, + "grad_norm": 0.1083950474858284, + "learning_rate": 4.586375237416905e-05, + "loss": 0.0232, + "step": 11650 + }, + { + "epoch": 0.08618905413796162, + "grad_norm": 0.070146843791008, + "learning_rate": 4.5860042735042736e-05, + "loss": 0.0212, + "step": 11660 + }, + { + "epoch": 0.08626297270926347, + "grad_norm": 0.07127376645803452, + "learning_rate": 4.585633309591643e-05, + "loss": 0.0201, + "step": 11670 + }, + { + "epoch": 0.08633689128056533, + "grad_norm": 0.11417879164218903, + "learning_rate": 4.585262345679013e-05, + "loss": 0.0221, + "step": 11680 + }, + { + "epoch": 0.08641080985186718, + "grad_norm": 0.11518987268209457, + "learning_rate": 4.584891381766382e-05, + "loss": 0.0202, + "step": 11690 + }, + { + "epoch": 0.08648472842316904, + "grad_norm": 0.0677439421415329, + "learning_rate": 4.5845204178537514e-05, + "loss": 0.0202, + "step": 11700 + }, + { + "epoch": 0.0865586469944709, + "grad_norm": 0.09899143874645233, + "learning_rate": 4.58414945394112e-05, + "loss": 0.0197, + "step": 11710 + }, + { + "epoch": 0.08663256556577274, + "grad_norm": 0.1325441151857376, + "learning_rate": 4.5837784900284905e-05, + "loss": 0.0222, + "step": 11720 + }, + { + "epoch": 0.0867064841370746, + "grad_norm": 0.1297304332256317, + "learning_rate": 4.58340752611586e-05, + "loss": 0.0238, + "step": 11730 + }, + { + "epoch": 0.08678040270837645, + "grad_norm": 0.08309206366539001, + "learning_rate": 4.583036562203229e-05, + "loss": 0.0206, + "step": 11740 + }, + { + "epoch": 0.0868543212796783, + "grad_norm": 0.08319962024688721, + "learning_rate": 4.5826655982905987e-05, + "loss": 0.0201, + "step": 11750 + }, + { + "epoch": 0.08692823985098017, + "grad_norm": 0.08149457722902298, + "learning_rate": 4.5822946343779676e-05, + "loss": 0.0208, + "step": 11760 + }, + { + "epoch": 0.08700215842228201, + "grad_norm": 0.12498198449611664, + "learning_rate": 4.581923670465337e-05, + "loss": 0.0205, + "step": 11770 + }, + { + "epoch": 0.08707607699358387, + "grad_norm": 0.06954797357320786, + "learning_rate": 4.581552706552707e-05, + "loss": 0.0204, + "step": 11780 + }, + { + "epoch": 0.08714999556488572, + "grad_norm": 0.08185707032680511, + "learning_rate": 4.5811817426400764e-05, + "loss": 0.0211, + "step": 11790 + }, + { + "epoch": 0.08722391413618757, + "grad_norm": 0.09459664672613144, + "learning_rate": 4.580810778727446e-05, + "loss": 0.0203, + "step": 11800 + }, + { + "epoch": 0.08729783270748943, + "grad_norm": 0.08659505099058151, + "learning_rate": 4.580439814814815e-05, + "loss": 0.0218, + "step": 11810 + }, + { + "epoch": 0.08737175127879128, + "grad_norm": 0.07980205118656158, + "learning_rate": 4.5800688509021845e-05, + "loss": 0.0245, + "step": 11820 + }, + { + "epoch": 0.08744566985009314, + "grad_norm": 0.09071630239486694, + "learning_rate": 4.579697886989554e-05, + "loss": 0.0209, + "step": 11830 + }, + { + "epoch": 0.087519588421395, + "grad_norm": 0.09811149537563324, + "learning_rate": 4.579326923076923e-05, + "loss": 0.0205, + "step": 11840 + }, + { + "epoch": 0.08759350699269684, + "grad_norm": 0.09655947238206863, + "learning_rate": 4.5789559591642926e-05, + "loss": 0.0194, + "step": 11850 + }, + { + "epoch": 0.0876674255639987, + "grad_norm": 0.10314806550741196, + "learning_rate": 4.578584995251662e-05, + "loss": 0.0188, + "step": 11860 + }, + { + "epoch": 0.08774134413530055, + "grad_norm": 0.07170028239488602, + "learning_rate": 4.578214031339032e-05, + "loss": 0.0236, + "step": 11870 + }, + { + "epoch": 0.0878152627066024, + "grad_norm": 0.07748035341501236, + "learning_rate": 4.5778430674264014e-05, + "loss": 0.0221, + "step": 11880 + }, + { + "epoch": 0.08788918127790427, + "grad_norm": 0.12346388399600983, + "learning_rate": 4.57747210351377e-05, + "loss": 0.0234, + "step": 11890 + }, + { + "epoch": 0.08796309984920611, + "grad_norm": 0.09907881170511246, + "learning_rate": 4.57710113960114e-05, + "loss": 0.0209, + "step": 11900 + }, + { + "epoch": 0.08803701842050797, + "grad_norm": 0.12885276973247528, + "learning_rate": 4.5767301756885095e-05, + "loss": 0.0229, + "step": 11910 + }, + { + "epoch": 0.08811093699180982, + "grad_norm": 0.12708497047424316, + "learning_rate": 4.5763592117758784e-05, + "loss": 0.0189, + "step": 11920 + }, + { + "epoch": 0.08818485556311167, + "grad_norm": 0.10651792585849762, + "learning_rate": 4.575988247863248e-05, + "loss": 0.0206, + "step": 11930 + }, + { + "epoch": 0.08825877413441353, + "grad_norm": 0.0823550820350647, + "learning_rate": 4.5756172839506176e-05, + "loss": 0.0188, + "step": 11940 + }, + { + "epoch": 0.08833269270571538, + "grad_norm": 0.10734937340021133, + "learning_rate": 4.575246320037987e-05, + "loss": 0.0221, + "step": 11950 + }, + { + "epoch": 0.08840661127701724, + "grad_norm": 0.10963788628578186, + "learning_rate": 4.574875356125357e-05, + "loss": 0.0204, + "step": 11960 + }, + { + "epoch": 0.0884805298483191, + "grad_norm": 0.0906628742814064, + "learning_rate": 4.574504392212726e-05, + "loss": 0.0182, + "step": 11970 + }, + { + "epoch": 0.08855444841962094, + "grad_norm": 0.08999045193195343, + "learning_rate": 4.574133428300095e-05, + "loss": 0.0203, + "step": 11980 + }, + { + "epoch": 0.0886283669909228, + "grad_norm": 0.09220533818006516, + "learning_rate": 4.573762464387464e-05, + "loss": 0.0211, + "step": 11990 + }, + { + "epoch": 0.08870228556222465, + "grad_norm": 0.09482621401548386, + "learning_rate": 4.573391500474834e-05, + "loss": 0.0248, + "step": 12000 + }, + { + "epoch": 0.0887762041335265, + "grad_norm": 0.07867447286844254, + "learning_rate": 4.5730205365622034e-05, + "loss": 0.0213, + "step": 12010 + }, + { + "epoch": 0.08885012270482837, + "grad_norm": 0.09121580421924591, + "learning_rate": 4.572649572649573e-05, + "loss": 0.023, + "step": 12020 + }, + { + "epoch": 0.08892404127613021, + "grad_norm": 0.07908417284488678, + "learning_rate": 4.5722786087369426e-05, + "loss": 0.0208, + "step": 12030 + }, + { + "epoch": 0.08899795984743207, + "grad_norm": 0.12759286165237427, + "learning_rate": 4.5719076448243115e-05, + "loss": 0.0237, + "step": 12040 + }, + { + "epoch": 0.08907187841873393, + "grad_norm": 0.08981367945671082, + "learning_rate": 4.571536680911681e-05, + "loss": 0.0188, + "step": 12050 + }, + { + "epoch": 0.08914579699003577, + "grad_norm": 0.16583828628063202, + "learning_rate": 4.571165716999051e-05, + "loss": 0.0234, + "step": 12060 + }, + { + "epoch": 0.08921971556133763, + "grad_norm": 0.09226063638925552, + "learning_rate": 4.5707947530864197e-05, + "loss": 0.0205, + "step": 12070 + }, + { + "epoch": 0.08929363413263948, + "grad_norm": 0.11874354630708694, + "learning_rate": 4.570423789173789e-05, + "loss": 0.0211, + "step": 12080 + }, + { + "epoch": 0.08936755270394134, + "grad_norm": 0.1163206398487091, + "learning_rate": 4.570052825261159e-05, + "loss": 0.0202, + "step": 12090 + }, + { + "epoch": 0.0894414712752432, + "grad_norm": 0.0943320095539093, + "learning_rate": 4.5696818613485284e-05, + "loss": 0.0198, + "step": 12100 + }, + { + "epoch": 0.08951538984654504, + "grad_norm": 0.09926102310419083, + "learning_rate": 4.569310897435898e-05, + "loss": 0.0192, + "step": 12110 + }, + { + "epoch": 0.0895893084178469, + "grad_norm": 0.08138690888881683, + "learning_rate": 4.568939933523267e-05, + "loss": 0.0208, + "step": 12120 + }, + { + "epoch": 0.08966322698914875, + "grad_norm": 0.10757561773061752, + "learning_rate": 4.5685689696106366e-05, + "loss": 0.0224, + "step": 12130 + }, + { + "epoch": 0.0897371455604506, + "grad_norm": 0.09196264296770096, + "learning_rate": 4.568198005698006e-05, + "loss": 0.0196, + "step": 12140 + }, + { + "epoch": 0.08981106413175247, + "grad_norm": 0.10394033789634705, + "learning_rate": 4.567827041785375e-05, + "loss": 0.0191, + "step": 12150 + }, + { + "epoch": 0.08988498270305431, + "grad_norm": 0.1103530302643776, + "learning_rate": 4.567456077872745e-05, + "loss": 0.0243, + "step": 12160 + }, + { + "epoch": 0.08995890127435617, + "grad_norm": 0.08777318894863129, + "learning_rate": 4.567085113960114e-05, + "loss": 0.0216, + "step": 12170 + }, + { + "epoch": 0.09003281984565803, + "grad_norm": 0.09444407373666763, + "learning_rate": 4.566714150047484e-05, + "loss": 0.0243, + "step": 12180 + }, + { + "epoch": 0.09010673841695988, + "grad_norm": 0.13325783610343933, + "learning_rate": 4.5663431861348535e-05, + "loss": 0.0223, + "step": 12190 + }, + { + "epoch": 0.09018065698826173, + "grad_norm": 0.11760847270488739, + "learning_rate": 4.5659722222222224e-05, + "loss": 0.0232, + "step": 12200 + }, + { + "epoch": 0.09025457555956358, + "grad_norm": 0.09399545192718506, + "learning_rate": 4.565601258309592e-05, + "loss": 0.0221, + "step": 12210 + }, + { + "epoch": 0.09032849413086544, + "grad_norm": 0.09595145285129547, + "learning_rate": 4.565230294396961e-05, + "loss": 0.0184, + "step": 12220 + }, + { + "epoch": 0.0904024127021673, + "grad_norm": 0.09397619217634201, + "learning_rate": 4.5648593304843305e-05, + "loss": 0.0213, + "step": 12230 + }, + { + "epoch": 0.09047633127346914, + "grad_norm": 0.13015051186084747, + "learning_rate": 4.5644883665717e-05, + "loss": 0.0213, + "step": 12240 + }, + { + "epoch": 0.090550249844771, + "grad_norm": 0.09179303795099258, + "learning_rate": 4.56411740265907e-05, + "loss": 0.0224, + "step": 12250 + }, + { + "epoch": 0.09062416841607285, + "grad_norm": 0.12288478761911392, + "learning_rate": 4.563746438746439e-05, + "loss": 0.0231, + "step": 12260 + }, + { + "epoch": 0.09069808698737471, + "grad_norm": 0.08742814511060715, + "learning_rate": 4.563375474833808e-05, + "loss": 0.0206, + "step": 12270 + }, + { + "epoch": 0.09077200555867657, + "grad_norm": 0.17362412810325623, + "learning_rate": 4.563004510921178e-05, + "loss": 0.0202, + "step": 12280 + }, + { + "epoch": 0.09084592412997841, + "grad_norm": 0.06865504384040833, + "learning_rate": 4.5626335470085474e-05, + "loss": 0.0202, + "step": 12290 + }, + { + "epoch": 0.09091984270128027, + "grad_norm": 0.0832584798336029, + "learning_rate": 4.562262583095916e-05, + "loss": 0.0235, + "step": 12300 + }, + { + "epoch": 0.09099376127258213, + "grad_norm": 0.10800887644290924, + "learning_rate": 4.561891619183286e-05, + "loss": 0.0199, + "step": 12310 + }, + { + "epoch": 0.09106767984388398, + "grad_norm": 0.07915148884057999, + "learning_rate": 4.5615206552706555e-05, + "loss": 0.0203, + "step": 12320 + }, + { + "epoch": 0.09114159841518583, + "grad_norm": 0.09770840406417847, + "learning_rate": 4.561149691358025e-05, + "loss": 0.0217, + "step": 12330 + }, + { + "epoch": 0.09121551698648768, + "grad_norm": 0.13962043821811676, + "learning_rate": 4.560778727445395e-05, + "loss": 0.0235, + "step": 12340 + }, + { + "epoch": 0.09128943555778954, + "grad_norm": 0.08580047637224197, + "learning_rate": 4.5604077635327636e-05, + "loss": 0.0204, + "step": 12350 + }, + { + "epoch": 0.0913633541290914, + "grad_norm": 0.07657952606678009, + "learning_rate": 4.560036799620133e-05, + "loss": 0.0214, + "step": 12360 + }, + { + "epoch": 0.09143727270039324, + "grad_norm": 0.09985584765672684, + "learning_rate": 4.559665835707503e-05, + "loss": 0.0203, + "step": 12370 + }, + { + "epoch": 0.0915111912716951, + "grad_norm": 0.14025211334228516, + "learning_rate": 4.559294871794872e-05, + "loss": 0.0224, + "step": 12380 + }, + { + "epoch": 0.09158510984299695, + "grad_norm": 0.10704502463340759, + "learning_rate": 4.558923907882241e-05, + "loss": 0.0181, + "step": 12390 + }, + { + "epoch": 0.09165902841429881, + "grad_norm": 0.12064116448163986, + "learning_rate": 4.558552943969611e-05, + "loss": 0.0217, + "step": 12400 + }, + { + "epoch": 0.09173294698560067, + "grad_norm": 0.09326235204935074, + "learning_rate": 4.5581819800569805e-05, + "loss": 0.0192, + "step": 12410 + }, + { + "epoch": 0.09180686555690251, + "grad_norm": 0.1293174922466278, + "learning_rate": 4.55781101614435e-05, + "loss": 0.0208, + "step": 12420 + }, + { + "epoch": 0.09188078412820437, + "grad_norm": 0.09565316885709763, + "learning_rate": 4.557440052231719e-05, + "loss": 0.0204, + "step": 12430 + }, + { + "epoch": 0.09195470269950623, + "grad_norm": 0.09142905473709106, + "learning_rate": 4.5570690883190886e-05, + "loss": 0.0224, + "step": 12440 + }, + { + "epoch": 0.09202862127080808, + "grad_norm": 0.08464720845222473, + "learning_rate": 4.5566981244064576e-05, + "loss": 0.0208, + "step": 12450 + }, + { + "epoch": 0.09210253984210993, + "grad_norm": 0.10270611196756363, + "learning_rate": 4.556327160493827e-05, + "loss": 0.0245, + "step": 12460 + }, + { + "epoch": 0.09217645841341178, + "grad_norm": 0.0861668661236763, + "learning_rate": 4.555956196581197e-05, + "loss": 0.0204, + "step": 12470 + }, + { + "epoch": 0.09225037698471364, + "grad_norm": 0.1521192193031311, + "learning_rate": 4.5555852326685663e-05, + "loss": 0.0211, + "step": 12480 + }, + { + "epoch": 0.0923242955560155, + "grad_norm": 0.08717560023069382, + "learning_rate": 4.555214268755936e-05, + "loss": 0.0187, + "step": 12490 + }, + { + "epoch": 0.09239821412731734, + "grad_norm": 0.0908551886677742, + "learning_rate": 4.554843304843305e-05, + "loss": 0.0195, + "step": 12500 + }, + { + "epoch": 0.0924721326986192, + "grad_norm": 0.09777882695198059, + "learning_rate": 4.5544723409306745e-05, + "loss": 0.0211, + "step": 12510 + }, + { + "epoch": 0.09254605126992106, + "grad_norm": 0.06726015359163284, + "learning_rate": 4.554101377018044e-05, + "loss": 0.0187, + "step": 12520 + }, + { + "epoch": 0.09261996984122291, + "grad_norm": 0.08230742812156677, + "learning_rate": 4.553730413105413e-05, + "loss": 0.0213, + "step": 12530 + }, + { + "epoch": 0.09269388841252477, + "grad_norm": 0.0912739634513855, + "learning_rate": 4.5533594491927826e-05, + "loss": 0.0224, + "step": 12540 + }, + { + "epoch": 0.09276780698382661, + "grad_norm": 0.10400039702653885, + "learning_rate": 4.552988485280152e-05, + "loss": 0.0198, + "step": 12550 + }, + { + "epoch": 0.09284172555512847, + "grad_norm": 0.07109177112579346, + "learning_rate": 4.552617521367522e-05, + "loss": 0.0198, + "step": 12560 + }, + { + "epoch": 0.09291564412643033, + "grad_norm": 0.11673447489738464, + "learning_rate": 4.5522465574548914e-05, + "loss": 0.0216, + "step": 12570 + }, + { + "epoch": 0.09298956269773218, + "grad_norm": 0.10010415315628052, + "learning_rate": 4.55187559354226e-05, + "loss": 0.0192, + "step": 12580 + }, + { + "epoch": 0.09306348126903403, + "grad_norm": 0.12168619781732559, + "learning_rate": 4.55150462962963e-05, + "loss": 0.0249, + "step": 12590 + }, + { + "epoch": 0.09313739984033588, + "grad_norm": 0.10904297232627869, + "learning_rate": 4.5511336657169995e-05, + "loss": 0.0221, + "step": 12600 + }, + { + "epoch": 0.09321131841163774, + "grad_norm": 0.07244222611188889, + "learning_rate": 4.5507627018043684e-05, + "loss": 0.0208, + "step": 12610 + }, + { + "epoch": 0.0932852369829396, + "grad_norm": 0.12043386697769165, + "learning_rate": 4.550391737891738e-05, + "loss": 0.0208, + "step": 12620 + }, + { + "epoch": 0.09335915555424144, + "grad_norm": 0.08687592297792435, + "learning_rate": 4.5500207739791076e-05, + "loss": 0.0215, + "step": 12630 + }, + { + "epoch": 0.0934330741255433, + "grad_norm": 0.11865407228469849, + "learning_rate": 4.549649810066477e-05, + "loss": 0.0208, + "step": 12640 + }, + { + "epoch": 0.09350699269684516, + "grad_norm": 0.10706440359354019, + "learning_rate": 4.549278846153847e-05, + "loss": 0.0201, + "step": 12650 + }, + { + "epoch": 0.09358091126814701, + "grad_norm": 0.08515679836273193, + "learning_rate": 4.548907882241216e-05, + "loss": 0.02, + "step": 12660 + }, + { + "epoch": 0.09365482983944887, + "grad_norm": 0.1031784862279892, + "learning_rate": 4.548536918328585e-05, + "loss": 0.0208, + "step": 12670 + }, + { + "epoch": 0.09372874841075071, + "grad_norm": 0.09280925989151001, + "learning_rate": 4.548165954415954e-05, + "loss": 0.0187, + "step": 12680 + }, + { + "epoch": 0.09380266698205257, + "grad_norm": 0.09347113966941833, + "learning_rate": 4.547794990503324e-05, + "loss": 0.0197, + "step": 12690 + }, + { + "epoch": 0.09387658555335443, + "grad_norm": 0.0832105502486229, + "learning_rate": 4.5474240265906934e-05, + "loss": 0.021, + "step": 12700 + }, + { + "epoch": 0.09395050412465628, + "grad_norm": 0.11361734569072723, + "learning_rate": 4.547053062678063e-05, + "loss": 0.0206, + "step": 12710 + }, + { + "epoch": 0.09402442269595813, + "grad_norm": 0.06806913763284683, + "learning_rate": 4.5466820987654326e-05, + "loss": 0.0199, + "step": 12720 + }, + { + "epoch": 0.09409834126725998, + "grad_norm": 0.07145105302333832, + "learning_rate": 4.5463111348528015e-05, + "loss": 0.0216, + "step": 12730 + }, + { + "epoch": 0.09417225983856184, + "grad_norm": 0.10840686410665512, + "learning_rate": 4.545940170940171e-05, + "loss": 0.0216, + "step": 12740 + }, + { + "epoch": 0.0942461784098637, + "grad_norm": 0.06309173256158829, + "learning_rate": 4.545569207027541e-05, + "loss": 0.0222, + "step": 12750 + }, + { + "epoch": 0.09432009698116554, + "grad_norm": 0.08596502989530563, + "learning_rate": 4.5451982431149096e-05, + "loss": 0.0194, + "step": 12760 + }, + { + "epoch": 0.0943940155524674, + "grad_norm": 0.10800420492887497, + "learning_rate": 4.544827279202279e-05, + "loss": 0.0205, + "step": 12770 + }, + { + "epoch": 0.09446793412376926, + "grad_norm": 0.11876120418310165, + "learning_rate": 4.544456315289649e-05, + "loss": 0.0199, + "step": 12780 + }, + { + "epoch": 0.09454185269507111, + "grad_norm": 0.10091854631900787, + "learning_rate": 4.5440853513770184e-05, + "loss": 0.0213, + "step": 12790 + }, + { + "epoch": 0.09461577126637297, + "grad_norm": 0.09491269290447235, + "learning_rate": 4.543714387464388e-05, + "loss": 0.0181, + "step": 12800 + }, + { + "epoch": 0.09468968983767481, + "grad_norm": 0.09774453192949295, + "learning_rate": 4.543343423551757e-05, + "loss": 0.0212, + "step": 12810 + }, + { + "epoch": 0.09476360840897667, + "grad_norm": 0.0799957811832428, + "learning_rate": 4.5429724596391265e-05, + "loss": 0.021, + "step": 12820 + }, + { + "epoch": 0.09483752698027853, + "grad_norm": 0.10638774931430817, + "learning_rate": 4.542601495726496e-05, + "loss": 0.0224, + "step": 12830 + }, + { + "epoch": 0.09491144555158038, + "grad_norm": 0.06579312682151794, + "learning_rate": 4.542230531813865e-05, + "loss": 0.0212, + "step": 12840 + }, + { + "epoch": 0.09498536412288223, + "grad_norm": 0.0943514034152031, + "learning_rate": 4.5418595679012346e-05, + "loss": 0.0222, + "step": 12850 + }, + { + "epoch": 0.09505928269418408, + "grad_norm": 0.11735722422599792, + "learning_rate": 4.541488603988604e-05, + "loss": 0.0194, + "step": 12860 + }, + { + "epoch": 0.09513320126548594, + "grad_norm": 0.09771939367055893, + "learning_rate": 4.541117640075974e-05, + "loss": 0.0213, + "step": 12870 + }, + { + "epoch": 0.0952071198367878, + "grad_norm": 0.09291230142116547, + "learning_rate": 4.5407466761633434e-05, + "loss": 0.0211, + "step": 12880 + }, + { + "epoch": 0.09528103840808964, + "grad_norm": 0.123878613114357, + "learning_rate": 4.5403757122507124e-05, + "loss": 0.0222, + "step": 12890 + }, + { + "epoch": 0.0953549569793915, + "grad_norm": 0.11175885796546936, + "learning_rate": 4.540004748338082e-05, + "loss": 0.0214, + "step": 12900 + }, + { + "epoch": 0.09542887555069336, + "grad_norm": 0.09758037328720093, + "learning_rate": 4.539633784425451e-05, + "loss": 0.0203, + "step": 12910 + }, + { + "epoch": 0.09550279412199521, + "grad_norm": 0.1057739108800888, + "learning_rate": 4.5392628205128205e-05, + "loss": 0.0226, + "step": 12920 + }, + { + "epoch": 0.09557671269329707, + "grad_norm": 0.08232392370700836, + "learning_rate": 4.53889185660019e-05, + "loss": 0.0229, + "step": 12930 + }, + { + "epoch": 0.09565063126459891, + "grad_norm": 0.11738672107458115, + "learning_rate": 4.5385208926875597e-05, + "loss": 0.0218, + "step": 12940 + }, + { + "epoch": 0.09572454983590077, + "grad_norm": 0.10136391222476959, + "learning_rate": 4.538149928774929e-05, + "loss": 0.0223, + "step": 12950 + }, + { + "epoch": 0.09579846840720263, + "grad_norm": 0.07713291794061661, + "learning_rate": 4.537778964862298e-05, + "loss": 0.022, + "step": 12960 + }, + { + "epoch": 0.09587238697850448, + "grad_norm": 0.1363089680671692, + "learning_rate": 4.537408000949668e-05, + "loss": 0.0238, + "step": 12970 + }, + { + "epoch": 0.09594630554980633, + "grad_norm": 0.08520155400037766, + "learning_rate": 4.5370370370370374e-05, + "loss": 0.0217, + "step": 12980 + }, + { + "epoch": 0.0960202241211082, + "grad_norm": 0.09474644809961319, + "learning_rate": 4.536666073124406e-05, + "loss": 0.0175, + "step": 12990 + }, + { + "epoch": 0.09609414269241004, + "grad_norm": 0.10518652945756912, + "learning_rate": 4.536295109211776e-05, + "loss": 0.0225, + "step": 13000 + }, + { + "epoch": 0.0961680612637119, + "grad_norm": 0.1090826466679573, + "learning_rate": 4.5359241452991455e-05, + "loss": 0.0208, + "step": 13010 + }, + { + "epoch": 0.09624197983501374, + "grad_norm": 0.09654388576745987, + "learning_rate": 4.535553181386515e-05, + "loss": 0.0193, + "step": 13020 + }, + { + "epoch": 0.0963158984063156, + "grad_norm": 0.06633268296718597, + "learning_rate": 4.535182217473885e-05, + "loss": 0.0177, + "step": 13030 + }, + { + "epoch": 0.09638981697761746, + "grad_norm": 0.1015574038028717, + "learning_rate": 4.5348112535612536e-05, + "loss": 0.0204, + "step": 13040 + }, + { + "epoch": 0.09646373554891931, + "grad_norm": 0.11916191875934601, + "learning_rate": 4.534440289648623e-05, + "loss": 0.0221, + "step": 13050 + }, + { + "epoch": 0.09653765412022117, + "grad_norm": 0.09542952477931976, + "learning_rate": 4.534069325735993e-05, + "loss": 0.0208, + "step": 13060 + }, + { + "epoch": 0.09661157269152301, + "grad_norm": 0.12832722067832947, + "learning_rate": 4.533698361823362e-05, + "loss": 0.0213, + "step": 13070 + }, + { + "epoch": 0.09668549126282487, + "grad_norm": 0.10564541071653366, + "learning_rate": 4.533327397910731e-05, + "loss": 0.0217, + "step": 13080 + }, + { + "epoch": 0.09675940983412673, + "grad_norm": 0.08311822265386581, + "learning_rate": 4.532956433998101e-05, + "loss": 0.0206, + "step": 13090 + }, + { + "epoch": 0.09683332840542858, + "grad_norm": 0.10254728049039841, + "learning_rate": 4.5325854700854705e-05, + "loss": 0.0237, + "step": 13100 + }, + { + "epoch": 0.09690724697673044, + "grad_norm": 0.07333148270845413, + "learning_rate": 4.53221450617284e-05, + "loss": 0.0186, + "step": 13110 + }, + { + "epoch": 0.0969811655480323, + "grad_norm": 0.1466401070356369, + "learning_rate": 4.531843542260209e-05, + "loss": 0.0189, + "step": 13120 + }, + { + "epoch": 0.09705508411933414, + "grad_norm": 0.09987608343362808, + "learning_rate": 4.5314725783475786e-05, + "loss": 0.0216, + "step": 13130 + }, + { + "epoch": 0.097129002690636, + "grad_norm": 0.09517742693424225, + "learning_rate": 4.5311016144349475e-05, + "loss": 0.0212, + "step": 13140 + }, + { + "epoch": 0.09720292126193784, + "grad_norm": 0.09622327238321304, + "learning_rate": 4.530730650522317e-05, + "loss": 0.0237, + "step": 13150 + }, + { + "epoch": 0.0972768398332397, + "grad_norm": 0.125885471701622, + "learning_rate": 4.530359686609687e-05, + "loss": 0.0219, + "step": 13160 + }, + { + "epoch": 0.09735075840454156, + "grad_norm": 0.11400435119867325, + "learning_rate": 4.529988722697056e-05, + "loss": 0.0187, + "step": 13170 + }, + { + "epoch": 0.09742467697584341, + "grad_norm": 0.0999847948551178, + "learning_rate": 4.529617758784426e-05, + "loss": 0.0204, + "step": 13180 + }, + { + "epoch": 0.09749859554714527, + "grad_norm": 0.10056626051664352, + "learning_rate": 4.529246794871795e-05, + "loss": 0.0196, + "step": 13190 + }, + { + "epoch": 0.09757251411844711, + "grad_norm": 0.09070125222206116, + "learning_rate": 4.5288758309591644e-05, + "loss": 0.021, + "step": 13200 + }, + { + "epoch": 0.09764643268974897, + "grad_norm": 0.1022719293832779, + "learning_rate": 4.528504867046534e-05, + "loss": 0.0203, + "step": 13210 + }, + { + "epoch": 0.09772035126105083, + "grad_norm": 0.11356259137392044, + "learning_rate": 4.528133903133903e-05, + "loss": 0.0201, + "step": 13220 + }, + { + "epoch": 0.09779426983235268, + "grad_norm": 0.10243353992700577, + "learning_rate": 4.5277629392212725e-05, + "loss": 0.019, + "step": 13230 + }, + { + "epoch": 0.09786818840365454, + "grad_norm": 0.08349420875310898, + "learning_rate": 4.527391975308642e-05, + "loss": 0.0228, + "step": 13240 + }, + { + "epoch": 0.0979421069749564, + "grad_norm": 0.07490584254264832, + "learning_rate": 4.527021011396012e-05, + "loss": 0.0188, + "step": 13250 + }, + { + "epoch": 0.09801602554625824, + "grad_norm": 0.11197661608457565, + "learning_rate": 4.526650047483381e-05, + "loss": 0.019, + "step": 13260 + }, + { + "epoch": 0.0980899441175601, + "grad_norm": 0.10814948379993439, + "learning_rate": 4.52627908357075e-05, + "loss": 0.0214, + "step": 13270 + }, + { + "epoch": 0.09816386268886194, + "grad_norm": 0.13824844360351562, + "learning_rate": 4.52590811965812e-05, + "loss": 0.0226, + "step": 13280 + }, + { + "epoch": 0.0982377812601638, + "grad_norm": 0.09959527105093002, + "learning_rate": 4.5255371557454894e-05, + "loss": 0.0223, + "step": 13290 + }, + { + "epoch": 0.09831169983146566, + "grad_norm": 0.0893290713429451, + "learning_rate": 4.5251661918328584e-05, + "loss": 0.0219, + "step": 13300 + }, + { + "epoch": 0.09838561840276751, + "grad_norm": 0.15800920128822327, + "learning_rate": 4.524795227920228e-05, + "loss": 0.0236, + "step": 13310 + }, + { + "epoch": 0.09845953697406937, + "grad_norm": 0.08085887879133224, + "learning_rate": 4.5244242640075976e-05, + "loss": 0.0224, + "step": 13320 + }, + { + "epoch": 0.09853345554537121, + "grad_norm": 0.07695917040109634, + "learning_rate": 4.524053300094967e-05, + "loss": 0.0214, + "step": 13330 + }, + { + "epoch": 0.09860737411667307, + "grad_norm": 0.0860675647854805, + "learning_rate": 4.523682336182337e-05, + "loss": 0.0167, + "step": 13340 + }, + { + "epoch": 0.09868129268797493, + "grad_norm": 0.10916229337453842, + "learning_rate": 4.523311372269706e-05, + "loss": 0.0214, + "step": 13350 + }, + { + "epoch": 0.09875521125927678, + "grad_norm": 0.09651117026805878, + "learning_rate": 4.522940408357075e-05, + "loss": 0.0205, + "step": 13360 + }, + { + "epoch": 0.09882912983057864, + "grad_norm": 0.10305950790643692, + "learning_rate": 4.522569444444444e-05, + "loss": 0.023, + "step": 13370 + }, + { + "epoch": 0.0989030484018805, + "grad_norm": 0.12070255726575851, + "learning_rate": 4.522198480531814e-05, + "loss": 0.0209, + "step": 13380 + }, + { + "epoch": 0.09897696697318234, + "grad_norm": 0.07445371896028519, + "learning_rate": 4.5218275166191834e-05, + "loss": 0.0202, + "step": 13390 + }, + { + "epoch": 0.0990508855444842, + "grad_norm": 0.09945213794708252, + "learning_rate": 4.521456552706553e-05, + "loss": 0.0206, + "step": 13400 + }, + { + "epoch": 0.09912480411578604, + "grad_norm": 0.0881875604391098, + "learning_rate": 4.5210855887939226e-05, + "loss": 0.0204, + "step": 13410 + }, + { + "epoch": 0.0991987226870879, + "grad_norm": 0.07720271497964859, + "learning_rate": 4.5207146248812915e-05, + "loss": 0.0195, + "step": 13420 + }, + { + "epoch": 0.09927264125838976, + "grad_norm": 0.09193691611289978, + "learning_rate": 4.520343660968661e-05, + "loss": 0.0205, + "step": 13430 + }, + { + "epoch": 0.09934655982969161, + "grad_norm": 0.0846005380153656, + "learning_rate": 4.519972697056031e-05, + "loss": 0.022, + "step": 13440 + }, + { + "epoch": 0.09942047840099347, + "grad_norm": 0.10466866195201874, + "learning_rate": 4.5196017331433996e-05, + "loss": 0.0182, + "step": 13450 + }, + { + "epoch": 0.09949439697229531, + "grad_norm": 0.07748100161552429, + "learning_rate": 4.519230769230769e-05, + "loss": 0.0205, + "step": 13460 + }, + { + "epoch": 0.09956831554359717, + "grad_norm": 0.10007157176733017, + "learning_rate": 4.518859805318139e-05, + "loss": 0.0224, + "step": 13470 + }, + { + "epoch": 0.09964223411489903, + "grad_norm": 0.08469849824905396, + "learning_rate": 4.5184888414055084e-05, + "loss": 0.0232, + "step": 13480 + }, + { + "epoch": 0.09971615268620088, + "grad_norm": 0.09315604716539383, + "learning_rate": 4.518117877492878e-05, + "loss": 0.0177, + "step": 13490 + }, + { + "epoch": 0.09979007125750274, + "grad_norm": 0.08141258358955383, + "learning_rate": 4.517746913580247e-05, + "loss": 0.0201, + "step": 13500 + }, + { + "epoch": 0.0998639898288046, + "grad_norm": 0.09147261083126068, + "learning_rate": 4.5173759496676165e-05, + "loss": 0.0216, + "step": 13510 + }, + { + "epoch": 0.09993790840010644, + "grad_norm": 0.07396159321069717, + "learning_rate": 4.517004985754986e-05, + "loss": 0.0181, + "step": 13520 + }, + { + "epoch": 0.1000118269714083, + "grad_norm": 0.104294054210186, + "learning_rate": 4.516634021842355e-05, + "loss": 0.0202, + "step": 13530 + }, + { + "epoch": 0.10008574554271014, + "grad_norm": 0.08349345624446869, + "learning_rate": 4.5162630579297246e-05, + "loss": 0.0205, + "step": 13540 + }, + { + "epoch": 0.100159664114012, + "grad_norm": 0.08465448766946793, + "learning_rate": 4.515892094017094e-05, + "loss": 0.0205, + "step": 13550 + }, + { + "epoch": 0.10023358268531386, + "grad_norm": 0.10030587017536163, + "learning_rate": 4.515521130104464e-05, + "loss": 0.0207, + "step": 13560 + }, + { + "epoch": 0.10030750125661571, + "grad_norm": 0.07631184905767441, + "learning_rate": 4.5151501661918334e-05, + "loss": 0.0206, + "step": 13570 + }, + { + "epoch": 0.10038141982791757, + "grad_norm": 0.11903389543294907, + "learning_rate": 4.514779202279202e-05, + "loss": 0.0224, + "step": 13580 + }, + { + "epoch": 0.10045533839921943, + "grad_norm": 0.08355337381362915, + "learning_rate": 4.514408238366572e-05, + "loss": 0.022, + "step": 13590 + }, + { + "epoch": 0.10052925697052127, + "grad_norm": 0.07425417751073837, + "learning_rate": 4.514037274453941e-05, + "loss": 0.0182, + "step": 13600 + }, + { + "epoch": 0.10060317554182313, + "grad_norm": 0.09235212951898575, + "learning_rate": 4.5136663105413104e-05, + "loss": 0.0203, + "step": 13610 + }, + { + "epoch": 0.10067709411312498, + "grad_norm": 0.09667506068944931, + "learning_rate": 4.51329534662868e-05, + "loss": 0.0218, + "step": 13620 + }, + { + "epoch": 0.10075101268442684, + "grad_norm": 0.09671285003423691, + "learning_rate": 4.5129243827160496e-05, + "loss": 0.0255, + "step": 13630 + }, + { + "epoch": 0.1008249312557287, + "grad_norm": 0.10232903808355331, + "learning_rate": 4.512553418803419e-05, + "loss": 0.0226, + "step": 13640 + }, + { + "epoch": 0.10089884982703054, + "grad_norm": 0.08583911508321762, + "learning_rate": 4.512182454890788e-05, + "loss": 0.0212, + "step": 13650 + }, + { + "epoch": 0.1009727683983324, + "grad_norm": 0.10278382152318954, + "learning_rate": 4.511811490978158e-05, + "loss": 0.0203, + "step": 13660 + }, + { + "epoch": 0.10104668696963424, + "grad_norm": 0.09178370237350464, + "learning_rate": 4.5114405270655273e-05, + "loss": 0.0186, + "step": 13670 + }, + { + "epoch": 0.1011206055409361, + "grad_norm": 0.08496574312448502, + "learning_rate": 4.511069563152896e-05, + "loss": 0.0196, + "step": 13680 + }, + { + "epoch": 0.10119452411223796, + "grad_norm": 0.09893114119768143, + "learning_rate": 4.510698599240266e-05, + "loss": 0.0186, + "step": 13690 + }, + { + "epoch": 0.10126844268353981, + "grad_norm": 0.1062416285276413, + "learning_rate": 4.510327635327636e-05, + "loss": 0.0203, + "step": 13700 + }, + { + "epoch": 0.10134236125484167, + "grad_norm": 0.11008942872285843, + "learning_rate": 4.509956671415005e-05, + "loss": 0.0224, + "step": 13710 + }, + { + "epoch": 0.10141627982614353, + "grad_norm": 0.09531824290752411, + "learning_rate": 4.5095857075023746e-05, + "loss": 0.0205, + "step": 13720 + }, + { + "epoch": 0.10149019839744537, + "grad_norm": 0.09373880177736282, + "learning_rate": 4.5092147435897436e-05, + "loss": 0.0225, + "step": 13730 + }, + { + "epoch": 0.10156411696874723, + "grad_norm": 0.09772521257400513, + "learning_rate": 4.508843779677113e-05, + "loss": 0.0209, + "step": 13740 + }, + { + "epoch": 0.10163803554004908, + "grad_norm": 0.11807812750339508, + "learning_rate": 4.508472815764483e-05, + "loss": 0.0218, + "step": 13750 + }, + { + "epoch": 0.10171195411135094, + "grad_norm": 0.09919760376214981, + "learning_rate": 4.508101851851852e-05, + "loss": 0.0213, + "step": 13760 + }, + { + "epoch": 0.1017858726826528, + "grad_norm": 0.13111914694309235, + "learning_rate": 4.507730887939221e-05, + "loss": 0.02, + "step": 13770 + }, + { + "epoch": 0.10185979125395464, + "grad_norm": 0.11977981775999069, + "learning_rate": 4.507359924026591e-05, + "loss": 0.0218, + "step": 13780 + }, + { + "epoch": 0.1019337098252565, + "grad_norm": 0.10085966438055038, + "learning_rate": 4.5069889601139605e-05, + "loss": 0.0223, + "step": 13790 + }, + { + "epoch": 0.10200762839655834, + "grad_norm": 0.09333731234073639, + "learning_rate": 4.50661799620133e-05, + "loss": 0.0221, + "step": 13800 + }, + { + "epoch": 0.1020815469678602, + "grad_norm": 0.11282014846801758, + "learning_rate": 4.506247032288699e-05, + "loss": 0.0223, + "step": 13810 + }, + { + "epoch": 0.10215546553916206, + "grad_norm": 0.09185537695884705, + "learning_rate": 4.5058760683760686e-05, + "loss": 0.0199, + "step": 13820 + }, + { + "epoch": 0.10222938411046391, + "grad_norm": 0.1047159805893898, + "learning_rate": 4.5055051044634375e-05, + "loss": 0.0214, + "step": 13830 + }, + { + "epoch": 0.10230330268176577, + "grad_norm": 0.09216520935297012, + "learning_rate": 4.505134140550807e-05, + "loss": 0.0222, + "step": 13840 + }, + { + "epoch": 0.10237722125306763, + "grad_norm": 0.10986544191837311, + "learning_rate": 4.5047631766381774e-05, + "loss": 0.0231, + "step": 13850 + }, + { + "epoch": 0.10245113982436947, + "grad_norm": 0.1287481188774109, + "learning_rate": 4.504392212725546e-05, + "loss": 0.0206, + "step": 13860 + }, + { + "epoch": 0.10252505839567133, + "grad_norm": 0.11915592104196548, + "learning_rate": 4.504021248812916e-05, + "loss": 0.0213, + "step": 13870 + }, + { + "epoch": 0.10259897696697318, + "grad_norm": 0.10342303663492203, + "learning_rate": 4.503650284900285e-05, + "loss": 0.0211, + "step": 13880 + }, + { + "epoch": 0.10267289553827504, + "grad_norm": 0.10142095386981964, + "learning_rate": 4.5032793209876544e-05, + "loss": 0.0206, + "step": 13890 + }, + { + "epoch": 0.1027468141095769, + "grad_norm": 0.11021839082241058, + "learning_rate": 4.502908357075024e-05, + "loss": 0.0227, + "step": 13900 + }, + { + "epoch": 0.10282073268087874, + "grad_norm": 0.09386792778968811, + "learning_rate": 4.502537393162393e-05, + "loss": 0.0216, + "step": 13910 + }, + { + "epoch": 0.1028946512521806, + "grad_norm": 0.09008444100618362, + "learning_rate": 4.5021664292497625e-05, + "loss": 0.0241, + "step": 13920 + }, + { + "epoch": 0.10296856982348244, + "grad_norm": 0.1056261882185936, + "learning_rate": 4.501795465337133e-05, + "loss": 0.0205, + "step": 13930 + }, + { + "epoch": 0.1030424883947843, + "grad_norm": 0.09542742371559143, + "learning_rate": 4.501424501424502e-05, + "loss": 0.0189, + "step": 13940 + }, + { + "epoch": 0.10311640696608616, + "grad_norm": 0.08561396598815918, + "learning_rate": 4.501053537511871e-05, + "loss": 0.0186, + "step": 13950 + }, + { + "epoch": 0.10319032553738801, + "grad_norm": 0.06652536988258362, + "learning_rate": 4.50068257359924e-05, + "loss": 0.0179, + "step": 13960 + }, + { + "epoch": 0.10326424410868987, + "grad_norm": 0.08740688860416412, + "learning_rate": 4.50031160968661e-05, + "loss": 0.0204, + "step": 13970 + }, + { + "epoch": 0.10333816267999173, + "grad_norm": 0.1050075963139534, + "learning_rate": 4.4999406457739794e-05, + "loss": 0.021, + "step": 13980 + }, + { + "epoch": 0.10341208125129357, + "grad_norm": 0.128191277384758, + "learning_rate": 4.499569681861348e-05, + "loss": 0.022, + "step": 13990 + }, + { + "epoch": 0.10348599982259543, + "grad_norm": 0.0734374076128006, + "learning_rate": 4.4991987179487186e-05, + "loss": 0.02, + "step": 14000 + }, + { + "epoch": 0.10355991839389728, + "grad_norm": 0.07442174106836319, + "learning_rate": 4.4988277540360875e-05, + "loss": 0.0174, + "step": 14010 + }, + { + "epoch": 0.10363383696519914, + "grad_norm": 0.10080066323280334, + "learning_rate": 4.498456790123457e-05, + "loss": 0.0212, + "step": 14020 + }, + { + "epoch": 0.103707755536501, + "grad_norm": 0.12612009048461914, + "learning_rate": 4.498085826210827e-05, + "loss": 0.0205, + "step": 14030 + }, + { + "epoch": 0.10378167410780284, + "grad_norm": 0.08199171721935272, + "learning_rate": 4.4977148622981956e-05, + "loss": 0.023, + "step": 14040 + }, + { + "epoch": 0.1038555926791047, + "grad_norm": 0.08755198121070862, + "learning_rate": 4.497343898385565e-05, + "loss": 0.0219, + "step": 14050 + }, + { + "epoch": 0.10392951125040656, + "grad_norm": 0.10621949285268784, + "learning_rate": 4.496972934472934e-05, + "loss": 0.0212, + "step": 14060 + }, + { + "epoch": 0.1040034298217084, + "grad_norm": 0.08471550047397614, + "learning_rate": 4.496601970560304e-05, + "loss": 0.0219, + "step": 14070 + }, + { + "epoch": 0.10407734839301026, + "grad_norm": 0.07074908912181854, + "learning_rate": 4.496231006647674e-05, + "loss": 0.0182, + "step": 14080 + }, + { + "epoch": 0.10415126696431211, + "grad_norm": 0.12259144335985184, + "learning_rate": 4.495860042735043e-05, + "loss": 0.0203, + "step": 14090 + }, + { + "epoch": 0.10422518553561397, + "grad_norm": 0.06853964924812317, + "learning_rate": 4.4954890788224125e-05, + "loss": 0.0192, + "step": 14100 + }, + { + "epoch": 0.10429910410691583, + "grad_norm": 0.10010594129562378, + "learning_rate": 4.4951181149097815e-05, + "loss": 0.0211, + "step": 14110 + }, + { + "epoch": 0.10437302267821767, + "grad_norm": 0.09315397590398788, + "learning_rate": 4.494747150997151e-05, + "loss": 0.0246, + "step": 14120 + }, + { + "epoch": 0.10444694124951953, + "grad_norm": 0.07536359876394272, + "learning_rate": 4.4943761870845207e-05, + "loss": 0.0201, + "step": 14130 + }, + { + "epoch": 0.10452085982082138, + "grad_norm": 0.08098624646663666, + "learning_rate": 4.4940052231718896e-05, + "loss": 0.0211, + "step": 14140 + }, + { + "epoch": 0.10459477839212324, + "grad_norm": 0.08475301414728165, + "learning_rate": 4.49363425925926e-05, + "loss": 0.0204, + "step": 14150 + }, + { + "epoch": 0.1046686969634251, + "grad_norm": 0.10505864769220352, + "learning_rate": 4.4932632953466294e-05, + "loss": 0.0206, + "step": 14160 + }, + { + "epoch": 0.10474261553472694, + "grad_norm": 0.08343085646629333, + "learning_rate": 4.4928923314339984e-05, + "loss": 0.0234, + "step": 14170 + }, + { + "epoch": 0.1048165341060288, + "grad_norm": 0.10162326693534851, + "learning_rate": 4.492521367521368e-05, + "loss": 0.0232, + "step": 14180 + }, + { + "epoch": 0.10489045267733066, + "grad_norm": 0.09022769331932068, + "learning_rate": 4.492150403608737e-05, + "loss": 0.0202, + "step": 14190 + }, + { + "epoch": 0.1049643712486325, + "grad_norm": 0.12614984810352325, + "learning_rate": 4.4917794396961065e-05, + "loss": 0.0198, + "step": 14200 + }, + { + "epoch": 0.10503828981993436, + "grad_norm": 0.07757915556430817, + "learning_rate": 4.491408475783476e-05, + "loss": 0.0197, + "step": 14210 + }, + { + "epoch": 0.10511220839123621, + "grad_norm": 0.10445569455623627, + "learning_rate": 4.491037511870845e-05, + "loss": 0.0216, + "step": 14220 + }, + { + "epoch": 0.10518612696253807, + "grad_norm": 0.08331640064716339, + "learning_rate": 4.490666547958215e-05, + "loss": 0.0204, + "step": 14230 + }, + { + "epoch": 0.10526004553383993, + "grad_norm": 0.0920739397406578, + "learning_rate": 4.490295584045584e-05, + "loss": 0.02, + "step": 14240 + }, + { + "epoch": 0.10533396410514177, + "grad_norm": 0.07688970863819122, + "learning_rate": 4.489924620132954e-05, + "loss": 0.0193, + "step": 14250 + }, + { + "epoch": 0.10540788267644363, + "grad_norm": 0.09069108217954636, + "learning_rate": 4.4895536562203234e-05, + "loss": 0.0187, + "step": 14260 + }, + { + "epoch": 0.10548180124774548, + "grad_norm": 0.10242221504449844, + "learning_rate": 4.489182692307692e-05, + "loss": 0.0232, + "step": 14270 + }, + { + "epoch": 0.10555571981904734, + "grad_norm": 0.08866092562675476, + "learning_rate": 4.488811728395062e-05, + "loss": 0.021, + "step": 14280 + }, + { + "epoch": 0.1056296383903492, + "grad_norm": 0.12382587045431137, + "learning_rate": 4.488440764482431e-05, + "loss": 0.0243, + "step": 14290 + }, + { + "epoch": 0.10570355696165104, + "grad_norm": 0.09086389094591141, + "learning_rate": 4.4880698005698004e-05, + "loss": 0.0208, + "step": 14300 + }, + { + "epoch": 0.1057774755329529, + "grad_norm": 0.0720980316400528, + "learning_rate": 4.487698836657171e-05, + "loss": 0.0216, + "step": 14310 + }, + { + "epoch": 0.10585139410425476, + "grad_norm": 0.10160847753286362, + "learning_rate": 4.4873278727445396e-05, + "loss": 0.0198, + "step": 14320 + }, + { + "epoch": 0.1059253126755566, + "grad_norm": 0.084689661860466, + "learning_rate": 4.486956908831909e-05, + "loss": 0.0197, + "step": 14330 + }, + { + "epoch": 0.10599923124685846, + "grad_norm": 0.09439302235841751, + "learning_rate": 4.486585944919278e-05, + "loss": 0.0226, + "step": 14340 + }, + { + "epoch": 0.10607314981816031, + "grad_norm": 0.11122968792915344, + "learning_rate": 4.486214981006648e-05, + "loss": 0.0246, + "step": 14350 + }, + { + "epoch": 0.10614706838946217, + "grad_norm": 0.11770051717758179, + "learning_rate": 4.485844017094017e-05, + "loss": 0.0226, + "step": 14360 + }, + { + "epoch": 0.10622098696076403, + "grad_norm": 0.0702885165810585, + "learning_rate": 4.485473053181386e-05, + "loss": 0.0204, + "step": 14370 + }, + { + "epoch": 0.10629490553206587, + "grad_norm": 0.11021970957517624, + "learning_rate": 4.4851020892687565e-05, + "loss": 0.0212, + "step": 14380 + }, + { + "epoch": 0.10636882410336773, + "grad_norm": 0.089713916182518, + "learning_rate": 4.484731125356126e-05, + "loss": 0.021, + "step": 14390 + }, + { + "epoch": 0.10644274267466958, + "grad_norm": 0.08593543618917465, + "learning_rate": 4.484360161443495e-05, + "loss": 0.0238, + "step": 14400 + }, + { + "epoch": 0.10651666124597144, + "grad_norm": 0.07170099765062332, + "learning_rate": 4.4839891975308646e-05, + "loss": 0.0214, + "step": 14410 + }, + { + "epoch": 0.1065905798172733, + "grad_norm": 0.09407330304384232, + "learning_rate": 4.4836182336182335e-05, + "loss": 0.02, + "step": 14420 + }, + { + "epoch": 0.10666449838857514, + "grad_norm": 0.09657812863588333, + "learning_rate": 4.483247269705603e-05, + "loss": 0.0197, + "step": 14430 + }, + { + "epoch": 0.106738416959877, + "grad_norm": 0.05521457642316818, + "learning_rate": 4.482876305792973e-05, + "loss": 0.0181, + "step": 14440 + }, + { + "epoch": 0.10681233553117886, + "grad_norm": 0.10616671293973923, + "learning_rate": 4.4825053418803417e-05, + "loss": 0.0219, + "step": 14450 + }, + { + "epoch": 0.1068862541024807, + "grad_norm": 0.08795152604579926, + "learning_rate": 4.482134377967712e-05, + "loss": 0.0229, + "step": 14460 + }, + { + "epoch": 0.10696017267378256, + "grad_norm": 0.1144707202911377, + "learning_rate": 4.481763414055081e-05, + "loss": 0.0192, + "step": 14470 + }, + { + "epoch": 0.10703409124508441, + "grad_norm": 0.1001119464635849, + "learning_rate": 4.4813924501424504e-05, + "loss": 0.0198, + "step": 14480 + }, + { + "epoch": 0.10710800981638627, + "grad_norm": 0.08099383115768433, + "learning_rate": 4.48102148622982e-05, + "loss": 0.021, + "step": 14490 + }, + { + "epoch": 0.10718192838768813, + "grad_norm": 0.12439311295747757, + "learning_rate": 4.480650522317189e-05, + "loss": 0.0229, + "step": 14500 + }, + { + "epoch": 0.10725584695898997, + "grad_norm": 0.076289102435112, + "learning_rate": 4.4802795584045586e-05, + "loss": 0.0188, + "step": 14510 + }, + { + "epoch": 0.10732976553029183, + "grad_norm": 0.08145805448293686, + "learning_rate": 4.4799085944919275e-05, + "loss": 0.0197, + "step": 14520 + }, + { + "epoch": 0.10740368410159369, + "grad_norm": 0.09102274477481842, + "learning_rate": 4.479537630579298e-05, + "loss": 0.0205, + "step": 14530 + }, + { + "epoch": 0.10747760267289554, + "grad_norm": 0.09698771685361862, + "learning_rate": 4.4791666666666673e-05, + "loss": 0.0194, + "step": 14540 + }, + { + "epoch": 0.1075515212441974, + "grad_norm": 0.11331616342067719, + "learning_rate": 4.478795702754036e-05, + "loss": 0.02, + "step": 14550 + }, + { + "epoch": 0.10762543981549924, + "grad_norm": 0.09942898899316788, + "learning_rate": 4.478424738841406e-05, + "loss": 0.0189, + "step": 14560 + }, + { + "epoch": 0.1076993583868011, + "grad_norm": 0.11539186537265778, + "learning_rate": 4.478053774928775e-05, + "loss": 0.0233, + "step": 14570 + }, + { + "epoch": 0.10777327695810296, + "grad_norm": 0.08130382001399994, + "learning_rate": 4.4776828110161444e-05, + "loss": 0.0205, + "step": 14580 + }, + { + "epoch": 0.1078471955294048, + "grad_norm": 0.09737993031740189, + "learning_rate": 4.477311847103514e-05, + "loss": 0.0228, + "step": 14590 + }, + { + "epoch": 0.10792111410070666, + "grad_norm": 0.10642804205417633, + "learning_rate": 4.476940883190883e-05, + "loss": 0.0233, + "step": 14600 + }, + { + "epoch": 0.10799503267200851, + "grad_norm": 0.08564222604036331, + "learning_rate": 4.476569919278253e-05, + "loss": 0.0194, + "step": 14610 + }, + { + "epoch": 0.10806895124331037, + "grad_norm": 0.07180308550596237, + "learning_rate": 4.476198955365623e-05, + "loss": 0.0213, + "step": 14620 + }, + { + "epoch": 0.10814286981461223, + "grad_norm": 0.08087430894374847, + "learning_rate": 4.475827991452992e-05, + "loss": 0.0185, + "step": 14630 + }, + { + "epoch": 0.10821678838591407, + "grad_norm": 0.08423203974962234, + "learning_rate": 4.475457027540361e-05, + "loss": 0.018, + "step": 14640 + }, + { + "epoch": 0.10829070695721593, + "grad_norm": 0.08719424903392792, + "learning_rate": 4.47508606362773e-05, + "loss": 0.0218, + "step": 14650 + }, + { + "epoch": 0.10836462552851779, + "grad_norm": 0.08597151190042496, + "learning_rate": 4.4747150997151e-05, + "loss": 0.0205, + "step": 14660 + }, + { + "epoch": 0.10843854409981964, + "grad_norm": 0.07289619743824005, + "learning_rate": 4.4743441358024694e-05, + "loss": 0.0217, + "step": 14670 + }, + { + "epoch": 0.1085124626711215, + "grad_norm": 0.10662366449832916, + "learning_rate": 4.473973171889839e-05, + "loss": 0.0201, + "step": 14680 + }, + { + "epoch": 0.10858638124242334, + "grad_norm": 0.0755082443356514, + "learning_rate": 4.4736022079772086e-05, + "loss": 0.0202, + "step": 14690 + }, + { + "epoch": 0.1086602998137252, + "grad_norm": 0.07999914884567261, + "learning_rate": 4.4732312440645775e-05, + "loss": 0.0188, + "step": 14700 + }, + { + "epoch": 0.10873421838502706, + "grad_norm": 0.09803368896245956, + "learning_rate": 4.472860280151947e-05, + "loss": 0.0206, + "step": 14710 + }, + { + "epoch": 0.1088081369563289, + "grad_norm": 0.09981626272201538, + "learning_rate": 4.472489316239317e-05, + "loss": 0.021, + "step": 14720 + }, + { + "epoch": 0.10888205552763076, + "grad_norm": 0.11182073503732681, + "learning_rate": 4.4721183523266856e-05, + "loss": 0.0218, + "step": 14730 + }, + { + "epoch": 0.10895597409893261, + "grad_norm": 0.0778021365404129, + "learning_rate": 4.471747388414055e-05, + "loss": 0.0174, + "step": 14740 + }, + { + "epoch": 0.10902989267023447, + "grad_norm": 0.10926984250545502, + "learning_rate": 4.471376424501424e-05, + "loss": 0.0232, + "step": 14750 + }, + { + "epoch": 0.10910381124153633, + "grad_norm": 0.09038849174976349, + "learning_rate": 4.4710054605887944e-05, + "loss": 0.0225, + "step": 14760 + }, + { + "epoch": 0.10917772981283817, + "grad_norm": 0.11312000453472137, + "learning_rate": 4.470634496676164e-05, + "loss": 0.0204, + "step": 14770 + }, + { + "epoch": 0.10925164838414003, + "grad_norm": 0.11730439215898514, + "learning_rate": 4.470263532763533e-05, + "loss": 0.0227, + "step": 14780 + }, + { + "epoch": 0.10932556695544189, + "grad_norm": 0.10705330222845078, + "learning_rate": 4.4698925688509025e-05, + "loss": 0.023, + "step": 14790 + }, + { + "epoch": 0.10939948552674374, + "grad_norm": 0.0921492725610733, + "learning_rate": 4.4695216049382714e-05, + "loss": 0.0225, + "step": 14800 + }, + { + "epoch": 0.1094734040980456, + "grad_norm": 0.14325913786888123, + "learning_rate": 4.469150641025641e-05, + "loss": 0.0217, + "step": 14810 + }, + { + "epoch": 0.10954732266934744, + "grad_norm": 0.08691754192113876, + "learning_rate": 4.4687796771130106e-05, + "loss": 0.0202, + "step": 14820 + }, + { + "epoch": 0.1096212412406493, + "grad_norm": 0.07764049619436264, + "learning_rate": 4.46840871320038e-05, + "loss": 0.0209, + "step": 14830 + }, + { + "epoch": 0.10969515981195116, + "grad_norm": 0.12285088002681732, + "learning_rate": 4.46803774928775e-05, + "loss": 0.02, + "step": 14840 + }, + { + "epoch": 0.109769078383253, + "grad_norm": 0.09305769950151443, + "learning_rate": 4.4676667853751194e-05, + "loss": 0.0211, + "step": 14850 + }, + { + "epoch": 0.10984299695455486, + "grad_norm": 0.09885692596435547, + "learning_rate": 4.4672958214624883e-05, + "loss": 0.0194, + "step": 14860 + }, + { + "epoch": 0.10991691552585671, + "grad_norm": 0.10101620852947235, + "learning_rate": 4.466924857549858e-05, + "loss": 0.0202, + "step": 14870 + }, + { + "epoch": 0.10999083409715857, + "grad_norm": 0.10346149653196335, + "learning_rate": 4.466553893637227e-05, + "loss": 0.0205, + "step": 14880 + }, + { + "epoch": 0.11006475266846043, + "grad_norm": 0.0820925235748291, + "learning_rate": 4.4661829297245965e-05, + "loss": 0.0208, + "step": 14890 + }, + { + "epoch": 0.11013867123976227, + "grad_norm": 0.08407224714756012, + "learning_rate": 4.465811965811966e-05, + "loss": 0.0181, + "step": 14900 + }, + { + "epoch": 0.11021258981106413, + "grad_norm": 0.12362422794103622, + "learning_rate": 4.4654410018993356e-05, + "loss": 0.022, + "step": 14910 + }, + { + "epoch": 0.11028650838236599, + "grad_norm": 0.13008932769298553, + "learning_rate": 4.465070037986705e-05, + "loss": 0.0212, + "step": 14920 + }, + { + "epoch": 0.11036042695366784, + "grad_norm": 0.08195054531097412, + "learning_rate": 4.464699074074074e-05, + "loss": 0.0198, + "step": 14930 + }, + { + "epoch": 0.1104343455249697, + "grad_norm": 0.12018315494060516, + "learning_rate": 4.464328110161444e-05, + "loss": 0.0234, + "step": 14940 + }, + { + "epoch": 0.11050826409627154, + "grad_norm": 0.16510340571403503, + "learning_rate": 4.4639571462488134e-05, + "loss": 0.0203, + "step": 14950 + }, + { + "epoch": 0.1105821826675734, + "grad_norm": 0.09181898087263107, + "learning_rate": 4.463586182336182e-05, + "loss": 0.0207, + "step": 14960 + }, + { + "epoch": 0.11065610123887526, + "grad_norm": 0.052205272018909454, + "learning_rate": 4.463215218423552e-05, + "loss": 0.0187, + "step": 14970 + }, + { + "epoch": 0.1107300198101771, + "grad_norm": 0.09707576036453247, + "learning_rate": 4.4628442545109215e-05, + "loss": 0.0205, + "step": 14980 + }, + { + "epoch": 0.11080393838147896, + "grad_norm": 0.0807408019900322, + "learning_rate": 4.462473290598291e-05, + "loss": 0.0196, + "step": 14990 + }, + { + "epoch": 0.11087785695278082, + "grad_norm": 0.11290616542100906, + "learning_rate": 4.462102326685661e-05, + "loss": 0.0199, + "step": 15000 + }, + { + "epoch": 0.11095177552408267, + "grad_norm": 0.09845750033855438, + "learning_rate": 4.4617313627730296e-05, + "loss": 0.0197, + "step": 15010 + }, + { + "epoch": 0.11102569409538453, + "grad_norm": 0.07282011210918427, + "learning_rate": 4.461360398860399e-05, + "loss": 0.0225, + "step": 15020 + }, + { + "epoch": 0.11109961266668637, + "grad_norm": 0.09254945814609528, + "learning_rate": 4.460989434947768e-05, + "loss": 0.0185, + "step": 15030 + }, + { + "epoch": 0.11117353123798823, + "grad_norm": 0.08924317359924316, + "learning_rate": 4.460618471035138e-05, + "loss": 0.021, + "step": 15040 + }, + { + "epoch": 0.11124744980929009, + "grad_norm": 0.0864916741847992, + "learning_rate": 4.460247507122507e-05, + "loss": 0.0192, + "step": 15050 + }, + { + "epoch": 0.11132136838059194, + "grad_norm": 0.10492896288633347, + "learning_rate": 4.459876543209877e-05, + "loss": 0.0197, + "step": 15060 + }, + { + "epoch": 0.1113952869518938, + "grad_norm": 0.0884445384144783, + "learning_rate": 4.4595055792972465e-05, + "loss": 0.02, + "step": 15070 + }, + { + "epoch": 0.11146920552319564, + "grad_norm": 0.09605001658201218, + "learning_rate": 4.459134615384616e-05, + "loss": 0.0238, + "step": 15080 + }, + { + "epoch": 0.1115431240944975, + "grad_norm": 0.1008792594075203, + "learning_rate": 4.458763651471985e-05, + "loss": 0.0208, + "step": 15090 + }, + { + "epoch": 0.11161704266579936, + "grad_norm": 0.0722879096865654, + "learning_rate": 4.4583926875593546e-05, + "loss": 0.0198, + "step": 15100 + }, + { + "epoch": 0.1116909612371012, + "grad_norm": 0.09575898945331573, + "learning_rate": 4.4580217236467235e-05, + "loss": 0.0212, + "step": 15110 + }, + { + "epoch": 0.11176487980840306, + "grad_norm": 0.08117416501045227, + "learning_rate": 4.457650759734093e-05, + "loss": 0.0218, + "step": 15120 + }, + { + "epoch": 0.11183879837970492, + "grad_norm": 0.14386782050132751, + "learning_rate": 4.457279795821463e-05, + "loss": 0.0218, + "step": 15130 + }, + { + "epoch": 0.11191271695100677, + "grad_norm": 0.09009039402008057, + "learning_rate": 4.456908831908832e-05, + "loss": 0.0197, + "step": 15140 + }, + { + "epoch": 0.11198663552230863, + "grad_norm": 0.08412953466176987, + "learning_rate": 4.456537867996202e-05, + "loss": 0.0188, + "step": 15150 + }, + { + "epoch": 0.11206055409361047, + "grad_norm": 0.13613708317279816, + "learning_rate": 4.456166904083571e-05, + "loss": 0.0205, + "step": 15160 + }, + { + "epoch": 0.11213447266491233, + "grad_norm": 0.08158623427152634, + "learning_rate": 4.4557959401709404e-05, + "loss": 0.0218, + "step": 15170 + }, + { + "epoch": 0.11220839123621419, + "grad_norm": 0.11550930142402649, + "learning_rate": 4.45542497625831e-05, + "loss": 0.0231, + "step": 15180 + }, + { + "epoch": 0.11228230980751604, + "grad_norm": 0.09520114958286285, + "learning_rate": 4.455054012345679e-05, + "loss": 0.0191, + "step": 15190 + }, + { + "epoch": 0.1123562283788179, + "grad_norm": 0.10363199561834335, + "learning_rate": 4.4546830484330485e-05, + "loss": 0.023, + "step": 15200 + }, + { + "epoch": 0.11243014695011974, + "grad_norm": 0.08404412120580673, + "learning_rate": 4.454312084520418e-05, + "loss": 0.0224, + "step": 15210 + }, + { + "epoch": 0.1125040655214216, + "grad_norm": 0.08799764513969421, + "learning_rate": 4.453941120607788e-05, + "loss": 0.021, + "step": 15220 + }, + { + "epoch": 0.11257798409272346, + "grad_norm": 0.09482520073652267, + "learning_rate": 4.453570156695157e-05, + "loss": 0.0209, + "step": 15230 + }, + { + "epoch": 0.1126519026640253, + "grad_norm": 0.10100306570529938, + "learning_rate": 4.453199192782526e-05, + "loss": 0.0223, + "step": 15240 + }, + { + "epoch": 0.11272582123532716, + "grad_norm": 0.10380079597234726, + "learning_rate": 4.452828228869896e-05, + "loss": 0.021, + "step": 15250 + }, + { + "epoch": 0.11279973980662902, + "grad_norm": 0.10141590982675552, + "learning_rate": 4.452457264957265e-05, + "loss": 0.0188, + "step": 15260 + }, + { + "epoch": 0.11287365837793087, + "grad_norm": 0.14011965692043304, + "learning_rate": 4.4520863010446344e-05, + "loss": 0.0218, + "step": 15270 + }, + { + "epoch": 0.11294757694923273, + "grad_norm": 0.10623647272586823, + "learning_rate": 4.451715337132004e-05, + "loss": 0.0218, + "step": 15280 + }, + { + "epoch": 0.11302149552053457, + "grad_norm": 0.06410671770572662, + "learning_rate": 4.4513443732193735e-05, + "loss": 0.0189, + "step": 15290 + }, + { + "epoch": 0.11309541409183643, + "grad_norm": 0.1001250296831131, + "learning_rate": 4.450973409306743e-05, + "loss": 0.0209, + "step": 15300 + }, + { + "epoch": 0.11316933266313829, + "grad_norm": 0.09662005305290222, + "learning_rate": 4.450602445394113e-05, + "loss": 0.02, + "step": 15310 + }, + { + "epoch": 0.11324325123444014, + "grad_norm": 0.09809233248233795, + "learning_rate": 4.4502314814814817e-05, + "loss": 0.0212, + "step": 15320 + }, + { + "epoch": 0.113317169805742, + "grad_norm": 0.0838894173502922, + "learning_rate": 4.449860517568851e-05, + "loss": 0.0191, + "step": 15330 + }, + { + "epoch": 0.11339108837704384, + "grad_norm": 0.08231469243764877, + "learning_rate": 4.44948955365622e-05, + "loss": 0.02, + "step": 15340 + }, + { + "epoch": 0.1134650069483457, + "grad_norm": 0.09870616346597672, + "learning_rate": 4.44911858974359e-05, + "loss": 0.0192, + "step": 15350 + }, + { + "epoch": 0.11353892551964756, + "grad_norm": 0.09969879686832428, + "learning_rate": 4.4487476258309594e-05, + "loss": 0.0212, + "step": 15360 + }, + { + "epoch": 0.1136128440909494, + "grad_norm": 0.11603955179452896, + "learning_rate": 4.448376661918329e-05, + "loss": 0.0204, + "step": 15370 + }, + { + "epoch": 0.11368676266225126, + "grad_norm": 0.09245004504919052, + "learning_rate": 4.4480056980056986e-05, + "loss": 0.0223, + "step": 15380 + }, + { + "epoch": 0.11376068123355312, + "grad_norm": 0.11372753977775574, + "learning_rate": 4.4476347340930675e-05, + "loss": 0.0203, + "step": 15390 + }, + { + "epoch": 0.11383459980485497, + "grad_norm": 0.08970644325017929, + "learning_rate": 4.447263770180437e-05, + "loss": 0.019, + "step": 15400 + }, + { + "epoch": 0.11390851837615683, + "grad_norm": 0.13420316576957703, + "learning_rate": 4.446892806267807e-05, + "loss": 0.0208, + "step": 15410 + }, + { + "epoch": 0.11398243694745867, + "grad_norm": 0.09992843866348267, + "learning_rate": 4.4465218423551756e-05, + "loss": 0.0192, + "step": 15420 + }, + { + "epoch": 0.11405635551876053, + "grad_norm": 0.11263995617628098, + "learning_rate": 4.446150878442545e-05, + "loss": 0.0201, + "step": 15430 + }, + { + "epoch": 0.11413027409006239, + "grad_norm": 0.10258904844522476, + "learning_rate": 4.445779914529915e-05, + "loss": 0.0193, + "step": 15440 + }, + { + "epoch": 0.11420419266136424, + "grad_norm": 0.09087394177913666, + "learning_rate": 4.4454089506172844e-05, + "loss": 0.0192, + "step": 15450 + }, + { + "epoch": 0.1142781112326661, + "grad_norm": 0.07284771651029587, + "learning_rate": 4.445037986704654e-05, + "loss": 0.0189, + "step": 15460 + }, + { + "epoch": 0.11435202980396796, + "grad_norm": 0.14658352732658386, + "learning_rate": 4.444667022792023e-05, + "loss": 0.0202, + "step": 15470 + }, + { + "epoch": 0.1144259483752698, + "grad_norm": 0.08134383708238602, + "learning_rate": 4.4442960588793925e-05, + "loss": 0.0191, + "step": 15480 + }, + { + "epoch": 0.11449986694657166, + "grad_norm": 0.09578991681337357, + "learning_rate": 4.4439250949667614e-05, + "loss": 0.0197, + "step": 15490 + }, + { + "epoch": 0.1145737855178735, + "grad_norm": 0.12194101512432098, + "learning_rate": 4.443554131054131e-05, + "loss": 0.0214, + "step": 15500 + }, + { + "epoch": 0.11464770408917536, + "grad_norm": 0.12530599534511566, + "learning_rate": 4.4431831671415006e-05, + "loss": 0.0192, + "step": 15510 + }, + { + "epoch": 0.11472162266047722, + "grad_norm": 0.10177845507860184, + "learning_rate": 4.44281220322887e-05, + "loss": 0.0183, + "step": 15520 + }, + { + "epoch": 0.11479554123177907, + "grad_norm": 0.14865587651729584, + "learning_rate": 4.44244123931624e-05, + "loss": 0.0193, + "step": 15530 + }, + { + "epoch": 0.11486945980308093, + "grad_norm": 0.12489147484302521, + "learning_rate": 4.4420702754036094e-05, + "loss": 0.0221, + "step": 15540 + }, + { + "epoch": 0.11494337837438277, + "grad_norm": 0.09141378849744797, + "learning_rate": 4.441699311490978e-05, + "loss": 0.019, + "step": 15550 + }, + { + "epoch": 0.11501729694568463, + "grad_norm": 0.10528656095266342, + "learning_rate": 4.441328347578348e-05, + "loss": 0.0196, + "step": 15560 + }, + { + "epoch": 0.11509121551698649, + "grad_norm": 0.10016972571611404, + "learning_rate": 4.440957383665717e-05, + "loss": 0.0217, + "step": 15570 + }, + { + "epoch": 0.11516513408828834, + "grad_norm": 0.11202121526002884, + "learning_rate": 4.4405864197530864e-05, + "loss": 0.0205, + "step": 15580 + }, + { + "epoch": 0.1152390526595902, + "grad_norm": 0.1344679892063141, + "learning_rate": 4.440215455840456e-05, + "loss": 0.0217, + "step": 15590 + }, + { + "epoch": 0.11531297123089206, + "grad_norm": 0.09601288288831711, + "learning_rate": 4.4398444919278256e-05, + "loss": 0.0203, + "step": 15600 + }, + { + "epoch": 0.1153868898021939, + "grad_norm": 0.15555888414382935, + "learning_rate": 4.439473528015195e-05, + "loss": 0.0194, + "step": 15610 + }, + { + "epoch": 0.11546080837349576, + "grad_norm": 0.11112314462661743, + "learning_rate": 4.439102564102564e-05, + "loss": 0.0211, + "step": 15620 + }, + { + "epoch": 0.1155347269447976, + "grad_norm": 0.07886022329330444, + "learning_rate": 4.438731600189934e-05, + "loss": 0.0212, + "step": 15630 + }, + { + "epoch": 0.11560864551609946, + "grad_norm": 0.1283668577671051, + "learning_rate": 4.438360636277303e-05, + "loss": 0.0213, + "step": 15640 + }, + { + "epoch": 0.11568256408740132, + "grad_norm": 0.07390929013490677, + "learning_rate": 4.437989672364672e-05, + "loss": 0.0195, + "step": 15650 + }, + { + "epoch": 0.11575648265870317, + "grad_norm": 0.1706046760082245, + "learning_rate": 4.437618708452042e-05, + "loss": 0.0223, + "step": 15660 + }, + { + "epoch": 0.11583040123000503, + "grad_norm": 0.11129933595657349, + "learning_rate": 4.4372477445394114e-05, + "loss": 0.0206, + "step": 15670 + }, + { + "epoch": 0.11590431980130687, + "grad_norm": 0.14023716747760773, + "learning_rate": 4.436876780626781e-05, + "loss": 0.0219, + "step": 15680 + }, + { + "epoch": 0.11597823837260873, + "grad_norm": 0.10003252327442169, + "learning_rate": 4.4365058167141506e-05, + "loss": 0.0203, + "step": 15690 + }, + { + "epoch": 0.11605215694391059, + "grad_norm": 0.11612115055322647, + "learning_rate": 4.4361348528015196e-05, + "loss": 0.0199, + "step": 15700 + }, + { + "epoch": 0.11612607551521244, + "grad_norm": 0.11864006519317627, + "learning_rate": 4.435763888888889e-05, + "loss": 0.0232, + "step": 15710 + }, + { + "epoch": 0.1161999940865143, + "grad_norm": 0.09690623730421066, + "learning_rate": 4.435392924976258e-05, + "loss": 0.0181, + "step": 15720 + }, + { + "epoch": 0.11627391265781616, + "grad_norm": 0.13921846449375153, + "learning_rate": 4.435021961063628e-05, + "loss": 0.022, + "step": 15730 + }, + { + "epoch": 0.116347831229118, + "grad_norm": 0.12367802113294601, + "learning_rate": 4.434650997150997e-05, + "loss": 0.0194, + "step": 15740 + }, + { + "epoch": 0.11642174980041986, + "grad_norm": 0.1278046816587448, + "learning_rate": 4.434280033238367e-05, + "loss": 0.0196, + "step": 15750 + }, + { + "epoch": 0.1164956683717217, + "grad_norm": 0.1132681667804718, + "learning_rate": 4.4339090693257365e-05, + "loss": 0.0196, + "step": 15760 + }, + { + "epoch": 0.11656958694302356, + "grad_norm": 0.1054956465959549, + "learning_rate": 4.433538105413106e-05, + "loss": 0.0194, + "step": 15770 + }, + { + "epoch": 0.11664350551432542, + "grad_norm": 0.11253593862056732, + "learning_rate": 4.433167141500475e-05, + "loss": 0.0204, + "step": 15780 + }, + { + "epoch": 0.11671742408562727, + "grad_norm": 0.11990190297365189, + "learning_rate": 4.4327961775878446e-05, + "loss": 0.0205, + "step": 15790 + }, + { + "epoch": 0.11679134265692913, + "grad_norm": 0.11037104576826096, + "learning_rate": 4.4324252136752135e-05, + "loss": 0.0214, + "step": 15800 + }, + { + "epoch": 0.11686526122823097, + "grad_norm": 0.1701211929321289, + "learning_rate": 4.432054249762583e-05, + "loss": 0.0229, + "step": 15810 + }, + { + "epoch": 0.11693917979953283, + "grad_norm": 0.10061797499656677, + "learning_rate": 4.431683285849953e-05, + "loss": 0.0204, + "step": 15820 + }, + { + "epoch": 0.11701309837083469, + "grad_norm": 0.09710580110549927, + "learning_rate": 4.431312321937322e-05, + "loss": 0.0228, + "step": 15830 + }, + { + "epoch": 0.11708701694213654, + "grad_norm": 0.07980790734291077, + "learning_rate": 4.430941358024692e-05, + "loss": 0.0186, + "step": 15840 + }, + { + "epoch": 0.1171609355134384, + "grad_norm": 0.09913813322782516, + "learning_rate": 4.430570394112061e-05, + "loss": 0.019, + "step": 15850 + }, + { + "epoch": 0.11723485408474026, + "grad_norm": 0.09745988994836807, + "learning_rate": 4.4301994301994304e-05, + "loss": 0.0202, + "step": 15860 + }, + { + "epoch": 0.1173087726560421, + "grad_norm": 0.0765075534582138, + "learning_rate": 4.4298284662868e-05, + "loss": 0.0199, + "step": 15870 + }, + { + "epoch": 0.11738269122734396, + "grad_norm": 0.12214318662881851, + "learning_rate": 4.429457502374169e-05, + "loss": 0.0219, + "step": 15880 + }, + { + "epoch": 0.1174566097986458, + "grad_norm": 0.12174449115991592, + "learning_rate": 4.4290865384615385e-05, + "loss": 0.0227, + "step": 15890 + }, + { + "epoch": 0.11753052836994766, + "grad_norm": 0.0814589262008667, + "learning_rate": 4.428715574548908e-05, + "loss": 0.0225, + "step": 15900 + }, + { + "epoch": 0.11760444694124952, + "grad_norm": 0.08090159296989441, + "learning_rate": 4.428344610636278e-05, + "loss": 0.0208, + "step": 15910 + }, + { + "epoch": 0.11767836551255137, + "grad_norm": 0.08562711626291275, + "learning_rate": 4.427973646723647e-05, + "loss": 0.0222, + "step": 15920 + }, + { + "epoch": 0.11775228408385323, + "grad_norm": 0.09362155199050903, + "learning_rate": 4.427602682811016e-05, + "loss": 0.0206, + "step": 15930 + }, + { + "epoch": 0.11782620265515509, + "grad_norm": 0.14854660630226135, + "learning_rate": 4.427231718898386e-05, + "loss": 0.0218, + "step": 15940 + }, + { + "epoch": 0.11790012122645693, + "grad_norm": 0.11786948144435883, + "learning_rate": 4.426860754985755e-05, + "loss": 0.0192, + "step": 15950 + }, + { + "epoch": 0.11797403979775879, + "grad_norm": 0.08195752650499344, + "learning_rate": 4.426489791073124e-05, + "loss": 0.019, + "step": 15960 + }, + { + "epoch": 0.11804795836906064, + "grad_norm": 0.1212175115942955, + "learning_rate": 4.426118827160494e-05, + "loss": 0.0231, + "step": 15970 + }, + { + "epoch": 0.1181218769403625, + "grad_norm": 0.08144454658031464, + "learning_rate": 4.4257478632478635e-05, + "loss": 0.0202, + "step": 15980 + }, + { + "epoch": 0.11819579551166436, + "grad_norm": 0.12607130408287048, + "learning_rate": 4.425376899335233e-05, + "loss": 0.0239, + "step": 15990 + }, + { + "epoch": 0.1182697140829662, + "grad_norm": 0.0952596589922905, + "learning_rate": 4.425005935422603e-05, + "loss": 0.0195, + "step": 16000 + }, + { + "epoch": 0.11834363265426806, + "grad_norm": 0.07940717041492462, + "learning_rate": 4.4246349715099716e-05, + "loss": 0.0186, + "step": 16010 + }, + { + "epoch": 0.1184175512255699, + "grad_norm": 0.08581467717885971, + "learning_rate": 4.424264007597341e-05, + "loss": 0.0186, + "step": 16020 + }, + { + "epoch": 0.11849146979687177, + "grad_norm": 0.06981861591339111, + "learning_rate": 4.42389304368471e-05, + "loss": 0.0201, + "step": 16030 + }, + { + "epoch": 0.11856538836817362, + "grad_norm": 0.06816834211349487, + "learning_rate": 4.42352207977208e-05, + "loss": 0.0217, + "step": 16040 + }, + { + "epoch": 0.11863930693947547, + "grad_norm": 0.09746452420949936, + "learning_rate": 4.4231511158594493e-05, + "loss": 0.023, + "step": 16050 + }, + { + "epoch": 0.11871322551077733, + "grad_norm": 0.08126521855592728, + "learning_rate": 4.422780151946819e-05, + "loss": 0.0174, + "step": 16060 + }, + { + "epoch": 0.11878714408207919, + "grad_norm": 0.13056722283363342, + "learning_rate": 4.4224091880341885e-05, + "loss": 0.0203, + "step": 16070 + }, + { + "epoch": 0.11886106265338103, + "grad_norm": 0.13683772087097168, + "learning_rate": 4.4220382241215575e-05, + "loss": 0.0233, + "step": 16080 + }, + { + "epoch": 0.11893498122468289, + "grad_norm": 0.09745480865240097, + "learning_rate": 4.421667260208927e-05, + "loss": 0.0193, + "step": 16090 + }, + { + "epoch": 0.11900889979598474, + "grad_norm": 0.237451434135437, + "learning_rate": 4.4212962962962966e-05, + "loss": 0.0201, + "step": 16100 + }, + { + "epoch": 0.1190828183672866, + "grad_norm": 0.11330193281173706, + "learning_rate": 4.4209253323836656e-05, + "loss": 0.023, + "step": 16110 + }, + { + "epoch": 0.11915673693858846, + "grad_norm": 0.09872958064079285, + "learning_rate": 4.420554368471035e-05, + "loss": 0.0223, + "step": 16120 + }, + { + "epoch": 0.1192306555098903, + "grad_norm": 0.11739817261695862, + "learning_rate": 4.420183404558405e-05, + "loss": 0.0211, + "step": 16130 + }, + { + "epoch": 0.11930457408119216, + "grad_norm": 0.09575287997722626, + "learning_rate": 4.4198124406457744e-05, + "loss": 0.0205, + "step": 16140 + }, + { + "epoch": 0.119378492652494, + "grad_norm": 0.08751419186592102, + "learning_rate": 4.419441476733144e-05, + "loss": 0.0201, + "step": 16150 + }, + { + "epoch": 0.11945241122379587, + "grad_norm": 0.11821751296520233, + "learning_rate": 4.419070512820513e-05, + "loss": 0.0194, + "step": 16160 + }, + { + "epoch": 0.11952632979509772, + "grad_norm": 0.09098798781633377, + "learning_rate": 4.4186995489078825e-05, + "loss": 0.0177, + "step": 16170 + }, + { + "epoch": 0.11960024836639957, + "grad_norm": 0.10671043395996094, + "learning_rate": 4.4183285849952514e-05, + "loss": 0.0216, + "step": 16180 + }, + { + "epoch": 0.11967416693770143, + "grad_norm": 0.12490768730640411, + "learning_rate": 4.417957621082621e-05, + "loss": 0.0193, + "step": 16190 + }, + { + "epoch": 0.11974808550900329, + "grad_norm": 0.128347709774971, + "learning_rate": 4.4175866571699906e-05, + "loss": 0.0225, + "step": 16200 + }, + { + "epoch": 0.11982200408030513, + "grad_norm": 0.11123131215572357, + "learning_rate": 4.41721569325736e-05, + "loss": 0.019, + "step": 16210 + }, + { + "epoch": 0.11989592265160699, + "grad_norm": 0.0979531928896904, + "learning_rate": 4.41684472934473e-05, + "loss": 0.0222, + "step": 16220 + }, + { + "epoch": 0.11996984122290884, + "grad_norm": 0.09113921225070953, + "learning_rate": 4.4164737654320994e-05, + "loss": 0.0217, + "step": 16230 + }, + { + "epoch": 0.1200437597942107, + "grad_norm": 0.1220354214310646, + "learning_rate": 4.416102801519468e-05, + "loss": 0.0217, + "step": 16240 + }, + { + "epoch": 0.12011767836551256, + "grad_norm": 0.0948338508605957, + "learning_rate": 4.415731837606838e-05, + "loss": 0.0241, + "step": 16250 + }, + { + "epoch": 0.1201915969368144, + "grad_norm": 0.07255394011735916, + "learning_rate": 4.415360873694207e-05, + "loss": 0.0197, + "step": 16260 + }, + { + "epoch": 0.12026551550811626, + "grad_norm": 0.11732756346464157, + "learning_rate": 4.4149899097815764e-05, + "loss": 0.0199, + "step": 16270 + }, + { + "epoch": 0.1203394340794181, + "grad_norm": 0.0827333927154541, + "learning_rate": 4.414618945868946e-05, + "loss": 0.0209, + "step": 16280 + }, + { + "epoch": 0.12041335265071997, + "grad_norm": 0.09649761021137238, + "learning_rate": 4.4142479819563156e-05, + "loss": 0.0192, + "step": 16290 + }, + { + "epoch": 0.12048727122202182, + "grad_norm": 0.08975580334663391, + "learning_rate": 4.413877018043685e-05, + "loss": 0.0227, + "step": 16300 + }, + { + "epoch": 0.12056118979332367, + "grad_norm": 0.08119421452283859, + "learning_rate": 4.413506054131054e-05, + "loss": 0.0211, + "step": 16310 + }, + { + "epoch": 0.12063510836462553, + "grad_norm": 0.1294875144958496, + "learning_rate": 4.413135090218424e-05, + "loss": 0.0212, + "step": 16320 + }, + { + "epoch": 0.12070902693592739, + "grad_norm": 0.1081729456782341, + "learning_rate": 4.412764126305793e-05, + "loss": 0.0224, + "step": 16330 + }, + { + "epoch": 0.12078294550722923, + "grad_norm": 0.07694138586521149, + "learning_rate": 4.412393162393162e-05, + "loss": 0.0206, + "step": 16340 + }, + { + "epoch": 0.12085686407853109, + "grad_norm": 0.08305425196886063, + "learning_rate": 4.412022198480532e-05, + "loss": 0.0191, + "step": 16350 + }, + { + "epoch": 0.12093078264983294, + "grad_norm": 0.07094188034534454, + "learning_rate": 4.4116512345679014e-05, + "loss": 0.0201, + "step": 16360 + }, + { + "epoch": 0.1210047012211348, + "grad_norm": 0.11019996553659439, + "learning_rate": 4.411280270655271e-05, + "loss": 0.019, + "step": 16370 + }, + { + "epoch": 0.12107861979243666, + "grad_norm": 0.11648508161306381, + "learning_rate": 4.4109093067426406e-05, + "loss": 0.0206, + "step": 16380 + }, + { + "epoch": 0.1211525383637385, + "grad_norm": 0.08518712222576141, + "learning_rate": 4.4105383428300095e-05, + "loss": 0.0232, + "step": 16390 + }, + { + "epoch": 0.12122645693504036, + "grad_norm": 0.08771713823080063, + "learning_rate": 4.410167378917379e-05, + "loss": 0.0195, + "step": 16400 + }, + { + "epoch": 0.12130037550634222, + "grad_norm": 0.0767797976732254, + "learning_rate": 4.409796415004749e-05, + "loss": 0.0225, + "step": 16410 + }, + { + "epoch": 0.12137429407764407, + "grad_norm": 0.1259419023990631, + "learning_rate": 4.4094254510921176e-05, + "loss": 0.0212, + "step": 16420 + }, + { + "epoch": 0.12144821264894592, + "grad_norm": 0.08342539519071579, + "learning_rate": 4.409054487179487e-05, + "loss": 0.0194, + "step": 16430 + }, + { + "epoch": 0.12152213122024777, + "grad_norm": 0.13784949481487274, + "learning_rate": 4.408683523266857e-05, + "loss": 0.0218, + "step": 16440 + }, + { + "epoch": 0.12159604979154963, + "grad_norm": 0.09963231533765793, + "learning_rate": 4.4083125593542264e-05, + "loss": 0.0204, + "step": 16450 + }, + { + "epoch": 0.12166996836285149, + "grad_norm": 0.09076182544231415, + "learning_rate": 4.407941595441596e-05, + "loss": 0.0221, + "step": 16460 + }, + { + "epoch": 0.12174388693415333, + "grad_norm": 0.09481775015592575, + "learning_rate": 4.407570631528965e-05, + "loss": 0.0197, + "step": 16470 + }, + { + "epoch": 0.12181780550545519, + "grad_norm": 0.11794617772102356, + "learning_rate": 4.4071996676163345e-05, + "loss": 0.0197, + "step": 16480 + }, + { + "epoch": 0.12189172407675704, + "grad_norm": 0.08506674319505692, + "learning_rate": 4.4068287037037035e-05, + "loss": 0.0219, + "step": 16490 + }, + { + "epoch": 0.1219656426480589, + "grad_norm": 0.10553494095802307, + "learning_rate": 4.406457739791073e-05, + "loss": 0.0215, + "step": 16500 + }, + { + "epoch": 0.12203956121936076, + "grad_norm": 0.10346570611000061, + "learning_rate": 4.4060867758784427e-05, + "loss": 0.0224, + "step": 16510 + }, + { + "epoch": 0.1221134797906626, + "grad_norm": 0.10566481202840805, + "learning_rate": 4.405715811965812e-05, + "loss": 0.0199, + "step": 16520 + }, + { + "epoch": 0.12218739836196446, + "grad_norm": 0.0927921012043953, + "learning_rate": 4.405344848053182e-05, + "loss": 0.0211, + "step": 16530 + }, + { + "epoch": 0.12226131693326632, + "grad_norm": 0.08882363885641098, + "learning_rate": 4.404973884140551e-05, + "loss": 0.0171, + "step": 16540 + }, + { + "epoch": 0.12233523550456817, + "grad_norm": 0.09634187072515488, + "learning_rate": 4.4046029202279204e-05, + "loss": 0.0199, + "step": 16550 + }, + { + "epoch": 0.12240915407587002, + "grad_norm": 0.1210630014538765, + "learning_rate": 4.40423195631529e-05, + "loss": 0.0196, + "step": 16560 + }, + { + "epoch": 0.12248307264717187, + "grad_norm": 0.14356377720832825, + "learning_rate": 4.403860992402659e-05, + "loss": 0.0205, + "step": 16570 + }, + { + "epoch": 0.12255699121847373, + "grad_norm": 0.0979805588722229, + "learning_rate": 4.4034900284900285e-05, + "loss": 0.0178, + "step": 16580 + }, + { + "epoch": 0.12263090978977559, + "grad_norm": 0.1003301665186882, + "learning_rate": 4.403119064577398e-05, + "loss": 0.0215, + "step": 16590 + }, + { + "epoch": 0.12270482836107743, + "grad_norm": 0.08601482957601547, + "learning_rate": 4.402748100664768e-05, + "loss": 0.0194, + "step": 16600 + }, + { + "epoch": 0.12277874693237929, + "grad_norm": 0.1280229091644287, + "learning_rate": 4.402377136752137e-05, + "loss": 0.0201, + "step": 16610 + }, + { + "epoch": 0.12285266550368114, + "grad_norm": 0.08791964501142502, + "learning_rate": 4.402006172839506e-05, + "loss": 0.0205, + "step": 16620 + }, + { + "epoch": 0.122926584074983, + "grad_norm": 0.06108580902218819, + "learning_rate": 4.401635208926876e-05, + "loss": 0.0167, + "step": 16630 + }, + { + "epoch": 0.12300050264628486, + "grad_norm": 0.08036208152770996, + "learning_rate": 4.4012642450142454e-05, + "loss": 0.0193, + "step": 16640 + }, + { + "epoch": 0.1230744212175867, + "grad_norm": 0.1433434784412384, + "learning_rate": 4.400893281101614e-05, + "loss": 0.0211, + "step": 16650 + }, + { + "epoch": 0.12314833978888856, + "grad_norm": 0.09189280867576599, + "learning_rate": 4.400522317188984e-05, + "loss": 0.0214, + "step": 16660 + }, + { + "epoch": 0.12322225836019042, + "grad_norm": 0.08462630957365036, + "learning_rate": 4.4001513532763535e-05, + "loss": 0.02, + "step": 16670 + }, + { + "epoch": 0.12329617693149227, + "grad_norm": 0.09673482924699783, + "learning_rate": 4.399780389363723e-05, + "loss": 0.0188, + "step": 16680 + }, + { + "epoch": 0.12337009550279412, + "grad_norm": 0.10847456008195877, + "learning_rate": 4.399409425451093e-05, + "loss": 0.0201, + "step": 16690 + }, + { + "epoch": 0.12344401407409597, + "grad_norm": 0.08582521229982376, + "learning_rate": 4.3990384615384616e-05, + "loss": 0.0215, + "step": 16700 + }, + { + "epoch": 0.12351793264539783, + "grad_norm": 0.08930855244398117, + "learning_rate": 4.398667497625831e-05, + "loss": 0.0228, + "step": 16710 + }, + { + "epoch": 0.12359185121669969, + "grad_norm": 0.08313935995101929, + "learning_rate": 4.3982965337132e-05, + "loss": 0.02, + "step": 16720 + }, + { + "epoch": 0.12366576978800153, + "grad_norm": 0.09864447265863419, + "learning_rate": 4.39792556980057e-05, + "loss": 0.0222, + "step": 16730 + }, + { + "epoch": 0.12373968835930339, + "grad_norm": 0.07049126923084259, + "learning_rate": 4.39755460588794e-05, + "loss": 0.0178, + "step": 16740 + }, + { + "epoch": 0.12381360693060524, + "grad_norm": 0.09831076860427856, + "learning_rate": 4.397183641975309e-05, + "loss": 0.0199, + "step": 16750 + }, + { + "epoch": 0.1238875255019071, + "grad_norm": 0.08048636466264725, + "learning_rate": 4.3968126780626785e-05, + "loss": 0.0202, + "step": 16760 + }, + { + "epoch": 0.12396144407320896, + "grad_norm": 0.14722725749015808, + "learning_rate": 4.3964417141500474e-05, + "loss": 0.0191, + "step": 16770 + }, + { + "epoch": 0.1240353626445108, + "grad_norm": 0.09972026944160461, + "learning_rate": 4.396070750237417e-05, + "loss": 0.0192, + "step": 16780 + }, + { + "epoch": 0.12410928121581266, + "grad_norm": 0.08267451077699661, + "learning_rate": 4.3956997863247866e-05, + "loss": 0.0205, + "step": 16790 + }, + { + "epoch": 0.12418319978711452, + "grad_norm": 0.08474751561880112, + "learning_rate": 4.3953288224121555e-05, + "loss": 0.0203, + "step": 16800 + }, + { + "epoch": 0.12425711835841637, + "grad_norm": 0.09754516929388046, + "learning_rate": 4.394957858499525e-05, + "loss": 0.0189, + "step": 16810 + }, + { + "epoch": 0.12433103692971822, + "grad_norm": 0.07999278604984283, + "learning_rate": 4.394586894586895e-05, + "loss": 0.0182, + "step": 16820 + }, + { + "epoch": 0.12440495550102007, + "grad_norm": 0.08049635589122772, + "learning_rate": 4.394215930674264e-05, + "loss": 0.0195, + "step": 16830 + }, + { + "epoch": 0.12447887407232193, + "grad_norm": 0.090194471180439, + "learning_rate": 4.393844966761634e-05, + "loss": 0.0194, + "step": 16840 + }, + { + "epoch": 0.12455279264362379, + "grad_norm": 0.09693975001573563, + "learning_rate": 4.393474002849003e-05, + "loss": 0.0201, + "step": 16850 + }, + { + "epoch": 0.12462671121492563, + "grad_norm": 0.1546517014503479, + "learning_rate": 4.3931030389363724e-05, + "loss": 0.0199, + "step": 16860 + }, + { + "epoch": 0.1247006297862275, + "grad_norm": 0.11674577742815018, + "learning_rate": 4.392732075023742e-05, + "loss": 0.0207, + "step": 16870 + }, + { + "epoch": 0.12477454835752935, + "grad_norm": 0.09541032463312149, + "learning_rate": 4.392361111111111e-05, + "loss": 0.0228, + "step": 16880 + }, + { + "epoch": 0.1248484669288312, + "grad_norm": 0.09020016342401505, + "learning_rate": 4.391990147198481e-05, + "loss": 0.0193, + "step": 16890 + }, + { + "epoch": 0.12492238550013306, + "grad_norm": 0.08206814527511597, + "learning_rate": 4.39161918328585e-05, + "loss": 0.0228, + "step": 16900 + }, + { + "epoch": 0.1249963040714349, + "grad_norm": 0.07679471373558044, + "learning_rate": 4.39124821937322e-05, + "loss": 0.02, + "step": 16910 + }, + { + "epoch": 0.12507022264273676, + "grad_norm": 0.08841928094625473, + "learning_rate": 4.3908772554605893e-05, + "loss": 0.0212, + "step": 16920 + }, + { + "epoch": 0.1251441412140386, + "grad_norm": 0.08420246094465256, + "learning_rate": 4.390506291547958e-05, + "loss": 0.0219, + "step": 16930 + }, + { + "epoch": 0.12521805978534048, + "grad_norm": 0.11030604690313339, + "learning_rate": 4.390135327635328e-05, + "loss": 0.0221, + "step": 16940 + }, + { + "epoch": 0.12529197835664233, + "grad_norm": 0.08040262013673782, + "learning_rate": 4.389764363722697e-05, + "loss": 0.0184, + "step": 16950 + }, + { + "epoch": 0.12536589692794417, + "grad_norm": 0.07484613358974457, + "learning_rate": 4.3893933998100664e-05, + "loss": 0.0226, + "step": 16960 + }, + { + "epoch": 0.12543981549924604, + "grad_norm": 0.06491480022668839, + "learning_rate": 4.3890224358974367e-05, + "loss": 0.0207, + "step": 16970 + }, + { + "epoch": 0.1255137340705479, + "grad_norm": 0.07864172756671906, + "learning_rate": 4.3886514719848056e-05, + "loss": 0.0184, + "step": 16980 + }, + { + "epoch": 0.12558765264184973, + "grad_norm": 0.11037025600671768, + "learning_rate": 4.388280508072175e-05, + "loss": 0.0184, + "step": 16990 + }, + { + "epoch": 0.12566157121315158, + "grad_norm": 0.0728209838271141, + "learning_rate": 4.387909544159544e-05, + "loss": 0.0201, + "step": 17000 + }, + { + "epoch": 0.12573548978445345, + "grad_norm": 0.10479336977005005, + "learning_rate": 4.387538580246914e-05, + "loss": 0.0212, + "step": 17010 + }, + { + "epoch": 0.1258094083557553, + "grad_norm": 0.09833086282014847, + "learning_rate": 4.387167616334283e-05, + "loss": 0.0204, + "step": 17020 + }, + { + "epoch": 0.12588332692705714, + "grad_norm": 0.07104408740997314, + "learning_rate": 4.386796652421652e-05, + "loss": 0.0212, + "step": 17030 + }, + { + "epoch": 0.12595724549835902, + "grad_norm": 0.0565701425075531, + "learning_rate": 4.386425688509022e-05, + "loss": 0.0209, + "step": 17040 + }, + { + "epoch": 0.12603116406966086, + "grad_norm": 0.08204209804534912, + "learning_rate": 4.3860547245963914e-05, + "loss": 0.0183, + "step": 17050 + }, + { + "epoch": 0.1261050826409627, + "grad_norm": 0.10821973532438278, + "learning_rate": 4.385683760683761e-05, + "loss": 0.0234, + "step": 17060 + }, + { + "epoch": 0.12617900121226458, + "grad_norm": 0.0890052542090416, + "learning_rate": 4.3853127967711306e-05, + "loss": 0.0186, + "step": 17070 + }, + { + "epoch": 0.12625291978356643, + "grad_norm": 0.10424130409955978, + "learning_rate": 4.3849418328584995e-05, + "loss": 0.0216, + "step": 17080 + }, + { + "epoch": 0.12632683835486827, + "grad_norm": 0.0998992845416069, + "learning_rate": 4.384570868945869e-05, + "loss": 0.0228, + "step": 17090 + }, + { + "epoch": 0.12640075692617014, + "grad_norm": 0.11746528744697571, + "learning_rate": 4.384199905033239e-05, + "loss": 0.0189, + "step": 17100 + }, + { + "epoch": 0.126474675497472, + "grad_norm": 0.10497764497995377, + "learning_rate": 4.3838289411206076e-05, + "loss": 0.0188, + "step": 17110 + }, + { + "epoch": 0.12654859406877383, + "grad_norm": 0.07906600832939148, + "learning_rate": 4.383457977207978e-05, + "loss": 0.0191, + "step": 17120 + }, + { + "epoch": 0.12662251264007568, + "grad_norm": 0.09107011556625366, + "learning_rate": 4.383087013295347e-05, + "loss": 0.0178, + "step": 17130 + }, + { + "epoch": 0.12669643121137755, + "grad_norm": 0.10516030341386795, + "learning_rate": 4.3827160493827164e-05, + "loss": 0.0195, + "step": 17140 + }, + { + "epoch": 0.1267703497826794, + "grad_norm": 0.05971836671233177, + "learning_rate": 4.382345085470086e-05, + "loss": 0.0182, + "step": 17150 + }, + { + "epoch": 0.12684426835398124, + "grad_norm": 0.07957801967859268, + "learning_rate": 4.381974121557455e-05, + "loss": 0.021, + "step": 17160 + }, + { + "epoch": 0.12691818692528312, + "grad_norm": 0.0885632112622261, + "learning_rate": 4.3816031576448245e-05, + "loss": 0.0205, + "step": 17170 + }, + { + "epoch": 0.12699210549658496, + "grad_norm": 0.11395470798015594, + "learning_rate": 4.3812321937321934e-05, + "loss": 0.0208, + "step": 17180 + }, + { + "epoch": 0.1270660240678868, + "grad_norm": 0.06711099296808243, + "learning_rate": 4.380861229819563e-05, + "loss": 0.02, + "step": 17190 + }, + { + "epoch": 0.12713994263918868, + "grad_norm": 0.07606938481330872, + "learning_rate": 4.380490265906933e-05, + "loss": 0.0246, + "step": 17200 + }, + { + "epoch": 0.12721386121049053, + "grad_norm": 0.08802618086338043, + "learning_rate": 4.380119301994302e-05, + "loss": 0.0192, + "step": 17210 + }, + { + "epoch": 0.12728777978179237, + "grad_norm": 0.10931966453790665, + "learning_rate": 4.379748338081672e-05, + "loss": 0.0219, + "step": 17220 + }, + { + "epoch": 0.12736169835309424, + "grad_norm": 0.0964532196521759, + "learning_rate": 4.379377374169041e-05, + "loss": 0.0203, + "step": 17230 + }, + { + "epoch": 0.1274356169243961, + "grad_norm": 0.07935398072004318, + "learning_rate": 4.3790064102564103e-05, + "loss": 0.0168, + "step": 17240 + }, + { + "epoch": 0.12750953549569793, + "grad_norm": 0.0767403393983841, + "learning_rate": 4.37863544634378e-05, + "loss": 0.0209, + "step": 17250 + }, + { + "epoch": 0.12758345406699978, + "grad_norm": 0.10330316424369812, + "learning_rate": 4.378264482431149e-05, + "loss": 0.0215, + "step": 17260 + }, + { + "epoch": 0.12765737263830165, + "grad_norm": 0.09282982349395752, + "learning_rate": 4.377893518518519e-05, + "loss": 0.0228, + "step": 17270 + }, + { + "epoch": 0.1277312912096035, + "grad_norm": 0.07249481230974197, + "learning_rate": 4.377522554605888e-05, + "loss": 0.019, + "step": 17280 + }, + { + "epoch": 0.12780520978090534, + "grad_norm": 0.06935325264930725, + "learning_rate": 4.3771515906932576e-05, + "loss": 0.0208, + "step": 17290 + }, + { + "epoch": 0.12787912835220722, + "grad_norm": 0.0685378834605217, + "learning_rate": 4.376780626780627e-05, + "loss": 0.0202, + "step": 17300 + }, + { + "epoch": 0.12795304692350906, + "grad_norm": 0.11913516372442245, + "learning_rate": 4.376409662867996e-05, + "loss": 0.0209, + "step": 17310 + }, + { + "epoch": 0.1280269654948109, + "grad_norm": 0.10438575595617294, + "learning_rate": 4.376038698955366e-05, + "loss": 0.0213, + "step": 17320 + }, + { + "epoch": 0.12810088406611278, + "grad_norm": 0.12262512743473053, + "learning_rate": 4.3756677350427354e-05, + "loss": 0.0217, + "step": 17330 + }, + { + "epoch": 0.12817480263741463, + "grad_norm": 0.16375628113746643, + "learning_rate": 4.375296771130104e-05, + "loss": 0.0196, + "step": 17340 + }, + { + "epoch": 0.12824872120871647, + "grad_norm": 0.08159057796001434, + "learning_rate": 4.3749258072174746e-05, + "loss": 0.0228, + "step": 17350 + }, + { + "epoch": 0.12832263978001834, + "grad_norm": 0.08955547213554382, + "learning_rate": 4.3745548433048435e-05, + "loss": 0.02, + "step": 17360 + }, + { + "epoch": 0.1283965583513202, + "grad_norm": 0.09596460312604904, + "learning_rate": 4.374183879392213e-05, + "loss": 0.0193, + "step": 17370 + }, + { + "epoch": 0.12847047692262203, + "grad_norm": 0.1025465801358223, + "learning_rate": 4.373812915479583e-05, + "loss": 0.0218, + "step": 17380 + }, + { + "epoch": 0.12854439549392388, + "grad_norm": 0.06650226563215256, + "learning_rate": 4.3734419515669516e-05, + "loss": 0.0191, + "step": 17390 + }, + { + "epoch": 0.12861831406522575, + "grad_norm": 0.11879001557826996, + "learning_rate": 4.373070987654321e-05, + "loss": 0.0214, + "step": 17400 + }, + { + "epoch": 0.1286922326365276, + "grad_norm": 0.10580756515264511, + "learning_rate": 4.37270002374169e-05, + "loss": 0.0224, + "step": 17410 + }, + { + "epoch": 0.12876615120782944, + "grad_norm": 0.1196075975894928, + "learning_rate": 4.3723290598290604e-05, + "loss": 0.0201, + "step": 17420 + }, + { + "epoch": 0.12884006977913132, + "grad_norm": 0.11283430457115173, + "learning_rate": 4.37195809591643e-05, + "loss": 0.0214, + "step": 17430 + }, + { + "epoch": 0.12891398835043316, + "grad_norm": 0.0617198720574379, + "learning_rate": 4.371587132003799e-05, + "loss": 0.0199, + "step": 17440 + }, + { + "epoch": 0.128987906921735, + "grad_norm": 0.0883866548538208, + "learning_rate": 4.3712161680911685e-05, + "loss": 0.0195, + "step": 17450 + }, + { + "epoch": 0.12906182549303688, + "grad_norm": 0.10958422720432281, + "learning_rate": 4.3708452041785374e-05, + "loss": 0.0211, + "step": 17460 + }, + { + "epoch": 0.12913574406433873, + "grad_norm": 0.08933533728122711, + "learning_rate": 4.370474240265907e-05, + "loss": 0.0221, + "step": 17470 + }, + { + "epoch": 0.12920966263564057, + "grad_norm": 0.12817147374153137, + "learning_rate": 4.3701032763532766e-05, + "loss": 0.0227, + "step": 17480 + }, + { + "epoch": 0.12928358120694244, + "grad_norm": 0.1111520379781723, + "learning_rate": 4.3697323124406455e-05, + "loss": 0.0197, + "step": 17490 + }, + { + "epoch": 0.1293574997782443, + "grad_norm": 0.07977577298879623, + "learning_rate": 4.369361348528016e-05, + "loss": 0.0199, + "step": 17500 + }, + { + "epoch": 0.12943141834954613, + "grad_norm": 0.09112214297056198, + "learning_rate": 4.368990384615385e-05, + "loss": 0.0203, + "step": 17510 + }, + { + "epoch": 0.129505336920848, + "grad_norm": 0.11383547633886337, + "learning_rate": 4.368619420702754e-05, + "loss": 0.0197, + "step": 17520 + }, + { + "epoch": 0.12957925549214985, + "grad_norm": 0.11252755671739578, + "learning_rate": 4.368248456790124e-05, + "loss": 0.0224, + "step": 17530 + }, + { + "epoch": 0.1296531740634517, + "grad_norm": 0.10879248380661011, + "learning_rate": 4.367877492877493e-05, + "loss": 0.0235, + "step": 17540 + }, + { + "epoch": 0.12972709263475354, + "grad_norm": 0.10132326930761337, + "learning_rate": 4.3675065289648624e-05, + "loss": 0.0186, + "step": 17550 + }, + { + "epoch": 0.12980101120605542, + "grad_norm": 0.10024122148752213, + "learning_rate": 4.367135565052232e-05, + "loss": 0.0201, + "step": 17560 + }, + { + "epoch": 0.12987492977735726, + "grad_norm": 0.11927720159292221, + "learning_rate": 4.3667646011396016e-05, + "loss": 0.0206, + "step": 17570 + }, + { + "epoch": 0.1299488483486591, + "grad_norm": 0.10507655143737793, + "learning_rate": 4.366393637226971e-05, + "loss": 0.0187, + "step": 17580 + }, + { + "epoch": 0.13002276691996098, + "grad_norm": 0.13051213324069977, + "learning_rate": 4.36602267331434e-05, + "loss": 0.0206, + "step": 17590 + }, + { + "epoch": 0.13009668549126283, + "grad_norm": 0.09864769130945206, + "learning_rate": 4.36565170940171e-05, + "loss": 0.0231, + "step": 17600 + }, + { + "epoch": 0.13017060406256467, + "grad_norm": 0.12273035198450089, + "learning_rate": 4.365280745489079e-05, + "loss": 0.0194, + "step": 17610 + }, + { + "epoch": 0.13024452263386654, + "grad_norm": 0.08343586325645447, + "learning_rate": 4.364909781576448e-05, + "loss": 0.02, + "step": 17620 + }, + { + "epoch": 0.1303184412051684, + "grad_norm": 0.09308631718158722, + "learning_rate": 4.364538817663818e-05, + "loss": 0.0161, + "step": 17630 + }, + { + "epoch": 0.13039235977647023, + "grad_norm": 0.1017531156539917, + "learning_rate": 4.364167853751187e-05, + "loss": 0.0191, + "step": 17640 + }, + { + "epoch": 0.1304662783477721, + "grad_norm": 0.11994847655296326, + "learning_rate": 4.363796889838557e-05, + "loss": 0.0206, + "step": 17650 + }, + { + "epoch": 0.13054019691907395, + "grad_norm": 0.0923483669757843, + "learning_rate": 4.3634259259259266e-05, + "loss": 0.0199, + "step": 17660 + }, + { + "epoch": 0.1306141154903758, + "grad_norm": 0.08812946081161499, + "learning_rate": 4.3630549620132955e-05, + "loss": 0.0198, + "step": 17670 + }, + { + "epoch": 0.13068803406167764, + "grad_norm": 0.09632091224193573, + "learning_rate": 4.362683998100665e-05, + "loss": 0.0199, + "step": 17680 + }, + { + "epoch": 0.13076195263297952, + "grad_norm": 0.07714105397462845, + "learning_rate": 4.362313034188034e-05, + "loss": 0.0183, + "step": 17690 + }, + { + "epoch": 0.13083587120428136, + "grad_norm": 0.10829984396696091, + "learning_rate": 4.3619420702754037e-05, + "loss": 0.02, + "step": 17700 + }, + { + "epoch": 0.1309097897755832, + "grad_norm": 0.10526656359434128, + "learning_rate": 4.361571106362773e-05, + "loss": 0.0195, + "step": 17710 + }, + { + "epoch": 0.13098370834688508, + "grad_norm": 0.11419089138507843, + "learning_rate": 4.361200142450143e-05, + "loss": 0.0182, + "step": 17720 + }, + { + "epoch": 0.13105762691818693, + "grad_norm": 0.07258699089288712, + "learning_rate": 4.3608291785375125e-05, + "loss": 0.0193, + "step": 17730 + }, + { + "epoch": 0.13113154548948877, + "grad_norm": 0.1357620507478714, + "learning_rate": 4.3604582146248814e-05, + "loss": 0.0187, + "step": 17740 + }, + { + "epoch": 0.13120546406079064, + "grad_norm": 0.0952298641204834, + "learning_rate": 4.360087250712251e-05, + "loss": 0.0223, + "step": 17750 + }, + { + "epoch": 0.1312793826320925, + "grad_norm": 0.08755937218666077, + "learning_rate": 4.3597162867996206e-05, + "loss": 0.0214, + "step": 17760 + }, + { + "epoch": 0.13135330120339433, + "grad_norm": 0.10102654248476028, + "learning_rate": 4.3593453228869895e-05, + "loss": 0.0181, + "step": 17770 + }, + { + "epoch": 0.1314272197746962, + "grad_norm": 0.0867420956492424, + "learning_rate": 4.358974358974359e-05, + "loss": 0.0215, + "step": 17780 + }, + { + "epoch": 0.13150113834599805, + "grad_norm": 0.1367752104997635, + "learning_rate": 4.358603395061729e-05, + "loss": 0.0219, + "step": 17790 + }, + { + "epoch": 0.1315750569172999, + "grad_norm": 0.09189877659082413, + "learning_rate": 4.358232431149098e-05, + "loss": 0.0206, + "step": 17800 + }, + { + "epoch": 0.13164897548860174, + "grad_norm": 0.07793890684843063, + "learning_rate": 4.357861467236468e-05, + "loss": 0.0196, + "step": 17810 + }, + { + "epoch": 0.13172289405990362, + "grad_norm": 0.10152376443147659, + "learning_rate": 4.357490503323837e-05, + "loss": 0.0193, + "step": 17820 + }, + { + "epoch": 0.13179681263120546, + "grad_norm": 0.10715579241514206, + "learning_rate": 4.3571195394112064e-05, + "loss": 0.0221, + "step": 17830 + }, + { + "epoch": 0.1318707312025073, + "grad_norm": 0.07988528907299042, + "learning_rate": 4.356748575498576e-05, + "loss": 0.0172, + "step": 17840 + }, + { + "epoch": 0.13194464977380918, + "grad_norm": 0.1156516894698143, + "learning_rate": 4.356377611585945e-05, + "loss": 0.0201, + "step": 17850 + }, + { + "epoch": 0.13201856834511103, + "grad_norm": 0.07987456023693085, + "learning_rate": 4.3560066476733145e-05, + "loss": 0.0212, + "step": 17860 + }, + { + "epoch": 0.13209248691641287, + "grad_norm": 0.1036411002278328, + "learning_rate": 4.355635683760684e-05, + "loss": 0.0184, + "step": 17870 + }, + { + "epoch": 0.13216640548771474, + "grad_norm": 0.06169649586081505, + "learning_rate": 4.355264719848054e-05, + "loss": 0.0213, + "step": 17880 + }, + { + "epoch": 0.1322403240590166, + "grad_norm": 0.0923336073756218, + "learning_rate": 4.354893755935423e-05, + "loss": 0.0193, + "step": 17890 + }, + { + "epoch": 0.13231424263031843, + "grad_norm": 0.09562145173549652, + "learning_rate": 4.354522792022792e-05, + "loss": 0.0225, + "step": 17900 + }, + { + "epoch": 0.1323881612016203, + "grad_norm": 0.09590861946344376, + "learning_rate": 4.354151828110162e-05, + "loss": 0.0205, + "step": 17910 + }, + { + "epoch": 0.13246207977292215, + "grad_norm": 0.1031101867556572, + "learning_rate": 4.353780864197531e-05, + "loss": 0.0195, + "step": 17920 + }, + { + "epoch": 0.132535998344224, + "grad_norm": 0.08585697412490845, + "learning_rate": 4.3534099002849e-05, + "loss": 0.0206, + "step": 17930 + }, + { + "epoch": 0.13260991691552584, + "grad_norm": 0.08860399574041367, + "learning_rate": 4.35303893637227e-05, + "loss": 0.0194, + "step": 17940 + }, + { + "epoch": 0.13268383548682772, + "grad_norm": 0.1192474216222763, + "learning_rate": 4.3526679724596395e-05, + "loss": 0.0201, + "step": 17950 + }, + { + "epoch": 0.13275775405812956, + "grad_norm": 0.08405131101608276, + "learning_rate": 4.352297008547009e-05, + "loss": 0.0195, + "step": 17960 + }, + { + "epoch": 0.1328316726294314, + "grad_norm": 0.07257207483053207, + "learning_rate": 4.351926044634378e-05, + "loss": 0.0164, + "step": 17970 + }, + { + "epoch": 0.13290559120073328, + "grad_norm": 0.08803092688322067, + "learning_rate": 4.3515550807217476e-05, + "loss": 0.0187, + "step": 17980 + }, + { + "epoch": 0.13297950977203513, + "grad_norm": 0.092405766248703, + "learning_rate": 4.351184116809117e-05, + "loss": 0.0187, + "step": 17990 + }, + { + "epoch": 0.13305342834333697, + "grad_norm": 0.08581379801034927, + "learning_rate": 4.350813152896486e-05, + "loss": 0.0199, + "step": 18000 + }, + { + "epoch": 0.13312734691463884, + "grad_norm": 0.10120215266942978, + "learning_rate": 4.350442188983856e-05, + "loss": 0.0219, + "step": 18010 + }, + { + "epoch": 0.1332012654859407, + "grad_norm": 0.0820140391588211, + "learning_rate": 4.350071225071225e-05, + "loss": 0.0212, + "step": 18020 + }, + { + "epoch": 0.13327518405724254, + "grad_norm": 0.10618920624256134, + "learning_rate": 4.349700261158595e-05, + "loss": 0.021, + "step": 18030 + }, + { + "epoch": 0.1333491026285444, + "grad_norm": 0.0975532978773117, + "learning_rate": 4.3493292972459645e-05, + "loss": 0.0206, + "step": 18040 + }, + { + "epoch": 0.13342302119984625, + "grad_norm": 0.09257347136735916, + "learning_rate": 4.3489583333333334e-05, + "loss": 0.0208, + "step": 18050 + }, + { + "epoch": 0.1334969397711481, + "grad_norm": 0.09179476648569107, + "learning_rate": 4.348587369420703e-05, + "loss": 0.0208, + "step": 18060 + }, + { + "epoch": 0.13357085834244994, + "grad_norm": 0.09116066247224808, + "learning_rate": 4.3482164055080726e-05, + "loss": 0.0197, + "step": 18070 + }, + { + "epoch": 0.13364477691375182, + "grad_norm": 0.07741699367761612, + "learning_rate": 4.3478454415954416e-05, + "loss": 0.0204, + "step": 18080 + }, + { + "epoch": 0.13371869548505366, + "grad_norm": 0.08600520342588425, + "learning_rate": 4.347474477682811e-05, + "loss": 0.0197, + "step": 18090 + }, + { + "epoch": 0.1337926140563555, + "grad_norm": 0.07641606777906418, + "learning_rate": 4.347103513770181e-05, + "loss": 0.021, + "step": 18100 + }, + { + "epoch": 0.13386653262765738, + "grad_norm": 0.12952347099781036, + "learning_rate": 4.3467325498575503e-05, + "loss": 0.0187, + "step": 18110 + }, + { + "epoch": 0.13394045119895923, + "grad_norm": 0.08709315955638885, + "learning_rate": 4.34636158594492e-05, + "loss": 0.0187, + "step": 18120 + }, + { + "epoch": 0.13401436977026107, + "grad_norm": 0.0985182449221611, + "learning_rate": 4.345990622032289e-05, + "loss": 0.0218, + "step": 18130 + }, + { + "epoch": 0.13408828834156294, + "grad_norm": 0.0989292562007904, + "learning_rate": 4.3456196581196585e-05, + "loss": 0.0198, + "step": 18140 + }, + { + "epoch": 0.1341622069128648, + "grad_norm": 0.10038740187883377, + "learning_rate": 4.3452486942070274e-05, + "loss": 0.0213, + "step": 18150 + }, + { + "epoch": 0.13423612548416664, + "grad_norm": 0.08420637995004654, + "learning_rate": 4.344877730294397e-05, + "loss": 0.0227, + "step": 18160 + }, + { + "epoch": 0.1343100440554685, + "grad_norm": 0.09484747052192688, + "learning_rate": 4.3445067663817666e-05, + "loss": 0.0206, + "step": 18170 + }, + { + "epoch": 0.13438396262677035, + "grad_norm": 0.09223677217960358, + "learning_rate": 4.344135802469136e-05, + "loss": 0.0216, + "step": 18180 + }, + { + "epoch": 0.1344578811980722, + "grad_norm": 0.06965033710002899, + "learning_rate": 4.343764838556506e-05, + "loss": 0.0207, + "step": 18190 + }, + { + "epoch": 0.13453179976937404, + "grad_norm": 0.06636201590299606, + "learning_rate": 4.343393874643875e-05, + "loss": 0.0195, + "step": 18200 + }, + { + "epoch": 0.13460571834067592, + "grad_norm": 0.07372688502073288, + "learning_rate": 4.343022910731244e-05, + "loss": 0.0224, + "step": 18210 + }, + { + "epoch": 0.13467963691197776, + "grad_norm": 0.08220159262418747, + "learning_rate": 4.342651946818614e-05, + "loss": 0.0187, + "step": 18220 + }, + { + "epoch": 0.1347535554832796, + "grad_norm": 0.09175535291433334, + "learning_rate": 4.342280982905983e-05, + "loss": 0.0198, + "step": 18230 + }, + { + "epoch": 0.13482747405458148, + "grad_norm": 0.09849115461111069, + "learning_rate": 4.3419100189933524e-05, + "loss": 0.0188, + "step": 18240 + }, + { + "epoch": 0.13490139262588333, + "grad_norm": 0.11270340532064438, + "learning_rate": 4.341539055080722e-05, + "loss": 0.0188, + "step": 18250 + }, + { + "epoch": 0.13497531119718517, + "grad_norm": 0.0937158614397049, + "learning_rate": 4.3411680911680916e-05, + "loss": 0.0219, + "step": 18260 + }, + { + "epoch": 0.13504922976848704, + "grad_norm": 0.09184368699789047, + "learning_rate": 4.340797127255461e-05, + "loss": 0.021, + "step": 18270 + }, + { + "epoch": 0.1351231483397889, + "grad_norm": 0.1325238049030304, + "learning_rate": 4.34042616334283e-05, + "loss": 0.0221, + "step": 18280 + }, + { + "epoch": 0.13519706691109074, + "grad_norm": 0.10193447023630142, + "learning_rate": 4.3400551994302e-05, + "loss": 0.0203, + "step": 18290 + }, + { + "epoch": 0.1352709854823926, + "grad_norm": 0.06822310388088226, + "learning_rate": 4.339684235517569e-05, + "loss": 0.0199, + "step": 18300 + }, + { + "epoch": 0.13534490405369445, + "grad_norm": 0.08294135332107544, + "learning_rate": 4.339313271604938e-05, + "loss": 0.0167, + "step": 18310 + }, + { + "epoch": 0.1354188226249963, + "grad_norm": 0.08948735147714615, + "learning_rate": 4.338942307692308e-05, + "loss": 0.019, + "step": 18320 + }, + { + "epoch": 0.13549274119629814, + "grad_norm": 0.09147325903177261, + "learning_rate": 4.3385713437796774e-05, + "loss": 0.0197, + "step": 18330 + }, + { + "epoch": 0.13556665976760002, + "grad_norm": 0.08658456802368164, + "learning_rate": 4.338200379867047e-05, + "loss": 0.0205, + "step": 18340 + }, + { + "epoch": 0.13564057833890186, + "grad_norm": 0.10264665633440018, + "learning_rate": 4.3378294159544166e-05, + "loss": 0.0192, + "step": 18350 + }, + { + "epoch": 0.1357144969102037, + "grad_norm": 0.07289276272058487, + "learning_rate": 4.3374584520417855e-05, + "loss": 0.0202, + "step": 18360 + }, + { + "epoch": 0.13578841548150558, + "grad_norm": 0.08398177474737167, + "learning_rate": 4.337087488129155e-05, + "loss": 0.0212, + "step": 18370 + }, + { + "epoch": 0.13586233405280743, + "grad_norm": 0.08534097671508789, + "learning_rate": 4.336716524216524e-05, + "loss": 0.0203, + "step": 18380 + }, + { + "epoch": 0.13593625262410927, + "grad_norm": 0.08274282515048981, + "learning_rate": 4.3363455603038936e-05, + "loss": 0.0203, + "step": 18390 + }, + { + "epoch": 0.13601017119541114, + "grad_norm": 0.10768471658229828, + "learning_rate": 4.335974596391263e-05, + "loss": 0.0192, + "step": 18400 + }, + { + "epoch": 0.136084089766713, + "grad_norm": 0.10010208189487457, + "learning_rate": 4.335603632478633e-05, + "loss": 0.0218, + "step": 18410 + }, + { + "epoch": 0.13615800833801484, + "grad_norm": 0.10978821665048599, + "learning_rate": 4.3352326685660024e-05, + "loss": 0.02, + "step": 18420 + }, + { + "epoch": 0.1362319269093167, + "grad_norm": 0.09286827594041824, + "learning_rate": 4.3348617046533713e-05, + "loss": 0.0222, + "step": 18430 + }, + { + "epoch": 0.13630584548061855, + "grad_norm": 0.10667553544044495, + "learning_rate": 4.334490740740741e-05, + "loss": 0.0192, + "step": 18440 + }, + { + "epoch": 0.1363797640519204, + "grad_norm": 0.0845090001821518, + "learning_rate": 4.3341197768281105e-05, + "loss": 0.0198, + "step": 18450 + }, + { + "epoch": 0.13645368262322227, + "grad_norm": 0.09107456356287003, + "learning_rate": 4.3337488129154795e-05, + "loss": 0.0229, + "step": 18460 + }, + { + "epoch": 0.13652760119452412, + "grad_norm": 0.11485371738672256, + "learning_rate": 4.333377849002849e-05, + "loss": 0.0209, + "step": 18470 + }, + { + "epoch": 0.13660151976582596, + "grad_norm": 0.09362673759460449, + "learning_rate": 4.3330068850902187e-05, + "loss": 0.0209, + "step": 18480 + }, + { + "epoch": 0.1366754383371278, + "grad_norm": 0.08930040150880814, + "learning_rate": 4.332635921177588e-05, + "loss": 0.0216, + "step": 18490 + }, + { + "epoch": 0.13674935690842968, + "grad_norm": 0.08274761587381363, + "learning_rate": 4.332264957264958e-05, + "loss": 0.0184, + "step": 18500 + }, + { + "epoch": 0.13682327547973153, + "grad_norm": 0.10678842663764954, + "learning_rate": 4.331893993352327e-05, + "loss": 0.0188, + "step": 18510 + }, + { + "epoch": 0.13689719405103337, + "grad_norm": 0.08956782519817352, + "learning_rate": 4.3315230294396964e-05, + "loss": 0.0199, + "step": 18520 + }, + { + "epoch": 0.13697111262233524, + "grad_norm": 0.10181953758001328, + "learning_rate": 4.331152065527066e-05, + "loss": 0.0196, + "step": 18530 + }, + { + "epoch": 0.1370450311936371, + "grad_norm": 0.09772741794586182, + "learning_rate": 4.330781101614435e-05, + "loss": 0.0195, + "step": 18540 + }, + { + "epoch": 0.13711894976493894, + "grad_norm": 0.08765088766813278, + "learning_rate": 4.3304101377018045e-05, + "loss": 0.0191, + "step": 18550 + }, + { + "epoch": 0.1371928683362408, + "grad_norm": 0.10872490704059601, + "learning_rate": 4.330039173789174e-05, + "loss": 0.0187, + "step": 18560 + }, + { + "epoch": 0.13726678690754265, + "grad_norm": 0.07029812783002853, + "learning_rate": 4.329668209876544e-05, + "loss": 0.0189, + "step": 18570 + }, + { + "epoch": 0.1373407054788445, + "grad_norm": 0.09874571114778519, + "learning_rate": 4.329297245963913e-05, + "loss": 0.0193, + "step": 18580 + }, + { + "epoch": 0.13741462405014637, + "grad_norm": 0.10533016175031662, + "learning_rate": 4.328926282051282e-05, + "loss": 0.0181, + "step": 18590 + }, + { + "epoch": 0.13748854262144822, + "grad_norm": 0.08398744463920593, + "learning_rate": 4.328555318138652e-05, + "loss": 0.0188, + "step": 18600 + }, + { + "epoch": 0.13756246119275006, + "grad_norm": 0.08623006194829941, + "learning_rate": 4.328184354226021e-05, + "loss": 0.0188, + "step": 18610 + }, + { + "epoch": 0.1376363797640519, + "grad_norm": 0.1085953414440155, + "learning_rate": 4.32781339031339e-05, + "loss": 0.0207, + "step": 18620 + }, + { + "epoch": 0.13771029833535378, + "grad_norm": 0.10261372476816177, + "learning_rate": 4.32744242640076e-05, + "loss": 0.0183, + "step": 18630 + }, + { + "epoch": 0.13778421690665563, + "grad_norm": 0.07687580585479736, + "learning_rate": 4.3270714624881295e-05, + "loss": 0.02, + "step": 18640 + }, + { + "epoch": 0.13785813547795747, + "grad_norm": 0.08356168121099472, + "learning_rate": 4.326700498575499e-05, + "loss": 0.0196, + "step": 18650 + }, + { + "epoch": 0.13793205404925934, + "grad_norm": 0.07929971069097519, + "learning_rate": 4.326329534662868e-05, + "loss": 0.0215, + "step": 18660 + }, + { + "epoch": 0.1380059726205612, + "grad_norm": 0.12214069813489914, + "learning_rate": 4.3259585707502376e-05, + "loss": 0.0206, + "step": 18670 + }, + { + "epoch": 0.13807989119186304, + "grad_norm": 0.0971401110291481, + "learning_rate": 4.325587606837607e-05, + "loss": 0.0173, + "step": 18680 + }, + { + "epoch": 0.1381538097631649, + "grad_norm": 0.1067282035946846, + "learning_rate": 4.325216642924976e-05, + "loss": 0.0218, + "step": 18690 + }, + { + "epoch": 0.13822772833446675, + "grad_norm": 0.08311453461647034, + "learning_rate": 4.324845679012346e-05, + "loss": 0.0189, + "step": 18700 + }, + { + "epoch": 0.1383016469057686, + "grad_norm": 0.07841376960277557, + "learning_rate": 4.324474715099715e-05, + "loss": 0.0196, + "step": 18710 + }, + { + "epoch": 0.13837556547707047, + "grad_norm": 0.11007623374462128, + "learning_rate": 4.324103751187085e-05, + "loss": 0.0233, + "step": 18720 + }, + { + "epoch": 0.13844948404837232, + "grad_norm": 0.06940075010061264, + "learning_rate": 4.3237327872744545e-05, + "loss": 0.0199, + "step": 18730 + }, + { + "epoch": 0.13852340261967416, + "grad_norm": 0.0829441025853157, + "learning_rate": 4.3233618233618234e-05, + "loss": 0.0202, + "step": 18740 + }, + { + "epoch": 0.138597321190976, + "grad_norm": 0.11176992207765579, + "learning_rate": 4.322990859449193e-05, + "loss": 0.0201, + "step": 18750 + }, + { + "epoch": 0.13867123976227788, + "grad_norm": 0.08433730155229568, + "learning_rate": 4.3226198955365626e-05, + "loss": 0.0193, + "step": 18760 + }, + { + "epoch": 0.13874515833357973, + "grad_norm": 0.10842785239219666, + "learning_rate": 4.3222489316239315e-05, + "loss": 0.0216, + "step": 18770 + }, + { + "epoch": 0.13881907690488157, + "grad_norm": 0.12139491736888885, + "learning_rate": 4.321877967711301e-05, + "loss": 0.0216, + "step": 18780 + }, + { + "epoch": 0.13889299547618345, + "grad_norm": 0.07391542941331863, + "learning_rate": 4.321507003798671e-05, + "loss": 0.0217, + "step": 18790 + }, + { + "epoch": 0.1389669140474853, + "grad_norm": 0.10303296893835068, + "learning_rate": 4.32113603988604e-05, + "loss": 0.0209, + "step": 18800 + }, + { + "epoch": 0.13904083261878714, + "grad_norm": 0.09348388016223907, + "learning_rate": 4.32076507597341e-05, + "loss": 0.0225, + "step": 18810 + }, + { + "epoch": 0.139114751190089, + "grad_norm": 0.10517199337482452, + "learning_rate": 4.320394112060779e-05, + "loss": 0.0204, + "step": 18820 + }, + { + "epoch": 0.13918866976139085, + "grad_norm": 0.0827675312757492, + "learning_rate": 4.3200231481481484e-05, + "loss": 0.0202, + "step": 18830 + }, + { + "epoch": 0.1392625883326927, + "grad_norm": 0.14683835208415985, + "learning_rate": 4.3196521842355174e-05, + "loss": 0.0215, + "step": 18840 + }, + { + "epoch": 0.13933650690399457, + "grad_norm": 0.08116668462753296, + "learning_rate": 4.319281220322887e-05, + "loss": 0.0224, + "step": 18850 + }, + { + "epoch": 0.13941042547529642, + "grad_norm": 0.08344514667987823, + "learning_rate": 4.3189102564102565e-05, + "loss": 0.02, + "step": 18860 + }, + { + "epoch": 0.13948434404659826, + "grad_norm": 0.0966254398226738, + "learning_rate": 4.318539292497626e-05, + "loss": 0.0234, + "step": 18870 + }, + { + "epoch": 0.1395582626179001, + "grad_norm": 0.0772177055478096, + "learning_rate": 4.318168328584996e-05, + "loss": 0.0206, + "step": 18880 + }, + { + "epoch": 0.13963218118920198, + "grad_norm": 0.07024878263473511, + "learning_rate": 4.317797364672365e-05, + "loss": 0.0211, + "step": 18890 + }, + { + "epoch": 0.13970609976050383, + "grad_norm": 0.06805914640426636, + "learning_rate": 4.317426400759734e-05, + "loss": 0.0185, + "step": 18900 + }, + { + "epoch": 0.13978001833180567, + "grad_norm": 0.08049175888299942, + "learning_rate": 4.317055436847104e-05, + "loss": 0.0188, + "step": 18910 + }, + { + "epoch": 0.13985393690310755, + "grad_norm": 0.053536444902420044, + "learning_rate": 4.316684472934473e-05, + "loss": 0.0217, + "step": 18920 + }, + { + "epoch": 0.1399278554744094, + "grad_norm": 0.08745238184928894, + "learning_rate": 4.3163135090218424e-05, + "loss": 0.0217, + "step": 18930 + }, + { + "epoch": 0.14000177404571124, + "grad_norm": 0.11072596907615662, + "learning_rate": 4.315942545109212e-05, + "loss": 0.0201, + "step": 18940 + }, + { + "epoch": 0.1400756926170131, + "grad_norm": 0.09789971262216568, + "learning_rate": 4.3155715811965816e-05, + "loss": 0.0218, + "step": 18950 + }, + { + "epoch": 0.14014961118831495, + "grad_norm": 0.10173800587654114, + "learning_rate": 4.315200617283951e-05, + "loss": 0.0204, + "step": 18960 + }, + { + "epoch": 0.1402235297596168, + "grad_norm": 0.10012296587228775, + "learning_rate": 4.31482965337132e-05, + "loss": 0.0211, + "step": 18970 + }, + { + "epoch": 0.14029744833091867, + "grad_norm": 0.10211526602506638, + "learning_rate": 4.31445868945869e-05, + "loss": 0.0187, + "step": 18980 + }, + { + "epoch": 0.14037136690222052, + "grad_norm": 0.09464067220687866, + "learning_rate": 4.314087725546059e-05, + "loss": 0.0203, + "step": 18990 + }, + { + "epoch": 0.14044528547352236, + "grad_norm": 0.07152500003576279, + "learning_rate": 4.313716761633428e-05, + "loss": 0.0211, + "step": 19000 + }, + { + "epoch": 0.1405192040448242, + "grad_norm": 0.09147216379642487, + "learning_rate": 4.313345797720798e-05, + "loss": 0.0221, + "step": 19010 + }, + { + "epoch": 0.14059312261612608, + "grad_norm": 0.08784119039773941, + "learning_rate": 4.3129748338081674e-05, + "loss": 0.0181, + "step": 19020 + }, + { + "epoch": 0.14066704118742793, + "grad_norm": 0.08497309684753418, + "learning_rate": 4.312603869895537e-05, + "loss": 0.0179, + "step": 19030 + }, + { + "epoch": 0.14074095975872977, + "grad_norm": 0.11955258995294571, + "learning_rate": 4.3122329059829066e-05, + "loss": 0.021, + "step": 19040 + }, + { + "epoch": 0.14081487833003165, + "grad_norm": 0.0956282839179039, + "learning_rate": 4.3118619420702755e-05, + "loss": 0.0197, + "step": 19050 + }, + { + "epoch": 0.1408887969013335, + "grad_norm": 0.10729701071977615, + "learning_rate": 4.311490978157645e-05, + "loss": 0.0188, + "step": 19060 + }, + { + "epoch": 0.14096271547263534, + "grad_norm": 0.11408047378063202, + "learning_rate": 4.311120014245014e-05, + "loss": 0.0211, + "step": 19070 + }, + { + "epoch": 0.1410366340439372, + "grad_norm": 0.0880783423781395, + "learning_rate": 4.3107490503323836e-05, + "loss": 0.0213, + "step": 19080 + }, + { + "epoch": 0.14111055261523905, + "grad_norm": 0.10292744636535645, + "learning_rate": 4.310378086419753e-05, + "loss": 0.0226, + "step": 19090 + }, + { + "epoch": 0.1411844711865409, + "grad_norm": 0.0908803939819336, + "learning_rate": 4.310007122507123e-05, + "loss": 0.0207, + "step": 19100 + }, + { + "epoch": 0.14125838975784277, + "grad_norm": 0.11729007959365845, + "learning_rate": 4.3096361585944924e-05, + "loss": 0.0225, + "step": 19110 + }, + { + "epoch": 0.14133230832914462, + "grad_norm": 0.07894833385944366, + "learning_rate": 4.309265194681861e-05, + "loss": 0.0179, + "step": 19120 + }, + { + "epoch": 0.14140622690044646, + "grad_norm": 0.10907064378261566, + "learning_rate": 4.308894230769231e-05, + "loss": 0.0185, + "step": 19130 + }, + { + "epoch": 0.1414801454717483, + "grad_norm": 0.09175430983304977, + "learning_rate": 4.3085232668566005e-05, + "loss": 0.0206, + "step": 19140 + }, + { + "epoch": 0.14155406404305018, + "grad_norm": 0.08137310296297073, + "learning_rate": 4.3081523029439694e-05, + "loss": 0.0202, + "step": 19150 + }, + { + "epoch": 0.14162798261435203, + "grad_norm": 0.0989503413438797, + "learning_rate": 4.307781339031339e-05, + "loss": 0.0186, + "step": 19160 + }, + { + "epoch": 0.14170190118565387, + "grad_norm": 0.09327124804258347, + "learning_rate": 4.3074103751187086e-05, + "loss": 0.0192, + "step": 19170 + }, + { + "epoch": 0.14177581975695575, + "grad_norm": 0.08980970829725266, + "learning_rate": 4.307039411206078e-05, + "loss": 0.0194, + "step": 19180 + }, + { + "epoch": 0.1418497383282576, + "grad_norm": 0.10795049369335175, + "learning_rate": 4.306668447293448e-05, + "loss": 0.0208, + "step": 19190 + }, + { + "epoch": 0.14192365689955944, + "grad_norm": 0.06910710781812668, + "learning_rate": 4.306297483380817e-05, + "loss": 0.0211, + "step": 19200 + }, + { + "epoch": 0.1419975754708613, + "grad_norm": 0.08890750259160995, + "learning_rate": 4.305926519468186e-05, + "loss": 0.0196, + "step": 19210 + }, + { + "epoch": 0.14207149404216315, + "grad_norm": 0.09042514115571976, + "learning_rate": 4.305555555555556e-05, + "loss": 0.0179, + "step": 19220 + }, + { + "epoch": 0.142145412613465, + "grad_norm": 0.0995037779211998, + "learning_rate": 4.305184591642925e-05, + "loss": 0.0191, + "step": 19230 + }, + { + "epoch": 0.14221933118476687, + "grad_norm": 0.07466725260019302, + "learning_rate": 4.3048136277302944e-05, + "loss": 0.0184, + "step": 19240 + }, + { + "epoch": 0.14229324975606872, + "grad_norm": 0.11802132427692413, + "learning_rate": 4.304442663817664e-05, + "loss": 0.0227, + "step": 19250 + }, + { + "epoch": 0.14236716832737056, + "grad_norm": 0.09003034234046936, + "learning_rate": 4.3040716999050336e-05, + "loss": 0.0188, + "step": 19260 + }, + { + "epoch": 0.1424410868986724, + "grad_norm": 0.06642922013998032, + "learning_rate": 4.303700735992403e-05, + "loss": 0.0186, + "step": 19270 + }, + { + "epoch": 0.14251500546997428, + "grad_norm": 0.10610310733318329, + "learning_rate": 4.303329772079772e-05, + "loss": 0.0199, + "step": 19280 + }, + { + "epoch": 0.14258892404127613, + "grad_norm": 0.08904029428958893, + "learning_rate": 4.302958808167142e-05, + "loss": 0.0218, + "step": 19290 + }, + { + "epoch": 0.14266284261257797, + "grad_norm": 0.11130847781896591, + "learning_rate": 4.302587844254511e-05, + "loss": 0.0163, + "step": 19300 + }, + { + "epoch": 0.14273676118387985, + "grad_norm": 0.07528279721736908, + "learning_rate": 4.30221688034188e-05, + "loss": 0.0185, + "step": 19310 + }, + { + "epoch": 0.1428106797551817, + "grad_norm": 0.07024827599525452, + "learning_rate": 4.30184591642925e-05, + "loss": 0.0173, + "step": 19320 + }, + { + "epoch": 0.14288459832648354, + "grad_norm": 0.11899475008249283, + "learning_rate": 4.3014749525166195e-05, + "loss": 0.0241, + "step": 19330 + }, + { + "epoch": 0.1429585168977854, + "grad_norm": 0.09621811658143997, + "learning_rate": 4.301103988603989e-05, + "loss": 0.0202, + "step": 19340 + }, + { + "epoch": 0.14303243546908725, + "grad_norm": 0.09382009506225586, + "learning_rate": 4.300733024691358e-05, + "loss": 0.0197, + "step": 19350 + }, + { + "epoch": 0.1431063540403891, + "grad_norm": 0.0980682298541069, + "learning_rate": 4.3003620607787276e-05, + "loss": 0.0208, + "step": 19360 + }, + { + "epoch": 0.14318027261169097, + "grad_norm": 0.09665199369192123, + "learning_rate": 4.299991096866097e-05, + "loss": 0.0204, + "step": 19370 + }, + { + "epoch": 0.14325419118299282, + "grad_norm": 0.09231219440698624, + "learning_rate": 4.299620132953466e-05, + "loss": 0.0219, + "step": 19380 + }, + { + "epoch": 0.14332810975429466, + "grad_norm": 0.10446962714195251, + "learning_rate": 4.299249169040836e-05, + "loss": 0.0202, + "step": 19390 + }, + { + "epoch": 0.14340202832559654, + "grad_norm": 0.08622289448976517, + "learning_rate": 4.298878205128205e-05, + "loss": 0.0192, + "step": 19400 + }, + { + "epoch": 0.14347594689689838, + "grad_norm": 0.10497544705867767, + "learning_rate": 4.298507241215575e-05, + "loss": 0.0212, + "step": 19410 + }, + { + "epoch": 0.14354986546820023, + "grad_norm": 0.1282612383365631, + "learning_rate": 4.2981362773029445e-05, + "loss": 0.0203, + "step": 19420 + }, + { + "epoch": 0.14362378403950207, + "grad_norm": 0.10771500319242477, + "learning_rate": 4.2977653133903134e-05, + "loss": 0.0218, + "step": 19430 + }, + { + "epoch": 0.14369770261080395, + "grad_norm": 0.08456694334745407, + "learning_rate": 4.297394349477683e-05, + "loss": 0.0199, + "step": 19440 + }, + { + "epoch": 0.1437716211821058, + "grad_norm": 0.08716116845607758, + "learning_rate": 4.2970233855650526e-05, + "loss": 0.0211, + "step": 19450 + }, + { + "epoch": 0.14384553975340764, + "grad_norm": 0.06831954419612885, + "learning_rate": 4.2966524216524215e-05, + "loss": 0.0201, + "step": 19460 + }, + { + "epoch": 0.1439194583247095, + "grad_norm": 0.09248486906290054, + "learning_rate": 4.296281457739791e-05, + "loss": 0.0216, + "step": 19470 + }, + { + "epoch": 0.14399337689601135, + "grad_norm": 0.08724641799926758, + "learning_rate": 4.295910493827161e-05, + "loss": 0.0213, + "step": 19480 + }, + { + "epoch": 0.1440672954673132, + "grad_norm": 0.10801932960748672, + "learning_rate": 4.29553952991453e-05, + "loss": 0.0236, + "step": 19490 + }, + { + "epoch": 0.14414121403861507, + "grad_norm": 0.07270118594169617, + "learning_rate": 4.2951685660019e-05, + "loss": 0.0208, + "step": 19500 + }, + { + "epoch": 0.14421513260991692, + "grad_norm": 0.07651976495981216, + "learning_rate": 4.294797602089269e-05, + "loss": 0.0196, + "step": 19510 + }, + { + "epoch": 0.14428905118121876, + "grad_norm": 0.10494709014892578, + "learning_rate": 4.2944266381766384e-05, + "loss": 0.0194, + "step": 19520 + }, + { + "epoch": 0.14436296975252064, + "grad_norm": 0.08860231935977936, + "learning_rate": 4.294055674264007e-05, + "loss": 0.0181, + "step": 19530 + }, + { + "epoch": 0.14443688832382248, + "grad_norm": 0.12752337753772736, + "learning_rate": 4.293684710351377e-05, + "loss": 0.02, + "step": 19540 + }, + { + "epoch": 0.14451080689512433, + "grad_norm": 0.08055710792541504, + "learning_rate": 4.2933137464387465e-05, + "loss": 0.0189, + "step": 19550 + }, + { + "epoch": 0.14458472546642617, + "grad_norm": 0.12380864471197128, + "learning_rate": 4.292942782526116e-05, + "loss": 0.019, + "step": 19560 + }, + { + "epoch": 0.14465864403772805, + "grad_norm": 0.11049962788820267, + "learning_rate": 4.292571818613486e-05, + "loss": 0.0206, + "step": 19570 + }, + { + "epoch": 0.1447325626090299, + "grad_norm": 0.09611523896455765, + "learning_rate": 4.2922008547008546e-05, + "loss": 0.0214, + "step": 19580 + }, + { + "epoch": 0.14480648118033174, + "grad_norm": 0.0665908008813858, + "learning_rate": 4.291829890788224e-05, + "loss": 0.0213, + "step": 19590 + }, + { + "epoch": 0.1448803997516336, + "grad_norm": 0.07490904629230499, + "learning_rate": 4.291458926875594e-05, + "loss": 0.0187, + "step": 19600 + }, + { + "epoch": 0.14495431832293545, + "grad_norm": 0.08843923360109329, + "learning_rate": 4.291087962962963e-05, + "loss": 0.0187, + "step": 19610 + }, + { + "epoch": 0.1450282368942373, + "grad_norm": 0.09567807614803314, + "learning_rate": 4.2907169990503323e-05, + "loss": 0.0183, + "step": 19620 + }, + { + "epoch": 0.14510215546553917, + "grad_norm": 0.08598290383815765, + "learning_rate": 4.290346035137702e-05, + "loss": 0.0217, + "step": 19630 + }, + { + "epoch": 0.14517607403684102, + "grad_norm": 0.09431812167167664, + "learning_rate": 4.2899750712250715e-05, + "loss": 0.019, + "step": 19640 + }, + { + "epoch": 0.14524999260814286, + "grad_norm": 0.06617843359708786, + "learning_rate": 4.289604107312441e-05, + "loss": 0.0174, + "step": 19650 + }, + { + "epoch": 0.14532391117944474, + "grad_norm": 0.1028028130531311, + "learning_rate": 4.28923314339981e-05, + "loss": 0.0188, + "step": 19660 + }, + { + "epoch": 0.14539782975074658, + "grad_norm": 0.10529684275388718, + "learning_rate": 4.2888621794871797e-05, + "loss": 0.019, + "step": 19670 + }, + { + "epoch": 0.14547174832204843, + "grad_norm": 0.06480713933706284, + "learning_rate": 4.288491215574549e-05, + "loss": 0.0202, + "step": 19680 + }, + { + "epoch": 0.14554566689335027, + "grad_norm": 0.13642440736293793, + "learning_rate": 4.288120251661918e-05, + "loss": 0.0211, + "step": 19690 + }, + { + "epoch": 0.14561958546465215, + "grad_norm": 0.11017101258039474, + "learning_rate": 4.287749287749288e-05, + "loss": 0.0208, + "step": 19700 + }, + { + "epoch": 0.145693504035954, + "grad_norm": 0.0895652249455452, + "learning_rate": 4.2873783238366574e-05, + "loss": 0.0231, + "step": 19710 + }, + { + "epoch": 0.14576742260725584, + "grad_norm": 0.10605411976575851, + "learning_rate": 4.287007359924027e-05, + "loss": 0.0215, + "step": 19720 + }, + { + "epoch": 0.1458413411785577, + "grad_norm": 0.10418490320444107, + "learning_rate": 4.2866363960113966e-05, + "loss": 0.0205, + "step": 19730 + }, + { + "epoch": 0.14591525974985955, + "grad_norm": 0.11002498865127563, + "learning_rate": 4.2862654320987655e-05, + "loss": 0.0212, + "step": 19740 + }, + { + "epoch": 0.1459891783211614, + "grad_norm": 0.10646151751279831, + "learning_rate": 4.285894468186135e-05, + "loss": 0.0203, + "step": 19750 + }, + { + "epoch": 0.14606309689246327, + "grad_norm": 0.09739893674850464, + "learning_rate": 4.285523504273504e-05, + "loss": 0.0205, + "step": 19760 + }, + { + "epoch": 0.14613701546376512, + "grad_norm": 0.08105024695396423, + "learning_rate": 4.2851525403608736e-05, + "loss": 0.022, + "step": 19770 + }, + { + "epoch": 0.14621093403506696, + "grad_norm": 0.06371292471885681, + "learning_rate": 4.284781576448243e-05, + "loss": 0.0186, + "step": 19780 + }, + { + "epoch": 0.14628485260636884, + "grad_norm": 0.12250304222106934, + "learning_rate": 4.284410612535613e-05, + "loss": 0.0218, + "step": 19790 + }, + { + "epoch": 0.14635877117767068, + "grad_norm": 0.08064945042133331, + "learning_rate": 4.2840396486229824e-05, + "loss": 0.0197, + "step": 19800 + }, + { + "epoch": 0.14643268974897253, + "grad_norm": 0.048164352774620056, + "learning_rate": 4.283668684710351e-05, + "loss": 0.0193, + "step": 19810 + }, + { + "epoch": 0.14650660832027437, + "grad_norm": 0.144981250166893, + "learning_rate": 4.283297720797721e-05, + "loss": 0.0179, + "step": 19820 + }, + { + "epoch": 0.14658052689157625, + "grad_norm": 0.07517068833112717, + "learning_rate": 4.2829267568850905e-05, + "loss": 0.0207, + "step": 19830 + }, + { + "epoch": 0.1466544454628781, + "grad_norm": 0.13142111897468567, + "learning_rate": 4.2825557929724594e-05, + "loss": 0.02, + "step": 19840 + }, + { + "epoch": 0.14672836403417994, + "grad_norm": 0.09935706853866577, + "learning_rate": 4.282184829059829e-05, + "loss": 0.0188, + "step": 19850 + }, + { + "epoch": 0.1468022826054818, + "grad_norm": 0.08331234008073807, + "learning_rate": 4.281813865147199e-05, + "loss": 0.0167, + "step": 19860 + }, + { + "epoch": 0.14687620117678366, + "grad_norm": 0.09987511485815048, + "learning_rate": 4.281442901234568e-05, + "loss": 0.0207, + "step": 19870 + }, + { + "epoch": 0.1469501197480855, + "grad_norm": 0.05960327759385109, + "learning_rate": 4.281071937321938e-05, + "loss": 0.0205, + "step": 19880 + }, + { + "epoch": 0.14702403831938737, + "grad_norm": 0.10007424652576447, + "learning_rate": 4.280700973409307e-05, + "loss": 0.0192, + "step": 19890 + }, + { + "epoch": 0.14709795689068922, + "grad_norm": 0.10943324863910675, + "learning_rate": 4.280330009496676e-05, + "loss": 0.0224, + "step": 19900 + }, + { + "epoch": 0.14717187546199106, + "grad_norm": 0.0814700797200203, + "learning_rate": 4.279959045584046e-05, + "loss": 0.0194, + "step": 19910 + }, + { + "epoch": 0.14724579403329294, + "grad_norm": 0.07240025699138641, + "learning_rate": 4.279588081671415e-05, + "loss": 0.0174, + "step": 19920 + }, + { + "epoch": 0.14731971260459478, + "grad_norm": 0.09242795407772064, + "learning_rate": 4.2792171177587844e-05, + "loss": 0.0191, + "step": 19930 + }, + { + "epoch": 0.14739363117589663, + "grad_norm": 0.10732390731573105, + "learning_rate": 4.278846153846154e-05, + "loss": 0.0205, + "step": 19940 + }, + { + "epoch": 0.14746754974719847, + "grad_norm": 0.10419422388076782, + "learning_rate": 4.2784751899335236e-05, + "loss": 0.0181, + "step": 19950 + }, + { + "epoch": 0.14754146831850035, + "grad_norm": 0.11255902796983719, + "learning_rate": 4.278104226020893e-05, + "loss": 0.0195, + "step": 19960 + }, + { + "epoch": 0.1476153868898022, + "grad_norm": 0.0901622548699379, + "learning_rate": 4.277733262108262e-05, + "loss": 0.0197, + "step": 19970 + }, + { + "epoch": 0.14768930546110404, + "grad_norm": 0.071842260658741, + "learning_rate": 4.277362298195632e-05, + "loss": 0.0206, + "step": 19980 + }, + { + "epoch": 0.1477632240324059, + "grad_norm": 0.08971525728702545, + "learning_rate": 4.2769913342830006e-05, + "loss": 0.0196, + "step": 19990 + }, + { + "epoch": 0.14783714260370776, + "grad_norm": 0.07036489993333817, + "learning_rate": 4.27662037037037e-05, + "loss": 0.022, + "step": 20000 + }, + { + "epoch": 0.14783714260370776, + "eval_f1": 0.5889918098753654, + "eval_loss": 0.019852200523018837, + "eval_precision": 0.4652586220167745, + "eval_recall": 0.8023809954805737, + "eval_runtime": 2652.3788, + "eval_samples_per_second": 204.018, + "eval_steps_per_second": 3.188, + "step": 20000 + }, + { + "epoch": 0.1479110611750096, + "grad_norm": 0.07533308118581772, + "learning_rate": 4.2762494064577405e-05, + "loss": 0.0191, + "step": 20010 + }, + { + "epoch": 0.14798497974631147, + "grad_norm": 0.07710433751344681, + "learning_rate": 4.2758784425451094e-05, + "loss": 0.0213, + "step": 20020 + }, + { + "epoch": 0.14805889831761332, + "grad_norm": 0.09403133392333984, + "learning_rate": 4.275507478632479e-05, + "loss": 0.023, + "step": 20030 + }, + { + "epoch": 0.14813281688891516, + "grad_norm": 0.0773426964879036, + "learning_rate": 4.275136514719848e-05, + "loss": 0.0185, + "step": 20040 + }, + { + "epoch": 0.14820673546021704, + "grad_norm": 0.0869455635547638, + "learning_rate": 4.2747655508072175e-05, + "loss": 0.0198, + "step": 20050 + }, + { + "epoch": 0.14828065403151888, + "grad_norm": 0.08084216713905334, + "learning_rate": 4.274394586894587e-05, + "loss": 0.0188, + "step": 20060 + }, + { + "epoch": 0.14835457260282073, + "grad_norm": 0.0831877812743187, + "learning_rate": 4.274023622981956e-05, + "loss": 0.0174, + "step": 20070 + }, + { + "epoch": 0.14842849117412257, + "grad_norm": 0.08009405434131622, + "learning_rate": 4.273652659069326e-05, + "loss": 0.0183, + "step": 20080 + }, + { + "epoch": 0.14850240974542445, + "grad_norm": 0.09190283715724945, + "learning_rate": 4.273281695156696e-05, + "loss": 0.0216, + "step": 20090 + }, + { + "epoch": 0.1485763283167263, + "grad_norm": 0.08109553903341293, + "learning_rate": 4.272910731244065e-05, + "loss": 0.0208, + "step": 20100 + }, + { + "epoch": 0.14865024688802814, + "grad_norm": 0.08654297143220901, + "learning_rate": 4.2725397673314345e-05, + "loss": 0.021, + "step": 20110 + }, + { + "epoch": 0.14872416545933, + "grad_norm": 0.0885516032576561, + "learning_rate": 4.2721688034188034e-05, + "loss": 0.0203, + "step": 20120 + }, + { + "epoch": 0.14879808403063186, + "grad_norm": 0.05938749387860298, + "learning_rate": 4.271797839506173e-05, + "loss": 0.0185, + "step": 20130 + }, + { + "epoch": 0.1488720026019337, + "grad_norm": 0.10479523986577988, + "learning_rate": 4.2714268755935426e-05, + "loss": 0.0201, + "step": 20140 + }, + { + "epoch": 0.14894592117323557, + "grad_norm": 0.07493485510349274, + "learning_rate": 4.2710559116809115e-05, + "loss": 0.0178, + "step": 20150 + }, + { + "epoch": 0.14901983974453742, + "grad_norm": 0.09768588840961456, + "learning_rate": 4.270684947768282e-05, + "loss": 0.0199, + "step": 20160 + }, + { + "epoch": 0.14909375831583926, + "grad_norm": 0.08659044653177261, + "learning_rate": 4.270313983855651e-05, + "loss": 0.0183, + "step": 20170 + }, + { + "epoch": 0.14916767688714114, + "grad_norm": 0.09599784016609192, + "learning_rate": 4.26994301994302e-05, + "loss": 0.021, + "step": 20180 + }, + { + "epoch": 0.14924159545844298, + "grad_norm": 0.0763728991150856, + "learning_rate": 4.26957205603039e-05, + "loss": 0.0197, + "step": 20190 + }, + { + "epoch": 0.14931551402974483, + "grad_norm": 0.09369050711393356, + "learning_rate": 4.269201092117759e-05, + "loss": 0.0204, + "step": 20200 + }, + { + "epoch": 0.14938943260104667, + "grad_norm": 0.11977594345808029, + "learning_rate": 4.2688301282051284e-05, + "loss": 0.0208, + "step": 20210 + }, + { + "epoch": 0.14946335117234855, + "grad_norm": 0.07575426250696182, + "learning_rate": 4.268459164292497e-05, + "loss": 0.0189, + "step": 20220 + }, + { + "epoch": 0.1495372697436504, + "grad_norm": 0.08216328918933868, + "learning_rate": 4.268088200379867e-05, + "loss": 0.0202, + "step": 20230 + }, + { + "epoch": 0.14961118831495224, + "grad_norm": 0.08688732981681824, + "learning_rate": 4.267717236467237e-05, + "loss": 0.0185, + "step": 20240 + }, + { + "epoch": 0.1496851068862541, + "grad_norm": 0.0954701155424118, + "learning_rate": 4.267346272554606e-05, + "loss": 0.0207, + "step": 20250 + }, + { + "epoch": 0.14975902545755596, + "grad_norm": 0.1011623814702034, + "learning_rate": 4.266975308641976e-05, + "loss": 0.0184, + "step": 20260 + }, + { + "epoch": 0.1498329440288578, + "grad_norm": 0.10412371158599854, + "learning_rate": 4.2666043447293446e-05, + "loss": 0.0187, + "step": 20270 + }, + { + "epoch": 0.14990686260015967, + "grad_norm": 0.08750808238983154, + "learning_rate": 4.266233380816714e-05, + "loss": 0.0203, + "step": 20280 + }, + { + "epoch": 0.14998078117146152, + "grad_norm": 0.09496186673641205, + "learning_rate": 4.265862416904084e-05, + "loss": 0.0191, + "step": 20290 + }, + { + "epoch": 0.15005469974276336, + "grad_norm": 0.10010498762130737, + "learning_rate": 4.265491452991453e-05, + "loss": 0.0228, + "step": 20300 + }, + { + "epoch": 0.15012861831406524, + "grad_norm": 0.08491216599941254, + "learning_rate": 4.265120489078823e-05, + "loss": 0.017, + "step": 20310 + }, + { + "epoch": 0.15020253688536708, + "grad_norm": 0.08513123542070389, + "learning_rate": 4.2647495251661926e-05, + "loss": 0.0191, + "step": 20320 + }, + { + "epoch": 0.15027645545666893, + "grad_norm": 0.0948261246085167, + "learning_rate": 4.2643785612535615e-05, + "loss": 0.0194, + "step": 20330 + }, + { + "epoch": 0.1503503740279708, + "grad_norm": 0.08445029705762863, + "learning_rate": 4.264007597340931e-05, + "loss": 0.02, + "step": 20340 + }, + { + "epoch": 0.15042429259927265, + "grad_norm": 0.08538064360618591, + "learning_rate": 4.2636366334283e-05, + "loss": 0.0191, + "step": 20350 + }, + { + "epoch": 0.1504982111705745, + "grad_norm": 0.10733731836080551, + "learning_rate": 4.2632656695156696e-05, + "loss": 0.0215, + "step": 20360 + }, + { + "epoch": 0.15057212974187634, + "grad_norm": 0.10469716787338257, + "learning_rate": 4.262894705603039e-05, + "loss": 0.0215, + "step": 20370 + }, + { + "epoch": 0.1506460483131782, + "grad_norm": 0.10288316756486893, + "learning_rate": 4.262523741690408e-05, + "loss": 0.0189, + "step": 20380 + }, + { + "epoch": 0.15071996688448006, + "grad_norm": 0.08677443116903305, + "learning_rate": 4.2621527777777784e-05, + "loss": 0.0188, + "step": 20390 + }, + { + "epoch": 0.1507938854557819, + "grad_norm": 0.10645607113838196, + "learning_rate": 4.261781813865147e-05, + "loss": 0.0209, + "step": 20400 + }, + { + "epoch": 0.15086780402708377, + "grad_norm": 0.07837022095918655, + "learning_rate": 4.261410849952517e-05, + "loss": 0.0173, + "step": 20410 + }, + { + "epoch": 0.15094172259838562, + "grad_norm": 0.1119476780295372, + "learning_rate": 4.2610398860398865e-05, + "loss": 0.0201, + "step": 20420 + }, + { + "epoch": 0.15101564116968746, + "grad_norm": 0.08979900926351547, + "learning_rate": 4.2606689221272554e-05, + "loss": 0.0183, + "step": 20430 + }, + { + "epoch": 0.15108955974098934, + "grad_norm": 0.07695356011390686, + "learning_rate": 4.260297958214625e-05, + "loss": 0.0227, + "step": 20440 + }, + { + "epoch": 0.15116347831229118, + "grad_norm": 0.07926007360219955, + "learning_rate": 4.259926994301994e-05, + "loss": 0.0207, + "step": 20450 + }, + { + "epoch": 0.15123739688359303, + "grad_norm": 0.07667418569326401, + "learning_rate": 4.259556030389364e-05, + "loss": 0.0206, + "step": 20460 + }, + { + "epoch": 0.1513113154548949, + "grad_norm": 0.09290957450866699, + "learning_rate": 4.259185066476734e-05, + "loss": 0.0197, + "step": 20470 + }, + { + "epoch": 0.15138523402619675, + "grad_norm": 0.10091643780469894, + "learning_rate": 4.258814102564103e-05, + "loss": 0.0222, + "step": 20480 + }, + { + "epoch": 0.1514591525974986, + "grad_norm": 0.07847228646278381, + "learning_rate": 4.2584431386514724e-05, + "loss": 0.0197, + "step": 20490 + }, + { + "epoch": 0.15153307116880044, + "grad_norm": 0.07392504066228867, + "learning_rate": 4.258072174738841e-05, + "loss": 0.0173, + "step": 20500 + }, + { + "epoch": 0.1516069897401023, + "grad_norm": 0.09294796735048294, + "learning_rate": 4.257701210826211e-05, + "loss": 0.018, + "step": 20510 + }, + { + "epoch": 0.15168090831140416, + "grad_norm": 0.09442655742168427, + "learning_rate": 4.2573302469135805e-05, + "loss": 0.019, + "step": 20520 + }, + { + "epoch": 0.151754826882706, + "grad_norm": 0.10183858871459961, + "learning_rate": 4.2569592830009494e-05, + "loss": 0.0201, + "step": 20530 + }, + { + "epoch": 0.15182874545400787, + "grad_norm": 0.09779280424118042, + "learning_rate": 4.2565883190883197e-05, + "loss": 0.0193, + "step": 20540 + }, + { + "epoch": 0.15190266402530972, + "grad_norm": 0.08100099116563797, + "learning_rate": 4.256217355175689e-05, + "loss": 0.0187, + "step": 20550 + }, + { + "epoch": 0.15197658259661156, + "grad_norm": 0.0783228799700737, + "learning_rate": 4.255846391263058e-05, + "loss": 0.0186, + "step": 20560 + }, + { + "epoch": 0.15205050116791344, + "grad_norm": 0.08579540997743607, + "learning_rate": 4.255475427350428e-05, + "loss": 0.0218, + "step": 20570 + }, + { + "epoch": 0.15212441973921528, + "grad_norm": 0.09593216329813004, + "learning_rate": 4.255104463437797e-05, + "loss": 0.0192, + "step": 20580 + }, + { + "epoch": 0.15219833831051713, + "grad_norm": 0.11332932114601135, + "learning_rate": 4.254733499525166e-05, + "loss": 0.0196, + "step": 20590 + }, + { + "epoch": 0.152272256881819, + "grad_norm": 0.08314672857522964, + "learning_rate": 4.254362535612536e-05, + "loss": 0.0228, + "step": 20600 + }, + { + "epoch": 0.15234617545312085, + "grad_norm": 0.08538781851530075, + "learning_rate": 4.2539915716999055e-05, + "loss": 0.0205, + "step": 20610 + }, + { + "epoch": 0.1524200940244227, + "grad_norm": 0.12145442515611649, + "learning_rate": 4.253620607787275e-05, + "loss": 0.0228, + "step": 20620 + }, + { + "epoch": 0.15249401259572454, + "grad_norm": 0.08186414837837219, + "learning_rate": 4.253249643874644e-05, + "loss": 0.0184, + "step": 20630 + }, + { + "epoch": 0.1525679311670264, + "grad_norm": 0.101750448346138, + "learning_rate": 4.2528786799620136e-05, + "loss": 0.0198, + "step": 20640 + }, + { + "epoch": 0.15264184973832826, + "grad_norm": 0.08295189589262009, + "learning_rate": 4.252507716049383e-05, + "loss": 0.0182, + "step": 20650 + }, + { + "epoch": 0.1527157683096301, + "grad_norm": 0.1188788115978241, + "learning_rate": 4.252136752136752e-05, + "loss": 0.0182, + "step": 20660 + }, + { + "epoch": 0.15278968688093197, + "grad_norm": 0.06974223256111145, + "learning_rate": 4.251765788224122e-05, + "loss": 0.0215, + "step": 20670 + }, + { + "epoch": 0.15286360545223382, + "grad_norm": 0.08505337685346603, + "learning_rate": 4.2513948243114906e-05, + "loss": 0.0206, + "step": 20680 + }, + { + "epoch": 0.15293752402353566, + "grad_norm": 0.0816953107714653, + "learning_rate": 4.251023860398861e-05, + "loss": 0.0186, + "step": 20690 + }, + { + "epoch": 0.15301144259483754, + "grad_norm": 0.08393016457557678, + "learning_rate": 4.2506528964862305e-05, + "loss": 0.0188, + "step": 20700 + }, + { + "epoch": 0.15308536116613938, + "grad_norm": 0.09978709369897842, + "learning_rate": 4.2502819325735994e-05, + "loss": 0.0194, + "step": 20710 + }, + { + "epoch": 0.15315927973744123, + "grad_norm": 0.08010877668857574, + "learning_rate": 4.249910968660969e-05, + "loss": 0.0204, + "step": 20720 + }, + { + "epoch": 0.1532331983087431, + "grad_norm": 0.08997316658496857, + "learning_rate": 4.249540004748338e-05, + "loss": 0.0199, + "step": 20730 + }, + { + "epoch": 0.15330711688004495, + "grad_norm": 0.1026889905333519, + "learning_rate": 4.2491690408357075e-05, + "loss": 0.0194, + "step": 20740 + }, + { + "epoch": 0.1533810354513468, + "grad_norm": 0.10672967880964279, + "learning_rate": 4.248798076923077e-05, + "loss": 0.024, + "step": 20750 + }, + { + "epoch": 0.15345495402264864, + "grad_norm": 0.09047634899616241, + "learning_rate": 4.248427113010447e-05, + "loss": 0.0199, + "step": 20760 + }, + { + "epoch": 0.1535288725939505, + "grad_norm": 0.07985531538724899, + "learning_rate": 4.248056149097816e-05, + "loss": 0.0207, + "step": 20770 + }, + { + "epoch": 0.15360279116525236, + "grad_norm": 0.10958772897720337, + "learning_rate": 4.247685185185186e-05, + "loss": 0.0197, + "step": 20780 + }, + { + "epoch": 0.1536767097365542, + "grad_norm": 0.10120689123868942, + "learning_rate": 4.247314221272555e-05, + "loss": 0.021, + "step": 20790 + }, + { + "epoch": 0.15375062830785607, + "grad_norm": 0.0742512047290802, + "learning_rate": 4.2469432573599244e-05, + "loss": 0.0196, + "step": 20800 + }, + { + "epoch": 0.15382454687915792, + "grad_norm": 0.08500395715236664, + "learning_rate": 4.2465722934472933e-05, + "loss": 0.0203, + "step": 20810 + }, + { + "epoch": 0.15389846545045976, + "grad_norm": 0.08425117284059525, + "learning_rate": 4.246201329534663e-05, + "loss": 0.0206, + "step": 20820 + }, + { + "epoch": 0.15397238402176164, + "grad_norm": 0.09172620624303818, + "learning_rate": 4.2458303656220325e-05, + "loss": 0.0215, + "step": 20830 + }, + { + "epoch": 0.15404630259306348, + "grad_norm": 0.08757595717906952, + "learning_rate": 4.245459401709402e-05, + "loss": 0.0174, + "step": 20840 + }, + { + "epoch": 0.15412022116436533, + "grad_norm": 0.060954876244068146, + "learning_rate": 4.245088437796772e-05, + "loss": 0.0202, + "step": 20850 + }, + { + "epoch": 0.1541941397356672, + "grad_norm": 0.10628701001405716, + "learning_rate": 4.2447174738841407e-05, + "loss": 0.0176, + "step": 20860 + }, + { + "epoch": 0.15426805830696905, + "grad_norm": 0.08656299114227295, + "learning_rate": 4.24434650997151e-05, + "loss": 0.0207, + "step": 20870 + }, + { + "epoch": 0.1543419768782709, + "grad_norm": 0.1467808336019516, + "learning_rate": 4.24397554605888e-05, + "loss": 0.0179, + "step": 20880 + }, + { + "epoch": 0.15441589544957274, + "grad_norm": 0.0891091376543045, + "learning_rate": 4.243604582146249e-05, + "loss": 0.02, + "step": 20890 + }, + { + "epoch": 0.1544898140208746, + "grad_norm": 0.08970539271831512, + "learning_rate": 4.2432336182336184e-05, + "loss": 0.02, + "step": 20900 + }, + { + "epoch": 0.15456373259217646, + "grad_norm": 0.07861539721488953, + "learning_rate": 4.242862654320987e-05, + "loss": 0.0222, + "step": 20910 + }, + { + "epoch": 0.1546376511634783, + "grad_norm": 0.09449228644371033, + "learning_rate": 4.2424916904083576e-05, + "loss": 0.0195, + "step": 20920 + }, + { + "epoch": 0.15471156973478017, + "grad_norm": 0.10891766846179962, + "learning_rate": 4.242120726495727e-05, + "loss": 0.02, + "step": 20930 + }, + { + "epoch": 0.15478548830608202, + "grad_norm": 0.08089979737997055, + "learning_rate": 4.241749762583096e-05, + "loss": 0.0209, + "step": 20940 + }, + { + "epoch": 0.15485940687738387, + "grad_norm": 0.08476491272449493, + "learning_rate": 4.241378798670466e-05, + "loss": 0.021, + "step": 20950 + }, + { + "epoch": 0.15493332544868574, + "grad_norm": 0.0795905813574791, + "learning_rate": 4.2410078347578346e-05, + "loss": 0.0222, + "step": 20960 + }, + { + "epoch": 0.15500724401998758, + "grad_norm": 0.11419697105884552, + "learning_rate": 4.240636870845204e-05, + "loss": 0.0208, + "step": 20970 + }, + { + "epoch": 0.15508116259128943, + "grad_norm": 0.08529966324567795, + "learning_rate": 4.240265906932574e-05, + "loss": 0.0201, + "step": 20980 + }, + { + "epoch": 0.1551550811625913, + "grad_norm": 0.07745010405778885, + "learning_rate": 4.2398949430199434e-05, + "loss": 0.019, + "step": 20990 + }, + { + "epoch": 0.15522899973389315, + "grad_norm": 0.10719358175992966, + "learning_rate": 4.239523979107313e-05, + "loss": 0.0205, + "step": 21000 + }, + { + "epoch": 0.155302918305195, + "grad_norm": 0.09493697434663773, + "learning_rate": 4.2391530151946826e-05, + "loss": 0.0203, + "step": 21010 + }, + { + "epoch": 0.15537683687649684, + "grad_norm": 0.09876014292240143, + "learning_rate": 4.2387820512820515e-05, + "loss": 0.0203, + "step": 21020 + }, + { + "epoch": 0.1554507554477987, + "grad_norm": 0.0771779790520668, + "learning_rate": 4.238411087369421e-05, + "loss": 0.0177, + "step": 21030 + }, + { + "epoch": 0.15552467401910056, + "grad_norm": 0.07993631809949875, + "learning_rate": 4.23804012345679e-05, + "loss": 0.0184, + "step": 21040 + }, + { + "epoch": 0.1555985925904024, + "grad_norm": 0.07111372798681259, + "learning_rate": 4.2376691595441596e-05, + "loss": 0.0203, + "step": 21050 + }, + { + "epoch": 0.15567251116170427, + "grad_norm": 0.07143562287092209, + "learning_rate": 4.237298195631529e-05, + "loss": 0.0205, + "step": 21060 + }, + { + "epoch": 0.15574642973300612, + "grad_norm": 0.11168545484542847, + "learning_rate": 4.236927231718899e-05, + "loss": 0.0222, + "step": 21070 + }, + { + "epoch": 0.15582034830430797, + "grad_norm": 0.08068099617958069, + "learning_rate": 4.2365562678062684e-05, + "loss": 0.0204, + "step": 21080 + }, + { + "epoch": 0.15589426687560984, + "grad_norm": 0.10373964160680771, + "learning_rate": 4.236185303893637e-05, + "loss": 0.0172, + "step": 21090 + }, + { + "epoch": 0.15596818544691168, + "grad_norm": 0.11680703610181808, + "learning_rate": 4.235814339981007e-05, + "loss": 0.0236, + "step": 21100 + }, + { + "epoch": 0.15604210401821353, + "grad_norm": 0.09924782812595367, + "learning_rate": 4.2354433760683765e-05, + "loss": 0.0208, + "step": 21110 + }, + { + "epoch": 0.1561160225895154, + "grad_norm": 0.07073837518692017, + "learning_rate": 4.2350724121557454e-05, + "loss": 0.0179, + "step": 21120 + }, + { + "epoch": 0.15618994116081725, + "grad_norm": 0.09141246229410172, + "learning_rate": 4.234701448243115e-05, + "loss": 0.019, + "step": 21130 + }, + { + "epoch": 0.1562638597321191, + "grad_norm": 0.1136874258518219, + "learning_rate": 4.2343304843304846e-05, + "loss": 0.0198, + "step": 21140 + }, + { + "epoch": 0.15633777830342094, + "grad_norm": 0.07442887872457504, + "learning_rate": 4.233959520417854e-05, + "loss": 0.0186, + "step": 21150 + }, + { + "epoch": 0.1564116968747228, + "grad_norm": 0.09873552620410919, + "learning_rate": 4.233588556505224e-05, + "loss": 0.0184, + "step": 21160 + }, + { + "epoch": 0.15648561544602466, + "grad_norm": 0.09348463267087936, + "learning_rate": 4.233217592592593e-05, + "loss": 0.0192, + "step": 21170 + }, + { + "epoch": 0.1565595340173265, + "grad_norm": 0.09827948361635208, + "learning_rate": 4.232846628679962e-05, + "loss": 0.0206, + "step": 21180 + }, + { + "epoch": 0.15663345258862837, + "grad_norm": 0.09134156256914139, + "learning_rate": 4.232475664767331e-05, + "loss": 0.021, + "step": 21190 + }, + { + "epoch": 0.15670737115993022, + "grad_norm": 0.09025607258081436, + "learning_rate": 4.232104700854701e-05, + "loss": 0.0205, + "step": 21200 + }, + { + "epoch": 0.15678128973123207, + "grad_norm": 0.09077123552560806, + "learning_rate": 4.2317337369420704e-05, + "loss": 0.0223, + "step": 21210 + }, + { + "epoch": 0.15685520830253394, + "grad_norm": 0.07134833931922913, + "learning_rate": 4.23136277302944e-05, + "loss": 0.0167, + "step": 21220 + }, + { + "epoch": 0.15692912687383578, + "grad_norm": 0.10388438403606415, + "learning_rate": 4.2309918091168096e-05, + "loss": 0.0239, + "step": 21230 + }, + { + "epoch": 0.15700304544513763, + "grad_norm": 0.11611325293779373, + "learning_rate": 4.230620845204179e-05, + "loss": 0.0229, + "step": 21240 + }, + { + "epoch": 0.1570769640164395, + "grad_norm": 0.0772848129272461, + "learning_rate": 4.230249881291548e-05, + "loss": 0.019, + "step": 21250 + }, + { + "epoch": 0.15715088258774135, + "grad_norm": 0.10972341895103455, + "learning_rate": 4.229878917378918e-05, + "loss": 0.0206, + "step": 21260 + }, + { + "epoch": 0.1572248011590432, + "grad_norm": 0.08381499350070953, + "learning_rate": 4.229507953466287e-05, + "loss": 0.0209, + "step": 21270 + }, + { + "epoch": 0.15729871973034507, + "grad_norm": 0.10344956070184708, + "learning_rate": 4.229136989553656e-05, + "loss": 0.0227, + "step": 21280 + }, + { + "epoch": 0.1573726383016469, + "grad_norm": 0.08652588725090027, + "learning_rate": 4.228766025641026e-05, + "loss": 0.0205, + "step": 21290 + }, + { + "epoch": 0.15744655687294876, + "grad_norm": 0.09220878779888153, + "learning_rate": 4.2283950617283955e-05, + "loss": 0.0222, + "step": 21300 + }, + { + "epoch": 0.1575204754442506, + "grad_norm": 0.111487478017807, + "learning_rate": 4.228024097815765e-05, + "loss": 0.0196, + "step": 21310 + }, + { + "epoch": 0.15759439401555247, + "grad_norm": 0.09026884287595749, + "learning_rate": 4.227653133903134e-05, + "loss": 0.0211, + "step": 21320 + }, + { + "epoch": 0.15766831258685432, + "grad_norm": 0.10143148899078369, + "learning_rate": 4.2272821699905036e-05, + "loss": 0.0196, + "step": 21330 + }, + { + "epoch": 0.15774223115815617, + "grad_norm": 0.07933638989925385, + "learning_rate": 4.226911206077873e-05, + "loss": 0.0177, + "step": 21340 + }, + { + "epoch": 0.15781614972945804, + "grad_norm": 0.0977751761674881, + "learning_rate": 4.226540242165242e-05, + "loss": 0.0199, + "step": 21350 + }, + { + "epoch": 0.15789006830075988, + "grad_norm": 0.09351316839456558, + "learning_rate": 4.226169278252612e-05, + "loss": 0.0209, + "step": 21360 + }, + { + "epoch": 0.15796398687206173, + "grad_norm": 0.0827011987566948, + "learning_rate": 4.225798314339981e-05, + "loss": 0.0178, + "step": 21370 + }, + { + "epoch": 0.1580379054433636, + "grad_norm": 0.0982867032289505, + "learning_rate": 4.225427350427351e-05, + "loss": 0.0216, + "step": 21380 + }, + { + "epoch": 0.15811182401466545, + "grad_norm": 0.0931718572974205, + "learning_rate": 4.2250563865147205e-05, + "loss": 0.0185, + "step": 21390 + }, + { + "epoch": 0.1581857425859673, + "grad_norm": 0.08066631108522415, + "learning_rate": 4.2246854226020894e-05, + "loss": 0.0217, + "step": 21400 + }, + { + "epoch": 0.15825966115726917, + "grad_norm": 0.09623356908559799, + "learning_rate": 4.224314458689459e-05, + "loss": 0.017, + "step": 21410 + }, + { + "epoch": 0.158333579728571, + "grad_norm": 0.0713859349489212, + "learning_rate": 4.223943494776828e-05, + "loss": 0.0191, + "step": 21420 + }, + { + "epoch": 0.15840749829987286, + "grad_norm": 0.08927904069423676, + "learning_rate": 4.2235725308641975e-05, + "loss": 0.0205, + "step": 21430 + }, + { + "epoch": 0.1584814168711747, + "grad_norm": 0.0775395855307579, + "learning_rate": 4.223201566951567e-05, + "loss": 0.0208, + "step": 21440 + }, + { + "epoch": 0.15855533544247657, + "grad_norm": 0.08013869822025299, + "learning_rate": 4.222830603038937e-05, + "loss": 0.0172, + "step": 21450 + }, + { + "epoch": 0.15862925401377842, + "grad_norm": 0.0839141458272934, + "learning_rate": 4.222459639126306e-05, + "loss": 0.0229, + "step": 21460 + }, + { + "epoch": 0.15870317258508027, + "grad_norm": 0.09782402217388153, + "learning_rate": 4.222088675213676e-05, + "loss": 0.0194, + "step": 21470 + }, + { + "epoch": 0.15877709115638214, + "grad_norm": 0.10169842839241028, + "learning_rate": 4.221717711301045e-05, + "loss": 0.0204, + "step": 21480 + }, + { + "epoch": 0.15885100972768398, + "grad_norm": 0.08428753167390823, + "learning_rate": 4.2213467473884144e-05, + "loss": 0.0172, + "step": 21490 + }, + { + "epoch": 0.15892492829898583, + "grad_norm": 0.08111728727817535, + "learning_rate": 4.220975783475783e-05, + "loss": 0.019, + "step": 21500 + }, + { + "epoch": 0.1589988468702877, + "grad_norm": 0.08612488210201263, + "learning_rate": 4.220604819563153e-05, + "loss": 0.0186, + "step": 21510 + }, + { + "epoch": 0.15907276544158955, + "grad_norm": 0.08112023025751114, + "learning_rate": 4.2202338556505225e-05, + "loss": 0.0183, + "step": 21520 + }, + { + "epoch": 0.1591466840128914, + "grad_norm": 0.08562341332435608, + "learning_rate": 4.219862891737892e-05, + "loss": 0.0213, + "step": 21530 + }, + { + "epoch": 0.15922060258419327, + "grad_norm": 0.09395560622215271, + "learning_rate": 4.219491927825262e-05, + "loss": 0.0209, + "step": 21540 + }, + { + "epoch": 0.1592945211554951, + "grad_norm": 0.09960179030895233, + "learning_rate": 4.2191209639126306e-05, + "loss": 0.0202, + "step": 21550 + }, + { + "epoch": 0.15936843972679696, + "grad_norm": 0.10689588636159897, + "learning_rate": 4.21875e-05, + "loss": 0.0214, + "step": 21560 + }, + { + "epoch": 0.1594423582980988, + "grad_norm": 0.11698466539382935, + "learning_rate": 4.21837903608737e-05, + "loss": 0.0194, + "step": 21570 + }, + { + "epoch": 0.15951627686940067, + "grad_norm": 0.059294432401657104, + "learning_rate": 4.218008072174739e-05, + "loss": 0.0219, + "step": 21580 + }, + { + "epoch": 0.15959019544070252, + "grad_norm": 0.07105332612991333, + "learning_rate": 4.217637108262108e-05, + "loss": 0.0184, + "step": 21590 + }, + { + "epoch": 0.15966411401200437, + "grad_norm": 0.09126646816730499, + "learning_rate": 4.217266144349478e-05, + "loss": 0.0208, + "step": 21600 + }, + { + "epoch": 0.15973803258330624, + "grad_norm": 0.08969204127788544, + "learning_rate": 4.2168951804368475e-05, + "loss": 0.0236, + "step": 21610 + }, + { + "epoch": 0.15981195115460808, + "grad_norm": 0.0936492457985878, + "learning_rate": 4.216524216524217e-05, + "loss": 0.0194, + "step": 21620 + }, + { + "epoch": 0.15988586972590993, + "grad_norm": 0.10618958622217178, + "learning_rate": 4.216153252611586e-05, + "loss": 0.0201, + "step": 21630 + }, + { + "epoch": 0.1599597882972118, + "grad_norm": 0.09264793246984482, + "learning_rate": 4.2157822886989556e-05, + "loss": 0.0188, + "step": 21640 + }, + { + "epoch": 0.16003370686851365, + "grad_norm": 0.0832374095916748, + "learning_rate": 4.2154113247863246e-05, + "loss": 0.0185, + "step": 21650 + }, + { + "epoch": 0.1601076254398155, + "grad_norm": 0.06740816682577133, + "learning_rate": 4.215040360873694e-05, + "loss": 0.0189, + "step": 21660 + }, + { + "epoch": 0.16018154401111737, + "grad_norm": 0.094356968998909, + "learning_rate": 4.214669396961064e-05, + "loss": 0.0202, + "step": 21670 + }, + { + "epoch": 0.1602554625824192, + "grad_norm": 0.08573713898658752, + "learning_rate": 4.2142984330484334e-05, + "loss": 0.0187, + "step": 21680 + }, + { + "epoch": 0.16032938115372106, + "grad_norm": 0.10325209051370621, + "learning_rate": 4.213927469135803e-05, + "loss": 0.0224, + "step": 21690 + }, + { + "epoch": 0.1604032997250229, + "grad_norm": 0.08298230916261673, + "learning_rate": 4.2135565052231725e-05, + "loss": 0.0197, + "step": 21700 + }, + { + "epoch": 0.16047721829632478, + "grad_norm": 0.06460912525653839, + "learning_rate": 4.2131855413105415e-05, + "loss": 0.02, + "step": 21710 + }, + { + "epoch": 0.16055113686762662, + "grad_norm": 0.08635788410902023, + "learning_rate": 4.212814577397911e-05, + "loss": 0.0202, + "step": 21720 + }, + { + "epoch": 0.16062505543892847, + "grad_norm": 0.07899269461631775, + "learning_rate": 4.21244361348528e-05, + "loss": 0.0214, + "step": 21730 + }, + { + "epoch": 0.16069897401023034, + "grad_norm": 0.10716480016708374, + "learning_rate": 4.2120726495726496e-05, + "loss": 0.0219, + "step": 21740 + }, + { + "epoch": 0.16077289258153218, + "grad_norm": 0.07382979989051819, + "learning_rate": 4.211701685660019e-05, + "loss": 0.0176, + "step": 21750 + }, + { + "epoch": 0.16084681115283403, + "grad_norm": 0.08749647438526154, + "learning_rate": 4.211330721747389e-05, + "loss": 0.0221, + "step": 21760 + }, + { + "epoch": 0.1609207297241359, + "grad_norm": 0.08402451872825623, + "learning_rate": 4.2109597578347584e-05, + "loss": 0.0182, + "step": 21770 + }, + { + "epoch": 0.16099464829543775, + "grad_norm": 0.09932088106870651, + "learning_rate": 4.210588793922127e-05, + "loss": 0.0216, + "step": 21780 + }, + { + "epoch": 0.1610685668667396, + "grad_norm": 0.1020146980881691, + "learning_rate": 4.210217830009497e-05, + "loss": 0.0191, + "step": 21790 + }, + { + "epoch": 0.16114248543804147, + "grad_norm": 0.07757379114627838, + "learning_rate": 4.2098468660968665e-05, + "loss": 0.0177, + "step": 21800 + }, + { + "epoch": 0.1612164040093433, + "grad_norm": 0.09609977900981903, + "learning_rate": 4.2094759021842354e-05, + "loss": 0.0182, + "step": 21810 + }, + { + "epoch": 0.16129032258064516, + "grad_norm": 0.11304876953363419, + "learning_rate": 4.209104938271605e-05, + "loss": 0.017, + "step": 21820 + }, + { + "epoch": 0.161364241151947, + "grad_norm": 0.11363095790147781, + "learning_rate": 4.2087339743589746e-05, + "loss": 0.0219, + "step": 21830 + }, + { + "epoch": 0.16143815972324888, + "grad_norm": 0.09433922916650772, + "learning_rate": 4.208363010446344e-05, + "loss": 0.0179, + "step": 21840 + }, + { + "epoch": 0.16151207829455072, + "grad_norm": 0.0955984964966774, + "learning_rate": 4.207992046533714e-05, + "loss": 0.0195, + "step": 21850 + }, + { + "epoch": 0.16158599686585257, + "grad_norm": 0.09357141703367233, + "learning_rate": 4.207621082621083e-05, + "loss": 0.0198, + "step": 21860 + }, + { + "epoch": 0.16165991543715444, + "grad_norm": 0.08176647871732712, + "learning_rate": 4.207250118708452e-05, + "loss": 0.0175, + "step": 21870 + }, + { + "epoch": 0.16173383400845628, + "grad_norm": 0.10562469065189362, + "learning_rate": 4.206879154795821e-05, + "loss": 0.0197, + "step": 21880 + }, + { + "epoch": 0.16180775257975813, + "grad_norm": 0.10479024797677994, + "learning_rate": 4.206508190883191e-05, + "loss": 0.0205, + "step": 21890 + }, + { + "epoch": 0.16188167115106, + "grad_norm": 0.11500013619661331, + "learning_rate": 4.2061372269705604e-05, + "loss": 0.0208, + "step": 21900 + }, + { + "epoch": 0.16195558972236185, + "grad_norm": 0.0811997801065445, + "learning_rate": 4.20576626305793e-05, + "loss": 0.0186, + "step": 21910 + }, + { + "epoch": 0.1620295082936637, + "grad_norm": 0.08326699584722519, + "learning_rate": 4.2053952991452996e-05, + "loss": 0.0182, + "step": 21920 + }, + { + "epoch": 0.16210342686496557, + "grad_norm": 0.08639881014823914, + "learning_rate": 4.205024335232669e-05, + "loss": 0.0179, + "step": 21930 + }, + { + "epoch": 0.1621773454362674, + "grad_norm": 0.08141583949327469, + "learning_rate": 4.204653371320038e-05, + "loss": 0.0178, + "step": 21940 + }, + { + "epoch": 0.16225126400756926, + "grad_norm": 0.12498077005147934, + "learning_rate": 4.204282407407408e-05, + "loss": 0.0223, + "step": 21950 + }, + { + "epoch": 0.1623251825788711, + "grad_norm": 0.09967196732759476, + "learning_rate": 4.2039114434947766e-05, + "loss": 0.0201, + "step": 21960 + }, + { + "epoch": 0.16239910115017298, + "grad_norm": 0.1027727872133255, + "learning_rate": 4.203540479582146e-05, + "loss": 0.0209, + "step": 21970 + }, + { + "epoch": 0.16247301972147482, + "grad_norm": 0.08557023853063583, + "learning_rate": 4.203169515669516e-05, + "loss": 0.0185, + "step": 21980 + }, + { + "epoch": 0.16254693829277667, + "grad_norm": 0.09070567041635513, + "learning_rate": 4.2027985517568854e-05, + "loss": 0.0184, + "step": 21990 + }, + { + "epoch": 0.16262085686407854, + "grad_norm": 0.11034265905618668, + "learning_rate": 4.202427587844255e-05, + "loss": 0.0205, + "step": 22000 + }, + { + "epoch": 0.16269477543538038, + "grad_norm": 0.07222605496644974, + "learning_rate": 4.202056623931624e-05, + "loss": 0.0207, + "step": 22010 + }, + { + "epoch": 0.16276869400668223, + "grad_norm": 0.09490154683589935, + "learning_rate": 4.2016856600189935e-05, + "loss": 0.0174, + "step": 22020 + }, + { + "epoch": 0.1628426125779841, + "grad_norm": 0.1052534207701683, + "learning_rate": 4.201314696106363e-05, + "loss": 0.0205, + "step": 22030 + }, + { + "epoch": 0.16291653114928595, + "grad_norm": 0.06723055988550186, + "learning_rate": 4.200943732193732e-05, + "loss": 0.0203, + "step": 22040 + }, + { + "epoch": 0.1629904497205878, + "grad_norm": 0.06851140409708023, + "learning_rate": 4.2005727682811017e-05, + "loss": 0.0207, + "step": 22050 + }, + { + "epoch": 0.16306436829188967, + "grad_norm": 0.0777968242764473, + "learning_rate": 4.200201804368471e-05, + "loss": 0.0218, + "step": 22060 + }, + { + "epoch": 0.1631382868631915, + "grad_norm": 0.07410554587841034, + "learning_rate": 4.199830840455841e-05, + "loss": 0.0212, + "step": 22070 + }, + { + "epoch": 0.16321220543449336, + "grad_norm": 0.10092426836490631, + "learning_rate": 4.1994598765432104e-05, + "loss": 0.0203, + "step": 22080 + }, + { + "epoch": 0.1632861240057952, + "grad_norm": 0.10312744975090027, + "learning_rate": 4.1990889126305794e-05, + "loss": 0.0192, + "step": 22090 + }, + { + "epoch": 0.16336004257709708, + "grad_norm": 0.09843014925718307, + "learning_rate": 4.198717948717949e-05, + "loss": 0.0214, + "step": 22100 + }, + { + "epoch": 0.16343396114839892, + "grad_norm": 0.0896533951163292, + "learning_rate": 4.198346984805318e-05, + "loss": 0.0198, + "step": 22110 + }, + { + "epoch": 0.16350787971970077, + "grad_norm": 0.09775464981794357, + "learning_rate": 4.1979760208926875e-05, + "loss": 0.0197, + "step": 22120 + }, + { + "epoch": 0.16358179829100264, + "grad_norm": 0.11483138799667358, + "learning_rate": 4.197605056980057e-05, + "loss": 0.017, + "step": 22130 + }, + { + "epoch": 0.16365571686230448, + "grad_norm": 0.08392385393381119, + "learning_rate": 4.197234093067427e-05, + "loss": 0.0196, + "step": 22140 + }, + { + "epoch": 0.16372963543360633, + "grad_norm": 0.07911352068185806, + "learning_rate": 4.196863129154796e-05, + "loss": 0.0201, + "step": 22150 + }, + { + "epoch": 0.1638035540049082, + "grad_norm": 0.08723487704992294, + "learning_rate": 4.196492165242166e-05, + "loss": 0.0217, + "step": 22160 + }, + { + "epoch": 0.16387747257621005, + "grad_norm": 0.08294124156236649, + "learning_rate": 4.196121201329535e-05, + "loss": 0.017, + "step": 22170 + }, + { + "epoch": 0.1639513911475119, + "grad_norm": 0.08533074706792831, + "learning_rate": 4.1957502374169044e-05, + "loss": 0.0195, + "step": 22180 + }, + { + "epoch": 0.16402530971881377, + "grad_norm": 0.11008051037788391, + "learning_rate": 4.195379273504273e-05, + "loss": 0.0189, + "step": 22190 + }, + { + "epoch": 0.1640992282901156, + "grad_norm": 0.08344793319702148, + "learning_rate": 4.195008309591643e-05, + "loss": 0.0194, + "step": 22200 + }, + { + "epoch": 0.16417314686141746, + "grad_norm": 0.0651671513915062, + "learning_rate": 4.1946373456790125e-05, + "loss": 0.0202, + "step": 22210 + }, + { + "epoch": 0.16424706543271933, + "grad_norm": 0.06644406169652939, + "learning_rate": 4.194266381766382e-05, + "loss": 0.0201, + "step": 22220 + }, + { + "epoch": 0.16432098400402118, + "grad_norm": 0.08951295912265778, + "learning_rate": 4.193895417853752e-05, + "loss": 0.0175, + "step": 22230 + }, + { + "epoch": 0.16439490257532302, + "grad_norm": 0.09983833134174347, + "learning_rate": 4.1935244539411206e-05, + "loss": 0.0187, + "step": 22240 + }, + { + "epoch": 0.16446882114662487, + "grad_norm": 0.08202794194221497, + "learning_rate": 4.19315349002849e-05, + "loss": 0.019, + "step": 22250 + }, + { + "epoch": 0.16454273971792674, + "grad_norm": 0.10202916711568832, + "learning_rate": 4.19278252611586e-05, + "loss": 0.0193, + "step": 22260 + }, + { + "epoch": 0.16461665828922858, + "grad_norm": 0.09498098492622375, + "learning_rate": 4.192411562203229e-05, + "loss": 0.0206, + "step": 22270 + }, + { + "epoch": 0.16469057686053043, + "grad_norm": 0.07276278734207153, + "learning_rate": 4.192040598290598e-05, + "loss": 0.0216, + "step": 22280 + }, + { + "epoch": 0.1647644954318323, + "grad_norm": 0.09117782860994339, + "learning_rate": 4.191669634377968e-05, + "loss": 0.0194, + "step": 22290 + }, + { + "epoch": 0.16483841400313415, + "grad_norm": 0.08927126228809357, + "learning_rate": 4.1912986704653375e-05, + "loss": 0.0178, + "step": 22300 + }, + { + "epoch": 0.164912332574436, + "grad_norm": 0.09630057960748672, + "learning_rate": 4.190927706552707e-05, + "loss": 0.0189, + "step": 22310 + }, + { + "epoch": 0.16498625114573787, + "grad_norm": 0.0837571918964386, + "learning_rate": 4.190556742640076e-05, + "loss": 0.0204, + "step": 22320 + }, + { + "epoch": 0.1650601697170397, + "grad_norm": 0.1012878343462944, + "learning_rate": 4.1901857787274456e-05, + "loss": 0.0198, + "step": 22330 + }, + { + "epoch": 0.16513408828834156, + "grad_norm": 0.11893408745527267, + "learning_rate": 4.1898148148148145e-05, + "loss": 0.019, + "step": 22340 + }, + { + "epoch": 0.16520800685964343, + "grad_norm": 0.07912056893110275, + "learning_rate": 4.189443850902184e-05, + "loss": 0.0191, + "step": 22350 + }, + { + "epoch": 0.16528192543094528, + "grad_norm": 0.11213517934083939, + "learning_rate": 4.189072886989554e-05, + "loss": 0.0196, + "step": 22360 + }, + { + "epoch": 0.16535584400224712, + "grad_norm": 0.08836356550455093, + "learning_rate": 4.188701923076923e-05, + "loss": 0.0166, + "step": 22370 + }, + { + "epoch": 0.16542976257354897, + "grad_norm": 0.07143910974264145, + "learning_rate": 4.188330959164293e-05, + "loss": 0.0226, + "step": 22380 + }, + { + "epoch": 0.16550368114485084, + "grad_norm": 0.08796315640211105, + "learning_rate": 4.1879599952516625e-05, + "loss": 0.0223, + "step": 22390 + }, + { + "epoch": 0.16557759971615268, + "grad_norm": 0.10248327255249023, + "learning_rate": 4.1875890313390314e-05, + "loss": 0.0219, + "step": 22400 + }, + { + "epoch": 0.16565151828745453, + "grad_norm": 0.10602504760026932, + "learning_rate": 4.187218067426401e-05, + "loss": 0.0195, + "step": 22410 + }, + { + "epoch": 0.1657254368587564, + "grad_norm": 0.087444968521595, + "learning_rate": 4.18684710351377e-05, + "loss": 0.0204, + "step": 22420 + }, + { + "epoch": 0.16579935543005825, + "grad_norm": 0.08767131716012955, + "learning_rate": 4.1864761396011396e-05, + "loss": 0.0208, + "step": 22430 + }, + { + "epoch": 0.1658732740013601, + "grad_norm": 0.10391967743635178, + "learning_rate": 4.186105175688509e-05, + "loss": 0.0211, + "step": 22440 + }, + { + "epoch": 0.16594719257266197, + "grad_norm": 0.06989863514900208, + "learning_rate": 4.185734211775879e-05, + "loss": 0.0213, + "step": 22450 + }, + { + "epoch": 0.1660211111439638, + "grad_norm": 0.10912055522203445, + "learning_rate": 4.1853632478632483e-05, + "loss": 0.0217, + "step": 22460 + }, + { + "epoch": 0.16609502971526566, + "grad_norm": 0.08249878883361816, + "learning_rate": 4.184992283950617e-05, + "loss": 0.0189, + "step": 22470 + }, + { + "epoch": 0.16616894828656753, + "grad_norm": 0.10511092096567154, + "learning_rate": 4.184621320037987e-05, + "loss": 0.0191, + "step": 22480 + }, + { + "epoch": 0.16624286685786938, + "grad_norm": 0.11219155788421631, + "learning_rate": 4.1842503561253565e-05, + "loss": 0.0207, + "step": 22490 + }, + { + "epoch": 0.16631678542917122, + "grad_norm": 0.09815070778131485, + "learning_rate": 4.1838793922127254e-05, + "loss": 0.0223, + "step": 22500 + }, + { + "epoch": 0.16639070400047307, + "grad_norm": 0.12557922303676605, + "learning_rate": 4.183508428300095e-05, + "loss": 0.0216, + "step": 22510 + }, + { + "epoch": 0.16646462257177494, + "grad_norm": 0.08924151957035065, + "learning_rate": 4.1831374643874646e-05, + "loss": 0.0211, + "step": 22520 + }, + { + "epoch": 0.16653854114307678, + "grad_norm": 0.0938444435596466, + "learning_rate": 4.182766500474834e-05, + "loss": 0.0178, + "step": 22530 + }, + { + "epoch": 0.16661245971437863, + "grad_norm": 0.06955837458372116, + "learning_rate": 4.182395536562204e-05, + "loss": 0.0189, + "step": 22540 + }, + { + "epoch": 0.1666863782856805, + "grad_norm": 0.09439922124147415, + "learning_rate": 4.182024572649573e-05, + "loss": 0.0193, + "step": 22550 + }, + { + "epoch": 0.16676029685698235, + "grad_norm": 0.07448028773069382, + "learning_rate": 4.181653608736942e-05, + "loss": 0.0218, + "step": 22560 + }, + { + "epoch": 0.1668342154282842, + "grad_norm": 0.08249770104885101, + "learning_rate": 4.181282644824311e-05, + "loss": 0.017, + "step": 22570 + }, + { + "epoch": 0.16690813399958607, + "grad_norm": 0.10910207033157349, + "learning_rate": 4.180911680911681e-05, + "loss": 0.0185, + "step": 22580 + }, + { + "epoch": 0.1669820525708879, + "grad_norm": 0.10977750271558762, + "learning_rate": 4.1805407169990504e-05, + "loss": 0.0187, + "step": 22590 + }, + { + "epoch": 0.16705597114218976, + "grad_norm": 0.06731917709112167, + "learning_rate": 4.18016975308642e-05, + "loss": 0.0191, + "step": 22600 + }, + { + "epoch": 0.16712988971349163, + "grad_norm": 0.09132971614599228, + "learning_rate": 4.1797987891737896e-05, + "loss": 0.0195, + "step": 22610 + }, + { + "epoch": 0.16720380828479348, + "grad_norm": 0.09934677183628082, + "learning_rate": 4.179427825261159e-05, + "loss": 0.019, + "step": 22620 + }, + { + "epoch": 0.16727772685609532, + "grad_norm": 0.09889721870422363, + "learning_rate": 4.179056861348528e-05, + "loss": 0.0222, + "step": 22630 + }, + { + "epoch": 0.16735164542739717, + "grad_norm": 0.09013309329748154, + "learning_rate": 4.178685897435898e-05, + "loss": 0.0201, + "step": 22640 + }, + { + "epoch": 0.16742556399869904, + "grad_norm": 0.09726738929748535, + "learning_rate": 4.1783149335232666e-05, + "loss": 0.0192, + "step": 22650 + }, + { + "epoch": 0.16749948257000088, + "grad_norm": 0.1414596438407898, + "learning_rate": 4.177943969610636e-05, + "loss": 0.0186, + "step": 22660 + }, + { + "epoch": 0.16757340114130273, + "grad_norm": 0.09074228256940842, + "learning_rate": 4.177573005698006e-05, + "loss": 0.0217, + "step": 22670 + }, + { + "epoch": 0.1676473197126046, + "grad_norm": 0.0756804347038269, + "learning_rate": 4.1772020417853754e-05, + "loss": 0.0202, + "step": 22680 + }, + { + "epoch": 0.16772123828390645, + "grad_norm": 0.08531906455755234, + "learning_rate": 4.176831077872745e-05, + "loss": 0.0182, + "step": 22690 + }, + { + "epoch": 0.1677951568552083, + "grad_norm": 0.12203525751829147, + "learning_rate": 4.176460113960114e-05, + "loss": 0.0202, + "step": 22700 + }, + { + "epoch": 0.16786907542651017, + "grad_norm": 0.06335960328578949, + "learning_rate": 4.1760891500474835e-05, + "loss": 0.0228, + "step": 22710 + }, + { + "epoch": 0.167942993997812, + "grad_norm": 0.060673534870147705, + "learning_rate": 4.175718186134853e-05, + "loss": 0.0162, + "step": 22720 + }, + { + "epoch": 0.16801691256911386, + "grad_norm": 0.08812720328569412, + "learning_rate": 4.175347222222222e-05, + "loss": 0.0203, + "step": 22730 + }, + { + "epoch": 0.16809083114041573, + "grad_norm": 0.07789555191993713, + "learning_rate": 4.1749762583095916e-05, + "loss": 0.0182, + "step": 22740 + }, + { + "epoch": 0.16816474971171758, + "grad_norm": 0.07631140947341919, + "learning_rate": 4.174605294396961e-05, + "loss": 0.0219, + "step": 22750 + }, + { + "epoch": 0.16823866828301942, + "grad_norm": 0.08663522452116013, + "learning_rate": 4.174234330484331e-05, + "loss": 0.0185, + "step": 22760 + }, + { + "epoch": 0.16831258685432127, + "grad_norm": 0.12132871896028519, + "learning_rate": 4.1738633665717004e-05, + "loss": 0.0195, + "step": 22770 + }, + { + "epoch": 0.16838650542562314, + "grad_norm": 0.07200045883655548, + "learning_rate": 4.173492402659069e-05, + "loss": 0.0196, + "step": 22780 + }, + { + "epoch": 0.16846042399692499, + "grad_norm": 0.08001714944839478, + "learning_rate": 4.173121438746439e-05, + "loss": 0.0219, + "step": 22790 + }, + { + "epoch": 0.16853434256822683, + "grad_norm": 0.07372362911701202, + "learning_rate": 4.172750474833808e-05, + "loss": 0.0185, + "step": 22800 + }, + { + "epoch": 0.1686082611395287, + "grad_norm": 0.08221118152141571, + "learning_rate": 4.1723795109211774e-05, + "loss": 0.0192, + "step": 22810 + }, + { + "epoch": 0.16868217971083055, + "grad_norm": 0.0778815820813179, + "learning_rate": 4.172008547008547e-05, + "loss": 0.0196, + "step": 22820 + }, + { + "epoch": 0.1687560982821324, + "grad_norm": 0.10039696842432022, + "learning_rate": 4.1716375830959166e-05, + "loss": 0.0243, + "step": 22830 + }, + { + "epoch": 0.16883001685343427, + "grad_norm": 0.1025981679558754, + "learning_rate": 4.171266619183286e-05, + "loss": 0.0186, + "step": 22840 + }, + { + "epoch": 0.1689039354247361, + "grad_norm": 0.07049137353897095, + "learning_rate": 4.170895655270656e-05, + "loss": 0.0186, + "step": 22850 + }, + { + "epoch": 0.16897785399603796, + "grad_norm": 0.1277700960636139, + "learning_rate": 4.170524691358025e-05, + "loss": 0.0171, + "step": 22860 + }, + { + "epoch": 0.16905177256733983, + "grad_norm": 0.07361399382352829, + "learning_rate": 4.1701537274453944e-05, + "loss": 0.021, + "step": 22870 + }, + { + "epoch": 0.16912569113864168, + "grad_norm": 0.07534274458885193, + "learning_rate": 4.169782763532763e-05, + "loss": 0.021, + "step": 22880 + }, + { + "epoch": 0.16919960970994352, + "grad_norm": 0.09960392117500305, + "learning_rate": 4.169411799620133e-05, + "loss": 0.0179, + "step": 22890 + }, + { + "epoch": 0.16927352828124537, + "grad_norm": 0.08838877826929092, + "learning_rate": 4.169040835707503e-05, + "loss": 0.0211, + "step": 22900 + }, + { + "epoch": 0.16934744685254724, + "grad_norm": 0.12791630625724792, + "learning_rate": 4.168669871794872e-05, + "loss": 0.021, + "step": 22910 + }, + { + "epoch": 0.16942136542384909, + "grad_norm": 0.1230262815952301, + "learning_rate": 4.1682989078822417e-05, + "loss": 0.0173, + "step": 22920 + }, + { + "epoch": 0.16949528399515093, + "grad_norm": 0.14303380250930786, + "learning_rate": 4.1679279439696106e-05, + "loss": 0.0198, + "step": 22930 + }, + { + "epoch": 0.1695692025664528, + "grad_norm": 0.1258019655942917, + "learning_rate": 4.16755698005698e-05, + "loss": 0.0214, + "step": 22940 + }, + { + "epoch": 0.16964312113775465, + "grad_norm": 0.09205642342567444, + "learning_rate": 4.16718601614435e-05, + "loss": 0.0194, + "step": 22950 + }, + { + "epoch": 0.1697170397090565, + "grad_norm": 0.08836735039949417, + "learning_rate": 4.166815052231719e-05, + "loss": 0.0197, + "step": 22960 + }, + { + "epoch": 0.16979095828035837, + "grad_norm": 0.09797254949808121, + "learning_rate": 4.166444088319088e-05, + "loss": 0.0196, + "step": 22970 + }, + { + "epoch": 0.1698648768516602, + "grad_norm": 0.0916217640042305, + "learning_rate": 4.166073124406458e-05, + "loss": 0.0193, + "step": 22980 + }, + { + "epoch": 0.16993879542296206, + "grad_norm": 0.07919829338788986, + "learning_rate": 4.1657021604938275e-05, + "loss": 0.0196, + "step": 22990 + }, + { + "epoch": 0.17001271399426393, + "grad_norm": 0.09741434454917908, + "learning_rate": 4.165331196581197e-05, + "loss": 0.0211, + "step": 23000 + }, + { + "epoch": 0.17008663256556578, + "grad_norm": 0.12706300616264343, + "learning_rate": 4.164960232668566e-05, + "loss": 0.0207, + "step": 23010 + }, + { + "epoch": 0.17016055113686762, + "grad_norm": 0.11344017088413239, + "learning_rate": 4.1645892687559356e-05, + "loss": 0.0191, + "step": 23020 + }, + { + "epoch": 0.17023446970816947, + "grad_norm": 0.09292125701904297, + "learning_rate": 4.1642183048433045e-05, + "loss": 0.0205, + "step": 23030 + }, + { + "epoch": 0.17030838827947134, + "grad_norm": 0.0878138393163681, + "learning_rate": 4.163847340930674e-05, + "loss": 0.0227, + "step": 23040 + }, + { + "epoch": 0.17038230685077319, + "grad_norm": 0.09296700358390808, + "learning_rate": 4.1634763770180444e-05, + "loss": 0.0218, + "step": 23050 + }, + { + "epoch": 0.17045622542207503, + "grad_norm": 0.09765233844518661, + "learning_rate": 4.163105413105413e-05, + "loss": 0.0187, + "step": 23060 + }, + { + "epoch": 0.1705301439933769, + "grad_norm": 0.07625902444124222, + "learning_rate": 4.162734449192783e-05, + "loss": 0.0184, + "step": 23070 + }, + { + "epoch": 0.17060406256467875, + "grad_norm": 0.13808022439479828, + "learning_rate": 4.1623634852801525e-05, + "loss": 0.0199, + "step": 23080 + }, + { + "epoch": 0.1706779811359806, + "grad_norm": 0.1144128069281578, + "learning_rate": 4.1619925213675214e-05, + "loss": 0.0211, + "step": 23090 + }, + { + "epoch": 0.17075189970728247, + "grad_norm": 0.075762078166008, + "learning_rate": 4.161621557454891e-05, + "loss": 0.0187, + "step": 23100 + }, + { + "epoch": 0.1708258182785843, + "grad_norm": 0.08701431006193161, + "learning_rate": 4.16125059354226e-05, + "loss": 0.0223, + "step": 23110 + }, + { + "epoch": 0.17089973684988616, + "grad_norm": 0.10601526498794556, + "learning_rate": 4.1608796296296295e-05, + "loss": 0.0208, + "step": 23120 + }, + { + "epoch": 0.17097365542118803, + "grad_norm": 0.10111930966377258, + "learning_rate": 4.160508665717e-05, + "loss": 0.0189, + "step": 23130 + }, + { + "epoch": 0.17104757399248988, + "grad_norm": 0.09323742240667343, + "learning_rate": 4.160137701804369e-05, + "loss": 0.0184, + "step": 23140 + }, + { + "epoch": 0.17112149256379172, + "grad_norm": 0.10205500572919846, + "learning_rate": 4.159766737891738e-05, + "loss": 0.0194, + "step": 23150 + }, + { + "epoch": 0.1711954111350936, + "grad_norm": 0.07837098836898804, + "learning_rate": 4.159395773979107e-05, + "loss": 0.0225, + "step": 23160 + }, + { + "epoch": 0.17126932970639544, + "grad_norm": 0.08919687569141388, + "learning_rate": 4.159024810066477e-05, + "loss": 0.0194, + "step": 23170 + }, + { + "epoch": 0.17134324827769729, + "grad_norm": 0.09402740001678467, + "learning_rate": 4.1586538461538464e-05, + "loss": 0.0198, + "step": 23180 + }, + { + "epoch": 0.17141716684899913, + "grad_norm": 0.12361681461334229, + "learning_rate": 4.1582828822412153e-05, + "loss": 0.0179, + "step": 23190 + }, + { + "epoch": 0.171491085420301, + "grad_norm": 0.08092927187681198, + "learning_rate": 4.1579119183285856e-05, + "loss": 0.0206, + "step": 23200 + }, + { + "epoch": 0.17156500399160285, + "grad_norm": 0.1065574660897255, + "learning_rate": 4.1575409544159545e-05, + "loss": 0.0212, + "step": 23210 + }, + { + "epoch": 0.1716389225629047, + "grad_norm": 0.10360395163297653, + "learning_rate": 4.157169990503324e-05, + "loss": 0.02, + "step": 23220 + }, + { + "epoch": 0.17171284113420657, + "grad_norm": 0.11819800734519958, + "learning_rate": 4.156799026590694e-05, + "loss": 0.0194, + "step": 23230 + }, + { + "epoch": 0.1717867597055084, + "grad_norm": 0.1112608015537262, + "learning_rate": 4.1564280626780627e-05, + "loss": 0.0188, + "step": 23240 + }, + { + "epoch": 0.17186067827681026, + "grad_norm": 0.12329550087451935, + "learning_rate": 4.156057098765432e-05, + "loss": 0.0201, + "step": 23250 + }, + { + "epoch": 0.17193459684811213, + "grad_norm": 0.11046748608350754, + "learning_rate": 4.155686134852801e-05, + "loss": 0.0194, + "step": 23260 + }, + { + "epoch": 0.17200851541941398, + "grad_norm": 0.09428173303604126, + "learning_rate": 4.155315170940171e-05, + "loss": 0.0241, + "step": 23270 + }, + { + "epoch": 0.17208243399071582, + "grad_norm": 0.09326600283384323, + "learning_rate": 4.154944207027541e-05, + "loss": 0.0193, + "step": 23280 + }, + { + "epoch": 0.1721563525620177, + "grad_norm": 0.06932845711708069, + "learning_rate": 4.15457324311491e-05, + "loss": 0.0196, + "step": 23290 + }, + { + "epoch": 0.17223027113331954, + "grad_norm": 0.08526023477315903, + "learning_rate": 4.1542022792022796e-05, + "loss": 0.0191, + "step": 23300 + }, + { + "epoch": 0.17230418970462139, + "grad_norm": 0.07734812796115875, + "learning_rate": 4.153831315289649e-05, + "loss": 0.0191, + "step": 23310 + }, + { + "epoch": 0.17237810827592323, + "grad_norm": 0.08788762986660004, + "learning_rate": 4.153460351377018e-05, + "loss": 0.0205, + "step": 23320 + }, + { + "epoch": 0.1724520268472251, + "grad_norm": 0.12777380645275116, + "learning_rate": 4.153089387464388e-05, + "loss": 0.0171, + "step": 23330 + }, + { + "epoch": 0.17252594541852695, + "grad_norm": 0.08061201870441437, + "learning_rate": 4.1527184235517566e-05, + "loss": 0.021, + "step": 23340 + }, + { + "epoch": 0.1725998639898288, + "grad_norm": 0.08452122658491135, + "learning_rate": 4.152347459639127e-05, + "loss": 0.0183, + "step": 23350 + }, + { + "epoch": 0.17267378256113067, + "grad_norm": 0.08944263309240341, + "learning_rate": 4.1519764957264965e-05, + "loss": 0.0191, + "step": 23360 + }, + { + "epoch": 0.1727477011324325, + "grad_norm": 0.13580366969108582, + "learning_rate": 4.1516055318138654e-05, + "loss": 0.0216, + "step": 23370 + }, + { + "epoch": 0.17282161970373436, + "grad_norm": 0.10675542056560516, + "learning_rate": 4.151234567901235e-05, + "loss": 0.0192, + "step": 23380 + }, + { + "epoch": 0.17289553827503623, + "grad_norm": 0.09346351772546768, + "learning_rate": 4.150863603988604e-05, + "loss": 0.0201, + "step": 23390 + }, + { + "epoch": 0.17296945684633808, + "grad_norm": 0.09439420700073242, + "learning_rate": 4.1504926400759735e-05, + "loss": 0.022, + "step": 23400 + }, + { + "epoch": 0.17304337541763992, + "grad_norm": 0.06360670179128647, + "learning_rate": 4.150121676163343e-05, + "loss": 0.0207, + "step": 23410 + }, + { + "epoch": 0.1731172939889418, + "grad_norm": 0.07455422729253769, + "learning_rate": 4.149750712250712e-05, + "loss": 0.019, + "step": 23420 + }, + { + "epoch": 0.17319121256024364, + "grad_norm": 0.0845806896686554, + "learning_rate": 4.149379748338082e-05, + "loss": 0.0187, + "step": 23430 + }, + { + "epoch": 0.17326513113154549, + "grad_norm": 0.10896727442741394, + "learning_rate": 4.149008784425451e-05, + "loss": 0.0171, + "step": 23440 + }, + { + "epoch": 0.17333904970284733, + "grad_norm": 0.08428969979286194, + "learning_rate": 4.148637820512821e-05, + "loss": 0.0186, + "step": 23450 + }, + { + "epoch": 0.1734129682741492, + "grad_norm": 0.08725108951330185, + "learning_rate": 4.1482668566001904e-05, + "loss": 0.0203, + "step": 23460 + }, + { + "epoch": 0.17348688684545105, + "grad_norm": 0.07068879902362823, + "learning_rate": 4.147895892687559e-05, + "loss": 0.0203, + "step": 23470 + }, + { + "epoch": 0.1735608054167529, + "grad_norm": 0.09164883196353912, + "learning_rate": 4.147524928774929e-05, + "loss": 0.0172, + "step": 23480 + }, + { + "epoch": 0.17363472398805477, + "grad_norm": 0.12593930959701538, + "learning_rate": 4.147153964862298e-05, + "loss": 0.0213, + "step": 23490 + }, + { + "epoch": 0.1737086425593566, + "grad_norm": 0.08440453559160233, + "learning_rate": 4.1467830009496674e-05, + "loss": 0.0194, + "step": 23500 + }, + { + "epoch": 0.17378256113065846, + "grad_norm": 0.1044711023569107, + "learning_rate": 4.146412037037038e-05, + "loss": 0.02, + "step": 23510 + }, + { + "epoch": 0.17385647970196033, + "grad_norm": 0.08570936322212219, + "learning_rate": 4.1460410731244066e-05, + "loss": 0.0207, + "step": 23520 + }, + { + "epoch": 0.17393039827326218, + "grad_norm": 0.07355409860610962, + "learning_rate": 4.145670109211776e-05, + "loss": 0.0179, + "step": 23530 + }, + { + "epoch": 0.17400431684456402, + "grad_norm": 0.11744547635316849, + "learning_rate": 4.145299145299146e-05, + "loss": 0.023, + "step": 23540 + }, + { + "epoch": 0.1740782354158659, + "grad_norm": 0.10709847509860992, + "learning_rate": 4.144928181386515e-05, + "loss": 0.0206, + "step": 23550 + }, + { + "epoch": 0.17415215398716774, + "grad_norm": 0.10861247777938843, + "learning_rate": 4.144557217473884e-05, + "loss": 0.0199, + "step": 23560 + }, + { + "epoch": 0.17422607255846959, + "grad_norm": 0.09161026030778885, + "learning_rate": 4.144186253561253e-05, + "loss": 0.0193, + "step": 23570 + }, + { + "epoch": 0.17429999112977143, + "grad_norm": 0.09316155314445496, + "learning_rate": 4.1438152896486235e-05, + "loss": 0.023, + "step": 23580 + }, + { + "epoch": 0.1743739097010733, + "grad_norm": 0.07369499653577805, + "learning_rate": 4.143444325735993e-05, + "loss": 0.0194, + "step": 23590 + }, + { + "epoch": 0.17444782827237515, + "grad_norm": 0.10297918319702148, + "learning_rate": 4.143073361823362e-05, + "loss": 0.021, + "step": 23600 + }, + { + "epoch": 0.174521746843677, + "grad_norm": 0.07225877791643143, + "learning_rate": 4.1427023979107316e-05, + "loss": 0.0178, + "step": 23610 + }, + { + "epoch": 0.17459566541497887, + "grad_norm": 0.08436037600040436, + "learning_rate": 4.1423314339981006e-05, + "loss": 0.0198, + "step": 23620 + }, + { + "epoch": 0.1746695839862807, + "grad_norm": 0.08097023516893387, + "learning_rate": 4.14196047008547e-05, + "loss": 0.0172, + "step": 23630 + }, + { + "epoch": 0.17474350255758256, + "grad_norm": 0.09056053310632706, + "learning_rate": 4.14158950617284e-05, + "loss": 0.0195, + "step": 23640 + }, + { + "epoch": 0.17481742112888443, + "grad_norm": 0.11529424786567688, + "learning_rate": 4.141218542260209e-05, + "loss": 0.0207, + "step": 23650 + }, + { + "epoch": 0.17489133970018628, + "grad_norm": 0.08660317212343216, + "learning_rate": 4.140847578347579e-05, + "loss": 0.02, + "step": 23660 + }, + { + "epoch": 0.17496525827148812, + "grad_norm": 0.09512817114591599, + "learning_rate": 4.140476614434948e-05, + "loss": 0.0184, + "step": 23670 + }, + { + "epoch": 0.17503917684279, + "grad_norm": 0.07830807566642761, + "learning_rate": 4.1401056505223175e-05, + "loss": 0.0202, + "step": 23680 + }, + { + "epoch": 0.17511309541409184, + "grad_norm": 0.11256569623947144, + "learning_rate": 4.139734686609687e-05, + "loss": 0.0195, + "step": 23690 + }, + { + "epoch": 0.17518701398539369, + "grad_norm": 0.09436741471290588, + "learning_rate": 4.139363722697056e-05, + "loss": 0.0195, + "step": 23700 + }, + { + "epoch": 0.17526093255669553, + "grad_norm": 0.05403030291199684, + "learning_rate": 4.1389927587844256e-05, + "loss": 0.0202, + "step": 23710 + }, + { + "epoch": 0.1753348511279974, + "grad_norm": 0.08679580688476562, + "learning_rate": 4.1386217948717945e-05, + "loss": 0.0204, + "step": 23720 + }, + { + "epoch": 0.17540876969929925, + "grad_norm": 0.07748806476593018, + "learning_rate": 4.138250830959165e-05, + "loss": 0.0183, + "step": 23730 + }, + { + "epoch": 0.1754826882706011, + "grad_norm": 0.10431193560361862, + "learning_rate": 4.1378798670465344e-05, + "loss": 0.0203, + "step": 23740 + }, + { + "epoch": 0.17555660684190297, + "grad_norm": 0.09392105042934418, + "learning_rate": 4.137508903133903e-05, + "loss": 0.0193, + "step": 23750 + }, + { + "epoch": 0.1756305254132048, + "grad_norm": 0.08645907044410706, + "learning_rate": 4.137137939221273e-05, + "loss": 0.0186, + "step": 23760 + }, + { + "epoch": 0.17570444398450666, + "grad_norm": 0.0924372673034668, + "learning_rate": 4.1367669753086425e-05, + "loss": 0.0164, + "step": 23770 + }, + { + "epoch": 0.17577836255580853, + "grad_norm": 0.057999420911073685, + "learning_rate": 4.1363960113960114e-05, + "loss": 0.0186, + "step": 23780 + }, + { + "epoch": 0.17585228112711038, + "grad_norm": 0.08546840399503708, + "learning_rate": 4.136025047483381e-05, + "loss": 0.0181, + "step": 23790 + }, + { + "epoch": 0.17592619969841222, + "grad_norm": 0.09696268290281296, + "learning_rate": 4.13565408357075e-05, + "loss": 0.0212, + "step": 23800 + }, + { + "epoch": 0.1760001182697141, + "grad_norm": 0.09261220693588257, + "learning_rate": 4.13528311965812e-05, + "loss": 0.021, + "step": 23810 + }, + { + "epoch": 0.17607403684101594, + "grad_norm": 0.08623462170362473, + "learning_rate": 4.13491215574549e-05, + "loss": 0.0177, + "step": 23820 + }, + { + "epoch": 0.17614795541231779, + "grad_norm": 0.08633705228567123, + "learning_rate": 4.134541191832859e-05, + "loss": 0.0198, + "step": 23830 + }, + { + "epoch": 0.17622187398361963, + "grad_norm": 0.10043275356292725, + "learning_rate": 4.134170227920228e-05, + "loss": 0.0208, + "step": 23840 + }, + { + "epoch": 0.1762957925549215, + "grad_norm": 0.10127022117376328, + "learning_rate": 4.133799264007597e-05, + "loss": 0.0177, + "step": 23850 + }, + { + "epoch": 0.17636971112622335, + "grad_norm": 0.07167801260948181, + "learning_rate": 4.133428300094967e-05, + "loss": 0.0202, + "step": 23860 + }, + { + "epoch": 0.1764436296975252, + "grad_norm": 0.10874171555042267, + "learning_rate": 4.1330573361823364e-05, + "loss": 0.0179, + "step": 23870 + }, + { + "epoch": 0.17651754826882707, + "grad_norm": 0.08898110687732697, + "learning_rate": 4.132686372269706e-05, + "loss": 0.019, + "step": 23880 + }, + { + "epoch": 0.1765914668401289, + "grad_norm": 0.1021110787987709, + "learning_rate": 4.1323154083570756e-05, + "loss": 0.0211, + "step": 23890 + }, + { + "epoch": 0.17666538541143076, + "grad_norm": 0.10505396127700806, + "learning_rate": 4.1319444444444445e-05, + "loss": 0.0193, + "step": 23900 + }, + { + "epoch": 0.17673930398273263, + "grad_norm": 0.08702701330184937, + "learning_rate": 4.131573480531814e-05, + "loss": 0.0222, + "step": 23910 + }, + { + "epoch": 0.17681322255403448, + "grad_norm": 0.09350302815437317, + "learning_rate": 4.131202516619184e-05, + "loss": 0.0213, + "step": 23920 + }, + { + "epoch": 0.17688714112533632, + "grad_norm": 0.0829959511756897, + "learning_rate": 4.1308315527065526e-05, + "loss": 0.0214, + "step": 23930 + }, + { + "epoch": 0.1769610596966382, + "grad_norm": 0.08366640657186508, + "learning_rate": 4.130460588793922e-05, + "loss": 0.0185, + "step": 23940 + }, + { + "epoch": 0.17703497826794004, + "grad_norm": 0.071568563580513, + "learning_rate": 4.130089624881291e-05, + "loss": 0.017, + "step": 23950 + }, + { + "epoch": 0.17710889683924189, + "grad_norm": 0.08128269761800766, + "learning_rate": 4.1297186609686614e-05, + "loss": 0.0169, + "step": 23960 + }, + { + "epoch": 0.17718281541054373, + "grad_norm": 0.08183693885803223, + "learning_rate": 4.129347697056031e-05, + "loss": 0.021, + "step": 23970 + }, + { + "epoch": 0.1772567339818456, + "grad_norm": 0.08033385127782822, + "learning_rate": 4.1289767331434e-05, + "loss": 0.019, + "step": 23980 + }, + { + "epoch": 0.17733065255314745, + "grad_norm": 0.11473685503005981, + "learning_rate": 4.1286057692307695e-05, + "loss": 0.0192, + "step": 23990 + }, + { + "epoch": 0.1774045711244493, + "grad_norm": 0.09031099826097488, + "learning_rate": 4.128234805318139e-05, + "loss": 0.0192, + "step": 24000 + }, + { + "epoch": 0.17747848969575117, + "grad_norm": 0.07809562981128693, + "learning_rate": 4.127863841405508e-05, + "loss": 0.0198, + "step": 24010 + }, + { + "epoch": 0.177552408267053, + "grad_norm": 0.08787760883569717, + "learning_rate": 4.1274928774928776e-05, + "loss": 0.0207, + "step": 24020 + }, + { + "epoch": 0.17762632683835486, + "grad_norm": 0.08965402096509933, + "learning_rate": 4.127121913580247e-05, + "loss": 0.0235, + "step": 24030 + }, + { + "epoch": 0.17770024540965673, + "grad_norm": 0.18066206574440002, + "learning_rate": 4.126750949667617e-05, + "loss": 0.0217, + "step": 24040 + }, + { + "epoch": 0.17777416398095858, + "grad_norm": 0.12065722048282623, + "learning_rate": 4.1263799857549864e-05, + "loss": 0.019, + "step": 24050 + }, + { + "epoch": 0.17784808255226042, + "grad_norm": 0.06909769028425217, + "learning_rate": 4.1260090218423554e-05, + "loss": 0.0209, + "step": 24060 + }, + { + "epoch": 0.1779220011235623, + "grad_norm": 0.09605135023593903, + "learning_rate": 4.125638057929725e-05, + "loss": 0.02, + "step": 24070 + }, + { + "epoch": 0.17799591969486414, + "grad_norm": 0.11883515864610672, + "learning_rate": 4.125267094017094e-05, + "loss": 0.0204, + "step": 24080 + }, + { + "epoch": 0.178069838266166, + "grad_norm": 0.06468243896961212, + "learning_rate": 4.1248961301044635e-05, + "loss": 0.0199, + "step": 24090 + }, + { + "epoch": 0.17814375683746786, + "grad_norm": 0.08295508474111557, + "learning_rate": 4.124525166191833e-05, + "loss": 0.0199, + "step": 24100 + }, + { + "epoch": 0.1782176754087697, + "grad_norm": 0.06782054901123047, + "learning_rate": 4.1241542022792027e-05, + "loss": 0.0189, + "step": 24110 + }, + { + "epoch": 0.17829159398007155, + "grad_norm": 0.09335844218730927, + "learning_rate": 4.123783238366572e-05, + "loss": 0.0213, + "step": 24120 + }, + { + "epoch": 0.1783655125513734, + "grad_norm": 0.08745142072439194, + "learning_rate": 4.123412274453941e-05, + "loss": 0.0179, + "step": 24130 + }, + { + "epoch": 0.17843943112267527, + "grad_norm": 0.09213876724243164, + "learning_rate": 4.123041310541311e-05, + "loss": 0.0185, + "step": 24140 + }, + { + "epoch": 0.1785133496939771, + "grad_norm": 0.12334982305765152, + "learning_rate": 4.1226703466286804e-05, + "loss": 0.0236, + "step": 24150 + }, + { + "epoch": 0.17858726826527896, + "grad_norm": 0.07247071713209152, + "learning_rate": 4.122299382716049e-05, + "loss": 0.0181, + "step": 24160 + }, + { + "epoch": 0.17866118683658083, + "grad_norm": 0.07788656651973724, + "learning_rate": 4.121928418803419e-05, + "loss": 0.0203, + "step": 24170 + }, + { + "epoch": 0.17873510540788268, + "grad_norm": 0.08412611484527588, + "learning_rate": 4.1215574548907885e-05, + "loss": 0.0218, + "step": 24180 + }, + { + "epoch": 0.17880902397918452, + "grad_norm": 0.10411692410707474, + "learning_rate": 4.121186490978158e-05, + "loss": 0.0186, + "step": 24190 + }, + { + "epoch": 0.1788829425504864, + "grad_norm": 0.09697825461626053, + "learning_rate": 4.120815527065528e-05, + "loss": 0.0191, + "step": 24200 + }, + { + "epoch": 0.17895686112178824, + "grad_norm": 0.11236854642629623, + "learning_rate": 4.1204445631528966e-05, + "loss": 0.0213, + "step": 24210 + }, + { + "epoch": 0.1790307796930901, + "grad_norm": 0.06983228772878647, + "learning_rate": 4.120073599240266e-05, + "loss": 0.0172, + "step": 24220 + }, + { + "epoch": 0.17910469826439196, + "grad_norm": 0.10572890192270279, + "learning_rate": 4.119702635327636e-05, + "loss": 0.0206, + "step": 24230 + }, + { + "epoch": 0.1791786168356938, + "grad_norm": 0.08477522432804108, + "learning_rate": 4.119331671415005e-05, + "loss": 0.0185, + "step": 24240 + }, + { + "epoch": 0.17925253540699565, + "grad_norm": 0.08904995024204254, + "learning_rate": 4.118960707502374e-05, + "loss": 0.0215, + "step": 24250 + }, + { + "epoch": 0.1793264539782975, + "grad_norm": 0.11577491462230682, + "learning_rate": 4.118589743589744e-05, + "loss": 0.0193, + "step": 24260 + }, + { + "epoch": 0.17940037254959937, + "grad_norm": 0.1336565613746643, + "learning_rate": 4.1182187796771135e-05, + "loss": 0.0194, + "step": 24270 + }, + { + "epoch": 0.1794742911209012, + "grad_norm": 0.09863122552633286, + "learning_rate": 4.117847815764483e-05, + "loss": 0.0192, + "step": 24280 + }, + { + "epoch": 0.17954820969220306, + "grad_norm": 0.071408212184906, + "learning_rate": 4.117476851851852e-05, + "loss": 0.0188, + "step": 24290 + }, + { + "epoch": 0.17962212826350493, + "grad_norm": 0.078492172062397, + "learning_rate": 4.1171058879392216e-05, + "loss": 0.0186, + "step": 24300 + }, + { + "epoch": 0.17969604683480678, + "grad_norm": 0.08483748137950897, + "learning_rate": 4.1167349240265905e-05, + "loss": 0.0187, + "step": 24310 + }, + { + "epoch": 0.17976996540610862, + "grad_norm": 0.08622529357671738, + "learning_rate": 4.11636396011396e-05, + "loss": 0.0201, + "step": 24320 + }, + { + "epoch": 0.1798438839774105, + "grad_norm": 0.09336697310209274, + "learning_rate": 4.11599299620133e-05, + "loss": 0.0192, + "step": 24330 + }, + { + "epoch": 0.17991780254871234, + "grad_norm": 0.11593308299779892, + "learning_rate": 4.115622032288699e-05, + "loss": 0.0198, + "step": 24340 + }, + { + "epoch": 0.1799917211200142, + "grad_norm": 0.09057379513978958, + "learning_rate": 4.115251068376069e-05, + "loss": 0.0174, + "step": 24350 + }, + { + "epoch": 0.18006563969131606, + "grad_norm": 0.0750114917755127, + "learning_rate": 4.114880104463438e-05, + "loss": 0.0195, + "step": 24360 + }, + { + "epoch": 0.1801395582626179, + "grad_norm": 0.0803409218788147, + "learning_rate": 4.1145091405508074e-05, + "loss": 0.0195, + "step": 24370 + }, + { + "epoch": 0.18021347683391975, + "grad_norm": 0.07491756230592728, + "learning_rate": 4.114138176638177e-05, + "loss": 0.0194, + "step": 24380 + }, + { + "epoch": 0.1802873954052216, + "grad_norm": 0.07920035719871521, + "learning_rate": 4.113767212725546e-05, + "loss": 0.0183, + "step": 24390 + }, + { + "epoch": 0.18036131397652347, + "grad_norm": 0.075667604804039, + "learning_rate": 4.1133962488129155e-05, + "loss": 0.0212, + "step": 24400 + }, + { + "epoch": 0.1804352325478253, + "grad_norm": 0.0879991203546524, + "learning_rate": 4.113025284900285e-05, + "loss": 0.0217, + "step": 24410 + }, + { + "epoch": 0.18050915111912716, + "grad_norm": 0.08139169216156006, + "learning_rate": 4.112654320987655e-05, + "loss": 0.0189, + "step": 24420 + }, + { + "epoch": 0.18058306969042903, + "grad_norm": 0.08189604431390762, + "learning_rate": 4.112283357075024e-05, + "loss": 0.0223, + "step": 24430 + }, + { + "epoch": 0.18065698826173088, + "grad_norm": 0.09923578053712845, + "learning_rate": 4.111912393162393e-05, + "loss": 0.0206, + "step": 24440 + }, + { + "epoch": 0.18073090683303272, + "grad_norm": 0.10648280382156372, + "learning_rate": 4.111541429249763e-05, + "loss": 0.0196, + "step": 24450 + }, + { + "epoch": 0.1808048254043346, + "grad_norm": 0.11174852401018143, + "learning_rate": 4.1111704653371324e-05, + "loss": 0.0212, + "step": 24460 + }, + { + "epoch": 0.18087874397563644, + "grad_norm": 0.08722157776355743, + "learning_rate": 4.1107995014245014e-05, + "loss": 0.0194, + "step": 24470 + }, + { + "epoch": 0.1809526625469383, + "grad_norm": 0.08150345087051392, + "learning_rate": 4.110428537511871e-05, + "loss": 0.0192, + "step": 24480 + }, + { + "epoch": 0.18102658111824016, + "grad_norm": 0.09848983585834503, + "learning_rate": 4.1100575735992406e-05, + "loss": 0.0196, + "step": 24490 + }, + { + "epoch": 0.181100499689542, + "grad_norm": 0.07330475002527237, + "learning_rate": 4.10968660968661e-05, + "loss": 0.0189, + "step": 24500 + }, + { + "epoch": 0.18117441826084385, + "grad_norm": 0.08347219973802567, + "learning_rate": 4.10931564577398e-05, + "loss": 0.0175, + "step": 24510 + }, + { + "epoch": 0.1812483368321457, + "grad_norm": 0.11764758080244064, + "learning_rate": 4.108944681861349e-05, + "loss": 0.024, + "step": 24520 + }, + { + "epoch": 0.18132225540344757, + "grad_norm": 0.07574162632226944, + "learning_rate": 4.108573717948718e-05, + "loss": 0.0205, + "step": 24530 + }, + { + "epoch": 0.18139617397474941, + "grad_norm": 0.09680452197790146, + "learning_rate": 4.108202754036087e-05, + "loss": 0.0243, + "step": 24540 + }, + { + "epoch": 0.18147009254605126, + "grad_norm": 0.08307170122861862, + "learning_rate": 4.107831790123457e-05, + "loss": 0.0205, + "step": 24550 + }, + { + "epoch": 0.18154401111735313, + "grad_norm": 0.09651833027601242, + "learning_rate": 4.1074608262108264e-05, + "loss": 0.0194, + "step": 24560 + }, + { + "epoch": 0.18161792968865498, + "grad_norm": 0.09410324692726135, + "learning_rate": 4.107089862298196e-05, + "loss": 0.0205, + "step": 24570 + }, + { + "epoch": 0.18169184825995682, + "grad_norm": 0.08339502662420273, + "learning_rate": 4.1067188983855656e-05, + "loss": 0.0201, + "step": 24580 + }, + { + "epoch": 0.1817657668312587, + "grad_norm": 0.08488886058330536, + "learning_rate": 4.1063479344729345e-05, + "loss": 0.0205, + "step": 24590 + }, + { + "epoch": 0.18183968540256054, + "grad_norm": 0.0752316489815712, + "learning_rate": 4.105976970560304e-05, + "loss": 0.0223, + "step": 24600 + }, + { + "epoch": 0.1819136039738624, + "grad_norm": 0.08579552173614502, + "learning_rate": 4.105606006647674e-05, + "loss": 0.0198, + "step": 24610 + }, + { + "epoch": 0.18198752254516426, + "grad_norm": 0.06785819679498672, + "learning_rate": 4.1052350427350426e-05, + "loss": 0.019, + "step": 24620 + }, + { + "epoch": 0.1820614411164661, + "grad_norm": 0.09498974680900574, + "learning_rate": 4.104864078822412e-05, + "loss": 0.0202, + "step": 24630 + }, + { + "epoch": 0.18213535968776795, + "grad_norm": 0.08310835808515549, + "learning_rate": 4.104493114909782e-05, + "loss": 0.0188, + "step": 24640 + }, + { + "epoch": 0.1822092782590698, + "grad_norm": 0.07385044544935226, + "learning_rate": 4.1041221509971514e-05, + "loss": 0.018, + "step": 24650 + }, + { + "epoch": 0.18228319683037167, + "grad_norm": 0.09191499650478363, + "learning_rate": 4.103751187084521e-05, + "loss": 0.0183, + "step": 24660 + }, + { + "epoch": 0.18235711540167351, + "grad_norm": 0.09280231595039368, + "learning_rate": 4.10338022317189e-05, + "loss": 0.0178, + "step": 24670 + }, + { + "epoch": 0.18243103397297536, + "grad_norm": 0.1001996323466301, + "learning_rate": 4.1030092592592595e-05, + "loss": 0.0179, + "step": 24680 + }, + { + "epoch": 0.18250495254427723, + "grad_norm": 0.08631417155265808, + "learning_rate": 4.102638295346629e-05, + "loss": 0.0205, + "step": 24690 + }, + { + "epoch": 0.18257887111557908, + "grad_norm": 0.07611658424139023, + "learning_rate": 4.102267331433998e-05, + "loss": 0.0189, + "step": 24700 + }, + { + "epoch": 0.18265278968688092, + "grad_norm": 0.09822043776512146, + "learning_rate": 4.1018963675213676e-05, + "loss": 0.022, + "step": 24710 + }, + { + "epoch": 0.1827267082581828, + "grad_norm": 0.08817804604768753, + "learning_rate": 4.101525403608737e-05, + "loss": 0.0192, + "step": 24720 + }, + { + "epoch": 0.18280062682948464, + "grad_norm": 0.08359253406524658, + "learning_rate": 4.101154439696107e-05, + "loss": 0.0247, + "step": 24730 + }, + { + "epoch": 0.1828745454007865, + "grad_norm": 0.11328428238630295, + "learning_rate": 4.1007834757834764e-05, + "loss": 0.0209, + "step": 24740 + }, + { + "epoch": 0.18294846397208836, + "grad_norm": 0.09812841564416885, + "learning_rate": 4.100412511870845e-05, + "loss": 0.0181, + "step": 24750 + }, + { + "epoch": 0.1830223825433902, + "grad_norm": 0.09421461820602417, + "learning_rate": 4.100041547958215e-05, + "loss": 0.0188, + "step": 24760 + }, + { + "epoch": 0.18309630111469205, + "grad_norm": 0.10235374420881271, + "learning_rate": 4.099670584045584e-05, + "loss": 0.0176, + "step": 24770 + }, + { + "epoch": 0.1831702196859939, + "grad_norm": 0.10455603897571564, + "learning_rate": 4.0992996201329534e-05, + "loss": 0.0211, + "step": 24780 + }, + { + "epoch": 0.18324413825729577, + "grad_norm": 0.07605478167533875, + "learning_rate": 4.098928656220323e-05, + "loss": 0.0207, + "step": 24790 + }, + { + "epoch": 0.18331805682859761, + "grad_norm": 0.11007492244243622, + "learning_rate": 4.0985576923076926e-05, + "loss": 0.0204, + "step": 24800 + }, + { + "epoch": 0.18339197539989946, + "grad_norm": 0.08086639642715454, + "learning_rate": 4.098186728395062e-05, + "loss": 0.0194, + "step": 24810 + }, + { + "epoch": 0.18346589397120133, + "grad_norm": 0.11971864104270935, + "learning_rate": 4.097815764482431e-05, + "loss": 0.0206, + "step": 24820 + }, + { + "epoch": 0.18353981254250318, + "grad_norm": 0.09048326313495636, + "learning_rate": 4.097444800569801e-05, + "loss": 0.02, + "step": 24830 + }, + { + "epoch": 0.18361373111380502, + "grad_norm": 0.08733789622783661, + "learning_rate": 4.0970738366571703e-05, + "loss": 0.0185, + "step": 24840 + }, + { + "epoch": 0.1836876496851069, + "grad_norm": 0.07789760828018188, + "learning_rate": 4.096702872744539e-05, + "loss": 0.0186, + "step": 24850 + }, + { + "epoch": 0.18376156825640874, + "grad_norm": 0.06854735314846039, + "learning_rate": 4.096331908831909e-05, + "loss": 0.0203, + "step": 24860 + }, + { + "epoch": 0.1838354868277106, + "grad_norm": 0.09135521203279495, + "learning_rate": 4.0959609449192785e-05, + "loss": 0.0199, + "step": 24870 + }, + { + "epoch": 0.18390940539901246, + "grad_norm": 0.08071056753396988, + "learning_rate": 4.095589981006648e-05, + "loss": 0.0191, + "step": 24880 + }, + { + "epoch": 0.1839833239703143, + "grad_norm": 0.08886650949716568, + "learning_rate": 4.0952190170940176e-05, + "loss": 0.0198, + "step": 24890 + }, + { + "epoch": 0.18405724254161615, + "grad_norm": 0.13103017210960388, + "learning_rate": 4.0948480531813866e-05, + "loss": 0.0199, + "step": 24900 + }, + { + "epoch": 0.184131161112918, + "grad_norm": 0.0853632241487503, + "learning_rate": 4.094477089268756e-05, + "loss": 0.0177, + "step": 24910 + }, + { + "epoch": 0.18420507968421987, + "grad_norm": 0.09077557921409607, + "learning_rate": 4.094106125356126e-05, + "loss": 0.0203, + "step": 24920 + }, + { + "epoch": 0.18427899825552171, + "grad_norm": 0.0909472331404686, + "learning_rate": 4.093735161443495e-05, + "loss": 0.0205, + "step": 24930 + }, + { + "epoch": 0.18435291682682356, + "grad_norm": 0.10778666287660599, + "learning_rate": 4.093364197530864e-05, + "loss": 0.0185, + "step": 24940 + }, + { + "epoch": 0.18442683539812543, + "grad_norm": 0.08606761693954468, + "learning_rate": 4.092993233618234e-05, + "loss": 0.0193, + "step": 24950 + }, + { + "epoch": 0.18450075396942728, + "grad_norm": 0.11441691964864731, + "learning_rate": 4.0926222697056035e-05, + "loss": 0.0209, + "step": 24960 + }, + { + "epoch": 0.18457467254072912, + "grad_norm": 0.08625328540802002, + "learning_rate": 4.092251305792973e-05, + "loss": 0.0198, + "step": 24970 + }, + { + "epoch": 0.184648591112031, + "grad_norm": 0.10141268372535706, + "learning_rate": 4.091880341880342e-05, + "loss": 0.0184, + "step": 24980 + }, + { + "epoch": 0.18472250968333284, + "grad_norm": 0.09997362643480301, + "learning_rate": 4.0915093779677116e-05, + "loss": 0.0222, + "step": 24990 + }, + { + "epoch": 0.1847964282546347, + "grad_norm": 0.1047859638929367, + "learning_rate": 4.0911384140550805e-05, + "loss": 0.0222, + "step": 25000 + }, + { + "epoch": 0.18487034682593656, + "grad_norm": 0.09549182653427124, + "learning_rate": 4.09076745014245e-05, + "loss": 0.0193, + "step": 25010 + }, + { + "epoch": 0.1849442653972384, + "grad_norm": 0.08501344919204712, + "learning_rate": 4.09039648622982e-05, + "loss": 0.0219, + "step": 25020 + }, + { + "epoch": 0.18501818396854025, + "grad_norm": 0.0857548862695694, + "learning_rate": 4.090025522317189e-05, + "loss": 0.017, + "step": 25030 + }, + { + "epoch": 0.18509210253984212, + "grad_norm": 0.07982197403907776, + "learning_rate": 4.089654558404559e-05, + "loss": 0.0202, + "step": 25040 + }, + { + "epoch": 0.18516602111114397, + "grad_norm": 0.0914795845746994, + "learning_rate": 4.089283594491928e-05, + "loss": 0.0193, + "step": 25050 + }, + { + "epoch": 0.18523993968244581, + "grad_norm": 0.09756156802177429, + "learning_rate": 4.0889126305792974e-05, + "loss": 0.0203, + "step": 25060 + }, + { + "epoch": 0.18531385825374766, + "grad_norm": 0.0878814235329628, + "learning_rate": 4.088541666666667e-05, + "loss": 0.0188, + "step": 25070 + }, + { + "epoch": 0.18538777682504953, + "grad_norm": 0.09018289297819138, + "learning_rate": 4.088170702754036e-05, + "loss": 0.0214, + "step": 25080 + }, + { + "epoch": 0.18546169539635138, + "grad_norm": 0.09078127145767212, + "learning_rate": 4.0877997388414055e-05, + "loss": 0.0199, + "step": 25090 + }, + { + "epoch": 0.18553561396765322, + "grad_norm": 0.10447991639375687, + "learning_rate": 4.087428774928775e-05, + "loss": 0.0199, + "step": 25100 + }, + { + "epoch": 0.1856095325389551, + "grad_norm": 0.08599971979856491, + "learning_rate": 4.087057811016145e-05, + "loss": 0.0219, + "step": 25110 + }, + { + "epoch": 0.18568345111025694, + "grad_norm": 0.08772403746843338, + "learning_rate": 4.086686847103514e-05, + "loss": 0.0189, + "step": 25120 + }, + { + "epoch": 0.1857573696815588, + "grad_norm": 0.1319970190525055, + "learning_rate": 4.086315883190883e-05, + "loss": 0.0187, + "step": 25130 + }, + { + "epoch": 0.18583128825286066, + "grad_norm": 0.06123780831694603, + "learning_rate": 4.085944919278253e-05, + "loss": 0.021, + "step": 25140 + }, + { + "epoch": 0.1859052068241625, + "grad_norm": 0.07819724828004837, + "learning_rate": 4.0855739553656224e-05, + "loss": 0.0207, + "step": 25150 + }, + { + "epoch": 0.18597912539546435, + "grad_norm": 0.12462358176708221, + "learning_rate": 4.085202991452991e-05, + "loss": 0.0214, + "step": 25160 + }, + { + "epoch": 0.18605304396676622, + "grad_norm": 0.09657600522041321, + "learning_rate": 4.084832027540361e-05, + "loss": 0.0204, + "step": 25170 + }, + { + "epoch": 0.18612696253806807, + "grad_norm": 0.10215092450380325, + "learning_rate": 4.0844610636277305e-05, + "loss": 0.0206, + "step": 25180 + }, + { + "epoch": 0.18620088110936991, + "grad_norm": 0.07794997841119766, + "learning_rate": 4.0840900997151e-05, + "loss": 0.0187, + "step": 25190 + }, + { + "epoch": 0.18627479968067176, + "grad_norm": 0.10502193123102188, + "learning_rate": 4.08371913580247e-05, + "loss": 0.0198, + "step": 25200 + }, + { + "epoch": 0.18634871825197363, + "grad_norm": 0.08818890154361725, + "learning_rate": 4.0833481718898386e-05, + "loss": 0.0211, + "step": 25210 + }, + { + "epoch": 0.18642263682327548, + "grad_norm": 0.08618687093257904, + "learning_rate": 4.082977207977208e-05, + "loss": 0.0184, + "step": 25220 + }, + { + "epoch": 0.18649655539457732, + "grad_norm": 0.09551467001438141, + "learning_rate": 4.082606244064577e-05, + "loss": 0.0197, + "step": 25230 + }, + { + "epoch": 0.1865704739658792, + "grad_norm": 0.09489475190639496, + "learning_rate": 4.082235280151947e-05, + "loss": 0.0209, + "step": 25240 + }, + { + "epoch": 0.18664439253718104, + "grad_norm": 0.09283564239740372, + "learning_rate": 4.0818643162393164e-05, + "loss": 0.0182, + "step": 25250 + }, + { + "epoch": 0.1867183111084829, + "grad_norm": 0.1071094423532486, + "learning_rate": 4.081493352326686e-05, + "loss": 0.0197, + "step": 25260 + }, + { + "epoch": 0.18679222967978476, + "grad_norm": 0.08947714418172836, + "learning_rate": 4.0811223884140555e-05, + "loss": 0.0193, + "step": 25270 + }, + { + "epoch": 0.1868661482510866, + "grad_norm": 0.14174768328666687, + "learning_rate": 4.0807514245014245e-05, + "loss": 0.0184, + "step": 25280 + }, + { + "epoch": 0.18694006682238845, + "grad_norm": 0.07919485867023468, + "learning_rate": 4.080380460588794e-05, + "loss": 0.0203, + "step": 25290 + }, + { + "epoch": 0.18701398539369032, + "grad_norm": 0.08494485914707184, + "learning_rate": 4.0800094966761637e-05, + "loss": 0.0222, + "step": 25300 + }, + { + "epoch": 0.18708790396499217, + "grad_norm": 0.0889492928981781, + "learning_rate": 4.0796385327635326e-05, + "loss": 0.0212, + "step": 25310 + }, + { + "epoch": 0.18716182253629401, + "grad_norm": 0.0972001850605011, + "learning_rate": 4.079267568850902e-05, + "loss": 0.0201, + "step": 25320 + }, + { + "epoch": 0.18723574110759586, + "grad_norm": 0.1136302798986435, + "learning_rate": 4.078896604938272e-05, + "loss": 0.0206, + "step": 25330 + }, + { + "epoch": 0.18730965967889773, + "grad_norm": 0.10994186252355576, + "learning_rate": 4.0785256410256414e-05, + "loss": 0.0222, + "step": 25340 + }, + { + "epoch": 0.18738357825019958, + "grad_norm": 0.0715063214302063, + "learning_rate": 4.078154677113011e-05, + "loss": 0.0182, + "step": 25350 + }, + { + "epoch": 0.18745749682150142, + "grad_norm": 0.07392342388629913, + "learning_rate": 4.07778371320038e-05, + "loss": 0.0202, + "step": 25360 + }, + { + "epoch": 0.1875314153928033, + "grad_norm": 0.07772233337163925, + "learning_rate": 4.0774127492877495e-05, + "loss": 0.0194, + "step": 25370 + }, + { + "epoch": 0.18760533396410514, + "grad_norm": 0.10917063802480698, + "learning_rate": 4.077041785375119e-05, + "loss": 0.0194, + "step": 25380 + }, + { + "epoch": 0.187679252535407, + "grad_norm": 0.0789274200797081, + "learning_rate": 4.076670821462488e-05, + "loss": 0.022, + "step": 25390 + }, + { + "epoch": 0.18775317110670886, + "grad_norm": 0.06959021091461182, + "learning_rate": 4.0762998575498576e-05, + "loss": 0.0192, + "step": 25400 + }, + { + "epoch": 0.1878270896780107, + "grad_norm": 0.05860432609915733, + "learning_rate": 4.075928893637227e-05, + "loss": 0.0186, + "step": 25410 + }, + { + "epoch": 0.18790100824931255, + "grad_norm": 0.08827078342437744, + "learning_rate": 4.075557929724597e-05, + "loss": 0.0192, + "step": 25420 + }, + { + "epoch": 0.18797492682061442, + "grad_norm": 0.10155238956212997, + "learning_rate": 4.0751869658119664e-05, + "loss": 0.0208, + "step": 25430 + }, + { + "epoch": 0.18804884539191627, + "grad_norm": 0.07168347388505936, + "learning_rate": 4.074816001899335e-05, + "loss": 0.0183, + "step": 25440 + }, + { + "epoch": 0.18812276396321811, + "grad_norm": 0.08718912303447723, + "learning_rate": 4.074445037986705e-05, + "loss": 0.0223, + "step": 25450 + }, + { + "epoch": 0.18819668253451996, + "grad_norm": 0.08500777930021286, + "learning_rate": 4.074074074074074e-05, + "loss": 0.0181, + "step": 25460 + }, + { + "epoch": 0.18827060110582183, + "grad_norm": 0.08464238792657852, + "learning_rate": 4.0737031101614434e-05, + "loss": 0.0213, + "step": 25470 + }, + { + "epoch": 0.18834451967712368, + "grad_norm": 0.1232479065656662, + "learning_rate": 4.073332146248813e-05, + "loss": 0.0229, + "step": 25480 + }, + { + "epoch": 0.18841843824842552, + "grad_norm": 0.10290573537349701, + "learning_rate": 4.0729611823361826e-05, + "loss": 0.0228, + "step": 25490 + }, + { + "epoch": 0.1884923568197274, + "grad_norm": 0.12121661007404327, + "learning_rate": 4.072590218423552e-05, + "loss": 0.0221, + "step": 25500 + }, + { + "epoch": 0.18856627539102924, + "grad_norm": 0.08239974826574326, + "learning_rate": 4.072219254510921e-05, + "loss": 0.0184, + "step": 25510 + }, + { + "epoch": 0.1886401939623311, + "grad_norm": 0.09272027015686035, + "learning_rate": 4.071848290598291e-05, + "loss": 0.0201, + "step": 25520 + }, + { + "epoch": 0.18871411253363296, + "grad_norm": 0.13893020153045654, + "learning_rate": 4.07147732668566e-05, + "loss": 0.02, + "step": 25530 + }, + { + "epoch": 0.1887880311049348, + "grad_norm": 0.08534568548202515, + "learning_rate": 4.071106362773029e-05, + "loss": 0.0176, + "step": 25540 + }, + { + "epoch": 0.18886194967623665, + "grad_norm": 0.0794425904750824, + "learning_rate": 4.070735398860399e-05, + "loss": 0.0187, + "step": 25550 + }, + { + "epoch": 0.18893586824753852, + "grad_norm": 0.05672596022486687, + "learning_rate": 4.0703644349477684e-05, + "loss": 0.0186, + "step": 25560 + }, + { + "epoch": 0.18900978681884037, + "grad_norm": 0.058385878801345825, + "learning_rate": 4.069993471035138e-05, + "loss": 0.0173, + "step": 25570 + }, + { + "epoch": 0.18908370539014221, + "grad_norm": 0.11499615013599396, + "learning_rate": 4.0696225071225076e-05, + "loss": 0.0227, + "step": 25580 + }, + { + "epoch": 0.18915762396144406, + "grad_norm": 0.11212538182735443, + "learning_rate": 4.0692515432098765e-05, + "loss": 0.0192, + "step": 25590 + }, + { + "epoch": 0.18923154253274593, + "grad_norm": 0.06475656479597092, + "learning_rate": 4.068880579297246e-05, + "loss": 0.0184, + "step": 25600 + }, + { + "epoch": 0.18930546110404778, + "grad_norm": 0.07098093628883362, + "learning_rate": 4.068509615384616e-05, + "loss": 0.0189, + "step": 25610 + }, + { + "epoch": 0.18937937967534962, + "grad_norm": 0.09972020238637924, + "learning_rate": 4.0681386514719847e-05, + "loss": 0.0198, + "step": 25620 + }, + { + "epoch": 0.1894532982466515, + "grad_norm": 0.08523591607809067, + "learning_rate": 4.067767687559354e-05, + "loss": 0.0208, + "step": 25630 + }, + { + "epoch": 0.18952721681795334, + "grad_norm": 0.08826702833175659, + "learning_rate": 4.067396723646724e-05, + "loss": 0.0172, + "step": 25640 + }, + { + "epoch": 0.1896011353892552, + "grad_norm": 0.10502415150403976, + "learning_rate": 4.0670257597340934e-05, + "loss": 0.0235, + "step": 25650 + }, + { + "epoch": 0.18967505396055706, + "grad_norm": 0.07767823338508606, + "learning_rate": 4.066654795821463e-05, + "loss": 0.0203, + "step": 25660 + }, + { + "epoch": 0.1897489725318589, + "grad_norm": 0.06232968717813492, + "learning_rate": 4.066283831908832e-05, + "loss": 0.0184, + "step": 25670 + }, + { + "epoch": 0.18982289110316075, + "grad_norm": 0.0691458061337471, + "learning_rate": 4.0659128679962016e-05, + "loss": 0.02, + "step": 25680 + }, + { + "epoch": 0.18989680967446262, + "grad_norm": 0.0777791365981102, + "learning_rate": 4.0655419040835705e-05, + "loss": 0.0186, + "step": 25690 + }, + { + "epoch": 0.18997072824576447, + "grad_norm": 0.09985006600618362, + "learning_rate": 4.06517094017094e-05, + "loss": 0.0213, + "step": 25700 + }, + { + "epoch": 0.19004464681706632, + "grad_norm": 0.06811359524726868, + "learning_rate": 4.06479997625831e-05, + "loss": 0.0188, + "step": 25710 + }, + { + "epoch": 0.19011856538836816, + "grad_norm": 0.07796566933393478, + "learning_rate": 4.064429012345679e-05, + "loss": 0.0207, + "step": 25720 + }, + { + "epoch": 0.19019248395967003, + "grad_norm": 0.08382702618837357, + "learning_rate": 4.064058048433049e-05, + "loss": 0.0163, + "step": 25730 + }, + { + "epoch": 0.19026640253097188, + "grad_norm": 0.07249416410923004, + "learning_rate": 4.063687084520418e-05, + "loss": 0.0184, + "step": 25740 + }, + { + "epoch": 0.19034032110227372, + "grad_norm": 0.08133135735988617, + "learning_rate": 4.0633161206077874e-05, + "loss": 0.0189, + "step": 25750 + }, + { + "epoch": 0.1904142396735756, + "grad_norm": 0.07951100915670395, + "learning_rate": 4.062945156695157e-05, + "loss": 0.02, + "step": 25760 + }, + { + "epoch": 0.19048815824487744, + "grad_norm": 0.08528363704681396, + "learning_rate": 4.062574192782526e-05, + "loss": 0.0177, + "step": 25770 + }, + { + "epoch": 0.1905620768161793, + "grad_norm": 0.08923252671957016, + "learning_rate": 4.0622032288698955e-05, + "loss": 0.0186, + "step": 25780 + }, + { + "epoch": 0.19063599538748116, + "grad_norm": 0.1294834166765213, + "learning_rate": 4.061832264957265e-05, + "loss": 0.0193, + "step": 25790 + }, + { + "epoch": 0.190709913958783, + "grad_norm": 0.08585592359304428, + "learning_rate": 4.061461301044635e-05, + "loss": 0.0201, + "step": 25800 + }, + { + "epoch": 0.19078383253008485, + "grad_norm": 0.07966049760580063, + "learning_rate": 4.061090337132004e-05, + "loss": 0.0192, + "step": 25810 + }, + { + "epoch": 0.19085775110138672, + "grad_norm": 0.11067353934049606, + "learning_rate": 4.060719373219373e-05, + "loss": 0.0198, + "step": 25820 + }, + { + "epoch": 0.19093166967268857, + "grad_norm": 0.08094379305839539, + "learning_rate": 4.060348409306743e-05, + "loss": 0.0189, + "step": 25830 + }, + { + "epoch": 0.19100558824399042, + "grad_norm": 0.08187496662139893, + "learning_rate": 4.0599774453941124e-05, + "loss": 0.0181, + "step": 25840 + }, + { + "epoch": 0.19107950681529226, + "grad_norm": 0.10788208246231079, + "learning_rate": 4.059606481481481e-05, + "loss": 0.0211, + "step": 25850 + }, + { + "epoch": 0.19115342538659413, + "grad_norm": 0.05920582264661789, + "learning_rate": 4.059235517568851e-05, + "loss": 0.0198, + "step": 25860 + }, + { + "epoch": 0.19122734395789598, + "grad_norm": 0.08051064610481262, + "learning_rate": 4.0588645536562205e-05, + "loss": 0.0197, + "step": 25870 + }, + { + "epoch": 0.19130126252919782, + "grad_norm": 0.07535728067159653, + "learning_rate": 4.05849358974359e-05, + "loss": 0.0184, + "step": 25880 + }, + { + "epoch": 0.1913751811004997, + "grad_norm": 0.09129388630390167, + "learning_rate": 4.05812262583096e-05, + "loss": 0.0171, + "step": 25890 + }, + { + "epoch": 0.19144909967180154, + "grad_norm": 0.0707845389842987, + "learning_rate": 4.0577516619183286e-05, + "loss": 0.0212, + "step": 25900 + }, + { + "epoch": 0.1915230182431034, + "grad_norm": 0.10849178582429886, + "learning_rate": 4.057380698005698e-05, + "loss": 0.0205, + "step": 25910 + }, + { + "epoch": 0.19159693681440526, + "grad_norm": 0.07658477127552032, + "learning_rate": 4.057009734093067e-05, + "loss": 0.0188, + "step": 25920 + }, + { + "epoch": 0.1916708553857071, + "grad_norm": 0.08434335142374039, + "learning_rate": 4.056638770180437e-05, + "loss": 0.0201, + "step": 25930 + }, + { + "epoch": 0.19174477395700895, + "grad_norm": 0.07062356173992157, + "learning_rate": 4.056267806267807e-05, + "loss": 0.0196, + "step": 25940 + }, + { + "epoch": 0.19181869252831082, + "grad_norm": 0.12351708859205246, + "learning_rate": 4.055896842355176e-05, + "loss": 0.0204, + "step": 25950 + }, + { + "epoch": 0.19189261109961267, + "grad_norm": 0.08979147672653198, + "learning_rate": 4.0555258784425455e-05, + "loss": 0.0175, + "step": 25960 + }, + { + "epoch": 0.19196652967091452, + "grad_norm": 0.10261064767837524, + "learning_rate": 4.0551549145299144e-05, + "loss": 0.0209, + "step": 25970 + }, + { + "epoch": 0.1920404482422164, + "grad_norm": 0.09259028732776642, + "learning_rate": 4.054783950617284e-05, + "loss": 0.0203, + "step": 25980 + }, + { + "epoch": 0.19211436681351823, + "grad_norm": 0.08469455689191818, + "learning_rate": 4.0544129867046536e-05, + "loss": 0.0197, + "step": 25990 + }, + { + "epoch": 0.19218828538482008, + "grad_norm": 0.07711025327444077, + "learning_rate": 4.0540420227920226e-05, + "loss": 0.0178, + "step": 26000 + }, + { + "epoch": 0.19226220395612192, + "grad_norm": 0.07707731425762177, + "learning_rate": 4.053671058879392e-05, + "loss": 0.0204, + "step": 26010 + }, + { + "epoch": 0.1923361225274238, + "grad_norm": 0.08284084498882294, + "learning_rate": 4.053300094966762e-05, + "loss": 0.0196, + "step": 26020 + }, + { + "epoch": 0.19241004109872564, + "grad_norm": 0.10776533931493759, + "learning_rate": 4.0529291310541313e-05, + "loss": 0.0195, + "step": 26030 + }, + { + "epoch": 0.1924839596700275, + "grad_norm": 0.08582749962806702, + "learning_rate": 4.052558167141501e-05, + "loss": 0.019, + "step": 26040 + }, + { + "epoch": 0.19255787824132936, + "grad_norm": 0.0954875648021698, + "learning_rate": 4.05218720322887e-05, + "loss": 0.0198, + "step": 26050 + }, + { + "epoch": 0.1926317968126312, + "grad_norm": 0.10585756599903107, + "learning_rate": 4.0518162393162395e-05, + "loss": 0.0183, + "step": 26060 + }, + { + "epoch": 0.19270571538393305, + "grad_norm": 0.15620584785938263, + "learning_rate": 4.051445275403609e-05, + "loss": 0.0203, + "step": 26070 + }, + { + "epoch": 0.19277963395523492, + "grad_norm": 0.07597756385803223, + "learning_rate": 4.051074311490978e-05, + "loss": 0.0202, + "step": 26080 + }, + { + "epoch": 0.19285355252653677, + "grad_norm": 0.10610008984804153, + "learning_rate": 4.050703347578348e-05, + "loss": 0.0208, + "step": 26090 + }, + { + "epoch": 0.19292747109783862, + "grad_norm": 0.08419201523065567, + "learning_rate": 4.050332383665717e-05, + "loss": 0.0197, + "step": 26100 + }, + { + "epoch": 0.1930013896691405, + "grad_norm": 0.10455375909805298, + "learning_rate": 4.049961419753087e-05, + "loss": 0.0208, + "step": 26110 + }, + { + "epoch": 0.19307530824044233, + "grad_norm": 0.09646882116794586, + "learning_rate": 4.0495904558404564e-05, + "loss": 0.0216, + "step": 26120 + }, + { + "epoch": 0.19314922681174418, + "grad_norm": 0.09530425816774368, + "learning_rate": 4.049219491927825e-05, + "loss": 0.0195, + "step": 26130 + }, + { + "epoch": 0.19322314538304602, + "grad_norm": 0.08805854618549347, + "learning_rate": 4.048848528015195e-05, + "loss": 0.0192, + "step": 26140 + }, + { + "epoch": 0.1932970639543479, + "grad_norm": 0.06926032900810242, + "learning_rate": 4.048477564102564e-05, + "loss": 0.02, + "step": 26150 + }, + { + "epoch": 0.19337098252564974, + "grad_norm": 0.09659983217716217, + "learning_rate": 4.0481066001899334e-05, + "loss": 0.0202, + "step": 26160 + }, + { + "epoch": 0.1934449010969516, + "grad_norm": 0.07807913422584534, + "learning_rate": 4.047735636277304e-05, + "loss": 0.0224, + "step": 26170 + }, + { + "epoch": 0.19351881966825346, + "grad_norm": 0.07716178148984909, + "learning_rate": 4.0473646723646726e-05, + "loss": 0.0192, + "step": 26180 + }, + { + "epoch": 0.1935927382395553, + "grad_norm": 0.12032181769609451, + "learning_rate": 4.046993708452042e-05, + "loss": 0.0217, + "step": 26190 + }, + { + "epoch": 0.19366665681085715, + "grad_norm": 0.09849361330270767, + "learning_rate": 4.046622744539411e-05, + "loss": 0.0202, + "step": 26200 + }, + { + "epoch": 0.19374057538215902, + "grad_norm": 0.07785578072071075, + "learning_rate": 4.046251780626781e-05, + "loss": 0.017, + "step": 26210 + }, + { + "epoch": 0.19381449395346087, + "grad_norm": 0.09973660856485367, + "learning_rate": 4.04588081671415e-05, + "loss": 0.0199, + "step": 26220 + }, + { + "epoch": 0.19388841252476272, + "grad_norm": 0.10502509772777557, + "learning_rate": 4.045509852801519e-05, + "loss": 0.0194, + "step": 26230 + }, + { + "epoch": 0.1939623310960646, + "grad_norm": 0.08962590247392654, + "learning_rate": 4.045138888888889e-05, + "loss": 0.0204, + "step": 26240 + }, + { + "epoch": 0.19403624966736643, + "grad_norm": 0.0918722152709961, + "learning_rate": 4.0447679249762584e-05, + "loss": 0.0215, + "step": 26250 + }, + { + "epoch": 0.19411016823866828, + "grad_norm": 0.07433968037366867, + "learning_rate": 4.044396961063628e-05, + "loss": 0.0182, + "step": 26260 + }, + { + "epoch": 0.19418408680997012, + "grad_norm": 0.12824876606464386, + "learning_rate": 4.0440259971509976e-05, + "loss": 0.0191, + "step": 26270 + }, + { + "epoch": 0.194258005381272, + "grad_norm": 0.0706171989440918, + "learning_rate": 4.0436550332383665e-05, + "loss": 0.0177, + "step": 26280 + }, + { + "epoch": 0.19433192395257384, + "grad_norm": 0.08861715346574783, + "learning_rate": 4.043284069325736e-05, + "loss": 0.018, + "step": 26290 + }, + { + "epoch": 0.1944058425238757, + "grad_norm": 0.09288990497589111, + "learning_rate": 4.042913105413106e-05, + "loss": 0.0201, + "step": 26300 + }, + { + "epoch": 0.19447976109517756, + "grad_norm": 0.09657161682844162, + "learning_rate": 4.0425421415004746e-05, + "loss": 0.0184, + "step": 26310 + }, + { + "epoch": 0.1945536796664794, + "grad_norm": 0.12205624580383301, + "learning_rate": 4.042171177587845e-05, + "loss": 0.0209, + "step": 26320 + }, + { + "epoch": 0.19462759823778125, + "grad_norm": 0.07949160784482956, + "learning_rate": 4.041800213675214e-05, + "loss": 0.0172, + "step": 26330 + }, + { + "epoch": 0.19470151680908312, + "grad_norm": 0.0787765309214592, + "learning_rate": 4.0414292497625834e-05, + "loss": 0.0204, + "step": 26340 + }, + { + "epoch": 0.19477543538038497, + "grad_norm": 0.12772636115550995, + "learning_rate": 4.041058285849953e-05, + "loss": 0.0214, + "step": 26350 + }, + { + "epoch": 0.19484935395168682, + "grad_norm": 0.09125122427940369, + "learning_rate": 4.040687321937322e-05, + "loss": 0.019, + "step": 26360 + }, + { + "epoch": 0.1949232725229887, + "grad_norm": 0.10375119000673294, + "learning_rate": 4.0403163580246915e-05, + "loss": 0.02, + "step": 26370 + }, + { + "epoch": 0.19499719109429053, + "grad_norm": 0.07330422848463058, + "learning_rate": 4.0399453941120605e-05, + "loss": 0.0192, + "step": 26380 + }, + { + "epoch": 0.19507110966559238, + "grad_norm": 0.10417810082435608, + "learning_rate": 4.03957443019943e-05, + "loss": 0.0201, + "step": 26390 + }, + { + "epoch": 0.19514502823689422, + "grad_norm": 0.0886123776435852, + "learning_rate": 4.0392034662868e-05, + "loss": 0.0184, + "step": 26400 + }, + { + "epoch": 0.1952189468081961, + "grad_norm": 0.07089180499315262, + "learning_rate": 4.038832502374169e-05, + "loss": 0.0165, + "step": 26410 + }, + { + "epoch": 0.19529286537949794, + "grad_norm": 0.09441141784191132, + "learning_rate": 4.038461538461539e-05, + "loss": 0.0198, + "step": 26420 + }, + { + "epoch": 0.1953667839507998, + "grad_norm": 0.08422481268644333, + "learning_rate": 4.038090574548908e-05, + "loss": 0.0177, + "step": 26430 + }, + { + "epoch": 0.19544070252210166, + "grad_norm": 0.10501968115568161, + "learning_rate": 4.0377196106362774e-05, + "loss": 0.0195, + "step": 26440 + }, + { + "epoch": 0.1955146210934035, + "grad_norm": 0.0918261930346489, + "learning_rate": 4.037348646723647e-05, + "loss": 0.0205, + "step": 26450 + }, + { + "epoch": 0.19558853966470535, + "grad_norm": 0.09516014903783798, + "learning_rate": 4.036977682811016e-05, + "loss": 0.0204, + "step": 26460 + }, + { + "epoch": 0.19566245823600723, + "grad_norm": 0.10102162510156631, + "learning_rate": 4.036606718898386e-05, + "loss": 0.0219, + "step": 26470 + }, + { + "epoch": 0.19573637680730907, + "grad_norm": 0.09033115208148956, + "learning_rate": 4.036235754985755e-05, + "loss": 0.0217, + "step": 26480 + }, + { + "epoch": 0.19581029537861092, + "grad_norm": 0.10341212898492813, + "learning_rate": 4.0358647910731247e-05, + "loss": 0.0176, + "step": 26490 + }, + { + "epoch": 0.1958842139499128, + "grad_norm": 0.09452036023139954, + "learning_rate": 4.035493827160494e-05, + "loss": 0.0206, + "step": 26500 + }, + { + "epoch": 0.19595813252121463, + "grad_norm": 0.06660100072622299, + "learning_rate": 4.035122863247863e-05, + "loss": 0.0165, + "step": 26510 + }, + { + "epoch": 0.19603205109251648, + "grad_norm": 0.0918986052274704, + "learning_rate": 4.034751899335233e-05, + "loss": 0.0213, + "step": 26520 + }, + { + "epoch": 0.19610596966381832, + "grad_norm": 0.09241180866956711, + "learning_rate": 4.0343809354226024e-05, + "loss": 0.0167, + "step": 26530 + }, + { + "epoch": 0.1961798882351202, + "grad_norm": 0.10144799202680588, + "learning_rate": 4.034009971509971e-05, + "loss": 0.0204, + "step": 26540 + }, + { + "epoch": 0.19625380680642204, + "grad_norm": 0.10204997658729553, + "learning_rate": 4.0336390075973416e-05, + "loss": 0.0201, + "step": 26550 + }, + { + "epoch": 0.1963277253777239, + "grad_norm": 0.08747974038124084, + "learning_rate": 4.0332680436847105e-05, + "loss": 0.0202, + "step": 26560 + }, + { + "epoch": 0.19640164394902576, + "grad_norm": 0.0816131979227066, + "learning_rate": 4.03289707977208e-05, + "loss": 0.0188, + "step": 26570 + }, + { + "epoch": 0.1964755625203276, + "grad_norm": 0.07453221082687378, + "learning_rate": 4.03252611585945e-05, + "loss": 0.0185, + "step": 26580 + }, + { + "epoch": 0.19654948109162945, + "grad_norm": 0.1012062057852745, + "learning_rate": 4.0321551519468186e-05, + "loss": 0.0197, + "step": 26590 + }, + { + "epoch": 0.19662339966293133, + "grad_norm": 0.07416244596242905, + "learning_rate": 4.031784188034188e-05, + "loss": 0.0168, + "step": 26600 + }, + { + "epoch": 0.19669731823423317, + "grad_norm": 0.06143144145607948, + "learning_rate": 4.031413224121557e-05, + "loss": 0.0188, + "step": 26610 + }, + { + "epoch": 0.19677123680553502, + "grad_norm": 0.13770316541194916, + "learning_rate": 4.0310422602089274e-05, + "loss": 0.0215, + "step": 26620 + }, + { + "epoch": 0.1968451553768369, + "grad_norm": 0.11620379984378815, + "learning_rate": 4.030671296296297e-05, + "loss": 0.0192, + "step": 26630 + }, + { + "epoch": 0.19691907394813873, + "grad_norm": 0.15548306703567505, + "learning_rate": 4.030300332383666e-05, + "loss": 0.0207, + "step": 26640 + }, + { + "epoch": 0.19699299251944058, + "grad_norm": 0.08808482438325882, + "learning_rate": 4.0299293684710355e-05, + "loss": 0.0182, + "step": 26650 + }, + { + "epoch": 0.19706691109074242, + "grad_norm": 0.08999871462583542, + "learning_rate": 4.0295584045584044e-05, + "loss": 0.0179, + "step": 26660 + }, + { + "epoch": 0.1971408296620443, + "grad_norm": 0.08707962930202484, + "learning_rate": 4.029187440645774e-05, + "loss": 0.0179, + "step": 26670 + }, + { + "epoch": 0.19721474823334614, + "grad_norm": 0.08106222748756409, + "learning_rate": 4.0288164767331436e-05, + "loss": 0.0187, + "step": 26680 + }, + { + "epoch": 0.197288666804648, + "grad_norm": 0.09482986479997635, + "learning_rate": 4.0284455128205125e-05, + "loss": 0.0209, + "step": 26690 + }, + { + "epoch": 0.19736258537594986, + "grad_norm": 0.10451532155275345, + "learning_rate": 4.028074548907883e-05, + "loss": 0.0213, + "step": 26700 + }, + { + "epoch": 0.1974365039472517, + "grad_norm": 0.0810663029551506, + "learning_rate": 4.027703584995252e-05, + "loss": 0.0186, + "step": 26710 + }, + { + "epoch": 0.19751042251855355, + "grad_norm": 0.046732064336538315, + "learning_rate": 4.027332621082621e-05, + "loss": 0.0181, + "step": 26720 + }, + { + "epoch": 0.19758434108985543, + "grad_norm": 0.11813398450613022, + "learning_rate": 4.026961657169991e-05, + "loss": 0.0219, + "step": 26730 + }, + { + "epoch": 0.19765825966115727, + "grad_norm": 0.12210382521152496, + "learning_rate": 4.02659069325736e-05, + "loss": 0.0215, + "step": 26740 + }, + { + "epoch": 0.19773217823245912, + "grad_norm": 0.0800352618098259, + "learning_rate": 4.0262197293447294e-05, + "loss": 0.0187, + "step": 26750 + }, + { + "epoch": 0.197806096803761, + "grad_norm": 0.09820833802223206, + "learning_rate": 4.025848765432099e-05, + "loss": 0.0177, + "step": 26760 + }, + { + "epoch": 0.19788001537506283, + "grad_norm": 0.09732136875391006, + "learning_rate": 4.0254778015194686e-05, + "loss": 0.0202, + "step": 26770 + }, + { + "epoch": 0.19795393394636468, + "grad_norm": 0.09171831607818604, + "learning_rate": 4.025106837606838e-05, + "loss": 0.019, + "step": 26780 + }, + { + "epoch": 0.19802785251766652, + "grad_norm": 0.1001991331577301, + "learning_rate": 4.024735873694207e-05, + "loss": 0.0202, + "step": 26790 + }, + { + "epoch": 0.1981017710889684, + "grad_norm": 0.08683771640062332, + "learning_rate": 4.024364909781577e-05, + "loss": 0.0211, + "step": 26800 + }, + { + "epoch": 0.19817568966027024, + "grad_norm": 0.11620105803012848, + "learning_rate": 4.023993945868946e-05, + "loss": 0.0212, + "step": 26810 + }, + { + "epoch": 0.1982496082315721, + "grad_norm": 0.0925314873456955, + "learning_rate": 4.023622981956315e-05, + "loss": 0.0177, + "step": 26820 + }, + { + "epoch": 0.19832352680287396, + "grad_norm": 0.08605986833572388, + "learning_rate": 4.023252018043685e-05, + "loss": 0.0205, + "step": 26830 + }, + { + "epoch": 0.1983974453741758, + "grad_norm": 0.10294154286384583, + "learning_rate": 4.022881054131054e-05, + "loss": 0.0202, + "step": 26840 + }, + { + "epoch": 0.19847136394547765, + "grad_norm": 0.08033913373947144, + "learning_rate": 4.022510090218424e-05, + "loss": 0.0187, + "step": 26850 + }, + { + "epoch": 0.19854528251677953, + "grad_norm": 0.07643717527389526, + "learning_rate": 4.0221391263057936e-05, + "loss": 0.0181, + "step": 26860 + }, + { + "epoch": 0.19861920108808137, + "grad_norm": 0.1059470847249031, + "learning_rate": 4.0217681623931626e-05, + "loss": 0.0196, + "step": 26870 + }, + { + "epoch": 0.19869311965938322, + "grad_norm": 0.08550570160150528, + "learning_rate": 4.021397198480532e-05, + "loss": 0.0182, + "step": 26880 + }, + { + "epoch": 0.1987670382306851, + "grad_norm": 0.07760634273290634, + "learning_rate": 4.021026234567901e-05, + "loss": 0.0194, + "step": 26890 + }, + { + "epoch": 0.19884095680198693, + "grad_norm": 0.07543564587831497, + "learning_rate": 4.020655270655271e-05, + "loss": 0.0207, + "step": 26900 + }, + { + "epoch": 0.19891487537328878, + "grad_norm": 0.08121582120656967, + "learning_rate": 4.02028430674264e-05, + "loss": 0.0188, + "step": 26910 + }, + { + "epoch": 0.19898879394459063, + "grad_norm": 0.08908379077911377, + "learning_rate": 4.01991334283001e-05, + "loss": 0.0197, + "step": 26920 + }, + { + "epoch": 0.1990627125158925, + "grad_norm": 0.11445832997560501, + "learning_rate": 4.0195423789173795e-05, + "loss": 0.0222, + "step": 26930 + }, + { + "epoch": 0.19913663108719434, + "grad_norm": 0.061530083417892456, + "learning_rate": 4.019171415004749e-05, + "loss": 0.02, + "step": 26940 + }, + { + "epoch": 0.1992105496584962, + "grad_norm": 0.09355339407920837, + "learning_rate": 4.018800451092118e-05, + "loss": 0.0193, + "step": 26950 + }, + { + "epoch": 0.19928446822979806, + "grad_norm": 0.11692371964454651, + "learning_rate": 4.0184294871794876e-05, + "loss": 0.0222, + "step": 26960 + }, + { + "epoch": 0.1993583868010999, + "grad_norm": 0.0950143113732338, + "learning_rate": 4.0180585232668565e-05, + "loss": 0.0209, + "step": 26970 + }, + { + "epoch": 0.19943230537240175, + "grad_norm": 0.08873429894447327, + "learning_rate": 4.017687559354226e-05, + "loss": 0.0187, + "step": 26980 + }, + { + "epoch": 0.19950622394370363, + "grad_norm": 0.08066698908805847, + "learning_rate": 4.017316595441596e-05, + "loss": 0.0196, + "step": 26990 + }, + { + "epoch": 0.19958014251500547, + "grad_norm": 0.10435789823532104, + "learning_rate": 4.016945631528965e-05, + "loss": 0.021, + "step": 27000 + }, + { + "epoch": 0.19965406108630732, + "grad_norm": 0.11480347812175751, + "learning_rate": 4.016574667616335e-05, + "loss": 0.0173, + "step": 27010 + }, + { + "epoch": 0.1997279796576092, + "grad_norm": 0.11956532299518585, + "learning_rate": 4.016203703703704e-05, + "loss": 0.0202, + "step": 27020 + }, + { + "epoch": 0.19980189822891103, + "grad_norm": 0.10095667093992233, + "learning_rate": 4.0158327397910734e-05, + "loss": 0.0185, + "step": 27030 + }, + { + "epoch": 0.19987581680021288, + "grad_norm": 0.08467351645231247, + "learning_rate": 4.015461775878443e-05, + "loss": 0.0209, + "step": 27040 + }, + { + "epoch": 0.19994973537151475, + "grad_norm": 0.07851988077163696, + "learning_rate": 4.015090811965812e-05, + "loss": 0.0223, + "step": 27050 + }, + { + "epoch": 0.2000236539428166, + "grad_norm": 0.08040213584899902, + "learning_rate": 4.0147198480531815e-05, + "loss": 0.0214, + "step": 27060 + }, + { + "epoch": 0.20009757251411844, + "grad_norm": 0.09008269757032394, + "learning_rate": 4.014348884140551e-05, + "loss": 0.0198, + "step": 27070 + }, + { + "epoch": 0.2001714910854203, + "grad_norm": 0.07157569378614426, + "learning_rate": 4.013977920227921e-05, + "loss": 0.0194, + "step": 27080 + }, + { + "epoch": 0.20024540965672216, + "grad_norm": 0.06881707161664963, + "learning_rate": 4.01360695631529e-05, + "loss": 0.0188, + "step": 27090 + }, + { + "epoch": 0.200319328228024, + "grad_norm": 0.06796582788228989, + "learning_rate": 4.013235992402659e-05, + "loss": 0.0188, + "step": 27100 + }, + { + "epoch": 0.20039324679932585, + "grad_norm": 0.09799590706825256, + "learning_rate": 4.012865028490029e-05, + "loss": 0.0185, + "step": 27110 + }, + { + "epoch": 0.20046716537062773, + "grad_norm": 0.08818831294775009, + "learning_rate": 4.012494064577398e-05, + "loss": 0.0174, + "step": 27120 + }, + { + "epoch": 0.20054108394192957, + "grad_norm": 0.08110783249139786, + "learning_rate": 4.012123100664767e-05, + "loss": 0.0192, + "step": 27130 + }, + { + "epoch": 0.20061500251323142, + "grad_norm": 0.09459230303764343, + "learning_rate": 4.011752136752137e-05, + "loss": 0.0214, + "step": 27140 + }, + { + "epoch": 0.2006889210845333, + "grad_norm": 0.09958945959806442, + "learning_rate": 4.0113811728395065e-05, + "loss": 0.02, + "step": 27150 + }, + { + "epoch": 0.20076283965583513, + "grad_norm": 0.11538317054510117, + "learning_rate": 4.011010208926876e-05, + "loss": 0.0205, + "step": 27160 + }, + { + "epoch": 0.20083675822713698, + "grad_norm": 0.0951877310872078, + "learning_rate": 4.010639245014246e-05, + "loss": 0.0204, + "step": 27170 + }, + { + "epoch": 0.20091067679843885, + "grad_norm": 0.09305461496114731, + "learning_rate": 4.0102682811016146e-05, + "loss": 0.0202, + "step": 27180 + }, + { + "epoch": 0.2009845953697407, + "grad_norm": 0.06460881233215332, + "learning_rate": 4.009897317188984e-05, + "loss": 0.0192, + "step": 27190 + }, + { + "epoch": 0.20105851394104254, + "grad_norm": 0.0849134549498558, + "learning_rate": 4.009526353276353e-05, + "loss": 0.0191, + "step": 27200 + }, + { + "epoch": 0.2011324325123444, + "grad_norm": 0.07277417182922363, + "learning_rate": 4.009155389363723e-05, + "loss": 0.0192, + "step": 27210 + }, + { + "epoch": 0.20120635108364626, + "grad_norm": 0.09616146236658096, + "learning_rate": 4.0087844254510923e-05, + "loss": 0.0194, + "step": 27220 + }, + { + "epoch": 0.2012802696549481, + "grad_norm": 0.10256221145391464, + "learning_rate": 4.008413461538462e-05, + "loss": 0.0196, + "step": 27230 + }, + { + "epoch": 0.20135418822624995, + "grad_norm": 0.05856601148843765, + "learning_rate": 4.0080424976258315e-05, + "loss": 0.0186, + "step": 27240 + }, + { + "epoch": 0.20142810679755183, + "grad_norm": 0.10383433103561401, + "learning_rate": 4.0076715337132005e-05, + "loss": 0.0215, + "step": 27250 + }, + { + "epoch": 0.20150202536885367, + "grad_norm": 0.09276574105024338, + "learning_rate": 4.00730056980057e-05, + "loss": 0.0191, + "step": 27260 + }, + { + "epoch": 0.20157594394015552, + "grad_norm": 0.07574693858623505, + "learning_rate": 4.0069296058879396e-05, + "loss": 0.0186, + "step": 27270 + }, + { + "epoch": 0.2016498625114574, + "grad_norm": 0.07975997775793076, + "learning_rate": 4.0065586419753086e-05, + "loss": 0.0236, + "step": 27280 + }, + { + "epoch": 0.20172378108275923, + "grad_norm": 0.07284721732139587, + "learning_rate": 4.006187678062678e-05, + "loss": 0.023, + "step": 27290 + }, + { + "epoch": 0.20179769965406108, + "grad_norm": 0.07264596223831177, + "learning_rate": 4.005816714150048e-05, + "loss": 0.0156, + "step": 27300 + }, + { + "epoch": 0.20187161822536295, + "grad_norm": 0.10306177288293839, + "learning_rate": 4.0054457502374174e-05, + "loss": 0.0218, + "step": 27310 + }, + { + "epoch": 0.2019455367966648, + "grad_norm": 0.10815753042697906, + "learning_rate": 4.005074786324787e-05, + "loss": 0.0196, + "step": 27320 + }, + { + "epoch": 0.20201945536796664, + "grad_norm": 0.10363900661468506, + "learning_rate": 4.004703822412156e-05, + "loss": 0.0163, + "step": 27330 + }, + { + "epoch": 0.2020933739392685, + "grad_norm": 0.09635012596845627, + "learning_rate": 4.0043328584995255e-05, + "loss": 0.0209, + "step": 27340 + }, + { + "epoch": 0.20216729251057036, + "grad_norm": 0.08603912591934204, + "learning_rate": 4.0039618945868944e-05, + "loss": 0.0186, + "step": 27350 + }, + { + "epoch": 0.2022412110818722, + "grad_norm": 0.10809623450040817, + "learning_rate": 4.003590930674264e-05, + "loss": 0.0203, + "step": 27360 + }, + { + "epoch": 0.20231512965317405, + "grad_norm": 0.08217758685350418, + "learning_rate": 4.0032199667616336e-05, + "loss": 0.0231, + "step": 27370 + }, + { + "epoch": 0.20238904822447593, + "grad_norm": 0.07781322300434113, + "learning_rate": 4.002849002849003e-05, + "loss": 0.0184, + "step": 27380 + }, + { + "epoch": 0.20246296679577777, + "grad_norm": 0.09918926656246185, + "learning_rate": 4.002478038936373e-05, + "loss": 0.0227, + "step": 27390 + }, + { + "epoch": 0.20253688536707962, + "grad_norm": 0.07839609682559967, + "learning_rate": 4.0021070750237424e-05, + "loss": 0.0187, + "step": 27400 + }, + { + "epoch": 0.2026108039383815, + "grad_norm": 0.0917617529630661, + "learning_rate": 4.001736111111111e-05, + "loss": 0.0175, + "step": 27410 + }, + { + "epoch": 0.20268472250968333, + "grad_norm": 0.09276415407657623, + "learning_rate": 4.001365147198481e-05, + "loss": 0.0226, + "step": 27420 + }, + { + "epoch": 0.20275864108098518, + "grad_norm": 0.09192842245101929, + "learning_rate": 4.00099418328585e-05, + "loss": 0.0175, + "step": 27430 + }, + { + "epoch": 0.20283255965228705, + "grad_norm": 0.09252292662858963, + "learning_rate": 4.0006232193732194e-05, + "loss": 0.0201, + "step": 27440 + }, + { + "epoch": 0.2029064782235889, + "grad_norm": 0.11140212416648865, + "learning_rate": 4.000252255460589e-05, + "loss": 0.0202, + "step": 27450 + }, + { + "epoch": 0.20298039679489074, + "grad_norm": 0.10308962315320969, + "learning_rate": 3.9998812915479586e-05, + "loss": 0.0199, + "step": 27460 + }, + { + "epoch": 0.2030543153661926, + "grad_norm": 0.09824221581220627, + "learning_rate": 3.999510327635328e-05, + "loss": 0.0216, + "step": 27470 + }, + { + "epoch": 0.20312823393749446, + "grad_norm": 0.09012199193239212, + "learning_rate": 3.999139363722697e-05, + "loss": 0.0176, + "step": 27480 + }, + { + "epoch": 0.2032021525087963, + "grad_norm": 0.10927720367908478, + "learning_rate": 3.998768399810067e-05, + "loss": 0.0205, + "step": 27490 + }, + { + "epoch": 0.20327607108009815, + "grad_norm": 0.07126221060752869, + "learning_rate": 3.998397435897436e-05, + "loss": 0.0221, + "step": 27500 + }, + { + "epoch": 0.20334998965140003, + "grad_norm": 0.07685278356075287, + "learning_rate": 3.998026471984805e-05, + "loss": 0.019, + "step": 27510 + }, + { + "epoch": 0.20342390822270187, + "grad_norm": 0.080895334482193, + "learning_rate": 3.997655508072175e-05, + "loss": 0.0188, + "step": 27520 + }, + { + "epoch": 0.20349782679400372, + "grad_norm": 0.09567815065383911, + "learning_rate": 3.9972845441595444e-05, + "loss": 0.019, + "step": 27530 + }, + { + "epoch": 0.2035717453653056, + "grad_norm": 0.056576136499643326, + "learning_rate": 3.996913580246914e-05, + "loss": 0.0175, + "step": 27540 + }, + { + "epoch": 0.20364566393660744, + "grad_norm": 0.12664413452148438, + "learning_rate": 3.9965426163342836e-05, + "loss": 0.0183, + "step": 27550 + }, + { + "epoch": 0.20371958250790928, + "grad_norm": 0.1033640056848526, + "learning_rate": 3.9961716524216525e-05, + "loss": 0.0184, + "step": 27560 + }, + { + "epoch": 0.20379350107921115, + "grad_norm": 0.071520134806633, + "learning_rate": 3.995800688509022e-05, + "loss": 0.0198, + "step": 27570 + }, + { + "epoch": 0.203867419650513, + "grad_norm": 0.08319620788097382, + "learning_rate": 3.995429724596391e-05, + "loss": 0.0198, + "step": 27580 + }, + { + "epoch": 0.20394133822181484, + "grad_norm": 0.07659216970205307, + "learning_rate": 3.9950587606837606e-05, + "loss": 0.018, + "step": 27590 + }, + { + "epoch": 0.2040152567931167, + "grad_norm": 0.08090242743492126, + "learning_rate": 3.99468779677113e-05, + "loss": 0.0211, + "step": 27600 + }, + { + "epoch": 0.20408917536441856, + "grad_norm": 0.09213200211524963, + "learning_rate": 3.9943168328585e-05, + "loss": 0.0186, + "step": 27610 + }, + { + "epoch": 0.2041630939357204, + "grad_norm": 0.08669887483119965, + "learning_rate": 3.9939458689458694e-05, + "loss": 0.0202, + "step": 27620 + }, + { + "epoch": 0.20423701250702225, + "grad_norm": 0.07079674303531647, + "learning_rate": 3.993574905033239e-05, + "loss": 0.0168, + "step": 27630 + }, + { + "epoch": 0.20431093107832413, + "grad_norm": 0.07631437480449677, + "learning_rate": 3.993203941120608e-05, + "loss": 0.0207, + "step": 27640 + }, + { + "epoch": 0.20438484964962597, + "grad_norm": 0.0871163159608841, + "learning_rate": 3.9928329772079775e-05, + "loss": 0.0204, + "step": 27650 + }, + { + "epoch": 0.20445876822092782, + "grad_norm": 0.08351099491119385, + "learning_rate": 3.9924620132953465e-05, + "loss": 0.0201, + "step": 27660 + }, + { + "epoch": 0.2045326867922297, + "grad_norm": 0.09270329028367996, + "learning_rate": 3.992091049382716e-05, + "loss": 0.0189, + "step": 27670 + }, + { + "epoch": 0.20460660536353154, + "grad_norm": 0.10540712624788284, + "learning_rate": 3.9917200854700857e-05, + "loss": 0.018, + "step": 27680 + }, + { + "epoch": 0.20468052393483338, + "grad_norm": 0.07462663948535919, + "learning_rate": 3.991349121557455e-05, + "loss": 0.0186, + "step": 27690 + }, + { + "epoch": 0.20475444250613525, + "grad_norm": 0.07764612138271332, + "learning_rate": 3.990978157644825e-05, + "loss": 0.0175, + "step": 27700 + }, + { + "epoch": 0.2048283610774371, + "grad_norm": 0.10635413974523544, + "learning_rate": 3.990607193732194e-05, + "loss": 0.0218, + "step": 27710 + }, + { + "epoch": 0.20490227964873894, + "grad_norm": 0.11886221915483475, + "learning_rate": 3.9902362298195634e-05, + "loss": 0.0173, + "step": 27720 + }, + { + "epoch": 0.2049761982200408, + "grad_norm": 0.07569146901369095, + "learning_rate": 3.989865265906933e-05, + "loss": 0.02, + "step": 27730 + }, + { + "epoch": 0.20505011679134266, + "grad_norm": 0.08334322273731232, + "learning_rate": 3.989494301994302e-05, + "loss": 0.0202, + "step": 27740 + }, + { + "epoch": 0.2051240353626445, + "grad_norm": 0.11985990405082703, + "learning_rate": 3.9891233380816715e-05, + "loss": 0.0216, + "step": 27750 + }, + { + "epoch": 0.20519795393394635, + "grad_norm": 0.096512570977211, + "learning_rate": 3.988752374169041e-05, + "loss": 0.0204, + "step": 27760 + }, + { + "epoch": 0.20527187250524823, + "grad_norm": 0.07972533255815506, + "learning_rate": 3.988381410256411e-05, + "loss": 0.0192, + "step": 27770 + }, + { + "epoch": 0.20534579107655007, + "grad_norm": 0.09564422070980072, + "learning_rate": 3.98801044634378e-05, + "loss": 0.02, + "step": 27780 + }, + { + "epoch": 0.20541970964785192, + "grad_norm": 0.08355620503425598, + "learning_rate": 3.987639482431149e-05, + "loss": 0.0207, + "step": 27790 + }, + { + "epoch": 0.2054936282191538, + "grad_norm": 0.10648877918720245, + "learning_rate": 3.987268518518519e-05, + "loss": 0.0181, + "step": 27800 + }, + { + "epoch": 0.20556754679045564, + "grad_norm": 0.06632612645626068, + "learning_rate": 3.986897554605888e-05, + "loss": 0.0209, + "step": 27810 + }, + { + "epoch": 0.20564146536175748, + "grad_norm": 0.09305301308631897, + "learning_rate": 3.986526590693257e-05, + "loss": 0.0164, + "step": 27820 + }, + { + "epoch": 0.20571538393305935, + "grad_norm": 0.07734738290309906, + "learning_rate": 3.986155626780627e-05, + "loss": 0.0185, + "step": 27830 + }, + { + "epoch": 0.2057893025043612, + "grad_norm": 0.07458245009183884, + "learning_rate": 3.9857846628679965e-05, + "loss": 0.0175, + "step": 27840 + }, + { + "epoch": 0.20586322107566304, + "grad_norm": 0.08158103376626968, + "learning_rate": 3.985413698955366e-05, + "loss": 0.0191, + "step": 27850 + }, + { + "epoch": 0.2059371396469649, + "grad_norm": 0.07438033074140549, + "learning_rate": 3.985042735042736e-05, + "loss": 0.02, + "step": 27860 + }, + { + "epoch": 0.20601105821826676, + "grad_norm": 0.06623519212007523, + "learning_rate": 3.9846717711301046e-05, + "loss": 0.0224, + "step": 27870 + }, + { + "epoch": 0.2060849767895686, + "grad_norm": 0.08878178894519806, + "learning_rate": 3.984300807217474e-05, + "loss": 0.018, + "step": 27880 + }, + { + "epoch": 0.20615889536087045, + "grad_norm": 0.09008637815713882, + "learning_rate": 3.983929843304843e-05, + "loss": 0.0203, + "step": 27890 + }, + { + "epoch": 0.20623281393217233, + "grad_norm": 0.105464406311512, + "learning_rate": 3.983558879392213e-05, + "loss": 0.0198, + "step": 27900 + }, + { + "epoch": 0.20630673250347417, + "grad_norm": 0.10132227092981339, + "learning_rate": 3.983187915479582e-05, + "loss": 0.0205, + "step": 27910 + }, + { + "epoch": 0.20638065107477602, + "grad_norm": 0.09023267775774002, + "learning_rate": 3.982816951566952e-05, + "loss": 0.0213, + "step": 27920 + }, + { + "epoch": 0.2064545696460779, + "grad_norm": 0.107700414955616, + "learning_rate": 3.9824459876543215e-05, + "loss": 0.0192, + "step": 27930 + }, + { + "epoch": 0.20652848821737974, + "grad_norm": 0.0751541405916214, + "learning_rate": 3.9820750237416904e-05, + "loss": 0.0199, + "step": 27940 + }, + { + "epoch": 0.20660240678868158, + "grad_norm": 0.0796554908156395, + "learning_rate": 3.98170405982906e-05, + "loss": 0.0185, + "step": 27950 + }, + { + "epoch": 0.20667632535998345, + "grad_norm": 0.15034401416778564, + "learning_rate": 3.9813330959164296e-05, + "loss": 0.0197, + "step": 27960 + }, + { + "epoch": 0.2067502439312853, + "grad_norm": 0.09039101749658585, + "learning_rate": 3.9809621320037985e-05, + "loss": 0.0175, + "step": 27970 + }, + { + "epoch": 0.20682416250258714, + "grad_norm": 0.08342040330171585, + "learning_rate": 3.980591168091168e-05, + "loss": 0.0206, + "step": 27980 + }, + { + "epoch": 0.20689808107388902, + "grad_norm": 0.08229884505271912, + "learning_rate": 3.980220204178538e-05, + "loss": 0.0204, + "step": 27990 + }, + { + "epoch": 0.20697199964519086, + "grad_norm": 0.07746469974517822, + "learning_rate": 3.979849240265907e-05, + "loss": 0.0214, + "step": 28000 + }, + { + "epoch": 0.2070459182164927, + "grad_norm": 0.11914195865392685, + "learning_rate": 3.979478276353277e-05, + "loss": 0.0223, + "step": 28010 + }, + { + "epoch": 0.20711983678779455, + "grad_norm": 0.10523311793804169, + "learning_rate": 3.979107312440646e-05, + "loss": 0.0189, + "step": 28020 + }, + { + "epoch": 0.20719375535909643, + "grad_norm": 0.061627957969903946, + "learning_rate": 3.9787363485280154e-05, + "loss": 0.0194, + "step": 28030 + }, + { + "epoch": 0.20726767393039827, + "grad_norm": 0.08928674459457397, + "learning_rate": 3.9783653846153844e-05, + "loss": 0.0194, + "step": 28040 + }, + { + "epoch": 0.20734159250170012, + "grad_norm": 0.11436183750629425, + "learning_rate": 3.977994420702754e-05, + "loss": 0.022, + "step": 28050 + }, + { + "epoch": 0.207415511073002, + "grad_norm": 0.07641822844743729, + "learning_rate": 3.9776234567901236e-05, + "loss": 0.0201, + "step": 28060 + }, + { + "epoch": 0.20748942964430384, + "grad_norm": 0.07226108014583588, + "learning_rate": 3.977252492877493e-05, + "loss": 0.0188, + "step": 28070 + }, + { + "epoch": 0.20756334821560568, + "grad_norm": 0.0811077281832695, + "learning_rate": 3.976881528964863e-05, + "loss": 0.02, + "step": 28080 + }, + { + "epoch": 0.20763726678690755, + "grad_norm": 0.09483274072408676, + "learning_rate": 3.9765105650522323e-05, + "loss": 0.0184, + "step": 28090 + }, + { + "epoch": 0.2077111853582094, + "grad_norm": 0.07932420074939728, + "learning_rate": 3.976139601139601e-05, + "loss": 0.02, + "step": 28100 + }, + { + "epoch": 0.20778510392951124, + "grad_norm": 0.06476841866970062, + "learning_rate": 3.975768637226971e-05, + "loss": 0.0197, + "step": 28110 + }, + { + "epoch": 0.20785902250081312, + "grad_norm": 0.09106004238128662, + "learning_rate": 3.97539767331434e-05, + "loss": 0.0182, + "step": 28120 + }, + { + "epoch": 0.20793294107211496, + "grad_norm": 0.08542423695325851, + "learning_rate": 3.9750267094017094e-05, + "loss": 0.0204, + "step": 28130 + }, + { + "epoch": 0.2080068596434168, + "grad_norm": 0.060376714915037155, + "learning_rate": 3.974655745489079e-05, + "loss": 0.0171, + "step": 28140 + }, + { + "epoch": 0.20808077821471865, + "grad_norm": 0.0787116289138794, + "learning_rate": 3.9742847815764486e-05, + "loss": 0.0188, + "step": 28150 + }, + { + "epoch": 0.20815469678602053, + "grad_norm": 0.0847581997513771, + "learning_rate": 3.973913817663818e-05, + "loss": 0.0194, + "step": 28160 + }, + { + "epoch": 0.20822861535732237, + "grad_norm": 0.08633068203926086, + "learning_rate": 3.973542853751187e-05, + "loss": 0.0217, + "step": 28170 + }, + { + "epoch": 0.20830253392862422, + "grad_norm": 0.10672096163034439, + "learning_rate": 3.973171889838557e-05, + "loss": 0.0206, + "step": 28180 + }, + { + "epoch": 0.2083764524999261, + "grad_norm": 0.06093067675828934, + "learning_rate": 3.972800925925926e-05, + "loss": 0.0184, + "step": 28190 + }, + { + "epoch": 0.20845037107122794, + "grad_norm": 0.09024010598659515, + "learning_rate": 3.972429962013295e-05, + "loss": 0.0213, + "step": 28200 + }, + { + "epoch": 0.20852428964252978, + "grad_norm": 0.11794493347406387, + "learning_rate": 3.972058998100665e-05, + "loss": 0.0198, + "step": 28210 + }, + { + "epoch": 0.20859820821383165, + "grad_norm": 0.0925167128443718, + "learning_rate": 3.9716880341880344e-05, + "loss": 0.0214, + "step": 28220 + }, + { + "epoch": 0.2086721267851335, + "grad_norm": 0.06808876246213913, + "learning_rate": 3.971317070275404e-05, + "loss": 0.0205, + "step": 28230 + }, + { + "epoch": 0.20874604535643534, + "grad_norm": 0.09446553885936737, + "learning_rate": 3.9709461063627736e-05, + "loss": 0.0232, + "step": 28240 + }, + { + "epoch": 0.20881996392773722, + "grad_norm": 0.07646559178829193, + "learning_rate": 3.9705751424501425e-05, + "loss": 0.018, + "step": 28250 + }, + { + "epoch": 0.20889388249903906, + "grad_norm": 0.10056505352258682, + "learning_rate": 3.970204178537512e-05, + "loss": 0.0196, + "step": 28260 + }, + { + "epoch": 0.2089678010703409, + "grad_norm": 0.08074390143156052, + "learning_rate": 3.969833214624881e-05, + "loss": 0.0179, + "step": 28270 + }, + { + "epoch": 0.20904171964164275, + "grad_norm": 0.08853159844875336, + "learning_rate": 3.9694622507122506e-05, + "loss": 0.0177, + "step": 28280 + }, + { + "epoch": 0.20911563821294463, + "grad_norm": 0.09671463072299957, + "learning_rate": 3.96909128679962e-05, + "loss": 0.0217, + "step": 28290 + }, + { + "epoch": 0.20918955678424647, + "grad_norm": 0.10711846500635147, + "learning_rate": 3.96872032288699e-05, + "loss": 0.0193, + "step": 28300 + }, + { + "epoch": 0.20926347535554832, + "grad_norm": 0.07058045268058777, + "learning_rate": 3.9683493589743594e-05, + "loss": 0.0186, + "step": 28310 + }, + { + "epoch": 0.2093373939268502, + "grad_norm": 0.10393711179494858, + "learning_rate": 3.967978395061729e-05, + "loss": 0.0184, + "step": 28320 + }, + { + "epoch": 0.20941131249815204, + "grad_norm": 0.10901501029729843, + "learning_rate": 3.967607431149098e-05, + "loss": 0.0212, + "step": 28330 + }, + { + "epoch": 0.20948523106945388, + "grad_norm": 0.10953504592180252, + "learning_rate": 3.9672364672364675e-05, + "loss": 0.0187, + "step": 28340 + }, + { + "epoch": 0.20955914964075575, + "grad_norm": 0.09685878455638885, + "learning_rate": 3.9668655033238364e-05, + "loss": 0.018, + "step": 28350 + }, + { + "epoch": 0.2096330682120576, + "grad_norm": 0.10276530683040619, + "learning_rate": 3.966494539411206e-05, + "loss": 0.0197, + "step": 28360 + }, + { + "epoch": 0.20970698678335944, + "grad_norm": 0.07335204631090164, + "learning_rate": 3.9661235754985756e-05, + "loss": 0.0205, + "step": 28370 + }, + { + "epoch": 0.20978090535466132, + "grad_norm": 0.10454121232032776, + "learning_rate": 3.965752611585945e-05, + "loss": 0.0206, + "step": 28380 + }, + { + "epoch": 0.20985482392596316, + "grad_norm": 0.0707346722483635, + "learning_rate": 3.965381647673315e-05, + "loss": 0.0203, + "step": 28390 + }, + { + "epoch": 0.209928742497265, + "grad_norm": 0.08866816014051437, + "learning_rate": 3.965010683760684e-05, + "loss": 0.0185, + "step": 28400 + }, + { + "epoch": 0.21000266106856685, + "grad_norm": 0.12376395612955093, + "learning_rate": 3.9646397198480533e-05, + "loss": 0.0185, + "step": 28410 + }, + { + "epoch": 0.21007657963986873, + "grad_norm": 0.06304518133401871, + "learning_rate": 3.964268755935423e-05, + "loss": 0.0225, + "step": 28420 + }, + { + "epoch": 0.21015049821117057, + "grad_norm": 0.08424234390258789, + "learning_rate": 3.963897792022792e-05, + "loss": 0.0202, + "step": 28430 + }, + { + "epoch": 0.21022441678247242, + "grad_norm": 0.09381997585296631, + "learning_rate": 3.9635268281101615e-05, + "loss": 0.0227, + "step": 28440 + }, + { + "epoch": 0.2102983353537743, + "grad_norm": 0.06805253028869629, + "learning_rate": 3.963155864197531e-05, + "loss": 0.0206, + "step": 28450 + }, + { + "epoch": 0.21037225392507614, + "grad_norm": 0.09173522889614105, + "learning_rate": 3.9627849002849006e-05, + "loss": 0.0187, + "step": 28460 + }, + { + "epoch": 0.21044617249637798, + "grad_norm": 0.07035304605960846, + "learning_rate": 3.96241393637227e-05, + "loss": 0.0202, + "step": 28470 + }, + { + "epoch": 0.21052009106767985, + "grad_norm": 0.07232367247343063, + "learning_rate": 3.962042972459639e-05, + "loss": 0.0193, + "step": 28480 + }, + { + "epoch": 0.2105940096389817, + "grad_norm": 0.10332144796848297, + "learning_rate": 3.961672008547009e-05, + "loss": 0.0202, + "step": 28490 + }, + { + "epoch": 0.21066792821028354, + "grad_norm": 0.08793472498655319, + "learning_rate": 3.961301044634378e-05, + "loss": 0.0174, + "step": 28500 + }, + { + "epoch": 0.21074184678158542, + "grad_norm": 0.11272072046995163, + "learning_rate": 3.960930080721747e-05, + "loss": 0.021, + "step": 28510 + }, + { + "epoch": 0.21081576535288726, + "grad_norm": 0.07357959449291229, + "learning_rate": 3.960559116809117e-05, + "loss": 0.0181, + "step": 28520 + }, + { + "epoch": 0.2108896839241891, + "grad_norm": 0.07872324436903, + "learning_rate": 3.9601881528964865e-05, + "loss": 0.0203, + "step": 28530 + }, + { + "epoch": 0.21096360249549095, + "grad_norm": 0.08163028210401535, + "learning_rate": 3.959817188983856e-05, + "loss": 0.0193, + "step": 28540 + }, + { + "epoch": 0.21103752106679283, + "grad_norm": 0.09265872836112976, + "learning_rate": 3.959446225071226e-05, + "loss": 0.0209, + "step": 28550 + }, + { + "epoch": 0.21111143963809467, + "grad_norm": 0.09339044243097305, + "learning_rate": 3.9590752611585946e-05, + "loss": 0.0212, + "step": 28560 + }, + { + "epoch": 0.21118535820939652, + "grad_norm": 0.06638886034488678, + "learning_rate": 3.958704297245964e-05, + "loss": 0.021, + "step": 28570 + }, + { + "epoch": 0.2112592767806984, + "grad_norm": 0.09415410459041595, + "learning_rate": 3.958333333333333e-05, + "loss": 0.0198, + "step": 28580 + }, + { + "epoch": 0.21133319535200024, + "grad_norm": 0.09834780544042587, + "learning_rate": 3.957962369420703e-05, + "loss": 0.02, + "step": 28590 + }, + { + "epoch": 0.21140711392330208, + "grad_norm": 0.09509751945734024, + "learning_rate": 3.957591405508072e-05, + "loss": 0.0186, + "step": 28600 + }, + { + "epoch": 0.21148103249460395, + "grad_norm": 0.08566658943891525, + "learning_rate": 3.957220441595442e-05, + "loss": 0.0187, + "step": 28610 + }, + { + "epoch": 0.2115549510659058, + "grad_norm": 0.09062514454126358, + "learning_rate": 3.9568494776828115e-05, + "loss": 0.0205, + "step": 28620 + }, + { + "epoch": 0.21162886963720764, + "grad_norm": 0.0751434788107872, + "learning_rate": 3.9564785137701804e-05, + "loss": 0.0171, + "step": 28630 + }, + { + "epoch": 0.21170278820850952, + "grad_norm": 0.08853239566087723, + "learning_rate": 3.95610754985755e-05, + "loss": 0.0206, + "step": 28640 + }, + { + "epoch": 0.21177670677981136, + "grad_norm": 0.10610842704772949, + "learning_rate": 3.9557365859449196e-05, + "loss": 0.0182, + "step": 28650 + }, + { + "epoch": 0.2118506253511132, + "grad_norm": 0.11282014846801758, + "learning_rate": 3.9553656220322885e-05, + "loss": 0.0177, + "step": 28660 + }, + { + "epoch": 0.21192454392241505, + "grad_norm": 0.08027879148721695, + "learning_rate": 3.954994658119658e-05, + "loss": 0.0213, + "step": 28670 + }, + { + "epoch": 0.21199846249371693, + "grad_norm": 0.09355953335762024, + "learning_rate": 3.954623694207028e-05, + "loss": 0.0202, + "step": 28680 + }, + { + "epoch": 0.21207238106501877, + "grad_norm": 0.09194199740886688, + "learning_rate": 3.954252730294397e-05, + "loss": 0.0204, + "step": 28690 + }, + { + "epoch": 0.21214629963632062, + "grad_norm": 0.0887255147099495, + "learning_rate": 3.953881766381767e-05, + "loss": 0.0214, + "step": 28700 + }, + { + "epoch": 0.2122202182076225, + "grad_norm": 0.08310788124799728, + "learning_rate": 3.953510802469136e-05, + "loss": 0.0211, + "step": 28710 + }, + { + "epoch": 0.21229413677892434, + "grad_norm": 0.0715780034661293, + "learning_rate": 3.9531398385565054e-05, + "loss": 0.0182, + "step": 28720 + }, + { + "epoch": 0.21236805535022618, + "grad_norm": 0.08348576724529266, + "learning_rate": 3.9527688746438743e-05, + "loss": 0.0197, + "step": 28730 + }, + { + "epoch": 0.21244197392152805, + "grad_norm": 0.11897409707307816, + "learning_rate": 3.952397910731244e-05, + "loss": 0.0193, + "step": 28740 + }, + { + "epoch": 0.2125158924928299, + "grad_norm": 0.08021600544452667, + "learning_rate": 3.9520269468186135e-05, + "loss": 0.019, + "step": 28750 + }, + { + "epoch": 0.21258981106413175, + "grad_norm": 0.07227059453725815, + "learning_rate": 3.951655982905983e-05, + "loss": 0.02, + "step": 28760 + }, + { + "epoch": 0.21266372963543362, + "grad_norm": 0.09160932153463364, + "learning_rate": 3.951285018993353e-05, + "loss": 0.0213, + "step": 28770 + }, + { + "epoch": 0.21273764820673546, + "grad_norm": 0.0931738093495369, + "learning_rate": 3.950914055080722e-05, + "loss": 0.0198, + "step": 28780 + }, + { + "epoch": 0.2128115667780373, + "grad_norm": 0.08352254331111908, + "learning_rate": 3.950543091168091e-05, + "loss": 0.021, + "step": 28790 + }, + { + "epoch": 0.21288548534933915, + "grad_norm": 0.08564615249633789, + "learning_rate": 3.950172127255461e-05, + "loss": 0.0187, + "step": 28800 + }, + { + "epoch": 0.21295940392064103, + "grad_norm": 0.1044422835111618, + "learning_rate": 3.94980116334283e-05, + "loss": 0.0198, + "step": 28810 + }, + { + "epoch": 0.21303332249194287, + "grad_norm": 0.07070822268724442, + "learning_rate": 3.9494301994301994e-05, + "loss": 0.0189, + "step": 28820 + }, + { + "epoch": 0.21310724106324472, + "grad_norm": 0.07984407991170883, + "learning_rate": 3.949059235517569e-05, + "loss": 0.0189, + "step": 28830 + }, + { + "epoch": 0.2131811596345466, + "grad_norm": 0.08178012818098068, + "learning_rate": 3.9486882716049385e-05, + "loss": 0.0196, + "step": 28840 + }, + { + "epoch": 0.21325507820584844, + "grad_norm": 0.06512026488780975, + "learning_rate": 3.948317307692308e-05, + "loss": 0.0207, + "step": 28850 + }, + { + "epoch": 0.21332899677715028, + "grad_norm": 0.08254005759954453, + "learning_rate": 3.947946343779677e-05, + "loss": 0.0182, + "step": 28860 + }, + { + "epoch": 0.21340291534845215, + "grad_norm": 0.10215901583433151, + "learning_rate": 3.9475753798670467e-05, + "loss": 0.0184, + "step": 28870 + }, + { + "epoch": 0.213476833919754, + "grad_norm": 0.08271851390600204, + "learning_rate": 3.947204415954416e-05, + "loss": 0.0174, + "step": 28880 + }, + { + "epoch": 0.21355075249105585, + "grad_norm": 0.08024625480175018, + "learning_rate": 3.946833452041785e-05, + "loss": 0.0183, + "step": 28890 + }, + { + "epoch": 0.21362467106235772, + "grad_norm": 0.06575581431388855, + "learning_rate": 3.946462488129155e-05, + "loss": 0.0184, + "step": 28900 + }, + { + "epoch": 0.21369858963365956, + "grad_norm": 0.12389591336250305, + "learning_rate": 3.9460915242165244e-05, + "loss": 0.0229, + "step": 28910 + }, + { + "epoch": 0.2137725082049614, + "grad_norm": 0.07798313349485397, + "learning_rate": 3.945720560303894e-05, + "loss": 0.0181, + "step": 28920 + }, + { + "epoch": 0.21384642677626328, + "grad_norm": 0.07742154598236084, + "learning_rate": 3.9453495963912636e-05, + "loss": 0.0182, + "step": 28930 + }, + { + "epoch": 0.21392034534756513, + "grad_norm": 0.0996508002281189, + "learning_rate": 3.9449786324786325e-05, + "loss": 0.0205, + "step": 28940 + }, + { + "epoch": 0.21399426391886697, + "grad_norm": 0.07458839565515518, + "learning_rate": 3.944607668566002e-05, + "loss": 0.0176, + "step": 28950 + }, + { + "epoch": 0.21406818249016882, + "grad_norm": 0.11061214655637741, + "learning_rate": 3.944236704653371e-05, + "loss": 0.0197, + "step": 28960 + }, + { + "epoch": 0.2141421010614707, + "grad_norm": 0.07425292581319809, + "learning_rate": 3.9438657407407406e-05, + "loss": 0.0171, + "step": 28970 + }, + { + "epoch": 0.21421601963277254, + "grad_norm": 0.08154287934303284, + "learning_rate": 3.94349477682811e-05, + "loss": 0.0204, + "step": 28980 + }, + { + "epoch": 0.21428993820407438, + "grad_norm": 0.09038446098566055, + "learning_rate": 3.94312381291548e-05, + "loss": 0.0187, + "step": 28990 + }, + { + "epoch": 0.21436385677537625, + "grad_norm": 0.09763018041849136, + "learning_rate": 3.9427528490028494e-05, + "loss": 0.0195, + "step": 29000 + }, + { + "epoch": 0.2144377753466781, + "grad_norm": 0.07743502408266068, + "learning_rate": 3.942381885090219e-05, + "loss": 0.0194, + "step": 29010 + }, + { + "epoch": 0.21451169391797995, + "grad_norm": 0.08858030289411545, + "learning_rate": 3.942010921177588e-05, + "loss": 0.0191, + "step": 29020 + }, + { + "epoch": 0.21458561248928182, + "grad_norm": 0.07084911316633224, + "learning_rate": 3.9416399572649575e-05, + "loss": 0.0193, + "step": 29030 + }, + { + "epoch": 0.21465953106058366, + "grad_norm": 0.108488067984581, + "learning_rate": 3.9412689933523264e-05, + "loss": 0.0208, + "step": 29040 + }, + { + "epoch": 0.2147334496318855, + "grad_norm": 0.08137663453817368, + "learning_rate": 3.940898029439696e-05, + "loss": 0.0194, + "step": 29050 + }, + { + "epoch": 0.21480736820318738, + "grad_norm": 0.07932858914136887, + "learning_rate": 3.940527065527066e-05, + "loss": 0.02, + "step": 29060 + }, + { + "epoch": 0.21488128677448923, + "grad_norm": 0.0741783156991005, + "learning_rate": 3.940156101614435e-05, + "loss": 0.02, + "step": 29070 + }, + { + "epoch": 0.21495520534579107, + "grad_norm": 0.09059015661478043, + "learning_rate": 3.939785137701805e-05, + "loss": 0.0196, + "step": 29080 + }, + { + "epoch": 0.21502912391709292, + "grad_norm": 0.10432152450084686, + "learning_rate": 3.939414173789174e-05, + "loss": 0.0184, + "step": 29090 + }, + { + "epoch": 0.2151030424883948, + "grad_norm": 0.08571776747703552, + "learning_rate": 3.939043209876543e-05, + "loss": 0.0184, + "step": 29100 + }, + { + "epoch": 0.21517696105969664, + "grad_norm": 0.09835029393434525, + "learning_rate": 3.938672245963913e-05, + "loss": 0.0186, + "step": 29110 + }, + { + "epoch": 0.21525087963099848, + "grad_norm": 0.08376480638980865, + "learning_rate": 3.938301282051282e-05, + "loss": 0.0182, + "step": 29120 + }, + { + "epoch": 0.21532479820230035, + "grad_norm": 0.06342848390340805, + "learning_rate": 3.9379303181386514e-05, + "loss": 0.0168, + "step": 29130 + }, + { + "epoch": 0.2153987167736022, + "grad_norm": 0.08471592515707016, + "learning_rate": 3.937559354226021e-05, + "loss": 0.02, + "step": 29140 + }, + { + "epoch": 0.21547263534490405, + "grad_norm": 0.0840645506978035, + "learning_rate": 3.9371883903133906e-05, + "loss": 0.0193, + "step": 29150 + }, + { + "epoch": 0.21554655391620592, + "grad_norm": 0.11769651621580124, + "learning_rate": 3.93681742640076e-05, + "loss": 0.0203, + "step": 29160 + }, + { + "epoch": 0.21562047248750776, + "grad_norm": 0.1026725247502327, + "learning_rate": 3.936446462488129e-05, + "loss": 0.0216, + "step": 29170 + }, + { + "epoch": 0.2156943910588096, + "grad_norm": 0.10950805991888046, + "learning_rate": 3.936075498575499e-05, + "loss": 0.0175, + "step": 29180 + }, + { + "epoch": 0.21576830963011148, + "grad_norm": 0.08458021283149719, + "learning_rate": 3.9357045346628677e-05, + "loss": 0.0195, + "step": 29190 + }, + { + "epoch": 0.21584222820141333, + "grad_norm": 0.0908300057053566, + "learning_rate": 3.935333570750237e-05, + "loss": 0.0219, + "step": 29200 + }, + { + "epoch": 0.21591614677271517, + "grad_norm": 0.10464771091938019, + "learning_rate": 3.9349626068376075e-05, + "loss": 0.0195, + "step": 29210 + }, + { + "epoch": 0.21599006534401702, + "grad_norm": 0.07181484997272491, + "learning_rate": 3.9345916429249764e-05, + "loss": 0.0162, + "step": 29220 + }, + { + "epoch": 0.2160639839153189, + "grad_norm": 0.09997867047786713, + "learning_rate": 3.934220679012346e-05, + "loss": 0.021, + "step": 29230 + }, + { + "epoch": 0.21613790248662074, + "grad_norm": 0.07985897362232208, + "learning_rate": 3.9338497150997156e-05, + "loss": 0.0187, + "step": 29240 + }, + { + "epoch": 0.21621182105792258, + "grad_norm": 0.06396341323852539, + "learning_rate": 3.9334787511870846e-05, + "loss": 0.0191, + "step": 29250 + }, + { + "epoch": 0.21628573962922445, + "grad_norm": 0.0796850398182869, + "learning_rate": 3.933107787274454e-05, + "loss": 0.0202, + "step": 29260 + }, + { + "epoch": 0.2163596582005263, + "grad_norm": 0.06869375705718994, + "learning_rate": 3.932736823361823e-05, + "loss": 0.0193, + "step": 29270 + }, + { + "epoch": 0.21643357677182815, + "grad_norm": 0.08196059614419937, + "learning_rate": 3.932365859449193e-05, + "loss": 0.0177, + "step": 29280 + }, + { + "epoch": 0.21650749534313002, + "grad_norm": 0.12254177033901215, + "learning_rate": 3.931994895536563e-05, + "loss": 0.019, + "step": 29290 + }, + { + "epoch": 0.21658141391443186, + "grad_norm": 0.09668796509504318, + "learning_rate": 3.931623931623932e-05, + "loss": 0.0207, + "step": 29300 + }, + { + "epoch": 0.2166553324857337, + "grad_norm": 0.11711964011192322, + "learning_rate": 3.9312529677113015e-05, + "loss": 0.0196, + "step": 29310 + }, + { + "epoch": 0.21672925105703558, + "grad_norm": 0.09891097992658615, + "learning_rate": 3.9308820037986704e-05, + "loss": 0.0193, + "step": 29320 + }, + { + "epoch": 0.21680316962833743, + "grad_norm": 0.06463112682104111, + "learning_rate": 3.93051103988604e-05, + "loss": 0.0175, + "step": 29330 + }, + { + "epoch": 0.21687708819963927, + "grad_norm": 0.08490326255559921, + "learning_rate": 3.9301400759734096e-05, + "loss": 0.02, + "step": 29340 + }, + { + "epoch": 0.21695100677094112, + "grad_norm": 0.09126602858304977, + "learning_rate": 3.9297691120607785e-05, + "loss": 0.021, + "step": 29350 + }, + { + "epoch": 0.217024925342243, + "grad_norm": 0.08907146006822586, + "learning_rate": 3.929398148148149e-05, + "loss": 0.0173, + "step": 29360 + }, + { + "epoch": 0.21709884391354484, + "grad_norm": 0.08719177544116974, + "learning_rate": 3.929027184235518e-05, + "loss": 0.0214, + "step": 29370 + }, + { + "epoch": 0.21717276248484668, + "grad_norm": 0.08162589371204376, + "learning_rate": 3.928656220322887e-05, + "loss": 0.0213, + "step": 29380 + }, + { + "epoch": 0.21724668105614856, + "grad_norm": 0.0870555192232132, + "learning_rate": 3.928285256410257e-05, + "loss": 0.0205, + "step": 29390 + }, + { + "epoch": 0.2173205996274504, + "grad_norm": 0.08443516492843628, + "learning_rate": 3.927914292497626e-05, + "loss": 0.0177, + "step": 29400 + }, + { + "epoch": 0.21739451819875225, + "grad_norm": 0.12415960431098938, + "learning_rate": 3.9275433285849954e-05, + "loss": 0.0205, + "step": 29410 + }, + { + "epoch": 0.21746843677005412, + "grad_norm": 0.12497182935476303, + "learning_rate": 3.927172364672364e-05, + "loss": 0.0201, + "step": 29420 + }, + { + "epoch": 0.21754235534135596, + "grad_norm": 0.10130871087312698, + "learning_rate": 3.926801400759734e-05, + "loss": 0.016, + "step": 29430 + }, + { + "epoch": 0.2176162739126578, + "grad_norm": 0.08756314963102341, + "learning_rate": 3.926430436847104e-05, + "loss": 0.0166, + "step": 29440 + }, + { + "epoch": 0.21769019248395968, + "grad_norm": 0.10453139245510101, + "learning_rate": 3.926059472934473e-05, + "loss": 0.019, + "step": 29450 + }, + { + "epoch": 0.21776411105526153, + "grad_norm": 0.09802225977182388, + "learning_rate": 3.925688509021843e-05, + "loss": 0.0206, + "step": 29460 + }, + { + "epoch": 0.21783802962656337, + "grad_norm": 0.07709541916847229, + "learning_rate": 3.925317545109212e-05, + "loss": 0.0152, + "step": 29470 + }, + { + "epoch": 0.21791194819786522, + "grad_norm": 0.10449168086051941, + "learning_rate": 3.924946581196581e-05, + "loss": 0.0189, + "step": 29480 + }, + { + "epoch": 0.2179858667691671, + "grad_norm": 0.09322134405374527, + "learning_rate": 3.924575617283951e-05, + "loss": 0.0207, + "step": 29490 + }, + { + "epoch": 0.21805978534046894, + "grad_norm": 0.0893213301897049, + "learning_rate": 3.92420465337132e-05, + "loss": 0.0182, + "step": 29500 + }, + { + "epoch": 0.21813370391177078, + "grad_norm": 0.06929013133049011, + "learning_rate": 3.92383368945869e-05, + "loss": 0.0194, + "step": 29510 + }, + { + "epoch": 0.21820762248307266, + "grad_norm": 0.08194336295127869, + "learning_rate": 3.9234627255460596e-05, + "loss": 0.0211, + "step": 29520 + }, + { + "epoch": 0.2182815410543745, + "grad_norm": 0.09584349393844604, + "learning_rate": 3.9230917616334285e-05, + "loss": 0.0187, + "step": 29530 + }, + { + "epoch": 0.21835545962567635, + "grad_norm": 0.08808445930480957, + "learning_rate": 3.922720797720798e-05, + "loss": 0.018, + "step": 29540 + }, + { + "epoch": 0.21842937819697822, + "grad_norm": 0.08436179161071777, + "learning_rate": 3.922349833808167e-05, + "loss": 0.0182, + "step": 29550 + }, + { + "epoch": 0.21850329676828006, + "grad_norm": 0.09081083536148071, + "learning_rate": 3.9219788698955366e-05, + "loss": 0.0196, + "step": 29560 + }, + { + "epoch": 0.2185772153395819, + "grad_norm": 0.06510711461305618, + "learning_rate": 3.921607905982906e-05, + "loss": 0.0198, + "step": 29570 + }, + { + "epoch": 0.21865113391088378, + "grad_norm": 0.06192505732178688, + "learning_rate": 3.921236942070275e-05, + "loss": 0.0171, + "step": 29580 + }, + { + "epoch": 0.21872505248218563, + "grad_norm": 0.10911913961172104, + "learning_rate": 3.9208659781576454e-05, + "loss": 0.02, + "step": 29590 + }, + { + "epoch": 0.21879897105348747, + "grad_norm": 0.10316479951143265, + "learning_rate": 3.9204950142450143e-05, + "loss": 0.0207, + "step": 29600 + }, + { + "epoch": 0.21887288962478932, + "grad_norm": 0.09082043915987015, + "learning_rate": 3.920124050332384e-05, + "loss": 0.0199, + "step": 29610 + }, + { + "epoch": 0.2189468081960912, + "grad_norm": 0.07779832184314728, + "learning_rate": 3.9197530864197535e-05, + "loss": 0.0194, + "step": 29620 + }, + { + "epoch": 0.21902072676739304, + "grad_norm": 0.09248018264770508, + "learning_rate": 3.9193821225071225e-05, + "loss": 0.0179, + "step": 29630 + }, + { + "epoch": 0.21909464533869488, + "grad_norm": 0.08787126839160919, + "learning_rate": 3.919011158594492e-05, + "loss": 0.021, + "step": 29640 + }, + { + "epoch": 0.21916856390999676, + "grad_norm": 0.10116524994373322, + "learning_rate": 3.918640194681861e-05, + "loss": 0.0186, + "step": 29650 + }, + { + "epoch": 0.2192424824812986, + "grad_norm": 0.09683122485876083, + "learning_rate": 3.918269230769231e-05, + "loss": 0.0189, + "step": 29660 + }, + { + "epoch": 0.21931640105260045, + "grad_norm": 0.09836520254611969, + "learning_rate": 3.917898266856601e-05, + "loss": 0.019, + "step": 29670 + }, + { + "epoch": 0.21939031962390232, + "grad_norm": 0.080613873898983, + "learning_rate": 3.91752730294397e-05, + "loss": 0.0178, + "step": 29680 + }, + { + "epoch": 0.21946423819520416, + "grad_norm": 0.11065308004617691, + "learning_rate": 3.9171563390313394e-05, + "loss": 0.0179, + "step": 29690 + }, + { + "epoch": 0.219538156766506, + "grad_norm": 0.04144902527332306, + "learning_rate": 3.916785375118709e-05, + "loss": 0.0166, + "step": 29700 + }, + { + "epoch": 0.21961207533780788, + "grad_norm": 0.08354011923074722, + "learning_rate": 3.916414411206078e-05, + "loss": 0.0195, + "step": 29710 + }, + { + "epoch": 0.21968599390910973, + "grad_norm": 0.09685730189085007, + "learning_rate": 3.9160434472934475e-05, + "loss": 0.0199, + "step": 29720 + }, + { + "epoch": 0.21975991248041157, + "grad_norm": 0.07402803003787994, + "learning_rate": 3.9156724833808164e-05, + "loss": 0.0168, + "step": 29730 + }, + { + "epoch": 0.21983383105171342, + "grad_norm": 0.08626370131969452, + "learning_rate": 3.915301519468187e-05, + "loss": 0.0202, + "step": 29740 + }, + { + "epoch": 0.2199077496230153, + "grad_norm": 0.12363871932029724, + "learning_rate": 3.914930555555556e-05, + "loss": 0.0193, + "step": 29750 + }, + { + "epoch": 0.21998166819431714, + "grad_norm": 0.07647421211004257, + "learning_rate": 3.914559591642925e-05, + "loss": 0.0187, + "step": 29760 + }, + { + "epoch": 0.22005558676561898, + "grad_norm": 0.10127396136522293, + "learning_rate": 3.914188627730295e-05, + "loss": 0.0185, + "step": 29770 + }, + { + "epoch": 0.22012950533692086, + "grad_norm": 0.11334076523780823, + "learning_rate": 3.913817663817664e-05, + "loss": 0.0225, + "step": 29780 + }, + { + "epoch": 0.2202034239082227, + "grad_norm": 0.08629689365625381, + "learning_rate": 3.913446699905033e-05, + "loss": 0.0193, + "step": 29790 + }, + { + "epoch": 0.22027734247952455, + "grad_norm": 0.07195857167243958, + "learning_rate": 3.913075735992403e-05, + "loss": 0.0191, + "step": 29800 + }, + { + "epoch": 0.22035126105082642, + "grad_norm": 0.09427592903375626, + "learning_rate": 3.9127047720797725e-05, + "loss": 0.0177, + "step": 29810 + }, + { + "epoch": 0.22042517962212826, + "grad_norm": 0.14630918204784393, + "learning_rate": 3.912333808167142e-05, + "loss": 0.0199, + "step": 29820 + }, + { + "epoch": 0.2204990981934301, + "grad_norm": 0.10568604618310928, + "learning_rate": 3.911962844254511e-05, + "loss": 0.018, + "step": 29830 + }, + { + "epoch": 0.22057301676473198, + "grad_norm": 0.09192904829978943, + "learning_rate": 3.9115918803418806e-05, + "loss": 0.0215, + "step": 29840 + }, + { + "epoch": 0.22064693533603383, + "grad_norm": 0.08686095476150513, + "learning_rate": 3.91122091642925e-05, + "loss": 0.0187, + "step": 29850 + }, + { + "epoch": 0.22072085390733567, + "grad_norm": 0.09952393174171448, + "learning_rate": 3.910849952516619e-05, + "loss": 0.0211, + "step": 29860 + }, + { + "epoch": 0.22079477247863755, + "grad_norm": 0.08013574033975601, + "learning_rate": 3.910478988603989e-05, + "loss": 0.0193, + "step": 29870 + }, + { + "epoch": 0.2208686910499394, + "grad_norm": 0.0664672777056694, + "learning_rate": 3.9101080246913576e-05, + "loss": 0.0183, + "step": 29880 + }, + { + "epoch": 0.22094260962124124, + "grad_norm": 0.07354767620563507, + "learning_rate": 3.909737060778728e-05, + "loss": 0.0181, + "step": 29890 + }, + { + "epoch": 0.22101652819254308, + "grad_norm": 0.0886886715888977, + "learning_rate": 3.9093660968660975e-05, + "loss": 0.0184, + "step": 29900 + }, + { + "epoch": 0.22109044676384496, + "grad_norm": 0.08277405798435211, + "learning_rate": 3.9089951329534664e-05, + "loss": 0.0204, + "step": 29910 + }, + { + "epoch": 0.2211643653351468, + "grad_norm": 0.10522552579641342, + "learning_rate": 3.908624169040836e-05, + "loss": 0.0213, + "step": 29920 + }, + { + "epoch": 0.22123828390644865, + "grad_norm": 0.08946707099676132, + "learning_rate": 3.9082532051282056e-05, + "loss": 0.0191, + "step": 29930 + }, + { + "epoch": 0.22131220247775052, + "grad_norm": 0.08845418691635132, + "learning_rate": 3.9078822412155745e-05, + "loss": 0.02, + "step": 29940 + }, + { + "epoch": 0.22138612104905236, + "grad_norm": 0.08584783226251602, + "learning_rate": 3.907511277302944e-05, + "loss": 0.0193, + "step": 29950 + }, + { + "epoch": 0.2214600396203542, + "grad_norm": 0.0865851640701294, + "learning_rate": 3.907140313390314e-05, + "loss": 0.0191, + "step": 29960 + }, + { + "epoch": 0.22153395819165608, + "grad_norm": 0.0629979595541954, + "learning_rate": 3.906769349477683e-05, + "loss": 0.0187, + "step": 29970 + }, + { + "epoch": 0.22160787676295793, + "grad_norm": 0.05814515799283981, + "learning_rate": 3.906398385565053e-05, + "loss": 0.0204, + "step": 29980 + }, + { + "epoch": 0.22168179533425977, + "grad_norm": 0.1256542056798935, + "learning_rate": 3.906027421652422e-05, + "loss": 0.0224, + "step": 29990 + }, + { + "epoch": 0.22175571390556165, + "grad_norm": 0.1209261491894722, + "learning_rate": 3.9056564577397914e-05, + "loss": 0.0185, + "step": 30000 + }, + { + "epoch": 0.22175571390556165, + "eval_f1": 0.5997543019336526, + "eval_loss": 0.019051436334848404, + "eval_precision": 0.47411606732788514, + "eval_recall": 0.8159863391440243, + "eval_runtime": 2664.1736, + "eval_samples_per_second": 203.115, + "eval_steps_per_second": 3.174, + "step": 30000 + }, + { + "epoch": 0.2218296324768635, + "grad_norm": 0.08469045907258987, + "learning_rate": 3.9052854938271604e-05, + "loss": 0.0206, + "step": 30010 + }, + { + "epoch": 0.22190355104816534, + "grad_norm": 0.09641801565885544, + "learning_rate": 3.90491452991453e-05, + "loss": 0.0168, + "step": 30020 + }, + { + "epoch": 0.22197746961946718, + "grad_norm": 0.08884450048208237, + "learning_rate": 3.9045435660018995e-05, + "loss": 0.019, + "step": 30030 + }, + { + "epoch": 0.22205138819076906, + "grad_norm": 0.09590624272823334, + "learning_rate": 3.904172602089269e-05, + "loss": 0.0207, + "step": 30040 + }, + { + "epoch": 0.2221253067620709, + "grad_norm": 0.08627372980117798, + "learning_rate": 3.903801638176639e-05, + "loss": 0.0161, + "step": 30050 + }, + { + "epoch": 0.22219922533337275, + "grad_norm": 0.10284058004617691, + "learning_rate": 3.9034306742640077e-05, + "loss": 0.0198, + "step": 30060 + }, + { + "epoch": 0.22227314390467462, + "grad_norm": 0.13199660181999207, + "learning_rate": 3.903059710351377e-05, + "loss": 0.0199, + "step": 30070 + }, + { + "epoch": 0.22234706247597646, + "grad_norm": 0.0878579244017601, + "learning_rate": 3.902688746438747e-05, + "loss": 0.0188, + "step": 30080 + }, + { + "epoch": 0.2224209810472783, + "grad_norm": 0.07251982390880585, + "learning_rate": 3.902317782526116e-05, + "loss": 0.0198, + "step": 30090 + }, + { + "epoch": 0.22249489961858018, + "grad_norm": 0.06770602613687515, + "learning_rate": 3.9019468186134854e-05, + "loss": 0.0186, + "step": 30100 + }, + { + "epoch": 0.22256881818988203, + "grad_norm": 0.08190783858299255, + "learning_rate": 3.901575854700854e-05, + "loss": 0.0161, + "step": 30110 + }, + { + "epoch": 0.22264273676118387, + "grad_norm": 0.07122600078582764, + "learning_rate": 3.9012048907882246e-05, + "loss": 0.0177, + "step": 30120 + }, + { + "epoch": 0.22271665533248575, + "grad_norm": 0.09126278758049011, + "learning_rate": 3.900833926875594e-05, + "loss": 0.0187, + "step": 30130 + }, + { + "epoch": 0.2227905739037876, + "grad_norm": 0.08449380099773407, + "learning_rate": 3.900462962962963e-05, + "loss": 0.0196, + "step": 30140 + }, + { + "epoch": 0.22286449247508944, + "grad_norm": 0.07444407790899277, + "learning_rate": 3.900091999050333e-05, + "loss": 0.0191, + "step": 30150 + }, + { + "epoch": 0.22293841104639128, + "grad_norm": 0.07632920145988464, + "learning_rate": 3.899721035137702e-05, + "loss": 0.0204, + "step": 30160 + }, + { + "epoch": 0.22301232961769316, + "grad_norm": 0.08453027904033661, + "learning_rate": 3.899350071225071e-05, + "loss": 0.0181, + "step": 30170 + }, + { + "epoch": 0.223086248188995, + "grad_norm": 0.09648805111646652, + "learning_rate": 3.898979107312441e-05, + "loss": 0.0194, + "step": 30180 + }, + { + "epoch": 0.22316016676029685, + "grad_norm": 0.09008285403251648, + "learning_rate": 3.8986081433998104e-05, + "loss": 0.0184, + "step": 30190 + }, + { + "epoch": 0.22323408533159872, + "grad_norm": 0.07204605638980865, + "learning_rate": 3.89823717948718e-05, + "loss": 0.0194, + "step": 30200 + }, + { + "epoch": 0.22330800390290056, + "grad_norm": 0.08470254391431808, + "learning_rate": 3.8978662155745496e-05, + "loss": 0.0191, + "step": 30210 + }, + { + "epoch": 0.2233819224742024, + "grad_norm": 0.1287703961133957, + "learning_rate": 3.8974952516619185e-05, + "loss": 0.0171, + "step": 30220 + }, + { + "epoch": 0.22345584104550428, + "grad_norm": 0.11528854072093964, + "learning_rate": 3.897124287749288e-05, + "loss": 0.0209, + "step": 30230 + }, + { + "epoch": 0.22352975961680613, + "grad_norm": 0.07534787058830261, + "learning_rate": 3.896753323836657e-05, + "loss": 0.0182, + "step": 30240 + }, + { + "epoch": 0.22360367818810797, + "grad_norm": 0.0873965248465538, + "learning_rate": 3.8963823599240266e-05, + "loss": 0.0155, + "step": 30250 + }, + { + "epoch": 0.22367759675940985, + "grad_norm": 0.11592572182416916, + "learning_rate": 3.896011396011396e-05, + "loss": 0.0183, + "step": 30260 + }, + { + "epoch": 0.2237515153307117, + "grad_norm": 0.1082519143819809, + "learning_rate": 3.895640432098766e-05, + "loss": 0.0169, + "step": 30270 + }, + { + "epoch": 0.22382543390201354, + "grad_norm": 0.0788358822464943, + "learning_rate": 3.8952694681861354e-05, + "loss": 0.0163, + "step": 30280 + }, + { + "epoch": 0.22389935247331538, + "grad_norm": 0.09216403216123581, + "learning_rate": 3.894898504273504e-05, + "loss": 0.0197, + "step": 30290 + }, + { + "epoch": 0.22397327104461726, + "grad_norm": 0.08517227321863174, + "learning_rate": 3.894527540360874e-05, + "loss": 0.0205, + "step": 30300 + }, + { + "epoch": 0.2240471896159191, + "grad_norm": 0.13697083294391632, + "learning_rate": 3.8941565764482435e-05, + "loss": 0.0206, + "step": 30310 + }, + { + "epoch": 0.22412110818722095, + "grad_norm": 0.07699858397245407, + "learning_rate": 3.8937856125356124e-05, + "loss": 0.0186, + "step": 30320 + }, + { + "epoch": 0.22419502675852282, + "grad_norm": 0.15653513371944427, + "learning_rate": 3.893414648622982e-05, + "loss": 0.0198, + "step": 30330 + }, + { + "epoch": 0.22426894532982466, + "grad_norm": 0.08664526790380478, + "learning_rate": 3.8930436847103516e-05, + "loss": 0.0208, + "step": 30340 + }, + { + "epoch": 0.2243428639011265, + "grad_norm": 0.06121116131544113, + "learning_rate": 3.892672720797721e-05, + "loss": 0.0189, + "step": 30350 + }, + { + "epoch": 0.22441678247242838, + "grad_norm": 0.09972760826349258, + "learning_rate": 3.892301756885091e-05, + "loss": 0.0181, + "step": 30360 + }, + { + "epoch": 0.22449070104373023, + "grad_norm": 0.09164098650217056, + "learning_rate": 3.89193079297246e-05, + "loss": 0.0168, + "step": 30370 + }, + { + "epoch": 0.22456461961503207, + "grad_norm": 0.1236223429441452, + "learning_rate": 3.891559829059829e-05, + "loss": 0.017, + "step": 30380 + }, + { + "epoch": 0.22463853818633395, + "grad_norm": 0.1081567257642746, + "learning_rate": 3.891188865147199e-05, + "loss": 0.0185, + "step": 30390 + }, + { + "epoch": 0.2247124567576358, + "grad_norm": 0.059913747012615204, + "learning_rate": 3.890817901234568e-05, + "loss": 0.0171, + "step": 30400 + }, + { + "epoch": 0.22478637532893764, + "grad_norm": 0.07820994406938553, + "learning_rate": 3.8904469373219374e-05, + "loss": 0.0172, + "step": 30410 + }, + { + "epoch": 0.22486029390023948, + "grad_norm": 0.08917850255966187, + "learning_rate": 3.890075973409307e-05, + "loss": 0.0169, + "step": 30420 + }, + { + "epoch": 0.22493421247154136, + "grad_norm": 0.09755807369947433, + "learning_rate": 3.8897050094966766e-05, + "loss": 0.0196, + "step": 30430 + }, + { + "epoch": 0.2250081310428432, + "grad_norm": 0.09309454262256622, + "learning_rate": 3.889334045584046e-05, + "loss": 0.0209, + "step": 30440 + }, + { + "epoch": 0.22508204961414505, + "grad_norm": 0.09513840824365616, + "learning_rate": 3.888963081671415e-05, + "loss": 0.0218, + "step": 30450 + }, + { + "epoch": 0.22515596818544692, + "grad_norm": 0.09850829094648361, + "learning_rate": 3.888592117758785e-05, + "loss": 0.0185, + "step": 30460 + }, + { + "epoch": 0.22522988675674877, + "grad_norm": 0.0765216201543808, + "learning_rate": 3.888221153846154e-05, + "loss": 0.0163, + "step": 30470 + }, + { + "epoch": 0.2253038053280506, + "grad_norm": 0.08078419417142868, + "learning_rate": 3.887850189933523e-05, + "loss": 0.0191, + "step": 30480 + }, + { + "epoch": 0.22537772389935248, + "grad_norm": 0.09681311249732971, + "learning_rate": 3.887479226020893e-05, + "loss": 0.021, + "step": 30490 + }, + { + "epoch": 0.22545164247065433, + "grad_norm": 0.10332570225000381, + "learning_rate": 3.8871082621082625e-05, + "loss": 0.023, + "step": 30500 + }, + { + "epoch": 0.22552556104195617, + "grad_norm": 0.10160394012928009, + "learning_rate": 3.886737298195632e-05, + "loss": 0.0184, + "step": 30510 + }, + { + "epoch": 0.22559947961325805, + "grad_norm": 0.09615443646907806, + "learning_rate": 3.886366334283001e-05, + "loss": 0.0208, + "step": 30520 + }, + { + "epoch": 0.2256733981845599, + "grad_norm": 0.08312519639730453, + "learning_rate": 3.8859953703703706e-05, + "loss": 0.0201, + "step": 30530 + }, + { + "epoch": 0.22574731675586174, + "grad_norm": 0.07552629709243774, + "learning_rate": 3.88562440645774e-05, + "loss": 0.022, + "step": 30540 + }, + { + "epoch": 0.22582123532716358, + "grad_norm": 0.10100152343511581, + "learning_rate": 3.885253442545109e-05, + "loss": 0.02, + "step": 30550 + }, + { + "epoch": 0.22589515389846546, + "grad_norm": 0.08780404180288315, + "learning_rate": 3.884882478632479e-05, + "loss": 0.0177, + "step": 30560 + }, + { + "epoch": 0.2259690724697673, + "grad_norm": 0.09640546888113022, + "learning_rate": 3.884511514719848e-05, + "loss": 0.0195, + "step": 30570 + }, + { + "epoch": 0.22604299104106915, + "grad_norm": 0.08945133537054062, + "learning_rate": 3.884140550807218e-05, + "loss": 0.0218, + "step": 30580 + }, + { + "epoch": 0.22611690961237102, + "grad_norm": 0.1121789738535881, + "learning_rate": 3.8837695868945875e-05, + "loss": 0.0175, + "step": 30590 + }, + { + "epoch": 0.22619082818367287, + "grad_norm": 0.1048922911286354, + "learning_rate": 3.8833986229819564e-05, + "loss": 0.0203, + "step": 30600 + }, + { + "epoch": 0.2262647467549747, + "grad_norm": 0.1150166317820549, + "learning_rate": 3.883027659069326e-05, + "loss": 0.0216, + "step": 30610 + }, + { + "epoch": 0.22633866532627658, + "grad_norm": 0.10024099797010422, + "learning_rate": 3.8826566951566956e-05, + "loss": 0.0188, + "step": 30620 + }, + { + "epoch": 0.22641258389757843, + "grad_norm": 0.06918352097272873, + "learning_rate": 3.8822857312440645e-05, + "loss": 0.0209, + "step": 30630 + }, + { + "epoch": 0.22648650246888027, + "grad_norm": 0.06771031767129898, + "learning_rate": 3.881914767331434e-05, + "loss": 0.0196, + "step": 30640 + }, + { + "epoch": 0.22656042104018215, + "grad_norm": 0.12088067829608917, + "learning_rate": 3.881543803418804e-05, + "loss": 0.0182, + "step": 30650 + }, + { + "epoch": 0.226634339611484, + "grad_norm": 0.07698526978492737, + "learning_rate": 3.881172839506173e-05, + "loss": 0.0217, + "step": 30660 + }, + { + "epoch": 0.22670825818278584, + "grad_norm": 0.13076117634773254, + "learning_rate": 3.880801875593543e-05, + "loss": 0.0202, + "step": 30670 + }, + { + "epoch": 0.22678217675408768, + "grad_norm": 0.0864979699254036, + "learning_rate": 3.880430911680912e-05, + "loss": 0.0209, + "step": 30680 + }, + { + "epoch": 0.22685609532538956, + "grad_norm": 0.08185715973377228, + "learning_rate": 3.8800599477682814e-05, + "loss": 0.0211, + "step": 30690 + }, + { + "epoch": 0.2269300138966914, + "grad_norm": 0.08243662863969803, + "learning_rate": 3.87968898385565e-05, + "loss": 0.0173, + "step": 30700 + }, + { + "epoch": 0.22700393246799325, + "grad_norm": 0.07913575321435928, + "learning_rate": 3.87931801994302e-05, + "loss": 0.0193, + "step": 30710 + }, + { + "epoch": 0.22707785103929512, + "grad_norm": 0.09552217274904251, + "learning_rate": 3.8789470560303895e-05, + "loss": 0.0203, + "step": 30720 + }, + { + "epoch": 0.22715176961059697, + "grad_norm": 0.11742965131998062, + "learning_rate": 3.878576092117759e-05, + "loss": 0.0207, + "step": 30730 + }, + { + "epoch": 0.2272256881818988, + "grad_norm": 0.06373470276594162, + "learning_rate": 3.878205128205129e-05, + "loss": 0.0186, + "step": 30740 + }, + { + "epoch": 0.22729960675320068, + "grad_norm": 0.07583777606487274, + "learning_rate": 3.8778341642924976e-05, + "loss": 0.0174, + "step": 30750 + }, + { + "epoch": 0.22737352532450253, + "grad_norm": 0.09138477593660355, + "learning_rate": 3.877463200379867e-05, + "loss": 0.0193, + "step": 30760 + }, + { + "epoch": 0.22744744389580437, + "grad_norm": 0.07085064053535461, + "learning_rate": 3.877092236467237e-05, + "loss": 0.0192, + "step": 30770 + }, + { + "epoch": 0.22752136246710625, + "grad_norm": 0.07788054645061493, + "learning_rate": 3.876721272554606e-05, + "loss": 0.0182, + "step": 30780 + }, + { + "epoch": 0.2275952810384081, + "grad_norm": 0.0838763490319252, + "learning_rate": 3.8763503086419753e-05, + "loss": 0.0187, + "step": 30790 + }, + { + "epoch": 0.22766919960970994, + "grad_norm": 0.1182074323296547, + "learning_rate": 3.875979344729345e-05, + "loss": 0.0206, + "step": 30800 + }, + { + "epoch": 0.2277431181810118, + "grad_norm": 0.1088971197605133, + "learning_rate": 3.8756083808167145e-05, + "loss": 0.0219, + "step": 30810 + }, + { + "epoch": 0.22781703675231366, + "grad_norm": 0.11365870386362076, + "learning_rate": 3.875237416904084e-05, + "loss": 0.0196, + "step": 30820 + }, + { + "epoch": 0.2278909553236155, + "grad_norm": 0.06740875542163849, + "learning_rate": 3.874866452991453e-05, + "loss": 0.0171, + "step": 30830 + }, + { + "epoch": 0.22796487389491735, + "grad_norm": 0.07138946652412415, + "learning_rate": 3.8744954890788227e-05, + "loss": 0.0211, + "step": 30840 + }, + { + "epoch": 0.22803879246621922, + "grad_norm": 0.0920095294713974, + "learning_rate": 3.874124525166192e-05, + "loss": 0.0183, + "step": 30850 + }, + { + "epoch": 0.22811271103752107, + "grad_norm": 0.13043396174907684, + "learning_rate": 3.873753561253561e-05, + "loss": 0.0222, + "step": 30860 + }, + { + "epoch": 0.2281866296088229, + "grad_norm": 0.12495102733373642, + "learning_rate": 3.873382597340931e-05, + "loss": 0.018, + "step": 30870 + }, + { + "epoch": 0.22826054818012478, + "grad_norm": 0.08440731465816498, + "learning_rate": 3.8730116334283004e-05, + "loss": 0.0178, + "step": 30880 + }, + { + "epoch": 0.22833446675142663, + "grad_norm": 0.07918287813663483, + "learning_rate": 3.87264066951567e-05, + "loss": 0.02, + "step": 30890 + }, + { + "epoch": 0.22840838532272847, + "grad_norm": 0.10504679381847382, + "learning_rate": 3.8722697056030396e-05, + "loss": 0.0189, + "step": 30900 + }, + { + "epoch": 0.22848230389403035, + "grad_norm": 0.09025320410728455, + "learning_rate": 3.8718987416904085e-05, + "loss": 0.0191, + "step": 30910 + }, + { + "epoch": 0.2285562224653322, + "grad_norm": 0.11715808510780334, + "learning_rate": 3.871527777777778e-05, + "loss": 0.0223, + "step": 30920 + }, + { + "epoch": 0.22863014103663404, + "grad_norm": 0.07258326560258865, + "learning_rate": 3.871156813865147e-05, + "loss": 0.0192, + "step": 30930 + }, + { + "epoch": 0.2287040596079359, + "grad_norm": 0.06756031513214111, + "learning_rate": 3.8707858499525166e-05, + "loss": 0.0202, + "step": 30940 + }, + { + "epoch": 0.22877797817923776, + "grad_norm": 0.09260854125022888, + "learning_rate": 3.870414886039886e-05, + "loss": 0.0219, + "step": 30950 + }, + { + "epoch": 0.2288518967505396, + "grad_norm": 0.0938676968216896, + "learning_rate": 3.870043922127256e-05, + "loss": 0.0182, + "step": 30960 + }, + { + "epoch": 0.22892581532184145, + "grad_norm": 0.09336868673563004, + "learning_rate": 3.8696729582146254e-05, + "loss": 0.0191, + "step": 30970 + }, + { + "epoch": 0.22899973389314332, + "grad_norm": 0.08895561844110489, + "learning_rate": 3.869301994301994e-05, + "loss": 0.0182, + "step": 30980 + }, + { + "epoch": 0.22907365246444517, + "grad_norm": 0.076514832675457, + "learning_rate": 3.868931030389364e-05, + "loss": 0.0172, + "step": 30990 + }, + { + "epoch": 0.229147571035747, + "grad_norm": 0.09007681161165237, + "learning_rate": 3.8685600664767335e-05, + "loss": 0.0202, + "step": 31000 + }, + { + "epoch": 0.22922148960704888, + "grad_norm": 0.10296738147735596, + "learning_rate": 3.8681891025641024e-05, + "loss": 0.0191, + "step": 31010 + }, + { + "epoch": 0.22929540817835073, + "grad_norm": 0.09268203377723694, + "learning_rate": 3.867818138651472e-05, + "loss": 0.0225, + "step": 31020 + }, + { + "epoch": 0.22936932674965257, + "grad_norm": 0.08601827919483185, + "learning_rate": 3.8674471747388416e-05, + "loss": 0.0174, + "step": 31030 + }, + { + "epoch": 0.22944324532095445, + "grad_norm": 0.13542363047599792, + "learning_rate": 3.867076210826211e-05, + "loss": 0.0215, + "step": 31040 + }, + { + "epoch": 0.2295171638922563, + "grad_norm": 0.07593340426683426, + "learning_rate": 3.866705246913581e-05, + "loss": 0.0193, + "step": 31050 + }, + { + "epoch": 0.22959108246355814, + "grad_norm": 0.1184159368276596, + "learning_rate": 3.86633428300095e-05, + "loss": 0.0195, + "step": 31060 + }, + { + "epoch": 0.22966500103486, + "grad_norm": 0.10774824768304825, + "learning_rate": 3.865963319088319e-05, + "loss": 0.0208, + "step": 31070 + }, + { + "epoch": 0.22973891960616186, + "grad_norm": 0.0968312919139862, + "learning_rate": 3.865592355175689e-05, + "loss": 0.0201, + "step": 31080 + }, + { + "epoch": 0.2298128381774637, + "grad_norm": 0.11818359047174454, + "learning_rate": 3.865221391263058e-05, + "loss": 0.02, + "step": 31090 + }, + { + "epoch": 0.22988675674876555, + "grad_norm": 0.054972726851701736, + "learning_rate": 3.8648504273504274e-05, + "loss": 0.0164, + "step": 31100 + }, + { + "epoch": 0.22996067532006742, + "grad_norm": 0.05757006257772446, + "learning_rate": 3.864479463437797e-05, + "loss": 0.0182, + "step": 31110 + }, + { + "epoch": 0.23003459389136927, + "grad_norm": 0.05938870832324028, + "learning_rate": 3.8641084995251666e-05, + "loss": 0.0202, + "step": 31120 + }, + { + "epoch": 0.2301085124626711, + "grad_norm": 0.09021522849798203, + "learning_rate": 3.863737535612536e-05, + "loss": 0.0206, + "step": 31130 + }, + { + "epoch": 0.23018243103397298, + "grad_norm": 0.08191125839948654, + "learning_rate": 3.863366571699905e-05, + "loss": 0.0175, + "step": 31140 + }, + { + "epoch": 0.23025634960527483, + "grad_norm": 0.13434763252735138, + "learning_rate": 3.862995607787275e-05, + "loss": 0.0241, + "step": 31150 + }, + { + "epoch": 0.23033026817657667, + "grad_norm": 0.09122100472450256, + "learning_rate": 3.8626246438746436e-05, + "loss": 0.0178, + "step": 31160 + }, + { + "epoch": 0.23040418674787855, + "grad_norm": 0.08553260564804077, + "learning_rate": 3.862253679962013e-05, + "loss": 0.018, + "step": 31170 + }, + { + "epoch": 0.2304781053191804, + "grad_norm": 0.06894198060035706, + "learning_rate": 3.861882716049383e-05, + "loss": 0.0196, + "step": 31180 + }, + { + "epoch": 0.23055202389048224, + "grad_norm": 0.09503644704818726, + "learning_rate": 3.8615117521367524e-05, + "loss": 0.0184, + "step": 31190 + }, + { + "epoch": 0.2306259424617841, + "grad_norm": 0.08598212897777557, + "learning_rate": 3.861140788224122e-05, + "loss": 0.0187, + "step": 31200 + }, + { + "epoch": 0.23069986103308596, + "grad_norm": 0.08239752054214478, + "learning_rate": 3.860769824311491e-05, + "loss": 0.0208, + "step": 31210 + }, + { + "epoch": 0.2307737796043878, + "grad_norm": 0.0904286578297615, + "learning_rate": 3.8603988603988605e-05, + "loss": 0.0194, + "step": 31220 + }, + { + "epoch": 0.23084769817568965, + "grad_norm": 0.08765274286270142, + "learning_rate": 3.86002789648623e-05, + "loss": 0.0196, + "step": 31230 + }, + { + "epoch": 0.23092161674699152, + "grad_norm": 0.0852227434515953, + "learning_rate": 3.859656932573599e-05, + "loss": 0.0213, + "step": 31240 + }, + { + "epoch": 0.23099553531829337, + "grad_norm": 0.09749691188335419, + "learning_rate": 3.859285968660969e-05, + "loss": 0.0202, + "step": 31250 + }, + { + "epoch": 0.2310694538895952, + "grad_norm": 0.06527355313301086, + "learning_rate": 3.858915004748338e-05, + "loss": 0.02, + "step": 31260 + }, + { + "epoch": 0.23114337246089708, + "grad_norm": 0.11276469379663467, + "learning_rate": 3.858544040835708e-05, + "loss": 0.0173, + "step": 31270 + }, + { + "epoch": 0.23121729103219893, + "grad_norm": 0.08993665128946304, + "learning_rate": 3.8581730769230775e-05, + "loss": 0.0184, + "step": 31280 + }, + { + "epoch": 0.23129120960350077, + "grad_norm": 0.0920916274189949, + "learning_rate": 3.8578021130104464e-05, + "loss": 0.0184, + "step": 31290 + }, + { + "epoch": 0.23136512817480265, + "grad_norm": 0.06178808957338333, + "learning_rate": 3.857431149097816e-05, + "loss": 0.0171, + "step": 31300 + }, + { + "epoch": 0.2314390467461045, + "grad_norm": 0.07721130549907684, + "learning_rate": 3.8570601851851856e-05, + "loss": 0.0172, + "step": 31310 + }, + { + "epoch": 0.23151296531740634, + "grad_norm": 0.0718604326248169, + "learning_rate": 3.8566892212725545e-05, + "loss": 0.0188, + "step": 31320 + }, + { + "epoch": 0.2315868838887082, + "grad_norm": 0.09962927550077438, + "learning_rate": 3.856318257359924e-05, + "loss": 0.0212, + "step": 31330 + }, + { + "epoch": 0.23166080246001006, + "grad_norm": 0.09302369505167007, + "learning_rate": 3.855947293447294e-05, + "loss": 0.0185, + "step": 31340 + }, + { + "epoch": 0.2317347210313119, + "grad_norm": 0.07690336555242538, + "learning_rate": 3.855576329534663e-05, + "loss": 0.0191, + "step": 31350 + }, + { + "epoch": 0.23180863960261375, + "grad_norm": 0.07962159067392349, + "learning_rate": 3.855205365622033e-05, + "loss": 0.0195, + "step": 31360 + }, + { + "epoch": 0.23188255817391562, + "grad_norm": 0.09690086543560028, + "learning_rate": 3.854834401709402e-05, + "loss": 0.018, + "step": 31370 + }, + { + "epoch": 0.23195647674521747, + "grad_norm": 0.0838049054145813, + "learning_rate": 3.8544634377967714e-05, + "loss": 0.0173, + "step": 31380 + }, + { + "epoch": 0.2320303953165193, + "grad_norm": 0.07686971127986908, + "learning_rate": 3.85409247388414e-05, + "loss": 0.0175, + "step": 31390 + }, + { + "epoch": 0.23210431388782118, + "grad_norm": 0.08736717700958252, + "learning_rate": 3.85372150997151e-05, + "loss": 0.0182, + "step": 31400 + }, + { + "epoch": 0.23217823245912303, + "grad_norm": 0.0625508725643158, + "learning_rate": 3.8533505460588795e-05, + "loss": 0.0204, + "step": 31410 + }, + { + "epoch": 0.23225215103042487, + "grad_norm": 0.09991180151700974, + "learning_rate": 3.852979582146249e-05, + "loss": 0.0179, + "step": 31420 + }, + { + "epoch": 0.23232606960172675, + "grad_norm": 0.06481295078992844, + "learning_rate": 3.852608618233619e-05, + "loss": 0.0209, + "step": 31430 + }, + { + "epoch": 0.2323999881730286, + "grad_norm": 0.07274527847766876, + "learning_rate": 3.8522376543209876e-05, + "loss": 0.018, + "step": 31440 + }, + { + "epoch": 0.23247390674433044, + "grad_norm": 0.04653900861740112, + "learning_rate": 3.851866690408357e-05, + "loss": 0.0149, + "step": 31450 + }, + { + "epoch": 0.2325478253156323, + "grad_norm": 0.0779101625084877, + "learning_rate": 3.851495726495727e-05, + "loss": 0.019, + "step": 31460 + }, + { + "epoch": 0.23262174388693416, + "grad_norm": 0.0786028727889061, + "learning_rate": 3.851124762583096e-05, + "loss": 0.0176, + "step": 31470 + }, + { + "epoch": 0.232695662458236, + "grad_norm": 0.07252313196659088, + "learning_rate": 3.850753798670465e-05, + "loss": 0.0193, + "step": 31480 + }, + { + "epoch": 0.23276958102953785, + "grad_norm": 0.0671439990401268, + "learning_rate": 3.850382834757835e-05, + "loss": 0.0187, + "step": 31490 + }, + { + "epoch": 0.23284349960083972, + "grad_norm": 0.08912762999534607, + "learning_rate": 3.8500118708452045e-05, + "loss": 0.0184, + "step": 31500 + }, + { + "epoch": 0.23291741817214157, + "grad_norm": 0.06486102193593979, + "learning_rate": 3.849640906932574e-05, + "loss": 0.0177, + "step": 31510 + }, + { + "epoch": 0.2329913367434434, + "grad_norm": 0.10911845415830612, + "learning_rate": 3.849269943019943e-05, + "loss": 0.02, + "step": 31520 + }, + { + "epoch": 0.23306525531474528, + "grad_norm": 0.07711423933506012, + "learning_rate": 3.8488989791073126e-05, + "loss": 0.0193, + "step": 31530 + }, + { + "epoch": 0.23313917388604713, + "grad_norm": 0.06304068863391876, + "learning_rate": 3.848528015194682e-05, + "loss": 0.0211, + "step": 31540 + }, + { + "epoch": 0.23321309245734897, + "grad_norm": 0.11020921915769577, + "learning_rate": 3.848157051282051e-05, + "loss": 0.0173, + "step": 31550 + }, + { + "epoch": 0.23328701102865085, + "grad_norm": 0.09332350641489029, + "learning_rate": 3.847786087369421e-05, + "loss": 0.0212, + "step": 31560 + }, + { + "epoch": 0.2333609295999527, + "grad_norm": 0.07746771723031998, + "learning_rate": 3.84741512345679e-05, + "loss": 0.0208, + "step": 31570 + }, + { + "epoch": 0.23343484817125454, + "grad_norm": 0.10996796935796738, + "learning_rate": 3.84704415954416e-05, + "loss": 0.0178, + "step": 31580 + }, + { + "epoch": 0.2335087667425564, + "grad_norm": 0.07971123605966568, + "learning_rate": 3.8466731956315295e-05, + "loss": 0.0198, + "step": 31590 + }, + { + "epoch": 0.23358268531385826, + "grad_norm": 0.08604178577661514, + "learning_rate": 3.8463022317188984e-05, + "loss": 0.0196, + "step": 31600 + }, + { + "epoch": 0.2336566038851601, + "grad_norm": 0.11693210899829865, + "learning_rate": 3.845931267806268e-05, + "loss": 0.02, + "step": 31610 + }, + { + "epoch": 0.23373052245646195, + "grad_norm": 0.09861855953931808, + "learning_rate": 3.845560303893637e-05, + "loss": 0.0204, + "step": 31620 + }, + { + "epoch": 0.23380444102776382, + "grad_norm": 0.08212066441774368, + "learning_rate": 3.8451893399810066e-05, + "loss": 0.0166, + "step": 31630 + }, + { + "epoch": 0.23387835959906567, + "grad_norm": 0.0658751055598259, + "learning_rate": 3.844818376068376e-05, + "loss": 0.0163, + "step": 31640 + }, + { + "epoch": 0.2339522781703675, + "grad_norm": 0.06246474012732506, + "learning_rate": 3.844447412155746e-05, + "loss": 0.0189, + "step": 31650 + }, + { + "epoch": 0.23402619674166938, + "grad_norm": 0.07317464053630829, + "learning_rate": 3.8440764482431153e-05, + "loss": 0.0178, + "step": 31660 + }, + { + "epoch": 0.23410011531297123, + "grad_norm": 0.07486504316329956, + "learning_rate": 3.843705484330484e-05, + "loss": 0.0175, + "step": 31670 + }, + { + "epoch": 0.23417403388427308, + "grad_norm": 0.057501647621393204, + "learning_rate": 3.843334520417854e-05, + "loss": 0.018, + "step": 31680 + }, + { + "epoch": 0.23424795245557495, + "grad_norm": 0.10206136852502823, + "learning_rate": 3.8429635565052235e-05, + "loss": 0.0169, + "step": 31690 + }, + { + "epoch": 0.2343218710268768, + "grad_norm": 0.09707526117563248, + "learning_rate": 3.8425925925925924e-05, + "loss": 0.0202, + "step": 31700 + }, + { + "epoch": 0.23439578959817864, + "grad_norm": 0.08385404944419861, + "learning_rate": 3.842221628679962e-05, + "loss": 0.018, + "step": 31710 + }, + { + "epoch": 0.2344697081694805, + "grad_norm": 0.08981026709079742, + "learning_rate": 3.8418506647673316e-05, + "loss": 0.018, + "step": 31720 + }, + { + "epoch": 0.23454362674078236, + "grad_norm": 0.07878104597330093, + "learning_rate": 3.841479700854701e-05, + "loss": 0.0198, + "step": 31730 + }, + { + "epoch": 0.2346175453120842, + "grad_norm": 0.10871163010597229, + "learning_rate": 3.841108736942071e-05, + "loss": 0.0199, + "step": 31740 + }, + { + "epoch": 0.23469146388338608, + "grad_norm": 0.08035383373498917, + "learning_rate": 3.84073777302944e-05, + "loss": 0.0184, + "step": 31750 + }, + { + "epoch": 0.23476538245468792, + "grad_norm": 0.09240926057100296, + "learning_rate": 3.840366809116809e-05, + "loss": 0.0191, + "step": 31760 + }, + { + "epoch": 0.23483930102598977, + "grad_norm": 0.0863938182592392, + "learning_rate": 3.839995845204179e-05, + "loss": 0.0178, + "step": 31770 + }, + { + "epoch": 0.2349132195972916, + "grad_norm": 0.08668408542871475, + "learning_rate": 3.839624881291548e-05, + "loss": 0.0186, + "step": 31780 + }, + { + "epoch": 0.23498713816859348, + "grad_norm": 0.10141124576330185, + "learning_rate": 3.8392539173789174e-05, + "loss": 0.02, + "step": 31790 + }, + { + "epoch": 0.23506105673989533, + "grad_norm": 0.07474733889102936, + "learning_rate": 3.838882953466287e-05, + "loss": 0.0179, + "step": 31800 + }, + { + "epoch": 0.23513497531119718, + "grad_norm": 0.0803011879324913, + "learning_rate": 3.8385119895536566e-05, + "loss": 0.0183, + "step": 31810 + }, + { + "epoch": 0.23520889388249905, + "grad_norm": 0.1086181104183197, + "learning_rate": 3.838141025641026e-05, + "loss": 0.0206, + "step": 31820 + }, + { + "epoch": 0.2352828124538009, + "grad_norm": 0.0771515890955925, + "learning_rate": 3.837770061728395e-05, + "loss": 0.0189, + "step": 31830 + }, + { + "epoch": 0.23535673102510274, + "grad_norm": 0.09756075590848923, + "learning_rate": 3.837399097815765e-05, + "loss": 0.0195, + "step": 31840 + }, + { + "epoch": 0.2354306495964046, + "grad_norm": 0.06898177415132523, + "learning_rate": 3.8370281339031336e-05, + "loss": 0.0182, + "step": 31850 + }, + { + "epoch": 0.23550456816770646, + "grad_norm": 0.08295425772666931, + "learning_rate": 3.836657169990503e-05, + "loss": 0.0191, + "step": 31860 + }, + { + "epoch": 0.2355784867390083, + "grad_norm": 0.08858896791934967, + "learning_rate": 3.836286206077873e-05, + "loss": 0.0207, + "step": 31870 + }, + { + "epoch": 0.23565240531031018, + "grad_norm": 0.06618060171604156, + "learning_rate": 3.8359152421652424e-05, + "loss": 0.0189, + "step": 31880 + }, + { + "epoch": 0.23572632388161202, + "grad_norm": 0.0754452645778656, + "learning_rate": 3.835544278252612e-05, + "loss": 0.0175, + "step": 31890 + }, + { + "epoch": 0.23580024245291387, + "grad_norm": 0.08633226156234741, + "learning_rate": 3.835173314339981e-05, + "loss": 0.0211, + "step": 31900 + }, + { + "epoch": 0.2358741610242157, + "grad_norm": 0.09160919487476349, + "learning_rate": 3.8348023504273505e-05, + "loss": 0.0222, + "step": 31910 + }, + { + "epoch": 0.23594807959551758, + "grad_norm": 0.08634735643863678, + "learning_rate": 3.83443138651472e-05, + "loss": 0.0163, + "step": 31920 + }, + { + "epoch": 0.23602199816681943, + "grad_norm": 0.13907606899738312, + "learning_rate": 3.834060422602089e-05, + "loss": 0.0188, + "step": 31930 + }, + { + "epoch": 0.23609591673812128, + "grad_norm": 0.07669810205698013, + "learning_rate": 3.8336894586894586e-05, + "loss": 0.0212, + "step": 31940 + }, + { + "epoch": 0.23616983530942315, + "grad_norm": 0.101229727268219, + "learning_rate": 3.833318494776828e-05, + "loss": 0.0192, + "step": 31950 + }, + { + "epoch": 0.236243753880725, + "grad_norm": 0.09813881665468216, + "learning_rate": 3.832947530864198e-05, + "loss": 0.0208, + "step": 31960 + }, + { + "epoch": 0.23631767245202684, + "grad_norm": 0.08786879479885101, + "learning_rate": 3.8325765669515674e-05, + "loss": 0.0196, + "step": 31970 + }, + { + "epoch": 0.2363915910233287, + "grad_norm": 0.06772688031196594, + "learning_rate": 3.8322056030389363e-05, + "loss": 0.018, + "step": 31980 + }, + { + "epoch": 0.23646550959463056, + "grad_norm": 0.09436634927988052, + "learning_rate": 3.831834639126306e-05, + "loss": 0.0207, + "step": 31990 + }, + { + "epoch": 0.2365394281659324, + "grad_norm": 0.09478554129600525, + "learning_rate": 3.8314636752136755e-05, + "loss": 0.0182, + "step": 32000 + }, + { + "epoch": 0.23661334673723428, + "grad_norm": 0.09436893463134766, + "learning_rate": 3.8310927113010445e-05, + "loss": 0.0174, + "step": 32010 + }, + { + "epoch": 0.23668726530853612, + "grad_norm": 0.12236055731773376, + "learning_rate": 3.830721747388414e-05, + "loss": 0.0205, + "step": 32020 + }, + { + "epoch": 0.23676118387983797, + "grad_norm": 0.09569685161113739, + "learning_rate": 3.8303507834757837e-05, + "loss": 0.0187, + "step": 32030 + }, + { + "epoch": 0.2368351024511398, + "grad_norm": 0.12519055604934692, + "learning_rate": 3.829979819563153e-05, + "loss": 0.0207, + "step": 32040 + }, + { + "epoch": 0.23690902102244168, + "grad_norm": 0.0853719487786293, + "learning_rate": 3.829608855650523e-05, + "loss": 0.0163, + "step": 32050 + }, + { + "epoch": 0.23698293959374353, + "grad_norm": 0.0898117944598198, + "learning_rate": 3.829237891737892e-05, + "loss": 0.0218, + "step": 32060 + }, + { + "epoch": 0.23705685816504538, + "grad_norm": 0.10286859422922134, + "learning_rate": 3.8288669278252614e-05, + "loss": 0.0191, + "step": 32070 + }, + { + "epoch": 0.23713077673634725, + "grad_norm": 0.07973692566156387, + "learning_rate": 3.82849596391263e-05, + "loss": 0.02, + "step": 32080 + }, + { + "epoch": 0.2372046953076491, + "grad_norm": 0.0759081095457077, + "learning_rate": 3.828125e-05, + "loss": 0.018, + "step": 32090 + }, + { + "epoch": 0.23727861387895094, + "grad_norm": 0.10089462995529175, + "learning_rate": 3.82775403608737e-05, + "loss": 0.0183, + "step": 32100 + }, + { + "epoch": 0.2373525324502528, + "grad_norm": 0.06875733286142349, + "learning_rate": 3.827383072174739e-05, + "loss": 0.0188, + "step": 32110 + }, + { + "epoch": 0.23742645102155466, + "grad_norm": 0.09233024716377258, + "learning_rate": 3.827012108262109e-05, + "loss": 0.0168, + "step": 32120 + }, + { + "epoch": 0.2375003695928565, + "grad_norm": 0.11073601245880127, + "learning_rate": 3.8266411443494776e-05, + "loss": 0.0194, + "step": 32130 + }, + { + "epoch": 0.23757428816415838, + "grad_norm": 0.05894783139228821, + "learning_rate": 3.826270180436847e-05, + "loss": 0.0183, + "step": 32140 + }, + { + "epoch": 0.23764820673546022, + "grad_norm": 0.09136971086263657, + "learning_rate": 3.825899216524217e-05, + "loss": 0.0213, + "step": 32150 + }, + { + "epoch": 0.23772212530676207, + "grad_norm": 0.09493359178304672, + "learning_rate": 3.825528252611586e-05, + "loss": 0.0207, + "step": 32160 + }, + { + "epoch": 0.2377960438780639, + "grad_norm": 0.0764891654253006, + "learning_rate": 3.825157288698955e-05, + "loss": 0.0197, + "step": 32170 + }, + { + "epoch": 0.23786996244936578, + "grad_norm": 0.06478419899940491, + "learning_rate": 3.824786324786325e-05, + "loss": 0.0208, + "step": 32180 + }, + { + "epoch": 0.23794388102066763, + "grad_norm": 0.060552988201379776, + "learning_rate": 3.8244153608736945e-05, + "loss": 0.0186, + "step": 32190 + }, + { + "epoch": 0.23801779959196948, + "grad_norm": 0.09175571799278259, + "learning_rate": 3.824044396961064e-05, + "loss": 0.0194, + "step": 32200 + }, + { + "epoch": 0.23809171816327135, + "grad_norm": 0.07756716758012772, + "learning_rate": 3.823673433048433e-05, + "loss": 0.0198, + "step": 32210 + }, + { + "epoch": 0.2381656367345732, + "grad_norm": 0.07804546505212784, + "learning_rate": 3.8233024691358026e-05, + "loss": 0.0207, + "step": 32220 + }, + { + "epoch": 0.23823955530587504, + "grad_norm": 0.10484328866004944, + "learning_rate": 3.822931505223172e-05, + "loss": 0.0187, + "step": 32230 + }, + { + "epoch": 0.2383134738771769, + "grad_norm": 0.10777238756418228, + "learning_rate": 3.822560541310541e-05, + "loss": 0.0222, + "step": 32240 + }, + { + "epoch": 0.23838739244847876, + "grad_norm": 0.06873957812786102, + "learning_rate": 3.8221895773979114e-05, + "loss": 0.0215, + "step": 32250 + }, + { + "epoch": 0.2384613110197806, + "grad_norm": 0.06929781287908554, + "learning_rate": 3.82181861348528e-05, + "loss": 0.0206, + "step": 32260 + }, + { + "epoch": 0.23853522959108248, + "grad_norm": 0.09036051481962204, + "learning_rate": 3.82144764957265e-05, + "loss": 0.0181, + "step": 32270 + }, + { + "epoch": 0.23860914816238432, + "grad_norm": 0.07851787656545639, + "learning_rate": 3.8210766856600195e-05, + "loss": 0.0195, + "step": 32280 + }, + { + "epoch": 0.23868306673368617, + "grad_norm": 0.07176248729228973, + "learning_rate": 3.8207057217473884e-05, + "loss": 0.0206, + "step": 32290 + }, + { + "epoch": 0.238756985304988, + "grad_norm": 0.0853743925690651, + "learning_rate": 3.820334757834758e-05, + "loss": 0.019, + "step": 32300 + }, + { + "epoch": 0.23883090387628989, + "grad_norm": 0.12627804279327393, + "learning_rate": 3.819963793922127e-05, + "loss": 0.0216, + "step": 32310 + }, + { + "epoch": 0.23890482244759173, + "grad_norm": 0.13927190005779266, + "learning_rate": 3.8195928300094965e-05, + "loss": 0.022, + "step": 32320 + }, + { + "epoch": 0.23897874101889358, + "grad_norm": 0.08586980402469635, + "learning_rate": 3.819221866096867e-05, + "loss": 0.0193, + "step": 32330 + }, + { + "epoch": 0.23905265959019545, + "grad_norm": 0.08638795465230942, + "learning_rate": 3.818850902184236e-05, + "loss": 0.0179, + "step": 32340 + }, + { + "epoch": 0.2391265781614973, + "grad_norm": 0.07057658582925797, + "learning_rate": 3.818479938271605e-05, + "loss": 0.0152, + "step": 32350 + }, + { + "epoch": 0.23920049673279914, + "grad_norm": 0.09457147121429443, + "learning_rate": 3.818108974358974e-05, + "loss": 0.019, + "step": 32360 + }, + { + "epoch": 0.239274415304101, + "grad_norm": 0.11094425618648529, + "learning_rate": 3.817738010446344e-05, + "loss": 0.0208, + "step": 32370 + }, + { + "epoch": 0.23934833387540286, + "grad_norm": 0.09708326309919357, + "learning_rate": 3.8173670465337134e-05, + "loss": 0.0198, + "step": 32380 + }, + { + "epoch": 0.2394222524467047, + "grad_norm": 0.07583857327699661, + "learning_rate": 3.8169960826210824e-05, + "loss": 0.0192, + "step": 32390 + }, + { + "epoch": 0.23949617101800658, + "grad_norm": 0.089440256357193, + "learning_rate": 3.8166251187084526e-05, + "loss": 0.0219, + "step": 32400 + }, + { + "epoch": 0.23957008958930842, + "grad_norm": 0.0617837980389595, + "learning_rate": 3.8162541547958215e-05, + "loss": 0.0188, + "step": 32410 + }, + { + "epoch": 0.23964400816061027, + "grad_norm": 0.09545888751745224, + "learning_rate": 3.815883190883191e-05, + "loss": 0.018, + "step": 32420 + }, + { + "epoch": 0.2397179267319121, + "grad_norm": 0.10647038370370865, + "learning_rate": 3.815512226970561e-05, + "loss": 0.0196, + "step": 32430 + }, + { + "epoch": 0.23979184530321399, + "grad_norm": 0.09234632551670074, + "learning_rate": 3.81514126305793e-05, + "loss": 0.0173, + "step": 32440 + }, + { + "epoch": 0.23986576387451583, + "grad_norm": 0.1066628023982048, + "learning_rate": 3.814770299145299e-05, + "loss": 0.0194, + "step": 32450 + }, + { + "epoch": 0.23993968244581768, + "grad_norm": 0.08322324603796005, + "learning_rate": 3.814399335232669e-05, + "loss": 0.019, + "step": 32460 + }, + { + "epoch": 0.24001360101711955, + "grad_norm": 0.116621233522892, + "learning_rate": 3.814028371320038e-05, + "loss": 0.0187, + "step": 32470 + }, + { + "epoch": 0.2400875195884214, + "grad_norm": 0.08321616798639297, + "learning_rate": 3.813657407407408e-05, + "loss": 0.019, + "step": 32480 + }, + { + "epoch": 0.24016143815972324, + "grad_norm": 0.1350875049829483, + "learning_rate": 3.813286443494777e-05, + "loss": 0.0192, + "step": 32490 + }, + { + "epoch": 0.2402353567310251, + "grad_norm": 0.12473758310079575, + "learning_rate": 3.8129154795821466e-05, + "loss": 0.0209, + "step": 32500 + }, + { + "epoch": 0.24030927530232696, + "grad_norm": 0.07671623677015305, + "learning_rate": 3.812544515669516e-05, + "loss": 0.0209, + "step": 32510 + }, + { + "epoch": 0.2403831938736288, + "grad_norm": 0.08903726935386658, + "learning_rate": 3.812173551756885e-05, + "loss": 0.0211, + "step": 32520 + }, + { + "epoch": 0.24045711244493068, + "grad_norm": 0.0904008224606514, + "learning_rate": 3.811802587844255e-05, + "loss": 0.0196, + "step": 32530 + }, + { + "epoch": 0.24053103101623252, + "grad_norm": 0.08499748259782791, + "learning_rate": 3.8114316239316236e-05, + "loss": 0.0218, + "step": 32540 + }, + { + "epoch": 0.24060494958753437, + "grad_norm": 0.09576483070850372, + "learning_rate": 3.811060660018994e-05, + "loss": 0.0205, + "step": 32550 + }, + { + "epoch": 0.2406788681588362, + "grad_norm": 0.07833150774240494, + "learning_rate": 3.8106896961063635e-05, + "loss": 0.0189, + "step": 32560 + }, + { + "epoch": 0.24075278673013809, + "grad_norm": 0.08177255839109421, + "learning_rate": 3.8103187321937324e-05, + "loss": 0.0172, + "step": 32570 + }, + { + "epoch": 0.24082670530143993, + "grad_norm": 0.11327052116394043, + "learning_rate": 3.809947768281102e-05, + "loss": 0.0219, + "step": 32580 + }, + { + "epoch": 0.24090062387274178, + "grad_norm": 0.08393348008394241, + "learning_rate": 3.809576804368471e-05, + "loss": 0.0218, + "step": 32590 + }, + { + "epoch": 0.24097454244404365, + "grad_norm": 0.09698115289211273, + "learning_rate": 3.8092058404558405e-05, + "loss": 0.0197, + "step": 32600 + }, + { + "epoch": 0.2410484610153455, + "grad_norm": 0.09822794049978256, + "learning_rate": 3.80883487654321e-05, + "loss": 0.0219, + "step": 32610 + }, + { + "epoch": 0.24112237958664734, + "grad_norm": 0.08049602806568146, + "learning_rate": 3.808463912630579e-05, + "loss": 0.0207, + "step": 32620 + }, + { + "epoch": 0.2411962981579492, + "grad_norm": 0.08989317715167999, + "learning_rate": 3.808092948717949e-05, + "loss": 0.0228, + "step": 32630 + }, + { + "epoch": 0.24127021672925106, + "grad_norm": 0.09174864739179611, + "learning_rate": 3.807721984805318e-05, + "loss": 0.0207, + "step": 32640 + }, + { + "epoch": 0.2413441353005529, + "grad_norm": 0.09022696316242218, + "learning_rate": 3.807351020892688e-05, + "loss": 0.0215, + "step": 32650 + }, + { + "epoch": 0.24141805387185478, + "grad_norm": 0.1705494076013565, + "learning_rate": 3.8069800569800574e-05, + "loss": 0.021, + "step": 32660 + }, + { + "epoch": 0.24149197244315662, + "grad_norm": 0.07814808189868927, + "learning_rate": 3.806609093067426e-05, + "loss": 0.021, + "step": 32670 + }, + { + "epoch": 0.24156589101445847, + "grad_norm": 0.09162690490484238, + "learning_rate": 3.806238129154796e-05, + "loss": 0.0193, + "step": 32680 + }, + { + "epoch": 0.24163980958576034, + "grad_norm": 0.11757416278123856, + "learning_rate": 3.8058671652421655e-05, + "loss": 0.0188, + "step": 32690 + }, + { + "epoch": 0.24171372815706219, + "grad_norm": 0.0703003853559494, + "learning_rate": 3.8054962013295344e-05, + "loss": 0.0179, + "step": 32700 + }, + { + "epoch": 0.24178764672836403, + "grad_norm": 0.0890263170003891, + "learning_rate": 3.805125237416905e-05, + "loss": 0.0191, + "step": 32710 + }, + { + "epoch": 0.24186156529966588, + "grad_norm": 0.09751693159341812, + "learning_rate": 3.8047542735042736e-05, + "loss": 0.017, + "step": 32720 + }, + { + "epoch": 0.24193548387096775, + "grad_norm": 0.10251424461603165, + "learning_rate": 3.804383309591643e-05, + "loss": 0.0225, + "step": 32730 + }, + { + "epoch": 0.2420094024422696, + "grad_norm": 0.09268451482057571, + "learning_rate": 3.804012345679013e-05, + "loss": 0.0175, + "step": 32740 + }, + { + "epoch": 0.24208332101357144, + "grad_norm": 0.08104787021875381, + "learning_rate": 3.803641381766382e-05, + "loss": 0.0206, + "step": 32750 + }, + { + "epoch": 0.2421572395848733, + "grad_norm": 0.11207576841115952, + "learning_rate": 3.803270417853751e-05, + "loss": 0.0215, + "step": 32760 + }, + { + "epoch": 0.24223115815617516, + "grad_norm": 0.10712762176990509, + "learning_rate": 3.80289945394112e-05, + "loss": 0.0206, + "step": 32770 + }, + { + "epoch": 0.242305076727477, + "grad_norm": 0.08218982070684433, + "learning_rate": 3.8025284900284905e-05, + "loss": 0.0187, + "step": 32780 + }, + { + "epoch": 0.24237899529877888, + "grad_norm": 0.10749506950378418, + "learning_rate": 3.80215752611586e-05, + "loss": 0.0174, + "step": 32790 + }, + { + "epoch": 0.24245291387008072, + "grad_norm": 0.11496833711862564, + "learning_rate": 3.801786562203229e-05, + "loss": 0.0211, + "step": 32800 + }, + { + "epoch": 0.24252683244138257, + "grad_norm": 0.09944915026426315, + "learning_rate": 3.8014155982905986e-05, + "loss": 0.0211, + "step": 32810 + }, + { + "epoch": 0.24260075101268444, + "grad_norm": 0.14544843137264252, + "learning_rate": 3.8010446343779676e-05, + "loss": 0.0189, + "step": 32820 + }, + { + "epoch": 0.24267466958398629, + "grad_norm": 0.146175816655159, + "learning_rate": 3.800673670465337e-05, + "loss": 0.0194, + "step": 32830 + }, + { + "epoch": 0.24274858815528813, + "grad_norm": 0.08695892989635468, + "learning_rate": 3.800302706552707e-05, + "loss": 0.0166, + "step": 32840 + }, + { + "epoch": 0.24282250672658998, + "grad_norm": 0.10960700362920761, + "learning_rate": 3.799931742640076e-05, + "loss": 0.0188, + "step": 32850 + }, + { + "epoch": 0.24289642529789185, + "grad_norm": 0.09043558686971664, + "learning_rate": 3.799560778727446e-05, + "loss": 0.0216, + "step": 32860 + }, + { + "epoch": 0.2429703438691937, + "grad_norm": 0.1071183830499649, + "learning_rate": 3.799189814814815e-05, + "loss": 0.0188, + "step": 32870 + }, + { + "epoch": 0.24304426244049554, + "grad_norm": 0.08563335239887238, + "learning_rate": 3.7988188509021845e-05, + "loss": 0.0203, + "step": 32880 + }, + { + "epoch": 0.2431181810117974, + "grad_norm": 0.09415535628795624, + "learning_rate": 3.798447886989554e-05, + "loss": 0.0195, + "step": 32890 + }, + { + "epoch": 0.24319209958309926, + "grad_norm": 0.10130775719881058, + "learning_rate": 3.798076923076923e-05, + "loss": 0.0217, + "step": 32900 + }, + { + "epoch": 0.2432660181544011, + "grad_norm": 0.075839102268219, + "learning_rate": 3.7977059591642926e-05, + "loss": 0.0192, + "step": 32910 + }, + { + "epoch": 0.24333993672570298, + "grad_norm": 0.0735127404332161, + "learning_rate": 3.797334995251662e-05, + "loss": 0.022, + "step": 32920 + }, + { + "epoch": 0.24341385529700482, + "grad_norm": 0.07995545864105225, + "learning_rate": 3.796964031339032e-05, + "loss": 0.0184, + "step": 32930 + }, + { + "epoch": 0.24348777386830667, + "grad_norm": 0.09814750403165817, + "learning_rate": 3.7965930674264014e-05, + "loss": 0.0188, + "step": 32940 + }, + { + "epoch": 0.24356169243960854, + "grad_norm": 0.09759753197431564, + "learning_rate": 3.79622210351377e-05, + "loss": 0.0175, + "step": 32950 + }, + { + "epoch": 0.24363561101091039, + "grad_norm": 0.0937381312251091, + "learning_rate": 3.79585113960114e-05, + "loss": 0.0185, + "step": 32960 + }, + { + "epoch": 0.24370952958221223, + "grad_norm": 0.06919185072183609, + "learning_rate": 3.7954801756885095e-05, + "loss": 0.0165, + "step": 32970 + }, + { + "epoch": 0.24378344815351408, + "grad_norm": 0.11352057009935379, + "learning_rate": 3.7951092117758784e-05, + "loss": 0.02, + "step": 32980 + }, + { + "epoch": 0.24385736672481595, + "grad_norm": 0.06584945321083069, + "learning_rate": 3.794738247863248e-05, + "loss": 0.0183, + "step": 32990 + }, + { + "epoch": 0.2439312852961178, + "grad_norm": 0.2477860003709793, + "learning_rate": 3.794367283950617e-05, + "loss": 0.0187, + "step": 33000 + }, + { + "epoch": 0.24400520386741964, + "grad_norm": 0.09093081206083298, + "learning_rate": 3.793996320037987e-05, + "loss": 0.0202, + "step": 33010 + }, + { + "epoch": 0.2440791224387215, + "grad_norm": 0.09048257768154144, + "learning_rate": 3.793625356125357e-05, + "loss": 0.0177, + "step": 33020 + }, + { + "epoch": 0.24415304101002336, + "grad_norm": 0.11075396090745926, + "learning_rate": 3.793254392212726e-05, + "loss": 0.0197, + "step": 33030 + }, + { + "epoch": 0.2442269595813252, + "grad_norm": 0.10718685388565063, + "learning_rate": 3.792883428300095e-05, + "loss": 0.0189, + "step": 33040 + }, + { + "epoch": 0.24430087815262708, + "grad_norm": 0.08964429050683975, + "learning_rate": 3.792512464387464e-05, + "loss": 0.0177, + "step": 33050 + }, + { + "epoch": 0.24437479672392892, + "grad_norm": 0.11210490763187408, + "learning_rate": 3.792141500474834e-05, + "loss": 0.0183, + "step": 33060 + }, + { + "epoch": 0.24444871529523077, + "grad_norm": 0.09917151182889938, + "learning_rate": 3.7917705365622034e-05, + "loss": 0.0195, + "step": 33070 + }, + { + "epoch": 0.24452263386653264, + "grad_norm": 0.07808961719274521, + "learning_rate": 3.791399572649573e-05, + "loss": 0.0188, + "step": 33080 + }, + { + "epoch": 0.24459655243783449, + "grad_norm": 0.09703001379966736, + "learning_rate": 3.7910286087369426e-05, + "loss": 0.0205, + "step": 33090 + }, + { + "epoch": 0.24467047100913633, + "grad_norm": 0.07967003434896469, + "learning_rate": 3.7906576448243115e-05, + "loss": 0.0174, + "step": 33100 + }, + { + "epoch": 0.24474438958043818, + "grad_norm": 0.08490297943353653, + "learning_rate": 3.790286680911681e-05, + "loss": 0.019, + "step": 33110 + }, + { + "epoch": 0.24481830815174005, + "grad_norm": 0.09356676787137985, + "learning_rate": 3.789915716999051e-05, + "loss": 0.0184, + "step": 33120 + }, + { + "epoch": 0.2448922267230419, + "grad_norm": 0.0811527818441391, + "learning_rate": 3.7895447530864196e-05, + "loss": 0.022, + "step": 33130 + }, + { + "epoch": 0.24496614529434374, + "grad_norm": 0.09434277564287186, + "learning_rate": 3.789173789173789e-05, + "loss": 0.0194, + "step": 33140 + }, + { + "epoch": 0.2450400638656456, + "grad_norm": 0.09916486591100693, + "learning_rate": 3.788802825261159e-05, + "loss": 0.0216, + "step": 33150 + }, + { + "epoch": 0.24511398243694746, + "grad_norm": 0.08763419836759567, + "learning_rate": 3.7884318613485284e-05, + "loss": 0.0202, + "step": 33160 + }, + { + "epoch": 0.2451879010082493, + "grad_norm": 0.08583448827266693, + "learning_rate": 3.788060897435898e-05, + "loss": 0.0179, + "step": 33170 + }, + { + "epoch": 0.24526181957955118, + "grad_norm": 0.09215591102838516, + "learning_rate": 3.787689933523267e-05, + "loss": 0.0187, + "step": 33180 + }, + { + "epoch": 0.24533573815085302, + "grad_norm": 0.09262657165527344, + "learning_rate": 3.7873189696106365e-05, + "loss": 0.0192, + "step": 33190 + }, + { + "epoch": 0.24540965672215487, + "grad_norm": 0.10066172480583191, + "learning_rate": 3.786948005698006e-05, + "loss": 0.0222, + "step": 33200 + }, + { + "epoch": 0.24548357529345674, + "grad_norm": 0.0908813551068306, + "learning_rate": 3.786577041785375e-05, + "loss": 0.018, + "step": 33210 + }, + { + "epoch": 0.24555749386475859, + "grad_norm": 0.11447655409574509, + "learning_rate": 3.7862060778727447e-05, + "loss": 0.02, + "step": 33220 + }, + { + "epoch": 0.24563141243606043, + "grad_norm": 0.07889176160097122, + "learning_rate": 3.785835113960114e-05, + "loss": 0.0196, + "step": 33230 + }, + { + "epoch": 0.24570533100736228, + "grad_norm": 0.06103529781103134, + "learning_rate": 3.785464150047484e-05, + "loss": 0.0175, + "step": 33240 + }, + { + "epoch": 0.24577924957866415, + "grad_norm": 0.0895182192325592, + "learning_rate": 3.7850931861348534e-05, + "loss": 0.0212, + "step": 33250 + }, + { + "epoch": 0.245853168149966, + "grad_norm": 0.065130814909935, + "learning_rate": 3.7847222222222224e-05, + "loss": 0.0193, + "step": 33260 + }, + { + "epoch": 0.24592708672126784, + "grad_norm": 0.09803667664527893, + "learning_rate": 3.784351258309592e-05, + "loss": 0.0185, + "step": 33270 + }, + { + "epoch": 0.2460010052925697, + "grad_norm": 0.08449755609035492, + "learning_rate": 3.783980294396961e-05, + "loss": 0.0218, + "step": 33280 + }, + { + "epoch": 0.24607492386387156, + "grad_norm": 0.0648382306098938, + "learning_rate": 3.7836093304843305e-05, + "loss": 0.017, + "step": 33290 + }, + { + "epoch": 0.2461488424351734, + "grad_norm": 0.2292952835559845, + "learning_rate": 3.7832383665717e-05, + "loss": 0.0192, + "step": 33300 + }, + { + "epoch": 0.24622276100647528, + "grad_norm": 0.08093922585248947, + "learning_rate": 3.78286740265907e-05, + "loss": 0.0194, + "step": 33310 + }, + { + "epoch": 0.24629667957777712, + "grad_norm": 0.09469078481197357, + "learning_rate": 3.782496438746439e-05, + "loss": 0.0202, + "step": 33320 + }, + { + "epoch": 0.24637059814907897, + "grad_norm": 0.10700713098049164, + "learning_rate": 3.782125474833808e-05, + "loss": 0.0192, + "step": 33330 + }, + { + "epoch": 0.24644451672038084, + "grad_norm": 0.10421323776245117, + "learning_rate": 3.781754510921178e-05, + "loss": 0.0202, + "step": 33340 + }, + { + "epoch": 0.24651843529168269, + "grad_norm": 0.07413748651742935, + "learning_rate": 3.7813835470085474e-05, + "loss": 0.0211, + "step": 33350 + }, + { + "epoch": 0.24659235386298453, + "grad_norm": 0.08727966248989105, + "learning_rate": 3.781012583095916e-05, + "loss": 0.0186, + "step": 33360 + }, + { + "epoch": 0.24666627243428638, + "grad_norm": 0.07526741921901703, + "learning_rate": 3.780641619183286e-05, + "loss": 0.0191, + "step": 33370 + }, + { + "epoch": 0.24674019100558825, + "grad_norm": 0.09076111763715744, + "learning_rate": 3.7802706552706555e-05, + "loss": 0.0204, + "step": 33380 + }, + { + "epoch": 0.2468141095768901, + "grad_norm": 0.12775136530399323, + "learning_rate": 3.779899691358025e-05, + "loss": 0.0211, + "step": 33390 + }, + { + "epoch": 0.24688802814819194, + "grad_norm": 0.13375797867774963, + "learning_rate": 3.779528727445395e-05, + "loss": 0.0203, + "step": 33400 + }, + { + "epoch": 0.2469619467194938, + "grad_norm": 0.09040780365467072, + "learning_rate": 3.7791577635327636e-05, + "loss": 0.0216, + "step": 33410 + }, + { + "epoch": 0.24703586529079566, + "grad_norm": 0.0828985720872879, + "learning_rate": 3.778786799620133e-05, + "loss": 0.0202, + "step": 33420 + }, + { + "epoch": 0.2471097838620975, + "grad_norm": 0.09508246928453445, + "learning_rate": 3.778415835707503e-05, + "loss": 0.0186, + "step": 33430 + }, + { + "epoch": 0.24718370243339938, + "grad_norm": 0.09850742667913437, + "learning_rate": 3.778044871794872e-05, + "loss": 0.0218, + "step": 33440 + }, + { + "epoch": 0.24725762100470122, + "grad_norm": 0.0872756764292717, + "learning_rate": 3.777673907882241e-05, + "loss": 0.0152, + "step": 33450 + }, + { + "epoch": 0.24733153957600307, + "grad_norm": 0.11343777179718018, + "learning_rate": 3.777302943969611e-05, + "loss": 0.0186, + "step": 33460 + }, + { + "epoch": 0.24740545814730494, + "grad_norm": 0.10817458480596542, + "learning_rate": 3.7769319800569805e-05, + "loss": 0.0218, + "step": 33470 + }, + { + "epoch": 0.24747937671860679, + "grad_norm": 0.10540654510259628, + "learning_rate": 3.77656101614435e-05, + "loss": 0.0204, + "step": 33480 + }, + { + "epoch": 0.24755329528990863, + "grad_norm": 0.08479999005794525, + "learning_rate": 3.776190052231719e-05, + "loss": 0.0213, + "step": 33490 + }, + { + "epoch": 0.24762721386121048, + "grad_norm": 0.06634625047445297, + "learning_rate": 3.7758190883190886e-05, + "loss": 0.0208, + "step": 33500 + }, + { + "epoch": 0.24770113243251235, + "grad_norm": 0.10522954165935516, + "learning_rate": 3.7754481244064575e-05, + "loss": 0.021, + "step": 33510 + }, + { + "epoch": 0.2477750510038142, + "grad_norm": 0.11834339052438736, + "learning_rate": 3.775077160493827e-05, + "loss": 0.0217, + "step": 33520 + }, + { + "epoch": 0.24784896957511604, + "grad_norm": 0.09099873155355453, + "learning_rate": 3.774706196581197e-05, + "loss": 0.022, + "step": 33530 + }, + { + "epoch": 0.2479228881464179, + "grad_norm": 0.11641920357942581, + "learning_rate": 3.774335232668566e-05, + "loss": 0.021, + "step": 33540 + }, + { + "epoch": 0.24799680671771976, + "grad_norm": 0.10506580024957657, + "learning_rate": 3.773964268755936e-05, + "loss": 0.0198, + "step": 33550 + }, + { + "epoch": 0.2480707252890216, + "grad_norm": 0.09904909878969193, + "learning_rate": 3.773593304843305e-05, + "loss": 0.0187, + "step": 33560 + }, + { + "epoch": 0.24814464386032348, + "grad_norm": 0.08247719705104828, + "learning_rate": 3.7732223409306744e-05, + "loss": 0.0214, + "step": 33570 + }, + { + "epoch": 0.24821856243162532, + "grad_norm": 0.09621842950582504, + "learning_rate": 3.772851377018044e-05, + "loss": 0.0198, + "step": 33580 + }, + { + "epoch": 0.24829248100292717, + "grad_norm": 0.10325994342565536, + "learning_rate": 3.772480413105413e-05, + "loss": 0.0191, + "step": 33590 + }, + { + "epoch": 0.24836639957422904, + "grad_norm": 0.06651351600885391, + "learning_rate": 3.7721094491927826e-05, + "loss": 0.0198, + "step": 33600 + }, + { + "epoch": 0.2484403181455309, + "grad_norm": 0.10950721055269241, + "learning_rate": 3.771738485280152e-05, + "loss": 0.0203, + "step": 33610 + }, + { + "epoch": 0.24851423671683273, + "grad_norm": 0.07975035905838013, + "learning_rate": 3.771367521367522e-05, + "loss": 0.0195, + "step": 33620 + }, + { + "epoch": 0.2485881552881346, + "grad_norm": 0.14583900570869446, + "learning_rate": 3.7709965574548913e-05, + "loss": 0.0194, + "step": 33630 + }, + { + "epoch": 0.24866207385943645, + "grad_norm": 0.07780491560697556, + "learning_rate": 3.77062559354226e-05, + "loss": 0.0198, + "step": 33640 + }, + { + "epoch": 0.2487359924307383, + "grad_norm": 0.12104685604572296, + "learning_rate": 3.77025462962963e-05, + "loss": 0.0209, + "step": 33650 + }, + { + "epoch": 0.24880991100204014, + "grad_norm": 0.13882067799568176, + "learning_rate": 3.7698836657169995e-05, + "loss": 0.0234, + "step": 33660 + }, + { + "epoch": 0.248883829573342, + "grad_norm": 0.0761672854423523, + "learning_rate": 3.7695127018043684e-05, + "loss": 0.0202, + "step": 33670 + }, + { + "epoch": 0.24895774814464386, + "grad_norm": 0.07395438104867935, + "learning_rate": 3.769141737891738e-05, + "loss": 0.0191, + "step": 33680 + }, + { + "epoch": 0.2490316667159457, + "grad_norm": 0.09155049920082092, + "learning_rate": 3.7687707739791076e-05, + "loss": 0.0193, + "step": 33690 + }, + { + "epoch": 0.24910558528724758, + "grad_norm": 0.10466806590557098, + "learning_rate": 3.768399810066477e-05, + "loss": 0.0198, + "step": 33700 + }, + { + "epoch": 0.24917950385854942, + "grad_norm": 0.0634426698088646, + "learning_rate": 3.768028846153847e-05, + "loss": 0.0168, + "step": 33710 + }, + { + "epoch": 0.24925342242985127, + "grad_norm": 0.08831311017274857, + "learning_rate": 3.767657882241216e-05, + "loss": 0.0199, + "step": 33720 + }, + { + "epoch": 0.24932734100115314, + "grad_norm": 0.078910231590271, + "learning_rate": 3.767286918328585e-05, + "loss": 0.0183, + "step": 33730 + }, + { + "epoch": 0.249401259572455, + "grad_norm": 0.07520467042922974, + "learning_rate": 3.766915954415954e-05, + "loss": 0.019, + "step": 33740 + }, + { + "epoch": 0.24947517814375683, + "grad_norm": 0.10446647554636002, + "learning_rate": 3.766544990503324e-05, + "loss": 0.0225, + "step": 33750 + }, + { + "epoch": 0.2495490967150587, + "grad_norm": 0.1083189845085144, + "learning_rate": 3.7661740265906934e-05, + "loss": 0.0193, + "step": 33760 + }, + { + "epoch": 0.24962301528636055, + "grad_norm": 0.07150457799434662, + "learning_rate": 3.765803062678063e-05, + "loss": 0.0198, + "step": 33770 + }, + { + "epoch": 0.2496969338576624, + "grad_norm": 0.08948767930269241, + "learning_rate": 3.7654320987654326e-05, + "loss": 0.0211, + "step": 33780 + }, + { + "epoch": 0.24977085242896424, + "grad_norm": 0.08297252655029297, + "learning_rate": 3.7650611348528015e-05, + "loss": 0.0225, + "step": 33790 + }, + { + "epoch": 0.2498447710002661, + "grad_norm": 0.07543346285820007, + "learning_rate": 3.764690170940171e-05, + "loss": 0.0198, + "step": 33800 + }, + { + "epoch": 0.24991868957156796, + "grad_norm": 0.07644642144441605, + "learning_rate": 3.764319207027541e-05, + "loss": 0.019, + "step": 33810 + }, + { + "epoch": 0.2499926081428698, + "grad_norm": 0.08885039389133453, + "learning_rate": 3.7639482431149096e-05, + "loss": 0.018, + "step": 33820 + }, + { + "epoch": 0.2500665267141717, + "grad_norm": 0.12531474232673645, + "learning_rate": 3.763577279202279e-05, + "loss": 0.0194, + "step": 33830 + }, + { + "epoch": 0.2501404452854735, + "grad_norm": 0.0952950045466423, + "learning_rate": 3.763206315289649e-05, + "loss": 0.0204, + "step": 33840 + }, + { + "epoch": 0.25021436385677537, + "grad_norm": 0.12818032503128052, + "learning_rate": 3.7628353513770184e-05, + "loss": 0.0208, + "step": 33850 + }, + { + "epoch": 0.2502882824280772, + "grad_norm": 0.08323697000741959, + "learning_rate": 3.762464387464388e-05, + "loss": 0.0207, + "step": 33860 + }, + { + "epoch": 0.25036220099937906, + "grad_norm": 0.10147396475076675, + "learning_rate": 3.762093423551757e-05, + "loss": 0.0202, + "step": 33870 + }, + { + "epoch": 0.25043611957068096, + "grad_norm": 0.07355233281850815, + "learning_rate": 3.7617224596391265e-05, + "loss": 0.0195, + "step": 33880 + }, + { + "epoch": 0.2505100381419828, + "grad_norm": 0.12211878597736359, + "learning_rate": 3.761351495726496e-05, + "loss": 0.0194, + "step": 33890 + }, + { + "epoch": 0.25058395671328465, + "grad_norm": 0.14431354403495789, + "learning_rate": 3.760980531813865e-05, + "loss": 0.0208, + "step": 33900 + }, + { + "epoch": 0.2506578752845865, + "grad_norm": 0.08047827333211899, + "learning_rate": 3.7606095679012346e-05, + "loss": 0.0184, + "step": 33910 + }, + { + "epoch": 0.25073179385588834, + "grad_norm": 0.07976054400205612, + "learning_rate": 3.760238603988604e-05, + "loss": 0.0191, + "step": 33920 + }, + { + "epoch": 0.2508057124271902, + "grad_norm": 0.17716683447360992, + "learning_rate": 3.759867640075974e-05, + "loss": 0.0213, + "step": 33930 + }, + { + "epoch": 0.2508796309984921, + "grad_norm": 0.12124445289373398, + "learning_rate": 3.7594966761633434e-05, + "loss": 0.0193, + "step": 33940 + }, + { + "epoch": 0.25095354956979393, + "grad_norm": 0.10458637028932571, + "learning_rate": 3.759125712250712e-05, + "loss": 0.0221, + "step": 33950 + }, + { + "epoch": 0.2510274681410958, + "grad_norm": 0.10960089415311813, + "learning_rate": 3.758754748338082e-05, + "loss": 0.0205, + "step": 33960 + }, + { + "epoch": 0.2511013867123976, + "grad_norm": 0.10792728513479233, + "learning_rate": 3.758383784425451e-05, + "loss": 0.0202, + "step": 33970 + }, + { + "epoch": 0.25117530528369947, + "grad_norm": 0.10539413243532181, + "learning_rate": 3.7580128205128204e-05, + "loss": 0.0201, + "step": 33980 + }, + { + "epoch": 0.2512492238550013, + "grad_norm": 0.09134382009506226, + "learning_rate": 3.75764185660019e-05, + "loss": 0.019, + "step": 33990 + }, + { + "epoch": 0.25132314242630316, + "grad_norm": 0.10139552503824234, + "learning_rate": 3.7572708926875596e-05, + "loss": 0.0223, + "step": 34000 + }, + { + "epoch": 0.25139706099760506, + "grad_norm": 0.08448898047208786, + "learning_rate": 3.756899928774929e-05, + "loss": 0.0183, + "step": 34010 + }, + { + "epoch": 0.2514709795689069, + "grad_norm": 0.11146911233663559, + "learning_rate": 3.756528964862298e-05, + "loss": 0.0189, + "step": 34020 + }, + { + "epoch": 0.25154489814020875, + "grad_norm": 0.07452386617660522, + "learning_rate": 3.756158000949668e-05, + "loss": 0.019, + "step": 34030 + }, + { + "epoch": 0.2516188167115106, + "grad_norm": 0.07857771217823029, + "learning_rate": 3.7557870370370374e-05, + "loss": 0.0162, + "step": 34040 + }, + { + "epoch": 0.25169273528281244, + "grad_norm": 0.08723234385251999, + "learning_rate": 3.755416073124406e-05, + "loss": 0.0205, + "step": 34050 + }, + { + "epoch": 0.2517666538541143, + "grad_norm": 0.07878848165273666, + "learning_rate": 3.755045109211776e-05, + "loss": 0.0174, + "step": 34060 + }, + { + "epoch": 0.2518405724254162, + "grad_norm": 0.05665014311671257, + "learning_rate": 3.7546741452991455e-05, + "loss": 0.0188, + "step": 34070 + }, + { + "epoch": 0.25191449099671803, + "grad_norm": 0.09765440225601196, + "learning_rate": 3.754303181386515e-05, + "loss": 0.0185, + "step": 34080 + }, + { + "epoch": 0.2519884095680199, + "grad_norm": 0.09128037840127945, + "learning_rate": 3.7539322174738847e-05, + "loss": 0.0203, + "step": 34090 + }, + { + "epoch": 0.2520623281393217, + "grad_norm": 0.06400024890899658, + "learning_rate": 3.7535612535612536e-05, + "loss": 0.0173, + "step": 34100 + }, + { + "epoch": 0.25213624671062357, + "grad_norm": 0.08637039363384247, + "learning_rate": 3.753190289648623e-05, + "loss": 0.0203, + "step": 34110 + }, + { + "epoch": 0.2522101652819254, + "grad_norm": 0.08356247842311859, + "learning_rate": 3.752819325735993e-05, + "loss": 0.0178, + "step": 34120 + }, + { + "epoch": 0.25228408385322726, + "grad_norm": 0.09152962267398834, + "learning_rate": 3.752448361823362e-05, + "loss": 0.0185, + "step": 34130 + }, + { + "epoch": 0.25235800242452916, + "grad_norm": 0.08049582690000534, + "learning_rate": 3.752077397910731e-05, + "loss": 0.0179, + "step": 34140 + }, + { + "epoch": 0.252431920995831, + "grad_norm": 0.10080865025520325, + "learning_rate": 3.751706433998101e-05, + "loss": 0.0202, + "step": 34150 + }, + { + "epoch": 0.25250583956713285, + "grad_norm": 0.12849904596805573, + "learning_rate": 3.7513354700854705e-05, + "loss": 0.0212, + "step": 34160 + }, + { + "epoch": 0.2525797581384347, + "grad_norm": 0.3746621608734131, + "learning_rate": 3.75096450617284e-05, + "loss": 0.0201, + "step": 34170 + }, + { + "epoch": 0.25265367670973654, + "grad_norm": 0.08858539909124374, + "learning_rate": 3.750593542260209e-05, + "loss": 0.0205, + "step": 34180 + }, + { + "epoch": 0.2527275952810384, + "grad_norm": 0.08877479285001755, + "learning_rate": 3.7502225783475786e-05, + "loss": 0.0198, + "step": 34190 + }, + { + "epoch": 0.2528015138523403, + "grad_norm": 0.14612101018428802, + "learning_rate": 3.7498516144349475e-05, + "loss": 0.0185, + "step": 34200 + }, + { + "epoch": 0.25287543242364213, + "grad_norm": 0.09967604279518127, + "learning_rate": 3.749480650522317e-05, + "loss": 0.0198, + "step": 34210 + }, + { + "epoch": 0.252949350994944, + "grad_norm": 0.06724333763122559, + "learning_rate": 3.749109686609687e-05, + "loss": 0.0215, + "step": 34220 + }, + { + "epoch": 0.2530232695662458, + "grad_norm": 0.06407474726438522, + "learning_rate": 3.748738722697056e-05, + "loss": 0.019, + "step": 34230 + }, + { + "epoch": 0.25309718813754767, + "grad_norm": 0.0668005645275116, + "learning_rate": 3.748367758784426e-05, + "loss": 0.0157, + "step": 34240 + }, + { + "epoch": 0.2531711067088495, + "grad_norm": 0.43225792050361633, + "learning_rate": 3.747996794871795e-05, + "loss": 0.0213, + "step": 34250 + }, + { + "epoch": 0.25324502528015136, + "grad_norm": 0.08636850863695145, + "learning_rate": 3.7476258309591644e-05, + "loss": 0.0211, + "step": 34260 + }, + { + "epoch": 0.25331894385145326, + "grad_norm": 0.12886829674243927, + "learning_rate": 3.747254867046534e-05, + "loss": 0.0214, + "step": 34270 + }, + { + "epoch": 0.2533928624227551, + "grad_norm": 0.07349354028701782, + "learning_rate": 3.746883903133903e-05, + "loss": 0.0177, + "step": 34280 + }, + { + "epoch": 0.25346678099405695, + "grad_norm": 0.06146747246384621, + "learning_rate": 3.7465129392212725e-05, + "loss": 0.0184, + "step": 34290 + }, + { + "epoch": 0.2535406995653588, + "grad_norm": 0.08462537825107574, + "learning_rate": 3.746141975308642e-05, + "loss": 0.019, + "step": 34300 + }, + { + "epoch": 0.25361461813666064, + "grad_norm": 0.08384005725383759, + "learning_rate": 3.745771011396012e-05, + "loss": 0.0187, + "step": 34310 + }, + { + "epoch": 0.2536885367079625, + "grad_norm": 0.0576070211827755, + "learning_rate": 3.745400047483381e-05, + "loss": 0.0197, + "step": 34320 + }, + { + "epoch": 0.2537624552792644, + "grad_norm": 0.12527556717395782, + "learning_rate": 3.74502908357075e-05, + "loss": 0.0204, + "step": 34330 + }, + { + "epoch": 0.25383637385056623, + "grad_norm": 0.07750841975212097, + "learning_rate": 3.74465811965812e-05, + "loss": 0.019, + "step": 34340 + }, + { + "epoch": 0.2539102924218681, + "grad_norm": 0.0806896835565567, + "learning_rate": 3.7442871557454894e-05, + "loss": 0.0192, + "step": 34350 + }, + { + "epoch": 0.2539842109931699, + "grad_norm": 0.08760758489370346, + "learning_rate": 3.7439161918328583e-05, + "loss": 0.0205, + "step": 34360 + }, + { + "epoch": 0.25405812956447177, + "grad_norm": 0.1384943574666977, + "learning_rate": 3.743545227920228e-05, + "loss": 0.0213, + "step": 34370 + }, + { + "epoch": 0.2541320481357736, + "grad_norm": 0.08599944412708282, + "learning_rate": 3.7431742640075975e-05, + "loss": 0.022, + "step": 34380 + }, + { + "epoch": 0.25420596670707546, + "grad_norm": 0.0776156410574913, + "learning_rate": 3.742803300094967e-05, + "loss": 0.0189, + "step": 34390 + }, + { + "epoch": 0.25427988527837736, + "grad_norm": 0.1002473384141922, + "learning_rate": 3.742432336182337e-05, + "loss": 0.0188, + "step": 34400 + }, + { + "epoch": 0.2543538038496792, + "grad_norm": 0.0879029631614685, + "learning_rate": 3.7420613722697057e-05, + "loss": 0.0177, + "step": 34410 + }, + { + "epoch": 0.25442772242098105, + "grad_norm": 0.09049376100301743, + "learning_rate": 3.741690408357075e-05, + "loss": 0.0179, + "step": 34420 + }, + { + "epoch": 0.2545016409922829, + "grad_norm": 0.07568518072366714, + "learning_rate": 3.741319444444444e-05, + "loss": 0.0191, + "step": 34430 + }, + { + "epoch": 0.25457555956358474, + "grad_norm": 0.09793508797883987, + "learning_rate": 3.740948480531814e-05, + "loss": 0.0186, + "step": 34440 + }, + { + "epoch": 0.2546494781348866, + "grad_norm": 0.129734605550766, + "learning_rate": 3.7405775166191834e-05, + "loss": 0.0192, + "step": 34450 + }, + { + "epoch": 0.2547233967061885, + "grad_norm": 0.1545630842447281, + "learning_rate": 3.740206552706553e-05, + "loss": 0.0217, + "step": 34460 + }, + { + "epoch": 0.25479731527749033, + "grad_norm": 0.10320380330085754, + "learning_rate": 3.7398355887939226e-05, + "loss": 0.0201, + "step": 34470 + }, + { + "epoch": 0.2548712338487922, + "grad_norm": 0.10976788401603699, + "learning_rate": 3.7394646248812915e-05, + "loss": 0.0191, + "step": 34480 + }, + { + "epoch": 0.254945152420094, + "grad_norm": 0.08091693371534348, + "learning_rate": 3.739093660968661e-05, + "loss": 0.0174, + "step": 34490 + }, + { + "epoch": 0.25501907099139587, + "grad_norm": 0.07782463729381561, + "learning_rate": 3.738722697056031e-05, + "loss": 0.0178, + "step": 34500 + }, + { + "epoch": 0.2550929895626977, + "grad_norm": 0.12280046194791794, + "learning_rate": 3.7383517331433996e-05, + "loss": 0.0192, + "step": 34510 + }, + { + "epoch": 0.25516690813399956, + "grad_norm": 0.08706457912921906, + "learning_rate": 3.737980769230769e-05, + "loss": 0.0178, + "step": 34520 + }, + { + "epoch": 0.25524082670530146, + "grad_norm": 0.08564306795597076, + "learning_rate": 3.737609805318139e-05, + "loss": 0.0184, + "step": 34530 + }, + { + "epoch": 0.2553147452766033, + "grad_norm": 0.08043599873781204, + "learning_rate": 3.7372388414055084e-05, + "loss": 0.0206, + "step": 34540 + }, + { + "epoch": 0.25538866384790515, + "grad_norm": 0.10819025337696075, + "learning_rate": 3.736867877492878e-05, + "loss": 0.0223, + "step": 34550 + }, + { + "epoch": 0.255462582419207, + "grad_norm": 0.08196381479501724, + "learning_rate": 3.736496913580247e-05, + "loss": 0.0186, + "step": 34560 + }, + { + "epoch": 0.25553650099050884, + "grad_norm": 0.1249900832772255, + "learning_rate": 3.7361259496676165e-05, + "loss": 0.0173, + "step": 34570 + }, + { + "epoch": 0.2556104195618107, + "grad_norm": 0.09837884455919266, + "learning_rate": 3.735754985754986e-05, + "loss": 0.0186, + "step": 34580 + }, + { + "epoch": 0.2556843381331126, + "grad_norm": 0.0960315689444542, + "learning_rate": 3.735384021842355e-05, + "loss": 0.0194, + "step": 34590 + }, + { + "epoch": 0.25575825670441443, + "grad_norm": 0.07749620825052261, + "learning_rate": 3.7350130579297246e-05, + "loss": 0.0192, + "step": 34600 + }, + { + "epoch": 0.2558321752757163, + "grad_norm": 0.07777617126703262, + "learning_rate": 3.734642094017094e-05, + "loss": 0.0212, + "step": 34610 + }, + { + "epoch": 0.2559060938470181, + "grad_norm": 0.09821510314941406, + "learning_rate": 3.734271130104464e-05, + "loss": 0.0189, + "step": 34620 + }, + { + "epoch": 0.25598001241831997, + "grad_norm": 0.18012312054634094, + "learning_rate": 3.7339001661918334e-05, + "loss": 0.0181, + "step": 34630 + }, + { + "epoch": 0.2560539309896218, + "grad_norm": 0.10718531906604767, + "learning_rate": 3.733529202279202e-05, + "loss": 0.0201, + "step": 34640 + }, + { + "epoch": 0.25612784956092366, + "grad_norm": 0.09973935037851334, + "learning_rate": 3.733158238366572e-05, + "loss": 0.0186, + "step": 34650 + }, + { + "epoch": 0.25620176813222556, + "grad_norm": 0.096929170191288, + "learning_rate": 3.732787274453941e-05, + "loss": 0.0215, + "step": 34660 + }, + { + "epoch": 0.2562756867035274, + "grad_norm": 0.09448627382516861, + "learning_rate": 3.7324163105413104e-05, + "loss": 0.0194, + "step": 34670 + }, + { + "epoch": 0.25634960527482925, + "grad_norm": 0.09486164897680283, + "learning_rate": 3.73204534662868e-05, + "loss": 0.0176, + "step": 34680 + }, + { + "epoch": 0.2564235238461311, + "grad_norm": 0.09092456102371216, + "learning_rate": 3.7316743827160496e-05, + "loss": 0.0208, + "step": 34690 + }, + { + "epoch": 0.25649744241743294, + "grad_norm": 0.11252086609601974, + "learning_rate": 3.731303418803419e-05, + "loss": 0.0204, + "step": 34700 + }, + { + "epoch": 0.2565713609887348, + "grad_norm": 0.08080518245697021, + "learning_rate": 3.730932454890788e-05, + "loss": 0.0206, + "step": 34710 + }, + { + "epoch": 0.2566452795600367, + "grad_norm": 0.0721350684762001, + "learning_rate": 3.730561490978158e-05, + "loss": 0.0185, + "step": 34720 + }, + { + "epoch": 0.25671919813133853, + "grad_norm": 0.08283282071352005, + "learning_rate": 3.730190527065527e-05, + "loss": 0.0202, + "step": 34730 + }, + { + "epoch": 0.2567931167026404, + "grad_norm": 0.09533637017011642, + "learning_rate": 3.729819563152896e-05, + "loss": 0.0199, + "step": 34740 + }, + { + "epoch": 0.2568670352739422, + "grad_norm": 0.1281680017709732, + "learning_rate": 3.729448599240266e-05, + "loss": 0.0197, + "step": 34750 + }, + { + "epoch": 0.25694095384524407, + "grad_norm": 0.15431781113147736, + "learning_rate": 3.7290776353276354e-05, + "loss": 0.0173, + "step": 34760 + }, + { + "epoch": 0.2570148724165459, + "grad_norm": 0.09290771931409836, + "learning_rate": 3.728706671415005e-05, + "loss": 0.0191, + "step": 34770 + }, + { + "epoch": 0.25708879098784776, + "grad_norm": 0.07939286530017853, + "learning_rate": 3.7283357075023746e-05, + "loss": 0.021, + "step": 34780 + }, + { + "epoch": 0.25716270955914966, + "grad_norm": 0.07546821236610413, + "learning_rate": 3.7279647435897436e-05, + "loss": 0.0211, + "step": 34790 + }, + { + "epoch": 0.2572366281304515, + "grad_norm": 0.11875446140766144, + "learning_rate": 3.727593779677113e-05, + "loss": 0.0229, + "step": 34800 + }, + { + "epoch": 0.25731054670175335, + "grad_norm": 0.09297899901866913, + "learning_rate": 3.727222815764483e-05, + "loss": 0.0173, + "step": 34810 + }, + { + "epoch": 0.2573844652730552, + "grad_norm": 0.06684008240699768, + "learning_rate": 3.726851851851852e-05, + "loss": 0.0175, + "step": 34820 + }, + { + "epoch": 0.25745838384435704, + "grad_norm": 0.11314887553453445, + "learning_rate": 3.726480887939221e-05, + "loss": 0.02, + "step": 34830 + }, + { + "epoch": 0.2575323024156589, + "grad_norm": 0.12812693417072296, + "learning_rate": 3.726109924026591e-05, + "loss": 0.0196, + "step": 34840 + }, + { + "epoch": 0.2576062209869608, + "grad_norm": 0.10089819878339767, + "learning_rate": 3.7257389601139605e-05, + "loss": 0.0198, + "step": 34850 + }, + { + "epoch": 0.25768013955826263, + "grad_norm": 0.09206525981426239, + "learning_rate": 3.72536799620133e-05, + "loss": 0.0186, + "step": 34860 + }, + { + "epoch": 0.2577540581295645, + "grad_norm": 0.09052669256925583, + "learning_rate": 3.724997032288699e-05, + "loss": 0.0164, + "step": 34870 + }, + { + "epoch": 0.2578279767008663, + "grad_norm": 0.08012855798006058, + "learning_rate": 3.7246260683760686e-05, + "loss": 0.0191, + "step": 34880 + }, + { + "epoch": 0.25790189527216817, + "grad_norm": 0.10896433144807816, + "learning_rate": 3.7242551044634375e-05, + "loss": 0.0212, + "step": 34890 + }, + { + "epoch": 0.25797581384347, + "grad_norm": 0.05664210394024849, + "learning_rate": 3.723884140550807e-05, + "loss": 0.0189, + "step": 34900 + }, + { + "epoch": 0.25804973241477186, + "grad_norm": 0.07072228193283081, + "learning_rate": 3.723513176638177e-05, + "loss": 0.0203, + "step": 34910 + }, + { + "epoch": 0.25812365098607376, + "grad_norm": 0.07742172479629517, + "learning_rate": 3.723142212725546e-05, + "loss": 0.018, + "step": 34920 + }, + { + "epoch": 0.2581975695573756, + "grad_norm": 0.08487288653850555, + "learning_rate": 3.722771248812916e-05, + "loss": 0.0192, + "step": 34930 + }, + { + "epoch": 0.25827148812867745, + "grad_norm": 0.0894191637635231, + "learning_rate": 3.722400284900285e-05, + "loss": 0.0183, + "step": 34940 + }, + { + "epoch": 0.2583454066999793, + "grad_norm": 0.077961266040802, + "learning_rate": 3.7220293209876544e-05, + "loss": 0.0189, + "step": 34950 + }, + { + "epoch": 0.25841932527128114, + "grad_norm": 0.08691947162151337, + "learning_rate": 3.721658357075024e-05, + "loss": 0.0183, + "step": 34960 + }, + { + "epoch": 0.258493243842583, + "grad_norm": 0.06688518822193146, + "learning_rate": 3.721287393162393e-05, + "loss": 0.0218, + "step": 34970 + }, + { + "epoch": 0.2585671624138849, + "grad_norm": 0.10814563930034637, + "learning_rate": 3.7209164292497625e-05, + "loss": 0.0201, + "step": 34980 + }, + { + "epoch": 0.25864108098518673, + "grad_norm": 0.11186385154724121, + "learning_rate": 3.720545465337133e-05, + "loss": 0.0177, + "step": 34990 + }, + { + "epoch": 0.2587149995564886, + "grad_norm": 0.07740023732185364, + "learning_rate": 3.720174501424502e-05, + "loss": 0.0181, + "step": 35000 + }, + { + "epoch": 0.2587889181277904, + "grad_norm": 0.12069465219974518, + "learning_rate": 3.719803537511871e-05, + "loss": 0.0186, + "step": 35010 + }, + { + "epoch": 0.25886283669909227, + "grad_norm": 0.10102164000272751, + "learning_rate": 3.71943257359924e-05, + "loss": 0.0192, + "step": 35020 + }, + { + "epoch": 0.2589367552703941, + "grad_norm": 0.0714918002486229, + "learning_rate": 3.71906160968661e-05, + "loss": 0.0177, + "step": 35030 + }, + { + "epoch": 0.259010673841696, + "grad_norm": 0.09676463156938553, + "learning_rate": 3.7186906457739794e-05, + "loss": 0.0206, + "step": 35040 + }, + { + "epoch": 0.25908459241299786, + "grad_norm": 0.1151902824640274, + "learning_rate": 3.718319681861348e-05, + "loss": 0.0169, + "step": 35050 + }, + { + "epoch": 0.2591585109842997, + "grad_norm": 0.09498842060565948, + "learning_rate": 3.717948717948718e-05, + "loss": 0.0182, + "step": 35060 + }, + { + "epoch": 0.25923242955560155, + "grad_norm": 0.08307395130395889, + "learning_rate": 3.7175777540360875e-05, + "loss": 0.0182, + "step": 35070 + }, + { + "epoch": 0.2593063481269034, + "grad_norm": 0.09639619290828705, + "learning_rate": 3.717206790123457e-05, + "loss": 0.0199, + "step": 35080 + }, + { + "epoch": 0.25938026669820524, + "grad_norm": 0.056522320955991745, + "learning_rate": 3.716835826210827e-05, + "loss": 0.017, + "step": 35090 + }, + { + "epoch": 0.2594541852695071, + "grad_norm": 0.09537209570407867, + "learning_rate": 3.7164648622981956e-05, + "loss": 0.0183, + "step": 35100 + }, + { + "epoch": 0.259528103840809, + "grad_norm": 0.12425485253334045, + "learning_rate": 3.716093898385565e-05, + "loss": 0.0199, + "step": 35110 + }, + { + "epoch": 0.25960202241211083, + "grad_norm": 0.08875080943107605, + "learning_rate": 3.715722934472934e-05, + "loss": 0.019, + "step": 35120 + }, + { + "epoch": 0.2596759409834127, + "grad_norm": 0.10081818699836731, + "learning_rate": 3.715351970560304e-05, + "loss": 0.0183, + "step": 35130 + }, + { + "epoch": 0.2597498595547145, + "grad_norm": 0.10890643298625946, + "learning_rate": 3.714981006647674e-05, + "loss": 0.0193, + "step": 35140 + }, + { + "epoch": 0.25982377812601637, + "grad_norm": 0.09250643849372864, + "learning_rate": 3.714610042735043e-05, + "loss": 0.0191, + "step": 35150 + }, + { + "epoch": 0.2598976966973182, + "grad_norm": 0.08782697468996048, + "learning_rate": 3.7142390788224125e-05, + "loss": 0.0179, + "step": 35160 + }, + { + "epoch": 0.2599716152686201, + "grad_norm": 0.09054393321275711, + "learning_rate": 3.7138681149097814e-05, + "loss": 0.0195, + "step": 35170 + }, + { + "epoch": 0.26004553383992196, + "grad_norm": 0.08586835116147995, + "learning_rate": 3.713497150997151e-05, + "loss": 0.018, + "step": 35180 + }, + { + "epoch": 0.2601194524112238, + "grad_norm": 0.07333735376596451, + "learning_rate": 3.7131261870845206e-05, + "loss": 0.0209, + "step": 35190 + }, + { + "epoch": 0.26019337098252565, + "grad_norm": 0.08953770250082016, + "learning_rate": 3.7127552231718896e-05, + "loss": 0.0163, + "step": 35200 + }, + { + "epoch": 0.2602672895538275, + "grad_norm": 0.09727450460195541, + "learning_rate": 3.712384259259259e-05, + "loss": 0.0164, + "step": 35210 + }, + { + "epoch": 0.26034120812512934, + "grad_norm": 0.1030101552605629, + "learning_rate": 3.7120132953466294e-05, + "loss": 0.0181, + "step": 35220 + }, + { + "epoch": 0.2604151266964312, + "grad_norm": 0.11227700859308243, + "learning_rate": 3.7116423314339984e-05, + "loss": 0.0218, + "step": 35230 + }, + { + "epoch": 0.2604890452677331, + "grad_norm": 0.09380971640348434, + "learning_rate": 3.711271367521368e-05, + "loss": 0.0196, + "step": 35240 + }, + { + "epoch": 0.26056296383903493, + "grad_norm": 0.07920172065496445, + "learning_rate": 3.710900403608737e-05, + "loss": 0.0183, + "step": 35250 + }, + { + "epoch": 0.2606368824103368, + "grad_norm": 0.10708235204219818, + "learning_rate": 3.7105294396961065e-05, + "loss": 0.0195, + "step": 35260 + }, + { + "epoch": 0.2607108009816386, + "grad_norm": 0.12499354779720306, + "learning_rate": 3.710158475783476e-05, + "loss": 0.0209, + "step": 35270 + }, + { + "epoch": 0.26078471955294047, + "grad_norm": 0.0819254145026207, + "learning_rate": 3.709787511870845e-05, + "loss": 0.0191, + "step": 35280 + }, + { + "epoch": 0.2608586381242423, + "grad_norm": 0.12078723311424255, + "learning_rate": 3.709416547958215e-05, + "loss": 0.0205, + "step": 35290 + }, + { + "epoch": 0.2609325566955442, + "grad_norm": 0.09493082016706467, + "learning_rate": 3.709045584045584e-05, + "loss": 0.0202, + "step": 35300 + }, + { + "epoch": 0.26100647526684606, + "grad_norm": 0.07896389067173004, + "learning_rate": 3.708674620132954e-05, + "loss": 0.0177, + "step": 35310 + }, + { + "epoch": 0.2610803938381479, + "grad_norm": 0.10148924589157104, + "learning_rate": 3.7083036562203234e-05, + "loss": 0.0205, + "step": 35320 + }, + { + "epoch": 0.26115431240944975, + "grad_norm": 0.08146346360445023, + "learning_rate": 3.707932692307692e-05, + "loss": 0.0219, + "step": 35330 + }, + { + "epoch": 0.2612282309807516, + "grad_norm": 0.0864454135298729, + "learning_rate": 3.707561728395062e-05, + "loss": 0.0219, + "step": 35340 + }, + { + "epoch": 0.26130214955205344, + "grad_norm": 0.08265972882509232, + "learning_rate": 3.707190764482431e-05, + "loss": 0.0205, + "step": 35350 + }, + { + "epoch": 0.2613760681233553, + "grad_norm": 0.0848066657781601, + "learning_rate": 3.7068198005698004e-05, + "loss": 0.0202, + "step": 35360 + }, + { + "epoch": 0.2614499866946572, + "grad_norm": 0.09225792437791824, + "learning_rate": 3.706448836657171e-05, + "loss": 0.0227, + "step": 35370 + }, + { + "epoch": 0.26152390526595903, + "grad_norm": 0.0770459994673729, + "learning_rate": 3.7060778727445396e-05, + "loss": 0.0186, + "step": 35380 + }, + { + "epoch": 0.2615978238372609, + "grad_norm": 0.07575425505638123, + "learning_rate": 3.705706908831909e-05, + "loss": 0.0169, + "step": 35390 + }, + { + "epoch": 0.2616717424085627, + "grad_norm": 0.07981394976377487, + "learning_rate": 3.705335944919278e-05, + "loss": 0.0178, + "step": 35400 + }, + { + "epoch": 0.26174566097986457, + "grad_norm": 0.10660523921251297, + "learning_rate": 3.704964981006648e-05, + "loss": 0.0179, + "step": 35410 + }, + { + "epoch": 0.2618195795511664, + "grad_norm": 0.11321929097175598, + "learning_rate": 3.704594017094017e-05, + "loss": 0.0192, + "step": 35420 + }, + { + "epoch": 0.2618934981224683, + "grad_norm": 0.10271138697862625, + "learning_rate": 3.704223053181386e-05, + "loss": 0.0183, + "step": 35430 + }, + { + "epoch": 0.26196741669377016, + "grad_norm": 0.08852849900722504, + "learning_rate": 3.703852089268756e-05, + "loss": 0.0188, + "step": 35440 + }, + { + "epoch": 0.262041335265072, + "grad_norm": 0.10573693364858627, + "learning_rate": 3.703481125356126e-05, + "loss": 0.0194, + "step": 35450 + }, + { + "epoch": 0.26211525383637385, + "grad_norm": 0.09053755551576614, + "learning_rate": 3.703110161443495e-05, + "loss": 0.0186, + "step": 35460 + }, + { + "epoch": 0.2621891724076757, + "grad_norm": 0.09393644332885742, + "learning_rate": 3.7027391975308646e-05, + "loss": 0.0188, + "step": 35470 + }, + { + "epoch": 0.26226309097897754, + "grad_norm": 0.0807633176445961, + "learning_rate": 3.7023682336182335e-05, + "loss": 0.0214, + "step": 35480 + }, + { + "epoch": 0.2623370095502794, + "grad_norm": 0.0962679386138916, + "learning_rate": 3.701997269705603e-05, + "loss": 0.019, + "step": 35490 + }, + { + "epoch": 0.2624109281215813, + "grad_norm": 0.08685987442731857, + "learning_rate": 3.701626305792973e-05, + "loss": 0.0189, + "step": 35500 + }, + { + "epoch": 0.26248484669288313, + "grad_norm": 0.08247281610965729, + "learning_rate": 3.7012553418803416e-05, + "loss": 0.0174, + "step": 35510 + }, + { + "epoch": 0.262558765264185, + "grad_norm": 0.09114781022071838, + "learning_rate": 3.700884377967712e-05, + "loss": 0.0195, + "step": 35520 + }, + { + "epoch": 0.2626326838354868, + "grad_norm": 0.09900009632110596, + "learning_rate": 3.700513414055081e-05, + "loss": 0.0218, + "step": 35530 + }, + { + "epoch": 0.26270660240678867, + "grad_norm": 0.08584482222795486, + "learning_rate": 3.7001424501424504e-05, + "loss": 0.0194, + "step": 35540 + }, + { + "epoch": 0.2627805209780905, + "grad_norm": 0.1007232666015625, + "learning_rate": 3.69977148622982e-05, + "loss": 0.0177, + "step": 35550 + }, + { + "epoch": 0.2628544395493924, + "grad_norm": 0.08348452299833298, + "learning_rate": 3.699400522317189e-05, + "loss": 0.0199, + "step": 35560 + }, + { + "epoch": 0.26292835812069426, + "grad_norm": 0.08889731019735336, + "learning_rate": 3.6990295584045585e-05, + "loss": 0.0191, + "step": 35570 + }, + { + "epoch": 0.2630022766919961, + "grad_norm": 0.07925963401794434, + "learning_rate": 3.6986585944919275e-05, + "loss": 0.0189, + "step": 35580 + }, + { + "epoch": 0.26307619526329795, + "grad_norm": 0.06700103729963303, + "learning_rate": 3.698287630579297e-05, + "loss": 0.0235, + "step": 35590 + }, + { + "epoch": 0.2631501138345998, + "grad_norm": 0.09109890460968018, + "learning_rate": 3.697916666666667e-05, + "loss": 0.0164, + "step": 35600 + }, + { + "epoch": 0.26322403240590164, + "grad_norm": 0.07469119131565094, + "learning_rate": 3.697545702754036e-05, + "loss": 0.0208, + "step": 35610 + }, + { + "epoch": 0.2632979509772035, + "grad_norm": 0.09684962779283524, + "learning_rate": 3.697174738841406e-05, + "loss": 0.0178, + "step": 35620 + }, + { + "epoch": 0.2633718695485054, + "grad_norm": 0.09081660211086273, + "learning_rate": 3.696803774928775e-05, + "loss": 0.0171, + "step": 35630 + }, + { + "epoch": 0.26344578811980723, + "grad_norm": 0.07729621231555939, + "learning_rate": 3.6964328110161444e-05, + "loss": 0.0171, + "step": 35640 + }, + { + "epoch": 0.2635197066911091, + "grad_norm": 0.07513635605573654, + "learning_rate": 3.696061847103514e-05, + "loss": 0.0215, + "step": 35650 + }, + { + "epoch": 0.2635936252624109, + "grad_norm": 0.07328686863183975, + "learning_rate": 3.695690883190883e-05, + "loss": 0.0183, + "step": 35660 + }, + { + "epoch": 0.26366754383371277, + "grad_norm": 0.08256453275680542, + "learning_rate": 3.695319919278253e-05, + "loss": 0.0171, + "step": 35670 + }, + { + "epoch": 0.2637414624050146, + "grad_norm": 0.08420126140117645, + "learning_rate": 3.694948955365623e-05, + "loss": 0.02, + "step": 35680 + }, + { + "epoch": 0.2638153809763165, + "grad_norm": 0.09419424086809158, + "learning_rate": 3.694577991452992e-05, + "loss": 0.0203, + "step": 35690 + }, + { + "epoch": 0.26388929954761836, + "grad_norm": 0.07843416184186935, + "learning_rate": 3.694207027540361e-05, + "loss": 0.0207, + "step": 35700 + }, + { + "epoch": 0.2639632181189202, + "grad_norm": 0.06925562769174576, + "learning_rate": 3.69383606362773e-05, + "loss": 0.0204, + "step": 35710 + }, + { + "epoch": 0.26403713669022205, + "grad_norm": 0.06749237328767776, + "learning_rate": 3.6934650997151e-05, + "loss": 0.0186, + "step": 35720 + }, + { + "epoch": 0.2641110552615239, + "grad_norm": 0.0696081593632698, + "learning_rate": 3.6930941358024694e-05, + "loss": 0.0192, + "step": 35730 + }, + { + "epoch": 0.26418497383282574, + "grad_norm": 0.0760570541024208, + "learning_rate": 3.692723171889838e-05, + "loss": 0.0186, + "step": 35740 + }, + { + "epoch": 0.2642588924041276, + "grad_norm": 0.10477142781019211, + "learning_rate": 3.6923522079772086e-05, + "loss": 0.0184, + "step": 35750 + }, + { + "epoch": 0.2643328109754295, + "grad_norm": 0.103655606508255, + "learning_rate": 3.6919812440645775e-05, + "loss": 0.0204, + "step": 35760 + }, + { + "epoch": 0.26440672954673133, + "grad_norm": 0.09836665540933609, + "learning_rate": 3.691610280151947e-05, + "loss": 0.0199, + "step": 35770 + }, + { + "epoch": 0.2644806481180332, + "grad_norm": 0.07217530161142349, + "learning_rate": 3.691239316239317e-05, + "loss": 0.021, + "step": 35780 + }, + { + "epoch": 0.264554566689335, + "grad_norm": 0.07730984687805176, + "learning_rate": 3.6908683523266856e-05, + "loss": 0.0208, + "step": 35790 + }, + { + "epoch": 0.26462848526063687, + "grad_norm": 0.09929768741130829, + "learning_rate": 3.690497388414055e-05, + "loss": 0.019, + "step": 35800 + }, + { + "epoch": 0.2647024038319387, + "grad_norm": 0.09074138849973679, + "learning_rate": 3.690126424501424e-05, + "loss": 0.0196, + "step": 35810 + }, + { + "epoch": 0.2647763224032406, + "grad_norm": 0.10163474828004837, + "learning_rate": 3.6897554605887944e-05, + "loss": 0.0182, + "step": 35820 + }, + { + "epoch": 0.26485024097454246, + "grad_norm": 0.08178673684597015, + "learning_rate": 3.689384496676164e-05, + "loss": 0.0172, + "step": 35830 + }, + { + "epoch": 0.2649241595458443, + "grad_norm": 0.09935182332992554, + "learning_rate": 3.689013532763533e-05, + "loss": 0.019, + "step": 35840 + }, + { + "epoch": 0.26499807811714615, + "grad_norm": 0.08761539310216904, + "learning_rate": 3.6886425688509025e-05, + "loss": 0.0227, + "step": 35850 + }, + { + "epoch": 0.265071996688448, + "grad_norm": 0.0659652128815651, + "learning_rate": 3.6882716049382714e-05, + "loss": 0.02, + "step": 35860 + }, + { + "epoch": 0.26514591525974984, + "grad_norm": 0.073598213493824, + "learning_rate": 3.687900641025641e-05, + "loss": 0.0177, + "step": 35870 + }, + { + "epoch": 0.2652198338310517, + "grad_norm": 0.104554682970047, + "learning_rate": 3.6875296771130106e-05, + "loss": 0.0196, + "step": 35880 + }, + { + "epoch": 0.2652937524023536, + "grad_norm": 0.06133726239204407, + "learning_rate": 3.6871587132003795e-05, + "loss": 0.0168, + "step": 35890 + }, + { + "epoch": 0.26536767097365543, + "grad_norm": 0.10068414360284805, + "learning_rate": 3.68678774928775e-05, + "loss": 0.0225, + "step": 35900 + }, + { + "epoch": 0.2654415895449573, + "grad_norm": 0.08246283233165741, + "learning_rate": 3.6864167853751194e-05, + "loss": 0.018, + "step": 35910 + }, + { + "epoch": 0.2655155081162591, + "grad_norm": 0.06949212402105331, + "learning_rate": 3.686045821462488e-05, + "loss": 0.0182, + "step": 35920 + }, + { + "epoch": 0.26558942668756097, + "grad_norm": 0.09852777421474457, + "learning_rate": 3.685674857549858e-05, + "loss": 0.0187, + "step": 35930 + }, + { + "epoch": 0.2656633452588628, + "grad_norm": 0.0703735500574112, + "learning_rate": 3.685303893637227e-05, + "loss": 0.02, + "step": 35940 + }, + { + "epoch": 0.2657372638301647, + "grad_norm": 0.11955846846103668, + "learning_rate": 3.6849329297245964e-05, + "loss": 0.0206, + "step": 35950 + }, + { + "epoch": 0.26581118240146656, + "grad_norm": 0.08075056225061417, + "learning_rate": 3.684561965811966e-05, + "loss": 0.0187, + "step": 35960 + }, + { + "epoch": 0.2658851009727684, + "grad_norm": 0.08222194015979767, + "learning_rate": 3.6841910018993356e-05, + "loss": 0.0175, + "step": 35970 + }, + { + "epoch": 0.26595901954407025, + "grad_norm": 0.10614724457263947, + "learning_rate": 3.683820037986705e-05, + "loss": 0.0207, + "step": 35980 + }, + { + "epoch": 0.2660329381153721, + "grad_norm": 0.09781161695718765, + "learning_rate": 3.683449074074074e-05, + "loss": 0.0203, + "step": 35990 + }, + { + "epoch": 0.26610685668667394, + "grad_norm": 0.08541706204414368, + "learning_rate": 3.683078110161444e-05, + "loss": 0.0184, + "step": 36000 + }, + { + "epoch": 0.2661807752579758, + "grad_norm": 0.09494499117136002, + "learning_rate": 3.6827071462488133e-05, + "loss": 0.0229, + "step": 36010 + }, + { + "epoch": 0.2662546938292777, + "grad_norm": 0.08918741345405579, + "learning_rate": 3.682336182336182e-05, + "loss": 0.0221, + "step": 36020 + }, + { + "epoch": 0.26632861240057953, + "grad_norm": 0.07742021977901459, + "learning_rate": 3.681965218423552e-05, + "loss": 0.0208, + "step": 36030 + }, + { + "epoch": 0.2664025309718814, + "grad_norm": 0.09315332770347595, + "learning_rate": 3.681594254510921e-05, + "loss": 0.0179, + "step": 36040 + }, + { + "epoch": 0.2664764495431832, + "grad_norm": 0.10973034054040909, + "learning_rate": 3.681223290598291e-05, + "loss": 0.0163, + "step": 36050 + }, + { + "epoch": 0.26655036811448507, + "grad_norm": 0.157131165266037, + "learning_rate": 3.6808523266856606e-05, + "loss": 0.0193, + "step": 36060 + }, + { + "epoch": 0.2666242866857869, + "grad_norm": 0.11281372606754303, + "learning_rate": 3.6804813627730296e-05, + "loss": 0.0201, + "step": 36070 + }, + { + "epoch": 0.2666982052570888, + "grad_norm": 0.08388475328683853, + "learning_rate": 3.680110398860399e-05, + "loss": 0.0187, + "step": 36080 + }, + { + "epoch": 0.26677212382839066, + "grad_norm": 0.11188846081495285, + "learning_rate": 3.679739434947768e-05, + "loss": 0.0178, + "step": 36090 + }, + { + "epoch": 0.2668460423996925, + "grad_norm": 0.07050701230764389, + "learning_rate": 3.679368471035138e-05, + "loss": 0.0155, + "step": 36100 + }, + { + "epoch": 0.26691996097099435, + "grad_norm": 0.09294600039720535, + "learning_rate": 3.678997507122507e-05, + "loss": 0.0162, + "step": 36110 + }, + { + "epoch": 0.2669938795422962, + "grad_norm": 0.1007222980260849, + "learning_rate": 3.678626543209877e-05, + "loss": 0.019, + "step": 36120 + }, + { + "epoch": 0.26706779811359804, + "grad_norm": 0.06697934865951538, + "learning_rate": 3.6782555792972465e-05, + "loss": 0.0204, + "step": 36130 + }, + { + "epoch": 0.2671417166848999, + "grad_norm": 0.07020814716815948, + "learning_rate": 3.677884615384616e-05, + "loss": 0.0203, + "step": 36140 + }, + { + "epoch": 0.2672156352562018, + "grad_norm": 0.0640861839056015, + "learning_rate": 3.677513651471985e-05, + "loss": 0.0198, + "step": 36150 + }, + { + "epoch": 0.26728955382750363, + "grad_norm": 0.08898360282182693, + "learning_rate": 3.6771426875593546e-05, + "loss": 0.0193, + "step": 36160 + }, + { + "epoch": 0.2673634723988055, + "grad_norm": 0.06888226419687271, + "learning_rate": 3.6767717236467235e-05, + "loss": 0.0182, + "step": 36170 + }, + { + "epoch": 0.2674373909701073, + "grad_norm": 0.09916161000728607, + "learning_rate": 3.676400759734093e-05, + "loss": 0.0194, + "step": 36180 + }, + { + "epoch": 0.26751130954140917, + "grad_norm": 0.08564165979623795, + "learning_rate": 3.676029795821463e-05, + "loss": 0.0211, + "step": 36190 + }, + { + "epoch": 0.267585228112711, + "grad_norm": 0.06861856579780579, + "learning_rate": 3.675658831908832e-05, + "loss": 0.0203, + "step": 36200 + }, + { + "epoch": 0.2676591466840129, + "grad_norm": 0.06755343079566956, + "learning_rate": 3.675287867996202e-05, + "loss": 0.02, + "step": 36210 + }, + { + "epoch": 0.26773306525531476, + "grad_norm": 0.0989122986793518, + "learning_rate": 3.674916904083571e-05, + "loss": 0.0172, + "step": 36220 + }, + { + "epoch": 0.2678069838266166, + "grad_norm": 0.09048120677471161, + "learning_rate": 3.6745459401709404e-05, + "loss": 0.0199, + "step": 36230 + }, + { + "epoch": 0.26788090239791845, + "grad_norm": 0.07613878697156906, + "learning_rate": 3.67417497625831e-05, + "loss": 0.0189, + "step": 36240 + }, + { + "epoch": 0.2679548209692203, + "grad_norm": 0.08191327750682831, + "learning_rate": 3.673804012345679e-05, + "loss": 0.0185, + "step": 36250 + }, + { + "epoch": 0.26802873954052214, + "grad_norm": 0.06279822438955307, + "learning_rate": 3.6734330484330485e-05, + "loss": 0.0169, + "step": 36260 + }, + { + "epoch": 0.268102658111824, + "grad_norm": 0.07203217595815659, + "learning_rate": 3.673062084520418e-05, + "loss": 0.0191, + "step": 36270 + }, + { + "epoch": 0.2681765766831259, + "grad_norm": 0.10490775853395462, + "learning_rate": 3.672691120607788e-05, + "loss": 0.0186, + "step": 36280 + }, + { + "epoch": 0.26825049525442773, + "grad_norm": 0.1026383489370346, + "learning_rate": 3.672320156695157e-05, + "loss": 0.0197, + "step": 36290 + }, + { + "epoch": 0.2683244138257296, + "grad_norm": 0.07934411615133286, + "learning_rate": 3.671949192782526e-05, + "loss": 0.0179, + "step": 36300 + }, + { + "epoch": 0.2683983323970314, + "grad_norm": 0.07977404445409775, + "learning_rate": 3.671578228869896e-05, + "loss": 0.0181, + "step": 36310 + }, + { + "epoch": 0.26847225096833327, + "grad_norm": 0.10161271691322327, + "learning_rate": 3.671207264957265e-05, + "loss": 0.0204, + "step": 36320 + }, + { + "epoch": 0.2685461695396351, + "grad_norm": 0.07428612560033798, + "learning_rate": 3.670836301044634e-05, + "loss": 0.0162, + "step": 36330 + }, + { + "epoch": 0.268620088110937, + "grad_norm": 0.05228397622704506, + "learning_rate": 3.670465337132004e-05, + "loss": 0.0192, + "step": 36340 + }, + { + "epoch": 0.26869400668223886, + "grad_norm": 0.0897165909409523, + "learning_rate": 3.6700943732193735e-05, + "loss": 0.0188, + "step": 36350 + }, + { + "epoch": 0.2687679252535407, + "grad_norm": 0.09100687503814697, + "learning_rate": 3.669723409306743e-05, + "loss": 0.0192, + "step": 36360 + }, + { + "epoch": 0.26884184382484255, + "grad_norm": 0.12539775669574738, + "learning_rate": 3.669352445394113e-05, + "loss": 0.0173, + "step": 36370 + }, + { + "epoch": 0.2689157623961444, + "grad_norm": 0.10039008408784866, + "learning_rate": 3.6689814814814816e-05, + "loss": 0.0211, + "step": 36380 + }, + { + "epoch": 0.26898968096744624, + "grad_norm": 0.09370112419128418, + "learning_rate": 3.668610517568851e-05, + "loss": 0.0172, + "step": 36390 + }, + { + "epoch": 0.2690635995387481, + "grad_norm": 0.11496493220329285, + "learning_rate": 3.66823955365622e-05, + "loss": 0.0168, + "step": 36400 + }, + { + "epoch": 0.26913751811005, + "grad_norm": 0.07878755778074265, + "learning_rate": 3.66786858974359e-05, + "loss": 0.0183, + "step": 36410 + }, + { + "epoch": 0.26921143668135183, + "grad_norm": 0.09112108498811722, + "learning_rate": 3.6674976258309594e-05, + "loss": 0.0192, + "step": 36420 + }, + { + "epoch": 0.2692853552526537, + "grad_norm": 0.10041724890470505, + "learning_rate": 3.667126661918329e-05, + "loss": 0.0195, + "step": 36430 + }, + { + "epoch": 0.2693592738239555, + "grad_norm": 0.10924817621707916, + "learning_rate": 3.6667556980056985e-05, + "loss": 0.0206, + "step": 36440 + }, + { + "epoch": 0.26943319239525737, + "grad_norm": 0.09754710644483566, + "learning_rate": 3.6663847340930675e-05, + "loss": 0.0183, + "step": 36450 + }, + { + "epoch": 0.2695071109665592, + "grad_norm": 0.09620609879493713, + "learning_rate": 3.666013770180437e-05, + "loss": 0.0193, + "step": 36460 + }, + { + "epoch": 0.2695810295378611, + "grad_norm": 0.09440019726753235, + "learning_rate": 3.6656428062678067e-05, + "loss": 0.0175, + "step": 36470 + }, + { + "epoch": 0.26965494810916296, + "grad_norm": 0.07512184977531433, + "learning_rate": 3.6652718423551756e-05, + "loss": 0.0198, + "step": 36480 + }, + { + "epoch": 0.2697288666804648, + "grad_norm": 0.08837522566318512, + "learning_rate": 3.664900878442545e-05, + "loss": 0.0201, + "step": 36490 + }, + { + "epoch": 0.26980278525176665, + "grad_norm": 0.16763469576835632, + "learning_rate": 3.664529914529915e-05, + "loss": 0.0219, + "step": 36500 + }, + { + "epoch": 0.2698767038230685, + "grad_norm": 0.1020568385720253, + "learning_rate": 3.6641589506172844e-05, + "loss": 0.0194, + "step": 36510 + }, + { + "epoch": 0.26995062239437034, + "grad_norm": 0.11948026716709137, + "learning_rate": 3.663787986704654e-05, + "loss": 0.0189, + "step": 36520 + }, + { + "epoch": 0.2700245409656722, + "grad_norm": 0.07943316549062729, + "learning_rate": 3.663417022792023e-05, + "loss": 0.0203, + "step": 36530 + }, + { + "epoch": 0.2700984595369741, + "grad_norm": 0.2049197405576706, + "learning_rate": 3.6630460588793925e-05, + "loss": 0.0192, + "step": 36540 + }, + { + "epoch": 0.27017237810827593, + "grad_norm": 0.09560728073120117, + "learning_rate": 3.6626750949667614e-05, + "loss": 0.0193, + "step": 36550 + }, + { + "epoch": 0.2702462966795778, + "grad_norm": 0.09262557327747345, + "learning_rate": 3.662304131054131e-05, + "loss": 0.02, + "step": 36560 + }, + { + "epoch": 0.2703202152508796, + "grad_norm": 0.13921691477298737, + "learning_rate": 3.6619331671415006e-05, + "loss": 0.0182, + "step": 36570 + }, + { + "epoch": 0.27039413382218147, + "grad_norm": 0.06821108609437943, + "learning_rate": 3.66156220322887e-05, + "loss": 0.0196, + "step": 36580 + }, + { + "epoch": 0.2704680523934833, + "grad_norm": 0.08378107100725174, + "learning_rate": 3.66119123931624e-05, + "loss": 0.0197, + "step": 36590 + }, + { + "epoch": 0.2705419709647852, + "grad_norm": 0.09226445853710175, + "learning_rate": 3.6608202754036094e-05, + "loss": 0.0187, + "step": 36600 + }, + { + "epoch": 0.27061588953608706, + "grad_norm": 0.07749415934085846, + "learning_rate": 3.660449311490978e-05, + "loss": 0.0191, + "step": 36610 + }, + { + "epoch": 0.2706898081073889, + "grad_norm": 0.06596098095178604, + "learning_rate": 3.660078347578348e-05, + "loss": 0.0179, + "step": 36620 + }, + { + "epoch": 0.27076372667869075, + "grad_norm": 0.12515129148960114, + "learning_rate": 3.659707383665717e-05, + "loss": 0.0242, + "step": 36630 + }, + { + "epoch": 0.2708376452499926, + "grad_norm": 0.12110026925802231, + "learning_rate": 3.6593364197530864e-05, + "loss": 0.0204, + "step": 36640 + }, + { + "epoch": 0.27091156382129444, + "grad_norm": 0.0709238350391388, + "learning_rate": 3.658965455840456e-05, + "loss": 0.0197, + "step": 36650 + }, + { + "epoch": 0.2709854823925963, + "grad_norm": 0.07414538413286209, + "learning_rate": 3.6585944919278256e-05, + "loss": 0.0213, + "step": 36660 + }, + { + "epoch": 0.2710594009638982, + "grad_norm": 0.07066690921783447, + "learning_rate": 3.658223528015195e-05, + "loss": 0.0189, + "step": 36670 + }, + { + "epoch": 0.27113331953520003, + "grad_norm": 0.08719604462385178, + "learning_rate": 3.657852564102564e-05, + "loss": 0.0179, + "step": 36680 + }, + { + "epoch": 0.2712072381065019, + "grad_norm": 0.08157003670930862, + "learning_rate": 3.657481600189934e-05, + "loss": 0.017, + "step": 36690 + }, + { + "epoch": 0.2712811566778037, + "grad_norm": 0.07565820217132568, + "learning_rate": 3.657110636277303e-05, + "loss": 0.0172, + "step": 36700 + }, + { + "epoch": 0.27135507524910557, + "grad_norm": 0.13831768929958344, + "learning_rate": 3.656739672364672e-05, + "loss": 0.0219, + "step": 36710 + }, + { + "epoch": 0.2714289938204074, + "grad_norm": 0.09268336743116379, + "learning_rate": 3.656368708452042e-05, + "loss": 0.0169, + "step": 36720 + }, + { + "epoch": 0.2715029123917093, + "grad_norm": 0.08384677022695541, + "learning_rate": 3.6559977445394114e-05, + "loss": 0.0228, + "step": 36730 + }, + { + "epoch": 0.27157683096301116, + "grad_norm": 0.09165684878826141, + "learning_rate": 3.655626780626781e-05, + "loss": 0.0208, + "step": 36740 + }, + { + "epoch": 0.271650749534313, + "grad_norm": 0.07229489833116531, + "learning_rate": 3.6552558167141506e-05, + "loss": 0.0211, + "step": 36750 + }, + { + "epoch": 0.27172466810561485, + "grad_norm": 0.08151846379041672, + "learning_rate": 3.6548848528015195e-05, + "loss": 0.0168, + "step": 36760 + }, + { + "epoch": 0.2717985866769167, + "grad_norm": 0.0799284502863884, + "learning_rate": 3.654513888888889e-05, + "loss": 0.0174, + "step": 36770 + }, + { + "epoch": 0.27187250524821854, + "grad_norm": 0.11195094883441925, + "learning_rate": 3.654142924976258e-05, + "loss": 0.018, + "step": 36780 + }, + { + "epoch": 0.2719464238195204, + "grad_norm": 0.06137610971927643, + "learning_rate": 3.6537719610636277e-05, + "loss": 0.0194, + "step": 36790 + }, + { + "epoch": 0.2720203423908223, + "grad_norm": 0.10252392292022705, + "learning_rate": 3.653400997150997e-05, + "loss": 0.0169, + "step": 36800 + }, + { + "epoch": 0.27209426096212413, + "grad_norm": 0.0966234877705574, + "learning_rate": 3.653030033238367e-05, + "loss": 0.0178, + "step": 36810 + }, + { + "epoch": 0.272168179533426, + "grad_norm": 0.06209605187177658, + "learning_rate": 3.6526590693257364e-05, + "loss": 0.0201, + "step": 36820 + }, + { + "epoch": 0.2722420981047278, + "grad_norm": 0.0901181623339653, + "learning_rate": 3.652288105413106e-05, + "loss": 0.0178, + "step": 36830 + }, + { + "epoch": 0.27231601667602967, + "grad_norm": 0.11750251799821854, + "learning_rate": 3.651917141500475e-05, + "loss": 0.0179, + "step": 36840 + }, + { + "epoch": 0.2723899352473315, + "grad_norm": 0.10816629230976105, + "learning_rate": 3.6515461775878446e-05, + "loss": 0.022, + "step": 36850 + }, + { + "epoch": 0.2724638538186334, + "grad_norm": 0.06723657250404358, + "learning_rate": 3.6511752136752135e-05, + "loss": 0.0176, + "step": 36860 + }, + { + "epoch": 0.27253777238993526, + "grad_norm": 0.10735520720481873, + "learning_rate": 3.650804249762583e-05, + "loss": 0.0199, + "step": 36870 + }, + { + "epoch": 0.2726116909612371, + "grad_norm": 0.0668816789984703, + "learning_rate": 3.650433285849953e-05, + "loss": 0.0207, + "step": 36880 + }, + { + "epoch": 0.27268560953253895, + "grad_norm": 0.07750372588634491, + "learning_rate": 3.650062321937322e-05, + "loss": 0.0185, + "step": 36890 + }, + { + "epoch": 0.2727595281038408, + "grad_norm": 0.08497249335050583, + "learning_rate": 3.649691358024692e-05, + "loss": 0.0189, + "step": 36900 + }, + { + "epoch": 0.27283344667514264, + "grad_norm": 0.08565185219049454, + "learning_rate": 3.649320394112061e-05, + "loss": 0.0197, + "step": 36910 + }, + { + "epoch": 0.27290736524644454, + "grad_norm": 0.07891131937503815, + "learning_rate": 3.6489494301994304e-05, + "loss": 0.0182, + "step": 36920 + }, + { + "epoch": 0.2729812838177464, + "grad_norm": 0.07422411441802979, + "learning_rate": 3.6485784662868e-05, + "loss": 0.0177, + "step": 36930 + }, + { + "epoch": 0.27305520238904823, + "grad_norm": 0.11000849306583405, + "learning_rate": 3.648207502374169e-05, + "loss": 0.0187, + "step": 36940 + }, + { + "epoch": 0.2731291209603501, + "grad_norm": 0.08401288837194443, + "learning_rate": 3.6478365384615385e-05, + "loss": 0.0206, + "step": 36950 + }, + { + "epoch": 0.2732030395316519, + "grad_norm": 0.12165091186761856, + "learning_rate": 3.647465574548908e-05, + "loss": 0.0198, + "step": 36960 + }, + { + "epoch": 0.27327695810295377, + "grad_norm": 0.10981786251068115, + "learning_rate": 3.647094610636278e-05, + "loss": 0.0195, + "step": 36970 + }, + { + "epoch": 0.2733508766742556, + "grad_norm": 0.09434209018945694, + "learning_rate": 3.646723646723647e-05, + "loss": 0.0182, + "step": 36980 + }, + { + "epoch": 0.2734247952455575, + "grad_norm": 0.17333875596523285, + "learning_rate": 3.646352682811016e-05, + "loss": 0.0197, + "step": 36990 + }, + { + "epoch": 0.27349871381685936, + "grad_norm": 0.09001073986291885, + "learning_rate": 3.645981718898386e-05, + "loss": 0.0201, + "step": 37000 + }, + { + "epoch": 0.2735726323881612, + "grad_norm": 0.08942365646362305, + "learning_rate": 3.645610754985755e-05, + "loss": 0.0236, + "step": 37010 + }, + { + "epoch": 0.27364655095946305, + "grad_norm": 0.08353909850120544, + "learning_rate": 3.645239791073124e-05, + "loss": 0.0175, + "step": 37020 + }, + { + "epoch": 0.2737204695307649, + "grad_norm": 0.08675676584243774, + "learning_rate": 3.644868827160494e-05, + "loss": 0.0167, + "step": 37030 + }, + { + "epoch": 0.27379438810206674, + "grad_norm": 0.12086187303066254, + "learning_rate": 3.6444978632478635e-05, + "loss": 0.022, + "step": 37040 + }, + { + "epoch": 0.27386830667336864, + "grad_norm": 0.11000093817710876, + "learning_rate": 3.644126899335233e-05, + "loss": 0.022, + "step": 37050 + }, + { + "epoch": 0.2739422252446705, + "grad_norm": 0.16909393668174744, + "learning_rate": 3.643755935422603e-05, + "loss": 0.0188, + "step": 37060 + }, + { + "epoch": 0.27401614381597234, + "grad_norm": 0.07956766337156296, + "learning_rate": 3.6433849715099716e-05, + "loss": 0.0206, + "step": 37070 + }, + { + "epoch": 0.2740900623872742, + "grad_norm": 0.10060791671276093, + "learning_rate": 3.643014007597341e-05, + "loss": 0.0184, + "step": 37080 + }, + { + "epoch": 0.274163980958576, + "grad_norm": 0.10526014119386673, + "learning_rate": 3.64264304368471e-05, + "loss": 0.0192, + "step": 37090 + }, + { + "epoch": 0.27423789952987787, + "grad_norm": 0.080055370926857, + "learning_rate": 3.64227207977208e-05, + "loss": 0.0191, + "step": 37100 + }, + { + "epoch": 0.2743118181011797, + "grad_norm": 0.12234311550855637, + "learning_rate": 3.641901115859449e-05, + "loss": 0.0184, + "step": 37110 + }, + { + "epoch": 0.2743857366724816, + "grad_norm": 0.08261439949274063, + "learning_rate": 3.641530151946819e-05, + "loss": 0.0171, + "step": 37120 + }, + { + "epoch": 0.27445965524378346, + "grad_norm": 0.07762432843446732, + "learning_rate": 3.6411591880341885e-05, + "loss": 0.0177, + "step": 37130 + }, + { + "epoch": 0.2745335738150853, + "grad_norm": 0.1242787316441536, + "learning_rate": 3.6407882241215574e-05, + "loss": 0.0215, + "step": 37140 + }, + { + "epoch": 0.27460749238638715, + "grad_norm": 0.1280965805053711, + "learning_rate": 3.640417260208927e-05, + "loss": 0.0171, + "step": 37150 + }, + { + "epoch": 0.274681410957689, + "grad_norm": 0.058584120124578476, + "learning_rate": 3.6400462962962966e-05, + "loss": 0.0214, + "step": 37160 + }, + { + "epoch": 0.27475532952899084, + "grad_norm": 0.07477357238531113, + "learning_rate": 3.6396753323836656e-05, + "loss": 0.0196, + "step": 37170 + }, + { + "epoch": 0.27482924810029274, + "grad_norm": 0.10592483729124069, + "learning_rate": 3.639304368471035e-05, + "loss": 0.0194, + "step": 37180 + }, + { + "epoch": 0.2749031666715946, + "grad_norm": 0.09464053064584732, + "learning_rate": 3.638933404558405e-05, + "loss": 0.0217, + "step": 37190 + }, + { + "epoch": 0.27497708524289644, + "grad_norm": 0.09078387916088104, + "learning_rate": 3.6385624406457743e-05, + "loss": 0.02, + "step": 37200 + }, + { + "epoch": 0.2750510038141983, + "grad_norm": 0.1125599816441536, + "learning_rate": 3.638191476733144e-05, + "loss": 0.0182, + "step": 37210 + }, + { + "epoch": 0.2751249223855001, + "grad_norm": 0.08146911859512329, + "learning_rate": 3.637820512820513e-05, + "loss": 0.0216, + "step": 37220 + }, + { + "epoch": 0.27519884095680197, + "grad_norm": 0.07568617165088654, + "learning_rate": 3.6374495489078825e-05, + "loss": 0.0182, + "step": 37230 + }, + { + "epoch": 0.2752727595281038, + "grad_norm": 0.08872109651565552, + "learning_rate": 3.6370785849952514e-05, + "loss": 0.0184, + "step": 37240 + }, + { + "epoch": 0.2753466780994057, + "grad_norm": 0.07912348955869675, + "learning_rate": 3.636707621082621e-05, + "loss": 0.0188, + "step": 37250 + }, + { + "epoch": 0.27542059667070756, + "grad_norm": 0.09789282828569412, + "learning_rate": 3.6363366571699906e-05, + "loss": 0.0206, + "step": 37260 + }, + { + "epoch": 0.2754945152420094, + "grad_norm": 0.07566897571086884, + "learning_rate": 3.63596569325736e-05, + "loss": 0.0213, + "step": 37270 + }, + { + "epoch": 0.27556843381331125, + "grad_norm": 0.09172476083040237, + "learning_rate": 3.63559472934473e-05, + "loss": 0.0178, + "step": 37280 + }, + { + "epoch": 0.2756423523846131, + "grad_norm": 0.08475707471370697, + "learning_rate": 3.6352237654320994e-05, + "loss": 0.0189, + "step": 37290 + }, + { + "epoch": 0.27571627095591494, + "grad_norm": 0.08507103472948074, + "learning_rate": 3.634852801519468e-05, + "loss": 0.0201, + "step": 37300 + }, + { + "epoch": 0.27579018952721684, + "grad_norm": 0.10753806680440903, + "learning_rate": 3.634481837606838e-05, + "loss": 0.0186, + "step": 37310 + }, + { + "epoch": 0.2758641080985187, + "grad_norm": 0.0725020319223404, + "learning_rate": 3.634110873694207e-05, + "loss": 0.0194, + "step": 37320 + }, + { + "epoch": 0.27593802666982054, + "grad_norm": 0.0779128149151802, + "learning_rate": 3.6337399097815764e-05, + "loss": 0.0182, + "step": 37330 + }, + { + "epoch": 0.2760119452411224, + "grad_norm": 0.07879126816987991, + "learning_rate": 3.633368945868946e-05, + "loss": 0.0192, + "step": 37340 + }, + { + "epoch": 0.2760858638124242, + "grad_norm": 0.07782237976789474, + "learning_rate": 3.6329979819563156e-05, + "loss": 0.0184, + "step": 37350 + }, + { + "epoch": 0.27615978238372607, + "grad_norm": 0.07219192385673523, + "learning_rate": 3.632627018043685e-05, + "loss": 0.019, + "step": 37360 + }, + { + "epoch": 0.2762337009550279, + "grad_norm": 0.07820811122655869, + "learning_rate": 3.632256054131054e-05, + "loss": 0.0219, + "step": 37370 + }, + { + "epoch": 0.2763076195263298, + "grad_norm": 0.08021660149097443, + "learning_rate": 3.631885090218424e-05, + "loss": 0.0175, + "step": 37380 + }, + { + "epoch": 0.27638153809763166, + "grad_norm": 0.0879962146282196, + "learning_rate": 3.631514126305793e-05, + "loss": 0.0183, + "step": 37390 + }, + { + "epoch": 0.2764554566689335, + "grad_norm": 0.09419143944978714, + "learning_rate": 3.631143162393162e-05, + "loss": 0.0177, + "step": 37400 + }, + { + "epoch": 0.27652937524023535, + "grad_norm": 0.07984619587659836, + "learning_rate": 3.630772198480532e-05, + "loss": 0.0173, + "step": 37410 + }, + { + "epoch": 0.2766032938115372, + "grad_norm": 0.09308286011219025, + "learning_rate": 3.6304012345679014e-05, + "loss": 0.0198, + "step": 37420 + }, + { + "epoch": 0.27667721238283904, + "grad_norm": 0.09586650133132935, + "learning_rate": 3.630030270655271e-05, + "loss": 0.0163, + "step": 37430 + }, + { + "epoch": 0.27675113095414094, + "grad_norm": 0.10913722962141037, + "learning_rate": 3.6296593067426406e-05, + "loss": 0.0216, + "step": 37440 + }, + { + "epoch": 0.2768250495254428, + "grad_norm": 0.07150280475616455, + "learning_rate": 3.6292883428300095e-05, + "loss": 0.0197, + "step": 37450 + }, + { + "epoch": 0.27689896809674464, + "grad_norm": 0.0947035625576973, + "learning_rate": 3.628917378917379e-05, + "loss": 0.0185, + "step": 37460 + }, + { + "epoch": 0.2769728866680465, + "grad_norm": 0.07512058317661285, + "learning_rate": 3.628546415004749e-05, + "loss": 0.0193, + "step": 37470 + }, + { + "epoch": 0.2770468052393483, + "grad_norm": 0.07108630985021591, + "learning_rate": 3.6281754510921176e-05, + "loss": 0.0188, + "step": 37480 + }, + { + "epoch": 0.27712072381065017, + "grad_norm": 0.08340652287006378, + "learning_rate": 3.627804487179487e-05, + "loss": 0.0202, + "step": 37490 + }, + { + "epoch": 0.277194642381952, + "grad_norm": 0.09074801206588745, + "learning_rate": 3.627433523266857e-05, + "loss": 0.0181, + "step": 37500 + }, + { + "epoch": 0.2772685609532539, + "grad_norm": 0.09117832779884338, + "learning_rate": 3.6270625593542264e-05, + "loss": 0.018, + "step": 37510 + }, + { + "epoch": 0.27734247952455576, + "grad_norm": 0.06303833425045013, + "learning_rate": 3.626691595441596e-05, + "loss": 0.019, + "step": 37520 + }, + { + "epoch": 0.2774163980958576, + "grad_norm": 0.06354062259197235, + "learning_rate": 3.626320631528965e-05, + "loss": 0.0199, + "step": 37530 + }, + { + "epoch": 0.27749031666715945, + "grad_norm": 0.07251658290624619, + "learning_rate": 3.6259496676163345e-05, + "loss": 0.0162, + "step": 37540 + }, + { + "epoch": 0.2775642352384613, + "grad_norm": 0.09875177592039108, + "learning_rate": 3.6255787037037035e-05, + "loss": 0.0184, + "step": 37550 + }, + { + "epoch": 0.27763815380976314, + "grad_norm": 0.09109731763601303, + "learning_rate": 3.625207739791073e-05, + "loss": 0.0171, + "step": 37560 + }, + { + "epoch": 0.27771207238106504, + "grad_norm": 0.09304312616586685, + "learning_rate": 3.6248367758784426e-05, + "loss": 0.0169, + "step": 37570 + }, + { + "epoch": 0.2777859909523669, + "grad_norm": 0.07635471224784851, + "learning_rate": 3.624465811965812e-05, + "loss": 0.0183, + "step": 37580 + }, + { + "epoch": 0.27785990952366874, + "grad_norm": 0.10096585005521774, + "learning_rate": 3.624094848053182e-05, + "loss": 0.018, + "step": 37590 + }, + { + "epoch": 0.2779338280949706, + "grad_norm": 0.09930157661437988, + "learning_rate": 3.623723884140551e-05, + "loss": 0.0194, + "step": 37600 + }, + { + "epoch": 0.2780077466662724, + "grad_norm": 0.10406079143285751, + "learning_rate": 3.6233529202279204e-05, + "loss": 0.0177, + "step": 37610 + }, + { + "epoch": 0.27808166523757427, + "grad_norm": 0.05847761034965515, + "learning_rate": 3.62298195631529e-05, + "loss": 0.0164, + "step": 37620 + }, + { + "epoch": 0.2781555838088761, + "grad_norm": 0.07200673967599869, + "learning_rate": 3.622610992402659e-05, + "loss": 0.0189, + "step": 37630 + }, + { + "epoch": 0.278229502380178, + "grad_norm": 0.0837259292602539, + "learning_rate": 3.6222400284900285e-05, + "loss": 0.0186, + "step": 37640 + }, + { + "epoch": 0.27830342095147986, + "grad_norm": 0.07048111408948898, + "learning_rate": 3.621869064577398e-05, + "loss": 0.0176, + "step": 37650 + }, + { + "epoch": 0.2783773395227817, + "grad_norm": 0.12101917713880539, + "learning_rate": 3.6214981006647677e-05, + "loss": 0.0199, + "step": 37660 + }, + { + "epoch": 0.27845125809408355, + "grad_norm": 0.09494868665933609, + "learning_rate": 3.621127136752137e-05, + "loss": 0.0211, + "step": 37670 + }, + { + "epoch": 0.2785251766653854, + "grad_norm": 0.10044016689062119, + "learning_rate": 3.620756172839506e-05, + "loss": 0.0208, + "step": 37680 + }, + { + "epoch": 0.27859909523668724, + "grad_norm": 0.0923566222190857, + "learning_rate": 3.620385208926876e-05, + "loss": 0.0203, + "step": 37690 + }, + { + "epoch": 0.27867301380798915, + "grad_norm": 0.08192206919193268, + "learning_rate": 3.6200142450142454e-05, + "loss": 0.0179, + "step": 37700 + }, + { + "epoch": 0.278746932379291, + "grad_norm": 0.10374264419078827, + "learning_rate": 3.619643281101614e-05, + "loss": 0.0215, + "step": 37710 + }, + { + "epoch": 0.27882085095059284, + "grad_norm": 0.07484060525894165, + "learning_rate": 3.619272317188984e-05, + "loss": 0.02, + "step": 37720 + }, + { + "epoch": 0.2788947695218947, + "grad_norm": 0.09017164260149002, + "learning_rate": 3.6189013532763535e-05, + "loss": 0.0178, + "step": 37730 + }, + { + "epoch": 0.2789686880931965, + "grad_norm": 0.08458402752876282, + "learning_rate": 3.618530389363723e-05, + "loss": 0.0195, + "step": 37740 + }, + { + "epoch": 0.27904260666449837, + "grad_norm": 0.06826536357402802, + "learning_rate": 3.618159425451093e-05, + "loss": 0.0194, + "step": 37750 + }, + { + "epoch": 0.2791165252358002, + "grad_norm": 0.08888979256153107, + "learning_rate": 3.6177884615384616e-05, + "loss": 0.0203, + "step": 37760 + }, + { + "epoch": 0.2791904438071021, + "grad_norm": 0.11521486937999725, + "learning_rate": 3.617417497625831e-05, + "loss": 0.0218, + "step": 37770 + }, + { + "epoch": 0.27926436237840396, + "grad_norm": 0.07831291854381561, + "learning_rate": 3.6170465337132e-05, + "loss": 0.0212, + "step": 37780 + }, + { + "epoch": 0.2793382809497058, + "grad_norm": 0.10350217670202255, + "learning_rate": 3.61667556980057e-05, + "loss": 0.0196, + "step": 37790 + }, + { + "epoch": 0.27941219952100765, + "grad_norm": 0.10121927410364151, + "learning_rate": 3.616304605887939e-05, + "loss": 0.0186, + "step": 37800 + }, + { + "epoch": 0.2794861180923095, + "grad_norm": 0.08976984024047852, + "learning_rate": 3.615933641975309e-05, + "loss": 0.0199, + "step": 37810 + }, + { + "epoch": 0.27956003666361134, + "grad_norm": 0.09638135880231857, + "learning_rate": 3.6155626780626785e-05, + "loss": 0.0204, + "step": 37820 + }, + { + "epoch": 0.27963395523491325, + "grad_norm": 0.09168626368045807, + "learning_rate": 3.6151917141500474e-05, + "loss": 0.0197, + "step": 37830 + }, + { + "epoch": 0.2797078738062151, + "grad_norm": 0.0909101665019989, + "learning_rate": 3.614820750237417e-05, + "loss": 0.0203, + "step": 37840 + }, + { + "epoch": 0.27978179237751694, + "grad_norm": 0.10678108781576157, + "learning_rate": 3.6144497863247866e-05, + "loss": 0.0186, + "step": 37850 + }, + { + "epoch": 0.2798557109488188, + "grad_norm": 0.09719112515449524, + "learning_rate": 3.6140788224121555e-05, + "loss": 0.0193, + "step": 37860 + }, + { + "epoch": 0.2799296295201206, + "grad_norm": 0.11422860622406006, + "learning_rate": 3.613707858499525e-05, + "loss": 0.022, + "step": 37870 + }, + { + "epoch": 0.28000354809142247, + "grad_norm": 0.0775851309299469, + "learning_rate": 3.613336894586895e-05, + "loss": 0.0205, + "step": 37880 + }, + { + "epoch": 0.2800774666627243, + "grad_norm": 0.06096609681844711, + "learning_rate": 3.612965930674264e-05, + "loss": 0.0178, + "step": 37890 + }, + { + "epoch": 0.2801513852340262, + "grad_norm": 0.07704003155231476, + "learning_rate": 3.612594966761634e-05, + "loss": 0.0174, + "step": 37900 + }, + { + "epoch": 0.28022530380532806, + "grad_norm": 0.08237231522798538, + "learning_rate": 3.612224002849003e-05, + "loss": 0.0185, + "step": 37910 + }, + { + "epoch": 0.2802992223766299, + "grad_norm": 0.0777268186211586, + "learning_rate": 3.6118530389363724e-05, + "loss": 0.02, + "step": 37920 + }, + { + "epoch": 0.28037314094793175, + "grad_norm": 0.08269333094358444, + "learning_rate": 3.611482075023742e-05, + "loss": 0.0211, + "step": 37930 + }, + { + "epoch": 0.2804470595192336, + "grad_norm": 0.08607961237430573, + "learning_rate": 3.611111111111111e-05, + "loss": 0.0173, + "step": 37940 + }, + { + "epoch": 0.28052097809053544, + "grad_norm": 0.09240085631608963, + "learning_rate": 3.6107401471984805e-05, + "loss": 0.0189, + "step": 37950 + }, + { + "epoch": 0.28059489666183735, + "grad_norm": 0.10778304189443588, + "learning_rate": 3.61036918328585e-05, + "loss": 0.0193, + "step": 37960 + }, + { + "epoch": 0.2806688152331392, + "grad_norm": 0.07810915261507034, + "learning_rate": 3.60999821937322e-05, + "loss": 0.0186, + "step": 37970 + }, + { + "epoch": 0.28074273380444104, + "grad_norm": 0.07399895042181015, + "learning_rate": 3.609627255460589e-05, + "loss": 0.0181, + "step": 37980 + }, + { + "epoch": 0.2808166523757429, + "grad_norm": 0.0926373153924942, + "learning_rate": 3.609256291547958e-05, + "loss": 0.019, + "step": 37990 + }, + { + "epoch": 0.2808905709470447, + "grad_norm": 0.05467765033245087, + "learning_rate": 3.608885327635328e-05, + "loss": 0.021, + "step": 38000 + }, + { + "epoch": 0.28096448951834657, + "grad_norm": 0.09927170723676682, + "learning_rate": 3.608514363722697e-05, + "loss": 0.0197, + "step": 38010 + }, + { + "epoch": 0.2810384080896484, + "grad_norm": 0.09716068208217621, + "learning_rate": 3.6081433998100664e-05, + "loss": 0.0188, + "step": 38020 + }, + { + "epoch": 0.2811123266609503, + "grad_norm": 0.096413254737854, + "learning_rate": 3.607772435897436e-05, + "loss": 0.0194, + "step": 38030 + }, + { + "epoch": 0.28118624523225216, + "grad_norm": 0.08352841436862946, + "learning_rate": 3.6074014719848056e-05, + "loss": 0.0194, + "step": 38040 + }, + { + "epoch": 0.281260163803554, + "grad_norm": 0.07367278635501862, + "learning_rate": 3.607030508072175e-05, + "loss": 0.018, + "step": 38050 + }, + { + "epoch": 0.28133408237485585, + "grad_norm": 0.09048157185316086, + "learning_rate": 3.606659544159544e-05, + "loss": 0.0173, + "step": 38060 + }, + { + "epoch": 0.2814080009461577, + "grad_norm": 0.07373079657554626, + "learning_rate": 3.606288580246914e-05, + "loss": 0.0201, + "step": 38070 + }, + { + "epoch": 0.28148191951745954, + "grad_norm": 0.07853297889232635, + "learning_rate": 3.605917616334283e-05, + "loss": 0.018, + "step": 38080 + }, + { + "epoch": 0.28155583808876145, + "grad_norm": 0.10812287032604218, + "learning_rate": 3.605546652421652e-05, + "loss": 0.017, + "step": 38090 + }, + { + "epoch": 0.2816297566600633, + "grad_norm": 0.08013275265693665, + "learning_rate": 3.605175688509022e-05, + "loss": 0.0178, + "step": 38100 + }, + { + "epoch": 0.28170367523136514, + "grad_norm": 0.08659382909536362, + "learning_rate": 3.6048047245963914e-05, + "loss": 0.0186, + "step": 38110 + }, + { + "epoch": 0.281777593802667, + "grad_norm": 0.08591907471418381, + "learning_rate": 3.604433760683761e-05, + "loss": 0.02, + "step": 38120 + }, + { + "epoch": 0.2818515123739688, + "grad_norm": 0.07610286772251129, + "learning_rate": 3.6040627967711306e-05, + "loss": 0.0153, + "step": 38130 + }, + { + "epoch": 0.28192543094527067, + "grad_norm": 0.07261490821838379, + "learning_rate": 3.6036918328584995e-05, + "loss": 0.019, + "step": 38140 + }, + { + "epoch": 0.2819993495165725, + "grad_norm": 0.10294881463050842, + "learning_rate": 3.603320868945869e-05, + "loss": 0.0183, + "step": 38150 + }, + { + "epoch": 0.2820732680878744, + "grad_norm": 0.09938614070415497, + "learning_rate": 3.602949905033239e-05, + "loss": 0.0205, + "step": 38160 + }, + { + "epoch": 0.28214718665917626, + "grad_norm": 0.07362860441207886, + "learning_rate": 3.6025789411206076e-05, + "loss": 0.021, + "step": 38170 + }, + { + "epoch": 0.2822211052304781, + "grad_norm": 0.056989945471286774, + "learning_rate": 3.602207977207977e-05, + "loss": 0.0196, + "step": 38180 + }, + { + "epoch": 0.28229502380177995, + "grad_norm": 0.18107661604881287, + "learning_rate": 3.601837013295347e-05, + "loss": 0.0174, + "step": 38190 + }, + { + "epoch": 0.2823689423730818, + "grad_norm": 0.08354820311069489, + "learning_rate": 3.6014660493827164e-05, + "loss": 0.0192, + "step": 38200 + }, + { + "epoch": 0.28244286094438364, + "grad_norm": 0.10664748400449753, + "learning_rate": 3.601095085470086e-05, + "loss": 0.0208, + "step": 38210 + }, + { + "epoch": 0.28251677951568555, + "grad_norm": 0.08411481976509094, + "learning_rate": 3.600724121557455e-05, + "loss": 0.0181, + "step": 38220 + }, + { + "epoch": 0.2825906980869874, + "grad_norm": 0.0959780141711235, + "learning_rate": 3.6003531576448245e-05, + "loss": 0.0183, + "step": 38230 + }, + { + "epoch": 0.28266461665828924, + "grad_norm": 0.05065147951245308, + "learning_rate": 3.5999821937321934e-05, + "loss": 0.0201, + "step": 38240 + }, + { + "epoch": 0.2827385352295911, + "grad_norm": 0.07555904239416122, + "learning_rate": 3.599611229819563e-05, + "loss": 0.0182, + "step": 38250 + }, + { + "epoch": 0.2828124538008929, + "grad_norm": 0.08089818060398102, + "learning_rate": 3.599240265906933e-05, + "loss": 0.02, + "step": 38260 + }, + { + "epoch": 0.28288637237219477, + "grad_norm": 0.09162278473377228, + "learning_rate": 3.598869301994302e-05, + "loss": 0.0191, + "step": 38270 + }, + { + "epoch": 0.2829602909434966, + "grad_norm": 0.08376139402389526, + "learning_rate": 3.598498338081672e-05, + "loss": 0.0163, + "step": 38280 + }, + { + "epoch": 0.2830342095147985, + "grad_norm": 0.10960046201944351, + "learning_rate": 3.598127374169041e-05, + "loss": 0.0216, + "step": 38290 + }, + { + "epoch": 0.28310812808610036, + "grad_norm": 0.08518026024103165, + "learning_rate": 3.59775641025641e-05, + "loss": 0.0205, + "step": 38300 + }, + { + "epoch": 0.2831820466574022, + "grad_norm": 0.08602633327245712, + "learning_rate": 3.59738544634378e-05, + "loss": 0.0192, + "step": 38310 + }, + { + "epoch": 0.28325596522870405, + "grad_norm": 0.091323621571064, + "learning_rate": 3.597014482431149e-05, + "loss": 0.0168, + "step": 38320 + }, + { + "epoch": 0.2833298838000059, + "grad_norm": 0.11848784238100052, + "learning_rate": 3.5966435185185184e-05, + "loss": 0.0181, + "step": 38330 + }, + { + "epoch": 0.28340380237130774, + "grad_norm": 0.09957283735275269, + "learning_rate": 3.596272554605888e-05, + "loss": 0.0183, + "step": 38340 + }, + { + "epoch": 0.28347772094260965, + "grad_norm": 0.07813524454832077, + "learning_rate": 3.5959015906932576e-05, + "loss": 0.0196, + "step": 38350 + }, + { + "epoch": 0.2835516395139115, + "grad_norm": 0.11081357300281525, + "learning_rate": 3.595530626780627e-05, + "loss": 0.0178, + "step": 38360 + }, + { + "epoch": 0.28362555808521334, + "grad_norm": 0.08587471395730972, + "learning_rate": 3.595159662867996e-05, + "loss": 0.0214, + "step": 38370 + }, + { + "epoch": 0.2836994766565152, + "grad_norm": 0.07502496242523193, + "learning_rate": 3.594788698955366e-05, + "loss": 0.0188, + "step": 38380 + }, + { + "epoch": 0.283773395227817, + "grad_norm": 0.0899985060095787, + "learning_rate": 3.5944177350427353e-05, + "loss": 0.0185, + "step": 38390 + }, + { + "epoch": 0.28384731379911887, + "grad_norm": 0.08980336785316467, + "learning_rate": 3.594046771130104e-05, + "loss": 0.0179, + "step": 38400 + }, + { + "epoch": 0.2839212323704207, + "grad_norm": 0.10397088527679443, + "learning_rate": 3.5936758072174745e-05, + "loss": 0.0179, + "step": 38410 + }, + { + "epoch": 0.2839951509417226, + "grad_norm": 0.13703933358192444, + "learning_rate": 3.5933048433048435e-05, + "loss": 0.0192, + "step": 38420 + }, + { + "epoch": 0.28406906951302446, + "grad_norm": 0.07654924690723419, + "learning_rate": 3.592933879392213e-05, + "loss": 0.0186, + "step": 38430 + }, + { + "epoch": 0.2841429880843263, + "grad_norm": 0.09625089168548584, + "learning_rate": 3.5925629154795826e-05, + "loss": 0.0202, + "step": 38440 + }, + { + "epoch": 0.28421690665562815, + "grad_norm": 0.10750327259302139, + "learning_rate": 3.5921919515669516e-05, + "loss": 0.0191, + "step": 38450 + }, + { + "epoch": 0.28429082522693, + "grad_norm": 0.08809314668178558, + "learning_rate": 3.591820987654321e-05, + "loss": 0.018, + "step": 38460 + }, + { + "epoch": 0.28436474379823184, + "grad_norm": 0.09000466018915176, + "learning_rate": 3.59145002374169e-05, + "loss": 0.021, + "step": 38470 + }, + { + "epoch": 0.28443866236953375, + "grad_norm": 0.11071033775806427, + "learning_rate": 3.59107905982906e-05, + "loss": 0.0194, + "step": 38480 + }, + { + "epoch": 0.2845125809408356, + "grad_norm": 0.09824886918067932, + "learning_rate": 3.59070809591643e-05, + "loss": 0.0193, + "step": 38490 + }, + { + "epoch": 0.28458649951213744, + "grad_norm": 0.15032939612865448, + "learning_rate": 3.590337132003799e-05, + "loss": 0.0181, + "step": 38500 + }, + { + "epoch": 0.2846604180834393, + "grad_norm": 0.07095257192850113, + "learning_rate": 3.5899661680911685e-05, + "loss": 0.0154, + "step": 38510 + }, + { + "epoch": 0.2847343366547411, + "grad_norm": 0.07965070009231567, + "learning_rate": 3.5895952041785374e-05, + "loss": 0.0197, + "step": 38520 + }, + { + "epoch": 0.284808255226043, + "grad_norm": 0.09079215675592422, + "learning_rate": 3.589224240265907e-05, + "loss": 0.0179, + "step": 38530 + }, + { + "epoch": 0.2848821737973448, + "grad_norm": 0.09377455711364746, + "learning_rate": 3.5888532763532766e-05, + "loss": 0.0217, + "step": 38540 + }, + { + "epoch": 0.2849560923686467, + "grad_norm": 0.09389619529247284, + "learning_rate": 3.5884823124406455e-05, + "loss": 0.0196, + "step": 38550 + }, + { + "epoch": 0.28503001093994856, + "grad_norm": 0.09551979601383209, + "learning_rate": 3.588111348528016e-05, + "loss": 0.0208, + "step": 38560 + }, + { + "epoch": 0.2851039295112504, + "grad_norm": 0.07076023519039154, + "learning_rate": 3.587740384615385e-05, + "loss": 0.0184, + "step": 38570 + }, + { + "epoch": 0.28517784808255225, + "grad_norm": 0.11054020375013351, + "learning_rate": 3.587369420702754e-05, + "loss": 0.0206, + "step": 38580 + }, + { + "epoch": 0.2852517666538541, + "grad_norm": 0.07012484967708588, + "learning_rate": 3.586998456790124e-05, + "loss": 0.017, + "step": 38590 + }, + { + "epoch": 0.28532568522515595, + "grad_norm": 0.06504105031490326, + "learning_rate": 3.586627492877493e-05, + "loss": 0.0154, + "step": 38600 + }, + { + "epoch": 0.28539960379645785, + "grad_norm": 0.0863649994134903, + "learning_rate": 3.5862565289648624e-05, + "loss": 0.0178, + "step": 38610 + }, + { + "epoch": 0.2854735223677597, + "grad_norm": 0.08585851639509201, + "learning_rate": 3.585885565052232e-05, + "loss": 0.0186, + "step": 38620 + }, + { + "epoch": 0.28554744093906154, + "grad_norm": 0.09086504578590393, + "learning_rate": 3.585514601139601e-05, + "loss": 0.0173, + "step": 38630 + }, + { + "epoch": 0.2856213595103634, + "grad_norm": 0.07614096999168396, + "learning_rate": 3.585143637226971e-05, + "loss": 0.0194, + "step": 38640 + }, + { + "epoch": 0.2856952780816652, + "grad_norm": 0.07605312764644623, + "learning_rate": 3.58477267331434e-05, + "loss": 0.0199, + "step": 38650 + }, + { + "epoch": 0.2857691966529671, + "grad_norm": 0.08855976164340973, + "learning_rate": 3.58440170940171e-05, + "loss": 0.0182, + "step": 38660 + }, + { + "epoch": 0.2858431152242689, + "grad_norm": 0.0881510004401207, + "learning_rate": 3.584030745489079e-05, + "loss": 0.0214, + "step": 38670 + }, + { + "epoch": 0.2859170337955708, + "grad_norm": 0.10402607917785645, + "learning_rate": 3.583659781576448e-05, + "loss": 0.0179, + "step": 38680 + }, + { + "epoch": 0.28599095236687266, + "grad_norm": 0.07266613841056824, + "learning_rate": 3.583288817663818e-05, + "loss": 0.0204, + "step": 38690 + }, + { + "epoch": 0.2860648709381745, + "grad_norm": 0.09164243936538696, + "learning_rate": 3.582917853751187e-05, + "loss": 0.0202, + "step": 38700 + }, + { + "epoch": 0.28613878950947635, + "grad_norm": 0.0948050394654274, + "learning_rate": 3.582546889838557e-05, + "loss": 0.0193, + "step": 38710 + }, + { + "epoch": 0.2862127080807782, + "grad_norm": 0.14507903158664703, + "learning_rate": 3.5821759259259266e-05, + "loss": 0.0188, + "step": 38720 + }, + { + "epoch": 0.28628662665208005, + "grad_norm": 0.08098135888576508, + "learning_rate": 3.5818049620132955e-05, + "loss": 0.0196, + "step": 38730 + }, + { + "epoch": 0.28636054522338195, + "grad_norm": 0.25409314036369324, + "learning_rate": 3.581433998100665e-05, + "loss": 0.018, + "step": 38740 + }, + { + "epoch": 0.2864344637946838, + "grad_norm": 0.07396399229764938, + "learning_rate": 3.581063034188034e-05, + "loss": 0.0176, + "step": 38750 + }, + { + "epoch": 0.28650838236598564, + "grad_norm": 0.07812829315662384, + "learning_rate": 3.5806920702754036e-05, + "loss": 0.0201, + "step": 38760 + }, + { + "epoch": 0.2865823009372875, + "grad_norm": 0.05777015537023544, + "learning_rate": 3.580321106362773e-05, + "loss": 0.0177, + "step": 38770 + }, + { + "epoch": 0.2866562195085893, + "grad_norm": 0.08417957276105881, + "learning_rate": 3.579950142450142e-05, + "loss": 0.0172, + "step": 38780 + }, + { + "epoch": 0.2867301380798912, + "grad_norm": 0.07578160613775253, + "learning_rate": 3.5795791785375124e-05, + "loss": 0.0192, + "step": 38790 + }, + { + "epoch": 0.2868040566511931, + "grad_norm": 0.10593099892139435, + "learning_rate": 3.5792082146248814e-05, + "loss": 0.0184, + "step": 38800 + }, + { + "epoch": 0.2868779752224949, + "grad_norm": 0.05731048807501793, + "learning_rate": 3.578837250712251e-05, + "loss": 0.0173, + "step": 38810 + }, + { + "epoch": 0.28695189379379676, + "grad_norm": 0.08876123279333115, + "learning_rate": 3.5784662867996205e-05, + "loss": 0.0189, + "step": 38820 + }, + { + "epoch": 0.2870258123650986, + "grad_norm": 0.08627317100763321, + "learning_rate": 3.5780953228869895e-05, + "loss": 0.0198, + "step": 38830 + }, + { + "epoch": 0.28709973093640045, + "grad_norm": 0.0902194231748581, + "learning_rate": 3.577724358974359e-05, + "loss": 0.0174, + "step": 38840 + }, + { + "epoch": 0.2871736495077023, + "grad_norm": 0.08449438959360123, + "learning_rate": 3.5773533950617287e-05, + "loss": 0.0181, + "step": 38850 + }, + { + "epoch": 0.28724756807900415, + "grad_norm": 0.10555487871170044, + "learning_rate": 3.576982431149098e-05, + "loss": 0.019, + "step": 38860 + }, + { + "epoch": 0.28732148665030605, + "grad_norm": 0.10188709199428558, + "learning_rate": 3.576611467236468e-05, + "loss": 0.0212, + "step": 38870 + }, + { + "epoch": 0.2873954052216079, + "grad_norm": 0.08163904398679733, + "learning_rate": 3.576240503323837e-05, + "loss": 0.0226, + "step": 38880 + }, + { + "epoch": 0.28746932379290974, + "grad_norm": 0.05684854835271835, + "learning_rate": 3.5758695394112064e-05, + "loss": 0.0167, + "step": 38890 + }, + { + "epoch": 0.2875432423642116, + "grad_norm": 0.09646262973546982, + "learning_rate": 3.575498575498576e-05, + "loss": 0.0214, + "step": 38900 + }, + { + "epoch": 0.2876171609355134, + "grad_norm": 0.06920493394136429, + "learning_rate": 3.575127611585945e-05, + "loss": 0.0218, + "step": 38910 + }, + { + "epoch": 0.2876910795068153, + "grad_norm": 0.0737135261297226, + "learning_rate": 3.5747566476733145e-05, + "loss": 0.0183, + "step": 38920 + }, + { + "epoch": 0.2877649980781172, + "grad_norm": 0.09294036775827408, + "learning_rate": 3.5743856837606834e-05, + "loss": 0.0175, + "step": 38930 + }, + { + "epoch": 0.287838916649419, + "grad_norm": 0.11958757042884827, + "learning_rate": 3.574014719848054e-05, + "loss": 0.0208, + "step": 38940 + }, + { + "epoch": 0.28791283522072086, + "grad_norm": 0.10666726529598236, + "learning_rate": 3.573643755935423e-05, + "loss": 0.0227, + "step": 38950 + }, + { + "epoch": 0.2879867537920227, + "grad_norm": 0.08652665466070175, + "learning_rate": 3.573272792022792e-05, + "loss": 0.0173, + "step": 38960 + }, + { + "epoch": 0.28806067236332455, + "grad_norm": 0.07823482155799866, + "learning_rate": 3.572901828110162e-05, + "loss": 0.0174, + "step": 38970 + }, + { + "epoch": 0.2881345909346264, + "grad_norm": 0.07927388697862625, + "learning_rate": 3.572530864197531e-05, + "loss": 0.0187, + "step": 38980 + }, + { + "epoch": 0.28820850950592825, + "grad_norm": 0.06484010070562363, + "learning_rate": 3.5721599002849e-05, + "loss": 0.0182, + "step": 38990 + }, + { + "epoch": 0.28828242807723015, + "grad_norm": 0.11370483785867691, + "learning_rate": 3.57178893637227e-05, + "loss": 0.0179, + "step": 39000 + }, + { + "epoch": 0.288356346648532, + "grad_norm": 0.0888226181268692, + "learning_rate": 3.5714179724596395e-05, + "loss": 0.0217, + "step": 39010 + }, + { + "epoch": 0.28843026521983384, + "grad_norm": 0.09708042442798615, + "learning_rate": 3.571047008547009e-05, + "loss": 0.0189, + "step": 39020 + }, + { + "epoch": 0.2885041837911357, + "grad_norm": 0.12996414303779602, + "learning_rate": 3.570676044634378e-05, + "loss": 0.0166, + "step": 39030 + }, + { + "epoch": 0.2885781023624375, + "grad_norm": 0.079742431640625, + "learning_rate": 3.5703050807217476e-05, + "loss": 0.0177, + "step": 39040 + }, + { + "epoch": 0.2886520209337394, + "grad_norm": 0.07856535166501999, + "learning_rate": 3.569934116809117e-05, + "loss": 0.0163, + "step": 39050 + }, + { + "epoch": 0.2887259395050413, + "grad_norm": 0.09074665606021881, + "learning_rate": 3.569563152896486e-05, + "loss": 0.0182, + "step": 39060 + }, + { + "epoch": 0.2887998580763431, + "grad_norm": 0.0902465432882309, + "learning_rate": 3.569192188983856e-05, + "loss": 0.0192, + "step": 39070 + }, + { + "epoch": 0.28887377664764496, + "grad_norm": 0.09278262406587601, + "learning_rate": 3.568821225071225e-05, + "loss": 0.0189, + "step": 39080 + }, + { + "epoch": 0.2889476952189468, + "grad_norm": 0.09800709784030914, + "learning_rate": 3.568450261158595e-05, + "loss": 0.017, + "step": 39090 + }, + { + "epoch": 0.28902161379024865, + "grad_norm": 0.10501433163881302, + "learning_rate": 3.5680792972459645e-05, + "loss": 0.018, + "step": 39100 + }, + { + "epoch": 0.2890955323615505, + "grad_norm": 0.10222896188497543, + "learning_rate": 3.5677083333333334e-05, + "loss": 0.0192, + "step": 39110 + }, + { + "epoch": 0.28916945093285235, + "grad_norm": 0.11003319174051285, + "learning_rate": 3.567337369420703e-05, + "loss": 0.0189, + "step": 39120 + }, + { + "epoch": 0.28924336950415425, + "grad_norm": 0.08521962910890579, + "learning_rate": 3.5669664055080726e-05, + "loss": 0.0195, + "step": 39130 + }, + { + "epoch": 0.2893172880754561, + "grad_norm": 0.05133388191461563, + "learning_rate": 3.5665954415954415e-05, + "loss": 0.0175, + "step": 39140 + }, + { + "epoch": 0.28939120664675794, + "grad_norm": 0.10458425432443619, + "learning_rate": 3.566224477682811e-05, + "loss": 0.0223, + "step": 39150 + }, + { + "epoch": 0.2894651252180598, + "grad_norm": 0.09683454781770706, + "learning_rate": 3.565853513770181e-05, + "loss": 0.0195, + "step": 39160 + }, + { + "epoch": 0.2895390437893616, + "grad_norm": 0.08803705126047134, + "learning_rate": 3.56548254985755e-05, + "loss": 0.0181, + "step": 39170 + }, + { + "epoch": 0.2896129623606635, + "grad_norm": 0.07024317234754562, + "learning_rate": 3.56511158594492e-05, + "loss": 0.019, + "step": 39180 + }, + { + "epoch": 0.2896868809319654, + "grad_norm": 0.0780089944601059, + "learning_rate": 3.564740622032289e-05, + "loss": 0.0188, + "step": 39190 + }, + { + "epoch": 0.2897607995032672, + "grad_norm": 0.09941524267196655, + "learning_rate": 3.5643696581196584e-05, + "loss": 0.0184, + "step": 39200 + }, + { + "epoch": 0.28983471807456906, + "grad_norm": 0.06821256130933762, + "learning_rate": 3.5639986942070274e-05, + "loss": 0.02, + "step": 39210 + }, + { + "epoch": 0.2899086366458709, + "grad_norm": 0.11383607983589172, + "learning_rate": 3.563627730294397e-05, + "loss": 0.0219, + "step": 39220 + }, + { + "epoch": 0.28998255521717275, + "grad_norm": 0.0734940692782402, + "learning_rate": 3.5632567663817666e-05, + "loss": 0.0197, + "step": 39230 + }, + { + "epoch": 0.2900564737884746, + "grad_norm": 0.06543325632810593, + "learning_rate": 3.562885802469136e-05, + "loss": 0.016, + "step": 39240 + }, + { + "epoch": 0.29013039235977645, + "grad_norm": 0.08477847278118134, + "learning_rate": 3.562514838556506e-05, + "loss": 0.0199, + "step": 39250 + }, + { + "epoch": 0.29020431093107835, + "grad_norm": 0.10399157553911209, + "learning_rate": 3.562143874643875e-05, + "loss": 0.0202, + "step": 39260 + }, + { + "epoch": 0.2902782295023802, + "grad_norm": 0.10714534670114517, + "learning_rate": 3.561772910731244e-05, + "loss": 0.0215, + "step": 39270 + }, + { + "epoch": 0.29035214807368204, + "grad_norm": 0.10957160592079163, + "learning_rate": 3.561401946818614e-05, + "loss": 0.0176, + "step": 39280 + }, + { + "epoch": 0.2904260666449839, + "grad_norm": 0.1137077659368515, + "learning_rate": 3.561030982905983e-05, + "loss": 0.0177, + "step": 39290 + }, + { + "epoch": 0.2904999852162857, + "grad_norm": 0.10111214220523834, + "learning_rate": 3.5606600189933524e-05, + "loss": 0.0197, + "step": 39300 + }, + { + "epoch": 0.2905739037875876, + "grad_norm": 0.08012203127145767, + "learning_rate": 3.560289055080722e-05, + "loss": 0.0212, + "step": 39310 + }, + { + "epoch": 0.2906478223588895, + "grad_norm": 0.07070279866456985, + "learning_rate": 3.5599180911680916e-05, + "loss": 0.0183, + "step": 39320 + }, + { + "epoch": 0.2907217409301913, + "grad_norm": 0.06599403917789459, + "learning_rate": 3.559547127255461e-05, + "loss": 0.0185, + "step": 39330 + }, + { + "epoch": 0.29079565950149316, + "grad_norm": 0.09379260987043381, + "learning_rate": 3.55917616334283e-05, + "loss": 0.0191, + "step": 39340 + }, + { + "epoch": 0.290869578072795, + "grad_norm": 0.08500421047210693, + "learning_rate": 3.5588051994302e-05, + "loss": 0.0179, + "step": 39350 + }, + { + "epoch": 0.29094349664409686, + "grad_norm": 0.0872545838356018, + "learning_rate": 3.558434235517569e-05, + "loss": 0.0195, + "step": 39360 + }, + { + "epoch": 0.2910174152153987, + "grad_norm": 0.09130999445915222, + "learning_rate": 3.558063271604938e-05, + "loss": 0.0214, + "step": 39370 + }, + { + "epoch": 0.29109133378670055, + "grad_norm": 0.09407318383455276, + "learning_rate": 3.557692307692308e-05, + "loss": 0.0177, + "step": 39380 + }, + { + "epoch": 0.29116525235800245, + "grad_norm": 0.09333614259958267, + "learning_rate": 3.5573213437796774e-05, + "loss": 0.0204, + "step": 39390 + }, + { + "epoch": 0.2912391709293043, + "grad_norm": 0.10681622475385666, + "learning_rate": 3.556950379867047e-05, + "loss": 0.0177, + "step": 39400 + }, + { + "epoch": 0.29131308950060614, + "grad_norm": 0.09072420746088028, + "learning_rate": 3.5565794159544166e-05, + "loss": 0.017, + "step": 39410 + }, + { + "epoch": 0.291387008071908, + "grad_norm": 0.07721372693777084, + "learning_rate": 3.5562084520417855e-05, + "loss": 0.0178, + "step": 39420 + }, + { + "epoch": 0.29146092664320983, + "grad_norm": 0.06005462631583214, + "learning_rate": 3.555837488129155e-05, + "loss": 0.0176, + "step": 39430 + }, + { + "epoch": 0.2915348452145117, + "grad_norm": 0.06264548003673553, + "learning_rate": 3.555466524216524e-05, + "loss": 0.0198, + "step": 39440 + }, + { + "epoch": 0.2916087637858136, + "grad_norm": 0.10125889629125595, + "learning_rate": 3.5550955603038936e-05, + "loss": 0.0177, + "step": 39450 + }, + { + "epoch": 0.2916826823571154, + "grad_norm": 0.11964855343103409, + "learning_rate": 3.554724596391263e-05, + "loss": 0.0229, + "step": 39460 + }, + { + "epoch": 0.29175660092841726, + "grad_norm": 0.09017671644687653, + "learning_rate": 3.554353632478633e-05, + "loss": 0.019, + "step": 39470 + }, + { + "epoch": 0.2918305194997191, + "grad_norm": 0.0720110610127449, + "learning_rate": 3.5539826685660024e-05, + "loss": 0.0185, + "step": 39480 + }, + { + "epoch": 0.29190443807102096, + "grad_norm": 0.06961306184530258, + "learning_rate": 3.553611704653371e-05, + "loss": 0.0202, + "step": 39490 + }, + { + "epoch": 0.2919783566423228, + "grad_norm": 0.07258889079093933, + "learning_rate": 3.553240740740741e-05, + "loss": 0.0218, + "step": 39500 + }, + { + "epoch": 0.29205227521362465, + "grad_norm": 0.14533229172229767, + "learning_rate": 3.5528697768281105e-05, + "loss": 0.0195, + "step": 39510 + }, + { + "epoch": 0.29212619378492655, + "grad_norm": 0.07298830896615982, + "learning_rate": 3.5524988129154794e-05, + "loss": 0.0178, + "step": 39520 + }, + { + "epoch": 0.2922001123562284, + "grad_norm": 0.08939662575721741, + "learning_rate": 3.552127849002849e-05, + "loss": 0.0199, + "step": 39530 + }, + { + "epoch": 0.29227403092753024, + "grad_norm": 0.07126587629318237, + "learning_rate": 3.5517568850902186e-05, + "loss": 0.0151, + "step": 39540 + }, + { + "epoch": 0.2923479494988321, + "grad_norm": 0.11612686514854431, + "learning_rate": 3.551385921177588e-05, + "loss": 0.0199, + "step": 39550 + }, + { + "epoch": 0.29242186807013393, + "grad_norm": 0.08554835617542267, + "learning_rate": 3.551014957264958e-05, + "loss": 0.0186, + "step": 39560 + }, + { + "epoch": 0.2924957866414358, + "grad_norm": 0.0877324566245079, + "learning_rate": 3.550643993352327e-05, + "loss": 0.0206, + "step": 39570 + }, + { + "epoch": 0.2925697052127377, + "grad_norm": 0.25311246514320374, + "learning_rate": 3.5502730294396963e-05, + "loss": 0.0203, + "step": 39580 + }, + { + "epoch": 0.2926436237840395, + "grad_norm": 0.07756414264440536, + "learning_rate": 3.549902065527066e-05, + "loss": 0.0183, + "step": 39590 + }, + { + "epoch": 0.29271754235534136, + "grad_norm": 0.09529725462198257, + "learning_rate": 3.549531101614435e-05, + "loss": 0.0202, + "step": 39600 + }, + { + "epoch": 0.2927914609266432, + "grad_norm": 0.09326574951410294, + "learning_rate": 3.5491601377018045e-05, + "loss": 0.0172, + "step": 39610 + }, + { + "epoch": 0.29286537949794506, + "grad_norm": 0.11572094261646271, + "learning_rate": 3.548789173789174e-05, + "loss": 0.0225, + "step": 39620 + }, + { + "epoch": 0.2929392980692469, + "grad_norm": 0.08779244869947433, + "learning_rate": 3.5484182098765436e-05, + "loss": 0.0194, + "step": 39630 + }, + { + "epoch": 0.29301321664054875, + "grad_norm": 0.08656366169452667, + "learning_rate": 3.548047245963913e-05, + "loss": 0.0183, + "step": 39640 + }, + { + "epoch": 0.29308713521185065, + "grad_norm": 0.09075984358787537, + "learning_rate": 3.547676282051282e-05, + "loss": 0.0189, + "step": 39650 + }, + { + "epoch": 0.2931610537831525, + "grad_norm": 0.15344542264938354, + "learning_rate": 3.547305318138652e-05, + "loss": 0.0208, + "step": 39660 + }, + { + "epoch": 0.29323497235445434, + "grad_norm": 0.08538174629211426, + "learning_rate": 3.546934354226021e-05, + "loss": 0.0193, + "step": 39670 + }, + { + "epoch": 0.2933088909257562, + "grad_norm": 0.10595715790987015, + "learning_rate": 3.54656339031339e-05, + "loss": 0.0213, + "step": 39680 + }, + { + "epoch": 0.29338280949705803, + "grad_norm": 0.08453144133090973, + "learning_rate": 3.54619242640076e-05, + "loss": 0.0199, + "step": 39690 + }, + { + "epoch": 0.2934567280683599, + "grad_norm": 0.08898500353097916, + "learning_rate": 3.5458214624881295e-05, + "loss": 0.0189, + "step": 39700 + }, + { + "epoch": 0.2935306466396618, + "grad_norm": 0.10838479548692703, + "learning_rate": 3.545450498575499e-05, + "loss": 0.0179, + "step": 39710 + }, + { + "epoch": 0.2936045652109636, + "grad_norm": 0.07953338325023651, + "learning_rate": 3.545079534662868e-05, + "loss": 0.02, + "step": 39720 + }, + { + "epoch": 0.29367848378226546, + "grad_norm": 0.11616776883602142, + "learning_rate": 3.5447085707502376e-05, + "loss": 0.0224, + "step": 39730 + }, + { + "epoch": 0.2937524023535673, + "grad_norm": 0.097626693546772, + "learning_rate": 3.544337606837607e-05, + "loss": 0.0175, + "step": 39740 + }, + { + "epoch": 0.29382632092486916, + "grad_norm": 0.09147676825523376, + "learning_rate": 3.543966642924976e-05, + "loss": 0.0188, + "step": 39750 + }, + { + "epoch": 0.293900239496171, + "grad_norm": 0.07421558350324631, + "learning_rate": 3.543595679012346e-05, + "loss": 0.0196, + "step": 39760 + }, + { + "epoch": 0.29397415806747285, + "grad_norm": 0.08205095678567886, + "learning_rate": 3.543224715099715e-05, + "loss": 0.0192, + "step": 39770 + }, + { + "epoch": 0.29404807663877475, + "grad_norm": 0.09343995898962021, + "learning_rate": 3.542853751187085e-05, + "loss": 0.0197, + "step": 39780 + }, + { + "epoch": 0.2941219952100766, + "grad_norm": 0.1180085763335228, + "learning_rate": 3.5424827872744545e-05, + "loss": 0.0207, + "step": 39790 + }, + { + "epoch": 0.29419591378137844, + "grad_norm": 0.06754688918590546, + "learning_rate": 3.5421118233618234e-05, + "loss": 0.0173, + "step": 39800 + }, + { + "epoch": 0.2942698323526803, + "grad_norm": 0.0712592601776123, + "learning_rate": 3.541740859449193e-05, + "loss": 0.0201, + "step": 39810 + }, + { + "epoch": 0.29434375092398213, + "grad_norm": 0.09996171295642853, + "learning_rate": 3.5413698955365626e-05, + "loss": 0.0204, + "step": 39820 + }, + { + "epoch": 0.294417669495284, + "grad_norm": 0.08990880846977234, + "learning_rate": 3.5409989316239315e-05, + "loss": 0.0211, + "step": 39830 + }, + { + "epoch": 0.2944915880665859, + "grad_norm": 0.07393600046634674, + "learning_rate": 3.540627967711301e-05, + "loss": 0.0226, + "step": 39840 + }, + { + "epoch": 0.2945655066378877, + "grad_norm": 0.06920602917671204, + "learning_rate": 3.540257003798671e-05, + "loss": 0.0175, + "step": 39850 + }, + { + "epoch": 0.29463942520918956, + "grad_norm": 0.08680330216884613, + "learning_rate": 3.53988603988604e-05, + "loss": 0.0181, + "step": 39860 + }, + { + "epoch": 0.2947133437804914, + "grad_norm": 0.10392973572015762, + "learning_rate": 3.53951507597341e-05, + "loss": 0.0192, + "step": 39870 + }, + { + "epoch": 0.29478726235179326, + "grad_norm": 0.07698529213666916, + "learning_rate": 3.539144112060779e-05, + "loss": 0.0189, + "step": 39880 + }, + { + "epoch": 0.2948611809230951, + "grad_norm": 0.0859031230211258, + "learning_rate": 3.5387731481481484e-05, + "loss": 0.0168, + "step": 39890 + }, + { + "epoch": 0.29493509949439695, + "grad_norm": 0.10237284749746323, + "learning_rate": 3.5384021842355173e-05, + "loss": 0.0186, + "step": 39900 + }, + { + "epoch": 0.29500901806569885, + "grad_norm": 0.09435314685106277, + "learning_rate": 3.538031220322887e-05, + "loss": 0.0192, + "step": 39910 + }, + { + "epoch": 0.2950829366370007, + "grad_norm": 0.06342162936925888, + "learning_rate": 3.5376602564102565e-05, + "loss": 0.0184, + "step": 39920 + }, + { + "epoch": 0.29515685520830254, + "grad_norm": 0.12394603341817856, + "learning_rate": 3.537289292497626e-05, + "loss": 0.0212, + "step": 39930 + }, + { + "epoch": 0.2952307737796044, + "grad_norm": 0.09578089416027069, + "learning_rate": 3.536918328584996e-05, + "loss": 0.0191, + "step": 39940 + }, + { + "epoch": 0.29530469235090623, + "grad_norm": 0.10095982998609543, + "learning_rate": 3.5365473646723646e-05, + "loss": 0.0176, + "step": 39950 + }, + { + "epoch": 0.2953786109222081, + "grad_norm": 0.09211903065443039, + "learning_rate": 3.536176400759734e-05, + "loss": 0.0172, + "step": 39960 + }, + { + "epoch": 0.29545252949351, + "grad_norm": 0.21059109270572662, + "learning_rate": 3.535805436847104e-05, + "loss": 0.0201, + "step": 39970 + }, + { + "epoch": 0.2955264480648118, + "grad_norm": 0.09203150123357773, + "learning_rate": 3.535434472934473e-05, + "loss": 0.0217, + "step": 39980 + }, + { + "epoch": 0.29560036663611367, + "grad_norm": 0.08103719353675842, + "learning_rate": 3.5350635090218424e-05, + "loss": 0.019, + "step": 39990 + }, + { + "epoch": 0.2956742852074155, + "grad_norm": 0.0887136161327362, + "learning_rate": 3.534692545109212e-05, + "loss": 0.0197, + "step": 40000 + }, + { + "epoch": 0.2956742852074155, + "eval_f1": 0.6048634691567629, + "eval_loss": 0.018523240461945534, + "eval_precision": 0.47730154244425604, + "eval_recall": 0.8254777377525961, + "eval_runtime": 2662.2523, + "eval_samples_per_second": 203.262, + "eval_steps_per_second": 3.176, + "step": 40000 + }, + { + "epoch": 0.29574820377871736, + "grad_norm": 0.08944419771432877, + "learning_rate": 3.5343215811965815e-05, + "loss": 0.0191, + "step": 40010 + }, + { + "epoch": 0.2958221223500192, + "grad_norm": 0.10585649311542511, + "learning_rate": 3.533950617283951e-05, + "loss": 0.0191, + "step": 40020 + }, + { + "epoch": 0.29589604092132105, + "grad_norm": 0.09504344314336777, + "learning_rate": 3.53357965337132e-05, + "loss": 0.0199, + "step": 40030 + }, + { + "epoch": 0.29596995949262295, + "grad_norm": 0.0804176926612854, + "learning_rate": 3.5332086894586897e-05, + "loss": 0.0174, + "step": 40040 + }, + { + "epoch": 0.2960438780639248, + "grad_norm": 0.062001194804906845, + "learning_rate": 3.532837725546059e-05, + "loss": 0.0169, + "step": 40050 + }, + { + "epoch": 0.29611779663522664, + "grad_norm": 0.08773874491453171, + "learning_rate": 3.532466761633428e-05, + "loss": 0.0215, + "step": 40060 + }, + { + "epoch": 0.2961917152065285, + "grad_norm": 0.07420551031827927, + "learning_rate": 3.532095797720798e-05, + "loss": 0.0175, + "step": 40070 + }, + { + "epoch": 0.29626563377783033, + "grad_norm": 0.08795443922281265, + "learning_rate": 3.5317248338081674e-05, + "loss": 0.0173, + "step": 40080 + }, + { + "epoch": 0.2963395523491322, + "grad_norm": 0.07256721705198288, + "learning_rate": 3.531353869895537e-05, + "loss": 0.0194, + "step": 40090 + }, + { + "epoch": 0.2964134709204341, + "grad_norm": 0.06961095333099365, + "learning_rate": 3.5309829059829066e-05, + "loss": 0.0184, + "step": 40100 + }, + { + "epoch": 0.2964873894917359, + "grad_norm": 0.08534622192382812, + "learning_rate": 3.5306119420702755e-05, + "loss": 0.0181, + "step": 40110 + }, + { + "epoch": 0.29656130806303777, + "grad_norm": 0.11563605815172195, + "learning_rate": 3.530240978157645e-05, + "loss": 0.021, + "step": 40120 + }, + { + "epoch": 0.2966352266343396, + "grad_norm": 0.08745235949754715, + "learning_rate": 3.529870014245014e-05, + "loss": 0.0191, + "step": 40130 + }, + { + "epoch": 0.29670914520564146, + "grad_norm": 0.06488395482301712, + "learning_rate": 3.5294990503323836e-05, + "loss": 0.0207, + "step": 40140 + }, + { + "epoch": 0.2967830637769433, + "grad_norm": 0.08929703384637833, + "learning_rate": 3.529128086419753e-05, + "loss": 0.0189, + "step": 40150 + }, + { + "epoch": 0.29685698234824515, + "grad_norm": 0.08577121794223785, + "learning_rate": 3.528757122507123e-05, + "loss": 0.023, + "step": 40160 + }, + { + "epoch": 0.29693090091954705, + "grad_norm": 0.08573178201913834, + "learning_rate": 3.5283861585944924e-05, + "loss": 0.0186, + "step": 40170 + }, + { + "epoch": 0.2970048194908489, + "grad_norm": 0.11787261813879013, + "learning_rate": 3.528015194681861e-05, + "loss": 0.018, + "step": 40180 + }, + { + "epoch": 0.29707873806215074, + "grad_norm": 0.08737242966890335, + "learning_rate": 3.527644230769231e-05, + "loss": 0.0198, + "step": 40190 + }, + { + "epoch": 0.2971526566334526, + "grad_norm": 0.08136210590600967, + "learning_rate": 3.5272732668566005e-05, + "loss": 0.0177, + "step": 40200 + }, + { + "epoch": 0.29722657520475443, + "grad_norm": 0.07731893658638, + "learning_rate": 3.5269023029439694e-05, + "loss": 0.0177, + "step": 40210 + }, + { + "epoch": 0.2973004937760563, + "grad_norm": 0.08985748142004013, + "learning_rate": 3.526531339031339e-05, + "loss": 0.0175, + "step": 40220 + }, + { + "epoch": 0.2973744123473582, + "grad_norm": 0.08974775671958923, + "learning_rate": 3.5261603751187086e-05, + "loss": 0.0202, + "step": 40230 + }, + { + "epoch": 0.29744833091866, + "grad_norm": 0.09473055601119995, + "learning_rate": 3.525789411206078e-05, + "loss": 0.0198, + "step": 40240 + }, + { + "epoch": 0.29752224948996187, + "grad_norm": 0.08488566428422928, + "learning_rate": 3.525418447293448e-05, + "loss": 0.0184, + "step": 40250 + }, + { + "epoch": 0.2975961680612637, + "grad_norm": 0.10066484659910202, + "learning_rate": 3.525047483380817e-05, + "loss": 0.0201, + "step": 40260 + }, + { + "epoch": 0.29767008663256556, + "grad_norm": 0.08238562941551208, + "learning_rate": 3.524676519468186e-05, + "loss": 0.0184, + "step": 40270 + }, + { + "epoch": 0.2977440052038674, + "grad_norm": 0.08526982367038727, + "learning_rate": 3.524305555555556e-05, + "loss": 0.0199, + "step": 40280 + }, + { + "epoch": 0.29781792377516925, + "grad_norm": 0.11167903244495392, + "learning_rate": 3.523934591642925e-05, + "loss": 0.0184, + "step": 40290 + }, + { + "epoch": 0.29789184234647115, + "grad_norm": 0.10231362283229828, + "learning_rate": 3.5235636277302944e-05, + "loss": 0.0203, + "step": 40300 + }, + { + "epoch": 0.297965760917773, + "grad_norm": 0.11291981488466263, + "learning_rate": 3.523192663817664e-05, + "loss": 0.0187, + "step": 40310 + }, + { + "epoch": 0.29803967948907484, + "grad_norm": 0.08946817368268967, + "learning_rate": 3.5228216999050336e-05, + "loss": 0.0174, + "step": 40320 + }, + { + "epoch": 0.2981135980603767, + "grad_norm": 0.08896126598119736, + "learning_rate": 3.522450735992403e-05, + "loss": 0.0186, + "step": 40330 + }, + { + "epoch": 0.29818751663167853, + "grad_norm": 0.06863762438297272, + "learning_rate": 3.522079772079772e-05, + "loss": 0.0178, + "step": 40340 + }, + { + "epoch": 0.2982614352029804, + "grad_norm": 0.09483463317155838, + "learning_rate": 3.521708808167142e-05, + "loss": 0.0197, + "step": 40350 + }, + { + "epoch": 0.2983353537742823, + "grad_norm": 0.07545123249292374, + "learning_rate": 3.5213378442545107e-05, + "loss": 0.0196, + "step": 40360 + }, + { + "epoch": 0.2984092723455841, + "grad_norm": 0.11172790080308914, + "learning_rate": 3.52096688034188e-05, + "loss": 0.0199, + "step": 40370 + }, + { + "epoch": 0.29848319091688597, + "grad_norm": 0.08551350235939026, + "learning_rate": 3.52059591642925e-05, + "loss": 0.0191, + "step": 40380 + }, + { + "epoch": 0.2985571094881878, + "grad_norm": 0.11399293690919876, + "learning_rate": 3.5202249525166194e-05, + "loss": 0.018, + "step": 40390 + }, + { + "epoch": 0.29863102805948966, + "grad_norm": 0.06697198003530502, + "learning_rate": 3.519853988603989e-05, + "loss": 0.0198, + "step": 40400 + }, + { + "epoch": 0.2987049466307915, + "grad_norm": 0.08352407813072205, + "learning_rate": 3.519483024691358e-05, + "loss": 0.0191, + "step": 40410 + }, + { + "epoch": 0.29877886520209335, + "grad_norm": 0.09723468124866486, + "learning_rate": 3.5191120607787276e-05, + "loss": 0.0185, + "step": 40420 + }, + { + "epoch": 0.29885278377339525, + "grad_norm": 0.09839003533124924, + "learning_rate": 3.518741096866097e-05, + "loss": 0.0181, + "step": 40430 + }, + { + "epoch": 0.2989267023446971, + "grad_norm": 0.07423264533281326, + "learning_rate": 3.518370132953466e-05, + "loss": 0.0179, + "step": 40440 + }, + { + "epoch": 0.29900062091599894, + "grad_norm": 0.10543884336948395, + "learning_rate": 3.517999169040836e-05, + "loss": 0.0207, + "step": 40450 + }, + { + "epoch": 0.2990745394873008, + "grad_norm": 0.09270637482404709, + "learning_rate": 3.517628205128205e-05, + "loss": 0.0204, + "step": 40460 + }, + { + "epoch": 0.29914845805860263, + "grad_norm": 0.10229405760765076, + "learning_rate": 3.517257241215575e-05, + "loss": 0.0167, + "step": 40470 + }, + { + "epoch": 0.2992223766299045, + "grad_norm": 0.07626686990261078, + "learning_rate": 3.5168862773029445e-05, + "loss": 0.0211, + "step": 40480 + }, + { + "epoch": 0.2992962952012064, + "grad_norm": 0.09048005193471909, + "learning_rate": 3.5165153133903134e-05, + "loss": 0.0197, + "step": 40490 + }, + { + "epoch": 0.2993702137725082, + "grad_norm": 0.08271709829568863, + "learning_rate": 3.516144349477683e-05, + "loss": 0.0186, + "step": 40500 + }, + { + "epoch": 0.29944413234381007, + "grad_norm": 0.1680927276611328, + "learning_rate": 3.5157733855650526e-05, + "loss": 0.0232, + "step": 40510 + }, + { + "epoch": 0.2995180509151119, + "grad_norm": 0.0796268880367279, + "learning_rate": 3.5154024216524215e-05, + "loss": 0.0185, + "step": 40520 + }, + { + "epoch": 0.29959196948641376, + "grad_norm": 0.21023017168045044, + "learning_rate": 3.515031457739791e-05, + "loss": 0.0168, + "step": 40530 + }, + { + "epoch": 0.2996658880577156, + "grad_norm": 0.11400981992483139, + "learning_rate": 3.514660493827161e-05, + "loss": 0.0181, + "step": 40540 + }, + { + "epoch": 0.29973980662901745, + "grad_norm": 0.08473803102970123, + "learning_rate": 3.51428952991453e-05, + "loss": 0.0187, + "step": 40550 + }, + { + "epoch": 0.29981372520031935, + "grad_norm": 0.06288875639438629, + "learning_rate": 3.5139185660019e-05, + "loss": 0.0188, + "step": 40560 + }, + { + "epoch": 0.2998876437716212, + "grad_norm": 0.09498181939125061, + "learning_rate": 3.513547602089269e-05, + "loss": 0.0177, + "step": 40570 + }, + { + "epoch": 0.29996156234292304, + "grad_norm": 0.09277638047933578, + "learning_rate": 3.5131766381766384e-05, + "loss": 0.0173, + "step": 40580 + }, + { + "epoch": 0.3000354809142249, + "grad_norm": 0.097083680331707, + "learning_rate": 3.512805674264007e-05, + "loss": 0.0187, + "step": 40590 + }, + { + "epoch": 0.30010939948552673, + "grad_norm": 0.09916821122169495, + "learning_rate": 3.512434710351377e-05, + "loss": 0.0216, + "step": 40600 + }, + { + "epoch": 0.3001833180568286, + "grad_norm": 0.08453313261270523, + "learning_rate": 3.5120637464387465e-05, + "loss": 0.0192, + "step": 40610 + }, + { + "epoch": 0.3002572366281305, + "grad_norm": 0.07424086332321167, + "learning_rate": 3.511692782526116e-05, + "loss": 0.0171, + "step": 40620 + }, + { + "epoch": 0.3003311551994323, + "grad_norm": 0.07494385540485382, + "learning_rate": 3.511321818613486e-05, + "loss": 0.0194, + "step": 40630 + }, + { + "epoch": 0.30040507377073417, + "grad_norm": 0.10998072475194931, + "learning_rate": 3.5109508547008546e-05, + "loss": 0.0191, + "step": 40640 + }, + { + "epoch": 0.300478992342036, + "grad_norm": 0.0937514454126358, + "learning_rate": 3.510579890788224e-05, + "loss": 0.0193, + "step": 40650 + }, + { + "epoch": 0.30055291091333786, + "grad_norm": 0.09247662872076035, + "learning_rate": 3.510208926875594e-05, + "loss": 0.0184, + "step": 40660 + }, + { + "epoch": 0.3006268294846397, + "grad_norm": 0.07527650147676468, + "learning_rate": 3.509837962962963e-05, + "loss": 0.0191, + "step": 40670 + }, + { + "epoch": 0.3007007480559416, + "grad_norm": 0.11358445882797241, + "learning_rate": 3.509466999050332e-05, + "loss": 0.0185, + "step": 40680 + }, + { + "epoch": 0.30077466662724345, + "grad_norm": 0.06219291314482689, + "learning_rate": 3.509096035137702e-05, + "loss": 0.0181, + "step": 40690 + }, + { + "epoch": 0.3008485851985453, + "grad_norm": 0.08207332342863083, + "learning_rate": 3.5087250712250715e-05, + "loss": 0.0179, + "step": 40700 + }, + { + "epoch": 0.30092250376984714, + "grad_norm": 0.06086430326104164, + "learning_rate": 3.508354107312441e-05, + "loss": 0.0174, + "step": 40710 + }, + { + "epoch": 0.300996422341149, + "grad_norm": 0.0684448629617691, + "learning_rate": 3.50798314339981e-05, + "loss": 0.0188, + "step": 40720 + }, + { + "epoch": 0.30107034091245083, + "grad_norm": 0.051553964614868164, + "learning_rate": 3.5076121794871796e-05, + "loss": 0.0173, + "step": 40730 + }, + { + "epoch": 0.3011442594837527, + "grad_norm": 0.1044883280992508, + "learning_rate": 3.507241215574549e-05, + "loss": 0.0191, + "step": 40740 + }, + { + "epoch": 0.3012181780550546, + "grad_norm": 0.10106072574853897, + "learning_rate": 3.506870251661918e-05, + "loss": 0.0201, + "step": 40750 + }, + { + "epoch": 0.3012920966263564, + "grad_norm": 0.09131675213575363, + "learning_rate": 3.506499287749288e-05, + "loss": 0.0207, + "step": 40760 + }, + { + "epoch": 0.30136601519765827, + "grad_norm": 0.08383143693208694, + "learning_rate": 3.5061283238366573e-05, + "loss": 0.0232, + "step": 40770 + }, + { + "epoch": 0.3014399337689601, + "grad_norm": 0.08969314396381378, + "learning_rate": 3.505757359924027e-05, + "loss": 0.0203, + "step": 40780 + }, + { + "epoch": 0.30151385234026196, + "grad_norm": 0.12498711049556732, + "learning_rate": 3.5053863960113965e-05, + "loss": 0.0195, + "step": 40790 + }, + { + "epoch": 0.3015877709115638, + "grad_norm": 0.06831370294094086, + "learning_rate": 3.5050154320987655e-05, + "loss": 0.019, + "step": 40800 + }, + { + "epoch": 0.3016616894828657, + "grad_norm": 0.08495339006185532, + "learning_rate": 3.504644468186135e-05, + "loss": 0.0203, + "step": 40810 + }, + { + "epoch": 0.30173560805416755, + "grad_norm": 0.11111482977867126, + "learning_rate": 3.504273504273504e-05, + "loss": 0.0179, + "step": 40820 + }, + { + "epoch": 0.3018095266254694, + "grad_norm": 0.06847722828388214, + "learning_rate": 3.5039025403608736e-05, + "loss": 0.02, + "step": 40830 + }, + { + "epoch": 0.30188344519677124, + "grad_norm": 0.1520591825246811, + "learning_rate": 3.503531576448243e-05, + "loss": 0.0189, + "step": 40840 + }, + { + "epoch": 0.3019573637680731, + "grad_norm": 0.13091471791267395, + "learning_rate": 3.503160612535613e-05, + "loss": 0.0189, + "step": 40850 + }, + { + "epoch": 0.30203128233937493, + "grad_norm": 0.08369293808937073, + "learning_rate": 3.5027896486229824e-05, + "loss": 0.019, + "step": 40860 + }, + { + "epoch": 0.3021052009106768, + "grad_norm": 0.09184248745441437, + "learning_rate": 3.502418684710351e-05, + "loss": 0.0194, + "step": 40870 + }, + { + "epoch": 0.3021791194819787, + "grad_norm": 0.08807466179132462, + "learning_rate": 3.502047720797721e-05, + "loss": 0.0197, + "step": 40880 + }, + { + "epoch": 0.3022530380532805, + "grad_norm": 0.07857280224561691, + "learning_rate": 3.5016767568850905e-05, + "loss": 0.0186, + "step": 40890 + }, + { + "epoch": 0.30232695662458237, + "grad_norm": 0.09837421774864197, + "learning_rate": 3.5013057929724594e-05, + "loss": 0.0214, + "step": 40900 + }, + { + "epoch": 0.3024008751958842, + "grad_norm": 0.08540481328964233, + "learning_rate": 3.500934829059829e-05, + "loss": 0.0192, + "step": 40910 + }, + { + "epoch": 0.30247479376718606, + "grad_norm": 0.0964401438832283, + "learning_rate": 3.5005638651471986e-05, + "loss": 0.0191, + "step": 40920 + }, + { + "epoch": 0.3025487123384879, + "grad_norm": 0.07555688172578812, + "learning_rate": 3.500192901234568e-05, + "loss": 0.0182, + "step": 40930 + }, + { + "epoch": 0.3026226309097898, + "grad_norm": 0.07472950965166092, + "learning_rate": 3.499821937321938e-05, + "loss": 0.0152, + "step": 40940 + }, + { + "epoch": 0.30269654948109165, + "grad_norm": 0.0759170651435852, + "learning_rate": 3.499450973409307e-05, + "loss": 0.0199, + "step": 40950 + }, + { + "epoch": 0.3027704680523935, + "grad_norm": 0.13162405788898468, + "learning_rate": 3.499080009496676e-05, + "loss": 0.0205, + "step": 40960 + }, + { + "epoch": 0.30284438662369534, + "grad_norm": 0.08498091250658035, + "learning_rate": 3.498709045584046e-05, + "loss": 0.0189, + "step": 40970 + }, + { + "epoch": 0.3029183051949972, + "grad_norm": 0.06392598152160645, + "learning_rate": 3.498338081671415e-05, + "loss": 0.0179, + "step": 40980 + }, + { + "epoch": 0.30299222376629903, + "grad_norm": 0.08923903107643127, + "learning_rate": 3.4979671177587844e-05, + "loss": 0.0195, + "step": 40990 + }, + { + "epoch": 0.3030661423376009, + "grad_norm": 0.10305962711572647, + "learning_rate": 3.497596153846154e-05, + "loss": 0.0169, + "step": 41000 + }, + { + "epoch": 0.3031400609089028, + "grad_norm": 0.07298894971609116, + "learning_rate": 3.4972251899335236e-05, + "loss": 0.0166, + "step": 41010 + }, + { + "epoch": 0.3032139794802046, + "grad_norm": 0.07091324776411057, + "learning_rate": 3.496854226020893e-05, + "loss": 0.0177, + "step": 41020 + }, + { + "epoch": 0.30328789805150647, + "grad_norm": 0.2106444388628006, + "learning_rate": 3.496483262108262e-05, + "loss": 0.0219, + "step": 41030 + }, + { + "epoch": 0.3033618166228083, + "grad_norm": 0.06905341893434525, + "learning_rate": 3.496112298195632e-05, + "loss": 0.0164, + "step": 41040 + }, + { + "epoch": 0.30343573519411016, + "grad_norm": 0.06448206305503845, + "learning_rate": 3.4957413342830006e-05, + "loss": 0.02, + "step": 41050 + }, + { + "epoch": 0.303509653765412, + "grad_norm": 0.07077588140964508, + "learning_rate": 3.49537037037037e-05, + "loss": 0.0175, + "step": 41060 + }, + { + "epoch": 0.3035835723367139, + "grad_norm": 0.09758295118808746, + "learning_rate": 3.49499940645774e-05, + "loss": 0.0209, + "step": 41070 + }, + { + "epoch": 0.30365749090801575, + "grad_norm": 0.10992822051048279, + "learning_rate": 3.4946284425451094e-05, + "loss": 0.0178, + "step": 41080 + }, + { + "epoch": 0.3037314094793176, + "grad_norm": 0.09202392399311066, + "learning_rate": 3.494257478632479e-05, + "loss": 0.0202, + "step": 41090 + }, + { + "epoch": 0.30380532805061944, + "grad_norm": 0.10223499685525894, + "learning_rate": 3.493886514719848e-05, + "loss": 0.018, + "step": 41100 + }, + { + "epoch": 0.3038792466219213, + "grad_norm": 0.08392112702131271, + "learning_rate": 3.4935155508072175e-05, + "loss": 0.0211, + "step": 41110 + }, + { + "epoch": 0.30395316519322313, + "grad_norm": 0.10613808035850525, + "learning_rate": 3.493144586894587e-05, + "loss": 0.0208, + "step": 41120 + }, + { + "epoch": 0.304027083764525, + "grad_norm": 0.07967979460954666, + "learning_rate": 3.492773622981956e-05, + "loss": 0.0183, + "step": 41130 + }, + { + "epoch": 0.3041010023358269, + "grad_norm": 0.09585115313529968, + "learning_rate": 3.4924026590693256e-05, + "loss": 0.0173, + "step": 41140 + }, + { + "epoch": 0.3041749209071287, + "grad_norm": 0.09077829867601395, + "learning_rate": 3.492031695156696e-05, + "loss": 0.0191, + "step": 41150 + }, + { + "epoch": 0.30424883947843057, + "grad_norm": 0.07524742186069489, + "learning_rate": 3.491660731244065e-05, + "loss": 0.0161, + "step": 41160 + }, + { + "epoch": 0.3043227580497324, + "grad_norm": 0.09042250365018845, + "learning_rate": 3.4912897673314344e-05, + "loss": 0.0212, + "step": 41170 + }, + { + "epoch": 0.30439667662103426, + "grad_norm": 0.10119567066431046, + "learning_rate": 3.4909188034188034e-05, + "loss": 0.0219, + "step": 41180 + }, + { + "epoch": 0.3044705951923361, + "grad_norm": 0.09681563824415207, + "learning_rate": 3.490547839506173e-05, + "loss": 0.0183, + "step": 41190 + }, + { + "epoch": 0.304544513763638, + "grad_norm": 0.10629399120807648, + "learning_rate": 3.4901768755935425e-05, + "loss": 0.0167, + "step": 41200 + }, + { + "epoch": 0.30461843233493985, + "grad_norm": 0.08098754286766052, + "learning_rate": 3.4898059116809115e-05, + "loss": 0.0213, + "step": 41210 + }, + { + "epoch": 0.3046923509062417, + "grad_norm": 0.09643225371837616, + "learning_rate": 3.489434947768281e-05, + "loss": 0.0211, + "step": 41220 + }, + { + "epoch": 0.30476626947754354, + "grad_norm": 0.09403149038553238, + "learning_rate": 3.4890639838556507e-05, + "loss": 0.0193, + "step": 41230 + }, + { + "epoch": 0.3048401880488454, + "grad_norm": 0.06702283024787903, + "learning_rate": 3.48869301994302e-05, + "loss": 0.0171, + "step": 41240 + }, + { + "epoch": 0.30491410662014723, + "grad_norm": 0.08411083370447159, + "learning_rate": 3.48832205603039e-05, + "loss": 0.021, + "step": 41250 + }, + { + "epoch": 0.3049880251914491, + "grad_norm": 0.08013994246721268, + "learning_rate": 3.487951092117759e-05, + "loss": 0.0192, + "step": 41260 + }, + { + "epoch": 0.305061943762751, + "grad_norm": 0.08897579461336136, + "learning_rate": 3.4875801282051284e-05, + "loss": 0.0197, + "step": 41270 + }, + { + "epoch": 0.3051358623340528, + "grad_norm": 0.08514079451560974, + "learning_rate": 3.487209164292497e-05, + "loss": 0.0197, + "step": 41280 + }, + { + "epoch": 0.30520978090535467, + "grad_norm": 0.09539547562599182, + "learning_rate": 3.486838200379867e-05, + "loss": 0.0178, + "step": 41290 + }, + { + "epoch": 0.3052836994766565, + "grad_norm": 0.086562380194664, + "learning_rate": 3.486467236467237e-05, + "loss": 0.0165, + "step": 41300 + }, + { + "epoch": 0.30535761804795836, + "grad_norm": 0.13276004791259766, + "learning_rate": 3.486096272554606e-05, + "loss": 0.0169, + "step": 41310 + }, + { + "epoch": 0.3054315366192602, + "grad_norm": 0.11075304448604584, + "learning_rate": 3.485725308641976e-05, + "loss": 0.0191, + "step": 41320 + }, + { + "epoch": 0.3055054551905621, + "grad_norm": 0.102408267557621, + "learning_rate": 3.4853543447293446e-05, + "loss": 0.0188, + "step": 41330 + }, + { + "epoch": 0.30557937376186395, + "grad_norm": 0.11963898688554764, + "learning_rate": 3.484983380816714e-05, + "loss": 0.0199, + "step": 41340 + }, + { + "epoch": 0.3056532923331658, + "grad_norm": 0.10946375876665115, + "learning_rate": 3.484612416904084e-05, + "loss": 0.0193, + "step": 41350 + }, + { + "epoch": 0.30572721090446764, + "grad_norm": 0.10319077968597412, + "learning_rate": 3.484241452991453e-05, + "loss": 0.0178, + "step": 41360 + }, + { + "epoch": 0.3058011294757695, + "grad_norm": 0.09708920121192932, + "learning_rate": 3.483870489078822e-05, + "loss": 0.0172, + "step": 41370 + }, + { + "epoch": 0.30587504804707133, + "grad_norm": 0.06438388675451279, + "learning_rate": 3.4834995251661926e-05, + "loss": 0.0181, + "step": 41380 + }, + { + "epoch": 0.3059489666183732, + "grad_norm": 0.09373542666435242, + "learning_rate": 3.4831285612535615e-05, + "loss": 0.019, + "step": 41390 + }, + { + "epoch": 0.3060228851896751, + "grad_norm": 0.07884660363197327, + "learning_rate": 3.482757597340931e-05, + "loss": 0.0173, + "step": 41400 + }, + { + "epoch": 0.3060968037609769, + "grad_norm": 0.10232368856668472, + "learning_rate": 3.4823866334283e-05, + "loss": 0.02, + "step": 41410 + }, + { + "epoch": 0.30617072233227877, + "grad_norm": 0.08321083337068558, + "learning_rate": 3.4820156695156696e-05, + "loss": 0.0161, + "step": 41420 + }, + { + "epoch": 0.3062446409035806, + "grad_norm": 0.09922542423009872, + "learning_rate": 3.481644705603039e-05, + "loss": 0.0178, + "step": 41430 + }, + { + "epoch": 0.30631855947488246, + "grad_norm": 0.08980782330036163, + "learning_rate": 3.481273741690408e-05, + "loss": 0.0188, + "step": 41440 + }, + { + "epoch": 0.3063924780461843, + "grad_norm": 0.06885354965925217, + "learning_rate": 3.4809027777777784e-05, + "loss": 0.0186, + "step": 41450 + }, + { + "epoch": 0.3064663966174862, + "grad_norm": 0.11679676920175552, + "learning_rate": 3.480531813865147e-05, + "loss": 0.0178, + "step": 41460 + }, + { + "epoch": 0.30654031518878805, + "grad_norm": 0.09281063824892044, + "learning_rate": 3.480160849952517e-05, + "loss": 0.0199, + "step": 41470 + }, + { + "epoch": 0.3066142337600899, + "grad_norm": 0.06953068822622299, + "learning_rate": 3.4797898860398865e-05, + "loss": 0.0205, + "step": 41480 + }, + { + "epoch": 0.30668815233139174, + "grad_norm": 0.09276734292507172, + "learning_rate": 3.4794189221272554e-05, + "loss": 0.0194, + "step": 41490 + }, + { + "epoch": 0.3067620709026936, + "grad_norm": 0.08479952067136765, + "learning_rate": 3.479047958214625e-05, + "loss": 0.0178, + "step": 41500 + }, + { + "epoch": 0.30683598947399543, + "grad_norm": 0.07465128600597382, + "learning_rate": 3.478676994301994e-05, + "loss": 0.0187, + "step": 41510 + }, + { + "epoch": 0.3069099080452973, + "grad_norm": 0.11329672485589981, + "learning_rate": 3.4783060303893635e-05, + "loss": 0.0181, + "step": 41520 + }, + { + "epoch": 0.3069838266165992, + "grad_norm": 0.06531966477632523, + "learning_rate": 3.477935066476734e-05, + "loss": 0.019, + "step": 41530 + }, + { + "epoch": 0.307057745187901, + "grad_norm": 0.09387999027967453, + "learning_rate": 3.477564102564103e-05, + "loss": 0.0192, + "step": 41540 + }, + { + "epoch": 0.30713166375920287, + "grad_norm": 0.06443177163600922, + "learning_rate": 3.477193138651472e-05, + "loss": 0.0194, + "step": 41550 + }, + { + "epoch": 0.3072055823305047, + "grad_norm": 0.10215765982866287, + "learning_rate": 3.476822174738841e-05, + "loss": 0.0194, + "step": 41560 + }, + { + "epoch": 0.30727950090180656, + "grad_norm": 0.09269960969686508, + "learning_rate": 3.476451210826211e-05, + "loss": 0.0199, + "step": 41570 + }, + { + "epoch": 0.3073534194731084, + "grad_norm": 0.08733227849006653, + "learning_rate": 3.4760802469135804e-05, + "loss": 0.0182, + "step": 41580 + }, + { + "epoch": 0.3074273380444103, + "grad_norm": 0.08594011515378952, + "learning_rate": 3.4757092830009494e-05, + "loss": 0.02, + "step": 41590 + }, + { + "epoch": 0.30750125661571215, + "grad_norm": 0.07571780681610107, + "learning_rate": 3.4753383190883196e-05, + "loss": 0.0209, + "step": 41600 + }, + { + "epoch": 0.307575175187014, + "grad_norm": 0.07805407047271729, + "learning_rate": 3.474967355175689e-05, + "loss": 0.0199, + "step": 41610 + }, + { + "epoch": 0.30764909375831584, + "grad_norm": 0.07116065174341202, + "learning_rate": 3.474596391263058e-05, + "loss": 0.0166, + "step": 41620 + }, + { + "epoch": 0.3077230123296177, + "grad_norm": 0.07087753713130951, + "learning_rate": 3.474225427350428e-05, + "loss": 0.0215, + "step": 41630 + }, + { + "epoch": 0.30779693090091953, + "grad_norm": 0.09777181595563889, + "learning_rate": 3.473854463437797e-05, + "loss": 0.0177, + "step": 41640 + }, + { + "epoch": 0.3078708494722214, + "grad_norm": 0.07996222376823425, + "learning_rate": 3.473483499525166e-05, + "loss": 0.0201, + "step": 41650 + }, + { + "epoch": 0.3079447680435233, + "grad_norm": 0.08959174901247025, + "learning_rate": 3.473112535612536e-05, + "loss": 0.0205, + "step": 41660 + }, + { + "epoch": 0.3080186866148251, + "grad_norm": 0.11086247861385345, + "learning_rate": 3.472741571699905e-05, + "loss": 0.0192, + "step": 41670 + }, + { + "epoch": 0.30809260518612697, + "grad_norm": 0.08845746517181396, + "learning_rate": 3.472370607787275e-05, + "loss": 0.0188, + "step": 41680 + }, + { + "epoch": 0.3081665237574288, + "grad_norm": 0.08229666203260422, + "learning_rate": 3.471999643874644e-05, + "loss": 0.0191, + "step": 41690 + }, + { + "epoch": 0.30824044232873066, + "grad_norm": 0.10134036093950272, + "learning_rate": 3.4716286799620136e-05, + "loss": 0.0199, + "step": 41700 + }, + { + "epoch": 0.3083143609000325, + "grad_norm": 0.09389904886484146, + "learning_rate": 3.471257716049383e-05, + "loss": 0.0169, + "step": 41710 + }, + { + "epoch": 0.3083882794713344, + "grad_norm": 0.1407119482755661, + "learning_rate": 3.470886752136752e-05, + "loss": 0.0218, + "step": 41720 + }, + { + "epoch": 0.30846219804263625, + "grad_norm": 0.08079833537340164, + "learning_rate": 3.470515788224122e-05, + "loss": 0.0166, + "step": 41730 + }, + { + "epoch": 0.3085361166139381, + "grad_norm": 0.07268788665533066, + "learning_rate": 3.4701448243114906e-05, + "loss": 0.0193, + "step": 41740 + }, + { + "epoch": 0.30861003518523994, + "grad_norm": 0.08180305361747742, + "learning_rate": 3.469773860398861e-05, + "loss": 0.0193, + "step": 41750 + }, + { + "epoch": 0.3086839537565418, + "grad_norm": 0.0704931765794754, + "learning_rate": 3.4694028964862305e-05, + "loss": 0.0174, + "step": 41760 + }, + { + "epoch": 0.30875787232784363, + "grad_norm": 0.12671099603176117, + "learning_rate": 3.4690319325735994e-05, + "loss": 0.0169, + "step": 41770 + }, + { + "epoch": 0.3088317908991455, + "grad_norm": 0.07578154653310776, + "learning_rate": 3.468660968660969e-05, + "loss": 0.016, + "step": 41780 + }, + { + "epoch": 0.3089057094704474, + "grad_norm": 0.05910816416144371, + "learning_rate": 3.468290004748338e-05, + "loss": 0.0177, + "step": 41790 + }, + { + "epoch": 0.3089796280417492, + "grad_norm": 0.1280171275138855, + "learning_rate": 3.4679190408357075e-05, + "loss": 0.0186, + "step": 41800 + }, + { + "epoch": 0.30905354661305107, + "grad_norm": 0.07991659641265869, + "learning_rate": 3.467548076923077e-05, + "loss": 0.0178, + "step": 41810 + }, + { + "epoch": 0.3091274651843529, + "grad_norm": 0.10132498294115067, + "learning_rate": 3.467177113010446e-05, + "loss": 0.0168, + "step": 41820 + }, + { + "epoch": 0.30920138375565476, + "grad_norm": 0.08080621808767319, + "learning_rate": 3.466806149097816e-05, + "loss": 0.0168, + "step": 41830 + }, + { + "epoch": 0.3092753023269566, + "grad_norm": 0.08449237048625946, + "learning_rate": 3.466435185185186e-05, + "loss": 0.0185, + "step": 41840 + }, + { + "epoch": 0.3093492208982585, + "grad_norm": 0.08588672429323196, + "learning_rate": 3.466064221272555e-05, + "loss": 0.0179, + "step": 41850 + }, + { + "epoch": 0.30942313946956035, + "grad_norm": 0.08748660981655121, + "learning_rate": 3.4656932573599244e-05, + "loss": 0.0216, + "step": 41860 + }, + { + "epoch": 0.3094970580408622, + "grad_norm": 0.0776829645037651, + "learning_rate": 3.465322293447293e-05, + "loss": 0.0189, + "step": 41870 + }, + { + "epoch": 0.30957097661216404, + "grad_norm": 0.05737544223666191, + "learning_rate": 3.464951329534663e-05, + "loss": 0.0193, + "step": 41880 + }, + { + "epoch": 0.3096448951834659, + "grad_norm": 0.08774904906749725, + "learning_rate": 3.4645803656220325e-05, + "loss": 0.0207, + "step": 41890 + }, + { + "epoch": 0.30971881375476773, + "grad_norm": 0.10542219877243042, + "learning_rate": 3.4642094017094014e-05, + "loss": 0.0182, + "step": 41900 + }, + { + "epoch": 0.3097927323260696, + "grad_norm": 0.08650589734315872, + "learning_rate": 3.463838437796772e-05, + "loss": 0.0195, + "step": 41910 + }, + { + "epoch": 0.3098666508973715, + "grad_norm": 0.07177089154720306, + "learning_rate": 3.4634674738841406e-05, + "loss": 0.0187, + "step": 41920 + }, + { + "epoch": 0.3099405694686733, + "grad_norm": 0.09856868535280228, + "learning_rate": 3.46309650997151e-05, + "loss": 0.0192, + "step": 41930 + }, + { + "epoch": 0.31001448803997517, + "grad_norm": 0.09403856843709946, + "learning_rate": 3.46272554605888e-05, + "loss": 0.0198, + "step": 41940 + }, + { + "epoch": 0.310088406611277, + "grad_norm": 0.09185963124036789, + "learning_rate": 3.462354582146249e-05, + "loss": 0.0191, + "step": 41950 + }, + { + "epoch": 0.31016232518257886, + "grad_norm": 0.08827266097068787, + "learning_rate": 3.4619836182336183e-05, + "loss": 0.0171, + "step": 41960 + }, + { + "epoch": 0.3102362437538807, + "grad_norm": 0.10931427776813507, + "learning_rate": 3.461612654320987e-05, + "loss": 0.0182, + "step": 41970 + }, + { + "epoch": 0.3103101623251826, + "grad_norm": 0.08554702252149582, + "learning_rate": 3.4612416904083575e-05, + "loss": 0.017, + "step": 41980 + }, + { + "epoch": 0.31038408089648445, + "grad_norm": 0.0926201343536377, + "learning_rate": 3.460870726495727e-05, + "loss": 0.02, + "step": 41990 + }, + { + "epoch": 0.3104579994677863, + "grad_norm": 0.08251968026161194, + "learning_rate": 3.460499762583096e-05, + "loss": 0.021, + "step": 42000 + }, + { + "epoch": 0.31053191803908814, + "grad_norm": 0.1010434478521347, + "learning_rate": 3.4601287986704657e-05, + "loss": 0.0203, + "step": 42010 + }, + { + "epoch": 0.31060583661039, + "grad_norm": 0.07276196032762527, + "learning_rate": 3.4597578347578346e-05, + "loss": 0.0184, + "step": 42020 + }, + { + "epoch": 0.31067975518169183, + "grad_norm": 0.08228261023759842, + "learning_rate": 3.459386870845204e-05, + "loss": 0.0182, + "step": 42030 + }, + { + "epoch": 0.3107536737529937, + "grad_norm": 0.07231447100639343, + "learning_rate": 3.459015906932574e-05, + "loss": 0.0183, + "step": 42040 + }, + { + "epoch": 0.3108275923242956, + "grad_norm": 0.08942629396915436, + "learning_rate": 3.458644943019943e-05, + "loss": 0.0182, + "step": 42050 + }, + { + "epoch": 0.3109015108955974, + "grad_norm": 0.07905402779579163, + "learning_rate": 3.458273979107313e-05, + "loss": 0.0201, + "step": 42060 + }, + { + "epoch": 0.31097542946689927, + "grad_norm": 0.08255128562450409, + "learning_rate": 3.4579030151946826e-05, + "loss": 0.016, + "step": 42070 + }, + { + "epoch": 0.3110493480382011, + "grad_norm": 0.07505086809396744, + "learning_rate": 3.4575320512820515e-05, + "loss": 0.0213, + "step": 42080 + }, + { + "epoch": 0.31112326660950296, + "grad_norm": 0.07689981907606125, + "learning_rate": 3.457161087369421e-05, + "loss": 0.0186, + "step": 42090 + }, + { + "epoch": 0.3111971851808048, + "grad_norm": 0.08315929025411606, + "learning_rate": 3.45679012345679e-05, + "loss": 0.0177, + "step": 42100 + }, + { + "epoch": 0.3112711037521067, + "grad_norm": 0.09151861816644669, + "learning_rate": 3.4564191595441596e-05, + "loss": 0.0172, + "step": 42110 + }, + { + "epoch": 0.31134502232340855, + "grad_norm": 0.11061935126781464, + "learning_rate": 3.456048195631529e-05, + "loss": 0.0223, + "step": 42120 + }, + { + "epoch": 0.3114189408947104, + "grad_norm": 0.061550162732601166, + "learning_rate": 3.455677231718899e-05, + "loss": 0.0176, + "step": 42130 + }, + { + "epoch": 0.31149285946601224, + "grad_norm": 0.11352083832025528, + "learning_rate": 3.4553062678062684e-05, + "loss": 0.0201, + "step": 42140 + }, + { + "epoch": 0.3115667780373141, + "grad_norm": 0.12292362749576569, + "learning_rate": 3.454935303893637e-05, + "loss": 0.0197, + "step": 42150 + }, + { + "epoch": 0.31164069660861593, + "grad_norm": 0.09067011624574661, + "learning_rate": 3.454564339981007e-05, + "loss": 0.0181, + "step": 42160 + }, + { + "epoch": 0.3117146151799178, + "grad_norm": 0.08751238137483597, + "learning_rate": 3.4541933760683765e-05, + "loss": 0.0175, + "step": 42170 + }, + { + "epoch": 0.3117885337512197, + "grad_norm": 0.11258953809738159, + "learning_rate": 3.4538224121557454e-05, + "loss": 0.0208, + "step": 42180 + }, + { + "epoch": 0.3118624523225215, + "grad_norm": 0.06748968362808228, + "learning_rate": 3.453451448243115e-05, + "loss": 0.0176, + "step": 42190 + }, + { + "epoch": 0.31193637089382337, + "grad_norm": 0.0773598849773407, + "learning_rate": 3.453080484330484e-05, + "loss": 0.0198, + "step": 42200 + }, + { + "epoch": 0.3120102894651252, + "grad_norm": 0.09891007095575333, + "learning_rate": 3.452709520417854e-05, + "loss": 0.0183, + "step": 42210 + }, + { + "epoch": 0.31208420803642706, + "grad_norm": 0.06439699977636337, + "learning_rate": 3.452338556505224e-05, + "loss": 0.0182, + "step": 42220 + }, + { + "epoch": 0.3121581266077289, + "grad_norm": 0.09395837038755417, + "learning_rate": 3.451967592592593e-05, + "loss": 0.0199, + "step": 42230 + }, + { + "epoch": 0.3122320451790308, + "grad_norm": 0.06609996408224106, + "learning_rate": 3.451596628679962e-05, + "loss": 0.0182, + "step": 42240 + }, + { + "epoch": 0.31230596375033265, + "grad_norm": 0.13297858834266663, + "learning_rate": 3.451225664767331e-05, + "loss": 0.0208, + "step": 42250 + }, + { + "epoch": 0.3123798823216345, + "grad_norm": 0.07555203139781952, + "learning_rate": 3.450854700854701e-05, + "loss": 0.017, + "step": 42260 + }, + { + "epoch": 0.31245380089293634, + "grad_norm": 0.09166613966226578, + "learning_rate": 3.4504837369420704e-05, + "loss": 0.0201, + "step": 42270 + }, + { + "epoch": 0.3125277194642382, + "grad_norm": 0.08490381389856339, + "learning_rate": 3.45011277302944e-05, + "loss": 0.0175, + "step": 42280 + }, + { + "epoch": 0.31260163803554003, + "grad_norm": 0.09918981790542603, + "learning_rate": 3.4497418091168096e-05, + "loss": 0.0193, + "step": 42290 + }, + { + "epoch": 0.3126755566068419, + "grad_norm": 0.09711208939552307, + "learning_rate": 3.449370845204179e-05, + "loss": 0.022, + "step": 42300 + }, + { + "epoch": 0.3127494751781438, + "grad_norm": 0.08405983448028564, + "learning_rate": 3.448999881291548e-05, + "loss": 0.0179, + "step": 42310 + }, + { + "epoch": 0.3128233937494456, + "grad_norm": 0.0851854532957077, + "learning_rate": 3.448628917378918e-05, + "loss": 0.0159, + "step": 42320 + }, + { + "epoch": 0.31289731232074747, + "grad_norm": 0.09262275695800781, + "learning_rate": 3.4482579534662866e-05, + "loss": 0.0186, + "step": 42330 + }, + { + "epoch": 0.3129712308920493, + "grad_norm": 0.08604475110769272, + "learning_rate": 3.447886989553656e-05, + "loss": 0.0219, + "step": 42340 + }, + { + "epoch": 0.31304514946335116, + "grad_norm": 0.10185009241104126, + "learning_rate": 3.447516025641026e-05, + "loss": 0.0192, + "step": 42350 + }, + { + "epoch": 0.313119068034653, + "grad_norm": 0.10257411748170853, + "learning_rate": 3.4471450617283954e-05, + "loss": 0.0196, + "step": 42360 + }, + { + "epoch": 0.3131929866059549, + "grad_norm": 0.07268723100423813, + "learning_rate": 3.446774097815765e-05, + "loss": 0.0179, + "step": 42370 + }, + { + "epoch": 0.31326690517725675, + "grad_norm": 0.07031035423278809, + "learning_rate": 3.446403133903134e-05, + "loss": 0.0182, + "step": 42380 + }, + { + "epoch": 0.3133408237485586, + "grad_norm": 0.08224672079086304, + "learning_rate": 3.4460321699905035e-05, + "loss": 0.0175, + "step": 42390 + }, + { + "epoch": 0.31341474231986044, + "grad_norm": 0.10886122286319733, + "learning_rate": 3.445661206077873e-05, + "loss": 0.0185, + "step": 42400 + }, + { + "epoch": 0.3134886608911623, + "grad_norm": 0.10665310174226761, + "learning_rate": 3.445290242165242e-05, + "loss": 0.0197, + "step": 42410 + }, + { + "epoch": 0.31356257946246413, + "grad_norm": 0.08749476820230484, + "learning_rate": 3.444919278252612e-05, + "loss": 0.0183, + "step": 42420 + }, + { + "epoch": 0.313636498033766, + "grad_norm": 0.08360803872346878, + "learning_rate": 3.444548314339981e-05, + "loss": 0.0192, + "step": 42430 + }, + { + "epoch": 0.3137104166050679, + "grad_norm": 0.07620719075202942, + "learning_rate": 3.444177350427351e-05, + "loss": 0.0173, + "step": 42440 + }, + { + "epoch": 0.3137843351763697, + "grad_norm": 0.08754304051399231, + "learning_rate": 3.4438063865147205e-05, + "loss": 0.0189, + "step": 42450 + }, + { + "epoch": 0.31385825374767157, + "grad_norm": 0.09048190712928772, + "learning_rate": 3.4434354226020894e-05, + "loss": 0.0195, + "step": 42460 + }, + { + "epoch": 0.3139321723189734, + "grad_norm": 0.08618180453777313, + "learning_rate": 3.443064458689459e-05, + "loss": 0.0193, + "step": 42470 + }, + { + "epoch": 0.31400609089027526, + "grad_norm": 0.11144917458295822, + "learning_rate": 3.442693494776828e-05, + "loss": 0.0202, + "step": 42480 + }, + { + "epoch": 0.3140800094615771, + "grad_norm": 0.10908270627260208, + "learning_rate": 3.4423225308641975e-05, + "loss": 0.0192, + "step": 42490 + }, + { + "epoch": 0.314153928032879, + "grad_norm": 0.09473959356546402, + "learning_rate": 3.441951566951567e-05, + "loss": 0.0179, + "step": 42500 + }, + { + "epoch": 0.31422784660418085, + "grad_norm": 0.11607281863689423, + "learning_rate": 3.441580603038937e-05, + "loss": 0.0172, + "step": 42510 + }, + { + "epoch": 0.3143017651754827, + "grad_norm": 0.0758347287774086, + "learning_rate": 3.441209639126306e-05, + "loss": 0.0177, + "step": 42520 + }, + { + "epoch": 0.31437568374678454, + "grad_norm": 0.11561296880245209, + "learning_rate": 3.440838675213676e-05, + "loss": 0.0186, + "step": 42530 + }, + { + "epoch": 0.3144496023180864, + "grad_norm": 0.09926974773406982, + "learning_rate": 3.440467711301045e-05, + "loss": 0.0176, + "step": 42540 + }, + { + "epoch": 0.31452352088938823, + "grad_norm": 0.06947681307792664, + "learning_rate": 3.4400967473884144e-05, + "loss": 0.0181, + "step": 42550 + }, + { + "epoch": 0.31459743946069013, + "grad_norm": 0.08658704906702042, + "learning_rate": 3.439725783475783e-05, + "loss": 0.0196, + "step": 42560 + }, + { + "epoch": 0.314671358031992, + "grad_norm": 0.09495535492897034, + "learning_rate": 3.439354819563153e-05, + "loss": 0.0183, + "step": 42570 + }, + { + "epoch": 0.3147452766032938, + "grad_norm": 0.07387121021747589, + "learning_rate": 3.4389838556505225e-05, + "loss": 0.0173, + "step": 42580 + }, + { + "epoch": 0.31481919517459567, + "grad_norm": 0.08743169158697128, + "learning_rate": 3.438612891737892e-05, + "loss": 0.0203, + "step": 42590 + }, + { + "epoch": 0.3148931137458975, + "grad_norm": 0.06493943184614182, + "learning_rate": 3.438241927825262e-05, + "loss": 0.0181, + "step": 42600 + }, + { + "epoch": 0.31496703231719936, + "grad_norm": 0.09590958058834076, + "learning_rate": 3.4378709639126306e-05, + "loss": 0.0191, + "step": 42610 + }, + { + "epoch": 0.3150409508885012, + "grad_norm": 0.11012791097164154, + "learning_rate": 3.4375e-05, + "loss": 0.0212, + "step": 42620 + }, + { + "epoch": 0.3151148694598031, + "grad_norm": 0.09544768929481506, + "learning_rate": 3.43712903608737e-05, + "loss": 0.0197, + "step": 42630 + }, + { + "epoch": 0.31518878803110495, + "grad_norm": 0.09332182258367538, + "learning_rate": 3.436758072174739e-05, + "loss": 0.0205, + "step": 42640 + }, + { + "epoch": 0.3152627066024068, + "grad_norm": 0.0923343226313591, + "learning_rate": 3.436387108262108e-05, + "loss": 0.0183, + "step": 42650 + }, + { + "epoch": 0.31533662517370864, + "grad_norm": 0.0833817794919014, + "learning_rate": 3.436016144349478e-05, + "loss": 0.0217, + "step": 42660 + }, + { + "epoch": 0.3154105437450105, + "grad_norm": 0.08704043924808502, + "learning_rate": 3.4356451804368475e-05, + "loss": 0.02, + "step": 42670 + }, + { + "epoch": 0.31548446231631233, + "grad_norm": 0.0813233032822609, + "learning_rate": 3.435274216524217e-05, + "loss": 0.0198, + "step": 42680 + }, + { + "epoch": 0.31555838088761423, + "grad_norm": 0.08583715558052063, + "learning_rate": 3.434903252611586e-05, + "loss": 0.0181, + "step": 42690 + }, + { + "epoch": 0.3156322994589161, + "grad_norm": 0.06066381186246872, + "learning_rate": 3.4345322886989556e-05, + "loss": 0.0191, + "step": 42700 + }, + { + "epoch": 0.3157062180302179, + "grad_norm": 0.10361328721046448, + "learning_rate": 3.4341613247863245e-05, + "loss": 0.0198, + "step": 42710 + }, + { + "epoch": 0.31578013660151977, + "grad_norm": 0.09133805334568024, + "learning_rate": 3.433790360873694e-05, + "loss": 0.0206, + "step": 42720 + }, + { + "epoch": 0.3158540551728216, + "grad_norm": 0.09569969773292542, + "learning_rate": 3.433419396961064e-05, + "loss": 0.0211, + "step": 42730 + }, + { + "epoch": 0.31592797374412346, + "grad_norm": 0.0903606042265892, + "learning_rate": 3.433048433048433e-05, + "loss": 0.0195, + "step": 42740 + }, + { + "epoch": 0.3160018923154253, + "grad_norm": 0.06401552259922028, + "learning_rate": 3.432677469135803e-05, + "loss": 0.0175, + "step": 42750 + }, + { + "epoch": 0.3160758108867272, + "grad_norm": 0.11987832933664322, + "learning_rate": 3.4323065052231725e-05, + "loss": 0.0171, + "step": 42760 + }, + { + "epoch": 0.31614972945802905, + "grad_norm": 0.10897868871688843, + "learning_rate": 3.4319355413105414e-05, + "loss": 0.0185, + "step": 42770 + }, + { + "epoch": 0.3162236480293309, + "grad_norm": 0.15359020233154297, + "learning_rate": 3.431564577397911e-05, + "loss": 0.0183, + "step": 42780 + }, + { + "epoch": 0.31629756660063274, + "grad_norm": 0.06444056332111359, + "learning_rate": 3.43119361348528e-05, + "loss": 0.0184, + "step": 42790 + }, + { + "epoch": 0.3163714851719346, + "grad_norm": 0.10911993682384491, + "learning_rate": 3.4308226495726496e-05, + "loss": 0.0175, + "step": 42800 + }, + { + "epoch": 0.31644540374323643, + "grad_norm": 0.07998649030923843, + "learning_rate": 3.430451685660019e-05, + "loss": 0.0189, + "step": 42810 + }, + { + "epoch": 0.31651932231453833, + "grad_norm": 0.09834294766187668, + "learning_rate": 3.430080721747389e-05, + "loss": 0.0177, + "step": 42820 + }, + { + "epoch": 0.3165932408858402, + "grad_norm": 0.08719082176685333, + "learning_rate": 3.4297097578347583e-05, + "loss": 0.0195, + "step": 42830 + }, + { + "epoch": 0.316667159457142, + "grad_norm": 0.12308837473392487, + "learning_rate": 3.429338793922127e-05, + "loss": 0.0178, + "step": 42840 + }, + { + "epoch": 0.31674107802844387, + "grad_norm": 0.11948662251234055, + "learning_rate": 3.428967830009497e-05, + "loss": 0.0175, + "step": 42850 + }, + { + "epoch": 0.3168149965997457, + "grad_norm": 0.07762668281793594, + "learning_rate": 3.4285968660968665e-05, + "loss": 0.0192, + "step": 42860 + }, + { + "epoch": 0.31688891517104756, + "grad_norm": 0.09460771828889847, + "learning_rate": 3.4282259021842354e-05, + "loss": 0.0186, + "step": 42870 + }, + { + "epoch": 0.3169628337423494, + "grad_norm": 0.07193564623594284, + "learning_rate": 3.427854938271605e-05, + "loss": 0.0189, + "step": 42880 + }, + { + "epoch": 0.3170367523136513, + "grad_norm": 0.09276462346315384, + "learning_rate": 3.4274839743589746e-05, + "loss": 0.0175, + "step": 42890 + }, + { + "epoch": 0.31711067088495315, + "grad_norm": 0.0885123535990715, + "learning_rate": 3.427113010446344e-05, + "loss": 0.0195, + "step": 42900 + }, + { + "epoch": 0.317184589456255, + "grad_norm": 0.09436597675085068, + "learning_rate": 3.426742046533714e-05, + "loss": 0.0188, + "step": 42910 + }, + { + "epoch": 0.31725850802755684, + "grad_norm": 0.07258325815200806, + "learning_rate": 3.426371082621083e-05, + "loss": 0.0178, + "step": 42920 + }, + { + "epoch": 0.3173324265988587, + "grad_norm": 0.06716993451118469, + "learning_rate": 3.426000118708452e-05, + "loss": 0.0203, + "step": 42930 + }, + { + "epoch": 0.31740634517016053, + "grad_norm": 0.08646200597286224, + "learning_rate": 3.425629154795821e-05, + "loss": 0.0198, + "step": 42940 + }, + { + "epoch": 0.31748026374146243, + "grad_norm": 0.06386014074087143, + "learning_rate": 3.425258190883191e-05, + "loss": 0.017, + "step": 42950 + }, + { + "epoch": 0.3175541823127643, + "grad_norm": 0.06515224277973175, + "learning_rate": 3.4248872269705604e-05, + "loss": 0.0179, + "step": 42960 + }, + { + "epoch": 0.3176281008840661, + "grad_norm": 0.08746074140071869, + "learning_rate": 3.42451626305793e-05, + "loss": 0.0181, + "step": 42970 + }, + { + "epoch": 0.31770201945536797, + "grad_norm": 0.08505991101264954, + "learning_rate": 3.4241452991452996e-05, + "loss": 0.0207, + "step": 42980 + }, + { + "epoch": 0.3177759380266698, + "grad_norm": 0.08368141204118729, + "learning_rate": 3.423774335232669e-05, + "loss": 0.0183, + "step": 42990 + }, + { + "epoch": 0.31784985659797166, + "grad_norm": 0.09414701908826828, + "learning_rate": 3.423403371320038e-05, + "loss": 0.0212, + "step": 43000 + }, + { + "epoch": 0.3179237751692735, + "grad_norm": 0.09063467383384705, + "learning_rate": 3.423032407407408e-05, + "loss": 0.0183, + "step": 43010 + }, + { + "epoch": 0.3179976937405754, + "grad_norm": 0.07970460504293442, + "learning_rate": 3.4226614434947766e-05, + "loss": 0.0227, + "step": 43020 + }, + { + "epoch": 0.31807161231187725, + "grad_norm": 0.09081530570983887, + "learning_rate": 3.422290479582146e-05, + "loss": 0.0208, + "step": 43030 + }, + { + "epoch": 0.3181455308831791, + "grad_norm": 0.08632205426692963, + "learning_rate": 3.421919515669516e-05, + "loss": 0.018, + "step": 43040 + }, + { + "epoch": 0.31821944945448094, + "grad_norm": 0.06210676580667496, + "learning_rate": 3.4215485517568854e-05, + "loss": 0.0173, + "step": 43050 + }, + { + "epoch": 0.3182933680257828, + "grad_norm": 0.09494701772928238, + "learning_rate": 3.421177587844255e-05, + "loss": 0.0188, + "step": 43060 + }, + { + "epoch": 0.31836728659708463, + "grad_norm": 0.11246080696582794, + "learning_rate": 3.420806623931624e-05, + "loss": 0.0165, + "step": 43070 + }, + { + "epoch": 0.31844120516838653, + "grad_norm": 0.11079894006252289, + "learning_rate": 3.4204356600189935e-05, + "loss": 0.0204, + "step": 43080 + }, + { + "epoch": 0.3185151237396884, + "grad_norm": 0.30354100465774536, + "learning_rate": 3.420064696106363e-05, + "loss": 0.0207, + "step": 43090 + }, + { + "epoch": 0.3185890423109902, + "grad_norm": 0.11234261095523834, + "learning_rate": 3.419693732193732e-05, + "loss": 0.0185, + "step": 43100 + }, + { + "epoch": 0.31866296088229207, + "grad_norm": 0.09679894149303436, + "learning_rate": 3.4193227682811016e-05, + "loss": 0.0205, + "step": 43110 + }, + { + "epoch": 0.3187368794535939, + "grad_norm": 0.10255374014377594, + "learning_rate": 3.418951804368471e-05, + "loss": 0.019, + "step": 43120 + }, + { + "epoch": 0.31881079802489576, + "grad_norm": 0.09295099973678589, + "learning_rate": 3.418580840455841e-05, + "loss": 0.0179, + "step": 43130 + }, + { + "epoch": 0.3188847165961976, + "grad_norm": 0.06588831543922424, + "learning_rate": 3.4182098765432104e-05, + "loss": 0.0185, + "step": 43140 + }, + { + "epoch": 0.3189586351674995, + "grad_norm": 0.0991397351026535, + "learning_rate": 3.4178389126305793e-05, + "loss": 0.0199, + "step": 43150 + }, + { + "epoch": 0.31903255373880135, + "grad_norm": 0.10081097483634949, + "learning_rate": 3.417467948717949e-05, + "loss": 0.0228, + "step": 43160 + }, + { + "epoch": 0.3191064723101032, + "grad_norm": 0.09069965034723282, + "learning_rate": 3.417096984805318e-05, + "loss": 0.0166, + "step": 43170 + }, + { + "epoch": 0.31918039088140504, + "grad_norm": 0.1002868041396141, + "learning_rate": 3.4167260208926875e-05, + "loss": 0.0216, + "step": 43180 + }, + { + "epoch": 0.3192543094527069, + "grad_norm": 0.07414255291223526, + "learning_rate": 3.416355056980057e-05, + "loss": 0.0209, + "step": 43190 + }, + { + "epoch": 0.31932822802400873, + "grad_norm": 0.08768635988235474, + "learning_rate": 3.4159840930674267e-05, + "loss": 0.0202, + "step": 43200 + }, + { + "epoch": 0.31940214659531063, + "grad_norm": 0.08148328959941864, + "learning_rate": 3.415613129154796e-05, + "loss": 0.0211, + "step": 43210 + }, + { + "epoch": 0.3194760651666125, + "grad_norm": 0.07470358908176422, + "learning_rate": 3.415242165242166e-05, + "loss": 0.0181, + "step": 43220 + }, + { + "epoch": 0.3195499837379143, + "grad_norm": 0.07791826874017715, + "learning_rate": 3.414871201329535e-05, + "loss": 0.019, + "step": 43230 + }, + { + "epoch": 0.31962390230921617, + "grad_norm": 0.07294123619794846, + "learning_rate": 3.4145002374169044e-05, + "loss": 0.0179, + "step": 43240 + }, + { + "epoch": 0.319697820880518, + "grad_norm": 0.08214742690324783, + "learning_rate": 3.414129273504273e-05, + "loss": 0.0196, + "step": 43250 + }, + { + "epoch": 0.31977173945181986, + "grad_norm": 0.08051939308643341, + "learning_rate": 3.413758309591643e-05, + "loss": 0.0198, + "step": 43260 + }, + { + "epoch": 0.3198456580231217, + "grad_norm": 0.06528766453266144, + "learning_rate": 3.4133873456790125e-05, + "loss": 0.0174, + "step": 43270 + }, + { + "epoch": 0.3199195765944236, + "grad_norm": 0.06888923048973083, + "learning_rate": 3.413016381766382e-05, + "loss": 0.0186, + "step": 43280 + }, + { + "epoch": 0.31999349516572545, + "grad_norm": 0.06578348577022552, + "learning_rate": 3.412645417853752e-05, + "loss": 0.0164, + "step": 43290 + }, + { + "epoch": 0.3200674137370273, + "grad_norm": 0.1435423642396927, + "learning_rate": 3.4122744539411206e-05, + "loss": 0.0201, + "step": 43300 + }, + { + "epoch": 0.32014133230832914, + "grad_norm": 0.09891865402460098, + "learning_rate": 3.41190349002849e-05, + "loss": 0.0185, + "step": 43310 + }, + { + "epoch": 0.320215250879631, + "grad_norm": 0.09307089447975159, + "learning_rate": 3.41153252611586e-05, + "loss": 0.0183, + "step": 43320 + }, + { + "epoch": 0.32028916945093283, + "grad_norm": 0.14205126464366913, + "learning_rate": 3.411161562203229e-05, + "loss": 0.0214, + "step": 43330 + }, + { + "epoch": 0.32036308802223473, + "grad_norm": 0.08174362778663635, + "learning_rate": 3.410790598290598e-05, + "loss": 0.0191, + "step": 43340 + }, + { + "epoch": 0.3204370065935366, + "grad_norm": 0.06301737576723099, + "learning_rate": 3.410419634377968e-05, + "loss": 0.0156, + "step": 43350 + }, + { + "epoch": 0.3205109251648384, + "grad_norm": 0.07327570766210556, + "learning_rate": 3.4100486704653375e-05, + "loss": 0.0186, + "step": 43360 + }, + { + "epoch": 0.32058484373614027, + "grad_norm": 0.07892131060361862, + "learning_rate": 3.409677706552707e-05, + "loss": 0.0183, + "step": 43370 + }, + { + "epoch": 0.3206587623074421, + "grad_norm": 0.10537906736135483, + "learning_rate": 3.409306742640076e-05, + "loss": 0.0181, + "step": 43380 + }, + { + "epoch": 0.32073268087874396, + "grad_norm": 0.07635634392499924, + "learning_rate": 3.4089357787274456e-05, + "loss": 0.0193, + "step": 43390 + }, + { + "epoch": 0.3208065994500458, + "grad_norm": 0.09027914702892303, + "learning_rate": 3.4085648148148145e-05, + "loss": 0.0216, + "step": 43400 + }, + { + "epoch": 0.3208805180213477, + "grad_norm": 0.09778694808483124, + "learning_rate": 3.408193850902184e-05, + "loss": 0.0193, + "step": 43410 + }, + { + "epoch": 0.32095443659264955, + "grad_norm": 0.071172334253788, + "learning_rate": 3.407822886989554e-05, + "loss": 0.0182, + "step": 43420 + }, + { + "epoch": 0.3210283551639514, + "grad_norm": 0.08770966529846191, + "learning_rate": 3.407451923076923e-05, + "loss": 0.0195, + "step": 43430 + }, + { + "epoch": 0.32110227373525324, + "grad_norm": 0.08739007264375687, + "learning_rate": 3.407080959164293e-05, + "loss": 0.0172, + "step": 43440 + }, + { + "epoch": 0.3211761923065551, + "grad_norm": 0.0868479385972023, + "learning_rate": 3.4067099952516625e-05, + "loss": 0.017, + "step": 43450 + }, + { + "epoch": 0.32125011087785693, + "grad_norm": 0.09316948801279068, + "learning_rate": 3.4063390313390314e-05, + "loss": 0.0195, + "step": 43460 + }, + { + "epoch": 0.32132402944915883, + "grad_norm": 0.07854902744293213, + "learning_rate": 3.405968067426401e-05, + "loss": 0.0193, + "step": 43470 + }, + { + "epoch": 0.3213979480204607, + "grad_norm": 0.08497950434684753, + "learning_rate": 3.40559710351377e-05, + "loss": 0.0192, + "step": 43480 + }, + { + "epoch": 0.3214718665917625, + "grad_norm": 0.10147658735513687, + "learning_rate": 3.4052261396011395e-05, + "loss": 0.0187, + "step": 43490 + }, + { + "epoch": 0.32154578516306437, + "grad_norm": 0.06862959265708923, + "learning_rate": 3.404855175688509e-05, + "loss": 0.0167, + "step": 43500 + }, + { + "epoch": 0.3216197037343662, + "grad_norm": 0.1020599827170372, + "learning_rate": 3.404484211775879e-05, + "loss": 0.018, + "step": 43510 + }, + { + "epoch": 0.32169362230566806, + "grad_norm": 0.09441424161195755, + "learning_rate": 3.404113247863248e-05, + "loss": 0.0197, + "step": 43520 + }, + { + "epoch": 0.3217675408769699, + "grad_norm": 0.0880255475640297, + "learning_rate": 3.403742283950617e-05, + "loss": 0.0199, + "step": 43530 + }, + { + "epoch": 0.3218414594482718, + "grad_norm": 0.0700983926653862, + "learning_rate": 3.403371320037987e-05, + "loss": 0.0181, + "step": 43540 + }, + { + "epoch": 0.32191537801957365, + "grad_norm": 0.09148949384689331, + "learning_rate": 3.4030003561253564e-05, + "loss": 0.0192, + "step": 43550 + }, + { + "epoch": 0.3219892965908755, + "grad_norm": 0.1262521594762802, + "learning_rate": 3.4026293922127254e-05, + "loss": 0.0187, + "step": 43560 + }, + { + "epoch": 0.32206321516217734, + "grad_norm": 0.11587070673704147, + "learning_rate": 3.402258428300095e-05, + "loss": 0.0188, + "step": 43570 + }, + { + "epoch": 0.3221371337334792, + "grad_norm": 0.09726160764694214, + "learning_rate": 3.4018874643874645e-05, + "loss": 0.0173, + "step": 43580 + }, + { + "epoch": 0.32221105230478103, + "grad_norm": 0.09375026077032089, + "learning_rate": 3.401516500474834e-05, + "loss": 0.0172, + "step": 43590 + }, + { + "epoch": 0.32228497087608293, + "grad_norm": 0.08319017291069031, + "learning_rate": 3.401145536562204e-05, + "loss": 0.0194, + "step": 43600 + }, + { + "epoch": 0.3223588894473848, + "grad_norm": 0.07451638579368591, + "learning_rate": 3.400774572649573e-05, + "loss": 0.018, + "step": 43610 + }, + { + "epoch": 0.3224328080186866, + "grad_norm": 0.07658302038908005, + "learning_rate": 3.400403608736942e-05, + "loss": 0.0192, + "step": 43620 + }, + { + "epoch": 0.32250672658998847, + "grad_norm": 0.11052345484495163, + "learning_rate": 3.400032644824311e-05, + "loss": 0.0204, + "step": 43630 + }, + { + "epoch": 0.3225806451612903, + "grad_norm": 0.09942183643579483, + "learning_rate": 3.399661680911681e-05, + "loss": 0.0198, + "step": 43640 + }, + { + "epoch": 0.32265456373259216, + "grad_norm": 0.08108251541852951, + "learning_rate": 3.3992907169990504e-05, + "loss": 0.0183, + "step": 43650 + }, + { + "epoch": 0.322728482303894, + "grad_norm": 0.10380920022726059, + "learning_rate": 3.39891975308642e-05, + "loss": 0.0213, + "step": 43660 + }, + { + "epoch": 0.3228024008751959, + "grad_norm": 0.08956963568925858, + "learning_rate": 3.3985487891737896e-05, + "loss": 0.0186, + "step": 43670 + }, + { + "epoch": 0.32287631944649775, + "grad_norm": 0.11362124979496002, + "learning_rate": 3.398177825261159e-05, + "loss": 0.0178, + "step": 43680 + }, + { + "epoch": 0.3229502380177996, + "grad_norm": 0.05954618379473686, + "learning_rate": 3.397806861348528e-05, + "loss": 0.0185, + "step": 43690 + }, + { + "epoch": 0.32302415658910144, + "grad_norm": 0.06360287964344025, + "learning_rate": 3.397435897435898e-05, + "loss": 0.018, + "step": 43700 + }, + { + "epoch": 0.3230980751604033, + "grad_norm": 0.08753181248903275, + "learning_rate": 3.3970649335232666e-05, + "loss": 0.0178, + "step": 43710 + }, + { + "epoch": 0.32317199373170513, + "grad_norm": 0.09147054702043533, + "learning_rate": 3.396693969610636e-05, + "loss": 0.0201, + "step": 43720 + }, + { + "epoch": 0.32324591230300703, + "grad_norm": 0.0729442685842514, + "learning_rate": 3.396323005698006e-05, + "loss": 0.0169, + "step": 43730 + }, + { + "epoch": 0.3233198308743089, + "grad_norm": 0.08728668093681335, + "learning_rate": 3.3959520417853754e-05, + "loss": 0.0201, + "step": 43740 + }, + { + "epoch": 0.3233937494456107, + "grad_norm": 0.07614386081695557, + "learning_rate": 3.395581077872745e-05, + "loss": 0.0204, + "step": 43750 + }, + { + "epoch": 0.32346766801691257, + "grad_norm": 0.09705346822738647, + "learning_rate": 3.395210113960114e-05, + "loss": 0.0211, + "step": 43760 + }, + { + "epoch": 0.3235415865882144, + "grad_norm": 0.07552920281887054, + "learning_rate": 3.3948391500474835e-05, + "loss": 0.0189, + "step": 43770 + }, + { + "epoch": 0.32361550515951626, + "grad_norm": 0.08856528252363205, + "learning_rate": 3.394468186134853e-05, + "loss": 0.0183, + "step": 43780 + }, + { + "epoch": 0.3236894237308181, + "grad_norm": 0.07442531734704971, + "learning_rate": 3.394097222222222e-05, + "loss": 0.019, + "step": 43790 + }, + { + "epoch": 0.32376334230212, + "grad_norm": 0.06279998272657394, + "learning_rate": 3.3937262583095916e-05, + "loss": 0.0189, + "step": 43800 + }, + { + "epoch": 0.32383726087342185, + "grad_norm": 0.060555700212717056, + "learning_rate": 3.393355294396961e-05, + "loss": 0.0185, + "step": 43810 + }, + { + "epoch": 0.3239111794447237, + "grad_norm": 0.06599309295415878, + "learning_rate": 3.392984330484331e-05, + "loss": 0.02, + "step": 43820 + }, + { + "epoch": 0.32398509801602554, + "grad_norm": 0.05925067141652107, + "learning_rate": 3.3926133665717004e-05, + "loss": 0.0166, + "step": 43830 + }, + { + "epoch": 0.3240590165873274, + "grad_norm": 0.0998731404542923, + "learning_rate": 3.392242402659069e-05, + "loss": 0.0179, + "step": 43840 + }, + { + "epoch": 0.32413293515862923, + "grad_norm": 0.12187574058771133, + "learning_rate": 3.391871438746439e-05, + "loss": 0.0197, + "step": 43850 + }, + { + "epoch": 0.32420685372993113, + "grad_norm": 0.07652764022350311, + "learning_rate": 3.391500474833808e-05, + "loss": 0.0178, + "step": 43860 + }, + { + "epoch": 0.324280772301233, + "grad_norm": 0.08620169758796692, + "learning_rate": 3.3911295109211774e-05, + "loss": 0.0178, + "step": 43870 + }, + { + "epoch": 0.3243546908725348, + "grad_norm": 0.09348164498806, + "learning_rate": 3.390758547008547e-05, + "loss": 0.0191, + "step": 43880 + }, + { + "epoch": 0.32442860944383667, + "grad_norm": 0.07452642172574997, + "learning_rate": 3.3903875830959166e-05, + "loss": 0.0185, + "step": 43890 + }, + { + "epoch": 0.3245025280151385, + "grad_norm": 0.07998265326023102, + "learning_rate": 3.390016619183286e-05, + "loss": 0.0176, + "step": 43900 + }, + { + "epoch": 0.32457644658644036, + "grad_norm": 0.10004127770662308, + "learning_rate": 3.389645655270656e-05, + "loss": 0.0188, + "step": 43910 + }, + { + "epoch": 0.3246503651577422, + "grad_norm": 0.0779428631067276, + "learning_rate": 3.389274691358025e-05, + "loss": 0.0186, + "step": 43920 + }, + { + "epoch": 0.3247242837290441, + "grad_norm": 0.08592027425765991, + "learning_rate": 3.388903727445394e-05, + "loss": 0.0187, + "step": 43930 + }, + { + "epoch": 0.32479820230034595, + "grad_norm": 0.09659634530544281, + "learning_rate": 3.388532763532763e-05, + "loss": 0.0218, + "step": 43940 + }, + { + "epoch": 0.3248721208716478, + "grad_norm": 0.08549503237009048, + "learning_rate": 3.388161799620133e-05, + "loss": 0.0172, + "step": 43950 + }, + { + "epoch": 0.32494603944294964, + "grad_norm": 0.07176053524017334, + "learning_rate": 3.3877908357075024e-05, + "loss": 0.0169, + "step": 43960 + }, + { + "epoch": 0.3250199580142515, + "grad_norm": 0.09597663581371307, + "learning_rate": 3.387419871794872e-05, + "loss": 0.0182, + "step": 43970 + }, + { + "epoch": 0.32509387658555333, + "grad_norm": 0.09322880953550339, + "learning_rate": 3.3870489078822416e-05, + "loss": 0.0192, + "step": 43980 + }, + { + "epoch": 0.32516779515685523, + "grad_norm": 0.08309384435415268, + "learning_rate": 3.3866779439696106e-05, + "loss": 0.019, + "step": 43990 + }, + { + "epoch": 0.3252417137281571, + "grad_norm": 0.08086520433425903, + "learning_rate": 3.38630698005698e-05, + "loss": 0.0196, + "step": 44000 + }, + { + "epoch": 0.3253156322994589, + "grad_norm": 0.07314729690551758, + "learning_rate": 3.38593601614435e-05, + "loss": 0.0203, + "step": 44010 + }, + { + "epoch": 0.32538955087076077, + "grad_norm": 0.08843156695365906, + "learning_rate": 3.385565052231719e-05, + "loss": 0.0184, + "step": 44020 + }, + { + "epoch": 0.3254634694420626, + "grad_norm": 0.13710302114486694, + "learning_rate": 3.385194088319088e-05, + "loss": 0.0187, + "step": 44030 + }, + { + "epoch": 0.32553738801336446, + "grad_norm": 0.06512115895748138, + "learning_rate": 3.384823124406458e-05, + "loss": 0.0186, + "step": 44040 + }, + { + "epoch": 0.3256113065846663, + "grad_norm": 0.09785745292901993, + "learning_rate": 3.3844521604938275e-05, + "loss": 0.0159, + "step": 44050 + }, + { + "epoch": 0.3256852251559682, + "grad_norm": 0.09728438407182693, + "learning_rate": 3.384081196581197e-05, + "loss": 0.0203, + "step": 44060 + }, + { + "epoch": 0.32575914372727005, + "grad_norm": 0.0847766101360321, + "learning_rate": 3.383710232668566e-05, + "loss": 0.0181, + "step": 44070 + }, + { + "epoch": 0.3258330622985719, + "grad_norm": 0.06976700574159622, + "learning_rate": 3.3833392687559356e-05, + "loss": 0.0194, + "step": 44080 + }, + { + "epoch": 0.32590698086987374, + "grad_norm": 0.08276432007551193, + "learning_rate": 3.3829683048433045e-05, + "loss": 0.0192, + "step": 44090 + }, + { + "epoch": 0.3259808994411756, + "grad_norm": 0.0673220157623291, + "learning_rate": 3.382597340930674e-05, + "loss": 0.0191, + "step": 44100 + }, + { + "epoch": 0.32605481801247743, + "grad_norm": 0.07937823235988617, + "learning_rate": 3.382226377018044e-05, + "loss": 0.0177, + "step": 44110 + }, + { + "epoch": 0.32612873658377933, + "grad_norm": 0.07397346198558807, + "learning_rate": 3.381855413105413e-05, + "loss": 0.019, + "step": 44120 + }, + { + "epoch": 0.3262026551550812, + "grad_norm": 0.07269817590713501, + "learning_rate": 3.381484449192783e-05, + "loss": 0.0164, + "step": 44130 + }, + { + "epoch": 0.326276573726383, + "grad_norm": 0.07143591344356537, + "learning_rate": 3.3811134852801525e-05, + "loss": 0.0194, + "step": 44140 + }, + { + "epoch": 0.32635049229768487, + "grad_norm": 0.10848372429609299, + "learning_rate": 3.3807425213675214e-05, + "loss": 0.019, + "step": 44150 + }, + { + "epoch": 0.3264244108689867, + "grad_norm": 0.08963832259178162, + "learning_rate": 3.380371557454891e-05, + "loss": 0.0202, + "step": 44160 + }, + { + "epoch": 0.32649832944028856, + "grad_norm": 0.0764172300696373, + "learning_rate": 3.38000059354226e-05, + "loss": 0.0172, + "step": 44170 + }, + { + "epoch": 0.3265722480115904, + "grad_norm": 0.08552178740501404, + "learning_rate": 3.3796296296296295e-05, + "loss": 0.0185, + "step": 44180 + }, + { + "epoch": 0.3266461665828923, + "grad_norm": 0.1096053346991539, + "learning_rate": 3.379258665717e-05, + "loss": 0.0171, + "step": 44190 + }, + { + "epoch": 0.32672008515419415, + "grad_norm": 0.16093607246875763, + "learning_rate": 3.378887701804369e-05, + "loss": 0.0178, + "step": 44200 + }, + { + "epoch": 0.326794003725496, + "grad_norm": 0.07053223252296448, + "learning_rate": 3.378516737891738e-05, + "loss": 0.02, + "step": 44210 + }, + { + "epoch": 0.32686792229679784, + "grad_norm": 0.1485041230916977, + "learning_rate": 3.378145773979107e-05, + "loss": 0.019, + "step": 44220 + }, + { + "epoch": 0.3269418408680997, + "grad_norm": 0.07003764808177948, + "learning_rate": 3.377774810066477e-05, + "loss": 0.0166, + "step": 44230 + }, + { + "epoch": 0.32701575943940153, + "grad_norm": 0.07502686232328415, + "learning_rate": 3.3774038461538464e-05, + "loss": 0.02, + "step": 44240 + }, + { + "epoch": 0.32708967801070343, + "grad_norm": 0.08239313215017319, + "learning_rate": 3.377032882241215e-05, + "loss": 0.0203, + "step": 44250 + }, + { + "epoch": 0.3271635965820053, + "grad_norm": 0.09772242605686188, + "learning_rate": 3.376661918328585e-05, + "loss": 0.0189, + "step": 44260 + }, + { + "epoch": 0.3272375151533071, + "grad_norm": 0.11429288238286972, + "learning_rate": 3.3762909544159545e-05, + "loss": 0.0182, + "step": 44270 + }, + { + "epoch": 0.32731143372460897, + "grad_norm": 0.0794249027967453, + "learning_rate": 3.375919990503324e-05, + "loss": 0.0176, + "step": 44280 + }, + { + "epoch": 0.3273853522959108, + "grad_norm": 0.07520075887441635, + "learning_rate": 3.375549026590694e-05, + "loss": 0.0162, + "step": 44290 + }, + { + "epoch": 0.32745927086721266, + "grad_norm": 0.0852094441652298, + "learning_rate": 3.3751780626780626e-05, + "loss": 0.0177, + "step": 44300 + }, + { + "epoch": 0.3275331894385145, + "grad_norm": 0.09648241102695465, + "learning_rate": 3.374807098765432e-05, + "loss": 0.0215, + "step": 44310 + }, + { + "epoch": 0.3276071080098164, + "grad_norm": 0.06515844166278839, + "learning_rate": 3.374436134852801e-05, + "loss": 0.0176, + "step": 44320 + }, + { + "epoch": 0.32768102658111825, + "grad_norm": 0.1179385706782341, + "learning_rate": 3.374065170940171e-05, + "loss": 0.0173, + "step": 44330 + }, + { + "epoch": 0.3277549451524201, + "grad_norm": 0.0831533819437027, + "learning_rate": 3.373694207027541e-05, + "loss": 0.0195, + "step": 44340 + }, + { + "epoch": 0.32782886372372194, + "grad_norm": 0.11352211236953735, + "learning_rate": 3.37332324311491e-05, + "loss": 0.0205, + "step": 44350 + }, + { + "epoch": 0.3279027822950238, + "grad_norm": 0.09942604601383209, + "learning_rate": 3.3729522792022795e-05, + "loss": 0.0167, + "step": 44360 + }, + { + "epoch": 0.32797670086632563, + "grad_norm": 0.10662192106246948, + "learning_rate": 3.372581315289649e-05, + "loss": 0.0189, + "step": 44370 + }, + { + "epoch": 0.32805061943762753, + "grad_norm": 0.11508975178003311, + "learning_rate": 3.372210351377018e-05, + "loss": 0.018, + "step": 44380 + }, + { + "epoch": 0.3281245380089294, + "grad_norm": 0.1185436099767685, + "learning_rate": 3.3718393874643877e-05, + "loss": 0.0184, + "step": 44390 + }, + { + "epoch": 0.3281984565802312, + "grad_norm": 0.08700943738222122, + "learning_rate": 3.3714684235517566e-05, + "loss": 0.0203, + "step": 44400 + }, + { + "epoch": 0.32827237515153307, + "grad_norm": 0.0730283334851265, + "learning_rate": 3.371097459639126e-05, + "loss": 0.018, + "step": 44410 + }, + { + "epoch": 0.3283462937228349, + "grad_norm": 0.067600317299366, + "learning_rate": 3.3707264957264964e-05, + "loss": 0.0175, + "step": 44420 + }, + { + "epoch": 0.32842021229413676, + "grad_norm": 0.6665230989456177, + "learning_rate": 3.3703555318138654e-05, + "loss": 0.0235, + "step": 44430 + }, + { + "epoch": 0.32849413086543866, + "grad_norm": 0.08568233251571655, + "learning_rate": 3.369984567901235e-05, + "loss": 0.0174, + "step": 44440 + }, + { + "epoch": 0.3285680494367405, + "grad_norm": 0.060386039316654205, + "learning_rate": 3.369613603988604e-05, + "loss": 0.0179, + "step": 44450 + }, + { + "epoch": 0.32864196800804235, + "grad_norm": 0.10326115787029266, + "learning_rate": 3.3692426400759735e-05, + "loss": 0.02, + "step": 44460 + }, + { + "epoch": 0.3287158865793442, + "grad_norm": 0.06419151276350021, + "learning_rate": 3.368871676163343e-05, + "loss": 0.0203, + "step": 44470 + }, + { + "epoch": 0.32878980515064604, + "grad_norm": 0.11964228004217148, + "learning_rate": 3.368500712250712e-05, + "loss": 0.0186, + "step": 44480 + }, + { + "epoch": 0.3288637237219479, + "grad_norm": 0.08400668948888779, + "learning_rate": 3.368129748338082e-05, + "loss": 0.0185, + "step": 44490 + }, + { + "epoch": 0.32893764229324973, + "grad_norm": 0.06105771288275719, + "learning_rate": 3.367758784425451e-05, + "loss": 0.0189, + "step": 44500 + }, + { + "epoch": 0.32901156086455163, + "grad_norm": 0.09602080285549164, + "learning_rate": 3.367387820512821e-05, + "loss": 0.0164, + "step": 44510 + }, + { + "epoch": 0.3290854794358535, + "grad_norm": 0.09508220851421356, + "learning_rate": 3.3670168566001904e-05, + "loss": 0.0196, + "step": 44520 + }, + { + "epoch": 0.3291593980071553, + "grad_norm": 0.06339730322360992, + "learning_rate": 3.366645892687559e-05, + "loss": 0.0169, + "step": 44530 + }, + { + "epoch": 0.32923331657845717, + "grad_norm": 0.10049261897802353, + "learning_rate": 3.366274928774929e-05, + "loss": 0.0179, + "step": 44540 + }, + { + "epoch": 0.329307235149759, + "grad_norm": 0.07951238006353378, + "learning_rate": 3.365903964862298e-05, + "loss": 0.0209, + "step": 44550 + }, + { + "epoch": 0.32938115372106086, + "grad_norm": 0.0696750357747078, + "learning_rate": 3.3655330009496674e-05, + "loss": 0.0184, + "step": 44560 + }, + { + "epoch": 0.32945507229236276, + "grad_norm": 0.07983843982219696, + "learning_rate": 3.365162037037038e-05, + "loss": 0.0183, + "step": 44570 + }, + { + "epoch": 0.3295289908636646, + "grad_norm": 0.08632298558950424, + "learning_rate": 3.3647910731244066e-05, + "loss": 0.0198, + "step": 44580 + }, + { + "epoch": 0.32960290943496645, + "grad_norm": 0.08244680613279343, + "learning_rate": 3.364420109211776e-05, + "loss": 0.0165, + "step": 44590 + }, + { + "epoch": 0.3296768280062683, + "grad_norm": 0.07452484965324402, + "learning_rate": 3.364049145299146e-05, + "loss": 0.0175, + "step": 44600 + }, + { + "epoch": 0.32975074657757014, + "grad_norm": 0.06344713270664215, + "learning_rate": 3.363678181386515e-05, + "loss": 0.0182, + "step": 44610 + }, + { + "epoch": 0.329824665148872, + "grad_norm": 0.13326257467269897, + "learning_rate": 3.363307217473884e-05, + "loss": 0.0213, + "step": 44620 + }, + { + "epoch": 0.32989858372017383, + "grad_norm": 0.08266934752464294, + "learning_rate": 3.362936253561253e-05, + "loss": 0.0192, + "step": 44630 + }, + { + "epoch": 0.32997250229147573, + "grad_norm": 0.07077519595623016, + "learning_rate": 3.362565289648623e-05, + "loss": 0.0199, + "step": 44640 + }, + { + "epoch": 0.3300464208627776, + "grad_norm": 0.06713325530290604, + "learning_rate": 3.362194325735993e-05, + "loss": 0.0177, + "step": 44650 + }, + { + "epoch": 0.3301203394340794, + "grad_norm": 0.10795174539089203, + "learning_rate": 3.361823361823362e-05, + "loss": 0.021, + "step": 44660 + }, + { + "epoch": 0.33019425800538127, + "grad_norm": 0.11786088347434998, + "learning_rate": 3.3614523979107316e-05, + "loss": 0.0188, + "step": 44670 + }, + { + "epoch": 0.3302681765766831, + "grad_norm": 0.09778054803609848, + "learning_rate": 3.3610814339981005e-05, + "loss": 0.0181, + "step": 44680 + }, + { + "epoch": 0.33034209514798496, + "grad_norm": 0.10623161494731903, + "learning_rate": 3.36071047008547e-05, + "loss": 0.0195, + "step": 44690 + }, + { + "epoch": 0.33041601371928686, + "grad_norm": 0.06341532617807388, + "learning_rate": 3.36033950617284e-05, + "loss": 0.0162, + "step": 44700 + }, + { + "epoch": 0.3304899322905887, + "grad_norm": 0.08555193245410919, + "learning_rate": 3.3599685422602086e-05, + "loss": 0.0205, + "step": 44710 + }, + { + "epoch": 0.33056385086189055, + "grad_norm": 0.10713338106870651, + "learning_rate": 3.359597578347579e-05, + "loss": 0.019, + "step": 44720 + }, + { + "epoch": 0.3306377694331924, + "grad_norm": 0.0889672115445137, + "learning_rate": 3.359226614434948e-05, + "loss": 0.017, + "step": 44730 + }, + { + "epoch": 0.33071168800449424, + "grad_norm": 0.1061701849102974, + "learning_rate": 3.3588556505223174e-05, + "loss": 0.0199, + "step": 44740 + }, + { + "epoch": 0.3307856065757961, + "grad_norm": 0.0591684989631176, + "learning_rate": 3.358484686609687e-05, + "loss": 0.0203, + "step": 44750 + }, + { + "epoch": 0.33085952514709793, + "grad_norm": 0.12138590216636658, + "learning_rate": 3.358113722697056e-05, + "loss": 0.0188, + "step": 44760 + }, + { + "epoch": 0.33093344371839983, + "grad_norm": 0.06759916245937347, + "learning_rate": 3.3577427587844256e-05, + "loss": 0.0168, + "step": 44770 + }, + { + "epoch": 0.3310073622897017, + "grad_norm": 0.07580891996622086, + "learning_rate": 3.3573717948717945e-05, + "loss": 0.0194, + "step": 44780 + }, + { + "epoch": 0.3310812808610035, + "grad_norm": 0.10968596488237381, + "learning_rate": 3.357000830959164e-05, + "loss": 0.0163, + "step": 44790 + }, + { + "epoch": 0.33115519943230537, + "grad_norm": 0.0971807911992073, + "learning_rate": 3.3566298670465343e-05, + "loss": 0.0205, + "step": 44800 + }, + { + "epoch": 0.3312291180036072, + "grad_norm": 0.1008811816573143, + "learning_rate": 3.356258903133903e-05, + "loss": 0.0167, + "step": 44810 + }, + { + "epoch": 0.33130303657490906, + "grad_norm": 0.09538344293832779, + "learning_rate": 3.355887939221273e-05, + "loss": 0.0166, + "step": 44820 + }, + { + "epoch": 0.33137695514621096, + "grad_norm": 0.07427355647087097, + "learning_rate": 3.3555169753086425e-05, + "loss": 0.0202, + "step": 44830 + }, + { + "epoch": 0.3314508737175128, + "grad_norm": 0.09792878478765488, + "learning_rate": 3.3551460113960114e-05, + "loss": 0.0205, + "step": 44840 + }, + { + "epoch": 0.33152479228881465, + "grad_norm": 0.07082540541887283, + "learning_rate": 3.354775047483381e-05, + "loss": 0.0192, + "step": 44850 + }, + { + "epoch": 0.3315987108601165, + "grad_norm": 0.08246816694736481, + "learning_rate": 3.35440408357075e-05, + "loss": 0.0182, + "step": 44860 + }, + { + "epoch": 0.33167262943141834, + "grad_norm": 0.09148363769054413, + "learning_rate": 3.35403311965812e-05, + "loss": 0.0187, + "step": 44870 + }, + { + "epoch": 0.3317465480027202, + "grad_norm": 0.07163092494010925, + "learning_rate": 3.35366215574549e-05, + "loss": 0.0208, + "step": 44880 + }, + { + "epoch": 0.33182046657402203, + "grad_norm": 0.05248361453413963, + "learning_rate": 3.353291191832859e-05, + "loss": 0.0186, + "step": 44890 + }, + { + "epoch": 0.33189438514532393, + "grad_norm": 0.08499059826135635, + "learning_rate": 3.352920227920228e-05, + "loss": 0.021, + "step": 44900 + }, + { + "epoch": 0.3319683037166258, + "grad_norm": 0.09064117074012756, + "learning_rate": 3.352549264007597e-05, + "loss": 0.0226, + "step": 44910 + }, + { + "epoch": 0.3320422222879276, + "grad_norm": 0.09755630791187286, + "learning_rate": 3.352178300094967e-05, + "loss": 0.0193, + "step": 44920 + }, + { + "epoch": 0.33211614085922947, + "grad_norm": 0.1002260372042656, + "learning_rate": 3.3518073361823364e-05, + "loss": 0.0193, + "step": 44930 + }, + { + "epoch": 0.3321900594305313, + "grad_norm": 0.0829191580414772, + "learning_rate": 3.351436372269705e-05, + "loss": 0.0199, + "step": 44940 + }, + { + "epoch": 0.33226397800183316, + "grad_norm": 0.1327805370092392, + "learning_rate": 3.3510654083570756e-05, + "loss": 0.0187, + "step": 44950 + }, + { + "epoch": 0.33233789657313506, + "grad_norm": 0.09947414696216583, + "learning_rate": 3.3506944444444445e-05, + "loss": 0.0203, + "step": 44960 + }, + { + "epoch": 0.3324118151444369, + "grad_norm": 0.0643693134188652, + "learning_rate": 3.350323480531814e-05, + "loss": 0.0225, + "step": 44970 + }, + { + "epoch": 0.33248573371573875, + "grad_norm": 0.10599484294652939, + "learning_rate": 3.349952516619184e-05, + "loss": 0.0188, + "step": 44980 + }, + { + "epoch": 0.3325596522870406, + "grad_norm": 0.0734916552901268, + "learning_rate": 3.3495815527065526e-05, + "loss": 0.0186, + "step": 44990 + }, + { + "epoch": 0.33263357085834244, + "grad_norm": 0.08003656566143036, + "learning_rate": 3.349210588793922e-05, + "loss": 0.0176, + "step": 45000 + }, + { + "epoch": 0.3327074894296443, + "grad_norm": 0.08142461627721786, + "learning_rate": 3.348839624881291e-05, + "loss": 0.0183, + "step": 45010 + }, + { + "epoch": 0.33278140800094613, + "grad_norm": 0.08227168768644333, + "learning_rate": 3.3484686609686614e-05, + "loss": 0.0182, + "step": 45020 + }, + { + "epoch": 0.33285532657224803, + "grad_norm": 0.10435879230499268, + "learning_rate": 3.348097697056031e-05, + "loss": 0.0207, + "step": 45030 + }, + { + "epoch": 0.3329292451435499, + "grad_norm": 0.09841688722372055, + "learning_rate": 3.3477267331434e-05, + "loss": 0.0174, + "step": 45040 + }, + { + "epoch": 0.3330031637148517, + "grad_norm": 0.12566670775413513, + "learning_rate": 3.3473557692307695e-05, + "loss": 0.0212, + "step": 45050 + }, + { + "epoch": 0.33307708228615357, + "grad_norm": 0.06397977471351624, + "learning_rate": 3.346984805318139e-05, + "loss": 0.0152, + "step": 45060 + }, + { + "epoch": 0.3331510008574554, + "grad_norm": 0.10317044705152512, + "learning_rate": 3.346613841405508e-05, + "loss": 0.0219, + "step": 45070 + }, + { + "epoch": 0.33322491942875726, + "grad_norm": 0.11175418645143509, + "learning_rate": 3.3462428774928776e-05, + "loss": 0.018, + "step": 45080 + }, + { + "epoch": 0.33329883800005916, + "grad_norm": 0.09719778597354889, + "learning_rate": 3.3458719135802465e-05, + "loss": 0.0186, + "step": 45090 + }, + { + "epoch": 0.333372756571361, + "grad_norm": 0.09944400191307068, + "learning_rate": 3.345500949667617e-05, + "loss": 0.0196, + "step": 45100 + }, + { + "epoch": 0.33344667514266285, + "grad_norm": 0.1216149777173996, + "learning_rate": 3.3451299857549864e-05, + "loss": 0.019, + "step": 45110 + }, + { + "epoch": 0.3335205937139647, + "grad_norm": 0.07389708608388901, + "learning_rate": 3.344759021842355e-05, + "loss": 0.0178, + "step": 45120 + }, + { + "epoch": 0.33359451228526654, + "grad_norm": 0.13755720853805542, + "learning_rate": 3.344388057929725e-05, + "loss": 0.0182, + "step": 45130 + }, + { + "epoch": 0.3336684308565684, + "grad_norm": 0.0727863758802414, + "learning_rate": 3.344017094017094e-05, + "loss": 0.0176, + "step": 45140 + }, + { + "epoch": 0.33374234942787023, + "grad_norm": 0.1026555746793747, + "learning_rate": 3.3436461301044634e-05, + "loss": 0.0185, + "step": 45150 + }, + { + "epoch": 0.33381626799917213, + "grad_norm": 0.06983400881290436, + "learning_rate": 3.343275166191833e-05, + "loss": 0.0173, + "step": 45160 + }, + { + "epoch": 0.333890186570474, + "grad_norm": 0.11022666841745377, + "learning_rate": 3.3429042022792026e-05, + "loss": 0.0186, + "step": 45170 + }, + { + "epoch": 0.3339641051417758, + "grad_norm": 0.06704510748386383, + "learning_rate": 3.342533238366572e-05, + "loss": 0.0195, + "step": 45180 + }, + { + "epoch": 0.33403802371307767, + "grad_norm": 0.09733101725578308, + "learning_rate": 3.342162274453941e-05, + "loss": 0.0188, + "step": 45190 + }, + { + "epoch": 0.3341119422843795, + "grad_norm": 0.11707833409309387, + "learning_rate": 3.341791310541311e-05, + "loss": 0.019, + "step": 45200 + }, + { + "epoch": 0.33418586085568136, + "grad_norm": 0.08068402111530304, + "learning_rate": 3.3414203466286804e-05, + "loss": 0.02, + "step": 45210 + }, + { + "epoch": 0.33425977942698326, + "grad_norm": 0.08782497048377991, + "learning_rate": 3.341049382716049e-05, + "loss": 0.0181, + "step": 45220 + }, + { + "epoch": 0.3343336979982851, + "grad_norm": 0.0925891250371933, + "learning_rate": 3.340678418803419e-05, + "loss": 0.0188, + "step": 45230 + }, + { + "epoch": 0.33440761656958695, + "grad_norm": 0.10016071051359177, + "learning_rate": 3.340307454890788e-05, + "loss": 0.0207, + "step": 45240 + }, + { + "epoch": 0.3344815351408888, + "grad_norm": 0.09942296147346497, + "learning_rate": 3.339936490978158e-05, + "loss": 0.018, + "step": 45250 + }, + { + "epoch": 0.33455545371219064, + "grad_norm": 0.07284510135650635, + "learning_rate": 3.3395655270655277e-05, + "loss": 0.0201, + "step": 45260 + }, + { + "epoch": 0.3346293722834925, + "grad_norm": 0.08632287383079529, + "learning_rate": 3.3391945631528966e-05, + "loss": 0.0187, + "step": 45270 + }, + { + "epoch": 0.33470329085479433, + "grad_norm": 0.07172413915395737, + "learning_rate": 3.338823599240266e-05, + "loss": 0.0208, + "step": 45280 + }, + { + "epoch": 0.33477720942609623, + "grad_norm": 0.06224941462278366, + "learning_rate": 3.338452635327636e-05, + "loss": 0.0162, + "step": 45290 + }, + { + "epoch": 0.3348511279973981, + "grad_norm": 0.09879124909639359, + "learning_rate": 3.338081671415005e-05, + "loss": 0.02, + "step": 45300 + }, + { + "epoch": 0.3349250465686999, + "grad_norm": 0.1068207249045372, + "learning_rate": 3.337710707502374e-05, + "loss": 0.0176, + "step": 45310 + }, + { + "epoch": 0.33499896514000177, + "grad_norm": 0.08688890188932419, + "learning_rate": 3.337339743589744e-05, + "loss": 0.021, + "step": 45320 + }, + { + "epoch": 0.3350728837113036, + "grad_norm": 0.07991108298301697, + "learning_rate": 3.3369687796771135e-05, + "loss": 0.0176, + "step": 45330 + }, + { + "epoch": 0.33514680228260546, + "grad_norm": 0.07760392129421234, + "learning_rate": 3.336597815764483e-05, + "loss": 0.0162, + "step": 45340 + }, + { + "epoch": 0.33522072085390736, + "grad_norm": 0.10568118095397949, + "learning_rate": 3.336226851851852e-05, + "loss": 0.0175, + "step": 45350 + }, + { + "epoch": 0.3352946394252092, + "grad_norm": 0.14464734494686127, + "learning_rate": 3.3358558879392216e-05, + "loss": 0.0178, + "step": 45360 + }, + { + "epoch": 0.33536855799651105, + "grad_norm": 0.10452807694673538, + "learning_rate": 3.3354849240265905e-05, + "loss": 0.021, + "step": 45370 + }, + { + "epoch": 0.3354424765678129, + "grad_norm": 0.06442765146493912, + "learning_rate": 3.33511396011396e-05, + "loss": 0.0183, + "step": 45380 + }, + { + "epoch": 0.33551639513911474, + "grad_norm": 0.08903558552265167, + "learning_rate": 3.33474299620133e-05, + "loss": 0.0192, + "step": 45390 + }, + { + "epoch": 0.3355903137104166, + "grad_norm": 0.09148462116718292, + "learning_rate": 3.334372032288699e-05, + "loss": 0.0186, + "step": 45400 + }, + { + "epoch": 0.33566423228171843, + "grad_norm": 0.10632915049791336, + "learning_rate": 3.334001068376069e-05, + "loss": 0.0195, + "step": 45410 + }, + { + "epoch": 0.33573815085302033, + "grad_norm": 0.0853031724691391, + "learning_rate": 3.333630104463438e-05, + "loss": 0.019, + "step": 45420 + }, + { + "epoch": 0.3358120694243222, + "grad_norm": 0.08535119891166687, + "learning_rate": 3.3332591405508074e-05, + "loss": 0.0196, + "step": 45430 + }, + { + "epoch": 0.335885987995624, + "grad_norm": 0.07558152079582214, + "learning_rate": 3.332888176638177e-05, + "loss": 0.0177, + "step": 45440 + }, + { + "epoch": 0.33595990656692587, + "grad_norm": 0.0958716869354248, + "learning_rate": 3.332517212725546e-05, + "loss": 0.0176, + "step": 45450 + }, + { + "epoch": 0.3360338251382277, + "grad_norm": 0.06695028394460678, + "learning_rate": 3.3321462488129155e-05, + "loss": 0.0174, + "step": 45460 + }, + { + "epoch": 0.33610774370952956, + "grad_norm": 0.08938926458358765, + "learning_rate": 3.331775284900285e-05, + "loss": 0.0199, + "step": 45470 + }, + { + "epoch": 0.33618166228083146, + "grad_norm": 0.07885266840457916, + "learning_rate": 3.331404320987655e-05, + "loss": 0.0167, + "step": 45480 + }, + { + "epoch": 0.3362555808521333, + "grad_norm": 0.0742824599146843, + "learning_rate": 3.331033357075024e-05, + "loss": 0.0195, + "step": 45490 + }, + { + "epoch": 0.33632949942343515, + "grad_norm": 0.07376478612422943, + "learning_rate": 3.330662393162393e-05, + "loss": 0.0191, + "step": 45500 + }, + { + "epoch": 0.336403417994737, + "grad_norm": 0.06707164645195007, + "learning_rate": 3.330291429249763e-05, + "loss": 0.0183, + "step": 45510 + }, + { + "epoch": 0.33647733656603884, + "grad_norm": 0.08750803023576736, + "learning_rate": 3.3299204653371324e-05, + "loss": 0.0181, + "step": 45520 + }, + { + "epoch": 0.3365512551373407, + "grad_norm": 0.06902256608009338, + "learning_rate": 3.3295495014245013e-05, + "loss": 0.0197, + "step": 45530 + }, + { + "epoch": 0.33662517370864253, + "grad_norm": 0.09635186940431595, + "learning_rate": 3.329178537511871e-05, + "loss": 0.0196, + "step": 45540 + }, + { + "epoch": 0.33669909227994443, + "grad_norm": 0.12827034294605255, + "learning_rate": 3.3288075735992405e-05, + "loss": 0.0166, + "step": 45550 + }, + { + "epoch": 0.3367730108512463, + "grad_norm": 0.07784906029701233, + "learning_rate": 3.32843660968661e-05, + "loss": 0.018, + "step": 45560 + }, + { + "epoch": 0.3368469294225481, + "grad_norm": 0.06968273967504501, + "learning_rate": 3.32806564577398e-05, + "loss": 0.0177, + "step": 45570 + }, + { + "epoch": 0.33692084799384997, + "grad_norm": 0.10143133252859116, + "learning_rate": 3.3276946818613487e-05, + "loss": 0.02, + "step": 45580 + }, + { + "epoch": 0.3369947665651518, + "grad_norm": 0.06562886387109756, + "learning_rate": 3.327323717948718e-05, + "loss": 0.0186, + "step": 45590 + }, + { + "epoch": 0.33706868513645366, + "grad_norm": 0.08129129558801651, + "learning_rate": 3.326952754036087e-05, + "loss": 0.0186, + "step": 45600 + }, + { + "epoch": 0.33714260370775556, + "grad_norm": 0.09293606877326965, + "learning_rate": 3.326581790123457e-05, + "loss": 0.0169, + "step": 45610 + }, + { + "epoch": 0.3372165222790574, + "grad_norm": 0.1033020094037056, + "learning_rate": 3.3262108262108264e-05, + "loss": 0.0198, + "step": 45620 + }, + { + "epoch": 0.33729044085035925, + "grad_norm": 0.08983422070741653, + "learning_rate": 3.325839862298196e-05, + "loss": 0.0184, + "step": 45630 + }, + { + "epoch": 0.3373643594216611, + "grad_norm": 0.10300349444150925, + "learning_rate": 3.3254688983855656e-05, + "loss": 0.0176, + "step": 45640 + }, + { + "epoch": 0.33743827799296294, + "grad_norm": 0.07111994177103043, + "learning_rate": 3.3250979344729345e-05, + "loss": 0.0191, + "step": 45650 + }, + { + "epoch": 0.3375121965642648, + "grad_norm": 0.06482949107885361, + "learning_rate": 3.324726970560304e-05, + "loss": 0.0156, + "step": 45660 + }, + { + "epoch": 0.33758611513556663, + "grad_norm": 0.09989634156227112, + "learning_rate": 3.324356006647674e-05, + "loss": 0.019, + "step": 45670 + }, + { + "epoch": 0.33766003370686853, + "grad_norm": 0.07140668481588364, + "learning_rate": 3.3239850427350426e-05, + "loss": 0.0186, + "step": 45680 + }, + { + "epoch": 0.3377339522781704, + "grad_norm": 0.07863501459360123, + "learning_rate": 3.323614078822412e-05, + "loss": 0.0172, + "step": 45690 + }, + { + "epoch": 0.3378078708494722, + "grad_norm": 0.0761060044169426, + "learning_rate": 3.323243114909782e-05, + "loss": 0.0181, + "step": 45700 + }, + { + "epoch": 0.33788178942077407, + "grad_norm": 0.07722659409046173, + "learning_rate": 3.3228721509971514e-05, + "loss": 0.0177, + "step": 45710 + }, + { + "epoch": 0.3379557079920759, + "grad_norm": 0.0757056325674057, + "learning_rate": 3.322501187084521e-05, + "loss": 0.0185, + "step": 45720 + }, + { + "epoch": 0.33802962656337776, + "grad_norm": 0.09015494585037231, + "learning_rate": 3.32213022317189e-05, + "loss": 0.0162, + "step": 45730 + }, + { + "epoch": 0.33810354513467966, + "grad_norm": 0.09859200567007065, + "learning_rate": 3.3217592592592595e-05, + "loss": 0.0176, + "step": 45740 + }, + { + "epoch": 0.3381774637059815, + "grad_norm": 0.08525702357292175, + "learning_rate": 3.321388295346629e-05, + "loss": 0.0214, + "step": 45750 + }, + { + "epoch": 0.33825138227728335, + "grad_norm": 0.07817837595939636, + "learning_rate": 3.321017331433998e-05, + "loss": 0.0172, + "step": 45760 + }, + { + "epoch": 0.3383253008485852, + "grad_norm": 0.06586714833974838, + "learning_rate": 3.3206463675213676e-05, + "loss": 0.0174, + "step": 45770 + }, + { + "epoch": 0.33839921941988704, + "grad_norm": 0.07378669828176498, + "learning_rate": 3.320275403608737e-05, + "loss": 0.0167, + "step": 45780 + }, + { + "epoch": 0.3384731379911889, + "grad_norm": 0.09944503009319305, + "learning_rate": 3.319904439696107e-05, + "loss": 0.0178, + "step": 45790 + }, + { + "epoch": 0.33854705656249073, + "grad_norm": 0.10161613672971725, + "learning_rate": 3.3195334757834764e-05, + "loss": 0.0202, + "step": 45800 + }, + { + "epoch": 0.33862097513379263, + "grad_norm": 0.09542983025312424, + "learning_rate": 3.319162511870845e-05, + "loss": 0.0191, + "step": 45810 + }, + { + "epoch": 0.3386948937050945, + "grad_norm": 0.0869336798787117, + "learning_rate": 3.318791547958215e-05, + "loss": 0.0191, + "step": 45820 + }, + { + "epoch": 0.3387688122763963, + "grad_norm": 0.13735035061836243, + "learning_rate": 3.318420584045584e-05, + "loss": 0.021, + "step": 45830 + }, + { + "epoch": 0.33884273084769817, + "grad_norm": 0.14309273660182953, + "learning_rate": 3.3180496201329534e-05, + "loss": 0.0202, + "step": 45840 + }, + { + "epoch": 0.338916649419, + "grad_norm": 0.08051399141550064, + "learning_rate": 3.317678656220323e-05, + "loss": 0.0196, + "step": 45850 + }, + { + "epoch": 0.33899056799030186, + "grad_norm": 0.12576155364513397, + "learning_rate": 3.3173076923076926e-05, + "loss": 0.019, + "step": 45860 + }, + { + "epoch": 0.33906448656160376, + "grad_norm": 0.0772315189242363, + "learning_rate": 3.316936728395062e-05, + "loss": 0.0179, + "step": 45870 + }, + { + "epoch": 0.3391384051329056, + "grad_norm": 0.0837082490324974, + "learning_rate": 3.316565764482431e-05, + "loss": 0.0191, + "step": 45880 + }, + { + "epoch": 0.33921232370420745, + "grad_norm": 0.0778370201587677, + "learning_rate": 3.316194800569801e-05, + "loss": 0.017, + "step": 45890 + }, + { + "epoch": 0.3392862422755093, + "grad_norm": 0.07121980935335159, + "learning_rate": 3.31582383665717e-05, + "loss": 0.0203, + "step": 45900 + }, + { + "epoch": 0.33936016084681114, + "grad_norm": 0.1336478441953659, + "learning_rate": 3.315452872744539e-05, + "loss": 0.0178, + "step": 45910 + }, + { + "epoch": 0.339434079418113, + "grad_norm": 0.10090693086385727, + "learning_rate": 3.315081908831909e-05, + "loss": 0.0193, + "step": 45920 + }, + { + "epoch": 0.33950799798941483, + "grad_norm": 0.06936877965927124, + "learning_rate": 3.3147109449192784e-05, + "loss": 0.0164, + "step": 45930 + }, + { + "epoch": 0.33958191656071673, + "grad_norm": 0.09035806357860565, + "learning_rate": 3.314339981006648e-05, + "loss": 0.0173, + "step": 45940 + }, + { + "epoch": 0.3396558351320186, + "grad_norm": 0.08105256408452988, + "learning_rate": 3.3139690170940176e-05, + "loss": 0.0169, + "step": 45950 + }, + { + "epoch": 0.3397297537033204, + "grad_norm": 0.10572589933872223, + "learning_rate": 3.3135980531813866e-05, + "loss": 0.0184, + "step": 45960 + }, + { + "epoch": 0.33980367227462227, + "grad_norm": 0.1222388744354248, + "learning_rate": 3.313227089268756e-05, + "loss": 0.0185, + "step": 45970 + }, + { + "epoch": 0.3398775908459241, + "grad_norm": 0.08362412452697754, + "learning_rate": 3.312856125356126e-05, + "loss": 0.0184, + "step": 45980 + }, + { + "epoch": 0.33995150941722596, + "grad_norm": 0.0899990126490593, + "learning_rate": 3.312485161443495e-05, + "loss": 0.0184, + "step": 45990 + }, + { + "epoch": 0.34002542798852786, + "grad_norm": 0.06433310359716415, + "learning_rate": 3.312114197530864e-05, + "loss": 0.0165, + "step": 46000 + }, + { + "epoch": 0.3400993465598297, + "grad_norm": 0.08536859601736069, + "learning_rate": 3.311743233618234e-05, + "loss": 0.0173, + "step": 46010 + }, + { + "epoch": 0.34017326513113155, + "grad_norm": 0.10119964927434921, + "learning_rate": 3.3113722697056035e-05, + "loss": 0.018, + "step": 46020 + }, + { + "epoch": 0.3402471837024334, + "grad_norm": 0.07824676483869553, + "learning_rate": 3.311001305792973e-05, + "loss": 0.0159, + "step": 46030 + }, + { + "epoch": 0.34032110227373524, + "grad_norm": 0.0952010378241539, + "learning_rate": 3.310630341880342e-05, + "loss": 0.0178, + "step": 46040 + }, + { + "epoch": 0.3403950208450371, + "grad_norm": 0.09411787986755371, + "learning_rate": 3.3102593779677116e-05, + "loss": 0.0201, + "step": 46050 + }, + { + "epoch": 0.34046893941633893, + "grad_norm": 0.12072297930717468, + "learning_rate": 3.3098884140550805e-05, + "loss": 0.0205, + "step": 46060 + }, + { + "epoch": 0.34054285798764083, + "grad_norm": 0.07515337318181992, + "learning_rate": 3.30951745014245e-05, + "loss": 0.019, + "step": 46070 + }, + { + "epoch": 0.3406167765589427, + "grad_norm": 0.07915613800287247, + "learning_rate": 3.30914648622982e-05, + "loss": 0.0167, + "step": 46080 + }, + { + "epoch": 0.3406906951302445, + "grad_norm": 0.06810886412858963, + "learning_rate": 3.308775522317189e-05, + "loss": 0.0187, + "step": 46090 + }, + { + "epoch": 0.34076461370154637, + "grad_norm": 0.05875418707728386, + "learning_rate": 3.308404558404559e-05, + "loss": 0.0191, + "step": 46100 + }, + { + "epoch": 0.3408385322728482, + "grad_norm": 0.11320916563272476, + "learning_rate": 3.308033594491928e-05, + "loss": 0.0191, + "step": 46110 + }, + { + "epoch": 0.34091245084415006, + "grad_norm": 0.10753299295902252, + "learning_rate": 3.3076626305792974e-05, + "loss": 0.0196, + "step": 46120 + }, + { + "epoch": 0.34098636941545196, + "grad_norm": 0.06757346540689468, + "learning_rate": 3.307291666666667e-05, + "loss": 0.0214, + "step": 46130 + }, + { + "epoch": 0.3410602879867538, + "grad_norm": 0.06744559854269028, + "learning_rate": 3.306920702754036e-05, + "loss": 0.022, + "step": 46140 + }, + { + "epoch": 0.34113420655805565, + "grad_norm": 0.09945414960384369, + "learning_rate": 3.3065497388414055e-05, + "loss": 0.0208, + "step": 46150 + }, + { + "epoch": 0.3412081251293575, + "grad_norm": 0.07971841096878052, + "learning_rate": 3.306178774928775e-05, + "loss": 0.0184, + "step": 46160 + }, + { + "epoch": 0.34128204370065934, + "grad_norm": 0.09220457077026367, + "learning_rate": 3.305807811016145e-05, + "loss": 0.0186, + "step": 46170 + }, + { + "epoch": 0.3413559622719612, + "grad_norm": 0.07157941162586212, + "learning_rate": 3.305436847103514e-05, + "loss": 0.0177, + "step": 46180 + }, + { + "epoch": 0.34142988084326303, + "grad_norm": 0.08177167177200317, + "learning_rate": 3.305065883190883e-05, + "loss": 0.0193, + "step": 46190 + }, + { + "epoch": 0.34150379941456493, + "grad_norm": 0.09292397648096085, + "learning_rate": 3.304694919278253e-05, + "loss": 0.0183, + "step": 46200 + }, + { + "epoch": 0.3415777179858668, + "grad_norm": 0.0946432575583458, + "learning_rate": 3.3043239553656224e-05, + "loss": 0.0162, + "step": 46210 + }, + { + "epoch": 0.3416516365571686, + "grad_norm": 0.0911048948764801, + "learning_rate": 3.303952991452991e-05, + "loss": 0.0205, + "step": 46220 + }, + { + "epoch": 0.34172555512847047, + "grad_norm": 0.0834997296333313, + "learning_rate": 3.303582027540361e-05, + "loss": 0.0195, + "step": 46230 + }, + { + "epoch": 0.3417994736997723, + "grad_norm": 0.12757158279418945, + "learning_rate": 3.3032110636277305e-05, + "loss": 0.0181, + "step": 46240 + }, + { + "epoch": 0.34187339227107416, + "grad_norm": 0.10090693831443787, + "learning_rate": 3.3028400997151e-05, + "loss": 0.0188, + "step": 46250 + }, + { + "epoch": 0.34194731084237606, + "grad_norm": 0.11772280186414719, + "learning_rate": 3.30246913580247e-05, + "loss": 0.0188, + "step": 46260 + }, + { + "epoch": 0.3420212294136779, + "grad_norm": 0.0676715150475502, + "learning_rate": 3.3020981718898386e-05, + "loss": 0.0188, + "step": 46270 + }, + { + "epoch": 0.34209514798497975, + "grad_norm": 0.09699052572250366, + "learning_rate": 3.301727207977208e-05, + "loss": 0.0171, + "step": 46280 + }, + { + "epoch": 0.3421690665562816, + "grad_norm": 0.07975872606039047, + "learning_rate": 3.301356244064577e-05, + "loss": 0.0213, + "step": 46290 + }, + { + "epoch": 0.34224298512758344, + "grad_norm": 0.045385707169771194, + "learning_rate": 3.300985280151947e-05, + "loss": 0.0188, + "step": 46300 + }, + { + "epoch": 0.3423169036988853, + "grad_norm": 0.10349560528993607, + "learning_rate": 3.300614316239316e-05, + "loss": 0.0191, + "step": 46310 + }, + { + "epoch": 0.3423908222701872, + "grad_norm": 0.08033833652734756, + "learning_rate": 3.300243352326686e-05, + "loss": 0.02, + "step": 46320 + }, + { + "epoch": 0.34246474084148903, + "grad_norm": 0.0725812166929245, + "learning_rate": 3.2998723884140555e-05, + "loss": 0.0161, + "step": 46330 + }, + { + "epoch": 0.3425386594127909, + "grad_norm": 0.08852728456258774, + "learning_rate": 3.2995014245014244e-05, + "loss": 0.0195, + "step": 46340 + }, + { + "epoch": 0.3426125779840927, + "grad_norm": 0.07608164101839066, + "learning_rate": 3.299130460588794e-05, + "loss": 0.0171, + "step": 46350 + }, + { + "epoch": 0.34268649655539457, + "grad_norm": 0.08210303634405136, + "learning_rate": 3.2987594966761636e-05, + "loss": 0.0181, + "step": 46360 + }, + { + "epoch": 0.3427604151266964, + "grad_norm": 0.0832902044057846, + "learning_rate": 3.2983885327635326e-05, + "loss": 0.0186, + "step": 46370 + }, + { + "epoch": 0.34283433369799826, + "grad_norm": 0.11009570956230164, + "learning_rate": 3.298017568850902e-05, + "loss": 0.0189, + "step": 46380 + }, + { + "epoch": 0.34290825226930016, + "grad_norm": 0.078762948513031, + "learning_rate": 3.297646604938272e-05, + "loss": 0.018, + "step": 46390 + }, + { + "epoch": 0.342982170840602, + "grad_norm": 0.10385294258594513, + "learning_rate": 3.2972756410256414e-05, + "loss": 0.0226, + "step": 46400 + }, + { + "epoch": 0.34305608941190385, + "grad_norm": 0.08638278394937515, + "learning_rate": 3.296904677113011e-05, + "loss": 0.0166, + "step": 46410 + }, + { + "epoch": 0.3431300079832057, + "grad_norm": 0.0991319864988327, + "learning_rate": 3.29653371320038e-05, + "loss": 0.019, + "step": 46420 + }, + { + "epoch": 0.34320392655450754, + "grad_norm": 0.09847695380449295, + "learning_rate": 3.2961627492877495e-05, + "loss": 0.0165, + "step": 46430 + }, + { + "epoch": 0.3432778451258094, + "grad_norm": 0.07403053343296051, + "learning_rate": 3.295791785375119e-05, + "loss": 0.0189, + "step": 46440 + }, + { + "epoch": 0.3433517636971113, + "grad_norm": 0.06902497261762619, + "learning_rate": 3.295420821462488e-05, + "loss": 0.0177, + "step": 46450 + }, + { + "epoch": 0.34342568226841313, + "grad_norm": 0.07637327909469604, + "learning_rate": 3.2950498575498576e-05, + "loss": 0.0189, + "step": 46460 + }, + { + "epoch": 0.343499600839715, + "grad_norm": 0.06147047504782677, + "learning_rate": 3.294678893637227e-05, + "loss": 0.018, + "step": 46470 + }, + { + "epoch": 0.3435735194110168, + "grad_norm": 0.08808895945549011, + "learning_rate": 3.294307929724597e-05, + "loss": 0.0177, + "step": 46480 + }, + { + "epoch": 0.34364743798231867, + "grad_norm": 0.0873732939362526, + "learning_rate": 3.2939369658119664e-05, + "loss": 0.0157, + "step": 46490 + }, + { + "epoch": 0.3437213565536205, + "grad_norm": 0.06639257818460464, + "learning_rate": 3.293566001899335e-05, + "loss": 0.0181, + "step": 46500 + }, + { + "epoch": 0.34379527512492236, + "grad_norm": 0.08449820429086685, + "learning_rate": 3.293195037986705e-05, + "loss": 0.0199, + "step": 46510 + }, + { + "epoch": 0.34386919369622426, + "grad_norm": 0.10189925134181976, + "learning_rate": 3.292824074074074e-05, + "loss": 0.0188, + "step": 46520 + }, + { + "epoch": 0.3439431122675261, + "grad_norm": 0.08540836721658707, + "learning_rate": 3.2924531101614434e-05, + "loss": 0.0196, + "step": 46530 + }, + { + "epoch": 0.34401703083882795, + "grad_norm": 0.08655638247728348, + "learning_rate": 3.292082146248813e-05, + "loss": 0.02, + "step": 46540 + }, + { + "epoch": 0.3440909494101298, + "grad_norm": 0.08147939294576645, + "learning_rate": 3.2917111823361826e-05, + "loss": 0.0185, + "step": 46550 + }, + { + "epoch": 0.34416486798143164, + "grad_norm": 0.07434861361980438, + "learning_rate": 3.291340218423552e-05, + "loss": 0.0194, + "step": 46560 + }, + { + "epoch": 0.3442387865527335, + "grad_norm": 0.07616791874170303, + "learning_rate": 3.290969254510921e-05, + "loss": 0.0186, + "step": 46570 + }, + { + "epoch": 0.3443127051240354, + "grad_norm": 0.08321139961481094, + "learning_rate": 3.290598290598291e-05, + "loss": 0.0224, + "step": 46580 + }, + { + "epoch": 0.34438662369533724, + "grad_norm": 0.11349760740995407, + "learning_rate": 3.29022732668566e-05, + "loss": 0.0188, + "step": 46590 + }, + { + "epoch": 0.3444605422666391, + "grad_norm": 0.09545985609292984, + "learning_rate": 3.289856362773029e-05, + "loss": 0.0193, + "step": 46600 + }, + { + "epoch": 0.3445344608379409, + "grad_norm": 0.0886853039264679, + "learning_rate": 3.289485398860399e-05, + "loss": 0.0206, + "step": 46610 + }, + { + "epoch": 0.34460837940924277, + "grad_norm": 0.07644405961036682, + "learning_rate": 3.2891144349477684e-05, + "loss": 0.0169, + "step": 46620 + }, + { + "epoch": 0.3446822979805446, + "grad_norm": 0.0853935107588768, + "learning_rate": 3.288743471035138e-05, + "loss": 0.0193, + "step": 46630 + }, + { + "epoch": 0.34475621655184646, + "grad_norm": 0.061725448817014694, + "learning_rate": 3.2883725071225076e-05, + "loss": 0.0175, + "step": 46640 + }, + { + "epoch": 0.34483013512314836, + "grad_norm": 0.0924752801656723, + "learning_rate": 3.2880015432098765e-05, + "loss": 0.0184, + "step": 46650 + }, + { + "epoch": 0.3449040536944502, + "grad_norm": 0.10381954163312912, + "learning_rate": 3.287630579297246e-05, + "loss": 0.0181, + "step": 46660 + }, + { + "epoch": 0.34497797226575205, + "grad_norm": 0.09819740802049637, + "learning_rate": 3.287259615384616e-05, + "loss": 0.0181, + "step": 46670 + }, + { + "epoch": 0.3450518908370539, + "grad_norm": 0.08267750591039658, + "learning_rate": 3.2868886514719846e-05, + "loss": 0.0196, + "step": 46680 + }, + { + "epoch": 0.34512580940835574, + "grad_norm": 0.054967526346445084, + "learning_rate": 3.286517687559354e-05, + "loss": 0.0175, + "step": 46690 + }, + { + "epoch": 0.3451997279796576, + "grad_norm": 0.09669110924005508, + "learning_rate": 3.286146723646724e-05, + "loss": 0.0196, + "step": 46700 + }, + { + "epoch": 0.3452736465509595, + "grad_norm": 0.07660909742116928, + "learning_rate": 3.2857757597340934e-05, + "loss": 0.02, + "step": 46710 + }, + { + "epoch": 0.34534756512226134, + "grad_norm": 0.09221714735031128, + "learning_rate": 3.285404795821463e-05, + "loss": 0.0155, + "step": 46720 + }, + { + "epoch": 0.3454214836935632, + "grad_norm": 0.08598586916923523, + "learning_rate": 3.285033831908832e-05, + "loss": 0.0184, + "step": 46730 + }, + { + "epoch": 0.345495402264865, + "grad_norm": 0.08627812564373016, + "learning_rate": 3.2846628679962015e-05, + "loss": 0.0188, + "step": 46740 + }, + { + "epoch": 0.34556932083616687, + "grad_norm": 0.10398365557193756, + "learning_rate": 3.2842919040835705e-05, + "loss": 0.0162, + "step": 46750 + }, + { + "epoch": 0.3456432394074687, + "grad_norm": 0.08908989280462265, + "learning_rate": 3.28392094017094e-05, + "loss": 0.0209, + "step": 46760 + }, + { + "epoch": 0.34571715797877056, + "grad_norm": 0.08196178078651428, + "learning_rate": 3.2835499762583097e-05, + "loss": 0.019, + "step": 46770 + }, + { + "epoch": 0.34579107655007246, + "grad_norm": 0.09578578919172287, + "learning_rate": 3.283179012345679e-05, + "loss": 0.0177, + "step": 46780 + }, + { + "epoch": 0.3458649951213743, + "grad_norm": 0.0772024616599083, + "learning_rate": 3.282808048433049e-05, + "loss": 0.018, + "step": 46790 + }, + { + "epoch": 0.34593891369267615, + "grad_norm": 0.06309277564287186, + "learning_rate": 3.282437084520418e-05, + "loss": 0.0181, + "step": 46800 + }, + { + "epoch": 0.346012832263978, + "grad_norm": 0.10176903009414673, + "learning_rate": 3.2820661206077874e-05, + "loss": 0.0224, + "step": 46810 + }, + { + "epoch": 0.34608675083527984, + "grad_norm": 0.10570260137319565, + "learning_rate": 3.281695156695157e-05, + "loss": 0.0172, + "step": 46820 + }, + { + "epoch": 0.3461606694065817, + "grad_norm": 0.10192044824361801, + "learning_rate": 3.281324192782526e-05, + "loss": 0.0191, + "step": 46830 + }, + { + "epoch": 0.3462345879778836, + "grad_norm": 0.07488798350095749, + "learning_rate": 3.2809532288698955e-05, + "loss": 0.02, + "step": 46840 + }, + { + "epoch": 0.34630850654918544, + "grad_norm": 0.08586698770523071, + "learning_rate": 3.280582264957265e-05, + "loss": 0.0185, + "step": 46850 + }, + { + "epoch": 0.3463824251204873, + "grad_norm": 0.09512021392583847, + "learning_rate": 3.280211301044635e-05, + "loss": 0.0178, + "step": 46860 + }, + { + "epoch": 0.3464563436917891, + "grad_norm": 0.09688975661993027, + "learning_rate": 3.279840337132004e-05, + "loss": 0.02, + "step": 46870 + }, + { + "epoch": 0.34653026226309097, + "grad_norm": 0.08890626579523087, + "learning_rate": 3.279469373219373e-05, + "loss": 0.0184, + "step": 46880 + }, + { + "epoch": 0.3466041808343928, + "grad_norm": 0.09725047647953033, + "learning_rate": 3.279098409306743e-05, + "loss": 0.02, + "step": 46890 + }, + { + "epoch": 0.34667809940569466, + "grad_norm": 0.124251589179039, + "learning_rate": 3.2787274453941124e-05, + "loss": 0.0179, + "step": 46900 + }, + { + "epoch": 0.34675201797699656, + "grad_norm": 0.08104971796274185, + "learning_rate": 3.278356481481481e-05, + "loss": 0.0173, + "step": 46910 + }, + { + "epoch": 0.3468259365482984, + "grad_norm": 0.119538314640522, + "learning_rate": 3.277985517568851e-05, + "loss": 0.0217, + "step": 46920 + }, + { + "epoch": 0.34689985511960025, + "grad_norm": 0.0804719477891922, + "learning_rate": 3.2776145536562205e-05, + "loss": 0.0204, + "step": 46930 + }, + { + "epoch": 0.3469737736909021, + "grad_norm": 0.09438547492027283, + "learning_rate": 3.27724358974359e-05, + "loss": 0.02, + "step": 46940 + }, + { + "epoch": 0.34704769226220394, + "grad_norm": 0.08524240553379059, + "learning_rate": 3.27687262583096e-05, + "loss": 0.0192, + "step": 46950 + }, + { + "epoch": 0.3471216108335058, + "grad_norm": 0.06681135296821594, + "learning_rate": 3.2765016619183286e-05, + "loss": 0.0194, + "step": 46960 + }, + { + "epoch": 0.3471955294048077, + "grad_norm": 0.11396799981594086, + "learning_rate": 3.276130698005698e-05, + "loss": 0.0201, + "step": 46970 + }, + { + "epoch": 0.34726944797610954, + "grad_norm": 0.09392783045768738, + "learning_rate": 3.275759734093067e-05, + "loss": 0.0192, + "step": 46980 + }, + { + "epoch": 0.3473433665474114, + "grad_norm": 0.08238954097032547, + "learning_rate": 3.275388770180437e-05, + "loss": 0.0192, + "step": 46990 + }, + { + "epoch": 0.3474172851187132, + "grad_norm": 0.08259283751249313, + "learning_rate": 3.275017806267806e-05, + "loss": 0.0193, + "step": 47000 + }, + { + "epoch": 0.34749120369001507, + "grad_norm": 0.06521821767091751, + "learning_rate": 3.274646842355176e-05, + "loss": 0.0184, + "step": 47010 + }, + { + "epoch": 0.3475651222613169, + "grad_norm": 0.08316667377948761, + "learning_rate": 3.2742758784425455e-05, + "loss": 0.017, + "step": 47020 + }, + { + "epoch": 0.34763904083261876, + "grad_norm": 0.09221500903367996, + "learning_rate": 3.2739049145299144e-05, + "loss": 0.019, + "step": 47030 + }, + { + "epoch": 0.34771295940392066, + "grad_norm": 0.1091422289609909, + "learning_rate": 3.273533950617284e-05, + "loss": 0.0195, + "step": 47040 + }, + { + "epoch": 0.3477868779752225, + "grad_norm": 0.06781918555498123, + "learning_rate": 3.2731629867046536e-05, + "loss": 0.0171, + "step": 47050 + }, + { + "epoch": 0.34786079654652435, + "grad_norm": 0.07065023481845856, + "learning_rate": 3.2727920227920225e-05, + "loss": 0.0187, + "step": 47060 + }, + { + "epoch": 0.3479347151178262, + "grad_norm": 0.060498230159282684, + "learning_rate": 3.272421058879392e-05, + "loss": 0.02, + "step": 47070 + }, + { + "epoch": 0.34800863368912804, + "grad_norm": 0.08753509074449539, + "learning_rate": 3.272050094966762e-05, + "loss": 0.0205, + "step": 47080 + }, + { + "epoch": 0.3480825522604299, + "grad_norm": 0.08763568848371506, + "learning_rate": 3.271679131054131e-05, + "loss": 0.0193, + "step": 47090 + }, + { + "epoch": 0.3481564708317318, + "grad_norm": 0.08908195793628693, + "learning_rate": 3.271308167141501e-05, + "loss": 0.0199, + "step": 47100 + }, + { + "epoch": 0.34823038940303364, + "grad_norm": 0.07962878048419952, + "learning_rate": 3.27093720322887e-05, + "loss": 0.0176, + "step": 47110 + }, + { + "epoch": 0.3483043079743355, + "grad_norm": 0.08842107653617859, + "learning_rate": 3.2705662393162394e-05, + "loss": 0.0206, + "step": 47120 + }, + { + "epoch": 0.3483782265456373, + "grad_norm": 0.1007806733250618, + "learning_rate": 3.270195275403609e-05, + "loss": 0.0183, + "step": 47130 + }, + { + "epoch": 0.34845214511693917, + "grad_norm": 0.05309021845459938, + "learning_rate": 3.269824311490978e-05, + "loss": 0.0192, + "step": 47140 + }, + { + "epoch": 0.348526063688241, + "grad_norm": 0.07942581921815872, + "learning_rate": 3.2694533475783476e-05, + "loss": 0.0179, + "step": 47150 + }, + { + "epoch": 0.34859998225954286, + "grad_norm": 0.08619439601898193, + "learning_rate": 3.269082383665717e-05, + "loss": 0.0179, + "step": 47160 + }, + { + "epoch": 0.34867390083084476, + "grad_norm": 0.1479090005159378, + "learning_rate": 3.268711419753087e-05, + "loss": 0.0215, + "step": 47170 + }, + { + "epoch": 0.3487478194021466, + "grad_norm": 0.07243809849023819, + "learning_rate": 3.2683404558404563e-05, + "loss": 0.0173, + "step": 47180 + }, + { + "epoch": 0.34882173797344845, + "grad_norm": 0.059807922691106796, + "learning_rate": 3.267969491927825e-05, + "loss": 0.0174, + "step": 47190 + }, + { + "epoch": 0.3488956565447503, + "grad_norm": 0.12337792664766312, + "learning_rate": 3.267598528015195e-05, + "loss": 0.0172, + "step": 47200 + }, + { + "epoch": 0.34896957511605214, + "grad_norm": 0.09791093319654465, + "learning_rate": 3.267227564102564e-05, + "loss": 0.0193, + "step": 47210 + }, + { + "epoch": 0.349043493687354, + "grad_norm": 0.15573714673519135, + "learning_rate": 3.2668566001899334e-05, + "loss": 0.0203, + "step": 47220 + }, + { + "epoch": 0.3491174122586559, + "grad_norm": 0.06387767195701599, + "learning_rate": 3.266485636277303e-05, + "loss": 0.0181, + "step": 47230 + }, + { + "epoch": 0.34919133082995774, + "grad_norm": 0.06677080690860748, + "learning_rate": 3.2661146723646726e-05, + "loss": 0.0192, + "step": 47240 + }, + { + "epoch": 0.3492652494012596, + "grad_norm": 0.10616503655910492, + "learning_rate": 3.265743708452042e-05, + "loss": 0.0209, + "step": 47250 + }, + { + "epoch": 0.3493391679725614, + "grad_norm": 0.0729021355509758, + "learning_rate": 3.265372744539411e-05, + "loss": 0.0166, + "step": 47260 + }, + { + "epoch": 0.34941308654386327, + "grad_norm": 0.06525515764951706, + "learning_rate": 3.265001780626781e-05, + "loss": 0.0172, + "step": 47270 + }, + { + "epoch": 0.3494870051151651, + "grad_norm": 0.0829407349228859, + "learning_rate": 3.26463081671415e-05, + "loss": 0.0221, + "step": 47280 + }, + { + "epoch": 0.34956092368646696, + "grad_norm": 0.08970022201538086, + "learning_rate": 3.264259852801519e-05, + "loss": 0.0174, + "step": 47290 + }, + { + "epoch": 0.34963484225776886, + "grad_norm": 0.09719527512788773, + "learning_rate": 3.263888888888889e-05, + "loss": 0.0178, + "step": 47300 + }, + { + "epoch": 0.3497087608290707, + "grad_norm": 0.1067635715007782, + "learning_rate": 3.2635179249762584e-05, + "loss": 0.0175, + "step": 47310 + }, + { + "epoch": 0.34978267940037255, + "grad_norm": 0.10435649007558823, + "learning_rate": 3.263146961063628e-05, + "loss": 0.0201, + "step": 47320 + }, + { + "epoch": 0.3498565979716744, + "grad_norm": 0.10568960011005402, + "learning_rate": 3.2627759971509976e-05, + "loss": 0.0205, + "step": 47330 + }, + { + "epoch": 0.34993051654297624, + "grad_norm": 0.08821310102939606, + "learning_rate": 3.2624050332383665e-05, + "loss": 0.0188, + "step": 47340 + }, + { + "epoch": 0.3500044351142781, + "grad_norm": 0.07589121907949448, + "learning_rate": 3.262034069325736e-05, + "loss": 0.0189, + "step": 47350 + }, + { + "epoch": 0.35007835368558, + "grad_norm": 0.07190071791410446, + "learning_rate": 3.261663105413106e-05, + "loss": 0.0165, + "step": 47360 + }, + { + "epoch": 0.35015227225688184, + "grad_norm": 0.09277673810720444, + "learning_rate": 3.2612921415004746e-05, + "loss": 0.0185, + "step": 47370 + }, + { + "epoch": 0.3502261908281837, + "grad_norm": 0.06976188719272614, + "learning_rate": 3.260921177587844e-05, + "loss": 0.0198, + "step": 47380 + }, + { + "epoch": 0.3503001093994855, + "grad_norm": 0.09157153218984604, + "learning_rate": 3.260550213675214e-05, + "loss": 0.0178, + "step": 47390 + }, + { + "epoch": 0.35037402797078737, + "grad_norm": 0.07097479701042175, + "learning_rate": 3.2601792497625834e-05, + "loss": 0.0202, + "step": 47400 + }, + { + "epoch": 0.3504479465420892, + "grad_norm": 0.06325814872980118, + "learning_rate": 3.259808285849953e-05, + "loss": 0.0182, + "step": 47410 + }, + { + "epoch": 0.35052186511339106, + "grad_norm": 0.08848169445991516, + "learning_rate": 3.259437321937322e-05, + "loss": 0.0196, + "step": 47420 + }, + { + "epoch": 0.35059578368469296, + "grad_norm": 0.07658163458108902, + "learning_rate": 3.2590663580246915e-05, + "loss": 0.0199, + "step": 47430 + }, + { + "epoch": 0.3506697022559948, + "grad_norm": 0.09558193385601044, + "learning_rate": 3.2586953941120604e-05, + "loss": 0.0185, + "step": 47440 + }, + { + "epoch": 0.35074362082729665, + "grad_norm": 0.07930658757686615, + "learning_rate": 3.25832443019943e-05, + "loss": 0.0165, + "step": 47450 + }, + { + "epoch": 0.3508175393985985, + "grad_norm": 0.099174365401268, + "learning_rate": 3.2579534662868e-05, + "loss": 0.0205, + "step": 47460 + }, + { + "epoch": 0.35089145796990034, + "grad_norm": 0.08491765707731247, + "learning_rate": 3.257582502374169e-05, + "loss": 0.0192, + "step": 47470 + }, + { + "epoch": 0.3509653765412022, + "grad_norm": 0.09877867996692657, + "learning_rate": 3.257211538461539e-05, + "loss": 0.0166, + "step": 47480 + }, + { + "epoch": 0.3510392951125041, + "grad_norm": 0.09346239268779755, + "learning_rate": 3.256840574548908e-05, + "loss": 0.017, + "step": 47490 + }, + { + "epoch": 0.35111321368380594, + "grad_norm": 0.11570316553115845, + "learning_rate": 3.256469610636277e-05, + "loss": 0.018, + "step": 47500 + }, + { + "epoch": 0.3511871322551078, + "grad_norm": 0.0938878133893013, + "learning_rate": 3.256098646723647e-05, + "loss": 0.0201, + "step": 47510 + }, + { + "epoch": 0.3512610508264096, + "grad_norm": 0.10068412870168686, + "learning_rate": 3.255727682811016e-05, + "loss": 0.0203, + "step": 47520 + }, + { + "epoch": 0.35133496939771147, + "grad_norm": 0.08617176115512848, + "learning_rate": 3.2553567188983854e-05, + "loss": 0.0175, + "step": 47530 + }, + { + "epoch": 0.3514088879690133, + "grad_norm": 0.07788795977830887, + "learning_rate": 3.254985754985755e-05, + "loss": 0.0194, + "step": 47540 + }, + { + "epoch": 0.35148280654031516, + "grad_norm": 0.08169343322515488, + "learning_rate": 3.2546147910731246e-05, + "loss": 0.0186, + "step": 47550 + }, + { + "epoch": 0.35155672511161706, + "grad_norm": 0.1000077873468399, + "learning_rate": 3.254243827160494e-05, + "loss": 0.0195, + "step": 47560 + }, + { + "epoch": 0.3516306436829189, + "grad_norm": 0.05659700557589531, + "learning_rate": 3.253872863247863e-05, + "loss": 0.0183, + "step": 47570 + }, + { + "epoch": 0.35170456225422075, + "grad_norm": 0.10793798416852951, + "learning_rate": 3.253501899335233e-05, + "loss": 0.0182, + "step": 47580 + }, + { + "epoch": 0.3517784808255226, + "grad_norm": 0.07970179617404938, + "learning_rate": 3.2531309354226024e-05, + "loss": 0.0214, + "step": 47590 + }, + { + "epoch": 0.35185239939682444, + "grad_norm": 0.07447035610675812, + "learning_rate": 3.252759971509971e-05, + "loss": 0.0174, + "step": 47600 + }, + { + "epoch": 0.3519263179681263, + "grad_norm": 0.11135456711053848, + "learning_rate": 3.2523890075973415e-05, + "loss": 0.0156, + "step": 47610 + }, + { + "epoch": 0.3520002365394282, + "grad_norm": 0.07851218432188034, + "learning_rate": 3.2520180436847105e-05, + "loss": 0.0194, + "step": 47620 + }, + { + "epoch": 0.35207415511073004, + "grad_norm": 0.07464441657066345, + "learning_rate": 3.25164707977208e-05, + "loss": 0.0177, + "step": 47630 + }, + { + "epoch": 0.3521480736820319, + "grad_norm": 0.07230711728334427, + "learning_rate": 3.2512761158594497e-05, + "loss": 0.0174, + "step": 47640 + }, + { + "epoch": 0.3522219922533337, + "grad_norm": 0.08653979748487473, + "learning_rate": 3.2509051519468186e-05, + "loss": 0.0204, + "step": 47650 + }, + { + "epoch": 0.35229591082463557, + "grad_norm": 0.08425447344779968, + "learning_rate": 3.250534188034188e-05, + "loss": 0.0186, + "step": 47660 + }, + { + "epoch": 0.3523698293959374, + "grad_norm": 0.11556941270828247, + "learning_rate": 3.250163224121557e-05, + "loss": 0.0205, + "step": 47670 + }, + { + "epoch": 0.35244374796723926, + "grad_norm": 0.11919292062520981, + "learning_rate": 3.249792260208927e-05, + "loss": 0.0178, + "step": 47680 + }, + { + "epoch": 0.35251766653854116, + "grad_norm": 0.07132294774055481, + "learning_rate": 3.249421296296297e-05, + "loss": 0.0195, + "step": 47690 + }, + { + "epoch": 0.352591585109843, + "grad_norm": 0.06593780219554901, + "learning_rate": 3.249050332383666e-05, + "loss": 0.0178, + "step": 47700 + }, + { + "epoch": 0.35266550368114485, + "grad_norm": 0.1030346155166626, + "learning_rate": 3.2486793684710355e-05, + "loss": 0.0205, + "step": 47710 + }, + { + "epoch": 0.3527394222524467, + "grad_norm": 0.07651791721582413, + "learning_rate": 3.2483084045584044e-05, + "loss": 0.016, + "step": 47720 + }, + { + "epoch": 0.35281334082374854, + "grad_norm": 0.09694407880306244, + "learning_rate": 3.247937440645774e-05, + "loss": 0.0178, + "step": 47730 + }, + { + "epoch": 0.3528872593950504, + "grad_norm": 0.09967559576034546, + "learning_rate": 3.2475664767331436e-05, + "loss": 0.0191, + "step": 47740 + }, + { + "epoch": 0.3529611779663523, + "grad_norm": 0.0916508138179779, + "learning_rate": 3.2471955128205125e-05, + "loss": 0.0193, + "step": 47750 + }, + { + "epoch": 0.35303509653765414, + "grad_norm": 0.08274427056312561, + "learning_rate": 3.246824548907883e-05, + "loss": 0.02, + "step": 47760 + }, + { + "epoch": 0.353109015108956, + "grad_norm": 0.08642439544200897, + "learning_rate": 3.246453584995252e-05, + "loss": 0.0174, + "step": 47770 + }, + { + "epoch": 0.3531829336802578, + "grad_norm": 0.09799844026565552, + "learning_rate": 3.246082621082621e-05, + "loss": 0.0176, + "step": 47780 + }, + { + "epoch": 0.35325685225155967, + "grad_norm": 0.10590720176696777, + "learning_rate": 3.245711657169991e-05, + "loss": 0.0157, + "step": 47790 + }, + { + "epoch": 0.3533307708228615, + "grad_norm": 0.07792959362268448, + "learning_rate": 3.24534069325736e-05, + "loss": 0.0158, + "step": 47800 + }, + { + "epoch": 0.35340468939416336, + "grad_norm": 0.0911719799041748, + "learning_rate": 3.2449697293447294e-05, + "loss": 0.0195, + "step": 47810 + }, + { + "epoch": 0.35347860796546526, + "grad_norm": 0.06966444104909897, + "learning_rate": 3.244598765432099e-05, + "loss": 0.0199, + "step": 47820 + }, + { + "epoch": 0.3535525265367671, + "grad_norm": 0.07988017052412033, + "learning_rate": 3.244227801519468e-05, + "loss": 0.0177, + "step": 47830 + }, + { + "epoch": 0.35362644510806895, + "grad_norm": 0.10488210618495941, + "learning_rate": 3.243856837606838e-05, + "loss": 0.0169, + "step": 47840 + }, + { + "epoch": 0.3537003636793708, + "grad_norm": 0.06732849776744843, + "learning_rate": 3.243485873694207e-05, + "loss": 0.0183, + "step": 47850 + }, + { + "epoch": 0.35377428225067264, + "grad_norm": 0.09752733260393143, + "learning_rate": 3.243114909781577e-05, + "loss": 0.0195, + "step": 47860 + }, + { + "epoch": 0.3538482008219745, + "grad_norm": 0.09057949483394623, + "learning_rate": 3.242743945868946e-05, + "loss": 0.0191, + "step": 47870 + }, + { + "epoch": 0.3539221193932764, + "grad_norm": 0.1418922394514084, + "learning_rate": 3.242372981956315e-05, + "loss": 0.0167, + "step": 47880 + }, + { + "epoch": 0.35399603796457824, + "grad_norm": 0.10503596067428589, + "learning_rate": 3.242002018043685e-05, + "loss": 0.0193, + "step": 47890 + }, + { + "epoch": 0.3540699565358801, + "grad_norm": 0.07770757377147675, + "learning_rate": 3.241631054131054e-05, + "loss": 0.0181, + "step": 47900 + }, + { + "epoch": 0.3541438751071819, + "grad_norm": 0.08247490972280502, + "learning_rate": 3.241260090218424e-05, + "loss": 0.0171, + "step": 47910 + }, + { + "epoch": 0.35421779367848377, + "grad_norm": 0.07183904200792313, + "learning_rate": 3.2408891263057936e-05, + "loss": 0.0157, + "step": 47920 + }, + { + "epoch": 0.3542917122497856, + "grad_norm": 0.07240360975265503, + "learning_rate": 3.2405181623931625e-05, + "loss": 0.0166, + "step": 47930 + }, + { + "epoch": 0.35436563082108746, + "grad_norm": 0.07742757350206375, + "learning_rate": 3.240147198480532e-05, + "loss": 0.0178, + "step": 47940 + }, + { + "epoch": 0.35443954939238936, + "grad_norm": 0.05356835573911667, + "learning_rate": 3.239776234567901e-05, + "loss": 0.0175, + "step": 47950 + }, + { + "epoch": 0.3545134679636912, + "grad_norm": 0.05533919483423233, + "learning_rate": 3.2394052706552707e-05, + "loss": 0.0178, + "step": 47960 + }, + { + "epoch": 0.35458738653499305, + "grad_norm": 0.07981298863887787, + "learning_rate": 3.23903430674264e-05, + "loss": 0.0193, + "step": 47970 + }, + { + "epoch": 0.3546613051062949, + "grad_norm": 0.09771832078695297, + "learning_rate": 3.238663342830009e-05, + "loss": 0.0176, + "step": 47980 + }, + { + "epoch": 0.35473522367759674, + "grad_norm": 0.07720401883125305, + "learning_rate": 3.2382923789173794e-05, + "loss": 0.0156, + "step": 47990 + }, + { + "epoch": 0.3548091422488986, + "grad_norm": 0.09286272525787354, + "learning_rate": 3.237921415004749e-05, + "loss": 0.0215, + "step": 48000 + }, + { + "epoch": 0.3548830608202005, + "grad_norm": 0.08557448536157608, + "learning_rate": 3.237550451092118e-05, + "loss": 0.0195, + "step": 48010 + }, + { + "epoch": 0.35495697939150234, + "grad_norm": 0.06496861577033997, + "learning_rate": 3.2371794871794876e-05, + "loss": 0.0205, + "step": 48020 + }, + { + "epoch": 0.3550308979628042, + "grad_norm": 0.07169222831726074, + "learning_rate": 3.2368085232668565e-05, + "loss": 0.0194, + "step": 48030 + }, + { + "epoch": 0.355104816534106, + "grad_norm": 0.11181885749101639, + "learning_rate": 3.236437559354226e-05, + "loss": 0.0198, + "step": 48040 + }, + { + "epoch": 0.3551787351054079, + "grad_norm": 0.07448727637529373, + "learning_rate": 3.236066595441596e-05, + "loss": 0.0176, + "step": 48050 + }, + { + "epoch": 0.3552526536767097, + "grad_norm": 0.11262239515781403, + "learning_rate": 3.235695631528965e-05, + "loss": 0.0184, + "step": 48060 + }, + { + "epoch": 0.35532657224801156, + "grad_norm": 0.061267927289009094, + "learning_rate": 3.235324667616335e-05, + "loss": 0.0185, + "step": 48070 + }, + { + "epoch": 0.35540049081931346, + "grad_norm": 0.08542980998754501, + "learning_rate": 3.234953703703704e-05, + "loss": 0.0186, + "step": 48080 + }, + { + "epoch": 0.3554744093906153, + "grad_norm": 0.08174847811460495, + "learning_rate": 3.2345827397910734e-05, + "loss": 0.0199, + "step": 48090 + }, + { + "epoch": 0.35554832796191715, + "grad_norm": 0.0893312469124794, + "learning_rate": 3.234211775878443e-05, + "loss": 0.02, + "step": 48100 + }, + { + "epoch": 0.355622246533219, + "grad_norm": 0.07482835650444031, + "learning_rate": 3.233840811965812e-05, + "loss": 0.0193, + "step": 48110 + }, + { + "epoch": 0.35569616510452085, + "grad_norm": 0.1367911845445633, + "learning_rate": 3.2334698480531815e-05, + "loss": 0.019, + "step": 48120 + }, + { + "epoch": 0.3557700836758227, + "grad_norm": 0.09481962025165558, + "learning_rate": 3.2330988841405504e-05, + "loss": 0.0186, + "step": 48130 + }, + { + "epoch": 0.3558440022471246, + "grad_norm": 0.09325818717479706, + "learning_rate": 3.232727920227921e-05, + "loss": 0.0208, + "step": 48140 + }, + { + "epoch": 0.35591792081842644, + "grad_norm": 0.07967712730169296, + "learning_rate": 3.23235695631529e-05, + "loss": 0.0191, + "step": 48150 + }, + { + "epoch": 0.3559918393897283, + "grad_norm": 0.07193194329738617, + "learning_rate": 3.231985992402659e-05, + "loss": 0.0167, + "step": 48160 + }, + { + "epoch": 0.3560657579610301, + "grad_norm": 0.07769019901752472, + "learning_rate": 3.231615028490029e-05, + "loss": 0.0177, + "step": 48170 + }, + { + "epoch": 0.356139676532332, + "grad_norm": 0.07792861014604568, + "learning_rate": 3.231244064577398e-05, + "loss": 0.0189, + "step": 48180 + }, + { + "epoch": 0.3562135951036338, + "grad_norm": 0.0690118744969368, + "learning_rate": 3.230873100664767e-05, + "loss": 0.0186, + "step": 48190 + }, + { + "epoch": 0.3562875136749357, + "grad_norm": 0.09299138933420181, + "learning_rate": 3.230502136752137e-05, + "loss": 0.0224, + "step": 48200 + }, + { + "epoch": 0.35636143224623756, + "grad_norm": 0.07170496135950089, + "learning_rate": 3.2301311728395065e-05, + "loss": 0.0163, + "step": 48210 + }, + { + "epoch": 0.3564353508175394, + "grad_norm": 0.07200492918491364, + "learning_rate": 3.229760208926876e-05, + "loss": 0.0187, + "step": 48220 + }, + { + "epoch": 0.35650926938884125, + "grad_norm": 0.10119190812110901, + "learning_rate": 3.229389245014246e-05, + "loss": 0.0177, + "step": 48230 + }, + { + "epoch": 0.3565831879601431, + "grad_norm": 0.08835913985967636, + "learning_rate": 3.2290182811016146e-05, + "loss": 0.0193, + "step": 48240 + }, + { + "epoch": 0.35665710653144495, + "grad_norm": 0.0860733613371849, + "learning_rate": 3.228647317188984e-05, + "loss": 0.0182, + "step": 48250 + }, + { + "epoch": 0.3567310251027468, + "grad_norm": 0.06780107319355011, + "learning_rate": 3.228276353276353e-05, + "loss": 0.0178, + "step": 48260 + }, + { + "epoch": 0.3568049436740487, + "grad_norm": 0.09218813478946686, + "learning_rate": 3.227905389363723e-05, + "loss": 0.0201, + "step": 48270 + }, + { + "epoch": 0.35687886224535054, + "grad_norm": 0.07747295498847961, + "learning_rate": 3.227534425451092e-05, + "loss": 0.0165, + "step": 48280 + }, + { + "epoch": 0.3569527808166524, + "grad_norm": 0.0689418762922287, + "learning_rate": 3.227163461538462e-05, + "loss": 0.0191, + "step": 48290 + }, + { + "epoch": 0.3570266993879542, + "grad_norm": 0.0954119861125946, + "learning_rate": 3.2267924976258315e-05, + "loss": 0.0161, + "step": 48300 + }, + { + "epoch": 0.3571006179592561, + "grad_norm": 0.07031914591789246, + "learning_rate": 3.2264215337132004e-05, + "loss": 0.0195, + "step": 48310 + }, + { + "epoch": 0.3571745365305579, + "grad_norm": 0.11735131591558456, + "learning_rate": 3.22605056980057e-05, + "loss": 0.0191, + "step": 48320 + }, + { + "epoch": 0.3572484551018598, + "grad_norm": 0.08334904909133911, + "learning_rate": 3.2256796058879396e-05, + "loss": 0.0178, + "step": 48330 + }, + { + "epoch": 0.35732237367316166, + "grad_norm": 0.10128627717494965, + "learning_rate": 3.2253086419753086e-05, + "loss": 0.0183, + "step": 48340 + }, + { + "epoch": 0.3573962922444635, + "grad_norm": 0.07888349890708923, + "learning_rate": 3.224937678062678e-05, + "loss": 0.0182, + "step": 48350 + }, + { + "epoch": 0.35747021081576535, + "grad_norm": 0.06927074491977692, + "learning_rate": 3.224566714150048e-05, + "loss": 0.0182, + "step": 48360 + }, + { + "epoch": 0.3575441293870672, + "grad_norm": 0.10130536556243896, + "learning_rate": 3.2241957502374173e-05, + "loss": 0.018, + "step": 48370 + }, + { + "epoch": 0.35761804795836905, + "grad_norm": 0.0818759873509407, + "learning_rate": 3.223824786324787e-05, + "loss": 0.0204, + "step": 48380 + }, + { + "epoch": 0.3576919665296709, + "grad_norm": 0.0981663390994072, + "learning_rate": 3.223453822412156e-05, + "loss": 0.0196, + "step": 48390 + }, + { + "epoch": 0.3577658851009728, + "grad_norm": 0.06864052265882492, + "learning_rate": 3.2230828584995255e-05, + "loss": 0.0188, + "step": 48400 + }, + { + "epoch": 0.35783980367227464, + "grad_norm": 0.05629339441657066, + "learning_rate": 3.2227118945868944e-05, + "loss": 0.0183, + "step": 48410 + }, + { + "epoch": 0.3579137222435765, + "grad_norm": 0.0950944796204567, + "learning_rate": 3.222340930674264e-05, + "loss": 0.0173, + "step": 48420 + }, + { + "epoch": 0.3579876408148783, + "grad_norm": 0.0920860767364502, + "learning_rate": 3.2219699667616336e-05, + "loss": 0.0195, + "step": 48430 + }, + { + "epoch": 0.3580615593861802, + "grad_norm": 0.09546150267124176, + "learning_rate": 3.221599002849003e-05, + "loss": 0.0202, + "step": 48440 + }, + { + "epoch": 0.358135477957482, + "grad_norm": 0.14299006760120392, + "learning_rate": 3.221228038936373e-05, + "loss": 0.021, + "step": 48450 + }, + { + "epoch": 0.3582093965287839, + "grad_norm": 0.10482155531644821, + "learning_rate": 3.2208570750237424e-05, + "loss": 0.0184, + "step": 48460 + }, + { + "epoch": 0.35828331510008576, + "grad_norm": 0.1320812702178955, + "learning_rate": 3.220486111111111e-05, + "loss": 0.023, + "step": 48470 + }, + { + "epoch": 0.3583572336713876, + "grad_norm": 0.09531274437904358, + "learning_rate": 3.220115147198481e-05, + "loss": 0.0211, + "step": 48480 + }, + { + "epoch": 0.35843115224268945, + "grad_norm": 0.05454854667186737, + "learning_rate": 3.21974418328585e-05, + "loss": 0.0179, + "step": 48490 + }, + { + "epoch": 0.3585050708139913, + "grad_norm": 0.10306868702173233, + "learning_rate": 3.2193732193732194e-05, + "loss": 0.018, + "step": 48500 + }, + { + "epoch": 0.35857898938529315, + "grad_norm": 0.08248790353536606, + "learning_rate": 3.219002255460589e-05, + "loss": 0.0186, + "step": 48510 + }, + { + "epoch": 0.358652907956595, + "grad_norm": 0.0634683296084404, + "learning_rate": 3.2186312915479586e-05, + "loss": 0.0178, + "step": 48520 + }, + { + "epoch": 0.3587268265278969, + "grad_norm": 0.06007637456059456, + "learning_rate": 3.218260327635328e-05, + "loss": 0.0173, + "step": 48530 + }, + { + "epoch": 0.35880074509919874, + "grad_norm": 0.10380519181489944, + "learning_rate": 3.217889363722697e-05, + "loss": 0.0181, + "step": 48540 + }, + { + "epoch": 0.3588746636705006, + "grad_norm": 0.08306507021188736, + "learning_rate": 3.217518399810067e-05, + "loss": 0.0188, + "step": 48550 + }, + { + "epoch": 0.3589485822418024, + "grad_norm": 0.07880030572414398, + "learning_rate": 3.217147435897436e-05, + "loss": 0.0199, + "step": 48560 + }, + { + "epoch": 0.3590225008131043, + "grad_norm": 0.12690183520317078, + "learning_rate": 3.216776471984805e-05, + "loss": 0.0184, + "step": 48570 + }, + { + "epoch": 0.3590964193844061, + "grad_norm": 0.06855998933315277, + "learning_rate": 3.216405508072175e-05, + "loss": 0.0201, + "step": 48580 + }, + { + "epoch": 0.359170337955708, + "grad_norm": 0.11447902023792267, + "learning_rate": 3.2160345441595444e-05, + "loss": 0.0188, + "step": 48590 + }, + { + "epoch": 0.35924425652700986, + "grad_norm": 0.09545428305864334, + "learning_rate": 3.215663580246914e-05, + "loss": 0.0177, + "step": 48600 + }, + { + "epoch": 0.3593181750983117, + "grad_norm": 0.11311125010251999, + "learning_rate": 3.2152926163342836e-05, + "loss": 0.0168, + "step": 48610 + }, + { + "epoch": 0.35939209366961355, + "grad_norm": 0.06668959558010101, + "learning_rate": 3.2149216524216525e-05, + "loss": 0.0187, + "step": 48620 + }, + { + "epoch": 0.3594660122409154, + "grad_norm": 0.07591219991445541, + "learning_rate": 3.214550688509022e-05, + "loss": 0.0194, + "step": 48630 + }, + { + "epoch": 0.35953993081221725, + "grad_norm": 0.05712525174021721, + "learning_rate": 3.214179724596391e-05, + "loss": 0.0188, + "step": 48640 + }, + { + "epoch": 0.3596138493835191, + "grad_norm": 0.11006911098957062, + "learning_rate": 3.2138087606837606e-05, + "loss": 0.0202, + "step": 48650 + }, + { + "epoch": 0.359687767954821, + "grad_norm": 0.1147850751876831, + "learning_rate": 3.21343779677113e-05, + "loss": 0.0193, + "step": 48660 + }, + { + "epoch": 0.35976168652612284, + "grad_norm": 0.08865036815404892, + "learning_rate": 3.2130668328585e-05, + "loss": 0.0162, + "step": 48670 + }, + { + "epoch": 0.3598356050974247, + "grad_norm": 0.10027143359184265, + "learning_rate": 3.2126958689458694e-05, + "loss": 0.0179, + "step": 48680 + }, + { + "epoch": 0.3599095236687265, + "grad_norm": 0.08432824164628983, + "learning_rate": 3.212324905033239e-05, + "loss": 0.019, + "step": 48690 + }, + { + "epoch": 0.3599834422400284, + "grad_norm": 0.09327958524227142, + "learning_rate": 3.211953941120608e-05, + "loss": 0.0181, + "step": 48700 + }, + { + "epoch": 0.3600573608113302, + "grad_norm": 0.09955098479986191, + "learning_rate": 3.2115829772079775e-05, + "loss": 0.0179, + "step": 48710 + }, + { + "epoch": 0.3601312793826321, + "grad_norm": 0.0701267421245575, + "learning_rate": 3.2112120132953465e-05, + "loss": 0.0187, + "step": 48720 + }, + { + "epoch": 0.36020519795393396, + "grad_norm": 0.13661158084869385, + "learning_rate": 3.210841049382716e-05, + "loss": 0.0187, + "step": 48730 + }, + { + "epoch": 0.3602791165252358, + "grad_norm": 0.0596172958612442, + "learning_rate": 3.2104700854700856e-05, + "loss": 0.017, + "step": 48740 + }, + { + "epoch": 0.36035303509653765, + "grad_norm": 0.1898808777332306, + "learning_rate": 3.210099121557455e-05, + "loss": 0.019, + "step": 48750 + }, + { + "epoch": 0.3604269536678395, + "grad_norm": 0.08930271118879318, + "learning_rate": 3.209728157644825e-05, + "loss": 0.0182, + "step": 48760 + }, + { + "epoch": 0.36050087223914135, + "grad_norm": 0.11357209086418152, + "learning_rate": 3.209357193732194e-05, + "loss": 0.0183, + "step": 48770 + }, + { + "epoch": 0.3605747908104432, + "grad_norm": 0.09769640117883682, + "learning_rate": 3.2089862298195634e-05, + "loss": 0.0166, + "step": 48780 + }, + { + "epoch": 0.3606487093817451, + "grad_norm": 0.08066027611494064, + "learning_rate": 3.208615265906933e-05, + "loss": 0.0191, + "step": 48790 + }, + { + "epoch": 0.36072262795304694, + "grad_norm": 0.06597844511270523, + "learning_rate": 3.208244301994302e-05, + "loss": 0.0163, + "step": 48800 + }, + { + "epoch": 0.3607965465243488, + "grad_norm": 0.07833616435527802, + "learning_rate": 3.2078733380816715e-05, + "loss": 0.0185, + "step": 48810 + }, + { + "epoch": 0.3608704650956506, + "grad_norm": 0.08092334121465683, + "learning_rate": 3.207502374169041e-05, + "loss": 0.0194, + "step": 48820 + }, + { + "epoch": 0.3609443836669525, + "grad_norm": 0.07439634203910828, + "learning_rate": 3.2071314102564107e-05, + "loss": 0.0161, + "step": 48830 + }, + { + "epoch": 0.3610183022382543, + "grad_norm": 0.10406707227230072, + "learning_rate": 3.20676044634378e-05, + "loss": 0.0202, + "step": 48840 + }, + { + "epoch": 0.3610922208095562, + "grad_norm": 0.09756065160036087, + "learning_rate": 3.206389482431149e-05, + "loss": 0.0198, + "step": 48850 + }, + { + "epoch": 0.36116613938085806, + "grad_norm": 0.07719270884990692, + "learning_rate": 3.206018518518519e-05, + "loss": 0.0199, + "step": 48860 + }, + { + "epoch": 0.3612400579521599, + "grad_norm": 0.07252097129821777, + "learning_rate": 3.205647554605888e-05, + "loss": 0.0174, + "step": 48870 + }, + { + "epoch": 0.36131397652346176, + "grad_norm": 0.08581475913524628, + "learning_rate": 3.205276590693257e-05, + "loss": 0.0187, + "step": 48880 + }, + { + "epoch": 0.3613878950947636, + "grad_norm": 0.0722767785191536, + "learning_rate": 3.204905626780627e-05, + "loss": 0.02, + "step": 48890 + }, + { + "epoch": 0.36146181366606545, + "grad_norm": 0.06997499614953995, + "learning_rate": 3.2045346628679965e-05, + "loss": 0.019, + "step": 48900 + }, + { + "epoch": 0.3615357322373673, + "grad_norm": 0.07591132074594498, + "learning_rate": 3.204163698955366e-05, + "loss": 0.0178, + "step": 48910 + }, + { + "epoch": 0.3616096508086692, + "grad_norm": 0.11934647709131241, + "learning_rate": 3.203792735042736e-05, + "loss": 0.0228, + "step": 48920 + }, + { + "epoch": 0.36168356937997104, + "grad_norm": 0.07847454398870468, + "learning_rate": 3.2034217711301046e-05, + "loss": 0.019, + "step": 48930 + }, + { + "epoch": 0.3617574879512729, + "grad_norm": 0.09507045894861221, + "learning_rate": 3.203050807217474e-05, + "loss": 0.0202, + "step": 48940 + }, + { + "epoch": 0.36183140652257473, + "grad_norm": 0.08507265895605087, + "learning_rate": 3.202679843304843e-05, + "loss": 0.0209, + "step": 48950 + }, + { + "epoch": 0.3619053250938766, + "grad_norm": 0.08112446963787079, + "learning_rate": 3.202308879392213e-05, + "loss": 0.0162, + "step": 48960 + }, + { + "epoch": 0.3619792436651784, + "grad_norm": 0.09993197023868561, + "learning_rate": 3.201937915479582e-05, + "loss": 0.0169, + "step": 48970 + }, + { + "epoch": 0.3620531622364803, + "grad_norm": 0.07737724483013153, + "learning_rate": 3.201566951566952e-05, + "loss": 0.0185, + "step": 48980 + }, + { + "epoch": 0.36212708080778216, + "grad_norm": 0.21678310632705688, + "learning_rate": 3.2011959876543215e-05, + "loss": 0.0181, + "step": 48990 + }, + { + "epoch": 0.362200999379084, + "grad_norm": 0.08399459719657898, + "learning_rate": 3.2008250237416904e-05, + "loss": 0.0179, + "step": 49000 + }, + { + "epoch": 0.36227491795038586, + "grad_norm": 0.06960813701152802, + "learning_rate": 3.20045405982906e-05, + "loss": 0.0172, + "step": 49010 + }, + { + "epoch": 0.3623488365216877, + "grad_norm": 0.08563554286956787, + "learning_rate": 3.2000830959164296e-05, + "loss": 0.0171, + "step": 49020 + }, + { + "epoch": 0.36242275509298955, + "grad_norm": 0.11998796463012695, + "learning_rate": 3.1997121320037985e-05, + "loss": 0.0192, + "step": 49030 + }, + { + "epoch": 0.3624966736642914, + "grad_norm": 0.07512512058019638, + "learning_rate": 3.199341168091168e-05, + "loss": 0.0176, + "step": 49040 + }, + { + "epoch": 0.3625705922355933, + "grad_norm": 0.10795829445123672, + "learning_rate": 3.198970204178538e-05, + "loss": 0.0189, + "step": 49050 + }, + { + "epoch": 0.36264451080689514, + "grad_norm": 0.06630425155162811, + "learning_rate": 3.198599240265907e-05, + "loss": 0.0189, + "step": 49060 + }, + { + "epoch": 0.362718429378197, + "grad_norm": 0.06694331020116806, + "learning_rate": 3.198228276353277e-05, + "loss": 0.0169, + "step": 49070 + }, + { + "epoch": 0.36279234794949883, + "grad_norm": 0.06272326409816742, + "learning_rate": 3.197857312440646e-05, + "loss": 0.0182, + "step": 49080 + }, + { + "epoch": 0.3628662665208007, + "grad_norm": 0.0822906568646431, + "learning_rate": 3.1974863485280154e-05, + "loss": 0.0172, + "step": 49090 + }, + { + "epoch": 0.3629401850921025, + "grad_norm": 0.1090676337480545, + "learning_rate": 3.1971153846153843e-05, + "loss": 0.0205, + "step": 49100 + }, + { + "epoch": 0.3630141036634044, + "grad_norm": 0.09612789005041122, + "learning_rate": 3.196744420702754e-05, + "loss": 0.0192, + "step": 49110 + }, + { + "epoch": 0.36308802223470626, + "grad_norm": 0.0747082456946373, + "learning_rate": 3.1963734567901235e-05, + "loss": 0.0188, + "step": 49120 + }, + { + "epoch": 0.3631619408060081, + "grad_norm": 0.10992246866226196, + "learning_rate": 3.196002492877493e-05, + "loss": 0.0177, + "step": 49130 + }, + { + "epoch": 0.36323585937730996, + "grad_norm": 0.06742114573717117, + "learning_rate": 3.195631528964863e-05, + "loss": 0.0184, + "step": 49140 + }, + { + "epoch": 0.3633097779486118, + "grad_norm": 0.07350549101829529, + "learning_rate": 3.195260565052232e-05, + "loss": 0.0216, + "step": 49150 + }, + { + "epoch": 0.36338369651991365, + "grad_norm": 0.07814744114875793, + "learning_rate": 3.194889601139601e-05, + "loss": 0.0164, + "step": 49160 + }, + { + "epoch": 0.3634576150912155, + "grad_norm": 0.08196965605020523, + "learning_rate": 3.194518637226971e-05, + "loss": 0.0161, + "step": 49170 + }, + { + "epoch": 0.3635315336625174, + "grad_norm": 0.06950008124113083, + "learning_rate": 3.19414767331434e-05, + "loss": 0.0189, + "step": 49180 + }, + { + "epoch": 0.36360545223381924, + "grad_norm": 0.09519726783037186, + "learning_rate": 3.1937767094017094e-05, + "loss": 0.0184, + "step": 49190 + }, + { + "epoch": 0.3636793708051211, + "grad_norm": 0.08564166724681854, + "learning_rate": 3.193405745489079e-05, + "loss": 0.0178, + "step": 49200 + }, + { + "epoch": 0.36375328937642293, + "grad_norm": 0.07054101675748825, + "learning_rate": 3.1930347815764486e-05, + "loss": 0.0182, + "step": 49210 + }, + { + "epoch": 0.3638272079477248, + "grad_norm": 0.07411817461252213, + "learning_rate": 3.192663817663818e-05, + "loss": 0.0186, + "step": 49220 + }, + { + "epoch": 0.3639011265190266, + "grad_norm": 0.07335742563009262, + "learning_rate": 3.192292853751187e-05, + "loss": 0.0176, + "step": 49230 + }, + { + "epoch": 0.3639750450903285, + "grad_norm": 0.09444653242826462, + "learning_rate": 3.191921889838557e-05, + "loss": 0.02, + "step": 49240 + }, + { + "epoch": 0.36404896366163036, + "grad_norm": 0.12061870843172073, + "learning_rate": 3.191550925925926e-05, + "loss": 0.0195, + "step": 49250 + }, + { + "epoch": 0.3641228822329322, + "grad_norm": 0.0709986537694931, + "learning_rate": 3.191179962013295e-05, + "loss": 0.0189, + "step": 49260 + }, + { + "epoch": 0.36419680080423406, + "grad_norm": 0.10596373677253723, + "learning_rate": 3.190808998100665e-05, + "loss": 0.0223, + "step": 49270 + }, + { + "epoch": 0.3642707193755359, + "grad_norm": 0.080999456346035, + "learning_rate": 3.1904380341880344e-05, + "loss": 0.0177, + "step": 49280 + }, + { + "epoch": 0.36434463794683775, + "grad_norm": 0.10911361128091812, + "learning_rate": 3.190067070275404e-05, + "loss": 0.0186, + "step": 49290 + }, + { + "epoch": 0.3644185565181396, + "grad_norm": 0.0921257883310318, + "learning_rate": 3.1896961063627736e-05, + "loss": 0.0199, + "step": 49300 + }, + { + "epoch": 0.3644924750894415, + "grad_norm": 0.06992916762828827, + "learning_rate": 3.1893251424501425e-05, + "loss": 0.019, + "step": 49310 + }, + { + "epoch": 0.36456639366074334, + "grad_norm": 0.07500123232603073, + "learning_rate": 3.188954178537512e-05, + "loss": 0.0187, + "step": 49320 + }, + { + "epoch": 0.3646403122320452, + "grad_norm": 0.11022918671369553, + "learning_rate": 3.188583214624881e-05, + "loss": 0.0205, + "step": 49330 + }, + { + "epoch": 0.36471423080334703, + "grad_norm": 0.08734451234340668, + "learning_rate": 3.1882122507122506e-05, + "loss": 0.0191, + "step": 49340 + }, + { + "epoch": 0.3647881493746489, + "grad_norm": 0.0821605697274208, + "learning_rate": 3.18784128679962e-05, + "loss": 0.0208, + "step": 49350 + }, + { + "epoch": 0.3648620679459507, + "grad_norm": 0.09052004665136337, + "learning_rate": 3.18747032288699e-05, + "loss": 0.0195, + "step": 49360 + }, + { + "epoch": 0.3649359865172526, + "grad_norm": 0.12369329482316971, + "learning_rate": 3.1870993589743594e-05, + "loss": 0.022, + "step": 49370 + }, + { + "epoch": 0.36500990508855446, + "grad_norm": 0.0797930434346199, + "learning_rate": 3.186728395061729e-05, + "loss": 0.0195, + "step": 49380 + }, + { + "epoch": 0.3650838236598563, + "grad_norm": 0.11744547635316849, + "learning_rate": 3.186357431149098e-05, + "loss": 0.0213, + "step": 49390 + }, + { + "epoch": 0.36515774223115816, + "grad_norm": 0.07720588147640228, + "learning_rate": 3.1859864672364675e-05, + "loss": 0.0219, + "step": 49400 + }, + { + "epoch": 0.36523166080246, + "grad_norm": 0.07627621293067932, + "learning_rate": 3.1856155033238364e-05, + "loss": 0.0186, + "step": 49410 + }, + { + "epoch": 0.36530557937376185, + "grad_norm": 0.06858126819133759, + "learning_rate": 3.185244539411206e-05, + "loss": 0.0157, + "step": 49420 + }, + { + "epoch": 0.3653794979450637, + "grad_norm": 0.08645808696746826, + "learning_rate": 3.1848735754985756e-05, + "loss": 0.0206, + "step": 49430 + }, + { + "epoch": 0.3654534165163656, + "grad_norm": 0.07037283480167389, + "learning_rate": 3.184502611585945e-05, + "loss": 0.0184, + "step": 49440 + }, + { + "epoch": 0.36552733508766744, + "grad_norm": 0.09279239922761917, + "learning_rate": 3.184131647673315e-05, + "loss": 0.0162, + "step": 49450 + }, + { + "epoch": 0.3656012536589693, + "grad_norm": 0.10505162179470062, + "learning_rate": 3.183760683760684e-05, + "loss": 0.0185, + "step": 49460 + }, + { + "epoch": 0.36567517223027113, + "grad_norm": 0.10786660760641098, + "learning_rate": 3.183389719848053e-05, + "loss": 0.0195, + "step": 49470 + }, + { + "epoch": 0.365749090801573, + "grad_norm": 0.08201677352190018, + "learning_rate": 3.183018755935423e-05, + "loss": 0.0176, + "step": 49480 + }, + { + "epoch": 0.3658230093728748, + "grad_norm": 0.09974339604377747, + "learning_rate": 3.182647792022792e-05, + "loss": 0.0175, + "step": 49490 + }, + { + "epoch": 0.3658969279441767, + "grad_norm": 0.05509033426642418, + "learning_rate": 3.1822768281101614e-05, + "loss": 0.0183, + "step": 49500 + }, + { + "epoch": 0.36597084651547857, + "grad_norm": 0.07690016180276871, + "learning_rate": 3.181905864197531e-05, + "loss": 0.0176, + "step": 49510 + }, + { + "epoch": 0.3660447650867804, + "grad_norm": 0.09903372079133987, + "learning_rate": 3.1815349002849006e-05, + "loss": 0.0195, + "step": 49520 + }, + { + "epoch": 0.36611868365808226, + "grad_norm": 0.09054466336965561, + "learning_rate": 3.18116393637227e-05, + "loss": 0.0177, + "step": 49530 + }, + { + "epoch": 0.3661926022293841, + "grad_norm": 0.06345314532518387, + "learning_rate": 3.180792972459639e-05, + "loss": 0.0157, + "step": 49540 + }, + { + "epoch": 0.36626652080068595, + "grad_norm": 0.08934221416711807, + "learning_rate": 3.180422008547009e-05, + "loss": 0.0204, + "step": 49550 + }, + { + "epoch": 0.3663404393719878, + "grad_norm": 0.10263720154762268, + "learning_rate": 3.180051044634378e-05, + "loss": 0.0176, + "step": 49560 + }, + { + "epoch": 0.3664143579432897, + "grad_norm": 0.08406967669725418, + "learning_rate": 3.179680080721747e-05, + "loss": 0.0212, + "step": 49570 + }, + { + "epoch": 0.36648827651459154, + "grad_norm": 0.10318008810281754, + "learning_rate": 3.179309116809117e-05, + "loss": 0.0176, + "step": 49580 + }, + { + "epoch": 0.3665621950858934, + "grad_norm": 0.11742877215147018, + "learning_rate": 3.1789381528964865e-05, + "loss": 0.0183, + "step": 49590 + }, + { + "epoch": 0.36663611365719523, + "grad_norm": 0.11578115075826645, + "learning_rate": 3.178567188983856e-05, + "loss": 0.0211, + "step": 49600 + }, + { + "epoch": 0.3667100322284971, + "grad_norm": 0.07098636031150818, + "learning_rate": 3.1781962250712256e-05, + "loss": 0.0191, + "step": 49610 + }, + { + "epoch": 0.3667839507997989, + "grad_norm": 0.09956346452236176, + "learning_rate": 3.1778252611585946e-05, + "loss": 0.0177, + "step": 49620 + }, + { + "epoch": 0.3668578693711008, + "grad_norm": 0.07314296066761017, + "learning_rate": 3.177454297245964e-05, + "loss": 0.0186, + "step": 49630 + }, + { + "epoch": 0.36693178794240267, + "grad_norm": 0.06977906078100204, + "learning_rate": 3.177083333333333e-05, + "loss": 0.0163, + "step": 49640 + }, + { + "epoch": 0.3670057065137045, + "grad_norm": 0.07374892383813858, + "learning_rate": 3.176712369420703e-05, + "loss": 0.0198, + "step": 49650 + }, + { + "epoch": 0.36707962508500636, + "grad_norm": 0.08748723566532135, + "learning_rate": 3.176341405508072e-05, + "loss": 0.0195, + "step": 49660 + }, + { + "epoch": 0.3671535436563082, + "grad_norm": 0.08709771931171417, + "learning_rate": 3.175970441595442e-05, + "loss": 0.0168, + "step": 49670 + }, + { + "epoch": 0.36722746222761005, + "grad_norm": 0.09110180288553238, + "learning_rate": 3.1755994776828115e-05, + "loss": 0.0203, + "step": 49680 + }, + { + "epoch": 0.3673013807989119, + "grad_norm": 0.04776851460337639, + "learning_rate": 3.1752285137701804e-05, + "loss": 0.0189, + "step": 49690 + }, + { + "epoch": 0.3673752993702138, + "grad_norm": 0.08647187799215317, + "learning_rate": 3.17485754985755e-05, + "loss": 0.0205, + "step": 49700 + }, + { + "epoch": 0.36744921794151564, + "grad_norm": 0.08102229982614517, + "learning_rate": 3.1744865859449196e-05, + "loss": 0.0197, + "step": 49710 + }, + { + "epoch": 0.3675231365128175, + "grad_norm": 0.11056692153215408, + "learning_rate": 3.1741156220322885e-05, + "loss": 0.0187, + "step": 49720 + }, + { + "epoch": 0.36759705508411933, + "grad_norm": 0.16177508234977722, + "learning_rate": 3.173744658119658e-05, + "loss": 0.0199, + "step": 49730 + }, + { + "epoch": 0.3676709736554212, + "grad_norm": 0.07288125157356262, + "learning_rate": 3.173373694207028e-05, + "loss": 0.0188, + "step": 49740 + }, + { + "epoch": 0.367744892226723, + "grad_norm": 0.07510792464017868, + "learning_rate": 3.173002730294397e-05, + "loss": 0.0175, + "step": 49750 + }, + { + "epoch": 0.3678188107980249, + "grad_norm": 0.10319831222295761, + "learning_rate": 3.172631766381767e-05, + "loss": 0.022, + "step": 49760 + }, + { + "epoch": 0.36789272936932677, + "grad_norm": 0.10422424226999283, + "learning_rate": 3.172260802469136e-05, + "loss": 0.0174, + "step": 49770 + }, + { + "epoch": 0.3679666479406286, + "grad_norm": 0.08082633465528488, + "learning_rate": 3.1718898385565054e-05, + "loss": 0.0174, + "step": 49780 + }, + { + "epoch": 0.36804056651193046, + "grad_norm": 0.09931709617376328, + "learning_rate": 3.171518874643874e-05, + "loss": 0.0188, + "step": 49790 + }, + { + "epoch": 0.3681144850832323, + "grad_norm": 0.07735582441091537, + "learning_rate": 3.171147910731244e-05, + "loss": 0.0158, + "step": 49800 + }, + { + "epoch": 0.36818840365453415, + "grad_norm": 0.12492542713880539, + "learning_rate": 3.1707769468186135e-05, + "loss": 0.0223, + "step": 49810 + }, + { + "epoch": 0.368262322225836, + "grad_norm": 0.1031833365559578, + "learning_rate": 3.170405982905983e-05, + "loss": 0.0224, + "step": 49820 + }, + { + "epoch": 0.3683362407971379, + "grad_norm": 0.06540018320083618, + "learning_rate": 3.170035018993353e-05, + "loss": 0.0191, + "step": 49830 + }, + { + "epoch": 0.36841015936843974, + "grad_norm": 0.07900650799274445, + "learning_rate": 3.169664055080722e-05, + "loss": 0.0196, + "step": 49840 + }, + { + "epoch": 0.3684840779397416, + "grad_norm": 0.10357868671417236, + "learning_rate": 3.169293091168091e-05, + "loss": 0.0193, + "step": 49850 + }, + { + "epoch": 0.36855799651104343, + "grad_norm": 0.07287750393152237, + "learning_rate": 3.168922127255461e-05, + "loss": 0.0162, + "step": 49860 + }, + { + "epoch": 0.3686319150823453, + "grad_norm": 0.07523094117641449, + "learning_rate": 3.16855116334283e-05, + "loss": 0.0184, + "step": 49870 + }, + { + "epoch": 0.3687058336536471, + "grad_norm": 0.09581726044416428, + "learning_rate": 3.168180199430199e-05, + "loss": 0.0202, + "step": 49880 + }, + { + "epoch": 0.368779752224949, + "grad_norm": 0.07129789143800735, + "learning_rate": 3.167809235517569e-05, + "loss": 0.019, + "step": 49890 + }, + { + "epoch": 0.36885367079625087, + "grad_norm": 0.07590305805206299, + "learning_rate": 3.1674382716049385e-05, + "loss": 0.0171, + "step": 49900 + }, + { + "epoch": 0.3689275893675527, + "grad_norm": 0.09437566995620728, + "learning_rate": 3.167067307692308e-05, + "loss": 0.021, + "step": 49910 + }, + { + "epoch": 0.36900150793885456, + "grad_norm": 0.06181221082806587, + "learning_rate": 3.166696343779677e-05, + "loss": 0.0199, + "step": 49920 + }, + { + "epoch": 0.3690754265101564, + "grad_norm": 0.09170825034379959, + "learning_rate": 3.1663253798670466e-05, + "loss": 0.0188, + "step": 49930 + }, + { + "epoch": 0.36914934508145825, + "grad_norm": 0.08520179986953735, + "learning_rate": 3.165954415954416e-05, + "loss": 0.0167, + "step": 49940 + }, + { + "epoch": 0.3692232636527601, + "grad_norm": 0.07948411256074905, + "learning_rate": 3.165583452041785e-05, + "loss": 0.0173, + "step": 49950 + }, + { + "epoch": 0.369297182224062, + "grad_norm": 0.09082730859518051, + "learning_rate": 3.165212488129155e-05, + "loss": 0.017, + "step": 49960 + }, + { + "epoch": 0.36937110079536384, + "grad_norm": 0.07359523326158524, + "learning_rate": 3.1648415242165244e-05, + "loss": 0.018, + "step": 49970 + }, + { + "epoch": 0.3694450193666657, + "grad_norm": 0.06806252151727676, + "learning_rate": 3.164470560303894e-05, + "loss": 0.0188, + "step": 49980 + }, + { + "epoch": 0.36951893793796753, + "grad_norm": 0.10926640778779984, + "learning_rate": 3.1640995963912635e-05, + "loss": 0.0186, + "step": 49990 + }, + { + "epoch": 0.3695928565092694, + "grad_norm": 0.09719149023294449, + "learning_rate": 3.1637286324786325e-05, + "loss": 0.0194, + "step": 50000 + }, + { + "epoch": 0.3695928565092694, + "eval_f1": 0.6127174177207287, + "eval_loss": 0.018170252442359924, + "eval_precision": 0.48591030292742615, + "eval_recall": 0.8290812108924811, + "eval_runtime": 2665.6263, + "eval_samples_per_second": 203.004, + "eval_steps_per_second": 3.172, + "step": 50000 + }, + { + "epoch": 0.3696667750805712, + "grad_norm": 0.06750772893428802, + "learning_rate": 3.163357668566002e-05, + "loss": 0.0181, + "step": 50010 + }, + { + "epoch": 0.3697406936518731, + "grad_norm": 0.13104230165481567, + "learning_rate": 3.162986704653371e-05, + "loss": 0.0194, + "step": 50020 + }, + { + "epoch": 0.36981461222317497, + "grad_norm": 0.07477361708879471, + "learning_rate": 3.1626157407407406e-05, + "loss": 0.0161, + "step": 50030 + }, + { + "epoch": 0.3698885307944768, + "grad_norm": 0.09455181658267975, + "learning_rate": 3.16224477682811e-05, + "loss": 0.0168, + "step": 50040 + }, + { + "epoch": 0.36996244936577866, + "grad_norm": 0.0634753629565239, + "learning_rate": 3.16187381291548e-05, + "loss": 0.0178, + "step": 50050 + }, + { + "epoch": 0.3700363679370805, + "grad_norm": 0.10524077713489532, + "learning_rate": 3.1615028490028494e-05, + "loss": 0.0176, + "step": 50060 + }, + { + "epoch": 0.37011028650838235, + "grad_norm": 0.07643004506826401, + "learning_rate": 3.161131885090219e-05, + "loss": 0.0187, + "step": 50070 + }, + { + "epoch": 0.37018420507968425, + "grad_norm": 0.10781189054250717, + "learning_rate": 3.160760921177588e-05, + "loss": 0.0161, + "step": 50080 + }, + { + "epoch": 0.3702581236509861, + "grad_norm": 0.0713755190372467, + "learning_rate": 3.1603899572649575e-05, + "loss": 0.0202, + "step": 50090 + }, + { + "epoch": 0.37033204222228794, + "grad_norm": 0.07827604562044144, + "learning_rate": 3.1600189933523264e-05, + "loss": 0.0165, + "step": 50100 + }, + { + "epoch": 0.3704059607935898, + "grad_norm": 0.0943358838558197, + "learning_rate": 3.159648029439696e-05, + "loss": 0.0191, + "step": 50110 + }, + { + "epoch": 0.37047987936489163, + "grad_norm": 0.07688167691230774, + "learning_rate": 3.1592770655270656e-05, + "loss": 0.0207, + "step": 50120 + }, + { + "epoch": 0.3705537979361935, + "grad_norm": 0.1343929022550583, + "learning_rate": 3.158906101614435e-05, + "loss": 0.019, + "step": 50130 + }, + { + "epoch": 0.3706277165074953, + "grad_norm": 0.10538404434919357, + "learning_rate": 3.158535137701805e-05, + "loss": 0.0221, + "step": 50140 + }, + { + "epoch": 0.3707016350787972, + "grad_norm": 0.13039255142211914, + "learning_rate": 3.158164173789174e-05, + "loss": 0.0195, + "step": 50150 + }, + { + "epoch": 0.37077555365009907, + "grad_norm": 0.07820204645395279, + "learning_rate": 3.157793209876543e-05, + "loss": 0.0164, + "step": 50160 + }, + { + "epoch": 0.3708494722214009, + "grad_norm": 0.09720060974359512, + "learning_rate": 3.157422245963913e-05, + "loss": 0.0199, + "step": 50170 + }, + { + "epoch": 0.37092339079270276, + "grad_norm": 0.10176794975996017, + "learning_rate": 3.157051282051282e-05, + "loss": 0.017, + "step": 50180 + }, + { + "epoch": 0.3709973093640046, + "grad_norm": 0.06773918122053146, + "learning_rate": 3.1566803181386514e-05, + "loss": 0.0199, + "step": 50190 + }, + { + "epoch": 0.37107122793530645, + "grad_norm": 0.07437018305063248, + "learning_rate": 3.156309354226021e-05, + "loss": 0.0172, + "step": 50200 + }, + { + "epoch": 0.37114514650660835, + "grad_norm": 0.08163496106863022, + "learning_rate": 3.1559383903133906e-05, + "loss": 0.0158, + "step": 50210 + }, + { + "epoch": 0.3712190650779102, + "grad_norm": 0.0832415372133255, + "learning_rate": 3.15556742640076e-05, + "loss": 0.0187, + "step": 50220 + }, + { + "epoch": 0.37129298364921204, + "grad_norm": 0.09557832777500153, + "learning_rate": 3.155196462488129e-05, + "loss": 0.0195, + "step": 50230 + }, + { + "epoch": 0.3713669022205139, + "grad_norm": 0.06662998348474503, + "learning_rate": 3.154825498575499e-05, + "loss": 0.0179, + "step": 50240 + }, + { + "epoch": 0.37144082079181573, + "grad_norm": 0.10676044225692749, + "learning_rate": 3.1544545346628676e-05, + "loss": 0.0204, + "step": 50250 + }, + { + "epoch": 0.3715147393631176, + "grad_norm": 0.1034659594297409, + "learning_rate": 3.154083570750237e-05, + "loss": 0.0197, + "step": 50260 + }, + { + "epoch": 0.3715886579344194, + "grad_norm": 0.08106374740600586, + "learning_rate": 3.153712606837607e-05, + "loss": 0.0199, + "step": 50270 + }, + { + "epoch": 0.3716625765057213, + "grad_norm": 0.08168378472328186, + "learning_rate": 3.1533416429249764e-05, + "loss": 0.0192, + "step": 50280 + }, + { + "epoch": 0.37173649507702317, + "grad_norm": 0.08763410896062851, + "learning_rate": 3.152970679012346e-05, + "loss": 0.0212, + "step": 50290 + }, + { + "epoch": 0.371810413648325, + "grad_norm": 0.1779336780309677, + "learning_rate": 3.1525997150997156e-05, + "loss": 0.019, + "step": 50300 + }, + { + "epoch": 0.37188433221962686, + "grad_norm": 0.05893019586801529, + "learning_rate": 3.1522287511870845e-05, + "loss": 0.0187, + "step": 50310 + }, + { + "epoch": 0.3719582507909287, + "grad_norm": 0.08330386877059937, + "learning_rate": 3.151857787274454e-05, + "loss": 0.0204, + "step": 50320 + }, + { + "epoch": 0.37203216936223055, + "grad_norm": 0.0755939707159996, + "learning_rate": 3.151486823361823e-05, + "loss": 0.0194, + "step": 50330 + }, + { + "epoch": 0.37210608793353245, + "grad_norm": 0.08258412033319473, + "learning_rate": 3.1511158594491927e-05, + "loss": 0.0188, + "step": 50340 + }, + { + "epoch": 0.3721800065048343, + "grad_norm": 0.09173570573329926, + "learning_rate": 3.150744895536563e-05, + "loss": 0.0174, + "step": 50350 + }, + { + "epoch": 0.37225392507613614, + "grad_norm": 0.06343158334493637, + "learning_rate": 3.150373931623932e-05, + "loss": 0.0153, + "step": 50360 + }, + { + "epoch": 0.372327843647438, + "grad_norm": 0.075467549264431, + "learning_rate": 3.1500029677113014e-05, + "loss": 0.0198, + "step": 50370 + }, + { + "epoch": 0.37240176221873983, + "grad_norm": 0.06660743802785873, + "learning_rate": 3.1496320037986704e-05, + "loss": 0.0171, + "step": 50380 + }, + { + "epoch": 0.3724756807900417, + "grad_norm": 0.07386814802885056, + "learning_rate": 3.14926103988604e-05, + "loss": 0.0171, + "step": 50390 + }, + { + "epoch": 0.3725495993613435, + "grad_norm": 0.09839634597301483, + "learning_rate": 3.1488900759734096e-05, + "loss": 0.0195, + "step": 50400 + }, + { + "epoch": 0.3726235179326454, + "grad_norm": 0.0697672888636589, + "learning_rate": 3.1485191120607785e-05, + "loss": 0.0207, + "step": 50410 + }, + { + "epoch": 0.37269743650394727, + "grad_norm": 0.12948425114154816, + "learning_rate": 3.148148148148148e-05, + "loss": 0.019, + "step": 50420 + }, + { + "epoch": 0.3727713550752491, + "grad_norm": 0.0835069864988327, + "learning_rate": 3.147777184235518e-05, + "loss": 0.0195, + "step": 50430 + }, + { + "epoch": 0.37284527364655096, + "grad_norm": 0.07875664532184601, + "learning_rate": 3.147406220322887e-05, + "loss": 0.018, + "step": 50440 + }, + { + "epoch": 0.3729191922178528, + "grad_norm": 0.09094227105379105, + "learning_rate": 3.147035256410257e-05, + "loss": 0.0164, + "step": 50450 + }, + { + "epoch": 0.37299311078915465, + "grad_norm": 0.07479475438594818, + "learning_rate": 3.146664292497626e-05, + "loss": 0.0176, + "step": 50460 + }, + { + "epoch": 0.37306702936045655, + "grad_norm": 0.0831725150346756, + "learning_rate": 3.1462933285849954e-05, + "loss": 0.0189, + "step": 50470 + }, + { + "epoch": 0.3731409479317584, + "grad_norm": 0.08730162680149078, + "learning_rate": 3.145922364672364e-05, + "loss": 0.021, + "step": 50480 + }, + { + "epoch": 0.37321486650306024, + "grad_norm": 0.06646716594696045, + "learning_rate": 3.145551400759734e-05, + "loss": 0.0182, + "step": 50490 + }, + { + "epoch": 0.3732887850743621, + "grad_norm": 0.07522834837436676, + "learning_rate": 3.145180436847104e-05, + "loss": 0.0185, + "step": 50500 + }, + { + "epoch": 0.37336270364566393, + "grad_norm": 0.09267633408308029, + "learning_rate": 3.144809472934473e-05, + "loss": 0.0163, + "step": 50510 + }, + { + "epoch": 0.3734366222169658, + "grad_norm": 0.07390173524618149, + "learning_rate": 3.144438509021843e-05, + "loss": 0.019, + "step": 50520 + }, + { + "epoch": 0.3735105407882676, + "grad_norm": 0.10241732001304626, + "learning_rate": 3.144067545109212e-05, + "loss": 0.0175, + "step": 50530 + }, + { + "epoch": 0.3735844593595695, + "grad_norm": 0.07036875188350677, + "learning_rate": 3.143696581196581e-05, + "loss": 0.0195, + "step": 50540 + }, + { + "epoch": 0.37365837793087137, + "grad_norm": 0.09159451723098755, + "learning_rate": 3.143325617283951e-05, + "loss": 0.0177, + "step": 50550 + }, + { + "epoch": 0.3737322965021732, + "grad_norm": 0.07133959978818893, + "learning_rate": 3.14295465337132e-05, + "loss": 0.0196, + "step": 50560 + }, + { + "epoch": 0.37380621507347506, + "grad_norm": 0.07279335707426071, + "learning_rate": 3.142583689458689e-05, + "loss": 0.0189, + "step": 50570 + }, + { + "epoch": 0.3738801336447769, + "grad_norm": 0.13607998192310333, + "learning_rate": 3.1422127255460596e-05, + "loss": 0.0176, + "step": 50580 + }, + { + "epoch": 0.37395405221607875, + "grad_norm": 0.07464282959699631, + "learning_rate": 3.1418417616334285e-05, + "loss": 0.0187, + "step": 50590 + }, + { + "epoch": 0.37402797078738065, + "grad_norm": 0.08845172077417374, + "learning_rate": 3.141470797720798e-05, + "loss": 0.0196, + "step": 50600 + }, + { + "epoch": 0.3741018893586825, + "grad_norm": 0.10331210494041443, + "learning_rate": 3.141099833808167e-05, + "loss": 0.0186, + "step": 50610 + }, + { + "epoch": 0.37417580792998434, + "grad_norm": 0.06014440208673477, + "learning_rate": 3.1407288698955366e-05, + "loss": 0.0191, + "step": 50620 + }, + { + "epoch": 0.3742497265012862, + "grad_norm": 0.09451748430728912, + "learning_rate": 3.140357905982906e-05, + "loss": 0.0167, + "step": 50630 + }, + { + "epoch": 0.37432364507258803, + "grad_norm": 0.07870625704526901, + "learning_rate": 3.139986942070275e-05, + "loss": 0.0199, + "step": 50640 + }, + { + "epoch": 0.3743975636438899, + "grad_norm": 0.0743972659111023, + "learning_rate": 3.1396159781576454e-05, + "loss": 0.018, + "step": 50650 + }, + { + "epoch": 0.3744714822151917, + "grad_norm": 0.08390262722969055, + "learning_rate": 3.139245014245014e-05, + "loss": 0.0196, + "step": 50660 + }, + { + "epoch": 0.3745454007864936, + "grad_norm": 0.10581986606121063, + "learning_rate": 3.138874050332384e-05, + "loss": 0.0177, + "step": 50670 + }, + { + "epoch": 0.37461931935779547, + "grad_norm": 0.07332141697406769, + "learning_rate": 3.1385030864197535e-05, + "loss": 0.0173, + "step": 50680 + }, + { + "epoch": 0.3746932379290973, + "grad_norm": 0.06791075319051743, + "learning_rate": 3.1381321225071224e-05, + "loss": 0.0213, + "step": 50690 + }, + { + "epoch": 0.37476715650039916, + "grad_norm": 0.0779709741473198, + "learning_rate": 3.137761158594492e-05, + "loss": 0.0196, + "step": 50700 + }, + { + "epoch": 0.374841075071701, + "grad_norm": 0.06821703165769577, + "learning_rate": 3.137390194681861e-05, + "loss": 0.0172, + "step": 50710 + }, + { + "epoch": 0.37491499364300285, + "grad_norm": 0.07088646292686462, + "learning_rate": 3.1370192307692306e-05, + "loss": 0.0178, + "step": 50720 + }, + { + "epoch": 0.37498891221430475, + "grad_norm": 0.09667670726776123, + "learning_rate": 3.136648266856601e-05, + "loss": 0.0187, + "step": 50730 + }, + { + "epoch": 0.3750628307856066, + "grad_norm": 0.0873975083231926, + "learning_rate": 3.13627730294397e-05, + "loss": 0.0212, + "step": 50740 + }, + { + "epoch": 0.37513674935690844, + "grad_norm": 0.10300181061029434, + "learning_rate": 3.1359063390313393e-05, + "loss": 0.0185, + "step": 50750 + }, + { + "epoch": 0.3752106679282103, + "grad_norm": 0.16665935516357422, + "learning_rate": 3.135535375118709e-05, + "loss": 0.0179, + "step": 50760 + }, + { + "epoch": 0.37528458649951213, + "grad_norm": 0.10472545772790909, + "learning_rate": 3.135164411206078e-05, + "loss": 0.0169, + "step": 50770 + }, + { + "epoch": 0.375358505070814, + "grad_norm": 0.08101044595241547, + "learning_rate": 3.1347934472934475e-05, + "loss": 0.0174, + "step": 50780 + }, + { + "epoch": 0.3754324236421158, + "grad_norm": 0.09952845424413681, + "learning_rate": 3.1344224833808164e-05, + "loss": 0.0192, + "step": 50790 + }, + { + "epoch": 0.3755063422134177, + "grad_norm": 0.09925219416618347, + "learning_rate": 3.1340515194681866e-05, + "loss": 0.0175, + "step": 50800 + }, + { + "epoch": 0.37558026078471957, + "grad_norm": 0.07916640490293503, + "learning_rate": 3.133680555555556e-05, + "loss": 0.0183, + "step": 50810 + }, + { + "epoch": 0.3756541793560214, + "grad_norm": 0.08429214358329773, + "learning_rate": 3.133309591642925e-05, + "loss": 0.0168, + "step": 50820 + }, + { + "epoch": 0.37572809792732326, + "grad_norm": 0.08063418418169022, + "learning_rate": 3.132938627730295e-05, + "loss": 0.0172, + "step": 50830 + }, + { + "epoch": 0.3758020164986251, + "grad_norm": 0.07345176488161087, + "learning_rate": 3.132567663817664e-05, + "loss": 0.0187, + "step": 50840 + }, + { + "epoch": 0.37587593506992695, + "grad_norm": 0.07181081175804138, + "learning_rate": 3.132196699905033e-05, + "loss": 0.0185, + "step": 50850 + }, + { + "epoch": 0.37594985364122885, + "grad_norm": 0.0684572383761406, + "learning_rate": 3.131825735992403e-05, + "loss": 0.0179, + "step": 50860 + }, + { + "epoch": 0.3760237722125307, + "grad_norm": 0.07158481329679489, + "learning_rate": 3.131454772079772e-05, + "loss": 0.0178, + "step": 50870 + }, + { + "epoch": 0.37609769078383254, + "grad_norm": 0.08474763482809067, + "learning_rate": 3.131083808167142e-05, + "loss": 0.0188, + "step": 50880 + }, + { + "epoch": 0.3761716093551344, + "grad_norm": 0.10242009907960892, + "learning_rate": 3.130712844254511e-05, + "loss": 0.0212, + "step": 50890 + }, + { + "epoch": 0.37624552792643623, + "grad_norm": 0.11419926583766937, + "learning_rate": 3.1303418803418806e-05, + "loss": 0.0198, + "step": 50900 + }, + { + "epoch": 0.3763194464977381, + "grad_norm": 0.07014594227075577, + "learning_rate": 3.12997091642925e-05, + "loss": 0.0187, + "step": 50910 + }, + { + "epoch": 0.3763933650690399, + "grad_norm": 0.09786113351583481, + "learning_rate": 3.129599952516619e-05, + "loss": 0.0184, + "step": 50920 + }, + { + "epoch": 0.3764672836403418, + "grad_norm": 0.0822451189160347, + "learning_rate": 3.129228988603989e-05, + "loss": 0.0177, + "step": 50930 + }, + { + "epoch": 0.37654120221164367, + "grad_norm": 0.09000653773546219, + "learning_rate": 3.1288580246913576e-05, + "loss": 0.0201, + "step": 50940 + }, + { + "epoch": 0.3766151207829455, + "grad_norm": 0.11525629460811615, + "learning_rate": 3.128487060778728e-05, + "loss": 0.0199, + "step": 50950 + }, + { + "epoch": 0.37668903935424736, + "grad_norm": 0.08533230423927307, + "learning_rate": 3.1281160968660975e-05, + "loss": 0.0184, + "step": 50960 + }, + { + "epoch": 0.3767629579255492, + "grad_norm": 0.1094772070646286, + "learning_rate": 3.1277451329534664e-05, + "loss": 0.0199, + "step": 50970 + }, + { + "epoch": 0.37683687649685105, + "grad_norm": 0.06830190867185593, + "learning_rate": 3.127374169040836e-05, + "loss": 0.0168, + "step": 50980 + }, + { + "epoch": 0.37691079506815295, + "grad_norm": 0.09465599805116653, + "learning_rate": 3.1270032051282056e-05, + "loss": 0.0198, + "step": 50990 + }, + { + "epoch": 0.3769847136394548, + "grad_norm": 0.07513085007667542, + "learning_rate": 3.1266322412155745e-05, + "loss": 0.0185, + "step": 51000 + }, + { + "epoch": 0.37705863221075664, + "grad_norm": 0.06580290198326111, + "learning_rate": 3.126261277302944e-05, + "loss": 0.0201, + "step": 51010 + }, + { + "epoch": 0.3771325507820585, + "grad_norm": 0.10376956313848495, + "learning_rate": 3.125890313390313e-05, + "loss": 0.0182, + "step": 51020 + }, + { + "epoch": 0.37720646935336033, + "grad_norm": 0.11452928185462952, + "learning_rate": 3.125519349477683e-05, + "loss": 0.0215, + "step": 51030 + }, + { + "epoch": 0.3772803879246622, + "grad_norm": 0.08027191460132599, + "learning_rate": 3.125148385565053e-05, + "loss": 0.0195, + "step": 51040 + }, + { + "epoch": 0.377354306495964, + "grad_norm": 0.10047151148319244, + "learning_rate": 3.124777421652422e-05, + "loss": 0.0197, + "step": 51050 + }, + { + "epoch": 0.3774282250672659, + "grad_norm": 0.08046058565378189, + "learning_rate": 3.1244064577397914e-05, + "loss": 0.0186, + "step": 51060 + }, + { + "epoch": 0.37750214363856777, + "grad_norm": 0.06543967872858047, + "learning_rate": 3.1240354938271603e-05, + "loss": 0.0184, + "step": 51070 + }, + { + "epoch": 0.3775760622098696, + "grad_norm": 0.0672696903347969, + "learning_rate": 3.12366452991453e-05, + "loss": 0.0197, + "step": 51080 + }, + { + "epoch": 0.37764998078117146, + "grad_norm": 0.08534809947013855, + "learning_rate": 3.1232935660018995e-05, + "loss": 0.0174, + "step": 51090 + }, + { + "epoch": 0.3777238993524733, + "grad_norm": 0.11928018182516098, + "learning_rate": 3.1229226020892685e-05, + "loss": 0.0186, + "step": 51100 + }, + { + "epoch": 0.37779781792377515, + "grad_norm": 0.10310760885477066, + "learning_rate": 3.122551638176639e-05, + "loss": 0.0179, + "step": 51110 + }, + { + "epoch": 0.37787173649507705, + "grad_norm": 0.08315908908843994, + "learning_rate": 3.1221806742640076e-05, + "loss": 0.0187, + "step": 51120 + }, + { + "epoch": 0.3779456550663789, + "grad_norm": 0.11481058597564697, + "learning_rate": 3.121809710351377e-05, + "loss": 0.0205, + "step": 51130 + }, + { + "epoch": 0.37801957363768074, + "grad_norm": 0.07198639214038849, + "learning_rate": 3.121438746438747e-05, + "loss": 0.016, + "step": 51140 + }, + { + "epoch": 0.3780934922089826, + "grad_norm": 0.09661037474870682, + "learning_rate": 3.121067782526116e-05, + "loss": 0.018, + "step": 51150 + }, + { + "epoch": 0.37816741078028443, + "grad_norm": 0.08421141654253006, + "learning_rate": 3.1206968186134854e-05, + "loss": 0.0195, + "step": 51160 + }, + { + "epoch": 0.3782413293515863, + "grad_norm": 0.07664620876312256, + "learning_rate": 3.120325854700854e-05, + "loss": 0.0207, + "step": 51170 + }, + { + "epoch": 0.3783152479228881, + "grad_norm": 0.07323388755321503, + "learning_rate": 3.1199548907882245e-05, + "loss": 0.0181, + "step": 51180 + }, + { + "epoch": 0.37838916649419, + "grad_norm": 0.09078080207109451, + "learning_rate": 3.119583926875594e-05, + "loss": 0.0192, + "step": 51190 + }, + { + "epoch": 0.37846308506549187, + "grad_norm": 0.13930262625217438, + "learning_rate": 3.119212962962963e-05, + "loss": 0.0175, + "step": 51200 + }, + { + "epoch": 0.3785370036367937, + "grad_norm": 0.0816483274102211, + "learning_rate": 3.1188419990503327e-05, + "loss": 0.0191, + "step": 51210 + }, + { + "epoch": 0.37861092220809556, + "grad_norm": 0.10349602997303009, + "learning_rate": 3.118471035137702e-05, + "loss": 0.0184, + "step": 51220 + }, + { + "epoch": 0.3786848407793974, + "grad_norm": 0.06943171471357346, + "learning_rate": 3.118100071225071e-05, + "loss": 0.0194, + "step": 51230 + }, + { + "epoch": 0.37875875935069925, + "grad_norm": 0.11047179996967316, + "learning_rate": 3.117729107312441e-05, + "loss": 0.0176, + "step": 51240 + }, + { + "epoch": 0.37883267792200115, + "grad_norm": 0.13045692443847656, + "learning_rate": 3.11735814339981e-05, + "loss": 0.0202, + "step": 51250 + }, + { + "epoch": 0.378906596493303, + "grad_norm": 0.10083399713039398, + "learning_rate": 3.11698717948718e-05, + "loss": 0.0187, + "step": 51260 + }, + { + "epoch": 0.37898051506460484, + "grad_norm": 0.09727805107831955, + "learning_rate": 3.1166162155745496e-05, + "loss": 0.0176, + "step": 51270 + }, + { + "epoch": 0.3790544336359067, + "grad_norm": 0.08823196589946747, + "learning_rate": 3.1162452516619185e-05, + "loss": 0.019, + "step": 51280 + }, + { + "epoch": 0.37912835220720853, + "grad_norm": 0.09118539839982986, + "learning_rate": 3.115874287749288e-05, + "loss": 0.0187, + "step": 51290 + }, + { + "epoch": 0.3792022707785104, + "grad_norm": 0.09754306823015213, + "learning_rate": 3.115503323836657e-05, + "loss": 0.0173, + "step": 51300 + }, + { + "epoch": 0.3792761893498122, + "grad_norm": 0.08700606226921082, + "learning_rate": 3.1151323599240266e-05, + "loss": 0.0218, + "step": 51310 + }, + { + "epoch": 0.3793501079211141, + "grad_norm": 0.10025543719530106, + "learning_rate": 3.114761396011396e-05, + "loss": 0.0181, + "step": 51320 + }, + { + "epoch": 0.37942402649241597, + "grad_norm": 0.0970856323838234, + "learning_rate": 3.114390432098766e-05, + "loss": 0.0196, + "step": 51330 + }, + { + "epoch": 0.3794979450637178, + "grad_norm": 0.08487201482057571, + "learning_rate": 3.1140194681861354e-05, + "loss": 0.0178, + "step": 51340 + }, + { + "epoch": 0.37957186363501966, + "grad_norm": 0.10383712500333786, + "learning_rate": 3.113648504273504e-05, + "loss": 0.0186, + "step": 51350 + }, + { + "epoch": 0.3796457822063215, + "grad_norm": 0.07665007561445236, + "learning_rate": 3.113277540360874e-05, + "loss": 0.0181, + "step": 51360 + }, + { + "epoch": 0.37971970077762335, + "grad_norm": 0.10688511282205582, + "learning_rate": 3.1129065764482435e-05, + "loss": 0.0202, + "step": 51370 + }, + { + "epoch": 0.37979361934892525, + "grad_norm": 0.12255487591028214, + "learning_rate": 3.1125356125356124e-05, + "loss": 0.0173, + "step": 51380 + }, + { + "epoch": 0.3798675379202271, + "grad_norm": 0.12634336948394775, + "learning_rate": 3.112164648622982e-05, + "loss": 0.0175, + "step": 51390 + }, + { + "epoch": 0.37994145649152894, + "grad_norm": 0.09347711503505707, + "learning_rate": 3.111793684710351e-05, + "loss": 0.0184, + "step": 51400 + }, + { + "epoch": 0.3800153750628308, + "grad_norm": 0.10192801058292389, + "learning_rate": 3.111422720797721e-05, + "loss": 0.0202, + "step": 51410 + }, + { + "epoch": 0.38008929363413263, + "grad_norm": 0.09505399316549301, + "learning_rate": 3.111051756885091e-05, + "loss": 0.0166, + "step": 51420 + }, + { + "epoch": 0.3801632122054345, + "grad_norm": 0.10079313069581985, + "learning_rate": 3.11068079297246e-05, + "loss": 0.0196, + "step": 51430 + }, + { + "epoch": 0.3802371307767363, + "grad_norm": 0.08078725636005402, + "learning_rate": 3.110309829059829e-05, + "loss": 0.0184, + "step": 51440 + }, + { + "epoch": 0.3803110493480382, + "grad_norm": 0.08401284366846085, + "learning_rate": 3.109938865147199e-05, + "loss": 0.0168, + "step": 51450 + }, + { + "epoch": 0.38038496791934007, + "grad_norm": 0.08584748953580856, + "learning_rate": 3.109567901234568e-05, + "loss": 0.0203, + "step": 51460 + }, + { + "epoch": 0.3804588864906419, + "grad_norm": 0.07659079879522324, + "learning_rate": 3.1091969373219374e-05, + "loss": 0.0186, + "step": 51470 + }, + { + "epoch": 0.38053280506194376, + "grad_norm": 0.11942635476589203, + "learning_rate": 3.108825973409307e-05, + "loss": 0.0183, + "step": 51480 + }, + { + "epoch": 0.3806067236332456, + "grad_norm": 0.10971416532993317, + "learning_rate": 3.1084550094966766e-05, + "loss": 0.0171, + "step": 51490 + }, + { + "epoch": 0.38068064220454745, + "grad_norm": 0.10077308118343353, + "learning_rate": 3.108084045584046e-05, + "loss": 0.0182, + "step": 51500 + }, + { + "epoch": 0.38075456077584935, + "grad_norm": 0.0781073048710823, + "learning_rate": 3.107713081671415e-05, + "loss": 0.0181, + "step": 51510 + }, + { + "epoch": 0.3808284793471512, + "grad_norm": 0.156025230884552, + "learning_rate": 3.107342117758785e-05, + "loss": 0.0191, + "step": 51520 + }, + { + "epoch": 0.38090239791845304, + "grad_norm": 0.10990814119577408, + "learning_rate": 3.1069711538461537e-05, + "loss": 0.018, + "step": 51530 + }, + { + "epoch": 0.3809763164897549, + "grad_norm": 0.10066544264554977, + "learning_rate": 3.106600189933523e-05, + "loss": 0.0194, + "step": 51540 + }, + { + "epoch": 0.38105023506105673, + "grad_norm": 0.08037177473306656, + "learning_rate": 3.106229226020893e-05, + "loss": 0.0208, + "step": 51550 + }, + { + "epoch": 0.3811241536323586, + "grad_norm": 0.0665728896856308, + "learning_rate": 3.1058582621082624e-05, + "loss": 0.0185, + "step": 51560 + }, + { + "epoch": 0.3811980722036604, + "grad_norm": 0.07329696416854858, + "learning_rate": 3.105487298195632e-05, + "loss": 0.02, + "step": 51570 + }, + { + "epoch": 0.3812719907749623, + "grad_norm": 0.07585390657186508, + "learning_rate": 3.105116334283001e-05, + "loss": 0.0192, + "step": 51580 + }, + { + "epoch": 0.38134590934626417, + "grad_norm": 0.07885745167732239, + "learning_rate": 3.1047453703703706e-05, + "loss": 0.017, + "step": 51590 + }, + { + "epoch": 0.381419827917566, + "grad_norm": 0.18103165924549103, + "learning_rate": 3.10437440645774e-05, + "loss": 0.0157, + "step": 51600 + }, + { + "epoch": 0.38149374648886786, + "grad_norm": 0.07363313436508179, + "learning_rate": 3.104003442545109e-05, + "loss": 0.0169, + "step": 51610 + }, + { + "epoch": 0.3815676650601697, + "grad_norm": 0.11146856844425201, + "learning_rate": 3.103632478632479e-05, + "loss": 0.0192, + "step": 51620 + }, + { + "epoch": 0.38164158363147155, + "grad_norm": 0.14162760972976685, + "learning_rate": 3.103261514719848e-05, + "loss": 0.0204, + "step": 51630 + }, + { + "epoch": 0.38171550220277345, + "grad_norm": 0.1353944092988968, + "learning_rate": 3.102890550807218e-05, + "loss": 0.018, + "step": 51640 + }, + { + "epoch": 0.3817894207740753, + "grad_norm": 0.10039281100034714, + "learning_rate": 3.1025195868945875e-05, + "loss": 0.0181, + "step": 51650 + }, + { + "epoch": 0.38186333934537714, + "grad_norm": 0.08758821338415146, + "learning_rate": 3.1021486229819564e-05, + "loss": 0.0198, + "step": 51660 + }, + { + "epoch": 0.381937257916679, + "grad_norm": 0.08009107410907745, + "learning_rate": 3.101777659069326e-05, + "loss": 0.0217, + "step": 51670 + }, + { + "epoch": 0.38201117648798083, + "grad_norm": 0.07778944820165634, + "learning_rate": 3.1014066951566956e-05, + "loss": 0.0172, + "step": 51680 + }, + { + "epoch": 0.3820850950592827, + "grad_norm": 0.09274286776781082, + "learning_rate": 3.1010357312440645e-05, + "loss": 0.0179, + "step": 51690 + }, + { + "epoch": 0.3821590136305845, + "grad_norm": 0.08399894088506699, + "learning_rate": 3.100664767331434e-05, + "loss": 0.0183, + "step": 51700 + }, + { + "epoch": 0.3822329322018864, + "grad_norm": 0.12259208410978317, + "learning_rate": 3.100293803418804e-05, + "loss": 0.0181, + "step": 51710 + }, + { + "epoch": 0.38230685077318827, + "grad_norm": 0.06090375408530235, + "learning_rate": 3.099922839506173e-05, + "loss": 0.0198, + "step": 51720 + }, + { + "epoch": 0.3823807693444901, + "grad_norm": 0.09268014878034592, + "learning_rate": 3.099551875593543e-05, + "loss": 0.0191, + "step": 51730 + }, + { + "epoch": 0.38245468791579196, + "grad_norm": 0.09483334422111511, + "learning_rate": 3.099180911680912e-05, + "loss": 0.0176, + "step": 51740 + }, + { + "epoch": 0.3825286064870938, + "grad_norm": 0.07935747504234314, + "learning_rate": 3.0988099477682814e-05, + "loss": 0.0184, + "step": 51750 + }, + { + "epoch": 0.38260252505839565, + "grad_norm": 0.08421669155359268, + "learning_rate": 3.09843898385565e-05, + "loss": 0.0192, + "step": 51760 + }, + { + "epoch": 0.38267644362969755, + "grad_norm": 0.06987878680229187, + "learning_rate": 3.09806801994302e-05, + "loss": 0.0177, + "step": 51770 + }, + { + "epoch": 0.3827503622009994, + "grad_norm": 0.09059000015258789, + "learning_rate": 3.0976970560303895e-05, + "loss": 0.0181, + "step": 51780 + }, + { + "epoch": 0.38282428077230124, + "grad_norm": 0.12444675713777542, + "learning_rate": 3.097326092117759e-05, + "loss": 0.0189, + "step": 51790 + }, + { + "epoch": 0.3828981993436031, + "grad_norm": 0.08249559253454208, + "learning_rate": 3.096955128205129e-05, + "loss": 0.0184, + "step": 51800 + }, + { + "epoch": 0.38297211791490493, + "grad_norm": 0.056822020560503006, + "learning_rate": 3.0965841642924976e-05, + "loss": 0.0166, + "step": 51810 + }, + { + "epoch": 0.3830460364862068, + "grad_norm": 0.092983677983284, + "learning_rate": 3.096213200379867e-05, + "loss": 0.0194, + "step": 51820 + }, + { + "epoch": 0.3831199550575086, + "grad_norm": 0.10161788016557693, + "learning_rate": 3.095842236467237e-05, + "loss": 0.0178, + "step": 51830 + }, + { + "epoch": 0.3831938736288105, + "grad_norm": 0.09972761571407318, + "learning_rate": 3.095471272554606e-05, + "loss": 0.018, + "step": 51840 + }, + { + "epoch": 0.38326779220011237, + "grad_norm": 0.05881828814744949, + "learning_rate": 3.095100308641975e-05, + "loss": 0.018, + "step": 51850 + }, + { + "epoch": 0.3833417107714142, + "grad_norm": 0.1028822734951973, + "learning_rate": 3.094729344729345e-05, + "loss": 0.0179, + "step": 51860 + }, + { + "epoch": 0.38341562934271606, + "grad_norm": 0.07917283475399017, + "learning_rate": 3.0943583808167145e-05, + "loss": 0.0198, + "step": 51870 + }, + { + "epoch": 0.3834895479140179, + "grad_norm": 0.08172168582677841, + "learning_rate": 3.093987416904084e-05, + "loss": 0.0183, + "step": 51880 + }, + { + "epoch": 0.38356346648531975, + "grad_norm": 0.06272972375154495, + "learning_rate": 3.093616452991453e-05, + "loss": 0.0188, + "step": 51890 + }, + { + "epoch": 0.38363738505662165, + "grad_norm": 0.093136265873909, + "learning_rate": 3.0932454890788226e-05, + "loss": 0.0203, + "step": 51900 + }, + { + "epoch": 0.3837113036279235, + "grad_norm": 0.0781591534614563, + "learning_rate": 3.092874525166192e-05, + "loss": 0.0197, + "step": 51910 + }, + { + "epoch": 0.38378522219922534, + "grad_norm": 0.08194848895072937, + "learning_rate": 3.092503561253561e-05, + "loss": 0.0194, + "step": 51920 + }, + { + "epoch": 0.3838591407705272, + "grad_norm": 0.06483061611652374, + "learning_rate": 3.092132597340931e-05, + "loss": 0.0175, + "step": 51930 + }, + { + "epoch": 0.38393305934182903, + "grad_norm": 0.09434029459953308, + "learning_rate": 3.0917616334283003e-05, + "loss": 0.0216, + "step": 51940 + }, + { + "epoch": 0.3840069779131309, + "grad_norm": 0.09067126363515854, + "learning_rate": 3.09139066951567e-05, + "loss": 0.0207, + "step": 51950 + }, + { + "epoch": 0.3840808964844328, + "grad_norm": 0.10259506106376648, + "learning_rate": 3.0910197056030395e-05, + "loss": 0.0221, + "step": 51960 + }, + { + "epoch": 0.3841548150557346, + "grad_norm": 0.11004582047462463, + "learning_rate": 3.0906487416904085e-05, + "loss": 0.0194, + "step": 51970 + }, + { + "epoch": 0.38422873362703647, + "grad_norm": 0.09257286041975021, + "learning_rate": 3.090277777777778e-05, + "loss": 0.0182, + "step": 51980 + }, + { + "epoch": 0.3843026521983383, + "grad_norm": 0.07792217284440994, + "learning_rate": 3.089906813865147e-05, + "loss": 0.0188, + "step": 51990 + }, + { + "epoch": 0.38437657076964016, + "grad_norm": 0.0765538364648819, + "learning_rate": 3.0895358499525166e-05, + "loss": 0.0185, + "step": 52000 + }, + { + "epoch": 0.384450489340942, + "grad_norm": 0.09007059782743454, + "learning_rate": 3.089164886039886e-05, + "loss": 0.0203, + "step": 52010 + }, + { + "epoch": 0.38452440791224385, + "grad_norm": 0.08688057959079742, + "learning_rate": 3.088793922127256e-05, + "loss": 0.0184, + "step": 52020 + }, + { + "epoch": 0.38459832648354575, + "grad_norm": 0.08209887892007828, + "learning_rate": 3.0884229582146254e-05, + "loss": 0.0177, + "step": 52030 + }, + { + "epoch": 0.3846722450548476, + "grad_norm": 0.07236919552087784, + "learning_rate": 3.088051994301994e-05, + "loss": 0.0171, + "step": 52040 + }, + { + "epoch": 0.38474616362614944, + "grad_norm": 0.08763102442026138, + "learning_rate": 3.087681030389364e-05, + "loss": 0.018, + "step": 52050 + }, + { + "epoch": 0.3848200821974513, + "grad_norm": 0.07670623809099197, + "learning_rate": 3.0873100664767335e-05, + "loss": 0.0179, + "step": 52060 + }, + { + "epoch": 0.38489400076875313, + "grad_norm": 0.10598905384540558, + "learning_rate": 3.0869391025641024e-05, + "loss": 0.0188, + "step": 52070 + }, + { + "epoch": 0.384967919340055, + "grad_norm": 0.08035948127508163, + "learning_rate": 3.086568138651472e-05, + "loss": 0.0157, + "step": 52080 + }, + { + "epoch": 0.3850418379113569, + "grad_norm": 0.09968321025371552, + "learning_rate": 3.0861971747388416e-05, + "loss": 0.0205, + "step": 52090 + }, + { + "epoch": 0.3851157564826587, + "grad_norm": 0.08303764462471008, + "learning_rate": 3.085826210826211e-05, + "loss": 0.0196, + "step": 52100 + }, + { + "epoch": 0.38518967505396057, + "grad_norm": 0.09184552729129791, + "learning_rate": 3.085455246913581e-05, + "loss": 0.0189, + "step": 52110 + }, + { + "epoch": 0.3852635936252624, + "grad_norm": 0.07583629339933395, + "learning_rate": 3.08508428300095e-05, + "loss": 0.0219, + "step": 52120 + }, + { + "epoch": 0.38533751219656426, + "grad_norm": 0.06147291883826256, + "learning_rate": 3.084713319088319e-05, + "loss": 0.0174, + "step": 52130 + }, + { + "epoch": 0.3854114307678661, + "grad_norm": 0.09958159178495407, + "learning_rate": 3.084342355175689e-05, + "loss": 0.0172, + "step": 52140 + }, + { + "epoch": 0.38548534933916795, + "grad_norm": 0.11025216430425644, + "learning_rate": 3.083971391263058e-05, + "loss": 0.019, + "step": 52150 + }, + { + "epoch": 0.38555926791046985, + "grad_norm": 0.08332915604114532, + "learning_rate": 3.0836004273504274e-05, + "loss": 0.0196, + "step": 52160 + }, + { + "epoch": 0.3856331864817717, + "grad_norm": 0.10837563872337341, + "learning_rate": 3.083229463437797e-05, + "loss": 0.0192, + "step": 52170 + }, + { + "epoch": 0.38570710505307354, + "grad_norm": 0.08369705826044083, + "learning_rate": 3.0828584995251666e-05, + "loss": 0.0158, + "step": 52180 + }, + { + "epoch": 0.3857810236243754, + "grad_norm": 0.09527096152305603, + "learning_rate": 3.082487535612536e-05, + "loss": 0.0188, + "step": 52190 + }, + { + "epoch": 0.38585494219567723, + "grad_norm": 0.06685808300971985, + "learning_rate": 3.082116571699905e-05, + "loss": 0.0173, + "step": 52200 + }, + { + "epoch": 0.3859288607669791, + "grad_norm": 0.07597069442272186, + "learning_rate": 3.081745607787275e-05, + "loss": 0.0208, + "step": 52210 + }, + { + "epoch": 0.386002779338281, + "grad_norm": 0.06681588292121887, + "learning_rate": 3.0813746438746436e-05, + "loss": 0.018, + "step": 52220 + }, + { + "epoch": 0.3860766979095828, + "grad_norm": 0.07467056810855865, + "learning_rate": 3.081003679962013e-05, + "loss": 0.0224, + "step": 52230 + }, + { + "epoch": 0.38615061648088467, + "grad_norm": 0.0811832919716835, + "learning_rate": 3.080632716049383e-05, + "loss": 0.0204, + "step": 52240 + }, + { + "epoch": 0.3862245350521865, + "grad_norm": 0.09243257343769073, + "learning_rate": 3.0802617521367524e-05, + "loss": 0.0188, + "step": 52250 + }, + { + "epoch": 0.38629845362348836, + "grad_norm": 0.08218041807413101, + "learning_rate": 3.079890788224122e-05, + "loss": 0.0175, + "step": 52260 + }, + { + "epoch": 0.3863723721947902, + "grad_norm": 0.07658617943525314, + "learning_rate": 3.079519824311491e-05, + "loss": 0.018, + "step": 52270 + }, + { + "epoch": 0.38644629076609205, + "grad_norm": 0.08019266277551651, + "learning_rate": 3.0791488603988605e-05, + "loss": 0.019, + "step": 52280 + }, + { + "epoch": 0.38652020933739395, + "grad_norm": 0.0836150050163269, + "learning_rate": 3.07877789648623e-05, + "loss": 0.0198, + "step": 52290 + }, + { + "epoch": 0.3865941279086958, + "grad_norm": 0.07567259669303894, + "learning_rate": 3.078406932573599e-05, + "loss": 0.021, + "step": 52300 + }, + { + "epoch": 0.38666804647999764, + "grad_norm": 0.07380937784910202, + "learning_rate": 3.0780359686609686e-05, + "loss": 0.0185, + "step": 52310 + }, + { + "epoch": 0.3867419650512995, + "grad_norm": 0.07615092396736145, + "learning_rate": 3.077665004748338e-05, + "loss": 0.0176, + "step": 52320 + }, + { + "epoch": 0.38681588362260133, + "grad_norm": 0.08575232326984406, + "learning_rate": 3.077294040835708e-05, + "loss": 0.0175, + "step": 52330 + }, + { + "epoch": 0.3868898021939032, + "grad_norm": 0.08019275218248367, + "learning_rate": 3.0769230769230774e-05, + "loss": 0.0184, + "step": 52340 + }, + { + "epoch": 0.3869637207652051, + "grad_norm": 0.08791986107826233, + "learning_rate": 3.0765521130104464e-05, + "loss": 0.0184, + "step": 52350 + }, + { + "epoch": 0.3870376393365069, + "grad_norm": 0.08571562170982361, + "learning_rate": 3.076181149097816e-05, + "loss": 0.0181, + "step": 52360 + }, + { + "epoch": 0.38711155790780877, + "grad_norm": 0.13191141188144684, + "learning_rate": 3.0758101851851855e-05, + "loss": 0.0197, + "step": 52370 + }, + { + "epoch": 0.3871854764791106, + "grad_norm": 0.07944611459970474, + "learning_rate": 3.0754392212725545e-05, + "loss": 0.0192, + "step": 52380 + }, + { + "epoch": 0.38725939505041246, + "grad_norm": 0.06848800927400589, + "learning_rate": 3.075068257359924e-05, + "loss": 0.0166, + "step": 52390 + }, + { + "epoch": 0.3873333136217143, + "grad_norm": 0.07772406935691833, + "learning_rate": 3.0746972934472937e-05, + "loss": 0.0179, + "step": 52400 + }, + { + "epoch": 0.38740723219301615, + "grad_norm": 0.08584585040807724, + "learning_rate": 3.074326329534663e-05, + "loss": 0.0176, + "step": 52410 + }, + { + "epoch": 0.38748115076431805, + "grad_norm": 0.08768728375434875, + "learning_rate": 3.073955365622033e-05, + "loss": 0.0169, + "step": 52420 + }, + { + "epoch": 0.3875550693356199, + "grad_norm": 0.06692370772361755, + "learning_rate": 3.073584401709402e-05, + "loss": 0.0179, + "step": 52430 + }, + { + "epoch": 0.38762898790692174, + "grad_norm": 0.11034579575061798, + "learning_rate": 3.0732134377967714e-05, + "loss": 0.0178, + "step": 52440 + }, + { + "epoch": 0.3877029064782236, + "grad_norm": 0.08563010394573212, + "learning_rate": 3.07284247388414e-05, + "loss": 0.0193, + "step": 52450 + }, + { + "epoch": 0.38777682504952543, + "grad_norm": 0.0796736404299736, + "learning_rate": 3.07247150997151e-05, + "loss": 0.0177, + "step": 52460 + }, + { + "epoch": 0.3878507436208273, + "grad_norm": 0.10029114782810211, + "learning_rate": 3.0721005460588795e-05, + "loss": 0.0157, + "step": 52470 + }, + { + "epoch": 0.3879246621921292, + "grad_norm": 0.07146019488573074, + "learning_rate": 3.071729582146249e-05, + "loss": 0.0209, + "step": 52480 + }, + { + "epoch": 0.387998580763431, + "grad_norm": 0.07550966739654541, + "learning_rate": 3.071358618233619e-05, + "loss": 0.0204, + "step": 52490 + }, + { + "epoch": 0.38807249933473287, + "grad_norm": 0.08018456399440765, + "learning_rate": 3.0709876543209876e-05, + "loss": 0.0173, + "step": 52500 + }, + { + "epoch": 0.3881464179060347, + "grad_norm": 0.12307467311620712, + "learning_rate": 3.070616690408357e-05, + "loss": 0.0196, + "step": 52510 + }, + { + "epoch": 0.38822033647733656, + "grad_norm": 0.10490459203720093, + "learning_rate": 3.070245726495727e-05, + "loss": 0.0182, + "step": 52520 + }, + { + "epoch": 0.3882942550486384, + "grad_norm": 0.07732898741960526, + "learning_rate": 3.069874762583096e-05, + "loss": 0.0204, + "step": 52530 + }, + { + "epoch": 0.38836817361994025, + "grad_norm": 0.09091013669967651, + "learning_rate": 3.069503798670465e-05, + "loss": 0.019, + "step": 52540 + }, + { + "epoch": 0.38844209219124215, + "grad_norm": 0.07816635072231293, + "learning_rate": 3.069132834757835e-05, + "loss": 0.0173, + "step": 52550 + }, + { + "epoch": 0.388516010762544, + "grad_norm": 0.06775733083486557, + "learning_rate": 3.0687618708452045e-05, + "loss": 0.0186, + "step": 52560 + }, + { + "epoch": 0.38858992933384584, + "grad_norm": 0.0946992039680481, + "learning_rate": 3.068390906932574e-05, + "loss": 0.0188, + "step": 52570 + }, + { + "epoch": 0.3886638479051477, + "grad_norm": 0.06431379169225693, + "learning_rate": 3.068019943019943e-05, + "loss": 0.0165, + "step": 52580 + }, + { + "epoch": 0.38873776647644953, + "grad_norm": 0.08713816851377487, + "learning_rate": 3.0676489791073126e-05, + "loss": 0.0181, + "step": 52590 + }, + { + "epoch": 0.3888116850477514, + "grad_norm": 0.08138715475797653, + "learning_rate": 3.067278015194682e-05, + "loss": 0.0171, + "step": 52600 + }, + { + "epoch": 0.3888856036190533, + "grad_norm": 0.06727848201990128, + "learning_rate": 3.066907051282051e-05, + "loss": 0.0196, + "step": 52610 + }, + { + "epoch": 0.3889595221903551, + "grad_norm": 0.09258706122636795, + "learning_rate": 3.066536087369421e-05, + "loss": 0.0193, + "step": 52620 + }, + { + "epoch": 0.38903344076165697, + "grad_norm": 0.05659174174070358, + "learning_rate": 3.06616512345679e-05, + "loss": 0.0173, + "step": 52630 + }, + { + "epoch": 0.3891073593329588, + "grad_norm": 0.09105684608221054, + "learning_rate": 3.06579415954416e-05, + "loss": 0.0191, + "step": 52640 + }, + { + "epoch": 0.38918127790426066, + "grad_norm": 0.08389654755592346, + "learning_rate": 3.0654231956315295e-05, + "loss": 0.0193, + "step": 52650 + }, + { + "epoch": 0.3892551964755625, + "grad_norm": 0.06399132311344147, + "learning_rate": 3.0650522317188984e-05, + "loss": 0.02, + "step": 52660 + }, + { + "epoch": 0.38932911504686435, + "grad_norm": 0.06903476268053055, + "learning_rate": 3.064681267806268e-05, + "loss": 0.0171, + "step": 52670 + }, + { + "epoch": 0.38940303361816625, + "grad_norm": 0.10140169411897659, + "learning_rate": 3.064310303893637e-05, + "loss": 0.0177, + "step": 52680 + }, + { + "epoch": 0.3894769521894681, + "grad_norm": 0.09307069331407547, + "learning_rate": 3.0639393399810065e-05, + "loss": 0.0168, + "step": 52690 + }, + { + "epoch": 0.38955087076076994, + "grad_norm": 0.08971722424030304, + "learning_rate": 3.063568376068376e-05, + "loss": 0.0181, + "step": 52700 + }, + { + "epoch": 0.3896247893320718, + "grad_norm": 0.06755110621452332, + "learning_rate": 3.063197412155746e-05, + "loss": 0.018, + "step": 52710 + }, + { + "epoch": 0.38969870790337363, + "grad_norm": 0.07892455160617828, + "learning_rate": 3.062826448243115e-05, + "loss": 0.0206, + "step": 52720 + }, + { + "epoch": 0.3897726264746755, + "grad_norm": 0.09181396663188934, + "learning_rate": 3.062455484330484e-05, + "loss": 0.0182, + "step": 52730 + }, + { + "epoch": 0.3898465450459774, + "grad_norm": 0.07258135080337524, + "learning_rate": 3.062084520417854e-05, + "loss": 0.0201, + "step": 52740 + }, + { + "epoch": 0.3899204636172792, + "grad_norm": 0.1641390025615692, + "learning_rate": 3.0617135565052234e-05, + "loss": 0.0193, + "step": 52750 + }, + { + "epoch": 0.38999438218858107, + "grad_norm": 0.05282848700881004, + "learning_rate": 3.0613425925925924e-05, + "loss": 0.0176, + "step": 52760 + }, + { + "epoch": 0.3900683007598829, + "grad_norm": 0.10620077699422836, + "learning_rate": 3.060971628679962e-05, + "loss": 0.0164, + "step": 52770 + }, + { + "epoch": 0.39014221933118476, + "grad_norm": 0.07189995795488358, + "learning_rate": 3.0606006647673316e-05, + "loss": 0.019, + "step": 52780 + }, + { + "epoch": 0.3902161379024866, + "grad_norm": 0.06073181703686714, + "learning_rate": 3.060229700854701e-05, + "loss": 0.016, + "step": 52790 + }, + { + "epoch": 0.39029005647378845, + "grad_norm": 0.10170228779315948, + "learning_rate": 3.059858736942071e-05, + "loss": 0.021, + "step": 52800 + }, + { + "epoch": 0.39036397504509035, + "grad_norm": 0.07040001451969147, + "learning_rate": 3.05948777302944e-05, + "loss": 0.0186, + "step": 52810 + }, + { + "epoch": 0.3904378936163922, + "grad_norm": 0.10885073989629745, + "learning_rate": 3.059116809116809e-05, + "loss": 0.0195, + "step": 52820 + }, + { + "epoch": 0.39051181218769404, + "grad_norm": 0.09115403145551682, + "learning_rate": 3.058745845204179e-05, + "loss": 0.0187, + "step": 52830 + }, + { + "epoch": 0.3905857307589959, + "grad_norm": 0.07751933485269547, + "learning_rate": 3.058374881291548e-05, + "loss": 0.0186, + "step": 52840 + }, + { + "epoch": 0.39065964933029773, + "grad_norm": 0.0606808103621006, + "learning_rate": 3.0580039173789174e-05, + "loss": 0.018, + "step": 52850 + }, + { + "epoch": 0.3907335679015996, + "grad_norm": 0.0965239480137825, + "learning_rate": 3.057632953466287e-05, + "loss": 0.0188, + "step": 52860 + }, + { + "epoch": 0.3908074864729015, + "grad_norm": 0.11499795317649841, + "learning_rate": 3.0572619895536566e-05, + "loss": 0.0172, + "step": 52870 + }, + { + "epoch": 0.3908814050442033, + "grad_norm": 0.08791990578174591, + "learning_rate": 3.056891025641026e-05, + "loss": 0.0185, + "step": 52880 + }, + { + "epoch": 0.39095532361550517, + "grad_norm": 0.09780432283878326, + "learning_rate": 3.056520061728395e-05, + "loss": 0.0198, + "step": 52890 + }, + { + "epoch": 0.391029242186807, + "grad_norm": 0.1021294891834259, + "learning_rate": 3.056149097815765e-05, + "loss": 0.0165, + "step": 52900 + }, + { + "epoch": 0.39110316075810886, + "grad_norm": 0.0787302777171135, + "learning_rate": 3.0557781339031336e-05, + "loss": 0.0197, + "step": 52910 + }, + { + "epoch": 0.3911770793294107, + "grad_norm": 0.08401231467723846, + "learning_rate": 3.055407169990503e-05, + "loss": 0.0187, + "step": 52920 + }, + { + "epoch": 0.39125099790071255, + "grad_norm": 0.09179053455591202, + "learning_rate": 3.055036206077873e-05, + "loss": 0.0179, + "step": 52930 + }, + { + "epoch": 0.39132491647201445, + "grad_norm": 0.09800249338150024, + "learning_rate": 3.0546652421652424e-05, + "loss": 0.0184, + "step": 52940 + }, + { + "epoch": 0.3913988350433163, + "grad_norm": 0.07815613597631454, + "learning_rate": 3.054294278252612e-05, + "loss": 0.0183, + "step": 52950 + }, + { + "epoch": 0.39147275361461814, + "grad_norm": 0.08728394657373428, + "learning_rate": 3.053923314339981e-05, + "loss": 0.017, + "step": 52960 + }, + { + "epoch": 0.39154667218592, + "grad_norm": 0.11514657735824585, + "learning_rate": 3.0535523504273505e-05, + "loss": 0.0173, + "step": 52970 + }, + { + "epoch": 0.39162059075722183, + "grad_norm": 0.08455294370651245, + "learning_rate": 3.05318138651472e-05, + "loss": 0.0199, + "step": 52980 + }, + { + "epoch": 0.3916945093285237, + "grad_norm": 0.06525518745183945, + "learning_rate": 3.052810422602089e-05, + "loss": 0.017, + "step": 52990 + }, + { + "epoch": 0.3917684278998256, + "grad_norm": 0.06673488020896912, + "learning_rate": 3.0524394586894586e-05, + "loss": 0.0182, + "step": 53000 + }, + { + "epoch": 0.3918423464711274, + "grad_norm": 0.09368862211704254, + "learning_rate": 3.052068494776828e-05, + "loss": 0.0175, + "step": 53010 + }, + { + "epoch": 0.39191626504242927, + "grad_norm": 0.09598971903324127, + "learning_rate": 3.0516975308641975e-05, + "loss": 0.0188, + "step": 53020 + }, + { + "epoch": 0.3919901836137311, + "grad_norm": 0.06615760177373886, + "learning_rate": 3.0513265669515674e-05, + "loss": 0.0186, + "step": 53030 + }, + { + "epoch": 0.39206410218503296, + "grad_norm": 0.08294472843408585, + "learning_rate": 3.0509556030389363e-05, + "loss": 0.0198, + "step": 53040 + }, + { + "epoch": 0.3921380207563348, + "grad_norm": 0.10596353560686111, + "learning_rate": 3.050584639126306e-05, + "loss": 0.0186, + "step": 53050 + }, + { + "epoch": 0.39221193932763665, + "grad_norm": 0.07891129702329636, + "learning_rate": 3.0502136752136755e-05, + "loss": 0.0185, + "step": 53060 + }, + { + "epoch": 0.39228585789893855, + "grad_norm": 0.08801103383302689, + "learning_rate": 3.0498427113010448e-05, + "loss": 0.0199, + "step": 53070 + }, + { + "epoch": 0.3923597764702404, + "grad_norm": 0.06925628334283829, + "learning_rate": 3.0494717473884144e-05, + "loss": 0.0202, + "step": 53080 + }, + { + "epoch": 0.39243369504154224, + "grad_norm": 0.08155371993780136, + "learning_rate": 3.0491007834757833e-05, + "loss": 0.0207, + "step": 53090 + }, + { + "epoch": 0.3925076136128441, + "grad_norm": 0.07738133519887924, + "learning_rate": 3.0487298195631532e-05, + "loss": 0.0202, + "step": 53100 + }, + { + "epoch": 0.39258153218414593, + "grad_norm": 0.06856966018676758, + "learning_rate": 3.0483588556505228e-05, + "loss": 0.0183, + "step": 53110 + }, + { + "epoch": 0.3926554507554478, + "grad_norm": 0.07440409064292908, + "learning_rate": 3.0479878917378917e-05, + "loss": 0.0197, + "step": 53120 + }, + { + "epoch": 0.3927293693267497, + "grad_norm": 0.07958266139030457, + "learning_rate": 3.0476169278252613e-05, + "loss": 0.0183, + "step": 53130 + }, + { + "epoch": 0.3928032878980515, + "grad_norm": 0.05523325875401497, + "learning_rate": 3.0472459639126306e-05, + "loss": 0.0184, + "step": 53140 + }, + { + "epoch": 0.39287720646935337, + "grad_norm": 0.07466422021389008, + "learning_rate": 3.0468750000000002e-05, + "loss": 0.0183, + "step": 53150 + }, + { + "epoch": 0.3929511250406552, + "grad_norm": 0.10524232685565948, + "learning_rate": 3.0465040360873698e-05, + "loss": 0.0184, + "step": 53160 + }, + { + "epoch": 0.39302504361195706, + "grad_norm": 0.06600213795900345, + "learning_rate": 3.0461330721747387e-05, + "loss": 0.02, + "step": 53170 + }, + { + "epoch": 0.3930989621832589, + "grad_norm": 0.08506739139556885, + "learning_rate": 3.0457621082621087e-05, + "loss": 0.0179, + "step": 53180 + }, + { + "epoch": 0.39317288075456075, + "grad_norm": 0.07443349063396454, + "learning_rate": 3.0453911443494776e-05, + "loss": 0.0167, + "step": 53190 + }, + { + "epoch": 0.39324679932586265, + "grad_norm": 0.07037489116191864, + "learning_rate": 3.045020180436847e-05, + "loss": 0.0183, + "step": 53200 + }, + { + "epoch": 0.3933207178971645, + "grad_norm": 0.08329469710588455, + "learning_rate": 3.0446492165242168e-05, + "loss": 0.0151, + "step": 53210 + }, + { + "epoch": 0.39339463646846634, + "grad_norm": 0.0837264209985733, + "learning_rate": 3.044278252611586e-05, + "loss": 0.0157, + "step": 53220 + }, + { + "epoch": 0.3934685550397682, + "grad_norm": 0.0812053307890892, + "learning_rate": 3.0439072886989556e-05, + "loss": 0.0194, + "step": 53230 + }, + { + "epoch": 0.39354247361107003, + "grad_norm": 0.09882737696170807, + "learning_rate": 3.0435363247863245e-05, + "loss": 0.0186, + "step": 53240 + }, + { + "epoch": 0.3936163921823719, + "grad_norm": 0.08482872694730759, + "learning_rate": 3.0431653608736945e-05, + "loss": 0.0196, + "step": 53250 + }, + { + "epoch": 0.3936903107536738, + "grad_norm": 0.07293414324522018, + "learning_rate": 3.042794396961064e-05, + "loss": 0.0189, + "step": 53260 + }, + { + "epoch": 0.3937642293249756, + "grad_norm": 0.08307154476642609, + "learning_rate": 3.042423433048433e-05, + "loss": 0.0218, + "step": 53270 + }, + { + "epoch": 0.39383814789627747, + "grad_norm": 0.10339733213186264, + "learning_rate": 3.0420524691358026e-05, + "loss": 0.0188, + "step": 53280 + }, + { + "epoch": 0.3939120664675793, + "grad_norm": 0.056059882044792175, + "learning_rate": 3.0416815052231722e-05, + "loss": 0.0169, + "step": 53290 + }, + { + "epoch": 0.39398598503888116, + "grad_norm": 0.08329902589321136, + "learning_rate": 3.0413105413105414e-05, + "loss": 0.0173, + "step": 53300 + }, + { + "epoch": 0.394059903610183, + "grad_norm": 0.06202562153339386, + "learning_rate": 3.040939577397911e-05, + "loss": 0.0187, + "step": 53310 + }, + { + "epoch": 0.39413382218148485, + "grad_norm": 0.07509329169988632, + "learning_rate": 3.04056861348528e-05, + "loss": 0.0164, + "step": 53320 + }, + { + "epoch": 0.39420774075278675, + "grad_norm": 0.09368224442005157, + "learning_rate": 3.04019764957265e-05, + "loss": 0.0208, + "step": 53330 + }, + { + "epoch": 0.3942816593240886, + "grad_norm": 0.06418855488300323, + "learning_rate": 3.0398266856600195e-05, + "loss": 0.0185, + "step": 53340 + }, + { + "epoch": 0.39435557789539044, + "grad_norm": 0.07906321436166763, + "learning_rate": 3.0394557217473884e-05, + "loss": 0.021, + "step": 53350 + }, + { + "epoch": 0.3944294964666923, + "grad_norm": 0.09090852737426758, + "learning_rate": 3.039084757834758e-05, + "loss": 0.018, + "step": 53360 + }, + { + "epoch": 0.39450341503799413, + "grad_norm": 0.12433502078056335, + "learning_rate": 3.0387137939221273e-05, + "loss": 0.0155, + "step": 53370 + }, + { + "epoch": 0.394577333609296, + "grad_norm": 0.08517279475927353, + "learning_rate": 3.038342830009497e-05, + "loss": 0.0201, + "step": 53380 + }, + { + "epoch": 0.3946512521805979, + "grad_norm": 0.0872577503323555, + "learning_rate": 3.0379718660968665e-05, + "loss": 0.0205, + "step": 53390 + }, + { + "epoch": 0.3947251707518997, + "grad_norm": 0.11957985907793045, + "learning_rate": 3.0376009021842354e-05, + "loss": 0.0175, + "step": 53400 + }, + { + "epoch": 0.39479908932320157, + "grad_norm": 0.07976693660020828, + "learning_rate": 3.0372299382716053e-05, + "loss": 0.0185, + "step": 53410 + }, + { + "epoch": 0.3948730078945034, + "grad_norm": 0.08289938420057297, + "learning_rate": 3.0368589743589742e-05, + "loss": 0.018, + "step": 53420 + }, + { + "epoch": 0.39494692646580526, + "grad_norm": 0.10434596985578537, + "learning_rate": 3.0364880104463438e-05, + "loss": 0.0184, + "step": 53430 + }, + { + "epoch": 0.3950208450371071, + "grad_norm": 0.08924178779125214, + "learning_rate": 3.0361170465337134e-05, + "loss": 0.0202, + "step": 53440 + }, + { + "epoch": 0.39509476360840895, + "grad_norm": 0.05690544471144676, + "learning_rate": 3.0357460826210827e-05, + "loss": 0.0178, + "step": 53450 + }, + { + "epoch": 0.39516868217971085, + "grad_norm": 0.11244209110736847, + "learning_rate": 3.0353751187084523e-05, + "loss": 0.0216, + "step": 53460 + }, + { + "epoch": 0.3952426007510127, + "grad_norm": 0.08175085484981537, + "learning_rate": 3.0350041547958212e-05, + "loss": 0.0172, + "step": 53470 + }, + { + "epoch": 0.39531651932231454, + "grad_norm": 0.10606854408979416, + "learning_rate": 3.034633190883191e-05, + "loss": 0.0195, + "step": 53480 + }, + { + "epoch": 0.3953904378936164, + "grad_norm": 0.07251104712486267, + "learning_rate": 3.0342622269705607e-05, + "loss": 0.0247, + "step": 53490 + }, + { + "epoch": 0.39546435646491823, + "grad_norm": 0.07370023429393768, + "learning_rate": 3.0338912630579296e-05, + "loss": 0.0215, + "step": 53500 + }, + { + "epoch": 0.3955382750362201, + "grad_norm": 0.08910705149173737, + "learning_rate": 3.0335202991452992e-05, + "loss": 0.0189, + "step": 53510 + }, + { + "epoch": 0.395612193607522, + "grad_norm": 0.06901726871728897, + "learning_rate": 3.033149335232669e-05, + "loss": 0.0195, + "step": 53520 + }, + { + "epoch": 0.3956861121788238, + "grad_norm": 0.08980687707662582, + "learning_rate": 3.032778371320038e-05, + "loss": 0.0193, + "step": 53530 + }, + { + "epoch": 0.39576003075012567, + "grad_norm": 0.0794142335653305, + "learning_rate": 3.0324074074074077e-05, + "loss": 0.0198, + "step": 53540 + }, + { + "epoch": 0.3958339493214275, + "grad_norm": 0.07814609259366989, + "learning_rate": 3.0320364434947766e-05, + "loss": 0.0205, + "step": 53550 + }, + { + "epoch": 0.39590786789272936, + "grad_norm": 0.07413819432258606, + "learning_rate": 3.0316654795821465e-05, + "loss": 0.018, + "step": 53560 + }, + { + "epoch": 0.3959817864640312, + "grad_norm": 0.08249972015619278, + "learning_rate": 3.031294515669516e-05, + "loss": 0.016, + "step": 53570 + }, + { + "epoch": 0.39605570503533305, + "grad_norm": 0.08316448330879211, + "learning_rate": 3.030923551756885e-05, + "loss": 0.0181, + "step": 53580 + }, + { + "epoch": 0.39612962360663495, + "grad_norm": 0.055324457585811615, + "learning_rate": 3.0305525878442547e-05, + "loss": 0.0165, + "step": 53590 + }, + { + "epoch": 0.3962035421779368, + "grad_norm": 0.07442904263734818, + "learning_rate": 3.030181623931624e-05, + "loss": 0.0186, + "step": 53600 + }, + { + "epoch": 0.39627746074923864, + "grad_norm": 0.09970244020223618, + "learning_rate": 3.0298106600189935e-05, + "loss": 0.0171, + "step": 53610 + }, + { + "epoch": 0.3963513793205405, + "grad_norm": 0.06909671425819397, + "learning_rate": 3.029439696106363e-05, + "loss": 0.0188, + "step": 53620 + }, + { + "epoch": 0.39642529789184233, + "grad_norm": 0.08010876178741455, + "learning_rate": 3.0290687321937324e-05, + "loss": 0.0168, + "step": 53630 + }, + { + "epoch": 0.3964992164631442, + "grad_norm": 0.072038933634758, + "learning_rate": 3.028697768281102e-05, + "loss": 0.0174, + "step": 53640 + }, + { + "epoch": 0.3965731350344461, + "grad_norm": 0.04955233260989189, + "learning_rate": 3.028326804368471e-05, + "loss": 0.0167, + "step": 53650 + }, + { + "epoch": 0.3966470536057479, + "grad_norm": 0.09004979580640793, + "learning_rate": 3.0279558404558405e-05, + "loss": 0.0192, + "step": 53660 + }, + { + "epoch": 0.39672097217704977, + "grad_norm": 0.07755633443593979, + "learning_rate": 3.02758487654321e-05, + "loss": 0.0184, + "step": 53670 + }, + { + "epoch": 0.3967948907483516, + "grad_norm": 0.06454595178365707, + "learning_rate": 3.0272139126305793e-05, + "loss": 0.0165, + "step": 53680 + }, + { + "epoch": 0.39686880931965346, + "grad_norm": 0.09947235137224197, + "learning_rate": 3.026842948717949e-05, + "loss": 0.0174, + "step": 53690 + }, + { + "epoch": 0.3969427278909553, + "grad_norm": 0.10691077262163162, + "learning_rate": 3.026471984805318e-05, + "loss": 0.0183, + "step": 53700 + }, + { + "epoch": 0.39701664646225715, + "grad_norm": 0.1857234388589859, + "learning_rate": 3.0261010208926878e-05, + "loss": 0.0171, + "step": 53710 + }, + { + "epoch": 0.39709056503355905, + "grad_norm": 0.07437913864850998, + "learning_rate": 3.0257300569800574e-05, + "loss": 0.0187, + "step": 53720 + }, + { + "epoch": 0.3971644836048609, + "grad_norm": 0.09386677294969559, + "learning_rate": 3.0253590930674263e-05, + "loss": 0.0193, + "step": 53730 + }, + { + "epoch": 0.39723840217616274, + "grad_norm": 0.06534472852945328, + "learning_rate": 3.024988129154796e-05, + "loss": 0.0173, + "step": 53740 + }, + { + "epoch": 0.3973123207474646, + "grad_norm": 0.09258140623569489, + "learning_rate": 3.0246171652421655e-05, + "loss": 0.0181, + "step": 53750 + }, + { + "epoch": 0.39738623931876643, + "grad_norm": 0.09052963554859161, + "learning_rate": 3.0242462013295348e-05, + "loss": 0.0187, + "step": 53760 + }, + { + "epoch": 0.3974601578900683, + "grad_norm": 0.06417059898376465, + "learning_rate": 3.0238752374169044e-05, + "loss": 0.0168, + "step": 53770 + }, + { + "epoch": 0.3975340764613702, + "grad_norm": 0.10106581449508667, + "learning_rate": 3.0235042735042736e-05, + "loss": 0.0211, + "step": 53780 + }, + { + "epoch": 0.397607995032672, + "grad_norm": 0.10131117701530457, + "learning_rate": 3.0231333095916432e-05, + "loss": 0.0192, + "step": 53790 + }, + { + "epoch": 0.39768191360397387, + "grad_norm": 0.1002507209777832, + "learning_rate": 3.0227623456790128e-05, + "loss": 0.0202, + "step": 53800 + }, + { + "epoch": 0.3977558321752757, + "grad_norm": 0.09136881679296494, + "learning_rate": 3.0223913817663817e-05, + "loss": 0.0182, + "step": 53810 + }, + { + "epoch": 0.39782975074657756, + "grad_norm": 0.09476058185100555, + "learning_rate": 3.0220204178537513e-05, + "loss": 0.0183, + "step": 53820 + }, + { + "epoch": 0.3979036693178794, + "grad_norm": 0.08330568671226501, + "learning_rate": 3.0216494539411206e-05, + "loss": 0.0202, + "step": 53830 + }, + { + "epoch": 0.39797758788918125, + "grad_norm": 0.06252734363079071, + "learning_rate": 3.0212784900284902e-05, + "loss": 0.0162, + "step": 53840 + }, + { + "epoch": 0.39805150646048315, + "grad_norm": 0.11372721940279007, + "learning_rate": 3.0209075261158598e-05, + "loss": 0.0187, + "step": 53850 + }, + { + "epoch": 0.398125425031785, + "grad_norm": 0.09041762351989746, + "learning_rate": 3.020536562203229e-05, + "loss": 0.0239, + "step": 53860 + }, + { + "epoch": 0.39819934360308684, + "grad_norm": 0.07595834136009216, + "learning_rate": 3.0201655982905986e-05, + "loss": 0.0175, + "step": 53870 + }, + { + "epoch": 0.3982732621743887, + "grad_norm": 0.10471068322658539, + "learning_rate": 3.0197946343779675e-05, + "loss": 0.0181, + "step": 53880 + }, + { + "epoch": 0.39834718074569053, + "grad_norm": 0.08905491232872009, + "learning_rate": 3.019423670465337e-05, + "loss": 0.0169, + "step": 53890 + }, + { + "epoch": 0.3984210993169924, + "grad_norm": 0.10129161924123764, + "learning_rate": 3.0190527065527067e-05, + "loss": 0.0203, + "step": 53900 + }, + { + "epoch": 0.3984950178882943, + "grad_norm": 0.08781618624925613, + "learning_rate": 3.018681742640076e-05, + "loss": 0.0177, + "step": 53910 + }, + { + "epoch": 0.3985689364595961, + "grad_norm": 0.05924617126584053, + "learning_rate": 3.0183107787274456e-05, + "loss": 0.0184, + "step": 53920 + }, + { + "epoch": 0.39864285503089797, + "grad_norm": 0.10525096952915192, + "learning_rate": 3.017939814814815e-05, + "loss": 0.0204, + "step": 53930 + }, + { + "epoch": 0.3987167736021998, + "grad_norm": 0.09503110498189926, + "learning_rate": 3.0175688509021844e-05, + "loss": 0.019, + "step": 53940 + }, + { + "epoch": 0.39879069217350166, + "grad_norm": 0.0813848227262497, + "learning_rate": 3.017197886989554e-05, + "loss": 0.0194, + "step": 53950 + }, + { + "epoch": 0.3988646107448035, + "grad_norm": 0.10762012004852295, + "learning_rate": 3.016826923076923e-05, + "loss": 0.0177, + "step": 53960 + }, + { + "epoch": 0.3989385293161054, + "grad_norm": 0.13751506805419922, + "learning_rate": 3.0164559591642926e-05, + "loss": 0.0177, + "step": 53970 + }, + { + "epoch": 0.39901244788740725, + "grad_norm": 0.0912880152463913, + "learning_rate": 3.0160849952516625e-05, + "loss": 0.0198, + "step": 53980 + }, + { + "epoch": 0.3990863664587091, + "grad_norm": 0.06373404711484909, + "learning_rate": 3.0157140313390314e-05, + "loss": 0.0175, + "step": 53990 + }, + { + "epoch": 0.39916028503001094, + "grad_norm": 0.08119054138660431, + "learning_rate": 3.015343067426401e-05, + "loss": 0.0155, + "step": 54000 + }, + { + "epoch": 0.3992342036013128, + "grad_norm": 0.07526402920484543, + "learning_rate": 3.0149721035137703e-05, + "loss": 0.0175, + "step": 54010 + }, + { + "epoch": 0.39930812217261463, + "grad_norm": 0.0822368636727333, + "learning_rate": 3.01460113960114e-05, + "loss": 0.0165, + "step": 54020 + }, + { + "epoch": 0.3993820407439165, + "grad_norm": 0.07308053225278854, + "learning_rate": 3.0142301756885095e-05, + "loss": 0.0176, + "step": 54030 + }, + { + "epoch": 0.3994559593152184, + "grad_norm": 0.11768469959497452, + "learning_rate": 3.0138592117758784e-05, + "loss": 0.0206, + "step": 54040 + }, + { + "epoch": 0.3995298778865202, + "grad_norm": 0.1068728119134903, + "learning_rate": 3.013488247863248e-05, + "loss": 0.0204, + "step": 54050 + }, + { + "epoch": 0.39960379645782207, + "grad_norm": 0.08505821973085403, + "learning_rate": 3.0131172839506172e-05, + "loss": 0.0184, + "step": 54060 + }, + { + "epoch": 0.3996777150291239, + "grad_norm": 0.10482881218194962, + "learning_rate": 3.012746320037987e-05, + "loss": 0.0197, + "step": 54070 + }, + { + "epoch": 0.39975163360042576, + "grad_norm": 0.07285966724157333, + "learning_rate": 3.0123753561253564e-05, + "loss": 0.0171, + "step": 54080 + }, + { + "epoch": 0.3998255521717276, + "grad_norm": 0.0907202884554863, + "learning_rate": 3.0120043922127257e-05, + "loss": 0.0174, + "step": 54090 + }, + { + "epoch": 0.3998994707430295, + "grad_norm": 0.07394234091043472, + "learning_rate": 3.0116334283000953e-05, + "loss": 0.0185, + "step": 54100 + }, + { + "epoch": 0.39997338931433135, + "grad_norm": 0.06811315566301346, + "learning_rate": 3.0112624643874642e-05, + "loss": 0.0167, + "step": 54110 + }, + { + "epoch": 0.4000473078856332, + "grad_norm": 0.0763428807258606, + "learning_rate": 3.0108915004748338e-05, + "loss": 0.017, + "step": 54120 + }, + { + "epoch": 0.40012122645693504, + "grad_norm": 0.07965312153100967, + "learning_rate": 3.0105205365622034e-05, + "loss": 0.0203, + "step": 54130 + }, + { + "epoch": 0.4001951450282369, + "grad_norm": 0.1002349779009819, + "learning_rate": 3.0101495726495727e-05, + "loss": 0.0171, + "step": 54140 + }, + { + "epoch": 0.40026906359953873, + "grad_norm": 0.07759466022253036, + "learning_rate": 3.0097786087369423e-05, + "loss": 0.0169, + "step": 54150 + }, + { + "epoch": 0.4003429821708406, + "grad_norm": 0.06216445192694664, + "learning_rate": 3.0094076448243115e-05, + "loss": 0.0176, + "step": 54160 + }, + { + "epoch": 0.4004169007421425, + "grad_norm": 0.07447425276041031, + "learning_rate": 3.009036680911681e-05, + "loss": 0.0164, + "step": 54170 + }, + { + "epoch": 0.4004908193134443, + "grad_norm": 0.06533452123403549, + "learning_rate": 3.0086657169990507e-05, + "loss": 0.0177, + "step": 54180 + }, + { + "epoch": 0.40056473788474617, + "grad_norm": 0.07110826671123505, + "learning_rate": 3.0082947530864196e-05, + "loss": 0.017, + "step": 54190 + }, + { + "epoch": 0.400638656456048, + "grad_norm": 0.08527481555938721, + "learning_rate": 3.0079237891737892e-05, + "loss": 0.0182, + "step": 54200 + }, + { + "epoch": 0.40071257502734986, + "grad_norm": 0.07174352556467056, + "learning_rate": 3.007552825261159e-05, + "loss": 0.0194, + "step": 54210 + }, + { + "epoch": 0.4007864935986517, + "grad_norm": 0.09062623977661133, + "learning_rate": 3.007181861348528e-05, + "loss": 0.0188, + "step": 54220 + }, + { + "epoch": 0.4008604121699536, + "grad_norm": 0.08902092278003693, + "learning_rate": 3.0068108974358977e-05, + "loss": 0.017, + "step": 54230 + }, + { + "epoch": 0.40093433074125545, + "grad_norm": 0.11530375480651855, + "learning_rate": 3.006439933523267e-05, + "loss": 0.0204, + "step": 54240 + }, + { + "epoch": 0.4010082493125573, + "grad_norm": 0.06766349822282791, + "learning_rate": 3.0060689696106365e-05, + "loss": 0.0155, + "step": 54250 + }, + { + "epoch": 0.40108216788385914, + "grad_norm": 0.07901377230882645, + "learning_rate": 3.005698005698006e-05, + "loss": 0.0159, + "step": 54260 + }, + { + "epoch": 0.401156086455161, + "grad_norm": 0.11029311269521713, + "learning_rate": 3.005327041785375e-05, + "loss": 0.0182, + "step": 54270 + }, + { + "epoch": 0.40123000502646283, + "grad_norm": 0.09365560859441757, + "learning_rate": 3.0049560778727446e-05, + "loss": 0.0207, + "step": 54280 + }, + { + "epoch": 0.4013039235977647, + "grad_norm": 0.06550610065460205, + "learning_rate": 3.004585113960114e-05, + "loss": 0.0153, + "step": 54290 + }, + { + "epoch": 0.4013778421690666, + "grad_norm": 0.06464651226997375, + "learning_rate": 3.0042141500474835e-05, + "loss": 0.0171, + "step": 54300 + }, + { + "epoch": 0.4014517607403684, + "grad_norm": 0.0901111364364624, + "learning_rate": 3.003843186134853e-05, + "loss": 0.0199, + "step": 54310 + }, + { + "epoch": 0.40152567931167027, + "grad_norm": 0.097609743475914, + "learning_rate": 3.0034722222222223e-05, + "loss": 0.0182, + "step": 54320 + }, + { + "epoch": 0.4015995978829721, + "grad_norm": 0.0751517117023468, + "learning_rate": 3.003101258309592e-05, + "loss": 0.0202, + "step": 54330 + }, + { + "epoch": 0.40167351645427396, + "grad_norm": 0.08785123378038406, + "learning_rate": 3.002730294396961e-05, + "loss": 0.0205, + "step": 54340 + }, + { + "epoch": 0.4017474350255758, + "grad_norm": 0.07688210904598236, + "learning_rate": 3.0023593304843305e-05, + "loss": 0.019, + "step": 54350 + }, + { + "epoch": 0.4018213535968777, + "grad_norm": 0.0797678530216217, + "learning_rate": 3.0019883665717004e-05, + "loss": 0.0177, + "step": 54360 + }, + { + "epoch": 0.40189527216817955, + "grad_norm": 0.08396304398775101, + "learning_rate": 3.0016174026590693e-05, + "loss": 0.0165, + "step": 54370 + }, + { + "epoch": 0.4019691907394814, + "grad_norm": 0.07126887887716293, + "learning_rate": 3.001246438746439e-05, + "loss": 0.0161, + "step": 54380 + }, + { + "epoch": 0.40204310931078324, + "grad_norm": 0.09733361005783081, + "learning_rate": 3.000875474833808e-05, + "loss": 0.021, + "step": 54390 + }, + { + "epoch": 0.4021170278820851, + "grad_norm": 0.07474087178707123, + "learning_rate": 3.0005045109211778e-05, + "loss": 0.0205, + "step": 54400 + }, + { + "epoch": 0.40219094645338693, + "grad_norm": 0.08255986124277115, + "learning_rate": 3.0001335470085474e-05, + "loss": 0.018, + "step": 54410 + }, + { + "epoch": 0.4022648650246888, + "grad_norm": 0.0718206912279129, + "learning_rate": 2.9997625830959163e-05, + "loss": 0.0185, + "step": 54420 + }, + { + "epoch": 0.4023387835959907, + "grad_norm": 0.07734379172325134, + "learning_rate": 2.999391619183286e-05, + "loss": 0.0165, + "step": 54430 + }, + { + "epoch": 0.4024127021672925, + "grad_norm": 0.06918898969888687, + "learning_rate": 2.9990206552706558e-05, + "loss": 0.0174, + "step": 54440 + }, + { + "epoch": 0.40248662073859437, + "grad_norm": 0.08389578759670258, + "learning_rate": 2.9986496913580247e-05, + "loss": 0.0161, + "step": 54450 + }, + { + "epoch": 0.4025605393098962, + "grad_norm": 0.0872507318854332, + "learning_rate": 2.9982787274453943e-05, + "loss": 0.0177, + "step": 54460 + }, + { + "epoch": 0.40263445788119806, + "grad_norm": 0.09838316589593887, + "learning_rate": 2.9979077635327636e-05, + "loss": 0.0189, + "step": 54470 + }, + { + "epoch": 0.4027083764524999, + "grad_norm": 0.06993544846773148, + "learning_rate": 2.9975367996201332e-05, + "loss": 0.0193, + "step": 54480 + }, + { + "epoch": 0.4027822950238018, + "grad_norm": 0.07519206404685974, + "learning_rate": 2.9971658357075028e-05, + "loss": 0.018, + "step": 54490 + }, + { + "epoch": 0.40285621359510365, + "grad_norm": 0.09367363154888153, + "learning_rate": 2.9967948717948717e-05, + "loss": 0.0169, + "step": 54500 + }, + { + "epoch": 0.4029301321664055, + "grad_norm": 0.09509658068418503, + "learning_rate": 2.9964239078822416e-05, + "loss": 0.017, + "step": 54510 + }, + { + "epoch": 0.40300405073770734, + "grad_norm": 0.0789688378572464, + "learning_rate": 2.9960529439696106e-05, + "loss": 0.0172, + "step": 54520 + }, + { + "epoch": 0.4030779693090092, + "grad_norm": 0.07211098819971085, + "learning_rate": 2.99568198005698e-05, + "loss": 0.0155, + "step": 54530 + }, + { + "epoch": 0.40315188788031103, + "grad_norm": 0.0857715830206871, + "learning_rate": 2.9953110161443497e-05, + "loss": 0.0183, + "step": 54540 + }, + { + "epoch": 0.4032258064516129, + "grad_norm": 0.10314106941223145, + "learning_rate": 2.994940052231719e-05, + "loss": 0.0178, + "step": 54550 + }, + { + "epoch": 0.4032997250229148, + "grad_norm": 0.08754850178956985, + "learning_rate": 2.9945690883190886e-05, + "loss": 0.0176, + "step": 54560 + }, + { + "epoch": 0.4033736435942166, + "grad_norm": 0.10044217109680176, + "learning_rate": 2.9941981244064575e-05, + "loss": 0.0192, + "step": 54570 + }, + { + "epoch": 0.40344756216551847, + "grad_norm": 0.09112729877233505, + "learning_rate": 2.993827160493827e-05, + "loss": 0.0174, + "step": 54580 + }, + { + "epoch": 0.4035214807368203, + "grad_norm": 0.08507723361253738, + "learning_rate": 2.993456196581197e-05, + "loss": 0.0188, + "step": 54590 + }, + { + "epoch": 0.40359539930812216, + "grad_norm": 0.08809395879507065, + "learning_rate": 2.993085232668566e-05, + "loss": 0.017, + "step": 54600 + }, + { + "epoch": 0.403669317879424, + "grad_norm": 0.09437878429889679, + "learning_rate": 2.9927142687559356e-05, + "loss": 0.0178, + "step": 54610 + }, + { + "epoch": 0.4037432364507259, + "grad_norm": 0.09309881180524826, + "learning_rate": 2.9923433048433048e-05, + "loss": 0.0197, + "step": 54620 + }, + { + "epoch": 0.40381715502202775, + "grad_norm": 0.07976792752742767, + "learning_rate": 2.9919723409306744e-05, + "loss": 0.0184, + "step": 54630 + }, + { + "epoch": 0.4038910735933296, + "grad_norm": 0.05811803787946701, + "learning_rate": 2.991601377018044e-05, + "loss": 0.0184, + "step": 54640 + }, + { + "epoch": 0.40396499216463144, + "grad_norm": 0.07902861386537552, + "learning_rate": 2.991230413105413e-05, + "loss": 0.0185, + "step": 54650 + }, + { + "epoch": 0.4040389107359333, + "grad_norm": 0.0864984542131424, + "learning_rate": 2.990859449192783e-05, + "loss": 0.0174, + "step": 54660 + }, + { + "epoch": 0.40411282930723513, + "grad_norm": 0.08877560496330261, + "learning_rate": 2.9904884852801525e-05, + "loss": 0.0161, + "step": 54670 + }, + { + "epoch": 0.404186747878537, + "grad_norm": 0.10576141625642776, + "learning_rate": 2.9901175213675214e-05, + "loss": 0.018, + "step": 54680 + }, + { + "epoch": 0.4042606664498389, + "grad_norm": 0.10026395320892334, + "learning_rate": 2.989746557454891e-05, + "loss": 0.0186, + "step": 54690 + }, + { + "epoch": 0.4043345850211407, + "grad_norm": 0.06324445456266403, + "learning_rate": 2.9893755935422602e-05, + "loss": 0.0189, + "step": 54700 + }, + { + "epoch": 0.40440850359244257, + "grad_norm": 0.09299612790346146, + "learning_rate": 2.98900462962963e-05, + "loss": 0.0184, + "step": 54710 + }, + { + "epoch": 0.4044824221637444, + "grad_norm": 0.07469546794891357, + "learning_rate": 2.9886336657169994e-05, + "loss": 0.0172, + "step": 54720 + }, + { + "epoch": 0.40455634073504626, + "grad_norm": 0.11556144058704376, + "learning_rate": 2.9882627018043684e-05, + "loss": 0.022, + "step": 54730 + }, + { + "epoch": 0.4046302593063481, + "grad_norm": 0.08120883256196976, + "learning_rate": 2.9878917378917383e-05, + "loss": 0.0181, + "step": 54740 + }, + { + "epoch": 0.40470417787765, + "grad_norm": 0.08154403418302536, + "learning_rate": 2.9875207739791072e-05, + "loss": 0.0176, + "step": 54750 + }, + { + "epoch": 0.40477809644895185, + "grad_norm": 0.09956072270870209, + "learning_rate": 2.9871498100664768e-05, + "loss": 0.0165, + "step": 54760 + }, + { + "epoch": 0.4048520150202537, + "grad_norm": 0.09262163192033768, + "learning_rate": 2.9867788461538464e-05, + "loss": 0.0161, + "step": 54770 + }, + { + "epoch": 0.40492593359155554, + "grad_norm": 0.09680727869272232, + "learning_rate": 2.9864078822412157e-05, + "loss": 0.0196, + "step": 54780 + }, + { + "epoch": 0.4049998521628574, + "grad_norm": 0.0783548429608345, + "learning_rate": 2.9860369183285853e-05, + "loss": 0.0208, + "step": 54790 + }, + { + "epoch": 0.40507377073415923, + "grad_norm": 0.06337272375822067, + "learning_rate": 2.9856659544159542e-05, + "loss": 0.0171, + "step": 54800 + }, + { + "epoch": 0.4051476893054611, + "grad_norm": 0.13881878554821014, + "learning_rate": 2.985294990503324e-05, + "loss": 0.0201, + "step": 54810 + }, + { + "epoch": 0.405221607876763, + "grad_norm": 0.0798603743314743, + "learning_rate": 2.9849240265906937e-05, + "loss": 0.0203, + "step": 54820 + }, + { + "epoch": 0.4052955264480648, + "grad_norm": 0.13624916970729828, + "learning_rate": 2.9845530626780626e-05, + "loss": 0.0183, + "step": 54830 + }, + { + "epoch": 0.40536944501936667, + "grad_norm": 0.09620372951030731, + "learning_rate": 2.9841820987654322e-05, + "loss": 0.0171, + "step": 54840 + }, + { + "epoch": 0.4054433635906685, + "grad_norm": 0.15520921349525452, + "learning_rate": 2.9838111348528015e-05, + "loss": 0.0192, + "step": 54850 + }, + { + "epoch": 0.40551728216197036, + "grad_norm": 0.06886200606822968, + "learning_rate": 2.983440170940171e-05, + "loss": 0.018, + "step": 54860 + }, + { + "epoch": 0.4055912007332722, + "grad_norm": 0.07065249234437943, + "learning_rate": 2.9830692070275407e-05, + "loss": 0.0208, + "step": 54870 + }, + { + "epoch": 0.4056651193045741, + "grad_norm": 0.09980648010969162, + "learning_rate": 2.9826982431149096e-05, + "loss": 0.0208, + "step": 54880 + }, + { + "epoch": 0.40573903787587595, + "grad_norm": 0.07251753658056259, + "learning_rate": 2.9823272792022795e-05, + "loss": 0.018, + "step": 54890 + }, + { + "epoch": 0.4058129564471778, + "grad_norm": 0.08873682469129562, + "learning_rate": 2.981956315289649e-05, + "loss": 0.0193, + "step": 54900 + }, + { + "epoch": 0.40588687501847964, + "grad_norm": 0.09693753719329834, + "learning_rate": 2.981585351377018e-05, + "loss": 0.0185, + "step": 54910 + }, + { + "epoch": 0.4059607935897815, + "grad_norm": 0.08978056907653809, + "learning_rate": 2.9812143874643876e-05, + "loss": 0.0185, + "step": 54920 + }, + { + "epoch": 0.40603471216108333, + "grad_norm": 0.0747421607375145, + "learning_rate": 2.980843423551757e-05, + "loss": 0.0184, + "step": 54930 + }, + { + "epoch": 0.4061086307323852, + "grad_norm": 0.11142931133508682, + "learning_rate": 2.9804724596391265e-05, + "loss": 0.0195, + "step": 54940 + }, + { + "epoch": 0.4061825493036871, + "grad_norm": 0.07661478221416473, + "learning_rate": 2.980101495726496e-05, + "loss": 0.0201, + "step": 54950 + }, + { + "epoch": 0.4062564678749889, + "grad_norm": 0.04711652547121048, + "learning_rate": 2.9797305318138654e-05, + "loss": 0.0165, + "step": 54960 + }, + { + "epoch": 0.40633038644629077, + "grad_norm": 0.11913223564624786, + "learning_rate": 2.979359567901235e-05, + "loss": 0.0183, + "step": 54970 + }, + { + "epoch": 0.4064043050175926, + "grad_norm": 0.08878781646490097, + "learning_rate": 2.978988603988604e-05, + "loss": 0.0164, + "step": 54980 + }, + { + "epoch": 0.40647822358889446, + "grad_norm": 0.07752066105604172, + "learning_rate": 2.9786176400759735e-05, + "loss": 0.0175, + "step": 54990 + }, + { + "epoch": 0.4065521421601963, + "grad_norm": 0.08861127495765686, + "learning_rate": 2.978246676163343e-05, + "loss": 0.02, + "step": 55000 + }, + { + "epoch": 0.4066260607314982, + "grad_norm": 0.0983710065484047, + "learning_rate": 2.9778757122507123e-05, + "loss": 0.0206, + "step": 55010 + }, + { + "epoch": 0.40669997930280005, + "grad_norm": 0.10089907050132751, + "learning_rate": 2.977504748338082e-05, + "loss": 0.0187, + "step": 55020 + }, + { + "epoch": 0.4067738978741019, + "grad_norm": 0.0748949870467186, + "learning_rate": 2.977133784425451e-05, + "loss": 0.0198, + "step": 55030 + }, + { + "epoch": 0.40684781644540374, + "grad_norm": 0.09311806410551071, + "learning_rate": 2.9767628205128208e-05, + "loss": 0.0178, + "step": 55040 + }, + { + "epoch": 0.4069217350167056, + "grad_norm": 0.09688904881477356, + "learning_rate": 2.9763918566001904e-05, + "loss": 0.0174, + "step": 55050 + }, + { + "epoch": 0.40699565358800743, + "grad_norm": 0.0686623826622963, + "learning_rate": 2.9760208926875593e-05, + "loss": 0.0165, + "step": 55060 + }, + { + "epoch": 0.4070695721593093, + "grad_norm": 0.08978747576475143, + "learning_rate": 2.975649928774929e-05, + "loss": 0.015, + "step": 55070 + }, + { + "epoch": 0.4071434907306112, + "grad_norm": 0.0859551951289177, + "learning_rate": 2.975278964862298e-05, + "loss": 0.0168, + "step": 55080 + }, + { + "epoch": 0.407217409301913, + "grad_norm": 0.06242772564291954, + "learning_rate": 2.9749080009496677e-05, + "loss": 0.0171, + "step": 55090 + }, + { + "epoch": 0.40729132787321487, + "grad_norm": 0.06802783161401749, + "learning_rate": 2.9745370370370373e-05, + "loss": 0.0168, + "step": 55100 + }, + { + "epoch": 0.4073652464445167, + "grad_norm": 0.06960264593362808, + "learning_rate": 2.9741660731244066e-05, + "loss": 0.0172, + "step": 55110 + }, + { + "epoch": 0.40743916501581856, + "grad_norm": 0.07149052619934082, + "learning_rate": 2.9737951092117762e-05, + "loss": 0.0189, + "step": 55120 + }, + { + "epoch": 0.4075130835871204, + "grad_norm": 0.0835036188364029, + "learning_rate": 2.9734241452991458e-05, + "loss": 0.0165, + "step": 55130 + }, + { + "epoch": 0.4075870021584223, + "grad_norm": 0.09829945862293243, + "learning_rate": 2.9730531813865147e-05, + "loss": 0.0189, + "step": 55140 + }, + { + "epoch": 0.40766092072972415, + "grad_norm": 0.2645808458328247, + "learning_rate": 2.9726822174738843e-05, + "loss": 0.0192, + "step": 55150 + }, + { + "epoch": 0.407734839301026, + "grad_norm": 0.08904475718736649, + "learning_rate": 2.9723112535612536e-05, + "loss": 0.0181, + "step": 55160 + }, + { + "epoch": 0.40780875787232784, + "grad_norm": 0.0872875228524208, + "learning_rate": 2.971940289648623e-05, + "loss": 0.0202, + "step": 55170 + }, + { + "epoch": 0.4078826764436297, + "grad_norm": 0.0831475630402565, + "learning_rate": 2.9715693257359928e-05, + "loss": 0.0196, + "step": 55180 + }, + { + "epoch": 0.40795659501493153, + "grad_norm": 0.0917244628071785, + "learning_rate": 2.971198361823362e-05, + "loss": 0.0208, + "step": 55190 + }, + { + "epoch": 0.4080305135862334, + "grad_norm": 0.07903681695461273, + "learning_rate": 2.9708273979107316e-05, + "loss": 0.0183, + "step": 55200 + }, + { + "epoch": 0.4081044321575353, + "grad_norm": 0.0862165316939354, + "learning_rate": 2.9704564339981005e-05, + "loss": 0.0189, + "step": 55210 + }, + { + "epoch": 0.4081783507288371, + "grad_norm": 0.09082507342100143, + "learning_rate": 2.97008547008547e-05, + "loss": 0.0176, + "step": 55220 + }, + { + "epoch": 0.40825226930013897, + "grad_norm": 0.07527061551809311, + "learning_rate": 2.9697145061728397e-05, + "loss": 0.0149, + "step": 55230 + }, + { + "epoch": 0.4083261878714408, + "grad_norm": 0.11855072528123856, + "learning_rate": 2.969343542260209e-05, + "loss": 0.02, + "step": 55240 + }, + { + "epoch": 0.40840010644274266, + "grad_norm": 0.07656430453062057, + "learning_rate": 2.9689725783475786e-05, + "loss": 0.0188, + "step": 55250 + }, + { + "epoch": 0.4084740250140445, + "grad_norm": 0.11509408801794052, + "learning_rate": 2.968601614434948e-05, + "loss": 0.0196, + "step": 55260 + }, + { + "epoch": 0.4085479435853464, + "grad_norm": 0.0942547544836998, + "learning_rate": 2.9682306505223174e-05, + "loss": 0.0181, + "step": 55270 + }, + { + "epoch": 0.40862186215664825, + "grad_norm": 0.09625694900751114, + "learning_rate": 2.967859686609687e-05, + "loss": 0.0189, + "step": 55280 + }, + { + "epoch": 0.4086957807279501, + "grad_norm": 0.07003076374530792, + "learning_rate": 2.967488722697056e-05, + "loss": 0.0166, + "step": 55290 + }, + { + "epoch": 0.40876969929925194, + "grad_norm": 0.091498002409935, + "learning_rate": 2.9671177587844255e-05, + "loss": 0.0187, + "step": 55300 + }, + { + "epoch": 0.4088436178705538, + "grad_norm": 0.11632296442985535, + "learning_rate": 2.9667467948717948e-05, + "loss": 0.0199, + "step": 55310 + }, + { + "epoch": 0.40891753644185563, + "grad_norm": 0.0970294401049614, + "learning_rate": 2.9663758309591644e-05, + "loss": 0.0198, + "step": 55320 + }, + { + "epoch": 0.4089914550131575, + "grad_norm": 0.11141879856586456, + "learning_rate": 2.966004867046534e-05, + "loss": 0.017, + "step": 55330 + }, + { + "epoch": 0.4090653735844594, + "grad_norm": 0.07932562381029129, + "learning_rate": 2.9656339031339033e-05, + "loss": 0.0171, + "step": 55340 + }, + { + "epoch": 0.4091392921557612, + "grad_norm": 0.07699783146381378, + "learning_rate": 2.965262939221273e-05, + "loss": 0.0161, + "step": 55350 + }, + { + "epoch": 0.40921321072706307, + "grad_norm": 0.09021428227424622, + "learning_rate": 2.9648919753086424e-05, + "loss": 0.017, + "step": 55360 + }, + { + "epoch": 0.4092871292983649, + "grad_norm": 0.09824664145708084, + "learning_rate": 2.9645210113960114e-05, + "loss": 0.0192, + "step": 55370 + }, + { + "epoch": 0.40936104786966676, + "grad_norm": 0.0917426124215126, + "learning_rate": 2.964150047483381e-05, + "loss": 0.0203, + "step": 55380 + }, + { + "epoch": 0.4094349664409686, + "grad_norm": 0.0881839171051979, + "learning_rate": 2.9637790835707502e-05, + "loss": 0.0191, + "step": 55390 + }, + { + "epoch": 0.4095088850122705, + "grad_norm": 0.07064758241176605, + "learning_rate": 2.9634081196581198e-05, + "loss": 0.0179, + "step": 55400 + }, + { + "epoch": 0.40958280358357235, + "grad_norm": 0.07534152269363403, + "learning_rate": 2.9630371557454894e-05, + "loss": 0.0177, + "step": 55410 + }, + { + "epoch": 0.4096567221548742, + "grad_norm": 0.09132669866085052, + "learning_rate": 2.9626661918328587e-05, + "loss": 0.0168, + "step": 55420 + }, + { + "epoch": 0.40973064072617604, + "grad_norm": 0.08473557233810425, + "learning_rate": 2.9622952279202283e-05, + "loss": 0.0192, + "step": 55430 + }, + { + "epoch": 0.4098045592974779, + "grad_norm": 0.07497388124465942, + "learning_rate": 2.9619242640075972e-05, + "loss": 0.018, + "step": 55440 + }, + { + "epoch": 0.40987847786877973, + "grad_norm": 0.07634835690259933, + "learning_rate": 2.9615533000949668e-05, + "loss": 0.0169, + "step": 55450 + }, + { + "epoch": 0.4099523964400816, + "grad_norm": 0.1072976365685463, + "learning_rate": 2.9611823361823364e-05, + "loss": 0.0183, + "step": 55460 + }, + { + "epoch": 0.4100263150113835, + "grad_norm": 0.08737269788980484, + "learning_rate": 2.9608113722697056e-05, + "loss": 0.0168, + "step": 55470 + }, + { + "epoch": 0.4101002335826853, + "grad_norm": 0.08957663178443909, + "learning_rate": 2.9604404083570752e-05, + "loss": 0.0191, + "step": 55480 + }, + { + "epoch": 0.41017415215398717, + "grad_norm": 0.0685604140162468, + "learning_rate": 2.9600694444444445e-05, + "loss": 0.0159, + "step": 55490 + }, + { + "epoch": 0.410248070725289, + "grad_norm": 0.08200903236865997, + "learning_rate": 2.959698480531814e-05, + "loss": 0.0187, + "step": 55500 + }, + { + "epoch": 0.41032198929659086, + "grad_norm": 0.08628015220165253, + "learning_rate": 2.9593275166191837e-05, + "loss": 0.0196, + "step": 55510 + }, + { + "epoch": 0.4103959078678927, + "grad_norm": 0.0769340842962265, + "learning_rate": 2.9589565527065526e-05, + "loss": 0.0189, + "step": 55520 + }, + { + "epoch": 0.4104698264391946, + "grad_norm": 0.08223969489336014, + "learning_rate": 2.9585855887939222e-05, + "loss": 0.0169, + "step": 55530 + }, + { + "epoch": 0.41054374501049645, + "grad_norm": 0.0858936756849289, + "learning_rate": 2.9582146248812915e-05, + "loss": 0.0185, + "step": 55540 + }, + { + "epoch": 0.4106176635817983, + "grad_norm": 0.0734199583530426, + "learning_rate": 2.957843660968661e-05, + "loss": 0.019, + "step": 55550 + }, + { + "epoch": 0.41069158215310014, + "grad_norm": 0.08149746805429459, + "learning_rate": 2.9574726970560307e-05, + "loss": 0.0144, + "step": 55560 + }, + { + "epoch": 0.410765500724402, + "grad_norm": 0.1099155992269516, + "learning_rate": 2.9571017331434e-05, + "loss": 0.0166, + "step": 55570 + }, + { + "epoch": 0.41083941929570383, + "grad_norm": 0.11669523268938065, + "learning_rate": 2.9567307692307695e-05, + "loss": 0.018, + "step": 55580 + }, + { + "epoch": 0.4109133378670057, + "grad_norm": 0.11657500267028809, + "learning_rate": 2.956359805318139e-05, + "loss": 0.0214, + "step": 55590 + }, + { + "epoch": 0.4109872564383076, + "grad_norm": 0.08685796707868576, + "learning_rate": 2.955988841405508e-05, + "loss": 0.0181, + "step": 55600 + }, + { + "epoch": 0.4110611750096094, + "grad_norm": 0.06064879149198532, + "learning_rate": 2.9556178774928776e-05, + "loss": 0.0199, + "step": 55610 + }, + { + "epoch": 0.41113509358091127, + "grad_norm": 0.0598427839577198, + "learning_rate": 2.955246913580247e-05, + "loss": 0.0181, + "step": 55620 + }, + { + "epoch": 0.4112090121522131, + "grad_norm": 0.06348814815282822, + "learning_rate": 2.9548759496676165e-05, + "loss": 0.0181, + "step": 55630 + }, + { + "epoch": 0.41128293072351496, + "grad_norm": 0.07413514703512192, + "learning_rate": 2.954504985754986e-05, + "loss": 0.0192, + "step": 55640 + }, + { + "epoch": 0.4113568492948168, + "grad_norm": 0.0760175809264183, + "learning_rate": 2.9541340218423553e-05, + "loss": 0.0164, + "step": 55650 + }, + { + "epoch": 0.4114307678661187, + "grad_norm": 0.05769358202815056, + "learning_rate": 2.953763057929725e-05, + "loss": 0.0184, + "step": 55660 + }, + { + "epoch": 0.41150468643742055, + "grad_norm": 0.08280821144580841, + "learning_rate": 2.953392094017094e-05, + "loss": 0.0168, + "step": 55670 + }, + { + "epoch": 0.4115786050087224, + "grad_norm": 0.07674267143011093, + "learning_rate": 2.9530211301044634e-05, + "loss": 0.0162, + "step": 55680 + }, + { + "epoch": 0.41165252358002424, + "grad_norm": 0.1518862247467041, + "learning_rate": 2.9526501661918334e-05, + "loss": 0.0197, + "step": 55690 + }, + { + "epoch": 0.4117264421513261, + "grad_norm": 0.048735495656728745, + "learning_rate": 2.9522792022792023e-05, + "loss": 0.0181, + "step": 55700 + }, + { + "epoch": 0.41180036072262793, + "grad_norm": 0.10562967509031296, + "learning_rate": 2.951908238366572e-05, + "loss": 0.0164, + "step": 55710 + }, + { + "epoch": 0.4118742792939298, + "grad_norm": 0.1096380278468132, + "learning_rate": 2.951537274453941e-05, + "loss": 0.0205, + "step": 55720 + }, + { + "epoch": 0.4119481978652317, + "grad_norm": 0.07708892971277237, + "learning_rate": 2.9511663105413107e-05, + "loss": 0.0177, + "step": 55730 + }, + { + "epoch": 0.4120221164365335, + "grad_norm": 0.10286412388086319, + "learning_rate": 2.9507953466286803e-05, + "loss": 0.0177, + "step": 55740 + }, + { + "epoch": 0.41209603500783537, + "grad_norm": 0.06790533661842346, + "learning_rate": 2.9504243827160493e-05, + "loss": 0.0171, + "step": 55750 + }, + { + "epoch": 0.4121699535791372, + "grad_norm": 0.08822519332170486, + "learning_rate": 2.950053418803419e-05, + "loss": 0.0166, + "step": 55760 + }, + { + "epoch": 0.41224387215043906, + "grad_norm": 0.08776558935642242, + "learning_rate": 2.949682454890788e-05, + "loss": 0.018, + "step": 55770 + }, + { + "epoch": 0.4123177907217409, + "grad_norm": 0.12816807627677917, + "learning_rate": 2.9493114909781577e-05, + "loss": 0.0217, + "step": 55780 + }, + { + "epoch": 0.4123917092930428, + "grad_norm": 0.07932336628437042, + "learning_rate": 2.9489405270655273e-05, + "loss": 0.0166, + "step": 55790 + }, + { + "epoch": 0.41246562786434465, + "grad_norm": 0.0879678800702095, + "learning_rate": 2.9485695631528966e-05, + "loss": 0.0181, + "step": 55800 + }, + { + "epoch": 0.4125395464356465, + "grad_norm": 0.20386385917663574, + "learning_rate": 2.948198599240266e-05, + "loss": 0.0167, + "step": 55810 + }, + { + "epoch": 0.41261346500694834, + "grad_norm": 0.1288457214832306, + "learning_rate": 2.9478276353276358e-05, + "loss": 0.0188, + "step": 55820 + }, + { + "epoch": 0.4126873835782502, + "grad_norm": 0.06845947355031967, + "learning_rate": 2.9474566714150047e-05, + "loss": 0.02, + "step": 55830 + }, + { + "epoch": 0.41276130214955203, + "grad_norm": 0.08572788536548615, + "learning_rate": 2.9470857075023746e-05, + "loss": 0.0172, + "step": 55840 + }, + { + "epoch": 0.41283522072085393, + "grad_norm": 0.0841023325920105, + "learning_rate": 2.9467147435897435e-05, + "loss": 0.0196, + "step": 55850 + }, + { + "epoch": 0.4129091392921558, + "grad_norm": 0.0607466846704483, + "learning_rate": 2.946343779677113e-05, + "loss": 0.0181, + "step": 55860 + }, + { + "epoch": 0.4129830578634576, + "grad_norm": 0.10207097232341766, + "learning_rate": 2.9459728157644827e-05, + "loss": 0.0186, + "step": 55870 + }, + { + "epoch": 0.41305697643475947, + "grad_norm": 0.08442854881286621, + "learning_rate": 2.945601851851852e-05, + "loss": 0.0156, + "step": 55880 + }, + { + "epoch": 0.4131308950060613, + "grad_norm": 0.06380892544984818, + "learning_rate": 2.9452308879392216e-05, + "loss": 0.0185, + "step": 55890 + }, + { + "epoch": 0.41320481357736316, + "grad_norm": 0.07364775240421295, + "learning_rate": 2.9448599240265905e-05, + "loss": 0.0184, + "step": 55900 + }, + { + "epoch": 0.413278732148665, + "grad_norm": 0.04778318852186203, + "learning_rate": 2.94448896011396e-05, + "loss": 0.0153, + "step": 55910 + }, + { + "epoch": 0.4133526507199669, + "grad_norm": 0.10692009329795837, + "learning_rate": 2.94411799620133e-05, + "loss": 0.0191, + "step": 55920 + }, + { + "epoch": 0.41342656929126875, + "grad_norm": 0.10312798619270325, + "learning_rate": 2.943747032288699e-05, + "loss": 0.0179, + "step": 55930 + }, + { + "epoch": 0.4135004878625706, + "grad_norm": 0.07171052694320679, + "learning_rate": 2.9433760683760685e-05, + "loss": 0.0219, + "step": 55940 + }, + { + "epoch": 0.41357440643387244, + "grad_norm": 0.09593594819307327, + "learning_rate": 2.9430051044634378e-05, + "loss": 0.0183, + "step": 55950 + }, + { + "epoch": 0.4136483250051743, + "grad_norm": 0.0906202644109726, + "learning_rate": 2.9426341405508074e-05, + "loss": 0.0154, + "step": 55960 + }, + { + "epoch": 0.41372224357647613, + "grad_norm": 0.06475282460451126, + "learning_rate": 2.942263176638177e-05, + "loss": 0.0177, + "step": 55970 + }, + { + "epoch": 0.41379616214777803, + "grad_norm": 0.08594939112663269, + "learning_rate": 2.941892212725546e-05, + "loss": 0.0175, + "step": 55980 + }, + { + "epoch": 0.4138700807190799, + "grad_norm": 0.0895848423242569, + "learning_rate": 2.941521248812916e-05, + "loss": 0.0176, + "step": 55990 + }, + { + "epoch": 0.4139439992903817, + "grad_norm": 0.07915901392698288, + "learning_rate": 2.9411502849002848e-05, + "loss": 0.0164, + "step": 56000 + }, + { + "epoch": 0.41401791786168357, + "grad_norm": 0.07280030101537704, + "learning_rate": 2.9407793209876544e-05, + "loss": 0.0181, + "step": 56010 + }, + { + "epoch": 0.4140918364329854, + "grad_norm": 0.09190954267978668, + "learning_rate": 2.940408357075024e-05, + "loss": 0.0181, + "step": 56020 + }, + { + "epoch": 0.41416575500428726, + "grad_norm": 0.1099943295121193, + "learning_rate": 2.9400373931623932e-05, + "loss": 0.019, + "step": 56030 + }, + { + "epoch": 0.4142396735755891, + "grad_norm": 0.10087905079126358, + "learning_rate": 2.9396664292497628e-05, + "loss": 0.0195, + "step": 56040 + }, + { + "epoch": 0.414313592146891, + "grad_norm": 0.061354316771030426, + "learning_rate": 2.9392954653371324e-05, + "loss": 0.0173, + "step": 56050 + }, + { + "epoch": 0.41438751071819285, + "grad_norm": 0.08477345108985901, + "learning_rate": 2.9389245014245013e-05, + "loss": 0.0156, + "step": 56060 + }, + { + "epoch": 0.4144614292894947, + "grad_norm": 0.08678663522005081, + "learning_rate": 2.9385535375118713e-05, + "loss": 0.0182, + "step": 56070 + }, + { + "epoch": 0.41453534786079654, + "grad_norm": 0.07783781737089157, + "learning_rate": 2.9381825735992402e-05, + "loss": 0.0168, + "step": 56080 + }, + { + "epoch": 0.4146092664320984, + "grad_norm": 0.11021009087562561, + "learning_rate": 2.9378116096866098e-05, + "loss": 0.0183, + "step": 56090 + }, + { + "epoch": 0.41468318500340023, + "grad_norm": 0.0836622565984726, + "learning_rate": 2.9374406457739794e-05, + "loss": 0.0159, + "step": 56100 + }, + { + "epoch": 0.41475710357470214, + "grad_norm": 0.1709643006324768, + "learning_rate": 2.9370696818613486e-05, + "loss": 0.0182, + "step": 56110 + }, + { + "epoch": 0.414831022146004, + "grad_norm": 0.12158837914466858, + "learning_rate": 2.9366987179487182e-05, + "loss": 0.0176, + "step": 56120 + }, + { + "epoch": 0.4149049407173058, + "grad_norm": 0.08476385474205017, + "learning_rate": 2.936327754036087e-05, + "loss": 0.0159, + "step": 56130 + }, + { + "epoch": 0.41497885928860767, + "grad_norm": 0.0901803970336914, + "learning_rate": 2.9359567901234568e-05, + "loss": 0.0183, + "step": 56140 + }, + { + "epoch": 0.4150527778599095, + "grad_norm": 0.07867557555437088, + "learning_rate": 2.9355858262108267e-05, + "loss": 0.0183, + "step": 56150 + }, + { + "epoch": 0.41512669643121136, + "grad_norm": 0.106829933822155, + "learning_rate": 2.9352148622981956e-05, + "loss": 0.0178, + "step": 56160 + }, + { + "epoch": 0.4152006150025132, + "grad_norm": 0.07354728132486343, + "learning_rate": 2.9348438983855652e-05, + "loss": 0.0176, + "step": 56170 + }, + { + "epoch": 0.4152745335738151, + "grad_norm": 0.11522946506738663, + "learning_rate": 2.9344729344729345e-05, + "loss": 0.0177, + "step": 56180 + }, + { + "epoch": 0.41534845214511695, + "grad_norm": 0.09632186591625214, + "learning_rate": 2.934101970560304e-05, + "loss": 0.0218, + "step": 56190 + }, + { + "epoch": 0.4154223707164188, + "grad_norm": 0.06593929976224899, + "learning_rate": 2.9337310066476737e-05, + "loss": 0.0169, + "step": 56200 + }, + { + "epoch": 0.41549628928772064, + "grad_norm": 0.08895174413919449, + "learning_rate": 2.9333600427350426e-05, + "loss": 0.0193, + "step": 56210 + }, + { + "epoch": 0.4155702078590225, + "grad_norm": 0.0696493610739708, + "learning_rate": 2.9329890788224125e-05, + "loss": 0.0195, + "step": 56220 + }, + { + "epoch": 0.41564412643032433, + "grad_norm": 0.08290284126996994, + "learning_rate": 2.9326181149097814e-05, + "loss": 0.0204, + "step": 56230 + }, + { + "epoch": 0.41571804500162624, + "grad_norm": 0.09379585832357407, + "learning_rate": 2.932247150997151e-05, + "loss": 0.0198, + "step": 56240 + }, + { + "epoch": 0.4157919635729281, + "grad_norm": 0.08410067856311798, + "learning_rate": 2.9318761870845206e-05, + "loss": 0.0182, + "step": 56250 + }, + { + "epoch": 0.4158658821442299, + "grad_norm": 0.09510476887226105, + "learning_rate": 2.93150522317189e-05, + "loss": 0.0154, + "step": 56260 + }, + { + "epoch": 0.41593980071553177, + "grad_norm": 0.0716487243771553, + "learning_rate": 2.9311342592592595e-05, + "loss": 0.0163, + "step": 56270 + }, + { + "epoch": 0.4160137192868336, + "grad_norm": 0.08590196073055267, + "learning_rate": 2.930763295346629e-05, + "loss": 0.0196, + "step": 56280 + }, + { + "epoch": 0.41608763785813546, + "grad_norm": 0.09150087088346481, + "learning_rate": 2.930392331433998e-05, + "loss": 0.02, + "step": 56290 + }, + { + "epoch": 0.4161615564294373, + "grad_norm": 0.07709812372922897, + "learning_rate": 2.930021367521368e-05, + "loss": 0.0184, + "step": 56300 + }, + { + "epoch": 0.4162354750007392, + "grad_norm": 0.05744616687297821, + "learning_rate": 2.929650403608737e-05, + "loss": 0.0174, + "step": 56310 + }, + { + "epoch": 0.41630939357204105, + "grad_norm": 0.08779750764369965, + "learning_rate": 2.9292794396961064e-05, + "loss": 0.0163, + "step": 56320 + }, + { + "epoch": 0.4163833121433429, + "grad_norm": 0.08599414676427841, + "learning_rate": 2.928908475783476e-05, + "loss": 0.0161, + "step": 56330 + }, + { + "epoch": 0.41645723071464474, + "grad_norm": 0.06551361083984375, + "learning_rate": 2.9285375118708453e-05, + "loss": 0.0159, + "step": 56340 + }, + { + "epoch": 0.4165311492859466, + "grad_norm": 0.0936962142586708, + "learning_rate": 2.928166547958215e-05, + "loss": 0.0186, + "step": 56350 + }, + { + "epoch": 0.41660506785724843, + "grad_norm": 0.09517485648393631, + "learning_rate": 2.9277955840455838e-05, + "loss": 0.0186, + "step": 56360 + }, + { + "epoch": 0.41667898642855034, + "grad_norm": 0.08648562431335449, + "learning_rate": 2.9274246201329538e-05, + "loss": 0.018, + "step": 56370 + }, + { + "epoch": 0.4167529049998522, + "grad_norm": 0.08034750819206238, + "learning_rate": 2.9270536562203234e-05, + "loss": 0.0165, + "step": 56380 + }, + { + "epoch": 0.416826823571154, + "grad_norm": 0.07642500102519989, + "learning_rate": 2.9266826923076923e-05, + "loss": 0.018, + "step": 56390 + }, + { + "epoch": 0.41690074214245587, + "grad_norm": 0.08972880244255066, + "learning_rate": 2.926311728395062e-05, + "loss": 0.018, + "step": 56400 + }, + { + "epoch": 0.4169746607137577, + "grad_norm": 0.07712242007255554, + "learning_rate": 2.925940764482431e-05, + "loss": 0.0184, + "step": 56410 + }, + { + "epoch": 0.41704857928505956, + "grad_norm": 0.08815699815750122, + "learning_rate": 2.9255698005698007e-05, + "loss": 0.0193, + "step": 56420 + }, + { + "epoch": 0.4171224978563614, + "grad_norm": 0.08716235309839249, + "learning_rate": 2.9251988366571703e-05, + "loss": 0.0204, + "step": 56430 + }, + { + "epoch": 0.4171964164276633, + "grad_norm": 0.10045552253723145, + "learning_rate": 2.9248278727445392e-05, + "loss": 0.0182, + "step": 56440 + }, + { + "epoch": 0.41727033499896515, + "grad_norm": 0.0753190815448761, + "learning_rate": 2.9244569088319092e-05, + "loss": 0.0201, + "step": 56450 + }, + { + "epoch": 0.417344253570267, + "grad_norm": 0.07696306705474854, + "learning_rate": 2.924085944919278e-05, + "loss": 0.0177, + "step": 56460 + }, + { + "epoch": 0.41741817214156884, + "grad_norm": 0.08237896114587784, + "learning_rate": 2.9237149810066477e-05, + "loss": 0.0206, + "step": 56470 + }, + { + "epoch": 0.4174920907128707, + "grad_norm": 0.06589295715093613, + "learning_rate": 2.9233440170940173e-05, + "loss": 0.0219, + "step": 56480 + }, + { + "epoch": 0.41756600928417253, + "grad_norm": 0.059493862092494965, + "learning_rate": 2.9229730531813865e-05, + "loss": 0.0196, + "step": 56490 + }, + { + "epoch": 0.41763992785547444, + "grad_norm": 0.07635711878538132, + "learning_rate": 2.922602089268756e-05, + "loss": 0.019, + "step": 56500 + }, + { + "epoch": 0.4177138464267763, + "grad_norm": 0.07192423194646835, + "learning_rate": 2.9222311253561257e-05, + "loss": 0.0174, + "step": 56510 + }, + { + "epoch": 0.4177877649980781, + "grad_norm": 0.1241598054766655, + "learning_rate": 2.921860161443495e-05, + "loss": 0.0187, + "step": 56520 + }, + { + "epoch": 0.41786168356937997, + "grad_norm": 0.0738677978515625, + "learning_rate": 2.9214891975308646e-05, + "loss": 0.0207, + "step": 56530 + }, + { + "epoch": 0.4179356021406818, + "grad_norm": 0.09094661474227905, + "learning_rate": 2.9211182336182335e-05, + "loss": 0.0209, + "step": 56540 + }, + { + "epoch": 0.41800952071198366, + "grad_norm": 0.07835207134485245, + "learning_rate": 2.920747269705603e-05, + "loss": 0.0175, + "step": 56550 + }, + { + "epoch": 0.4180834392832855, + "grad_norm": 0.05614442005753517, + "learning_rate": 2.9203763057929727e-05, + "loss": 0.0173, + "step": 56560 + }, + { + "epoch": 0.4181573578545874, + "grad_norm": 0.0806455984711647, + "learning_rate": 2.920005341880342e-05, + "loss": 0.0183, + "step": 56570 + }, + { + "epoch": 0.41823127642588925, + "grad_norm": 0.10308822244405746, + "learning_rate": 2.9196343779677116e-05, + "loss": 0.0188, + "step": 56580 + }, + { + "epoch": 0.4183051949971911, + "grad_norm": 0.10335249453783035, + "learning_rate": 2.9192634140550805e-05, + "loss": 0.0187, + "step": 56590 + }, + { + "epoch": 0.41837911356849294, + "grad_norm": 0.08708662539720535, + "learning_rate": 2.9188924501424504e-05, + "loss": 0.0183, + "step": 56600 + }, + { + "epoch": 0.4184530321397948, + "grad_norm": 0.08335819840431213, + "learning_rate": 2.91852148622982e-05, + "loss": 0.0191, + "step": 56610 + }, + { + "epoch": 0.41852695071109663, + "grad_norm": 0.07016626000404358, + "learning_rate": 2.918150522317189e-05, + "loss": 0.0186, + "step": 56620 + }, + { + "epoch": 0.41860086928239854, + "grad_norm": 0.11232458800077438, + "learning_rate": 2.9177795584045585e-05, + "loss": 0.0189, + "step": 56630 + }, + { + "epoch": 0.4186747878537004, + "grad_norm": 0.07821941375732422, + "learning_rate": 2.9174085944919278e-05, + "loss": 0.0178, + "step": 56640 + }, + { + "epoch": 0.4187487064250022, + "grad_norm": 0.07746779173612595, + "learning_rate": 2.9170376305792974e-05, + "loss": 0.0164, + "step": 56650 + }, + { + "epoch": 0.41882262499630407, + "grad_norm": 0.10343199223279953, + "learning_rate": 2.916666666666667e-05, + "loss": 0.0174, + "step": 56660 + }, + { + "epoch": 0.4188965435676059, + "grad_norm": 0.07732991129159927, + "learning_rate": 2.9162957027540362e-05, + "loss": 0.0186, + "step": 56670 + }, + { + "epoch": 0.41897046213890776, + "grad_norm": 0.09331337362527847, + "learning_rate": 2.9159247388414058e-05, + "loss": 0.0183, + "step": 56680 + }, + { + "epoch": 0.4190443807102096, + "grad_norm": 0.06747223436832428, + "learning_rate": 2.9155537749287747e-05, + "loss": 0.0163, + "step": 56690 + }, + { + "epoch": 0.4191182992815115, + "grad_norm": 0.0804247334599495, + "learning_rate": 2.9151828110161443e-05, + "loss": 0.0176, + "step": 56700 + }, + { + "epoch": 0.41919221785281335, + "grad_norm": 0.0986623466014862, + "learning_rate": 2.914811847103514e-05, + "loss": 0.0179, + "step": 56710 + }, + { + "epoch": 0.4192661364241152, + "grad_norm": 0.08656472712755203, + "learning_rate": 2.9144408831908832e-05, + "loss": 0.0198, + "step": 56720 + }, + { + "epoch": 0.41934005499541704, + "grad_norm": 0.093471959233284, + "learning_rate": 2.9140699192782528e-05, + "loss": 0.0188, + "step": 56730 + }, + { + "epoch": 0.4194139735667189, + "grad_norm": 0.09669660776853561, + "learning_rate": 2.9136989553656224e-05, + "loss": 0.0178, + "step": 56740 + }, + { + "epoch": 0.41948789213802073, + "grad_norm": 0.08791998028755188, + "learning_rate": 2.9133279914529917e-05, + "loss": 0.0193, + "step": 56750 + }, + { + "epoch": 0.41956181070932264, + "grad_norm": 0.08240630477666855, + "learning_rate": 2.9129570275403612e-05, + "loss": 0.0167, + "step": 56760 + }, + { + "epoch": 0.4196357292806245, + "grad_norm": 0.0766507163643837, + "learning_rate": 2.91258606362773e-05, + "loss": 0.0175, + "step": 56770 + }, + { + "epoch": 0.4197096478519263, + "grad_norm": 0.08248403668403625, + "learning_rate": 2.9122150997150998e-05, + "loss": 0.0172, + "step": 56780 + }, + { + "epoch": 0.41978356642322817, + "grad_norm": 0.07111876457929611, + "learning_rate": 2.9118441358024694e-05, + "loss": 0.0171, + "step": 56790 + }, + { + "epoch": 0.41985748499453, + "grad_norm": 0.10291147977113724, + "learning_rate": 2.9114731718898386e-05, + "loss": 0.0203, + "step": 56800 + }, + { + "epoch": 0.41993140356583186, + "grad_norm": 0.09333407133817673, + "learning_rate": 2.9111022079772082e-05, + "loss": 0.0201, + "step": 56810 + }, + { + "epoch": 0.4200053221371337, + "grad_norm": 0.08243946731090546, + "learning_rate": 2.9107312440645775e-05, + "loss": 0.0192, + "step": 56820 + }, + { + "epoch": 0.4200792407084356, + "grad_norm": 0.07780137658119202, + "learning_rate": 2.910360280151947e-05, + "loss": 0.0199, + "step": 56830 + }, + { + "epoch": 0.42015315927973745, + "grad_norm": 0.0727728083729744, + "learning_rate": 2.9099893162393167e-05, + "loss": 0.0165, + "step": 56840 + }, + { + "epoch": 0.4202270778510393, + "grad_norm": 0.08291462063789368, + "learning_rate": 2.9096183523266856e-05, + "loss": 0.0169, + "step": 56850 + }, + { + "epoch": 0.42030099642234114, + "grad_norm": 0.07736489921808243, + "learning_rate": 2.9092473884140552e-05, + "loss": 0.0197, + "step": 56860 + }, + { + "epoch": 0.420374914993643, + "grad_norm": 0.06759285926818848, + "learning_rate": 2.9088764245014244e-05, + "loss": 0.0158, + "step": 56870 + }, + { + "epoch": 0.42044883356494483, + "grad_norm": 0.08929736167192459, + "learning_rate": 2.908505460588794e-05, + "loss": 0.0179, + "step": 56880 + }, + { + "epoch": 0.42052275213624674, + "grad_norm": 0.07094036787748337, + "learning_rate": 2.9081344966761636e-05, + "loss": 0.0193, + "step": 56890 + }, + { + "epoch": 0.4205966707075486, + "grad_norm": 0.09569530189037323, + "learning_rate": 2.907763532763533e-05, + "loss": 0.0209, + "step": 56900 + }, + { + "epoch": 0.4206705892788504, + "grad_norm": 0.09825005382299423, + "learning_rate": 2.9073925688509025e-05, + "loss": 0.0192, + "step": 56910 + }, + { + "epoch": 0.42074450785015227, + "grad_norm": 0.11080006510019302, + "learning_rate": 2.9070216049382714e-05, + "loss": 0.0175, + "step": 56920 + }, + { + "epoch": 0.4208184264214541, + "grad_norm": 0.09301717579364777, + "learning_rate": 2.906650641025641e-05, + "loss": 0.0194, + "step": 56930 + }, + { + "epoch": 0.42089234499275596, + "grad_norm": 0.09418229013681412, + "learning_rate": 2.9062796771130106e-05, + "loss": 0.0149, + "step": 56940 + }, + { + "epoch": 0.4209662635640578, + "grad_norm": 0.09676161408424377, + "learning_rate": 2.90590871320038e-05, + "loss": 0.0179, + "step": 56950 + }, + { + "epoch": 0.4210401821353597, + "grad_norm": 0.07113750278949738, + "learning_rate": 2.9055377492877495e-05, + "loss": 0.0183, + "step": 56960 + }, + { + "epoch": 0.42111410070666155, + "grad_norm": 0.06378157436847687, + "learning_rate": 2.905166785375119e-05, + "loss": 0.0167, + "step": 56970 + }, + { + "epoch": 0.4211880192779634, + "grad_norm": 0.06854438036680222, + "learning_rate": 2.9047958214624883e-05, + "loss": 0.0164, + "step": 56980 + }, + { + "epoch": 0.42126193784926524, + "grad_norm": 0.08005507290363312, + "learning_rate": 2.904424857549858e-05, + "loss": 0.0169, + "step": 56990 + }, + { + "epoch": 0.4213358564205671, + "grad_norm": 0.09289931505918503, + "learning_rate": 2.9040538936372268e-05, + "loss": 0.0185, + "step": 57000 + }, + { + "epoch": 0.42140977499186894, + "grad_norm": 0.08229994028806686, + "learning_rate": 2.9036829297245964e-05, + "loss": 0.0172, + "step": 57010 + }, + { + "epoch": 0.42148369356317084, + "grad_norm": 0.0777730792760849, + "learning_rate": 2.903311965811966e-05, + "loss": 0.0181, + "step": 57020 + }, + { + "epoch": 0.4215576121344727, + "grad_norm": 0.12674884498119354, + "learning_rate": 2.9029410018993353e-05, + "loss": 0.0186, + "step": 57030 + }, + { + "epoch": 0.4216315307057745, + "grad_norm": 0.07563526928424835, + "learning_rate": 2.902570037986705e-05, + "loss": 0.0182, + "step": 57040 + }, + { + "epoch": 0.42170544927707637, + "grad_norm": 0.07964854687452316, + "learning_rate": 2.902199074074074e-05, + "loss": 0.017, + "step": 57050 + }, + { + "epoch": 0.4217793678483782, + "grad_norm": 0.07769711315631866, + "learning_rate": 2.9018281101614437e-05, + "loss": 0.0168, + "step": 57060 + }, + { + "epoch": 0.42185328641968006, + "grad_norm": 0.0685645267367363, + "learning_rate": 2.9014571462488133e-05, + "loss": 0.0203, + "step": 57070 + }, + { + "epoch": 0.4219272049909819, + "grad_norm": 0.07990463823080063, + "learning_rate": 2.9010861823361822e-05, + "loss": 0.0201, + "step": 57080 + }, + { + "epoch": 0.4220011235622838, + "grad_norm": 0.08478770405054092, + "learning_rate": 2.900715218423552e-05, + "loss": 0.0172, + "step": 57090 + }, + { + "epoch": 0.42207504213358565, + "grad_norm": 0.06837252527475357, + "learning_rate": 2.900344254510921e-05, + "loss": 0.0237, + "step": 57100 + }, + { + "epoch": 0.4221489607048875, + "grad_norm": 0.08297235518693924, + "learning_rate": 2.8999732905982907e-05, + "loss": 0.0204, + "step": 57110 + }, + { + "epoch": 0.42222287927618934, + "grad_norm": 0.10914500057697296, + "learning_rate": 2.8996023266856603e-05, + "loss": 0.0172, + "step": 57120 + }, + { + "epoch": 0.4222967978474912, + "grad_norm": 0.07343258708715439, + "learning_rate": 2.8992313627730296e-05, + "loss": 0.0165, + "step": 57130 + }, + { + "epoch": 0.42237071641879304, + "grad_norm": 0.09652519226074219, + "learning_rate": 2.898860398860399e-05, + "loss": 0.0192, + "step": 57140 + }, + { + "epoch": 0.42244463499009494, + "grad_norm": 0.072759710252285, + "learning_rate": 2.898489434947768e-05, + "loss": 0.0171, + "step": 57150 + }, + { + "epoch": 0.4225185535613968, + "grad_norm": 0.09772517532110214, + "learning_rate": 2.8981184710351377e-05, + "loss": 0.0192, + "step": 57160 + }, + { + "epoch": 0.4225924721326986, + "grad_norm": 0.08044306188821793, + "learning_rate": 2.8977475071225073e-05, + "loss": 0.0189, + "step": 57170 + }, + { + "epoch": 0.42266639070400047, + "grad_norm": 0.1011492982506752, + "learning_rate": 2.8973765432098765e-05, + "loss": 0.0184, + "step": 57180 + }, + { + "epoch": 0.4227403092753023, + "grad_norm": 0.08796268701553345, + "learning_rate": 2.897005579297246e-05, + "loss": 0.0209, + "step": 57190 + }, + { + "epoch": 0.42281422784660416, + "grad_norm": 0.09029748290777206, + "learning_rate": 2.8966346153846157e-05, + "loss": 0.0181, + "step": 57200 + }, + { + "epoch": 0.422888146417906, + "grad_norm": 0.09961996972560883, + "learning_rate": 2.896263651471985e-05, + "loss": 0.0161, + "step": 57210 + }, + { + "epoch": 0.4229620649892079, + "grad_norm": 0.23429933190345764, + "learning_rate": 2.8958926875593546e-05, + "loss": 0.019, + "step": 57220 + }, + { + "epoch": 0.42303598356050975, + "grad_norm": 0.11873859912157059, + "learning_rate": 2.8955217236467235e-05, + "loss": 0.0222, + "step": 57230 + }, + { + "epoch": 0.4231099021318116, + "grad_norm": 0.0865563228726387, + "learning_rate": 2.895150759734093e-05, + "loss": 0.0181, + "step": 57240 + }, + { + "epoch": 0.42318382070311344, + "grad_norm": 0.14462807774543762, + "learning_rate": 2.894779795821463e-05, + "loss": 0.0176, + "step": 57250 + }, + { + "epoch": 0.4232577392744153, + "grad_norm": 0.0750247985124588, + "learning_rate": 2.894408831908832e-05, + "loss": 0.0182, + "step": 57260 + }, + { + "epoch": 0.42333165784571714, + "grad_norm": 0.0788230374455452, + "learning_rate": 2.8940378679962015e-05, + "loss": 0.0198, + "step": 57270 + }, + { + "epoch": 0.42340557641701904, + "grad_norm": 0.08862007409334183, + "learning_rate": 2.8936669040835708e-05, + "loss": 0.017, + "step": 57280 + }, + { + "epoch": 0.4234794949883209, + "grad_norm": 0.09727243334054947, + "learning_rate": 2.8932959401709404e-05, + "loss": 0.0178, + "step": 57290 + }, + { + "epoch": 0.4235534135596227, + "grad_norm": 0.08772630244493484, + "learning_rate": 2.89292497625831e-05, + "loss": 0.0182, + "step": 57300 + }, + { + "epoch": 0.42362733213092457, + "grad_norm": 0.07961007952690125, + "learning_rate": 2.892554012345679e-05, + "loss": 0.0181, + "step": 57310 + }, + { + "epoch": 0.4237012507022264, + "grad_norm": 0.07917802780866623, + "learning_rate": 2.8921830484330485e-05, + "loss": 0.0178, + "step": 57320 + }, + { + "epoch": 0.42377516927352826, + "grad_norm": 0.08240603655576706, + "learning_rate": 2.8918120845204178e-05, + "loss": 0.0192, + "step": 57330 + }, + { + "epoch": 0.4238490878448301, + "grad_norm": 0.11383040994405746, + "learning_rate": 2.8914411206077874e-05, + "loss": 0.0196, + "step": 57340 + }, + { + "epoch": 0.423923006416132, + "grad_norm": 0.09868566691875458, + "learning_rate": 2.891070156695157e-05, + "loss": 0.0194, + "step": 57350 + }, + { + "epoch": 0.42399692498743385, + "grad_norm": 0.1159425750374794, + "learning_rate": 2.8906991927825262e-05, + "loss": 0.0196, + "step": 57360 + }, + { + "epoch": 0.4240708435587357, + "grad_norm": 0.07365076243877411, + "learning_rate": 2.8903282288698958e-05, + "loss": 0.016, + "step": 57370 + }, + { + "epoch": 0.42414476213003754, + "grad_norm": 0.07025440782308578, + "learning_rate": 2.8899572649572647e-05, + "loss": 0.0173, + "step": 57380 + }, + { + "epoch": 0.4242186807013394, + "grad_norm": 0.08012454211711884, + "learning_rate": 2.8895863010446343e-05, + "loss": 0.0183, + "step": 57390 + }, + { + "epoch": 0.42429259927264124, + "grad_norm": 0.087140753865242, + "learning_rate": 2.8892153371320043e-05, + "loss": 0.0189, + "step": 57400 + }, + { + "epoch": 0.42436651784394314, + "grad_norm": 0.1057136058807373, + "learning_rate": 2.8888443732193732e-05, + "loss": 0.0193, + "step": 57410 + }, + { + "epoch": 0.424440436415245, + "grad_norm": 0.08255941420793533, + "learning_rate": 2.8884734093067428e-05, + "loss": 0.018, + "step": 57420 + }, + { + "epoch": 0.4245143549865468, + "grad_norm": 0.07151252031326294, + "learning_rate": 2.8881024453941124e-05, + "loss": 0.0178, + "step": 57430 + }, + { + "epoch": 0.42458827355784867, + "grad_norm": 0.08573801070451736, + "learning_rate": 2.8877314814814816e-05, + "loss": 0.0172, + "step": 57440 + }, + { + "epoch": 0.4246621921291505, + "grad_norm": 0.11681819707155228, + "learning_rate": 2.8873605175688512e-05, + "loss": 0.0208, + "step": 57450 + }, + { + "epoch": 0.42473611070045236, + "grad_norm": 0.0816139355301857, + "learning_rate": 2.88698955365622e-05, + "loss": 0.0222, + "step": 57460 + }, + { + "epoch": 0.4248100292717542, + "grad_norm": 0.06773439049720764, + "learning_rate": 2.8866185897435897e-05, + "loss": 0.018, + "step": 57470 + }, + { + "epoch": 0.4248839478430561, + "grad_norm": 0.06856571137905121, + "learning_rate": 2.8862476258309597e-05, + "loss": 0.0174, + "step": 57480 + }, + { + "epoch": 0.42495786641435795, + "grad_norm": 0.09887688606977463, + "learning_rate": 2.8858766619183286e-05, + "loss": 0.0194, + "step": 57490 + }, + { + "epoch": 0.4250317849856598, + "grad_norm": 0.09817370027303696, + "learning_rate": 2.8855056980056982e-05, + "loss": 0.0205, + "step": 57500 + }, + { + "epoch": 0.42510570355696164, + "grad_norm": 0.09108339995145798, + "learning_rate": 2.8851347340930674e-05, + "loss": 0.017, + "step": 57510 + }, + { + "epoch": 0.4251796221282635, + "grad_norm": 0.08016406744718552, + "learning_rate": 2.884763770180437e-05, + "loss": 0.0189, + "step": 57520 + }, + { + "epoch": 0.42525354069956534, + "grad_norm": 0.07251052558422089, + "learning_rate": 2.8843928062678066e-05, + "loss": 0.0176, + "step": 57530 + }, + { + "epoch": 0.42532745927086724, + "grad_norm": 0.0984114408493042, + "learning_rate": 2.8840218423551756e-05, + "loss": 0.0204, + "step": 57540 + }, + { + "epoch": 0.4254013778421691, + "grad_norm": 0.0761522427201271, + "learning_rate": 2.8836508784425455e-05, + "loss": 0.0185, + "step": 57550 + }, + { + "epoch": 0.4254752964134709, + "grad_norm": 0.07486771047115326, + "learning_rate": 2.8832799145299144e-05, + "loss": 0.0186, + "step": 57560 + }, + { + "epoch": 0.4255492149847728, + "grad_norm": 0.07122237235307693, + "learning_rate": 2.882908950617284e-05, + "loss": 0.0168, + "step": 57570 + }, + { + "epoch": 0.4256231335560746, + "grad_norm": 0.08447059243917465, + "learning_rate": 2.8825379867046536e-05, + "loss": 0.0209, + "step": 57580 + }, + { + "epoch": 0.42569705212737646, + "grad_norm": 0.09231428802013397, + "learning_rate": 2.882167022792023e-05, + "loss": 0.0179, + "step": 57590 + }, + { + "epoch": 0.4257709706986783, + "grad_norm": 0.09985219687223434, + "learning_rate": 2.8817960588793925e-05, + "loss": 0.0175, + "step": 57600 + }, + { + "epoch": 0.4258448892699802, + "grad_norm": 0.08764240145683289, + "learning_rate": 2.8814250949667614e-05, + "loss": 0.0188, + "step": 57610 + }, + { + "epoch": 0.42591880784128205, + "grad_norm": 0.1160874217748642, + "learning_rate": 2.881054131054131e-05, + "loss": 0.0165, + "step": 57620 + }, + { + "epoch": 0.4259927264125839, + "grad_norm": 0.09844554215669632, + "learning_rate": 2.880683167141501e-05, + "loss": 0.0203, + "step": 57630 + }, + { + "epoch": 0.42606664498388575, + "grad_norm": 0.12586788833141327, + "learning_rate": 2.88031220322887e-05, + "loss": 0.0173, + "step": 57640 + }, + { + "epoch": 0.4261405635551876, + "grad_norm": 0.07267776131629944, + "learning_rate": 2.8799412393162394e-05, + "loss": 0.0197, + "step": 57650 + }, + { + "epoch": 0.42621448212648944, + "grad_norm": 0.11026757210493088, + "learning_rate": 2.879570275403609e-05, + "loss": 0.0195, + "step": 57660 + }, + { + "epoch": 0.42628840069779134, + "grad_norm": 0.08216289430856705, + "learning_rate": 2.8791993114909783e-05, + "loss": 0.019, + "step": 57670 + }, + { + "epoch": 0.4263623192690932, + "grad_norm": 0.07248413562774658, + "learning_rate": 2.878828347578348e-05, + "loss": 0.0191, + "step": 57680 + }, + { + "epoch": 0.426436237840395, + "grad_norm": 0.0735250785946846, + "learning_rate": 2.8784573836657168e-05, + "loss": 0.0158, + "step": 57690 + }, + { + "epoch": 0.4265101564116969, + "grad_norm": 0.08926840871572495, + "learning_rate": 2.8780864197530867e-05, + "loss": 0.0174, + "step": 57700 + }, + { + "epoch": 0.4265840749829987, + "grad_norm": 0.0892130509018898, + "learning_rate": 2.8777154558404563e-05, + "loss": 0.0167, + "step": 57710 + }, + { + "epoch": 0.42665799355430056, + "grad_norm": 0.08792570978403091, + "learning_rate": 2.8773444919278253e-05, + "loss": 0.0197, + "step": 57720 + }, + { + "epoch": 0.42673191212560246, + "grad_norm": 0.08881917595863342, + "learning_rate": 2.876973528015195e-05, + "loss": 0.018, + "step": 57730 + }, + { + "epoch": 0.4268058306969043, + "grad_norm": 0.08950222283601761, + "learning_rate": 2.876602564102564e-05, + "loss": 0.0174, + "step": 57740 + }, + { + "epoch": 0.42687974926820615, + "grad_norm": 0.0893772542476654, + "learning_rate": 2.8762316001899337e-05, + "loss": 0.0218, + "step": 57750 + }, + { + "epoch": 0.426953667839508, + "grad_norm": 0.09056003391742706, + "learning_rate": 2.8758606362773033e-05, + "loss": 0.0193, + "step": 57760 + }, + { + "epoch": 0.42702758641080985, + "grad_norm": 0.11356296390295029, + "learning_rate": 2.8754896723646722e-05, + "loss": 0.0212, + "step": 57770 + }, + { + "epoch": 0.4271015049821117, + "grad_norm": 0.06659174710512161, + "learning_rate": 2.875118708452042e-05, + "loss": 0.0192, + "step": 57780 + }, + { + "epoch": 0.42717542355341354, + "grad_norm": 0.09439995139837265, + "learning_rate": 2.874747744539411e-05, + "loss": 0.0207, + "step": 57790 + }, + { + "epoch": 0.42724934212471544, + "grad_norm": 0.07848507910966873, + "learning_rate": 2.8743767806267807e-05, + "loss": 0.0182, + "step": 57800 + }, + { + "epoch": 0.4273232606960173, + "grad_norm": 0.07541602104902267, + "learning_rate": 2.8740058167141503e-05, + "loss": 0.0217, + "step": 57810 + }, + { + "epoch": 0.4273971792673191, + "grad_norm": 0.10256219655275345, + "learning_rate": 2.8736348528015195e-05, + "loss": 0.0207, + "step": 57820 + }, + { + "epoch": 0.427471097838621, + "grad_norm": 0.08818720281124115, + "learning_rate": 2.873263888888889e-05, + "loss": 0.0173, + "step": 57830 + }, + { + "epoch": 0.4275450164099228, + "grad_norm": 0.06569219380617142, + "learning_rate": 2.872892924976258e-05, + "loss": 0.019, + "step": 57840 + }, + { + "epoch": 0.42761893498122466, + "grad_norm": 0.05568908900022507, + "learning_rate": 2.872521961063628e-05, + "loss": 0.0217, + "step": 57850 + }, + { + "epoch": 0.42769285355252656, + "grad_norm": 0.08012406527996063, + "learning_rate": 2.8721509971509976e-05, + "loss": 0.0176, + "step": 57860 + }, + { + "epoch": 0.4277667721238284, + "grad_norm": 0.07560218870639801, + "learning_rate": 2.8717800332383665e-05, + "loss": 0.0165, + "step": 57870 + }, + { + "epoch": 0.42784069069513025, + "grad_norm": 0.0802362784743309, + "learning_rate": 2.871409069325736e-05, + "loss": 0.017, + "step": 57880 + }, + { + "epoch": 0.4279146092664321, + "grad_norm": 0.0996783971786499, + "learning_rate": 2.8710381054131057e-05, + "loss": 0.018, + "step": 57890 + }, + { + "epoch": 0.42798852783773395, + "grad_norm": 0.06278420239686966, + "learning_rate": 2.870667141500475e-05, + "loss": 0.0183, + "step": 57900 + }, + { + "epoch": 0.4280624464090358, + "grad_norm": 0.09411557763814926, + "learning_rate": 2.8702961775878445e-05, + "loss": 0.0184, + "step": 57910 + }, + { + "epoch": 0.42813636498033764, + "grad_norm": 0.1253633052110672, + "learning_rate": 2.8699252136752135e-05, + "loss": 0.0186, + "step": 57920 + }, + { + "epoch": 0.42821028355163954, + "grad_norm": 0.06833294779062271, + "learning_rate": 2.8695542497625834e-05, + "loss": 0.0194, + "step": 57930 + }, + { + "epoch": 0.4282842021229414, + "grad_norm": 0.08020038157701492, + "learning_rate": 2.869183285849953e-05, + "loss": 0.0196, + "step": 57940 + }, + { + "epoch": 0.4283581206942432, + "grad_norm": 0.09458767622709274, + "learning_rate": 2.868812321937322e-05, + "loss": 0.0175, + "step": 57950 + }, + { + "epoch": 0.4284320392655451, + "grad_norm": 0.07971557974815369, + "learning_rate": 2.8684413580246915e-05, + "loss": 0.018, + "step": 57960 + }, + { + "epoch": 0.4285059578368469, + "grad_norm": 0.07327605783939362, + "learning_rate": 2.8680703941120608e-05, + "loss": 0.0178, + "step": 57970 + }, + { + "epoch": 0.42857987640814876, + "grad_norm": 0.08138163387775421, + "learning_rate": 2.8676994301994304e-05, + "loss": 0.0177, + "step": 57980 + }, + { + "epoch": 0.42865379497945066, + "grad_norm": 0.06313429772853851, + "learning_rate": 2.8673284662868e-05, + "loss": 0.0181, + "step": 57990 + }, + { + "epoch": 0.4287277135507525, + "grad_norm": 0.09797652810811996, + "learning_rate": 2.866957502374169e-05, + "loss": 0.0179, + "step": 58000 + }, + { + "epoch": 0.42880163212205435, + "grad_norm": 0.06211160123348236, + "learning_rate": 2.8665865384615388e-05, + "loss": 0.0162, + "step": 58010 + }, + { + "epoch": 0.4288755506933562, + "grad_norm": 0.0639074519276619, + "learning_rate": 2.8662155745489077e-05, + "loss": 0.0171, + "step": 58020 + }, + { + "epoch": 0.42894946926465805, + "grad_norm": 0.0698406845331192, + "learning_rate": 2.8658446106362773e-05, + "loss": 0.0185, + "step": 58030 + }, + { + "epoch": 0.4290233878359599, + "grad_norm": 0.09111010283231735, + "learning_rate": 2.865473646723647e-05, + "loss": 0.0184, + "step": 58040 + }, + { + "epoch": 0.42909730640726174, + "grad_norm": 0.057579610496759415, + "learning_rate": 2.8651026828110162e-05, + "loss": 0.0184, + "step": 58050 + }, + { + "epoch": 0.42917122497856364, + "grad_norm": 0.08983967453241348, + "learning_rate": 2.8647317188983858e-05, + "loss": 0.0173, + "step": 58060 + }, + { + "epoch": 0.4292451435498655, + "grad_norm": 0.07949908822774887, + "learning_rate": 2.8643607549857547e-05, + "loss": 0.0138, + "step": 58070 + }, + { + "epoch": 0.4293190621211673, + "grad_norm": 0.11375358700752258, + "learning_rate": 2.8639897910731246e-05, + "loss": 0.02, + "step": 58080 + }, + { + "epoch": 0.4293929806924692, + "grad_norm": 0.07009422779083252, + "learning_rate": 2.8636188271604942e-05, + "loss": 0.0155, + "step": 58090 + }, + { + "epoch": 0.429466899263771, + "grad_norm": 0.07353521138429642, + "learning_rate": 2.863247863247863e-05, + "loss": 0.0176, + "step": 58100 + }, + { + "epoch": 0.42954081783507286, + "grad_norm": 0.0705774649977684, + "learning_rate": 2.8628768993352327e-05, + "loss": 0.0191, + "step": 58110 + }, + { + "epoch": 0.42961473640637476, + "grad_norm": 0.07235126942396164, + "learning_rate": 2.8625059354226023e-05, + "loss": 0.0205, + "step": 58120 + }, + { + "epoch": 0.4296886549776766, + "grad_norm": 0.07953356206417084, + "learning_rate": 2.8621349715099716e-05, + "loss": 0.0201, + "step": 58130 + }, + { + "epoch": 0.42976257354897845, + "grad_norm": 0.09690140932798386, + "learning_rate": 2.8617640075973412e-05, + "loss": 0.0179, + "step": 58140 + }, + { + "epoch": 0.4298364921202803, + "grad_norm": 0.09740715473890305, + "learning_rate": 2.86139304368471e-05, + "loss": 0.0201, + "step": 58150 + }, + { + "epoch": 0.42991041069158215, + "grad_norm": 0.12054110318422318, + "learning_rate": 2.86102207977208e-05, + "loss": 0.0207, + "step": 58160 + }, + { + "epoch": 0.429984329262884, + "grad_norm": 0.08145933598279953, + "learning_rate": 2.8606511158594496e-05, + "loss": 0.02, + "step": 58170 + }, + { + "epoch": 0.43005824783418584, + "grad_norm": 0.06964617967605591, + "learning_rate": 2.8602801519468186e-05, + "loss": 0.0201, + "step": 58180 + }, + { + "epoch": 0.43013216640548774, + "grad_norm": 0.09489192813634872, + "learning_rate": 2.859909188034188e-05, + "loss": 0.0183, + "step": 58190 + }, + { + "epoch": 0.4302060849767896, + "grad_norm": 0.06682495027780533, + "learning_rate": 2.8595382241215574e-05, + "loss": 0.0169, + "step": 58200 + }, + { + "epoch": 0.4302800035480914, + "grad_norm": 0.08641725778579712, + "learning_rate": 2.859167260208927e-05, + "loss": 0.0193, + "step": 58210 + }, + { + "epoch": 0.4303539221193933, + "grad_norm": 0.0909150168299675, + "learning_rate": 2.8587962962962966e-05, + "loss": 0.0188, + "step": 58220 + }, + { + "epoch": 0.4304278406906951, + "grad_norm": 0.07001394778490067, + "learning_rate": 2.858425332383666e-05, + "loss": 0.0182, + "step": 58230 + }, + { + "epoch": 0.43050175926199696, + "grad_norm": 0.0732155293226242, + "learning_rate": 2.8580543684710355e-05, + "loss": 0.0217, + "step": 58240 + }, + { + "epoch": 0.43057567783329886, + "grad_norm": 0.07230205088853836, + "learning_rate": 2.8576834045584044e-05, + "loss": 0.0177, + "step": 58250 + }, + { + "epoch": 0.4306495964046007, + "grad_norm": 0.08895523101091385, + "learning_rate": 2.857312440645774e-05, + "loss": 0.0194, + "step": 58260 + }, + { + "epoch": 0.43072351497590255, + "grad_norm": 0.07853704690933228, + "learning_rate": 2.8569414767331436e-05, + "loss": 0.0182, + "step": 58270 + }, + { + "epoch": 0.4307974335472044, + "grad_norm": 0.10844014585018158, + "learning_rate": 2.856570512820513e-05, + "loss": 0.02, + "step": 58280 + }, + { + "epoch": 0.43087135211850625, + "grad_norm": 0.08113034069538116, + "learning_rate": 2.8561995489078824e-05, + "loss": 0.0177, + "step": 58290 + }, + { + "epoch": 0.4309452706898081, + "grad_norm": 0.08964955061674118, + "learning_rate": 2.8558285849952514e-05, + "loss": 0.0177, + "step": 58300 + }, + { + "epoch": 0.43101918926110994, + "grad_norm": 0.07938321679830551, + "learning_rate": 2.8554576210826213e-05, + "loss": 0.0179, + "step": 58310 + }, + { + "epoch": 0.43109310783241184, + "grad_norm": 0.07115291059017181, + "learning_rate": 2.855086657169991e-05, + "loss": 0.0177, + "step": 58320 + }, + { + "epoch": 0.4311670264037137, + "grad_norm": 0.06883087009191513, + "learning_rate": 2.8547156932573598e-05, + "loss": 0.0194, + "step": 58330 + }, + { + "epoch": 0.4312409449750155, + "grad_norm": 0.09511996060609818, + "learning_rate": 2.8543447293447294e-05, + "loss": 0.02, + "step": 58340 + }, + { + "epoch": 0.4313148635463174, + "grad_norm": 0.13075599074363708, + "learning_rate": 2.853973765432099e-05, + "loss": 0.0222, + "step": 58350 + }, + { + "epoch": 0.4313887821176192, + "grad_norm": 0.06507648527622223, + "learning_rate": 2.8536028015194683e-05, + "loss": 0.0159, + "step": 58360 + }, + { + "epoch": 0.43146270068892106, + "grad_norm": 0.09812938421964645, + "learning_rate": 2.853231837606838e-05, + "loss": 0.0166, + "step": 58370 + }, + { + "epoch": 0.43153661926022296, + "grad_norm": 0.08186332136392593, + "learning_rate": 2.852860873694207e-05, + "loss": 0.0215, + "step": 58380 + }, + { + "epoch": 0.4316105378315248, + "grad_norm": 0.07731013000011444, + "learning_rate": 2.8524899097815767e-05, + "loss": 0.0202, + "step": 58390 + }, + { + "epoch": 0.43168445640282666, + "grad_norm": 0.094798244535923, + "learning_rate": 2.8521189458689463e-05, + "loss": 0.0171, + "step": 58400 + }, + { + "epoch": 0.4317583749741285, + "grad_norm": 0.110369011759758, + "learning_rate": 2.8517479819563152e-05, + "loss": 0.0198, + "step": 58410 + }, + { + "epoch": 0.43183229354543035, + "grad_norm": 0.10678127408027649, + "learning_rate": 2.8513770180436848e-05, + "loss": 0.0169, + "step": 58420 + }, + { + "epoch": 0.4319062121167322, + "grad_norm": 0.09165314584970474, + "learning_rate": 2.851006054131054e-05, + "loss": 0.02, + "step": 58430 + }, + { + "epoch": 0.43198013068803404, + "grad_norm": 0.06183528155088425, + "learning_rate": 2.8506350902184237e-05, + "loss": 0.0204, + "step": 58440 + }, + { + "epoch": 0.43205404925933594, + "grad_norm": 0.0831647589802742, + "learning_rate": 2.8502641263057933e-05, + "loss": 0.0208, + "step": 58450 + }, + { + "epoch": 0.4321279678306378, + "grad_norm": 0.09043709933757782, + "learning_rate": 2.8498931623931625e-05, + "loss": 0.0183, + "step": 58460 + }, + { + "epoch": 0.43220188640193963, + "grad_norm": 0.07497021555900574, + "learning_rate": 2.849522198480532e-05, + "loss": 0.0181, + "step": 58470 + }, + { + "epoch": 0.4322758049732415, + "grad_norm": 0.09095343947410583, + "learning_rate": 2.849151234567901e-05, + "loss": 0.0193, + "step": 58480 + }, + { + "epoch": 0.4323497235445433, + "grad_norm": 0.09853128343820572, + "learning_rate": 2.8487802706552706e-05, + "loss": 0.0186, + "step": 58490 + }, + { + "epoch": 0.43242364211584516, + "grad_norm": 0.08832711726427078, + "learning_rate": 2.8484093067426402e-05, + "loss": 0.0159, + "step": 58500 + }, + { + "epoch": 0.43249756068714706, + "grad_norm": 0.07951664924621582, + "learning_rate": 2.8480383428300095e-05, + "loss": 0.02, + "step": 58510 + }, + { + "epoch": 0.4325714792584489, + "grad_norm": 0.08814312517642975, + "learning_rate": 2.847667378917379e-05, + "loss": 0.0187, + "step": 58520 + }, + { + "epoch": 0.43264539782975076, + "grad_norm": 0.09159665554761887, + "learning_rate": 2.8472964150047487e-05, + "loss": 0.0197, + "step": 58530 + }, + { + "epoch": 0.4327193164010526, + "grad_norm": 0.0654560849070549, + "learning_rate": 2.846925451092118e-05, + "loss": 0.0194, + "step": 58540 + }, + { + "epoch": 0.43279323497235445, + "grad_norm": 0.06398443132638931, + "learning_rate": 2.8465544871794875e-05, + "loss": 0.0148, + "step": 58550 + }, + { + "epoch": 0.4328671535436563, + "grad_norm": 0.10145048797130585, + "learning_rate": 2.8461835232668565e-05, + "loss": 0.0164, + "step": 58560 + }, + { + "epoch": 0.43294107211495814, + "grad_norm": 0.07037309557199478, + "learning_rate": 2.845812559354226e-05, + "loss": 0.0171, + "step": 58570 + }, + { + "epoch": 0.43301499068626004, + "grad_norm": 0.0784565657377243, + "learning_rate": 2.845441595441596e-05, + "loss": 0.0188, + "step": 58580 + }, + { + "epoch": 0.4330889092575619, + "grad_norm": 0.10798093676567078, + "learning_rate": 2.845070631528965e-05, + "loss": 0.0171, + "step": 58590 + }, + { + "epoch": 0.43316282782886373, + "grad_norm": 0.14126542210578918, + "learning_rate": 2.8446996676163345e-05, + "loss": 0.0192, + "step": 58600 + }, + { + "epoch": 0.4332367464001656, + "grad_norm": 0.06410064548254013, + "learning_rate": 2.8443287037037038e-05, + "loss": 0.0185, + "step": 58610 + }, + { + "epoch": 0.4333106649714674, + "grad_norm": 0.10798434168100357, + "learning_rate": 2.8439577397910734e-05, + "loss": 0.0187, + "step": 58620 + }, + { + "epoch": 0.43338458354276926, + "grad_norm": 0.09027906507253647, + "learning_rate": 2.843586775878443e-05, + "loss": 0.0188, + "step": 58630 + }, + { + "epoch": 0.43345850211407116, + "grad_norm": 0.06169156730175018, + "learning_rate": 2.843215811965812e-05, + "loss": 0.0181, + "step": 58640 + }, + { + "epoch": 0.433532420685373, + "grad_norm": 0.0730864405632019, + "learning_rate": 2.8428448480531815e-05, + "loss": 0.017, + "step": 58650 + }, + { + "epoch": 0.43360633925667486, + "grad_norm": 0.08099040389060974, + "learning_rate": 2.8424738841405507e-05, + "loss": 0.0187, + "step": 58660 + }, + { + "epoch": 0.4336802578279767, + "grad_norm": 0.06756632030010223, + "learning_rate": 2.8421029202279203e-05, + "loss": 0.0157, + "step": 58670 + }, + { + "epoch": 0.43375417639927855, + "grad_norm": 0.06606796383857727, + "learning_rate": 2.84173195631529e-05, + "loss": 0.016, + "step": 58680 + }, + { + "epoch": 0.4338280949705804, + "grad_norm": 0.08598122000694275, + "learning_rate": 2.8413609924026592e-05, + "loss": 0.017, + "step": 58690 + }, + { + "epoch": 0.43390201354188224, + "grad_norm": 0.07801267504692078, + "learning_rate": 2.8409900284900288e-05, + "loss": 0.0192, + "step": 58700 + }, + { + "epoch": 0.43397593211318414, + "grad_norm": 0.11814634501934052, + "learning_rate": 2.8406190645773977e-05, + "loss": 0.0186, + "step": 58710 + }, + { + "epoch": 0.434049850684486, + "grad_norm": 0.08184555917978287, + "learning_rate": 2.8402481006647673e-05, + "loss": 0.0178, + "step": 58720 + }, + { + "epoch": 0.43412376925578783, + "grad_norm": 0.10462668538093567, + "learning_rate": 2.839877136752137e-05, + "loss": 0.016, + "step": 58730 + }, + { + "epoch": 0.4341976878270897, + "grad_norm": 0.09804099798202515, + "learning_rate": 2.839506172839506e-05, + "loss": 0.0181, + "step": 58740 + }, + { + "epoch": 0.4342716063983915, + "grad_norm": 0.06667347997426987, + "learning_rate": 2.8391352089268758e-05, + "loss": 0.0162, + "step": 58750 + }, + { + "epoch": 0.43434552496969336, + "grad_norm": 0.07495291531085968, + "learning_rate": 2.8387642450142454e-05, + "loss": 0.0195, + "step": 58760 + }, + { + "epoch": 0.43441944354099526, + "grad_norm": 0.07768017053604126, + "learning_rate": 2.8383932811016146e-05, + "loss": 0.0178, + "step": 58770 + }, + { + "epoch": 0.4344933621122971, + "grad_norm": 0.09335466474294662, + "learning_rate": 2.8380223171889842e-05, + "loss": 0.0209, + "step": 58780 + }, + { + "epoch": 0.43456728068359896, + "grad_norm": 0.0606859028339386, + "learning_rate": 2.837651353276353e-05, + "loss": 0.0195, + "step": 58790 + }, + { + "epoch": 0.4346411992549008, + "grad_norm": 0.0708116963505745, + "learning_rate": 2.8372803893637227e-05, + "loss": 0.017, + "step": 58800 + }, + { + "epoch": 0.43471511782620265, + "grad_norm": 0.07581387460231781, + "learning_rate": 2.8369094254510927e-05, + "loss": 0.0169, + "step": 58810 + }, + { + "epoch": 0.4347890363975045, + "grad_norm": 0.10559257864952087, + "learning_rate": 2.8365384615384616e-05, + "loss": 0.016, + "step": 58820 + }, + { + "epoch": 0.43486295496880634, + "grad_norm": 0.06055699661374092, + "learning_rate": 2.8361674976258312e-05, + "loss": 0.0178, + "step": 58830 + }, + { + "epoch": 0.43493687354010824, + "grad_norm": 0.07731713354587555, + "learning_rate": 2.8357965337132004e-05, + "loss": 0.0185, + "step": 58840 + }, + { + "epoch": 0.4350107921114101, + "grad_norm": 0.07417374104261398, + "learning_rate": 2.83542556980057e-05, + "loss": 0.0179, + "step": 58850 + }, + { + "epoch": 0.43508471068271193, + "grad_norm": 0.08462899923324585, + "learning_rate": 2.8350546058879396e-05, + "loss": 0.0167, + "step": 58860 + }, + { + "epoch": 0.4351586292540138, + "grad_norm": 0.16679640114307404, + "learning_rate": 2.8346836419753085e-05, + "loss": 0.0206, + "step": 58870 + }, + { + "epoch": 0.4352325478253156, + "grad_norm": 0.08410433679819107, + "learning_rate": 2.834312678062678e-05, + "loss": 0.0173, + "step": 58880 + }, + { + "epoch": 0.43530646639661746, + "grad_norm": 0.08903812617063522, + "learning_rate": 2.8339417141500474e-05, + "loss": 0.0206, + "step": 58890 + }, + { + "epoch": 0.43538038496791936, + "grad_norm": 0.08129949867725372, + "learning_rate": 2.833570750237417e-05, + "loss": 0.0188, + "step": 58900 + }, + { + "epoch": 0.4354543035392212, + "grad_norm": 0.0971398800611496, + "learning_rate": 2.8331997863247866e-05, + "loss": 0.0175, + "step": 58910 + }, + { + "epoch": 0.43552822211052306, + "grad_norm": 0.05689968168735504, + "learning_rate": 2.832828822412156e-05, + "loss": 0.0185, + "step": 58920 + }, + { + "epoch": 0.4356021406818249, + "grad_norm": 0.06802728027105331, + "learning_rate": 2.8324578584995254e-05, + "loss": 0.0172, + "step": 58930 + }, + { + "epoch": 0.43567605925312675, + "grad_norm": 0.07055903226137161, + "learning_rate": 2.8320868945868944e-05, + "loss": 0.0158, + "step": 58940 + }, + { + "epoch": 0.4357499778244286, + "grad_norm": 0.08107168227434158, + "learning_rate": 2.831715930674264e-05, + "loss": 0.0201, + "step": 58950 + }, + { + "epoch": 0.43582389639573044, + "grad_norm": 0.08352484554052353, + "learning_rate": 2.831344966761634e-05, + "loss": 0.02, + "step": 58960 + }, + { + "epoch": 0.43589781496703234, + "grad_norm": 0.08295414596796036, + "learning_rate": 2.8309740028490028e-05, + "loss": 0.0176, + "step": 58970 + }, + { + "epoch": 0.4359717335383342, + "grad_norm": 0.0799393355846405, + "learning_rate": 2.8306030389363724e-05, + "loss": 0.0171, + "step": 58980 + }, + { + "epoch": 0.43604565210963603, + "grad_norm": 0.060312747955322266, + "learning_rate": 2.830232075023742e-05, + "loss": 0.0172, + "step": 58990 + }, + { + "epoch": 0.4361195706809379, + "grad_norm": 0.11155330389738083, + "learning_rate": 2.8298611111111113e-05, + "loss": 0.0206, + "step": 59000 + }, + { + "epoch": 0.4361934892522397, + "grad_norm": 0.08497704565525055, + "learning_rate": 2.829490147198481e-05, + "loss": 0.0157, + "step": 59010 + }, + { + "epoch": 0.43626740782354156, + "grad_norm": 0.08191950619220734, + "learning_rate": 2.8291191832858498e-05, + "loss": 0.0186, + "step": 59020 + }, + { + "epoch": 0.43634132639484347, + "grad_norm": 0.09134622663259506, + "learning_rate": 2.8287482193732194e-05, + "loss": 0.0178, + "step": 59030 + }, + { + "epoch": 0.4364152449661453, + "grad_norm": 0.061829108744859695, + "learning_rate": 2.8283772554605893e-05, + "loss": 0.0176, + "step": 59040 + }, + { + "epoch": 0.43648916353744716, + "grad_norm": 0.07822591811418533, + "learning_rate": 2.8280062915479582e-05, + "loss": 0.0158, + "step": 59050 + }, + { + "epoch": 0.436563082108749, + "grad_norm": 0.07874293625354767, + "learning_rate": 2.827635327635328e-05, + "loss": 0.018, + "step": 59060 + }, + { + "epoch": 0.43663700068005085, + "grad_norm": 0.09312419593334198, + "learning_rate": 2.827264363722697e-05, + "loss": 0.0191, + "step": 59070 + }, + { + "epoch": 0.4367109192513527, + "grad_norm": 0.08162961900234222, + "learning_rate": 2.8268933998100667e-05, + "loss": 0.0188, + "step": 59080 + }, + { + "epoch": 0.43678483782265454, + "grad_norm": 0.06353994458913803, + "learning_rate": 2.8265224358974363e-05, + "loss": 0.0174, + "step": 59090 + }, + { + "epoch": 0.43685875639395644, + "grad_norm": 0.08950216323137283, + "learning_rate": 2.8261514719848052e-05, + "loss": 0.0191, + "step": 59100 + }, + { + "epoch": 0.4369326749652583, + "grad_norm": 0.06819460541009903, + "learning_rate": 2.825780508072175e-05, + "loss": 0.0155, + "step": 59110 + }, + { + "epoch": 0.43700659353656013, + "grad_norm": 0.09616361558437347, + "learning_rate": 2.825409544159544e-05, + "loss": 0.0176, + "step": 59120 + }, + { + "epoch": 0.437080512107862, + "grad_norm": 0.10181725025177002, + "learning_rate": 2.8250385802469137e-05, + "loss": 0.0205, + "step": 59130 + }, + { + "epoch": 0.4371544306791638, + "grad_norm": 0.12475231289863586, + "learning_rate": 2.8246676163342832e-05, + "loss": 0.0194, + "step": 59140 + }, + { + "epoch": 0.43722834925046566, + "grad_norm": 0.06839167326688766, + "learning_rate": 2.8242966524216525e-05, + "loss": 0.018, + "step": 59150 + }, + { + "epoch": 0.43730226782176757, + "grad_norm": 0.07144494354724884, + "learning_rate": 2.823925688509022e-05, + "loss": 0.0219, + "step": 59160 + }, + { + "epoch": 0.4373761863930694, + "grad_norm": 0.08841611444950104, + "learning_rate": 2.823554724596391e-05, + "loss": 0.0189, + "step": 59170 + }, + { + "epoch": 0.43745010496437126, + "grad_norm": 0.07900545001029968, + "learning_rate": 2.8231837606837606e-05, + "loss": 0.019, + "step": 59180 + }, + { + "epoch": 0.4375240235356731, + "grad_norm": 0.0752866268157959, + "learning_rate": 2.8228127967711306e-05, + "loss": 0.0168, + "step": 59190 + }, + { + "epoch": 0.43759794210697495, + "grad_norm": 0.08983873575925827, + "learning_rate": 2.8224418328584995e-05, + "loss": 0.0163, + "step": 59200 + }, + { + "epoch": 0.4376718606782768, + "grad_norm": 0.08169794827699661, + "learning_rate": 2.822070868945869e-05, + "loss": 0.0175, + "step": 59210 + }, + { + "epoch": 0.43774577924957864, + "grad_norm": 0.08061221241950989, + "learning_rate": 2.8216999050332387e-05, + "loss": 0.0197, + "step": 59220 + }, + { + "epoch": 0.43781969782088054, + "grad_norm": 0.0905076116323471, + "learning_rate": 2.821328941120608e-05, + "loss": 0.0185, + "step": 59230 + }, + { + "epoch": 0.4378936163921824, + "grad_norm": 0.07927828282117844, + "learning_rate": 2.8209579772079775e-05, + "loss": 0.0176, + "step": 59240 + }, + { + "epoch": 0.43796753496348423, + "grad_norm": 0.07110433280467987, + "learning_rate": 2.8205870132953464e-05, + "loss": 0.0181, + "step": 59250 + }, + { + "epoch": 0.4380414535347861, + "grad_norm": 0.08170510083436966, + "learning_rate": 2.8202160493827164e-05, + "loss": 0.0169, + "step": 59260 + }, + { + "epoch": 0.4381153721060879, + "grad_norm": 0.0884263664484024, + "learning_rate": 2.819845085470086e-05, + "loss": 0.0201, + "step": 59270 + }, + { + "epoch": 0.43818929067738976, + "grad_norm": 0.0725260078907013, + "learning_rate": 2.819474121557455e-05, + "loss": 0.0198, + "step": 59280 + }, + { + "epoch": 0.43826320924869167, + "grad_norm": 0.12197090685367584, + "learning_rate": 2.8191031576448245e-05, + "loss": 0.0175, + "step": 59290 + }, + { + "epoch": 0.4383371278199935, + "grad_norm": 0.09408222138881683, + "learning_rate": 2.8187321937321937e-05, + "loss": 0.0192, + "step": 59300 + }, + { + "epoch": 0.43841104639129536, + "grad_norm": 0.07606709003448486, + "learning_rate": 2.8183612298195633e-05, + "loss": 0.0166, + "step": 59310 + }, + { + "epoch": 0.4384849649625972, + "grad_norm": 0.12872089445590973, + "learning_rate": 2.817990265906933e-05, + "loss": 0.0203, + "step": 59320 + }, + { + "epoch": 0.43855888353389905, + "grad_norm": 0.07069522142410278, + "learning_rate": 2.817619301994302e-05, + "loss": 0.0201, + "step": 59330 + }, + { + "epoch": 0.4386328021052009, + "grad_norm": 0.10497741401195526, + "learning_rate": 2.8172483380816718e-05, + "loss": 0.0177, + "step": 59340 + }, + { + "epoch": 0.43870672067650274, + "grad_norm": 0.0735153928399086, + "learning_rate": 2.8168773741690407e-05, + "loss": 0.0193, + "step": 59350 + }, + { + "epoch": 0.43878063924780464, + "grad_norm": 0.06192095950245857, + "learning_rate": 2.8165064102564103e-05, + "loss": 0.0209, + "step": 59360 + }, + { + "epoch": 0.4388545578191065, + "grad_norm": 0.08346061408519745, + "learning_rate": 2.81613544634378e-05, + "loss": 0.0179, + "step": 59370 + }, + { + "epoch": 0.43892847639040833, + "grad_norm": 0.08042008429765701, + "learning_rate": 2.815764482431149e-05, + "loss": 0.0174, + "step": 59380 + }, + { + "epoch": 0.4390023949617102, + "grad_norm": 0.082971952855587, + "learning_rate": 2.8153935185185188e-05, + "loss": 0.0181, + "step": 59390 + }, + { + "epoch": 0.439076313533012, + "grad_norm": 0.08295343071222305, + "learning_rate": 2.8150225546058877e-05, + "loss": 0.019, + "step": 59400 + }, + { + "epoch": 0.43915023210431386, + "grad_norm": 0.07845450192689896, + "learning_rate": 2.8146515906932576e-05, + "loss": 0.0181, + "step": 59410 + }, + { + "epoch": 0.43922415067561577, + "grad_norm": 0.059175584465265274, + "learning_rate": 2.8142806267806272e-05, + "loss": 0.0168, + "step": 59420 + }, + { + "epoch": 0.4392980692469176, + "grad_norm": 0.07663719356060028, + "learning_rate": 2.813909662867996e-05, + "loss": 0.0161, + "step": 59430 + }, + { + "epoch": 0.43937198781821946, + "grad_norm": 0.06469923257827759, + "learning_rate": 2.8135386989553657e-05, + "loss": 0.0193, + "step": 59440 + }, + { + "epoch": 0.4394459063895213, + "grad_norm": 0.09643541276454926, + "learning_rate": 2.8131677350427353e-05, + "loss": 0.0197, + "step": 59450 + }, + { + "epoch": 0.43951982496082315, + "grad_norm": 0.08100558072328568, + "learning_rate": 2.8127967711301046e-05, + "loss": 0.0173, + "step": 59460 + }, + { + "epoch": 0.439593743532125, + "grad_norm": 0.08516040444374084, + "learning_rate": 2.8124258072174742e-05, + "loss": 0.0199, + "step": 59470 + }, + { + "epoch": 0.43966766210342684, + "grad_norm": 0.07431662082672119, + "learning_rate": 2.812054843304843e-05, + "loss": 0.0192, + "step": 59480 + }, + { + "epoch": 0.43974158067472874, + "grad_norm": 0.06716176122426987, + "learning_rate": 2.811683879392213e-05, + "loss": 0.0189, + "step": 59490 + }, + { + "epoch": 0.4398154992460306, + "grad_norm": 0.11926719546318054, + "learning_rate": 2.8113129154795826e-05, + "loss": 0.0174, + "step": 59500 + }, + { + "epoch": 0.43988941781733243, + "grad_norm": 0.07346758246421814, + "learning_rate": 2.8109419515669516e-05, + "loss": 0.0171, + "step": 59510 + }, + { + "epoch": 0.4399633363886343, + "grad_norm": 0.09154437482357025, + "learning_rate": 2.810570987654321e-05, + "loss": 0.0183, + "step": 59520 + }, + { + "epoch": 0.4400372549599361, + "grad_norm": 0.08122612535953522, + "learning_rate": 2.8102000237416904e-05, + "loss": 0.018, + "step": 59530 + }, + { + "epoch": 0.44011117353123796, + "grad_norm": 0.10390602797269821, + "learning_rate": 2.80982905982906e-05, + "loss": 0.0192, + "step": 59540 + }, + { + "epoch": 0.44018509210253987, + "grad_norm": 0.08174613118171692, + "learning_rate": 2.8094580959164296e-05, + "loss": 0.0185, + "step": 59550 + }, + { + "epoch": 0.4402590106738417, + "grad_norm": 0.09750137478113174, + "learning_rate": 2.809087132003799e-05, + "loss": 0.0193, + "step": 59560 + }, + { + "epoch": 0.44033292924514356, + "grad_norm": 0.08275096863508224, + "learning_rate": 2.8087161680911685e-05, + "loss": 0.0187, + "step": 59570 + }, + { + "epoch": 0.4404068478164454, + "grad_norm": 0.07879745960235596, + "learning_rate": 2.8083452041785374e-05, + "loss": 0.0193, + "step": 59580 + }, + { + "epoch": 0.44048076638774725, + "grad_norm": 0.08212180435657501, + "learning_rate": 2.807974240265907e-05, + "loss": 0.0212, + "step": 59590 + }, + { + "epoch": 0.4405546849590491, + "grad_norm": 0.09315769374370575, + "learning_rate": 2.8076032763532766e-05, + "loss": 0.0193, + "step": 59600 + }, + { + "epoch": 0.440628603530351, + "grad_norm": 0.0979996994137764, + "learning_rate": 2.8072323124406458e-05, + "loss": 0.0167, + "step": 59610 + }, + { + "epoch": 0.44070252210165284, + "grad_norm": 0.08392339199781418, + "learning_rate": 2.8068613485280154e-05, + "loss": 0.0184, + "step": 59620 + }, + { + "epoch": 0.4407764406729547, + "grad_norm": 0.07921017706394196, + "learning_rate": 2.8064903846153843e-05, + "loss": 0.0151, + "step": 59630 + }, + { + "epoch": 0.44085035924425653, + "grad_norm": 0.08482234179973602, + "learning_rate": 2.8061194207027543e-05, + "loss": 0.0195, + "step": 59640 + }, + { + "epoch": 0.4409242778155584, + "grad_norm": 0.057127732783555984, + "learning_rate": 2.805748456790124e-05, + "loss": 0.0194, + "step": 59650 + }, + { + "epoch": 0.4409981963868602, + "grad_norm": 0.08310925215482712, + "learning_rate": 2.8053774928774928e-05, + "loss": 0.0181, + "step": 59660 + }, + { + "epoch": 0.44107211495816206, + "grad_norm": 0.16174446046352386, + "learning_rate": 2.8050065289648624e-05, + "loss": 0.0172, + "step": 59670 + }, + { + "epoch": 0.44114603352946397, + "grad_norm": 0.09357830137014389, + "learning_rate": 2.804635565052232e-05, + "loss": 0.0161, + "step": 59680 + }, + { + "epoch": 0.4412199521007658, + "grad_norm": 0.07025822252035141, + "learning_rate": 2.8042646011396012e-05, + "loss": 0.018, + "step": 59690 + }, + { + "epoch": 0.44129387067206766, + "grad_norm": 0.06780959665775299, + "learning_rate": 2.803893637226971e-05, + "loss": 0.0158, + "step": 59700 + }, + { + "epoch": 0.4413677892433695, + "grad_norm": 0.0780034065246582, + "learning_rate": 2.80352267331434e-05, + "loss": 0.0178, + "step": 59710 + }, + { + "epoch": 0.44144170781467135, + "grad_norm": 0.1173756867647171, + "learning_rate": 2.8031517094017097e-05, + "loss": 0.0183, + "step": 59720 + }, + { + "epoch": 0.4415156263859732, + "grad_norm": 0.07703826576471329, + "learning_rate": 2.8027807454890793e-05, + "loss": 0.0184, + "step": 59730 + }, + { + "epoch": 0.4415895449572751, + "grad_norm": 0.07214858382940292, + "learning_rate": 2.8024097815764482e-05, + "loss": 0.0185, + "step": 59740 + }, + { + "epoch": 0.44166346352857694, + "grad_norm": 0.06699478626251221, + "learning_rate": 2.8020388176638178e-05, + "loss": 0.018, + "step": 59750 + }, + { + "epoch": 0.4417373820998788, + "grad_norm": 0.08458127826452255, + "learning_rate": 2.801667853751187e-05, + "loss": 0.017, + "step": 59760 + }, + { + "epoch": 0.44181130067118063, + "grad_norm": 0.05389302968978882, + "learning_rate": 2.8012968898385567e-05, + "loss": 0.0171, + "step": 59770 + }, + { + "epoch": 0.4418852192424825, + "grad_norm": 0.07580448687076569, + "learning_rate": 2.8009259259259263e-05, + "loss": 0.0162, + "step": 59780 + }, + { + "epoch": 0.4419591378137843, + "grad_norm": 0.06884662061929703, + "learning_rate": 2.8005549620132955e-05, + "loss": 0.0185, + "step": 59790 + }, + { + "epoch": 0.44203305638508616, + "grad_norm": 0.07314516603946686, + "learning_rate": 2.800183998100665e-05, + "loss": 0.0198, + "step": 59800 + }, + { + "epoch": 0.44210697495638807, + "grad_norm": 0.062169380486011505, + "learning_rate": 2.799813034188034e-05, + "loss": 0.0175, + "step": 59810 + }, + { + "epoch": 0.4421808935276899, + "grad_norm": 0.09424956887960434, + "learning_rate": 2.7994420702754036e-05, + "loss": 0.0192, + "step": 59820 + }, + { + "epoch": 0.44225481209899176, + "grad_norm": 0.08688298612833023, + "learning_rate": 2.7990711063627732e-05, + "loss": 0.0183, + "step": 59830 + }, + { + "epoch": 0.4423287306702936, + "grad_norm": 0.07653351128101349, + "learning_rate": 2.7987001424501425e-05, + "loss": 0.0177, + "step": 59840 + }, + { + "epoch": 0.44240264924159545, + "grad_norm": 0.08118182420730591, + "learning_rate": 2.798329178537512e-05, + "loss": 0.0196, + "step": 59850 + }, + { + "epoch": 0.4424765678128973, + "grad_norm": 0.1382313370704651, + "learning_rate": 2.7979582146248813e-05, + "loss": 0.0182, + "step": 59860 + }, + { + "epoch": 0.4425504863841992, + "grad_norm": 0.09287630766630173, + "learning_rate": 2.797587250712251e-05, + "loss": 0.0188, + "step": 59870 + }, + { + "epoch": 0.44262440495550104, + "grad_norm": 0.1005270704627037, + "learning_rate": 2.7972162867996205e-05, + "loss": 0.0186, + "step": 59880 + }, + { + "epoch": 0.4426983235268029, + "grad_norm": 0.0674884170293808, + "learning_rate": 2.7968453228869894e-05, + "loss": 0.0191, + "step": 59890 + }, + { + "epoch": 0.44277224209810473, + "grad_norm": 0.12094124406576157, + "learning_rate": 2.796474358974359e-05, + "loss": 0.0178, + "step": 59900 + }, + { + "epoch": 0.4428461606694066, + "grad_norm": 0.06646429002285004, + "learning_rate": 2.7961033950617286e-05, + "loss": 0.0171, + "step": 59910 + }, + { + "epoch": 0.4429200792407084, + "grad_norm": 0.06779291480779648, + "learning_rate": 2.795732431149098e-05, + "loss": 0.0179, + "step": 59920 + }, + { + "epoch": 0.44299399781201027, + "grad_norm": 0.09634333848953247, + "learning_rate": 2.7953614672364675e-05, + "loss": 0.0187, + "step": 59930 + }, + { + "epoch": 0.44306791638331217, + "grad_norm": 0.06265617907047272, + "learning_rate": 2.7949905033238368e-05, + "loss": 0.0177, + "step": 59940 + }, + { + "epoch": 0.443141834954614, + "grad_norm": 0.06748286634683609, + "learning_rate": 2.7946195394112064e-05, + "loss": 0.0179, + "step": 59950 + }, + { + "epoch": 0.44321575352591586, + "grad_norm": 0.10775701701641083, + "learning_rate": 2.794248575498576e-05, + "loss": 0.0165, + "step": 59960 + }, + { + "epoch": 0.4432896720972177, + "grad_norm": 0.06812064349651337, + "learning_rate": 2.793877611585945e-05, + "loss": 0.0193, + "step": 59970 + }, + { + "epoch": 0.44336359066851955, + "grad_norm": 0.10246866941452026, + "learning_rate": 2.7935066476733145e-05, + "loss": 0.0176, + "step": 59980 + }, + { + "epoch": 0.4434375092398214, + "grad_norm": 0.0841251090168953, + "learning_rate": 2.7931356837606837e-05, + "loss": 0.019, + "step": 59990 + }, + { + "epoch": 0.4435114278111233, + "grad_norm": 0.07392372936010361, + "learning_rate": 2.7927647198480533e-05, + "loss": 0.0177, + "step": 60000 + }, + { + "epoch": 0.4435114278111233, + "eval_f1": 0.6183157522063305, + "eval_loss": 0.01798221655189991, + "eval_precision": 0.49327030049358794, + "eval_recall": 0.8282895570479518, + "eval_runtime": 2668.5091, + "eval_samples_per_second": 202.785, + "eval_steps_per_second": 3.169, + "step": 60000 + }, + { + "epoch": 0.44358534638242514, + "grad_norm": 0.08475377410650253, + "learning_rate": 2.792393755935423e-05, + "loss": 0.0168, + "step": 60010 + }, + { + "epoch": 0.443659264953727, + "grad_norm": 0.06998006999492645, + "learning_rate": 2.7920227920227922e-05, + "loss": 0.0165, + "step": 60020 + }, + { + "epoch": 0.44373318352502883, + "grad_norm": 0.0669107660651207, + "learning_rate": 2.7916518281101618e-05, + "loss": 0.0149, + "step": 60030 + }, + { + "epoch": 0.4438071020963307, + "grad_norm": 0.09289882332086563, + "learning_rate": 2.7912808641975307e-05, + "loss": 0.0174, + "step": 60040 + }, + { + "epoch": 0.4438810206676325, + "grad_norm": 0.07927216589450836, + "learning_rate": 2.7909099002849003e-05, + "loss": 0.0181, + "step": 60050 + }, + { + "epoch": 0.44395493923893437, + "grad_norm": 0.07733915001153946, + "learning_rate": 2.79053893637227e-05, + "loss": 0.0214, + "step": 60060 + }, + { + "epoch": 0.44402885781023627, + "grad_norm": 0.06538470089435577, + "learning_rate": 2.790167972459639e-05, + "loss": 0.0154, + "step": 60070 + }, + { + "epoch": 0.4441027763815381, + "grad_norm": 0.08921214938163757, + "learning_rate": 2.7897970085470087e-05, + "loss": 0.0208, + "step": 60080 + }, + { + "epoch": 0.44417669495283996, + "grad_norm": 0.09106167405843735, + "learning_rate": 2.789426044634378e-05, + "loss": 0.0161, + "step": 60090 + }, + { + "epoch": 0.4442506135241418, + "grad_norm": 0.05333057418465614, + "learning_rate": 2.7890550807217476e-05, + "loss": 0.0163, + "step": 60100 + }, + { + "epoch": 0.44432453209544365, + "grad_norm": 0.08506513386964798, + "learning_rate": 2.7886841168091172e-05, + "loss": 0.0187, + "step": 60110 + }, + { + "epoch": 0.4443984506667455, + "grad_norm": 0.06936834007501602, + "learning_rate": 2.788313152896486e-05, + "loss": 0.0166, + "step": 60120 + }, + { + "epoch": 0.4444723692380474, + "grad_norm": 0.07555869221687317, + "learning_rate": 2.7879421889838557e-05, + "loss": 0.0149, + "step": 60130 + }, + { + "epoch": 0.44454628780934924, + "grad_norm": 0.08665190637111664, + "learning_rate": 2.7875712250712256e-05, + "loss": 0.0189, + "step": 60140 + }, + { + "epoch": 0.4446202063806511, + "grad_norm": 0.08473851531744003, + "learning_rate": 2.7872002611585946e-05, + "loss": 0.0182, + "step": 60150 + }, + { + "epoch": 0.44469412495195293, + "grad_norm": 0.09297851473093033, + "learning_rate": 2.786829297245964e-05, + "loss": 0.0209, + "step": 60160 + }, + { + "epoch": 0.4447680435232548, + "grad_norm": 0.08520183712244034, + "learning_rate": 2.7864583333333334e-05, + "loss": 0.0198, + "step": 60170 + }, + { + "epoch": 0.4448419620945566, + "grad_norm": 0.0894201397895813, + "learning_rate": 2.786087369420703e-05, + "loss": 0.0202, + "step": 60180 + }, + { + "epoch": 0.44491588066585847, + "grad_norm": 0.063407301902771, + "learning_rate": 2.7857164055080726e-05, + "loss": 0.0199, + "step": 60190 + }, + { + "epoch": 0.44498979923716037, + "grad_norm": 0.08296534419059753, + "learning_rate": 2.7853454415954415e-05, + "loss": 0.0185, + "step": 60200 + }, + { + "epoch": 0.4450637178084622, + "grad_norm": 0.09972469508647919, + "learning_rate": 2.784974477682811e-05, + "loss": 0.0194, + "step": 60210 + }, + { + "epoch": 0.44513763637976406, + "grad_norm": 0.07479657977819443, + "learning_rate": 2.7846035137701804e-05, + "loss": 0.0213, + "step": 60220 + }, + { + "epoch": 0.4452115549510659, + "grad_norm": 0.09490302205085754, + "learning_rate": 2.78423254985755e-05, + "loss": 0.0187, + "step": 60230 + }, + { + "epoch": 0.44528547352236775, + "grad_norm": 0.08586356043815613, + "learning_rate": 2.7838615859449196e-05, + "loss": 0.0185, + "step": 60240 + }, + { + "epoch": 0.4453593920936696, + "grad_norm": 0.0650666207075119, + "learning_rate": 2.783490622032289e-05, + "loss": 0.0192, + "step": 60250 + }, + { + "epoch": 0.4454333106649715, + "grad_norm": 0.05304092913866043, + "learning_rate": 2.7831196581196584e-05, + "loss": 0.0174, + "step": 60260 + }, + { + "epoch": 0.44550722923627334, + "grad_norm": 0.07560031116008759, + "learning_rate": 2.7827486942070273e-05, + "loss": 0.0176, + "step": 60270 + }, + { + "epoch": 0.4455811478075752, + "grad_norm": 0.05344974994659424, + "learning_rate": 2.782377730294397e-05, + "loss": 0.0177, + "step": 60280 + }, + { + "epoch": 0.44565506637887703, + "grad_norm": 0.07482201606035233, + "learning_rate": 2.782006766381767e-05, + "loss": 0.0181, + "step": 60290 + }, + { + "epoch": 0.4457289849501789, + "grad_norm": 0.06870289891958237, + "learning_rate": 2.7816358024691358e-05, + "loss": 0.0186, + "step": 60300 + }, + { + "epoch": 0.4458029035214807, + "grad_norm": 0.08564918488264084, + "learning_rate": 2.7812648385565054e-05, + "loss": 0.0181, + "step": 60310 + }, + { + "epoch": 0.44587682209278257, + "grad_norm": 0.058514028787612915, + "learning_rate": 2.7808938746438747e-05, + "loss": 0.0166, + "step": 60320 + }, + { + "epoch": 0.44595074066408447, + "grad_norm": 0.07964633405208588, + "learning_rate": 2.7805229107312443e-05, + "loss": 0.0192, + "step": 60330 + }, + { + "epoch": 0.4460246592353863, + "grad_norm": 0.07813328504562378, + "learning_rate": 2.780151946818614e-05, + "loss": 0.0186, + "step": 60340 + }, + { + "epoch": 0.44609857780668816, + "grad_norm": 0.10391835123300552, + "learning_rate": 2.7797809829059828e-05, + "loss": 0.0191, + "step": 60350 + }, + { + "epoch": 0.44617249637799, + "grad_norm": 0.0750124379992485, + "learning_rate": 2.7794100189933524e-05, + "loss": 0.0192, + "step": 60360 + }, + { + "epoch": 0.44624641494929185, + "grad_norm": 0.07106636464595795, + "learning_rate": 2.7790390550807223e-05, + "loss": 0.0184, + "step": 60370 + }, + { + "epoch": 0.4463203335205937, + "grad_norm": 0.10053660720586777, + "learning_rate": 2.7786680911680912e-05, + "loss": 0.0193, + "step": 60380 + }, + { + "epoch": 0.4463942520918956, + "grad_norm": 0.07444123178720474, + "learning_rate": 2.7782971272554608e-05, + "loss": 0.0186, + "step": 60390 + }, + { + "epoch": 0.44646817066319744, + "grad_norm": 0.07024513185024261, + "learning_rate": 2.77792616334283e-05, + "loss": 0.0198, + "step": 60400 + }, + { + "epoch": 0.4465420892344993, + "grad_norm": 0.07613866776227951, + "learning_rate": 2.7775551994301997e-05, + "loss": 0.0172, + "step": 60410 + }, + { + "epoch": 0.44661600780580113, + "grad_norm": 0.08213798701763153, + "learning_rate": 2.7771842355175693e-05, + "loss": 0.0188, + "step": 60420 + }, + { + "epoch": 0.446689926377103, + "grad_norm": 0.10978935658931732, + "learning_rate": 2.7768132716049382e-05, + "loss": 0.0203, + "step": 60430 + }, + { + "epoch": 0.4467638449484048, + "grad_norm": 0.07833638787269592, + "learning_rate": 2.776442307692308e-05, + "loss": 0.0182, + "step": 60440 + }, + { + "epoch": 0.44683776351970667, + "grad_norm": 0.07589908689260483, + "learning_rate": 2.776071343779677e-05, + "loss": 0.0194, + "step": 60450 + }, + { + "epoch": 0.44691168209100857, + "grad_norm": 0.09376027435064316, + "learning_rate": 2.7757003798670466e-05, + "loss": 0.0205, + "step": 60460 + }, + { + "epoch": 0.4469856006623104, + "grad_norm": 0.09234382212162018, + "learning_rate": 2.7753294159544162e-05, + "loss": 0.0177, + "step": 60470 + }, + { + "epoch": 0.44705951923361226, + "grad_norm": 0.09196515381336212, + "learning_rate": 2.7749584520417855e-05, + "loss": 0.0183, + "step": 60480 + }, + { + "epoch": 0.4471334378049141, + "grad_norm": 0.07494409382343292, + "learning_rate": 2.774587488129155e-05, + "loss": 0.0164, + "step": 60490 + }, + { + "epoch": 0.44720735637621595, + "grad_norm": 0.07437138259410858, + "learning_rate": 2.774216524216524e-05, + "loss": 0.019, + "step": 60500 + }, + { + "epoch": 0.4472812749475178, + "grad_norm": 0.06940478831529617, + "learning_rate": 2.7738455603038936e-05, + "loss": 0.0193, + "step": 60510 + }, + { + "epoch": 0.4473551935188197, + "grad_norm": 0.08766179531812668, + "learning_rate": 2.7734745963912635e-05, + "loss": 0.017, + "step": 60520 + }, + { + "epoch": 0.44742911209012154, + "grad_norm": 0.08161088824272156, + "learning_rate": 2.7731036324786325e-05, + "loss": 0.0186, + "step": 60530 + }, + { + "epoch": 0.4475030306614234, + "grad_norm": 0.08334532380104065, + "learning_rate": 2.772732668566002e-05, + "loss": 0.0181, + "step": 60540 + }, + { + "epoch": 0.44757694923272523, + "grad_norm": 0.08141965419054031, + "learning_rate": 2.7723617046533713e-05, + "loss": 0.0157, + "step": 60550 + }, + { + "epoch": 0.4476508678040271, + "grad_norm": 0.07597937434911728, + "learning_rate": 2.771990740740741e-05, + "loss": 0.0175, + "step": 60560 + }, + { + "epoch": 0.4477247863753289, + "grad_norm": 0.09826496243476868, + "learning_rate": 2.7716197768281105e-05, + "loss": 0.0191, + "step": 60570 + }, + { + "epoch": 0.44779870494663077, + "grad_norm": 0.07023176550865173, + "learning_rate": 2.7712488129154794e-05, + "loss": 0.0184, + "step": 60580 + }, + { + "epoch": 0.44787262351793267, + "grad_norm": 0.07832328975200653, + "learning_rate": 2.7708778490028494e-05, + "loss": 0.0178, + "step": 60590 + }, + { + "epoch": 0.4479465420892345, + "grad_norm": 0.11813930422067642, + "learning_rate": 2.770506885090219e-05, + "loss": 0.0174, + "step": 60600 + }, + { + "epoch": 0.44802046066053636, + "grad_norm": 0.08903170377016068, + "learning_rate": 2.770135921177588e-05, + "loss": 0.0183, + "step": 60610 + }, + { + "epoch": 0.4480943792318382, + "grad_norm": 0.10360957682132721, + "learning_rate": 2.7697649572649575e-05, + "loss": 0.0189, + "step": 60620 + }, + { + "epoch": 0.44816829780314005, + "grad_norm": 0.060528405010700226, + "learning_rate": 2.7693939933523267e-05, + "loss": 0.019, + "step": 60630 + }, + { + "epoch": 0.4482422163744419, + "grad_norm": 0.07136612385511398, + "learning_rate": 2.7690230294396963e-05, + "loss": 0.0183, + "step": 60640 + }, + { + "epoch": 0.4483161349457438, + "grad_norm": 0.08754134178161621, + "learning_rate": 2.768652065527066e-05, + "loss": 0.0182, + "step": 60650 + }, + { + "epoch": 0.44839005351704564, + "grad_norm": 0.08291976898908615, + "learning_rate": 2.768281101614435e-05, + "loss": 0.0181, + "step": 60660 + }, + { + "epoch": 0.4484639720883475, + "grad_norm": 0.0794183537364006, + "learning_rate": 2.7679101377018048e-05, + "loss": 0.0184, + "step": 60670 + }, + { + "epoch": 0.44853789065964933, + "grad_norm": 0.05834812670946121, + "learning_rate": 2.7675391737891737e-05, + "loss": 0.0184, + "step": 60680 + }, + { + "epoch": 0.4486118092309512, + "grad_norm": 0.06786110252141953, + "learning_rate": 2.7671682098765433e-05, + "loss": 0.0157, + "step": 60690 + }, + { + "epoch": 0.448685727802253, + "grad_norm": 0.06590061634778976, + "learning_rate": 2.766797245963913e-05, + "loss": 0.0156, + "step": 60700 + }, + { + "epoch": 0.44875964637355487, + "grad_norm": 0.07546421140432358, + "learning_rate": 2.766426282051282e-05, + "loss": 0.0176, + "step": 60710 + }, + { + "epoch": 0.44883356494485677, + "grad_norm": 0.06898737698793411, + "learning_rate": 2.7660553181386517e-05, + "loss": 0.0175, + "step": 60720 + }, + { + "epoch": 0.4489074835161586, + "grad_norm": 0.08143886923789978, + "learning_rate": 2.7656843542260207e-05, + "loss": 0.0187, + "step": 60730 + }, + { + "epoch": 0.44898140208746046, + "grad_norm": 0.08089787513017654, + "learning_rate": 2.7653133903133903e-05, + "loss": 0.0178, + "step": 60740 + }, + { + "epoch": 0.4490553206587623, + "grad_norm": 0.08173342049121857, + "learning_rate": 2.7649424264007602e-05, + "loss": 0.0174, + "step": 60750 + }, + { + "epoch": 0.44912923923006415, + "grad_norm": 0.07083459198474884, + "learning_rate": 2.764571462488129e-05, + "loss": 0.0176, + "step": 60760 + }, + { + "epoch": 0.449203157801366, + "grad_norm": 0.08658960461616516, + "learning_rate": 2.7642004985754987e-05, + "loss": 0.0172, + "step": 60770 + }, + { + "epoch": 0.4492770763726679, + "grad_norm": 0.1044207438826561, + "learning_rate": 2.763829534662868e-05, + "loss": 0.0182, + "step": 60780 + }, + { + "epoch": 0.44935099494396974, + "grad_norm": 0.08168955147266388, + "learning_rate": 2.7634585707502376e-05, + "loss": 0.0188, + "step": 60790 + }, + { + "epoch": 0.4494249135152716, + "grad_norm": 0.07407943159341812, + "learning_rate": 2.763087606837607e-05, + "loss": 0.0175, + "step": 60800 + }, + { + "epoch": 0.44949883208657343, + "grad_norm": 0.09754371643066406, + "learning_rate": 2.762716642924976e-05, + "loss": 0.0186, + "step": 60810 + }, + { + "epoch": 0.4495727506578753, + "grad_norm": 0.13254565000534058, + "learning_rate": 2.762345679012346e-05, + "loss": 0.0203, + "step": 60820 + }, + { + "epoch": 0.4496466692291771, + "grad_norm": 0.0885326936841011, + "learning_rate": 2.7619747150997156e-05, + "loss": 0.018, + "step": 60830 + }, + { + "epoch": 0.44972058780047897, + "grad_norm": 0.08535313606262207, + "learning_rate": 2.7616037511870845e-05, + "loss": 0.0192, + "step": 60840 + }, + { + "epoch": 0.44979450637178087, + "grad_norm": 0.08815433084964752, + "learning_rate": 2.761232787274454e-05, + "loss": 0.0198, + "step": 60850 + }, + { + "epoch": 0.4498684249430827, + "grad_norm": 0.16081440448760986, + "learning_rate": 2.7608618233618234e-05, + "loss": 0.0175, + "step": 60860 + }, + { + "epoch": 0.44994234351438456, + "grad_norm": 0.06050868704915047, + "learning_rate": 2.760490859449193e-05, + "loss": 0.018, + "step": 60870 + }, + { + "epoch": 0.4500162620856864, + "grad_norm": 0.08731406927108765, + "learning_rate": 2.7601198955365626e-05, + "loss": 0.0209, + "step": 60880 + }, + { + "epoch": 0.45009018065698825, + "grad_norm": 0.08373596519231796, + "learning_rate": 2.7597489316239315e-05, + "loss": 0.0179, + "step": 60890 + }, + { + "epoch": 0.4501640992282901, + "grad_norm": 0.10616223514080048, + "learning_rate": 2.7593779677113014e-05, + "loss": 0.0177, + "step": 60900 + }, + { + "epoch": 0.450238017799592, + "grad_norm": 0.06489362567663193, + "learning_rate": 2.7590070037986704e-05, + "loss": 0.0174, + "step": 60910 + }, + { + "epoch": 0.45031193637089384, + "grad_norm": 0.07929858565330505, + "learning_rate": 2.75863603988604e-05, + "loss": 0.02, + "step": 60920 + }, + { + "epoch": 0.4503858549421957, + "grad_norm": 0.08207009732723236, + "learning_rate": 2.7582650759734095e-05, + "loss": 0.0182, + "step": 60930 + }, + { + "epoch": 0.45045977351349753, + "grad_norm": 0.06570993363857269, + "learning_rate": 2.7578941120607788e-05, + "loss": 0.0176, + "step": 60940 + }, + { + "epoch": 0.4505336920847994, + "grad_norm": 0.08798123151063919, + "learning_rate": 2.7575231481481484e-05, + "loss": 0.0155, + "step": 60950 + }, + { + "epoch": 0.4506076106561012, + "grad_norm": 0.09389277547597885, + "learning_rate": 2.7571521842355173e-05, + "loss": 0.0168, + "step": 60960 + }, + { + "epoch": 0.45068152922740307, + "grad_norm": 0.05513633042573929, + "learning_rate": 2.7567812203228873e-05, + "loss": 0.0163, + "step": 60970 + }, + { + "epoch": 0.45075544779870497, + "grad_norm": 0.0851559042930603, + "learning_rate": 2.756410256410257e-05, + "loss": 0.0177, + "step": 60980 + }, + { + "epoch": 0.4508293663700068, + "grad_norm": 0.08380331844091415, + "learning_rate": 2.7560392924976258e-05, + "loss": 0.0181, + "step": 60990 + }, + { + "epoch": 0.45090328494130866, + "grad_norm": 0.08465258032083511, + "learning_rate": 2.7556683285849954e-05, + "loss": 0.0178, + "step": 61000 + }, + { + "epoch": 0.4509772035126105, + "grad_norm": 0.07717887312173843, + "learning_rate": 2.7552973646723646e-05, + "loss": 0.0176, + "step": 61010 + }, + { + "epoch": 0.45105112208391235, + "grad_norm": 0.10460257530212402, + "learning_rate": 2.7549264007597342e-05, + "loss": 0.0191, + "step": 61020 + }, + { + "epoch": 0.4511250406552142, + "grad_norm": 0.08913242071866989, + "learning_rate": 2.7545554368471038e-05, + "loss": 0.0194, + "step": 61030 + }, + { + "epoch": 0.4511989592265161, + "grad_norm": 0.059481166303157806, + "learning_rate": 2.7541844729344727e-05, + "loss": 0.0172, + "step": 61040 + }, + { + "epoch": 0.45127287779781794, + "grad_norm": 0.0636981949210167, + "learning_rate": 2.7538135090218427e-05, + "loss": 0.0181, + "step": 61050 + }, + { + "epoch": 0.4513467963691198, + "grad_norm": 0.08157453685998917, + "learning_rate": 2.7534425451092123e-05, + "loss": 0.0193, + "step": 61060 + }, + { + "epoch": 0.45142071494042163, + "grad_norm": 0.08309981226921082, + "learning_rate": 2.7530715811965812e-05, + "loss": 0.0169, + "step": 61070 + }, + { + "epoch": 0.4514946335117235, + "grad_norm": 0.08034002035856247, + "learning_rate": 2.7527006172839508e-05, + "loss": 0.0189, + "step": 61080 + }, + { + "epoch": 0.4515685520830253, + "grad_norm": 0.08259232342243195, + "learning_rate": 2.75232965337132e-05, + "loss": 0.0171, + "step": 61090 + }, + { + "epoch": 0.45164247065432717, + "grad_norm": 0.10203670710325241, + "learning_rate": 2.7519586894586896e-05, + "loss": 0.0193, + "step": 61100 + }, + { + "epoch": 0.45171638922562907, + "grad_norm": 0.08340781182050705, + "learning_rate": 2.7515877255460592e-05, + "loss": 0.0179, + "step": 61110 + }, + { + "epoch": 0.4517903077969309, + "grad_norm": 0.11332692950963974, + "learning_rate": 2.7512167616334285e-05, + "loss": 0.0221, + "step": 61120 + }, + { + "epoch": 0.45186422636823276, + "grad_norm": 0.07806441187858582, + "learning_rate": 2.750845797720798e-05, + "loss": 0.0176, + "step": 61130 + }, + { + "epoch": 0.4519381449395346, + "grad_norm": 0.0680718943476677, + "learning_rate": 2.750474833808167e-05, + "loss": 0.0173, + "step": 61140 + }, + { + "epoch": 0.45201206351083645, + "grad_norm": 0.07029244303703308, + "learning_rate": 2.7501038698955366e-05, + "loss": 0.0149, + "step": 61150 + }, + { + "epoch": 0.4520859820821383, + "grad_norm": 0.08795084804296494, + "learning_rate": 2.7497329059829062e-05, + "loss": 0.0221, + "step": 61160 + }, + { + "epoch": 0.4521599006534402, + "grad_norm": 0.08229811489582062, + "learning_rate": 2.7493619420702755e-05, + "loss": 0.0172, + "step": 61170 + }, + { + "epoch": 0.45223381922474204, + "grad_norm": 0.07843748480081558, + "learning_rate": 2.748990978157645e-05, + "loss": 0.0183, + "step": 61180 + }, + { + "epoch": 0.4523077377960439, + "grad_norm": 0.11385442316532135, + "learning_rate": 2.748620014245014e-05, + "loss": 0.0184, + "step": 61190 + }, + { + "epoch": 0.45238165636734573, + "grad_norm": 0.08859772235155106, + "learning_rate": 2.748249050332384e-05, + "loss": 0.0185, + "step": 61200 + }, + { + "epoch": 0.4524555749386476, + "grad_norm": 0.07936745882034302, + "learning_rate": 2.7478780864197535e-05, + "loss": 0.0178, + "step": 61210 + }, + { + "epoch": 0.4525294935099494, + "grad_norm": 0.07612825930118561, + "learning_rate": 2.7475071225071224e-05, + "loss": 0.0189, + "step": 61220 + }, + { + "epoch": 0.45260341208125127, + "grad_norm": 0.0908486470580101, + "learning_rate": 2.747136158594492e-05, + "loss": 0.0182, + "step": 61230 + }, + { + "epoch": 0.45267733065255317, + "grad_norm": 0.059742387384176254, + "learning_rate": 2.7467651946818613e-05, + "loss": 0.0187, + "step": 61240 + }, + { + "epoch": 0.452751249223855, + "grad_norm": 0.08679183572530746, + "learning_rate": 2.746394230769231e-05, + "loss": 0.0179, + "step": 61250 + }, + { + "epoch": 0.45282516779515686, + "grad_norm": 0.08411939442157745, + "learning_rate": 2.7460232668566005e-05, + "loss": 0.0185, + "step": 61260 + }, + { + "epoch": 0.4528990863664587, + "grad_norm": 0.08937957137823105, + "learning_rate": 2.7456523029439697e-05, + "loss": 0.0188, + "step": 61270 + }, + { + "epoch": 0.45297300493776055, + "grad_norm": 0.0620824359357357, + "learning_rate": 2.7452813390313393e-05, + "loss": 0.0191, + "step": 61280 + }, + { + "epoch": 0.4530469235090624, + "grad_norm": 0.06696116924285889, + "learning_rate": 2.744910375118709e-05, + "loss": 0.0188, + "step": 61290 + }, + { + "epoch": 0.4531208420803643, + "grad_norm": 0.08053241670131683, + "learning_rate": 2.744539411206078e-05, + "loss": 0.0159, + "step": 61300 + }, + { + "epoch": 0.45319476065166614, + "grad_norm": 0.07864797115325928, + "learning_rate": 2.7441684472934474e-05, + "loss": 0.0171, + "step": 61310 + }, + { + "epoch": 0.453268679222968, + "grad_norm": 0.08154179155826569, + "learning_rate": 2.7437974833808167e-05, + "loss": 0.0181, + "step": 61320 + }, + { + "epoch": 0.45334259779426983, + "grad_norm": 0.07514984160661697, + "learning_rate": 2.7434265194681863e-05, + "loss": 0.0186, + "step": 61330 + }, + { + "epoch": 0.4534165163655717, + "grad_norm": 0.07200212776660919, + "learning_rate": 2.743055555555556e-05, + "loss": 0.0171, + "step": 61340 + }, + { + "epoch": 0.4534904349368735, + "grad_norm": 0.07293741405010223, + "learning_rate": 2.742684591642925e-05, + "loss": 0.02, + "step": 61350 + }, + { + "epoch": 0.45356435350817537, + "grad_norm": 0.0856717899441719, + "learning_rate": 2.7423136277302948e-05, + "loss": 0.0193, + "step": 61360 + }, + { + "epoch": 0.45363827207947727, + "grad_norm": 0.0812024474143982, + "learning_rate": 2.7419426638176637e-05, + "loss": 0.021, + "step": 61370 + }, + { + "epoch": 0.4537121906507791, + "grad_norm": 0.0680481418967247, + "learning_rate": 2.7415716999050333e-05, + "loss": 0.0187, + "step": 61380 + }, + { + "epoch": 0.45378610922208096, + "grad_norm": 0.09765014797449112, + "learning_rate": 2.741200735992403e-05, + "loss": 0.015, + "step": 61390 + }, + { + "epoch": 0.4538600277933828, + "grad_norm": 0.07712357491254807, + "learning_rate": 2.740829772079772e-05, + "loss": 0.0179, + "step": 61400 + }, + { + "epoch": 0.45393394636468465, + "grad_norm": 0.10133178532123566, + "learning_rate": 2.7404588081671417e-05, + "loss": 0.02, + "step": 61410 + }, + { + "epoch": 0.4540078649359865, + "grad_norm": 0.09235849231481552, + "learning_rate": 2.740087844254511e-05, + "loss": 0.0188, + "step": 61420 + }, + { + "epoch": 0.4540817835072884, + "grad_norm": 0.11862324178218842, + "learning_rate": 2.7397168803418806e-05, + "loss": 0.0204, + "step": 61430 + }, + { + "epoch": 0.45415570207859024, + "grad_norm": 0.06792068481445312, + "learning_rate": 2.7393459164292502e-05, + "loss": 0.017, + "step": 61440 + }, + { + "epoch": 0.4542296206498921, + "grad_norm": 0.08101335167884827, + "learning_rate": 2.738974952516619e-05, + "loss": 0.015, + "step": 61450 + }, + { + "epoch": 0.45430353922119393, + "grad_norm": 0.09211039543151855, + "learning_rate": 2.7386039886039887e-05, + "loss": 0.0207, + "step": 61460 + }, + { + "epoch": 0.4543774577924958, + "grad_norm": 0.11689606308937073, + "learning_rate": 2.738233024691358e-05, + "loss": 0.02, + "step": 61470 + }, + { + "epoch": 0.4544513763637976, + "grad_norm": 0.09053738415241241, + "learning_rate": 2.7378620607787275e-05, + "loss": 0.0166, + "step": 61480 + }, + { + "epoch": 0.4545252949350995, + "grad_norm": 0.09697338938713074, + "learning_rate": 2.737491096866097e-05, + "loss": 0.0202, + "step": 61490 + }, + { + "epoch": 0.45459921350640137, + "grad_norm": 0.06055251881480217, + "learning_rate": 2.7371201329534664e-05, + "loss": 0.0178, + "step": 61500 + }, + { + "epoch": 0.4546731320777032, + "grad_norm": 0.06681554764509201, + "learning_rate": 2.736749169040836e-05, + "loss": 0.0193, + "step": 61510 + }, + { + "epoch": 0.45474705064900506, + "grad_norm": 0.08950188755989075, + "learning_rate": 2.7363782051282056e-05, + "loss": 0.0187, + "step": 61520 + }, + { + "epoch": 0.4548209692203069, + "grad_norm": 0.0626884400844574, + "learning_rate": 2.7360072412155745e-05, + "loss": 0.021, + "step": 61530 + }, + { + "epoch": 0.45489488779160875, + "grad_norm": 0.09036307781934738, + "learning_rate": 2.735636277302944e-05, + "loss": 0.0216, + "step": 61540 + }, + { + "epoch": 0.4549688063629106, + "grad_norm": 0.08309593796730042, + "learning_rate": 2.7352653133903134e-05, + "loss": 0.0161, + "step": 61550 + }, + { + "epoch": 0.4550427249342125, + "grad_norm": 0.07348351180553436, + "learning_rate": 2.734894349477683e-05, + "loss": 0.0209, + "step": 61560 + }, + { + "epoch": 0.45511664350551434, + "grad_norm": 0.0639307051897049, + "learning_rate": 2.7345233855650526e-05, + "loss": 0.0164, + "step": 61570 + }, + { + "epoch": 0.4551905620768162, + "grad_norm": 0.07979018986225128, + "learning_rate": 2.7341524216524218e-05, + "loss": 0.021, + "step": 61580 + }, + { + "epoch": 0.45526448064811803, + "grad_norm": 0.08083287626504898, + "learning_rate": 2.7337814577397914e-05, + "loss": 0.016, + "step": 61590 + }, + { + "epoch": 0.4553383992194199, + "grad_norm": 0.07112161815166473, + "learning_rate": 2.7334104938271603e-05, + "loss": 0.018, + "step": 61600 + }, + { + "epoch": 0.4554123177907217, + "grad_norm": 0.08322005718946457, + "learning_rate": 2.73303952991453e-05, + "loss": 0.0198, + "step": 61610 + }, + { + "epoch": 0.4554862363620236, + "grad_norm": 0.08761461824178696, + "learning_rate": 2.7326685660018995e-05, + "loss": 0.0187, + "step": 61620 + }, + { + "epoch": 0.45556015493332547, + "grad_norm": 0.11122740060091019, + "learning_rate": 2.7322976020892688e-05, + "loss": 0.0196, + "step": 61630 + }, + { + "epoch": 0.4556340735046273, + "grad_norm": 0.10976991802453995, + "learning_rate": 2.7319266381766384e-05, + "loss": 0.0182, + "step": 61640 + }, + { + "epoch": 0.45570799207592916, + "grad_norm": 0.0777834877371788, + "learning_rate": 2.7315556742640076e-05, + "loss": 0.0195, + "step": 61650 + }, + { + "epoch": 0.455781910647231, + "grad_norm": 0.07317525148391724, + "learning_rate": 2.7311847103513772e-05, + "loss": 0.0203, + "step": 61660 + }, + { + "epoch": 0.45585582921853285, + "grad_norm": 0.06810017675161362, + "learning_rate": 2.7308137464387468e-05, + "loss": 0.0192, + "step": 61670 + }, + { + "epoch": 0.4559297477898347, + "grad_norm": 0.08570457994937897, + "learning_rate": 2.7304427825261157e-05, + "loss": 0.0175, + "step": 61680 + }, + { + "epoch": 0.4560036663611366, + "grad_norm": 0.08259614557027817, + "learning_rate": 2.7300718186134853e-05, + "loss": 0.0166, + "step": 61690 + }, + { + "epoch": 0.45607758493243844, + "grad_norm": 0.22294946014881134, + "learning_rate": 2.7297008547008546e-05, + "loss": 0.0199, + "step": 61700 + }, + { + "epoch": 0.4561515035037403, + "grad_norm": 0.0658465325832367, + "learning_rate": 2.7293298907882242e-05, + "loss": 0.0199, + "step": 61710 + }, + { + "epoch": 0.45622542207504213, + "grad_norm": 0.06378777325153351, + "learning_rate": 2.7289589268755938e-05, + "loss": 0.0183, + "step": 61720 + }, + { + "epoch": 0.456299340646344, + "grad_norm": 0.08054514229297638, + "learning_rate": 2.728587962962963e-05, + "loss": 0.0171, + "step": 61730 + }, + { + "epoch": 0.4563732592176458, + "grad_norm": 0.08308053761720657, + "learning_rate": 2.7282169990503327e-05, + "loss": 0.017, + "step": 61740 + }, + { + "epoch": 0.4564471777889477, + "grad_norm": 0.08155302703380585, + "learning_rate": 2.7278460351377022e-05, + "loss": 0.0184, + "step": 61750 + }, + { + "epoch": 0.45652109636024957, + "grad_norm": 0.10983440279960632, + "learning_rate": 2.727475071225071e-05, + "loss": 0.0193, + "step": 61760 + }, + { + "epoch": 0.4565950149315514, + "grad_norm": 0.08785505592823029, + "learning_rate": 2.7271041073124408e-05, + "loss": 0.0184, + "step": 61770 + }, + { + "epoch": 0.45666893350285326, + "grad_norm": 0.07460859417915344, + "learning_rate": 2.72673314339981e-05, + "loss": 0.0166, + "step": 61780 + }, + { + "epoch": 0.4567428520741551, + "grad_norm": 0.07602991908788681, + "learning_rate": 2.7263621794871796e-05, + "loss": 0.0188, + "step": 61790 + }, + { + "epoch": 0.45681677064545695, + "grad_norm": 0.09512347728013992, + "learning_rate": 2.7259912155745492e-05, + "loss": 0.0202, + "step": 61800 + }, + { + "epoch": 0.4568906892167588, + "grad_norm": 0.06678026914596558, + "learning_rate": 2.7256202516619185e-05, + "loss": 0.0179, + "step": 61810 + }, + { + "epoch": 0.4569646077880607, + "grad_norm": 0.06316298246383667, + "learning_rate": 2.725249287749288e-05, + "loss": 0.0154, + "step": 61820 + }, + { + "epoch": 0.45703852635936254, + "grad_norm": 0.0684802457690239, + "learning_rate": 2.724878323836657e-05, + "loss": 0.0175, + "step": 61830 + }, + { + "epoch": 0.4571124449306644, + "grad_norm": 0.07379800081253052, + "learning_rate": 2.7245073599240266e-05, + "loss": 0.0202, + "step": 61840 + }, + { + "epoch": 0.45718636350196623, + "grad_norm": 0.09847801923751831, + "learning_rate": 2.7241363960113965e-05, + "loss": 0.0216, + "step": 61850 + }, + { + "epoch": 0.4572602820732681, + "grad_norm": 0.06762059777975082, + "learning_rate": 2.7237654320987654e-05, + "loss": 0.0169, + "step": 61860 + }, + { + "epoch": 0.4573342006445699, + "grad_norm": 0.08554498851299286, + "learning_rate": 2.723394468186135e-05, + "loss": 0.0154, + "step": 61870 + }, + { + "epoch": 0.4574081192158718, + "grad_norm": 0.06983321160078049, + "learning_rate": 2.7230235042735043e-05, + "loss": 0.0185, + "step": 61880 + }, + { + "epoch": 0.45748203778717367, + "grad_norm": 0.06776176393032074, + "learning_rate": 2.722652540360874e-05, + "loss": 0.0164, + "step": 61890 + }, + { + "epoch": 0.4575559563584755, + "grad_norm": 0.08174600452184677, + "learning_rate": 2.7222815764482435e-05, + "loss": 0.0175, + "step": 61900 + }, + { + "epoch": 0.45762987492977736, + "grad_norm": 0.08814748376607895, + "learning_rate": 2.7219106125356124e-05, + "loss": 0.0182, + "step": 61910 + }, + { + "epoch": 0.4577037935010792, + "grad_norm": 0.08945896476507187, + "learning_rate": 2.721539648622982e-05, + "loss": 0.0186, + "step": 61920 + }, + { + "epoch": 0.45777771207238105, + "grad_norm": 0.06929168850183487, + "learning_rate": 2.7211686847103513e-05, + "loss": 0.017, + "step": 61930 + }, + { + "epoch": 0.4578516306436829, + "grad_norm": 0.08452693372964859, + "learning_rate": 2.720797720797721e-05, + "loss": 0.0187, + "step": 61940 + }, + { + "epoch": 0.4579255492149848, + "grad_norm": 0.07486118376255035, + "learning_rate": 2.7204267568850905e-05, + "loss": 0.019, + "step": 61950 + }, + { + "epoch": 0.45799946778628664, + "grad_norm": 0.10226127505302429, + "learning_rate": 2.7200557929724597e-05, + "loss": 0.0182, + "step": 61960 + }, + { + "epoch": 0.4580733863575885, + "grad_norm": 0.07045385986566544, + "learning_rate": 2.7196848290598293e-05, + "loss": 0.0201, + "step": 61970 + }, + { + "epoch": 0.45814730492889033, + "grad_norm": 0.08606812357902527, + "learning_rate": 2.719313865147199e-05, + "loss": 0.018, + "step": 61980 + }, + { + "epoch": 0.4582212235001922, + "grad_norm": 0.05335699021816254, + "learning_rate": 2.7189429012345678e-05, + "loss": 0.0196, + "step": 61990 + }, + { + "epoch": 0.458295142071494, + "grad_norm": 0.09203033149242401, + "learning_rate": 2.7185719373219378e-05, + "loss": 0.0163, + "step": 62000 + }, + { + "epoch": 0.4583690606427959, + "grad_norm": 0.08226708322763443, + "learning_rate": 2.7182009734093067e-05, + "loss": 0.019, + "step": 62010 + }, + { + "epoch": 0.45844297921409777, + "grad_norm": 0.08296380192041397, + "learning_rate": 2.7178300094966763e-05, + "loss": 0.0196, + "step": 62020 + }, + { + "epoch": 0.4585168977853996, + "grad_norm": 0.07300078123807907, + "learning_rate": 2.717459045584046e-05, + "loss": 0.0208, + "step": 62030 + }, + { + "epoch": 0.45859081635670146, + "grad_norm": 0.0772596001625061, + "learning_rate": 2.717088081671415e-05, + "loss": 0.0205, + "step": 62040 + }, + { + "epoch": 0.4586647349280033, + "grad_norm": 0.0756126269698143, + "learning_rate": 2.7167171177587847e-05, + "loss": 0.0166, + "step": 62050 + }, + { + "epoch": 0.45873865349930515, + "grad_norm": 0.0758863240480423, + "learning_rate": 2.7163461538461536e-05, + "loss": 0.017, + "step": 62060 + }, + { + "epoch": 0.458812572070607, + "grad_norm": 0.05326606705784798, + "learning_rate": 2.7159751899335232e-05, + "loss": 0.0176, + "step": 62070 + }, + { + "epoch": 0.4588864906419089, + "grad_norm": 0.09534876048564911, + "learning_rate": 2.7156042260208932e-05, + "loss": 0.0181, + "step": 62080 + }, + { + "epoch": 0.45896040921321074, + "grad_norm": 0.09859345853328705, + "learning_rate": 2.715233262108262e-05, + "loss": 0.0182, + "step": 62090 + }, + { + "epoch": 0.4590343277845126, + "grad_norm": 0.07618529349565506, + "learning_rate": 2.7148622981956317e-05, + "loss": 0.0172, + "step": 62100 + }, + { + "epoch": 0.45910824635581443, + "grad_norm": 0.08859498053789139, + "learning_rate": 2.714491334283001e-05, + "loss": 0.0163, + "step": 62110 + }, + { + "epoch": 0.4591821649271163, + "grad_norm": 0.07485493272542953, + "learning_rate": 2.7141203703703705e-05, + "loss": 0.0185, + "step": 62120 + }, + { + "epoch": 0.4592560834984181, + "grad_norm": 0.08691943436861038, + "learning_rate": 2.71374940645774e-05, + "loss": 0.0177, + "step": 62130 + }, + { + "epoch": 0.45933000206972, + "grad_norm": 0.10383077710866928, + "learning_rate": 2.713378442545109e-05, + "loss": 0.0177, + "step": 62140 + }, + { + "epoch": 0.45940392064102187, + "grad_norm": 0.05785920470952988, + "learning_rate": 2.713007478632479e-05, + "loss": 0.0184, + "step": 62150 + }, + { + "epoch": 0.4594778392123237, + "grad_norm": 0.0689610168337822, + "learning_rate": 2.712636514719848e-05, + "loss": 0.019, + "step": 62160 + }, + { + "epoch": 0.45955175778362556, + "grad_norm": 0.10667440295219421, + "learning_rate": 2.7122655508072175e-05, + "loss": 0.0171, + "step": 62170 + }, + { + "epoch": 0.4596256763549274, + "grad_norm": 0.08950226753950119, + "learning_rate": 2.711894586894587e-05, + "loss": 0.0173, + "step": 62180 + }, + { + "epoch": 0.45969959492622925, + "grad_norm": 0.0803423747420311, + "learning_rate": 2.7115236229819564e-05, + "loss": 0.0182, + "step": 62190 + }, + { + "epoch": 0.4597735134975311, + "grad_norm": 0.07441220432519913, + "learning_rate": 2.711152659069326e-05, + "loss": 0.0176, + "step": 62200 + }, + { + "epoch": 0.459847432068833, + "grad_norm": 0.06424054503440857, + "learning_rate": 2.7107816951566956e-05, + "loss": 0.0168, + "step": 62210 + }, + { + "epoch": 0.45992135064013484, + "grad_norm": 0.08061058074235916, + "learning_rate": 2.7104107312440645e-05, + "loss": 0.0153, + "step": 62220 + }, + { + "epoch": 0.4599952692114367, + "grad_norm": 0.06604152172803879, + "learning_rate": 2.7100397673314344e-05, + "loss": 0.0164, + "step": 62230 + }, + { + "epoch": 0.46006918778273853, + "grad_norm": 0.08139542490243912, + "learning_rate": 2.7096688034188033e-05, + "loss": 0.0186, + "step": 62240 + }, + { + "epoch": 0.4601431063540404, + "grad_norm": 0.04587428644299507, + "learning_rate": 2.709297839506173e-05, + "loss": 0.0157, + "step": 62250 + }, + { + "epoch": 0.4602170249253422, + "grad_norm": 0.10360642522573471, + "learning_rate": 2.7089268755935425e-05, + "loss": 0.0163, + "step": 62260 + }, + { + "epoch": 0.4602909434966441, + "grad_norm": 0.09443079680204391, + "learning_rate": 2.7085559116809118e-05, + "loss": 0.0169, + "step": 62270 + }, + { + "epoch": 0.46036486206794597, + "grad_norm": 0.06919102370738983, + "learning_rate": 2.7081849477682814e-05, + "loss": 0.0189, + "step": 62280 + }, + { + "epoch": 0.4604387806392478, + "grad_norm": 0.09010902047157288, + "learning_rate": 2.7078139838556503e-05, + "loss": 0.0176, + "step": 62290 + }, + { + "epoch": 0.46051269921054966, + "grad_norm": 0.06500197947025299, + "learning_rate": 2.7074430199430202e-05, + "loss": 0.017, + "step": 62300 + }, + { + "epoch": 0.4605866177818515, + "grad_norm": 0.08220690488815308, + "learning_rate": 2.70707205603039e-05, + "loss": 0.0191, + "step": 62310 + }, + { + "epoch": 0.46066053635315335, + "grad_norm": 0.08635219186544418, + "learning_rate": 2.7067010921177588e-05, + "loss": 0.018, + "step": 62320 + }, + { + "epoch": 0.4607344549244552, + "grad_norm": 0.0828336849808693, + "learning_rate": 2.7063301282051284e-05, + "loss": 0.0203, + "step": 62330 + }, + { + "epoch": 0.4608083734957571, + "grad_norm": 0.07880325615406036, + "learning_rate": 2.7059591642924976e-05, + "loss": 0.0177, + "step": 62340 + }, + { + "epoch": 0.46088229206705894, + "grad_norm": 0.0864029973745346, + "learning_rate": 2.7055882003798672e-05, + "loss": 0.0177, + "step": 62350 + }, + { + "epoch": 0.4609562106383608, + "grad_norm": 0.07870983332395554, + "learning_rate": 2.7052172364672368e-05, + "loss": 0.0158, + "step": 62360 + }, + { + "epoch": 0.46103012920966263, + "grad_norm": 0.07564816623926163, + "learning_rate": 2.7048462725546057e-05, + "loss": 0.0186, + "step": 62370 + }, + { + "epoch": 0.4611040477809645, + "grad_norm": 0.0991857647895813, + "learning_rate": 2.7044753086419757e-05, + "loss": 0.0188, + "step": 62380 + }, + { + "epoch": 0.4611779663522663, + "grad_norm": 0.06728631258010864, + "learning_rate": 2.7041043447293446e-05, + "loss": 0.0188, + "step": 62390 + }, + { + "epoch": 0.4612518849235682, + "grad_norm": 0.07630317658185959, + "learning_rate": 2.7037333808167142e-05, + "loss": 0.0188, + "step": 62400 + }, + { + "epoch": 0.46132580349487007, + "grad_norm": 0.08242637664079666, + "learning_rate": 2.7033624169040838e-05, + "loss": 0.0157, + "step": 62410 + }, + { + "epoch": 0.4613997220661719, + "grad_norm": 0.06703434139490128, + "learning_rate": 2.702991452991453e-05, + "loss": 0.0191, + "step": 62420 + }, + { + "epoch": 0.46147364063747376, + "grad_norm": 0.08433108776807785, + "learning_rate": 2.7026204890788226e-05, + "loss": 0.0179, + "step": 62430 + }, + { + "epoch": 0.4615475592087756, + "grad_norm": 0.09839016944169998, + "learning_rate": 2.7022495251661922e-05, + "loss": 0.018, + "step": 62440 + }, + { + "epoch": 0.46162147778007745, + "grad_norm": 0.06323008984327316, + "learning_rate": 2.7018785612535615e-05, + "loss": 0.0177, + "step": 62450 + }, + { + "epoch": 0.4616953963513793, + "grad_norm": 0.09598717838525772, + "learning_rate": 2.701507597340931e-05, + "loss": 0.0196, + "step": 62460 + }, + { + "epoch": 0.4617693149226812, + "grad_norm": 0.0499408058822155, + "learning_rate": 2.7011366334283e-05, + "loss": 0.016, + "step": 62470 + }, + { + "epoch": 0.46184323349398304, + "grad_norm": 0.06782973557710648, + "learning_rate": 2.7007656695156696e-05, + "loss": 0.0176, + "step": 62480 + }, + { + "epoch": 0.4619171520652849, + "grad_norm": 0.06388473510742188, + "learning_rate": 2.7003947056030392e-05, + "loss": 0.0165, + "step": 62490 + }, + { + "epoch": 0.46199107063658673, + "grad_norm": 0.08375228941440582, + "learning_rate": 2.7000237416904084e-05, + "loss": 0.0161, + "step": 62500 + }, + { + "epoch": 0.4620649892078886, + "grad_norm": 0.09047287702560425, + "learning_rate": 2.699652777777778e-05, + "loss": 0.0216, + "step": 62510 + }, + { + "epoch": 0.4621389077791904, + "grad_norm": 0.08288053423166275, + "learning_rate": 2.699281813865147e-05, + "loss": 0.0181, + "step": 62520 + }, + { + "epoch": 0.4622128263504923, + "grad_norm": 0.08707530796527863, + "learning_rate": 2.698910849952517e-05, + "loss": 0.018, + "step": 62530 + }, + { + "epoch": 0.46228674492179417, + "grad_norm": 0.06646700948476791, + "learning_rate": 2.6985398860398865e-05, + "loss": 0.019, + "step": 62540 + }, + { + "epoch": 0.462360663493096, + "grad_norm": 0.08140867203474045, + "learning_rate": 2.6981689221272554e-05, + "loss": 0.0171, + "step": 62550 + }, + { + "epoch": 0.46243458206439786, + "grad_norm": 0.0692920982837677, + "learning_rate": 2.697797958214625e-05, + "loss": 0.0179, + "step": 62560 + }, + { + "epoch": 0.4625085006356997, + "grad_norm": 0.07965762168169022, + "learning_rate": 2.6974269943019943e-05, + "loss": 0.018, + "step": 62570 + }, + { + "epoch": 0.46258241920700155, + "grad_norm": 0.07014095783233643, + "learning_rate": 2.697056030389364e-05, + "loss": 0.0174, + "step": 62580 + }, + { + "epoch": 0.4626563377783034, + "grad_norm": 0.07212305814027786, + "learning_rate": 2.6966850664767335e-05, + "loss": 0.0203, + "step": 62590 + }, + { + "epoch": 0.4627302563496053, + "grad_norm": 0.10186107456684113, + "learning_rate": 2.6963141025641024e-05, + "loss": 0.0193, + "step": 62600 + }, + { + "epoch": 0.46280417492090714, + "grad_norm": 0.07537295669317245, + "learning_rate": 2.6959431386514723e-05, + "loss": 0.0173, + "step": 62610 + }, + { + "epoch": 0.462878093492209, + "grad_norm": 0.08256471157073975, + "learning_rate": 2.6955721747388412e-05, + "loss": 0.0193, + "step": 62620 + }, + { + "epoch": 0.46295201206351083, + "grad_norm": 0.08015892654657364, + "learning_rate": 2.695201210826211e-05, + "loss": 0.0185, + "step": 62630 + }, + { + "epoch": 0.4630259306348127, + "grad_norm": 0.0764988586306572, + "learning_rate": 2.6948302469135804e-05, + "loss": 0.0184, + "step": 62640 + }, + { + "epoch": 0.4630998492061145, + "grad_norm": 0.07432615756988525, + "learning_rate": 2.6944592830009497e-05, + "loss": 0.0194, + "step": 62650 + }, + { + "epoch": 0.4631737677774164, + "grad_norm": 0.0674583911895752, + "learning_rate": 2.6940883190883193e-05, + "loss": 0.0194, + "step": 62660 + }, + { + "epoch": 0.46324768634871827, + "grad_norm": 0.07448495924472809, + "learning_rate": 2.693717355175689e-05, + "loss": 0.0195, + "step": 62670 + }, + { + "epoch": 0.4633216049200201, + "grad_norm": 0.07558556646108627, + "learning_rate": 2.693346391263058e-05, + "loss": 0.0208, + "step": 62680 + }, + { + "epoch": 0.46339552349132196, + "grad_norm": 0.07623834908008575, + "learning_rate": 2.6929754273504277e-05, + "loss": 0.0209, + "step": 62690 + }, + { + "epoch": 0.4634694420626238, + "grad_norm": 0.07406779378652573, + "learning_rate": 2.6926044634377967e-05, + "loss": 0.0182, + "step": 62700 + }, + { + "epoch": 0.46354336063392565, + "grad_norm": 0.07742155343294144, + "learning_rate": 2.6922334995251663e-05, + "loss": 0.019, + "step": 62710 + }, + { + "epoch": 0.4636172792052275, + "grad_norm": 0.07569239288568497, + "learning_rate": 2.691862535612536e-05, + "loss": 0.0173, + "step": 62720 + }, + { + "epoch": 0.4636911977765294, + "grad_norm": 0.07936278730630875, + "learning_rate": 2.691491571699905e-05, + "loss": 0.0164, + "step": 62730 + }, + { + "epoch": 0.46376511634783124, + "grad_norm": 0.08103445172309875, + "learning_rate": 2.6911206077872747e-05, + "loss": 0.0168, + "step": 62740 + }, + { + "epoch": 0.4638390349191331, + "grad_norm": 0.0931391492486, + "learning_rate": 2.6907496438746436e-05, + "loss": 0.019, + "step": 62750 + }, + { + "epoch": 0.46391295349043493, + "grad_norm": 0.07881567627191544, + "learning_rate": 2.6903786799620136e-05, + "loss": 0.0194, + "step": 62760 + }, + { + "epoch": 0.4639868720617368, + "grad_norm": 0.08158383518457413, + "learning_rate": 2.690007716049383e-05, + "loss": 0.0186, + "step": 62770 + }, + { + "epoch": 0.4640607906330386, + "grad_norm": 0.07594239711761475, + "learning_rate": 2.689636752136752e-05, + "loss": 0.0181, + "step": 62780 + }, + { + "epoch": 0.4641347092043405, + "grad_norm": 0.07770699262619019, + "learning_rate": 2.6892657882241217e-05, + "loss": 0.0187, + "step": 62790 + }, + { + "epoch": 0.46420862777564237, + "grad_norm": 0.07621417939662933, + "learning_rate": 2.688894824311491e-05, + "loss": 0.0191, + "step": 62800 + }, + { + "epoch": 0.4642825463469442, + "grad_norm": 0.06834164261817932, + "learning_rate": 2.6885238603988605e-05, + "loss": 0.0182, + "step": 62810 + }, + { + "epoch": 0.46435646491824606, + "grad_norm": 0.16739051043987274, + "learning_rate": 2.68815289648623e-05, + "loss": 0.0187, + "step": 62820 + }, + { + "epoch": 0.4644303834895479, + "grad_norm": 0.09170664846897125, + "learning_rate": 2.6877819325735994e-05, + "loss": 0.0219, + "step": 62830 + }, + { + "epoch": 0.46450430206084975, + "grad_norm": 0.09461186826229095, + "learning_rate": 2.687410968660969e-05, + "loss": 0.0176, + "step": 62840 + }, + { + "epoch": 0.4645782206321516, + "grad_norm": 0.06674101203680038, + "learning_rate": 2.687040004748338e-05, + "loss": 0.0186, + "step": 62850 + }, + { + "epoch": 0.4646521392034535, + "grad_norm": 0.058075059205293655, + "learning_rate": 2.6866690408357075e-05, + "loss": 0.0176, + "step": 62860 + }, + { + "epoch": 0.46472605777475534, + "grad_norm": 0.07084377855062485, + "learning_rate": 2.686298076923077e-05, + "loss": 0.019, + "step": 62870 + }, + { + "epoch": 0.4647999763460572, + "grad_norm": 0.11315616220235825, + "learning_rate": 2.6859271130104463e-05, + "loss": 0.0186, + "step": 62880 + }, + { + "epoch": 0.46487389491735903, + "grad_norm": 0.09645523875951767, + "learning_rate": 2.685556149097816e-05, + "loss": 0.021, + "step": 62890 + }, + { + "epoch": 0.4649478134886609, + "grad_norm": 0.07578028738498688, + "learning_rate": 2.6851851851851855e-05, + "loss": 0.0175, + "step": 62900 + }, + { + "epoch": 0.4650217320599627, + "grad_norm": 0.09306954592466354, + "learning_rate": 2.6848142212725548e-05, + "loss": 0.018, + "step": 62910 + }, + { + "epoch": 0.4650956506312646, + "grad_norm": 0.06260194629430771, + "learning_rate": 2.6844432573599244e-05, + "loss": 0.0188, + "step": 62920 + }, + { + "epoch": 0.46516956920256647, + "grad_norm": 0.08187243342399597, + "learning_rate": 2.6840722934472933e-05, + "loss": 0.0174, + "step": 62930 + }, + { + "epoch": 0.4652434877738683, + "grad_norm": 0.10176604241132736, + "learning_rate": 2.683701329534663e-05, + "loss": 0.0168, + "step": 62940 + }, + { + "epoch": 0.46531740634517016, + "grad_norm": 0.10157504677772522, + "learning_rate": 2.6833303656220325e-05, + "loss": 0.0174, + "step": 62950 + }, + { + "epoch": 0.465391324916472, + "grad_norm": 0.09087589383125305, + "learning_rate": 2.6829594017094018e-05, + "loss": 0.0179, + "step": 62960 + }, + { + "epoch": 0.46546524348777385, + "grad_norm": 0.07912372052669525, + "learning_rate": 2.6825884377967714e-05, + "loss": 0.0164, + "step": 62970 + }, + { + "epoch": 0.4655391620590757, + "grad_norm": 0.09425844252109528, + "learning_rate": 2.6822174738841406e-05, + "loss": 0.0203, + "step": 62980 + }, + { + "epoch": 0.4656130806303776, + "grad_norm": 0.08376140147447586, + "learning_rate": 2.6818465099715102e-05, + "loss": 0.0182, + "step": 62990 + }, + { + "epoch": 0.46568699920167944, + "grad_norm": 0.0748540386557579, + "learning_rate": 2.6814755460588798e-05, + "loss": 0.0187, + "step": 63000 + }, + { + "epoch": 0.4657609177729813, + "grad_norm": 0.09936424344778061, + "learning_rate": 2.6811045821462487e-05, + "loss": 0.0183, + "step": 63010 + }, + { + "epoch": 0.46583483634428313, + "grad_norm": 0.0781758725643158, + "learning_rate": 2.6807336182336183e-05, + "loss": 0.0171, + "step": 63020 + }, + { + "epoch": 0.465908754915585, + "grad_norm": 0.10308913141489029, + "learning_rate": 2.6803626543209876e-05, + "loss": 0.0205, + "step": 63030 + }, + { + "epoch": 0.4659826734868868, + "grad_norm": 0.0855834037065506, + "learning_rate": 2.6799916904083572e-05, + "loss": 0.0196, + "step": 63040 + }, + { + "epoch": 0.4660565920581887, + "grad_norm": 0.06840619444847107, + "learning_rate": 2.6796207264957268e-05, + "loss": 0.0175, + "step": 63050 + }, + { + "epoch": 0.46613051062949057, + "grad_norm": 0.0917033851146698, + "learning_rate": 2.679249762583096e-05, + "loss": 0.0176, + "step": 63060 + }, + { + "epoch": 0.4662044292007924, + "grad_norm": 0.07346449047327042, + "learning_rate": 2.6788787986704656e-05, + "loss": 0.0178, + "step": 63070 + }, + { + "epoch": 0.46627834777209426, + "grad_norm": 0.06600905954837799, + "learning_rate": 2.6785078347578346e-05, + "loss": 0.0178, + "step": 63080 + }, + { + "epoch": 0.4663522663433961, + "grad_norm": 0.08728162944316864, + "learning_rate": 2.678136870845204e-05, + "loss": 0.0185, + "step": 63090 + }, + { + "epoch": 0.46642618491469795, + "grad_norm": 0.08154621720314026, + "learning_rate": 2.6777659069325737e-05, + "loss": 0.0183, + "step": 63100 + }, + { + "epoch": 0.4665001034859998, + "grad_norm": 0.06465992331504822, + "learning_rate": 2.677394943019943e-05, + "loss": 0.0185, + "step": 63110 + }, + { + "epoch": 0.4665740220573017, + "grad_norm": 0.07442975044250488, + "learning_rate": 2.6770239791073126e-05, + "loss": 0.0179, + "step": 63120 + }, + { + "epoch": 0.46664794062860354, + "grad_norm": 0.07067244499921799, + "learning_rate": 2.6766530151946822e-05, + "loss": 0.0165, + "step": 63130 + }, + { + "epoch": 0.4667218591999054, + "grad_norm": 0.09633144736289978, + "learning_rate": 2.6762820512820515e-05, + "loss": 0.0173, + "step": 63140 + }, + { + "epoch": 0.46679577777120723, + "grad_norm": 0.08338624984025955, + "learning_rate": 2.675911087369421e-05, + "loss": 0.0185, + "step": 63150 + }, + { + "epoch": 0.4668696963425091, + "grad_norm": 0.07794088125228882, + "learning_rate": 2.67554012345679e-05, + "loss": 0.017, + "step": 63160 + }, + { + "epoch": 0.4669436149138109, + "grad_norm": 0.08031114935874939, + "learning_rate": 2.6751691595441596e-05, + "loss": 0.0172, + "step": 63170 + }, + { + "epoch": 0.4670175334851128, + "grad_norm": 0.07404765486717224, + "learning_rate": 2.6747981956315295e-05, + "loss": 0.0185, + "step": 63180 + }, + { + "epoch": 0.46709145205641467, + "grad_norm": 0.06223677843809128, + "learning_rate": 2.6744272317188984e-05, + "loss": 0.0161, + "step": 63190 + }, + { + "epoch": 0.4671653706277165, + "grad_norm": 0.07571319490671158, + "learning_rate": 2.674056267806268e-05, + "loss": 0.0185, + "step": 63200 + }, + { + "epoch": 0.46723928919901836, + "grad_norm": 0.10422144085168839, + "learning_rate": 2.6736853038936373e-05, + "loss": 0.0171, + "step": 63210 + }, + { + "epoch": 0.4673132077703202, + "grad_norm": 0.08425986766815186, + "learning_rate": 2.673314339981007e-05, + "loss": 0.0172, + "step": 63220 + }, + { + "epoch": 0.46738712634162205, + "grad_norm": 0.08318064361810684, + "learning_rate": 2.6729433760683765e-05, + "loss": 0.0193, + "step": 63230 + }, + { + "epoch": 0.4674610449129239, + "grad_norm": 0.07843171060085297, + "learning_rate": 2.6725724121557454e-05, + "loss": 0.0177, + "step": 63240 + }, + { + "epoch": 0.4675349634842258, + "grad_norm": 0.08089744299650192, + "learning_rate": 2.672201448243115e-05, + "loss": 0.0176, + "step": 63250 + }, + { + "epoch": 0.46760888205552764, + "grad_norm": 0.05883701890707016, + "learning_rate": 2.6718304843304842e-05, + "loss": 0.0149, + "step": 63260 + }, + { + "epoch": 0.4676828006268295, + "grad_norm": 0.09115418046712875, + "learning_rate": 2.671459520417854e-05, + "loss": 0.0153, + "step": 63270 + }, + { + "epoch": 0.46775671919813133, + "grad_norm": 0.08480945974588394, + "learning_rate": 2.6710885565052234e-05, + "loss": 0.0188, + "step": 63280 + }, + { + "epoch": 0.4678306377694332, + "grad_norm": 0.08573106676340103, + "learning_rate": 2.6707175925925927e-05, + "loss": 0.0178, + "step": 63290 + }, + { + "epoch": 0.467904556340735, + "grad_norm": 0.07894410192966461, + "learning_rate": 2.6703466286799623e-05, + "loss": 0.0188, + "step": 63300 + }, + { + "epoch": 0.4679784749120369, + "grad_norm": 0.07888448983430862, + "learning_rate": 2.6699756647673312e-05, + "loss": 0.0183, + "step": 63310 + }, + { + "epoch": 0.46805239348333877, + "grad_norm": 0.08809691667556763, + "learning_rate": 2.6696047008547008e-05, + "loss": 0.0157, + "step": 63320 + }, + { + "epoch": 0.4681263120546406, + "grad_norm": 0.06534068286418915, + "learning_rate": 2.6692337369420704e-05, + "loss": 0.0186, + "step": 63330 + }, + { + "epoch": 0.46820023062594246, + "grad_norm": 0.11392059177160263, + "learning_rate": 2.6688627730294397e-05, + "loss": 0.0192, + "step": 63340 + }, + { + "epoch": 0.4682741491972443, + "grad_norm": 0.09349965304136276, + "learning_rate": 2.6684918091168093e-05, + "loss": 0.0158, + "step": 63350 + }, + { + "epoch": 0.46834806776854615, + "grad_norm": 0.0638246163725853, + "learning_rate": 2.668120845204179e-05, + "loss": 0.0188, + "step": 63360 + }, + { + "epoch": 0.46842198633984805, + "grad_norm": 0.06115562841296196, + "learning_rate": 2.667749881291548e-05, + "loss": 0.0187, + "step": 63370 + }, + { + "epoch": 0.4684959049111499, + "grad_norm": 0.0755188837647438, + "learning_rate": 2.6673789173789177e-05, + "loss": 0.0184, + "step": 63380 + }, + { + "epoch": 0.46856982348245174, + "grad_norm": 0.051312826573848724, + "learning_rate": 2.6670079534662866e-05, + "loss": 0.0169, + "step": 63390 + }, + { + "epoch": 0.4686437420537536, + "grad_norm": 0.08142546564340591, + "learning_rate": 2.6666369895536562e-05, + "loss": 0.0193, + "step": 63400 + }, + { + "epoch": 0.46871766062505543, + "grad_norm": 0.07243930548429489, + "learning_rate": 2.666266025641026e-05, + "loss": 0.0169, + "step": 63410 + }, + { + "epoch": 0.4687915791963573, + "grad_norm": 0.09887338429689407, + "learning_rate": 2.665895061728395e-05, + "loss": 0.0185, + "step": 63420 + }, + { + "epoch": 0.4688654977676591, + "grad_norm": 0.08401845395565033, + "learning_rate": 2.6655240978157647e-05, + "loss": 0.0194, + "step": 63430 + }, + { + "epoch": 0.468939416338961, + "grad_norm": 0.049609072506427765, + "learning_rate": 2.665153133903134e-05, + "loss": 0.0178, + "step": 63440 + }, + { + "epoch": 0.46901333491026287, + "grad_norm": 0.06521576642990112, + "learning_rate": 2.6647821699905035e-05, + "loss": 0.0167, + "step": 63450 + }, + { + "epoch": 0.4690872534815647, + "grad_norm": 0.08452492952346802, + "learning_rate": 2.664411206077873e-05, + "loss": 0.0183, + "step": 63460 + }, + { + "epoch": 0.46916117205286656, + "grad_norm": 0.10923508554697037, + "learning_rate": 2.664040242165242e-05, + "loss": 0.0197, + "step": 63470 + }, + { + "epoch": 0.4692350906241684, + "grad_norm": 0.07494436204433441, + "learning_rate": 2.6636692782526116e-05, + "loss": 0.0167, + "step": 63480 + }, + { + "epoch": 0.46930900919547025, + "grad_norm": 0.09253532439470291, + "learning_rate": 2.663298314339981e-05, + "loss": 0.0204, + "step": 63490 + }, + { + "epoch": 0.46938292776677215, + "grad_norm": 0.07485422492027283, + "learning_rate": 2.6629273504273505e-05, + "loss": 0.0193, + "step": 63500 + }, + { + "epoch": 0.469456846338074, + "grad_norm": 0.07859625667333603, + "learning_rate": 2.66255638651472e-05, + "loss": 0.0181, + "step": 63510 + }, + { + "epoch": 0.46953076490937584, + "grad_norm": 0.06669975072145462, + "learning_rate": 2.6621854226020894e-05, + "loss": 0.0181, + "step": 63520 + }, + { + "epoch": 0.4696046834806777, + "grad_norm": 0.08856157213449478, + "learning_rate": 2.661814458689459e-05, + "loss": 0.017, + "step": 63530 + }, + { + "epoch": 0.46967860205197953, + "grad_norm": 0.07480046898126602, + "learning_rate": 2.661443494776828e-05, + "loss": 0.0184, + "step": 63540 + }, + { + "epoch": 0.4697525206232814, + "grad_norm": 0.08537238091230392, + "learning_rate": 2.6610725308641975e-05, + "loss": 0.0164, + "step": 63550 + }, + { + "epoch": 0.4698264391945832, + "grad_norm": 0.07791563868522644, + "learning_rate": 2.6607015669515674e-05, + "loss": 0.0169, + "step": 63560 + }, + { + "epoch": 0.4699003577658851, + "grad_norm": 0.07880368083715439, + "learning_rate": 2.6603306030389363e-05, + "loss": 0.0165, + "step": 63570 + }, + { + "epoch": 0.46997427633718697, + "grad_norm": 0.09982677549123764, + "learning_rate": 2.659959639126306e-05, + "loss": 0.0179, + "step": 63580 + }, + { + "epoch": 0.4700481949084888, + "grad_norm": 0.058600496500730515, + "learning_rate": 2.6595886752136755e-05, + "loss": 0.016, + "step": 63590 + }, + { + "epoch": 0.47012211347979066, + "grad_norm": 0.07470900565385818, + "learning_rate": 2.6592177113010448e-05, + "loss": 0.0177, + "step": 63600 + }, + { + "epoch": 0.4701960320510925, + "grad_norm": 0.08938571810722351, + "learning_rate": 2.6588467473884144e-05, + "loss": 0.0179, + "step": 63610 + }, + { + "epoch": 0.47026995062239435, + "grad_norm": 0.08260288089513779, + "learning_rate": 2.6584757834757833e-05, + "loss": 0.0191, + "step": 63620 + }, + { + "epoch": 0.47034386919369625, + "grad_norm": 0.09578394889831543, + "learning_rate": 2.658104819563153e-05, + "loss": 0.0189, + "step": 63630 + }, + { + "epoch": 0.4704177877649981, + "grad_norm": 0.06751023232936859, + "learning_rate": 2.6577338556505228e-05, + "loss": 0.0165, + "step": 63640 + }, + { + "epoch": 0.47049170633629994, + "grad_norm": 0.07542560994625092, + "learning_rate": 2.6573628917378917e-05, + "loss": 0.0214, + "step": 63650 + }, + { + "epoch": 0.4705656249076018, + "grad_norm": 0.09560023248195648, + "learning_rate": 2.6569919278252613e-05, + "loss": 0.02, + "step": 63660 + }, + { + "epoch": 0.47063954347890363, + "grad_norm": 0.07671035826206207, + "learning_rate": 2.6566209639126306e-05, + "loss": 0.0159, + "step": 63670 + }, + { + "epoch": 0.4707134620502055, + "grad_norm": 0.07942305505275726, + "learning_rate": 2.6562500000000002e-05, + "loss": 0.016, + "step": 63680 + }, + { + "epoch": 0.4707873806215073, + "grad_norm": 0.09094572812318802, + "learning_rate": 2.6558790360873698e-05, + "loss": 0.0182, + "step": 63690 + }, + { + "epoch": 0.4708612991928092, + "grad_norm": 0.05231654644012451, + "learning_rate": 2.6555080721747387e-05, + "loss": 0.0175, + "step": 63700 + }, + { + "epoch": 0.47093521776411107, + "grad_norm": 0.056431982666254044, + "learning_rate": 2.6551371082621086e-05, + "loss": 0.0173, + "step": 63710 + }, + { + "epoch": 0.4710091363354129, + "grad_norm": 0.13144607841968536, + "learning_rate": 2.6547661443494776e-05, + "loss": 0.0214, + "step": 63720 + }, + { + "epoch": 0.47108305490671476, + "grad_norm": 0.08269277960062027, + "learning_rate": 2.654395180436847e-05, + "loss": 0.0193, + "step": 63730 + }, + { + "epoch": 0.4711569734780166, + "grad_norm": 0.12055590748786926, + "learning_rate": 2.6540242165242168e-05, + "loss": 0.0189, + "step": 63740 + }, + { + "epoch": 0.47123089204931845, + "grad_norm": 0.08253350853919983, + "learning_rate": 2.653653252611586e-05, + "loss": 0.0195, + "step": 63750 + }, + { + "epoch": 0.47130481062062035, + "grad_norm": 0.09564819186925888, + "learning_rate": 2.6532822886989556e-05, + "loss": 0.0175, + "step": 63760 + }, + { + "epoch": 0.4713787291919222, + "grad_norm": 0.08060647547245026, + "learning_rate": 2.6529113247863245e-05, + "loss": 0.0173, + "step": 63770 + }, + { + "epoch": 0.47145264776322404, + "grad_norm": 0.09288964420557022, + "learning_rate": 2.652540360873694e-05, + "loss": 0.0183, + "step": 63780 + }, + { + "epoch": 0.4715265663345259, + "grad_norm": 0.061360739171504974, + "learning_rate": 2.652169396961064e-05, + "loss": 0.0182, + "step": 63790 + }, + { + "epoch": 0.47160048490582773, + "grad_norm": 0.08212579041719437, + "learning_rate": 2.651798433048433e-05, + "loss": 0.0188, + "step": 63800 + }, + { + "epoch": 0.4716744034771296, + "grad_norm": 0.07967258244752884, + "learning_rate": 2.6514274691358026e-05, + "loss": 0.0179, + "step": 63810 + }, + { + "epoch": 0.4717483220484314, + "grad_norm": 0.07890528440475464, + "learning_rate": 2.6510565052231722e-05, + "loss": 0.0168, + "step": 63820 + }, + { + "epoch": 0.4718222406197333, + "grad_norm": 0.06696062535047531, + "learning_rate": 2.6506855413105414e-05, + "loss": 0.0165, + "step": 63830 + }, + { + "epoch": 0.47189615919103517, + "grad_norm": 0.0909370705485344, + "learning_rate": 2.650314577397911e-05, + "loss": 0.019, + "step": 63840 + }, + { + "epoch": 0.471970077762337, + "grad_norm": 0.06163240224123001, + "learning_rate": 2.64994361348528e-05, + "loss": 0.0213, + "step": 63850 + }, + { + "epoch": 0.47204399633363886, + "grad_norm": 0.05091671273112297, + "learning_rate": 2.64957264957265e-05, + "loss": 0.0191, + "step": 63860 + }, + { + "epoch": 0.4721179149049407, + "grad_norm": 0.07003694027662277, + "learning_rate": 2.6492016856600195e-05, + "loss": 0.0182, + "step": 63870 + }, + { + "epoch": 0.47219183347624255, + "grad_norm": 0.07868874818086624, + "learning_rate": 2.6488307217473884e-05, + "loss": 0.0208, + "step": 63880 + }, + { + "epoch": 0.47226575204754445, + "grad_norm": 0.08938395231962204, + "learning_rate": 2.648459757834758e-05, + "loss": 0.0187, + "step": 63890 + }, + { + "epoch": 0.4723396706188463, + "grad_norm": 0.0641806572675705, + "learning_rate": 2.6480887939221273e-05, + "loss": 0.0189, + "step": 63900 + }, + { + "epoch": 0.47241358919014814, + "grad_norm": 0.06578119844198227, + "learning_rate": 2.647717830009497e-05, + "loss": 0.0172, + "step": 63910 + }, + { + "epoch": 0.47248750776145, + "grad_norm": 0.061670076102018356, + "learning_rate": 2.6473468660968664e-05, + "loss": 0.0183, + "step": 63920 + }, + { + "epoch": 0.47256142633275183, + "grad_norm": 0.06204846128821373, + "learning_rate": 2.6469759021842354e-05, + "loss": 0.0191, + "step": 63930 + }, + { + "epoch": 0.4726353449040537, + "grad_norm": 0.0723707377910614, + "learning_rate": 2.6466049382716053e-05, + "loss": 0.0158, + "step": 63940 + }, + { + "epoch": 0.4727092634753555, + "grad_norm": 0.08021928369998932, + "learning_rate": 2.6462339743589742e-05, + "loss": 0.019, + "step": 63950 + }, + { + "epoch": 0.4727831820466574, + "grad_norm": 0.09613403677940369, + "learning_rate": 2.6458630104463438e-05, + "loss": 0.0193, + "step": 63960 + }, + { + "epoch": 0.47285710061795927, + "grad_norm": 0.09475836902856827, + "learning_rate": 2.6454920465337134e-05, + "loss": 0.0179, + "step": 63970 + }, + { + "epoch": 0.4729310191892611, + "grad_norm": 0.08046221733093262, + "learning_rate": 2.6451210826210827e-05, + "loss": 0.0176, + "step": 63980 + }, + { + "epoch": 0.47300493776056296, + "grad_norm": 0.09605638682842255, + "learning_rate": 2.6447501187084523e-05, + "loss": 0.0179, + "step": 63990 + }, + { + "epoch": 0.4730788563318648, + "grad_norm": 0.08003576844930649, + "learning_rate": 2.6443791547958212e-05, + "loss": 0.0208, + "step": 64000 + }, + { + "epoch": 0.47315277490316665, + "grad_norm": 0.11510287970304489, + "learning_rate": 2.644008190883191e-05, + "loss": 0.0172, + "step": 64010 + }, + { + "epoch": 0.47322669347446855, + "grad_norm": 0.07279416173696518, + "learning_rate": 2.6436372269705607e-05, + "loss": 0.0179, + "step": 64020 + }, + { + "epoch": 0.4733006120457704, + "grad_norm": 0.10133165121078491, + "learning_rate": 2.6432662630579296e-05, + "loss": 0.0184, + "step": 64030 + }, + { + "epoch": 0.47337453061707224, + "grad_norm": 0.08167675882577896, + "learning_rate": 2.6428952991452992e-05, + "loss": 0.0179, + "step": 64040 + }, + { + "epoch": 0.4734484491883741, + "grad_norm": 0.07276114076375961, + "learning_rate": 2.642524335232669e-05, + "loss": 0.018, + "step": 64050 + }, + { + "epoch": 0.47352236775967593, + "grad_norm": 0.07859620451927185, + "learning_rate": 2.642153371320038e-05, + "loss": 0.0191, + "step": 64060 + }, + { + "epoch": 0.4735962863309778, + "grad_norm": 0.06081186980009079, + "learning_rate": 2.6417824074074077e-05, + "loss": 0.0161, + "step": 64070 + }, + { + "epoch": 0.4736702049022796, + "grad_norm": 0.07537996768951416, + "learning_rate": 2.6414114434947766e-05, + "loss": 0.0182, + "step": 64080 + }, + { + "epoch": 0.4737441234735815, + "grad_norm": 0.07903088629245758, + "learning_rate": 2.6410404795821465e-05, + "loss": 0.0175, + "step": 64090 + }, + { + "epoch": 0.47381804204488337, + "grad_norm": 0.05258747562766075, + "learning_rate": 2.640669515669516e-05, + "loss": 0.0182, + "step": 64100 + }, + { + "epoch": 0.4738919606161852, + "grad_norm": 0.1332976073026657, + "learning_rate": 2.640298551756885e-05, + "loss": 0.0171, + "step": 64110 + }, + { + "epoch": 0.47396587918748706, + "grad_norm": 0.12791724503040314, + "learning_rate": 2.6399275878442547e-05, + "loss": 0.0184, + "step": 64120 + }, + { + "epoch": 0.4740397977587889, + "grad_norm": 0.0703354924917221, + "learning_rate": 2.639556623931624e-05, + "loss": 0.0185, + "step": 64130 + }, + { + "epoch": 0.47411371633009075, + "grad_norm": 0.12382179498672485, + "learning_rate": 2.6391856600189935e-05, + "loss": 0.0175, + "step": 64140 + }, + { + "epoch": 0.47418763490139265, + "grad_norm": 0.068964883685112, + "learning_rate": 2.638814696106363e-05, + "loss": 0.0171, + "step": 64150 + }, + { + "epoch": 0.4742615534726945, + "grad_norm": 0.1036754921078682, + "learning_rate": 2.6384437321937324e-05, + "loss": 0.0182, + "step": 64160 + }, + { + "epoch": 0.47433547204399634, + "grad_norm": 0.0769418254494667, + "learning_rate": 2.638072768281102e-05, + "loss": 0.0167, + "step": 64170 + }, + { + "epoch": 0.4744093906152982, + "grad_norm": 0.09884065389633179, + "learning_rate": 2.637701804368471e-05, + "loss": 0.0187, + "step": 64180 + }, + { + "epoch": 0.47448330918660003, + "grad_norm": 0.07469066977500916, + "learning_rate": 2.6373308404558405e-05, + "loss": 0.017, + "step": 64190 + }, + { + "epoch": 0.4745572277579019, + "grad_norm": 0.08337382227182388, + "learning_rate": 2.63695987654321e-05, + "loss": 0.0189, + "step": 64200 + }, + { + "epoch": 0.4746311463292037, + "grad_norm": 0.0834554135799408, + "learning_rate": 2.6365889126305793e-05, + "loss": 0.0149, + "step": 64210 + }, + { + "epoch": 0.4747050649005056, + "grad_norm": 0.07497718185186386, + "learning_rate": 2.636217948717949e-05, + "loss": 0.0178, + "step": 64220 + }, + { + "epoch": 0.47477898347180747, + "grad_norm": 0.10598395764827728, + "learning_rate": 2.635846984805318e-05, + "loss": 0.0186, + "step": 64230 + }, + { + "epoch": 0.4748529020431093, + "grad_norm": 0.09169488400220871, + "learning_rate": 2.6354760208926878e-05, + "loss": 0.0178, + "step": 64240 + }, + { + "epoch": 0.47492682061441116, + "grad_norm": 0.07592468708753586, + "learning_rate": 2.6351050569800574e-05, + "loss": 0.0178, + "step": 64250 + }, + { + "epoch": 0.475000739185713, + "grad_norm": 0.09536651521921158, + "learning_rate": 2.6347340930674263e-05, + "loss": 0.0168, + "step": 64260 + }, + { + "epoch": 0.47507465775701485, + "grad_norm": 0.09624287486076355, + "learning_rate": 2.634363129154796e-05, + "loss": 0.02, + "step": 64270 + }, + { + "epoch": 0.47514857632831675, + "grad_norm": 0.10279426723718643, + "learning_rate": 2.6339921652421655e-05, + "loss": 0.0183, + "step": 64280 + }, + { + "epoch": 0.4752224948996186, + "grad_norm": 0.0842171236872673, + "learning_rate": 2.6336212013295347e-05, + "loss": 0.0172, + "step": 64290 + }, + { + "epoch": 0.47529641347092044, + "grad_norm": 0.07284293323755264, + "learning_rate": 2.6332502374169043e-05, + "loss": 0.0167, + "step": 64300 + }, + { + "epoch": 0.4753703320422223, + "grad_norm": 0.07489843666553497, + "learning_rate": 2.6328792735042736e-05, + "loss": 0.0153, + "step": 64310 + }, + { + "epoch": 0.47544425061352413, + "grad_norm": 0.07663732767105103, + "learning_rate": 2.6325083095916432e-05, + "loss": 0.0182, + "step": 64320 + }, + { + "epoch": 0.475518169184826, + "grad_norm": 0.08088131248950958, + "learning_rate": 2.6321373456790128e-05, + "loss": 0.0194, + "step": 64330 + }, + { + "epoch": 0.4755920877561278, + "grad_norm": 0.07426482439041138, + "learning_rate": 2.6317663817663817e-05, + "loss": 0.0158, + "step": 64340 + }, + { + "epoch": 0.4756660063274297, + "grad_norm": 0.08960974961519241, + "learning_rate": 2.6313954178537513e-05, + "loss": 0.02, + "step": 64350 + }, + { + "epoch": 0.47573992489873157, + "grad_norm": 0.062308117747306824, + "learning_rate": 2.6310244539411206e-05, + "loss": 0.0223, + "step": 64360 + }, + { + "epoch": 0.4758138434700334, + "grad_norm": 0.08103105425834656, + "learning_rate": 2.63065349002849e-05, + "loss": 0.0193, + "step": 64370 + }, + { + "epoch": 0.47588776204133526, + "grad_norm": 0.0753159448504448, + "learning_rate": 2.6302825261158598e-05, + "loss": 0.0173, + "step": 64380 + }, + { + "epoch": 0.4759616806126371, + "grad_norm": 0.08921549469232559, + "learning_rate": 2.629911562203229e-05, + "loss": 0.0203, + "step": 64390 + }, + { + "epoch": 0.47603559918393895, + "grad_norm": 0.055965226143598557, + "learning_rate": 2.6295405982905986e-05, + "loss": 0.0157, + "step": 64400 + }, + { + "epoch": 0.47610951775524085, + "grad_norm": 0.05769721791148186, + "learning_rate": 2.6291696343779675e-05, + "loss": 0.0153, + "step": 64410 + }, + { + "epoch": 0.4761834363265427, + "grad_norm": 0.1230938732624054, + "learning_rate": 2.628798670465337e-05, + "loss": 0.0191, + "step": 64420 + }, + { + "epoch": 0.47625735489784454, + "grad_norm": 0.10393833369016647, + "learning_rate": 2.6284277065527067e-05, + "loss": 0.0194, + "step": 64430 + }, + { + "epoch": 0.4763312734691464, + "grad_norm": 0.0997467115521431, + "learning_rate": 2.628056742640076e-05, + "loss": 0.0198, + "step": 64440 + }, + { + "epoch": 0.47640519204044823, + "grad_norm": 0.06631512194871902, + "learning_rate": 2.6276857787274456e-05, + "loss": 0.0194, + "step": 64450 + }, + { + "epoch": 0.4764791106117501, + "grad_norm": 0.06374955177307129, + "learning_rate": 2.627314814814815e-05, + "loss": 0.0161, + "step": 64460 + }, + { + "epoch": 0.4765530291830519, + "grad_norm": 0.06800312548875809, + "learning_rate": 2.6269438509021844e-05, + "loss": 0.0174, + "step": 64470 + }, + { + "epoch": 0.4766269477543538, + "grad_norm": 0.0736674815416336, + "learning_rate": 2.626572886989554e-05, + "loss": 0.0204, + "step": 64480 + }, + { + "epoch": 0.47670086632565567, + "grad_norm": 0.07592196017503738, + "learning_rate": 2.626201923076923e-05, + "loss": 0.0178, + "step": 64490 + }, + { + "epoch": 0.4767747848969575, + "grad_norm": 0.08925735205411911, + "learning_rate": 2.6258309591642926e-05, + "loss": 0.0186, + "step": 64500 + }, + { + "epoch": 0.47684870346825936, + "grad_norm": 0.10544586926698685, + "learning_rate": 2.625459995251662e-05, + "loss": 0.0187, + "step": 64510 + }, + { + "epoch": 0.4769226220395612, + "grad_norm": 0.06558398902416229, + "learning_rate": 2.6250890313390314e-05, + "loss": 0.0191, + "step": 64520 + }, + { + "epoch": 0.47699654061086305, + "grad_norm": 0.11240635812282562, + "learning_rate": 2.624718067426401e-05, + "loss": 0.0176, + "step": 64530 + }, + { + "epoch": 0.47707045918216495, + "grad_norm": 0.07124663889408112, + "learning_rate": 2.6243471035137703e-05, + "loss": 0.0172, + "step": 64540 + }, + { + "epoch": 0.4771443777534668, + "grad_norm": 0.10384904593229294, + "learning_rate": 2.62397613960114e-05, + "loss": 0.0163, + "step": 64550 + }, + { + "epoch": 0.47721829632476864, + "grad_norm": 0.11825846135616302, + "learning_rate": 2.6236051756885095e-05, + "loss": 0.0204, + "step": 64560 + }, + { + "epoch": 0.4772922148960705, + "grad_norm": 0.09247847646474838, + "learning_rate": 2.6232342117758784e-05, + "loss": 0.019, + "step": 64570 + }, + { + "epoch": 0.47736613346737233, + "grad_norm": 0.0669577345252037, + "learning_rate": 2.622863247863248e-05, + "loss": 0.0198, + "step": 64580 + }, + { + "epoch": 0.4774400520386742, + "grad_norm": 0.08727841079235077, + "learning_rate": 2.6224922839506172e-05, + "loss": 0.0174, + "step": 64590 + }, + { + "epoch": 0.477513970609976, + "grad_norm": 0.09160082042217255, + "learning_rate": 2.6221213200379868e-05, + "loss": 0.0194, + "step": 64600 + }, + { + "epoch": 0.4775878891812779, + "grad_norm": 0.09534512460231781, + "learning_rate": 2.6217503561253564e-05, + "loss": 0.019, + "step": 64610 + }, + { + "epoch": 0.47766180775257977, + "grad_norm": 0.05581028386950493, + "learning_rate": 2.6213793922127257e-05, + "loss": 0.0173, + "step": 64620 + }, + { + "epoch": 0.4777357263238816, + "grad_norm": 0.07642550021409988, + "learning_rate": 2.6210084283000953e-05, + "loss": 0.0168, + "step": 64630 + }, + { + "epoch": 0.47780964489518346, + "grad_norm": 0.05983065813779831, + "learning_rate": 2.6206374643874642e-05, + "loss": 0.0186, + "step": 64640 + }, + { + "epoch": 0.4778835634664853, + "grad_norm": 0.08361926674842834, + "learning_rate": 2.6202665004748338e-05, + "loss": 0.0193, + "step": 64650 + }, + { + "epoch": 0.47795748203778715, + "grad_norm": 0.08084291964769363, + "learning_rate": 2.6198955365622034e-05, + "loss": 0.0196, + "step": 64660 + }, + { + "epoch": 0.47803140060908905, + "grad_norm": 0.08104509115219116, + "learning_rate": 2.6195245726495726e-05, + "loss": 0.019, + "step": 64670 + }, + { + "epoch": 0.4781053191803909, + "grad_norm": 0.08967190235853195, + "learning_rate": 2.6191536087369422e-05, + "loss": 0.0218, + "step": 64680 + }, + { + "epoch": 0.47817923775169274, + "grad_norm": 0.07969162613153458, + "learning_rate": 2.6187826448243115e-05, + "loss": 0.0171, + "step": 64690 + }, + { + "epoch": 0.4782531563229946, + "grad_norm": 0.09354029595851898, + "learning_rate": 2.618411680911681e-05, + "loss": 0.0186, + "step": 64700 + }, + { + "epoch": 0.47832707489429643, + "grad_norm": 0.06691830605268478, + "learning_rate": 2.6180407169990507e-05, + "loss": 0.0159, + "step": 64710 + }, + { + "epoch": 0.4784009934655983, + "grad_norm": 0.08824547380208969, + "learning_rate": 2.6176697530864196e-05, + "loss": 0.0187, + "step": 64720 + }, + { + "epoch": 0.4784749120369001, + "grad_norm": 0.10307306051254272, + "learning_rate": 2.6172987891737892e-05, + "loss": 0.0165, + "step": 64730 + }, + { + "epoch": 0.478548830608202, + "grad_norm": 0.06971874833106995, + "learning_rate": 2.616927825261159e-05, + "loss": 0.0158, + "step": 64740 + }, + { + "epoch": 0.47862274917950387, + "grad_norm": 0.10057180374860764, + "learning_rate": 2.616556861348528e-05, + "loss": 0.0166, + "step": 64750 + }, + { + "epoch": 0.4786966677508057, + "grad_norm": 0.0871298536658287, + "learning_rate": 2.6161858974358977e-05, + "loss": 0.0183, + "step": 64760 + }, + { + "epoch": 0.47877058632210756, + "grad_norm": 0.06503120064735413, + "learning_rate": 2.615814933523267e-05, + "loss": 0.0177, + "step": 64770 + }, + { + "epoch": 0.4788445048934094, + "grad_norm": 0.08198139071464539, + "learning_rate": 2.6154439696106365e-05, + "loss": 0.018, + "step": 64780 + }, + { + "epoch": 0.47891842346471125, + "grad_norm": 0.07585715502500534, + "learning_rate": 2.615073005698006e-05, + "loss": 0.0182, + "step": 64790 + }, + { + "epoch": 0.47899234203601315, + "grad_norm": 0.07827455550432205, + "learning_rate": 2.614702041785375e-05, + "loss": 0.0204, + "step": 64800 + }, + { + "epoch": 0.479066260607315, + "grad_norm": 0.06340257823467255, + "learning_rate": 2.6143310778727446e-05, + "loss": 0.018, + "step": 64810 + }, + { + "epoch": 0.47914017917861684, + "grad_norm": 0.12032327800989151, + "learning_rate": 2.613960113960114e-05, + "loss": 0.0199, + "step": 64820 + }, + { + "epoch": 0.4792140977499187, + "grad_norm": 0.09568923711776733, + "learning_rate": 2.6135891500474835e-05, + "loss": 0.0188, + "step": 64830 + }, + { + "epoch": 0.47928801632122053, + "grad_norm": 0.07581949234008789, + "learning_rate": 2.613218186134853e-05, + "loss": 0.0187, + "step": 64840 + }, + { + "epoch": 0.4793619348925224, + "grad_norm": 0.07674810290336609, + "learning_rate": 2.6128472222222223e-05, + "loss": 0.0156, + "step": 64850 + }, + { + "epoch": 0.4794358534638242, + "grad_norm": 0.09926360845565796, + "learning_rate": 2.612476258309592e-05, + "loss": 0.0179, + "step": 64860 + }, + { + "epoch": 0.4795097720351261, + "grad_norm": 0.07553702592849731, + "learning_rate": 2.612105294396961e-05, + "loss": 0.0183, + "step": 64870 + }, + { + "epoch": 0.47958369060642797, + "grad_norm": 0.10049743950366974, + "learning_rate": 2.6117343304843304e-05, + "loss": 0.0174, + "step": 64880 + }, + { + "epoch": 0.4796576091777298, + "grad_norm": 0.05749649181962013, + "learning_rate": 2.6113633665717004e-05, + "loss": 0.0176, + "step": 64890 + }, + { + "epoch": 0.47973152774903166, + "grad_norm": 0.07312509417533875, + "learning_rate": 2.6109924026590693e-05, + "loss": 0.0179, + "step": 64900 + }, + { + "epoch": 0.4798054463203335, + "grad_norm": 0.07548145204782486, + "learning_rate": 2.610621438746439e-05, + "loss": 0.0172, + "step": 64910 + }, + { + "epoch": 0.47987936489163535, + "grad_norm": 0.09609496593475342, + "learning_rate": 2.610250474833808e-05, + "loss": 0.0163, + "step": 64920 + }, + { + "epoch": 0.47995328346293725, + "grad_norm": 0.08333881199359894, + "learning_rate": 2.6098795109211778e-05, + "loss": 0.0176, + "step": 64930 + }, + { + "epoch": 0.4800272020342391, + "grad_norm": 0.08614440262317657, + "learning_rate": 2.6095085470085474e-05, + "loss": 0.0165, + "step": 64940 + }, + { + "epoch": 0.48010112060554094, + "grad_norm": 0.11465088278055191, + "learning_rate": 2.6091375830959163e-05, + "loss": 0.017, + "step": 64950 + }, + { + "epoch": 0.4801750391768428, + "grad_norm": 0.10436538606882095, + "learning_rate": 2.608766619183286e-05, + "loss": 0.0188, + "step": 64960 + }, + { + "epoch": 0.48024895774814463, + "grad_norm": 0.09603813290596008, + "learning_rate": 2.6083956552706558e-05, + "loss": 0.0175, + "step": 64970 + }, + { + "epoch": 0.4803228763194465, + "grad_norm": 0.08472411334514618, + "learning_rate": 2.6080246913580247e-05, + "loss": 0.0207, + "step": 64980 + }, + { + "epoch": 0.4803967948907483, + "grad_norm": 0.07732685655355453, + "learning_rate": 2.6076537274453943e-05, + "loss": 0.017, + "step": 64990 + }, + { + "epoch": 0.4804707134620502, + "grad_norm": 0.1053856760263443, + "learning_rate": 2.6072827635327636e-05, + "loss": 0.0203, + "step": 65000 + }, + { + "epoch": 0.48054463203335207, + "grad_norm": 0.07103192061185837, + "learning_rate": 2.6069117996201332e-05, + "loss": 0.018, + "step": 65010 + }, + { + "epoch": 0.4806185506046539, + "grad_norm": 0.08402489125728607, + "learning_rate": 2.6065408357075028e-05, + "loss": 0.0198, + "step": 65020 + }, + { + "epoch": 0.48069246917595576, + "grad_norm": 0.06711668521165848, + "learning_rate": 2.6061698717948717e-05, + "loss": 0.0174, + "step": 65030 + }, + { + "epoch": 0.4807663877472576, + "grad_norm": 0.10626384615898132, + "learning_rate": 2.6057989078822416e-05, + "loss": 0.0169, + "step": 65040 + }, + { + "epoch": 0.48084030631855945, + "grad_norm": 0.08938641101121902, + "learning_rate": 2.6054279439696105e-05, + "loss": 0.0173, + "step": 65050 + }, + { + "epoch": 0.48091422488986135, + "grad_norm": 0.07563548535108566, + "learning_rate": 2.60505698005698e-05, + "loss": 0.0182, + "step": 65060 + }, + { + "epoch": 0.4809881434611632, + "grad_norm": 0.0857568308711052, + "learning_rate": 2.6046860161443497e-05, + "loss": 0.0189, + "step": 65070 + }, + { + "epoch": 0.48106206203246504, + "grad_norm": 0.11830922961235046, + "learning_rate": 2.604315052231719e-05, + "loss": 0.0183, + "step": 65080 + }, + { + "epoch": 0.4811359806037669, + "grad_norm": 0.09156577289104462, + "learning_rate": 2.6039440883190886e-05, + "loss": 0.019, + "step": 65090 + }, + { + "epoch": 0.48120989917506873, + "grad_norm": 0.07325790077447891, + "learning_rate": 2.6035731244064575e-05, + "loss": 0.0179, + "step": 65100 + }, + { + "epoch": 0.4812838177463706, + "grad_norm": 0.06196491792798042, + "learning_rate": 2.603202160493827e-05, + "loss": 0.0173, + "step": 65110 + }, + { + "epoch": 0.4813577363176724, + "grad_norm": 0.09580127149820328, + "learning_rate": 2.602831196581197e-05, + "loss": 0.0175, + "step": 65120 + }, + { + "epoch": 0.4814316548889743, + "grad_norm": 0.07817379385232925, + "learning_rate": 2.602460232668566e-05, + "loss": 0.0183, + "step": 65130 + }, + { + "epoch": 0.48150557346027617, + "grad_norm": 0.0812036395072937, + "learning_rate": 2.6020892687559356e-05, + "loss": 0.0159, + "step": 65140 + }, + { + "epoch": 0.481579492031578, + "grad_norm": 0.08802223205566406, + "learning_rate": 2.6017183048433048e-05, + "loss": 0.0186, + "step": 65150 + }, + { + "epoch": 0.48165341060287986, + "grad_norm": 0.08315332978963852, + "learning_rate": 2.6013473409306744e-05, + "loss": 0.0184, + "step": 65160 + }, + { + "epoch": 0.4817273291741817, + "grad_norm": 0.08085574954748154, + "learning_rate": 2.600976377018044e-05, + "loss": 0.018, + "step": 65170 + }, + { + "epoch": 0.48180124774548355, + "grad_norm": 0.05223085731267929, + "learning_rate": 2.600605413105413e-05, + "loss": 0.0178, + "step": 65180 + }, + { + "epoch": 0.48187516631678545, + "grad_norm": 0.055687062442302704, + "learning_rate": 2.600234449192783e-05, + "loss": 0.0167, + "step": 65190 + }, + { + "epoch": 0.4819490848880873, + "grad_norm": 0.08231914043426514, + "learning_rate": 2.5998634852801525e-05, + "loss": 0.0167, + "step": 65200 + }, + { + "epoch": 0.48202300345938914, + "grad_norm": 0.08846697211265564, + "learning_rate": 2.5994925213675214e-05, + "loss": 0.0171, + "step": 65210 + }, + { + "epoch": 0.482096922030691, + "grad_norm": 0.09297716617584229, + "learning_rate": 2.599121557454891e-05, + "loss": 0.0183, + "step": 65220 + }, + { + "epoch": 0.48217084060199283, + "grad_norm": 0.07962552458047867, + "learning_rate": 2.5987505935422602e-05, + "loss": 0.0168, + "step": 65230 + }, + { + "epoch": 0.4822447591732947, + "grad_norm": 0.07073140889406204, + "learning_rate": 2.59837962962963e-05, + "loss": 0.017, + "step": 65240 + }, + { + "epoch": 0.4823186777445966, + "grad_norm": 0.09182487428188324, + "learning_rate": 2.5980086657169994e-05, + "loss": 0.0176, + "step": 65250 + }, + { + "epoch": 0.4823925963158984, + "grad_norm": 0.06719227135181427, + "learning_rate": 2.5976377018043683e-05, + "loss": 0.0188, + "step": 65260 + }, + { + "epoch": 0.48246651488720027, + "grad_norm": 0.07798336446285248, + "learning_rate": 2.5972667378917383e-05, + "loss": 0.0192, + "step": 65270 + }, + { + "epoch": 0.4825404334585021, + "grad_norm": 0.11315786093473434, + "learning_rate": 2.5968957739791072e-05, + "loss": 0.0191, + "step": 65280 + }, + { + "epoch": 0.48261435202980396, + "grad_norm": 0.0917963758111, + "learning_rate": 2.5965248100664768e-05, + "loss": 0.0174, + "step": 65290 + }, + { + "epoch": 0.4826882706011058, + "grad_norm": 0.07520270347595215, + "learning_rate": 2.5961538461538464e-05, + "loss": 0.017, + "step": 65300 + }, + { + "epoch": 0.48276218917240765, + "grad_norm": 0.08580703288316727, + "learning_rate": 2.5957828822412157e-05, + "loss": 0.0193, + "step": 65310 + }, + { + "epoch": 0.48283610774370955, + "grad_norm": 0.15712198615074158, + "learning_rate": 2.5954119183285853e-05, + "loss": 0.019, + "step": 65320 + }, + { + "epoch": 0.4829100263150114, + "grad_norm": 0.0746866911649704, + "learning_rate": 2.595040954415954e-05, + "loss": 0.0155, + "step": 65330 + }, + { + "epoch": 0.48298394488631324, + "grad_norm": 0.08219867199659348, + "learning_rate": 2.5946699905033238e-05, + "loss": 0.016, + "step": 65340 + }, + { + "epoch": 0.4830578634576151, + "grad_norm": 0.10122178494930267, + "learning_rate": 2.5942990265906937e-05, + "loss": 0.0167, + "step": 65350 + }, + { + "epoch": 0.48313178202891693, + "grad_norm": 0.11777858436107635, + "learning_rate": 2.5939280626780626e-05, + "loss": 0.0165, + "step": 65360 + }, + { + "epoch": 0.4832057006002188, + "grad_norm": 0.0590934231877327, + "learning_rate": 2.5935570987654322e-05, + "loss": 0.0181, + "step": 65370 + }, + { + "epoch": 0.4832796191715207, + "grad_norm": 0.07778100669384003, + "learning_rate": 2.5931861348528015e-05, + "loss": 0.0169, + "step": 65380 + }, + { + "epoch": 0.4833535377428225, + "grad_norm": 0.09313920885324478, + "learning_rate": 2.592815170940171e-05, + "loss": 0.0186, + "step": 65390 + }, + { + "epoch": 0.48342745631412437, + "grad_norm": 0.11338415741920471, + "learning_rate": 2.5924442070275407e-05, + "loss": 0.0173, + "step": 65400 + }, + { + "epoch": 0.4835013748854262, + "grad_norm": 0.0711422711610794, + "learning_rate": 2.5920732431149096e-05, + "loss": 0.0174, + "step": 65410 + }, + { + "epoch": 0.48357529345672806, + "grad_norm": 0.0966838076710701, + "learning_rate": 2.5917022792022795e-05, + "loss": 0.0188, + "step": 65420 + }, + { + "epoch": 0.4836492120280299, + "grad_norm": 0.0847175195813179, + "learning_rate": 2.591331315289649e-05, + "loss": 0.0198, + "step": 65430 + }, + { + "epoch": 0.48372313059933175, + "grad_norm": 0.08860213309526443, + "learning_rate": 2.590960351377018e-05, + "loss": 0.0184, + "step": 65440 + }, + { + "epoch": 0.48379704917063365, + "grad_norm": 0.0929391011595726, + "learning_rate": 2.5905893874643876e-05, + "loss": 0.0184, + "step": 65450 + }, + { + "epoch": 0.4838709677419355, + "grad_norm": 0.08175451308488846, + "learning_rate": 2.590218423551757e-05, + "loss": 0.0175, + "step": 65460 + }, + { + "epoch": 0.48394488631323734, + "grad_norm": 0.0830913782119751, + "learning_rate": 2.5898474596391265e-05, + "loss": 0.0177, + "step": 65470 + }, + { + "epoch": 0.4840188048845392, + "grad_norm": 0.09538637101650238, + "learning_rate": 2.589476495726496e-05, + "loss": 0.0178, + "step": 65480 + }, + { + "epoch": 0.48409272345584103, + "grad_norm": 0.09078861773014069, + "learning_rate": 2.589105531813865e-05, + "loss": 0.0193, + "step": 65490 + }, + { + "epoch": 0.4841666420271429, + "grad_norm": 0.08665713667869568, + "learning_rate": 2.588734567901235e-05, + "loss": 0.0172, + "step": 65500 + }, + { + "epoch": 0.4842405605984448, + "grad_norm": 0.0846015214920044, + "learning_rate": 2.588363603988604e-05, + "loss": 0.0192, + "step": 65510 + }, + { + "epoch": 0.4843144791697466, + "grad_norm": 0.09109742194414139, + "learning_rate": 2.5879926400759735e-05, + "loss": 0.0157, + "step": 65520 + }, + { + "epoch": 0.48438839774104847, + "grad_norm": 0.10539749264717102, + "learning_rate": 2.587621676163343e-05, + "loss": 0.0199, + "step": 65530 + }, + { + "epoch": 0.4844623163123503, + "grad_norm": 0.11487936973571777, + "learning_rate": 2.5872507122507123e-05, + "loss": 0.0195, + "step": 65540 + }, + { + "epoch": 0.48453623488365216, + "grad_norm": 0.05182930827140808, + "learning_rate": 2.586879748338082e-05, + "loss": 0.016, + "step": 65550 + }, + { + "epoch": 0.484610153454954, + "grad_norm": 0.0592120923101902, + "learning_rate": 2.5865087844254508e-05, + "loss": 0.0174, + "step": 65560 + }, + { + "epoch": 0.48468407202625585, + "grad_norm": 0.07573354989290237, + "learning_rate": 2.5861378205128208e-05, + "loss": 0.0167, + "step": 65570 + }, + { + "epoch": 0.48475799059755775, + "grad_norm": 0.07868777960538864, + "learning_rate": 2.5857668566001904e-05, + "loss": 0.0179, + "step": 65580 + }, + { + "epoch": 0.4848319091688596, + "grad_norm": 0.1018095463514328, + "learning_rate": 2.5853958926875593e-05, + "loss": 0.0155, + "step": 65590 + }, + { + "epoch": 0.48490582774016144, + "grad_norm": 0.0721440240740776, + "learning_rate": 2.585024928774929e-05, + "loss": 0.0175, + "step": 65600 + }, + { + "epoch": 0.4849797463114633, + "grad_norm": 0.08099162578582764, + "learning_rate": 2.584653964862298e-05, + "loss": 0.0164, + "step": 65610 + }, + { + "epoch": 0.48505366488276513, + "grad_norm": 0.11158620566129684, + "learning_rate": 2.5842830009496677e-05, + "loss": 0.0181, + "step": 65620 + }, + { + "epoch": 0.485127583454067, + "grad_norm": 0.09026879817247391, + "learning_rate": 2.5839120370370373e-05, + "loss": 0.0187, + "step": 65630 + }, + { + "epoch": 0.4852015020253689, + "grad_norm": 0.07620834559202194, + "learning_rate": 2.5835410731244062e-05, + "loss": 0.0217, + "step": 65640 + }, + { + "epoch": 0.4852754205966707, + "grad_norm": 0.08644992113113403, + "learning_rate": 2.5831701092117762e-05, + "loss": 0.0176, + "step": 65650 + }, + { + "epoch": 0.48534933916797257, + "grad_norm": 0.08692828565835953, + "learning_rate": 2.5827991452991458e-05, + "loss": 0.0202, + "step": 65660 + }, + { + "epoch": 0.4854232577392744, + "grad_norm": 0.08011046051979065, + "learning_rate": 2.5824281813865147e-05, + "loss": 0.02, + "step": 65670 + }, + { + "epoch": 0.48549717631057626, + "grad_norm": 0.057609450072050095, + "learning_rate": 2.5820572174738843e-05, + "loss": 0.0172, + "step": 65680 + }, + { + "epoch": 0.4855710948818781, + "grad_norm": 0.09666851162910461, + "learning_rate": 2.5816862535612536e-05, + "loss": 0.0178, + "step": 65690 + }, + { + "epoch": 0.48564501345317995, + "grad_norm": 0.0653696209192276, + "learning_rate": 2.581315289648623e-05, + "loss": 0.0163, + "step": 65700 + }, + { + "epoch": 0.48571893202448185, + "grad_norm": 0.07197672873735428, + "learning_rate": 2.5809443257359927e-05, + "loss": 0.0205, + "step": 65710 + }, + { + "epoch": 0.4857928505957837, + "grad_norm": 0.08986418694257736, + "learning_rate": 2.580573361823362e-05, + "loss": 0.0215, + "step": 65720 + }, + { + "epoch": 0.48586676916708554, + "grad_norm": 0.09233395010232925, + "learning_rate": 2.5802023979107316e-05, + "loss": 0.0175, + "step": 65730 + }, + { + "epoch": 0.4859406877383874, + "grad_norm": 0.08180242031812668, + "learning_rate": 2.5798314339981005e-05, + "loss": 0.0217, + "step": 65740 + }, + { + "epoch": 0.48601460630968923, + "grad_norm": 0.08367449790239334, + "learning_rate": 2.57946047008547e-05, + "loss": 0.018, + "step": 65750 + }, + { + "epoch": 0.4860885248809911, + "grad_norm": 0.07709220051765442, + "learning_rate": 2.5790895061728397e-05, + "loss": 0.0182, + "step": 65760 + }, + { + "epoch": 0.486162443452293, + "grad_norm": 0.09754809737205505, + "learning_rate": 2.578718542260209e-05, + "loss": 0.0159, + "step": 65770 + }, + { + "epoch": 0.4862363620235948, + "grad_norm": 0.0685078427195549, + "learning_rate": 2.5783475783475786e-05, + "loss": 0.0165, + "step": 65780 + }, + { + "epoch": 0.48631028059489667, + "grad_norm": 0.1023360937833786, + "learning_rate": 2.5779766144349475e-05, + "loss": 0.0204, + "step": 65790 + }, + { + "epoch": 0.4863841991661985, + "grad_norm": 0.09142835438251495, + "learning_rate": 2.5776056505223174e-05, + "loss": 0.0156, + "step": 65800 + }, + { + "epoch": 0.48645811773750036, + "grad_norm": 0.06241421401500702, + "learning_rate": 2.577234686609687e-05, + "loss": 0.0186, + "step": 65810 + }, + { + "epoch": 0.4865320363088022, + "grad_norm": 0.09255488216876984, + "learning_rate": 2.576863722697056e-05, + "loss": 0.0191, + "step": 65820 + }, + { + "epoch": 0.48660595488010405, + "grad_norm": 0.06651777029037476, + "learning_rate": 2.5764927587844255e-05, + "loss": 0.0155, + "step": 65830 + }, + { + "epoch": 0.48667987345140595, + "grad_norm": 0.07002021372318268, + "learning_rate": 2.5761217948717948e-05, + "loss": 0.0167, + "step": 65840 + }, + { + "epoch": 0.4867537920227078, + "grad_norm": 0.07770857959985733, + "learning_rate": 2.5757508309591644e-05, + "loss": 0.0208, + "step": 65850 + }, + { + "epoch": 0.48682771059400964, + "grad_norm": 0.10046552866697311, + "learning_rate": 2.575379867046534e-05, + "loss": 0.019, + "step": 65860 + }, + { + "epoch": 0.4869016291653115, + "grad_norm": 0.06843043118715286, + "learning_rate": 2.5750089031339032e-05, + "loss": 0.0192, + "step": 65870 + }, + { + "epoch": 0.48697554773661333, + "grad_norm": 0.05567110329866409, + "learning_rate": 2.574637939221273e-05, + "loss": 0.0201, + "step": 65880 + }, + { + "epoch": 0.4870494663079152, + "grad_norm": 0.10436822474002838, + "learning_rate": 2.5742669753086424e-05, + "loss": 0.0164, + "step": 65890 + }, + { + "epoch": 0.4871233848792171, + "grad_norm": 0.07619208842515945, + "learning_rate": 2.5738960113960114e-05, + "loss": 0.0176, + "step": 65900 + }, + { + "epoch": 0.4871973034505189, + "grad_norm": 0.08201929926872253, + "learning_rate": 2.573525047483381e-05, + "loss": 0.0211, + "step": 65910 + }, + { + "epoch": 0.48727122202182077, + "grad_norm": 0.08895740658044815, + "learning_rate": 2.5731540835707502e-05, + "loss": 0.0165, + "step": 65920 + }, + { + "epoch": 0.4873451405931226, + "grad_norm": 0.05612145736813545, + "learning_rate": 2.5727831196581198e-05, + "loss": 0.016, + "step": 65930 + }, + { + "epoch": 0.48741905916442446, + "grad_norm": 0.0678725466132164, + "learning_rate": 2.5724121557454894e-05, + "loss": 0.0181, + "step": 65940 + }, + { + "epoch": 0.4874929777357263, + "grad_norm": 0.0889897495508194, + "learning_rate": 2.5720411918328587e-05, + "loss": 0.016, + "step": 65950 + }, + { + "epoch": 0.48756689630702815, + "grad_norm": 0.09029419720172882, + "learning_rate": 2.5716702279202283e-05, + "loss": 0.0201, + "step": 65960 + }, + { + "epoch": 0.48764081487833005, + "grad_norm": 0.057307250797748566, + "learning_rate": 2.5712992640075972e-05, + "loss": 0.0169, + "step": 65970 + }, + { + "epoch": 0.4877147334496319, + "grad_norm": 0.08193568885326385, + "learning_rate": 2.5709283000949668e-05, + "loss": 0.0196, + "step": 65980 + }, + { + "epoch": 0.48778865202093374, + "grad_norm": 0.055389512330293655, + "learning_rate": 2.5705573361823364e-05, + "loss": 0.0203, + "step": 65990 + }, + { + "epoch": 0.4878625705922356, + "grad_norm": 0.07593169063329697, + "learning_rate": 2.5701863722697056e-05, + "loss": 0.0202, + "step": 66000 + }, + { + "epoch": 0.48793648916353743, + "grad_norm": 0.06627099961042404, + "learning_rate": 2.5698154083570752e-05, + "loss": 0.0186, + "step": 66010 + }, + { + "epoch": 0.4880104077348393, + "grad_norm": 0.07474975287914276, + "learning_rate": 2.5694444444444445e-05, + "loss": 0.0158, + "step": 66020 + }, + { + "epoch": 0.4880843263061412, + "grad_norm": 0.08697369694709778, + "learning_rate": 2.569073480531814e-05, + "loss": 0.0179, + "step": 66030 + }, + { + "epoch": 0.488158244877443, + "grad_norm": 0.06758727878332138, + "learning_rate": 2.5687025166191837e-05, + "loss": 0.0178, + "step": 66040 + }, + { + "epoch": 0.48823216344874487, + "grad_norm": 0.1088317334651947, + "learning_rate": 2.5683315527065526e-05, + "loss": 0.0219, + "step": 66050 + }, + { + "epoch": 0.4883060820200467, + "grad_norm": 0.11058960855007172, + "learning_rate": 2.5679605887939222e-05, + "loss": 0.0184, + "step": 66060 + }, + { + "epoch": 0.48838000059134856, + "grad_norm": 0.07780216634273529, + "learning_rate": 2.5675896248812914e-05, + "loss": 0.018, + "step": 66070 + }, + { + "epoch": 0.4884539191626504, + "grad_norm": 0.07846556603908539, + "learning_rate": 2.567218660968661e-05, + "loss": 0.0183, + "step": 66080 + }, + { + "epoch": 0.48852783773395225, + "grad_norm": 0.0712955892086029, + "learning_rate": 2.5668476970560306e-05, + "loss": 0.0164, + "step": 66090 + }, + { + "epoch": 0.48860175630525415, + "grad_norm": 0.09872201085090637, + "learning_rate": 2.5664767331434e-05, + "loss": 0.0198, + "step": 66100 + }, + { + "epoch": 0.488675674876556, + "grad_norm": 0.09058482199907303, + "learning_rate": 2.5661057692307695e-05, + "loss": 0.0178, + "step": 66110 + }, + { + "epoch": 0.48874959344785784, + "grad_norm": 0.07849286496639252, + "learning_rate": 2.565734805318139e-05, + "loss": 0.0198, + "step": 66120 + }, + { + "epoch": 0.4888235120191597, + "grad_norm": 0.11207176744937897, + "learning_rate": 2.565363841405508e-05, + "loss": 0.0195, + "step": 66130 + }, + { + "epoch": 0.48889743059046153, + "grad_norm": 0.06829576194286346, + "learning_rate": 2.5649928774928776e-05, + "loss": 0.0183, + "step": 66140 + }, + { + "epoch": 0.4889713491617634, + "grad_norm": 0.07670718431472778, + "learning_rate": 2.564621913580247e-05, + "loss": 0.0173, + "step": 66150 + }, + { + "epoch": 0.4890452677330653, + "grad_norm": 0.10271691530942917, + "learning_rate": 2.5642509496676165e-05, + "loss": 0.0162, + "step": 66160 + }, + { + "epoch": 0.4891191863043671, + "grad_norm": 0.10622168332338333, + "learning_rate": 2.563879985754986e-05, + "loss": 0.0191, + "step": 66170 + }, + { + "epoch": 0.48919310487566897, + "grad_norm": 0.07407471537590027, + "learning_rate": 2.5635090218423553e-05, + "loss": 0.0172, + "step": 66180 + }, + { + "epoch": 0.4892670234469708, + "grad_norm": 0.08040529489517212, + "learning_rate": 2.563138057929725e-05, + "loss": 0.019, + "step": 66190 + }, + { + "epoch": 0.48934094201827266, + "grad_norm": 0.06908878684043884, + "learning_rate": 2.562767094017094e-05, + "loss": 0.0196, + "step": 66200 + }, + { + "epoch": 0.4894148605895745, + "grad_norm": 0.07580314576625824, + "learning_rate": 2.5623961301044634e-05, + "loss": 0.0162, + "step": 66210 + }, + { + "epoch": 0.48948877916087635, + "grad_norm": 0.09771912544965744, + "learning_rate": 2.562025166191833e-05, + "loss": 0.0197, + "step": 66220 + }, + { + "epoch": 0.48956269773217825, + "grad_norm": 0.0960688367486, + "learning_rate": 2.5616542022792023e-05, + "loss": 0.0199, + "step": 66230 + }, + { + "epoch": 0.4896366163034801, + "grad_norm": 0.06303451955318451, + "learning_rate": 2.561283238366572e-05, + "loss": 0.0188, + "step": 66240 + }, + { + "epoch": 0.48971053487478194, + "grad_norm": 0.06757359206676483, + "learning_rate": 2.560912274453941e-05, + "loss": 0.0188, + "step": 66250 + }, + { + "epoch": 0.4897844534460838, + "grad_norm": 0.10776185989379883, + "learning_rate": 2.5605413105413107e-05, + "loss": 0.0163, + "step": 66260 + }, + { + "epoch": 0.48985837201738563, + "grad_norm": 0.08810808509588242, + "learning_rate": 2.5601703466286803e-05, + "loss": 0.0182, + "step": 66270 + }, + { + "epoch": 0.4899322905886875, + "grad_norm": 0.09690174460411072, + "learning_rate": 2.5597993827160493e-05, + "loss": 0.0191, + "step": 66280 + }, + { + "epoch": 0.4900062091599894, + "grad_norm": 0.07141640037298203, + "learning_rate": 2.559428418803419e-05, + "loss": 0.0175, + "step": 66290 + }, + { + "epoch": 0.4900801277312912, + "grad_norm": 0.065648153424263, + "learning_rate": 2.559057454890788e-05, + "loss": 0.0197, + "step": 66300 + }, + { + "epoch": 0.49015404630259307, + "grad_norm": 0.07435780018568039, + "learning_rate": 2.5586864909781577e-05, + "loss": 0.0179, + "step": 66310 + }, + { + "epoch": 0.4902279648738949, + "grad_norm": 0.0778043195605278, + "learning_rate": 2.5583155270655273e-05, + "loss": 0.0173, + "step": 66320 + }, + { + "epoch": 0.49030188344519676, + "grad_norm": 0.05645797401666641, + "learning_rate": 2.5579445631528966e-05, + "loss": 0.0169, + "step": 66330 + }, + { + "epoch": 0.4903758020164986, + "grad_norm": 0.09132159501314163, + "learning_rate": 2.557573599240266e-05, + "loss": 0.0182, + "step": 66340 + }, + { + "epoch": 0.49044972058780045, + "grad_norm": 0.08254007250070572, + "learning_rate": 2.5572026353276358e-05, + "loss": 0.0241, + "step": 66350 + }, + { + "epoch": 0.49052363915910235, + "grad_norm": 0.0711667537689209, + "learning_rate": 2.5568316714150047e-05, + "loss": 0.02, + "step": 66360 + }, + { + "epoch": 0.4905975577304042, + "grad_norm": 0.07004100829362869, + "learning_rate": 2.5564607075023743e-05, + "loss": 0.0152, + "step": 66370 + }, + { + "epoch": 0.49067147630170604, + "grad_norm": 0.07659637928009033, + "learning_rate": 2.5560897435897435e-05, + "loss": 0.0194, + "step": 66380 + }, + { + "epoch": 0.4907453948730079, + "grad_norm": 0.06178012117743492, + "learning_rate": 2.555718779677113e-05, + "loss": 0.0195, + "step": 66390 + }, + { + "epoch": 0.49081931344430973, + "grad_norm": 0.09294130653142929, + "learning_rate": 2.5553478157644827e-05, + "loss": 0.0196, + "step": 66400 + }, + { + "epoch": 0.4908932320156116, + "grad_norm": 0.08098200708627701, + "learning_rate": 2.554976851851852e-05, + "loss": 0.0171, + "step": 66410 + }, + { + "epoch": 0.4909671505869135, + "grad_norm": 0.09157669544219971, + "learning_rate": 2.5546058879392216e-05, + "loss": 0.0164, + "step": 66420 + }, + { + "epoch": 0.4910410691582153, + "grad_norm": 0.10379356145858765, + "learning_rate": 2.5542349240265905e-05, + "loss": 0.0181, + "step": 66430 + }, + { + "epoch": 0.49111498772951717, + "grad_norm": 0.08127181231975555, + "learning_rate": 2.55386396011396e-05, + "loss": 0.0178, + "step": 66440 + }, + { + "epoch": 0.491188906300819, + "grad_norm": 0.07030443102121353, + "learning_rate": 2.55349299620133e-05, + "loss": 0.0178, + "step": 66450 + }, + { + "epoch": 0.49126282487212086, + "grad_norm": 0.061049479991197586, + "learning_rate": 2.553122032288699e-05, + "loss": 0.0193, + "step": 66460 + }, + { + "epoch": 0.4913367434434227, + "grad_norm": 0.04734016954898834, + "learning_rate": 2.5527510683760685e-05, + "loss": 0.0191, + "step": 66470 + }, + { + "epoch": 0.49141066201472455, + "grad_norm": 0.13547852635383606, + "learning_rate": 2.5523801044634378e-05, + "loss": 0.0201, + "step": 66480 + }, + { + "epoch": 0.49148458058602645, + "grad_norm": 0.05827900767326355, + "learning_rate": 2.5520091405508074e-05, + "loss": 0.0155, + "step": 66490 + }, + { + "epoch": 0.4915584991573283, + "grad_norm": 0.10678491741418839, + "learning_rate": 2.551638176638177e-05, + "loss": 0.0174, + "step": 66500 + }, + { + "epoch": 0.49163241772863014, + "grad_norm": 0.07167919725179672, + "learning_rate": 2.551267212725546e-05, + "loss": 0.0186, + "step": 66510 + }, + { + "epoch": 0.491706336299932, + "grad_norm": 0.08683153241872787, + "learning_rate": 2.5508962488129155e-05, + "loss": 0.0168, + "step": 66520 + }, + { + "epoch": 0.49178025487123384, + "grad_norm": 0.1263267695903778, + "learning_rate": 2.5505252849002848e-05, + "loss": 0.0196, + "step": 66530 + }, + { + "epoch": 0.4918541734425357, + "grad_norm": 0.07182847708463669, + "learning_rate": 2.5501543209876544e-05, + "loss": 0.0159, + "step": 66540 + }, + { + "epoch": 0.4919280920138376, + "grad_norm": 0.11993493884801865, + "learning_rate": 2.549783357075024e-05, + "loss": 0.0175, + "step": 66550 + }, + { + "epoch": 0.4920020105851394, + "grad_norm": 0.088954858481884, + "learning_rate": 2.5494123931623932e-05, + "loss": 0.0182, + "step": 66560 + }, + { + "epoch": 0.49207592915644127, + "grad_norm": 0.08541052788496017, + "learning_rate": 2.5490414292497628e-05, + "loss": 0.0194, + "step": 66570 + }, + { + "epoch": 0.4921498477277431, + "grad_norm": 0.09256555885076523, + "learning_rate": 2.5486704653371324e-05, + "loss": 0.0189, + "step": 66580 + }, + { + "epoch": 0.49222376629904496, + "grad_norm": 0.06345752626657486, + "learning_rate": 2.5482995014245013e-05, + "loss": 0.018, + "step": 66590 + }, + { + "epoch": 0.4922976848703468, + "grad_norm": 0.0659131333231926, + "learning_rate": 2.5479285375118713e-05, + "loss": 0.0175, + "step": 66600 + }, + { + "epoch": 0.49237160344164865, + "grad_norm": 0.08424822241067886, + "learning_rate": 2.5475575735992402e-05, + "loss": 0.0172, + "step": 66610 + }, + { + "epoch": 0.49244552201295055, + "grad_norm": 0.10027677565813065, + "learning_rate": 2.5471866096866098e-05, + "loss": 0.0189, + "step": 66620 + }, + { + "epoch": 0.4925194405842524, + "grad_norm": 0.05486908182501793, + "learning_rate": 2.5468156457739794e-05, + "loss": 0.016, + "step": 66630 + }, + { + "epoch": 0.49259335915555424, + "grad_norm": 0.09274695813655853, + "learning_rate": 2.5464446818613486e-05, + "loss": 0.0183, + "step": 66640 + }, + { + "epoch": 0.4926672777268561, + "grad_norm": 0.06558877974748611, + "learning_rate": 2.5460737179487182e-05, + "loss": 0.0201, + "step": 66650 + }, + { + "epoch": 0.49274119629815794, + "grad_norm": 0.10087820142507553, + "learning_rate": 2.545702754036087e-05, + "loss": 0.0193, + "step": 66660 + }, + { + "epoch": 0.4928151148694598, + "grad_norm": 0.0879107117652893, + "learning_rate": 2.5453317901234567e-05, + "loss": 0.0167, + "step": 66670 + }, + { + "epoch": 0.4928890334407617, + "grad_norm": 0.06862018257379532, + "learning_rate": 2.5449608262108267e-05, + "loss": 0.0197, + "step": 66680 + }, + { + "epoch": 0.4929629520120635, + "grad_norm": 0.10974738746881485, + "learning_rate": 2.5445898622981956e-05, + "loss": 0.0182, + "step": 66690 + }, + { + "epoch": 0.49303687058336537, + "grad_norm": 0.08736329525709152, + "learning_rate": 2.5442188983855652e-05, + "loss": 0.0187, + "step": 66700 + }, + { + "epoch": 0.4931107891546672, + "grad_norm": 0.10087980329990387, + "learning_rate": 2.5438479344729345e-05, + "loss": 0.0171, + "step": 66710 + }, + { + "epoch": 0.49318470772596906, + "grad_norm": 0.08211347460746765, + "learning_rate": 2.543476970560304e-05, + "loss": 0.0157, + "step": 66720 + }, + { + "epoch": 0.4932586262972709, + "grad_norm": 0.06540828943252563, + "learning_rate": 2.5431060066476737e-05, + "loss": 0.0204, + "step": 66730 + }, + { + "epoch": 0.49333254486857275, + "grad_norm": 0.07201921194791794, + "learning_rate": 2.5427350427350426e-05, + "loss": 0.018, + "step": 66740 + }, + { + "epoch": 0.49340646343987465, + "grad_norm": 0.09930144995450974, + "learning_rate": 2.5423640788224125e-05, + "loss": 0.0171, + "step": 66750 + }, + { + "epoch": 0.4934803820111765, + "grad_norm": 0.08865474909543991, + "learning_rate": 2.5419931149097814e-05, + "loss": 0.0207, + "step": 66760 + }, + { + "epoch": 0.49355430058247834, + "grad_norm": 0.0578952357172966, + "learning_rate": 2.541622150997151e-05, + "loss": 0.0183, + "step": 66770 + }, + { + "epoch": 0.4936282191537802, + "grad_norm": 0.1250295639038086, + "learning_rate": 2.5412511870845206e-05, + "loss": 0.016, + "step": 66780 + }, + { + "epoch": 0.49370213772508204, + "grad_norm": 0.08131686598062515, + "learning_rate": 2.54088022317189e-05, + "loss": 0.0164, + "step": 66790 + }, + { + "epoch": 0.4937760562963839, + "grad_norm": 0.08497151732444763, + "learning_rate": 2.5405092592592595e-05, + "loss": 0.0169, + "step": 66800 + }, + { + "epoch": 0.4938499748676858, + "grad_norm": 0.09198103845119476, + "learning_rate": 2.540138295346629e-05, + "loss": 0.0175, + "step": 66810 + }, + { + "epoch": 0.4939238934389876, + "grad_norm": 0.054499588906764984, + "learning_rate": 2.539767331433998e-05, + "loss": 0.0178, + "step": 66820 + }, + { + "epoch": 0.49399781201028947, + "grad_norm": 0.06767275184392929, + "learning_rate": 2.539396367521368e-05, + "loss": 0.016, + "step": 66830 + }, + { + "epoch": 0.4940717305815913, + "grad_norm": 0.10051191598176956, + "learning_rate": 2.539025403608737e-05, + "loss": 0.0194, + "step": 66840 + }, + { + "epoch": 0.49414564915289316, + "grad_norm": 0.09332738071680069, + "learning_rate": 2.5386544396961064e-05, + "loss": 0.0168, + "step": 66850 + }, + { + "epoch": 0.494219567724195, + "grad_norm": 0.06618684530258179, + "learning_rate": 2.538283475783476e-05, + "loss": 0.0156, + "step": 66860 + }, + { + "epoch": 0.49429348629549685, + "grad_norm": 0.07552550733089447, + "learning_rate": 2.5379125118708453e-05, + "loss": 0.0214, + "step": 66870 + }, + { + "epoch": 0.49436740486679875, + "grad_norm": 0.07698984444141388, + "learning_rate": 2.537541547958215e-05, + "loss": 0.0165, + "step": 66880 + }, + { + "epoch": 0.4944413234381006, + "grad_norm": 0.06162414327263832, + "learning_rate": 2.5371705840455838e-05, + "loss": 0.0163, + "step": 66890 + }, + { + "epoch": 0.49451524200940244, + "grad_norm": 0.09170342981815338, + "learning_rate": 2.5367996201329537e-05, + "loss": 0.0201, + "step": 66900 + }, + { + "epoch": 0.4945891605807043, + "grad_norm": 0.08989348262548447, + "learning_rate": 2.5364286562203233e-05, + "loss": 0.0194, + "step": 66910 + }, + { + "epoch": 0.49466307915200614, + "grad_norm": 0.0776243656873703, + "learning_rate": 2.5360576923076923e-05, + "loss": 0.0178, + "step": 66920 + }, + { + "epoch": 0.494736997723308, + "grad_norm": 0.11429768055677414, + "learning_rate": 2.535686728395062e-05, + "loss": 0.0183, + "step": 66930 + }, + { + "epoch": 0.4948109162946099, + "grad_norm": 0.07270469516515732, + "learning_rate": 2.535315764482431e-05, + "loss": 0.0176, + "step": 66940 + }, + { + "epoch": 0.4948848348659117, + "grad_norm": 0.0862298533320427, + "learning_rate": 2.5349448005698007e-05, + "loss": 0.0193, + "step": 66950 + }, + { + "epoch": 0.49495875343721357, + "grad_norm": 0.09826846420764923, + "learning_rate": 2.5345738366571703e-05, + "loss": 0.0185, + "step": 66960 + }, + { + "epoch": 0.4950326720085154, + "grad_norm": 0.10540689527988434, + "learning_rate": 2.5342028727445392e-05, + "loss": 0.0172, + "step": 66970 + }, + { + "epoch": 0.49510659057981726, + "grad_norm": 0.0765228345990181, + "learning_rate": 2.533831908831909e-05, + "loss": 0.0186, + "step": 66980 + }, + { + "epoch": 0.4951805091511191, + "grad_norm": 0.08622103184461594, + "learning_rate": 2.533460944919278e-05, + "loss": 0.0177, + "step": 66990 + }, + { + "epoch": 0.49525442772242095, + "grad_norm": 0.07351600378751755, + "learning_rate": 2.5330899810066477e-05, + "loss": 0.0183, + "step": 67000 + }, + { + "epoch": 0.49532834629372285, + "grad_norm": 0.0732818990945816, + "learning_rate": 2.5327190170940173e-05, + "loss": 0.018, + "step": 67010 + }, + { + "epoch": 0.4954022648650247, + "grad_norm": 0.10038938373327255, + "learning_rate": 2.5323480531813865e-05, + "loss": 0.0201, + "step": 67020 + }, + { + "epoch": 0.49547618343632654, + "grad_norm": 0.11697548627853394, + "learning_rate": 2.531977089268756e-05, + "loss": 0.0178, + "step": 67030 + }, + { + "epoch": 0.4955501020076284, + "grad_norm": 0.0824909508228302, + "learning_rate": 2.5316061253561257e-05, + "loss": 0.0187, + "step": 67040 + }, + { + "epoch": 0.49562402057893024, + "grad_norm": 0.07028558850288391, + "learning_rate": 2.531235161443495e-05, + "loss": 0.0155, + "step": 67050 + }, + { + "epoch": 0.4956979391502321, + "grad_norm": 0.06478890776634216, + "learning_rate": 2.5308641975308646e-05, + "loss": 0.0179, + "step": 67060 + }, + { + "epoch": 0.495771857721534, + "grad_norm": 0.08976764231920242, + "learning_rate": 2.5304932336182335e-05, + "loss": 0.0169, + "step": 67070 + }, + { + "epoch": 0.4958457762928358, + "grad_norm": 0.07186666876077652, + "learning_rate": 2.530122269705603e-05, + "loss": 0.0186, + "step": 67080 + }, + { + "epoch": 0.4959196948641377, + "grad_norm": 0.06850361078977585, + "learning_rate": 2.5297513057929727e-05, + "loss": 0.0194, + "step": 67090 + }, + { + "epoch": 0.4959936134354395, + "grad_norm": 0.09590274095535278, + "learning_rate": 2.529380341880342e-05, + "loss": 0.0198, + "step": 67100 + }, + { + "epoch": 0.49606753200674136, + "grad_norm": 0.094276562333107, + "learning_rate": 2.5290093779677115e-05, + "loss": 0.019, + "step": 67110 + }, + { + "epoch": 0.4961414505780432, + "grad_norm": 0.11390369385480881, + "learning_rate": 2.5286384140550805e-05, + "loss": 0.0173, + "step": 67120 + }, + { + "epoch": 0.4962153691493451, + "grad_norm": 0.0800207182765007, + "learning_rate": 2.5282674501424504e-05, + "loss": 0.0161, + "step": 67130 + }, + { + "epoch": 0.49628928772064695, + "grad_norm": 0.08522894233465195, + "learning_rate": 2.52789648622982e-05, + "loss": 0.0189, + "step": 67140 + }, + { + "epoch": 0.4963632062919488, + "grad_norm": 0.0826270580291748, + "learning_rate": 2.527525522317189e-05, + "loss": 0.0213, + "step": 67150 + }, + { + "epoch": 0.49643712486325065, + "grad_norm": 0.09851718693971634, + "learning_rate": 2.5271545584045585e-05, + "loss": 0.018, + "step": 67160 + }, + { + "epoch": 0.4965110434345525, + "grad_norm": 0.08281811326742172, + "learning_rate": 2.5267835944919278e-05, + "loss": 0.0179, + "step": 67170 + }, + { + "epoch": 0.49658496200585434, + "grad_norm": 0.05920829251408577, + "learning_rate": 2.5264126305792974e-05, + "loss": 0.0161, + "step": 67180 + }, + { + "epoch": 0.4966588805771562, + "grad_norm": 0.09232311695814133, + "learning_rate": 2.526041666666667e-05, + "loss": 0.018, + "step": 67190 + }, + { + "epoch": 0.4967327991484581, + "grad_norm": 0.05791931599378586, + "learning_rate": 2.525670702754036e-05, + "loss": 0.0182, + "step": 67200 + }, + { + "epoch": 0.4968067177197599, + "grad_norm": 0.08446840941905975, + "learning_rate": 2.5252997388414058e-05, + "loss": 0.018, + "step": 67210 + }, + { + "epoch": 0.4968806362910618, + "grad_norm": 0.09007827937602997, + "learning_rate": 2.5249287749287747e-05, + "loss": 0.0191, + "step": 67220 + }, + { + "epoch": 0.4969545548623636, + "grad_norm": 0.09295041114091873, + "learning_rate": 2.5245578110161443e-05, + "loss": 0.0189, + "step": 67230 + }, + { + "epoch": 0.49702847343366546, + "grad_norm": 0.12847813963890076, + "learning_rate": 2.524186847103514e-05, + "loss": 0.0159, + "step": 67240 + }, + { + "epoch": 0.4971023920049673, + "grad_norm": 0.07954330742359161, + "learning_rate": 2.5238158831908832e-05, + "loss": 0.0182, + "step": 67250 + }, + { + "epoch": 0.4971763105762692, + "grad_norm": 0.0992896780371666, + "learning_rate": 2.5234449192782528e-05, + "loss": 0.0198, + "step": 67260 + }, + { + "epoch": 0.49725022914757105, + "grad_norm": 0.07747029513120651, + "learning_rate": 2.5230739553656224e-05, + "loss": 0.0188, + "step": 67270 + }, + { + "epoch": 0.4973241477188729, + "grad_norm": 0.07811928540468216, + "learning_rate": 2.5227029914529916e-05, + "loss": 0.0141, + "step": 67280 + }, + { + "epoch": 0.49739806629017475, + "grad_norm": 0.06636947393417358, + "learning_rate": 2.5223320275403612e-05, + "loss": 0.0151, + "step": 67290 + }, + { + "epoch": 0.4974719848614766, + "grad_norm": 0.06473857909440994, + "learning_rate": 2.52196106362773e-05, + "loss": 0.0157, + "step": 67300 + }, + { + "epoch": 0.49754590343277844, + "grad_norm": 0.13399842381477356, + "learning_rate": 2.5215900997150998e-05, + "loss": 0.0179, + "step": 67310 + }, + { + "epoch": 0.4976198220040803, + "grad_norm": 0.07125803083181381, + "learning_rate": 2.5212191358024694e-05, + "loss": 0.0177, + "step": 67320 + }, + { + "epoch": 0.4976937405753822, + "grad_norm": 0.06952224671840668, + "learning_rate": 2.5208481718898386e-05, + "loss": 0.0176, + "step": 67330 + }, + { + "epoch": 0.497767659146684, + "grad_norm": 0.09094695746898651, + "learning_rate": 2.5204772079772082e-05, + "loss": 0.0214, + "step": 67340 + }, + { + "epoch": 0.4978415777179859, + "grad_norm": 0.08997868001461029, + "learning_rate": 2.520106244064577e-05, + "loss": 0.0205, + "step": 67350 + }, + { + "epoch": 0.4979154962892877, + "grad_norm": 0.05725601315498352, + "learning_rate": 2.519735280151947e-05, + "loss": 0.0172, + "step": 67360 + }, + { + "epoch": 0.49798941486058956, + "grad_norm": 0.10619509220123291, + "learning_rate": 2.5193643162393167e-05, + "loss": 0.022, + "step": 67370 + }, + { + "epoch": 0.4980633334318914, + "grad_norm": 0.07303507626056671, + "learning_rate": 2.5189933523266856e-05, + "loss": 0.018, + "step": 67380 + }, + { + "epoch": 0.4981372520031933, + "grad_norm": 0.0775846466422081, + "learning_rate": 2.5186223884140552e-05, + "loss": 0.0177, + "step": 67390 + }, + { + "epoch": 0.49821117057449515, + "grad_norm": 0.07945775240659714, + "learning_rate": 2.5182514245014244e-05, + "loss": 0.0182, + "step": 67400 + }, + { + "epoch": 0.498285089145797, + "grad_norm": 0.07879578322172165, + "learning_rate": 2.517880460588794e-05, + "loss": 0.0179, + "step": 67410 + }, + { + "epoch": 0.49835900771709885, + "grad_norm": 0.09689656645059586, + "learning_rate": 2.5175094966761636e-05, + "loss": 0.0189, + "step": 67420 + }, + { + "epoch": 0.4984329262884007, + "grad_norm": 0.0677686408162117, + "learning_rate": 2.517138532763533e-05, + "loss": 0.0169, + "step": 67430 + }, + { + "epoch": 0.49850684485970254, + "grad_norm": 0.09172812849283218, + "learning_rate": 2.5167675688509025e-05, + "loss": 0.0182, + "step": 67440 + }, + { + "epoch": 0.4985807634310044, + "grad_norm": 0.08993425965309143, + "learning_rate": 2.5163966049382714e-05, + "loss": 0.019, + "step": 67450 + }, + { + "epoch": 0.4986546820023063, + "grad_norm": 0.10504340380430222, + "learning_rate": 2.516025641025641e-05, + "loss": 0.019, + "step": 67460 + }, + { + "epoch": 0.4987286005736081, + "grad_norm": 0.09816096723079681, + "learning_rate": 2.5156546771130106e-05, + "loss": 0.0169, + "step": 67470 + }, + { + "epoch": 0.49880251914491, + "grad_norm": 0.08428593724966049, + "learning_rate": 2.51528371320038e-05, + "loss": 0.0182, + "step": 67480 + }, + { + "epoch": 0.4988764377162118, + "grad_norm": 0.0799790471792221, + "learning_rate": 2.5149127492877494e-05, + "loss": 0.0194, + "step": 67490 + }, + { + "epoch": 0.49895035628751366, + "grad_norm": 0.07538869976997375, + "learning_rate": 2.514541785375119e-05, + "loss": 0.0235, + "step": 67500 + }, + { + "epoch": 0.4990242748588155, + "grad_norm": 0.07655777782201767, + "learning_rate": 2.5141708214624883e-05, + "loss": 0.018, + "step": 67510 + }, + { + "epoch": 0.4990981934301174, + "grad_norm": 0.08150215446949005, + "learning_rate": 2.513799857549858e-05, + "loss": 0.0178, + "step": 67520 + }, + { + "epoch": 0.49917211200141925, + "grad_norm": 0.08197018504142761, + "learning_rate": 2.5134288936372268e-05, + "loss": 0.0193, + "step": 67530 + }, + { + "epoch": 0.4992460305727211, + "grad_norm": 0.09561475366353989, + "learning_rate": 2.5130579297245964e-05, + "loss": 0.0163, + "step": 67540 + }, + { + "epoch": 0.49931994914402295, + "grad_norm": 0.06492964178323746, + "learning_rate": 2.512686965811966e-05, + "loss": 0.0156, + "step": 67550 + }, + { + "epoch": 0.4993938677153248, + "grad_norm": 0.06733527779579163, + "learning_rate": 2.5123160018993353e-05, + "loss": 0.0188, + "step": 67560 + }, + { + "epoch": 0.49946778628662664, + "grad_norm": 0.055312179028987885, + "learning_rate": 2.511945037986705e-05, + "loss": 0.0173, + "step": 67570 + }, + { + "epoch": 0.4995417048579285, + "grad_norm": 0.096860371530056, + "learning_rate": 2.511574074074074e-05, + "loss": 0.0179, + "step": 67580 + }, + { + "epoch": 0.4996156234292304, + "grad_norm": 0.06684661656618118, + "learning_rate": 2.5112031101614437e-05, + "loss": 0.0195, + "step": 67590 + }, + { + "epoch": 0.4996895420005322, + "grad_norm": 0.10767112672328949, + "learning_rate": 2.5108321462488133e-05, + "loss": 0.0173, + "step": 67600 + }, + { + "epoch": 0.4997634605718341, + "grad_norm": 0.07578767836093903, + "learning_rate": 2.5104611823361822e-05, + "loss": 0.0161, + "step": 67610 + }, + { + "epoch": 0.4998373791431359, + "grad_norm": 0.07601311802864075, + "learning_rate": 2.510090218423552e-05, + "loss": 0.0188, + "step": 67620 + }, + { + "epoch": 0.49991129771443776, + "grad_norm": 0.07916391640901566, + "learning_rate": 2.509719254510921e-05, + "loss": 0.0206, + "step": 67630 + }, + { + "epoch": 0.4999852162857396, + "grad_norm": 0.09040073305368423, + "learning_rate": 2.5093482905982907e-05, + "loss": 0.018, + "step": 67640 + }, + { + "epoch": 0.5000591348570415, + "grad_norm": 0.05715951323509216, + "learning_rate": 2.5089773266856603e-05, + "loss": 0.0172, + "step": 67650 + }, + { + "epoch": 0.5001330534283434, + "grad_norm": 0.05888355150818825, + "learning_rate": 2.5086063627730295e-05, + "loss": 0.0167, + "step": 67660 + }, + { + "epoch": 0.5002069719996451, + "grad_norm": 0.07443065196275711, + "learning_rate": 2.508235398860399e-05, + "loss": 0.0175, + "step": 67670 + }, + { + "epoch": 0.500280890570947, + "grad_norm": 0.05773517116904259, + "learning_rate": 2.507864434947768e-05, + "loss": 0.0157, + "step": 67680 + }, + { + "epoch": 0.500354809142249, + "grad_norm": 0.08666082471609116, + "learning_rate": 2.5074934710351377e-05, + "loss": 0.0182, + "step": 67690 + }, + { + "epoch": 0.5004287277135507, + "grad_norm": 0.0956631749868393, + "learning_rate": 2.5071225071225073e-05, + "loss": 0.0195, + "step": 67700 + }, + { + "epoch": 0.5005026462848526, + "grad_norm": 0.07875441014766693, + "learning_rate": 2.5067515432098765e-05, + "loss": 0.0169, + "step": 67710 + }, + { + "epoch": 0.5005765648561544, + "grad_norm": 0.07937600463628769, + "learning_rate": 2.506380579297246e-05, + "loss": 0.0195, + "step": 67720 + }, + { + "epoch": 0.5006504834274563, + "grad_norm": 0.0602254644036293, + "learning_rate": 2.5060096153846157e-05, + "loss": 0.0159, + "step": 67730 + }, + { + "epoch": 0.5007244019987581, + "grad_norm": 0.10217604786157608, + "learning_rate": 2.505638651471985e-05, + "loss": 0.0178, + "step": 67740 + }, + { + "epoch": 0.50079832057006, + "grad_norm": 0.08687470853328705, + "learning_rate": 2.5052676875593546e-05, + "loss": 0.0191, + "step": 67750 + }, + { + "epoch": 0.5008722391413619, + "grad_norm": 0.0878107100725174, + "learning_rate": 2.5048967236467235e-05, + "loss": 0.0184, + "step": 67760 + }, + { + "epoch": 0.5009461577126637, + "grad_norm": 0.0794096440076828, + "learning_rate": 2.504525759734093e-05, + "loss": 0.0172, + "step": 67770 + }, + { + "epoch": 0.5010200762839656, + "grad_norm": 0.0839376449584961, + "learning_rate": 2.504154795821463e-05, + "loss": 0.0158, + "step": 67780 + }, + { + "epoch": 0.5010939948552674, + "grad_norm": 0.09538474678993225, + "learning_rate": 2.503783831908832e-05, + "loss": 0.0222, + "step": 67790 + }, + { + "epoch": 0.5011679134265693, + "grad_norm": 0.08887049555778503, + "learning_rate": 2.5034128679962015e-05, + "loss": 0.0189, + "step": 67800 + }, + { + "epoch": 0.5012418319978711, + "grad_norm": 0.12286271154880524, + "learning_rate": 2.5030419040835708e-05, + "loss": 0.0162, + "step": 67810 + }, + { + "epoch": 0.501315750569173, + "grad_norm": 0.0786716490983963, + "learning_rate": 2.5026709401709404e-05, + "loss": 0.0179, + "step": 67820 + }, + { + "epoch": 0.5013896691404749, + "grad_norm": 0.08333033323287964, + "learning_rate": 2.50229997625831e-05, + "loss": 0.0183, + "step": 67830 + }, + { + "epoch": 0.5014635877117767, + "grad_norm": 0.06486833095550537, + "learning_rate": 2.501929012345679e-05, + "loss": 0.0157, + "step": 67840 + }, + { + "epoch": 0.5015375062830786, + "grad_norm": 0.07939372211694717, + "learning_rate": 2.5015580484330485e-05, + "loss": 0.0196, + "step": 67850 + }, + { + "epoch": 0.5016114248543804, + "grad_norm": 0.08343151211738586, + "learning_rate": 2.5011870845204177e-05, + "loss": 0.0168, + "step": 67860 + }, + { + "epoch": 0.5016853434256823, + "grad_norm": 0.0886319950222969, + "learning_rate": 2.5008161206077873e-05, + "loss": 0.0181, + "step": 67870 + }, + { + "epoch": 0.5017592619969842, + "grad_norm": 0.07745710760354996, + "learning_rate": 2.500445156695157e-05, + "loss": 0.0174, + "step": 67880 + }, + { + "epoch": 0.501833180568286, + "grad_norm": 0.10162079334259033, + "learning_rate": 2.5000741927825262e-05, + "loss": 0.0184, + "step": 67890 + }, + { + "epoch": 0.5019070991395879, + "grad_norm": 0.06939810514450073, + "learning_rate": 2.4997032288698958e-05, + "loss": 0.0191, + "step": 67900 + }, + { + "epoch": 0.5019810177108897, + "grad_norm": 0.09388607740402222, + "learning_rate": 2.499332264957265e-05, + "loss": 0.0212, + "step": 67910 + }, + { + "epoch": 0.5020549362821916, + "grad_norm": 0.06481907516717911, + "learning_rate": 2.4989613010446343e-05, + "loss": 0.0169, + "step": 67920 + }, + { + "epoch": 0.5021288548534933, + "grad_norm": 0.05753227323293686, + "learning_rate": 2.498590337132004e-05, + "loss": 0.0194, + "step": 67930 + }, + { + "epoch": 0.5022027734247952, + "grad_norm": 0.07289828360080719, + "learning_rate": 2.4982193732193735e-05, + "loss": 0.0179, + "step": 67940 + }, + { + "epoch": 0.5022766919960971, + "grad_norm": 0.08217739313840866, + "learning_rate": 2.4978484093067428e-05, + "loss": 0.0171, + "step": 67950 + }, + { + "epoch": 0.5023506105673989, + "grad_norm": 0.07248316705226898, + "learning_rate": 2.497477445394112e-05, + "loss": 0.0168, + "step": 67960 + }, + { + "epoch": 0.5024245291387008, + "grad_norm": 0.07131918519735336, + "learning_rate": 2.4971064814814816e-05, + "loss": 0.0181, + "step": 67970 + }, + { + "epoch": 0.5024984477100026, + "grad_norm": 0.08347027003765106, + "learning_rate": 2.4967355175688512e-05, + "loss": 0.0149, + "step": 67980 + }, + { + "epoch": 0.5025723662813045, + "grad_norm": 0.08220400661230087, + "learning_rate": 2.4963645536562205e-05, + "loss": 0.0185, + "step": 67990 + }, + { + "epoch": 0.5026462848526063, + "grad_norm": 0.092999666929245, + "learning_rate": 2.4959935897435897e-05, + "loss": 0.0197, + "step": 68000 + }, + { + "epoch": 0.5027202034239082, + "grad_norm": 0.0755041241645813, + "learning_rate": 2.4956226258309593e-05, + "loss": 0.0196, + "step": 68010 + }, + { + "epoch": 0.5027941219952101, + "grad_norm": 0.07427439838647842, + "learning_rate": 2.4952516619183286e-05, + "loss": 0.0169, + "step": 68020 + }, + { + "epoch": 0.5028680405665119, + "grad_norm": 0.08319801092147827, + "learning_rate": 2.4948806980056982e-05, + "loss": 0.0162, + "step": 68030 + }, + { + "epoch": 0.5029419591378138, + "grad_norm": 0.1142725721001625, + "learning_rate": 2.4945097340930674e-05, + "loss": 0.0151, + "step": 68040 + }, + { + "epoch": 0.5030158777091156, + "grad_norm": 0.0719955712556839, + "learning_rate": 2.494138770180437e-05, + "loss": 0.0169, + "step": 68050 + }, + { + "epoch": 0.5030897962804175, + "grad_norm": 0.07777555286884308, + "learning_rate": 2.4937678062678063e-05, + "loss": 0.0193, + "step": 68060 + }, + { + "epoch": 0.5031637148517194, + "grad_norm": 0.09181669354438782, + "learning_rate": 2.4933968423551756e-05, + "loss": 0.0168, + "step": 68070 + }, + { + "epoch": 0.5032376334230212, + "grad_norm": 0.06580276042222977, + "learning_rate": 2.493025878442545e-05, + "loss": 0.0184, + "step": 68080 + }, + { + "epoch": 0.5033115519943231, + "grad_norm": 0.07432732731103897, + "learning_rate": 2.4926549145299147e-05, + "loss": 0.0161, + "step": 68090 + }, + { + "epoch": 0.5033854705656249, + "grad_norm": 0.07752696424722672, + "learning_rate": 2.492283950617284e-05, + "loss": 0.017, + "step": 68100 + }, + { + "epoch": 0.5034593891369268, + "grad_norm": 0.07764051109552383, + "learning_rate": 2.4919129867046533e-05, + "loss": 0.0162, + "step": 68110 + }, + { + "epoch": 0.5035333077082286, + "grad_norm": 0.09618353098630905, + "learning_rate": 2.4915420227920232e-05, + "loss": 0.0182, + "step": 68120 + }, + { + "epoch": 0.5036072262795305, + "grad_norm": 0.07791148126125336, + "learning_rate": 2.4911710588793925e-05, + "loss": 0.0187, + "step": 68130 + }, + { + "epoch": 0.5036811448508324, + "grad_norm": 0.09849604964256287, + "learning_rate": 2.4908000949667617e-05, + "loss": 0.0176, + "step": 68140 + }, + { + "epoch": 0.5037550634221342, + "grad_norm": 0.1010863184928894, + "learning_rate": 2.490429131054131e-05, + "loss": 0.0206, + "step": 68150 + }, + { + "epoch": 0.5038289819934361, + "grad_norm": 0.07524994015693665, + "learning_rate": 2.4900581671415006e-05, + "loss": 0.017, + "step": 68160 + }, + { + "epoch": 0.5039029005647379, + "grad_norm": 0.0941774845123291, + "learning_rate": 2.48968720322887e-05, + "loss": 0.0173, + "step": 68170 + }, + { + "epoch": 0.5039768191360398, + "grad_norm": 0.10298559069633484, + "learning_rate": 2.4893162393162394e-05, + "loss": 0.0204, + "step": 68180 + }, + { + "epoch": 0.5040507377073415, + "grad_norm": 0.08465954661369324, + "learning_rate": 2.4889452754036087e-05, + "loss": 0.0192, + "step": 68190 + }, + { + "epoch": 0.5041246562786434, + "grad_norm": 0.10369189828634262, + "learning_rate": 2.4885743114909783e-05, + "loss": 0.0203, + "step": 68200 + }, + { + "epoch": 0.5041985748499453, + "grad_norm": 0.05515586957335472, + "learning_rate": 2.488203347578348e-05, + "loss": 0.0186, + "step": 68210 + }, + { + "epoch": 0.5042724934212471, + "grad_norm": 0.0919659435749054, + "learning_rate": 2.487832383665717e-05, + "loss": 0.0164, + "step": 68220 + }, + { + "epoch": 0.504346411992549, + "grad_norm": 0.10131487995386124, + "learning_rate": 2.4874614197530864e-05, + "loss": 0.0175, + "step": 68230 + }, + { + "epoch": 0.5044203305638508, + "grad_norm": 0.10229624807834625, + "learning_rate": 2.487090455840456e-05, + "loss": 0.0195, + "step": 68240 + }, + { + "epoch": 0.5044942491351527, + "grad_norm": 0.08007878810167313, + "learning_rate": 2.4867194919278252e-05, + "loss": 0.0202, + "step": 68250 + }, + { + "epoch": 0.5045681677064545, + "grad_norm": 0.07810080796480179, + "learning_rate": 2.486348528015195e-05, + "loss": 0.018, + "step": 68260 + }, + { + "epoch": 0.5046420862777564, + "grad_norm": 0.09035582840442657, + "learning_rate": 2.4859775641025644e-05, + "loss": 0.0174, + "step": 68270 + }, + { + "epoch": 0.5047160048490583, + "grad_norm": 0.09939395636320114, + "learning_rate": 2.4856066001899337e-05, + "loss": 0.0157, + "step": 68280 + }, + { + "epoch": 0.5047899234203601, + "grad_norm": 0.06271559000015259, + "learning_rate": 2.485235636277303e-05, + "loss": 0.0194, + "step": 68290 + }, + { + "epoch": 0.504863841991662, + "grad_norm": 0.05262134224176407, + "learning_rate": 2.4848646723646722e-05, + "loss": 0.0168, + "step": 68300 + }, + { + "epoch": 0.5049377605629638, + "grad_norm": 0.07836416363716125, + "learning_rate": 2.484493708452042e-05, + "loss": 0.0181, + "step": 68310 + }, + { + "epoch": 0.5050116791342657, + "grad_norm": 0.08511856943368912, + "learning_rate": 2.4841227445394114e-05, + "loss": 0.0167, + "step": 68320 + }, + { + "epoch": 0.5050855977055676, + "grad_norm": 0.1118873730301857, + "learning_rate": 2.4837517806267807e-05, + "loss": 0.02, + "step": 68330 + }, + { + "epoch": 0.5051595162768694, + "grad_norm": 0.06002812087535858, + "learning_rate": 2.48338081671415e-05, + "loss": 0.0156, + "step": 68340 + }, + { + "epoch": 0.5052334348481713, + "grad_norm": 0.07912244647741318, + "learning_rate": 2.48300985280152e-05, + "loss": 0.0178, + "step": 68350 + }, + { + "epoch": 0.5053073534194731, + "grad_norm": 0.07713264971971512, + "learning_rate": 2.482638888888889e-05, + "loss": 0.0176, + "step": 68360 + }, + { + "epoch": 0.505381271990775, + "grad_norm": 0.08661041408777237, + "learning_rate": 2.4822679249762584e-05, + "loss": 0.0157, + "step": 68370 + }, + { + "epoch": 0.5054551905620768, + "grad_norm": 0.07685781270265579, + "learning_rate": 2.4818969610636276e-05, + "loss": 0.0185, + "step": 68380 + }, + { + "epoch": 0.5055291091333787, + "grad_norm": 0.06656645238399506, + "learning_rate": 2.4815259971509972e-05, + "loss": 0.0198, + "step": 68390 + }, + { + "epoch": 0.5056030277046806, + "grad_norm": 0.06481455266475677, + "learning_rate": 2.4811550332383668e-05, + "loss": 0.0166, + "step": 68400 + }, + { + "epoch": 0.5056769462759824, + "grad_norm": 0.10537232458591461, + "learning_rate": 2.480784069325736e-05, + "loss": 0.0204, + "step": 68410 + }, + { + "epoch": 0.5057508648472843, + "grad_norm": 0.08557964116334915, + "learning_rate": 2.4804131054131057e-05, + "loss": 0.0187, + "step": 68420 + }, + { + "epoch": 0.505824783418586, + "grad_norm": 0.08855307102203369, + "learning_rate": 2.480042141500475e-05, + "loss": 0.0172, + "step": 68430 + }, + { + "epoch": 0.505898701989888, + "grad_norm": 0.07538998126983643, + "learning_rate": 2.4796711775878445e-05, + "loss": 0.0177, + "step": 68440 + }, + { + "epoch": 0.5059726205611897, + "grad_norm": 0.07634703814983368, + "learning_rate": 2.4793002136752138e-05, + "loss": 0.0205, + "step": 68450 + }, + { + "epoch": 0.5060465391324916, + "grad_norm": 0.08731398731470108, + "learning_rate": 2.4789292497625834e-05, + "loss": 0.0167, + "step": 68460 + }, + { + "epoch": 0.5061204577037935, + "grad_norm": 0.09150371700525284, + "learning_rate": 2.4785582858499526e-05, + "loss": 0.0184, + "step": 68470 + }, + { + "epoch": 0.5061943762750953, + "grad_norm": 0.07076684385538101, + "learning_rate": 2.478187321937322e-05, + "loss": 0.0154, + "step": 68480 + }, + { + "epoch": 0.5062682948463972, + "grad_norm": 0.07761862128973007, + "learning_rate": 2.4778163580246915e-05, + "loss": 0.0177, + "step": 68490 + }, + { + "epoch": 0.506342213417699, + "grad_norm": 0.10504137724637985, + "learning_rate": 2.477445394112061e-05, + "loss": 0.0206, + "step": 68500 + }, + { + "epoch": 0.5064161319890009, + "grad_norm": 0.08980455249547958, + "learning_rate": 2.4770744301994304e-05, + "loss": 0.0176, + "step": 68510 + }, + { + "epoch": 0.5064900505603027, + "grad_norm": 0.09038835763931274, + "learning_rate": 2.4767034662867996e-05, + "loss": 0.0198, + "step": 68520 + }, + { + "epoch": 0.5065639691316046, + "grad_norm": 0.08619700372219086, + "learning_rate": 2.476332502374169e-05, + "loss": 0.0187, + "step": 68530 + }, + { + "epoch": 0.5066378877029065, + "grad_norm": 0.0925818458199501, + "learning_rate": 2.4759615384615388e-05, + "loss": 0.0177, + "step": 68540 + }, + { + "epoch": 0.5067118062742083, + "grad_norm": 0.07353539019823074, + "learning_rate": 2.475590574548908e-05, + "loss": 0.0186, + "step": 68550 + }, + { + "epoch": 0.5067857248455102, + "grad_norm": 0.09178286045789719, + "learning_rate": 2.4752196106362773e-05, + "loss": 0.0226, + "step": 68560 + }, + { + "epoch": 0.506859643416812, + "grad_norm": 0.08274193108081818, + "learning_rate": 2.4748486467236466e-05, + "loss": 0.0154, + "step": 68570 + }, + { + "epoch": 0.5069335619881139, + "grad_norm": 0.09752582758665085, + "learning_rate": 2.4744776828110165e-05, + "loss": 0.0205, + "step": 68580 + }, + { + "epoch": 0.5070074805594158, + "grad_norm": 0.06460082530975342, + "learning_rate": 2.4741067188983858e-05, + "loss": 0.0171, + "step": 68590 + }, + { + "epoch": 0.5070813991307176, + "grad_norm": 0.09168283641338348, + "learning_rate": 2.473735754985755e-05, + "loss": 0.0209, + "step": 68600 + }, + { + "epoch": 0.5071553177020195, + "grad_norm": 0.09616568684577942, + "learning_rate": 2.4733647910731246e-05, + "loss": 0.0194, + "step": 68610 + }, + { + "epoch": 0.5072292362733213, + "grad_norm": 0.090309739112854, + "learning_rate": 2.472993827160494e-05, + "loss": 0.0178, + "step": 68620 + }, + { + "epoch": 0.5073031548446232, + "grad_norm": 0.07238759845495224, + "learning_rate": 2.4726228632478635e-05, + "loss": 0.0195, + "step": 68630 + }, + { + "epoch": 0.507377073415925, + "grad_norm": 0.07042551040649414, + "learning_rate": 2.4722518993352327e-05, + "loss": 0.0191, + "step": 68640 + }, + { + "epoch": 0.5074509919872269, + "grad_norm": 0.09275923669338226, + "learning_rate": 2.4718809354226023e-05, + "loss": 0.0169, + "step": 68650 + }, + { + "epoch": 0.5075249105585288, + "grad_norm": 0.09293848276138306, + "learning_rate": 2.4715099715099716e-05, + "loss": 0.0169, + "step": 68660 + }, + { + "epoch": 0.5075988291298306, + "grad_norm": 0.09748068451881409, + "learning_rate": 2.4711390075973412e-05, + "loss": 0.0167, + "step": 68670 + }, + { + "epoch": 0.5076727477011325, + "grad_norm": 0.07259243726730347, + "learning_rate": 2.4707680436847104e-05, + "loss": 0.0202, + "step": 68680 + }, + { + "epoch": 0.5077466662724343, + "grad_norm": 0.09704194217920303, + "learning_rate": 2.47039707977208e-05, + "loss": 0.0181, + "step": 68690 + }, + { + "epoch": 0.5078205848437362, + "grad_norm": 0.09361874312162399, + "learning_rate": 2.4700261158594493e-05, + "loss": 0.0184, + "step": 68700 + }, + { + "epoch": 0.507894503415038, + "grad_norm": 0.07901400327682495, + "learning_rate": 2.4696551519468186e-05, + "loss": 0.0179, + "step": 68710 + }, + { + "epoch": 0.5079684219863398, + "grad_norm": 0.10706298798322678, + "learning_rate": 2.469284188034188e-05, + "loss": 0.0187, + "step": 68720 + }, + { + "epoch": 0.5080423405576417, + "grad_norm": 0.07765945047140121, + "learning_rate": 2.4689132241215578e-05, + "loss": 0.0174, + "step": 68730 + }, + { + "epoch": 0.5081162591289435, + "grad_norm": 0.06914977729320526, + "learning_rate": 2.468542260208927e-05, + "loss": 0.0168, + "step": 68740 + }, + { + "epoch": 0.5081901777002454, + "grad_norm": 0.1388658732175827, + "learning_rate": 2.4681712962962963e-05, + "loss": 0.0191, + "step": 68750 + }, + { + "epoch": 0.5082640962715472, + "grad_norm": 0.06330770254135132, + "learning_rate": 2.467800332383666e-05, + "loss": 0.0163, + "step": 68760 + }, + { + "epoch": 0.5083380148428491, + "grad_norm": 0.08374302834272385, + "learning_rate": 2.4674293684710355e-05, + "loss": 0.0164, + "step": 68770 + }, + { + "epoch": 0.5084119334141509, + "grad_norm": 0.09005650132894516, + "learning_rate": 2.4670584045584047e-05, + "loss": 0.0172, + "step": 68780 + }, + { + "epoch": 0.5084858519854528, + "grad_norm": 0.08126549422740936, + "learning_rate": 2.466687440645774e-05, + "loss": 0.0193, + "step": 68790 + }, + { + "epoch": 0.5085597705567547, + "grad_norm": 0.0886392742395401, + "learning_rate": 2.4663164767331436e-05, + "loss": 0.0177, + "step": 68800 + }, + { + "epoch": 0.5086336891280565, + "grad_norm": 0.09654468297958374, + "learning_rate": 2.4659455128205132e-05, + "loss": 0.0198, + "step": 68810 + }, + { + "epoch": 0.5087076076993584, + "grad_norm": 0.09124394506216049, + "learning_rate": 2.4655745489078824e-05, + "loss": 0.0177, + "step": 68820 + }, + { + "epoch": 0.5087815262706602, + "grad_norm": 0.07470245659351349, + "learning_rate": 2.4652035849952517e-05, + "loss": 0.0168, + "step": 68830 + }, + { + "epoch": 0.5088554448419621, + "grad_norm": 0.06541818380355835, + "learning_rate": 2.4648326210826213e-05, + "loss": 0.0172, + "step": 68840 + }, + { + "epoch": 0.508929363413264, + "grad_norm": 0.06566719710826874, + "learning_rate": 2.4644616571699905e-05, + "loss": 0.0188, + "step": 68850 + }, + { + "epoch": 0.5090032819845658, + "grad_norm": 0.08970692753791809, + "learning_rate": 2.46409069325736e-05, + "loss": 0.0173, + "step": 68860 + }, + { + "epoch": 0.5090772005558677, + "grad_norm": 0.08380686491727829, + "learning_rate": 2.4637197293447294e-05, + "loss": 0.0174, + "step": 68870 + }, + { + "epoch": 0.5091511191271695, + "grad_norm": 0.0961533933877945, + "learning_rate": 2.463348765432099e-05, + "loss": 0.019, + "step": 68880 + }, + { + "epoch": 0.5092250376984714, + "grad_norm": 0.07376634329557419, + "learning_rate": 2.4629778015194683e-05, + "loss": 0.0175, + "step": 68890 + }, + { + "epoch": 0.5092989562697732, + "grad_norm": 0.08185489475727081, + "learning_rate": 2.462606837606838e-05, + "loss": 0.0181, + "step": 68900 + }, + { + "epoch": 0.5093728748410751, + "grad_norm": 0.11931279301643372, + "learning_rate": 2.462235873694207e-05, + "loss": 0.0192, + "step": 68910 + }, + { + "epoch": 0.509446793412377, + "grad_norm": 0.09398669004440308, + "learning_rate": 2.4618649097815767e-05, + "loss": 0.0184, + "step": 68920 + }, + { + "epoch": 0.5095207119836788, + "grad_norm": 0.08356818556785583, + "learning_rate": 2.461493945868946e-05, + "loss": 0.0189, + "step": 68930 + }, + { + "epoch": 0.5095946305549807, + "grad_norm": 0.06756184250116348, + "learning_rate": 2.4611229819563152e-05, + "loss": 0.0179, + "step": 68940 + }, + { + "epoch": 0.5096685491262825, + "grad_norm": 0.048982974141836166, + "learning_rate": 2.4607520180436848e-05, + "loss": 0.0182, + "step": 68950 + }, + { + "epoch": 0.5097424676975844, + "grad_norm": 0.06728595495223999, + "learning_rate": 2.4603810541310544e-05, + "loss": 0.0163, + "step": 68960 + }, + { + "epoch": 0.5098163862688861, + "grad_norm": 0.09846184402704239, + "learning_rate": 2.4600100902184237e-05, + "loss": 0.0166, + "step": 68970 + }, + { + "epoch": 0.509890304840188, + "grad_norm": 0.0789187103509903, + "learning_rate": 2.459639126305793e-05, + "loss": 0.0183, + "step": 68980 + }, + { + "epoch": 0.50996422341149, + "grad_norm": 0.06714697182178497, + "learning_rate": 2.4592681623931625e-05, + "loss": 0.0185, + "step": 68990 + }, + { + "epoch": 0.5100381419827917, + "grad_norm": 0.06789720058441162, + "learning_rate": 2.458897198480532e-05, + "loss": 0.0162, + "step": 69000 + }, + { + "epoch": 0.5101120605540936, + "grad_norm": 0.12073642015457153, + "learning_rate": 2.4585262345679014e-05, + "loss": 0.018, + "step": 69010 + }, + { + "epoch": 0.5101859791253954, + "grad_norm": 0.07985580712556839, + "learning_rate": 2.4581552706552706e-05, + "loss": 0.017, + "step": 69020 + }, + { + "epoch": 0.5102598976966973, + "grad_norm": 0.0621945783495903, + "learning_rate": 2.4577843067426402e-05, + "loss": 0.0187, + "step": 69030 + }, + { + "epoch": 0.5103338162679991, + "grad_norm": 0.05953545123338699, + "learning_rate": 2.45741334283001e-05, + "loss": 0.017, + "step": 69040 + }, + { + "epoch": 0.510407734839301, + "grad_norm": 0.08608721941709518, + "learning_rate": 2.457042378917379e-05, + "loss": 0.0192, + "step": 69050 + }, + { + "epoch": 0.5104816534106029, + "grad_norm": 0.08489928394556046, + "learning_rate": 2.4566714150047483e-05, + "loss": 0.0173, + "step": 69060 + }, + { + "epoch": 0.5105555719819047, + "grad_norm": 0.06985501945018768, + "learning_rate": 2.456300451092118e-05, + "loss": 0.0179, + "step": 69070 + }, + { + "epoch": 0.5106294905532066, + "grad_norm": 0.07255925983190536, + "learning_rate": 2.4559294871794872e-05, + "loss": 0.0172, + "step": 69080 + }, + { + "epoch": 0.5107034091245084, + "grad_norm": 0.09363655745983124, + "learning_rate": 2.4555585232668568e-05, + "loss": 0.0187, + "step": 69090 + }, + { + "epoch": 0.5107773276958103, + "grad_norm": 0.07645261287689209, + "learning_rate": 2.455187559354226e-05, + "loss": 0.017, + "step": 69100 + }, + { + "epoch": 0.5108512462671122, + "grad_norm": 0.07369133830070496, + "learning_rate": 2.4548165954415957e-05, + "loss": 0.0186, + "step": 69110 + }, + { + "epoch": 0.510925164838414, + "grad_norm": 0.07639345526695251, + "learning_rate": 2.454445631528965e-05, + "loss": 0.0164, + "step": 69120 + }, + { + "epoch": 0.5109990834097159, + "grad_norm": 0.05209139734506607, + "learning_rate": 2.4540746676163345e-05, + "loss": 0.0172, + "step": 69130 + }, + { + "epoch": 0.5110730019810177, + "grad_norm": 0.09408107399940491, + "learning_rate": 2.4537037037037038e-05, + "loss": 0.0195, + "step": 69140 + }, + { + "epoch": 0.5111469205523196, + "grad_norm": 0.07027251273393631, + "learning_rate": 2.4533327397910734e-05, + "loss": 0.0182, + "step": 69150 + }, + { + "epoch": 0.5112208391236214, + "grad_norm": 0.10147867351770401, + "learning_rate": 2.4529617758784426e-05, + "loss": 0.0204, + "step": 69160 + }, + { + "epoch": 0.5112947576949233, + "grad_norm": 0.07984784245491028, + "learning_rate": 2.452590811965812e-05, + "loss": 0.0187, + "step": 69170 + }, + { + "epoch": 0.5113686762662252, + "grad_norm": 0.07958944141864777, + "learning_rate": 2.4522198480531815e-05, + "loss": 0.017, + "step": 69180 + }, + { + "epoch": 0.511442594837527, + "grad_norm": 0.039841070771217346, + "learning_rate": 2.451848884140551e-05, + "loss": 0.0186, + "step": 69190 + }, + { + "epoch": 0.5115165134088289, + "grad_norm": 0.07965556532144547, + "learning_rate": 2.4514779202279203e-05, + "loss": 0.0172, + "step": 69200 + }, + { + "epoch": 0.5115904319801307, + "grad_norm": 0.0786585807800293, + "learning_rate": 2.4511069563152896e-05, + "loss": 0.0184, + "step": 69210 + }, + { + "epoch": 0.5116643505514326, + "grad_norm": 0.06293124705553055, + "learning_rate": 2.4507359924026592e-05, + "loss": 0.0168, + "step": 69220 + }, + { + "epoch": 0.5117382691227343, + "grad_norm": 0.12131396681070328, + "learning_rate": 2.4503650284900288e-05, + "loss": 0.0193, + "step": 69230 + }, + { + "epoch": 0.5118121876940362, + "grad_norm": 0.07918912917375565, + "learning_rate": 2.449994064577398e-05, + "loss": 0.0201, + "step": 69240 + }, + { + "epoch": 0.5118861062653381, + "grad_norm": 0.08990711718797684, + "learning_rate": 2.4496231006647673e-05, + "loss": 0.0225, + "step": 69250 + }, + { + "epoch": 0.5119600248366399, + "grad_norm": 0.043771080672740936, + "learning_rate": 2.449252136752137e-05, + "loss": 0.0178, + "step": 69260 + }, + { + "epoch": 0.5120339434079418, + "grad_norm": 0.12659858167171478, + "learning_rate": 2.4488811728395065e-05, + "loss": 0.0198, + "step": 69270 + }, + { + "epoch": 0.5121078619792436, + "grad_norm": 0.07652303576469421, + "learning_rate": 2.4485102089268757e-05, + "loss": 0.0173, + "step": 69280 + }, + { + "epoch": 0.5121817805505455, + "grad_norm": 0.07906925678253174, + "learning_rate": 2.448139245014245e-05, + "loss": 0.0191, + "step": 69290 + }, + { + "epoch": 0.5122556991218473, + "grad_norm": 0.06692638993263245, + "learning_rate": 2.4477682811016146e-05, + "loss": 0.02, + "step": 69300 + }, + { + "epoch": 0.5123296176931492, + "grad_norm": 0.07715941965579987, + "learning_rate": 2.447397317188984e-05, + "loss": 0.0184, + "step": 69310 + }, + { + "epoch": 0.5124035362644511, + "grad_norm": 0.06227978691458702, + "learning_rate": 2.4470263532763535e-05, + "loss": 0.0158, + "step": 69320 + }, + { + "epoch": 0.5124774548357529, + "grad_norm": 0.0803263857960701, + "learning_rate": 2.4466553893637227e-05, + "loss": 0.0166, + "step": 69330 + }, + { + "epoch": 0.5125513734070548, + "grad_norm": 0.07377856224775314, + "learning_rate": 2.4462844254510923e-05, + "loss": 0.0162, + "step": 69340 + }, + { + "epoch": 0.5126252919783566, + "grad_norm": 0.08652761578559875, + "learning_rate": 2.4459134615384616e-05, + "loss": 0.0204, + "step": 69350 + }, + { + "epoch": 0.5126992105496585, + "grad_norm": 0.056324318051338196, + "learning_rate": 2.445542497625831e-05, + "loss": 0.0158, + "step": 69360 + }, + { + "epoch": 0.5127731291209604, + "grad_norm": 0.07542183995246887, + "learning_rate": 2.4451715337132004e-05, + "loss": 0.0153, + "step": 69370 + }, + { + "epoch": 0.5128470476922622, + "grad_norm": 0.06313826143741608, + "learning_rate": 2.44480056980057e-05, + "loss": 0.0172, + "step": 69380 + }, + { + "epoch": 0.5129209662635641, + "grad_norm": 0.10730694979429245, + "learning_rate": 2.4444296058879393e-05, + "loss": 0.0202, + "step": 69390 + }, + { + "epoch": 0.5129948848348659, + "grad_norm": 0.0748748779296875, + "learning_rate": 2.4440586419753085e-05, + "loss": 0.0175, + "step": 69400 + }, + { + "epoch": 0.5130688034061678, + "grad_norm": 0.07341676205396652, + "learning_rate": 2.443687678062678e-05, + "loss": 0.017, + "step": 69410 + }, + { + "epoch": 0.5131427219774696, + "grad_norm": 0.14897748827934265, + "learning_rate": 2.4433167141500477e-05, + "loss": 0.0171, + "step": 69420 + }, + { + "epoch": 0.5132166405487715, + "grad_norm": 0.08443755656480789, + "learning_rate": 2.442945750237417e-05, + "loss": 0.0195, + "step": 69430 + }, + { + "epoch": 0.5132905591200734, + "grad_norm": 0.0830007791519165, + "learning_rate": 2.4425747863247862e-05, + "loss": 0.0188, + "step": 69440 + }, + { + "epoch": 0.5133644776913752, + "grad_norm": 0.08024938404560089, + "learning_rate": 2.442203822412156e-05, + "loss": 0.0189, + "step": 69450 + }, + { + "epoch": 0.5134383962626771, + "grad_norm": 0.07932229340076447, + "learning_rate": 2.4418328584995254e-05, + "loss": 0.0164, + "step": 69460 + }, + { + "epoch": 0.5135123148339789, + "grad_norm": 0.1015755757689476, + "learning_rate": 2.4414618945868947e-05, + "loss": 0.0162, + "step": 69470 + }, + { + "epoch": 0.5135862334052808, + "grad_norm": 0.08583158254623413, + "learning_rate": 2.441090930674264e-05, + "loss": 0.0154, + "step": 69480 + }, + { + "epoch": 0.5136601519765825, + "grad_norm": 0.08639636635780334, + "learning_rate": 2.4407199667616336e-05, + "loss": 0.018, + "step": 69490 + }, + { + "epoch": 0.5137340705478844, + "grad_norm": 0.07380050420761108, + "learning_rate": 2.440349002849003e-05, + "loss": 0.0203, + "step": 69500 + }, + { + "epoch": 0.5138079891191863, + "grad_norm": 0.0639515370130539, + "learning_rate": 2.4399780389363724e-05, + "loss": 0.017, + "step": 69510 + }, + { + "epoch": 0.5138819076904881, + "grad_norm": 0.08474968373775482, + "learning_rate": 2.4396070750237417e-05, + "loss": 0.0186, + "step": 69520 + }, + { + "epoch": 0.51395582626179, + "grad_norm": 0.09511318802833557, + "learning_rate": 2.4392361111111113e-05, + "loss": 0.0186, + "step": 69530 + }, + { + "epoch": 0.5140297448330918, + "grad_norm": 0.08560945838689804, + "learning_rate": 2.4388651471984805e-05, + "loss": 0.0186, + "step": 69540 + }, + { + "epoch": 0.5141036634043937, + "grad_norm": 0.08364006876945496, + "learning_rate": 2.43849418328585e-05, + "loss": 0.0194, + "step": 69550 + }, + { + "epoch": 0.5141775819756955, + "grad_norm": 0.07831569015979767, + "learning_rate": 2.4381232193732194e-05, + "loss": 0.0185, + "step": 69560 + }, + { + "epoch": 0.5142515005469974, + "grad_norm": 0.07500734180212021, + "learning_rate": 2.437752255460589e-05, + "loss": 0.0171, + "step": 69570 + }, + { + "epoch": 0.5143254191182993, + "grad_norm": 0.10483544319868088, + "learning_rate": 2.4373812915479582e-05, + "loss": 0.019, + "step": 69580 + }, + { + "epoch": 0.5143993376896011, + "grad_norm": 0.08549469709396362, + "learning_rate": 2.4370103276353278e-05, + "loss": 0.0191, + "step": 69590 + }, + { + "epoch": 0.514473256260903, + "grad_norm": 0.06504369527101517, + "learning_rate": 2.436639363722697e-05, + "loss": 0.0189, + "step": 69600 + }, + { + "epoch": 0.5145471748322048, + "grad_norm": 0.054876018315553665, + "learning_rate": 2.4362683998100667e-05, + "loss": 0.0172, + "step": 69610 + }, + { + "epoch": 0.5146210934035067, + "grad_norm": 0.0693962424993515, + "learning_rate": 2.435897435897436e-05, + "loss": 0.0176, + "step": 69620 + }, + { + "epoch": 0.5146950119748086, + "grad_norm": 0.11357058584690094, + "learning_rate": 2.4355264719848052e-05, + "loss": 0.0175, + "step": 69630 + }, + { + "epoch": 0.5147689305461104, + "grad_norm": 0.09563953429460526, + "learning_rate": 2.435155508072175e-05, + "loss": 0.0178, + "step": 69640 + }, + { + "epoch": 0.5148428491174123, + "grad_norm": 0.06931401789188385, + "learning_rate": 2.4347845441595444e-05, + "loss": 0.0156, + "step": 69650 + }, + { + "epoch": 0.5149167676887141, + "grad_norm": 0.10174936056137085, + "learning_rate": 2.4344135802469136e-05, + "loss": 0.0204, + "step": 69660 + }, + { + "epoch": 0.514990686260016, + "grad_norm": 0.0861804261803627, + "learning_rate": 2.434042616334283e-05, + "loss": 0.0177, + "step": 69670 + }, + { + "epoch": 0.5150646048313178, + "grad_norm": 0.07918040454387665, + "learning_rate": 2.4336716524216525e-05, + "loss": 0.0176, + "step": 69680 + }, + { + "epoch": 0.5151385234026197, + "grad_norm": 0.07646022737026215, + "learning_rate": 2.433300688509022e-05, + "loss": 0.016, + "step": 69690 + }, + { + "epoch": 0.5152124419739216, + "grad_norm": 0.10380648821592331, + "learning_rate": 2.4329297245963914e-05, + "loss": 0.0185, + "step": 69700 + }, + { + "epoch": 0.5152863605452234, + "grad_norm": 0.09445256739854813, + "learning_rate": 2.4325587606837606e-05, + "loss": 0.0167, + "step": 69710 + }, + { + "epoch": 0.5153602791165253, + "grad_norm": 0.07841164618730545, + "learning_rate": 2.4321877967711302e-05, + "loss": 0.016, + "step": 69720 + }, + { + "epoch": 0.5154341976878271, + "grad_norm": 0.12285198271274567, + "learning_rate": 2.4318168328584998e-05, + "loss": 0.0175, + "step": 69730 + }, + { + "epoch": 0.515508116259129, + "grad_norm": 0.07969985902309418, + "learning_rate": 2.431445868945869e-05, + "loss": 0.0188, + "step": 69740 + }, + { + "epoch": 0.5155820348304307, + "grad_norm": 0.09048346430063248, + "learning_rate": 2.4310749050332383e-05, + "loss": 0.0178, + "step": 69750 + }, + { + "epoch": 0.5156559534017326, + "grad_norm": 0.08953306823968887, + "learning_rate": 2.430703941120608e-05, + "loss": 0.0165, + "step": 69760 + }, + { + "epoch": 0.5157298719730345, + "grad_norm": 0.09533467888832092, + "learning_rate": 2.4303329772079772e-05, + "loss": 0.0171, + "step": 69770 + }, + { + "epoch": 0.5158037905443363, + "grad_norm": 0.06188954412937164, + "learning_rate": 2.4299620132953468e-05, + "loss": 0.0179, + "step": 69780 + }, + { + "epoch": 0.5158777091156382, + "grad_norm": 0.07564985007047653, + "learning_rate": 2.4295910493827164e-05, + "loss": 0.0157, + "step": 69790 + }, + { + "epoch": 0.51595162768694, + "grad_norm": 0.18343132734298706, + "learning_rate": 2.4292200854700856e-05, + "loss": 0.0198, + "step": 69800 + }, + { + "epoch": 0.5160255462582419, + "grad_norm": 0.07260244339704514, + "learning_rate": 2.428849121557455e-05, + "loss": 0.0179, + "step": 69810 + }, + { + "epoch": 0.5160994648295437, + "grad_norm": 0.057750802487134933, + "learning_rate": 2.4284781576448245e-05, + "loss": 0.0151, + "step": 69820 + }, + { + "epoch": 0.5161733834008456, + "grad_norm": 0.07174156606197357, + "learning_rate": 2.428107193732194e-05, + "loss": 0.0177, + "step": 69830 + }, + { + "epoch": 0.5162473019721475, + "grad_norm": 0.09543014317750931, + "learning_rate": 2.4277362298195633e-05, + "loss": 0.0193, + "step": 69840 + }, + { + "epoch": 0.5163212205434493, + "grad_norm": 0.08153504878282547, + "learning_rate": 2.4273652659069326e-05, + "loss": 0.0205, + "step": 69850 + }, + { + "epoch": 0.5163951391147512, + "grad_norm": 0.07932714372873306, + "learning_rate": 2.426994301994302e-05, + "loss": 0.019, + "step": 69860 + }, + { + "epoch": 0.516469057686053, + "grad_norm": 0.06483565270900726, + "learning_rate": 2.4266233380816718e-05, + "loss": 0.0173, + "step": 69870 + }, + { + "epoch": 0.5165429762573549, + "grad_norm": 0.07810701429843903, + "learning_rate": 2.426252374169041e-05, + "loss": 0.0189, + "step": 69880 + }, + { + "epoch": 0.5166168948286568, + "grad_norm": 0.087563157081604, + "learning_rate": 2.4258814102564103e-05, + "loss": 0.0184, + "step": 69890 + }, + { + "epoch": 0.5166908133999586, + "grad_norm": 0.09393598139286041, + "learning_rate": 2.4255104463437796e-05, + "loss": 0.0173, + "step": 69900 + }, + { + "epoch": 0.5167647319712605, + "grad_norm": 0.09094571322202682, + "learning_rate": 2.425139482431149e-05, + "loss": 0.02, + "step": 69910 + }, + { + "epoch": 0.5168386505425623, + "grad_norm": 0.07238578796386719, + "learning_rate": 2.4247685185185188e-05, + "loss": 0.0161, + "step": 69920 + }, + { + "epoch": 0.5169125691138642, + "grad_norm": 0.07430820167064667, + "learning_rate": 2.424397554605888e-05, + "loss": 0.0166, + "step": 69930 + }, + { + "epoch": 0.516986487685166, + "grad_norm": 0.07901631295681, + "learning_rate": 2.4240265906932573e-05, + "loss": 0.0188, + "step": 69940 + }, + { + "epoch": 0.5170604062564679, + "grad_norm": 0.06880009174346924, + "learning_rate": 2.423655626780627e-05, + "loss": 0.0164, + "step": 69950 + }, + { + "epoch": 0.5171343248277698, + "grad_norm": 0.11061283200979233, + "learning_rate": 2.4232846628679965e-05, + "loss": 0.0181, + "step": 69960 + }, + { + "epoch": 0.5172082433990716, + "grad_norm": 0.08216848969459534, + "learning_rate": 2.4229136989553657e-05, + "loss": 0.0179, + "step": 69970 + }, + { + "epoch": 0.5172821619703735, + "grad_norm": 0.06348450481891632, + "learning_rate": 2.4225427350427353e-05, + "loss": 0.0191, + "step": 69980 + }, + { + "epoch": 0.5173560805416753, + "grad_norm": 0.07877887785434723, + "learning_rate": 2.4221717711301046e-05, + "loss": 0.0167, + "step": 69990 + }, + { + "epoch": 0.5174299991129772, + "grad_norm": 0.08459838479757309, + "learning_rate": 2.421800807217474e-05, + "loss": 0.0184, + "step": 70000 + }, + { + "epoch": 0.5174299991129772, + "eval_f1": 0.6190886640565278, + "eval_loss": 0.017734253779053688, + "eval_precision": 0.4918493016326983, + "eval_recall": 0.8351344665085773, + "eval_runtime": 2668.5338, + "eval_samples_per_second": 202.783, + "eval_steps_per_second": 3.169, + "step": 70000 + }, + { + "epoch": 0.517503917684279, + "grad_norm": 0.07879788428544998, + "learning_rate": 2.4214298433048434e-05, + "loss": 0.0189, + "step": 70010 + }, + { + "epoch": 0.5175778362555808, + "grad_norm": 0.07976265996694565, + "learning_rate": 2.421058879392213e-05, + "loss": 0.0182, + "step": 70020 + }, + { + "epoch": 0.5176517548268827, + "grad_norm": 0.10265471786260605, + "learning_rate": 2.4206879154795823e-05, + "loss": 0.0184, + "step": 70030 + }, + { + "epoch": 0.5177256733981845, + "grad_norm": 0.0674351379275322, + "learning_rate": 2.4203169515669515e-05, + "loss": 0.0178, + "step": 70040 + }, + { + "epoch": 0.5177995919694864, + "grad_norm": 0.08812592923641205, + "learning_rate": 2.419945987654321e-05, + "loss": 0.0191, + "step": 70050 + }, + { + "epoch": 0.5178735105407882, + "grad_norm": 0.06502197682857513, + "learning_rate": 2.4195750237416907e-05, + "loss": 0.0167, + "step": 70060 + }, + { + "epoch": 0.5179474291120901, + "grad_norm": 0.09050507843494415, + "learning_rate": 2.41920405982906e-05, + "loss": 0.0165, + "step": 70070 + }, + { + "epoch": 0.518021347683392, + "grad_norm": 0.07016700506210327, + "learning_rate": 2.4188330959164293e-05, + "loss": 0.016, + "step": 70080 + }, + { + "epoch": 0.5180952662546938, + "grad_norm": 0.07196346670389175, + "learning_rate": 2.4184621320037985e-05, + "loss": 0.0159, + "step": 70090 + }, + { + "epoch": 0.5181691848259957, + "grad_norm": 0.08884629607200623, + "learning_rate": 2.4180911680911684e-05, + "loss": 0.0166, + "step": 70100 + }, + { + "epoch": 0.5182431033972975, + "grad_norm": 0.05546234920620918, + "learning_rate": 2.4177202041785377e-05, + "loss": 0.0165, + "step": 70110 + }, + { + "epoch": 0.5183170219685994, + "grad_norm": 0.09314189106225967, + "learning_rate": 2.417349240265907e-05, + "loss": 0.0171, + "step": 70120 + }, + { + "epoch": 0.5183909405399012, + "grad_norm": 0.0631951317191124, + "learning_rate": 2.4169782763532766e-05, + "loss": 0.019, + "step": 70130 + }, + { + "epoch": 0.5184648591112031, + "grad_norm": 0.07926510274410248, + "learning_rate": 2.4166073124406458e-05, + "loss": 0.0192, + "step": 70140 + }, + { + "epoch": 0.518538777682505, + "grad_norm": 0.08929207921028137, + "learning_rate": 2.4162363485280154e-05, + "loss": 0.019, + "step": 70150 + }, + { + "epoch": 0.5186126962538068, + "grad_norm": 0.07223989814519882, + "learning_rate": 2.4158653846153847e-05, + "loss": 0.0183, + "step": 70160 + }, + { + "epoch": 0.5186866148251087, + "grad_norm": 0.06583063304424286, + "learning_rate": 2.4154944207027543e-05, + "loss": 0.0149, + "step": 70170 + }, + { + "epoch": 0.5187605333964105, + "grad_norm": 0.09024758636951447, + "learning_rate": 2.4151234567901235e-05, + "loss": 0.0164, + "step": 70180 + }, + { + "epoch": 0.5188344519677124, + "grad_norm": 0.0851876512169838, + "learning_rate": 2.414752492877493e-05, + "loss": 0.0179, + "step": 70190 + }, + { + "epoch": 0.5189083705390142, + "grad_norm": 0.07299210131168365, + "learning_rate": 2.4143815289648624e-05, + "loss": 0.0185, + "step": 70200 + }, + { + "epoch": 0.5189822891103161, + "grad_norm": 0.1315525621175766, + "learning_rate": 2.414010565052232e-05, + "loss": 0.0195, + "step": 70210 + }, + { + "epoch": 0.519056207681618, + "grad_norm": 0.10280711948871613, + "learning_rate": 2.4136396011396012e-05, + "loss": 0.0213, + "step": 70220 + }, + { + "epoch": 0.5191301262529198, + "grad_norm": 0.08981981873512268, + "learning_rate": 2.4132686372269705e-05, + "loss": 0.0159, + "step": 70230 + }, + { + "epoch": 0.5192040448242217, + "grad_norm": 0.06911870837211609, + "learning_rate": 2.41289767331434e-05, + "loss": 0.0188, + "step": 70240 + }, + { + "epoch": 0.5192779633955235, + "grad_norm": 0.09323382377624512, + "learning_rate": 2.4125267094017097e-05, + "loss": 0.0185, + "step": 70250 + }, + { + "epoch": 0.5193518819668254, + "grad_norm": 0.08451409637928009, + "learning_rate": 2.412155745489079e-05, + "loss": 0.0166, + "step": 70260 + }, + { + "epoch": 0.5194258005381271, + "grad_norm": 0.08477005362510681, + "learning_rate": 2.4117847815764482e-05, + "loss": 0.0185, + "step": 70270 + }, + { + "epoch": 0.519499719109429, + "grad_norm": 0.06879612803459167, + "learning_rate": 2.4114138176638178e-05, + "loss": 0.0166, + "step": 70280 + }, + { + "epoch": 0.519573637680731, + "grad_norm": 0.08084993809461594, + "learning_rate": 2.4110428537511874e-05, + "loss": 0.0198, + "step": 70290 + }, + { + "epoch": 0.5196475562520327, + "grad_norm": 0.11922654509544373, + "learning_rate": 2.4106718898385567e-05, + "loss": 0.0187, + "step": 70300 + }, + { + "epoch": 0.5197214748233346, + "grad_norm": 0.09586163610219955, + "learning_rate": 2.410300925925926e-05, + "loss": 0.0159, + "step": 70310 + }, + { + "epoch": 0.5197953933946364, + "grad_norm": 0.08208409696817398, + "learning_rate": 2.4099299620132955e-05, + "loss": 0.0214, + "step": 70320 + }, + { + "epoch": 0.5198693119659383, + "grad_norm": 0.08232421427965164, + "learning_rate": 2.409558998100665e-05, + "loss": 0.0178, + "step": 70330 + }, + { + "epoch": 0.5199432305372402, + "grad_norm": 0.05796344578266144, + "learning_rate": 2.4091880341880344e-05, + "loss": 0.0171, + "step": 70340 + }, + { + "epoch": 0.520017149108542, + "grad_norm": 0.07854857295751572, + "learning_rate": 2.4088170702754036e-05, + "loss": 0.0181, + "step": 70350 + }, + { + "epoch": 0.5200910676798439, + "grad_norm": 0.06449563801288605, + "learning_rate": 2.4084461063627732e-05, + "loss": 0.0164, + "step": 70360 + }, + { + "epoch": 0.5201649862511457, + "grad_norm": 0.09255155175924301, + "learning_rate": 2.4080751424501425e-05, + "loss": 0.0223, + "step": 70370 + }, + { + "epoch": 0.5202389048224476, + "grad_norm": 0.07693065702915192, + "learning_rate": 2.407704178537512e-05, + "loss": 0.016, + "step": 70380 + }, + { + "epoch": 0.5203128233937494, + "grad_norm": 0.09296828508377075, + "learning_rate": 2.4073332146248813e-05, + "loss": 0.0178, + "step": 70390 + }, + { + "epoch": 0.5203867419650513, + "grad_norm": 0.08111296594142914, + "learning_rate": 2.406962250712251e-05, + "loss": 0.0169, + "step": 70400 + }, + { + "epoch": 0.5204606605363532, + "grad_norm": 0.08747255057096481, + "learning_rate": 2.4065912867996202e-05, + "loss": 0.0194, + "step": 70410 + }, + { + "epoch": 0.520534579107655, + "grad_norm": 0.07338342070579529, + "learning_rate": 2.4062203228869898e-05, + "loss": 0.0184, + "step": 70420 + }, + { + "epoch": 0.5206084976789569, + "grad_norm": 0.06751928478479385, + "learning_rate": 2.405849358974359e-05, + "loss": 0.0165, + "step": 70430 + }, + { + "epoch": 0.5206824162502587, + "grad_norm": 0.08731786906719208, + "learning_rate": 2.4054783950617286e-05, + "loss": 0.016, + "step": 70440 + }, + { + "epoch": 0.5207563348215606, + "grad_norm": 0.07595943659543991, + "learning_rate": 2.405107431149098e-05, + "loss": 0.018, + "step": 70450 + }, + { + "epoch": 0.5208302533928624, + "grad_norm": 0.09234201163053513, + "learning_rate": 2.404736467236467e-05, + "loss": 0.0176, + "step": 70460 + }, + { + "epoch": 0.5209041719641643, + "grad_norm": 0.08772604912519455, + "learning_rate": 2.4043655033238367e-05, + "loss": 0.0177, + "step": 70470 + }, + { + "epoch": 0.5209780905354662, + "grad_norm": 0.08371371030807495, + "learning_rate": 2.4039945394112063e-05, + "loss": 0.0201, + "step": 70480 + }, + { + "epoch": 0.521052009106768, + "grad_norm": 0.06990265846252441, + "learning_rate": 2.4036235754985756e-05, + "loss": 0.0164, + "step": 70490 + }, + { + "epoch": 0.5211259276780699, + "grad_norm": 0.11240319162607193, + "learning_rate": 2.403252611585945e-05, + "loss": 0.0164, + "step": 70500 + }, + { + "epoch": 0.5211998462493717, + "grad_norm": 0.09507393836975098, + "learning_rate": 2.4028816476733145e-05, + "loss": 0.0175, + "step": 70510 + }, + { + "epoch": 0.5212737648206736, + "grad_norm": 0.056186776608228683, + "learning_rate": 2.402510683760684e-05, + "loss": 0.0162, + "step": 70520 + }, + { + "epoch": 0.5213476833919753, + "grad_norm": 0.07950075715780258, + "learning_rate": 2.4021397198480533e-05, + "loss": 0.0157, + "step": 70530 + }, + { + "epoch": 0.5214216019632772, + "grad_norm": 0.06671198457479477, + "learning_rate": 2.4017687559354226e-05, + "loss": 0.02, + "step": 70540 + }, + { + "epoch": 0.5214955205345791, + "grad_norm": 0.1159062460064888, + "learning_rate": 2.401397792022792e-05, + "loss": 0.0185, + "step": 70550 + }, + { + "epoch": 0.5215694391058809, + "grad_norm": 0.059728387743234634, + "learning_rate": 2.4010268281101618e-05, + "loss": 0.0181, + "step": 70560 + }, + { + "epoch": 0.5216433576771828, + "grad_norm": 0.07596202939748764, + "learning_rate": 2.400655864197531e-05, + "loss": 0.0177, + "step": 70570 + }, + { + "epoch": 0.5217172762484846, + "grad_norm": 0.10383056849241257, + "learning_rate": 2.4002849002849003e-05, + "loss": 0.0183, + "step": 70580 + }, + { + "epoch": 0.5217911948197865, + "grad_norm": 0.0796007513999939, + "learning_rate": 2.39991393637227e-05, + "loss": 0.0167, + "step": 70590 + }, + { + "epoch": 0.5218651133910884, + "grad_norm": 0.06471708416938782, + "learning_rate": 2.399542972459639e-05, + "loss": 0.0171, + "step": 70600 + }, + { + "epoch": 0.5219390319623902, + "grad_norm": 0.06391924619674683, + "learning_rate": 2.3991720085470087e-05, + "loss": 0.0182, + "step": 70610 + }, + { + "epoch": 0.5220129505336921, + "grad_norm": 0.11240462213754654, + "learning_rate": 2.398801044634378e-05, + "loss": 0.0177, + "step": 70620 + }, + { + "epoch": 0.5220868691049939, + "grad_norm": 0.07067076861858368, + "learning_rate": 2.3984300807217476e-05, + "loss": 0.0177, + "step": 70630 + }, + { + "epoch": 0.5221607876762958, + "grad_norm": 0.07260678708553314, + "learning_rate": 2.398059116809117e-05, + "loss": 0.0191, + "step": 70640 + }, + { + "epoch": 0.5222347062475976, + "grad_norm": 0.0927368700504303, + "learning_rate": 2.3976881528964864e-05, + "loss": 0.019, + "step": 70650 + }, + { + "epoch": 0.5223086248188995, + "grad_norm": 0.07368585467338562, + "learning_rate": 2.3973171889838557e-05, + "loss": 0.0179, + "step": 70660 + }, + { + "epoch": 0.5223825433902014, + "grad_norm": 0.09810861945152283, + "learning_rate": 2.3969462250712253e-05, + "loss": 0.0196, + "step": 70670 + }, + { + "epoch": 0.5224564619615032, + "grad_norm": 0.08655349910259247, + "learning_rate": 2.3965752611585946e-05, + "loss": 0.0182, + "step": 70680 + }, + { + "epoch": 0.5225303805328051, + "grad_norm": 0.06796327233314514, + "learning_rate": 2.3962042972459638e-05, + "loss": 0.0173, + "step": 70690 + }, + { + "epoch": 0.5226042991041069, + "grad_norm": 0.08273261785507202, + "learning_rate": 2.3958333333333334e-05, + "loss": 0.0203, + "step": 70700 + }, + { + "epoch": 0.5226782176754088, + "grad_norm": 0.08774213492870331, + "learning_rate": 2.395462369420703e-05, + "loss": 0.0177, + "step": 70710 + }, + { + "epoch": 0.5227521362467106, + "grad_norm": 0.06766923516988754, + "learning_rate": 2.3950914055080723e-05, + "loss": 0.018, + "step": 70720 + }, + { + "epoch": 0.5228260548180125, + "grad_norm": 0.08683652430772781, + "learning_rate": 2.3947204415954415e-05, + "loss": 0.0184, + "step": 70730 + }, + { + "epoch": 0.5228999733893144, + "grad_norm": 0.06167571246623993, + "learning_rate": 2.394349477682811e-05, + "loss": 0.0166, + "step": 70740 + }, + { + "epoch": 0.5229738919606162, + "grad_norm": 0.10443181544542313, + "learning_rate": 2.3939785137701807e-05, + "loss": 0.0173, + "step": 70750 + }, + { + "epoch": 0.5230478105319181, + "grad_norm": 0.10969404131174088, + "learning_rate": 2.39360754985755e-05, + "loss": 0.0207, + "step": 70760 + }, + { + "epoch": 0.5231217291032199, + "grad_norm": 0.0757172703742981, + "learning_rate": 2.3932365859449192e-05, + "loss": 0.0187, + "step": 70770 + }, + { + "epoch": 0.5231956476745218, + "grad_norm": 0.0848625898361206, + "learning_rate": 2.3928656220322888e-05, + "loss": 0.0209, + "step": 70780 + }, + { + "epoch": 0.5232695662458235, + "grad_norm": 0.06197137385606766, + "learning_rate": 2.3924946581196584e-05, + "loss": 0.0179, + "step": 70790 + }, + { + "epoch": 0.5233434848171254, + "grad_norm": 0.06887900829315186, + "learning_rate": 2.3921236942070277e-05, + "loss": 0.0174, + "step": 70800 + }, + { + "epoch": 0.5234174033884273, + "grad_norm": 0.07631140947341919, + "learning_rate": 2.391752730294397e-05, + "loss": 0.0191, + "step": 70810 + }, + { + "epoch": 0.5234913219597291, + "grad_norm": 0.08222679793834686, + "learning_rate": 2.3913817663817665e-05, + "loss": 0.0185, + "step": 70820 + }, + { + "epoch": 0.523565240531031, + "grad_norm": 0.07283231616020203, + "learning_rate": 2.3910108024691358e-05, + "loss": 0.0171, + "step": 70830 + }, + { + "epoch": 0.5236391591023328, + "grad_norm": 0.07829372584819794, + "learning_rate": 2.3906398385565054e-05, + "loss": 0.0179, + "step": 70840 + }, + { + "epoch": 0.5237130776736347, + "grad_norm": 0.06830435246229172, + "learning_rate": 2.3902688746438746e-05, + "loss": 0.0199, + "step": 70850 + }, + { + "epoch": 0.5237869962449366, + "grad_norm": 0.10674279183149338, + "learning_rate": 2.3898979107312442e-05, + "loss": 0.0214, + "step": 70860 + }, + { + "epoch": 0.5238609148162384, + "grad_norm": 0.06788614392280579, + "learning_rate": 2.3895269468186135e-05, + "loss": 0.0171, + "step": 70870 + }, + { + "epoch": 0.5239348333875403, + "grad_norm": 0.07488367706537247, + "learning_rate": 2.389155982905983e-05, + "loss": 0.0183, + "step": 70880 + }, + { + "epoch": 0.5240087519588421, + "grad_norm": 0.07454147189855576, + "learning_rate": 2.3887850189933524e-05, + "loss": 0.0207, + "step": 70890 + }, + { + "epoch": 0.524082670530144, + "grad_norm": 0.06669653952121735, + "learning_rate": 2.388414055080722e-05, + "loss": 0.019, + "step": 70900 + }, + { + "epoch": 0.5241565891014458, + "grad_norm": 0.06841177493333817, + "learning_rate": 2.3880430911680912e-05, + "loss": 0.0172, + "step": 70910 + }, + { + "epoch": 0.5242305076727477, + "grad_norm": 0.08513778448104858, + "learning_rate": 2.3876721272554605e-05, + "loss": 0.0191, + "step": 70920 + }, + { + "epoch": 0.5243044262440496, + "grad_norm": 0.08537270873785019, + "learning_rate": 2.38730116334283e-05, + "loss": 0.0171, + "step": 70930 + }, + { + "epoch": 0.5243783448153514, + "grad_norm": 0.09533053636550903, + "learning_rate": 2.3869301994301997e-05, + "loss": 0.0199, + "step": 70940 + }, + { + "epoch": 0.5244522633866533, + "grad_norm": 0.07149159163236618, + "learning_rate": 2.386559235517569e-05, + "loss": 0.0168, + "step": 70950 + }, + { + "epoch": 0.5245261819579551, + "grad_norm": 0.09240260720252991, + "learning_rate": 2.3861882716049382e-05, + "loss": 0.0193, + "step": 70960 + }, + { + "epoch": 0.524600100529257, + "grad_norm": 0.07488304376602173, + "learning_rate": 2.3858173076923078e-05, + "loss": 0.0183, + "step": 70970 + }, + { + "epoch": 0.5246740191005588, + "grad_norm": 0.061253637075424194, + "learning_rate": 2.3854463437796774e-05, + "loss": 0.0179, + "step": 70980 + }, + { + "epoch": 0.5247479376718607, + "grad_norm": 0.08286695927381516, + "learning_rate": 2.3850753798670466e-05, + "loss": 0.0184, + "step": 70990 + }, + { + "epoch": 0.5248218562431626, + "grad_norm": 0.09659498184919357, + "learning_rate": 2.384704415954416e-05, + "loss": 0.0179, + "step": 71000 + }, + { + "epoch": 0.5248957748144644, + "grad_norm": 0.05578518658876419, + "learning_rate": 2.3843334520417855e-05, + "loss": 0.0157, + "step": 71010 + }, + { + "epoch": 0.5249696933857663, + "grad_norm": 0.07591883838176727, + "learning_rate": 2.383962488129155e-05, + "loss": 0.0174, + "step": 71020 + }, + { + "epoch": 0.5250436119570681, + "grad_norm": 0.08070017397403717, + "learning_rate": 2.3835915242165243e-05, + "loss": 0.0164, + "step": 71030 + }, + { + "epoch": 0.52511753052837, + "grad_norm": 0.05172639340162277, + "learning_rate": 2.3832205603038936e-05, + "loss": 0.0175, + "step": 71040 + }, + { + "epoch": 0.5251914490996717, + "grad_norm": 0.07358026504516602, + "learning_rate": 2.3828495963912632e-05, + "loss": 0.0172, + "step": 71050 + }, + { + "epoch": 0.5252653676709736, + "grad_norm": 0.0738629549741745, + "learning_rate": 2.3824786324786324e-05, + "loss": 0.0167, + "step": 71060 + }, + { + "epoch": 0.5253392862422755, + "grad_norm": 0.06652035564184189, + "learning_rate": 2.382107668566002e-05, + "loss": 0.0158, + "step": 71070 + }, + { + "epoch": 0.5254132048135773, + "grad_norm": 0.09065684676170349, + "learning_rate": 2.3817367046533713e-05, + "loss": 0.0183, + "step": 71080 + }, + { + "epoch": 0.5254871233848792, + "grad_norm": 0.09338533133268356, + "learning_rate": 2.381365740740741e-05, + "loss": 0.0197, + "step": 71090 + }, + { + "epoch": 0.525561041956181, + "grad_norm": 0.07664430141448975, + "learning_rate": 2.38099477682811e-05, + "loss": 0.0191, + "step": 71100 + }, + { + "epoch": 0.5256349605274829, + "grad_norm": 0.07986918091773987, + "learning_rate": 2.3806238129154798e-05, + "loss": 0.0176, + "step": 71110 + }, + { + "epoch": 0.5257088790987848, + "grad_norm": 0.07862329483032227, + "learning_rate": 2.380252849002849e-05, + "loss": 0.0164, + "step": 71120 + }, + { + "epoch": 0.5257827976700866, + "grad_norm": 0.05761149525642395, + "learning_rate": 2.3798818850902186e-05, + "loss": 0.0172, + "step": 71130 + }, + { + "epoch": 0.5258567162413885, + "grad_norm": 0.06935084611177444, + "learning_rate": 2.379510921177588e-05, + "loss": 0.0172, + "step": 71140 + }, + { + "epoch": 0.5259306348126903, + "grad_norm": 0.05792957916855812, + "learning_rate": 2.379139957264957e-05, + "loss": 0.0174, + "step": 71150 + }, + { + "epoch": 0.5260045533839922, + "grad_norm": 0.07632586359977722, + "learning_rate": 2.3787689933523267e-05, + "loss": 0.0172, + "step": 71160 + }, + { + "epoch": 0.526078471955294, + "grad_norm": 0.077678382396698, + "learning_rate": 2.3783980294396963e-05, + "loss": 0.0167, + "step": 71170 + }, + { + "epoch": 0.5261523905265959, + "grad_norm": 0.09699144959449768, + "learning_rate": 2.3780270655270656e-05, + "loss": 0.018, + "step": 71180 + }, + { + "epoch": 0.5262263090978978, + "grad_norm": 0.06520868092775345, + "learning_rate": 2.377656101614435e-05, + "loss": 0.0148, + "step": 71190 + }, + { + "epoch": 0.5263002276691996, + "grad_norm": 0.07202000170946121, + "learning_rate": 2.3772851377018048e-05, + "loss": 0.0203, + "step": 71200 + }, + { + "epoch": 0.5263741462405015, + "grad_norm": 0.09399239718914032, + "learning_rate": 2.376914173789174e-05, + "loss": 0.0187, + "step": 71210 + }, + { + "epoch": 0.5264480648118033, + "grad_norm": 0.05629691854119301, + "learning_rate": 2.3765432098765433e-05, + "loss": 0.0174, + "step": 71220 + }, + { + "epoch": 0.5265219833831052, + "grad_norm": 0.05375561863183975, + "learning_rate": 2.3761722459639125e-05, + "loss": 0.0181, + "step": 71230 + }, + { + "epoch": 0.526595901954407, + "grad_norm": 0.07988496124744415, + "learning_rate": 2.375801282051282e-05, + "loss": 0.0199, + "step": 71240 + }, + { + "epoch": 0.5266698205257089, + "grad_norm": 0.07610704749822617, + "learning_rate": 2.3754303181386517e-05, + "loss": 0.0152, + "step": 71250 + }, + { + "epoch": 0.5267437390970108, + "grad_norm": 0.07854470610618591, + "learning_rate": 2.375059354226021e-05, + "loss": 0.0183, + "step": 71260 + }, + { + "epoch": 0.5268176576683126, + "grad_norm": 0.06778319180011749, + "learning_rate": 2.3746883903133903e-05, + "loss": 0.0194, + "step": 71270 + }, + { + "epoch": 0.5268915762396145, + "grad_norm": 0.06963464617729187, + "learning_rate": 2.37431742640076e-05, + "loss": 0.0188, + "step": 71280 + }, + { + "epoch": 0.5269654948109163, + "grad_norm": 0.07358632236719131, + "learning_rate": 2.373946462488129e-05, + "loss": 0.0193, + "step": 71290 + }, + { + "epoch": 0.5270394133822182, + "grad_norm": 0.07299153506755829, + "learning_rate": 2.3735754985754987e-05, + "loss": 0.0163, + "step": 71300 + }, + { + "epoch": 0.52711333195352, + "grad_norm": 0.08467881381511688, + "learning_rate": 2.373204534662868e-05, + "loss": 0.0179, + "step": 71310 + }, + { + "epoch": 0.5271872505248218, + "grad_norm": 0.07622227072715759, + "learning_rate": 2.3728335707502376e-05, + "loss": 0.0208, + "step": 71320 + }, + { + "epoch": 0.5272611690961237, + "grad_norm": 0.10649682581424713, + "learning_rate": 2.3724626068376068e-05, + "loss": 0.0173, + "step": 71330 + }, + { + "epoch": 0.5273350876674255, + "grad_norm": 0.07591233402490616, + "learning_rate": 2.3720916429249764e-05, + "loss": 0.0188, + "step": 71340 + }, + { + "epoch": 0.5274090062387274, + "grad_norm": 0.12941525876522064, + "learning_rate": 2.371720679012346e-05, + "loss": 0.0203, + "step": 71350 + }, + { + "epoch": 0.5274829248100292, + "grad_norm": 0.1032840758562088, + "learning_rate": 2.3713497150997153e-05, + "loss": 0.0162, + "step": 71360 + }, + { + "epoch": 0.5275568433813311, + "grad_norm": 0.08252818137407303, + "learning_rate": 2.3709787511870845e-05, + "loss": 0.0187, + "step": 71370 + }, + { + "epoch": 0.527630761952633, + "grad_norm": 0.0837445855140686, + "learning_rate": 2.3706077872744538e-05, + "loss": 0.0194, + "step": 71380 + }, + { + "epoch": 0.5277046805239348, + "grad_norm": 0.05505933240056038, + "learning_rate": 2.3702368233618237e-05, + "loss": 0.0169, + "step": 71390 + }, + { + "epoch": 0.5277785990952367, + "grad_norm": 0.0914740040898323, + "learning_rate": 2.369865859449193e-05, + "loss": 0.0184, + "step": 71400 + }, + { + "epoch": 0.5278525176665385, + "grad_norm": 0.07132803648710251, + "learning_rate": 2.3694948955365622e-05, + "loss": 0.0189, + "step": 71410 + }, + { + "epoch": 0.5279264362378404, + "grad_norm": 0.0907021313905716, + "learning_rate": 2.3691239316239315e-05, + "loss": 0.0182, + "step": 71420 + }, + { + "epoch": 0.5280003548091422, + "grad_norm": 0.07915528118610382, + "learning_rate": 2.3687529677113014e-05, + "loss": 0.0167, + "step": 71430 + }, + { + "epoch": 0.5280742733804441, + "grad_norm": 0.08348540216684341, + "learning_rate": 2.3683820037986707e-05, + "loss": 0.0177, + "step": 71440 + }, + { + "epoch": 0.528148191951746, + "grad_norm": 0.07703004032373428, + "learning_rate": 2.36801103988604e-05, + "loss": 0.0207, + "step": 71450 + }, + { + "epoch": 0.5282221105230478, + "grad_norm": 0.09287678450345993, + "learning_rate": 2.3676400759734092e-05, + "loss": 0.0185, + "step": 71460 + }, + { + "epoch": 0.5282960290943497, + "grad_norm": 0.08298031985759735, + "learning_rate": 2.3672691120607788e-05, + "loss": 0.0172, + "step": 71470 + }, + { + "epoch": 0.5283699476656515, + "grad_norm": 0.08304513990879059, + "learning_rate": 2.3668981481481484e-05, + "loss": 0.0167, + "step": 71480 + }, + { + "epoch": 0.5284438662369534, + "grad_norm": 0.07882574200630188, + "learning_rate": 2.3665271842355177e-05, + "loss": 0.0181, + "step": 71490 + }, + { + "epoch": 0.5285177848082552, + "grad_norm": 0.10319788008928299, + "learning_rate": 2.3661562203228873e-05, + "loss": 0.019, + "step": 71500 + }, + { + "epoch": 0.5285917033795571, + "grad_norm": 0.0673266127705574, + "learning_rate": 2.3657852564102565e-05, + "loss": 0.0204, + "step": 71510 + }, + { + "epoch": 0.528665621950859, + "grad_norm": 0.08062201738357544, + "learning_rate": 2.3654142924976258e-05, + "loss": 0.0164, + "step": 71520 + }, + { + "epoch": 0.5287395405221608, + "grad_norm": 0.10110322386026382, + "learning_rate": 2.3650433285849954e-05, + "loss": 0.0206, + "step": 71530 + }, + { + "epoch": 0.5288134590934627, + "grad_norm": 0.05925064533948898, + "learning_rate": 2.364672364672365e-05, + "loss": 0.018, + "step": 71540 + }, + { + "epoch": 0.5288873776647645, + "grad_norm": 0.0888727605342865, + "learning_rate": 2.3643014007597342e-05, + "loss": 0.0189, + "step": 71550 + }, + { + "epoch": 0.5289612962360664, + "grad_norm": 0.05731735751032829, + "learning_rate": 2.3639304368471035e-05, + "loss": 0.0154, + "step": 71560 + }, + { + "epoch": 0.5290352148073681, + "grad_norm": 0.06728977710008621, + "learning_rate": 2.363559472934473e-05, + "loss": 0.0167, + "step": 71570 + }, + { + "epoch": 0.52910913337867, + "grad_norm": 0.1000228300690651, + "learning_rate": 2.3631885090218427e-05, + "loss": 0.0181, + "step": 71580 + }, + { + "epoch": 0.529183051949972, + "grad_norm": 0.09549673646688461, + "learning_rate": 2.362817545109212e-05, + "loss": 0.0193, + "step": 71590 + }, + { + "epoch": 0.5292569705212737, + "grad_norm": 0.11185158789157867, + "learning_rate": 2.3624465811965812e-05, + "loss": 0.0201, + "step": 71600 + }, + { + "epoch": 0.5293308890925756, + "grad_norm": 0.06617502868175507, + "learning_rate": 2.3620756172839504e-05, + "loss": 0.0172, + "step": 71610 + }, + { + "epoch": 0.5294048076638774, + "grad_norm": 0.062105994671583176, + "learning_rate": 2.3617046533713204e-05, + "loss": 0.0166, + "step": 71620 + }, + { + "epoch": 0.5294787262351793, + "grad_norm": 0.09233871847391129, + "learning_rate": 2.3613336894586896e-05, + "loss": 0.0184, + "step": 71630 + }, + { + "epoch": 0.5295526448064812, + "grad_norm": 0.06355202198028564, + "learning_rate": 2.360962725546059e-05, + "loss": 0.0172, + "step": 71640 + }, + { + "epoch": 0.529626563377783, + "grad_norm": 0.0670185461640358, + "learning_rate": 2.3605917616334285e-05, + "loss": 0.0195, + "step": 71650 + }, + { + "epoch": 0.5297004819490849, + "grad_norm": 0.07617296278476715, + "learning_rate": 2.360220797720798e-05, + "loss": 0.0179, + "step": 71660 + }, + { + "epoch": 0.5297744005203867, + "grad_norm": 0.055757950991392136, + "learning_rate": 2.3598498338081673e-05, + "loss": 0.0173, + "step": 71670 + }, + { + "epoch": 0.5298483190916886, + "grad_norm": 0.0978802815079689, + "learning_rate": 2.3594788698955366e-05, + "loss": 0.0163, + "step": 71680 + }, + { + "epoch": 0.5299222376629904, + "grad_norm": 0.06614841520786285, + "learning_rate": 2.3591079059829062e-05, + "loss": 0.0178, + "step": 71690 + }, + { + "epoch": 0.5299961562342923, + "grad_norm": 0.10969319939613342, + "learning_rate": 2.3587369420702755e-05, + "loss": 0.0189, + "step": 71700 + }, + { + "epoch": 0.5300700748055942, + "grad_norm": 0.07790661603212357, + "learning_rate": 2.358365978157645e-05, + "loss": 0.0183, + "step": 71710 + }, + { + "epoch": 0.530143993376896, + "grad_norm": 0.04722243547439575, + "learning_rate": 2.3579950142450143e-05, + "loss": 0.016, + "step": 71720 + }, + { + "epoch": 0.5302179119481979, + "grad_norm": 0.09032213687896729, + "learning_rate": 2.357624050332384e-05, + "loss": 0.0155, + "step": 71730 + }, + { + "epoch": 0.5302918305194997, + "grad_norm": 0.07642046362161636, + "learning_rate": 2.357253086419753e-05, + "loss": 0.0162, + "step": 71740 + }, + { + "epoch": 0.5303657490908016, + "grad_norm": 0.1244816929101944, + "learning_rate": 2.3568821225071228e-05, + "loss": 0.0197, + "step": 71750 + }, + { + "epoch": 0.5304396676621034, + "grad_norm": 0.0643230676651001, + "learning_rate": 2.356511158594492e-05, + "loss": 0.0167, + "step": 71760 + }, + { + "epoch": 0.5305135862334053, + "grad_norm": 0.07190030813217163, + "learning_rate": 2.3561401946818616e-05, + "loss": 0.0185, + "step": 71770 + }, + { + "epoch": 0.5305875048047072, + "grad_norm": 0.06764136254787445, + "learning_rate": 2.355769230769231e-05, + "loss": 0.0198, + "step": 71780 + }, + { + "epoch": 0.530661423376009, + "grad_norm": 0.10977832227945328, + "learning_rate": 2.3553982668566e-05, + "loss": 0.0203, + "step": 71790 + }, + { + "epoch": 0.5307353419473109, + "grad_norm": 0.09612040966749191, + "learning_rate": 2.3550273029439697e-05, + "loss": 0.0182, + "step": 71800 + }, + { + "epoch": 0.5308092605186127, + "grad_norm": 0.08732334524393082, + "learning_rate": 2.3546563390313393e-05, + "loss": 0.0179, + "step": 71810 + }, + { + "epoch": 0.5308831790899146, + "grad_norm": 0.0612306222319603, + "learning_rate": 2.3542853751187086e-05, + "loss": 0.0192, + "step": 71820 + }, + { + "epoch": 0.5309570976612165, + "grad_norm": 0.08263002336025238, + "learning_rate": 2.353914411206078e-05, + "loss": 0.0188, + "step": 71830 + }, + { + "epoch": 0.5310310162325182, + "grad_norm": 0.06137899309396744, + "learning_rate": 2.3535434472934474e-05, + "loss": 0.0191, + "step": 71840 + }, + { + "epoch": 0.5311049348038202, + "grad_norm": 0.07601217925548553, + "learning_rate": 2.353172483380817e-05, + "loss": 0.0164, + "step": 71850 + }, + { + "epoch": 0.5311788533751219, + "grad_norm": 0.07243067026138306, + "learning_rate": 2.3528015194681863e-05, + "loss": 0.0178, + "step": 71860 + }, + { + "epoch": 0.5312527719464238, + "grad_norm": 0.09085898101329803, + "learning_rate": 2.3524305555555556e-05, + "loss": 0.0178, + "step": 71870 + }, + { + "epoch": 0.5313266905177256, + "grad_norm": 0.09606260061264038, + "learning_rate": 2.352059591642925e-05, + "loss": 0.0207, + "step": 71880 + }, + { + "epoch": 0.5314006090890275, + "grad_norm": 0.05547720566391945, + "learning_rate": 2.3516886277302947e-05, + "loss": 0.0174, + "step": 71890 + }, + { + "epoch": 0.5314745276603294, + "grad_norm": 0.08246821910142899, + "learning_rate": 2.351317663817664e-05, + "loss": 0.0177, + "step": 71900 + }, + { + "epoch": 0.5315484462316312, + "grad_norm": 0.11368558555841446, + "learning_rate": 2.3509466999050333e-05, + "loss": 0.0159, + "step": 71910 + }, + { + "epoch": 0.5316223648029331, + "grad_norm": 0.09380337595939636, + "learning_rate": 2.350575735992403e-05, + "loss": 0.0168, + "step": 71920 + }, + { + "epoch": 0.5316962833742349, + "grad_norm": 0.05818706005811691, + "learning_rate": 2.350204772079772e-05, + "loss": 0.0184, + "step": 71930 + }, + { + "epoch": 0.5317702019455368, + "grad_norm": 0.08017941564321518, + "learning_rate": 2.3498338081671417e-05, + "loss": 0.0175, + "step": 71940 + }, + { + "epoch": 0.5318441205168386, + "grad_norm": 0.08289653807878494, + "learning_rate": 2.349462844254511e-05, + "loss": 0.0156, + "step": 71950 + }, + { + "epoch": 0.5319180390881405, + "grad_norm": 0.06446444988250732, + "learning_rate": 2.3490918803418806e-05, + "loss": 0.0161, + "step": 71960 + }, + { + "epoch": 0.5319919576594424, + "grad_norm": 0.07639432698488235, + "learning_rate": 2.3487209164292498e-05, + "loss": 0.0171, + "step": 71970 + }, + { + "epoch": 0.5320658762307442, + "grad_norm": 0.049774058163166046, + "learning_rate": 2.3483499525166194e-05, + "loss": 0.0187, + "step": 71980 + }, + { + "epoch": 0.5321397948020461, + "grad_norm": 0.09060239046812057, + "learning_rate": 2.3479789886039887e-05, + "loss": 0.0178, + "step": 71990 + }, + { + "epoch": 0.5322137133733479, + "grad_norm": 0.08577951043844223, + "learning_rate": 2.3476080246913583e-05, + "loss": 0.0203, + "step": 72000 + }, + { + "epoch": 0.5322876319446498, + "grad_norm": 0.08357306569814682, + "learning_rate": 2.3472370607787275e-05, + "loss": 0.0189, + "step": 72010 + }, + { + "epoch": 0.5323615505159516, + "grad_norm": 0.0907038152217865, + "learning_rate": 2.3468660968660968e-05, + "loss": 0.0179, + "step": 72020 + }, + { + "epoch": 0.5324354690872535, + "grad_norm": 0.07584109157323837, + "learning_rate": 2.3464951329534664e-05, + "loss": 0.0181, + "step": 72030 + }, + { + "epoch": 0.5325093876585554, + "grad_norm": 0.06174978241324425, + "learning_rate": 2.346124169040836e-05, + "loss": 0.0189, + "step": 72040 + }, + { + "epoch": 0.5325833062298572, + "grad_norm": 0.07204340398311615, + "learning_rate": 2.3457532051282052e-05, + "loss": 0.0179, + "step": 72050 + }, + { + "epoch": 0.5326572248011591, + "grad_norm": 0.07583874464035034, + "learning_rate": 2.3453822412155745e-05, + "loss": 0.0156, + "step": 72060 + }, + { + "epoch": 0.5327311433724609, + "grad_norm": 0.07981467247009277, + "learning_rate": 2.345011277302944e-05, + "loss": 0.0189, + "step": 72070 + }, + { + "epoch": 0.5328050619437628, + "grad_norm": 0.0859747901558876, + "learning_rate": 2.3446403133903137e-05, + "loss": 0.0169, + "step": 72080 + }, + { + "epoch": 0.5328789805150647, + "grad_norm": 0.060712117701768875, + "learning_rate": 2.344269349477683e-05, + "loss": 0.0171, + "step": 72090 + }, + { + "epoch": 0.5329528990863664, + "grad_norm": 0.10640475898981094, + "learning_rate": 2.3438983855650522e-05, + "loss": 0.0203, + "step": 72100 + }, + { + "epoch": 0.5330268176576684, + "grad_norm": 0.07040079683065414, + "learning_rate": 2.3435274216524218e-05, + "loss": 0.0169, + "step": 72110 + }, + { + "epoch": 0.5331007362289701, + "grad_norm": 0.06555905193090439, + "learning_rate": 2.3431564577397914e-05, + "loss": 0.0183, + "step": 72120 + }, + { + "epoch": 0.533174654800272, + "grad_norm": 0.0838426873087883, + "learning_rate": 2.3427854938271607e-05, + "loss": 0.0167, + "step": 72130 + }, + { + "epoch": 0.5332485733715738, + "grad_norm": 0.061927784234285355, + "learning_rate": 2.34241452991453e-05, + "loss": 0.0163, + "step": 72140 + }, + { + "epoch": 0.5333224919428757, + "grad_norm": 0.08208876848220825, + "learning_rate": 2.3420435660018995e-05, + "loss": 0.0173, + "step": 72150 + }, + { + "epoch": 0.5333964105141776, + "grad_norm": 0.0723063200712204, + "learning_rate": 2.3416726020892688e-05, + "loss": 0.0197, + "step": 72160 + }, + { + "epoch": 0.5334703290854794, + "grad_norm": 0.07213089615106583, + "learning_rate": 2.3413016381766384e-05, + "loss": 0.0176, + "step": 72170 + }, + { + "epoch": 0.5335442476567813, + "grad_norm": 0.08938119560480118, + "learning_rate": 2.3409306742640076e-05, + "loss": 0.0189, + "step": 72180 + }, + { + "epoch": 0.5336181662280831, + "grad_norm": 0.0909256711602211, + "learning_rate": 2.3405597103513772e-05, + "loss": 0.0197, + "step": 72190 + }, + { + "epoch": 0.533692084799385, + "grad_norm": 0.06849884241819382, + "learning_rate": 2.3401887464387465e-05, + "loss": 0.0187, + "step": 72200 + }, + { + "epoch": 0.5337660033706868, + "grad_norm": 0.0745333582162857, + "learning_rate": 2.339817782526116e-05, + "loss": 0.0177, + "step": 72210 + }, + { + "epoch": 0.5338399219419887, + "grad_norm": 0.074846550822258, + "learning_rate": 2.3394468186134853e-05, + "loss": 0.0169, + "step": 72220 + }, + { + "epoch": 0.5339138405132906, + "grad_norm": 0.060825783759355545, + "learning_rate": 2.339075854700855e-05, + "loss": 0.0164, + "step": 72230 + }, + { + "epoch": 0.5339877590845924, + "grad_norm": 0.07052914053201675, + "learning_rate": 2.3387048907882242e-05, + "loss": 0.0169, + "step": 72240 + }, + { + "epoch": 0.5340616776558943, + "grad_norm": 0.07752663642168045, + "learning_rate": 2.3383339268755935e-05, + "loss": 0.0192, + "step": 72250 + }, + { + "epoch": 0.5341355962271961, + "grad_norm": 0.08008281141519547, + "learning_rate": 2.337962962962963e-05, + "loss": 0.0165, + "step": 72260 + }, + { + "epoch": 0.534209514798498, + "grad_norm": 0.08343818038702011, + "learning_rate": 2.3375919990503326e-05, + "loss": 0.0201, + "step": 72270 + }, + { + "epoch": 0.5342834333697998, + "grad_norm": 0.08409687876701355, + "learning_rate": 2.337221035137702e-05, + "loss": 0.0169, + "step": 72280 + }, + { + "epoch": 0.5343573519411017, + "grad_norm": 0.08192925155162811, + "learning_rate": 2.336850071225071e-05, + "loss": 0.0179, + "step": 72290 + }, + { + "epoch": 0.5344312705124036, + "grad_norm": 0.08260039240121841, + "learning_rate": 2.3364791073124408e-05, + "loss": 0.019, + "step": 72300 + }, + { + "epoch": 0.5345051890837054, + "grad_norm": 0.06475205719470978, + "learning_rate": 2.3361081433998104e-05, + "loss": 0.0187, + "step": 72310 + }, + { + "epoch": 0.5345791076550073, + "grad_norm": 0.05737556517124176, + "learning_rate": 2.3357371794871796e-05, + "loss": 0.0166, + "step": 72320 + }, + { + "epoch": 0.5346530262263091, + "grad_norm": 0.0827866718173027, + "learning_rate": 2.335366215574549e-05, + "loss": 0.0195, + "step": 72330 + }, + { + "epoch": 0.534726944797611, + "grad_norm": 0.055962737649679184, + "learning_rate": 2.3349952516619185e-05, + "loss": 0.0184, + "step": 72340 + }, + { + "epoch": 0.5348008633689129, + "grad_norm": 0.08469745516777039, + "learning_rate": 2.334624287749288e-05, + "loss": 0.0192, + "step": 72350 + }, + { + "epoch": 0.5348747819402146, + "grad_norm": 0.06490087509155273, + "learning_rate": 2.3342533238366573e-05, + "loss": 0.0176, + "step": 72360 + }, + { + "epoch": 0.5349487005115166, + "grad_norm": 0.07816960662603378, + "learning_rate": 2.3338823599240266e-05, + "loss": 0.0179, + "step": 72370 + }, + { + "epoch": 0.5350226190828183, + "grad_norm": 0.05101162567734718, + "learning_rate": 2.3335113960113962e-05, + "loss": 0.0166, + "step": 72380 + }, + { + "epoch": 0.5350965376541202, + "grad_norm": 0.10295893996953964, + "learning_rate": 2.3331404320987654e-05, + "loss": 0.0182, + "step": 72390 + }, + { + "epoch": 0.535170456225422, + "grad_norm": 0.0677361711859703, + "learning_rate": 2.332769468186135e-05, + "loss": 0.02, + "step": 72400 + }, + { + "epoch": 0.5352443747967239, + "grad_norm": 0.08023238182067871, + "learning_rate": 2.3323985042735043e-05, + "loss": 0.0162, + "step": 72410 + }, + { + "epoch": 0.5353182933680258, + "grad_norm": 0.08963119983673096, + "learning_rate": 2.332027540360874e-05, + "loss": 0.018, + "step": 72420 + }, + { + "epoch": 0.5353922119393276, + "grad_norm": 0.08507431298494339, + "learning_rate": 2.331656576448243e-05, + "loss": 0.0187, + "step": 72430 + }, + { + "epoch": 0.5354661305106295, + "grad_norm": 0.12224476784467697, + "learning_rate": 2.3312856125356127e-05, + "loss": 0.02, + "step": 72440 + }, + { + "epoch": 0.5355400490819313, + "grad_norm": 0.0635685995221138, + "learning_rate": 2.330914648622982e-05, + "loss": 0.0165, + "step": 72450 + }, + { + "epoch": 0.5356139676532332, + "grad_norm": 0.05425122380256653, + "learning_rate": 2.3305436847103516e-05, + "loss": 0.0177, + "step": 72460 + }, + { + "epoch": 0.535687886224535, + "grad_norm": 0.08225373178720474, + "learning_rate": 2.330172720797721e-05, + "loss": 0.017, + "step": 72470 + }, + { + "epoch": 0.5357618047958369, + "grad_norm": 0.0681522786617279, + "learning_rate": 2.32980175688509e-05, + "loss": 0.017, + "step": 72480 + }, + { + "epoch": 0.5358357233671388, + "grad_norm": 0.060225699096918106, + "learning_rate": 2.3294307929724597e-05, + "loss": 0.0161, + "step": 72490 + }, + { + "epoch": 0.5359096419384406, + "grad_norm": 0.0953868180513382, + "learning_rate": 2.3290598290598293e-05, + "loss": 0.0185, + "step": 72500 + }, + { + "epoch": 0.5359835605097425, + "grad_norm": 0.10979098081588745, + "learning_rate": 2.3286888651471986e-05, + "loss": 0.0207, + "step": 72510 + }, + { + "epoch": 0.5360574790810443, + "grad_norm": 0.10969498008489609, + "learning_rate": 2.3283179012345678e-05, + "loss": 0.0182, + "step": 72520 + }, + { + "epoch": 0.5361313976523462, + "grad_norm": 0.06657572090625763, + "learning_rate": 2.3279469373219374e-05, + "loss": 0.0179, + "step": 72530 + }, + { + "epoch": 0.536205316223648, + "grad_norm": 0.08106397092342377, + "learning_rate": 2.327575973409307e-05, + "loss": 0.0164, + "step": 72540 + }, + { + "epoch": 0.5362792347949499, + "grad_norm": 0.08693154901266098, + "learning_rate": 2.3272050094966763e-05, + "loss": 0.0187, + "step": 72550 + }, + { + "epoch": 0.5363531533662518, + "grad_norm": 0.12388740479946136, + "learning_rate": 2.3268340455840455e-05, + "loss": 0.0196, + "step": 72560 + }, + { + "epoch": 0.5364270719375536, + "grad_norm": 0.07695605605840683, + "learning_rate": 2.326463081671415e-05, + "loss": 0.0179, + "step": 72570 + }, + { + "epoch": 0.5365009905088555, + "grad_norm": 0.07933904975652695, + "learning_rate": 2.3260921177587847e-05, + "loss": 0.0168, + "step": 72580 + }, + { + "epoch": 0.5365749090801573, + "grad_norm": 0.07312914729118347, + "learning_rate": 2.325721153846154e-05, + "loss": 0.0173, + "step": 72590 + }, + { + "epoch": 0.5366488276514592, + "grad_norm": 0.06449300050735474, + "learning_rate": 2.3253501899335232e-05, + "loss": 0.0192, + "step": 72600 + }, + { + "epoch": 0.5367227462227611, + "grad_norm": 0.07357765734195709, + "learning_rate": 2.324979226020893e-05, + "loss": 0.0183, + "step": 72610 + }, + { + "epoch": 0.5367966647940628, + "grad_norm": 0.06910233199596405, + "learning_rate": 2.324608262108262e-05, + "loss": 0.0186, + "step": 72620 + }, + { + "epoch": 0.5368705833653648, + "grad_norm": 0.06882346421480179, + "learning_rate": 2.3242372981956317e-05, + "loss": 0.0181, + "step": 72630 + }, + { + "epoch": 0.5369445019366665, + "grad_norm": 0.08777708560228348, + "learning_rate": 2.323866334283001e-05, + "loss": 0.0164, + "step": 72640 + }, + { + "epoch": 0.5370184205079684, + "grad_norm": 0.07195254415273666, + "learning_rate": 2.3234953703703705e-05, + "loss": 0.0182, + "step": 72650 + }, + { + "epoch": 0.5370923390792702, + "grad_norm": 0.06659919023513794, + "learning_rate": 2.3231244064577398e-05, + "loss": 0.0167, + "step": 72660 + }, + { + "epoch": 0.5371662576505721, + "grad_norm": 0.07043921202421188, + "learning_rate": 2.3227534425451094e-05, + "loss": 0.0192, + "step": 72670 + }, + { + "epoch": 0.537240176221874, + "grad_norm": 0.061976149678230286, + "learning_rate": 2.3223824786324787e-05, + "loss": 0.0167, + "step": 72680 + }, + { + "epoch": 0.5373140947931758, + "grad_norm": 0.08181758224964142, + "learning_rate": 2.3220115147198483e-05, + "loss": 0.0163, + "step": 72690 + }, + { + "epoch": 0.5373880133644777, + "grad_norm": 0.07538007944822311, + "learning_rate": 2.3216405508072175e-05, + "loss": 0.0187, + "step": 72700 + }, + { + "epoch": 0.5374619319357795, + "grad_norm": 0.07163660228252411, + "learning_rate": 2.3212695868945868e-05, + "loss": 0.0183, + "step": 72710 + }, + { + "epoch": 0.5375358505070814, + "grad_norm": 0.08359681069850922, + "learning_rate": 2.3208986229819567e-05, + "loss": 0.0197, + "step": 72720 + }, + { + "epoch": 0.5376097690783832, + "grad_norm": 0.06933805346488953, + "learning_rate": 2.320527659069326e-05, + "loss": 0.0177, + "step": 72730 + }, + { + "epoch": 0.5376836876496851, + "grad_norm": 0.09602054208517075, + "learning_rate": 2.3201566951566952e-05, + "loss": 0.0184, + "step": 72740 + }, + { + "epoch": 0.537757606220987, + "grad_norm": 0.0886077880859375, + "learning_rate": 2.3197857312440645e-05, + "loss": 0.0196, + "step": 72750 + }, + { + "epoch": 0.5378315247922888, + "grad_norm": 0.08779918402433395, + "learning_rate": 2.319414767331434e-05, + "loss": 0.0186, + "step": 72760 + }, + { + "epoch": 0.5379054433635907, + "grad_norm": 0.06885647773742676, + "learning_rate": 2.3190438034188037e-05, + "loss": 0.0171, + "step": 72770 + }, + { + "epoch": 0.5379793619348925, + "grad_norm": 0.08270540088415146, + "learning_rate": 2.318672839506173e-05, + "loss": 0.0195, + "step": 72780 + }, + { + "epoch": 0.5380532805061944, + "grad_norm": 0.08925561606884003, + "learning_rate": 2.3183018755935422e-05, + "loss": 0.0164, + "step": 72790 + }, + { + "epoch": 0.5381271990774962, + "grad_norm": 0.08344494551420212, + "learning_rate": 2.3179309116809118e-05, + "loss": 0.0172, + "step": 72800 + }, + { + "epoch": 0.5382011176487981, + "grad_norm": 0.08896711468696594, + "learning_rate": 2.3175599477682814e-05, + "loss": 0.0166, + "step": 72810 + }, + { + "epoch": 0.5382750362201, + "grad_norm": 0.07909851521253586, + "learning_rate": 2.3171889838556506e-05, + "loss": 0.0154, + "step": 72820 + }, + { + "epoch": 0.5383489547914018, + "grad_norm": 0.06761901080608368, + "learning_rate": 2.31681801994302e-05, + "loss": 0.0176, + "step": 72830 + }, + { + "epoch": 0.5384228733627037, + "grad_norm": 0.09635329991579056, + "learning_rate": 2.3164470560303895e-05, + "loss": 0.017, + "step": 72840 + }, + { + "epoch": 0.5384967919340055, + "grad_norm": 0.08209284394979477, + "learning_rate": 2.3160760921177587e-05, + "loss": 0.0182, + "step": 72850 + }, + { + "epoch": 0.5385707105053074, + "grad_norm": 0.07711105048656464, + "learning_rate": 2.3157051282051283e-05, + "loss": 0.0195, + "step": 72860 + }, + { + "epoch": 0.5386446290766093, + "grad_norm": 0.07249467819929123, + "learning_rate": 2.315334164292498e-05, + "loss": 0.0167, + "step": 72870 + }, + { + "epoch": 0.538718547647911, + "grad_norm": 0.07699708640575409, + "learning_rate": 2.3149632003798672e-05, + "loss": 0.019, + "step": 72880 + }, + { + "epoch": 0.538792466219213, + "grad_norm": 0.08737131953239441, + "learning_rate": 2.3145922364672365e-05, + "loss": 0.0168, + "step": 72890 + }, + { + "epoch": 0.5388663847905147, + "grad_norm": 0.0935729444026947, + "learning_rate": 2.314221272554606e-05, + "loss": 0.0189, + "step": 72900 + }, + { + "epoch": 0.5389403033618166, + "grad_norm": 0.07913996279239655, + "learning_rate": 2.3138503086419757e-05, + "loss": 0.0166, + "step": 72910 + }, + { + "epoch": 0.5390142219331184, + "grad_norm": 0.07874778658151627, + "learning_rate": 2.313479344729345e-05, + "loss": 0.0185, + "step": 72920 + }, + { + "epoch": 0.5390881405044203, + "grad_norm": 0.059319932013750076, + "learning_rate": 2.313108380816714e-05, + "loss": 0.0187, + "step": 72930 + }, + { + "epoch": 0.5391620590757222, + "grad_norm": 0.06820226460695267, + "learning_rate": 2.3127374169040834e-05, + "loss": 0.0194, + "step": 72940 + }, + { + "epoch": 0.539235977647024, + "grad_norm": 0.0838264748454094, + "learning_rate": 2.3123664529914534e-05, + "loss": 0.0191, + "step": 72950 + }, + { + "epoch": 0.5393098962183259, + "grad_norm": 0.08863027393817902, + "learning_rate": 2.3119954890788226e-05, + "loss": 0.0175, + "step": 72960 + }, + { + "epoch": 0.5393838147896277, + "grad_norm": 0.0833144411444664, + "learning_rate": 2.311624525166192e-05, + "loss": 0.0172, + "step": 72970 + }, + { + "epoch": 0.5394577333609296, + "grad_norm": 0.07805544883012772, + "learning_rate": 2.311253561253561e-05, + "loss": 0.0163, + "step": 72980 + }, + { + "epoch": 0.5395316519322314, + "grad_norm": 0.12435826659202576, + "learning_rate": 2.3108825973409307e-05, + "loss": 0.0194, + "step": 72990 + }, + { + "epoch": 0.5396055705035333, + "grad_norm": 0.06662425398826599, + "learning_rate": 2.3105116334283003e-05, + "loss": 0.0184, + "step": 73000 + }, + { + "epoch": 0.5396794890748352, + "grad_norm": 0.06451261788606644, + "learning_rate": 2.3101406695156696e-05, + "loss": 0.0186, + "step": 73010 + }, + { + "epoch": 0.539753407646137, + "grad_norm": 0.09885028004646301, + "learning_rate": 2.3097697056030392e-05, + "loss": 0.018, + "step": 73020 + }, + { + "epoch": 0.5398273262174389, + "grad_norm": 0.0644676461815834, + "learning_rate": 2.3093987416904084e-05, + "loss": 0.0179, + "step": 73030 + }, + { + "epoch": 0.5399012447887407, + "grad_norm": 0.07263324409723282, + "learning_rate": 2.309027777777778e-05, + "loss": 0.0189, + "step": 73040 + }, + { + "epoch": 0.5399751633600426, + "grad_norm": 0.07130710780620575, + "learning_rate": 2.3086568138651473e-05, + "loss": 0.018, + "step": 73050 + }, + { + "epoch": 0.5400490819313444, + "grad_norm": 0.1313527673482895, + "learning_rate": 2.308285849952517e-05, + "loss": 0.0164, + "step": 73060 + }, + { + "epoch": 0.5401230005026463, + "grad_norm": 0.11921053379774094, + "learning_rate": 2.307914886039886e-05, + "loss": 0.0199, + "step": 73070 + }, + { + "epoch": 0.5401969190739482, + "grad_norm": 0.071078822016716, + "learning_rate": 2.3075439221272554e-05, + "loss": 0.0165, + "step": 73080 + }, + { + "epoch": 0.54027083764525, + "grad_norm": 0.06688307225704193, + "learning_rate": 2.307172958214625e-05, + "loss": 0.0176, + "step": 73090 + }, + { + "epoch": 0.5403447562165519, + "grad_norm": 0.07644398510456085, + "learning_rate": 2.3068019943019946e-05, + "loss": 0.0185, + "step": 73100 + }, + { + "epoch": 0.5404186747878537, + "grad_norm": 0.08246079087257385, + "learning_rate": 2.306431030389364e-05, + "loss": 0.0169, + "step": 73110 + }, + { + "epoch": 0.5404925933591556, + "grad_norm": 0.08127640187740326, + "learning_rate": 2.306060066476733e-05, + "loss": 0.0193, + "step": 73120 + }, + { + "epoch": 0.5405665119304575, + "grad_norm": 0.058930739760398865, + "learning_rate": 2.3056891025641027e-05, + "loss": 0.0174, + "step": 73130 + }, + { + "epoch": 0.5406404305017593, + "grad_norm": 0.09932822734117508, + "learning_rate": 2.3053181386514723e-05, + "loss": 0.0159, + "step": 73140 + }, + { + "epoch": 0.5407143490730612, + "grad_norm": 0.07006815075874329, + "learning_rate": 2.3049471747388416e-05, + "loss": 0.0179, + "step": 73150 + }, + { + "epoch": 0.5407882676443629, + "grad_norm": 0.06904808431863785, + "learning_rate": 2.3045762108262108e-05, + "loss": 0.0206, + "step": 73160 + }, + { + "epoch": 0.5408621862156648, + "grad_norm": 0.0655849501490593, + "learning_rate": 2.30420524691358e-05, + "loss": 0.0176, + "step": 73170 + }, + { + "epoch": 0.5409361047869666, + "grad_norm": 0.11681249737739563, + "learning_rate": 2.30383428300095e-05, + "loss": 0.0182, + "step": 73180 + }, + { + "epoch": 0.5410100233582685, + "grad_norm": 0.08647429943084717, + "learning_rate": 2.3034633190883193e-05, + "loss": 0.0184, + "step": 73190 + }, + { + "epoch": 0.5410839419295704, + "grad_norm": 0.11153209954500198, + "learning_rate": 2.3030923551756885e-05, + "loss": 0.0198, + "step": 73200 + }, + { + "epoch": 0.5411578605008722, + "grad_norm": 0.07431022077798843, + "learning_rate": 2.302721391263058e-05, + "loss": 0.0156, + "step": 73210 + }, + { + "epoch": 0.5412317790721741, + "grad_norm": 0.08188775926828384, + "learning_rate": 2.3023504273504274e-05, + "loss": 0.0171, + "step": 73220 + }, + { + "epoch": 0.5413056976434759, + "grad_norm": 0.07274893671274185, + "learning_rate": 2.301979463437797e-05, + "loss": 0.0163, + "step": 73230 + }, + { + "epoch": 0.5413796162147778, + "grad_norm": 0.1031305342912674, + "learning_rate": 2.3016084995251662e-05, + "loss": 0.0172, + "step": 73240 + }, + { + "epoch": 0.5414535347860796, + "grad_norm": 0.06862304359674454, + "learning_rate": 2.301237535612536e-05, + "loss": 0.0178, + "step": 73250 + }, + { + "epoch": 0.5415274533573815, + "grad_norm": 0.06975569576025009, + "learning_rate": 2.300866571699905e-05, + "loss": 0.0178, + "step": 73260 + }, + { + "epoch": 0.5416013719286834, + "grad_norm": 0.08459268510341644, + "learning_rate": 2.3004956077872747e-05, + "loss": 0.0165, + "step": 73270 + }, + { + "epoch": 0.5416752904999852, + "grad_norm": 0.07779370993375778, + "learning_rate": 2.300124643874644e-05, + "loss": 0.0167, + "step": 73280 + }, + { + "epoch": 0.5417492090712871, + "grad_norm": 0.05710046738386154, + "learning_rate": 2.2997536799620135e-05, + "loss": 0.0166, + "step": 73290 + }, + { + "epoch": 0.5418231276425889, + "grad_norm": 0.07504494488239288, + "learning_rate": 2.2993827160493828e-05, + "loss": 0.0149, + "step": 73300 + }, + { + "epoch": 0.5418970462138908, + "grad_norm": 0.07329048216342926, + "learning_rate": 2.299011752136752e-05, + "loss": 0.0172, + "step": 73310 + }, + { + "epoch": 0.5419709647851926, + "grad_norm": 0.06944195926189423, + "learning_rate": 2.2986407882241217e-05, + "loss": 0.0191, + "step": 73320 + }, + { + "epoch": 0.5420448833564945, + "grad_norm": 0.07709192484617233, + "learning_rate": 2.2982698243114913e-05, + "loss": 0.0163, + "step": 73330 + }, + { + "epoch": 0.5421188019277964, + "grad_norm": 0.09037928283214569, + "learning_rate": 2.2978988603988605e-05, + "loss": 0.0161, + "step": 73340 + }, + { + "epoch": 0.5421927204990982, + "grad_norm": 0.1045730784535408, + "learning_rate": 2.2975278964862298e-05, + "loss": 0.0183, + "step": 73350 + }, + { + "epoch": 0.5422666390704001, + "grad_norm": 0.08890021592378616, + "learning_rate": 2.2971569325735994e-05, + "loss": 0.0198, + "step": 73360 + }, + { + "epoch": 0.5423405576417019, + "grad_norm": 0.08284857124090195, + "learning_rate": 2.296785968660969e-05, + "loss": 0.0196, + "step": 73370 + }, + { + "epoch": 0.5424144762130038, + "grad_norm": 0.09294552356004715, + "learning_rate": 2.2964150047483382e-05, + "loss": 0.0199, + "step": 73380 + }, + { + "epoch": 0.5424883947843057, + "grad_norm": 0.07610155642032623, + "learning_rate": 2.2960440408357075e-05, + "loss": 0.0185, + "step": 73390 + }, + { + "epoch": 0.5425623133556075, + "grad_norm": 0.07990909367799759, + "learning_rate": 2.295673076923077e-05, + "loss": 0.0185, + "step": 73400 + }, + { + "epoch": 0.5426362319269094, + "grad_norm": 0.08958449959754944, + "learning_rate": 2.2953021130104467e-05, + "loss": 0.0158, + "step": 73410 + }, + { + "epoch": 0.5427101504982111, + "grad_norm": 0.07816076278686523, + "learning_rate": 2.294931149097816e-05, + "loss": 0.0187, + "step": 73420 + }, + { + "epoch": 0.542784069069513, + "grad_norm": 0.05900447815656662, + "learning_rate": 2.2945601851851852e-05, + "loss": 0.0175, + "step": 73430 + }, + { + "epoch": 0.5428579876408148, + "grad_norm": 0.0727897435426712, + "learning_rate": 2.2941892212725548e-05, + "loss": 0.0194, + "step": 73440 + }, + { + "epoch": 0.5429319062121167, + "grad_norm": 0.10008365660905838, + "learning_rate": 2.293818257359924e-05, + "loss": 0.0181, + "step": 73450 + }, + { + "epoch": 0.5430058247834186, + "grad_norm": 0.07405528426170349, + "learning_rate": 2.2934472934472936e-05, + "loss": 0.0165, + "step": 73460 + }, + { + "epoch": 0.5430797433547204, + "grad_norm": 0.0775710791349411, + "learning_rate": 2.293076329534663e-05, + "loss": 0.0171, + "step": 73470 + }, + { + "epoch": 0.5431536619260223, + "grad_norm": 0.07803450524806976, + "learning_rate": 2.2927053656220325e-05, + "loss": 0.0155, + "step": 73480 + }, + { + "epoch": 0.5432275804973241, + "grad_norm": 0.0809224396944046, + "learning_rate": 2.2923344017094018e-05, + "loss": 0.0186, + "step": 73490 + }, + { + "epoch": 0.543301499068626, + "grad_norm": 0.10041411966085434, + "learning_rate": 2.2919634377967714e-05, + "loss": 0.0174, + "step": 73500 + }, + { + "epoch": 0.5433754176399278, + "grad_norm": 0.07222852855920792, + "learning_rate": 2.2915924738841406e-05, + "loss": 0.0213, + "step": 73510 + }, + { + "epoch": 0.5434493362112297, + "grad_norm": 0.09778133034706116, + "learning_rate": 2.2912215099715102e-05, + "loss": 0.02, + "step": 73520 + }, + { + "epoch": 0.5435232547825316, + "grad_norm": 0.07030012458562851, + "learning_rate": 2.2908505460588795e-05, + "loss": 0.0199, + "step": 73530 + }, + { + "epoch": 0.5435971733538334, + "grad_norm": 0.06923951208591461, + "learning_rate": 2.2904795821462487e-05, + "loss": 0.0184, + "step": 73540 + }, + { + "epoch": 0.5436710919251353, + "grad_norm": 0.08172609657049179, + "learning_rate": 2.2901086182336183e-05, + "loss": 0.0168, + "step": 73550 + }, + { + "epoch": 0.5437450104964371, + "grad_norm": 0.07019809633493423, + "learning_rate": 2.289737654320988e-05, + "loss": 0.0181, + "step": 73560 + }, + { + "epoch": 0.543818929067739, + "grad_norm": 0.08588024228811264, + "learning_rate": 2.2893666904083572e-05, + "loss": 0.0191, + "step": 73570 + }, + { + "epoch": 0.5438928476390408, + "grad_norm": 0.07677468657493591, + "learning_rate": 2.2889957264957264e-05, + "loss": 0.0174, + "step": 73580 + }, + { + "epoch": 0.5439667662103427, + "grad_norm": 0.07189089059829712, + "learning_rate": 2.288624762583096e-05, + "loss": 0.0161, + "step": 73590 + }, + { + "epoch": 0.5440406847816446, + "grad_norm": 0.0762021616101265, + "learning_rate": 2.2882537986704656e-05, + "loss": 0.0164, + "step": 73600 + }, + { + "epoch": 0.5441146033529464, + "grad_norm": 0.08038612455129623, + "learning_rate": 2.287882834757835e-05, + "loss": 0.0207, + "step": 73610 + }, + { + "epoch": 0.5441885219242483, + "grad_norm": 0.07729607820510864, + "learning_rate": 2.287511870845204e-05, + "loss": 0.0187, + "step": 73620 + }, + { + "epoch": 0.5442624404955501, + "grad_norm": 0.0560038797557354, + "learning_rate": 2.2871409069325737e-05, + "loss": 0.0181, + "step": 73630 + }, + { + "epoch": 0.544336359066852, + "grad_norm": 0.07387551665306091, + "learning_rate": 2.2867699430199433e-05, + "loss": 0.0166, + "step": 73640 + }, + { + "epoch": 0.5444102776381539, + "grad_norm": 0.06997978687286377, + "learning_rate": 2.2863989791073126e-05, + "loss": 0.0182, + "step": 73650 + }, + { + "epoch": 0.5444841962094557, + "grad_norm": 0.08107759058475494, + "learning_rate": 2.286028015194682e-05, + "loss": 0.0179, + "step": 73660 + }, + { + "epoch": 0.5445581147807576, + "grad_norm": 0.07450973242521286, + "learning_rate": 2.2856570512820514e-05, + "loss": 0.0161, + "step": 73670 + }, + { + "epoch": 0.5446320333520593, + "grad_norm": 0.058085158467292786, + "learning_rate": 2.2852860873694207e-05, + "loss": 0.0188, + "step": 73680 + }, + { + "epoch": 0.5447059519233612, + "grad_norm": 0.099921815097332, + "learning_rate": 2.2849151234567903e-05, + "loss": 0.0175, + "step": 73690 + }, + { + "epoch": 0.544779870494663, + "grad_norm": 0.09356129914522171, + "learning_rate": 2.2845441595441596e-05, + "loss": 0.0195, + "step": 73700 + }, + { + "epoch": 0.5448537890659649, + "grad_norm": 0.07478101551532745, + "learning_rate": 2.284173195631529e-05, + "loss": 0.0169, + "step": 73710 + }, + { + "epoch": 0.5449277076372668, + "grad_norm": 0.06617410480976105, + "learning_rate": 2.2838022317188984e-05, + "loss": 0.0183, + "step": 73720 + }, + { + "epoch": 0.5450016262085686, + "grad_norm": 0.11164570599794388, + "learning_rate": 2.283431267806268e-05, + "loss": 0.0171, + "step": 73730 + }, + { + "epoch": 0.5450755447798705, + "grad_norm": 0.06951144337654114, + "learning_rate": 2.2830603038936373e-05, + "loss": 0.0176, + "step": 73740 + }, + { + "epoch": 0.5451494633511723, + "grad_norm": 0.09214255213737488, + "learning_rate": 2.282689339981007e-05, + "loss": 0.0182, + "step": 73750 + }, + { + "epoch": 0.5452233819224742, + "grad_norm": 0.07372793555259705, + "learning_rate": 2.282318376068376e-05, + "loss": 0.0175, + "step": 73760 + }, + { + "epoch": 0.545297300493776, + "grad_norm": 0.06709369271993637, + "learning_rate": 2.2819474121557454e-05, + "loss": 0.0156, + "step": 73770 + }, + { + "epoch": 0.5453712190650779, + "grad_norm": 0.08580457419157028, + "learning_rate": 2.281576448243115e-05, + "loss": 0.0197, + "step": 73780 + }, + { + "epoch": 0.5454451376363798, + "grad_norm": 0.08991753309965134, + "learning_rate": 2.2812054843304846e-05, + "loss": 0.0199, + "step": 73790 + }, + { + "epoch": 0.5455190562076816, + "grad_norm": 0.08562833070755005, + "learning_rate": 2.280834520417854e-05, + "loss": 0.0172, + "step": 73800 + }, + { + "epoch": 0.5455929747789835, + "grad_norm": 0.0725017562508583, + "learning_rate": 2.280463556505223e-05, + "loss": 0.0185, + "step": 73810 + }, + { + "epoch": 0.5456668933502853, + "grad_norm": 0.10185367614030838, + "learning_rate": 2.2800925925925927e-05, + "loss": 0.0185, + "step": 73820 + }, + { + "epoch": 0.5457408119215872, + "grad_norm": 0.07755794376134872, + "learning_rate": 2.2797216286799623e-05, + "loss": 0.0187, + "step": 73830 + }, + { + "epoch": 0.5458147304928891, + "grad_norm": 0.058865927159786224, + "learning_rate": 2.2793506647673315e-05, + "loss": 0.0156, + "step": 73840 + }, + { + "epoch": 0.5458886490641909, + "grad_norm": 0.08337011933326721, + "learning_rate": 2.2789797008547008e-05, + "loss": 0.0201, + "step": 73850 + }, + { + "epoch": 0.5459625676354928, + "grad_norm": 0.08722520619630814, + "learning_rate": 2.2786087369420704e-05, + "loss": 0.0177, + "step": 73860 + }, + { + "epoch": 0.5460364862067946, + "grad_norm": 0.07942160218954086, + "learning_rate": 2.27823777302944e-05, + "loss": 0.0162, + "step": 73870 + }, + { + "epoch": 0.5461104047780965, + "grad_norm": 0.0768059492111206, + "learning_rate": 2.2778668091168093e-05, + "loss": 0.0183, + "step": 73880 + }, + { + "epoch": 0.5461843233493983, + "grad_norm": 0.07833820581436157, + "learning_rate": 2.2774958452041785e-05, + "loss": 0.0207, + "step": 73890 + }, + { + "epoch": 0.5462582419207002, + "grad_norm": 0.07780931890010834, + "learning_rate": 2.277124881291548e-05, + "loss": 0.0161, + "step": 73900 + }, + { + "epoch": 0.5463321604920021, + "grad_norm": 0.07057272642850876, + "learning_rate": 2.2767539173789174e-05, + "loss": 0.0174, + "step": 73910 + }, + { + "epoch": 0.5464060790633039, + "grad_norm": 0.08263225853443146, + "learning_rate": 2.276382953466287e-05, + "loss": 0.0197, + "step": 73920 + }, + { + "epoch": 0.5464799976346058, + "grad_norm": 0.07716953754425049, + "learning_rate": 2.2760119895536562e-05, + "loss": 0.0182, + "step": 73930 + }, + { + "epoch": 0.5465539162059075, + "grad_norm": 0.06758254766464233, + "learning_rate": 2.2756410256410258e-05, + "loss": 0.0186, + "step": 73940 + }, + { + "epoch": 0.5466278347772094, + "grad_norm": 0.06011701002717018, + "learning_rate": 2.275270061728395e-05, + "loss": 0.0169, + "step": 73950 + }, + { + "epoch": 0.5467017533485112, + "grad_norm": 0.09424585103988647, + "learning_rate": 2.2748990978157647e-05, + "loss": 0.018, + "step": 73960 + }, + { + "epoch": 0.5467756719198131, + "grad_norm": 0.07628542184829712, + "learning_rate": 2.274528133903134e-05, + "loss": 0.0177, + "step": 73970 + }, + { + "epoch": 0.546849590491115, + "grad_norm": 0.08188623934984207, + "learning_rate": 2.2741571699905035e-05, + "loss": 0.0183, + "step": 73980 + }, + { + "epoch": 0.5469235090624168, + "grad_norm": 0.07078612595796585, + "learning_rate": 2.2737862060778728e-05, + "loss": 0.0213, + "step": 73990 + }, + { + "epoch": 0.5469974276337187, + "grad_norm": 0.08338020741939545, + "learning_rate": 2.273415242165242e-05, + "loss": 0.0193, + "step": 74000 + }, + { + "epoch": 0.5470713462050205, + "grad_norm": 0.0826638787984848, + "learning_rate": 2.2730442782526116e-05, + "loss": 0.0198, + "step": 74010 + }, + { + "epoch": 0.5471452647763224, + "grad_norm": 0.07347635924816132, + "learning_rate": 2.2726733143399812e-05, + "loss": 0.017, + "step": 74020 + }, + { + "epoch": 0.5472191833476242, + "grad_norm": 0.06849198788404465, + "learning_rate": 2.2723023504273505e-05, + "loss": 0.017, + "step": 74030 + }, + { + "epoch": 0.5472931019189261, + "grad_norm": 0.07767181843519211, + "learning_rate": 2.2719313865147197e-05, + "loss": 0.019, + "step": 74040 + }, + { + "epoch": 0.547367020490228, + "grad_norm": 0.08283500373363495, + "learning_rate": 2.2715604226020893e-05, + "loss": 0.0183, + "step": 74050 + }, + { + "epoch": 0.5474409390615298, + "grad_norm": 0.07500879466533661, + "learning_rate": 2.271189458689459e-05, + "loss": 0.0184, + "step": 74060 + }, + { + "epoch": 0.5475148576328317, + "grad_norm": 0.07654564082622528, + "learning_rate": 2.2708184947768282e-05, + "loss": 0.0202, + "step": 74070 + }, + { + "epoch": 0.5475887762041335, + "grad_norm": 0.07483880966901779, + "learning_rate": 2.2704475308641975e-05, + "loss": 0.0186, + "step": 74080 + }, + { + "epoch": 0.5476626947754354, + "grad_norm": 0.05762667953968048, + "learning_rate": 2.270076566951567e-05, + "loss": 0.0163, + "step": 74090 + }, + { + "epoch": 0.5477366133467373, + "grad_norm": 0.08308656513690948, + "learning_rate": 2.2697056030389367e-05, + "loss": 0.0178, + "step": 74100 + }, + { + "epoch": 0.5478105319180391, + "grad_norm": 0.09272373467683792, + "learning_rate": 2.269334639126306e-05, + "loss": 0.0185, + "step": 74110 + }, + { + "epoch": 0.547884450489341, + "grad_norm": 0.09792761504650116, + "learning_rate": 2.268963675213675e-05, + "loss": 0.0191, + "step": 74120 + }, + { + "epoch": 0.5479583690606428, + "grad_norm": 0.08621697872877121, + "learning_rate": 2.2685927113010448e-05, + "loss": 0.0173, + "step": 74130 + }, + { + "epoch": 0.5480322876319447, + "grad_norm": 0.07154551148414612, + "learning_rate": 2.268221747388414e-05, + "loss": 0.0176, + "step": 74140 + }, + { + "epoch": 0.5481062062032465, + "grad_norm": 0.06855598092079163, + "learning_rate": 2.2678507834757836e-05, + "loss": 0.0227, + "step": 74150 + }, + { + "epoch": 0.5481801247745484, + "grad_norm": 0.07203419506549835, + "learning_rate": 2.267479819563153e-05, + "loss": 0.0174, + "step": 74160 + }, + { + "epoch": 0.5482540433458503, + "grad_norm": 0.07480933517217636, + "learning_rate": 2.2671088556505225e-05, + "loss": 0.0178, + "step": 74170 + }, + { + "epoch": 0.548327961917152, + "grad_norm": 0.08844535052776337, + "learning_rate": 2.2667378917378917e-05, + "loss": 0.0178, + "step": 74180 + }, + { + "epoch": 0.548401880488454, + "grad_norm": 0.08185859024524689, + "learning_rate": 2.2663669278252613e-05, + "loss": 0.0163, + "step": 74190 + }, + { + "epoch": 0.5484757990597557, + "grad_norm": 0.07496652007102966, + "learning_rate": 2.2659959639126306e-05, + "loss": 0.0185, + "step": 74200 + }, + { + "epoch": 0.5485497176310576, + "grad_norm": 0.06973458081483841, + "learning_rate": 2.2656250000000002e-05, + "loss": 0.0155, + "step": 74210 + }, + { + "epoch": 0.5486236362023594, + "grad_norm": 0.08845594525337219, + "learning_rate": 2.2652540360873694e-05, + "loss": 0.0169, + "step": 74220 + }, + { + "epoch": 0.5486975547736613, + "grad_norm": 0.051325857639312744, + "learning_rate": 2.2648830721747387e-05, + "loss": 0.0168, + "step": 74230 + }, + { + "epoch": 0.5487714733449632, + "grad_norm": 0.0844346433877945, + "learning_rate": 2.2645121082621086e-05, + "loss": 0.0174, + "step": 74240 + }, + { + "epoch": 0.548845391916265, + "grad_norm": 0.06688199937343597, + "learning_rate": 2.264141144349478e-05, + "loss": 0.0188, + "step": 74250 + }, + { + "epoch": 0.5489193104875669, + "grad_norm": 0.07292843610048294, + "learning_rate": 2.263770180436847e-05, + "loss": 0.0176, + "step": 74260 + }, + { + "epoch": 0.5489932290588687, + "grad_norm": 0.10604050010442734, + "learning_rate": 2.2633992165242164e-05, + "loss": 0.0182, + "step": 74270 + }, + { + "epoch": 0.5490671476301706, + "grad_norm": 0.07840543985366821, + "learning_rate": 2.2630282526115863e-05, + "loss": 0.0172, + "step": 74280 + }, + { + "epoch": 0.5491410662014724, + "grad_norm": 0.12083763629198074, + "learning_rate": 2.2626572886989556e-05, + "loss": 0.0185, + "step": 74290 + }, + { + "epoch": 0.5492149847727743, + "grad_norm": 0.05159150809049606, + "learning_rate": 2.262286324786325e-05, + "loss": 0.0174, + "step": 74300 + }, + { + "epoch": 0.5492889033440762, + "grad_norm": 0.10930929332971573, + "learning_rate": 2.261915360873694e-05, + "loss": 0.0199, + "step": 74310 + }, + { + "epoch": 0.549362821915378, + "grad_norm": 0.060169853270053864, + "learning_rate": 2.2615443969610637e-05, + "loss": 0.0169, + "step": 74320 + }, + { + "epoch": 0.5494367404866799, + "grad_norm": 0.08198923617601395, + "learning_rate": 2.2611734330484333e-05, + "loss": 0.0182, + "step": 74330 + }, + { + "epoch": 0.5495106590579817, + "grad_norm": 0.0873372033238411, + "learning_rate": 2.2608024691358026e-05, + "loss": 0.019, + "step": 74340 + }, + { + "epoch": 0.5495845776292836, + "grad_norm": 0.08752260357141495, + "learning_rate": 2.2604315052231718e-05, + "loss": 0.0167, + "step": 74350 + }, + { + "epoch": 0.5496584962005855, + "grad_norm": 0.09364860504865646, + "learning_rate": 2.2600605413105414e-05, + "loss": 0.0172, + "step": 74360 + }, + { + "epoch": 0.5497324147718873, + "grad_norm": 0.061986666172742844, + "learning_rate": 2.2596895773979107e-05, + "loss": 0.0159, + "step": 74370 + }, + { + "epoch": 0.5498063333431892, + "grad_norm": 0.07798568159341812, + "learning_rate": 2.2593186134852803e-05, + "loss": 0.0157, + "step": 74380 + }, + { + "epoch": 0.549880251914491, + "grad_norm": 0.05340861156582832, + "learning_rate": 2.25894764957265e-05, + "loss": 0.0172, + "step": 74390 + }, + { + "epoch": 0.5499541704857929, + "grad_norm": 0.10631144791841507, + "learning_rate": 2.258576685660019e-05, + "loss": 0.0169, + "step": 74400 + }, + { + "epoch": 0.5500280890570947, + "grad_norm": 0.06307753175497055, + "learning_rate": 2.2582057217473884e-05, + "loss": 0.0183, + "step": 74410 + }, + { + "epoch": 0.5501020076283966, + "grad_norm": 0.10609875619411469, + "learning_rate": 2.257834757834758e-05, + "loss": 0.0212, + "step": 74420 + }, + { + "epoch": 0.5501759261996985, + "grad_norm": 0.09723684936761856, + "learning_rate": 2.2574637939221276e-05, + "loss": 0.0182, + "step": 74430 + }, + { + "epoch": 0.5502498447710003, + "grad_norm": 0.07827954739332199, + "learning_rate": 2.257092830009497e-05, + "loss": 0.0153, + "step": 74440 + }, + { + "epoch": 0.5503237633423022, + "grad_norm": 0.05574478209018707, + "learning_rate": 2.256721866096866e-05, + "loss": 0.0163, + "step": 74450 + }, + { + "epoch": 0.5503976819136039, + "grad_norm": 0.06514042615890503, + "learning_rate": 2.2563509021842354e-05, + "loss": 0.0178, + "step": 74460 + }, + { + "epoch": 0.5504716004849058, + "grad_norm": 0.06927794218063354, + "learning_rate": 2.2559799382716053e-05, + "loss": 0.0174, + "step": 74470 + }, + { + "epoch": 0.5505455190562076, + "grad_norm": 0.08658149838447571, + "learning_rate": 2.2556089743589746e-05, + "loss": 0.0178, + "step": 74480 + }, + { + "epoch": 0.5506194376275095, + "grad_norm": 0.059816401451826096, + "learning_rate": 2.2552380104463438e-05, + "loss": 0.0199, + "step": 74490 + }, + { + "epoch": 0.5506933561988114, + "grad_norm": 0.07856094092130661, + "learning_rate": 2.254867046533713e-05, + "loss": 0.0181, + "step": 74500 + }, + { + "epoch": 0.5507672747701132, + "grad_norm": 0.09727056324481964, + "learning_rate": 2.254496082621083e-05, + "loss": 0.0192, + "step": 74510 + }, + { + "epoch": 0.5508411933414151, + "grad_norm": 0.07374399900436401, + "learning_rate": 2.2541251187084523e-05, + "loss": 0.0182, + "step": 74520 + }, + { + "epoch": 0.5509151119127169, + "grad_norm": 0.08984479308128357, + "learning_rate": 2.2537541547958215e-05, + "loss": 0.0176, + "step": 74530 + }, + { + "epoch": 0.5509890304840188, + "grad_norm": 0.08314557373523712, + "learning_rate": 2.2533831908831908e-05, + "loss": 0.0157, + "step": 74540 + }, + { + "epoch": 0.5510629490553206, + "grad_norm": 0.09913858026266098, + "learning_rate": 2.2530122269705604e-05, + "loss": 0.0178, + "step": 74550 + }, + { + "epoch": 0.5511368676266225, + "grad_norm": 0.06641694903373718, + "learning_rate": 2.25264126305793e-05, + "loss": 0.0175, + "step": 74560 + }, + { + "epoch": 0.5512107861979244, + "grad_norm": 0.07155690342187881, + "learning_rate": 2.2522702991452992e-05, + "loss": 0.0207, + "step": 74570 + }, + { + "epoch": 0.5512847047692262, + "grad_norm": 0.07747691869735718, + "learning_rate": 2.2518993352326688e-05, + "loss": 0.0179, + "step": 74580 + }, + { + "epoch": 0.5513586233405281, + "grad_norm": 0.06510493904352188, + "learning_rate": 2.251528371320038e-05, + "loss": 0.0166, + "step": 74590 + }, + { + "epoch": 0.5514325419118299, + "grad_norm": 0.09934507310390472, + "learning_rate": 2.2511574074074073e-05, + "loss": 0.0167, + "step": 74600 + }, + { + "epoch": 0.5515064604831318, + "grad_norm": 0.06525859236717224, + "learning_rate": 2.250786443494777e-05, + "loss": 0.0184, + "step": 74610 + }, + { + "epoch": 0.5515803790544337, + "grad_norm": 0.104197658598423, + "learning_rate": 2.2504154795821465e-05, + "loss": 0.0185, + "step": 74620 + }, + { + "epoch": 0.5516542976257355, + "grad_norm": 0.0662267878651619, + "learning_rate": 2.2500445156695158e-05, + "loss": 0.0181, + "step": 74630 + }, + { + "epoch": 0.5517282161970374, + "grad_norm": 0.07721276581287384, + "learning_rate": 2.249673551756885e-05, + "loss": 0.0181, + "step": 74640 + }, + { + "epoch": 0.5518021347683392, + "grad_norm": 0.09885270148515701, + "learning_rate": 2.2493025878442546e-05, + "loss": 0.0176, + "step": 74650 + }, + { + "epoch": 0.5518760533396411, + "grad_norm": 0.0868011862039566, + "learning_rate": 2.2489316239316242e-05, + "loss": 0.0174, + "step": 74660 + }, + { + "epoch": 0.5519499719109429, + "grad_norm": 0.07308496534824371, + "learning_rate": 2.2485606600189935e-05, + "loss": 0.0175, + "step": 74670 + }, + { + "epoch": 0.5520238904822448, + "grad_norm": 0.08916335552930832, + "learning_rate": 2.2481896961063628e-05, + "loss": 0.0166, + "step": 74680 + }, + { + "epoch": 0.5520978090535467, + "grad_norm": 0.08002009242773056, + "learning_rate": 2.247818732193732e-05, + "loss": 0.0164, + "step": 74690 + }, + { + "epoch": 0.5521717276248485, + "grad_norm": 0.08925087749958038, + "learning_rate": 2.247447768281102e-05, + "loss": 0.0172, + "step": 74700 + }, + { + "epoch": 0.5522456461961504, + "grad_norm": 0.061949167400598526, + "learning_rate": 2.2470768043684712e-05, + "loss": 0.0145, + "step": 74710 + }, + { + "epoch": 0.5523195647674521, + "grad_norm": 0.10795128345489502, + "learning_rate": 2.2467058404558405e-05, + "loss": 0.016, + "step": 74720 + }, + { + "epoch": 0.552393483338754, + "grad_norm": 0.15623247623443604, + "learning_rate": 2.24633487654321e-05, + "loss": 0.018, + "step": 74730 + }, + { + "epoch": 0.5524674019100558, + "grad_norm": 0.08144319802522659, + "learning_rate": 2.2459639126305797e-05, + "loss": 0.0195, + "step": 74740 + }, + { + "epoch": 0.5525413204813577, + "grad_norm": 0.09133787453174591, + "learning_rate": 2.245592948717949e-05, + "loss": 0.0188, + "step": 74750 + }, + { + "epoch": 0.5526152390526596, + "grad_norm": 0.06813662499189377, + "learning_rate": 2.2452219848053182e-05, + "loss": 0.0159, + "step": 74760 + }, + { + "epoch": 0.5526891576239614, + "grad_norm": 0.10573530197143555, + "learning_rate": 2.2448510208926878e-05, + "loss": 0.0195, + "step": 74770 + }, + { + "epoch": 0.5527630761952633, + "grad_norm": 0.09004482626914978, + "learning_rate": 2.244480056980057e-05, + "loss": 0.021, + "step": 74780 + }, + { + "epoch": 0.5528369947665651, + "grad_norm": 0.06961522996425629, + "learning_rate": 2.2441090930674266e-05, + "loss": 0.0175, + "step": 74790 + }, + { + "epoch": 0.552910913337867, + "grad_norm": 0.10666133463382721, + "learning_rate": 2.243738129154796e-05, + "loss": 0.0193, + "step": 74800 + }, + { + "epoch": 0.5529848319091688, + "grad_norm": 0.07972019910812378, + "learning_rate": 2.2433671652421655e-05, + "loss": 0.0165, + "step": 74810 + }, + { + "epoch": 0.5530587504804707, + "grad_norm": 0.07946088165044785, + "learning_rate": 2.2429962013295347e-05, + "loss": 0.0168, + "step": 74820 + }, + { + "epoch": 0.5531326690517726, + "grad_norm": 0.07809576392173767, + "learning_rate": 2.242625237416904e-05, + "loss": 0.0184, + "step": 74830 + }, + { + "epoch": 0.5532065876230744, + "grad_norm": 0.12295723706483841, + "learning_rate": 2.2422542735042736e-05, + "loss": 0.0182, + "step": 74840 + }, + { + "epoch": 0.5532805061943763, + "grad_norm": 0.06109769642353058, + "learning_rate": 2.2418833095916432e-05, + "loss": 0.0174, + "step": 74850 + }, + { + "epoch": 0.5533544247656781, + "grad_norm": 0.07381372898817062, + "learning_rate": 2.2415123456790124e-05, + "loss": 0.0179, + "step": 74860 + }, + { + "epoch": 0.55342834333698, + "grad_norm": 0.0744849368929863, + "learning_rate": 2.2411413817663817e-05, + "loss": 0.0189, + "step": 74870 + }, + { + "epoch": 0.5535022619082819, + "grad_norm": 0.07806837558746338, + "learning_rate": 2.2407704178537513e-05, + "loss": 0.0184, + "step": 74880 + }, + { + "epoch": 0.5535761804795837, + "grad_norm": 0.0611063651740551, + "learning_rate": 2.240399453941121e-05, + "loss": 0.017, + "step": 74890 + }, + { + "epoch": 0.5536500990508856, + "grad_norm": 0.09074815362691879, + "learning_rate": 2.24002849002849e-05, + "loss": 0.0177, + "step": 74900 + }, + { + "epoch": 0.5537240176221874, + "grad_norm": 0.05981917679309845, + "learning_rate": 2.2396575261158594e-05, + "loss": 0.0173, + "step": 74910 + }, + { + "epoch": 0.5537979361934893, + "grad_norm": 0.06573251634836197, + "learning_rate": 2.239286562203229e-05, + "loss": 0.0155, + "step": 74920 + }, + { + "epoch": 0.5538718547647911, + "grad_norm": 0.0894307941198349, + "learning_rate": 2.2389155982905986e-05, + "loss": 0.0177, + "step": 74930 + }, + { + "epoch": 0.553945773336093, + "grad_norm": 0.07103675603866577, + "learning_rate": 2.238544634377968e-05, + "loss": 0.0178, + "step": 74940 + }, + { + "epoch": 0.5540196919073949, + "grad_norm": 0.06956657022237778, + "learning_rate": 2.238173670465337e-05, + "loss": 0.0203, + "step": 74950 + }, + { + "epoch": 0.5540936104786967, + "grad_norm": 0.08880143612623215, + "learning_rate": 2.2378027065527067e-05, + "loss": 0.0166, + "step": 74960 + }, + { + "epoch": 0.5541675290499986, + "grad_norm": 0.07951267808675766, + "learning_rate": 2.2374317426400763e-05, + "loss": 0.017, + "step": 74970 + }, + { + "epoch": 0.5542414476213003, + "grad_norm": 0.08745020627975464, + "learning_rate": 2.2370607787274456e-05, + "loss": 0.0166, + "step": 74980 + }, + { + "epoch": 0.5543153661926022, + "grad_norm": 0.08782682567834854, + "learning_rate": 2.236689814814815e-05, + "loss": 0.0185, + "step": 74990 + }, + { + "epoch": 0.554389284763904, + "grad_norm": 0.09408223628997803, + "learning_rate": 2.2363188509021844e-05, + "loss": 0.0176, + "step": 75000 + }, + { + "epoch": 0.5544632033352059, + "grad_norm": 0.0643104761838913, + "learning_rate": 2.2359478869895537e-05, + "loss": 0.0182, + "step": 75010 + }, + { + "epoch": 0.5545371219065078, + "grad_norm": 0.08297935128211975, + "learning_rate": 2.2355769230769233e-05, + "loss": 0.0201, + "step": 75020 + }, + { + "epoch": 0.5546110404778096, + "grad_norm": 0.08416923135519028, + "learning_rate": 2.2352059591642925e-05, + "loss": 0.016, + "step": 75030 + }, + { + "epoch": 0.5546849590491115, + "grad_norm": 0.09272485226392746, + "learning_rate": 2.234834995251662e-05, + "loss": 0.018, + "step": 75040 + }, + { + "epoch": 0.5547588776204133, + "grad_norm": 0.06990412622690201, + "learning_rate": 2.2344640313390314e-05, + "loss": 0.0199, + "step": 75050 + }, + { + "epoch": 0.5548327961917152, + "grad_norm": 0.08628682792186737, + "learning_rate": 2.2340930674264007e-05, + "loss": 0.0185, + "step": 75060 + }, + { + "epoch": 0.554906714763017, + "grad_norm": 0.06135695427656174, + "learning_rate": 2.2337221035137703e-05, + "loss": 0.0183, + "step": 75070 + }, + { + "epoch": 0.5549806333343189, + "grad_norm": 0.07100776582956314, + "learning_rate": 2.23335113960114e-05, + "loss": 0.018, + "step": 75080 + }, + { + "epoch": 0.5550545519056208, + "grad_norm": 0.08598551154136658, + "learning_rate": 2.232980175688509e-05, + "loss": 0.0197, + "step": 75090 + }, + { + "epoch": 0.5551284704769226, + "grad_norm": 0.09556099772453308, + "learning_rate": 2.2326092117758784e-05, + "loss": 0.0178, + "step": 75100 + }, + { + "epoch": 0.5552023890482245, + "grad_norm": 0.09031404554843903, + "learning_rate": 2.232238247863248e-05, + "loss": 0.0184, + "step": 75110 + }, + { + "epoch": 0.5552763076195263, + "grad_norm": 0.07659973204135895, + "learning_rate": 2.2318672839506176e-05, + "loss": 0.0177, + "step": 75120 + }, + { + "epoch": 0.5553502261908282, + "grad_norm": 0.07315723598003387, + "learning_rate": 2.2314963200379868e-05, + "loss": 0.0179, + "step": 75130 + }, + { + "epoch": 0.5554241447621301, + "grad_norm": 0.06226538494229317, + "learning_rate": 2.231125356125356e-05, + "loss": 0.0156, + "step": 75140 + }, + { + "epoch": 0.5554980633334319, + "grad_norm": 0.061756961047649384, + "learning_rate": 2.2307543922127257e-05, + "loss": 0.0176, + "step": 75150 + }, + { + "epoch": 0.5555719819047338, + "grad_norm": 0.09428950399160385, + "learning_rate": 2.2303834283000953e-05, + "loss": 0.0184, + "step": 75160 + }, + { + "epoch": 0.5556459004760356, + "grad_norm": 0.06500783562660217, + "learning_rate": 2.2300124643874645e-05, + "loss": 0.0169, + "step": 75170 + }, + { + "epoch": 0.5557198190473375, + "grad_norm": 0.0777098536491394, + "learning_rate": 2.2296415004748338e-05, + "loss": 0.0184, + "step": 75180 + }, + { + "epoch": 0.5557937376186393, + "grad_norm": 0.07891745120286942, + "learning_rate": 2.2292705365622034e-05, + "loss": 0.0178, + "step": 75190 + }, + { + "epoch": 0.5558676561899412, + "grad_norm": 0.05507243424654007, + "learning_rate": 2.228899572649573e-05, + "loss": 0.0179, + "step": 75200 + }, + { + "epoch": 0.5559415747612431, + "grad_norm": 0.08101353049278259, + "learning_rate": 2.2285286087369422e-05, + "loss": 0.0169, + "step": 75210 + }, + { + "epoch": 0.5560154933325449, + "grad_norm": 0.07776536792516708, + "learning_rate": 2.2281576448243115e-05, + "loss": 0.0169, + "step": 75220 + }, + { + "epoch": 0.5560894119038468, + "grad_norm": 0.07454312592744827, + "learning_rate": 2.227786680911681e-05, + "loss": 0.0162, + "step": 75230 + }, + { + "epoch": 0.5561633304751485, + "grad_norm": 0.07357978820800781, + "learning_rate": 2.2274157169990503e-05, + "loss": 0.0174, + "step": 75240 + }, + { + "epoch": 0.5562372490464504, + "grad_norm": 0.0766085535287857, + "learning_rate": 2.22704475308642e-05, + "loss": 0.0198, + "step": 75250 + }, + { + "epoch": 0.5563111676177522, + "grad_norm": 0.10694071650505066, + "learning_rate": 2.2266737891737892e-05, + "loss": 0.0182, + "step": 75260 + }, + { + "epoch": 0.5563850861890541, + "grad_norm": 0.08427776396274567, + "learning_rate": 2.2263028252611588e-05, + "loss": 0.0169, + "step": 75270 + }, + { + "epoch": 0.556459004760356, + "grad_norm": 0.06425360590219498, + "learning_rate": 2.225931861348528e-05, + "loss": 0.0182, + "step": 75280 + }, + { + "epoch": 0.5565329233316578, + "grad_norm": 0.09031922370195389, + "learning_rate": 2.2255608974358973e-05, + "loss": 0.0197, + "step": 75290 + }, + { + "epoch": 0.5566068419029597, + "grad_norm": 0.06680730730295181, + "learning_rate": 2.225189933523267e-05, + "loss": 0.0162, + "step": 75300 + }, + { + "epoch": 0.5566807604742615, + "grad_norm": 0.06603684276342392, + "learning_rate": 2.2248189696106365e-05, + "loss": 0.0195, + "step": 75310 + }, + { + "epoch": 0.5567546790455634, + "grad_norm": 0.06782803684473038, + "learning_rate": 2.2244480056980058e-05, + "loss": 0.0162, + "step": 75320 + }, + { + "epoch": 0.5568285976168652, + "grad_norm": 0.09163343161344528, + "learning_rate": 2.224077041785375e-05, + "loss": 0.0184, + "step": 75330 + }, + { + "epoch": 0.5569025161881671, + "grad_norm": 0.07321230322122574, + "learning_rate": 2.2237060778727446e-05, + "loss": 0.0193, + "step": 75340 + }, + { + "epoch": 0.556976434759469, + "grad_norm": 0.06885476410388947, + "learning_rate": 2.2233351139601142e-05, + "loss": 0.0158, + "step": 75350 + }, + { + "epoch": 0.5570503533307708, + "grad_norm": 0.08268599212169647, + "learning_rate": 2.2229641500474835e-05, + "loss": 0.0164, + "step": 75360 + }, + { + "epoch": 0.5571242719020727, + "grad_norm": 0.08073693513870239, + "learning_rate": 2.2225931861348527e-05, + "loss": 0.0167, + "step": 75370 + }, + { + "epoch": 0.5571981904733745, + "grad_norm": 0.05825946480035782, + "learning_rate": 2.2222222222222223e-05, + "loss": 0.0158, + "step": 75380 + }, + { + "epoch": 0.5572721090446764, + "grad_norm": 0.08236400038003922, + "learning_rate": 2.221851258309592e-05, + "loss": 0.0179, + "step": 75390 + }, + { + "epoch": 0.5573460276159783, + "grad_norm": 0.097753144800663, + "learning_rate": 2.2214802943969612e-05, + "loss": 0.0184, + "step": 75400 + }, + { + "epoch": 0.5574199461872801, + "grad_norm": 0.08877979218959808, + "learning_rate": 2.2211093304843304e-05, + "loss": 0.0178, + "step": 75410 + }, + { + "epoch": 0.557493864758582, + "grad_norm": 0.11679001152515411, + "learning_rate": 2.2207383665717e-05, + "loss": 0.0174, + "step": 75420 + }, + { + "epoch": 0.5575677833298838, + "grad_norm": 0.08042196929454803, + "learning_rate": 2.2203674026590696e-05, + "loss": 0.0163, + "step": 75430 + }, + { + "epoch": 0.5576417019011857, + "grad_norm": 0.06778399646282196, + "learning_rate": 2.219996438746439e-05, + "loss": 0.0183, + "step": 75440 + }, + { + "epoch": 0.5577156204724875, + "grad_norm": 0.07316727936267853, + "learning_rate": 2.219625474833808e-05, + "loss": 0.0176, + "step": 75450 + }, + { + "epoch": 0.5577895390437894, + "grad_norm": 0.0796537697315216, + "learning_rate": 2.2192545109211777e-05, + "loss": 0.0193, + "step": 75460 + }, + { + "epoch": 0.5578634576150913, + "grad_norm": 0.08047375082969666, + "learning_rate": 2.218883547008547e-05, + "loss": 0.0182, + "step": 75470 + }, + { + "epoch": 0.557937376186393, + "grad_norm": 0.06991118937730789, + "learning_rate": 2.2185125830959166e-05, + "loss": 0.0163, + "step": 75480 + }, + { + "epoch": 0.558011294757695, + "grad_norm": 0.07676273584365845, + "learning_rate": 2.218141619183286e-05, + "loss": 0.0189, + "step": 75490 + }, + { + "epoch": 0.5580852133289967, + "grad_norm": 0.07694961130619049, + "learning_rate": 2.2177706552706555e-05, + "loss": 0.0196, + "step": 75500 + }, + { + "epoch": 0.5581591319002986, + "grad_norm": 0.08334438502788544, + "learning_rate": 2.2173996913580247e-05, + "loss": 0.017, + "step": 75510 + }, + { + "epoch": 0.5582330504716004, + "grad_norm": 0.07191971689462662, + "learning_rate": 2.217028727445394e-05, + "loss": 0.019, + "step": 75520 + }, + { + "epoch": 0.5583069690429023, + "grad_norm": 0.0889514610171318, + "learning_rate": 2.2166577635327636e-05, + "loss": 0.0144, + "step": 75530 + }, + { + "epoch": 0.5583808876142042, + "grad_norm": 0.05933712422847748, + "learning_rate": 2.216286799620133e-05, + "loss": 0.0169, + "step": 75540 + }, + { + "epoch": 0.558454806185506, + "grad_norm": 0.07951271533966064, + "learning_rate": 2.2159158357075024e-05, + "loss": 0.0173, + "step": 75550 + }, + { + "epoch": 0.5585287247568079, + "grad_norm": 0.07584798336029053, + "learning_rate": 2.2155448717948717e-05, + "loss": 0.016, + "step": 75560 + }, + { + "epoch": 0.5586026433281097, + "grad_norm": 0.07957648485898972, + "learning_rate": 2.2151739078822413e-05, + "loss": 0.0197, + "step": 75570 + }, + { + "epoch": 0.5586765618994116, + "grad_norm": 0.13206903636455536, + "learning_rate": 2.214802943969611e-05, + "loss": 0.0186, + "step": 75580 + }, + { + "epoch": 0.5587504804707135, + "grad_norm": 0.07318476587533951, + "learning_rate": 2.21443198005698e-05, + "loss": 0.0186, + "step": 75590 + }, + { + "epoch": 0.5588243990420153, + "grad_norm": 0.06643281131982803, + "learning_rate": 2.2140610161443494e-05, + "loss": 0.0165, + "step": 75600 + }, + { + "epoch": 0.5588983176133172, + "grad_norm": 0.07346879690885544, + "learning_rate": 2.213690052231719e-05, + "loss": 0.0159, + "step": 75610 + }, + { + "epoch": 0.558972236184619, + "grad_norm": 0.09815242886543274, + "learning_rate": 2.2133190883190886e-05, + "loss": 0.0178, + "step": 75620 + }, + { + "epoch": 0.5590461547559209, + "grad_norm": 0.08379372954368591, + "learning_rate": 2.212948124406458e-05, + "loss": 0.0171, + "step": 75630 + }, + { + "epoch": 0.5591200733272227, + "grad_norm": 0.07338432967662811, + "learning_rate": 2.212577160493827e-05, + "loss": 0.0153, + "step": 75640 + }, + { + "epoch": 0.5591939918985246, + "grad_norm": 0.06731506437063217, + "learning_rate": 2.2122061965811967e-05, + "loss": 0.0167, + "step": 75650 + }, + { + "epoch": 0.5592679104698265, + "grad_norm": 0.09762943536043167, + "learning_rate": 2.2118352326685663e-05, + "loss": 0.0181, + "step": 75660 + }, + { + "epoch": 0.5593418290411283, + "grad_norm": 0.06943827867507935, + "learning_rate": 2.2114642687559356e-05, + "loss": 0.0181, + "step": 75670 + }, + { + "epoch": 0.5594157476124302, + "grad_norm": 0.08721622824668884, + "learning_rate": 2.2110933048433048e-05, + "loss": 0.0172, + "step": 75680 + }, + { + "epoch": 0.559489666183732, + "grad_norm": 0.07268903404474258, + "learning_rate": 2.2107223409306744e-05, + "loss": 0.0168, + "step": 75690 + }, + { + "epoch": 0.5595635847550339, + "grad_norm": 0.07044602185487747, + "learning_rate": 2.2103513770180437e-05, + "loss": 0.0178, + "step": 75700 + }, + { + "epoch": 0.5596375033263357, + "grad_norm": 0.05945662781596184, + "learning_rate": 2.2099804131054133e-05, + "loss": 0.0162, + "step": 75710 + }, + { + "epoch": 0.5597114218976376, + "grad_norm": 0.07119162380695343, + "learning_rate": 2.2096094491927825e-05, + "loss": 0.0183, + "step": 75720 + }, + { + "epoch": 0.5597853404689395, + "grad_norm": 0.0783572718501091, + "learning_rate": 2.209238485280152e-05, + "loss": 0.0205, + "step": 75730 + }, + { + "epoch": 0.5598592590402413, + "grad_norm": 0.1033354178071022, + "learning_rate": 2.2088675213675214e-05, + "loss": 0.017, + "step": 75740 + }, + { + "epoch": 0.5599331776115432, + "grad_norm": 0.07081745564937592, + "learning_rate": 2.2084965574548906e-05, + "loss": 0.0189, + "step": 75750 + }, + { + "epoch": 0.5600070961828449, + "grad_norm": 0.07376065105199814, + "learning_rate": 2.2081255935422602e-05, + "loss": 0.018, + "step": 75760 + }, + { + "epoch": 0.5600810147541468, + "grad_norm": 0.08154357969760895, + "learning_rate": 2.2077546296296298e-05, + "loss": 0.018, + "step": 75770 + }, + { + "epoch": 0.5601549333254486, + "grad_norm": 0.08173554390668869, + "learning_rate": 2.207383665716999e-05, + "loss": 0.0202, + "step": 75780 + }, + { + "epoch": 0.5602288518967505, + "grad_norm": 0.08825518935918808, + "learning_rate": 2.2070127018043683e-05, + "loss": 0.0179, + "step": 75790 + }, + { + "epoch": 0.5603027704680524, + "grad_norm": 0.09500667452812195, + "learning_rate": 2.2066417378917383e-05, + "loss": 0.0166, + "step": 75800 + }, + { + "epoch": 0.5603766890393542, + "grad_norm": 0.09798631817102432, + "learning_rate": 2.2062707739791075e-05, + "loss": 0.0181, + "step": 75810 + }, + { + "epoch": 0.5604506076106561, + "grad_norm": 0.06984465569257736, + "learning_rate": 2.2058998100664768e-05, + "loss": 0.0147, + "step": 75820 + }, + { + "epoch": 0.5605245261819579, + "grad_norm": 0.08530226349830627, + "learning_rate": 2.205528846153846e-05, + "loss": 0.0173, + "step": 75830 + }, + { + "epoch": 0.5605984447532598, + "grad_norm": 0.07154073566198349, + "learning_rate": 2.2051578822412156e-05, + "loss": 0.0176, + "step": 75840 + }, + { + "epoch": 0.5606723633245617, + "grad_norm": 0.08098631352186203, + "learning_rate": 2.2047869183285852e-05, + "loss": 0.0174, + "step": 75850 + }, + { + "epoch": 0.5607462818958635, + "grad_norm": 0.08754424750804901, + "learning_rate": 2.2044159544159545e-05, + "loss": 0.0174, + "step": 75860 + }, + { + "epoch": 0.5608202004671654, + "grad_norm": 0.07257146388292313, + "learning_rate": 2.2040449905033238e-05, + "loss": 0.0185, + "step": 75870 + }, + { + "epoch": 0.5608941190384672, + "grad_norm": 0.06962021440267563, + "learning_rate": 2.2036740265906934e-05, + "loss": 0.0179, + "step": 75880 + }, + { + "epoch": 0.5609680376097691, + "grad_norm": 0.08640135824680328, + "learning_rate": 2.203303062678063e-05, + "loss": 0.0177, + "step": 75890 + }, + { + "epoch": 0.5610419561810709, + "grad_norm": 0.10559427738189697, + "learning_rate": 2.2029320987654322e-05, + "loss": 0.0156, + "step": 75900 + }, + { + "epoch": 0.5611158747523728, + "grad_norm": 0.08029115200042725, + "learning_rate": 2.2025611348528015e-05, + "loss": 0.0171, + "step": 75910 + }, + { + "epoch": 0.5611897933236747, + "grad_norm": 0.09592238068580627, + "learning_rate": 2.202190170940171e-05, + "loss": 0.0208, + "step": 75920 + }, + { + "epoch": 0.5612637118949765, + "grad_norm": 0.07311482727527618, + "learning_rate": 2.2018192070275403e-05, + "loss": 0.0186, + "step": 75930 + }, + { + "epoch": 0.5613376304662784, + "grad_norm": 0.07350117713212967, + "learning_rate": 2.20144824311491e-05, + "loss": 0.0185, + "step": 75940 + }, + { + "epoch": 0.5614115490375802, + "grad_norm": 0.11407254636287689, + "learning_rate": 2.2010772792022795e-05, + "loss": 0.0179, + "step": 75950 + }, + { + "epoch": 0.5614854676088821, + "grad_norm": 0.07127930968999863, + "learning_rate": 2.2007063152896488e-05, + "loss": 0.0192, + "step": 75960 + }, + { + "epoch": 0.5615593861801839, + "grad_norm": 0.09806405752897263, + "learning_rate": 2.200335351377018e-05, + "loss": 0.0182, + "step": 75970 + }, + { + "epoch": 0.5616333047514858, + "grad_norm": 0.0642218291759491, + "learning_rate": 2.1999643874643873e-05, + "loss": 0.0169, + "step": 75980 + }, + { + "epoch": 0.5617072233227877, + "grad_norm": 0.08178985863924026, + "learning_rate": 2.1995934235517572e-05, + "loss": 0.0169, + "step": 75990 + }, + { + "epoch": 0.5617811418940895, + "grad_norm": 0.12003156542778015, + "learning_rate": 2.1992224596391265e-05, + "loss": 0.0195, + "step": 76000 + }, + { + "epoch": 0.5618550604653914, + "grad_norm": 0.10624649375677109, + "learning_rate": 2.1988514957264957e-05, + "loss": 0.0168, + "step": 76010 + }, + { + "epoch": 0.5619289790366931, + "grad_norm": 0.060141682624816895, + "learning_rate": 2.198480531813865e-05, + "loss": 0.016, + "step": 76020 + }, + { + "epoch": 0.562002897607995, + "grad_norm": 0.07442566752433777, + "learning_rate": 2.198109567901235e-05, + "loss": 0.0186, + "step": 76030 + }, + { + "epoch": 0.5620768161792968, + "grad_norm": 0.09498272836208344, + "learning_rate": 2.1977386039886042e-05, + "loss": 0.0162, + "step": 76040 + }, + { + "epoch": 0.5621507347505987, + "grad_norm": 0.06662849336862564, + "learning_rate": 2.1973676400759734e-05, + "loss": 0.0174, + "step": 76050 + }, + { + "epoch": 0.5622246533219006, + "grad_norm": 0.09005844593048096, + "learning_rate": 2.1969966761633427e-05, + "loss": 0.0172, + "step": 76060 + }, + { + "epoch": 0.5622985718932024, + "grad_norm": 0.07276521623134613, + "learning_rate": 2.1966257122507123e-05, + "loss": 0.0161, + "step": 76070 + }, + { + "epoch": 0.5623724904645043, + "grad_norm": 0.07238933444023132, + "learning_rate": 2.196254748338082e-05, + "loss": 0.0183, + "step": 76080 + }, + { + "epoch": 0.5624464090358061, + "grad_norm": 0.061950720846652985, + "learning_rate": 2.195883784425451e-05, + "loss": 0.0198, + "step": 76090 + }, + { + "epoch": 0.562520327607108, + "grad_norm": 0.07546308636665344, + "learning_rate": 2.1955128205128208e-05, + "loss": 0.0157, + "step": 76100 + }, + { + "epoch": 0.5625942461784099, + "grad_norm": 0.0679684653878212, + "learning_rate": 2.19514185660019e-05, + "loss": 0.0166, + "step": 76110 + }, + { + "epoch": 0.5626681647497117, + "grad_norm": 0.07437802106142044, + "learning_rate": 2.1947708926875596e-05, + "loss": 0.0186, + "step": 76120 + }, + { + "epoch": 0.5627420833210136, + "grad_norm": 0.07765395939350128, + "learning_rate": 2.194399928774929e-05, + "loss": 0.0169, + "step": 76130 + }, + { + "epoch": 0.5628160018923154, + "grad_norm": 0.0975767970085144, + "learning_rate": 2.1940289648622985e-05, + "loss": 0.0169, + "step": 76140 + }, + { + "epoch": 0.5628899204636173, + "grad_norm": 0.06236174330115318, + "learning_rate": 2.1936580009496677e-05, + "loss": 0.0157, + "step": 76150 + }, + { + "epoch": 0.5629638390349191, + "grad_norm": 0.09108094871044159, + "learning_rate": 2.193287037037037e-05, + "loss": 0.0194, + "step": 76160 + }, + { + "epoch": 0.563037757606221, + "grad_norm": 0.10375560820102692, + "learning_rate": 2.1929160731244066e-05, + "loss": 0.0198, + "step": 76170 + }, + { + "epoch": 0.5631116761775229, + "grad_norm": 0.10493025183677673, + "learning_rate": 2.1925451092117762e-05, + "loss": 0.0192, + "step": 76180 + }, + { + "epoch": 0.5631855947488247, + "grad_norm": 0.06288543343544006, + "learning_rate": 2.1921741452991454e-05, + "loss": 0.0177, + "step": 76190 + }, + { + "epoch": 0.5632595133201266, + "grad_norm": 0.0990319773554802, + "learning_rate": 2.1918031813865147e-05, + "loss": 0.0162, + "step": 76200 + }, + { + "epoch": 0.5633334318914284, + "grad_norm": 0.08652309328317642, + "learning_rate": 2.191432217473884e-05, + "loss": 0.0192, + "step": 76210 + }, + { + "epoch": 0.5634073504627303, + "grad_norm": 0.06287626922130585, + "learning_rate": 2.191061253561254e-05, + "loss": 0.0139, + "step": 76220 + }, + { + "epoch": 0.5634812690340321, + "grad_norm": 0.08388041704893112, + "learning_rate": 2.190690289648623e-05, + "loss": 0.0185, + "step": 76230 + }, + { + "epoch": 0.563555187605334, + "grad_norm": 0.0805421769618988, + "learning_rate": 2.1903193257359924e-05, + "loss": 0.0162, + "step": 76240 + }, + { + "epoch": 0.5636291061766359, + "grad_norm": 0.08735102415084839, + "learning_rate": 2.189948361823362e-05, + "loss": 0.0188, + "step": 76250 + }, + { + "epoch": 0.5637030247479377, + "grad_norm": 0.07975315302610397, + "learning_rate": 2.1895773979107316e-05, + "loss": 0.0184, + "step": 76260 + }, + { + "epoch": 0.5637769433192396, + "grad_norm": 0.09392821043729782, + "learning_rate": 2.189206433998101e-05, + "loss": 0.0159, + "step": 76270 + }, + { + "epoch": 0.5638508618905413, + "grad_norm": 0.08077412843704224, + "learning_rate": 2.18883547008547e-05, + "loss": 0.0177, + "step": 76280 + }, + { + "epoch": 0.5639247804618432, + "grad_norm": 0.07388119399547577, + "learning_rate": 2.1884645061728397e-05, + "loss": 0.0193, + "step": 76290 + }, + { + "epoch": 0.563998699033145, + "grad_norm": 0.09471497684717178, + "learning_rate": 2.188093542260209e-05, + "loss": 0.0187, + "step": 76300 + }, + { + "epoch": 0.5640726176044469, + "grad_norm": 0.07443129271268845, + "learning_rate": 2.1877225783475786e-05, + "loss": 0.0157, + "step": 76310 + }, + { + "epoch": 0.5641465361757488, + "grad_norm": 0.07455494999885559, + "learning_rate": 2.1873516144349478e-05, + "loss": 0.0164, + "step": 76320 + }, + { + "epoch": 0.5642204547470506, + "grad_norm": 0.0900912880897522, + "learning_rate": 2.1869806505223174e-05, + "loss": 0.0176, + "step": 76330 + }, + { + "epoch": 0.5642943733183525, + "grad_norm": 0.06420095264911652, + "learning_rate": 2.1866096866096867e-05, + "loss": 0.0173, + "step": 76340 + }, + { + "epoch": 0.5643682918896543, + "grad_norm": 0.09643217921257019, + "learning_rate": 2.1862387226970563e-05, + "loss": 0.0178, + "step": 76350 + }, + { + "epoch": 0.5644422104609562, + "grad_norm": 0.07020307332277298, + "learning_rate": 2.1858677587844255e-05, + "loss": 0.0192, + "step": 76360 + }, + { + "epoch": 0.5645161290322581, + "grad_norm": 0.09751828014850616, + "learning_rate": 2.185496794871795e-05, + "loss": 0.0157, + "step": 76370 + }, + { + "epoch": 0.5645900476035599, + "grad_norm": 0.053897056728601456, + "learning_rate": 2.1851258309591644e-05, + "loss": 0.019, + "step": 76380 + }, + { + "epoch": 0.5646639661748618, + "grad_norm": 0.07476173341274261, + "learning_rate": 2.1847548670465336e-05, + "loss": 0.0201, + "step": 76390 + }, + { + "epoch": 0.5647378847461636, + "grad_norm": 0.07188209146261215, + "learning_rate": 2.1843839031339032e-05, + "loss": 0.0177, + "step": 76400 + }, + { + "epoch": 0.5648118033174655, + "grad_norm": 0.05990707501769066, + "learning_rate": 2.184012939221273e-05, + "loss": 0.0166, + "step": 76410 + }, + { + "epoch": 0.5648857218887673, + "grad_norm": 0.07479406148195267, + "learning_rate": 2.183641975308642e-05, + "loss": 0.0174, + "step": 76420 + }, + { + "epoch": 0.5649596404600692, + "grad_norm": 0.05558224022388458, + "learning_rate": 2.1832710113960113e-05, + "loss": 0.0187, + "step": 76430 + }, + { + "epoch": 0.5650335590313711, + "grad_norm": 0.08270638436079025, + "learning_rate": 2.182900047483381e-05, + "loss": 0.0166, + "step": 76440 + }, + { + "epoch": 0.5651074776026729, + "grad_norm": 0.06895186752080917, + "learning_rate": 2.1825290835707505e-05, + "loss": 0.0178, + "step": 76450 + }, + { + "epoch": 0.5651813961739748, + "grad_norm": 0.05761091411113739, + "learning_rate": 2.1821581196581198e-05, + "loss": 0.0171, + "step": 76460 + }, + { + "epoch": 0.5652553147452766, + "grad_norm": 0.0714200958609581, + "learning_rate": 2.181787155745489e-05, + "loss": 0.0185, + "step": 76470 + }, + { + "epoch": 0.5653292333165785, + "grad_norm": 0.08116467297077179, + "learning_rate": 2.1814161918328587e-05, + "loss": 0.0171, + "step": 76480 + }, + { + "epoch": 0.5654031518878803, + "grad_norm": 0.09137962013483047, + "learning_rate": 2.1810452279202282e-05, + "loss": 0.0182, + "step": 76490 + }, + { + "epoch": 0.5654770704591822, + "grad_norm": 0.05624079331755638, + "learning_rate": 2.1806742640075975e-05, + "loss": 0.0168, + "step": 76500 + }, + { + "epoch": 0.5655509890304841, + "grad_norm": 0.08908656239509583, + "learning_rate": 2.1803033000949668e-05, + "loss": 0.0169, + "step": 76510 + }, + { + "epoch": 0.5656249076017859, + "grad_norm": 0.06865982711315155, + "learning_rate": 2.1799323361823364e-05, + "loss": 0.0185, + "step": 76520 + }, + { + "epoch": 0.5656988261730878, + "grad_norm": 0.08398845791816711, + "learning_rate": 2.1795613722697056e-05, + "loss": 0.0199, + "step": 76530 + }, + { + "epoch": 0.5657727447443895, + "grad_norm": 0.06882167607545853, + "learning_rate": 2.1791904083570752e-05, + "loss": 0.0188, + "step": 76540 + }, + { + "epoch": 0.5658466633156914, + "grad_norm": 0.11541710793972015, + "learning_rate": 2.1788194444444445e-05, + "loss": 0.0181, + "step": 76550 + }, + { + "epoch": 0.5659205818869932, + "grad_norm": 0.08537382632493973, + "learning_rate": 2.178448480531814e-05, + "loss": 0.0164, + "step": 76560 + }, + { + "epoch": 0.5659945004582951, + "grad_norm": 0.08711043000221252, + "learning_rate": 2.1780775166191833e-05, + "loss": 0.0166, + "step": 76570 + }, + { + "epoch": 0.566068419029597, + "grad_norm": 0.08303167670965195, + "learning_rate": 2.177706552706553e-05, + "loss": 0.0172, + "step": 76580 + }, + { + "epoch": 0.5661423376008988, + "grad_norm": 0.13246089220046997, + "learning_rate": 2.1773355887939222e-05, + "loss": 0.0194, + "step": 76590 + }, + { + "epoch": 0.5662162561722007, + "grad_norm": 0.07567055523395538, + "learning_rate": 2.1769646248812918e-05, + "loss": 0.0167, + "step": 76600 + }, + { + "epoch": 0.5662901747435025, + "grad_norm": 0.10139036178588867, + "learning_rate": 2.176593660968661e-05, + "loss": 0.017, + "step": 76610 + }, + { + "epoch": 0.5663640933148044, + "grad_norm": 0.057844486087560654, + "learning_rate": 2.1762226970560303e-05, + "loss": 0.0192, + "step": 76620 + }, + { + "epoch": 0.5664380118861063, + "grad_norm": 0.0815180242061615, + "learning_rate": 2.1758517331434e-05, + "loss": 0.0188, + "step": 76630 + }, + { + "epoch": 0.5665119304574081, + "grad_norm": 0.0749938040971756, + "learning_rate": 2.1754807692307695e-05, + "loss": 0.0179, + "step": 76640 + }, + { + "epoch": 0.56658584902871, + "grad_norm": 0.0814565122127533, + "learning_rate": 2.1751098053181387e-05, + "loss": 0.0154, + "step": 76650 + }, + { + "epoch": 0.5666597676000118, + "grad_norm": 0.08643099665641785, + "learning_rate": 2.174738841405508e-05, + "loss": 0.0178, + "step": 76660 + }, + { + "epoch": 0.5667336861713137, + "grad_norm": 0.07690934091806412, + "learning_rate": 2.1743678774928776e-05, + "loss": 0.016, + "step": 76670 + }, + { + "epoch": 0.5668076047426155, + "grad_norm": 0.06089937686920166, + "learning_rate": 2.1739969135802472e-05, + "loss": 0.0147, + "step": 76680 + }, + { + "epoch": 0.5668815233139174, + "grad_norm": 0.10612459480762482, + "learning_rate": 2.1736259496676165e-05, + "loss": 0.013, + "step": 76690 + }, + { + "epoch": 0.5669554418852193, + "grad_norm": 0.07758487015962601, + "learning_rate": 2.1732549857549857e-05, + "loss": 0.0176, + "step": 76700 + }, + { + "epoch": 0.5670293604565211, + "grad_norm": 0.13286489248275757, + "learning_rate": 2.1728840218423553e-05, + "loss": 0.0217, + "step": 76710 + }, + { + "epoch": 0.567103279027823, + "grad_norm": 0.06863997131586075, + "learning_rate": 2.172513057929725e-05, + "loss": 0.018, + "step": 76720 + }, + { + "epoch": 0.5671771975991248, + "grad_norm": 0.0733184888958931, + "learning_rate": 2.172142094017094e-05, + "loss": 0.0164, + "step": 76730 + }, + { + "epoch": 0.5672511161704267, + "grad_norm": 0.0709184855222702, + "learning_rate": 2.1717711301044634e-05, + "loss": 0.0155, + "step": 76740 + }, + { + "epoch": 0.5673250347417285, + "grad_norm": 0.07036468386650085, + "learning_rate": 2.171400166191833e-05, + "loss": 0.0151, + "step": 76750 + }, + { + "epoch": 0.5673989533130304, + "grad_norm": 0.08693096786737442, + "learning_rate": 2.1710292022792023e-05, + "loss": 0.0183, + "step": 76760 + }, + { + "epoch": 0.5674728718843323, + "grad_norm": 0.09758399426937103, + "learning_rate": 2.170658238366572e-05, + "loss": 0.017, + "step": 76770 + }, + { + "epoch": 0.567546790455634, + "grad_norm": 0.09273570775985718, + "learning_rate": 2.170287274453941e-05, + "loss": 0.0171, + "step": 76780 + }, + { + "epoch": 0.567620709026936, + "grad_norm": 0.0994814783334732, + "learning_rate": 2.1699163105413107e-05, + "loss": 0.019, + "step": 76790 + }, + { + "epoch": 0.5676946275982377, + "grad_norm": 0.08835679292678833, + "learning_rate": 2.16954534662868e-05, + "loss": 0.0139, + "step": 76800 + }, + { + "epoch": 0.5677685461695396, + "grad_norm": 0.07272812724113464, + "learning_rate": 2.1691743827160496e-05, + "loss": 0.0183, + "step": 76810 + }, + { + "epoch": 0.5678424647408414, + "grad_norm": 0.05673498660326004, + "learning_rate": 2.168803418803419e-05, + "loss": 0.0187, + "step": 76820 + }, + { + "epoch": 0.5679163833121433, + "grad_norm": 0.05819728597998619, + "learning_rate": 2.1684324548907884e-05, + "loss": 0.0186, + "step": 76830 + }, + { + "epoch": 0.5679903018834452, + "grad_norm": 0.08914889395236969, + "learning_rate": 2.1680614909781577e-05, + "loss": 0.0167, + "step": 76840 + }, + { + "epoch": 0.568064220454747, + "grad_norm": 0.07220537960529327, + "learning_rate": 2.167690527065527e-05, + "loss": 0.0173, + "step": 76850 + }, + { + "epoch": 0.5681381390260489, + "grad_norm": 0.09979695081710815, + "learning_rate": 2.1673195631528966e-05, + "loss": 0.0172, + "step": 76860 + }, + { + "epoch": 0.5682120575973507, + "grad_norm": 0.08128748089075089, + "learning_rate": 2.166948599240266e-05, + "loss": 0.0169, + "step": 76870 + }, + { + "epoch": 0.5682859761686526, + "grad_norm": 0.07112763822078705, + "learning_rate": 2.1665776353276354e-05, + "loss": 0.0173, + "step": 76880 + }, + { + "epoch": 0.5683598947399545, + "grad_norm": 0.06230897083878517, + "learning_rate": 2.1662066714150047e-05, + "loss": 0.0191, + "step": 76890 + }, + { + "epoch": 0.5684338133112563, + "grad_norm": 0.07018118351697922, + "learning_rate": 2.1658357075023743e-05, + "loss": 0.0166, + "step": 76900 + }, + { + "epoch": 0.5685077318825582, + "grad_norm": 0.08045455813407898, + "learning_rate": 2.165464743589744e-05, + "loss": 0.0193, + "step": 76910 + }, + { + "epoch": 0.56858165045386, + "grad_norm": 0.063961461186409, + "learning_rate": 2.165093779677113e-05, + "loss": 0.0186, + "step": 76920 + }, + { + "epoch": 0.5686555690251619, + "grad_norm": 0.09592575579881668, + "learning_rate": 2.1647228157644824e-05, + "loss": 0.0161, + "step": 76930 + }, + { + "epoch": 0.5687294875964637, + "grad_norm": 0.07979429513216019, + "learning_rate": 2.164351851851852e-05, + "loss": 0.0163, + "step": 76940 + }, + { + "epoch": 0.5688034061677656, + "grad_norm": 0.10558890551328659, + "learning_rate": 2.1639808879392216e-05, + "loss": 0.0173, + "step": 76950 + }, + { + "epoch": 0.5688773247390675, + "grad_norm": 0.08926460146903992, + "learning_rate": 2.1636099240265908e-05, + "loss": 0.0194, + "step": 76960 + }, + { + "epoch": 0.5689512433103693, + "grad_norm": 0.05744050443172455, + "learning_rate": 2.16323896011396e-05, + "loss": 0.0183, + "step": 76970 + }, + { + "epoch": 0.5690251618816712, + "grad_norm": 0.11004228889942169, + "learning_rate": 2.1628679962013297e-05, + "loss": 0.0201, + "step": 76980 + }, + { + "epoch": 0.569099080452973, + "grad_norm": 0.08529970794916153, + "learning_rate": 2.162497032288699e-05, + "loss": 0.0175, + "step": 76990 + }, + { + "epoch": 0.5691729990242749, + "grad_norm": 0.07577558606863022, + "learning_rate": 2.1621260683760685e-05, + "loss": 0.0166, + "step": 77000 + }, + { + "epoch": 0.5692469175955767, + "grad_norm": 0.10471208393573761, + "learning_rate": 2.1617551044634378e-05, + "loss": 0.0194, + "step": 77010 + }, + { + "epoch": 0.5693208361668786, + "grad_norm": 0.0808945968747139, + "learning_rate": 2.1613841405508074e-05, + "loss": 0.0193, + "step": 77020 + }, + { + "epoch": 0.5693947547381805, + "grad_norm": 0.12748804688453674, + "learning_rate": 2.1610131766381766e-05, + "loss": 0.0192, + "step": 77030 + }, + { + "epoch": 0.5694686733094823, + "grad_norm": 0.09266991168260574, + "learning_rate": 2.1606422127255462e-05, + "loss": 0.0174, + "step": 77040 + }, + { + "epoch": 0.5695425918807842, + "grad_norm": 0.0642351359128952, + "learning_rate": 2.1602712488129155e-05, + "loss": 0.0188, + "step": 77050 + }, + { + "epoch": 0.569616510452086, + "grad_norm": 0.06646756082773209, + "learning_rate": 2.159900284900285e-05, + "loss": 0.018, + "step": 77060 + }, + { + "epoch": 0.5696904290233878, + "grad_norm": 0.07489508390426636, + "learning_rate": 2.1595293209876544e-05, + "loss": 0.0183, + "step": 77070 + }, + { + "epoch": 0.5697643475946896, + "grad_norm": 0.0715007334947586, + "learning_rate": 2.1591583570750236e-05, + "loss": 0.019, + "step": 77080 + }, + { + "epoch": 0.5698382661659915, + "grad_norm": 0.07386353611946106, + "learning_rate": 2.1587873931623932e-05, + "loss": 0.0194, + "step": 77090 + }, + { + "epoch": 0.5699121847372934, + "grad_norm": 0.09321268647909164, + "learning_rate": 2.1584164292497628e-05, + "loss": 0.0159, + "step": 77100 + }, + { + "epoch": 0.5699861033085952, + "grad_norm": 0.08199749141931534, + "learning_rate": 2.158045465337132e-05, + "loss": 0.0185, + "step": 77110 + }, + { + "epoch": 0.5700600218798971, + "grad_norm": 0.10026410222053528, + "learning_rate": 2.1576745014245013e-05, + "loss": 0.0169, + "step": 77120 + }, + { + "epoch": 0.5701339404511989, + "grad_norm": 0.12471500039100647, + "learning_rate": 2.157303537511871e-05, + "loss": 0.0197, + "step": 77130 + }, + { + "epoch": 0.5702078590225008, + "grad_norm": 0.08403502404689789, + "learning_rate": 2.1569325735992405e-05, + "loss": 0.017, + "step": 77140 + }, + { + "epoch": 0.5702817775938027, + "grad_norm": 0.07287797331809998, + "learning_rate": 2.1565616096866098e-05, + "loss": 0.0175, + "step": 77150 + }, + { + "epoch": 0.5703556961651045, + "grad_norm": 0.09309650212526321, + "learning_rate": 2.156190645773979e-05, + "loss": 0.0168, + "step": 77160 + }, + { + "epoch": 0.5704296147364064, + "grad_norm": 0.07405105978250504, + "learning_rate": 2.1558196818613486e-05, + "loss": 0.0179, + "step": 77170 + }, + { + "epoch": 0.5705035333077082, + "grad_norm": 0.08167034387588501, + "learning_rate": 2.1554487179487182e-05, + "loss": 0.0167, + "step": 77180 + }, + { + "epoch": 0.5705774518790101, + "grad_norm": 0.1163032054901123, + "learning_rate": 2.1550777540360875e-05, + "loss": 0.0174, + "step": 77190 + }, + { + "epoch": 0.5706513704503119, + "grad_norm": 0.09522049129009247, + "learning_rate": 2.1547067901234567e-05, + "loss": 0.0179, + "step": 77200 + }, + { + "epoch": 0.5707252890216138, + "grad_norm": 0.07914343476295471, + "learning_rate": 2.1543358262108263e-05, + "loss": 0.021, + "step": 77210 + }, + { + "epoch": 0.5707992075929157, + "grad_norm": 0.08064424246549606, + "learning_rate": 2.1539648622981956e-05, + "loss": 0.0194, + "step": 77220 + }, + { + "epoch": 0.5708731261642175, + "grad_norm": 0.0969173014163971, + "learning_rate": 2.1535938983855652e-05, + "loss": 0.0168, + "step": 77230 + }, + { + "epoch": 0.5709470447355194, + "grad_norm": 0.08027642965316772, + "learning_rate": 2.1532229344729344e-05, + "loss": 0.0169, + "step": 77240 + }, + { + "epoch": 0.5710209633068212, + "grad_norm": 0.06907127797603607, + "learning_rate": 2.152851970560304e-05, + "loss": 0.0206, + "step": 77250 + }, + { + "epoch": 0.5710948818781231, + "grad_norm": 0.10103422403335571, + "learning_rate": 2.1524810066476733e-05, + "loss": 0.0183, + "step": 77260 + }, + { + "epoch": 0.5711688004494249, + "grad_norm": 0.07240621000528336, + "learning_rate": 2.152110042735043e-05, + "loss": 0.0168, + "step": 77270 + }, + { + "epoch": 0.5712427190207268, + "grad_norm": 0.0701412558555603, + "learning_rate": 2.151739078822412e-05, + "loss": 0.0172, + "step": 77280 + }, + { + "epoch": 0.5713166375920287, + "grad_norm": 0.05859806761145592, + "learning_rate": 2.1513681149097818e-05, + "loss": 0.0192, + "step": 77290 + }, + { + "epoch": 0.5713905561633305, + "grad_norm": 0.0658048465847969, + "learning_rate": 2.150997150997151e-05, + "loss": 0.0162, + "step": 77300 + }, + { + "epoch": 0.5714644747346324, + "grad_norm": 0.10993514955043793, + "learning_rate": 2.1506261870845203e-05, + "loss": 0.02, + "step": 77310 + }, + { + "epoch": 0.5715383933059341, + "grad_norm": 0.07890800386667252, + "learning_rate": 2.1502552231718902e-05, + "loss": 0.0183, + "step": 77320 + }, + { + "epoch": 0.571612311877236, + "grad_norm": 0.07548778504133224, + "learning_rate": 2.1498842592592595e-05, + "loss": 0.0176, + "step": 77330 + }, + { + "epoch": 0.5716862304485378, + "grad_norm": 0.11432471871376038, + "learning_rate": 2.1495132953466287e-05, + "loss": 0.0186, + "step": 77340 + }, + { + "epoch": 0.5717601490198397, + "grad_norm": 0.07823914289474487, + "learning_rate": 2.149142331433998e-05, + "loss": 0.0177, + "step": 77350 + }, + { + "epoch": 0.5718340675911416, + "grad_norm": 0.08516088128089905, + "learning_rate": 2.148771367521368e-05, + "loss": 0.0161, + "step": 77360 + }, + { + "epoch": 0.5719079861624434, + "grad_norm": 0.06825272738933563, + "learning_rate": 2.1484004036087372e-05, + "loss": 0.0174, + "step": 77370 + }, + { + "epoch": 0.5719819047337453, + "grad_norm": 0.06376885622739792, + "learning_rate": 2.1480294396961064e-05, + "loss": 0.0174, + "step": 77380 + }, + { + "epoch": 0.5720558233050471, + "grad_norm": 0.08210685849189758, + "learning_rate": 2.1476584757834757e-05, + "loss": 0.0189, + "step": 77390 + }, + { + "epoch": 0.572129741876349, + "grad_norm": 0.07939564436674118, + "learning_rate": 2.1472875118708453e-05, + "loss": 0.0173, + "step": 77400 + }, + { + "epoch": 0.5722036604476509, + "grad_norm": 0.08778122067451477, + "learning_rate": 2.146916547958215e-05, + "loss": 0.0188, + "step": 77410 + }, + { + "epoch": 0.5722775790189527, + "grad_norm": 0.06430813670158386, + "learning_rate": 2.146545584045584e-05, + "loss": 0.0185, + "step": 77420 + }, + { + "epoch": 0.5723514975902546, + "grad_norm": 0.07405303418636322, + "learning_rate": 2.1461746201329534e-05, + "loss": 0.0183, + "step": 77430 + }, + { + "epoch": 0.5724254161615564, + "grad_norm": 0.08082497864961624, + "learning_rate": 2.145803656220323e-05, + "loss": 0.0179, + "step": 77440 + }, + { + "epoch": 0.5724993347328583, + "grad_norm": 0.09143561124801636, + "learning_rate": 2.1454326923076923e-05, + "loss": 0.0173, + "step": 77450 + }, + { + "epoch": 0.5725732533041601, + "grad_norm": 0.08747171610593796, + "learning_rate": 2.145061728395062e-05, + "loss": 0.0174, + "step": 77460 + }, + { + "epoch": 0.572647171875462, + "grad_norm": 0.09300205111503601, + "learning_rate": 2.1446907644824314e-05, + "loss": 0.0176, + "step": 77470 + }, + { + "epoch": 0.5727210904467639, + "grad_norm": 0.08413171768188477, + "learning_rate": 2.1443198005698007e-05, + "loss": 0.0173, + "step": 77480 + }, + { + "epoch": 0.5727950090180657, + "grad_norm": 0.06460338830947876, + "learning_rate": 2.14394883665717e-05, + "loss": 0.0159, + "step": 77490 + }, + { + "epoch": 0.5728689275893676, + "grad_norm": 0.08256684243679047, + "learning_rate": 2.1435778727445396e-05, + "loss": 0.0167, + "step": 77500 + }, + { + "epoch": 0.5729428461606694, + "grad_norm": 0.08620408177375793, + "learning_rate": 2.143206908831909e-05, + "loss": 0.0193, + "step": 77510 + }, + { + "epoch": 0.5730167647319713, + "grad_norm": 0.060469850897789, + "learning_rate": 2.1428359449192784e-05, + "loss": 0.0176, + "step": 77520 + }, + { + "epoch": 0.5730906833032731, + "grad_norm": 0.09427706897258759, + "learning_rate": 2.1424649810066477e-05, + "loss": 0.017, + "step": 77530 + }, + { + "epoch": 0.573164601874575, + "grad_norm": 0.07587603479623795, + "learning_rate": 2.142094017094017e-05, + "loss": 0.0202, + "step": 77540 + }, + { + "epoch": 0.5732385204458769, + "grad_norm": 0.05992022901773453, + "learning_rate": 2.141723053181387e-05, + "loss": 0.0151, + "step": 77550 + }, + { + "epoch": 0.5733124390171787, + "grad_norm": 0.1014205813407898, + "learning_rate": 2.141352089268756e-05, + "loss": 0.0187, + "step": 77560 + }, + { + "epoch": 0.5733863575884806, + "grad_norm": 0.08910097926855087, + "learning_rate": 2.1409811253561254e-05, + "loss": 0.0176, + "step": 77570 + }, + { + "epoch": 0.5734602761597823, + "grad_norm": 0.08204387873411179, + "learning_rate": 2.1406101614434946e-05, + "loss": 0.0208, + "step": 77580 + }, + { + "epoch": 0.5735341947310842, + "grad_norm": 0.10908210277557373, + "learning_rate": 2.1402391975308646e-05, + "loss": 0.0183, + "step": 77590 + }, + { + "epoch": 0.5736081133023861, + "grad_norm": 0.04895222187042236, + "learning_rate": 2.139868233618234e-05, + "loss": 0.0172, + "step": 77600 + }, + { + "epoch": 0.5736820318736879, + "grad_norm": 0.07253724336624146, + "learning_rate": 2.139497269705603e-05, + "loss": 0.0179, + "step": 77610 + }, + { + "epoch": 0.5737559504449898, + "grad_norm": 0.07849972695112228, + "learning_rate": 2.1391263057929727e-05, + "loss": 0.0176, + "step": 77620 + }, + { + "epoch": 0.5738298690162916, + "grad_norm": 0.06313782930374146, + "learning_rate": 2.138755341880342e-05, + "loss": 0.0172, + "step": 77630 + }, + { + "epoch": 0.5739037875875935, + "grad_norm": 0.08818871527910233, + "learning_rate": 2.1383843779677115e-05, + "loss": 0.0154, + "step": 77640 + }, + { + "epoch": 0.5739777061588953, + "grad_norm": 0.07989152520895004, + "learning_rate": 2.1380134140550808e-05, + "loss": 0.0192, + "step": 77650 + }, + { + "epoch": 0.5740516247301972, + "grad_norm": 0.08319340646266937, + "learning_rate": 2.1376424501424504e-05, + "loss": 0.0177, + "step": 77660 + }, + { + "epoch": 0.5741255433014991, + "grad_norm": 0.08299017697572708, + "learning_rate": 2.1372714862298197e-05, + "loss": 0.0161, + "step": 77670 + }, + { + "epoch": 0.5741994618728009, + "grad_norm": 0.09096242487430573, + "learning_rate": 2.136900522317189e-05, + "loss": 0.0175, + "step": 77680 + }, + { + "epoch": 0.5742733804441028, + "grad_norm": 0.08916709572076797, + "learning_rate": 2.1365295584045585e-05, + "loss": 0.0195, + "step": 77690 + }, + { + "epoch": 0.5743472990154046, + "grad_norm": 0.1071515679359436, + "learning_rate": 2.136158594491928e-05, + "loss": 0.0195, + "step": 77700 + }, + { + "epoch": 0.5744212175867065, + "grad_norm": 0.09372571110725403, + "learning_rate": 2.1357876305792974e-05, + "loss": 0.0197, + "step": 77710 + }, + { + "epoch": 0.5744951361580083, + "grad_norm": 0.06699743866920471, + "learning_rate": 2.1354166666666666e-05, + "loss": 0.0136, + "step": 77720 + }, + { + "epoch": 0.5745690547293102, + "grad_norm": 0.10548382997512817, + "learning_rate": 2.1350457027540362e-05, + "loss": 0.0189, + "step": 77730 + }, + { + "epoch": 0.5746429733006121, + "grad_norm": 0.10325492918491364, + "learning_rate": 2.1346747388414058e-05, + "loss": 0.0184, + "step": 77740 + }, + { + "epoch": 0.5747168918719139, + "grad_norm": 0.07018305361270905, + "learning_rate": 2.134303774928775e-05, + "loss": 0.0177, + "step": 77750 + }, + { + "epoch": 0.5747908104432158, + "grad_norm": 0.06633090227842331, + "learning_rate": 2.1339328110161443e-05, + "loss": 0.0153, + "step": 77760 + }, + { + "epoch": 0.5748647290145176, + "grad_norm": 0.08085515350103378, + "learning_rate": 2.1335618471035136e-05, + "loss": 0.0182, + "step": 77770 + }, + { + "epoch": 0.5749386475858195, + "grad_norm": 0.07745050638914108, + "learning_rate": 2.1331908831908835e-05, + "loss": 0.0167, + "step": 77780 + }, + { + "epoch": 0.5750125661571213, + "grad_norm": 0.1009209081530571, + "learning_rate": 2.1328199192782528e-05, + "loss": 0.0181, + "step": 77790 + }, + { + "epoch": 0.5750864847284232, + "grad_norm": 0.0653989389538765, + "learning_rate": 2.132448955365622e-05, + "loss": 0.0182, + "step": 77800 + }, + { + "epoch": 0.5751604032997251, + "grad_norm": 0.07263030856847763, + "learning_rate": 2.1320779914529916e-05, + "loss": 0.0167, + "step": 77810 + }, + { + "epoch": 0.5752343218710269, + "grad_norm": 0.0782294049859047, + "learning_rate": 2.1317070275403612e-05, + "loss": 0.0187, + "step": 77820 + }, + { + "epoch": 0.5753082404423288, + "grad_norm": 0.07321447879076004, + "learning_rate": 2.1313360636277305e-05, + "loss": 0.0184, + "step": 77830 + }, + { + "epoch": 0.5753821590136305, + "grad_norm": 0.10173796117305756, + "learning_rate": 2.1309650997150997e-05, + "loss": 0.0187, + "step": 77840 + }, + { + "epoch": 0.5754560775849324, + "grad_norm": 0.09293046593666077, + "learning_rate": 2.1305941358024693e-05, + "loss": 0.0192, + "step": 77850 + }, + { + "epoch": 0.5755299961562343, + "grad_norm": 0.09862498193979263, + "learning_rate": 2.1302231718898386e-05, + "loss": 0.0188, + "step": 77860 + }, + { + "epoch": 0.5756039147275361, + "grad_norm": 0.06980642676353455, + "learning_rate": 2.1298522079772082e-05, + "loss": 0.0164, + "step": 77870 + }, + { + "epoch": 0.575677833298838, + "grad_norm": 0.08679135888814926, + "learning_rate": 2.1294812440645775e-05, + "loss": 0.0188, + "step": 77880 + }, + { + "epoch": 0.5757517518701398, + "grad_norm": 0.0878649652004242, + "learning_rate": 2.129110280151947e-05, + "loss": 0.0197, + "step": 77890 + }, + { + "epoch": 0.5758256704414417, + "grad_norm": 0.07993719726800919, + "learning_rate": 2.1287393162393163e-05, + "loss": 0.018, + "step": 77900 + }, + { + "epoch": 0.5758995890127435, + "grad_norm": 0.09472255408763885, + "learning_rate": 2.1283683523266856e-05, + "loss": 0.0194, + "step": 77910 + }, + { + "epoch": 0.5759735075840454, + "grad_norm": 0.09329091012477875, + "learning_rate": 2.127997388414055e-05, + "loss": 0.0172, + "step": 77920 + }, + { + "epoch": 0.5760474261553473, + "grad_norm": 0.07562713325023651, + "learning_rate": 2.1276264245014248e-05, + "loss": 0.0165, + "step": 77930 + }, + { + "epoch": 0.5761213447266491, + "grad_norm": 0.08897285908460617, + "learning_rate": 2.127255460588794e-05, + "loss": 0.0154, + "step": 77940 + }, + { + "epoch": 0.576195263297951, + "grad_norm": 0.06445154547691345, + "learning_rate": 2.1268844966761633e-05, + "loss": 0.0176, + "step": 77950 + }, + { + "epoch": 0.5762691818692528, + "grad_norm": 0.06542029231786728, + "learning_rate": 2.126513532763533e-05, + "loss": 0.0171, + "step": 77960 + }, + { + "epoch": 0.5763431004405547, + "grad_norm": 0.06990020722150803, + "learning_rate": 2.1261425688509025e-05, + "loss": 0.0178, + "step": 77970 + }, + { + "epoch": 0.5764170190118565, + "grad_norm": 0.09482400119304657, + "learning_rate": 2.1257716049382717e-05, + "loss": 0.0186, + "step": 77980 + }, + { + "epoch": 0.5764909375831584, + "grad_norm": 0.07218655198812485, + "learning_rate": 2.125400641025641e-05, + "loss": 0.0192, + "step": 77990 + }, + { + "epoch": 0.5765648561544603, + "grad_norm": 0.11474903672933578, + "learning_rate": 2.1250296771130106e-05, + "loss": 0.0167, + "step": 78000 + }, + { + "epoch": 0.5766387747257621, + "grad_norm": 0.10138155519962311, + "learning_rate": 2.1246587132003802e-05, + "loss": 0.0191, + "step": 78010 + }, + { + "epoch": 0.576712693297064, + "grad_norm": 0.0677545964717865, + "learning_rate": 2.1242877492877494e-05, + "loss": 0.0166, + "step": 78020 + }, + { + "epoch": 0.5767866118683658, + "grad_norm": 0.07511387765407562, + "learning_rate": 2.1239167853751187e-05, + "loss": 0.0184, + "step": 78030 + }, + { + "epoch": 0.5768605304396677, + "grad_norm": 0.08694437146186829, + "learning_rate": 2.1235458214624883e-05, + "loss": 0.0174, + "step": 78040 + }, + { + "epoch": 0.5769344490109695, + "grad_norm": 0.07573015242815018, + "learning_rate": 2.123174857549858e-05, + "loss": 0.0181, + "step": 78050 + }, + { + "epoch": 0.5770083675822714, + "grad_norm": 0.09312745183706284, + "learning_rate": 2.122803893637227e-05, + "loss": 0.0187, + "step": 78060 + }, + { + "epoch": 0.5770822861535733, + "grad_norm": 0.07375273108482361, + "learning_rate": 2.1224329297245964e-05, + "loss": 0.0171, + "step": 78070 + }, + { + "epoch": 0.577156204724875, + "grad_norm": 0.1048007383942604, + "learning_rate": 2.122061965811966e-05, + "loss": 0.019, + "step": 78080 + }, + { + "epoch": 0.577230123296177, + "grad_norm": 0.06581725925207138, + "learning_rate": 2.1216910018993353e-05, + "loss": 0.019, + "step": 78090 + }, + { + "epoch": 0.5773040418674787, + "grad_norm": 0.07203702628612518, + "learning_rate": 2.121320037986705e-05, + "loss": 0.0202, + "step": 78100 + }, + { + "epoch": 0.5773779604387806, + "grad_norm": 0.1035374104976654, + "learning_rate": 2.120949074074074e-05, + "loss": 0.0187, + "step": 78110 + }, + { + "epoch": 0.5774518790100825, + "grad_norm": 0.079229436814785, + "learning_rate": 2.1205781101614437e-05, + "loss": 0.0155, + "step": 78120 + }, + { + "epoch": 0.5775257975813843, + "grad_norm": 0.08558651059865952, + "learning_rate": 2.120207146248813e-05, + "loss": 0.0186, + "step": 78130 + }, + { + "epoch": 0.5775997161526862, + "grad_norm": 0.07611927390098572, + "learning_rate": 2.1198361823361822e-05, + "loss": 0.0197, + "step": 78140 + }, + { + "epoch": 0.577673634723988, + "grad_norm": 0.08735854178667068, + "learning_rate": 2.1194652184235518e-05, + "loss": 0.0173, + "step": 78150 + }, + { + "epoch": 0.5777475532952899, + "grad_norm": 0.07931400835514069, + "learning_rate": 2.1190942545109214e-05, + "loss": 0.0201, + "step": 78160 + }, + { + "epoch": 0.5778214718665917, + "grad_norm": 0.09490150213241577, + "learning_rate": 2.1187232905982907e-05, + "loss": 0.018, + "step": 78170 + }, + { + "epoch": 0.5778953904378936, + "grad_norm": 0.08102288097143173, + "learning_rate": 2.11835232668566e-05, + "loss": 0.0186, + "step": 78180 + }, + { + "epoch": 0.5779693090091955, + "grad_norm": 0.08374160528182983, + "learning_rate": 2.1179813627730295e-05, + "loss": 0.0185, + "step": 78190 + }, + { + "epoch": 0.5780432275804973, + "grad_norm": 0.07512281835079193, + "learning_rate": 2.117610398860399e-05, + "loss": 0.0157, + "step": 78200 + }, + { + "epoch": 0.5781171461517992, + "grad_norm": 0.1545741707086563, + "learning_rate": 2.1172394349477684e-05, + "loss": 0.0194, + "step": 78210 + }, + { + "epoch": 0.578191064723101, + "grad_norm": 0.05544229596853256, + "learning_rate": 2.1168684710351376e-05, + "loss": 0.0166, + "step": 78220 + }, + { + "epoch": 0.5782649832944029, + "grad_norm": 0.0926881954073906, + "learning_rate": 2.1164975071225072e-05, + "loss": 0.0168, + "step": 78230 + }, + { + "epoch": 0.5783389018657047, + "grad_norm": 0.057462915778160095, + "learning_rate": 2.116126543209877e-05, + "loss": 0.0169, + "step": 78240 + }, + { + "epoch": 0.5784128204370066, + "grad_norm": 0.08461698144674301, + "learning_rate": 2.115755579297246e-05, + "loss": 0.0179, + "step": 78250 + }, + { + "epoch": 0.5784867390083085, + "grad_norm": 0.09049886465072632, + "learning_rate": 2.1153846153846154e-05, + "loss": 0.0177, + "step": 78260 + }, + { + "epoch": 0.5785606575796103, + "grad_norm": 0.0894893929362297, + "learning_rate": 2.115013651471985e-05, + "loss": 0.0159, + "step": 78270 + }, + { + "epoch": 0.5786345761509122, + "grad_norm": 0.0849931612610817, + "learning_rate": 2.1146426875593545e-05, + "loss": 0.0165, + "step": 78280 + }, + { + "epoch": 0.578708494722214, + "grad_norm": 0.09327192604541779, + "learning_rate": 2.1142717236467238e-05, + "loss": 0.0189, + "step": 78290 + }, + { + "epoch": 0.5787824132935159, + "grad_norm": 0.0816856399178505, + "learning_rate": 2.113900759734093e-05, + "loss": 0.0193, + "step": 78300 + }, + { + "epoch": 0.5788563318648177, + "grad_norm": 0.06551526486873627, + "learning_rate": 2.1135297958214627e-05, + "loss": 0.0186, + "step": 78310 + }, + { + "epoch": 0.5789302504361196, + "grad_norm": 0.0740165114402771, + "learning_rate": 2.113158831908832e-05, + "loss": 0.0167, + "step": 78320 + }, + { + "epoch": 0.5790041690074215, + "grad_norm": 0.08765026926994324, + "learning_rate": 2.1127878679962015e-05, + "loss": 0.0201, + "step": 78330 + }, + { + "epoch": 0.5790780875787233, + "grad_norm": 0.05869891867041588, + "learning_rate": 2.1124169040835708e-05, + "loss": 0.0178, + "step": 78340 + }, + { + "epoch": 0.5791520061500252, + "grad_norm": 0.08589767664670944, + "learning_rate": 2.1120459401709404e-05, + "loss": 0.0178, + "step": 78350 + }, + { + "epoch": 0.579225924721327, + "grad_norm": 0.07567378878593445, + "learning_rate": 2.1116749762583096e-05, + "loss": 0.0186, + "step": 78360 + }, + { + "epoch": 0.5792998432926288, + "grad_norm": 0.11730443686246872, + "learning_rate": 2.111304012345679e-05, + "loss": 0.0164, + "step": 78370 + }, + { + "epoch": 0.5793737618639307, + "grad_norm": 0.09426385164260864, + "learning_rate": 2.1109330484330485e-05, + "loss": 0.0175, + "step": 78380 + }, + { + "epoch": 0.5794476804352325, + "grad_norm": 0.09733286499977112, + "learning_rate": 2.110562084520418e-05, + "loss": 0.0179, + "step": 78390 + }, + { + "epoch": 0.5795215990065344, + "grad_norm": 0.08981124311685562, + "learning_rate": 2.1101911206077873e-05, + "loss": 0.0168, + "step": 78400 + }, + { + "epoch": 0.5795955175778362, + "grad_norm": 0.08097285032272339, + "learning_rate": 2.1098201566951566e-05, + "loss": 0.0196, + "step": 78410 + }, + { + "epoch": 0.5796694361491381, + "grad_norm": 0.08661559224128723, + "learning_rate": 2.1094491927825262e-05, + "loss": 0.0188, + "step": 78420 + }, + { + "epoch": 0.5797433547204399, + "grad_norm": 0.1040518581867218, + "learning_rate": 2.1090782288698958e-05, + "loss": 0.0173, + "step": 78430 + }, + { + "epoch": 0.5798172732917418, + "grad_norm": 0.08766549080610275, + "learning_rate": 2.108707264957265e-05, + "loss": 0.0201, + "step": 78440 + }, + { + "epoch": 0.5798911918630437, + "grad_norm": 0.07151542603969574, + "learning_rate": 2.1083363010446343e-05, + "loss": 0.0174, + "step": 78450 + }, + { + "epoch": 0.5799651104343455, + "grad_norm": 0.07667539268732071, + "learning_rate": 2.107965337132004e-05, + "loss": 0.0158, + "step": 78460 + }, + { + "epoch": 0.5800390290056474, + "grad_norm": 0.0796733871102333, + "learning_rate": 2.1075943732193735e-05, + "loss": 0.0187, + "step": 78470 + }, + { + "epoch": 0.5801129475769492, + "grad_norm": 0.10920443385839462, + "learning_rate": 2.1072234093067428e-05, + "loss": 0.0211, + "step": 78480 + }, + { + "epoch": 0.5801868661482511, + "grad_norm": 0.08107868582010269, + "learning_rate": 2.106852445394112e-05, + "loss": 0.0196, + "step": 78490 + }, + { + "epoch": 0.5802607847195529, + "grad_norm": 0.08896715193986893, + "learning_rate": 2.1064814814814816e-05, + "loss": 0.0205, + "step": 78500 + }, + { + "epoch": 0.5803347032908548, + "grad_norm": 0.07335825264453888, + "learning_rate": 2.1061105175688512e-05, + "loss": 0.0194, + "step": 78510 + }, + { + "epoch": 0.5804086218621567, + "grad_norm": 0.08255856484174728, + "learning_rate": 2.1057395536562205e-05, + "loss": 0.0144, + "step": 78520 + }, + { + "epoch": 0.5804825404334585, + "grad_norm": 0.059211425483226776, + "learning_rate": 2.1053685897435897e-05, + "loss": 0.0178, + "step": 78530 + }, + { + "epoch": 0.5805564590047604, + "grad_norm": 0.08221136778593063, + "learning_rate": 2.1049976258309593e-05, + "loss": 0.0216, + "step": 78540 + }, + { + "epoch": 0.5806303775760622, + "grad_norm": 0.08132661134004593, + "learning_rate": 2.1046266619183286e-05, + "loss": 0.0177, + "step": 78550 + }, + { + "epoch": 0.5807042961473641, + "grad_norm": 0.05896592140197754, + "learning_rate": 2.1042556980056982e-05, + "loss": 0.0169, + "step": 78560 + }, + { + "epoch": 0.5807782147186659, + "grad_norm": 0.059992872178554535, + "learning_rate": 2.1038847340930674e-05, + "loss": 0.0152, + "step": 78570 + }, + { + "epoch": 0.5808521332899678, + "grad_norm": 0.04415227100253105, + "learning_rate": 2.103513770180437e-05, + "loss": 0.0171, + "step": 78580 + }, + { + "epoch": 0.5809260518612697, + "grad_norm": 0.058747969567775726, + "learning_rate": 2.1031428062678063e-05, + "loss": 0.0175, + "step": 78590 + }, + { + "epoch": 0.5809999704325715, + "grad_norm": 0.061170656234025955, + "learning_rate": 2.1027718423551755e-05, + "loss": 0.019, + "step": 78600 + }, + { + "epoch": 0.5810738890038734, + "grad_norm": 0.1113303005695343, + "learning_rate": 2.102400878442545e-05, + "loss": 0.0186, + "step": 78610 + }, + { + "epoch": 0.5811478075751751, + "grad_norm": 0.09334623068571091, + "learning_rate": 2.1020299145299147e-05, + "loss": 0.0176, + "step": 78620 + }, + { + "epoch": 0.581221726146477, + "grad_norm": 0.05836963653564453, + "learning_rate": 2.101658950617284e-05, + "loss": 0.0153, + "step": 78630 + }, + { + "epoch": 0.581295644717779, + "grad_norm": 0.0612514466047287, + "learning_rate": 2.1012879867046533e-05, + "loss": 0.0183, + "step": 78640 + }, + { + "epoch": 0.5813695632890807, + "grad_norm": 0.06752336025238037, + "learning_rate": 2.100917022792023e-05, + "loss": 0.0141, + "step": 78650 + }, + { + "epoch": 0.5814434818603826, + "grad_norm": 0.05256432294845581, + "learning_rate": 2.1005460588793924e-05, + "loss": 0.0163, + "step": 78660 + }, + { + "epoch": 0.5815174004316844, + "grad_norm": 0.06495155394077301, + "learning_rate": 2.1001750949667617e-05, + "loss": 0.0178, + "step": 78670 + }, + { + "epoch": 0.5815913190029863, + "grad_norm": 0.0711987167596817, + "learning_rate": 2.099804131054131e-05, + "loss": 0.0173, + "step": 78680 + }, + { + "epoch": 0.5816652375742881, + "grad_norm": 0.09095991402864456, + "learning_rate": 2.0994331671415006e-05, + "loss": 0.0195, + "step": 78690 + }, + { + "epoch": 0.58173915614559, + "grad_norm": 0.1315537840127945, + "learning_rate": 2.09906220322887e-05, + "loss": 0.0176, + "step": 78700 + }, + { + "epoch": 0.5818130747168919, + "grad_norm": 0.06469413638114929, + "learning_rate": 2.0986912393162394e-05, + "loss": 0.0167, + "step": 78710 + }, + { + "epoch": 0.5818869932881937, + "grad_norm": 0.08639881759881973, + "learning_rate": 2.0983202754036087e-05, + "loss": 0.0193, + "step": 78720 + }, + { + "epoch": 0.5819609118594956, + "grad_norm": 0.07865394651889801, + "learning_rate": 2.0979493114909783e-05, + "loss": 0.0173, + "step": 78730 + }, + { + "epoch": 0.5820348304307974, + "grad_norm": 0.05969080328941345, + "learning_rate": 2.097578347578348e-05, + "loss": 0.0159, + "step": 78740 + }, + { + "epoch": 0.5821087490020993, + "grad_norm": 0.08935252577066422, + "learning_rate": 2.097207383665717e-05, + "loss": 0.0172, + "step": 78750 + }, + { + "epoch": 0.5821826675734011, + "grad_norm": 0.07992496341466904, + "learning_rate": 2.0968364197530864e-05, + "loss": 0.0174, + "step": 78760 + }, + { + "epoch": 0.582256586144703, + "grad_norm": 0.07541343569755554, + "learning_rate": 2.096465455840456e-05, + "loss": 0.0172, + "step": 78770 + }, + { + "epoch": 0.5823305047160049, + "grad_norm": 0.09297898411750793, + "learning_rate": 2.0960944919278252e-05, + "loss": 0.0159, + "step": 78780 + }, + { + "epoch": 0.5824044232873067, + "grad_norm": 0.12045703828334808, + "learning_rate": 2.095723528015195e-05, + "loss": 0.0186, + "step": 78790 + }, + { + "epoch": 0.5824783418586086, + "grad_norm": 0.08663397282361984, + "learning_rate": 2.095352564102564e-05, + "loss": 0.018, + "step": 78800 + }, + { + "epoch": 0.5825522604299104, + "grad_norm": 0.06644534319639206, + "learning_rate": 2.0949816001899337e-05, + "loss": 0.0191, + "step": 78810 + }, + { + "epoch": 0.5826261790012123, + "grad_norm": 0.08833138644695282, + "learning_rate": 2.094610636277303e-05, + "loss": 0.017, + "step": 78820 + }, + { + "epoch": 0.5827000975725141, + "grad_norm": 0.07349563390016556, + "learning_rate": 2.0942396723646722e-05, + "loss": 0.0185, + "step": 78830 + }, + { + "epoch": 0.582774016143816, + "grad_norm": 0.08034585416316986, + "learning_rate": 2.093868708452042e-05, + "loss": 0.0193, + "step": 78840 + }, + { + "epoch": 0.5828479347151179, + "grad_norm": 0.07185887545347214, + "learning_rate": 2.0934977445394114e-05, + "loss": 0.016, + "step": 78850 + }, + { + "epoch": 0.5829218532864197, + "grad_norm": 0.06577017903327942, + "learning_rate": 2.0931267806267807e-05, + "loss": 0.0172, + "step": 78860 + }, + { + "epoch": 0.5829957718577216, + "grad_norm": 0.05835145711898804, + "learning_rate": 2.09275581671415e-05, + "loss": 0.0181, + "step": 78870 + }, + { + "epoch": 0.5830696904290233, + "grad_norm": 0.06854193657636642, + "learning_rate": 2.09238485280152e-05, + "loss": 0.0172, + "step": 78880 + }, + { + "epoch": 0.5831436090003252, + "grad_norm": 0.07310711592435837, + "learning_rate": 2.092013888888889e-05, + "loss": 0.0192, + "step": 78890 + }, + { + "epoch": 0.5832175275716271, + "grad_norm": 0.08630604296922684, + "learning_rate": 2.0916429249762584e-05, + "loss": 0.0162, + "step": 78900 + }, + { + "epoch": 0.5832914461429289, + "grad_norm": 0.12226032465696335, + "learning_rate": 2.0912719610636276e-05, + "loss": 0.0167, + "step": 78910 + }, + { + "epoch": 0.5833653647142308, + "grad_norm": 0.07864919304847717, + "learning_rate": 2.0909009971509972e-05, + "loss": 0.0161, + "step": 78920 + }, + { + "epoch": 0.5834392832855326, + "grad_norm": 0.09677169471979141, + "learning_rate": 2.0905300332383668e-05, + "loss": 0.0179, + "step": 78930 + }, + { + "epoch": 0.5835132018568345, + "grad_norm": 0.09349837899208069, + "learning_rate": 2.090159069325736e-05, + "loss": 0.0197, + "step": 78940 + }, + { + "epoch": 0.5835871204281363, + "grad_norm": 0.08077096939086914, + "learning_rate": 2.0897881054131053e-05, + "loss": 0.0159, + "step": 78950 + }, + { + "epoch": 0.5836610389994382, + "grad_norm": 0.06563491374254227, + "learning_rate": 2.089417141500475e-05, + "loss": 0.0172, + "step": 78960 + }, + { + "epoch": 0.5837349575707401, + "grad_norm": 0.060831811279058456, + "learning_rate": 2.0890461775878445e-05, + "loss": 0.0158, + "step": 78970 + }, + { + "epoch": 0.5838088761420419, + "grad_norm": 0.08679822087287903, + "learning_rate": 2.0886752136752138e-05, + "loss": 0.0161, + "step": 78980 + }, + { + "epoch": 0.5838827947133438, + "grad_norm": 0.07423678040504456, + "learning_rate": 2.0883042497625834e-05, + "loss": 0.0184, + "step": 78990 + }, + { + "epoch": 0.5839567132846456, + "grad_norm": 0.1013701856136322, + "learning_rate": 2.0879332858499526e-05, + "loss": 0.0201, + "step": 79000 + }, + { + "epoch": 0.5840306318559475, + "grad_norm": 0.0821235179901123, + "learning_rate": 2.087562321937322e-05, + "loss": 0.0161, + "step": 79010 + }, + { + "epoch": 0.5841045504272493, + "grad_norm": 0.04412822797894478, + "learning_rate": 2.0871913580246915e-05, + "loss": 0.0161, + "step": 79020 + }, + { + "epoch": 0.5841784689985512, + "grad_norm": 0.07753975689411163, + "learning_rate": 2.086820394112061e-05, + "loss": 0.0179, + "step": 79030 + }, + { + "epoch": 0.5842523875698531, + "grad_norm": 0.08966545760631561, + "learning_rate": 2.0864494301994303e-05, + "loss": 0.0193, + "step": 79040 + }, + { + "epoch": 0.5843263061411549, + "grad_norm": 0.07493555545806885, + "learning_rate": 2.0860784662867996e-05, + "loss": 0.0174, + "step": 79050 + }, + { + "epoch": 0.5844002247124568, + "grad_norm": 0.09181392192840576, + "learning_rate": 2.085707502374169e-05, + "loss": 0.0205, + "step": 79060 + }, + { + "epoch": 0.5844741432837586, + "grad_norm": 0.07202092558145523, + "learning_rate": 2.0853365384615388e-05, + "loss": 0.0198, + "step": 79070 + }, + { + "epoch": 0.5845480618550605, + "grad_norm": 0.07976742833852768, + "learning_rate": 2.084965574548908e-05, + "loss": 0.0172, + "step": 79080 + }, + { + "epoch": 0.5846219804263623, + "grad_norm": 0.08119291812181473, + "learning_rate": 2.0845946106362773e-05, + "loss": 0.0198, + "step": 79090 + }, + { + "epoch": 0.5846958989976642, + "grad_norm": 0.08092097193002701, + "learning_rate": 2.0842236467236466e-05, + "loss": 0.0155, + "step": 79100 + }, + { + "epoch": 0.5847698175689661, + "grad_norm": 0.06528455018997192, + "learning_rate": 2.0838526828110165e-05, + "loss": 0.0172, + "step": 79110 + }, + { + "epoch": 0.5848437361402679, + "grad_norm": 0.07378751039505005, + "learning_rate": 2.0834817188983858e-05, + "loss": 0.0172, + "step": 79120 + }, + { + "epoch": 0.5849176547115698, + "grad_norm": 0.11115199327468872, + "learning_rate": 2.083110754985755e-05, + "loss": 0.0184, + "step": 79130 + }, + { + "epoch": 0.5849915732828715, + "grad_norm": 0.10814355313777924, + "learning_rate": 2.0827397910731243e-05, + "loss": 0.0193, + "step": 79140 + }, + { + "epoch": 0.5850654918541734, + "grad_norm": 0.07689186185598373, + "learning_rate": 2.082368827160494e-05, + "loss": 0.0157, + "step": 79150 + }, + { + "epoch": 0.5851394104254753, + "grad_norm": 0.09198521077632904, + "learning_rate": 2.0819978632478635e-05, + "loss": 0.0177, + "step": 79160 + }, + { + "epoch": 0.5852133289967771, + "grad_norm": 0.04869784787297249, + "learning_rate": 2.0816268993352327e-05, + "loss": 0.0174, + "step": 79170 + }, + { + "epoch": 0.585287247568079, + "grad_norm": 0.09584268927574158, + "learning_rate": 2.0812559354226023e-05, + "loss": 0.0184, + "step": 79180 + }, + { + "epoch": 0.5853611661393808, + "grad_norm": 0.10546091198921204, + "learning_rate": 2.0808849715099716e-05, + "loss": 0.018, + "step": 79190 + }, + { + "epoch": 0.5854350847106827, + "grad_norm": 0.07282552123069763, + "learning_rate": 2.0805140075973412e-05, + "loss": 0.0187, + "step": 79200 + }, + { + "epoch": 0.5855090032819845, + "grad_norm": 0.07551319152116776, + "learning_rate": 2.0801430436847104e-05, + "loss": 0.0177, + "step": 79210 + }, + { + "epoch": 0.5855829218532864, + "grad_norm": 0.08180715143680573, + "learning_rate": 2.07977207977208e-05, + "loss": 0.0187, + "step": 79220 + }, + { + "epoch": 0.5856568404245883, + "grad_norm": 0.07415753602981567, + "learning_rate": 2.0794011158594493e-05, + "loss": 0.0193, + "step": 79230 + }, + { + "epoch": 0.5857307589958901, + "grad_norm": 0.07355595380067825, + "learning_rate": 2.0790301519468186e-05, + "loss": 0.0172, + "step": 79240 + }, + { + "epoch": 0.585804677567192, + "grad_norm": 0.08238102495670319, + "learning_rate": 2.078659188034188e-05, + "loss": 0.019, + "step": 79250 + }, + { + "epoch": 0.5858785961384938, + "grad_norm": 0.07160931080579758, + "learning_rate": 2.0782882241215577e-05, + "loss": 0.0183, + "step": 79260 + }, + { + "epoch": 0.5859525147097957, + "grad_norm": 0.07847066223621368, + "learning_rate": 2.077917260208927e-05, + "loss": 0.0187, + "step": 79270 + }, + { + "epoch": 0.5860264332810975, + "grad_norm": 0.05874091759324074, + "learning_rate": 2.0775462962962963e-05, + "loss": 0.0168, + "step": 79280 + }, + { + "epoch": 0.5861003518523994, + "grad_norm": 0.08555573225021362, + "learning_rate": 2.0771753323836655e-05, + "loss": 0.0179, + "step": 79290 + }, + { + "epoch": 0.5861742704237013, + "grad_norm": 0.07176050543785095, + "learning_rate": 2.0768043684710355e-05, + "loss": 0.0163, + "step": 79300 + }, + { + "epoch": 0.5862481889950031, + "grad_norm": 0.07672320306301117, + "learning_rate": 2.0764334045584047e-05, + "loss": 0.0186, + "step": 79310 + }, + { + "epoch": 0.586322107566305, + "grad_norm": 0.10934194177389145, + "learning_rate": 2.076062440645774e-05, + "loss": 0.0188, + "step": 79320 + }, + { + "epoch": 0.5863960261376068, + "grad_norm": 0.10554488748311996, + "learning_rate": 2.0756914767331436e-05, + "loss": 0.0155, + "step": 79330 + }, + { + "epoch": 0.5864699447089087, + "grad_norm": 0.06040872260928154, + "learning_rate": 2.075320512820513e-05, + "loss": 0.0143, + "step": 79340 + }, + { + "epoch": 0.5865438632802105, + "grad_norm": 0.10152992606163025, + "learning_rate": 2.0749495489078824e-05, + "loss": 0.0206, + "step": 79350 + }, + { + "epoch": 0.5866177818515124, + "grad_norm": 0.07255376875400543, + "learning_rate": 2.0745785849952517e-05, + "loss": 0.0198, + "step": 79360 + }, + { + "epoch": 0.5866917004228143, + "grad_norm": 0.09351497143507004, + "learning_rate": 2.0742076210826213e-05, + "loss": 0.0184, + "step": 79370 + }, + { + "epoch": 0.5867656189941161, + "grad_norm": 0.1424107849597931, + "learning_rate": 2.0738366571699905e-05, + "loss": 0.0208, + "step": 79380 + }, + { + "epoch": 0.586839537565418, + "grad_norm": 0.09308390319347382, + "learning_rate": 2.07346569325736e-05, + "loss": 0.0185, + "step": 79390 + }, + { + "epoch": 0.5869134561367197, + "grad_norm": 0.06494449079036713, + "learning_rate": 2.0730947293447294e-05, + "loss": 0.0172, + "step": 79400 + }, + { + "epoch": 0.5869873747080216, + "grad_norm": 0.08240706473588943, + "learning_rate": 2.072723765432099e-05, + "loss": 0.0174, + "step": 79410 + }, + { + "epoch": 0.5870612932793235, + "grad_norm": 0.08911719918251038, + "learning_rate": 2.0723528015194682e-05, + "loss": 0.0185, + "step": 79420 + }, + { + "epoch": 0.5871352118506253, + "grad_norm": 0.08278301358222961, + "learning_rate": 2.071981837606838e-05, + "loss": 0.0186, + "step": 79430 + }, + { + "epoch": 0.5872091304219272, + "grad_norm": 0.08317048102617264, + "learning_rate": 2.071610873694207e-05, + "loss": 0.0195, + "step": 79440 + }, + { + "epoch": 0.587283048993229, + "grad_norm": 0.08313658088445663, + "learning_rate": 2.0712399097815767e-05, + "loss": 0.0187, + "step": 79450 + }, + { + "epoch": 0.5873569675645309, + "grad_norm": 0.09306693822145462, + "learning_rate": 2.070868945868946e-05, + "loss": 0.0167, + "step": 79460 + }, + { + "epoch": 0.5874308861358327, + "grad_norm": 0.08138968795537949, + "learning_rate": 2.0704979819563152e-05, + "loss": 0.0149, + "step": 79470 + }, + { + "epoch": 0.5875048047071346, + "grad_norm": 0.07695699483156204, + "learning_rate": 2.0701270180436848e-05, + "loss": 0.0171, + "step": 79480 + }, + { + "epoch": 0.5875787232784365, + "grad_norm": 0.08786536753177643, + "learning_rate": 2.0697560541310544e-05, + "loss": 0.0175, + "step": 79490 + }, + { + "epoch": 0.5876526418497383, + "grad_norm": 0.06110033392906189, + "learning_rate": 2.0693850902184237e-05, + "loss": 0.0175, + "step": 79500 + }, + { + "epoch": 0.5877265604210402, + "grad_norm": 0.1047593280673027, + "learning_rate": 2.069014126305793e-05, + "loss": 0.0206, + "step": 79510 + }, + { + "epoch": 0.587800478992342, + "grad_norm": 0.07371040433645248, + "learning_rate": 2.0686431623931625e-05, + "loss": 0.0173, + "step": 79520 + }, + { + "epoch": 0.5878743975636439, + "grad_norm": 0.06224002316594124, + "learning_rate": 2.068272198480532e-05, + "loss": 0.0186, + "step": 79530 + }, + { + "epoch": 0.5879483161349457, + "grad_norm": 0.08960548043251038, + "learning_rate": 2.0679012345679014e-05, + "loss": 0.0195, + "step": 79540 + }, + { + "epoch": 0.5880222347062476, + "grad_norm": 0.06475990265607834, + "learning_rate": 2.0675302706552706e-05, + "loss": 0.0176, + "step": 79550 + }, + { + "epoch": 0.5880961532775495, + "grad_norm": 0.07483542710542679, + "learning_rate": 2.0671593067426402e-05, + "loss": 0.0153, + "step": 79560 + }, + { + "epoch": 0.5881700718488513, + "grad_norm": 0.08100137859582901, + "learning_rate": 2.0667883428300098e-05, + "loss": 0.0184, + "step": 79570 + }, + { + "epoch": 0.5882439904201532, + "grad_norm": 0.08143702149391174, + "learning_rate": 2.066417378917379e-05, + "loss": 0.016, + "step": 79580 + }, + { + "epoch": 0.588317908991455, + "grad_norm": 0.09400355070829391, + "learning_rate": 2.0660464150047483e-05, + "loss": 0.0179, + "step": 79590 + }, + { + "epoch": 0.5883918275627569, + "grad_norm": 0.07985514402389526, + "learning_rate": 2.065675451092118e-05, + "loss": 0.0223, + "step": 79600 + }, + { + "epoch": 0.5884657461340588, + "grad_norm": 0.07633242011070251, + "learning_rate": 2.0653044871794872e-05, + "loss": 0.0156, + "step": 79610 + }, + { + "epoch": 0.5885396647053606, + "grad_norm": 0.07383780181407928, + "learning_rate": 2.0649335232668568e-05, + "loss": 0.0162, + "step": 79620 + }, + { + "epoch": 0.5886135832766625, + "grad_norm": 0.0776839628815651, + "learning_rate": 2.064562559354226e-05, + "loss": 0.0174, + "step": 79630 + }, + { + "epoch": 0.5886875018479643, + "grad_norm": 0.07054539769887924, + "learning_rate": 2.0641915954415956e-05, + "loss": 0.0166, + "step": 79640 + }, + { + "epoch": 0.5887614204192662, + "grad_norm": 0.08717437088489532, + "learning_rate": 2.063820631528965e-05, + "loss": 0.0163, + "step": 79650 + }, + { + "epoch": 0.588835338990568, + "grad_norm": 0.09364652633666992, + "learning_rate": 2.0634496676163345e-05, + "loss": 0.0183, + "step": 79660 + }, + { + "epoch": 0.5889092575618698, + "grad_norm": 0.07449059188365936, + "learning_rate": 2.0630787037037038e-05, + "loss": 0.0162, + "step": 79670 + }, + { + "epoch": 0.5889831761331717, + "grad_norm": 0.05628354474902153, + "learning_rate": 2.0627077397910734e-05, + "loss": 0.0179, + "step": 79680 + }, + { + "epoch": 0.5890570947044735, + "grad_norm": 0.09294017404317856, + "learning_rate": 2.0623367758784426e-05, + "loss": 0.016, + "step": 79690 + }, + { + "epoch": 0.5891310132757754, + "grad_norm": 0.08671052753925323, + "learning_rate": 2.061965811965812e-05, + "loss": 0.0192, + "step": 79700 + }, + { + "epoch": 0.5892049318470772, + "grad_norm": 0.08170660585165024, + "learning_rate": 2.0615948480531815e-05, + "loss": 0.02, + "step": 79710 + }, + { + "epoch": 0.5892788504183791, + "grad_norm": 0.07939363270998001, + "learning_rate": 2.061223884140551e-05, + "loss": 0.0174, + "step": 79720 + }, + { + "epoch": 0.5893527689896809, + "grad_norm": 0.08600367605686188, + "learning_rate": 2.0608529202279203e-05, + "loss": 0.0163, + "step": 79730 + }, + { + "epoch": 0.5894266875609828, + "grad_norm": 0.08043445646762848, + "learning_rate": 2.0604819563152896e-05, + "loss": 0.0176, + "step": 79740 + }, + { + "epoch": 0.5895006061322847, + "grad_norm": 0.08205698430538177, + "learning_rate": 2.0601109924026592e-05, + "loss": 0.0171, + "step": 79750 + }, + { + "epoch": 0.5895745247035865, + "grad_norm": 0.07406076043844223, + "learning_rate": 2.0597400284900288e-05, + "loss": 0.0193, + "step": 79760 + }, + { + "epoch": 0.5896484432748884, + "grad_norm": 0.07343707233667374, + "learning_rate": 2.059369064577398e-05, + "loss": 0.0166, + "step": 79770 + }, + { + "epoch": 0.5897223618461902, + "grad_norm": 0.07175429910421371, + "learning_rate": 2.0589981006647673e-05, + "loss": 0.0171, + "step": 79780 + }, + { + "epoch": 0.5897962804174921, + "grad_norm": 0.08020445704460144, + "learning_rate": 2.058627136752137e-05, + "loss": 0.0154, + "step": 79790 + }, + { + "epoch": 0.5898701989887939, + "grad_norm": 0.089165598154068, + "learning_rate": 2.0582561728395065e-05, + "loss": 0.0191, + "step": 79800 + }, + { + "epoch": 0.5899441175600958, + "grad_norm": 0.0934486910700798, + "learning_rate": 2.0578852089268757e-05, + "loss": 0.0185, + "step": 79810 + }, + { + "epoch": 0.5900180361313977, + "grad_norm": 0.06639142334461212, + "learning_rate": 2.057514245014245e-05, + "loss": 0.0187, + "step": 79820 + }, + { + "epoch": 0.5900919547026995, + "grad_norm": 0.07277216017246246, + "learning_rate": 2.0571432811016146e-05, + "loss": 0.0169, + "step": 79830 + }, + { + "epoch": 0.5901658732740014, + "grad_norm": 0.08275559544563293, + "learning_rate": 2.056772317188984e-05, + "loss": 0.0205, + "step": 79840 + }, + { + "epoch": 0.5902397918453032, + "grad_norm": 0.07960959523916245, + "learning_rate": 2.0564013532763534e-05, + "loss": 0.0186, + "step": 79850 + }, + { + "epoch": 0.5903137104166051, + "grad_norm": 0.08382745087146759, + "learning_rate": 2.0560303893637227e-05, + "loss": 0.0172, + "step": 79860 + }, + { + "epoch": 0.590387628987907, + "grad_norm": 0.07545735687017441, + "learning_rate": 2.0556594254510923e-05, + "loss": 0.0185, + "step": 79870 + }, + { + "epoch": 0.5904615475592088, + "grad_norm": 0.07180724292993546, + "learning_rate": 2.0552884615384616e-05, + "loss": 0.016, + "step": 79880 + }, + { + "epoch": 0.5905354661305107, + "grad_norm": 0.07791922241449356, + "learning_rate": 2.054917497625831e-05, + "loss": 0.0187, + "step": 79890 + }, + { + "epoch": 0.5906093847018125, + "grad_norm": 0.06921916455030441, + "learning_rate": 2.0545465337132004e-05, + "loss": 0.0195, + "step": 79900 + }, + { + "epoch": 0.5906833032731144, + "grad_norm": 0.06241946294903755, + "learning_rate": 2.05417556980057e-05, + "loss": 0.0163, + "step": 79910 + }, + { + "epoch": 0.5907572218444161, + "grad_norm": 0.08319985866546631, + "learning_rate": 2.0538046058879393e-05, + "loss": 0.0181, + "step": 79920 + }, + { + "epoch": 0.590831140415718, + "grad_norm": 0.05606095865368843, + "learning_rate": 2.0534336419753085e-05, + "loss": 0.0183, + "step": 79930 + }, + { + "epoch": 0.59090505898702, + "grad_norm": 0.08617392182350159, + "learning_rate": 2.053062678062678e-05, + "loss": 0.0159, + "step": 79940 + }, + { + "epoch": 0.5909789775583217, + "grad_norm": 0.09617938101291656, + "learning_rate": 2.0526917141500477e-05, + "loss": 0.017, + "step": 79950 + }, + { + "epoch": 0.5910528961296236, + "grad_norm": 0.08484801650047302, + "learning_rate": 2.052320750237417e-05, + "loss": 0.0187, + "step": 79960 + }, + { + "epoch": 0.5911268147009254, + "grad_norm": 0.06967838108539581, + "learning_rate": 2.0519497863247862e-05, + "loss": 0.0174, + "step": 79970 + }, + { + "epoch": 0.5912007332722273, + "grad_norm": 0.10206121951341629, + "learning_rate": 2.051578822412156e-05, + "loss": 0.0184, + "step": 79980 + }, + { + "epoch": 0.5912746518435291, + "grad_norm": 0.09198049455881119, + "learning_rate": 2.0512078584995254e-05, + "loss": 0.019, + "step": 79990 + }, + { + "epoch": 0.591348570414831, + "grad_norm": 0.06630239635705948, + "learning_rate": 2.0508368945868947e-05, + "loss": 0.0178, + "step": 80000 + }, + { + "epoch": 0.591348570414831, + "eval_f1": 0.6219793357433911, + "eval_loss": 0.01734846830368042, + "eval_precision": 0.49293097739254205, + "eval_recall": 0.8425598416692311, + "eval_runtime": 2919.4187, + "eval_samples_per_second": 185.357, + "eval_steps_per_second": 2.896, + "step": 80000 + }, + { + "epoch": 0.5914224889861329, + "grad_norm": 0.083350270986557, + "learning_rate": 2.050465930674264e-05, + "loss": 0.0153, + "step": 80010 + }, + { + "epoch": 0.5914964075574347, + "grad_norm": 0.08730822801589966, + "learning_rate": 2.0500949667616335e-05, + "loss": 0.0183, + "step": 80020 + }, + { + "epoch": 0.5915703261287366, + "grad_norm": 0.063474640250206, + "learning_rate": 2.049724002849003e-05, + "loss": 0.0183, + "step": 80030 + }, + { + "epoch": 0.5916442447000384, + "grad_norm": 0.07405190169811249, + "learning_rate": 2.0493530389363724e-05, + "loss": 0.0178, + "step": 80040 + }, + { + "epoch": 0.5917181632713403, + "grad_norm": 0.09950005263090134, + "learning_rate": 2.0489820750237417e-05, + "loss": 0.021, + "step": 80050 + }, + { + "epoch": 0.5917920818426421, + "grad_norm": 0.08904198557138443, + "learning_rate": 2.0486111111111113e-05, + "loss": 0.0186, + "step": 80060 + }, + { + "epoch": 0.591866000413944, + "grad_norm": 0.08764451742172241, + "learning_rate": 2.0482401471984805e-05, + "loss": 0.0177, + "step": 80070 + }, + { + "epoch": 0.5919399189852459, + "grad_norm": 0.07702360302209854, + "learning_rate": 2.04786918328585e-05, + "loss": 0.0196, + "step": 80080 + }, + { + "epoch": 0.5920138375565477, + "grad_norm": 0.07286150008440018, + "learning_rate": 2.0474982193732194e-05, + "loss": 0.0173, + "step": 80090 + }, + { + "epoch": 0.5920877561278496, + "grad_norm": 0.1036733090877533, + "learning_rate": 2.047127255460589e-05, + "loss": 0.0195, + "step": 80100 + }, + { + "epoch": 0.5921616746991514, + "grad_norm": 0.06630636751651764, + "learning_rate": 2.0467562915479582e-05, + "loss": 0.0171, + "step": 80110 + }, + { + "epoch": 0.5922355932704533, + "grad_norm": 0.09572155773639679, + "learning_rate": 2.0463853276353278e-05, + "loss": 0.02, + "step": 80120 + }, + { + "epoch": 0.5923095118417552, + "grad_norm": 0.10689663887023926, + "learning_rate": 2.046014363722697e-05, + "loss": 0.0186, + "step": 80130 + }, + { + "epoch": 0.592383430413057, + "grad_norm": 0.09138118475675583, + "learning_rate": 2.0456433998100667e-05, + "loss": 0.0176, + "step": 80140 + }, + { + "epoch": 0.5924573489843589, + "grad_norm": 0.07343235611915588, + "learning_rate": 2.045272435897436e-05, + "loss": 0.0176, + "step": 80150 + }, + { + "epoch": 0.5925312675556607, + "grad_norm": 0.07991829514503479, + "learning_rate": 2.0449014719848052e-05, + "loss": 0.0166, + "step": 80160 + }, + { + "epoch": 0.5926051861269626, + "grad_norm": 0.06455441564321518, + "learning_rate": 2.0445305080721748e-05, + "loss": 0.0174, + "step": 80170 + }, + { + "epoch": 0.5926791046982643, + "grad_norm": 0.09613152593374252, + "learning_rate": 2.0441595441595444e-05, + "loss": 0.0211, + "step": 80180 + }, + { + "epoch": 0.5927530232695662, + "grad_norm": 0.06977544724941254, + "learning_rate": 2.0437885802469136e-05, + "loss": 0.018, + "step": 80190 + }, + { + "epoch": 0.5928269418408681, + "grad_norm": 0.0790705755352974, + "learning_rate": 2.043417616334283e-05, + "loss": 0.0169, + "step": 80200 + }, + { + "epoch": 0.5929008604121699, + "grad_norm": 0.07312140613794327, + "learning_rate": 2.0430466524216525e-05, + "loss": 0.0161, + "step": 80210 + }, + { + "epoch": 0.5929747789834718, + "grad_norm": 0.07595586776733398, + "learning_rate": 2.042675688509022e-05, + "loss": 0.018, + "step": 80220 + }, + { + "epoch": 0.5930486975547736, + "grad_norm": 0.08415449410676956, + "learning_rate": 2.0423047245963913e-05, + "loss": 0.0158, + "step": 80230 + }, + { + "epoch": 0.5931226161260755, + "grad_norm": 0.07658253610134125, + "learning_rate": 2.0419337606837606e-05, + "loss": 0.0165, + "step": 80240 + }, + { + "epoch": 0.5931965346973773, + "grad_norm": 0.09636301547288895, + "learning_rate": 2.0415627967711302e-05, + "loss": 0.0183, + "step": 80250 + }, + { + "epoch": 0.5932704532686792, + "grad_norm": 0.07571303844451904, + "learning_rate": 2.0411918328584998e-05, + "loss": 0.0159, + "step": 80260 + }, + { + "epoch": 0.5933443718399811, + "grad_norm": 0.07676825672388077, + "learning_rate": 2.040820868945869e-05, + "loss": 0.0189, + "step": 80270 + }, + { + "epoch": 0.5934182904112829, + "grad_norm": 0.08915860950946808, + "learning_rate": 2.0404499050332383e-05, + "loss": 0.0175, + "step": 80280 + }, + { + "epoch": 0.5934922089825848, + "grad_norm": 0.06634003669023514, + "learning_rate": 2.040078941120608e-05, + "loss": 0.0185, + "step": 80290 + }, + { + "epoch": 0.5935661275538866, + "grad_norm": 0.08090776950120926, + "learning_rate": 2.039707977207977e-05, + "loss": 0.0201, + "step": 80300 + }, + { + "epoch": 0.5936400461251885, + "grad_norm": 0.08503460884094238, + "learning_rate": 2.0393370132953468e-05, + "loss": 0.0184, + "step": 80310 + }, + { + "epoch": 0.5937139646964903, + "grad_norm": 0.10272512584924698, + "learning_rate": 2.038966049382716e-05, + "loss": 0.017, + "step": 80320 + }, + { + "epoch": 0.5937878832677922, + "grad_norm": 0.09803182631731033, + "learning_rate": 2.0385950854700856e-05, + "loss": 0.0172, + "step": 80330 + }, + { + "epoch": 0.5938618018390941, + "grad_norm": 0.05736514553427696, + "learning_rate": 2.038224121557455e-05, + "loss": 0.0165, + "step": 80340 + }, + { + "epoch": 0.5939357204103959, + "grad_norm": 0.09937681257724762, + "learning_rate": 2.0378531576448245e-05, + "loss": 0.0199, + "step": 80350 + }, + { + "epoch": 0.5940096389816978, + "grad_norm": 0.0747910812497139, + "learning_rate": 2.0374821937321937e-05, + "loss": 0.0165, + "step": 80360 + }, + { + "epoch": 0.5940835575529996, + "grad_norm": 0.07158013433218002, + "learning_rate": 2.0371112298195633e-05, + "loss": 0.0184, + "step": 80370 + }, + { + "epoch": 0.5941574761243015, + "grad_norm": 0.06482820957899094, + "learning_rate": 2.0367402659069326e-05, + "loss": 0.0164, + "step": 80380 + }, + { + "epoch": 0.5942313946956034, + "grad_norm": 0.09437862038612366, + "learning_rate": 2.036369301994302e-05, + "loss": 0.0161, + "step": 80390 + }, + { + "epoch": 0.5943053132669052, + "grad_norm": 0.0751989334821701, + "learning_rate": 2.0359983380816718e-05, + "loss": 0.0178, + "step": 80400 + }, + { + "epoch": 0.5943792318382071, + "grad_norm": 0.057128626853227615, + "learning_rate": 2.035627374169041e-05, + "loss": 0.0192, + "step": 80410 + }, + { + "epoch": 0.5944531504095089, + "grad_norm": 0.062380481511354446, + "learning_rate": 2.0352564102564103e-05, + "loss": 0.0175, + "step": 80420 + }, + { + "epoch": 0.5945270689808108, + "grad_norm": 0.09042806178331375, + "learning_rate": 2.0348854463437796e-05, + "loss": 0.0186, + "step": 80430 + }, + { + "epoch": 0.5946009875521125, + "grad_norm": 0.06878332048654556, + "learning_rate": 2.034514482431149e-05, + "loss": 0.0184, + "step": 80440 + }, + { + "epoch": 0.5946749061234144, + "grad_norm": 0.09018473327159882, + "learning_rate": 2.0341435185185187e-05, + "loss": 0.0169, + "step": 80450 + }, + { + "epoch": 0.5947488246947163, + "grad_norm": 0.07117787003517151, + "learning_rate": 2.033772554605888e-05, + "loss": 0.02, + "step": 80460 + }, + { + "epoch": 0.5948227432660181, + "grad_norm": 0.09593405574560165, + "learning_rate": 2.0334015906932573e-05, + "loss": 0.0152, + "step": 80470 + }, + { + "epoch": 0.59489666183732, + "grad_norm": 0.10332407057285309, + "learning_rate": 2.033030626780627e-05, + "loss": 0.0185, + "step": 80480 + }, + { + "epoch": 0.5949705804086218, + "grad_norm": 0.09714365005493164, + "learning_rate": 2.0326596628679965e-05, + "loss": 0.0182, + "step": 80490 + }, + { + "epoch": 0.5950444989799237, + "grad_norm": 0.10157936811447144, + "learning_rate": 2.0322886989553657e-05, + "loss": 0.0179, + "step": 80500 + }, + { + "epoch": 0.5951184175512255, + "grad_norm": 0.07549656182527542, + "learning_rate": 2.031917735042735e-05, + "loss": 0.017, + "step": 80510 + }, + { + "epoch": 0.5951923361225274, + "grad_norm": 0.07607118040323257, + "learning_rate": 2.0315467711301046e-05, + "loss": 0.0184, + "step": 80520 + }, + { + "epoch": 0.5952662546938293, + "grad_norm": 0.08963571488857269, + "learning_rate": 2.0311758072174738e-05, + "loss": 0.0168, + "step": 80530 + }, + { + "epoch": 0.5953401732651311, + "grad_norm": 0.07405596226453781, + "learning_rate": 2.0308048433048434e-05, + "loss": 0.0168, + "step": 80540 + }, + { + "epoch": 0.595414091836433, + "grad_norm": 0.09124046564102173, + "learning_rate": 2.030433879392213e-05, + "loss": 0.0193, + "step": 80550 + }, + { + "epoch": 0.5954880104077348, + "grad_norm": 0.07387691736221313, + "learning_rate": 2.0300629154795823e-05, + "loss": 0.0182, + "step": 80560 + }, + { + "epoch": 0.5955619289790367, + "grad_norm": 0.07532805949449539, + "learning_rate": 2.0296919515669515e-05, + "loss": 0.0161, + "step": 80570 + }, + { + "epoch": 0.5956358475503385, + "grad_norm": 0.12049363553524017, + "learning_rate": 2.029320987654321e-05, + "loss": 0.0182, + "step": 80580 + }, + { + "epoch": 0.5957097661216404, + "grad_norm": 0.13337494432926178, + "learning_rate": 2.0289500237416907e-05, + "loss": 0.019, + "step": 80590 + }, + { + "epoch": 0.5957836846929423, + "grad_norm": 0.09054873883724213, + "learning_rate": 2.02857905982906e-05, + "loss": 0.019, + "step": 80600 + }, + { + "epoch": 0.5958576032642441, + "grad_norm": 0.0526256337761879, + "learning_rate": 2.0282080959164292e-05, + "loss": 0.0175, + "step": 80610 + }, + { + "epoch": 0.595931521835546, + "grad_norm": 0.06578093022108078, + "learning_rate": 2.0278371320037985e-05, + "loss": 0.0162, + "step": 80620 + }, + { + "epoch": 0.5960054404068478, + "grad_norm": 0.07064341008663177, + "learning_rate": 2.0274661680911684e-05, + "loss": 0.0185, + "step": 80630 + }, + { + "epoch": 0.5960793589781497, + "grad_norm": 0.07011621445417404, + "learning_rate": 2.0270952041785377e-05, + "loss": 0.0173, + "step": 80640 + }, + { + "epoch": 0.5961532775494516, + "grad_norm": 0.0939406305551529, + "learning_rate": 2.026724240265907e-05, + "loss": 0.0189, + "step": 80650 + }, + { + "epoch": 0.5962271961207534, + "grad_norm": 0.08310779184103012, + "learning_rate": 2.0263532763532762e-05, + "loss": 0.0153, + "step": 80660 + }, + { + "epoch": 0.5963011146920553, + "grad_norm": 0.07875712215900421, + "learning_rate": 2.0259823124406458e-05, + "loss": 0.0187, + "step": 80670 + }, + { + "epoch": 0.5963750332633571, + "grad_norm": 0.08235958963632584, + "learning_rate": 2.0256113485280154e-05, + "loss": 0.0164, + "step": 80680 + }, + { + "epoch": 0.596448951834659, + "grad_norm": 0.08048681169748306, + "learning_rate": 2.0252403846153847e-05, + "loss": 0.0174, + "step": 80690 + }, + { + "epoch": 0.5965228704059607, + "grad_norm": 0.07104280591011047, + "learning_rate": 2.0248694207027543e-05, + "loss": 0.0165, + "step": 80700 + }, + { + "epoch": 0.5965967889772626, + "grad_norm": 0.09004362672567368, + "learning_rate": 2.0244984567901235e-05, + "loss": 0.016, + "step": 80710 + }, + { + "epoch": 0.5966707075485645, + "grad_norm": 0.0726613849401474, + "learning_rate": 2.024127492877493e-05, + "loss": 0.0187, + "step": 80720 + }, + { + "epoch": 0.5967446261198663, + "grad_norm": 0.07129136472940445, + "learning_rate": 2.0237565289648624e-05, + "loss": 0.017, + "step": 80730 + }, + { + "epoch": 0.5968185446911682, + "grad_norm": 0.07636060565710068, + "learning_rate": 2.023385565052232e-05, + "loss": 0.0168, + "step": 80740 + }, + { + "epoch": 0.59689246326247, + "grad_norm": 0.06925743818283081, + "learning_rate": 2.0230146011396012e-05, + "loss": 0.0158, + "step": 80750 + }, + { + "epoch": 0.5969663818337719, + "grad_norm": 0.07321954518556595, + "learning_rate": 2.0226436372269705e-05, + "loss": 0.0158, + "step": 80760 + }, + { + "epoch": 0.5970403004050737, + "grad_norm": 0.06690855324268341, + "learning_rate": 2.02227267331434e-05, + "loss": 0.0157, + "step": 80770 + }, + { + "epoch": 0.5971142189763756, + "grad_norm": 0.08284644782543182, + "learning_rate": 2.0219017094017097e-05, + "loss": 0.0181, + "step": 80780 + }, + { + "epoch": 0.5971881375476775, + "grad_norm": 0.09947702288627625, + "learning_rate": 2.021530745489079e-05, + "loss": 0.0176, + "step": 80790 + }, + { + "epoch": 0.5972620561189793, + "grad_norm": 0.08260909467935562, + "learning_rate": 2.0211597815764482e-05, + "loss": 0.0158, + "step": 80800 + }, + { + "epoch": 0.5973359746902812, + "grad_norm": 0.06769290566444397, + "learning_rate": 2.0207888176638178e-05, + "loss": 0.0172, + "step": 80810 + }, + { + "epoch": 0.597409893261583, + "grad_norm": 0.08128570765256882, + "learning_rate": 2.0204178537511874e-05, + "loss": 0.0188, + "step": 80820 + }, + { + "epoch": 0.5974838118328849, + "grad_norm": 0.0641016960144043, + "learning_rate": 2.0200468898385566e-05, + "loss": 0.0168, + "step": 80830 + }, + { + "epoch": 0.5975577304041867, + "grad_norm": 0.06617254763841629, + "learning_rate": 2.019675925925926e-05, + "loss": 0.0169, + "step": 80840 + }, + { + "epoch": 0.5976316489754886, + "grad_norm": 0.09616333246231079, + "learning_rate": 2.0193049620132955e-05, + "loss": 0.0212, + "step": 80850 + }, + { + "epoch": 0.5977055675467905, + "grad_norm": 0.0783003717660904, + "learning_rate": 2.018933998100665e-05, + "loss": 0.0169, + "step": 80860 + }, + { + "epoch": 0.5977794861180923, + "grad_norm": 0.07168202847242355, + "learning_rate": 2.0185630341880344e-05, + "loss": 0.0166, + "step": 80870 + }, + { + "epoch": 0.5978534046893942, + "grad_norm": 0.11167970299720764, + "learning_rate": 2.0181920702754036e-05, + "loss": 0.0179, + "step": 80880 + }, + { + "epoch": 0.597927323260696, + "grad_norm": 0.09261804819107056, + "learning_rate": 2.0178211063627732e-05, + "loss": 0.0178, + "step": 80890 + }, + { + "epoch": 0.5980012418319979, + "grad_norm": 0.0917130634188652, + "learning_rate": 2.0174501424501425e-05, + "loss": 0.0193, + "step": 80900 + }, + { + "epoch": 0.5980751604032998, + "grad_norm": 0.07518907636404037, + "learning_rate": 2.017079178537512e-05, + "loss": 0.0171, + "step": 80910 + }, + { + "epoch": 0.5981490789746016, + "grad_norm": 0.07371152192354202, + "learning_rate": 2.0167082146248813e-05, + "loss": 0.0183, + "step": 80920 + }, + { + "epoch": 0.5982229975459035, + "grad_norm": 0.07565153390169144, + "learning_rate": 2.016337250712251e-05, + "loss": 0.0193, + "step": 80930 + }, + { + "epoch": 0.5982969161172053, + "grad_norm": 0.07934678345918655, + "learning_rate": 2.0159662867996202e-05, + "loss": 0.0195, + "step": 80940 + }, + { + "epoch": 0.5983708346885072, + "grad_norm": 0.08630355447530746, + "learning_rate": 2.0155953228869898e-05, + "loss": 0.0166, + "step": 80950 + }, + { + "epoch": 0.598444753259809, + "grad_norm": 0.08241811394691467, + "learning_rate": 2.015224358974359e-05, + "loss": 0.0174, + "step": 80960 + }, + { + "epoch": 0.5985186718311108, + "grad_norm": 0.07798541337251663, + "learning_rate": 2.0148533950617286e-05, + "loss": 0.0199, + "step": 80970 + }, + { + "epoch": 0.5985925904024127, + "grad_norm": 0.11176567524671555, + "learning_rate": 2.014482431149098e-05, + "loss": 0.0189, + "step": 80980 + }, + { + "epoch": 0.5986665089737145, + "grad_norm": 0.11033938825130463, + "learning_rate": 2.014111467236467e-05, + "loss": 0.02, + "step": 80990 + }, + { + "epoch": 0.5987404275450164, + "grad_norm": 0.07284007966518402, + "learning_rate": 2.0137405033238367e-05, + "loss": 0.0167, + "step": 81000 + }, + { + "epoch": 0.5988143461163182, + "grad_norm": 0.05665259063243866, + "learning_rate": 2.0133695394112063e-05, + "loss": 0.0176, + "step": 81010 + }, + { + "epoch": 0.5988882646876201, + "grad_norm": 0.09139882773160934, + "learning_rate": 2.0129985754985756e-05, + "loss": 0.0169, + "step": 81020 + }, + { + "epoch": 0.5989621832589219, + "grad_norm": 0.08543331921100616, + "learning_rate": 2.012627611585945e-05, + "loss": 0.0168, + "step": 81030 + }, + { + "epoch": 0.5990361018302238, + "grad_norm": 0.0778161883354187, + "learning_rate": 2.0122566476733144e-05, + "loss": 0.0171, + "step": 81040 + }, + { + "epoch": 0.5991100204015257, + "grad_norm": 0.07842028886079788, + "learning_rate": 2.011885683760684e-05, + "loss": 0.0155, + "step": 81050 + }, + { + "epoch": 0.5991839389728275, + "grad_norm": 0.06531845778226852, + "learning_rate": 2.0115147198480533e-05, + "loss": 0.0187, + "step": 81060 + }, + { + "epoch": 0.5992578575441294, + "grad_norm": 0.07770530134439468, + "learning_rate": 2.0111437559354226e-05, + "loss": 0.0165, + "step": 81070 + }, + { + "epoch": 0.5993317761154312, + "grad_norm": 0.0701184794306755, + "learning_rate": 2.010772792022792e-05, + "loss": 0.0158, + "step": 81080 + }, + { + "epoch": 0.5994056946867331, + "grad_norm": 0.07867050170898438, + "learning_rate": 2.0104018281101618e-05, + "loss": 0.0166, + "step": 81090 + }, + { + "epoch": 0.5994796132580349, + "grad_norm": 0.07159710675477982, + "learning_rate": 2.010030864197531e-05, + "loss": 0.0184, + "step": 81100 + }, + { + "epoch": 0.5995535318293368, + "grad_norm": 0.08176423609256744, + "learning_rate": 2.0096599002849003e-05, + "loss": 0.0153, + "step": 81110 + }, + { + "epoch": 0.5996274504006387, + "grad_norm": 0.05352728068828583, + "learning_rate": 2.00928893637227e-05, + "loss": 0.0189, + "step": 81120 + }, + { + "epoch": 0.5997013689719405, + "grad_norm": 0.08773167431354523, + "learning_rate": 2.008917972459639e-05, + "loss": 0.0181, + "step": 81130 + }, + { + "epoch": 0.5997752875432424, + "grad_norm": 0.12341810762882233, + "learning_rate": 2.0085470085470087e-05, + "loss": 0.0191, + "step": 81140 + }, + { + "epoch": 0.5998492061145442, + "grad_norm": 0.08232131600379944, + "learning_rate": 2.008176044634378e-05, + "loss": 0.0169, + "step": 81150 + }, + { + "epoch": 0.5999231246858461, + "grad_norm": 0.1366821676492691, + "learning_rate": 2.0078050807217476e-05, + "loss": 0.0171, + "step": 81160 + }, + { + "epoch": 0.599997043257148, + "grad_norm": 0.07837974280118942, + "learning_rate": 2.007434116809117e-05, + "loss": 0.0184, + "step": 81170 + }, + { + "epoch": 0.6000709618284498, + "grad_norm": 0.08063607662916183, + "learning_rate": 2.0070631528964864e-05, + "loss": 0.0156, + "step": 81180 + }, + { + "epoch": 0.6001448803997517, + "grad_norm": 0.07672372460365295, + "learning_rate": 2.0066921889838557e-05, + "loss": 0.0197, + "step": 81190 + }, + { + "epoch": 0.6002187989710535, + "grad_norm": 0.1100495308637619, + "learning_rate": 2.0063212250712253e-05, + "loss": 0.0216, + "step": 81200 + }, + { + "epoch": 0.6002927175423554, + "grad_norm": 0.09978067874908447, + "learning_rate": 2.0059502611585945e-05, + "loss": 0.0186, + "step": 81210 + }, + { + "epoch": 0.6003666361136571, + "grad_norm": 0.07662851363420486, + "learning_rate": 2.0055792972459638e-05, + "loss": 0.0163, + "step": 81220 + }, + { + "epoch": 0.600440554684959, + "grad_norm": 0.08676115423440933, + "learning_rate": 2.0052083333333334e-05, + "loss": 0.0169, + "step": 81230 + }, + { + "epoch": 0.600514473256261, + "grad_norm": 0.07154218852519989, + "learning_rate": 2.004837369420703e-05, + "loss": 0.0177, + "step": 81240 + }, + { + "epoch": 0.6005883918275627, + "grad_norm": 0.0777452364563942, + "learning_rate": 2.0044664055080723e-05, + "loss": 0.0192, + "step": 81250 + }, + { + "epoch": 0.6006623103988646, + "grad_norm": 0.07932627201080322, + "learning_rate": 2.0040954415954415e-05, + "loss": 0.0174, + "step": 81260 + }, + { + "epoch": 0.6007362289701664, + "grad_norm": 0.07561071217060089, + "learning_rate": 2.003724477682811e-05, + "loss": 0.0159, + "step": 81270 + }, + { + "epoch": 0.6008101475414683, + "grad_norm": 0.07187164574861526, + "learning_rate": 2.0033535137701807e-05, + "loss": 0.0173, + "step": 81280 + }, + { + "epoch": 0.6008840661127701, + "grad_norm": 0.08690813928842545, + "learning_rate": 2.00298254985755e-05, + "loss": 0.0184, + "step": 81290 + }, + { + "epoch": 0.600957984684072, + "grad_norm": 0.06971681118011475, + "learning_rate": 2.0026115859449192e-05, + "loss": 0.0172, + "step": 81300 + }, + { + "epoch": 0.6010319032553739, + "grad_norm": 0.06447634100914001, + "learning_rate": 2.0022406220322888e-05, + "loss": 0.0185, + "step": 81310 + }, + { + "epoch": 0.6011058218266757, + "grad_norm": 0.09539581835269928, + "learning_rate": 2.0018696581196584e-05, + "loss": 0.018, + "step": 81320 + }, + { + "epoch": 0.6011797403979776, + "grad_norm": 0.05732736736536026, + "learning_rate": 2.0014986942070277e-05, + "loss": 0.0163, + "step": 81330 + }, + { + "epoch": 0.6012536589692794, + "grad_norm": 0.059371933341026306, + "learning_rate": 2.001127730294397e-05, + "loss": 0.0172, + "step": 81340 + }, + { + "epoch": 0.6013275775405813, + "grad_norm": 0.20023475587368011, + "learning_rate": 2.0007567663817665e-05, + "loss": 0.0197, + "step": 81350 + }, + { + "epoch": 0.6014014961118832, + "grad_norm": 0.09249341487884521, + "learning_rate": 2.0003858024691358e-05, + "loss": 0.0175, + "step": 81360 + }, + { + "epoch": 0.601475414683185, + "grad_norm": 0.07810152322053909, + "learning_rate": 2.0000148385565054e-05, + "loss": 0.018, + "step": 81370 + }, + { + "epoch": 0.6015493332544869, + "grad_norm": 0.08528342097997665, + "learning_rate": 1.9996438746438746e-05, + "loss": 0.0163, + "step": 81380 + }, + { + "epoch": 0.6016232518257887, + "grad_norm": 0.05830449238419533, + "learning_rate": 1.9992729107312442e-05, + "loss": 0.0185, + "step": 81390 + }, + { + "epoch": 0.6016971703970906, + "grad_norm": 0.06526520848274231, + "learning_rate": 1.9989019468186135e-05, + "loss": 0.0175, + "step": 81400 + }, + { + "epoch": 0.6017710889683924, + "grad_norm": 0.06398095190525055, + "learning_rate": 1.998530982905983e-05, + "loss": 0.016, + "step": 81410 + }, + { + "epoch": 0.6018450075396943, + "grad_norm": 0.05629360303282738, + "learning_rate": 1.9981600189933523e-05, + "loss": 0.015, + "step": 81420 + }, + { + "epoch": 0.6019189261109962, + "grad_norm": 0.0880153700709343, + "learning_rate": 1.997789055080722e-05, + "loss": 0.0182, + "step": 81430 + }, + { + "epoch": 0.601992844682298, + "grad_norm": 0.10312674194574356, + "learning_rate": 1.9974180911680912e-05, + "loss": 0.0163, + "step": 81440 + }, + { + "epoch": 0.6020667632535999, + "grad_norm": 0.06881557404994965, + "learning_rate": 1.9970471272554605e-05, + "loss": 0.0178, + "step": 81450 + }, + { + "epoch": 0.6021406818249017, + "grad_norm": 0.07419392466545105, + "learning_rate": 1.99667616334283e-05, + "loss": 0.0149, + "step": 81460 + }, + { + "epoch": 0.6022146003962036, + "grad_norm": 0.08452188968658447, + "learning_rate": 1.9963051994301997e-05, + "loss": 0.0162, + "step": 81470 + }, + { + "epoch": 0.6022885189675053, + "grad_norm": 0.08976190537214279, + "learning_rate": 1.995934235517569e-05, + "loss": 0.0179, + "step": 81480 + }, + { + "epoch": 0.6023624375388072, + "grad_norm": 0.05598026141524315, + "learning_rate": 1.995563271604938e-05, + "loss": 0.0152, + "step": 81490 + }, + { + "epoch": 0.6024363561101092, + "grad_norm": 0.08023150265216827, + "learning_rate": 1.9951923076923078e-05, + "loss": 0.0202, + "step": 81500 + }, + { + "epoch": 0.6025102746814109, + "grad_norm": 0.08681504428386688, + "learning_rate": 1.9948213437796774e-05, + "loss": 0.0181, + "step": 81510 + }, + { + "epoch": 0.6025841932527128, + "grad_norm": 0.1389748752117157, + "learning_rate": 1.9944503798670466e-05, + "loss": 0.0192, + "step": 81520 + }, + { + "epoch": 0.6026581118240146, + "grad_norm": 0.11852090805768967, + "learning_rate": 1.994079415954416e-05, + "loss": 0.0175, + "step": 81530 + }, + { + "epoch": 0.6027320303953165, + "grad_norm": 0.07455288618803024, + "learning_rate": 1.9937084520417855e-05, + "loss": 0.0178, + "step": 81540 + }, + { + "epoch": 0.6028059489666183, + "grad_norm": 0.075650155544281, + "learning_rate": 1.993337488129155e-05, + "loss": 0.0186, + "step": 81550 + }, + { + "epoch": 0.6028798675379202, + "grad_norm": 0.0989387258887291, + "learning_rate": 1.9929665242165243e-05, + "loss": 0.0185, + "step": 81560 + }, + { + "epoch": 0.6029537861092221, + "grad_norm": 0.07722626626491547, + "learning_rate": 1.9925955603038936e-05, + "loss": 0.0168, + "step": 81570 + }, + { + "epoch": 0.6030277046805239, + "grad_norm": 0.05953269824385643, + "learning_rate": 1.9922245963912632e-05, + "loss": 0.0174, + "step": 81580 + }, + { + "epoch": 0.6031016232518258, + "grad_norm": 0.08750326186418533, + "learning_rate": 1.9918536324786324e-05, + "loss": 0.0176, + "step": 81590 + }, + { + "epoch": 0.6031755418231276, + "grad_norm": 0.0637708455324173, + "learning_rate": 1.991482668566002e-05, + "loss": 0.0162, + "step": 81600 + }, + { + "epoch": 0.6032494603944295, + "grad_norm": 0.07842204719781876, + "learning_rate": 1.9911117046533713e-05, + "loss": 0.0145, + "step": 81610 + }, + { + "epoch": 0.6033233789657314, + "grad_norm": 0.07083271443843842, + "learning_rate": 1.990740740740741e-05, + "loss": 0.0179, + "step": 81620 + }, + { + "epoch": 0.6033972975370332, + "grad_norm": 0.04399878531694412, + "learning_rate": 1.99036977682811e-05, + "loss": 0.0173, + "step": 81630 + }, + { + "epoch": 0.6034712161083351, + "grad_norm": 0.08544055372476578, + "learning_rate": 1.9899988129154797e-05, + "loss": 0.0177, + "step": 81640 + }, + { + "epoch": 0.6035451346796369, + "grad_norm": 0.0625818744301796, + "learning_rate": 1.989627849002849e-05, + "loss": 0.0171, + "step": 81650 + }, + { + "epoch": 0.6036190532509388, + "grad_norm": 0.11756463348865509, + "learning_rate": 1.9892568850902186e-05, + "loss": 0.0199, + "step": 81660 + }, + { + "epoch": 0.6036929718222406, + "grad_norm": 0.059727754443883896, + "learning_rate": 1.988885921177588e-05, + "loss": 0.0185, + "step": 81670 + }, + { + "epoch": 0.6037668903935425, + "grad_norm": 0.06694986671209335, + "learning_rate": 1.988514957264957e-05, + "loss": 0.0164, + "step": 81680 + }, + { + "epoch": 0.6038408089648444, + "grad_norm": 0.08270717412233353, + "learning_rate": 1.9881439933523267e-05, + "loss": 0.0175, + "step": 81690 + }, + { + "epoch": 0.6039147275361462, + "grad_norm": 0.11556769162416458, + "learning_rate": 1.9877730294396963e-05, + "loss": 0.018, + "step": 81700 + }, + { + "epoch": 0.6039886461074481, + "grad_norm": 0.06787298619747162, + "learning_rate": 1.9874020655270656e-05, + "loss": 0.0186, + "step": 81710 + }, + { + "epoch": 0.6040625646787499, + "grad_norm": 0.09240677207708359, + "learning_rate": 1.9870311016144348e-05, + "loss": 0.0176, + "step": 81720 + }, + { + "epoch": 0.6041364832500518, + "grad_norm": 0.0830911323428154, + "learning_rate": 1.9866601377018044e-05, + "loss": 0.0161, + "step": 81730 + }, + { + "epoch": 0.6042104018213535, + "grad_norm": 0.08775690943002701, + "learning_rate": 1.986289173789174e-05, + "loss": 0.0197, + "step": 81740 + }, + { + "epoch": 0.6042843203926554, + "grad_norm": 0.09807612001895905, + "learning_rate": 1.9859182098765433e-05, + "loss": 0.0191, + "step": 81750 + }, + { + "epoch": 0.6043582389639574, + "grad_norm": 0.08120745420455933, + "learning_rate": 1.9855472459639125e-05, + "loss": 0.0173, + "step": 81760 + }, + { + "epoch": 0.6044321575352591, + "grad_norm": 0.08104580640792847, + "learning_rate": 1.985176282051282e-05, + "loss": 0.0227, + "step": 81770 + }, + { + "epoch": 0.604506076106561, + "grad_norm": 0.09069859981536865, + "learning_rate": 1.9848053181386517e-05, + "loss": 0.0202, + "step": 81780 + }, + { + "epoch": 0.6045799946778628, + "grad_norm": 0.06171071156859398, + "learning_rate": 1.984434354226021e-05, + "loss": 0.0161, + "step": 81790 + }, + { + "epoch": 0.6046539132491647, + "grad_norm": 0.09475076198577881, + "learning_rate": 1.9840633903133902e-05, + "loss": 0.0175, + "step": 81800 + }, + { + "epoch": 0.6047278318204665, + "grad_norm": 0.07772573083639145, + "learning_rate": 1.98369242640076e-05, + "loss": 0.0176, + "step": 81810 + }, + { + "epoch": 0.6048017503917684, + "grad_norm": 0.0816749632358551, + "learning_rate": 1.983321462488129e-05, + "loss": 0.0195, + "step": 81820 + }, + { + "epoch": 0.6048756689630703, + "grad_norm": 0.06907261908054352, + "learning_rate": 1.9829504985754987e-05, + "loss": 0.0179, + "step": 81830 + }, + { + "epoch": 0.6049495875343721, + "grad_norm": 0.07266615331172943, + "learning_rate": 1.982579534662868e-05, + "loss": 0.0176, + "step": 81840 + }, + { + "epoch": 0.605023506105674, + "grad_norm": 0.1005522757768631, + "learning_rate": 1.9822085707502376e-05, + "loss": 0.0167, + "step": 81850 + }, + { + "epoch": 0.6050974246769758, + "grad_norm": 0.07866062223911285, + "learning_rate": 1.9818376068376068e-05, + "loss": 0.017, + "step": 81860 + }, + { + "epoch": 0.6051713432482777, + "grad_norm": 0.08237247169017792, + "learning_rate": 1.9814666429249764e-05, + "loss": 0.0163, + "step": 81870 + }, + { + "epoch": 0.6052452618195796, + "grad_norm": 0.06381018459796906, + "learning_rate": 1.9810956790123457e-05, + "loss": 0.0175, + "step": 81880 + }, + { + "epoch": 0.6053191803908814, + "grad_norm": 0.09133058041334152, + "learning_rate": 1.9807247150997153e-05, + "loss": 0.0166, + "step": 81890 + }, + { + "epoch": 0.6053930989621833, + "grad_norm": 0.09503191709518433, + "learning_rate": 1.9803537511870845e-05, + "loss": 0.0172, + "step": 81900 + }, + { + "epoch": 0.6054670175334851, + "grad_norm": 0.09173522889614105, + "learning_rate": 1.9799827872744538e-05, + "loss": 0.0185, + "step": 81910 + }, + { + "epoch": 0.605540936104787, + "grad_norm": 0.10468512773513794, + "learning_rate": 1.9796118233618237e-05, + "loss": 0.0172, + "step": 81920 + }, + { + "epoch": 0.6056148546760888, + "grad_norm": 0.06426660716533661, + "learning_rate": 1.979240859449193e-05, + "loss": 0.0174, + "step": 81930 + }, + { + "epoch": 0.6056887732473907, + "grad_norm": 0.1315983235836029, + "learning_rate": 1.9788698955365622e-05, + "loss": 0.0179, + "step": 81940 + }, + { + "epoch": 0.6057626918186926, + "grad_norm": 0.09482461959123611, + "learning_rate": 1.9784989316239315e-05, + "loss": 0.0173, + "step": 81950 + }, + { + "epoch": 0.6058366103899944, + "grad_norm": 0.07064099609851837, + "learning_rate": 1.9781279677113014e-05, + "loss": 0.0167, + "step": 81960 + }, + { + "epoch": 0.6059105289612963, + "grad_norm": 0.17217977344989777, + "learning_rate": 1.9777570037986707e-05, + "loss": 0.0179, + "step": 81970 + }, + { + "epoch": 0.6059844475325981, + "grad_norm": 0.0605187863111496, + "learning_rate": 1.97738603988604e-05, + "loss": 0.0205, + "step": 81980 + }, + { + "epoch": 0.6060583661039, + "grad_norm": 0.1200895830988884, + "learning_rate": 1.9770150759734092e-05, + "loss": 0.0159, + "step": 81990 + }, + { + "epoch": 0.6061322846752017, + "grad_norm": 0.08941507339477539, + "learning_rate": 1.9766441120607788e-05, + "loss": 0.0158, + "step": 82000 + }, + { + "epoch": 0.6062062032465036, + "grad_norm": 0.0896550789475441, + "learning_rate": 1.9762731481481484e-05, + "loss": 0.0172, + "step": 82010 + }, + { + "epoch": 0.6062801218178056, + "grad_norm": 0.06692972034215927, + "learning_rate": 1.9759021842355176e-05, + "loss": 0.0176, + "step": 82020 + }, + { + "epoch": 0.6063540403891073, + "grad_norm": 0.05622977018356323, + "learning_rate": 1.975531220322887e-05, + "loss": 0.0176, + "step": 82030 + }, + { + "epoch": 0.6064279589604092, + "grad_norm": 0.10331122577190399, + "learning_rate": 1.9751602564102565e-05, + "loss": 0.0181, + "step": 82040 + }, + { + "epoch": 0.606501877531711, + "grad_norm": 0.09501251578330994, + "learning_rate": 1.9747892924976258e-05, + "loss": 0.0185, + "step": 82050 + }, + { + "epoch": 0.6065757961030129, + "grad_norm": 0.059741511940956116, + "learning_rate": 1.9744183285849954e-05, + "loss": 0.0148, + "step": 82060 + }, + { + "epoch": 0.6066497146743147, + "grad_norm": 0.06147574260830879, + "learning_rate": 1.974047364672365e-05, + "loss": 0.0169, + "step": 82070 + }, + { + "epoch": 0.6067236332456166, + "grad_norm": 0.08298580348491669, + "learning_rate": 1.9736764007597342e-05, + "loss": 0.0203, + "step": 82080 + }, + { + "epoch": 0.6067975518169185, + "grad_norm": 0.07622501999139786, + "learning_rate": 1.9733054368471035e-05, + "loss": 0.0177, + "step": 82090 + }, + { + "epoch": 0.6068714703882203, + "grad_norm": 0.07855395972728729, + "learning_rate": 1.972934472934473e-05, + "loss": 0.0165, + "step": 82100 + }, + { + "epoch": 0.6069453889595222, + "grad_norm": 0.07775694131851196, + "learning_rate": 1.9725635090218427e-05, + "loss": 0.0181, + "step": 82110 + }, + { + "epoch": 0.607019307530824, + "grad_norm": 0.07085943222045898, + "learning_rate": 1.972192545109212e-05, + "loss": 0.018, + "step": 82120 + }, + { + "epoch": 0.6070932261021259, + "grad_norm": 0.08171843737363815, + "learning_rate": 1.9718215811965812e-05, + "loss": 0.0186, + "step": 82130 + }, + { + "epoch": 0.6071671446734278, + "grad_norm": 0.09936392307281494, + "learning_rate": 1.9714506172839504e-05, + "loss": 0.0185, + "step": 82140 + }, + { + "epoch": 0.6072410632447296, + "grad_norm": 0.07809259742498398, + "learning_rate": 1.9710796533713204e-05, + "loss": 0.0189, + "step": 82150 + }, + { + "epoch": 0.6073149818160315, + "grad_norm": 0.10185842961072922, + "learning_rate": 1.9707086894586896e-05, + "loss": 0.0184, + "step": 82160 + }, + { + "epoch": 0.6073889003873333, + "grad_norm": 0.08506341278553009, + "learning_rate": 1.970337725546059e-05, + "loss": 0.0211, + "step": 82170 + }, + { + "epoch": 0.6074628189586352, + "grad_norm": 0.0753314197063446, + "learning_rate": 1.969966761633428e-05, + "loss": 0.0185, + "step": 82180 + }, + { + "epoch": 0.607536737529937, + "grad_norm": 0.08406726270914078, + "learning_rate": 1.969595797720798e-05, + "loss": 0.0173, + "step": 82190 + }, + { + "epoch": 0.6076106561012389, + "grad_norm": 0.07717745751142502, + "learning_rate": 1.9692248338081673e-05, + "loss": 0.0162, + "step": 82200 + }, + { + "epoch": 0.6076845746725408, + "grad_norm": 0.07151953876018524, + "learning_rate": 1.9688538698955366e-05, + "loss": 0.0198, + "step": 82210 + }, + { + "epoch": 0.6077584932438426, + "grad_norm": 0.07603351771831512, + "learning_rate": 1.9684829059829062e-05, + "loss": 0.0162, + "step": 82220 + }, + { + "epoch": 0.6078324118151445, + "grad_norm": 0.0788840651512146, + "learning_rate": 1.9681119420702754e-05, + "loss": 0.016, + "step": 82230 + }, + { + "epoch": 0.6079063303864463, + "grad_norm": 0.08803824335336685, + "learning_rate": 1.967740978157645e-05, + "loss": 0.0178, + "step": 82240 + }, + { + "epoch": 0.6079802489577482, + "grad_norm": 0.09685607254505157, + "learning_rate": 1.9673700142450143e-05, + "loss": 0.0173, + "step": 82250 + }, + { + "epoch": 0.60805416752905, + "grad_norm": 0.067377969622612, + "learning_rate": 1.966999050332384e-05, + "loss": 0.0194, + "step": 82260 + }, + { + "epoch": 0.6081280861003519, + "grad_norm": 0.09533512592315674, + "learning_rate": 1.966628086419753e-05, + "loss": 0.0164, + "step": 82270 + }, + { + "epoch": 0.6082020046716538, + "grad_norm": 0.08822904527187347, + "learning_rate": 1.9662571225071228e-05, + "loss": 0.0172, + "step": 82280 + }, + { + "epoch": 0.6082759232429555, + "grad_norm": 0.06911545246839523, + "learning_rate": 1.965886158594492e-05, + "loss": 0.0183, + "step": 82290 + }, + { + "epoch": 0.6083498418142574, + "grad_norm": 0.08410735428333282, + "learning_rate": 1.9655151946818616e-05, + "loss": 0.0181, + "step": 82300 + }, + { + "epoch": 0.6084237603855592, + "grad_norm": 0.07128669321537018, + "learning_rate": 1.965144230769231e-05, + "loss": 0.016, + "step": 82310 + }, + { + "epoch": 0.6084976789568611, + "grad_norm": 0.07649806886911392, + "learning_rate": 1.9647732668566e-05, + "loss": 0.0189, + "step": 82320 + }, + { + "epoch": 0.6085715975281629, + "grad_norm": 0.07032273709774017, + "learning_rate": 1.9644023029439697e-05, + "loss": 0.018, + "step": 82330 + }, + { + "epoch": 0.6086455160994648, + "grad_norm": 0.09526366740465164, + "learning_rate": 1.9640313390313393e-05, + "loss": 0.0182, + "step": 82340 + }, + { + "epoch": 0.6087194346707667, + "grad_norm": 0.0940188616514206, + "learning_rate": 1.9636603751187086e-05, + "loss": 0.0171, + "step": 82350 + }, + { + "epoch": 0.6087933532420685, + "grad_norm": 0.07495230436325073, + "learning_rate": 1.963289411206078e-05, + "loss": 0.0169, + "step": 82360 + }, + { + "epoch": 0.6088672718133704, + "grad_norm": 0.09661861509084702, + "learning_rate": 1.962918447293447e-05, + "loss": 0.0172, + "step": 82370 + }, + { + "epoch": 0.6089411903846722, + "grad_norm": 0.08764498680830002, + "learning_rate": 1.962547483380817e-05, + "loss": 0.0155, + "step": 82380 + }, + { + "epoch": 0.6090151089559741, + "grad_norm": 0.06950998306274414, + "learning_rate": 1.9621765194681863e-05, + "loss": 0.0166, + "step": 82390 + }, + { + "epoch": 0.609089027527276, + "grad_norm": 0.040111687034368515, + "learning_rate": 1.9618055555555555e-05, + "loss": 0.0175, + "step": 82400 + }, + { + "epoch": 0.6091629460985778, + "grad_norm": 0.07911848276853561, + "learning_rate": 1.961434591642925e-05, + "loss": 0.0195, + "step": 82410 + }, + { + "epoch": 0.6092368646698797, + "grad_norm": 0.07476948946714401, + "learning_rate": 1.9610636277302947e-05, + "loss": 0.0178, + "step": 82420 + }, + { + "epoch": 0.6093107832411815, + "grad_norm": 0.1279289871454239, + "learning_rate": 1.960692663817664e-05, + "loss": 0.0174, + "step": 82430 + }, + { + "epoch": 0.6093847018124834, + "grad_norm": 0.0703182965517044, + "learning_rate": 1.9603216999050333e-05, + "loss": 0.0177, + "step": 82440 + }, + { + "epoch": 0.6094586203837852, + "grad_norm": 0.0812310054898262, + "learning_rate": 1.959950735992403e-05, + "loss": 0.018, + "step": 82450 + }, + { + "epoch": 0.6095325389550871, + "grad_norm": 0.09021375328302383, + "learning_rate": 1.959579772079772e-05, + "loss": 0.0157, + "step": 82460 + }, + { + "epoch": 0.609606457526389, + "grad_norm": 0.09041917324066162, + "learning_rate": 1.9592088081671417e-05, + "loss": 0.0159, + "step": 82470 + }, + { + "epoch": 0.6096803760976908, + "grad_norm": 0.08076930791139603, + "learning_rate": 1.958837844254511e-05, + "loss": 0.0164, + "step": 82480 + }, + { + "epoch": 0.6097542946689927, + "grad_norm": 0.08362049609422684, + "learning_rate": 1.9584668803418806e-05, + "loss": 0.0196, + "step": 82490 + }, + { + "epoch": 0.6098282132402945, + "grad_norm": 0.09628446400165558, + "learning_rate": 1.9580959164292498e-05, + "loss": 0.0177, + "step": 82500 + }, + { + "epoch": 0.6099021318115964, + "grad_norm": 0.07797731459140778, + "learning_rate": 1.9577249525166194e-05, + "loss": 0.0202, + "step": 82510 + }, + { + "epoch": 0.6099760503828981, + "grad_norm": 0.08127598464488983, + "learning_rate": 1.9573539886039887e-05, + "loss": 0.0198, + "step": 82520 + }, + { + "epoch": 0.6100499689542, + "grad_norm": 0.060320500284433365, + "learning_rate": 1.9569830246913583e-05, + "loss": 0.0173, + "step": 82530 + }, + { + "epoch": 0.610123887525502, + "grad_norm": 0.05279785394668579, + "learning_rate": 1.9566120607787275e-05, + "loss": 0.0152, + "step": 82540 + }, + { + "epoch": 0.6101978060968037, + "grad_norm": 0.06562897562980652, + "learning_rate": 1.9562410968660968e-05, + "loss": 0.0186, + "step": 82550 + }, + { + "epoch": 0.6102717246681056, + "grad_norm": 0.08916452527046204, + "learning_rate": 1.9558701329534664e-05, + "loss": 0.0177, + "step": 82560 + }, + { + "epoch": 0.6103456432394074, + "grad_norm": 0.05119860917329788, + "learning_rate": 1.955499169040836e-05, + "loss": 0.017, + "step": 82570 + }, + { + "epoch": 0.6104195618107093, + "grad_norm": 0.07321562618017197, + "learning_rate": 1.9551282051282052e-05, + "loss": 0.0183, + "step": 82580 + }, + { + "epoch": 0.6104934803820111, + "grad_norm": 0.06944682449102402, + "learning_rate": 1.9547572412155745e-05, + "loss": 0.0194, + "step": 82590 + }, + { + "epoch": 0.610567398953313, + "grad_norm": 0.09687232226133347, + "learning_rate": 1.954386277302944e-05, + "loss": 0.0169, + "step": 82600 + }, + { + "epoch": 0.6106413175246149, + "grad_norm": 0.07312414050102234, + "learning_rate": 1.9540153133903137e-05, + "loss": 0.018, + "step": 82610 + }, + { + "epoch": 0.6107152360959167, + "grad_norm": 0.07167432457208633, + "learning_rate": 1.953644349477683e-05, + "loss": 0.0202, + "step": 82620 + }, + { + "epoch": 0.6107891546672186, + "grad_norm": 0.07102657854557037, + "learning_rate": 1.9532733855650522e-05, + "loss": 0.0179, + "step": 82630 + }, + { + "epoch": 0.6108630732385204, + "grad_norm": 0.07418309897184372, + "learning_rate": 1.9529024216524218e-05, + "loss": 0.0152, + "step": 82640 + }, + { + "epoch": 0.6109369918098223, + "grad_norm": 0.0991952046751976, + "learning_rate": 1.9525314577397914e-05, + "loss": 0.0188, + "step": 82650 + }, + { + "epoch": 0.6110109103811242, + "grad_norm": 0.0658475011587143, + "learning_rate": 1.9521604938271607e-05, + "loss": 0.0181, + "step": 82660 + }, + { + "epoch": 0.611084828952426, + "grad_norm": 0.08346283435821533, + "learning_rate": 1.95178952991453e-05, + "loss": 0.0188, + "step": 82670 + }, + { + "epoch": 0.6111587475237279, + "grad_norm": 0.08464224636554718, + "learning_rate": 1.9514185660018995e-05, + "loss": 0.0164, + "step": 82680 + }, + { + "epoch": 0.6112326660950297, + "grad_norm": 0.06308498233556747, + "learning_rate": 1.9510476020892688e-05, + "loss": 0.0167, + "step": 82690 + }, + { + "epoch": 0.6113065846663316, + "grad_norm": 0.06456910073757172, + "learning_rate": 1.9506766381766384e-05, + "loss": 0.0171, + "step": 82700 + }, + { + "epoch": 0.6113805032376334, + "grad_norm": 0.06762517243623734, + "learning_rate": 1.9503056742640076e-05, + "loss": 0.018, + "step": 82710 + }, + { + "epoch": 0.6114544218089353, + "grad_norm": 0.09544017910957336, + "learning_rate": 1.9499347103513772e-05, + "loss": 0.0149, + "step": 82720 + }, + { + "epoch": 0.6115283403802372, + "grad_norm": 0.07939791679382324, + "learning_rate": 1.9495637464387465e-05, + "loss": 0.0182, + "step": 82730 + }, + { + "epoch": 0.611602258951539, + "grad_norm": 0.07914235442876816, + "learning_rate": 1.949192782526116e-05, + "loss": 0.018, + "step": 82740 + }, + { + "epoch": 0.6116761775228409, + "grad_norm": 0.09582500159740448, + "learning_rate": 1.9488218186134853e-05, + "loss": 0.0168, + "step": 82750 + }, + { + "epoch": 0.6117500960941427, + "grad_norm": 0.07404599338769913, + "learning_rate": 1.948450854700855e-05, + "loss": 0.0198, + "step": 82760 + }, + { + "epoch": 0.6118240146654446, + "grad_norm": 0.07443395256996155, + "learning_rate": 1.9480798907882242e-05, + "loss": 0.0166, + "step": 82770 + }, + { + "epoch": 0.6118979332367463, + "grad_norm": 0.07942095398902893, + "learning_rate": 1.9477089268755934e-05, + "loss": 0.0174, + "step": 82780 + }, + { + "epoch": 0.6119718518080483, + "grad_norm": 0.07103811949491501, + "learning_rate": 1.947337962962963e-05, + "loss": 0.0171, + "step": 82790 + }, + { + "epoch": 0.6120457703793502, + "grad_norm": 0.09235479682683945, + "learning_rate": 1.9469669990503326e-05, + "loss": 0.0196, + "step": 82800 + }, + { + "epoch": 0.6121196889506519, + "grad_norm": 0.08370334655046463, + "learning_rate": 1.946596035137702e-05, + "loss": 0.0175, + "step": 82810 + }, + { + "epoch": 0.6121936075219538, + "grad_norm": 0.09434487670660019, + "learning_rate": 1.946225071225071e-05, + "loss": 0.0174, + "step": 82820 + }, + { + "epoch": 0.6122675260932556, + "grad_norm": 0.0804063156247139, + "learning_rate": 1.9458541073124407e-05, + "loss": 0.0175, + "step": 82830 + }, + { + "epoch": 0.6123414446645575, + "grad_norm": 0.08051107078790665, + "learning_rate": 1.9454831433998103e-05, + "loss": 0.0167, + "step": 82840 + }, + { + "epoch": 0.6124153632358593, + "grad_norm": 0.09527001529932022, + "learning_rate": 1.9451121794871796e-05, + "loss": 0.018, + "step": 82850 + }, + { + "epoch": 0.6124892818071612, + "grad_norm": 0.09901162981987, + "learning_rate": 1.944741215574549e-05, + "loss": 0.0182, + "step": 82860 + }, + { + "epoch": 0.6125632003784631, + "grad_norm": 0.11223585158586502, + "learning_rate": 1.9443702516619185e-05, + "loss": 0.0213, + "step": 82870 + }, + { + "epoch": 0.6126371189497649, + "grad_norm": 0.10098253935575485, + "learning_rate": 1.943999287749288e-05, + "loss": 0.0188, + "step": 82880 + }, + { + "epoch": 0.6127110375210668, + "grad_norm": 0.126252681016922, + "learning_rate": 1.9436283238366573e-05, + "loss": 0.0171, + "step": 82890 + }, + { + "epoch": 0.6127849560923686, + "grad_norm": 0.08134482800960541, + "learning_rate": 1.9432573599240266e-05, + "loss": 0.0178, + "step": 82900 + }, + { + "epoch": 0.6128588746636705, + "grad_norm": 0.06653578579425812, + "learning_rate": 1.942886396011396e-05, + "loss": 0.0183, + "step": 82910 + }, + { + "epoch": 0.6129327932349724, + "grad_norm": 0.05563788861036301, + "learning_rate": 1.9425154320987654e-05, + "loss": 0.0182, + "step": 82920 + }, + { + "epoch": 0.6130067118062742, + "grad_norm": 0.07811429351568222, + "learning_rate": 1.942144468186135e-05, + "loss": 0.0154, + "step": 82930 + }, + { + "epoch": 0.6130806303775761, + "grad_norm": 0.0799199566245079, + "learning_rate": 1.9417735042735043e-05, + "loss": 0.02, + "step": 82940 + }, + { + "epoch": 0.6131545489488779, + "grad_norm": 0.08699478954076767, + "learning_rate": 1.941402540360874e-05, + "loss": 0.0184, + "step": 82950 + }, + { + "epoch": 0.6132284675201798, + "grad_norm": 0.08317042887210846, + "learning_rate": 1.941031576448243e-05, + "loss": 0.0163, + "step": 82960 + }, + { + "epoch": 0.6133023860914816, + "grad_norm": 0.08908155560493469, + "learning_rate": 1.9406606125356127e-05, + "loss": 0.017, + "step": 82970 + }, + { + "epoch": 0.6133763046627835, + "grad_norm": 0.0903390496969223, + "learning_rate": 1.940289648622982e-05, + "loss": 0.0178, + "step": 82980 + }, + { + "epoch": 0.6134502232340854, + "grad_norm": 0.0837148055434227, + "learning_rate": 1.9399186847103516e-05, + "loss": 0.0186, + "step": 82990 + }, + { + "epoch": 0.6135241418053872, + "grad_norm": 0.07399854809045792, + "learning_rate": 1.939547720797721e-05, + "loss": 0.0176, + "step": 83000 + }, + { + "epoch": 0.6135980603766891, + "grad_norm": 0.05894295498728752, + "learning_rate": 1.93917675688509e-05, + "loss": 0.0183, + "step": 83010 + }, + { + "epoch": 0.6136719789479909, + "grad_norm": 0.09080424159765244, + "learning_rate": 1.9388057929724597e-05, + "loss": 0.0159, + "step": 83020 + }, + { + "epoch": 0.6137458975192928, + "grad_norm": 0.06074220687150955, + "learning_rate": 1.9384348290598293e-05, + "loss": 0.0194, + "step": 83030 + }, + { + "epoch": 0.6138198160905945, + "grad_norm": 0.07715459167957306, + "learning_rate": 1.9380638651471986e-05, + "loss": 0.0177, + "step": 83040 + }, + { + "epoch": 0.6138937346618965, + "grad_norm": 0.1121053695678711, + "learning_rate": 1.9376929012345678e-05, + "loss": 0.0184, + "step": 83050 + }, + { + "epoch": 0.6139676532331984, + "grad_norm": 0.07647407799959183, + "learning_rate": 1.9373219373219374e-05, + "loss": 0.0174, + "step": 83060 + }, + { + "epoch": 0.6140415718045001, + "grad_norm": 0.08576580882072449, + "learning_rate": 1.936950973409307e-05, + "loss": 0.0185, + "step": 83070 + }, + { + "epoch": 0.614115490375802, + "grad_norm": 0.06949552893638611, + "learning_rate": 1.9365800094966763e-05, + "loss": 0.0175, + "step": 83080 + }, + { + "epoch": 0.6141894089471038, + "grad_norm": 0.05394889414310455, + "learning_rate": 1.9362090455840455e-05, + "loss": 0.016, + "step": 83090 + }, + { + "epoch": 0.6142633275184057, + "grad_norm": 0.1143985167145729, + "learning_rate": 1.935838081671415e-05, + "loss": 0.0194, + "step": 83100 + }, + { + "epoch": 0.6143372460897075, + "grad_norm": 0.07003039121627808, + "learning_rate": 1.9354671177587847e-05, + "loss": 0.0182, + "step": 83110 + }, + { + "epoch": 0.6144111646610094, + "grad_norm": 0.0878969207406044, + "learning_rate": 1.935096153846154e-05, + "loss": 0.0195, + "step": 83120 + }, + { + "epoch": 0.6144850832323113, + "grad_norm": 0.062459979206323624, + "learning_rate": 1.9347251899335232e-05, + "loss": 0.0188, + "step": 83130 + }, + { + "epoch": 0.6145590018036131, + "grad_norm": 0.0716971904039383, + "learning_rate": 1.9343542260208928e-05, + "loss": 0.0168, + "step": 83140 + }, + { + "epoch": 0.614632920374915, + "grad_norm": 0.07524064928293228, + "learning_rate": 1.933983262108262e-05, + "loss": 0.0187, + "step": 83150 + }, + { + "epoch": 0.6147068389462168, + "grad_norm": 0.08315975219011307, + "learning_rate": 1.9336122981956317e-05, + "loss": 0.0182, + "step": 83160 + }, + { + "epoch": 0.6147807575175187, + "grad_norm": 0.07940053939819336, + "learning_rate": 1.933241334283001e-05, + "loss": 0.0162, + "step": 83170 + }, + { + "epoch": 0.6148546760888206, + "grad_norm": 0.06539808958768845, + "learning_rate": 1.9328703703703705e-05, + "loss": 0.0167, + "step": 83180 + }, + { + "epoch": 0.6149285946601224, + "grad_norm": 0.09697232395410538, + "learning_rate": 1.9324994064577398e-05, + "loss": 0.0189, + "step": 83190 + }, + { + "epoch": 0.6150025132314243, + "grad_norm": 0.09977428615093231, + "learning_rate": 1.9321284425451094e-05, + "loss": 0.0189, + "step": 83200 + }, + { + "epoch": 0.6150764318027261, + "grad_norm": 0.0542060025036335, + "learning_rate": 1.9317574786324786e-05, + "loss": 0.017, + "step": 83210 + }, + { + "epoch": 0.615150350374028, + "grad_norm": 0.0710158422589302, + "learning_rate": 1.9313865147198482e-05, + "loss": 0.0178, + "step": 83220 + }, + { + "epoch": 0.6152242689453298, + "grad_norm": 0.07952819764614105, + "learning_rate": 1.9310155508072175e-05, + "loss": 0.0179, + "step": 83230 + }, + { + "epoch": 0.6152981875166317, + "grad_norm": 0.0634431466460228, + "learning_rate": 1.9306445868945868e-05, + "loss": 0.0177, + "step": 83240 + }, + { + "epoch": 0.6153721060879336, + "grad_norm": 0.07674331963062286, + "learning_rate": 1.9302736229819564e-05, + "loss": 0.0183, + "step": 83250 + }, + { + "epoch": 0.6154460246592354, + "grad_norm": 0.10375528037548065, + "learning_rate": 1.929902659069326e-05, + "loss": 0.0164, + "step": 83260 + }, + { + "epoch": 0.6155199432305373, + "grad_norm": 0.06967686861753464, + "learning_rate": 1.9295316951566952e-05, + "loss": 0.02, + "step": 83270 + }, + { + "epoch": 0.6155938618018391, + "grad_norm": 0.08993130922317505, + "learning_rate": 1.9291607312440645e-05, + "loss": 0.0188, + "step": 83280 + }, + { + "epoch": 0.615667780373141, + "grad_norm": 0.06792949140071869, + "learning_rate": 1.928789767331434e-05, + "loss": 0.0176, + "step": 83290 + }, + { + "epoch": 0.6157416989444428, + "grad_norm": 0.07722378522157669, + "learning_rate": 1.9284188034188037e-05, + "loss": 0.0169, + "step": 83300 + }, + { + "epoch": 0.6158156175157447, + "grad_norm": 0.0698406994342804, + "learning_rate": 1.928047839506173e-05, + "loss": 0.0188, + "step": 83310 + }, + { + "epoch": 0.6158895360870466, + "grad_norm": 0.07145354151725769, + "learning_rate": 1.9276768755935422e-05, + "loss": 0.017, + "step": 83320 + }, + { + "epoch": 0.6159634546583483, + "grad_norm": 0.06953489035367966, + "learning_rate": 1.9273059116809118e-05, + "loss": 0.0156, + "step": 83330 + }, + { + "epoch": 0.6160373732296502, + "grad_norm": 0.09008972346782684, + "learning_rate": 1.9269349477682814e-05, + "loss": 0.0169, + "step": 83340 + }, + { + "epoch": 0.616111291800952, + "grad_norm": 0.07456735521554947, + "learning_rate": 1.9265639838556506e-05, + "loss": 0.0152, + "step": 83350 + }, + { + "epoch": 0.6161852103722539, + "grad_norm": 0.07110676914453506, + "learning_rate": 1.92619301994302e-05, + "loss": 0.0196, + "step": 83360 + }, + { + "epoch": 0.6162591289435558, + "grad_norm": 0.06726035475730896, + "learning_rate": 1.9258220560303895e-05, + "loss": 0.0178, + "step": 83370 + }, + { + "epoch": 0.6163330475148576, + "grad_norm": 0.07207488268613815, + "learning_rate": 1.9254510921177587e-05, + "loss": 0.0184, + "step": 83380 + }, + { + "epoch": 0.6164069660861595, + "grad_norm": 0.06204327568411827, + "learning_rate": 1.9250801282051283e-05, + "loss": 0.0159, + "step": 83390 + }, + { + "epoch": 0.6164808846574613, + "grad_norm": 0.13019336760044098, + "learning_rate": 1.9247091642924976e-05, + "loss": 0.018, + "step": 83400 + }, + { + "epoch": 0.6165548032287632, + "grad_norm": 0.08293792605400085, + "learning_rate": 1.9243382003798672e-05, + "loss": 0.0192, + "step": 83410 + }, + { + "epoch": 0.616628721800065, + "grad_norm": 0.05020857974886894, + "learning_rate": 1.9239672364672364e-05, + "loss": 0.0156, + "step": 83420 + }, + { + "epoch": 0.6167026403713669, + "grad_norm": 0.06632091850042343, + "learning_rate": 1.923596272554606e-05, + "loss": 0.0174, + "step": 83430 + }, + { + "epoch": 0.6167765589426688, + "grad_norm": 0.11432168632745743, + "learning_rate": 1.9232253086419756e-05, + "loss": 0.0182, + "step": 83440 + }, + { + "epoch": 0.6168504775139706, + "grad_norm": 0.08506050705909729, + "learning_rate": 1.922854344729345e-05, + "loss": 0.0181, + "step": 83450 + }, + { + "epoch": 0.6169243960852725, + "grad_norm": 0.08166781812906265, + "learning_rate": 1.922483380816714e-05, + "loss": 0.0191, + "step": 83460 + }, + { + "epoch": 0.6169983146565743, + "grad_norm": 0.06657709181308746, + "learning_rate": 1.9221124169040834e-05, + "loss": 0.0181, + "step": 83470 + }, + { + "epoch": 0.6170722332278762, + "grad_norm": 0.06887469440698624, + "learning_rate": 1.9217414529914534e-05, + "loss": 0.0178, + "step": 83480 + }, + { + "epoch": 0.617146151799178, + "grad_norm": 0.07855060696601868, + "learning_rate": 1.9213704890788226e-05, + "loss": 0.017, + "step": 83490 + }, + { + "epoch": 0.6172200703704799, + "grad_norm": 0.0891873836517334, + "learning_rate": 1.920999525166192e-05, + "loss": 0.0179, + "step": 83500 + }, + { + "epoch": 0.6172939889417818, + "grad_norm": 0.1001485213637352, + "learning_rate": 1.920628561253561e-05, + "loss": 0.0173, + "step": 83510 + }, + { + "epoch": 0.6173679075130836, + "grad_norm": 0.0711141899228096, + "learning_rate": 1.9202575973409307e-05, + "loss": 0.0182, + "step": 83520 + }, + { + "epoch": 0.6174418260843855, + "grad_norm": 0.07405520975589752, + "learning_rate": 1.9198866334283003e-05, + "loss": 0.0177, + "step": 83530 + }, + { + "epoch": 0.6175157446556873, + "grad_norm": 0.06708526611328125, + "learning_rate": 1.9195156695156696e-05, + "loss": 0.0194, + "step": 83540 + }, + { + "epoch": 0.6175896632269892, + "grad_norm": 0.09064970910549164, + "learning_rate": 1.919144705603039e-05, + "loss": 0.0197, + "step": 83550 + }, + { + "epoch": 0.617663581798291, + "grad_norm": 0.07799121737480164, + "learning_rate": 1.9187737416904084e-05, + "loss": 0.0188, + "step": 83560 + }, + { + "epoch": 0.6177375003695929, + "grad_norm": 0.049574632197618484, + "learning_rate": 1.918402777777778e-05, + "loss": 0.0173, + "step": 83570 + }, + { + "epoch": 0.6178114189408948, + "grad_norm": 0.06921504437923431, + "learning_rate": 1.9180318138651473e-05, + "loss": 0.0188, + "step": 83580 + }, + { + "epoch": 0.6178853375121965, + "grad_norm": 0.06067037582397461, + "learning_rate": 1.917660849952517e-05, + "loss": 0.0186, + "step": 83590 + }, + { + "epoch": 0.6179592560834984, + "grad_norm": 0.09824355691671371, + "learning_rate": 1.917289886039886e-05, + "loss": 0.0183, + "step": 83600 + }, + { + "epoch": 0.6180331746548002, + "grad_norm": 0.07331540435552597, + "learning_rate": 1.9169189221272554e-05, + "loss": 0.0155, + "step": 83610 + }, + { + "epoch": 0.6181070932261021, + "grad_norm": 0.09476131200790405, + "learning_rate": 1.916547958214625e-05, + "loss": 0.0176, + "step": 83620 + }, + { + "epoch": 0.618181011797404, + "grad_norm": 0.07724323868751526, + "learning_rate": 1.9161769943019946e-05, + "loss": 0.015, + "step": 83630 + }, + { + "epoch": 0.6182549303687058, + "grad_norm": 0.08021007478237152, + "learning_rate": 1.915806030389364e-05, + "loss": 0.0177, + "step": 83640 + }, + { + "epoch": 0.6183288489400077, + "grad_norm": 0.07071489095687866, + "learning_rate": 1.915435066476733e-05, + "loss": 0.0186, + "step": 83650 + }, + { + "epoch": 0.6184027675113095, + "grad_norm": 0.10465199500322342, + "learning_rate": 1.9150641025641027e-05, + "loss": 0.0179, + "step": 83660 + }, + { + "epoch": 0.6184766860826114, + "grad_norm": 0.09959176927804947, + "learning_rate": 1.9146931386514723e-05, + "loss": 0.0177, + "step": 83670 + }, + { + "epoch": 0.6185506046539132, + "grad_norm": 0.0952177420258522, + "learning_rate": 1.9143221747388416e-05, + "loss": 0.0193, + "step": 83680 + }, + { + "epoch": 0.6186245232252151, + "grad_norm": 0.08493653684854507, + "learning_rate": 1.9139512108262108e-05, + "loss": 0.0165, + "step": 83690 + }, + { + "epoch": 0.618698441796517, + "grad_norm": 0.07825871556997299, + "learning_rate": 1.91358024691358e-05, + "loss": 0.0164, + "step": 83700 + }, + { + "epoch": 0.6187723603678188, + "grad_norm": 0.09615447372198105, + "learning_rate": 1.91320928300095e-05, + "loss": 0.0186, + "step": 83710 + }, + { + "epoch": 0.6188462789391207, + "grad_norm": 0.06349712610244751, + "learning_rate": 1.9128383190883193e-05, + "loss": 0.0145, + "step": 83720 + }, + { + "epoch": 0.6189201975104225, + "grad_norm": 0.09628452360630035, + "learning_rate": 1.9124673551756885e-05, + "loss": 0.0161, + "step": 83730 + }, + { + "epoch": 0.6189941160817244, + "grad_norm": 0.07538719475269318, + "learning_rate": 1.9120963912630578e-05, + "loss": 0.0181, + "step": 83740 + }, + { + "epoch": 0.6190680346530262, + "grad_norm": 0.0743245929479599, + "learning_rate": 1.9117254273504274e-05, + "loss": 0.0189, + "step": 83750 + }, + { + "epoch": 0.6191419532243281, + "grad_norm": 0.08122391253709793, + "learning_rate": 1.911354463437797e-05, + "loss": 0.0199, + "step": 83760 + }, + { + "epoch": 0.61921587179563, + "grad_norm": 0.06733874976634979, + "learning_rate": 1.9109834995251662e-05, + "loss": 0.0168, + "step": 83770 + }, + { + "epoch": 0.6192897903669318, + "grad_norm": 0.06824064254760742, + "learning_rate": 1.910612535612536e-05, + "loss": 0.0174, + "step": 83780 + }, + { + "epoch": 0.6193637089382337, + "grad_norm": 0.06514638662338257, + "learning_rate": 1.910241571699905e-05, + "loss": 0.0176, + "step": 83790 + }, + { + "epoch": 0.6194376275095355, + "grad_norm": 0.08393329381942749, + "learning_rate": 1.9098706077872747e-05, + "loss": 0.0172, + "step": 83800 + }, + { + "epoch": 0.6195115460808374, + "grad_norm": 0.0601193904876709, + "learning_rate": 1.909499643874644e-05, + "loss": 0.0168, + "step": 83810 + }, + { + "epoch": 0.6195854646521392, + "grad_norm": 0.09605354815721512, + "learning_rate": 1.9091286799620135e-05, + "loss": 0.019, + "step": 83820 + }, + { + "epoch": 0.619659383223441, + "grad_norm": 0.08356858789920807, + "learning_rate": 1.9087577160493828e-05, + "loss": 0.0187, + "step": 83830 + }, + { + "epoch": 0.619733301794743, + "grad_norm": 0.09216849505901337, + "learning_rate": 1.908386752136752e-05, + "loss": 0.018, + "step": 83840 + }, + { + "epoch": 0.6198072203660447, + "grad_norm": 0.07293610274791718, + "learning_rate": 1.9080157882241217e-05, + "loss": 0.0164, + "step": 83850 + }, + { + "epoch": 0.6198811389373466, + "grad_norm": 0.09722217172384262, + "learning_rate": 1.9076448243114913e-05, + "loss": 0.0199, + "step": 83860 + }, + { + "epoch": 0.6199550575086484, + "grad_norm": 0.060842424631118774, + "learning_rate": 1.9072738603988605e-05, + "loss": 0.0167, + "step": 83870 + }, + { + "epoch": 0.6200289760799503, + "grad_norm": 0.08243682980537415, + "learning_rate": 1.9069028964862298e-05, + "loss": 0.0173, + "step": 83880 + }, + { + "epoch": 0.6201028946512522, + "grad_norm": 0.09843390434980392, + "learning_rate": 1.9065319325735994e-05, + "loss": 0.0192, + "step": 83890 + }, + { + "epoch": 0.620176813222554, + "grad_norm": 0.0666450783610344, + "learning_rate": 1.906160968660969e-05, + "loss": 0.0184, + "step": 83900 + }, + { + "epoch": 0.6202507317938559, + "grad_norm": 0.07498643547296524, + "learning_rate": 1.9057900047483382e-05, + "loss": 0.0173, + "step": 83910 + }, + { + "epoch": 0.6203246503651577, + "grad_norm": 0.0821881964802742, + "learning_rate": 1.9054190408357075e-05, + "loss": 0.0172, + "step": 83920 + }, + { + "epoch": 0.6203985689364596, + "grad_norm": 0.06200087442994118, + "learning_rate": 1.905048076923077e-05, + "loss": 0.0167, + "step": 83930 + }, + { + "epoch": 0.6204724875077614, + "grad_norm": 0.08189740777015686, + "learning_rate": 1.9046771130104467e-05, + "loss": 0.0199, + "step": 83940 + }, + { + "epoch": 0.6205464060790633, + "grad_norm": 0.08127987384796143, + "learning_rate": 1.904306149097816e-05, + "loss": 0.0188, + "step": 83950 + }, + { + "epoch": 0.6206203246503652, + "grad_norm": 0.08441877365112305, + "learning_rate": 1.9039351851851852e-05, + "loss": 0.0167, + "step": 83960 + }, + { + "epoch": 0.620694243221667, + "grad_norm": 0.04807879403233528, + "learning_rate": 1.9035642212725548e-05, + "loss": 0.0168, + "step": 83970 + }, + { + "epoch": 0.6207681617929689, + "grad_norm": 0.07149063050746918, + "learning_rate": 1.903193257359924e-05, + "loss": 0.0181, + "step": 83980 + }, + { + "epoch": 0.6208420803642707, + "grad_norm": 0.08318059891462326, + "learning_rate": 1.9028222934472936e-05, + "loss": 0.0182, + "step": 83990 + }, + { + "epoch": 0.6209159989355726, + "grad_norm": 0.08298517763614655, + "learning_rate": 1.902451329534663e-05, + "loss": 0.0185, + "step": 84000 + }, + { + "epoch": 0.6209899175068744, + "grad_norm": 0.06689772754907608, + "learning_rate": 1.9020803656220325e-05, + "loss": 0.0152, + "step": 84010 + }, + { + "epoch": 0.6210638360781763, + "grad_norm": 0.1090644896030426, + "learning_rate": 1.9017094017094017e-05, + "loss": 0.0203, + "step": 84020 + }, + { + "epoch": 0.6211377546494782, + "grad_norm": 0.09569858759641647, + "learning_rate": 1.9013384377967713e-05, + "loss": 0.0185, + "step": 84030 + }, + { + "epoch": 0.62121167322078, + "grad_norm": 0.07898285239934921, + "learning_rate": 1.9009674738841406e-05, + "loss": 0.0158, + "step": 84040 + }, + { + "epoch": 0.6212855917920819, + "grad_norm": 0.08022938668727875, + "learning_rate": 1.9005965099715102e-05, + "loss": 0.0172, + "step": 84050 + }, + { + "epoch": 0.6213595103633837, + "grad_norm": 0.07045149058103561, + "learning_rate": 1.9002255460588795e-05, + "loss": 0.0182, + "step": 84060 + }, + { + "epoch": 0.6214334289346856, + "grad_norm": 0.09920522570610046, + "learning_rate": 1.8998545821462487e-05, + "loss": 0.017, + "step": 84070 + }, + { + "epoch": 0.6215073475059874, + "grad_norm": 0.10555408149957657, + "learning_rate": 1.8994836182336183e-05, + "loss": 0.0209, + "step": 84080 + }, + { + "epoch": 0.6215812660772893, + "grad_norm": 0.06650474667549133, + "learning_rate": 1.899112654320988e-05, + "loss": 0.0176, + "step": 84090 + }, + { + "epoch": 0.6216551846485912, + "grad_norm": 0.07660243660211563, + "learning_rate": 1.898741690408357e-05, + "loss": 0.0179, + "step": 84100 + }, + { + "epoch": 0.6217291032198929, + "grad_norm": 0.07393896579742432, + "learning_rate": 1.8983707264957264e-05, + "loss": 0.0171, + "step": 84110 + }, + { + "epoch": 0.6218030217911948, + "grad_norm": 0.09908831864595413, + "learning_rate": 1.897999762583096e-05, + "loss": 0.0198, + "step": 84120 + }, + { + "epoch": 0.6218769403624966, + "grad_norm": 0.07033517956733704, + "learning_rate": 1.8976287986704656e-05, + "loss": 0.0192, + "step": 84130 + }, + { + "epoch": 0.6219508589337985, + "grad_norm": 0.0860503613948822, + "learning_rate": 1.897257834757835e-05, + "loss": 0.0177, + "step": 84140 + }, + { + "epoch": 0.6220247775051004, + "grad_norm": 0.10883115977048874, + "learning_rate": 1.896886870845204e-05, + "loss": 0.0158, + "step": 84150 + }, + { + "epoch": 0.6220986960764022, + "grad_norm": 0.07544694095849991, + "learning_rate": 1.8965159069325737e-05, + "loss": 0.0195, + "step": 84160 + }, + { + "epoch": 0.6221726146477041, + "grad_norm": 0.06770365685224533, + "learning_rate": 1.8961449430199433e-05, + "loss": 0.017, + "step": 84170 + }, + { + "epoch": 0.6222465332190059, + "grad_norm": 0.09239034354686737, + "learning_rate": 1.8957739791073126e-05, + "loss": 0.0165, + "step": 84180 + }, + { + "epoch": 0.6223204517903078, + "grad_norm": 0.0541006475687027, + "learning_rate": 1.895403015194682e-05, + "loss": 0.0172, + "step": 84190 + }, + { + "epoch": 0.6223943703616096, + "grad_norm": 0.08181953430175781, + "learning_rate": 1.8950320512820514e-05, + "loss": 0.0184, + "step": 84200 + }, + { + "epoch": 0.6224682889329115, + "grad_norm": 0.085248202085495, + "learning_rate": 1.8946610873694207e-05, + "loss": 0.0183, + "step": 84210 + }, + { + "epoch": 0.6225422075042134, + "grad_norm": 0.07147057354450226, + "learning_rate": 1.8942901234567903e-05, + "loss": 0.0165, + "step": 84220 + }, + { + "epoch": 0.6226161260755152, + "grad_norm": 0.08153613656759262, + "learning_rate": 1.8939191595441596e-05, + "loss": 0.0174, + "step": 84230 + }, + { + "epoch": 0.6226900446468171, + "grad_norm": 0.07802841067314148, + "learning_rate": 1.893548195631529e-05, + "loss": 0.018, + "step": 84240 + }, + { + "epoch": 0.6227639632181189, + "grad_norm": 0.08233258128166199, + "learning_rate": 1.8931772317188984e-05, + "loss": 0.0158, + "step": 84250 + }, + { + "epoch": 0.6228378817894208, + "grad_norm": 0.08503064513206482, + "learning_rate": 1.892806267806268e-05, + "loss": 0.0135, + "step": 84260 + }, + { + "epoch": 0.6229118003607226, + "grad_norm": 0.07782771438360214, + "learning_rate": 1.8924353038936373e-05, + "loss": 0.019, + "step": 84270 + }, + { + "epoch": 0.6229857189320245, + "grad_norm": 0.07316979765892029, + "learning_rate": 1.892064339981007e-05, + "loss": 0.0154, + "step": 84280 + }, + { + "epoch": 0.6230596375033264, + "grad_norm": 0.06318678706884384, + "learning_rate": 1.891693376068376e-05, + "loss": 0.0154, + "step": 84290 + }, + { + "epoch": 0.6231335560746282, + "grad_norm": 0.09239519387483597, + "learning_rate": 1.8913224121557454e-05, + "loss": 0.016, + "step": 84300 + }, + { + "epoch": 0.6232074746459301, + "grad_norm": 0.0915965810418129, + "learning_rate": 1.890951448243115e-05, + "loss": 0.016, + "step": 84310 + }, + { + "epoch": 0.6232813932172319, + "grad_norm": 0.07295511662960052, + "learning_rate": 1.8905804843304846e-05, + "loss": 0.0182, + "step": 84320 + }, + { + "epoch": 0.6233553117885338, + "grad_norm": 0.09266003966331482, + "learning_rate": 1.8902095204178538e-05, + "loss": 0.0161, + "step": 84330 + }, + { + "epoch": 0.6234292303598356, + "grad_norm": 0.08012060821056366, + "learning_rate": 1.889838556505223e-05, + "loss": 0.0175, + "step": 84340 + }, + { + "epoch": 0.6235031489311375, + "grad_norm": 0.10466712713241577, + "learning_rate": 1.8894675925925927e-05, + "loss": 0.0177, + "step": 84350 + }, + { + "epoch": 0.6235770675024394, + "grad_norm": 0.08725877106189728, + "learning_rate": 1.8890966286799623e-05, + "loss": 0.0171, + "step": 84360 + }, + { + "epoch": 0.6236509860737411, + "grad_norm": 0.13205556571483612, + "learning_rate": 1.8887256647673315e-05, + "loss": 0.0152, + "step": 84370 + }, + { + "epoch": 0.623724904645043, + "grad_norm": 0.09634332358837128, + "learning_rate": 1.8883547008547008e-05, + "loss": 0.0183, + "step": 84380 + }, + { + "epoch": 0.6237988232163448, + "grad_norm": 0.08334216475486755, + "learning_rate": 1.8879837369420704e-05, + "loss": 0.0159, + "step": 84390 + }, + { + "epoch": 0.6238727417876467, + "grad_norm": 0.06158098205924034, + "learning_rate": 1.88761277302944e-05, + "loss": 0.0161, + "step": 84400 + }, + { + "epoch": 0.6239466603589486, + "grad_norm": 0.08288898319005966, + "learning_rate": 1.8872418091168092e-05, + "loss": 0.0183, + "step": 84410 + }, + { + "epoch": 0.6240205789302504, + "grad_norm": 0.10982749611139297, + "learning_rate": 1.8868708452041785e-05, + "loss": 0.019, + "step": 84420 + }, + { + "epoch": 0.6240944975015523, + "grad_norm": 0.07673147320747375, + "learning_rate": 1.886499881291548e-05, + "loss": 0.0165, + "step": 84430 + }, + { + "epoch": 0.6241684160728541, + "grad_norm": 0.0847049355506897, + "learning_rate": 1.8861289173789174e-05, + "loss": 0.0153, + "step": 84440 + }, + { + "epoch": 0.624242334644156, + "grad_norm": 0.13826969265937805, + "learning_rate": 1.885757953466287e-05, + "loss": 0.0198, + "step": 84450 + }, + { + "epoch": 0.6243162532154578, + "grad_norm": 0.08034597337245941, + "learning_rate": 1.8853869895536562e-05, + "loss": 0.0162, + "step": 84460 + }, + { + "epoch": 0.6243901717867597, + "grad_norm": 0.06714905798435211, + "learning_rate": 1.8850160256410258e-05, + "loss": 0.0172, + "step": 84470 + }, + { + "epoch": 0.6244640903580616, + "grad_norm": 0.09450197964906693, + "learning_rate": 1.884645061728395e-05, + "loss": 0.0189, + "step": 84480 + }, + { + "epoch": 0.6245380089293634, + "grad_norm": 0.07406625896692276, + "learning_rate": 1.8842740978157647e-05, + "loss": 0.0159, + "step": 84490 + }, + { + "epoch": 0.6246119275006653, + "grad_norm": 0.07484360784292221, + "learning_rate": 1.883903133903134e-05, + "loss": 0.019, + "step": 84500 + }, + { + "epoch": 0.6246858460719671, + "grad_norm": 0.0986456349492073, + "learning_rate": 1.8835321699905035e-05, + "loss": 0.0177, + "step": 84510 + }, + { + "epoch": 0.624759764643269, + "grad_norm": 0.06627759337425232, + "learning_rate": 1.8831612060778728e-05, + "loss": 0.0192, + "step": 84520 + }, + { + "epoch": 0.6248336832145708, + "grad_norm": 0.07380781322717667, + "learning_rate": 1.882790242165242e-05, + "loss": 0.0183, + "step": 84530 + }, + { + "epoch": 0.6249076017858727, + "grad_norm": 0.0757727399468422, + "learning_rate": 1.8824192782526116e-05, + "loss": 0.0166, + "step": 84540 + }, + { + "epoch": 0.6249815203571746, + "grad_norm": 0.06838218867778778, + "learning_rate": 1.8820483143399812e-05, + "loss": 0.017, + "step": 84550 + }, + { + "epoch": 0.6250554389284764, + "grad_norm": 0.06704081594944, + "learning_rate": 1.8816773504273505e-05, + "loss": 0.0149, + "step": 84560 + }, + { + "epoch": 0.6251293574997783, + "grad_norm": 0.072157122194767, + "learning_rate": 1.8813063865147197e-05, + "loss": 0.0156, + "step": 84570 + }, + { + "epoch": 0.6252032760710801, + "grad_norm": 0.091456338763237, + "learning_rate": 1.8809354226020893e-05, + "loss": 0.0188, + "step": 84580 + }, + { + "epoch": 0.625277194642382, + "grad_norm": 0.047500334680080414, + "learning_rate": 1.880564458689459e-05, + "loss": 0.0158, + "step": 84590 + }, + { + "epoch": 0.6253511132136838, + "grad_norm": 0.07546786218881607, + "learning_rate": 1.8801934947768282e-05, + "loss": 0.0165, + "step": 84600 + }, + { + "epoch": 0.6254250317849857, + "grad_norm": 0.07692951709032059, + "learning_rate": 1.8798225308641975e-05, + "loss": 0.0175, + "step": 84610 + }, + { + "epoch": 0.6254989503562876, + "grad_norm": 0.0885094627737999, + "learning_rate": 1.879451566951567e-05, + "loss": 0.0162, + "step": 84620 + }, + { + "epoch": 0.6255728689275893, + "grad_norm": 0.0944632887840271, + "learning_rate": 1.8790806030389366e-05, + "loss": 0.0188, + "step": 84630 + }, + { + "epoch": 0.6256467874988912, + "grad_norm": 0.08889977633953094, + "learning_rate": 1.878709639126306e-05, + "loss": 0.0188, + "step": 84640 + }, + { + "epoch": 0.625720706070193, + "grad_norm": 0.07446218281984329, + "learning_rate": 1.878338675213675e-05, + "loss": 0.0199, + "step": 84650 + }, + { + "epoch": 0.6257946246414949, + "grad_norm": 0.07708046585321426, + "learning_rate": 1.8779677113010448e-05, + "loss": 0.0165, + "step": 84660 + }, + { + "epoch": 0.6258685432127968, + "grad_norm": 0.10142193734645844, + "learning_rate": 1.877596747388414e-05, + "loss": 0.0179, + "step": 84670 + }, + { + "epoch": 0.6259424617840986, + "grad_norm": 0.08232563734054565, + "learning_rate": 1.8772257834757836e-05, + "loss": 0.0172, + "step": 84680 + }, + { + "epoch": 0.6260163803554005, + "grad_norm": 0.09110886603593826, + "learning_rate": 1.876854819563153e-05, + "loss": 0.0168, + "step": 84690 + }, + { + "epoch": 0.6260902989267023, + "grad_norm": 0.0754542201757431, + "learning_rate": 1.8764838556505225e-05, + "loss": 0.0178, + "step": 84700 + }, + { + "epoch": 0.6261642174980042, + "grad_norm": 0.07504022866487503, + "learning_rate": 1.8761128917378917e-05, + "loss": 0.0182, + "step": 84710 + }, + { + "epoch": 0.626238136069306, + "grad_norm": 0.053055763244628906, + "learning_rate": 1.8757419278252613e-05, + "loss": 0.0164, + "step": 84720 + }, + { + "epoch": 0.6263120546406079, + "grad_norm": 0.10610322654247284, + "learning_rate": 1.8753709639126306e-05, + "loss": 0.0169, + "step": 84730 + }, + { + "epoch": 0.6263859732119098, + "grad_norm": 0.08564948290586472, + "learning_rate": 1.8750000000000002e-05, + "loss": 0.0163, + "step": 84740 + }, + { + "epoch": 0.6264598917832116, + "grad_norm": 0.062062621116638184, + "learning_rate": 1.8746290360873694e-05, + "loss": 0.0181, + "step": 84750 + }, + { + "epoch": 0.6265338103545135, + "grad_norm": 0.08934297412633896, + "learning_rate": 1.8742580721747387e-05, + "loss": 0.0151, + "step": 84760 + }, + { + "epoch": 0.6266077289258153, + "grad_norm": 0.057570990175008774, + "learning_rate": 1.8738871082621083e-05, + "loss": 0.0185, + "step": 84770 + }, + { + "epoch": 0.6266816474971172, + "grad_norm": 0.07890599966049194, + "learning_rate": 1.873516144349478e-05, + "loss": 0.0182, + "step": 84780 + }, + { + "epoch": 0.626755566068419, + "grad_norm": 0.060687899589538574, + "learning_rate": 1.873145180436847e-05, + "loss": 0.0192, + "step": 84790 + }, + { + "epoch": 0.6268294846397209, + "grad_norm": 0.07368257641792297, + "learning_rate": 1.8727742165242164e-05, + "loss": 0.0166, + "step": 84800 + }, + { + "epoch": 0.6269034032110228, + "grad_norm": 0.08597444742918015, + "learning_rate": 1.8724032526115863e-05, + "loss": 0.0184, + "step": 84810 + }, + { + "epoch": 0.6269773217823246, + "grad_norm": 0.06118566542863846, + "learning_rate": 1.8720322886989556e-05, + "loss": 0.0176, + "step": 84820 + }, + { + "epoch": 0.6270512403536265, + "grad_norm": 0.10047478973865509, + "learning_rate": 1.871661324786325e-05, + "loss": 0.0178, + "step": 84830 + }, + { + "epoch": 0.6271251589249283, + "grad_norm": 0.07671331614255905, + "learning_rate": 1.871290360873694e-05, + "loss": 0.017, + "step": 84840 + }, + { + "epoch": 0.6271990774962302, + "grad_norm": 0.07404090464115143, + "learning_rate": 1.8709193969610637e-05, + "loss": 0.018, + "step": 84850 + }, + { + "epoch": 0.627272996067532, + "grad_norm": 0.06930069625377655, + "learning_rate": 1.8705484330484333e-05, + "loss": 0.0187, + "step": 84860 + }, + { + "epoch": 0.6273469146388339, + "grad_norm": 0.08201423287391663, + "learning_rate": 1.8701774691358026e-05, + "loss": 0.0194, + "step": 84870 + }, + { + "epoch": 0.6274208332101358, + "grad_norm": 0.06555015593767166, + "learning_rate": 1.8698065052231718e-05, + "loss": 0.0186, + "step": 84880 + }, + { + "epoch": 0.6274947517814375, + "grad_norm": 0.07549230754375458, + "learning_rate": 1.8694355413105414e-05, + "loss": 0.0175, + "step": 84890 + }, + { + "epoch": 0.6275686703527394, + "grad_norm": 0.09163384139537811, + "learning_rate": 1.8690645773979107e-05, + "loss": 0.0181, + "step": 84900 + }, + { + "epoch": 0.6276425889240412, + "grad_norm": 0.08533893525600433, + "learning_rate": 1.8686936134852803e-05, + "loss": 0.0154, + "step": 84910 + }, + { + "epoch": 0.6277165074953431, + "grad_norm": 0.07470065355300903, + "learning_rate": 1.8683226495726495e-05, + "loss": 0.0154, + "step": 84920 + }, + { + "epoch": 0.627790426066645, + "grad_norm": 0.07613536715507507, + "learning_rate": 1.867951685660019e-05, + "loss": 0.0166, + "step": 84930 + }, + { + "epoch": 0.6278643446379468, + "grad_norm": 0.11712261289358139, + "learning_rate": 1.8675807217473884e-05, + "loss": 0.019, + "step": 84940 + }, + { + "epoch": 0.6279382632092487, + "grad_norm": 0.10472145676612854, + "learning_rate": 1.867209757834758e-05, + "loss": 0.0202, + "step": 84950 + }, + { + "epoch": 0.6280121817805505, + "grad_norm": 0.05405691638588905, + "learning_rate": 1.8668387939221272e-05, + "loss": 0.0165, + "step": 84960 + }, + { + "epoch": 0.6280861003518524, + "grad_norm": 0.06644877791404724, + "learning_rate": 1.866467830009497e-05, + "loss": 0.0184, + "step": 84970 + }, + { + "epoch": 0.6281600189231542, + "grad_norm": 0.06299781054258347, + "learning_rate": 1.866096866096866e-05, + "loss": 0.0167, + "step": 84980 + }, + { + "epoch": 0.6282339374944561, + "grad_norm": 0.06534602493047714, + "learning_rate": 1.8657259021842353e-05, + "loss": 0.016, + "step": 84990 + }, + { + "epoch": 0.628307856065758, + "grad_norm": 0.09402451664209366, + "learning_rate": 1.8653549382716053e-05, + "loss": 0.019, + "step": 85000 + }, + { + "epoch": 0.6283817746370598, + "grad_norm": 0.11132051050662994, + "learning_rate": 1.8649839743589745e-05, + "loss": 0.0182, + "step": 85010 + }, + { + "epoch": 0.6284556932083617, + "grad_norm": 0.09433764219284058, + "learning_rate": 1.8646130104463438e-05, + "loss": 0.0187, + "step": 85020 + }, + { + "epoch": 0.6285296117796635, + "grad_norm": 0.0806804820895195, + "learning_rate": 1.864242046533713e-05, + "loss": 0.0201, + "step": 85030 + }, + { + "epoch": 0.6286035303509654, + "grad_norm": 0.06340008974075317, + "learning_rate": 1.863871082621083e-05, + "loss": 0.0145, + "step": 85040 + }, + { + "epoch": 0.6286774489222672, + "grad_norm": 0.07197962701320648, + "learning_rate": 1.8635001187084523e-05, + "loss": 0.0148, + "step": 85050 + }, + { + "epoch": 0.6287513674935691, + "grad_norm": 0.07551277428865433, + "learning_rate": 1.8631291547958215e-05, + "loss": 0.0167, + "step": 85060 + }, + { + "epoch": 0.628825286064871, + "grad_norm": 0.06895064562559128, + "learning_rate": 1.8627581908831908e-05, + "loss": 0.0154, + "step": 85070 + }, + { + "epoch": 0.6288992046361728, + "grad_norm": 0.07740975171327591, + "learning_rate": 1.8623872269705604e-05, + "loss": 0.0175, + "step": 85080 + }, + { + "epoch": 0.6289731232074747, + "grad_norm": 0.07699029892683029, + "learning_rate": 1.86201626305793e-05, + "loss": 0.017, + "step": 85090 + }, + { + "epoch": 0.6290470417787765, + "grad_norm": 0.07050425559282303, + "learning_rate": 1.8616452991452992e-05, + "loss": 0.0204, + "step": 85100 + }, + { + "epoch": 0.6291209603500784, + "grad_norm": 0.09506210684776306, + "learning_rate": 1.8612743352326685e-05, + "loss": 0.0194, + "step": 85110 + }, + { + "epoch": 0.6291948789213803, + "grad_norm": 0.06495136767625809, + "learning_rate": 1.860903371320038e-05, + "loss": 0.0184, + "step": 85120 + }, + { + "epoch": 0.629268797492682, + "grad_norm": 0.08276129513978958, + "learning_rate": 1.8605324074074073e-05, + "loss": 0.0167, + "step": 85130 + }, + { + "epoch": 0.629342716063984, + "grad_norm": 0.1007828414440155, + "learning_rate": 1.860161443494777e-05, + "loss": 0.0164, + "step": 85140 + }, + { + "epoch": 0.6294166346352857, + "grad_norm": 0.10978119820356369, + "learning_rate": 1.8597904795821465e-05, + "loss": 0.0192, + "step": 85150 + }, + { + "epoch": 0.6294905532065876, + "grad_norm": 0.06298724561929703, + "learning_rate": 1.8594195156695158e-05, + "loss": 0.0156, + "step": 85160 + }, + { + "epoch": 0.6295644717778894, + "grad_norm": 0.08732854574918747, + "learning_rate": 1.859048551756885e-05, + "loss": 0.0163, + "step": 85170 + }, + { + "epoch": 0.6296383903491913, + "grad_norm": 0.07033935934305191, + "learning_rate": 1.8586775878442546e-05, + "loss": 0.0162, + "step": 85180 + }, + { + "epoch": 0.6297123089204932, + "grad_norm": 0.08429215103387833, + "learning_rate": 1.8583066239316242e-05, + "loss": 0.0168, + "step": 85190 + }, + { + "epoch": 0.629786227491795, + "grad_norm": 0.08191212266683578, + "learning_rate": 1.8579356600189935e-05, + "loss": 0.0182, + "step": 85200 + }, + { + "epoch": 0.6298601460630969, + "grad_norm": 0.07270575314760208, + "learning_rate": 1.8575646961063627e-05, + "loss": 0.016, + "step": 85210 + }, + { + "epoch": 0.6299340646343987, + "grad_norm": 0.09649472683668137, + "learning_rate": 1.857193732193732e-05, + "loss": 0.0185, + "step": 85220 + }, + { + "epoch": 0.6300079832057006, + "grad_norm": 0.06528923660516739, + "learning_rate": 1.856822768281102e-05, + "loss": 0.0189, + "step": 85230 + }, + { + "epoch": 0.6300819017770024, + "grad_norm": 0.05460357666015625, + "learning_rate": 1.8564518043684712e-05, + "loss": 0.0165, + "step": 85240 + }, + { + "epoch": 0.6301558203483043, + "grad_norm": 0.07081805169582367, + "learning_rate": 1.8560808404558405e-05, + "loss": 0.0174, + "step": 85250 + }, + { + "epoch": 0.6302297389196062, + "grad_norm": 0.0982808768749237, + "learning_rate": 1.8557098765432097e-05, + "loss": 0.0172, + "step": 85260 + }, + { + "epoch": 0.630303657490908, + "grad_norm": 0.11132100224494934, + "learning_rate": 1.8553389126305797e-05, + "loss": 0.018, + "step": 85270 + }, + { + "epoch": 0.6303775760622099, + "grad_norm": 0.0746561735868454, + "learning_rate": 1.854967948717949e-05, + "loss": 0.0169, + "step": 85280 + }, + { + "epoch": 0.6304514946335117, + "grad_norm": 0.09312082082033157, + "learning_rate": 1.854596984805318e-05, + "loss": 0.018, + "step": 85290 + }, + { + "epoch": 0.6305254132048136, + "grad_norm": 0.0683741420507431, + "learning_rate": 1.8542260208926878e-05, + "loss": 0.0159, + "step": 85300 + }, + { + "epoch": 0.6305993317761154, + "grad_norm": 0.09346149861812592, + "learning_rate": 1.853855056980057e-05, + "loss": 0.0169, + "step": 85310 + }, + { + "epoch": 0.6306732503474173, + "grad_norm": 0.07769722491502762, + "learning_rate": 1.8534840930674266e-05, + "loss": 0.0177, + "step": 85320 + }, + { + "epoch": 0.6307471689187192, + "grad_norm": 0.08260294049978256, + "learning_rate": 1.853113129154796e-05, + "loss": 0.0175, + "step": 85330 + }, + { + "epoch": 0.630821087490021, + "grad_norm": 0.08505997806787491, + "learning_rate": 1.8527421652421655e-05, + "loss": 0.0173, + "step": 85340 + }, + { + "epoch": 0.6308950060613229, + "grad_norm": 0.08676422387361526, + "learning_rate": 1.8523712013295347e-05, + "loss": 0.0191, + "step": 85350 + }, + { + "epoch": 0.6309689246326247, + "grad_norm": 0.09579332917928696, + "learning_rate": 1.852000237416904e-05, + "loss": 0.019, + "step": 85360 + }, + { + "epoch": 0.6310428432039266, + "grad_norm": 0.07286658883094788, + "learning_rate": 1.8516292735042736e-05, + "loss": 0.0182, + "step": 85370 + }, + { + "epoch": 0.6311167617752285, + "grad_norm": 0.08508472889661789, + "learning_rate": 1.8512583095916432e-05, + "loss": 0.0175, + "step": 85380 + }, + { + "epoch": 0.6311906803465303, + "grad_norm": 0.06830597668886185, + "learning_rate": 1.8508873456790124e-05, + "loss": 0.0162, + "step": 85390 + }, + { + "epoch": 0.6312645989178322, + "grad_norm": 0.07824182510375977, + "learning_rate": 1.8505163817663817e-05, + "loss": 0.0173, + "step": 85400 + }, + { + "epoch": 0.6313385174891339, + "grad_norm": 0.08850301057100296, + "learning_rate": 1.8501454178537513e-05, + "loss": 0.0159, + "step": 85410 + }, + { + "epoch": 0.6314124360604358, + "grad_norm": 0.07097941637039185, + "learning_rate": 1.849774453941121e-05, + "loss": 0.0197, + "step": 85420 + }, + { + "epoch": 0.6314863546317376, + "grad_norm": 0.06867203861474991, + "learning_rate": 1.84940349002849e-05, + "loss": 0.0186, + "step": 85430 + }, + { + "epoch": 0.6315602732030395, + "grad_norm": 0.06668906658887863, + "learning_rate": 1.8490325261158594e-05, + "loss": 0.0167, + "step": 85440 + }, + { + "epoch": 0.6316341917743414, + "grad_norm": 0.0815042108297348, + "learning_rate": 1.848661562203229e-05, + "loss": 0.0165, + "step": 85450 + }, + { + "epoch": 0.6317081103456432, + "grad_norm": 0.07367333024740219, + "learning_rate": 1.8482905982905986e-05, + "loss": 0.0193, + "step": 85460 + }, + { + "epoch": 0.6317820289169451, + "grad_norm": 0.05753447115421295, + "learning_rate": 1.847919634377968e-05, + "loss": 0.0156, + "step": 85470 + }, + { + "epoch": 0.6318559474882469, + "grad_norm": 0.08272279798984528, + "learning_rate": 1.847548670465337e-05, + "loss": 0.0187, + "step": 85480 + }, + { + "epoch": 0.6319298660595488, + "grad_norm": 0.09389559179544449, + "learning_rate": 1.8471777065527067e-05, + "loss": 0.0175, + "step": 85490 + }, + { + "epoch": 0.6320037846308506, + "grad_norm": 0.09469499439001083, + "learning_rate": 1.8468067426400763e-05, + "loss": 0.0184, + "step": 85500 + }, + { + "epoch": 0.6320777032021525, + "grad_norm": 0.07357271015644073, + "learning_rate": 1.8464357787274456e-05, + "loss": 0.0162, + "step": 85510 + }, + { + "epoch": 0.6321516217734544, + "grad_norm": 0.08749333024024963, + "learning_rate": 1.8460648148148148e-05, + "loss": 0.0192, + "step": 85520 + }, + { + "epoch": 0.6322255403447562, + "grad_norm": 0.05760166421532631, + "learning_rate": 1.8456938509021844e-05, + "loss": 0.0146, + "step": 85530 + }, + { + "epoch": 0.6322994589160581, + "grad_norm": 0.0925261601805687, + "learning_rate": 1.8453228869895537e-05, + "loss": 0.0186, + "step": 85540 + }, + { + "epoch": 0.6323733774873599, + "grad_norm": 0.08290409296751022, + "learning_rate": 1.8449519230769233e-05, + "loss": 0.0163, + "step": 85550 + }, + { + "epoch": 0.6324472960586618, + "grad_norm": 0.08422588557004929, + "learning_rate": 1.8445809591642925e-05, + "loss": 0.0196, + "step": 85560 + }, + { + "epoch": 0.6325212146299636, + "grad_norm": 0.08946088701486588, + "learning_rate": 1.844209995251662e-05, + "loss": 0.0159, + "step": 85570 + }, + { + "epoch": 0.6325951332012655, + "grad_norm": 0.11140304058790207, + "learning_rate": 1.8438390313390314e-05, + "loss": 0.0181, + "step": 85580 + }, + { + "epoch": 0.6326690517725674, + "grad_norm": 0.09750638157129288, + "learning_rate": 1.8434680674264006e-05, + "loss": 0.0178, + "step": 85590 + }, + { + "epoch": 0.6327429703438692, + "grad_norm": 0.061476483941078186, + "learning_rate": 1.8430971035137702e-05, + "loss": 0.0188, + "step": 85600 + }, + { + "epoch": 0.6328168889151711, + "grad_norm": 0.08562640100717545, + "learning_rate": 1.84272613960114e-05, + "loss": 0.0182, + "step": 85610 + }, + { + "epoch": 0.6328908074864729, + "grad_norm": 0.06430555135011673, + "learning_rate": 1.842355175688509e-05, + "loss": 0.0162, + "step": 85620 + }, + { + "epoch": 0.6329647260577748, + "grad_norm": 0.06210716813802719, + "learning_rate": 1.8419842117758784e-05, + "loss": 0.0161, + "step": 85630 + }, + { + "epoch": 0.6330386446290767, + "grad_norm": 0.06714742630720139, + "learning_rate": 1.841613247863248e-05, + "loss": 0.0186, + "step": 85640 + }, + { + "epoch": 0.6331125632003785, + "grad_norm": 0.05862513184547424, + "learning_rate": 1.8412422839506175e-05, + "loss": 0.0151, + "step": 85650 + }, + { + "epoch": 0.6331864817716804, + "grad_norm": 0.08143387734889984, + "learning_rate": 1.8408713200379868e-05, + "loss": 0.0188, + "step": 85660 + }, + { + "epoch": 0.6332604003429821, + "grad_norm": 0.06120765581727028, + "learning_rate": 1.840500356125356e-05, + "loss": 0.021, + "step": 85670 + }, + { + "epoch": 0.633334318914284, + "grad_norm": 0.07627062499523163, + "learning_rate": 1.8401293922127257e-05, + "loss": 0.0156, + "step": 85680 + }, + { + "epoch": 0.6334082374855858, + "grad_norm": 0.0795658677816391, + "learning_rate": 1.8397584283000953e-05, + "loss": 0.0184, + "step": 85690 + }, + { + "epoch": 0.6334821560568877, + "grad_norm": 0.09216859191656113, + "learning_rate": 1.8393874643874645e-05, + "loss": 0.019, + "step": 85700 + }, + { + "epoch": 0.6335560746281896, + "grad_norm": 0.07667369395494461, + "learning_rate": 1.8390165004748338e-05, + "loss": 0.0187, + "step": 85710 + }, + { + "epoch": 0.6336299931994914, + "grad_norm": 0.09150813519954681, + "learning_rate": 1.8386455365622034e-05, + "loss": 0.0202, + "step": 85720 + }, + { + "epoch": 0.6337039117707933, + "grad_norm": 0.057827726006507874, + "learning_rate": 1.838274572649573e-05, + "loss": 0.0166, + "step": 85730 + }, + { + "epoch": 0.6337778303420951, + "grad_norm": 0.07610096037387848, + "learning_rate": 1.8379036087369422e-05, + "loss": 0.0172, + "step": 85740 + }, + { + "epoch": 0.633851748913397, + "grad_norm": 0.06838104873895645, + "learning_rate": 1.8375326448243115e-05, + "loss": 0.0189, + "step": 85750 + }, + { + "epoch": 0.6339256674846988, + "grad_norm": 0.0750965029001236, + "learning_rate": 1.837161680911681e-05, + "loss": 0.0194, + "step": 85760 + }, + { + "epoch": 0.6339995860560007, + "grad_norm": 0.07158775627613068, + "learning_rate": 1.8367907169990503e-05, + "loss": 0.0186, + "step": 85770 + }, + { + "epoch": 0.6340735046273026, + "grad_norm": 0.07273725420236588, + "learning_rate": 1.83641975308642e-05, + "loss": 0.0165, + "step": 85780 + }, + { + "epoch": 0.6341474231986044, + "grad_norm": 0.0997527688741684, + "learning_rate": 1.8360487891737892e-05, + "loss": 0.0177, + "step": 85790 + }, + { + "epoch": 0.6342213417699063, + "grad_norm": 0.08006743341684341, + "learning_rate": 1.8356778252611588e-05, + "loss": 0.0181, + "step": 85800 + }, + { + "epoch": 0.6342952603412081, + "grad_norm": 0.06414107233285904, + "learning_rate": 1.835306861348528e-05, + "loss": 0.0174, + "step": 85810 + }, + { + "epoch": 0.63436917891251, + "grad_norm": 0.0821545273065567, + "learning_rate": 1.8349358974358973e-05, + "loss": 0.0162, + "step": 85820 + }, + { + "epoch": 0.6344430974838118, + "grad_norm": 0.08555491268634796, + "learning_rate": 1.834564933523267e-05, + "loss": 0.0177, + "step": 85830 + }, + { + "epoch": 0.6345170160551137, + "grad_norm": 0.08455513417720795, + "learning_rate": 1.8341939696106365e-05, + "loss": 0.0184, + "step": 85840 + }, + { + "epoch": 0.6345909346264156, + "grad_norm": 0.08851484954357147, + "learning_rate": 1.8338230056980058e-05, + "loss": 0.0178, + "step": 85850 + }, + { + "epoch": 0.6346648531977174, + "grad_norm": 0.06817267835140228, + "learning_rate": 1.833452041785375e-05, + "loss": 0.0146, + "step": 85860 + }, + { + "epoch": 0.6347387717690193, + "grad_norm": 0.11727976053953171, + "learning_rate": 1.8330810778727446e-05, + "loss": 0.0169, + "step": 85870 + }, + { + "epoch": 0.6348126903403211, + "grad_norm": 0.11674083024263382, + "learning_rate": 1.8327101139601142e-05, + "loss": 0.0171, + "step": 85880 + }, + { + "epoch": 0.634886608911623, + "grad_norm": 0.09169033169746399, + "learning_rate": 1.8323391500474835e-05, + "loss": 0.0173, + "step": 85890 + }, + { + "epoch": 0.6349605274829249, + "grad_norm": 0.09100646525621414, + "learning_rate": 1.8319681861348527e-05, + "loss": 0.0202, + "step": 85900 + }, + { + "epoch": 0.6350344460542267, + "grad_norm": 0.09275765717029572, + "learning_rate": 1.8315972222222223e-05, + "loss": 0.0176, + "step": 85910 + }, + { + "epoch": 0.6351083646255286, + "grad_norm": 0.06938066333532333, + "learning_rate": 1.831226258309592e-05, + "loss": 0.0177, + "step": 85920 + }, + { + "epoch": 0.6351822831968303, + "grad_norm": 0.0831063911318779, + "learning_rate": 1.8308552943969612e-05, + "loss": 0.0179, + "step": 85930 + }, + { + "epoch": 0.6352562017681322, + "grad_norm": 0.06474802643060684, + "learning_rate": 1.8304843304843304e-05, + "loss": 0.0184, + "step": 85940 + }, + { + "epoch": 0.635330120339434, + "grad_norm": 0.05899034067988396, + "learning_rate": 1.8301133665717e-05, + "loss": 0.0167, + "step": 85950 + }, + { + "epoch": 0.6354040389107359, + "grad_norm": 0.0838722288608551, + "learning_rate": 1.8297424026590696e-05, + "loss": 0.0194, + "step": 85960 + }, + { + "epoch": 0.6354779574820378, + "grad_norm": 0.09603888541460037, + "learning_rate": 1.829371438746439e-05, + "loss": 0.0193, + "step": 85970 + }, + { + "epoch": 0.6355518760533396, + "grad_norm": 0.07909788191318512, + "learning_rate": 1.829000474833808e-05, + "loss": 0.0138, + "step": 85980 + }, + { + "epoch": 0.6356257946246415, + "grad_norm": 0.08713407814502716, + "learning_rate": 1.8286295109211777e-05, + "loss": 0.0154, + "step": 85990 + }, + { + "epoch": 0.6356997131959433, + "grad_norm": 0.06677623838186264, + "learning_rate": 1.828258547008547e-05, + "loss": 0.0186, + "step": 86000 + }, + { + "epoch": 0.6357736317672452, + "grad_norm": 0.10572227835655212, + "learning_rate": 1.8278875830959166e-05, + "loss": 0.0179, + "step": 86010 + }, + { + "epoch": 0.635847550338547, + "grad_norm": 0.08408637344837189, + "learning_rate": 1.827516619183286e-05, + "loss": 0.019, + "step": 86020 + }, + { + "epoch": 0.6359214689098489, + "grad_norm": 0.05936083570122719, + "learning_rate": 1.8271456552706554e-05, + "loss": 0.0157, + "step": 86030 + }, + { + "epoch": 0.6359953874811508, + "grad_norm": 0.086244136095047, + "learning_rate": 1.8267746913580247e-05, + "loss": 0.0167, + "step": 86040 + }, + { + "epoch": 0.6360693060524526, + "grad_norm": 0.08920861035585403, + "learning_rate": 1.826403727445394e-05, + "loss": 0.0185, + "step": 86050 + }, + { + "epoch": 0.6361432246237545, + "grad_norm": 0.06074165552854538, + "learning_rate": 1.8260327635327636e-05, + "loss": 0.0173, + "step": 86060 + }, + { + "epoch": 0.6362171431950563, + "grad_norm": 0.07496129721403122, + "learning_rate": 1.825661799620133e-05, + "loss": 0.0182, + "step": 86070 + }, + { + "epoch": 0.6362910617663582, + "grad_norm": 0.08098746836185455, + "learning_rate": 1.8252908357075024e-05, + "loss": 0.0195, + "step": 86080 + }, + { + "epoch": 0.63636498033766, + "grad_norm": 0.06507303565740585, + "learning_rate": 1.8249198717948717e-05, + "loss": 0.0183, + "step": 86090 + }, + { + "epoch": 0.6364388989089619, + "grad_norm": 0.0749412328004837, + "learning_rate": 1.8245489078822413e-05, + "loss": 0.0191, + "step": 86100 + }, + { + "epoch": 0.6365128174802638, + "grad_norm": 0.0722000002861023, + "learning_rate": 1.824177943969611e-05, + "loss": 0.0181, + "step": 86110 + }, + { + "epoch": 0.6365867360515656, + "grad_norm": 0.05548422038555145, + "learning_rate": 1.82380698005698e-05, + "loss": 0.0152, + "step": 86120 + }, + { + "epoch": 0.6366606546228675, + "grad_norm": 0.11477868258953094, + "learning_rate": 1.8234360161443494e-05, + "loss": 0.0163, + "step": 86130 + }, + { + "epoch": 0.6367345731941693, + "grad_norm": 0.09127828478813171, + "learning_rate": 1.823065052231719e-05, + "loss": 0.0168, + "step": 86140 + }, + { + "epoch": 0.6368084917654712, + "grad_norm": 0.08679413050413132, + "learning_rate": 1.8226940883190886e-05, + "loss": 0.0177, + "step": 86150 + }, + { + "epoch": 0.6368824103367731, + "grad_norm": 0.07122139632701874, + "learning_rate": 1.822323124406458e-05, + "loss": 0.0172, + "step": 86160 + }, + { + "epoch": 0.6369563289080749, + "grad_norm": 0.11077108979225159, + "learning_rate": 1.821952160493827e-05, + "loss": 0.0192, + "step": 86170 + }, + { + "epoch": 0.6370302474793768, + "grad_norm": 0.07095395028591156, + "learning_rate": 1.8215811965811967e-05, + "loss": 0.0199, + "step": 86180 + }, + { + "epoch": 0.6371041660506785, + "grad_norm": 0.0770963728427887, + "learning_rate": 1.8212102326685663e-05, + "loss": 0.0155, + "step": 86190 + }, + { + "epoch": 0.6371780846219804, + "grad_norm": 0.09591083973646164, + "learning_rate": 1.8208392687559355e-05, + "loss": 0.0176, + "step": 86200 + }, + { + "epoch": 0.6372520031932822, + "grad_norm": 0.07716168463230133, + "learning_rate": 1.8204683048433048e-05, + "loss": 0.0184, + "step": 86210 + }, + { + "epoch": 0.6373259217645841, + "grad_norm": 0.08891519159078598, + "learning_rate": 1.8200973409306744e-05, + "loss": 0.0155, + "step": 86220 + }, + { + "epoch": 0.637399840335886, + "grad_norm": 0.08693523705005646, + "learning_rate": 1.8197263770180437e-05, + "loss": 0.0176, + "step": 86230 + }, + { + "epoch": 0.6374737589071878, + "grad_norm": 0.07175737619400024, + "learning_rate": 1.8193554131054133e-05, + "loss": 0.0186, + "step": 86240 + }, + { + "epoch": 0.6375476774784897, + "grad_norm": 0.07700374722480774, + "learning_rate": 1.8189844491927825e-05, + "loss": 0.0181, + "step": 86250 + }, + { + "epoch": 0.6376215960497915, + "grad_norm": 0.0934210941195488, + "learning_rate": 1.818613485280152e-05, + "loss": 0.0193, + "step": 86260 + }, + { + "epoch": 0.6376955146210934, + "grad_norm": 0.08946838229894638, + "learning_rate": 1.8182425213675214e-05, + "loss": 0.019, + "step": 86270 + }, + { + "epoch": 0.6377694331923952, + "grad_norm": 0.0774223729968071, + "learning_rate": 1.8178715574548906e-05, + "loss": 0.0204, + "step": 86280 + }, + { + "epoch": 0.6378433517636971, + "grad_norm": 0.06967286020517349, + "learning_rate": 1.8175005935422602e-05, + "loss": 0.0182, + "step": 86290 + }, + { + "epoch": 0.637917270334999, + "grad_norm": 0.06664400547742844, + "learning_rate": 1.8171296296296298e-05, + "loss": 0.0167, + "step": 86300 + }, + { + "epoch": 0.6379911889063008, + "grad_norm": 0.1289106160402298, + "learning_rate": 1.816758665716999e-05, + "loss": 0.0183, + "step": 86310 + }, + { + "epoch": 0.6380651074776027, + "grad_norm": 0.09550661593675613, + "learning_rate": 1.8163877018043683e-05, + "loss": 0.0163, + "step": 86320 + }, + { + "epoch": 0.6381390260489045, + "grad_norm": 0.06980642676353455, + "learning_rate": 1.816016737891738e-05, + "loss": 0.0174, + "step": 86330 + }, + { + "epoch": 0.6382129446202064, + "grad_norm": 0.07693205773830414, + "learning_rate": 1.8156457739791075e-05, + "loss": 0.0193, + "step": 86340 + }, + { + "epoch": 0.6382868631915082, + "grad_norm": 0.07100095599889755, + "learning_rate": 1.8152748100664768e-05, + "loss": 0.0168, + "step": 86350 + }, + { + "epoch": 0.6383607817628101, + "grad_norm": 0.07612592726945877, + "learning_rate": 1.814903846153846e-05, + "loss": 0.017, + "step": 86360 + }, + { + "epoch": 0.638434700334112, + "grad_norm": 0.08937795460224152, + "learning_rate": 1.8145328822412156e-05, + "loss": 0.0175, + "step": 86370 + }, + { + "epoch": 0.6385086189054138, + "grad_norm": 0.09523028135299683, + "learning_rate": 1.8141619183285852e-05, + "loss": 0.021, + "step": 86380 + }, + { + "epoch": 0.6385825374767157, + "grad_norm": 0.07586309313774109, + "learning_rate": 1.8137909544159545e-05, + "loss": 0.0167, + "step": 86390 + }, + { + "epoch": 0.6386564560480175, + "grad_norm": 0.08415968716144562, + "learning_rate": 1.8134199905033237e-05, + "loss": 0.0179, + "step": 86400 + }, + { + "epoch": 0.6387303746193194, + "grad_norm": 0.07729531824588776, + "learning_rate": 1.8130490265906933e-05, + "loss": 0.0175, + "step": 86410 + }, + { + "epoch": 0.6388042931906213, + "grad_norm": 0.07085543125867844, + "learning_rate": 1.812678062678063e-05, + "loss": 0.0182, + "step": 86420 + }, + { + "epoch": 0.638878211761923, + "grad_norm": 0.0866665244102478, + "learning_rate": 1.8123070987654322e-05, + "loss": 0.0184, + "step": 86430 + }, + { + "epoch": 0.638952130333225, + "grad_norm": 0.10179586708545685, + "learning_rate": 1.8119361348528015e-05, + "loss": 0.016, + "step": 86440 + }, + { + "epoch": 0.6390260489045267, + "grad_norm": 0.0781673863530159, + "learning_rate": 1.811565170940171e-05, + "loss": 0.0164, + "step": 86450 + }, + { + "epoch": 0.6390999674758286, + "grad_norm": 0.0726914331316948, + "learning_rate": 1.8111942070275403e-05, + "loss": 0.0171, + "step": 86460 + }, + { + "epoch": 0.6391738860471304, + "grad_norm": 0.06358665972948074, + "learning_rate": 1.81082324311491e-05, + "loss": 0.0164, + "step": 86470 + }, + { + "epoch": 0.6392478046184323, + "grad_norm": 0.06945040822029114, + "learning_rate": 1.810452279202279e-05, + "loss": 0.0159, + "step": 86480 + }, + { + "epoch": 0.6393217231897342, + "grad_norm": 0.06123916804790497, + "learning_rate": 1.8100813152896488e-05, + "loss": 0.0178, + "step": 86490 + }, + { + "epoch": 0.639395641761036, + "grad_norm": 0.08601492643356323, + "learning_rate": 1.809710351377018e-05, + "loss": 0.0183, + "step": 86500 + }, + { + "epoch": 0.6394695603323379, + "grad_norm": 0.0838085189461708, + "learning_rate": 1.8093393874643873e-05, + "loss": 0.0179, + "step": 86510 + }, + { + "epoch": 0.6395434789036397, + "grad_norm": 0.05589223653078079, + "learning_rate": 1.8089684235517572e-05, + "loss": 0.0164, + "step": 86520 + }, + { + "epoch": 0.6396173974749416, + "grad_norm": 0.08448652178049088, + "learning_rate": 1.8085974596391265e-05, + "loss": 0.0182, + "step": 86530 + }, + { + "epoch": 0.6396913160462434, + "grad_norm": 0.09148748219013214, + "learning_rate": 1.8082264957264957e-05, + "loss": 0.0176, + "step": 86540 + }, + { + "epoch": 0.6397652346175453, + "grad_norm": 0.057749610394239426, + "learning_rate": 1.807855531813865e-05, + "loss": 0.0175, + "step": 86550 + }, + { + "epoch": 0.6398391531888472, + "grad_norm": 0.08635307103395462, + "learning_rate": 1.807484567901235e-05, + "loss": 0.0195, + "step": 86560 + }, + { + "epoch": 0.639913071760149, + "grad_norm": 0.09366890043020248, + "learning_rate": 1.8071136039886042e-05, + "loss": 0.0183, + "step": 86570 + }, + { + "epoch": 0.6399869903314509, + "grad_norm": 0.0522395595908165, + "learning_rate": 1.8067426400759734e-05, + "loss": 0.0181, + "step": 86580 + }, + { + "epoch": 0.6400609089027527, + "grad_norm": 0.07926511019468307, + "learning_rate": 1.8063716761633427e-05, + "loss": 0.0169, + "step": 86590 + }, + { + "epoch": 0.6401348274740546, + "grad_norm": 0.08467059582471848, + "learning_rate": 1.8060007122507123e-05, + "loss": 0.0206, + "step": 86600 + }, + { + "epoch": 0.6402087460453564, + "grad_norm": 0.07435682415962219, + "learning_rate": 1.805629748338082e-05, + "loss": 0.0183, + "step": 86610 + }, + { + "epoch": 0.6402826646166583, + "grad_norm": 0.11792890727519989, + "learning_rate": 1.805258784425451e-05, + "loss": 0.0192, + "step": 86620 + }, + { + "epoch": 0.6403565831879602, + "grad_norm": 0.08234578371047974, + "learning_rate": 1.8048878205128204e-05, + "loss": 0.0174, + "step": 86630 + }, + { + "epoch": 0.640430501759262, + "grad_norm": 0.0733381137251854, + "learning_rate": 1.80451685660019e-05, + "loss": 0.0182, + "step": 86640 + }, + { + "epoch": 0.6405044203305639, + "grad_norm": 0.10058154910802841, + "learning_rate": 1.8041458926875596e-05, + "loss": 0.0176, + "step": 86650 + }, + { + "epoch": 0.6405783389018657, + "grad_norm": 0.08493416011333466, + "learning_rate": 1.803774928774929e-05, + "loss": 0.0188, + "step": 86660 + }, + { + "epoch": 0.6406522574731676, + "grad_norm": 0.07821780443191528, + "learning_rate": 1.8034039648622985e-05, + "loss": 0.0211, + "step": 86670 + }, + { + "epoch": 0.6407261760444695, + "grad_norm": 0.09611696749925613, + "learning_rate": 1.8030330009496677e-05, + "loss": 0.0178, + "step": 86680 + }, + { + "epoch": 0.6408000946157713, + "grad_norm": 0.0972839891910553, + "learning_rate": 1.802662037037037e-05, + "loss": 0.0165, + "step": 86690 + }, + { + "epoch": 0.6408740131870732, + "grad_norm": 0.09013646095991135, + "learning_rate": 1.8022910731244066e-05, + "loss": 0.017, + "step": 86700 + }, + { + "epoch": 0.640947931758375, + "grad_norm": 0.07640646398067474, + "learning_rate": 1.801920109211776e-05, + "loss": 0.0192, + "step": 86710 + }, + { + "epoch": 0.6410218503296768, + "grad_norm": 0.08092772960662842, + "learning_rate": 1.8015491452991454e-05, + "loss": 0.0177, + "step": 86720 + }, + { + "epoch": 0.6410957689009786, + "grad_norm": 0.0772366002202034, + "learning_rate": 1.8011781813865147e-05, + "loss": 0.0151, + "step": 86730 + }, + { + "epoch": 0.6411696874722805, + "grad_norm": 0.09616388380527496, + "learning_rate": 1.800807217473884e-05, + "loss": 0.018, + "step": 86740 + }, + { + "epoch": 0.6412436060435824, + "grad_norm": 0.06029176712036133, + "learning_rate": 1.800436253561254e-05, + "loss": 0.0172, + "step": 86750 + }, + { + "epoch": 0.6413175246148842, + "grad_norm": 0.09864173829555511, + "learning_rate": 1.800065289648623e-05, + "loss": 0.018, + "step": 86760 + }, + { + "epoch": 0.6413914431861861, + "grad_norm": 0.06823432445526123, + "learning_rate": 1.7996943257359924e-05, + "loss": 0.0176, + "step": 86770 + }, + { + "epoch": 0.6414653617574879, + "grad_norm": 0.09102225303649902, + "learning_rate": 1.7993233618233616e-05, + "loss": 0.0172, + "step": 86780 + }, + { + "epoch": 0.6415392803287898, + "grad_norm": 0.07310714572668076, + "learning_rate": 1.7989523979107316e-05, + "loss": 0.0166, + "step": 86790 + }, + { + "epoch": 0.6416131989000916, + "grad_norm": 0.08708073198795319, + "learning_rate": 1.798581433998101e-05, + "loss": 0.0153, + "step": 86800 + }, + { + "epoch": 0.6416871174713935, + "grad_norm": 0.06822753697633743, + "learning_rate": 1.79821047008547e-05, + "loss": 0.0164, + "step": 86810 + }, + { + "epoch": 0.6417610360426954, + "grad_norm": 0.07988683879375458, + "learning_rate": 1.7978395061728397e-05, + "loss": 0.0164, + "step": 86820 + }, + { + "epoch": 0.6418349546139972, + "grad_norm": 0.10230796784162521, + "learning_rate": 1.797468542260209e-05, + "loss": 0.0174, + "step": 86830 + }, + { + "epoch": 0.6419088731852991, + "grad_norm": 0.07685470581054688, + "learning_rate": 1.7970975783475786e-05, + "loss": 0.0179, + "step": 86840 + }, + { + "epoch": 0.6419827917566009, + "grad_norm": 0.07872067391872406, + "learning_rate": 1.7967266144349478e-05, + "loss": 0.0182, + "step": 86850 + }, + { + "epoch": 0.6420567103279028, + "grad_norm": 0.07615887373685837, + "learning_rate": 1.7963556505223174e-05, + "loss": 0.0186, + "step": 86860 + }, + { + "epoch": 0.6421306288992046, + "grad_norm": 0.07310209423303604, + "learning_rate": 1.7959846866096867e-05, + "loss": 0.0178, + "step": 86870 + }, + { + "epoch": 0.6422045474705065, + "grad_norm": 0.07288283854722977, + "learning_rate": 1.7956137226970563e-05, + "loss": 0.019, + "step": 86880 + }, + { + "epoch": 0.6422784660418084, + "grad_norm": 0.07359371334314346, + "learning_rate": 1.7952427587844255e-05, + "loss": 0.0158, + "step": 86890 + }, + { + "epoch": 0.6423523846131102, + "grad_norm": 0.08522066473960876, + "learning_rate": 1.794871794871795e-05, + "loss": 0.0184, + "step": 86900 + }, + { + "epoch": 0.6424263031844121, + "grad_norm": 0.08251197636127472, + "learning_rate": 1.7945008309591644e-05, + "loss": 0.0169, + "step": 86910 + }, + { + "epoch": 0.6425002217557139, + "grad_norm": 0.08390085399150848, + "learning_rate": 1.7941298670465336e-05, + "loss": 0.0164, + "step": 86920 + }, + { + "epoch": 0.6425741403270158, + "grad_norm": 0.11742323637008667, + "learning_rate": 1.7937589031339032e-05, + "loss": 0.0185, + "step": 86930 + }, + { + "epoch": 0.6426480588983177, + "grad_norm": 0.08146241307258606, + "learning_rate": 1.7933879392212728e-05, + "loss": 0.0194, + "step": 86940 + }, + { + "epoch": 0.6427219774696195, + "grad_norm": 0.07195167243480682, + "learning_rate": 1.793016975308642e-05, + "loss": 0.0186, + "step": 86950 + }, + { + "epoch": 0.6427958960409214, + "grad_norm": 0.056414175778627396, + "learning_rate": 1.7926460113960113e-05, + "loss": 0.0154, + "step": 86960 + }, + { + "epoch": 0.6428698146122231, + "grad_norm": 0.0632779523730278, + "learning_rate": 1.7922750474833806e-05, + "loss": 0.0162, + "step": 86970 + }, + { + "epoch": 0.642943733183525, + "grad_norm": 0.0912681519985199, + "learning_rate": 1.7919040835707505e-05, + "loss": 0.018, + "step": 86980 + }, + { + "epoch": 0.6430176517548268, + "grad_norm": 0.06888893991708755, + "learning_rate": 1.7915331196581198e-05, + "loss": 0.0171, + "step": 86990 + }, + { + "epoch": 0.6430915703261287, + "grad_norm": 0.0987488254904747, + "learning_rate": 1.791162155745489e-05, + "loss": 0.018, + "step": 87000 + }, + { + "epoch": 0.6431654888974306, + "grad_norm": 0.08291744440793991, + "learning_rate": 1.7907911918328586e-05, + "loss": 0.0179, + "step": 87010 + }, + { + "epoch": 0.6432394074687324, + "grad_norm": 0.12280869483947754, + "learning_rate": 1.7904202279202282e-05, + "loss": 0.0174, + "step": 87020 + }, + { + "epoch": 0.6433133260400343, + "grad_norm": 0.09106610715389252, + "learning_rate": 1.7900492640075975e-05, + "loss": 0.0163, + "step": 87030 + }, + { + "epoch": 0.6433872446113361, + "grad_norm": 0.051806751638650894, + "learning_rate": 1.7896783000949668e-05, + "loss": 0.0151, + "step": 87040 + }, + { + "epoch": 0.643461163182638, + "grad_norm": 0.08872636407613754, + "learning_rate": 1.7893073361823364e-05, + "loss": 0.0185, + "step": 87050 + }, + { + "epoch": 0.6435350817539398, + "grad_norm": 0.058952391147613525, + "learning_rate": 1.7889363722697056e-05, + "loss": 0.0169, + "step": 87060 + }, + { + "epoch": 0.6436090003252417, + "grad_norm": 0.09729276597499847, + "learning_rate": 1.7885654083570752e-05, + "loss": 0.018, + "step": 87070 + }, + { + "epoch": 0.6436829188965436, + "grad_norm": 0.07857722043991089, + "learning_rate": 1.7881944444444445e-05, + "loss": 0.0177, + "step": 87080 + }, + { + "epoch": 0.6437568374678454, + "grad_norm": 0.07660865783691406, + "learning_rate": 1.787823480531814e-05, + "loss": 0.0168, + "step": 87090 + }, + { + "epoch": 0.6438307560391473, + "grad_norm": 0.07207610458135605, + "learning_rate": 1.7874525166191833e-05, + "loss": 0.0165, + "step": 87100 + }, + { + "epoch": 0.6439046746104491, + "grad_norm": 0.07649290561676025, + "learning_rate": 1.787081552706553e-05, + "loss": 0.0166, + "step": 87110 + }, + { + "epoch": 0.643978593181751, + "grad_norm": 0.10615783929824829, + "learning_rate": 1.7867105887939222e-05, + "loss": 0.0194, + "step": 87120 + }, + { + "epoch": 0.6440525117530529, + "grad_norm": 0.07937850058078766, + "learning_rate": 1.7863396248812918e-05, + "loss": 0.0167, + "step": 87130 + }, + { + "epoch": 0.6441264303243547, + "grad_norm": 0.07615212351083755, + "learning_rate": 1.785968660968661e-05, + "loss": 0.0183, + "step": 87140 + }, + { + "epoch": 0.6442003488956566, + "grad_norm": 0.08921714127063751, + "learning_rate": 1.7855976970560303e-05, + "loss": 0.0147, + "step": 87150 + }, + { + "epoch": 0.6442742674669584, + "grad_norm": 0.06732451915740967, + "learning_rate": 1.7852267331434e-05, + "loss": 0.0205, + "step": 87160 + }, + { + "epoch": 0.6443481860382603, + "grad_norm": 0.08228149265050888, + "learning_rate": 1.7848557692307695e-05, + "loss": 0.0193, + "step": 87170 + }, + { + "epoch": 0.6444221046095621, + "grad_norm": 0.05445276200771332, + "learning_rate": 1.7844848053181387e-05, + "loss": 0.0168, + "step": 87180 + }, + { + "epoch": 0.644496023180864, + "grad_norm": 0.09085311740636826, + "learning_rate": 1.784113841405508e-05, + "loss": 0.0186, + "step": 87190 + }, + { + "epoch": 0.6445699417521659, + "grad_norm": 0.07711207121610641, + "learning_rate": 1.7837428774928776e-05, + "loss": 0.0161, + "step": 87200 + }, + { + "epoch": 0.6446438603234677, + "grad_norm": 0.06373975425958633, + "learning_rate": 1.7833719135802472e-05, + "loss": 0.0167, + "step": 87210 + }, + { + "epoch": 0.6447177788947696, + "grad_norm": 0.07344771176576614, + "learning_rate": 1.7830009496676164e-05, + "loss": 0.0189, + "step": 87220 + }, + { + "epoch": 0.6447916974660713, + "grad_norm": 0.08050648123025894, + "learning_rate": 1.7826299857549857e-05, + "loss": 0.0185, + "step": 87230 + }, + { + "epoch": 0.6448656160373732, + "grad_norm": 0.07578188180923462, + "learning_rate": 1.7822590218423553e-05, + "loss": 0.017, + "step": 87240 + }, + { + "epoch": 0.644939534608675, + "grad_norm": 0.07016967982053757, + "learning_rate": 1.781888057929725e-05, + "loss": 0.0181, + "step": 87250 + }, + { + "epoch": 0.6450134531799769, + "grad_norm": 0.08886934816837311, + "learning_rate": 1.781517094017094e-05, + "loss": 0.0181, + "step": 87260 + }, + { + "epoch": 0.6450873717512788, + "grad_norm": 0.09196509420871735, + "learning_rate": 1.7811461301044634e-05, + "loss": 0.0196, + "step": 87270 + }, + { + "epoch": 0.6451612903225806, + "grad_norm": 0.059960197657346725, + "learning_rate": 1.780775166191833e-05, + "loss": 0.0166, + "step": 87280 + }, + { + "epoch": 0.6452352088938825, + "grad_norm": 0.06782221794128418, + "learning_rate": 1.7804042022792023e-05, + "loss": 0.0165, + "step": 87290 + }, + { + "epoch": 0.6453091274651843, + "grad_norm": 0.09103874862194061, + "learning_rate": 1.780033238366572e-05, + "loss": 0.0171, + "step": 87300 + }, + { + "epoch": 0.6453830460364862, + "grad_norm": 0.08995518088340759, + "learning_rate": 1.779662274453941e-05, + "loss": 0.0151, + "step": 87310 + }, + { + "epoch": 0.645456964607788, + "grad_norm": 0.09144158661365509, + "learning_rate": 1.7792913105413107e-05, + "loss": 0.0186, + "step": 87320 + }, + { + "epoch": 0.6455308831790899, + "grad_norm": 0.09462243318557739, + "learning_rate": 1.77892034662868e-05, + "loss": 0.0157, + "step": 87330 + }, + { + "epoch": 0.6456048017503918, + "grad_norm": 0.08410174399614334, + "learning_rate": 1.7785493827160496e-05, + "loss": 0.018, + "step": 87340 + }, + { + "epoch": 0.6456787203216936, + "grad_norm": 0.06735754013061523, + "learning_rate": 1.778178418803419e-05, + "loss": 0.0177, + "step": 87350 + }, + { + "epoch": 0.6457526388929955, + "grad_norm": 0.08470659703016281, + "learning_rate": 1.7778074548907884e-05, + "loss": 0.0195, + "step": 87360 + }, + { + "epoch": 0.6458265574642973, + "grad_norm": 0.08102233707904816, + "learning_rate": 1.7774364909781577e-05, + "loss": 0.0164, + "step": 87370 + }, + { + "epoch": 0.6459004760355992, + "grad_norm": 0.08035475015640259, + "learning_rate": 1.777065527065527e-05, + "loss": 0.0173, + "step": 87380 + }, + { + "epoch": 0.6459743946069011, + "grad_norm": 0.07792873680591583, + "learning_rate": 1.7766945631528965e-05, + "loss": 0.0194, + "step": 87390 + }, + { + "epoch": 0.6460483131782029, + "grad_norm": 0.09555920958518982, + "learning_rate": 1.776323599240266e-05, + "loss": 0.0177, + "step": 87400 + }, + { + "epoch": 0.6461222317495048, + "grad_norm": 0.09256338328123093, + "learning_rate": 1.7759526353276354e-05, + "loss": 0.0192, + "step": 87410 + }, + { + "epoch": 0.6461961503208066, + "grad_norm": 0.07088413834571838, + "learning_rate": 1.7755816714150047e-05, + "loss": 0.0181, + "step": 87420 + }, + { + "epoch": 0.6462700688921085, + "grad_norm": 0.07728109508752823, + "learning_rate": 1.7752107075023743e-05, + "loss": 0.0158, + "step": 87430 + }, + { + "epoch": 0.6463439874634103, + "grad_norm": 0.0669608861207962, + "learning_rate": 1.774839743589744e-05, + "loss": 0.0168, + "step": 87440 + }, + { + "epoch": 0.6464179060347122, + "grad_norm": 0.09193000197410583, + "learning_rate": 1.774468779677113e-05, + "loss": 0.0168, + "step": 87450 + }, + { + "epoch": 0.6464918246060141, + "grad_norm": 0.085015207529068, + "learning_rate": 1.7740978157644824e-05, + "loss": 0.0187, + "step": 87460 + }, + { + "epoch": 0.6465657431773159, + "grad_norm": 0.08341581374406815, + "learning_rate": 1.773726851851852e-05, + "loss": 0.016, + "step": 87470 + }, + { + "epoch": 0.6466396617486178, + "grad_norm": 0.07875963300466537, + "learning_rate": 1.7733558879392216e-05, + "loss": 0.0188, + "step": 87480 + }, + { + "epoch": 0.6467135803199195, + "grad_norm": 0.05918658524751663, + "learning_rate": 1.7729849240265908e-05, + "loss": 0.0188, + "step": 87490 + }, + { + "epoch": 0.6467874988912214, + "grad_norm": 0.0708702951669693, + "learning_rate": 1.77261396011396e-05, + "loss": 0.0179, + "step": 87500 + }, + { + "epoch": 0.6468614174625232, + "grad_norm": 0.08807706832885742, + "learning_rate": 1.7722429962013297e-05, + "loss": 0.0177, + "step": 87510 + }, + { + "epoch": 0.6469353360338251, + "grad_norm": 0.08028735220432281, + "learning_rate": 1.771872032288699e-05, + "loss": 0.0168, + "step": 87520 + }, + { + "epoch": 0.647009254605127, + "grad_norm": 0.07263261079788208, + "learning_rate": 1.7715010683760685e-05, + "loss": 0.0173, + "step": 87530 + }, + { + "epoch": 0.6470831731764288, + "grad_norm": 0.06898120045661926, + "learning_rate": 1.7711301044634378e-05, + "loss": 0.0141, + "step": 87540 + }, + { + "epoch": 0.6471570917477307, + "grad_norm": 0.06692709028720856, + "learning_rate": 1.7707591405508074e-05, + "loss": 0.0154, + "step": 87550 + }, + { + "epoch": 0.6472310103190325, + "grad_norm": 0.11443676054477692, + "learning_rate": 1.7703881766381766e-05, + "loss": 0.0161, + "step": 87560 + }, + { + "epoch": 0.6473049288903344, + "grad_norm": 0.07005342096090317, + "learning_rate": 1.7700172127255462e-05, + "loss": 0.0169, + "step": 87570 + }, + { + "epoch": 0.6473788474616362, + "grad_norm": 0.08362290263175964, + "learning_rate": 1.7696462488129155e-05, + "loss": 0.0186, + "step": 87580 + }, + { + "epoch": 0.6474527660329381, + "grad_norm": 0.07407110184431076, + "learning_rate": 1.769275284900285e-05, + "loss": 0.017, + "step": 87590 + }, + { + "epoch": 0.64752668460424, + "grad_norm": 0.07652756571769714, + "learning_rate": 1.7689043209876543e-05, + "loss": 0.015, + "step": 87600 + }, + { + "epoch": 0.6476006031755418, + "grad_norm": 0.08048743009567261, + "learning_rate": 1.7685333570750236e-05, + "loss": 0.0169, + "step": 87610 + }, + { + "epoch": 0.6476745217468437, + "grad_norm": 0.10383118689060211, + "learning_rate": 1.7681623931623932e-05, + "loss": 0.0178, + "step": 87620 + }, + { + "epoch": 0.6477484403181455, + "grad_norm": 0.07576031982898712, + "learning_rate": 1.7677914292497628e-05, + "loss": 0.0185, + "step": 87630 + }, + { + "epoch": 0.6478223588894474, + "grad_norm": 0.06110012158751488, + "learning_rate": 1.767420465337132e-05, + "loss": 0.0161, + "step": 87640 + }, + { + "epoch": 0.6478962774607493, + "grad_norm": 0.07275137305259705, + "learning_rate": 1.7670495014245013e-05, + "loss": 0.0187, + "step": 87650 + }, + { + "epoch": 0.6479701960320511, + "grad_norm": 0.09735322743654251, + "learning_rate": 1.766678537511871e-05, + "loss": 0.016, + "step": 87660 + }, + { + "epoch": 0.648044114603353, + "grad_norm": 0.06891202181577682, + "learning_rate": 1.7663075735992405e-05, + "loss": 0.0165, + "step": 87670 + }, + { + "epoch": 0.6481180331746548, + "grad_norm": 0.08524206280708313, + "learning_rate": 1.7659366096866098e-05, + "loss": 0.0173, + "step": 87680 + }, + { + "epoch": 0.6481919517459567, + "grad_norm": 0.07716153562068939, + "learning_rate": 1.765565645773979e-05, + "loss": 0.0175, + "step": 87690 + }, + { + "epoch": 0.6482658703172585, + "grad_norm": 0.06978312879800797, + "learning_rate": 1.7651946818613486e-05, + "loss": 0.0168, + "step": 87700 + }, + { + "epoch": 0.6483397888885604, + "grad_norm": 0.10100270807743073, + "learning_rate": 1.7648237179487182e-05, + "loss": 0.018, + "step": 87710 + }, + { + "epoch": 0.6484137074598623, + "grad_norm": 0.11189740151166916, + "learning_rate": 1.7644527540360875e-05, + "loss": 0.018, + "step": 87720 + }, + { + "epoch": 0.648487626031164, + "grad_norm": 0.08508005738258362, + "learning_rate": 1.7640817901234567e-05, + "loss": 0.0183, + "step": 87730 + }, + { + "epoch": 0.648561544602466, + "grad_norm": 0.0837252214550972, + "learning_rate": 1.7637108262108263e-05, + "loss": 0.0183, + "step": 87740 + }, + { + "epoch": 0.6486354631737677, + "grad_norm": 0.07493939995765686, + "learning_rate": 1.7633398622981956e-05, + "loss": 0.0145, + "step": 87750 + }, + { + "epoch": 0.6487093817450696, + "grad_norm": 0.07387962937355042, + "learning_rate": 1.7629688983855652e-05, + "loss": 0.0149, + "step": 87760 + }, + { + "epoch": 0.6487833003163714, + "grad_norm": 0.07578963786363602, + "learning_rate": 1.7625979344729344e-05, + "loss": 0.0179, + "step": 87770 + }, + { + "epoch": 0.6488572188876733, + "grad_norm": 0.07460996508598328, + "learning_rate": 1.762226970560304e-05, + "loss": 0.0179, + "step": 87780 + }, + { + "epoch": 0.6489311374589752, + "grad_norm": 0.07213159650564194, + "learning_rate": 1.7618560066476733e-05, + "loss": 0.0181, + "step": 87790 + }, + { + "epoch": 0.649005056030277, + "grad_norm": 0.10759951174259186, + "learning_rate": 1.761485042735043e-05, + "loss": 0.0198, + "step": 87800 + }, + { + "epoch": 0.6490789746015789, + "grad_norm": 0.06241863965988159, + "learning_rate": 1.761114078822412e-05, + "loss": 0.0156, + "step": 87810 + }, + { + "epoch": 0.6491528931728807, + "grad_norm": 0.06056777387857437, + "learning_rate": 1.7607431149097817e-05, + "loss": 0.0171, + "step": 87820 + }, + { + "epoch": 0.6492268117441826, + "grad_norm": 0.09477005153894424, + "learning_rate": 1.760372150997151e-05, + "loss": 0.0194, + "step": 87830 + }, + { + "epoch": 0.6493007303154844, + "grad_norm": 0.09164115786552429, + "learning_rate": 1.7600011870845203e-05, + "loss": 0.0161, + "step": 87840 + }, + { + "epoch": 0.6493746488867863, + "grad_norm": 0.08950028568506241, + "learning_rate": 1.75963022317189e-05, + "loss": 0.017, + "step": 87850 + }, + { + "epoch": 0.6494485674580882, + "grad_norm": 0.06165635585784912, + "learning_rate": 1.7592592592592595e-05, + "loss": 0.017, + "step": 87860 + }, + { + "epoch": 0.64952248602939, + "grad_norm": 0.12589341402053833, + "learning_rate": 1.7588882953466287e-05, + "loss": 0.0167, + "step": 87870 + }, + { + "epoch": 0.6495964046006919, + "grad_norm": 0.07935626804828644, + "learning_rate": 1.758517331433998e-05, + "loss": 0.0179, + "step": 87880 + }, + { + "epoch": 0.6496703231719937, + "grad_norm": 0.08484697341918945, + "learning_rate": 1.758146367521368e-05, + "loss": 0.0194, + "step": 87890 + }, + { + "epoch": 0.6497442417432956, + "grad_norm": 0.06872183829545975, + "learning_rate": 1.757775403608737e-05, + "loss": 0.0168, + "step": 87900 + }, + { + "epoch": 0.6498181603145975, + "grad_norm": 0.11150965839624405, + "learning_rate": 1.7574044396961064e-05, + "loss": 0.0188, + "step": 87910 + }, + { + "epoch": 0.6498920788858993, + "grad_norm": 0.07173824310302734, + "learning_rate": 1.7570334757834757e-05, + "loss": 0.0207, + "step": 87920 + }, + { + "epoch": 0.6499659974572012, + "grad_norm": 0.07741278409957886, + "learning_rate": 1.7566625118708453e-05, + "loss": 0.0181, + "step": 87930 + }, + { + "epoch": 0.650039916028503, + "grad_norm": 0.06695520877838135, + "learning_rate": 1.756291547958215e-05, + "loss": 0.0159, + "step": 87940 + }, + { + "epoch": 0.6501138345998049, + "grad_norm": 0.08548988401889801, + "learning_rate": 1.755920584045584e-05, + "loss": 0.0172, + "step": 87950 + }, + { + "epoch": 0.6501877531711067, + "grad_norm": 0.07158133387565613, + "learning_rate": 1.7555496201329534e-05, + "loss": 0.0161, + "step": 87960 + }, + { + "epoch": 0.6502616717424086, + "grad_norm": 0.10227443277835846, + "learning_rate": 1.755178656220323e-05, + "loss": 0.0188, + "step": 87970 + }, + { + "epoch": 0.6503355903137105, + "grad_norm": 0.06894666701555252, + "learning_rate": 1.7548076923076922e-05, + "loss": 0.0183, + "step": 87980 + }, + { + "epoch": 0.6504095088850123, + "grad_norm": 0.09842050820589066, + "learning_rate": 1.754436728395062e-05, + "loss": 0.0192, + "step": 87990 + }, + { + "epoch": 0.6504834274563142, + "grad_norm": 0.07729904353618622, + "learning_rate": 1.754065764482431e-05, + "loss": 0.0178, + "step": 88000 + }, + { + "epoch": 0.650557346027616, + "grad_norm": 0.07297061383724213, + "learning_rate": 1.7536948005698007e-05, + "loss": 0.0155, + "step": 88010 + }, + { + "epoch": 0.6506312645989178, + "grad_norm": 0.07612710446119308, + "learning_rate": 1.75332383665717e-05, + "loss": 0.0175, + "step": 88020 + }, + { + "epoch": 0.6507051831702196, + "grad_norm": 0.08314087241888046, + "learning_rate": 1.7529528727445396e-05, + "loss": 0.0187, + "step": 88030 + }, + { + "epoch": 0.6507791017415215, + "grad_norm": 0.10151941329240799, + "learning_rate": 1.752581908831909e-05, + "loss": 0.0167, + "step": 88040 + }, + { + "epoch": 0.6508530203128234, + "grad_norm": 0.06678344309329987, + "learning_rate": 1.7522109449192784e-05, + "loss": 0.017, + "step": 88050 + }, + { + "epoch": 0.6509269388841252, + "grad_norm": 0.06826924532651901, + "learning_rate": 1.7518399810066477e-05, + "loss": 0.0167, + "step": 88060 + }, + { + "epoch": 0.6510008574554271, + "grad_norm": 0.06449755281209946, + "learning_rate": 1.751469017094017e-05, + "loss": 0.0188, + "step": 88070 + }, + { + "epoch": 0.6510747760267289, + "grad_norm": 0.06460060179233551, + "learning_rate": 1.751098053181387e-05, + "loss": 0.0171, + "step": 88080 + }, + { + "epoch": 0.6511486945980308, + "grad_norm": 0.09221690893173218, + "learning_rate": 1.750727089268756e-05, + "loss": 0.0193, + "step": 88090 + }, + { + "epoch": 0.6512226131693326, + "grad_norm": 0.11470767110586166, + "learning_rate": 1.7503561253561254e-05, + "loss": 0.0175, + "step": 88100 + }, + { + "epoch": 0.6512965317406345, + "grad_norm": 0.09464261680841446, + "learning_rate": 1.7499851614434946e-05, + "loss": 0.0165, + "step": 88110 + }, + { + "epoch": 0.6513704503119364, + "grad_norm": 0.09214161336421967, + "learning_rate": 1.7496141975308646e-05, + "loss": 0.0175, + "step": 88120 + }, + { + "epoch": 0.6514443688832382, + "grad_norm": 0.07291626185178757, + "learning_rate": 1.7492432336182338e-05, + "loss": 0.0187, + "step": 88130 + }, + { + "epoch": 0.6515182874545401, + "grad_norm": 0.1113857626914978, + "learning_rate": 1.748872269705603e-05, + "loss": 0.0203, + "step": 88140 + }, + { + "epoch": 0.6515922060258419, + "grad_norm": 0.07568159699440002, + "learning_rate": 1.7485013057929723e-05, + "loss": 0.0208, + "step": 88150 + }, + { + "epoch": 0.6516661245971438, + "grad_norm": 0.09512155503034592, + "learning_rate": 1.748130341880342e-05, + "loss": 0.0172, + "step": 88160 + }, + { + "epoch": 0.6517400431684457, + "grad_norm": 0.08339069038629532, + "learning_rate": 1.7477593779677115e-05, + "loss": 0.0192, + "step": 88170 + }, + { + "epoch": 0.6518139617397475, + "grad_norm": 0.09774073213338852, + "learning_rate": 1.7473884140550808e-05, + "loss": 0.0193, + "step": 88180 + }, + { + "epoch": 0.6518878803110494, + "grad_norm": 0.08510401099920273, + "learning_rate": 1.74701745014245e-05, + "loss": 0.0195, + "step": 88190 + }, + { + "epoch": 0.6519617988823512, + "grad_norm": 0.07104596495628357, + "learning_rate": 1.7466464862298196e-05, + "loss": 0.0165, + "step": 88200 + }, + { + "epoch": 0.6520357174536531, + "grad_norm": 0.06436054408550262, + "learning_rate": 1.746275522317189e-05, + "loss": 0.0178, + "step": 88210 + }, + { + "epoch": 0.6521096360249549, + "grad_norm": 0.05952128395438194, + "learning_rate": 1.7459045584045585e-05, + "loss": 0.0171, + "step": 88220 + }, + { + "epoch": 0.6521835545962568, + "grad_norm": 0.0875726044178009, + "learning_rate": 1.745533594491928e-05, + "loss": 0.0181, + "step": 88230 + }, + { + "epoch": 0.6522574731675587, + "grad_norm": 0.07102656364440918, + "learning_rate": 1.7451626305792974e-05, + "loss": 0.0184, + "step": 88240 + }, + { + "epoch": 0.6523313917388605, + "grad_norm": 0.07627592235803604, + "learning_rate": 1.7447916666666666e-05, + "loss": 0.0201, + "step": 88250 + }, + { + "epoch": 0.6524053103101624, + "grad_norm": 0.07031363993883133, + "learning_rate": 1.7444207027540362e-05, + "loss": 0.0169, + "step": 88260 + }, + { + "epoch": 0.6524792288814641, + "grad_norm": 0.06708788871765137, + "learning_rate": 1.7440497388414058e-05, + "loss": 0.0196, + "step": 88270 + }, + { + "epoch": 0.652553147452766, + "grad_norm": 0.08406656980514526, + "learning_rate": 1.743678774928775e-05, + "loss": 0.019, + "step": 88280 + }, + { + "epoch": 0.6526270660240678, + "grad_norm": 0.06256931275129318, + "learning_rate": 1.7433078110161443e-05, + "loss": 0.0157, + "step": 88290 + }, + { + "epoch": 0.6527009845953697, + "grad_norm": 0.06588439643383026, + "learning_rate": 1.7429368471035136e-05, + "loss": 0.017, + "step": 88300 + }, + { + "epoch": 0.6527749031666716, + "grad_norm": 0.09346655011177063, + "learning_rate": 1.7425658831908835e-05, + "loss": 0.0163, + "step": 88310 + }, + { + "epoch": 0.6528488217379734, + "grad_norm": 0.07473476231098175, + "learning_rate": 1.7421949192782528e-05, + "loss": 0.0168, + "step": 88320 + }, + { + "epoch": 0.6529227403092753, + "grad_norm": 0.06457147747278214, + "learning_rate": 1.741823955365622e-05, + "loss": 0.0191, + "step": 88330 + }, + { + "epoch": 0.6529966588805771, + "grad_norm": 0.06038924679160118, + "learning_rate": 1.7414529914529913e-05, + "loss": 0.016, + "step": 88340 + }, + { + "epoch": 0.653070577451879, + "grad_norm": 0.07191398739814758, + "learning_rate": 1.7410820275403612e-05, + "loss": 0.0176, + "step": 88350 + }, + { + "epoch": 0.6531444960231808, + "grad_norm": 0.0851953998208046, + "learning_rate": 1.7407110636277305e-05, + "loss": 0.0174, + "step": 88360 + }, + { + "epoch": 0.6532184145944827, + "grad_norm": 0.06611715257167816, + "learning_rate": 1.7403400997150997e-05, + "loss": 0.0159, + "step": 88370 + }, + { + "epoch": 0.6532923331657846, + "grad_norm": 0.07906629145145416, + "learning_rate": 1.7399691358024693e-05, + "loss": 0.017, + "step": 88380 + }, + { + "epoch": 0.6533662517370864, + "grad_norm": 0.07506294548511505, + "learning_rate": 1.7395981718898386e-05, + "loss": 0.017, + "step": 88390 + }, + { + "epoch": 0.6534401703083883, + "grad_norm": 0.07016431540250778, + "learning_rate": 1.7392272079772082e-05, + "loss": 0.0187, + "step": 88400 + }, + { + "epoch": 0.6535140888796901, + "grad_norm": 0.08150769025087357, + "learning_rate": 1.7388562440645774e-05, + "loss": 0.0178, + "step": 88410 + }, + { + "epoch": 0.653588007450992, + "grad_norm": 0.08036355674266815, + "learning_rate": 1.738485280151947e-05, + "loss": 0.0191, + "step": 88420 + }, + { + "epoch": 0.6536619260222939, + "grad_norm": 0.07620055973529816, + "learning_rate": 1.7381143162393163e-05, + "loss": 0.016, + "step": 88430 + }, + { + "epoch": 0.6537358445935957, + "grad_norm": 0.08634445816278458, + "learning_rate": 1.7377433523266856e-05, + "loss": 0.0166, + "step": 88440 + }, + { + "epoch": 0.6538097631648976, + "grad_norm": 0.09410831332206726, + "learning_rate": 1.737372388414055e-05, + "loss": 0.02, + "step": 88450 + }, + { + "epoch": 0.6538836817361994, + "grad_norm": 0.08769334107637405, + "learning_rate": 1.7370014245014248e-05, + "loss": 0.0154, + "step": 88460 + }, + { + "epoch": 0.6539576003075013, + "grad_norm": 0.05638042092323303, + "learning_rate": 1.736630460588794e-05, + "loss": 0.0162, + "step": 88470 + }, + { + "epoch": 0.6540315188788031, + "grad_norm": 0.08551321923732758, + "learning_rate": 1.7362594966761633e-05, + "loss": 0.0212, + "step": 88480 + }, + { + "epoch": 0.654105437450105, + "grad_norm": 0.06933954358100891, + "learning_rate": 1.735888532763533e-05, + "loss": 0.0204, + "step": 88490 + }, + { + "epoch": 0.6541793560214069, + "grad_norm": 0.07857784628868103, + "learning_rate": 1.7355175688509025e-05, + "loss": 0.021, + "step": 88500 + }, + { + "epoch": 0.6542532745927087, + "grad_norm": 0.06191938742995262, + "learning_rate": 1.7351466049382717e-05, + "loss": 0.017, + "step": 88510 + }, + { + "epoch": 0.6543271931640106, + "grad_norm": 0.07220283150672913, + "learning_rate": 1.734775641025641e-05, + "loss": 0.0154, + "step": 88520 + }, + { + "epoch": 0.6544011117353123, + "grad_norm": 0.08090227842330933, + "learning_rate": 1.7344046771130106e-05, + "loss": 0.0175, + "step": 88530 + }, + { + "epoch": 0.6544750303066142, + "grad_norm": 0.06821764260530472, + "learning_rate": 1.7340337132003802e-05, + "loss": 0.0153, + "step": 88540 + }, + { + "epoch": 0.654548948877916, + "grad_norm": 0.07408381253480911, + "learning_rate": 1.7336627492877494e-05, + "loss": 0.0161, + "step": 88550 + }, + { + "epoch": 0.6546228674492179, + "grad_norm": 0.09872277826070786, + "learning_rate": 1.7332917853751187e-05, + "loss": 0.0143, + "step": 88560 + }, + { + "epoch": 0.6546967860205198, + "grad_norm": 0.09294930100440979, + "learning_rate": 1.7329208214624883e-05, + "loss": 0.0184, + "step": 88570 + }, + { + "epoch": 0.6547707045918216, + "grad_norm": 0.0852140411734581, + "learning_rate": 1.732549857549858e-05, + "loss": 0.0169, + "step": 88580 + }, + { + "epoch": 0.6548446231631235, + "grad_norm": 0.06658681482076645, + "learning_rate": 1.732178893637227e-05, + "loss": 0.0189, + "step": 88590 + }, + { + "epoch": 0.6549185417344253, + "grad_norm": 0.06592123210430145, + "learning_rate": 1.7318079297245964e-05, + "loss": 0.0124, + "step": 88600 + }, + { + "epoch": 0.6549924603057272, + "grad_norm": 0.06855089962482452, + "learning_rate": 1.731436965811966e-05, + "loss": 0.018, + "step": 88610 + }, + { + "epoch": 0.655066378877029, + "grad_norm": 0.0831160768866539, + "learning_rate": 1.7310660018993353e-05, + "loss": 0.0157, + "step": 88620 + }, + { + "epoch": 0.6551402974483309, + "grad_norm": 0.09856075793504715, + "learning_rate": 1.730695037986705e-05, + "loss": 0.0197, + "step": 88630 + }, + { + "epoch": 0.6552142160196328, + "grad_norm": 0.1038886234164238, + "learning_rate": 1.730324074074074e-05, + "loss": 0.0182, + "step": 88640 + }, + { + "epoch": 0.6552881345909346, + "grad_norm": 0.08120853453874588, + "learning_rate": 1.7299531101614437e-05, + "loss": 0.0183, + "step": 88650 + }, + { + "epoch": 0.6553620531622365, + "grad_norm": 0.09115610271692276, + "learning_rate": 1.729582146248813e-05, + "loss": 0.0169, + "step": 88660 + }, + { + "epoch": 0.6554359717335383, + "grad_norm": 0.10014880448579788, + "learning_rate": 1.7292111823361822e-05, + "loss": 0.0209, + "step": 88670 + }, + { + "epoch": 0.6555098903048402, + "grad_norm": 0.06310135871171951, + "learning_rate": 1.7288402184235518e-05, + "loss": 0.0195, + "step": 88680 + }, + { + "epoch": 0.6555838088761421, + "grad_norm": 0.08188183605670929, + "learning_rate": 1.7284692545109214e-05, + "loss": 0.0172, + "step": 88690 + }, + { + "epoch": 0.6556577274474439, + "grad_norm": 0.0723811686038971, + "learning_rate": 1.7280982905982907e-05, + "loss": 0.0152, + "step": 88700 + }, + { + "epoch": 0.6557316460187458, + "grad_norm": 0.07690294831991196, + "learning_rate": 1.72772732668566e-05, + "loss": 0.0162, + "step": 88710 + }, + { + "epoch": 0.6558055645900476, + "grad_norm": 0.07310546934604645, + "learning_rate": 1.7273563627730295e-05, + "loss": 0.0186, + "step": 88720 + }, + { + "epoch": 0.6558794831613495, + "grad_norm": 0.07603324204683304, + "learning_rate": 1.726985398860399e-05, + "loss": 0.016, + "step": 88730 + }, + { + "epoch": 0.6559534017326513, + "grad_norm": 0.07487497478723526, + "learning_rate": 1.7266144349477684e-05, + "loss": 0.0177, + "step": 88740 + }, + { + "epoch": 0.6560273203039532, + "grad_norm": 0.06836133450269699, + "learning_rate": 1.7262434710351376e-05, + "loss": 0.0152, + "step": 88750 + }, + { + "epoch": 0.6561012388752551, + "grad_norm": 0.0726594552397728, + "learning_rate": 1.7258725071225072e-05, + "loss": 0.0186, + "step": 88760 + }, + { + "epoch": 0.6561751574465569, + "grad_norm": 0.0918281301856041, + "learning_rate": 1.725501543209877e-05, + "loss": 0.0181, + "step": 88770 + }, + { + "epoch": 0.6562490760178588, + "grad_norm": 0.0876714214682579, + "learning_rate": 1.725130579297246e-05, + "loss": 0.0188, + "step": 88780 + }, + { + "epoch": 0.6563229945891605, + "grad_norm": 0.08103278279304504, + "learning_rate": 1.7247596153846153e-05, + "loss": 0.0168, + "step": 88790 + }, + { + "epoch": 0.6563969131604624, + "grad_norm": 0.06613600999116898, + "learning_rate": 1.724388651471985e-05, + "loss": 0.0185, + "step": 88800 + }, + { + "epoch": 0.6564708317317642, + "grad_norm": 0.08758364617824554, + "learning_rate": 1.7240176875593545e-05, + "loss": 0.0154, + "step": 88810 + }, + { + "epoch": 0.6565447503030661, + "grad_norm": 0.09205630421638489, + "learning_rate": 1.7236467236467238e-05, + "loss": 0.0175, + "step": 88820 + }, + { + "epoch": 0.656618668874368, + "grad_norm": 0.06685183942317963, + "learning_rate": 1.723275759734093e-05, + "loss": 0.0167, + "step": 88830 + }, + { + "epoch": 0.6566925874456698, + "grad_norm": 0.07834669202566147, + "learning_rate": 1.7229047958214627e-05, + "loss": 0.0175, + "step": 88840 + }, + { + "epoch": 0.6567665060169717, + "grad_norm": 0.07539256662130356, + "learning_rate": 1.722533831908832e-05, + "loss": 0.0163, + "step": 88850 + }, + { + "epoch": 0.6568404245882735, + "grad_norm": 0.10026615858078003, + "learning_rate": 1.7221628679962015e-05, + "loss": 0.0177, + "step": 88860 + }, + { + "epoch": 0.6569143431595754, + "grad_norm": 0.06142430007457733, + "learning_rate": 1.7217919040835708e-05, + "loss": 0.0154, + "step": 88870 + }, + { + "epoch": 0.6569882617308773, + "grad_norm": 0.07194249331951141, + "learning_rate": 1.7214209401709404e-05, + "loss": 0.0156, + "step": 88880 + }, + { + "epoch": 0.6570621803021791, + "grad_norm": 0.09324514865875244, + "learning_rate": 1.7210499762583096e-05, + "loss": 0.0173, + "step": 88890 + }, + { + "epoch": 0.657136098873481, + "grad_norm": 0.07574540376663208, + "learning_rate": 1.720679012345679e-05, + "loss": 0.0194, + "step": 88900 + }, + { + "epoch": 0.6572100174447828, + "grad_norm": 0.08077821880578995, + "learning_rate": 1.7203080484330485e-05, + "loss": 0.0184, + "step": 88910 + }, + { + "epoch": 0.6572839360160847, + "grad_norm": 0.1079104095697403, + "learning_rate": 1.719937084520418e-05, + "loss": 0.0178, + "step": 88920 + }, + { + "epoch": 0.6573578545873865, + "grad_norm": 0.09008293598890305, + "learning_rate": 1.7195661206077873e-05, + "loss": 0.0169, + "step": 88930 + }, + { + "epoch": 0.6574317731586884, + "grad_norm": 0.04996965825557709, + "learning_rate": 1.7191951566951566e-05, + "loss": 0.0163, + "step": 88940 + }, + { + "epoch": 0.6575056917299903, + "grad_norm": 0.06016021594405174, + "learning_rate": 1.7188241927825262e-05, + "loss": 0.0193, + "step": 88950 + }, + { + "epoch": 0.6575796103012921, + "grad_norm": 0.08215005695819855, + "learning_rate": 1.7184532288698958e-05, + "loss": 0.0188, + "step": 88960 + }, + { + "epoch": 0.657653528872594, + "grad_norm": 0.08095736801624298, + "learning_rate": 1.718082264957265e-05, + "loss": 0.021, + "step": 88970 + }, + { + "epoch": 0.6577274474438958, + "grad_norm": 0.09990512579679489, + "learning_rate": 1.7177113010446343e-05, + "loss": 0.0174, + "step": 88980 + }, + { + "epoch": 0.6578013660151977, + "grad_norm": 0.09206452965736389, + "learning_rate": 1.717340337132004e-05, + "loss": 0.0178, + "step": 88990 + }, + { + "epoch": 0.6578752845864995, + "grad_norm": 0.06057227775454521, + "learning_rate": 1.7169693732193735e-05, + "loss": 0.0191, + "step": 89000 + }, + { + "epoch": 0.6579492031578014, + "grad_norm": 0.08504120260477066, + "learning_rate": 1.7165984093067427e-05, + "loss": 0.0171, + "step": 89010 + }, + { + "epoch": 0.6580231217291033, + "grad_norm": 0.06572899222373962, + "learning_rate": 1.716227445394112e-05, + "loss": 0.0182, + "step": 89020 + }, + { + "epoch": 0.6580970403004051, + "grad_norm": 0.08160002529621124, + "learning_rate": 1.7158564814814816e-05, + "loss": 0.0151, + "step": 89030 + }, + { + "epoch": 0.658170958871707, + "grad_norm": 0.09511744230985641, + "learning_rate": 1.7154855175688512e-05, + "loss": 0.0148, + "step": 89040 + }, + { + "epoch": 0.6582448774430087, + "grad_norm": 0.07572203129529953, + "learning_rate": 1.7151145536562205e-05, + "loss": 0.0176, + "step": 89050 + }, + { + "epoch": 0.6583187960143106, + "grad_norm": 0.05752973258495331, + "learning_rate": 1.7147435897435897e-05, + "loss": 0.0144, + "step": 89060 + }, + { + "epoch": 0.6583927145856124, + "grad_norm": 0.06448008865118027, + "learning_rate": 1.7143726258309593e-05, + "loss": 0.017, + "step": 89070 + }, + { + "epoch": 0.6584666331569143, + "grad_norm": 0.09872671216726303, + "learning_rate": 1.7140016619183286e-05, + "loss": 0.018, + "step": 89080 + }, + { + "epoch": 0.6585405517282162, + "grad_norm": 0.11234050989151001, + "learning_rate": 1.713630698005698e-05, + "loss": 0.0195, + "step": 89090 + }, + { + "epoch": 0.658614470299518, + "grad_norm": 0.06337179988622665, + "learning_rate": 1.7132597340930674e-05, + "loss": 0.0189, + "step": 89100 + }, + { + "epoch": 0.6586883888708199, + "grad_norm": 0.06142707169055939, + "learning_rate": 1.712888770180437e-05, + "loss": 0.0181, + "step": 89110 + }, + { + "epoch": 0.6587623074421217, + "grad_norm": 0.0669117197394371, + "learning_rate": 1.7125178062678063e-05, + "loss": 0.0163, + "step": 89120 + }, + { + "epoch": 0.6588362260134236, + "grad_norm": 0.16983623802661896, + "learning_rate": 1.7121468423551755e-05, + "loss": 0.0177, + "step": 89130 + }, + { + "epoch": 0.6589101445847255, + "grad_norm": 0.06876539438962936, + "learning_rate": 1.711775878442545e-05, + "loss": 0.0169, + "step": 89140 + }, + { + "epoch": 0.6589840631560273, + "grad_norm": 0.09889322519302368, + "learning_rate": 1.7114049145299147e-05, + "loss": 0.0187, + "step": 89150 + }, + { + "epoch": 0.6590579817273292, + "grad_norm": 0.07979824393987656, + "learning_rate": 1.711033950617284e-05, + "loss": 0.0179, + "step": 89160 + }, + { + "epoch": 0.659131900298631, + "grad_norm": 0.06535978615283966, + "learning_rate": 1.7106629867046532e-05, + "loss": 0.0188, + "step": 89170 + }, + { + "epoch": 0.6592058188699329, + "grad_norm": 0.07772579044103622, + "learning_rate": 1.710292022792023e-05, + "loss": 0.0176, + "step": 89180 + }, + { + "epoch": 0.6592797374412347, + "grad_norm": 0.09148101508617401, + "learning_rate": 1.7099210588793924e-05, + "loss": 0.0184, + "step": 89190 + }, + { + "epoch": 0.6593536560125366, + "grad_norm": 0.07457172125577927, + "learning_rate": 1.7095500949667617e-05, + "loss": 0.0173, + "step": 89200 + }, + { + "epoch": 0.6594275745838385, + "grad_norm": 0.07437124103307724, + "learning_rate": 1.709179131054131e-05, + "loss": 0.0177, + "step": 89210 + }, + { + "epoch": 0.6595014931551403, + "grad_norm": 0.05599772930145264, + "learning_rate": 1.7088081671415006e-05, + "loss": 0.0168, + "step": 89220 + }, + { + "epoch": 0.6595754117264422, + "grad_norm": 0.07221709191799164, + "learning_rate": 1.70843720322887e-05, + "loss": 0.0156, + "step": 89230 + }, + { + "epoch": 0.659649330297744, + "grad_norm": 0.08130336552858353, + "learning_rate": 1.7080662393162394e-05, + "loss": 0.0163, + "step": 89240 + }, + { + "epoch": 0.6597232488690459, + "grad_norm": 0.0671689435839653, + "learning_rate": 1.7076952754036087e-05, + "loss": 0.0154, + "step": 89250 + }, + { + "epoch": 0.6597971674403477, + "grad_norm": 0.09063195437192917, + "learning_rate": 1.7073243114909783e-05, + "loss": 0.0168, + "step": 89260 + }, + { + "epoch": 0.6598710860116496, + "grad_norm": 0.07398603856563568, + "learning_rate": 1.706953347578348e-05, + "loss": 0.016, + "step": 89270 + }, + { + "epoch": 0.6599450045829515, + "grad_norm": 0.07265040278434753, + "learning_rate": 1.706582383665717e-05, + "loss": 0.0187, + "step": 89280 + }, + { + "epoch": 0.6600189231542533, + "grad_norm": 0.12028414756059647, + "learning_rate": 1.7062114197530864e-05, + "loss": 0.0163, + "step": 89290 + }, + { + "epoch": 0.6600928417255552, + "grad_norm": 0.11012061685323715, + "learning_rate": 1.705840455840456e-05, + "loss": 0.018, + "step": 89300 + }, + { + "epoch": 0.660166760296857, + "grad_norm": 0.08662872016429901, + "learning_rate": 1.7054694919278252e-05, + "loss": 0.0194, + "step": 89310 + }, + { + "epoch": 0.6602406788681588, + "grad_norm": 0.0670333206653595, + "learning_rate": 1.7050985280151948e-05, + "loss": 0.0163, + "step": 89320 + }, + { + "epoch": 0.6603145974394606, + "grad_norm": 0.07254930585622787, + "learning_rate": 1.704727564102564e-05, + "loss": 0.0162, + "step": 89330 + }, + { + "epoch": 0.6603885160107625, + "grad_norm": 0.07330934703350067, + "learning_rate": 1.7043566001899337e-05, + "loss": 0.0189, + "step": 89340 + }, + { + "epoch": 0.6604624345820644, + "grad_norm": 0.06475830078125, + "learning_rate": 1.703985636277303e-05, + "loss": 0.0157, + "step": 89350 + }, + { + "epoch": 0.6605363531533662, + "grad_norm": 0.07823119312524796, + "learning_rate": 1.7036146723646722e-05, + "loss": 0.0161, + "step": 89360 + }, + { + "epoch": 0.6606102717246681, + "grad_norm": 0.06793460249900818, + "learning_rate": 1.7032437084520418e-05, + "loss": 0.0162, + "step": 89370 + }, + { + "epoch": 0.6606841902959699, + "grad_norm": 0.06623463332653046, + "learning_rate": 1.7028727445394114e-05, + "loss": 0.0159, + "step": 89380 + }, + { + "epoch": 0.6607581088672718, + "grad_norm": 0.08912117034196854, + "learning_rate": 1.7025017806267806e-05, + "loss": 0.0181, + "step": 89390 + }, + { + "epoch": 0.6608320274385737, + "grad_norm": 0.07295441627502441, + "learning_rate": 1.70213081671415e-05, + "loss": 0.0204, + "step": 89400 + }, + { + "epoch": 0.6609059460098755, + "grad_norm": 0.11057400703430176, + "learning_rate": 1.70175985280152e-05, + "loss": 0.0185, + "step": 89410 + }, + { + "epoch": 0.6609798645811774, + "grad_norm": 0.09350541979074478, + "learning_rate": 1.701388888888889e-05, + "loss": 0.0183, + "step": 89420 + }, + { + "epoch": 0.6610537831524792, + "grad_norm": 0.060925353318452835, + "learning_rate": 1.7010179249762584e-05, + "loss": 0.0172, + "step": 89430 + }, + { + "epoch": 0.6611277017237811, + "grad_norm": 0.09891802072525024, + "learning_rate": 1.7006469610636276e-05, + "loss": 0.016, + "step": 89440 + }, + { + "epoch": 0.6612016202950829, + "grad_norm": 0.06763199716806412, + "learning_rate": 1.7002759971509972e-05, + "loss": 0.0161, + "step": 89450 + }, + { + "epoch": 0.6612755388663848, + "grad_norm": 0.09075762331485748, + "learning_rate": 1.6999050332383668e-05, + "loss": 0.0184, + "step": 89460 + }, + { + "epoch": 0.6613494574376867, + "grad_norm": 0.0649956688284874, + "learning_rate": 1.699534069325736e-05, + "loss": 0.0185, + "step": 89470 + }, + { + "epoch": 0.6614233760089885, + "grad_norm": 0.06586815416812897, + "learning_rate": 1.6991631054131053e-05, + "loss": 0.018, + "step": 89480 + }, + { + "epoch": 0.6614972945802904, + "grad_norm": 0.06681115925312042, + "learning_rate": 1.698792141500475e-05, + "loss": 0.0207, + "step": 89490 + }, + { + "epoch": 0.6615712131515922, + "grad_norm": 0.06798037886619568, + "learning_rate": 1.6984211775878445e-05, + "loss": 0.0166, + "step": 89500 + }, + { + "epoch": 0.6616451317228941, + "grad_norm": 0.08689411729574203, + "learning_rate": 1.6980502136752138e-05, + "loss": 0.0172, + "step": 89510 + }, + { + "epoch": 0.6617190502941959, + "grad_norm": 0.08138342946767807, + "learning_rate": 1.697679249762583e-05, + "loss": 0.0192, + "step": 89520 + }, + { + "epoch": 0.6617929688654978, + "grad_norm": 0.07708865404129028, + "learning_rate": 1.6973082858499526e-05, + "loss": 0.0166, + "step": 89530 + }, + { + "epoch": 0.6618668874367997, + "grad_norm": 0.07389518618583679, + "learning_rate": 1.696937321937322e-05, + "loss": 0.0167, + "step": 89540 + }, + { + "epoch": 0.6619408060081015, + "grad_norm": 0.08059065043926239, + "learning_rate": 1.6965663580246915e-05, + "loss": 0.0161, + "step": 89550 + }, + { + "epoch": 0.6620147245794034, + "grad_norm": 0.07389674335718155, + "learning_rate": 1.6961953941120607e-05, + "loss": 0.0156, + "step": 89560 + }, + { + "epoch": 0.6620886431507051, + "grad_norm": 0.09503727406263351, + "learning_rate": 1.6958244301994303e-05, + "loss": 0.0178, + "step": 89570 + }, + { + "epoch": 0.662162561722007, + "grad_norm": 0.08040928840637207, + "learning_rate": 1.6954534662867996e-05, + "loss": 0.0178, + "step": 89580 + }, + { + "epoch": 0.6622364802933088, + "grad_norm": 0.06714902818202972, + "learning_rate": 1.695082502374169e-05, + "loss": 0.0191, + "step": 89590 + }, + { + "epoch": 0.6623103988646107, + "grad_norm": 0.1026182547211647, + "learning_rate": 1.6947115384615388e-05, + "loss": 0.0182, + "step": 89600 + }, + { + "epoch": 0.6623843174359126, + "grad_norm": 0.06692285090684891, + "learning_rate": 1.694340574548908e-05, + "loss": 0.017, + "step": 89610 + }, + { + "epoch": 0.6624582360072144, + "grad_norm": 0.07225298136472702, + "learning_rate": 1.6939696106362773e-05, + "loss": 0.0176, + "step": 89620 + }, + { + "epoch": 0.6625321545785163, + "grad_norm": 0.08241157233715057, + "learning_rate": 1.6935986467236466e-05, + "loss": 0.0189, + "step": 89630 + }, + { + "epoch": 0.6626060731498181, + "grad_norm": 0.08315838873386383, + "learning_rate": 1.6932276828110165e-05, + "loss": 0.0169, + "step": 89640 + }, + { + "epoch": 0.66267999172112, + "grad_norm": 0.06351329386234283, + "learning_rate": 1.6928567188983858e-05, + "loss": 0.0166, + "step": 89650 + }, + { + "epoch": 0.6627539102924219, + "grad_norm": 0.10287221521139145, + "learning_rate": 1.692485754985755e-05, + "loss": 0.0217, + "step": 89660 + }, + { + "epoch": 0.6628278288637237, + "grad_norm": 0.07420855015516281, + "learning_rate": 1.6921147910731243e-05, + "loss": 0.0168, + "step": 89670 + }, + { + "epoch": 0.6629017474350256, + "grad_norm": 0.08003699034452438, + "learning_rate": 1.691743827160494e-05, + "loss": 0.0211, + "step": 89680 + }, + { + "epoch": 0.6629756660063274, + "grad_norm": 0.07801926136016846, + "learning_rate": 1.6913728632478635e-05, + "loss": 0.0172, + "step": 89690 + }, + { + "epoch": 0.6630495845776293, + "grad_norm": 0.08778087049722672, + "learning_rate": 1.6910018993352327e-05, + "loss": 0.0198, + "step": 89700 + }, + { + "epoch": 0.6631235031489311, + "grad_norm": 0.06600422412157059, + "learning_rate": 1.690630935422602e-05, + "loss": 0.0176, + "step": 89710 + }, + { + "epoch": 0.663197421720233, + "grad_norm": 0.06339359283447266, + "learning_rate": 1.6902599715099716e-05, + "loss": 0.0189, + "step": 89720 + }, + { + "epoch": 0.6632713402915349, + "grad_norm": 0.05996141955256462, + "learning_rate": 1.6898890075973412e-05, + "loss": 0.0158, + "step": 89730 + }, + { + "epoch": 0.6633452588628367, + "grad_norm": 0.06987646967172623, + "learning_rate": 1.6895180436847104e-05, + "loss": 0.0165, + "step": 89740 + }, + { + "epoch": 0.6634191774341386, + "grad_norm": 0.06630247831344604, + "learning_rate": 1.68914707977208e-05, + "loss": 0.0195, + "step": 89750 + }, + { + "epoch": 0.6634930960054404, + "grad_norm": 0.09023629128932953, + "learning_rate": 1.6887761158594493e-05, + "loss": 0.0172, + "step": 89760 + }, + { + "epoch": 0.6635670145767423, + "grad_norm": 0.09470485895872116, + "learning_rate": 1.6884051519468185e-05, + "loss": 0.0195, + "step": 89770 + }, + { + "epoch": 0.6636409331480441, + "grad_norm": 0.09433825314044952, + "learning_rate": 1.688034188034188e-05, + "loss": 0.0189, + "step": 89780 + }, + { + "epoch": 0.663714851719346, + "grad_norm": 0.09324505925178528, + "learning_rate": 1.6876632241215577e-05, + "loss": 0.0194, + "step": 89790 + }, + { + "epoch": 0.6637887702906479, + "grad_norm": 0.11419341713190079, + "learning_rate": 1.687292260208927e-05, + "loss": 0.0184, + "step": 89800 + }, + { + "epoch": 0.6638626888619497, + "grad_norm": 0.09134076535701752, + "learning_rate": 1.6869212962962963e-05, + "loss": 0.0179, + "step": 89810 + }, + { + "epoch": 0.6639366074332516, + "grad_norm": 0.060227178037166595, + "learning_rate": 1.6865503323836655e-05, + "loss": 0.0164, + "step": 89820 + }, + { + "epoch": 0.6640105260045533, + "grad_norm": 0.09734909236431122, + "learning_rate": 1.6861793684710354e-05, + "loss": 0.0167, + "step": 89830 + }, + { + "epoch": 0.6640844445758552, + "grad_norm": 0.08146527409553528, + "learning_rate": 1.6858084045584047e-05, + "loss": 0.018, + "step": 89840 + }, + { + "epoch": 0.664158363147157, + "grad_norm": 0.0977560356259346, + "learning_rate": 1.685437440645774e-05, + "loss": 0.0179, + "step": 89850 + }, + { + "epoch": 0.6642322817184589, + "grad_norm": 0.08954362571239471, + "learning_rate": 1.6850664767331432e-05, + "loss": 0.0166, + "step": 89860 + }, + { + "epoch": 0.6643062002897608, + "grad_norm": 0.06876461952924728, + "learning_rate": 1.684695512820513e-05, + "loss": 0.0161, + "step": 89870 + }, + { + "epoch": 0.6643801188610626, + "grad_norm": 0.07776942104101181, + "learning_rate": 1.6843245489078824e-05, + "loss": 0.0167, + "step": 89880 + }, + { + "epoch": 0.6644540374323645, + "grad_norm": 0.08065933734178543, + "learning_rate": 1.6839535849952517e-05, + "loss": 0.0182, + "step": 89890 + }, + { + "epoch": 0.6645279560036663, + "grad_norm": 0.08351798355579376, + "learning_rate": 1.6835826210826213e-05, + "loss": 0.0187, + "step": 89900 + }, + { + "epoch": 0.6646018745749682, + "grad_norm": 0.08597084134817123, + "learning_rate": 1.6832116571699905e-05, + "loss": 0.018, + "step": 89910 + }, + { + "epoch": 0.6646757931462701, + "grad_norm": 0.07811547815799713, + "learning_rate": 1.68284069325736e-05, + "loss": 0.0176, + "step": 89920 + }, + { + "epoch": 0.6647497117175719, + "grad_norm": 0.06780789792537689, + "learning_rate": 1.6824697293447294e-05, + "loss": 0.0158, + "step": 89930 + }, + { + "epoch": 0.6648236302888738, + "grad_norm": 0.06562267243862152, + "learning_rate": 1.682098765432099e-05, + "loss": 0.0176, + "step": 89940 + }, + { + "epoch": 0.6648975488601756, + "grad_norm": 0.0878724604845047, + "learning_rate": 1.6817278015194682e-05, + "loss": 0.0154, + "step": 89950 + }, + { + "epoch": 0.6649714674314775, + "grad_norm": 0.07386815547943115, + "learning_rate": 1.681356837606838e-05, + "loss": 0.0149, + "step": 89960 + }, + { + "epoch": 0.6650453860027793, + "grad_norm": 0.07271129637956619, + "learning_rate": 1.680985873694207e-05, + "loss": 0.0174, + "step": 89970 + }, + { + "epoch": 0.6651193045740812, + "grad_norm": 0.07908832281827927, + "learning_rate": 1.6806149097815767e-05, + "loss": 0.0176, + "step": 89980 + }, + { + "epoch": 0.6651932231453831, + "grad_norm": 0.08835571259260178, + "learning_rate": 1.680243945868946e-05, + "loss": 0.0179, + "step": 89990 + }, + { + "epoch": 0.6652671417166849, + "grad_norm": 0.08931245654821396, + "learning_rate": 1.6798729819563152e-05, + "loss": 0.0159, + "step": 90000 + }, + { + "epoch": 0.6652671417166849, + "eval_f1": 0.6309572429107293, + "eval_loss": 0.017208395525813103, + "eval_precision": 0.5059339765875664, + "eval_recall": 0.8380512764814848, + "eval_runtime": 2920.6198, + "eval_samples_per_second": 185.281, + "eval_steps_per_second": 2.895, + "step": 90000 + }, + { + "epoch": 0.6653410602879868, + "grad_norm": 0.05561404302716255, + "learning_rate": 1.6795020180436848e-05, + "loss": 0.0182, + "step": 90010 + }, + { + "epoch": 0.6654149788592886, + "grad_norm": 0.09877961128950119, + "learning_rate": 1.6791310541310544e-05, + "loss": 0.0183, + "step": 90020 + }, + { + "epoch": 0.6654888974305905, + "grad_norm": 0.06422388553619385, + "learning_rate": 1.6787600902184237e-05, + "loss": 0.018, + "step": 90030 + }, + { + "epoch": 0.6655628160018923, + "grad_norm": 0.05520254746079445, + "learning_rate": 1.678389126305793e-05, + "loss": 0.0171, + "step": 90040 + }, + { + "epoch": 0.6656367345731942, + "grad_norm": 0.05561777949333191, + "learning_rate": 1.6780181623931625e-05, + "loss": 0.0167, + "step": 90050 + }, + { + "epoch": 0.6657106531444961, + "grad_norm": 0.07283684611320496, + "learning_rate": 1.677647198480532e-05, + "loss": 0.0146, + "step": 90060 + }, + { + "epoch": 0.6657845717157979, + "grad_norm": 0.07278977334499359, + "learning_rate": 1.6772762345679014e-05, + "loss": 0.0159, + "step": 90070 + }, + { + "epoch": 0.6658584902870998, + "grad_norm": 0.08145174384117126, + "learning_rate": 1.6769052706552706e-05, + "loss": 0.0173, + "step": 90080 + }, + { + "epoch": 0.6659324088584015, + "grad_norm": 0.05579023063182831, + "learning_rate": 1.6765343067426402e-05, + "loss": 0.0176, + "step": 90090 + }, + { + "epoch": 0.6660063274297034, + "grad_norm": 0.07451515644788742, + "learning_rate": 1.6761633428300098e-05, + "loss": 0.0165, + "step": 90100 + }, + { + "epoch": 0.6660802460010052, + "grad_norm": 0.08948320895433426, + "learning_rate": 1.675792378917379e-05, + "loss": 0.0189, + "step": 90110 + }, + { + "epoch": 0.6661541645723071, + "grad_norm": 0.07860399782657623, + "learning_rate": 1.6754214150047483e-05, + "loss": 0.0193, + "step": 90120 + }, + { + "epoch": 0.666228083143609, + "grad_norm": 0.0718916580080986, + "learning_rate": 1.675050451092118e-05, + "loss": 0.0175, + "step": 90130 + }, + { + "epoch": 0.6663020017149108, + "grad_norm": 0.10223414748907089, + "learning_rate": 1.6746794871794872e-05, + "loss": 0.0196, + "step": 90140 + }, + { + "epoch": 0.6663759202862127, + "grad_norm": 0.07415630668401718, + "learning_rate": 1.6743085232668568e-05, + "loss": 0.0173, + "step": 90150 + }, + { + "epoch": 0.6664498388575145, + "grad_norm": 0.10686575621366501, + "learning_rate": 1.673937559354226e-05, + "loss": 0.0184, + "step": 90160 + }, + { + "epoch": 0.6665237574288164, + "grad_norm": 0.062122609466314316, + "learning_rate": 1.6735665954415956e-05, + "loss": 0.0208, + "step": 90170 + }, + { + "epoch": 0.6665976760001183, + "grad_norm": 0.1056041568517685, + "learning_rate": 1.673195631528965e-05, + "loss": 0.0154, + "step": 90180 + }, + { + "epoch": 0.6666715945714201, + "grad_norm": 0.06856260448694229, + "learning_rate": 1.6728246676163345e-05, + "loss": 0.0182, + "step": 90190 + }, + { + "epoch": 0.666745513142722, + "grad_norm": 0.09874051809310913, + "learning_rate": 1.6724537037037037e-05, + "loss": 0.0192, + "step": 90200 + }, + { + "epoch": 0.6668194317140238, + "grad_norm": 0.09938324987888336, + "learning_rate": 1.6720827397910733e-05, + "loss": 0.0186, + "step": 90210 + }, + { + "epoch": 0.6668933502853257, + "grad_norm": 0.0918031856417656, + "learning_rate": 1.6717117758784426e-05, + "loss": 0.0164, + "step": 90220 + }, + { + "epoch": 0.6669672688566275, + "grad_norm": 0.06843070685863495, + "learning_rate": 1.671340811965812e-05, + "loss": 0.0187, + "step": 90230 + }, + { + "epoch": 0.6670411874279294, + "grad_norm": 0.07426135241985321, + "learning_rate": 1.6709698480531815e-05, + "loss": 0.019, + "step": 90240 + }, + { + "epoch": 0.6671151059992313, + "grad_norm": 0.07184404879808426, + "learning_rate": 1.670598884140551e-05, + "loss": 0.0192, + "step": 90250 + }, + { + "epoch": 0.6671890245705331, + "grad_norm": 0.108861044049263, + "learning_rate": 1.6702279202279203e-05, + "loss": 0.0181, + "step": 90260 + }, + { + "epoch": 0.667262943141835, + "grad_norm": 0.0755942165851593, + "learning_rate": 1.6698569563152896e-05, + "loss": 0.0168, + "step": 90270 + }, + { + "epoch": 0.6673368617131368, + "grad_norm": 0.06772216409444809, + "learning_rate": 1.669485992402659e-05, + "loss": 0.0179, + "step": 90280 + }, + { + "epoch": 0.6674107802844387, + "grad_norm": 0.07250899821519852, + "learning_rate": 1.6691150284900288e-05, + "loss": 0.0152, + "step": 90290 + }, + { + "epoch": 0.6674846988557405, + "grad_norm": 0.07351907342672348, + "learning_rate": 1.668744064577398e-05, + "loss": 0.0174, + "step": 90300 + }, + { + "epoch": 0.6675586174270424, + "grad_norm": 0.07086682319641113, + "learning_rate": 1.6683731006647673e-05, + "loss": 0.0184, + "step": 90310 + }, + { + "epoch": 0.6676325359983443, + "grad_norm": 0.0855761393904686, + "learning_rate": 1.668002136752137e-05, + "loss": 0.0165, + "step": 90320 + }, + { + "epoch": 0.6677064545696461, + "grad_norm": 0.0903254970908165, + "learning_rate": 1.6676311728395065e-05, + "loss": 0.0187, + "step": 90330 + }, + { + "epoch": 0.667780373140948, + "grad_norm": 0.10437069088220596, + "learning_rate": 1.6672602089268757e-05, + "loss": 0.0174, + "step": 90340 + }, + { + "epoch": 0.6678542917122497, + "grad_norm": 0.07991361618041992, + "learning_rate": 1.666889245014245e-05, + "loss": 0.0177, + "step": 90350 + }, + { + "epoch": 0.6679282102835516, + "grad_norm": 0.06454506516456604, + "learning_rate": 1.6665182811016146e-05, + "loss": 0.0136, + "step": 90360 + }, + { + "epoch": 0.6680021288548534, + "grad_norm": 0.09109412133693695, + "learning_rate": 1.666147317188984e-05, + "loss": 0.0169, + "step": 90370 + }, + { + "epoch": 0.6680760474261553, + "grad_norm": 0.06350691616535187, + "learning_rate": 1.6657763532763534e-05, + "loss": 0.015, + "step": 90380 + }, + { + "epoch": 0.6681499659974572, + "grad_norm": 0.10788552463054657, + "learning_rate": 1.6654053893637227e-05, + "loss": 0.0197, + "step": 90390 + }, + { + "epoch": 0.668223884568759, + "grad_norm": 0.054137811064720154, + "learning_rate": 1.6650344254510923e-05, + "loss": 0.0161, + "step": 90400 + }, + { + "epoch": 0.6682978031400609, + "grad_norm": 0.06528239697217941, + "learning_rate": 1.6646634615384616e-05, + "loss": 0.0167, + "step": 90410 + }, + { + "epoch": 0.6683717217113627, + "grad_norm": 0.06540937721729279, + "learning_rate": 1.664292497625831e-05, + "loss": 0.0164, + "step": 90420 + }, + { + "epoch": 0.6684456402826646, + "grad_norm": 0.07134325057268143, + "learning_rate": 1.6639215337132004e-05, + "loss": 0.0174, + "step": 90430 + }, + { + "epoch": 0.6685195588539665, + "grad_norm": 0.08838541060686111, + "learning_rate": 1.66355056980057e-05, + "loss": 0.0186, + "step": 90440 + }, + { + "epoch": 0.6685934774252683, + "grad_norm": 0.08562029153108597, + "learning_rate": 1.6631796058879393e-05, + "loss": 0.0225, + "step": 90450 + }, + { + "epoch": 0.6686673959965702, + "grad_norm": 0.10071739554405212, + "learning_rate": 1.6628086419753085e-05, + "loss": 0.0167, + "step": 90460 + }, + { + "epoch": 0.668741314567872, + "grad_norm": 0.08474022895097733, + "learning_rate": 1.662437678062678e-05, + "loss": 0.0204, + "step": 90470 + }, + { + "epoch": 0.6688152331391739, + "grad_norm": 0.08979560434818268, + "learning_rate": 1.6620667141500477e-05, + "loss": 0.0154, + "step": 90480 + }, + { + "epoch": 0.6688891517104757, + "grad_norm": 0.09872204810380936, + "learning_rate": 1.661695750237417e-05, + "loss": 0.0184, + "step": 90490 + }, + { + "epoch": 0.6689630702817776, + "grad_norm": 0.07118275761604309, + "learning_rate": 1.6613247863247862e-05, + "loss": 0.0177, + "step": 90500 + }, + { + "epoch": 0.6690369888530795, + "grad_norm": 0.08609623461961746, + "learning_rate": 1.6609538224121558e-05, + "loss": 0.0182, + "step": 90510 + }, + { + "epoch": 0.6691109074243813, + "grad_norm": 0.08788052946329117, + "learning_rate": 1.6605828584995254e-05, + "loss": 0.016, + "step": 90520 + }, + { + "epoch": 0.6691848259956832, + "grad_norm": 0.09005333483219147, + "learning_rate": 1.6602118945868947e-05, + "loss": 0.0175, + "step": 90530 + }, + { + "epoch": 0.669258744566985, + "grad_norm": 0.0722627267241478, + "learning_rate": 1.659840930674264e-05, + "loss": 0.0189, + "step": 90540 + }, + { + "epoch": 0.6693326631382869, + "grad_norm": 0.07813578844070435, + "learning_rate": 1.6594699667616335e-05, + "loss": 0.0187, + "step": 90550 + }, + { + "epoch": 0.6694065817095887, + "grad_norm": 0.056393325328826904, + "learning_rate": 1.659099002849003e-05, + "loss": 0.017, + "step": 90560 + }, + { + "epoch": 0.6694805002808906, + "grad_norm": 0.06716451793909073, + "learning_rate": 1.6587280389363724e-05, + "loss": 0.0176, + "step": 90570 + }, + { + "epoch": 0.6695544188521925, + "grad_norm": 0.09170413762331009, + "learning_rate": 1.6583570750237416e-05, + "loss": 0.0167, + "step": 90580 + }, + { + "epoch": 0.6696283374234943, + "grad_norm": 0.09170734137296677, + "learning_rate": 1.6579861111111112e-05, + "loss": 0.0188, + "step": 90590 + }, + { + "epoch": 0.6697022559947962, + "grad_norm": 0.09092351794242859, + "learning_rate": 1.6576151471984805e-05, + "loss": 0.0199, + "step": 90600 + }, + { + "epoch": 0.669776174566098, + "grad_norm": 0.09902980178594589, + "learning_rate": 1.65724418328585e-05, + "loss": 0.018, + "step": 90610 + }, + { + "epoch": 0.6698500931373998, + "grad_norm": 0.09110864251852036, + "learning_rate": 1.6568732193732194e-05, + "loss": 0.0185, + "step": 90620 + }, + { + "epoch": 0.6699240117087016, + "grad_norm": 0.06778454780578613, + "learning_rate": 1.656502255460589e-05, + "loss": 0.0186, + "step": 90630 + }, + { + "epoch": 0.6699979302800035, + "grad_norm": 0.08745856583118439, + "learning_rate": 1.6561312915479582e-05, + "loss": 0.0171, + "step": 90640 + }, + { + "epoch": 0.6700718488513054, + "grad_norm": 0.06933335214853287, + "learning_rate": 1.6557603276353278e-05, + "loss": 0.0164, + "step": 90650 + }, + { + "epoch": 0.6701457674226072, + "grad_norm": 0.05806926637887955, + "learning_rate": 1.655389363722697e-05, + "loss": 0.0163, + "step": 90660 + }, + { + "epoch": 0.6702196859939091, + "grad_norm": 0.08060045540332794, + "learning_rate": 1.6550183998100667e-05, + "loss": 0.0158, + "step": 90670 + }, + { + "epoch": 0.6702936045652109, + "grad_norm": 0.0801333636045456, + "learning_rate": 1.654647435897436e-05, + "loss": 0.0202, + "step": 90680 + }, + { + "epoch": 0.6703675231365128, + "grad_norm": 0.10312236845493317, + "learning_rate": 1.6542764719848052e-05, + "loss": 0.0189, + "step": 90690 + }, + { + "epoch": 0.6704414417078147, + "grad_norm": 0.08690429478883743, + "learning_rate": 1.6539055080721748e-05, + "loss": 0.0176, + "step": 90700 + }, + { + "epoch": 0.6705153602791165, + "grad_norm": 0.15349721908569336, + "learning_rate": 1.6535345441595444e-05, + "loss": 0.0178, + "step": 90710 + }, + { + "epoch": 0.6705892788504184, + "grad_norm": 0.09240947663784027, + "learning_rate": 1.6531635802469136e-05, + "loss": 0.0172, + "step": 90720 + }, + { + "epoch": 0.6706631974217202, + "grad_norm": 0.08479215949773788, + "learning_rate": 1.652792616334283e-05, + "loss": 0.0167, + "step": 90730 + }, + { + "epoch": 0.6707371159930221, + "grad_norm": 0.08831705152988434, + "learning_rate": 1.6524216524216525e-05, + "loss": 0.0176, + "step": 90740 + }, + { + "epoch": 0.6708110345643239, + "grad_norm": 0.11717703193426132, + "learning_rate": 1.652050688509022e-05, + "loss": 0.0183, + "step": 90750 + }, + { + "epoch": 0.6708849531356258, + "grad_norm": 0.07897736132144928, + "learning_rate": 1.6516797245963913e-05, + "loss": 0.0168, + "step": 90760 + }, + { + "epoch": 0.6709588717069277, + "grad_norm": 0.07475557178258896, + "learning_rate": 1.6513087606837606e-05, + "loss": 0.0159, + "step": 90770 + }, + { + "epoch": 0.6710327902782295, + "grad_norm": 0.07179439067840576, + "learning_rate": 1.6509377967711302e-05, + "loss": 0.0165, + "step": 90780 + }, + { + "epoch": 0.6711067088495314, + "grad_norm": 0.08902493119239807, + "learning_rate": 1.6505668328584998e-05, + "loss": 0.0178, + "step": 90790 + }, + { + "epoch": 0.6711806274208332, + "grad_norm": 0.08185980468988419, + "learning_rate": 1.650195868945869e-05, + "loss": 0.0164, + "step": 90800 + }, + { + "epoch": 0.6712545459921351, + "grad_norm": 0.05548747256398201, + "learning_rate": 1.6498249050332383e-05, + "loss": 0.0185, + "step": 90810 + }, + { + "epoch": 0.6713284645634369, + "grad_norm": 0.06674123555421829, + "learning_rate": 1.649453941120608e-05, + "loss": 0.0197, + "step": 90820 + }, + { + "epoch": 0.6714023831347388, + "grad_norm": 0.07334823161363602, + "learning_rate": 1.649082977207977e-05, + "loss": 0.0177, + "step": 90830 + }, + { + "epoch": 0.6714763017060407, + "grad_norm": 0.08820454031229019, + "learning_rate": 1.6487120132953468e-05, + "loss": 0.0173, + "step": 90840 + }, + { + "epoch": 0.6715502202773425, + "grad_norm": 0.07256550341844559, + "learning_rate": 1.648341049382716e-05, + "loss": 0.0159, + "step": 90850 + }, + { + "epoch": 0.6716241388486444, + "grad_norm": 0.07709689438343048, + "learning_rate": 1.6479700854700856e-05, + "loss": 0.0189, + "step": 90860 + }, + { + "epoch": 0.6716980574199461, + "grad_norm": 0.06899577379226685, + "learning_rate": 1.647599121557455e-05, + "loss": 0.0208, + "step": 90870 + }, + { + "epoch": 0.671771975991248, + "grad_norm": 0.08083607256412506, + "learning_rate": 1.6472281576448245e-05, + "loss": 0.0167, + "step": 90880 + }, + { + "epoch": 0.67184589456255, + "grad_norm": 0.08641605824232101, + "learning_rate": 1.6468571937321937e-05, + "loss": 0.0148, + "step": 90890 + }, + { + "epoch": 0.6719198131338517, + "grad_norm": 0.07134467363357544, + "learning_rate": 1.6464862298195633e-05, + "loss": 0.0169, + "step": 90900 + }, + { + "epoch": 0.6719937317051536, + "grad_norm": 0.09091978520154953, + "learning_rate": 1.6461152659069326e-05, + "loss": 0.0185, + "step": 90910 + }, + { + "epoch": 0.6720676502764554, + "grad_norm": 0.07118461281061172, + "learning_rate": 1.645744301994302e-05, + "loss": 0.016, + "step": 90920 + }, + { + "epoch": 0.6721415688477573, + "grad_norm": 0.08685034513473511, + "learning_rate": 1.6453733380816714e-05, + "loss": 0.0192, + "step": 90930 + }, + { + "epoch": 0.6722154874190591, + "grad_norm": 0.0918172299861908, + "learning_rate": 1.645002374169041e-05, + "loss": 0.0165, + "step": 90940 + }, + { + "epoch": 0.672289405990361, + "grad_norm": 0.09770748764276505, + "learning_rate": 1.6446314102564103e-05, + "loss": 0.0172, + "step": 90950 + }, + { + "epoch": 0.6723633245616629, + "grad_norm": 0.11164256185293198, + "learning_rate": 1.6442604463437795e-05, + "loss": 0.0168, + "step": 90960 + }, + { + "epoch": 0.6724372431329647, + "grad_norm": 0.06833399832248688, + "learning_rate": 1.643889482431149e-05, + "loss": 0.0161, + "step": 90970 + }, + { + "epoch": 0.6725111617042666, + "grad_norm": 0.0737537294626236, + "learning_rate": 1.6435185185185187e-05, + "loss": 0.0168, + "step": 90980 + }, + { + "epoch": 0.6725850802755684, + "grad_norm": 0.08121098577976227, + "learning_rate": 1.643147554605888e-05, + "loss": 0.0177, + "step": 90990 + }, + { + "epoch": 0.6726589988468703, + "grad_norm": 0.06961645931005478, + "learning_rate": 1.6427765906932573e-05, + "loss": 0.016, + "step": 91000 + }, + { + "epoch": 0.6727329174181721, + "grad_norm": 0.08413616567850113, + "learning_rate": 1.642405626780627e-05, + "loss": 0.0165, + "step": 91010 + }, + { + "epoch": 0.672806835989474, + "grad_norm": 0.09649928659200668, + "learning_rate": 1.6420346628679964e-05, + "loss": 0.0163, + "step": 91020 + }, + { + "epoch": 0.6728807545607759, + "grad_norm": 0.10375601798295975, + "learning_rate": 1.6416636989553657e-05, + "loss": 0.0187, + "step": 91030 + }, + { + "epoch": 0.6729546731320777, + "grad_norm": 0.07088519632816315, + "learning_rate": 1.641292735042735e-05, + "loss": 0.0178, + "step": 91040 + }, + { + "epoch": 0.6730285917033796, + "grad_norm": 0.06176723167300224, + "learning_rate": 1.6409217711301046e-05, + "loss": 0.015, + "step": 91050 + }, + { + "epoch": 0.6731025102746814, + "grad_norm": 0.09454366564750671, + "learning_rate": 1.6405508072174738e-05, + "loss": 0.0176, + "step": 91060 + }, + { + "epoch": 0.6731764288459833, + "grad_norm": 0.0832739919424057, + "learning_rate": 1.6401798433048434e-05, + "loss": 0.0179, + "step": 91070 + }, + { + "epoch": 0.6732503474172851, + "grad_norm": 0.09681088477373123, + "learning_rate": 1.6398088793922127e-05, + "loss": 0.0189, + "step": 91080 + }, + { + "epoch": 0.673324265988587, + "grad_norm": 0.08091330528259277, + "learning_rate": 1.6394379154795823e-05, + "loss": 0.016, + "step": 91090 + }, + { + "epoch": 0.6733981845598889, + "grad_norm": 0.08992233872413635, + "learning_rate": 1.6390669515669515e-05, + "loss": 0.0201, + "step": 91100 + }, + { + "epoch": 0.6734721031311907, + "grad_norm": 0.07696627080440521, + "learning_rate": 1.638695987654321e-05, + "loss": 0.0157, + "step": 91110 + }, + { + "epoch": 0.6735460217024926, + "grad_norm": 0.06534789502620697, + "learning_rate": 1.6383250237416907e-05, + "loss": 0.0186, + "step": 91120 + }, + { + "epoch": 0.6736199402737943, + "grad_norm": 0.07140354067087173, + "learning_rate": 1.63795405982906e-05, + "loss": 0.0189, + "step": 91130 + }, + { + "epoch": 0.6736938588450962, + "grad_norm": 0.07498431950807571, + "learning_rate": 1.6375830959164292e-05, + "loss": 0.0175, + "step": 91140 + }, + { + "epoch": 0.6737677774163982, + "grad_norm": 0.08109982311725616, + "learning_rate": 1.6372121320037985e-05, + "loss": 0.0174, + "step": 91150 + }, + { + "epoch": 0.6738416959876999, + "grad_norm": 0.06725709140300751, + "learning_rate": 1.6368411680911684e-05, + "loss": 0.0175, + "step": 91160 + }, + { + "epoch": 0.6739156145590018, + "grad_norm": 0.07145275175571442, + "learning_rate": 1.6364702041785377e-05, + "loss": 0.0175, + "step": 91170 + }, + { + "epoch": 0.6739895331303036, + "grad_norm": 0.07161834836006165, + "learning_rate": 1.636099240265907e-05, + "loss": 0.0154, + "step": 91180 + }, + { + "epoch": 0.6740634517016055, + "grad_norm": 0.07694154977798462, + "learning_rate": 1.6357282763532762e-05, + "loss": 0.0155, + "step": 91190 + }, + { + "epoch": 0.6741373702729073, + "grad_norm": 0.06621759384870529, + "learning_rate": 1.6353573124406458e-05, + "loss": 0.021, + "step": 91200 + }, + { + "epoch": 0.6742112888442092, + "grad_norm": 0.0970006138086319, + "learning_rate": 1.6349863485280154e-05, + "loss": 0.017, + "step": 91210 + }, + { + "epoch": 0.6742852074155111, + "grad_norm": 0.0957852154970169, + "learning_rate": 1.6346153846153847e-05, + "loss": 0.0187, + "step": 91220 + }, + { + "epoch": 0.6743591259868129, + "grad_norm": 0.08307493478059769, + "learning_rate": 1.634244420702754e-05, + "loss": 0.0161, + "step": 91230 + }, + { + "epoch": 0.6744330445581148, + "grad_norm": 0.09416956454515457, + "learning_rate": 1.6338734567901235e-05, + "loss": 0.0173, + "step": 91240 + }, + { + "epoch": 0.6745069631294166, + "grad_norm": 0.07699141651391983, + "learning_rate": 1.633502492877493e-05, + "loss": 0.0159, + "step": 91250 + }, + { + "epoch": 0.6745808817007185, + "grad_norm": 0.06724029034376144, + "learning_rate": 1.6331315289648624e-05, + "loss": 0.0156, + "step": 91260 + }, + { + "epoch": 0.6746548002720203, + "grad_norm": 0.10115319490432739, + "learning_rate": 1.632760565052232e-05, + "loss": 0.0174, + "step": 91270 + }, + { + "epoch": 0.6747287188433222, + "grad_norm": 0.0618862546980381, + "learning_rate": 1.6323896011396012e-05, + "loss": 0.0172, + "step": 91280 + }, + { + "epoch": 0.6748026374146241, + "grad_norm": 0.101004958152771, + "learning_rate": 1.6320186372269705e-05, + "loss": 0.0189, + "step": 91290 + }, + { + "epoch": 0.6748765559859259, + "grad_norm": 0.07224094867706299, + "learning_rate": 1.63164767331434e-05, + "loss": 0.0175, + "step": 91300 + }, + { + "epoch": 0.6749504745572278, + "grad_norm": 0.07197592407464981, + "learning_rate": 1.6312767094017097e-05, + "loss": 0.0171, + "step": 91310 + }, + { + "epoch": 0.6750243931285296, + "grad_norm": 0.06368234008550644, + "learning_rate": 1.630905745489079e-05, + "loss": 0.0166, + "step": 91320 + }, + { + "epoch": 0.6750983116998315, + "grad_norm": 0.0943496897816658, + "learning_rate": 1.6305347815764482e-05, + "loss": 0.017, + "step": 91330 + }, + { + "epoch": 0.6751722302711333, + "grad_norm": 0.07670585066080093, + "learning_rate": 1.6301638176638178e-05, + "loss": 0.0196, + "step": 91340 + }, + { + "epoch": 0.6752461488424352, + "grad_norm": 0.09279531240463257, + "learning_rate": 1.6297928537511874e-05, + "loss": 0.0177, + "step": 91350 + }, + { + "epoch": 0.6753200674137371, + "grad_norm": 0.06604079902172089, + "learning_rate": 1.6294218898385566e-05, + "loss": 0.0148, + "step": 91360 + }, + { + "epoch": 0.6753939859850389, + "grad_norm": 0.081661157310009, + "learning_rate": 1.629050925925926e-05, + "loss": 0.0165, + "step": 91370 + }, + { + "epoch": 0.6754679045563408, + "grad_norm": 0.09706810116767883, + "learning_rate": 1.628679962013295e-05, + "loss": 0.0156, + "step": 91380 + }, + { + "epoch": 0.6755418231276425, + "grad_norm": 0.07296976447105408, + "learning_rate": 1.628308998100665e-05, + "loss": 0.0163, + "step": 91390 + }, + { + "epoch": 0.6756157416989444, + "grad_norm": 0.07825100421905518, + "learning_rate": 1.6279380341880343e-05, + "loss": 0.0184, + "step": 91400 + }, + { + "epoch": 0.6756896602702464, + "grad_norm": 0.07666713744401932, + "learning_rate": 1.6275670702754036e-05, + "loss": 0.0177, + "step": 91410 + }, + { + "epoch": 0.6757635788415481, + "grad_norm": 0.11629067361354828, + "learning_rate": 1.6271961063627732e-05, + "loss": 0.0185, + "step": 91420 + }, + { + "epoch": 0.67583749741285, + "grad_norm": 0.07979996502399445, + "learning_rate": 1.6268251424501425e-05, + "loss": 0.0181, + "step": 91430 + }, + { + "epoch": 0.6759114159841518, + "grad_norm": 0.08092159777879715, + "learning_rate": 1.626454178537512e-05, + "loss": 0.0169, + "step": 91440 + }, + { + "epoch": 0.6759853345554537, + "grad_norm": 0.09439874440431595, + "learning_rate": 1.6260832146248813e-05, + "loss": 0.0178, + "step": 91450 + }, + { + "epoch": 0.6760592531267555, + "grad_norm": 0.11879925429821014, + "learning_rate": 1.625712250712251e-05, + "loss": 0.0167, + "step": 91460 + }, + { + "epoch": 0.6761331716980574, + "grad_norm": 0.06729891896247864, + "learning_rate": 1.62534128679962e-05, + "loss": 0.0175, + "step": 91470 + }, + { + "epoch": 0.6762070902693593, + "grad_norm": 0.08897735923528671, + "learning_rate": 1.6249703228869898e-05, + "loss": 0.0183, + "step": 91480 + }, + { + "epoch": 0.6762810088406611, + "grad_norm": 0.08789429813623428, + "learning_rate": 1.624599358974359e-05, + "loss": 0.0173, + "step": 91490 + }, + { + "epoch": 0.676354927411963, + "grad_norm": 0.09152679145336151, + "learning_rate": 1.6242283950617286e-05, + "loss": 0.0167, + "step": 91500 + }, + { + "epoch": 0.6764288459832648, + "grad_norm": 0.07893484830856323, + "learning_rate": 1.623857431149098e-05, + "loss": 0.016, + "step": 91510 + }, + { + "epoch": 0.6765027645545667, + "grad_norm": 0.081581711769104, + "learning_rate": 1.623486467236467e-05, + "loss": 0.016, + "step": 91520 + }, + { + "epoch": 0.6765766831258685, + "grad_norm": 0.060920149087905884, + "learning_rate": 1.6231155033238367e-05, + "loss": 0.0164, + "step": 91530 + }, + { + "epoch": 0.6766506016971704, + "grad_norm": 0.0741836279630661, + "learning_rate": 1.6227445394112063e-05, + "loss": 0.0172, + "step": 91540 + }, + { + "epoch": 0.6767245202684723, + "grad_norm": 0.05645517259836197, + "learning_rate": 1.6223735754985756e-05, + "loss": 0.0191, + "step": 91550 + }, + { + "epoch": 0.6767984388397741, + "grad_norm": 0.0640142410993576, + "learning_rate": 1.622002611585945e-05, + "loss": 0.0183, + "step": 91560 + }, + { + "epoch": 0.676872357411076, + "grad_norm": 0.10702574998140335, + "learning_rate": 1.6216316476733144e-05, + "loss": 0.0157, + "step": 91570 + }, + { + "epoch": 0.6769462759823778, + "grad_norm": 0.0870826467871666, + "learning_rate": 1.621260683760684e-05, + "loss": 0.018, + "step": 91580 + }, + { + "epoch": 0.6770201945536797, + "grad_norm": 0.10151351988315582, + "learning_rate": 1.6208897198480533e-05, + "loss": 0.0185, + "step": 91590 + }, + { + "epoch": 0.6770941131249815, + "grad_norm": 0.0760154202580452, + "learning_rate": 1.6205187559354226e-05, + "loss": 0.0168, + "step": 91600 + }, + { + "epoch": 0.6771680316962834, + "grad_norm": 0.0814782902598381, + "learning_rate": 1.620147792022792e-05, + "loss": 0.0188, + "step": 91610 + }, + { + "epoch": 0.6772419502675853, + "grad_norm": 0.07616132497787476, + "learning_rate": 1.6197768281101617e-05, + "loss": 0.0184, + "step": 91620 + }, + { + "epoch": 0.6773158688388871, + "grad_norm": 0.07322046905755997, + "learning_rate": 1.619405864197531e-05, + "loss": 0.0176, + "step": 91630 + }, + { + "epoch": 0.677389787410189, + "grad_norm": 0.06372839957475662, + "learning_rate": 1.6190349002849003e-05, + "loss": 0.0169, + "step": 91640 + }, + { + "epoch": 0.6774637059814907, + "grad_norm": 0.07176537066698074, + "learning_rate": 1.61866393637227e-05, + "loss": 0.0158, + "step": 91650 + }, + { + "epoch": 0.6775376245527926, + "grad_norm": 0.0934654176235199, + "learning_rate": 1.618292972459639e-05, + "loss": 0.0175, + "step": 91660 + }, + { + "epoch": 0.6776115431240946, + "grad_norm": 0.08245817571878433, + "learning_rate": 1.6179220085470087e-05, + "loss": 0.0174, + "step": 91670 + }, + { + "epoch": 0.6776854616953963, + "grad_norm": 0.08012942224740982, + "learning_rate": 1.617551044634378e-05, + "loss": 0.0169, + "step": 91680 + }, + { + "epoch": 0.6777593802666982, + "grad_norm": 0.08259119093418121, + "learning_rate": 1.6171800807217476e-05, + "loss": 0.0192, + "step": 91690 + }, + { + "epoch": 0.677833298838, + "grad_norm": 0.07471147179603577, + "learning_rate": 1.6168091168091168e-05, + "loss": 0.0193, + "step": 91700 + }, + { + "epoch": 0.6779072174093019, + "grad_norm": 0.0738404244184494, + "learning_rate": 1.6164381528964864e-05, + "loss": 0.0162, + "step": 91710 + }, + { + "epoch": 0.6779811359806037, + "grad_norm": 0.07564840465784073, + "learning_rate": 1.6160671889838557e-05, + "loss": 0.017, + "step": 91720 + }, + { + "epoch": 0.6780550545519056, + "grad_norm": 0.11400596797466278, + "learning_rate": 1.6156962250712253e-05, + "loss": 0.0164, + "step": 91730 + }, + { + "epoch": 0.6781289731232075, + "grad_norm": 0.09355101734399796, + "learning_rate": 1.6153252611585945e-05, + "loss": 0.0194, + "step": 91740 + }, + { + "epoch": 0.6782028916945093, + "grad_norm": 0.08981883525848389, + "learning_rate": 1.6149542972459638e-05, + "loss": 0.0178, + "step": 91750 + }, + { + "epoch": 0.6782768102658112, + "grad_norm": 0.08405116200447083, + "learning_rate": 1.6145833333333334e-05, + "loss": 0.0173, + "step": 91760 + }, + { + "epoch": 0.678350728837113, + "grad_norm": 0.11737560480833054, + "learning_rate": 1.614212369420703e-05, + "loss": 0.0195, + "step": 91770 + }, + { + "epoch": 0.6784246474084149, + "grad_norm": 0.08872861415147781, + "learning_rate": 1.6138414055080722e-05, + "loss": 0.0184, + "step": 91780 + }, + { + "epoch": 0.6784985659797167, + "grad_norm": 0.05598621815443039, + "learning_rate": 1.6134704415954415e-05, + "loss": 0.016, + "step": 91790 + }, + { + "epoch": 0.6785724845510186, + "grad_norm": 0.0666181743144989, + "learning_rate": 1.613099477682811e-05, + "loss": 0.0166, + "step": 91800 + }, + { + "epoch": 0.6786464031223205, + "grad_norm": 0.09435441344976425, + "learning_rate": 1.6127285137701807e-05, + "loss": 0.0173, + "step": 91810 + }, + { + "epoch": 0.6787203216936223, + "grad_norm": 0.07333523780107498, + "learning_rate": 1.61235754985755e-05, + "loss": 0.0169, + "step": 91820 + }, + { + "epoch": 0.6787942402649242, + "grad_norm": 0.06826969981193542, + "learning_rate": 1.6119865859449192e-05, + "loss": 0.0169, + "step": 91830 + }, + { + "epoch": 0.678868158836226, + "grad_norm": 0.09807952493429184, + "learning_rate": 1.6116156220322888e-05, + "loss": 0.0176, + "step": 91840 + }, + { + "epoch": 0.6789420774075279, + "grad_norm": 0.08572237193584442, + "learning_rate": 1.6112446581196584e-05, + "loss": 0.0191, + "step": 91850 + }, + { + "epoch": 0.6790159959788297, + "grad_norm": 0.08711138367652893, + "learning_rate": 1.6108736942070277e-05, + "loss": 0.0152, + "step": 91860 + }, + { + "epoch": 0.6790899145501316, + "grad_norm": 0.07314954698085785, + "learning_rate": 1.610502730294397e-05, + "loss": 0.0161, + "step": 91870 + }, + { + "epoch": 0.6791638331214335, + "grad_norm": 0.06449910253286362, + "learning_rate": 1.6101317663817665e-05, + "loss": 0.0171, + "step": 91880 + }, + { + "epoch": 0.6792377516927353, + "grad_norm": 0.06305267661809921, + "learning_rate": 1.6097608024691358e-05, + "loss": 0.0179, + "step": 91890 + }, + { + "epoch": 0.6793116702640372, + "grad_norm": 0.062146369367837906, + "learning_rate": 1.6093898385565054e-05, + "loss": 0.0168, + "step": 91900 + }, + { + "epoch": 0.679385588835339, + "grad_norm": 0.08478434383869171, + "learning_rate": 1.6090188746438746e-05, + "loss": 0.0177, + "step": 91910 + }, + { + "epoch": 0.6794595074066409, + "grad_norm": 0.07532593607902527, + "learning_rate": 1.6086479107312442e-05, + "loss": 0.0183, + "step": 91920 + }, + { + "epoch": 0.6795334259779428, + "grad_norm": 0.08118089288473129, + "learning_rate": 1.6082769468186135e-05, + "loss": 0.0188, + "step": 91930 + }, + { + "epoch": 0.6796073445492445, + "grad_norm": 0.09082608669996262, + "learning_rate": 1.607905982905983e-05, + "loss": 0.0203, + "step": 91940 + }, + { + "epoch": 0.6796812631205464, + "grad_norm": 0.06602920591831207, + "learning_rate": 1.6075350189933523e-05, + "loss": 0.0187, + "step": 91950 + }, + { + "epoch": 0.6797551816918482, + "grad_norm": 0.08158482611179352, + "learning_rate": 1.607164055080722e-05, + "loss": 0.0187, + "step": 91960 + }, + { + "epoch": 0.6798291002631501, + "grad_norm": 0.05968308076262474, + "learning_rate": 1.6067930911680912e-05, + "loss": 0.0159, + "step": 91970 + }, + { + "epoch": 0.6799030188344519, + "grad_norm": 0.08257415890693665, + "learning_rate": 1.6064221272554605e-05, + "loss": 0.0176, + "step": 91980 + }, + { + "epoch": 0.6799769374057538, + "grad_norm": 0.10404511541128159, + "learning_rate": 1.60605116334283e-05, + "loss": 0.017, + "step": 91990 + }, + { + "epoch": 0.6800508559770557, + "grad_norm": 0.09237322211265564, + "learning_rate": 1.6056801994301996e-05, + "loss": 0.0194, + "step": 92000 + }, + { + "epoch": 0.6801247745483575, + "grad_norm": 0.06430641561746597, + "learning_rate": 1.605309235517569e-05, + "loss": 0.0186, + "step": 92010 + }, + { + "epoch": 0.6801986931196594, + "grad_norm": 0.056055862456560135, + "learning_rate": 1.604938271604938e-05, + "loss": 0.0173, + "step": 92020 + }, + { + "epoch": 0.6802726116909612, + "grad_norm": 0.08425690978765488, + "learning_rate": 1.6045673076923078e-05, + "loss": 0.0183, + "step": 92030 + }, + { + "epoch": 0.6803465302622631, + "grad_norm": 0.0908007025718689, + "learning_rate": 1.6041963437796774e-05, + "loss": 0.0177, + "step": 92040 + }, + { + "epoch": 0.6804204488335649, + "grad_norm": 0.06910260766744614, + "learning_rate": 1.6038253798670466e-05, + "loss": 0.0207, + "step": 92050 + }, + { + "epoch": 0.6804943674048668, + "grad_norm": 0.08825428038835526, + "learning_rate": 1.603454415954416e-05, + "loss": 0.0185, + "step": 92060 + }, + { + "epoch": 0.6805682859761687, + "grad_norm": 0.07975617051124573, + "learning_rate": 1.6030834520417855e-05, + "loss": 0.0193, + "step": 92070 + }, + { + "epoch": 0.6806422045474705, + "grad_norm": 0.08019765466451645, + "learning_rate": 1.602712488129155e-05, + "loss": 0.0178, + "step": 92080 + }, + { + "epoch": 0.6807161231187724, + "grad_norm": 0.07777857780456543, + "learning_rate": 1.6023415242165243e-05, + "loss": 0.0172, + "step": 92090 + }, + { + "epoch": 0.6807900416900742, + "grad_norm": 0.09756932407617569, + "learning_rate": 1.6019705603038936e-05, + "loss": 0.018, + "step": 92100 + }, + { + "epoch": 0.6808639602613761, + "grad_norm": 0.08036884665489197, + "learning_rate": 1.6015995963912632e-05, + "loss": 0.019, + "step": 92110 + }, + { + "epoch": 0.6809378788326779, + "grad_norm": 0.08309336006641388, + "learning_rate": 1.6012286324786324e-05, + "loss": 0.0171, + "step": 92120 + }, + { + "epoch": 0.6810117974039798, + "grad_norm": 0.08888134360313416, + "learning_rate": 1.600857668566002e-05, + "loss": 0.02, + "step": 92130 + }, + { + "epoch": 0.6810857159752817, + "grad_norm": 0.07211754471063614, + "learning_rate": 1.6004867046533713e-05, + "loss": 0.0192, + "step": 92140 + }, + { + "epoch": 0.6811596345465835, + "grad_norm": 0.0735514760017395, + "learning_rate": 1.600115740740741e-05, + "loss": 0.0181, + "step": 92150 + }, + { + "epoch": 0.6812335531178854, + "grad_norm": 0.06698044389486313, + "learning_rate": 1.59974477682811e-05, + "loss": 0.0166, + "step": 92160 + }, + { + "epoch": 0.6813074716891871, + "grad_norm": 0.09301523864269257, + "learning_rate": 1.5993738129154797e-05, + "loss": 0.0176, + "step": 92170 + }, + { + "epoch": 0.681381390260489, + "grad_norm": 0.09156746417284012, + "learning_rate": 1.599002849002849e-05, + "loss": 0.0188, + "step": 92180 + }, + { + "epoch": 0.681455308831791, + "grad_norm": 0.06802339851856232, + "learning_rate": 1.5986318850902186e-05, + "loss": 0.0178, + "step": 92190 + }, + { + "epoch": 0.6815292274030927, + "grad_norm": 0.1307523250579834, + "learning_rate": 1.598260921177588e-05, + "loss": 0.0166, + "step": 92200 + }, + { + "epoch": 0.6816031459743946, + "grad_norm": 0.08905123174190521, + "learning_rate": 1.597889957264957e-05, + "loss": 0.0174, + "step": 92210 + }, + { + "epoch": 0.6816770645456964, + "grad_norm": 0.0922866016626358, + "learning_rate": 1.5975189933523267e-05, + "loss": 0.0183, + "step": 92220 + }, + { + "epoch": 0.6817509831169983, + "grad_norm": 0.0921129509806633, + "learning_rate": 1.5971480294396963e-05, + "loss": 0.0164, + "step": 92230 + }, + { + "epoch": 0.6818249016883001, + "grad_norm": 0.06170952692627907, + "learning_rate": 1.5967770655270656e-05, + "loss": 0.0169, + "step": 92240 + }, + { + "epoch": 0.681898820259602, + "grad_norm": 0.08036050945520401, + "learning_rate": 1.5964061016144348e-05, + "loss": 0.0174, + "step": 92250 + }, + { + "epoch": 0.6819727388309039, + "grad_norm": 0.10198846459388733, + "learning_rate": 1.5960351377018044e-05, + "loss": 0.0185, + "step": 92260 + }, + { + "epoch": 0.6820466574022057, + "grad_norm": 0.05803276598453522, + "learning_rate": 1.595664173789174e-05, + "loss": 0.0162, + "step": 92270 + }, + { + "epoch": 0.6821205759735076, + "grad_norm": 0.06204792112112045, + "learning_rate": 1.5952932098765433e-05, + "loss": 0.0175, + "step": 92280 + }, + { + "epoch": 0.6821944945448094, + "grad_norm": 0.07870964705944061, + "learning_rate": 1.5949222459639125e-05, + "loss": 0.0189, + "step": 92290 + }, + { + "epoch": 0.6822684131161113, + "grad_norm": 0.08516126126050949, + "learning_rate": 1.594551282051282e-05, + "loss": 0.0178, + "step": 92300 + }, + { + "epoch": 0.6823423316874131, + "grad_norm": 0.07499513030052185, + "learning_rate": 1.5941803181386517e-05, + "loss": 0.0165, + "step": 92310 + }, + { + "epoch": 0.682416250258715, + "grad_norm": 0.07255727052688599, + "learning_rate": 1.593809354226021e-05, + "loss": 0.0186, + "step": 92320 + }, + { + "epoch": 0.6824901688300169, + "grad_norm": 0.056045882403850555, + "learning_rate": 1.5934383903133902e-05, + "loss": 0.0164, + "step": 92330 + }, + { + "epoch": 0.6825640874013187, + "grad_norm": 0.056728675961494446, + "learning_rate": 1.59306742640076e-05, + "loss": 0.0159, + "step": 92340 + }, + { + "epoch": 0.6826380059726206, + "grad_norm": 0.08712863177061081, + "learning_rate": 1.592696462488129e-05, + "loss": 0.0168, + "step": 92350 + }, + { + "epoch": 0.6827119245439224, + "grad_norm": 0.08192974328994751, + "learning_rate": 1.5923254985754987e-05, + "loss": 0.0179, + "step": 92360 + }, + { + "epoch": 0.6827858431152243, + "grad_norm": 0.07430005073547363, + "learning_rate": 1.591954534662868e-05, + "loss": 0.018, + "step": 92370 + }, + { + "epoch": 0.6828597616865261, + "grad_norm": 0.0707472711801529, + "learning_rate": 1.5915835707502375e-05, + "loss": 0.0191, + "step": 92380 + }, + { + "epoch": 0.682933680257828, + "grad_norm": 0.08295336365699768, + "learning_rate": 1.5912126068376068e-05, + "loss": 0.0181, + "step": 92390 + }, + { + "epoch": 0.6830075988291299, + "grad_norm": 0.0547831691801548, + "learning_rate": 1.5908416429249764e-05, + "loss": 0.0167, + "step": 92400 + }, + { + "epoch": 0.6830815174004317, + "grad_norm": 0.0616627112030983, + "learning_rate": 1.5904706790123457e-05, + "loss": 0.0166, + "step": 92410 + }, + { + "epoch": 0.6831554359717336, + "grad_norm": 0.07605061680078506, + "learning_rate": 1.5900997150997153e-05, + "loss": 0.0174, + "step": 92420 + }, + { + "epoch": 0.6832293545430353, + "grad_norm": 0.07115055620670319, + "learning_rate": 1.5897287511870845e-05, + "loss": 0.0147, + "step": 92430 + }, + { + "epoch": 0.6833032731143373, + "grad_norm": 0.10412179678678513, + "learning_rate": 1.5893577872744538e-05, + "loss": 0.0189, + "step": 92440 + }, + { + "epoch": 0.6833771916856392, + "grad_norm": 0.067973293364048, + "learning_rate": 1.5889868233618234e-05, + "loss": 0.0169, + "step": 92450 + }, + { + "epoch": 0.6834511102569409, + "grad_norm": 0.07125255465507507, + "learning_rate": 1.588615859449193e-05, + "loss": 0.0196, + "step": 92460 + }, + { + "epoch": 0.6835250288282428, + "grad_norm": 0.06105915084481239, + "learning_rate": 1.5882448955365622e-05, + "loss": 0.0171, + "step": 92470 + }, + { + "epoch": 0.6835989473995446, + "grad_norm": 0.07705443352460861, + "learning_rate": 1.5878739316239315e-05, + "loss": 0.0161, + "step": 92480 + }, + { + "epoch": 0.6836728659708465, + "grad_norm": 0.07152712345123291, + "learning_rate": 1.5875029677113014e-05, + "loss": 0.0178, + "step": 92490 + }, + { + "epoch": 0.6837467845421483, + "grad_norm": 0.06911912560462952, + "learning_rate": 1.5871320037986707e-05, + "loss": 0.02, + "step": 92500 + }, + { + "epoch": 0.6838207031134502, + "grad_norm": 0.06743113696575165, + "learning_rate": 1.58676103988604e-05, + "loss": 0.0148, + "step": 92510 + }, + { + "epoch": 0.6838946216847521, + "grad_norm": 0.0807114690542221, + "learning_rate": 1.5863900759734092e-05, + "loss": 0.0175, + "step": 92520 + }, + { + "epoch": 0.6839685402560539, + "grad_norm": 0.10657806694507599, + "learning_rate": 1.5860191120607788e-05, + "loss": 0.0179, + "step": 92530 + }, + { + "epoch": 0.6840424588273558, + "grad_norm": 0.08332397043704987, + "learning_rate": 1.5856481481481484e-05, + "loss": 0.0176, + "step": 92540 + }, + { + "epoch": 0.6841163773986576, + "grad_norm": 0.06702392548322678, + "learning_rate": 1.5852771842355176e-05, + "loss": 0.0158, + "step": 92550 + }, + { + "epoch": 0.6841902959699595, + "grad_norm": 0.07311967760324478, + "learning_rate": 1.584906220322887e-05, + "loss": 0.0177, + "step": 92560 + }, + { + "epoch": 0.6842642145412613, + "grad_norm": 0.06532112509012222, + "learning_rate": 1.5845352564102565e-05, + "loss": 0.0166, + "step": 92570 + }, + { + "epoch": 0.6843381331125632, + "grad_norm": 0.097222238779068, + "learning_rate": 1.5841642924976257e-05, + "loss": 0.0193, + "step": 92580 + }, + { + "epoch": 0.6844120516838651, + "grad_norm": 0.08110792189836502, + "learning_rate": 1.5837933285849953e-05, + "loss": 0.0163, + "step": 92590 + }, + { + "epoch": 0.6844859702551669, + "grad_norm": 0.06945564597845078, + "learning_rate": 1.5834223646723646e-05, + "loss": 0.0168, + "step": 92600 + }, + { + "epoch": 0.6845598888264688, + "grad_norm": 0.10886241495609283, + "learning_rate": 1.5830514007597342e-05, + "loss": 0.0201, + "step": 92610 + }, + { + "epoch": 0.6846338073977706, + "grad_norm": 0.06769303977489471, + "learning_rate": 1.5826804368471035e-05, + "loss": 0.016, + "step": 92620 + }, + { + "epoch": 0.6847077259690725, + "grad_norm": 0.05759311467409134, + "learning_rate": 1.582309472934473e-05, + "loss": 0.016, + "step": 92630 + }, + { + "epoch": 0.6847816445403744, + "grad_norm": 0.07953110337257385, + "learning_rate": 1.5819385090218427e-05, + "loss": 0.0184, + "step": 92640 + }, + { + "epoch": 0.6848555631116762, + "grad_norm": 0.07198061048984528, + "learning_rate": 1.581567545109212e-05, + "loss": 0.0177, + "step": 92650 + }, + { + "epoch": 0.6849294816829781, + "grad_norm": 0.054443176835775375, + "learning_rate": 1.581196581196581e-05, + "loss": 0.0163, + "step": 92660 + }, + { + "epoch": 0.6850034002542799, + "grad_norm": 0.1029653251171112, + "learning_rate": 1.5808256172839504e-05, + "loss": 0.0183, + "step": 92670 + }, + { + "epoch": 0.6850773188255818, + "grad_norm": 0.07635175436735153, + "learning_rate": 1.5804546533713204e-05, + "loss": 0.0149, + "step": 92680 + }, + { + "epoch": 0.6851512373968836, + "grad_norm": 0.07192494720220566, + "learning_rate": 1.5800836894586896e-05, + "loss": 0.0137, + "step": 92690 + }, + { + "epoch": 0.6852251559681855, + "grad_norm": 0.08006956428289413, + "learning_rate": 1.579712725546059e-05, + "loss": 0.017, + "step": 92700 + }, + { + "epoch": 0.6852990745394874, + "grad_norm": 0.07870300859212875, + "learning_rate": 1.579341761633428e-05, + "loss": 0.0166, + "step": 92710 + }, + { + "epoch": 0.6853729931107891, + "grad_norm": 0.057502858340740204, + "learning_rate": 1.578970797720798e-05, + "loss": 0.0156, + "step": 92720 + }, + { + "epoch": 0.685446911682091, + "grad_norm": 0.06075780466198921, + "learning_rate": 1.5785998338081673e-05, + "loss": 0.0175, + "step": 92730 + }, + { + "epoch": 0.6855208302533928, + "grad_norm": 0.08338859677314758, + "learning_rate": 1.5782288698955366e-05, + "loss": 0.02, + "step": 92740 + }, + { + "epoch": 0.6855947488246947, + "grad_norm": 0.08002070337533951, + "learning_rate": 1.577857905982906e-05, + "loss": 0.0183, + "step": 92750 + }, + { + "epoch": 0.6856686673959965, + "grad_norm": 0.05922900512814522, + "learning_rate": 1.5774869420702754e-05, + "loss": 0.0167, + "step": 92760 + }, + { + "epoch": 0.6857425859672984, + "grad_norm": 0.054509907960891724, + "learning_rate": 1.577115978157645e-05, + "loss": 0.0164, + "step": 92770 + }, + { + "epoch": 0.6858165045386003, + "grad_norm": 0.07607532292604446, + "learning_rate": 1.5767450142450143e-05, + "loss": 0.015, + "step": 92780 + }, + { + "epoch": 0.6858904231099021, + "grad_norm": 0.0719480961561203, + "learning_rate": 1.5763740503323836e-05, + "loss": 0.0206, + "step": 92790 + }, + { + "epoch": 0.685964341681204, + "grad_norm": 0.0783941000699997, + "learning_rate": 1.576003086419753e-05, + "loss": 0.0189, + "step": 92800 + }, + { + "epoch": 0.6860382602525058, + "grad_norm": 0.0678236335515976, + "learning_rate": 1.5756321225071227e-05, + "loss": 0.0161, + "step": 92810 + }, + { + "epoch": 0.6861121788238077, + "grad_norm": 0.0772695392370224, + "learning_rate": 1.575261158594492e-05, + "loss": 0.0175, + "step": 92820 + }, + { + "epoch": 0.6861860973951095, + "grad_norm": 0.09057267755270004, + "learning_rate": 1.5748901946818616e-05, + "loss": 0.0151, + "step": 92830 + }, + { + "epoch": 0.6862600159664114, + "grad_norm": 0.08303174376487732, + "learning_rate": 1.574519230769231e-05, + "loss": 0.0185, + "step": 92840 + }, + { + "epoch": 0.6863339345377133, + "grad_norm": 0.08072768896818161, + "learning_rate": 1.5741482668566e-05, + "loss": 0.0182, + "step": 92850 + }, + { + "epoch": 0.6864078531090151, + "grad_norm": 0.08461995422840118, + "learning_rate": 1.5737773029439697e-05, + "loss": 0.0184, + "step": 92860 + }, + { + "epoch": 0.686481771680317, + "grad_norm": 0.07058220356702805, + "learning_rate": 1.5734063390313393e-05, + "loss": 0.0145, + "step": 92870 + }, + { + "epoch": 0.6865556902516188, + "grad_norm": 0.07958796620368958, + "learning_rate": 1.5730353751187086e-05, + "loss": 0.0169, + "step": 92880 + }, + { + "epoch": 0.6866296088229207, + "grad_norm": 0.1059829518198967, + "learning_rate": 1.5726644112060778e-05, + "loss": 0.0179, + "step": 92890 + }, + { + "epoch": 0.6867035273942226, + "grad_norm": 0.06931605190038681, + "learning_rate": 1.572293447293447e-05, + "loss": 0.0166, + "step": 92900 + }, + { + "epoch": 0.6867774459655244, + "grad_norm": 0.06818471103906631, + "learning_rate": 1.571922483380817e-05, + "loss": 0.0158, + "step": 92910 + }, + { + "epoch": 0.6868513645368263, + "grad_norm": 0.05438080057501793, + "learning_rate": 1.5715515194681863e-05, + "loss": 0.0157, + "step": 92920 + }, + { + "epoch": 0.6869252831081281, + "grad_norm": 0.07555864006280899, + "learning_rate": 1.5711805555555555e-05, + "loss": 0.0171, + "step": 92930 + }, + { + "epoch": 0.68699920167943, + "grad_norm": 0.06466842442750931, + "learning_rate": 1.5708095916429248e-05, + "loss": 0.018, + "step": 92940 + }, + { + "epoch": 0.6870731202507318, + "grad_norm": 0.06618928164243698, + "learning_rate": 1.5704386277302947e-05, + "loss": 0.0167, + "step": 92950 + }, + { + "epoch": 0.6871470388220337, + "grad_norm": 0.09534517675638199, + "learning_rate": 1.570067663817664e-05, + "loss": 0.0164, + "step": 92960 + }, + { + "epoch": 0.6872209573933356, + "grad_norm": 0.08508911728858948, + "learning_rate": 1.5696966999050332e-05, + "loss": 0.0178, + "step": 92970 + }, + { + "epoch": 0.6872948759646373, + "grad_norm": 0.09765244275331497, + "learning_rate": 1.569325735992403e-05, + "loss": 0.0188, + "step": 92980 + }, + { + "epoch": 0.6873687945359392, + "grad_norm": 0.09178492426872253, + "learning_rate": 1.568954772079772e-05, + "loss": 0.0173, + "step": 92990 + }, + { + "epoch": 0.687442713107241, + "grad_norm": 0.08355975896120071, + "learning_rate": 1.5685838081671417e-05, + "loss": 0.0174, + "step": 93000 + }, + { + "epoch": 0.6875166316785429, + "grad_norm": 0.07787944376468658, + "learning_rate": 1.568212844254511e-05, + "loss": 0.0153, + "step": 93010 + }, + { + "epoch": 0.6875905502498447, + "grad_norm": 0.06931786239147186, + "learning_rate": 1.5678418803418806e-05, + "loss": 0.0188, + "step": 93020 + }, + { + "epoch": 0.6876644688211466, + "grad_norm": 0.07735756039619446, + "learning_rate": 1.5674709164292498e-05, + "loss": 0.0196, + "step": 93030 + }, + { + "epoch": 0.6877383873924485, + "grad_norm": 0.07819321006536484, + "learning_rate": 1.5670999525166194e-05, + "loss": 0.0171, + "step": 93040 + }, + { + "epoch": 0.6878123059637503, + "grad_norm": 0.09739043563604355, + "learning_rate": 1.5667289886039887e-05, + "loss": 0.0177, + "step": 93050 + }, + { + "epoch": 0.6878862245350522, + "grad_norm": 0.06608149409294128, + "learning_rate": 1.5663580246913583e-05, + "loss": 0.016, + "step": 93060 + }, + { + "epoch": 0.687960143106354, + "grad_norm": 0.08745332807302475, + "learning_rate": 1.5659870607787275e-05, + "loss": 0.0169, + "step": 93070 + }, + { + "epoch": 0.6880340616776559, + "grad_norm": 0.07087000459432602, + "learning_rate": 1.5656160968660968e-05, + "loss": 0.016, + "step": 93080 + }, + { + "epoch": 0.6881079802489577, + "grad_norm": 0.08703728020191193, + "learning_rate": 1.5652451329534664e-05, + "loss": 0.0154, + "step": 93090 + }, + { + "epoch": 0.6881818988202596, + "grad_norm": 0.09080562740564346, + "learning_rate": 1.564874169040836e-05, + "loss": 0.0189, + "step": 93100 + }, + { + "epoch": 0.6882558173915615, + "grad_norm": 0.09614677727222443, + "learning_rate": 1.5645032051282052e-05, + "loss": 0.0192, + "step": 93110 + }, + { + "epoch": 0.6883297359628633, + "grad_norm": 0.06185782700777054, + "learning_rate": 1.5641322412155745e-05, + "loss": 0.017, + "step": 93120 + }, + { + "epoch": 0.6884036545341652, + "grad_norm": 0.0741971805691719, + "learning_rate": 1.563761277302944e-05, + "loss": 0.0175, + "step": 93130 + }, + { + "epoch": 0.688477573105467, + "grad_norm": 0.0744548961520195, + "learning_rate": 1.5633903133903137e-05, + "loss": 0.0176, + "step": 93140 + }, + { + "epoch": 0.6885514916767689, + "grad_norm": 0.07029058784246445, + "learning_rate": 1.563019349477683e-05, + "loss": 0.0177, + "step": 93150 + }, + { + "epoch": 0.6886254102480708, + "grad_norm": 0.05117655545473099, + "learning_rate": 1.5626483855650522e-05, + "loss": 0.0166, + "step": 93160 + }, + { + "epoch": 0.6886993288193726, + "grad_norm": 0.07720599323511124, + "learning_rate": 1.5622774216524218e-05, + "loss": 0.02, + "step": 93170 + }, + { + "epoch": 0.6887732473906745, + "grad_norm": 0.0722067803144455, + "learning_rate": 1.5619064577397914e-05, + "loss": 0.0179, + "step": 93180 + }, + { + "epoch": 0.6888471659619763, + "grad_norm": 0.06938211619853973, + "learning_rate": 1.5615354938271606e-05, + "loss": 0.0162, + "step": 93190 + }, + { + "epoch": 0.6889210845332782, + "grad_norm": 0.07468469440937042, + "learning_rate": 1.56116452991453e-05, + "loss": 0.0177, + "step": 93200 + }, + { + "epoch": 0.68899500310458, + "grad_norm": 0.05706126615405083, + "learning_rate": 1.5607935660018995e-05, + "loss": 0.0172, + "step": 93210 + }, + { + "epoch": 0.6890689216758819, + "grad_norm": 0.0980379581451416, + "learning_rate": 1.5604226020892688e-05, + "loss": 0.019, + "step": 93220 + }, + { + "epoch": 0.6891428402471838, + "grad_norm": 0.09685836732387543, + "learning_rate": 1.5600516381766384e-05, + "loss": 0.0173, + "step": 93230 + }, + { + "epoch": 0.6892167588184855, + "grad_norm": 0.0751485526561737, + "learning_rate": 1.5596806742640076e-05, + "loss": 0.0171, + "step": 93240 + }, + { + "epoch": 0.6892906773897874, + "grad_norm": 0.10697372257709503, + "learning_rate": 1.5593097103513772e-05, + "loss": 0.0173, + "step": 93250 + }, + { + "epoch": 0.6893645959610892, + "grad_norm": 0.09985894709825516, + "learning_rate": 1.5589387464387465e-05, + "loss": 0.0184, + "step": 93260 + }, + { + "epoch": 0.6894385145323911, + "grad_norm": 0.07273690402507782, + "learning_rate": 1.558567782526116e-05, + "loss": 0.0197, + "step": 93270 + }, + { + "epoch": 0.6895124331036929, + "grad_norm": 0.059736963361501694, + "learning_rate": 1.5581968186134853e-05, + "loss": 0.018, + "step": 93280 + }, + { + "epoch": 0.6895863516749948, + "grad_norm": 0.05851113796234131, + "learning_rate": 1.557825854700855e-05, + "loss": 0.0174, + "step": 93290 + }, + { + "epoch": 0.6896602702462967, + "grad_norm": 0.06368900835514069, + "learning_rate": 1.5574548907882242e-05, + "loss": 0.017, + "step": 93300 + }, + { + "epoch": 0.6897341888175985, + "grad_norm": 0.077921062707901, + "learning_rate": 1.5570839268755934e-05, + "loss": 0.0187, + "step": 93310 + }, + { + "epoch": 0.6898081073889004, + "grad_norm": 0.07463159412145615, + "learning_rate": 1.556712962962963e-05, + "loss": 0.0193, + "step": 93320 + }, + { + "epoch": 0.6898820259602022, + "grad_norm": 0.09554771333932877, + "learning_rate": 1.5563419990503326e-05, + "loss": 0.0191, + "step": 93330 + }, + { + "epoch": 0.6899559445315041, + "grad_norm": 0.09546215832233429, + "learning_rate": 1.555971035137702e-05, + "loss": 0.0188, + "step": 93340 + }, + { + "epoch": 0.6900298631028059, + "grad_norm": 0.08112092316150665, + "learning_rate": 1.555600071225071e-05, + "loss": 0.018, + "step": 93350 + }, + { + "epoch": 0.6901037816741078, + "grad_norm": 0.07517974823713303, + "learning_rate": 1.5552291073124407e-05, + "loss": 0.0143, + "step": 93360 + }, + { + "epoch": 0.6901777002454097, + "grad_norm": 0.08749409019947052, + "learning_rate": 1.5548581433998103e-05, + "loss": 0.0191, + "step": 93370 + }, + { + "epoch": 0.6902516188167115, + "grad_norm": 0.05953217297792435, + "learning_rate": 1.5544871794871796e-05, + "loss": 0.0176, + "step": 93380 + }, + { + "epoch": 0.6903255373880134, + "grad_norm": 0.07596410810947418, + "learning_rate": 1.554116215574549e-05, + "loss": 0.0164, + "step": 93390 + }, + { + "epoch": 0.6903994559593152, + "grad_norm": 0.07445093989372253, + "learning_rate": 1.5537452516619184e-05, + "loss": 0.0173, + "step": 93400 + }, + { + "epoch": 0.6904733745306171, + "grad_norm": 0.08547616004943848, + "learning_rate": 1.553374287749288e-05, + "loss": 0.0167, + "step": 93410 + }, + { + "epoch": 0.690547293101919, + "grad_norm": 0.0664495974779129, + "learning_rate": 1.5530033238366573e-05, + "loss": 0.0194, + "step": 93420 + }, + { + "epoch": 0.6906212116732208, + "grad_norm": 0.07082750648260117, + "learning_rate": 1.5526323599240266e-05, + "loss": 0.0164, + "step": 93430 + }, + { + "epoch": 0.6906951302445227, + "grad_norm": 0.06069180741906166, + "learning_rate": 1.552261396011396e-05, + "loss": 0.0158, + "step": 93440 + }, + { + "epoch": 0.6907690488158245, + "grad_norm": 0.10092563927173615, + "learning_rate": 1.5518904320987654e-05, + "loss": 0.0187, + "step": 93450 + }, + { + "epoch": 0.6908429673871264, + "grad_norm": 0.04582209512591362, + "learning_rate": 1.551519468186135e-05, + "loss": 0.0161, + "step": 93460 + }, + { + "epoch": 0.6909168859584282, + "grad_norm": 0.0675414577126503, + "learning_rate": 1.5511485042735043e-05, + "loss": 0.0178, + "step": 93470 + }, + { + "epoch": 0.69099080452973, + "grad_norm": 0.09676419943571091, + "learning_rate": 1.550777540360874e-05, + "loss": 0.0178, + "step": 93480 + }, + { + "epoch": 0.691064723101032, + "grad_norm": 0.10699211061000824, + "learning_rate": 1.550406576448243e-05, + "loss": 0.0191, + "step": 93490 + }, + { + "epoch": 0.6911386416723337, + "grad_norm": 0.06596608459949493, + "learning_rate": 1.5500356125356127e-05, + "loss": 0.0165, + "step": 93500 + }, + { + "epoch": 0.6912125602436356, + "grad_norm": 0.07985042035579681, + "learning_rate": 1.549664648622982e-05, + "loss": 0.0182, + "step": 93510 + }, + { + "epoch": 0.6912864788149374, + "grad_norm": 0.0778883844614029, + "learning_rate": 1.5492936847103516e-05, + "loss": 0.0154, + "step": 93520 + }, + { + "epoch": 0.6913603973862393, + "grad_norm": 0.07879475504159927, + "learning_rate": 1.548922720797721e-05, + "loss": 0.0172, + "step": 93530 + }, + { + "epoch": 0.6914343159575411, + "grad_norm": 0.07020098716020584, + "learning_rate": 1.54855175688509e-05, + "loss": 0.0182, + "step": 93540 + }, + { + "epoch": 0.691508234528843, + "grad_norm": 0.0860968828201294, + "learning_rate": 1.5481807929724597e-05, + "loss": 0.0186, + "step": 93550 + }, + { + "epoch": 0.6915821531001449, + "grad_norm": 0.06118820235133171, + "learning_rate": 1.5478098290598293e-05, + "loss": 0.0167, + "step": 93560 + }, + { + "epoch": 0.6916560716714467, + "grad_norm": 0.09667540341615677, + "learning_rate": 1.5474388651471985e-05, + "loss": 0.0173, + "step": 93570 + }, + { + "epoch": 0.6917299902427486, + "grad_norm": 0.05607501044869423, + "learning_rate": 1.5470679012345678e-05, + "loss": 0.0154, + "step": 93580 + }, + { + "epoch": 0.6918039088140504, + "grad_norm": 0.05989857390522957, + "learning_rate": 1.5466969373219374e-05, + "loss": 0.0174, + "step": 93590 + }, + { + "epoch": 0.6918778273853523, + "grad_norm": 0.21198701858520508, + "learning_rate": 1.546325973409307e-05, + "loss": 0.0159, + "step": 93600 + }, + { + "epoch": 0.6919517459566541, + "grad_norm": 0.05558236688375473, + "learning_rate": 1.5459550094966763e-05, + "loss": 0.0144, + "step": 93610 + }, + { + "epoch": 0.692025664527956, + "grad_norm": 0.08093948662281036, + "learning_rate": 1.5455840455840455e-05, + "loss": 0.0157, + "step": 93620 + }, + { + "epoch": 0.6920995830992579, + "grad_norm": 0.09775768965482712, + "learning_rate": 1.545213081671415e-05, + "loss": 0.02, + "step": 93630 + }, + { + "epoch": 0.6921735016705597, + "grad_norm": 0.07269429415464401, + "learning_rate": 1.5448421177587847e-05, + "loss": 0.0176, + "step": 93640 + }, + { + "epoch": 0.6922474202418616, + "grad_norm": 0.06962566077709198, + "learning_rate": 1.544471153846154e-05, + "loss": 0.016, + "step": 93650 + }, + { + "epoch": 0.6923213388131634, + "grad_norm": 0.0953865498304367, + "learning_rate": 1.5441001899335232e-05, + "loss": 0.0181, + "step": 93660 + }, + { + "epoch": 0.6923952573844653, + "grad_norm": 0.0926433652639389, + "learning_rate": 1.5437292260208928e-05, + "loss": 0.0184, + "step": 93670 + }, + { + "epoch": 0.6924691759557672, + "grad_norm": 0.04853018373250961, + "learning_rate": 1.543358262108262e-05, + "loss": 0.014, + "step": 93680 + }, + { + "epoch": 0.692543094527069, + "grad_norm": 0.07597115635871887, + "learning_rate": 1.5429872981956317e-05, + "loss": 0.0163, + "step": 93690 + }, + { + "epoch": 0.6926170130983709, + "grad_norm": 0.10934596508741379, + "learning_rate": 1.542616334283001e-05, + "loss": 0.017, + "step": 93700 + }, + { + "epoch": 0.6926909316696727, + "grad_norm": 0.06172508746385574, + "learning_rate": 1.5422453703703705e-05, + "loss": 0.0138, + "step": 93710 + }, + { + "epoch": 0.6927648502409746, + "grad_norm": 0.08862422406673431, + "learning_rate": 1.5418744064577398e-05, + "loss": 0.0208, + "step": 93720 + }, + { + "epoch": 0.6928387688122764, + "grad_norm": 0.09241653978824615, + "learning_rate": 1.5415034425451094e-05, + "loss": 0.0205, + "step": 93730 + }, + { + "epoch": 0.6929126873835783, + "grad_norm": 0.05826287344098091, + "learning_rate": 1.5411324786324786e-05, + "loss": 0.0154, + "step": 93740 + }, + { + "epoch": 0.6929866059548802, + "grad_norm": 0.06904604285955429, + "learning_rate": 1.5407615147198482e-05, + "loss": 0.0168, + "step": 93750 + }, + { + "epoch": 0.6930605245261819, + "grad_norm": 0.07445508241653442, + "learning_rate": 1.5403905508072175e-05, + "loss": 0.0162, + "step": 93760 + }, + { + "epoch": 0.6931344430974838, + "grad_norm": 0.07311367243528366, + "learning_rate": 1.5400195868945868e-05, + "loss": 0.0177, + "step": 93770 + }, + { + "epoch": 0.6932083616687856, + "grad_norm": 0.09041839092969894, + "learning_rate": 1.5396486229819563e-05, + "loss": 0.0197, + "step": 93780 + }, + { + "epoch": 0.6932822802400875, + "grad_norm": 0.07511784881353378, + "learning_rate": 1.539277659069326e-05, + "loss": 0.0164, + "step": 93790 + }, + { + "epoch": 0.6933561988113893, + "grad_norm": 0.0799839198589325, + "learning_rate": 1.5389066951566952e-05, + "loss": 0.0195, + "step": 93800 + }, + { + "epoch": 0.6934301173826912, + "grad_norm": 0.09590111672878265, + "learning_rate": 1.5385357312440645e-05, + "loss": 0.0188, + "step": 93810 + }, + { + "epoch": 0.6935040359539931, + "grad_norm": 0.08044479787349701, + "learning_rate": 1.538164767331434e-05, + "loss": 0.0175, + "step": 93820 + }, + { + "epoch": 0.6935779545252949, + "grad_norm": 0.08378466963768005, + "learning_rate": 1.5377938034188037e-05, + "loss": 0.0184, + "step": 93830 + }, + { + "epoch": 0.6936518730965968, + "grad_norm": 0.08196622878313065, + "learning_rate": 1.537422839506173e-05, + "loss": 0.0189, + "step": 93840 + }, + { + "epoch": 0.6937257916678986, + "grad_norm": 0.05602654069662094, + "learning_rate": 1.537051875593542e-05, + "loss": 0.0169, + "step": 93850 + }, + { + "epoch": 0.6937997102392005, + "grad_norm": 0.07777991145849228, + "learning_rate": 1.5366809116809118e-05, + "loss": 0.0187, + "step": 93860 + }, + { + "epoch": 0.6938736288105023, + "grad_norm": 0.12807385623455048, + "learning_rate": 1.5363099477682814e-05, + "loss": 0.0204, + "step": 93870 + }, + { + "epoch": 0.6939475473818042, + "grad_norm": 0.08109208196401596, + "learning_rate": 1.5359389838556506e-05, + "loss": 0.0173, + "step": 93880 + }, + { + "epoch": 0.6940214659531061, + "grad_norm": 0.06914292275905609, + "learning_rate": 1.53556801994302e-05, + "loss": 0.0159, + "step": 93890 + }, + { + "epoch": 0.6940953845244079, + "grad_norm": 0.08813401311635971, + "learning_rate": 1.5351970560303895e-05, + "loss": 0.0179, + "step": 93900 + }, + { + "epoch": 0.6941693030957098, + "grad_norm": 0.07378672063350677, + "learning_rate": 1.5348260921177587e-05, + "loss": 0.017, + "step": 93910 + }, + { + "epoch": 0.6942432216670116, + "grad_norm": 0.08109115809202194, + "learning_rate": 1.5344551282051283e-05, + "loss": 0.018, + "step": 93920 + }, + { + "epoch": 0.6943171402383135, + "grad_norm": 0.08002543449401855, + "learning_rate": 1.5340841642924976e-05, + "loss": 0.0181, + "step": 93930 + }, + { + "epoch": 0.6943910588096154, + "grad_norm": 0.07397976517677307, + "learning_rate": 1.5337132003798672e-05, + "loss": 0.0171, + "step": 93940 + }, + { + "epoch": 0.6944649773809172, + "grad_norm": 0.05872524902224541, + "learning_rate": 1.5333422364672364e-05, + "loss": 0.0155, + "step": 93950 + }, + { + "epoch": 0.6945388959522191, + "grad_norm": 0.14280560612678528, + "learning_rate": 1.532971272554606e-05, + "loss": 0.0196, + "step": 93960 + }, + { + "epoch": 0.6946128145235209, + "grad_norm": 0.06711677461862564, + "learning_rate": 1.5326003086419753e-05, + "loss": 0.0163, + "step": 93970 + }, + { + "epoch": 0.6946867330948228, + "grad_norm": 0.09471774846315384, + "learning_rate": 1.532229344729345e-05, + "loss": 0.0169, + "step": 93980 + }, + { + "epoch": 0.6947606516661246, + "grad_norm": 0.06166717782616615, + "learning_rate": 1.531858380816714e-05, + "loss": 0.0169, + "step": 93990 + }, + { + "epoch": 0.6948345702374265, + "grad_norm": 0.06437874585390091, + "learning_rate": 1.5314874169040834e-05, + "loss": 0.0183, + "step": 94000 + }, + { + "epoch": 0.6949084888087284, + "grad_norm": 0.07692473381757736, + "learning_rate": 1.5311164529914533e-05, + "loss": 0.0169, + "step": 94010 + }, + { + "epoch": 0.6949824073800301, + "grad_norm": 0.11510708928108215, + "learning_rate": 1.5307454890788226e-05, + "loss": 0.0153, + "step": 94020 + }, + { + "epoch": 0.695056325951332, + "grad_norm": 0.10411643981933594, + "learning_rate": 1.530374525166192e-05, + "loss": 0.0216, + "step": 94030 + }, + { + "epoch": 0.6951302445226338, + "grad_norm": 0.0748017430305481, + "learning_rate": 1.530003561253561e-05, + "loss": 0.0182, + "step": 94040 + }, + { + "epoch": 0.6952041630939357, + "grad_norm": 0.07279026508331299, + "learning_rate": 1.5296325973409307e-05, + "loss": 0.0195, + "step": 94050 + }, + { + "epoch": 0.6952780816652375, + "grad_norm": 0.07129613310098648, + "learning_rate": 1.5292616334283003e-05, + "loss": 0.0163, + "step": 94060 + }, + { + "epoch": 0.6953520002365394, + "grad_norm": 0.05188415199518204, + "learning_rate": 1.5288906695156696e-05, + "loss": 0.0156, + "step": 94070 + }, + { + "epoch": 0.6954259188078413, + "grad_norm": 0.056653719395399094, + "learning_rate": 1.5285197056030388e-05, + "loss": 0.0162, + "step": 94080 + }, + { + "epoch": 0.6954998373791431, + "grad_norm": 0.06849221140146255, + "learning_rate": 1.5281487416904084e-05, + "loss": 0.02, + "step": 94090 + }, + { + "epoch": 0.695573755950445, + "grad_norm": 0.08342500030994415, + "learning_rate": 1.527777777777778e-05, + "loss": 0.0185, + "step": 94100 + }, + { + "epoch": 0.6956476745217468, + "grad_norm": 0.07203114032745361, + "learning_rate": 1.5274068138651473e-05, + "loss": 0.0166, + "step": 94110 + }, + { + "epoch": 0.6957215930930487, + "grad_norm": 0.06841064244508743, + "learning_rate": 1.5270358499525165e-05, + "loss": 0.0178, + "step": 94120 + }, + { + "epoch": 0.6957955116643505, + "grad_norm": 0.07839217036962509, + "learning_rate": 1.526664886039886e-05, + "loss": 0.0157, + "step": 94130 + }, + { + "epoch": 0.6958694302356524, + "grad_norm": 0.0921502634882927, + "learning_rate": 1.5262939221272554e-05, + "loss": 0.0173, + "step": 94140 + }, + { + "epoch": 0.6959433488069543, + "grad_norm": 0.07723719626665115, + "learning_rate": 1.525922958214625e-05, + "loss": 0.019, + "step": 94150 + }, + { + "epoch": 0.6960172673782561, + "grad_norm": 0.07200010120868683, + "learning_rate": 1.5255519943019944e-05, + "loss": 0.0166, + "step": 94160 + }, + { + "epoch": 0.696091185949558, + "grad_norm": 0.06452837586402893, + "learning_rate": 1.5251810303893638e-05, + "loss": 0.0162, + "step": 94170 + }, + { + "epoch": 0.6961651045208598, + "grad_norm": 0.06620864570140839, + "learning_rate": 1.5248100664767331e-05, + "loss": 0.0177, + "step": 94180 + }, + { + "epoch": 0.6962390230921617, + "grad_norm": 0.12665487825870514, + "learning_rate": 1.5244391025641027e-05, + "loss": 0.0195, + "step": 94190 + }, + { + "epoch": 0.6963129416634636, + "grad_norm": 0.08196282386779785, + "learning_rate": 1.5240681386514721e-05, + "loss": 0.0181, + "step": 94200 + }, + { + "epoch": 0.6963868602347654, + "grad_norm": 0.0876227468252182, + "learning_rate": 1.5236971747388416e-05, + "loss": 0.0189, + "step": 94210 + }, + { + "epoch": 0.6964607788060673, + "grad_norm": 0.07745127379894257, + "learning_rate": 1.5233262108262108e-05, + "loss": 0.0168, + "step": 94220 + }, + { + "epoch": 0.6965346973773691, + "grad_norm": 0.05537101626396179, + "learning_rate": 1.5229552469135802e-05, + "loss": 0.0172, + "step": 94230 + }, + { + "epoch": 0.696608615948671, + "grad_norm": 0.09687453508377075, + "learning_rate": 1.5225842830009498e-05, + "loss": 0.0176, + "step": 94240 + }, + { + "epoch": 0.6966825345199728, + "grad_norm": 0.08245055377483368, + "learning_rate": 1.5222133190883193e-05, + "loss": 0.0171, + "step": 94250 + }, + { + "epoch": 0.6967564530912747, + "grad_norm": 0.07196832448244095, + "learning_rate": 1.5218423551756885e-05, + "loss": 0.0171, + "step": 94260 + }, + { + "epoch": 0.6968303716625766, + "grad_norm": 0.08605038374662399, + "learning_rate": 1.521471391263058e-05, + "loss": 0.0174, + "step": 94270 + }, + { + "epoch": 0.6969042902338783, + "grad_norm": 0.15191549062728882, + "learning_rate": 1.5211004273504274e-05, + "loss": 0.0201, + "step": 94280 + }, + { + "epoch": 0.6969782088051802, + "grad_norm": 0.08578886091709137, + "learning_rate": 1.520729463437797e-05, + "loss": 0.0175, + "step": 94290 + }, + { + "epoch": 0.697052127376482, + "grad_norm": 0.0731656476855278, + "learning_rate": 1.5203584995251662e-05, + "loss": 0.0163, + "step": 94300 + }, + { + "epoch": 0.6971260459477839, + "grad_norm": 0.09269023686647415, + "learning_rate": 1.5199875356125357e-05, + "loss": 0.0186, + "step": 94310 + }, + { + "epoch": 0.6971999645190857, + "grad_norm": 0.08424928039312363, + "learning_rate": 1.519616571699905e-05, + "loss": 0.0179, + "step": 94320 + }, + { + "epoch": 0.6972738830903876, + "grad_norm": 0.06722105294466019, + "learning_rate": 1.5192456077872747e-05, + "loss": 0.0177, + "step": 94330 + }, + { + "epoch": 0.6973478016616895, + "grad_norm": 0.06906666606664658, + "learning_rate": 1.518874643874644e-05, + "loss": 0.0169, + "step": 94340 + }, + { + "epoch": 0.6974217202329913, + "grad_norm": 0.07479233294725418, + "learning_rate": 1.5185036799620134e-05, + "loss": 0.017, + "step": 94350 + }, + { + "epoch": 0.6974956388042932, + "grad_norm": 0.08008567243814468, + "learning_rate": 1.5181327160493828e-05, + "loss": 0.0194, + "step": 94360 + }, + { + "epoch": 0.697569557375595, + "grad_norm": 0.08511345088481903, + "learning_rate": 1.517761752136752e-05, + "loss": 0.0173, + "step": 94370 + }, + { + "epoch": 0.6976434759468969, + "grad_norm": 0.07214191555976868, + "learning_rate": 1.5173907882241216e-05, + "loss": 0.0153, + "step": 94380 + }, + { + "epoch": 0.6977173945181987, + "grad_norm": 0.09548451006412506, + "learning_rate": 1.517019824311491e-05, + "loss": 0.0199, + "step": 94390 + }, + { + "epoch": 0.6977913130895006, + "grad_norm": 0.11812444776296616, + "learning_rate": 1.5166488603988605e-05, + "loss": 0.0155, + "step": 94400 + }, + { + "epoch": 0.6978652316608025, + "grad_norm": 0.06574191153049469, + "learning_rate": 1.5162778964862298e-05, + "loss": 0.0165, + "step": 94410 + }, + { + "epoch": 0.6979391502321043, + "grad_norm": 0.07519920915365219, + "learning_rate": 1.5159069325735995e-05, + "loss": 0.0178, + "step": 94420 + }, + { + "epoch": 0.6980130688034062, + "grad_norm": 0.0758148655295372, + "learning_rate": 1.5155359686609688e-05, + "loss": 0.0179, + "step": 94430 + }, + { + "epoch": 0.698086987374708, + "grad_norm": 0.09478045254945755, + "learning_rate": 1.5151650047483382e-05, + "loss": 0.0193, + "step": 94440 + }, + { + "epoch": 0.6981609059460099, + "grad_norm": 0.07561103254556656, + "learning_rate": 1.5147940408357075e-05, + "loss": 0.015, + "step": 94450 + }, + { + "epoch": 0.6982348245173118, + "grad_norm": 0.08076457679271698, + "learning_rate": 1.5144230769230769e-05, + "loss": 0.0161, + "step": 94460 + }, + { + "epoch": 0.6983087430886136, + "grad_norm": 0.07646799832582474, + "learning_rate": 1.5140521130104465e-05, + "loss": 0.0185, + "step": 94470 + }, + { + "epoch": 0.6983826616599155, + "grad_norm": 0.07224825024604797, + "learning_rate": 1.513681149097816e-05, + "loss": 0.0182, + "step": 94480 + }, + { + "epoch": 0.6984565802312173, + "grad_norm": 0.09400869160890579, + "learning_rate": 1.5133101851851852e-05, + "loss": 0.0185, + "step": 94490 + }, + { + "epoch": 0.6985304988025192, + "grad_norm": 0.08666172623634338, + "learning_rate": 1.5129392212725546e-05, + "loss": 0.0158, + "step": 94500 + }, + { + "epoch": 0.698604417373821, + "grad_norm": 0.0649208202958107, + "learning_rate": 1.512568257359924e-05, + "loss": 0.0184, + "step": 94510 + }, + { + "epoch": 0.6986783359451229, + "grad_norm": 0.07728701084852219, + "learning_rate": 1.5121972934472936e-05, + "loss": 0.0188, + "step": 94520 + }, + { + "epoch": 0.6987522545164248, + "grad_norm": 0.09042263776063919, + "learning_rate": 1.5118263295346629e-05, + "loss": 0.0165, + "step": 94530 + }, + { + "epoch": 0.6988261730877265, + "grad_norm": 0.0655745267868042, + "learning_rate": 1.5114553656220323e-05, + "loss": 0.0181, + "step": 94540 + }, + { + "epoch": 0.6989000916590284, + "grad_norm": 0.0579184852540493, + "learning_rate": 1.5110844017094017e-05, + "loss": 0.0143, + "step": 94550 + }, + { + "epoch": 0.6989740102303302, + "grad_norm": 0.07687050104141235, + "learning_rate": 1.5107134377967713e-05, + "loss": 0.0193, + "step": 94560 + }, + { + "epoch": 0.6990479288016321, + "grad_norm": 0.06710931658744812, + "learning_rate": 1.5103424738841408e-05, + "loss": 0.0162, + "step": 94570 + }, + { + "epoch": 0.6991218473729339, + "grad_norm": 0.07855264842510223, + "learning_rate": 1.50997150997151e-05, + "loss": 0.0186, + "step": 94580 + }, + { + "epoch": 0.6991957659442358, + "grad_norm": 0.08692429959774017, + "learning_rate": 1.5096005460588794e-05, + "loss": 0.0177, + "step": 94590 + }, + { + "epoch": 0.6992696845155377, + "grad_norm": 0.09486404061317444, + "learning_rate": 1.5092295821462487e-05, + "loss": 0.0145, + "step": 94600 + }, + { + "epoch": 0.6993436030868395, + "grad_norm": 0.07637245953083038, + "learning_rate": 1.5088586182336185e-05, + "loss": 0.0168, + "step": 94610 + }, + { + "epoch": 0.6994175216581414, + "grad_norm": 0.08055321127176285, + "learning_rate": 1.5084876543209877e-05, + "loss": 0.0167, + "step": 94620 + }, + { + "epoch": 0.6994914402294432, + "grad_norm": 0.06514108926057816, + "learning_rate": 1.5081166904083572e-05, + "loss": 0.0174, + "step": 94630 + }, + { + "epoch": 0.6995653588007451, + "grad_norm": 0.08226980268955231, + "learning_rate": 1.5077457264957264e-05, + "loss": 0.0176, + "step": 94640 + }, + { + "epoch": 0.699639277372047, + "grad_norm": 0.0900607705116272, + "learning_rate": 1.5073747625830962e-05, + "loss": 0.018, + "step": 94650 + }, + { + "epoch": 0.6997131959433488, + "grad_norm": 0.059451647102832794, + "learning_rate": 1.5070037986704654e-05, + "loss": 0.0184, + "step": 94660 + }, + { + "epoch": 0.6997871145146507, + "grad_norm": 0.0902848169207573, + "learning_rate": 1.5066328347578349e-05, + "loss": 0.0193, + "step": 94670 + }, + { + "epoch": 0.6998610330859525, + "grad_norm": 0.06665327399969101, + "learning_rate": 1.5062618708452041e-05, + "loss": 0.02, + "step": 94680 + }, + { + "epoch": 0.6999349516572544, + "grad_norm": 0.08193585276603699, + "learning_rate": 1.5058909069325736e-05, + "loss": 0.0169, + "step": 94690 + }, + { + "epoch": 0.7000088702285562, + "grad_norm": 0.09058967977762222, + "learning_rate": 1.5055199430199431e-05, + "loss": 0.0156, + "step": 94700 + }, + { + "epoch": 0.7000827887998581, + "grad_norm": 0.05407675355672836, + "learning_rate": 1.5051489791073126e-05, + "loss": 0.0171, + "step": 94710 + }, + { + "epoch": 0.70015670737116, + "grad_norm": 0.08992829918861389, + "learning_rate": 1.504778015194682e-05, + "loss": 0.0173, + "step": 94720 + }, + { + "epoch": 0.7002306259424618, + "grad_norm": 0.08389969915151596, + "learning_rate": 1.5044070512820513e-05, + "loss": 0.0191, + "step": 94730 + }, + { + "epoch": 0.7003045445137637, + "grad_norm": 0.08148416876792908, + "learning_rate": 1.5040360873694207e-05, + "loss": 0.0202, + "step": 94740 + }, + { + "epoch": 0.7003784630850655, + "grad_norm": 0.06622910499572754, + "learning_rate": 1.5036651234567903e-05, + "loss": 0.0184, + "step": 94750 + }, + { + "epoch": 0.7004523816563674, + "grad_norm": 0.29766276478767395, + "learning_rate": 1.5032941595441597e-05, + "loss": 0.0181, + "step": 94760 + }, + { + "epoch": 0.7005263002276692, + "grad_norm": 0.08713862299919128, + "learning_rate": 1.502923195631529e-05, + "loss": 0.0188, + "step": 94770 + }, + { + "epoch": 0.700600218798971, + "grad_norm": 0.08461463451385498, + "learning_rate": 1.5025522317188984e-05, + "loss": 0.0183, + "step": 94780 + }, + { + "epoch": 0.700674137370273, + "grad_norm": 0.06767699867486954, + "learning_rate": 1.502181267806268e-05, + "loss": 0.0147, + "step": 94790 + }, + { + "epoch": 0.7007480559415747, + "grad_norm": 0.09660223871469498, + "learning_rate": 1.5018103038936374e-05, + "loss": 0.016, + "step": 94800 + }, + { + "epoch": 0.7008219745128766, + "grad_norm": 0.08179505914449692, + "learning_rate": 1.5014393399810067e-05, + "loss": 0.0173, + "step": 94810 + }, + { + "epoch": 0.7008958930841784, + "grad_norm": 0.1166461706161499, + "learning_rate": 1.5010683760683761e-05, + "loss": 0.0196, + "step": 94820 + }, + { + "epoch": 0.7009698116554803, + "grad_norm": 0.14224278926849365, + "learning_rate": 1.5006974121557454e-05, + "loss": 0.019, + "step": 94830 + }, + { + "epoch": 0.7010437302267821, + "grad_norm": 0.05852314084768295, + "learning_rate": 1.5003264482431151e-05, + "loss": 0.0145, + "step": 94840 + }, + { + "epoch": 0.701117648798084, + "grad_norm": 0.06998267769813538, + "learning_rate": 1.4999554843304844e-05, + "loss": 0.019, + "step": 94850 + }, + { + "epoch": 0.7011915673693859, + "grad_norm": 0.07684922963380814, + "learning_rate": 1.4995845204178538e-05, + "loss": 0.0169, + "step": 94860 + }, + { + "epoch": 0.7012654859406877, + "grad_norm": 0.08372074365615845, + "learning_rate": 1.4992135565052232e-05, + "loss": 0.0173, + "step": 94870 + }, + { + "epoch": 0.7013394045119896, + "grad_norm": 0.07502642273902893, + "learning_rate": 1.4988425925925928e-05, + "loss": 0.0167, + "step": 94880 + }, + { + "epoch": 0.7014133230832914, + "grad_norm": 0.07391461730003357, + "learning_rate": 1.4984716286799621e-05, + "loss": 0.0163, + "step": 94890 + }, + { + "epoch": 0.7014872416545933, + "grad_norm": 0.07972786575555801, + "learning_rate": 1.4981006647673315e-05, + "loss": 0.0189, + "step": 94900 + }, + { + "epoch": 0.7015611602258952, + "grad_norm": 0.05219242349267006, + "learning_rate": 1.497729700854701e-05, + "loss": 0.0152, + "step": 94910 + }, + { + "epoch": 0.701635078797197, + "grad_norm": 0.07841724902391434, + "learning_rate": 1.4973587369420702e-05, + "loss": 0.016, + "step": 94920 + }, + { + "epoch": 0.7017089973684989, + "grad_norm": 0.097462959587574, + "learning_rate": 1.4969877730294398e-05, + "loss": 0.016, + "step": 94930 + }, + { + "epoch": 0.7017829159398007, + "grad_norm": 0.07454212754964828, + "learning_rate": 1.4966168091168092e-05, + "loss": 0.0145, + "step": 94940 + }, + { + "epoch": 0.7018568345111026, + "grad_norm": 0.07699607312679291, + "learning_rate": 1.4962458452041787e-05, + "loss": 0.0201, + "step": 94950 + }, + { + "epoch": 0.7019307530824044, + "grad_norm": 0.07101486623287201, + "learning_rate": 1.495874881291548e-05, + "loss": 0.0164, + "step": 94960 + }, + { + "epoch": 0.7020046716537063, + "grad_norm": 0.07824986428022385, + "learning_rate": 1.4955039173789173e-05, + "loss": 0.0177, + "step": 94970 + }, + { + "epoch": 0.7020785902250082, + "grad_norm": 0.08556605130434036, + "learning_rate": 1.495132953466287e-05, + "loss": 0.0178, + "step": 94980 + }, + { + "epoch": 0.70215250879631, + "grad_norm": 0.10653487592935562, + "learning_rate": 1.4947619895536564e-05, + "loss": 0.0197, + "step": 94990 + }, + { + "epoch": 0.7022264273676119, + "grad_norm": 0.08999094367027283, + "learning_rate": 1.4943910256410256e-05, + "loss": 0.0155, + "step": 95000 + }, + { + "epoch": 0.7023003459389137, + "grad_norm": 0.08729143440723419, + "learning_rate": 1.494020061728395e-05, + "loss": 0.0179, + "step": 95010 + }, + { + "epoch": 0.7023742645102156, + "grad_norm": 0.07924710214138031, + "learning_rate": 1.4936490978157647e-05, + "loss": 0.0206, + "step": 95020 + }, + { + "epoch": 0.7024481830815174, + "grad_norm": 0.07349123060703278, + "learning_rate": 1.493278133903134e-05, + "loss": 0.0185, + "step": 95030 + }, + { + "epoch": 0.7025221016528193, + "grad_norm": 0.08930125087499619, + "learning_rate": 1.4929071699905033e-05, + "loss": 0.0153, + "step": 95040 + }, + { + "epoch": 0.7025960202241212, + "grad_norm": 0.06959878653287888, + "learning_rate": 1.4925362060778728e-05, + "loss": 0.0183, + "step": 95050 + }, + { + "epoch": 0.7026699387954229, + "grad_norm": 0.06968756020069122, + "learning_rate": 1.4921652421652422e-05, + "loss": 0.0179, + "step": 95060 + }, + { + "epoch": 0.7027438573667248, + "grad_norm": 0.07830172032117844, + "learning_rate": 1.4917942782526118e-05, + "loss": 0.0153, + "step": 95070 + }, + { + "epoch": 0.7028177759380266, + "grad_norm": 0.11358001828193665, + "learning_rate": 1.491423314339981e-05, + "loss": 0.0166, + "step": 95080 + }, + { + "epoch": 0.7028916945093285, + "grad_norm": 0.0876796543598175, + "learning_rate": 1.4910523504273505e-05, + "loss": 0.0185, + "step": 95090 + }, + { + "epoch": 0.7029656130806303, + "grad_norm": 0.0739654153585434, + "learning_rate": 1.4906813865147199e-05, + "loss": 0.0168, + "step": 95100 + }, + { + "epoch": 0.7030395316519322, + "grad_norm": 0.06776399165391922, + "learning_rate": 1.4903104226020895e-05, + "loss": 0.0155, + "step": 95110 + }, + { + "epoch": 0.7031134502232341, + "grad_norm": 0.08771048486232758, + "learning_rate": 1.4899394586894588e-05, + "loss": 0.0163, + "step": 95120 + }, + { + "epoch": 0.7031873687945359, + "grad_norm": 0.07175265997648239, + "learning_rate": 1.4895684947768282e-05, + "loss": 0.0205, + "step": 95130 + }, + { + "epoch": 0.7032612873658378, + "grad_norm": 0.08065624535083771, + "learning_rate": 1.4891975308641976e-05, + "loss": 0.019, + "step": 95140 + }, + { + "epoch": 0.7033352059371396, + "grad_norm": 0.08298975974321365, + "learning_rate": 1.4888265669515669e-05, + "loss": 0.016, + "step": 95150 + }, + { + "epoch": 0.7034091245084415, + "grad_norm": 0.0918532982468605, + "learning_rate": 1.4884556030389366e-05, + "loss": 0.0179, + "step": 95160 + }, + { + "epoch": 0.7034830430797434, + "grad_norm": 0.09895238280296326, + "learning_rate": 1.4880846391263059e-05, + "loss": 0.0182, + "step": 95170 + }, + { + "epoch": 0.7035569616510452, + "grad_norm": 0.06671979278326035, + "learning_rate": 1.4877136752136753e-05, + "loss": 0.0182, + "step": 95180 + }, + { + "epoch": 0.7036308802223471, + "grad_norm": 0.08994031697511673, + "learning_rate": 1.4873427113010446e-05, + "loss": 0.0178, + "step": 95190 + }, + { + "epoch": 0.7037047987936489, + "grad_norm": 0.11199507862329483, + "learning_rate": 1.486971747388414e-05, + "loss": 0.0201, + "step": 95200 + }, + { + "epoch": 0.7037787173649508, + "grad_norm": 0.09525155276060104, + "learning_rate": 1.4866007834757836e-05, + "loss": 0.0172, + "step": 95210 + }, + { + "epoch": 0.7038526359362526, + "grad_norm": 0.06155973672866821, + "learning_rate": 1.486229819563153e-05, + "loss": 0.0168, + "step": 95220 + }, + { + "epoch": 0.7039265545075545, + "grad_norm": 0.07256151735782623, + "learning_rate": 1.4858588556505223e-05, + "loss": 0.0168, + "step": 95230 + }, + { + "epoch": 0.7040004730788564, + "grad_norm": 0.06860183924436569, + "learning_rate": 1.4854878917378917e-05, + "loss": 0.0194, + "step": 95240 + }, + { + "epoch": 0.7040743916501582, + "grad_norm": 0.085331492125988, + "learning_rate": 1.4851169278252613e-05, + "loss": 0.0177, + "step": 95250 + }, + { + "epoch": 0.7041483102214601, + "grad_norm": 0.08582129329442978, + "learning_rate": 1.4847459639126307e-05, + "loss": 0.0152, + "step": 95260 + }, + { + "epoch": 0.7042222287927619, + "grad_norm": 0.07819168269634247, + "learning_rate": 1.484375e-05, + "loss": 0.0161, + "step": 95270 + }, + { + "epoch": 0.7042961473640638, + "grad_norm": 0.07004784792661667, + "learning_rate": 1.4840040360873694e-05, + "loss": 0.0194, + "step": 95280 + }, + { + "epoch": 0.7043700659353656, + "grad_norm": 0.09493345767259598, + "learning_rate": 1.4836330721747389e-05, + "loss": 0.0189, + "step": 95290 + }, + { + "epoch": 0.7044439845066675, + "grad_norm": 0.06889653950929642, + "learning_rate": 1.4832621082621084e-05, + "loss": 0.0178, + "step": 95300 + }, + { + "epoch": 0.7045179030779694, + "grad_norm": 0.0887618362903595, + "learning_rate": 1.4828911443494777e-05, + "loss": 0.0185, + "step": 95310 + }, + { + "epoch": 0.7045918216492711, + "grad_norm": 0.071207694709301, + "learning_rate": 1.4825201804368471e-05, + "loss": 0.0178, + "step": 95320 + }, + { + "epoch": 0.704665740220573, + "grad_norm": 0.09956490993499756, + "learning_rate": 1.4821492165242166e-05, + "loss": 0.0156, + "step": 95330 + }, + { + "epoch": 0.7047396587918748, + "grad_norm": 0.09358590841293335, + "learning_rate": 1.4817782526115862e-05, + "loss": 0.0174, + "step": 95340 + }, + { + "epoch": 0.7048135773631767, + "grad_norm": 0.07501141726970673, + "learning_rate": 1.4814072886989556e-05, + "loss": 0.0161, + "step": 95350 + }, + { + "epoch": 0.7048874959344785, + "grad_norm": 0.04576000198721886, + "learning_rate": 1.4810363247863248e-05, + "loss": 0.0171, + "step": 95360 + }, + { + "epoch": 0.7049614145057804, + "grad_norm": 0.06188805401325226, + "learning_rate": 1.4806653608736943e-05, + "loss": 0.0149, + "step": 95370 + }, + { + "epoch": 0.7050353330770823, + "grad_norm": 0.08435463160276413, + "learning_rate": 1.4802943969610635e-05, + "loss": 0.0181, + "step": 95380 + }, + { + "epoch": 0.7051092516483841, + "grad_norm": 0.10611864924430847, + "learning_rate": 1.4799234330484333e-05, + "loss": 0.0191, + "step": 95390 + }, + { + "epoch": 0.705183170219686, + "grad_norm": 0.08419112861156464, + "learning_rate": 1.4795524691358026e-05, + "loss": 0.0175, + "step": 95400 + }, + { + "epoch": 0.7052570887909878, + "grad_norm": 0.09311195462942123, + "learning_rate": 1.479181505223172e-05, + "loss": 0.0174, + "step": 95410 + }, + { + "epoch": 0.7053310073622897, + "grad_norm": 0.0805216059088707, + "learning_rate": 1.4788105413105412e-05, + "loss": 0.018, + "step": 95420 + }, + { + "epoch": 0.7054049259335916, + "grad_norm": 0.07350783795118332, + "learning_rate": 1.4784395773979107e-05, + "loss": 0.0182, + "step": 95430 + }, + { + "epoch": 0.7054788445048934, + "grad_norm": 0.07063649594783783, + "learning_rate": 1.4780686134852803e-05, + "loss": 0.0176, + "step": 95440 + }, + { + "epoch": 0.7055527630761953, + "grad_norm": 0.06820762902498245, + "learning_rate": 1.4776976495726497e-05, + "loss": 0.0176, + "step": 95450 + }, + { + "epoch": 0.7056266816474971, + "grad_norm": 0.07071997225284576, + "learning_rate": 1.477326685660019e-05, + "loss": 0.0172, + "step": 95460 + }, + { + "epoch": 0.705700600218799, + "grad_norm": 0.0830804631114006, + "learning_rate": 1.4769557217473884e-05, + "loss": 0.0161, + "step": 95470 + }, + { + "epoch": 0.7057745187901008, + "grad_norm": 0.09889290481805801, + "learning_rate": 1.476584757834758e-05, + "loss": 0.0187, + "step": 95480 + }, + { + "epoch": 0.7058484373614027, + "grad_norm": 0.08065763860940933, + "learning_rate": 1.4762137939221274e-05, + "loss": 0.0172, + "step": 95490 + }, + { + "epoch": 0.7059223559327046, + "grad_norm": 0.06085206940770149, + "learning_rate": 1.4758428300094968e-05, + "loss": 0.0151, + "step": 95500 + }, + { + "epoch": 0.7059962745040064, + "grad_norm": 0.07296063005924225, + "learning_rate": 1.475471866096866e-05, + "loss": 0.0191, + "step": 95510 + }, + { + "epoch": 0.7060701930753083, + "grad_norm": 0.06595694273710251, + "learning_rate": 1.4751009021842355e-05, + "loss": 0.0188, + "step": 95520 + }, + { + "epoch": 0.7061441116466101, + "grad_norm": 0.09198103100061417, + "learning_rate": 1.4747299382716051e-05, + "loss": 0.0151, + "step": 95530 + }, + { + "epoch": 0.706218030217912, + "grad_norm": 0.054292045533657074, + "learning_rate": 1.4743589743589745e-05, + "loss": 0.0175, + "step": 95540 + }, + { + "epoch": 0.7062919487892138, + "grad_norm": 0.08294742554426193, + "learning_rate": 1.4739880104463438e-05, + "loss": 0.0178, + "step": 95550 + }, + { + "epoch": 0.7063658673605157, + "grad_norm": 0.07073783874511719, + "learning_rate": 1.4736170465337132e-05, + "loss": 0.0181, + "step": 95560 + }, + { + "epoch": 0.7064397859318176, + "grad_norm": 0.07501115649938583, + "learning_rate": 1.4732460826210828e-05, + "loss": 0.0155, + "step": 95570 + }, + { + "epoch": 0.7065137045031193, + "grad_norm": 0.07367327064275742, + "learning_rate": 1.4728751187084522e-05, + "loss": 0.0175, + "step": 95580 + }, + { + "epoch": 0.7065876230744212, + "grad_norm": 0.08878722041845322, + "learning_rate": 1.4725041547958215e-05, + "loss": 0.0184, + "step": 95590 + }, + { + "epoch": 0.706661541645723, + "grad_norm": 0.07486210018396378, + "learning_rate": 1.472133190883191e-05, + "loss": 0.0172, + "step": 95600 + }, + { + "epoch": 0.7067354602170249, + "grad_norm": 0.0776633620262146, + "learning_rate": 1.4717622269705602e-05, + "loss": 0.0181, + "step": 95610 + }, + { + "epoch": 0.7068093787883267, + "grad_norm": 0.08359479904174805, + "learning_rate": 1.47139126305793e-05, + "loss": 0.0156, + "step": 95620 + }, + { + "epoch": 0.7068832973596286, + "grad_norm": 0.10812906175851822, + "learning_rate": 1.4710202991452992e-05, + "loss": 0.0197, + "step": 95630 + }, + { + "epoch": 0.7069572159309305, + "grad_norm": 0.07726120203733444, + "learning_rate": 1.4706493352326686e-05, + "loss": 0.0178, + "step": 95640 + }, + { + "epoch": 0.7070311345022323, + "grad_norm": 0.0754457488656044, + "learning_rate": 1.470278371320038e-05, + "loss": 0.0184, + "step": 95650 + }, + { + "epoch": 0.7071050530735342, + "grad_norm": 0.11202505975961685, + "learning_rate": 1.4699074074074073e-05, + "loss": 0.0209, + "step": 95660 + }, + { + "epoch": 0.707178971644836, + "grad_norm": 0.0659521147608757, + "learning_rate": 1.469536443494777e-05, + "loss": 0.0176, + "step": 95670 + }, + { + "epoch": 0.7072528902161379, + "grad_norm": 0.05878068879246712, + "learning_rate": 1.4691654795821463e-05, + "loss": 0.0138, + "step": 95680 + }, + { + "epoch": 0.7073268087874398, + "grad_norm": 0.09369215369224548, + "learning_rate": 1.4687945156695158e-05, + "loss": 0.0211, + "step": 95690 + }, + { + "epoch": 0.7074007273587416, + "grad_norm": 0.09781701862812042, + "learning_rate": 1.468423551756885e-05, + "loss": 0.0199, + "step": 95700 + }, + { + "epoch": 0.7074746459300435, + "grad_norm": 0.067824587225914, + "learning_rate": 1.4680525878442546e-05, + "loss": 0.0175, + "step": 95710 + }, + { + "epoch": 0.7075485645013453, + "grad_norm": 0.06842079013586044, + "learning_rate": 1.467681623931624e-05, + "loss": 0.0183, + "step": 95720 + }, + { + "epoch": 0.7076224830726472, + "grad_norm": 0.09361150115728378, + "learning_rate": 1.4673106600189935e-05, + "loss": 0.0174, + "step": 95730 + }, + { + "epoch": 0.707696401643949, + "grad_norm": 0.0677383691072464, + "learning_rate": 1.4669396961063627e-05, + "loss": 0.0177, + "step": 95740 + }, + { + "epoch": 0.7077703202152509, + "grad_norm": 0.05748044326901436, + "learning_rate": 1.4665687321937322e-05, + "loss": 0.018, + "step": 95750 + }, + { + "epoch": 0.7078442387865528, + "grad_norm": 0.09779369831085205, + "learning_rate": 1.4661977682811018e-05, + "loss": 0.0185, + "step": 95760 + }, + { + "epoch": 0.7079181573578546, + "grad_norm": 0.08810590952634811, + "learning_rate": 1.4658268043684712e-05, + "loss": 0.0192, + "step": 95770 + }, + { + "epoch": 0.7079920759291565, + "grad_norm": 0.11388625204563141, + "learning_rate": 1.4654558404558405e-05, + "loss": 0.0197, + "step": 95780 + }, + { + "epoch": 0.7080659945004583, + "grad_norm": 0.10341662913560867, + "learning_rate": 1.4650848765432099e-05, + "loss": 0.021, + "step": 95790 + }, + { + "epoch": 0.7081399130717602, + "grad_norm": 0.1025422066450119, + "learning_rate": 1.4647139126305795e-05, + "loss": 0.0179, + "step": 95800 + }, + { + "epoch": 0.708213831643062, + "grad_norm": 0.07308562099933624, + "learning_rate": 1.4643429487179489e-05, + "loss": 0.0166, + "step": 95810 + }, + { + "epoch": 0.7082877502143639, + "grad_norm": 0.09482390433549881, + "learning_rate": 1.4639719848053182e-05, + "loss": 0.0141, + "step": 95820 + }, + { + "epoch": 0.7083616687856658, + "grad_norm": 0.07176923751831055, + "learning_rate": 1.4636010208926876e-05, + "loss": 0.0182, + "step": 95830 + }, + { + "epoch": 0.7084355873569675, + "grad_norm": 0.08835030347108841, + "learning_rate": 1.463230056980057e-05, + "loss": 0.0179, + "step": 95840 + }, + { + "epoch": 0.7085095059282694, + "grad_norm": 0.07899758964776993, + "learning_rate": 1.4628590930674266e-05, + "loss": 0.017, + "step": 95850 + }, + { + "epoch": 0.7085834244995712, + "grad_norm": 0.07400030642747879, + "learning_rate": 1.4624881291547959e-05, + "loss": 0.0173, + "step": 95860 + }, + { + "epoch": 0.7086573430708731, + "grad_norm": 0.07127097994089127, + "learning_rate": 1.4621171652421653e-05, + "loss": 0.018, + "step": 95870 + }, + { + "epoch": 0.7087312616421749, + "grad_norm": 0.13907699286937714, + "learning_rate": 1.4617462013295347e-05, + "loss": 0.0206, + "step": 95880 + }, + { + "epoch": 0.7088051802134768, + "grad_norm": 0.07832183688879013, + "learning_rate": 1.461375237416904e-05, + "loss": 0.0166, + "step": 95890 + }, + { + "epoch": 0.7088790987847787, + "grad_norm": 0.06786512583494186, + "learning_rate": 1.4610042735042736e-05, + "loss": 0.0158, + "step": 95900 + }, + { + "epoch": 0.7089530173560805, + "grad_norm": 0.08504045754671097, + "learning_rate": 1.460633309591643e-05, + "loss": 0.0173, + "step": 95910 + }, + { + "epoch": 0.7090269359273824, + "grad_norm": 0.06429767608642578, + "learning_rate": 1.4602623456790124e-05, + "loss": 0.0162, + "step": 95920 + }, + { + "epoch": 0.7091008544986842, + "grad_norm": 0.12872451543807983, + "learning_rate": 1.4598913817663817e-05, + "loss": 0.0183, + "step": 95930 + }, + { + "epoch": 0.7091747730699861, + "grad_norm": 0.07814126461744308, + "learning_rate": 1.4595204178537515e-05, + "loss": 0.0165, + "step": 95940 + }, + { + "epoch": 0.709248691641288, + "grad_norm": 0.07006963342428207, + "learning_rate": 1.4591494539411207e-05, + "loss": 0.0152, + "step": 95950 + }, + { + "epoch": 0.7093226102125898, + "grad_norm": 0.061831723898649216, + "learning_rate": 1.4587784900284901e-05, + "loss": 0.0163, + "step": 95960 + }, + { + "epoch": 0.7093965287838917, + "grad_norm": 0.1168350800871849, + "learning_rate": 1.4584075261158594e-05, + "loss": 0.0185, + "step": 95970 + }, + { + "epoch": 0.7094704473551935, + "grad_norm": 0.07854729890823364, + "learning_rate": 1.4580365622032288e-05, + "loss": 0.0172, + "step": 95980 + }, + { + "epoch": 0.7095443659264954, + "grad_norm": 0.07101375609636307, + "learning_rate": 1.4576655982905984e-05, + "loss": 0.0188, + "step": 95990 + }, + { + "epoch": 0.7096182844977972, + "grad_norm": 0.07454732060432434, + "learning_rate": 1.4572946343779679e-05, + "loss": 0.0178, + "step": 96000 + }, + { + "epoch": 0.7096922030690991, + "grad_norm": 0.055436041206121445, + "learning_rate": 1.4569236704653371e-05, + "loss": 0.017, + "step": 96010 + }, + { + "epoch": 0.709766121640401, + "grad_norm": 0.08497800678014755, + "learning_rate": 1.4565527065527065e-05, + "loss": 0.0165, + "step": 96020 + }, + { + "epoch": 0.7098400402117028, + "grad_norm": 0.08286574482917786, + "learning_rate": 1.4561817426400761e-05, + "loss": 0.0162, + "step": 96030 + }, + { + "epoch": 0.7099139587830047, + "grad_norm": 0.07201328873634338, + "learning_rate": 1.4558107787274456e-05, + "loss": 0.0163, + "step": 96040 + }, + { + "epoch": 0.7099878773543065, + "grad_norm": 0.07630084455013275, + "learning_rate": 1.4554398148148148e-05, + "loss": 0.0162, + "step": 96050 + }, + { + "epoch": 0.7100617959256084, + "grad_norm": 0.06601142138242722, + "learning_rate": 1.4550688509021842e-05, + "loss": 0.0179, + "step": 96060 + }, + { + "epoch": 0.7101357144969102, + "grad_norm": 0.07810642570257187, + "learning_rate": 1.4546978869895537e-05, + "loss": 0.0198, + "step": 96070 + }, + { + "epoch": 0.710209633068212, + "grad_norm": 0.09582499414682388, + "learning_rate": 1.4543269230769233e-05, + "loss": 0.0167, + "step": 96080 + }, + { + "epoch": 0.710283551639514, + "grad_norm": 0.09497883915901184, + "learning_rate": 1.4539559591642927e-05, + "loss": 0.0189, + "step": 96090 + }, + { + "epoch": 0.7103574702108157, + "grad_norm": 0.06800592690706253, + "learning_rate": 1.453584995251662e-05, + "loss": 0.0179, + "step": 96100 + }, + { + "epoch": 0.7104313887821176, + "grad_norm": 0.07267574220895767, + "learning_rate": 1.4532140313390314e-05, + "loss": 0.0178, + "step": 96110 + }, + { + "epoch": 0.7105053073534194, + "grad_norm": 0.0896543338894844, + "learning_rate": 1.4528430674264006e-05, + "loss": 0.0189, + "step": 96120 + }, + { + "epoch": 0.7105792259247213, + "grad_norm": 0.07230860739946365, + "learning_rate": 1.4524721035137704e-05, + "loss": 0.0149, + "step": 96130 + }, + { + "epoch": 0.7106531444960231, + "grad_norm": 0.07519300282001495, + "learning_rate": 1.4521011396011397e-05, + "loss": 0.0166, + "step": 96140 + }, + { + "epoch": 0.710727063067325, + "grad_norm": 0.07765395939350128, + "learning_rate": 1.4517301756885091e-05, + "loss": 0.0166, + "step": 96150 + }, + { + "epoch": 0.7108009816386269, + "grad_norm": 0.07616063952445984, + "learning_rate": 1.4513592117758783e-05, + "loss": 0.0157, + "step": 96160 + }, + { + "epoch": 0.7108749002099287, + "grad_norm": 0.052234821021556854, + "learning_rate": 1.4509882478632481e-05, + "loss": 0.0188, + "step": 96170 + }, + { + "epoch": 0.7109488187812306, + "grad_norm": 0.07081520557403564, + "learning_rate": 1.4506172839506174e-05, + "loss": 0.0181, + "step": 96180 + }, + { + "epoch": 0.7110227373525324, + "grad_norm": 0.08544421195983887, + "learning_rate": 1.4502463200379868e-05, + "loss": 0.0196, + "step": 96190 + }, + { + "epoch": 0.7110966559238343, + "grad_norm": 0.08664252609014511, + "learning_rate": 1.449875356125356e-05, + "loss": 0.0155, + "step": 96200 + }, + { + "epoch": 0.7111705744951362, + "grad_norm": 0.08272361755371094, + "learning_rate": 1.4495043922127255e-05, + "loss": 0.0176, + "step": 96210 + }, + { + "epoch": 0.711244493066438, + "grad_norm": 0.08338142931461334, + "learning_rate": 1.449133428300095e-05, + "loss": 0.0179, + "step": 96220 + }, + { + "epoch": 0.7113184116377399, + "grad_norm": 0.08550018072128296, + "learning_rate": 1.4487624643874645e-05, + "loss": 0.0198, + "step": 96230 + }, + { + "epoch": 0.7113923302090417, + "grad_norm": 0.06720562279224396, + "learning_rate": 1.4483915004748338e-05, + "loss": 0.0176, + "step": 96240 + }, + { + "epoch": 0.7114662487803436, + "grad_norm": 0.06509919464588165, + "learning_rate": 1.4480205365622032e-05, + "loss": 0.0177, + "step": 96250 + }, + { + "epoch": 0.7115401673516454, + "grad_norm": 0.11599702388048172, + "learning_rate": 1.4476495726495728e-05, + "loss": 0.0174, + "step": 96260 + }, + { + "epoch": 0.7116140859229473, + "grad_norm": 0.05585482716560364, + "learning_rate": 1.4472786087369422e-05, + "loss": 0.0179, + "step": 96270 + }, + { + "epoch": 0.7116880044942492, + "grad_norm": 0.0665430873632431, + "learning_rate": 1.4469076448243116e-05, + "loss": 0.0175, + "step": 96280 + }, + { + "epoch": 0.711761923065551, + "grad_norm": 0.06822658330202103, + "learning_rate": 1.4465366809116809e-05, + "loss": 0.0168, + "step": 96290 + }, + { + "epoch": 0.7118358416368529, + "grad_norm": 0.10566496849060059, + "learning_rate": 1.4461657169990503e-05, + "loss": 0.0155, + "step": 96300 + }, + { + "epoch": 0.7119097602081547, + "grad_norm": 0.09547118842601776, + "learning_rate": 1.44579475308642e-05, + "loss": 0.0189, + "step": 96310 + }, + { + "epoch": 0.7119836787794566, + "grad_norm": 0.09716902673244476, + "learning_rate": 1.4454237891737894e-05, + "loss": 0.0185, + "step": 96320 + }, + { + "epoch": 0.7120575973507584, + "grad_norm": 0.07762455195188522, + "learning_rate": 1.4450528252611586e-05, + "loss": 0.0188, + "step": 96330 + }, + { + "epoch": 0.7121315159220603, + "grad_norm": 0.07455060631036758, + "learning_rate": 1.444681861348528e-05, + "loss": 0.0161, + "step": 96340 + }, + { + "epoch": 0.7122054344933622, + "grad_norm": 0.08196079730987549, + "learning_rate": 1.4443108974358973e-05, + "loss": 0.0183, + "step": 96350 + }, + { + "epoch": 0.712279353064664, + "grad_norm": 0.08345690369606018, + "learning_rate": 1.443939933523267e-05, + "loss": 0.0198, + "step": 96360 + }, + { + "epoch": 0.7123532716359658, + "grad_norm": 0.10257066786289215, + "learning_rate": 1.4435689696106363e-05, + "loss": 0.0179, + "step": 96370 + }, + { + "epoch": 0.7124271902072676, + "grad_norm": 0.06419511884450912, + "learning_rate": 1.4431980056980057e-05, + "loss": 0.0149, + "step": 96380 + }, + { + "epoch": 0.7125011087785695, + "grad_norm": 0.06590508669614792, + "learning_rate": 1.442827041785375e-05, + "loss": 0.0184, + "step": 96390 + }, + { + "epoch": 0.7125750273498714, + "grad_norm": 0.09475826472043991, + "learning_rate": 1.4424560778727448e-05, + "loss": 0.0171, + "step": 96400 + }, + { + "epoch": 0.7126489459211732, + "grad_norm": 0.08667121827602386, + "learning_rate": 1.442085113960114e-05, + "loss": 0.0184, + "step": 96410 + }, + { + "epoch": 0.7127228644924751, + "grad_norm": 0.08919225633144379, + "learning_rate": 1.4417141500474835e-05, + "loss": 0.0192, + "step": 96420 + }, + { + "epoch": 0.7127967830637769, + "grad_norm": 0.07371910661458969, + "learning_rate": 1.4413431861348529e-05, + "loss": 0.0187, + "step": 96430 + }, + { + "epoch": 0.7128707016350788, + "grad_norm": 0.13033123314380646, + "learning_rate": 1.4409722222222221e-05, + "loss": 0.0187, + "step": 96440 + }, + { + "epoch": 0.7129446202063806, + "grad_norm": 0.06924816966056824, + "learning_rate": 1.4406012583095917e-05, + "loss": 0.0157, + "step": 96450 + }, + { + "epoch": 0.7130185387776825, + "grad_norm": 0.07701105624437332, + "learning_rate": 1.4402302943969612e-05, + "loss": 0.0158, + "step": 96460 + }, + { + "epoch": 0.7130924573489844, + "grad_norm": 0.07914204150438309, + "learning_rate": 1.4398593304843306e-05, + "loss": 0.019, + "step": 96470 + }, + { + "epoch": 0.7131663759202862, + "grad_norm": 0.07155069708824158, + "learning_rate": 1.4394883665716999e-05, + "loss": 0.0181, + "step": 96480 + }, + { + "epoch": 0.7132402944915881, + "grad_norm": 0.06861402094364166, + "learning_rate": 1.4391174026590694e-05, + "loss": 0.0167, + "step": 96490 + }, + { + "epoch": 0.7133142130628899, + "grad_norm": 0.09447457641363144, + "learning_rate": 1.4387464387464389e-05, + "loss": 0.0182, + "step": 96500 + }, + { + "epoch": 0.7133881316341918, + "grad_norm": 0.06595803052186966, + "learning_rate": 1.4383754748338083e-05, + "loss": 0.0154, + "step": 96510 + }, + { + "epoch": 0.7134620502054936, + "grad_norm": 0.09120450913906097, + "learning_rate": 1.4380045109211776e-05, + "loss": 0.0184, + "step": 96520 + }, + { + "epoch": 0.7135359687767955, + "grad_norm": 0.07662360370159149, + "learning_rate": 1.437633547008547e-05, + "loss": 0.0185, + "step": 96530 + }, + { + "epoch": 0.7136098873480974, + "grad_norm": 0.09823779761791229, + "learning_rate": 1.4372625830959166e-05, + "loss": 0.0204, + "step": 96540 + }, + { + "epoch": 0.7136838059193992, + "grad_norm": 0.06132419407367706, + "learning_rate": 1.436891619183286e-05, + "loss": 0.0172, + "step": 96550 + }, + { + "epoch": 0.7137577244907011, + "grad_norm": 0.09777303785085678, + "learning_rate": 1.4365206552706553e-05, + "loss": 0.017, + "step": 96560 + }, + { + "epoch": 0.7138316430620029, + "grad_norm": 0.07330077141523361, + "learning_rate": 1.4361496913580247e-05, + "loss": 0.0168, + "step": 96570 + }, + { + "epoch": 0.7139055616333048, + "grad_norm": 0.07575954496860504, + "learning_rate": 1.4357787274453941e-05, + "loss": 0.0191, + "step": 96580 + }, + { + "epoch": 0.7139794802046066, + "grad_norm": 0.0686848908662796, + "learning_rate": 1.4354077635327637e-05, + "loss": 0.0161, + "step": 96590 + }, + { + "epoch": 0.7140533987759085, + "grad_norm": 0.0778437927365303, + "learning_rate": 1.435036799620133e-05, + "loss": 0.0158, + "step": 96600 + }, + { + "epoch": 0.7141273173472104, + "grad_norm": 0.08872485160827637, + "learning_rate": 1.4346658357075024e-05, + "loss": 0.0198, + "step": 96610 + }, + { + "epoch": 0.7142012359185121, + "grad_norm": 0.0839143618941307, + "learning_rate": 1.4342948717948718e-05, + "loss": 0.0168, + "step": 96620 + }, + { + "epoch": 0.714275154489814, + "grad_norm": 0.08605623990297318, + "learning_rate": 1.4339239078822414e-05, + "loss": 0.0174, + "step": 96630 + }, + { + "epoch": 0.7143490730611158, + "grad_norm": 0.07410331070423126, + "learning_rate": 1.4335529439696107e-05, + "loss": 0.0156, + "step": 96640 + }, + { + "epoch": 0.7144229916324177, + "grad_norm": 0.09648150205612183, + "learning_rate": 1.4331819800569801e-05, + "loss": 0.0164, + "step": 96650 + }, + { + "epoch": 0.7144969102037196, + "grad_norm": 0.09856808930635452, + "learning_rate": 1.4328110161443495e-05, + "loss": 0.019, + "step": 96660 + }, + { + "epoch": 0.7145708287750214, + "grad_norm": 0.08042283356189728, + "learning_rate": 1.4324400522317188e-05, + "loss": 0.0173, + "step": 96670 + }, + { + "epoch": 0.7146447473463233, + "grad_norm": 0.06512616574764252, + "learning_rate": 1.4320690883190884e-05, + "loss": 0.0179, + "step": 96680 + }, + { + "epoch": 0.7147186659176251, + "grad_norm": 0.0875067338347435, + "learning_rate": 1.4316981244064578e-05, + "loss": 0.0182, + "step": 96690 + }, + { + "epoch": 0.714792584488927, + "grad_norm": 0.10024929791688919, + "learning_rate": 1.4313271604938273e-05, + "loss": 0.0189, + "step": 96700 + }, + { + "epoch": 0.7148665030602288, + "grad_norm": 0.09748955070972443, + "learning_rate": 1.4309561965811965e-05, + "loss": 0.0184, + "step": 96710 + }, + { + "epoch": 0.7149404216315307, + "grad_norm": 0.07124093919992447, + "learning_rate": 1.4305852326685663e-05, + "loss": 0.0193, + "step": 96720 + }, + { + "epoch": 0.7150143402028326, + "grad_norm": 0.07453083992004395, + "learning_rate": 1.4302142687559355e-05, + "loss": 0.0187, + "step": 96730 + }, + { + "epoch": 0.7150882587741344, + "grad_norm": 0.08479517698287964, + "learning_rate": 1.429843304843305e-05, + "loss": 0.0182, + "step": 96740 + }, + { + "epoch": 0.7151621773454363, + "grad_norm": 0.08077602833509445, + "learning_rate": 1.4294723409306742e-05, + "loss": 0.0165, + "step": 96750 + }, + { + "epoch": 0.7152360959167381, + "grad_norm": 0.10075265169143677, + "learning_rate": 1.4291013770180436e-05, + "loss": 0.0179, + "step": 96760 + }, + { + "epoch": 0.71531001448804, + "grad_norm": 0.06620938330888748, + "learning_rate": 1.4287304131054132e-05, + "loss": 0.0173, + "step": 96770 + }, + { + "epoch": 0.7153839330593418, + "grad_norm": 0.08356441557407379, + "learning_rate": 1.4283594491927827e-05, + "loss": 0.0168, + "step": 96780 + }, + { + "epoch": 0.7154578516306437, + "grad_norm": 0.06950430572032928, + "learning_rate": 1.427988485280152e-05, + "loss": 0.0164, + "step": 96790 + }, + { + "epoch": 0.7155317702019456, + "grad_norm": 0.10832403600215912, + "learning_rate": 1.4276175213675214e-05, + "loss": 0.0193, + "step": 96800 + }, + { + "epoch": 0.7156056887732474, + "grad_norm": 0.0809079110622406, + "learning_rate": 1.4272465574548908e-05, + "loss": 0.0152, + "step": 96810 + }, + { + "epoch": 0.7156796073445493, + "grad_norm": 0.05348746106028557, + "learning_rate": 1.4268755935422604e-05, + "loss": 0.0186, + "step": 96820 + }, + { + "epoch": 0.7157535259158511, + "grad_norm": 0.1107306033372879, + "learning_rate": 1.4265046296296296e-05, + "loss": 0.0185, + "step": 96830 + }, + { + "epoch": 0.715827444487153, + "grad_norm": 0.06968053430318832, + "learning_rate": 1.426133665716999e-05, + "loss": 0.0187, + "step": 96840 + }, + { + "epoch": 0.7159013630584548, + "grad_norm": 0.07479491084814072, + "learning_rate": 1.4257627018043685e-05, + "loss": 0.0154, + "step": 96850 + }, + { + "epoch": 0.7159752816297567, + "grad_norm": 0.0738830491900444, + "learning_rate": 1.4253917378917381e-05, + "loss": 0.0167, + "step": 96860 + }, + { + "epoch": 0.7160492002010586, + "grad_norm": 0.0806659460067749, + "learning_rate": 1.4250207739791075e-05, + "loss": 0.017, + "step": 96870 + }, + { + "epoch": 0.7161231187723603, + "grad_norm": 0.0961422547698021, + "learning_rate": 1.4246498100664768e-05, + "loss": 0.0184, + "step": 96880 + }, + { + "epoch": 0.7161970373436622, + "grad_norm": 0.07728654146194458, + "learning_rate": 1.4242788461538462e-05, + "loss": 0.0191, + "step": 96890 + }, + { + "epoch": 0.716270955914964, + "grad_norm": 0.08907140791416168, + "learning_rate": 1.4239078822412155e-05, + "loss": 0.0166, + "step": 96900 + }, + { + "epoch": 0.7163448744862659, + "grad_norm": 0.08190717548131943, + "learning_rate": 1.4235369183285852e-05, + "loss": 0.0178, + "step": 96910 + }, + { + "epoch": 0.7164187930575678, + "grad_norm": 0.07787206023931503, + "learning_rate": 1.4231659544159545e-05, + "loss": 0.0171, + "step": 96920 + }, + { + "epoch": 0.7164927116288696, + "grad_norm": 0.06619174778461456, + "learning_rate": 1.4227949905033239e-05, + "loss": 0.0153, + "step": 96930 + }, + { + "epoch": 0.7165666302001715, + "grad_norm": 0.0749562531709671, + "learning_rate": 1.4224240265906932e-05, + "loss": 0.0188, + "step": 96940 + }, + { + "epoch": 0.7166405487714733, + "grad_norm": 0.09903047978878021, + "learning_rate": 1.422053062678063e-05, + "loss": 0.0176, + "step": 96950 + }, + { + "epoch": 0.7167144673427752, + "grad_norm": 0.09285420179367065, + "learning_rate": 1.4216820987654322e-05, + "loss": 0.0189, + "step": 96960 + }, + { + "epoch": 0.716788385914077, + "grad_norm": 0.0756186917424202, + "learning_rate": 1.4213111348528016e-05, + "loss": 0.0204, + "step": 96970 + }, + { + "epoch": 0.7168623044853789, + "grad_norm": 0.08044464141130447, + "learning_rate": 1.4209401709401709e-05, + "loss": 0.0161, + "step": 96980 + }, + { + "epoch": 0.7169362230566808, + "grad_norm": 0.0663600042462349, + "learning_rate": 1.4205692070275403e-05, + "loss": 0.0149, + "step": 96990 + }, + { + "epoch": 0.7170101416279826, + "grad_norm": 0.08343540877103806, + "learning_rate": 1.4201982431149099e-05, + "loss": 0.0169, + "step": 97000 + }, + { + "epoch": 0.7170840601992845, + "grad_norm": 0.08704724907875061, + "learning_rate": 1.4198272792022793e-05, + "loss": 0.0169, + "step": 97010 + }, + { + "epoch": 0.7171579787705863, + "grad_norm": 0.10420592129230499, + "learning_rate": 1.4194563152896488e-05, + "loss": 0.015, + "step": 97020 + }, + { + "epoch": 0.7172318973418882, + "grad_norm": 0.08785660564899445, + "learning_rate": 1.419085351377018e-05, + "loss": 0.0155, + "step": 97030 + }, + { + "epoch": 0.71730581591319, + "grad_norm": 0.084081269800663, + "learning_rate": 1.4187143874643874e-05, + "loss": 0.0153, + "step": 97040 + }, + { + "epoch": 0.7173797344844919, + "grad_norm": 0.07698749750852585, + "learning_rate": 1.418343423551757e-05, + "loss": 0.0182, + "step": 97050 + }, + { + "epoch": 0.7174536530557938, + "grad_norm": 0.06612184643745422, + "learning_rate": 1.4179724596391265e-05, + "loss": 0.0192, + "step": 97060 + }, + { + "epoch": 0.7175275716270956, + "grad_norm": 0.06976378709077835, + "learning_rate": 1.4176014957264957e-05, + "loss": 0.016, + "step": 97070 + }, + { + "epoch": 0.7176014901983975, + "grad_norm": 0.07119813561439514, + "learning_rate": 1.4172305318138652e-05, + "loss": 0.0179, + "step": 97080 + }, + { + "epoch": 0.7176754087696993, + "grad_norm": 0.0721423551440239, + "learning_rate": 1.4168595679012347e-05, + "loss": 0.0175, + "step": 97090 + }, + { + "epoch": 0.7177493273410012, + "grad_norm": 0.06006765365600586, + "learning_rate": 1.4164886039886042e-05, + "loss": 0.0173, + "step": 97100 + }, + { + "epoch": 0.717823245912303, + "grad_norm": 0.07218063622713089, + "learning_rate": 1.4161176400759734e-05, + "loss": 0.0179, + "step": 97110 + }, + { + "epoch": 0.7178971644836049, + "grad_norm": 0.06732066720724106, + "learning_rate": 1.4157466761633429e-05, + "loss": 0.0174, + "step": 97120 + }, + { + "epoch": 0.7179710830549068, + "grad_norm": 0.05910402163863182, + "learning_rate": 1.4153757122507121e-05, + "loss": 0.0173, + "step": 97130 + }, + { + "epoch": 0.7180450016262085, + "grad_norm": 0.12241929024457932, + "learning_rate": 1.4150047483380819e-05, + "loss": 0.0175, + "step": 97140 + }, + { + "epoch": 0.7181189201975104, + "grad_norm": 0.06028750538825989, + "learning_rate": 1.4146337844254511e-05, + "loss": 0.0197, + "step": 97150 + }, + { + "epoch": 0.7181928387688122, + "grad_norm": 0.061230018734931946, + "learning_rate": 1.4142628205128206e-05, + "loss": 0.015, + "step": 97160 + }, + { + "epoch": 0.7182667573401141, + "grad_norm": 0.09459489583969116, + "learning_rate": 1.41389185660019e-05, + "loss": 0.0184, + "step": 97170 + }, + { + "epoch": 0.718340675911416, + "grad_norm": 0.06572078168392181, + "learning_rate": 1.4135208926875596e-05, + "loss": 0.016, + "step": 97180 + }, + { + "epoch": 0.7184145944827178, + "grad_norm": 0.09846540540456772, + "learning_rate": 1.4131499287749289e-05, + "loss": 0.017, + "step": 97190 + }, + { + "epoch": 0.7184885130540197, + "grad_norm": 0.06211007758975029, + "learning_rate": 1.4127789648622983e-05, + "loss": 0.0171, + "step": 97200 + }, + { + "epoch": 0.7185624316253215, + "grad_norm": 0.0772789940237999, + "learning_rate": 1.4124080009496677e-05, + "loss": 0.0152, + "step": 97210 + }, + { + "epoch": 0.7186363501966234, + "grad_norm": 0.08457238227128983, + "learning_rate": 1.412037037037037e-05, + "loss": 0.0158, + "step": 97220 + }, + { + "epoch": 0.7187102687679252, + "grad_norm": 0.06557153910398483, + "learning_rate": 1.4116660731244066e-05, + "loss": 0.017, + "step": 97230 + }, + { + "epoch": 0.7187841873392271, + "grad_norm": 0.0879523754119873, + "learning_rate": 1.411295109211776e-05, + "loss": 0.0179, + "step": 97240 + }, + { + "epoch": 0.718858105910529, + "grad_norm": 0.058569859713315964, + "learning_rate": 1.4109241452991454e-05, + "loss": 0.016, + "step": 97250 + }, + { + "epoch": 0.7189320244818308, + "grad_norm": 0.09156788885593414, + "learning_rate": 1.4105531813865147e-05, + "loss": 0.0185, + "step": 97260 + }, + { + "epoch": 0.7190059430531327, + "grad_norm": 0.0831763818860054, + "learning_rate": 1.4101822174738841e-05, + "loss": 0.0174, + "step": 97270 + }, + { + "epoch": 0.7190798616244345, + "grad_norm": 0.08405617624521255, + "learning_rate": 1.4098112535612537e-05, + "loss": 0.018, + "step": 97280 + }, + { + "epoch": 0.7191537801957364, + "grad_norm": 0.06453635543584824, + "learning_rate": 1.4094402896486231e-05, + "loss": 0.0183, + "step": 97290 + }, + { + "epoch": 0.7192276987670382, + "grad_norm": 0.08332324028015137, + "learning_rate": 1.4090693257359924e-05, + "loss": 0.0166, + "step": 97300 + }, + { + "epoch": 0.7193016173383401, + "grad_norm": 0.07621225714683533, + "learning_rate": 1.4086983618233618e-05, + "loss": 0.0166, + "step": 97310 + }, + { + "epoch": 0.719375535909642, + "grad_norm": 0.0854366272687912, + "learning_rate": 1.4083273979107314e-05, + "loss": 0.0169, + "step": 97320 + }, + { + "epoch": 0.7194494544809438, + "grad_norm": 0.08591848611831665, + "learning_rate": 1.4079564339981008e-05, + "loss": 0.0178, + "step": 97330 + }, + { + "epoch": 0.7195233730522457, + "grad_norm": 0.10342957079410553, + "learning_rate": 1.4075854700854701e-05, + "loss": 0.0176, + "step": 97340 + }, + { + "epoch": 0.7195972916235475, + "grad_norm": 0.08080706000328064, + "learning_rate": 1.4072145061728395e-05, + "loss": 0.0174, + "step": 97350 + }, + { + "epoch": 0.7196712101948494, + "grad_norm": 0.06645863503217697, + "learning_rate": 1.406843542260209e-05, + "loss": 0.0161, + "step": 97360 + }, + { + "epoch": 0.7197451287661512, + "grad_norm": 0.0648173987865448, + "learning_rate": 1.4064725783475785e-05, + "loss": 0.0199, + "step": 97370 + }, + { + "epoch": 0.719819047337453, + "grad_norm": 0.10382978618144989, + "learning_rate": 1.4061016144349478e-05, + "loss": 0.0173, + "step": 97380 + }, + { + "epoch": 0.719892965908755, + "grad_norm": 0.07804905623197556, + "learning_rate": 1.4057306505223172e-05, + "loss": 0.0194, + "step": 97390 + }, + { + "epoch": 0.7199668844800567, + "grad_norm": 0.06643490493297577, + "learning_rate": 1.4053596866096867e-05, + "loss": 0.0168, + "step": 97400 + }, + { + "epoch": 0.7200408030513586, + "grad_norm": 0.0706179067492485, + "learning_rate": 1.4049887226970563e-05, + "loss": 0.0181, + "step": 97410 + }, + { + "epoch": 0.7201147216226604, + "grad_norm": 0.056075792759656906, + "learning_rate": 1.4046177587844255e-05, + "loss": 0.0179, + "step": 97420 + }, + { + "epoch": 0.7201886401939623, + "grad_norm": 0.0818905308842659, + "learning_rate": 1.404246794871795e-05, + "loss": 0.0167, + "step": 97430 + }, + { + "epoch": 0.7202625587652642, + "grad_norm": 0.06715161353349686, + "learning_rate": 1.4038758309591644e-05, + "loss": 0.0185, + "step": 97440 + }, + { + "epoch": 0.720336477336566, + "grad_norm": 0.0666624903678894, + "learning_rate": 1.4035048670465336e-05, + "loss": 0.0188, + "step": 97450 + }, + { + "epoch": 0.7204103959078679, + "grad_norm": 0.05913471058011055, + "learning_rate": 1.4031339031339034e-05, + "loss": 0.0175, + "step": 97460 + }, + { + "epoch": 0.7204843144791697, + "grad_norm": 0.057986270636320114, + "learning_rate": 1.4027629392212726e-05, + "loss": 0.0173, + "step": 97470 + }, + { + "epoch": 0.7205582330504716, + "grad_norm": 0.09989604353904724, + "learning_rate": 1.402391975308642e-05, + "loss": 0.0155, + "step": 97480 + }, + { + "epoch": 0.7206321516217734, + "grad_norm": 0.10672522336244583, + "learning_rate": 1.4020210113960113e-05, + "loss": 0.0167, + "step": 97490 + }, + { + "epoch": 0.7207060701930753, + "grad_norm": 0.07857657223939896, + "learning_rate": 1.4016500474833808e-05, + "loss": 0.0185, + "step": 97500 + }, + { + "epoch": 0.7207799887643772, + "grad_norm": 0.08305259793996811, + "learning_rate": 1.4012790835707504e-05, + "loss": 0.0181, + "step": 97510 + }, + { + "epoch": 0.720853907335679, + "grad_norm": 0.06638690084218979, + "learning_rate": 1.4009081196581198e-05, + "loss": 0.0154, + "step": 97520 + }, + { + "epoch": 0.7209278259069809, + "grad_norm": 0.07285010069608688, + "learning_rate": 1.400537155745489e-05, + "loss": 0.0167, + "step": 97530 + }, + { + "epoch": 0.7210017444782827, + "grad_norm": 0.07371450960636139, + "learning_rate": 1.4001661918328585e-05, + "loss": 0.016, + "step": 97540 + }, + { + "epoch": 0.7210756630495846, + "grad_norm": 0.07550600171089172, + "learning_rate": 1.399795227920228e-05, + "loss": 0.0158, + "step": 97550 + }, + { + "epoch": 0.7211495816208864, + "grad_norm": 0.09500684589147568, + "learning_rate": 1.3994242640075975e-05, + "loss": 0.0177, + "step": 97560 + }, + { + "epoch": 0.7212235001921883, + "grad_norm": 0.061078622937202454, + "learning_rate": 1.3990533000949667e-05, + "loss": 0.0185, + "step": 97570 + }, + { + "epoch": 0.7212974187634902, + "grad_norm": 0.07436969131231308, + "learning_rate": 1.3986823361823362e-05, + "loss": 0.0152, + "step": 97580 + }, + { + "epoch": 0.721371337334792, + "grad_norm": 0.08949553966522217, + "learning_rate": 1.3983113722697056e-05, + "loss": 0.0186, + "step": 97590 + }, + { + "epoch": 0.7214452559060939, + "grad_norm": 0.07141807675361633, + "learning_rate": 1.3979404083570752e-05, + "loss": 0.0146, + "step": 97600 + }, + { + "epoch": 0.7215191744773957, + "grad_norm": 0.095971018075943, + "learning_rate": 1.3975694444444445e-05, + "loss": 0.0189, + "step": 97610 + }, + { + "epoch": 0.7215930930486976, + "grad_norm": 0.0835772380232811, + "learning_rate": 1.3971984805318139e-05, + "loss": 0.0163, + "step": 97620 + }, + { + "epoch": 0.7216670116199994, + "grad_norm": 0.11169163882732391, + "learning_rate": 1.3968275166191833e-05, + "loss": 0.0192, + "step": 97630 + }, + { + "epoch": 0.7217409301913013, + "grad_norm": 0.10236402601003647, + "learning_rate": 1.3964565527065529e-05, + "loss": 0.0194, + "step": 97640 + }, + { + "epoch": 0.7218148487626032, + "grad_norm": 0.07134838402271271, + "learning_rate": 1.3960855887939223e-05, + "loss": 0.0176, + "step": 97650 + }, + { + "epoch": 0.721888767333905, + "grad_norm": 0.07170452922582626, + "learning_rate": 1.3957146248812916e-05, + "loss": 0.0161, + "step": 97660 + }, + { + "epoch": 0.7219626859052068, + "grad_norm": 0.09556709975004196, + "learning_rate": 1.395343660968661e-05, + "loss": 0.0182, + "step": 97670 + }, + { + "epoch": 0.7220366044765086, + "grad_norm": 0.08802442252635956, + "learning_rate": 1.3949726970560303e-05, + "loss": 0.0172, + "step": 97680 + }, + { + "epoch": 0.7221105230478105, + "grad_norm": 0.07534075528383255, + "learning_rate": 1.3946017331434e-05, + "loss": 0.0179, + "step": 97690 + }, + { + "epoch": 0.7221844416191124, + "grad_norm": 0.07446454465389252, + "learning_rate": 1.3942307692307693e-05, + "loss": 0.0162, + "step": 97700 + }, + { + "epoch": 0.7222583601904142, + "grad_norm": 0.07789715379476547, + "learning_rate": 1.3938598053181387e-05, + "loss": 0.0183, + "step": 97710 + }, + { + "epoch": 0.7223322787617161, + "grad_norm": 0.08578293025493622, + "learning_rate": 1.393488841405508e-05, + "loss": 0.0174, + "step": 97720 + }, + { + "epoch": 0.7224061973330179, + "grad_norm": 0.08506274223327637, + "learning_rate": 1.3931178774928774e-05, + "loss": 0.0184, + "step": 97730 + }, + { + "epoch": 0.7224801159043198, + "grad_norm": 0.058572497218847275, + "learning_rate": 1.392746913580247e-05, + "loss": 0.0171, + "step": 97740 + }, + { + "epoch": 0.7225540344756216, + "grad_norm": 0.052253492176532745, + "learning_rate": 1.3923759496676164e-05, + "loss": 0.0178, + "step": 97750 + }, + { + "epoch": 0.7226279530469235, + "grad_norm": 0.09432969242334366, + "learning_rate": 1.3920049857549857e-05, + "loss": 0.0155, + "step": 97760 + }, + { + "epoch": 0.7227018716182254, + "grad_norm": 0.0906846895813942, + "learning_rate": 1.3916340218423551e-05, + "loss": 0.0194, + "step": 97770 + }, + { + "epoch": 0.7227757901895272, + "grad_norm": 0.05747510865330696, + "learning_rate": 1.3912630579297247e-05, + "loss": 0.018, + "step": 97780 + }, + { + "epoch": 0.7228497087608291, + "grad_norm": 0.05860042944550514, + "learning_rate": 1.3908920940170941e-05, + "loss": 0.0154, + "step": 97790 + }, + { + "epoch": 0.7229236273321309, + "grad_norm": 0.06754317134618759, + "learning_rate": 1.3905211301044636e-05, + "loss": 0.0205, + "step": 97800 + }, + { + "epoch": 0.7229975459034328, + "grad_norm": 0.07940924167633057, + "learning_rate": 1.3901501661918328e-05, + "loss": 0.0194, + "step": 97810 + }, + { + "epoch": 0.7230714644747346, + "grad_norm": 0.0894060730934143, + "learning_rate": 1.3897792022792023e-05, + "loss": 0.0174, + "step": 97820 + }, + { + "epoch": 0.7231453830460365, + "grad_norm": 0.10713989287614822, + "learning_rate": 1.3894082383665719e-05, + "loss": 0.0174, + "step": 97830 + }, + { + "epoch": 0.7232193016173384, + "grad_norm": 0.09438996016979218, + "learning_rate": 1.3890372744539413e-05, + "loss": 0.015, + "step": 97840 + }, + { + "epoch": 0.7232932201886402, + "grad_norm": 0.09098189324140549, + "learning_rate": 1.3886663105413105e-05, + "loss": 0.0186, + "step": 97850 + }, + { + "epoch": 0.7233671387599421, + "grad_norm": 0.06245177239179611, + "learning_rate": 1.38829534662868e-05, + "loss": 0.0162, + "step": 97860 + }, + { + "epoch": 0.7234410573312439, + "grad_norm": 0.06434362381696701, + "learning_rate": 1.3879243827160496e-05, + "loss": 0.0173, + "step": 97870 + }, + { + "epoch": 0.7235149759025458, + "grad_norm": 0.10157263278961182, + "learning_rate": 1.387553418803419e-05, + "loss": 0.0206, + "step": 97880 + }, + { + "epoch": 0.7235888944738476, + "grad_norm": 0.07744581252336502, + "learning_rate": 1.3871824548907883e-05, + "loss": 0.0176, + "step": 97890 + }, + { + "epoch": 0.7236628130451495, + "grad_norm": 0.07699330151081085, + "learning_rate": 1.3868114909781577e-05, + "loss": 0.015, + "step": 97900 + }, + { + "epoch": 0.7237367316164514, + "grad_norm": 0.09393465518951416, + "learning_rate": 1.386440527065527e-05, + "loss": 0.0145, + "step": 97910 + }, + { + "epoch": 0.7238106501877531, + "grad_norm": 0.1100480780005455, + "learning_rate": 1.3860695631528967e-05, + "loss": 0.0178, + "step": 97920 + }, + { + "epoch": 0.723884568759055, + "grad_norm": 0.08318780362606049, + "learning_rate": 1.385698599240266e-05, + "loss": 0.0164, + "step": 97930 + }, + { + "epoch": 0.7239584873303568, + "grad_norm": 0.09401154518127441, + "learning_rate": 1.3853276353276354e-05, + "loss": 0.0172, + "step": 97940 + }, + { + "epoch": 0.7240324059016587, + "grad_norm": 0.05973022058606148, + "learning_rate": 1.3849566714150048e-05, + "loss": 0.0158, + "step": 97950 + }, + { + "epoch": 0.7241063244729606, + "grad_norm": 0.09328263252973557, + "learning_rate": 1.3845857075023744e-05, + "loss": 0.0177, + "step": 97960 + }, + { + "epoch": 0.7241802430442624, + "grad_norm": 0.08581507951021194, + "learning_rate": 1.3842147435897437e-05, + "loss": 0.0179, + "step": 97970 + }, + { + "epoch": 0.7242541616155643, + "grad_norm": 0.08177319914102554, + "learning_rate": 1.3838437796771131e-05, + "loss": 0.0188, + "step": 97980 + }, + { + "epoch": 0.7243280801868661, + "grad_norm": 0.07062500715255737, + "learning_rate": 1.3834728157644825e-05, + "loss": 0.0152, + "step": 97990 + }, + { + "epoch": 0.724401998758168, + "grad_norm": 0.06682254374027252, + "learning_rate": 1.3831018518518518e-05, + "loss": 0.0189, + "step": 98000 + }, + { + "epoch": 0.7244759173294698, + "grad_norm": 0.09271026402711868, + "learning_rate": 1.3827308879392214e-05, + "loss": 0.0164, + "step": 98010 + }, + { + "epoch": 0.7245498359007717, + "grad_norm": 0.0880376547574997, + "learning_rate": 1.3823599240265908e-05, + "loss": 0.0169, + "step": 98020 + }, + { + "epoch": 0.7246237544720736, + "grad_norm": 0.06539382040500641, + "learning_rate": 1.3819889601139602e-05, + "loss": 0.0167, + "step": 98030 + }, + { + "epoch": 0.7246976730433754, + "grad_norm": 0.0701875388622284, + "learning_rate": 1.3816179962013295e-05, + "loss": 0.0183, + "step": 98040 + }, + { + "epoch": 0.7247715916146773, + "grad_norm": 0.1255258321762085, + "learning_rate": 1.381247032288699e-05, + "loss": 0.0179, + "step": 98050 + }, + { + "epoch": 0.7248455101859791, + "grad_norm": 0.09884500503540039, + "learning_rate": 1.3808760683760685e-05, + "loss": 0.0185, + "step": 98060 + }, + { + "epoch": 0.724919428757281, + "grad_norm": 0.061767254024744034, + "learning_rate": 1.380505104463438e-05, + "loss": 0.0167, + "step": 98070 + }, + { + "epoch": 0.7249933473285828, + "grad_norm": 0.07481275498867035, + "learning_rate": 1.3801341405508072e-05, + "loss": 0.0157, + "step": 98080 + }, + { + "epoch": 0.7250672658998847, + "grad_norm": 0.07772032916545868, + "learning_rate": 1.3797631766381766e-05, + "loss": 0.0175, + "step": 98090 + }, + { + "epoch": 0.7251411844711866, + "grad_norm": 0.08376848697662354, + "learning_rate": 1.3793922127255462e-05, + "loss": 0.017, + "step": 98100 + }, + { + "epoch": 0.7252151030424884, + "grad_norm": 0.07278087735176086, + "learning_rate": 1.3790212488129157e-05, + "loss": 0.0165, + "step": 98110 + }, + { + "epoch": 0.7252890216137903, + "grad_norm": 0.0768652856349945, + "learning_rate": 1.3786502849002849e-05, + "loss": 0.0156, + "step": 98120 + }, + { + "epoch": 0.7253629401850921, + "grad_norm": 0.08204212039709091, + "learning_rate": 1.3782793209876543e-05, + "loss": 0.0193, + "step": 98130 + }, + { + "epoch": 0.725436858756394, + "grad_norm": 0.07756412774324417, + "learning_rate": 1.3779083570750238e-05, + "loss": 0.0173, + "step": 98140 + }, + { + "epoch": 0.7255107773276958, + "grad_norm": 0.09154937416315079, + "learning_rate": 1.3775373931623934e-05, + "loss": 0.0184, + "step": 98150 + }, + { + "epoch": 0.7255846958989977, + "grad_norm": 0.06443267315626144, + "learning_rate": 1.3771664292497626e-05, + "loss": 0.0194, + "step": 98160 + }, + { + "epoch": 0.7256586144702996, + "grad_norm": 0.07392708957195282, + "learning_rate": 1.376795465337132e-05, + "loss": 0.014, + "step": 98170 + }, + { + "epoch": 0.7257325330416013, + "grad_norm": 0.06653968244791031, + "learning_rate": 1.3764245014245015e-05, + "loss": 0.0175, + "step": 98180 + }, + { + "epoch": 0.7258064516129032, + "grad_norm": 0.0918004959821701, + "learning_rate": 1.376053537511871e-05, + "loss": 0.0177, + "step": 98190 + }, + { + "epoch": 0.725880370184205, + "grad_norm": 0.08654023706912994, + "learning_rate": 1.3756825735992403e-05, + "loss": 0.0187, + "step": 98200 + }, + { + "epoch": 0.7259542887555069, + "grad_norm": 0.08185164630413055, + "learning_rate": 1.3753116096866098e-05, + "loss": 0.018, + "step": 98210 + }, + { + "epoch": 0.7260282073268088, + "grad_norm": 0.06029609963297844, + "learning_rate": 1.3749406457739792e-05, + "loss": 0.018, + "step": 98220 + }, + { + "epoch": 0.7261021258981106, + "grad_norm": 0.058451976627111435, + "learning_rate": 1.3745696818613484e-05, + "loss": 0.0168, + "step": 98230 + }, + { + "epoch": 0.7261760444694125, + "grad_norm": 0.09028119593858719, + "learning_rate": 1.3741987179487182e-05, + "loss": 0.0163, + "step": 98240 + }, + { + "epoch": 0.7262499630407143, + "grad_norm": 0.072898730635643, + "learning_rate": 1.3738277540360875e-05, + "loss": 0.014, + "step": 98250 + }, + { + "epoch": 0.7263238816120162, + "grad_norm": 0.0665789544582367, + "learning_rate": 1.3734567901234569e-05, + "loss": 0.0192, + "step": 98260 + }, + { + "epoch": 0.726397800183318, + "grad_norm": 0.0644986480474472, + "learning_rate": 1.3730858262108262e-05, + "loss": 0.0168, + "step": 98270 + }, + { + "epoch": 0.7264717187546199, + "grad_norm": 0.076494500041008, + "learning_rate": 1.3727148622981956e-05, + "loss": 0.0156, + "step": 98280 + }, + { + "epoch": 0.7265456373259218, + "grad_norm": 0.07643983513116837, + "learning_rate": 1.3723438983855652e-05, + "loss": 0.0173, + "step": 98290 + }, + { + "epoch": 0.7266195558972236, + "grad_norm": 0.0882069543004036, + "learning_rate": 1.3719729344729346e-05, + "loss": 0.0176, + "step": 98300 + }, + { + "epoch": 0.7266934744685255, + "grad_norm": 0.0776149332523346, + "learning_rate": 1.3716019705603039e-05, + "loss": 0.017, + "step": 98310 + }, + { + "epoch": 0.7267673930398273, + "grad_norm": 0.09404109418392181, + "learning_rate": 1.3712310066476733e-05, + "loss": 0.0178, + "step": 98320 + }, + { + "epoch": 0.7268413116111292, + "grad_norm": 0.08796057850122452, + "learning_rate": 1.3708600427350429e-05, + "loss": 0.0177, + "step": 98330 + }, + { + "epoch": 0.726915230182431, + "grad_norm": 0.11046618968248367, + "learning_rate": 1.3704890788224123e-05, + "loss": 0.0177, + "step": 98340 + }, + { + "epoch": 0.7269891487537329, + "grad_norm": 0.06461908668279648, + "learning_rate": 1.3701181149097816e-05, + "loss": 0.0168, + "step": 98350 + }, + { + "epoch": 0.7270630673250348, + "grad_norm": 0.1039423793554306, + "learning_rate": 1.369747150997151e-05, + "loss": 0.0182, + "step": 98360 + }, + { + "epoch": 0.7271369858963366, + "grad_norm": 0.07331787049770355, + "learning_rate": 1.3693761870845204e-05, + "loss": 0.0163, + "step": 98370 + }, + { + "epoch": 0.7272109044676385, + "grad_norm": 0.08373088389635086, + "learning_rate": 1.36900522317189e-05, + "loss": 0.02, + "step": 98380 + }, + { + "epoch": 0.7272848230389403, + "grad_norm": 0.08613729476928711, + "learning_rate": 1.3686342592592594e-05, + "loss": 0.018, + "step": 98390 + }, + { + "epoch": 0.7273587416102422, + "grad_norm": 0.07876535505056381, + "learning_rate": 1.3682632953466287e-05, + "loss": 0.0194, + "step": 98400 + }, + { + "epoch": 0.7274326601815441, + "grad_norm": 0.06077132746577263, + "learning_rate": 1.3678923314339981e-05, + "loss": 0.0164, + "step": 98410 + }, + { + "epoch": 0.7275065787528459, + "grad_norm": 0.1049044132232666, + "learning_rate": 1.3675213675213677e-05, + "loss": 0.0188, + "step": 98420 + }, + { + "epoch": 0.7275804973241478, + "grad_norm": 0.08487299084663391, + "learning_rate": 1.3671504036087372e-05, + "loss": 0.0187, + "step": 98430 + }, + { + "epoch": 0.7276544158954495, + "grad_norm": 0.07457312941551208, + "learning_rate": 1.3667794396961064e-05, + "loss": 0.0193, + "step": 98440 + }, + { + "epoch": 0.7277283344667514, + "grad_norm": 0.06628565490245819, + "learning_rate": 1.3664084757834758e-05, + "loss": 0.0202, + "step": 98450 + }, + { + "epoch": 0.7278022530380532, + "grad_norm": 0.08865582942962646, + "learning_rate": 1.3660375118708451e-05, + "loss": 0.0167, + "step": 98460 + }, + { + "epoch": 0.7278761716093551, + "grad_norm": 0.08476138114929199, + "learning_rate": 1.3656665479582149e-05, + "loss": 0.018, + "step": 98470 + }, + { + "epoch": 0.727950090180657, + "grad_norm": 0.09250761568546295, + "learning_rate": 1.3652955840455841e-05, + "loss": 0.0165, + "step": 98480 + }, + { + "epoch": 0.7280240087519588, + "grad_norm": 0.10515966266393661, + "learning_rate": 1.3649246201329536e-05, + "loss": 0.0159, + "step": 98490 + }, + { + "epoch": 0.7280979273232607, + "grad_norm": 0.06855718046426773, + "learning_rate": 1.3645536562203228e-05, + "loss": 0.017, + "step": 98500 + }, + { + "epoch": 0.7281718458945625, + "grad_norm": 0.06158420443534851, + "learning_rate": 1.3641826923076922e-05, + "loss": 0.016, + "step": 98510 + }, + { + "epoch": 0.7282457644658644, + "grad_norm": 0.08707701414823532, + "learning_rate": 1.3638117283950618e-05, + "loss": 0.016, + "step": 98520 + }, + { + "epoch": 0.7283196830371662, + "grad_norm": 0.09738589823246002, + "learning_rate": 1.3634407644824313e-05, + "loss": 0.0178, + "step": 98530 + }, + { + "epoch": 0.7283936016084681, + "grad_norm": 0.0779692679643631, + "learning_rate": 1.3630698005698005e-05, + "loss": 0.0176, + "step": 98540 + }, + { + "epoch": 0.72846752017977, + "grad_norm": 0.08245556056499481, + "learning_rate": 1.36269883665717e-05, + "loss": 0.0166, + "step": 98550 + }, + { + "epoch": 0.7285414387510718, + "grad_norm": 0.06367962062358856, + "learning_rate": 1.3623278727445395e-05, + "loss": 0.0165, + "step": 98560 + }, + { + "epoch": 0.7286153573223737, + "grad_norm": 0.10788046568632126, + "learning_rate": 1.361956908831909e-05, + "loss": 0.0184, + "step": 98570 + }, + { + "epoch": 0.7286892758936755, + "grad_norm": 0.06675861030817032, + "learning_rate": 1.3615859449192784e-05, + "loss": 0.0149, + "step": 98580 + }, + { + "epoch": 0.7287631944649774, + "grad_norm": 0.055531568825244904, + "learning_rate": 1.3612149810066477e-05, + "loss": 0.0156, + "step": 98590 + }, + { + "epoch": 0.7288371130362792, + "grad_norm": 0.060155775398015976, + "learning_rate": 1.360844017094017e-05, + "loss": 0.0151, + "step": 98600 + }, + { + "epoch": 0.7289110316075811, + "grad_norm": 0.1119222342967987, + "learning_rate": 1.3604730531813867e-05, + "loss": 0.0177, + "step": 98610 + }, + { + "epoch": 0.728984950178883, + "grad_norm": 0.10965950042009354, + "learning_rate": 1.3601020892687561e-05, + "loss": 0.018, + "step": 98620 + }, + { + "epoch": 0.7290588687501848, + "grad_norm": 0.08716337382793427, + "learning_rate": 1.3597311253561254e-05, + "loss": 0.0162, + "step": 98630 + }, + { + "epoch": 0.7291327873214867, + "grad_norm": 0.11401829868555069, + "learning_rate": 1.3593601614434948e-05, + "loss": 0.0173, + "step": 98640 + }, + { + "epoch": 0.7292067058927885, + "grad_norm": 0.07196416705846786, + "learning_rate": 1.3589891975308644e-05, + "loss": 0.018, + "step": 98650 + }, + { + "epoch": 0.7292806244640904, + "grad_norm": 0.0806848555803299, + "learning_rate": 1.3586182336182338e-05, + "loss": 0.0196, + "step": 98660 + }, + { + "epoch": 0.7293545430353923, + "grad_norm": 0.08177975565195084, + "learning_rate": 1.358247269705603e-05, + "loss": 0.0179, + "step": 98670 + }, + { + "epoch": 0.7294284616066941, + "grad_norm": 0.07048134505748749, + "learning_rate": 1.3578763057929725e-05, + "loss": 0.0192, + "step": 98680 + }, + { + "epoch": 0.729502380177996, + "grad_norm": 0.07914569973945618, + "learning_rate": 1.3575053418803418e-05, + "loss": 0.0176, + "step": 98690 + }, + { + "epoch": 0.7295762987492977, + "grad_norm": 0.07468180358409882, + "learning_rate": 1.3571343779677115e-05, + "loss": 0.0159, + "step": 98700 + }, + { + "epoch": 0.7296502173205996, + "grad_norm": 0.07458420842885971, + "learning_rate": 1.3567634140550808e-05, + "loss": 0.021, + "step": 98710 + }, + { + "epoch": 0.7297241358919014, + "grad_norm": 0.09042967855930328, + "learning_rate": 1.3563924501424502e-05, + "loss": 0.0197, + "step": 98720 + }, + { + "epoch": 0.7297980544632033, + "grad_norm": 0.07578109204769135, + "learning_rate": 1.3560214862298196e-05, + "loss": 0.0172, + "step": 98730 + }, + { + "epoch": 0.7298719730345052, + "grad_norm": 0.06371735781431198, + "learning_rate": 1.3556505223171889e-05, + "loss": 0.0188, + "step": 98740 + }, + { + "epoch": 0.729945891605807, + "grad_norm": 0.057217177003622055, + "learning_rate": 1.3552795584045585e-05, + "loss": 0.0168, + "step": 98750 + }, + { + "epoch": 0.7300198101771089, + "grad_norm": 0.08760707825422287, + "learning_rate": 1.354908594491928e-05, + "loss": 0.0171, + "step": 98760 + }, + { + "epoch": 0.7300937287484107, + "grad_norm": 0.07055719196796417, + "learning_rate": 1.3545376305792973e-05, + "loss": 0.0174, + "step": 98770 + }, + { + "epoch": 0.7301676473197126, + "grad_norm": 0.07746641337871552, + "learning_rate": 1.3541666666666666e-05, + "loss": 0.018, + "step": 98780 + }, + { + "epoch": 0.7302415658910144, + "grad_norm": 0.048529159277677536, + "learning_rate": 1.3537957027540362e-05, + "loss": 0.0154, + "step": 98790 + }, + { + "epoch": 0.7303154844623163, + "grad_norm": 0.08571402728557587, + "learning_rate": 1.3534247388414056e-05, + "loss": 0.0155, + "step": 98800 + }, + { + "epoch": 0.7303894030336182, + "grad_norm": 0.09416328370571136, + "learning_rate": 1.353053774928775e-05, + "loss": 0.0179, + "step": 98810 + }, + { + "epoch": 0.73046332160492, + "grad_norm": 0.08363144099712372, + "learning_rate": 1.3526828110161443e-05, + "loss": 0.0166, + "step": 98820 + }, + { + "epoch": 0.7305372401762219, + "grad_norm": 0.06076916307210922, + "learning_rate": 1.3523118471035137e-05, + "loss": 0.0145, + "step": 98830 + }, + { + "epoch": 0.7306111587475237, + "grad_norm": 0.0909329429268837, + "learning_rate": 1.3519408831908833e-05, + "loss": 0.0175, + "step": 98840 + }, + { + "epoch": 0.7306850773188256, + "grad_norm": 0.08312118798494339, + "learning_rate": 1.3515699192782528e-05, + "loss": 0.019, + "step": 98850 + }, + { + "epoch": 0.7307589958901274, + "grad_norm": 0.08315754681825638, + "learning_rate": 1.351198955365622e-05, + "loss": 0.0174, + "step": 98860 + }, + { + "epoch": 0.7308329144614293, + "grad_norm": 0.06642234325408936, + "learning_rate": 1.3508279914529915e-05, + "loss": 0.0174, + "step": 98870 + }, + { + "epoch": 0.7309068330327312, + "grad_norm": 0.06108556315302849, + "learning_rate": 1.350457027540361e-05, + "loss": 0.0163, + "step": 98880 + }, + { + "epoch": 0.730980751604033, + "grad_norm": 0.09150240570306778, + "learning_rate": 1.3500860636277305e-05, + "loss": 0.0209, + "step": 98890 + }, + { + "epoch": 0.7310546701753349, + "grad_norm": 0.09559241682291031, + "learning_rate": 1.3497150997150997e-05, + "loss": 0.0168, + "step": 98900 + }, + { + "epoch": 0.7311285887466367, + "grad_norm": 0.08340034633874893, + "learning_rate": 1.3493441358024692e-05, + "loss": 0.0185, + "step": 98910 + }, + { + "epoch": 0.7312025073179386, + "grad_norm": 0.10789379477500916, + "learning_rate": 1.3489731718898386e-05, + "loss": 0.0169, + "step": 98920 + }, + { + "epoch": 0.7312764258892405, + "grad_norm": 0.09083528071641922, + "learning_rate": 1.3486022079772082e-05, + "loss": 0.0181, + "step": 98930 + }, + { + "epoch": 0.7313503444605423, + "grad_norm": 0.06849498301744461, + "learning_rate": 1.3482312440645774e-05, + "loss": 0.0168, + "step": 98940 + }, + { + "epoch": 0.7314242630318442, + "grad_norm": 0.07019881159067154, + "learning_rate": 1.3478602801519469e-05, + "loss": 0.0165, + "step": 98950 + }, + { + "epoch": 0.731498181603146, + "grad_norm": 0.10241382569074631, + "learning_rate": 1.3474893162393163e-05, + "loss": 0.0187, + "step": 98960 + }, + { + "epoch": 0.7315721001744478, + "grad_norm": 0.08488575369119644, + "learning_rate": 1.3471183523266856e-05, + "loss": 0.0178, + "step": 98970 + }, + { + "epoch": 0.7316460187457496, + "grad_norm": 0.09588972479104996, + "learning_rate": 1.3467473884140552e-05, + "loss": 0.016, + "step": 98980 + }, + { + "epoch": 0.7317199373170515, + "grad_norm": 0.07969026267528534, + "learning_rate": 1.3463764245014246e-05, + "loss": 0.0173, + "step": 98990 + }, + { + "epoch": 0.7317938558883534, + "grad_norm": 0.09308630973100662, + "learning_rate": 1.346005460588794e-05, + "loss": 0.0186, + "step": 99000 + }, + { + "epoch": 0.7318677744596552, + "grad_norm": 0.0691651776432991, + "learning_rate": 1.3456344966761633e-05, + "loss": 0.0167, + "step": 99010 + }, + { + "epoch": 0.7319416930309571, + "grad_norm": 0.05044808238744736, + "learning_rate": 1.345263532763533e-05, + "loss": 0.0172, + "step": 99020 + }, + { + "epoch": 0.7320156116022589, + "grad_norm": 0.12543408572673798, + "learning_rate": 1.3448925688509023e-05, + "loss": 0.0197, + "step": 99030 + }, + { + "epoch": 0.7320895301735608, + "grad_norm": 0.08851069211959839, + "learning_rate": 1.3445216049382717e-05, + "loss": 0.0156, + "step": 99040 + }, + { + "epoch": 0.7321634487448626, + "grad_norm": 0.08638571947813034, + "learning_rate": 1.344150641025641e-05, + "loss": 0.015, + "step": 99050 + }, + { + "epoch": 0.7322373673161645, + "grad_norm": 0.11562138050794601, + "learning_rate": 1.3437796771130104e-05, + "loss": 0.0179, + "step": 99060 + }, + { + "epoch": 0.7323112858874664, + "grad_norm": 0.08109956234693527, + "learning_rate": 1.34340871320038e-05, + "loss": 0.0172, + "step": 99070 + }, + { + "epoch": 0.7323852044587682, + "grad_norm": 0.08304541558027267, + "learning_rate": 1.3430377492877494e-05, + "loss": 0.0208, + "step": 99080 + }, + { + "epoch": 0.7324591230300701, + "grad_norm": 0.06958150863647461, + "learning_rate": 1.3426667853751187e-05, + "loss": 0.0168, + "step": 99090 + }, + { + "epoch": 0.7325330416013719, + "grad_norm": 0.08339308202266693, + "learning_rate": 1.3422958214624881e-05, + "loss": 0.0203, + "step": 99100 + }, + { + "epoch": 0.7326069601726738, + "grad_norm": 0.05242053046822548, + "learning_rate": 1.3419248575498577e-05, + "loss": 0.0178, + "step": 99110 + }, + { + "epoch": 0.7326808787439756, + "grad_norm": 0.09275031089782715, + "learning_rate": 1.3415538936372271e-05, + "loss": 0.016, + "step": 99120 + }, + { + "epoch": 0.7327547973152775, + "grad_norm": 0.09298167377710342, + "learning_rate": 1.3411829297245964e-05, + "loss": 0.0184, + "step": 99130 + }, + { + "epoch": 0.7328287158865794, + "grad_norm": 0.06392785161733627, + "learning_rate": 1.3408119658119658e-05, + "loss": 0.0142, + "step": 99140 + }, + { + "epoch": 0.7329026344578812, + "grad_norm": 0.06492038071155548, + "learning_rate": 1.3404410018993352e-05, + "loss": 0.0169, + "step": 99150 + }, + { + "epoch": 0.7329765530291831, + "grad_norm": 0.08821947872638702, + "learning_rate": 1.3400700379867048e-05, + "loss": 0.0192, + "step": 99160 + }, + { + "epoch": 0.7330504716004849, + "grad_norm": 0.08305369317531586, + "learning_rate": 1.3396990740740743e-05, + "loss": 0.0174, + "step": 99170 + }, + { + "epoch": 0.7331243901717868, + "grad_norm": 0.0802772119641304, + "learning_rate": 1.3393281101614435e-05, + "loss": 0.0179, + "step": 99180 + }, + { + "epoch": 0.7331983087430887, + "grad_norm": 0.08738873898983002, + "learning_rate": 1.338957146248813e-05, + "loss": 0.0174, + "step": 99190 + }, + { + "epoch": 0.7332722273143905, + "grad_norm": 0.0812428891658783, + "learning_rate": 1.3385861823361822e-05, + "loss": 0.0165, + "step": 99200 + }, + { + "epoch": 0.7333461458856924, + "grad_norm": 0.06330117583274841, + "learning_rate": 1.338215218423552e-05, + "loss": 0.0173, + "step": 99210 + }, + { + "epoch": 0.7334200644569941, + "grad_norm": 0.07262025028467178, + "learning_rate": 1.3378442545109212e-05, + "loss": 0.0166, + "step": 99220 + }, + { + "epoch": 0.733493983028296, + "grad_norm": 0.09085766226053238, + "learning_rate": 1.3374732905982907e-05, + "loss": 0.0184, + "step": 99230 + }, + { + "epoch": 0.7335679015995978, + "grad_norm": 0.08859771490097046, + "learning_rate": 1.33710232668566e-05, + "loss": 0.0175, + "step": 99240 + }, + { + "epoch": 0.7336418201708997, + "grad_norm": 0.07603614032268524, + "learning_rate": 1.3367313627730297e-05, + "loss": 0.0168, + "step": 99250 + }, + { + "epoch": 0.7337157387422016, + "grad_norm": 0.07759816944599152, + "learning_rate": 1.336360398860399e-05, + "loss": 0.0171, + "step": 99260 + }, + { + "epoch": 0.7337896573135034, + "grad_norm": 0.09745617210865021, + "learning_rate": 1.3359894349477684e-05, + "loss": 0.0171, + "step": 99270 + }, + { + "epoch": 0.7338635758848053, + "grad_norm": 0.07508081197738647, + "learning_rate": 1.3356184710351376e-05, + "loss": 0.0171, + "step": 99280 + }, + { + "epoch": 0.7339374944561071, + "grad_norm": 0.09933502972126007, + "learning_rate": 1.335247507122507e-05, + "loss": 0.0193, + "step": 99290 + }, + { + "epoch": 0.734011413027409, + "grad_norm": 0.06211693957448006, + "learning_rate": 1.3348765432098767e-05, + "loss": 0.017, + "step": 99300 + }, + { + "epoch": 0.7340853315987108, + "grad_norm": 0.08026242256164551, + "learning_rate": 1.334505579297246e-05, + "loss": 0.0178, + "step": 99310 + }, + { + "epoch": 0.7341592501700127, + "grad_norm": 0.07862497121095657, + "learning_rate": 1.3341346153846155e-05, + "loss": 0.0167, + "step": 99320 + }, + { + "epoch": 0.7342331687413146, + "grad_norm": 0.06331951171159744, + "learning_rate": 1.3337636514719848e-05, + "loss": 0.0162, + "step": 99330 + }, + { + "epoch": 0.7343070873126164, + "grad_norm": 0.08189788460731506, + "learning_rate": 1.3333926875593544e-05, + "loss": 0.0187, + "step": 99340 + }, + { + "epoch": 0.7343810058839183, + "grad_norm": 0.09491676092147827, + "learning_rate": 1.3330217236467238e-05, + "loss": 0.0185, + "step": 99350 + }, + { + "epoch": 0.7344549244552201, + "grad_norm": 0.05899200215935707, + "learning_rate": 1.3326507597340932e-05, + "loss": 0.0162, + "step": 99360 + }, + { + "epoch": 0.734528843026522, + "grad_norm": 0.11451012641191483, + "learning_rate": 1.3322797958214625e-05, + "loss": 0.0179, + "step": 99370 + }, + { + "epoch": 0.7346027615978238, + "grad_norm": 0.0756615698337555, + "learning_rate": 1.3319088319088319e-05, + "loss": 0.0179, + "step": 99380 + }, + { + "epoch": 0.7346766801691257, + "grad_norm": 0.07908909767866135, + "learning_rate": 1.3315378679962015e-05, + "loss": 0.0188, + "step": 99390 + }, + { + "epoch": 0.7347505987404276, + "grad_norm": 0.08498425781726837, + "learning_rate": 1.331166904083571e-05, + "loss": 0.0159, + "step": 99400 + }, + { + "epoch": 0.7348245173117294, + "grad_norm": 0.09858408570289612, + "learning_rate": 1.3307959401709402e-05, + "loss": 0.0165, + "step": 99410 + }, + { + "epoch": 0.7348984358830313, + "grad_norm": 0.09515240043401718, + "learning_rate": 1.3304249762583096e-05, + "loss": 0.019, + "step": 99420 + }, + { + "epoch": 0.7349723544543331, + "grad_norm": 0.1012689471244812, + "learning_rate": 1.3300540123456789e-05, + "loss": 0.0175, + "step": 99430 + }, + { + "epoch": 0.735046273025635, + "grad_norm": 0.09500102698802948, + "learning_rate": 1.3296830484330486e-05, + "loss": 0.0188, + "step": 99440 + }, + { + "epoch": 0.7351201915969369, + "grad_norm": 0.0788261741399765, + "learning_rate": 1.3293120845204179e-05, + "loss": 0.0201, + "step": 99450 + }, + { + "epoch": 0.7351941101682387, + "grad_norm": 0.07192644476890564, + "learning_rate": 1.3289411206077873e-05, + "loss": 0.017, + "step": 99460 + }, + { + "epoch": 0.7352680287395406, + "grad_norm": 0.06370582431554794, + "learning_rate": 1.3285701566951567e-05, + "loss": 0.0174, + "step": 99470 + }, + { + "epoch": 0.7353419473108423, + "grad_norm": 0.11295067518949509, + "learning_rate": 1.3281991927825263e-05, + "loss": 0.0164, + "step": 99480 + }, + { + "epoch": 0.7354158658821442, + "grad_norm": 0.07671871036291122, + "learning_rate": 1.3278282288698956e-05, + "loss": 0.0155, + "step": 99490 + }, + { + "epoch": 0.735489784453446, + "grad_norm": 0.06635639071464539, + "learning_rate": 1.327457264957265e-05, + "loss": 0.0168, + "step": 99500 + }, + { + "epoch": 0.7355637030247479, + "grad_norm": 0.08532058447599411, + "learning_rate": 1.3270863010446345e-05, + "loss": 0.0204, + "step": 99510 + }, + { + "epoch": 0.7356376215960498, + "grad_norm": 0.07075171172618866, + "learning_rate": 1.3267153371320037e-05, + "loss": 0.0164, + "step": 99520 + }, + { + "epoch": 0.7357115401673516, + "grad_norm": 0.10158608853816986, + "learning_rate": 1.3263443732193733e-05, + "loss": 0.018, + "step": 99530 + }, + { + "epoch": 0.7357854587386535, + "grad_norm": 0.0684775710105896, + "learning_rate": 1.3259734093067427e-05, + "loss": 0.0168, + "step": 99540 + }, + { + "epoch": 0.7358593773099553, + "grad_norm": 0.07273388653993607, + "learning_rate": 1.3256024453941122e-05, + "loss": 0.0182, + "step": 99550 + }, + { + "epoch": 0.7359332958812572, + "grad_norm": 0.09297880530357361, + "learning_rate": 1.3252314814814814e-05, + "loss": 0.0163, + "step": 99560 + }, + { + "epoch": 0.736007214452559, + "grad_norm": 0.06453926861286163, + "learning_rate": 1.324860517568851e-05, + "loss": 0.0182, + "step": 99570 + }, + { + "epoch": 0.7360811330238609, + "grad_norm": 0.0784500390291214, + "learning_rate": 1.3244895536562204e-05, + "loss": 0.0169, + "step": 99580 + }, + { + "epoch": 0.7361550515951628, + "grad_norm": 0.07597056031227112, + "learning_rate": 1.3241185897435899e-05, + "loss": 0.0179, + "step": 99590 + }, + { + "epoch": 0.7362289701664646, + "grad_norm": 0.06927984207868576, + "learning_rate": 1.3237476258309591e-05, + "loss": 0.0178, + "step": 99600 + }, + { + "epoch": 0.7363028887377665, + "grad_norm": 0.10358776897192001, + "learning_rate": 1.3233766619183286e-05, + "loss": 0.0181, + "step": 99610 + }, + { + "epoch": 0.7363768073090683, + "grad_norm": 0.06848066300153732, + "learning_rate": 1.3230056980056982e-05, + "loss": 0.019, + "step": 99620 + }, + { + "epoch": 0.7364507258803702, + "grad_norm": 0.07205010205507278, + "learning_rate": 1.3226347340930676e-05, + "loss": 0.0172, + "step": 99630 + }, + { + "epoch": 0.736524644451672, + "grad_norm": 0.08415688574314117, + "learning_rate": 1.3222637701804368e-05, + "loss": 0.0179, + "step": 99640 + }, + { + "epoch": 0.7365985630229739, + "grad_norm": 0.0837029442191124, + "learning_rate": 1.3218928062678063e-05, + "loss": 0.0182, + "step": 99650 + }, + { + "epoch": 0.7366724815942758, + "grad_norm": 0.08125575631856918, + "learning_rate": 1.3215218423551757e-05, + "loss": 0.0168, + "step": 99660 + }, + { + "epoch": 0.7367464001655776, + "grad_norm": 0.0863199532032013, + "learning_rate": 1.3211508784425453e-05, + "loss": 0.0177, + "step": 99670 + }, + { + "epoch": 0.7368203187368795, + "grad_norm": 0.1306217908859253, + "learning_rate": 1.3207799145299146e-05, + "loss": 0.0191, + "step": 99680 + }, + { + "epoch": 0.7368942373081813, + "grad_norm": 0.08671586215496063, + "learning_rate": 1.320408950617284e-05, + "loss": 0.0181, + "step": 99690 + }, + { + "epoch": 0.7369681558794832, + "grad_norm": 0.08784538507461548, + "learning_rate": 1.3200379867046534e-05, + "loss": 0.0181, + "step": 99700 + }, + { + "epoch": 0.7370420744507851, + "grad_norm": 0.10323961079120636, + "learning_rate": 1.319667022792023e-05, + "loss": 0.02, + "step": 99710 + }, + { + "epoch": 0.7371159930220869, + "grad_norm": 0.08128951489925385, + "learning_rate": 1.3192960588793923e-05, + "loss": 0.0168, + "step": 99720 + }, + { + "epoch": 0.7371899115933888, + "grad_norm": 0.07869691401720047, + "learning_rate": 1.3189250949667617e-05, + "loss": 0.017, + "step": 99730 + }, + { + "epoch": 0.7372638301646905, + "grad_norm": 0.07044506818056107, + "learning_rate": 1.3185541310541311e-05, + "loss": 0.0201, + "step": 99740 + }, + { + "epoch": 0.7373377487359924, + "grad_norm": 0.10342875868082047, + "learning_rate": 1.3181831671415004e-05, + "loss": 0.017, + "step": 99750 + }, + { + "epoch": 0.7374116673072942, + "grad_norm": 0.08335622400045395, + "learning_rate": 1.3178122032288701e-05, + "loss": 0.017, + "step": 99760 + }, + { + "epoch": 0.7374855858785961, + "grad_norm": 0.07070112973451614, + "learning_rate": 1.3174412393162394e-05, + "loss": 0.0158, + "step": 99770 + }, + { + "epoch": 0.737559504449898, + "grad_norm": 0.08443205803632736, + "learning_rate": 1.3170702754036088e-05, + "loss": 0.0182, + "step": 99780 + }, + { + "epoch": 0.7376334230211998, + "grad_norm": 0.08464431017637253, + "learning_rate": 1.316699311490978e-05, + "loss": 0.0165, + "step": 99790 + }, + { + "epoch": 0.7377073415925017, + "grad_norm": 0.07187016308307648, + "learning_rate": 1.3163283475783478e-05, + "loss": 0.0186, + "step": 99800 + }, + { + "epoch": 0.7377812601638035, + "grad_norm": 0.08470672369003296, + "learning_rate": 1.3159573836657171e-05, + "loss": 0.0166, + "step": 99810 + }, + { + "epoch": 0.7378551787351054, + "grad_norm": 0.09578060358762741, + "learning_rate": 1.3155864197530865e-05, + "loss": 0.0177, + "step": 99820 + }, + { + "epoch": 0.7379290973064072, + "grad_norm": 0.08780781179666519, + "learning_rate": 1.3152154558404558e-05, + "loss": 0.0172, + "step": 99830 + }, + { + "epoch": 0.7380030158777091, + "grad_norm": 0.07516979426145554, + "learning_rate": 1.3148444919278252e-05, + "loss": 0.0157, + "step": 99840 + }, + { + "epoch": 0.738076934449011, + "grad_norm": 0.0591609925031662, + "learning_rate": 1.3144735280151948e-05, + "loss": 0.0179, + "step": 99850 + }, + { + "epoch": 0.7381508530203128, + "grad_norm": 0.08503682911396027, + "learning_rate": 1.3141025641025642e-05, + "loss": 0.0189, + "step": 99860 + }, + { + "epoch": 0.7382247715916147, + "grad_norm": 0.048752620816230774, + "learning_rate": 1.3137316001899335e-05, + "loss": 0.0167, + "step": 99870 + }, + { + "epoch": 0.7382986901629165, + "grad_norm": 0.06892601400613785, + "learning_rate": 1.313360636277303e-05, + "loss": 0.0167, + "step": 99880 + }, + { + "epoch": 0.7383726087342184, + "grad_norm": 0.08108551800251007, + "learning_rate": 1.3129896723646724e-05, + "loss": 0.0164, + "step": 99890 + }, + { + "epoch": 0.7384465273055202, + "grad_norm": 0.10203902423381805, + "learning_rate": 1.312618708452042e-05, + "loss": 0.0192, + "step": 99900 + }, + { + "epoch": 0.7385204458768221, + "grad_norm": 0.07102199643850327, + "learning_rate": 1.3122477445394112e-05, + "loss": 0.0171, + "step": 99910 + }, + { + "epoch": 0.738594364448124, + "grad_norm": 0.08365597575902939, + "learning_rate": 1.3118767806267806e-05, + "loss": 0.0159, + "step": 99920 + }, + { + "epoch": 0.7386682830194258, + "grad_norm": 0.06642188131809235, + "learning_rate": 1.31150581671415e-05, + "loss": 0.0175, + "step": 99930 + }, + { + "epoch": 0.7387422015907277, + "grad_norm": 0.08410719037055969, + "learning_rate": 1.3111348528015197e-05, + "loss": 0.0158, + "step": 99940 + }, + { + "epoch": 0.7388161201620295, + "grad_norm": 0.08759714663028717, + "learning_rate": 1.3107638888888891e-05, + "loss": 0.0183, + "step": 99950 + }, + { + "epoch": 0.7388900387333314, + "grad_norm": 0.09986711293458939, + "learning_rate": 1.3103929249762583e-05, + "loss": 0.019, + "step": 99960 + }, + { + "epoch": 0.7389639573046333, + "grad_norm": 0.07965222001075745, + "learning_rate": 1.3100219610636278e-05, + "loss": 0.0177, + "step": 99970 + }, + { + "epoch": 0.7390378758759351, + "grad_norm": 0.05860085412859917, + "learning_rate": 1.309650997150997e-05, + "loss": 0.0179, + "step": 99980 + }, + { + "epoch": 0.739111794447237, + "grad_norm": 0.08647920936346054, + "learning_rate": 1.3092800332383668e-05, + "loss": 0.0159, + "step": 99990 + }, + { + "epoch": 0.7391857130185387, + "grad_norm": 0.06778937578201294, + "learning_rate": 1.308909069325736e-05, + "loss": 0.0175, + "step": 100000 + }, + { + "epoch": 0.7391857130185387, + "eval_f1": 0.6265636264638181, + "eval_loss": 0.01698913984000683, + "eval_precision": 0.4966591483671539, + "eval_recall": 0.8484924183456124, + "eval_runtime": 2919.6426, + "eval_samples_per_second": 185.343, + "eval_steps_per_second": 2.896, + "step": 100000 + }, + { + "epoch": 0.7392596315898406, + "grad_norm": 0.08258048444986343, + "learning_rate": 1.3085381054131055e-05, + "loss": 0.0159, + "step": 100010 + }, + { + "epoch": 0.7393335501611424, + "grad_norm": 0.07373189926147461, + "learning_rate": 1.3081671415004747e-05, + "loss": 0.0157, + "step": 100020 + }, + { + "epoch": 0.7394074687324443, + "grad_norm": 0.06241403892636299, + "learning_rate": 1.3077961775878445e-05, + "loss": 0.0193, + "step": 100030 + }, + { + "epoch": 0.7394813873037462, + "grad_norm": 0.06616196036338806, + "learning_rate": 1.3074252136752138e-05, + "loss": 0.0156, + "step": 100040 + }, + { + "epoch": 0.739555305875048, + "grad_norm": 0.06679438799619675, + "learning_rate": 1.3070542497625832e-05, + "loss": 0.0157, + "step": 100050 + }, + { + "epoch": 0.7396292244463499, + "grad_norm": 0.08504631370306015, + "learning_rate": 1.3066832858499525e-05, + "loss": 0.0157, + "step": 100060 + }, + { + "epoch": 0.7397031430176517, + "grad_norm": 0.0888257697224617, + "learning_rate": 1.3063123219373219e-05, + "loss": 0.0167, + "step": 100070 + }, + { + "epoch": 0.7397770615889536, + "grad_norm": 0.10319739580154419, + "learning_rate": 1.3059413580246915e-05, + "loss": 0.0158, + "step": 100080 + }, + { + "epoch": 0.7398509801602554, + "grad_norm": 0.0712389275431633, + "learning_rate": 1.3055703941120609e-05, + "loss": 0.0158, + "step": 100090 + }, + { + "epoch": 0.7399248987315573, + "grad_norm": 0.07851889729499817, + "learning_rate": 1.3051994301994303e-05, + "loss": 0.0205, + "step": 100100 + }, + { + "epoch": 0.7399988173028592, + "grad_norm": 0.06646507233381271, + "learning_rate": 1.3048284662867996e-05, + "loss": 0.0185, + "step": 100110 + }, + { + "epoch": 0.740072735874161, + "grad_norm": 0.0772419422864914, + "learning_rate": 1.304457502374169e-05, + "loss": 0.019, + "step": 100120 + }, + { + "epoch": 0.7401466544454629, + "grad_norm": 0.11786741018295288, + "learning_rate": 1.3040865384615386e-05, + "loss": 0.0211, + "step": 100130 + }, + { + "epoch": 0.7402205730167647, + "grad_norm": 0.05052179470658302, + "learning_rate": 1.303715574548908e-05, + "loss": 0.0171, + "step": 100140 + }, + { + "epoch": 0.7402944915880666, + "grad_norm": 0.07826834917068481, + "learning_rate": 1.3033446106362773e-05, + "loss": 0.0204, + "step": 100150 + }, + { + "epoch": 0.7403684101593685, + "grad_norm": 0.06459248065948486, + "learning_rate": 1.3029736467236467e-05, + "loss": 0.017, + "step": 100160 + }, + { + "epoch": 0.7404423287306703, + "grad_norm": 0.06260719150304794, + "learning_rate": 1.3026026828110163e-05, + "loss": 0.0186, + "step": 100170 + }, + { + "epoch": 0.7405162473019722, + "grad_norm": 0.09153701364994049, + "learning_rate": 1.3022317188983857e-05, + "loss": 0.0171, + "step": 100180 + }, + { + "epoch": 0.740590165873274, + "grad_norm": 0.07650409638881683, + "learning_rate": 1.301860754985755e-05, + "loss": 0.018, + "step": 100190 + }, + { + "epoch": 0.7406640844445759, + "grad_norm": 0.05814934894442558, + "learning_rate": 1.3014897910731244e-05, + "loss": 0.0153, + "step": 100200 + }, + { + "epoch": 0.7407380030158777, + "grad_norm": 0.07334347069263458, + "learning_rate": 1.3011188271604937e-05, + "loss": 0.0174, + "step": 100210 + }, + { + "epoch": 0.7408119215871796, + "grad_norm": 0.06784732639789581, + "learning_rate": 1.3007478632478635e-05, + "loss": 0.0182, + "step": 100220 + }, + { + "epoch": 0.7408858401584815, + "grad_norm": 0.07556786388158798, + "learning_rate": 1.3003768993352327e-05, + "loss": 0.0197, + "step": 100230 + }, + { + "epoch": 0.7409597587297833, + "grad_norm": 0.08706970512866974, + "learning_rate": 1.3000059354226021e-05, + "loss": 0.0175, + "step": 100240 + }, + { + "epoch": 0.7410336773010852, + "grad_norm": 0.07846567779779434, + "learning_rate": 1.2996349715099716e-05, + "loss": 0.0171, + "step": 100250 + }, + { + "epoch": 0.741107595872387, + "grad_norm": 0.06241794303059578, + "learning_rate": 1.2992640075973412e-05, + "loss": 0.0164, + "step": 100260 + }, + { + "epoch": 0.7411815144436888, + "grad_norm": 0.07802298665046692, + "learning_rate": 1.2988930436847104e-05, + "loss": 0.0199, + "step": 100270 + }, + { + "epoch": 0.7412554330149906, + "grad_norm": 0.07757619023323059, + "learning_rate": 1.2985220797720799e-05, + "loss": 0.0179, + "step": 100280 + }, + { + "epoch": 0.7413293515862925, + "grad_norm": 0.07125821709632874, + "learning_rate": 1.2981511158594493e-05, + "loss": 0.0178, + "step": 100290 + }, + { + "epoch": 0.7414032701575944, + "grad_norm": 0.08631688356399536, + "learning_rate": 1.2977801519468185e-05, + "loss": 0.0166, + "step": 100300 + }, + { + "epoch": 0.7414771887288962, + "grad_norm": 0.08890466392040253, + "learning_rate": 1.2974091880341881e-05, + "loss": 0.0161, + "step": 100310 + }, + { + "epoch": 0.7415511073001981, + "grad_norm": 0.060113392770290375, + "learning_rate": 1.2970382241215576e-05, + "loss": 0.0162, + "step": 100320 + }, + { + "epoch": 0.7416250258714999, + "grad_norm": 0.06005857139825821, + "learning_rate": 1.296667260208927e-05, + "loss": 0.015, + "step": 100330 + }, + { + "epoch": 0.7416989444428018, + "grad_norm": 0.06900335103273392, + "learning_rate": 1.2962962962962962e-05, + "loss": 0.0201, + "step": 100340 + }, + { + "epoch": 0.7417728630141036, + "grad_norm": 0.05656469240784645, + "learning_rate": 1.2959253323836657e-05, + "loss": 0.0178, + "step": 100350 + }, + { + "epoch": 0.7418467815854055, + "grad_norm": 0.085307277739048, + "learning_rate": 1.2955543684710353e-05, + "loss": 0.0161, + "step": 100360 + }, + { + "epoch": 0.7419207001567074, + "grad_norm": 0.09140070527791977, + "learning_rate": 1.2951834045584047e-05, + "loss": 0.0176, + "step": 100370 + }, + { + "epoch": 0.7419946187280092, + "grad_norm": 0.08289497345685959, + "learning_rate": 1.294812440645774e-05, + "loss": 0.0167, + "step": 100380 + }, + { + "epoch": 0.7420685372993111, + "grad_norm": 0.08116666972637177, + "learning_rate": 1.2944414767331434e-05, + "loss": 0.0156, + "step": 100390 + }, + { + "epoch": 0.7421424558706129, + "grad_norm": 0.10772228986024857, + "learning_rate": 1.294070512820513e-05, + "loss": 0.0209, + "step": 100400 + }, + { + "epoch": 0.7422163744419148, + "grad_norm": 0.07488065958023071, + "learning_rate": 1.2936995489078824e-05, + "loss": 0.0176, + "step": 100410 + }, + { + "epoch": 0.7422902930132167, + "grad_norm": 0.07480562478303909, + "learning_rate": 1.2933285849952517e-05, + "loss": 0.0173, + "step": 100420 + }, + { + "epoch": 0.7423642115845185, + "grad_norm": 0.07681198418140411, + "learning_rate": 1.2929576210826211e-05, + "loss": 0.0159, + "step": 100430 + }, + { + "epoch": 0.7424381301558204, + "grad_norm": 0.10482607036828995, + "learning_rate": 1.2925866571699905e-05, + "loss": 0.018, + "step": 100440 + }, + { + "epoch": 0.7425120487271222, + "grad_norm": 0.08250238001346588, + "learning_rate": 1.2922156932573601e-05, + "loss": 0.0183, + "step": 100450 + }, + { + "epoch": 0.7425859672984241, + "grad_norm": 0.08101457357406616, + "learning_rate": 1.2918447293447294e-05, + "loss": 0.0164, + "step": 100460 + }, + { + "epoch": 0.7426598858697259, + "grad_norm": 0.09645617753267288, + "learning_rate": 1.2914737654320988e-05, + "loss": 0.0172, + "step": 100470 + }, + { + "epoch": 0.7427338044410278, + "grad_norm": 0.10801961272954941, + "learning_rate": 1.2911028015194682e-05, + "loss": 0.0162, + "step": 100480 + }, + { + "epoch": 0.7428077230123297, + "grad_norm": 0.07592090964317322, + "learning_rate": 1.2907318376068378e-05, + "loss": 0.0194, + "step": 100490 + }, + { + "epoch": 0.7428816415836315, + "grad_norm": 0.09863361716270447, + "learning_rate": 1.290360873694207e-05, + "loss": 0.0183, + "step": 100500 + }, + { + "epoch": 0.7429555601549334, + "grad_norm": 0.0681617334485054, + "learning_rate": 1.2899899097815765e-05, + "loss": 0.0155, + "step": 100510 + }, + { + "epoch": 0.7430294787262351, + "grad_norm": 0.09108418226242065, + "learning_rate": 1.289618945868946e-05, + "loss": 0.0206, + "step": 100520 + }, + { + "epoch": 0.743103397297537, + "grad_norm": 0.07221322506666183, + "learning_rate": 1.2892479819563152e-05, + "loss": 0.0161, + "step": 100530 + }, + { + "epoch": 0.7431773158688388, + "grad_norm": 0.07454651594161987, + "learning_rate": 1.288877018043685e-05, + "loss": 0.02, + "step": 100540 + }, + { + "epoch": 0.7432512344401407, + "grad_norm": 0.07050295919179916, + "learning_rate": 1.2885060541310542e-05, + "loss": 0.0193, + "step": 100550 + }, + { + "epoch": 0.7433251530114426, + "grad_norm": 0.07239052653312683, + "learning_rate": 1.2881350902184236e-05, + "loss": 0.0175, + "step": 100560 + }, + { + "epoch": 0.7433990715827444, + "grad_norm": 0.06946774572134018, + "learning_rate": 1.2877641263057929e-05, + "loss": 0.0185, + "step": 100570 + }, + { + "epoch": 0.7434729901540463, + "grad_norm": 0.0834672674536705, + "learning_rate": 1.2873931623931623e-05, + "loss": 0.0154, + "step": 100580 + }, + { + "epoch": 0.7435469087253481, + "grad_norm": 0.09061615914106369, + "learning_rate": 1.287022198480532e-05, + "loss": 0.0172, + "step": 100590 + }, + { + "epoch": 0.74362082729665, + "grad_norm": 0.08729087561368942, + "learning_rate": 1.2866512345679014e-05, + "loss": 0.0173, + "step": 100600 + }, + { + "epoch": 0.7436947458679518, + "grad_norm": 0.05605660006403923, + "learning_rate": 1.2862802706552706e-05, + "loss": 0.0181, + "step": 100610 + }, + { + "epoch": 0.7437686644392537, + "grad_norm": 0.08221442997455597, + "learning_rate": 1.28590930674264e-05, + "loss": 0.0188, + "step": 100620 + }, + { + "epoch": 0.7438425830105556, + "grad_norm": 0.05651714280247688, + "learning_rate": 1.2855383428300096e-05, + "loss": 0.0158, + "step": 100630 + }, + { + "epoch": 0.7439165015818574, + "grad_norm": 0.0809439942240715, + "learning_rate": 1.285167378917379e-05, + "loss": 0.0193, + "step": 100640 + }, + { + "epoch": 0.7439904201531593, + "grad_norm": 0.09423944354057312, + "learning_rate": 1.2847964150047483e-05, + "loss": 0.0174, + "step": 100650 + }, + { + "epoch": 0.7440643387244611, + "grad_norm": 0.10862912237644196, + "learning_rate": 1.2844254510921177e-05, + "loss": 0.0175, + "step": 100660 + }, + { + "epoch": 0.744138257295763, + "grad_norm": 0.09572023153305054, + "learning_rate": 1.2840544871794872e-05, + "loss": 0.0191, + "step": 100670 + }, + { + "epoch": 0.7442121758670649, + "grad_norm": 0.09042080491781235, + "learning_rate": 1.2836835232668568e-05, + "loss": 0.0157, + "step": 100680 + }, + { + "epoch": 0.7442860944383667, + "grad_norm": 0.09502577781677246, + "learning_rate": 1.2833125593542262e-05, + "loss": 0.0188, + "step": 100690 + }, + { + "epoch": 0.7443600130096686, + "grad_norm": 0.07590213418006897, + "learning_rate": 1.2829415954415955e-05, + "loss": 0.0172, + "step": 100700 + }, + { + "epoch": 0.7444339315809704, + "grad_norm": 0.08804917335510254, + "learning_rate": 1.2825706315289649e-05, + "loss": 0.0179, + "step": 100710 + }, + { + "epoch": 0.7445078501522723, + "grad_norm": 0.07385943084955215, + "learning_rate": 1.2821996676163345e-05, + "loss": 0.019, + "step": 100720 + }, + { + "epoch": 0.7445817687235741, + "grad_norm": 0.10739407688379288, + "learning_rate": 1.2818287037037039e-05, + "loss": 0.0181, + "step": 100730 + }, + { + "epoch": 0.744655687294876, + "grad_norm": 0.09441185742616653, + "learning_rate": 1.2814577397910732e-05, + "loss": 0.0185, + "step": 100740 + }, + { + "epoch": 0.7447296058661779, + "grad_norm": 0.10174049437046051, + "learning_rate": 1.2810867758784426e-05, + "loss": 0.0158, + "step": 100750 + }, + { + "epoch": 0.7448035244374797, + "grad_norm": 0.058671485632658005, + "learning_rate": 1.2807158119658119e-05, + "loss": 0.0169, + "step": 100760 + }, + { + "epoch": 0.7448774430087816, + "grad_norm": 0.06477764248847961, + "learning_rate": 1.2803448480531816e-05, + "loss": 0.0171, + "step": 100770 + }, + { + "epoch": 0.7449513615800833, + "grad_norm": 0.09175854921340942, + "learning_rate": 1.2799738841405509e-05, + "loss": 0.0144, + "step": 100780 + }, + { + "epoch": 0.7450252801513852, + "grad_norm": 0.07994644343852997, + "learning_rate": 1.2796029202279203e-05, + "loss": 0.0139, + "step": 100790 + }, + { + "epoch": 0.745099198722687, + "grad_norm": 0.07431617379188538, + "learning_rate": 1.2792319563152896e-05, + "loss": 0.016, + "step": 100800 + }, + { + "epoch": 0.7451731172939889, + "grad_norm": 0.08921124786138535, + "learning_rate": 1.278860992402659e-05, + "loss": 0.0181, + "step": 100810 + }, + { + "epoch": 0.7452470358652908, + "grad_norm": 0.06759527325630188, + "learning_rate": 1.2784900284900286e-05, + "loss": 0.0164, + "step": 100820 + }, + { + "epoch": 0.7453209544365926, + "grad_norm": 0.07384713739156723, + "learning_rate": 1.278119064577398e-05, + "loss": 0.0165, + "step": 100830 + }, + { + "epoch": 0.7453948730078945, + "grad_norm": 0.11012984067201614, + "learning_rate": 1.2777481006647673e-05, + "loss": 0.0173, + "step": 100840 + }, + { + "epoch": 0.7454687915791963, + "grad_norm": 0.06738018244504929, + "learning_rate": 1.2773771367521367e-05, + "loss": 0.0177, + "step": 100850 + }, + { + "epoch": 0.7455427101504982, + "grad_norm": 0.07870987057685852, + "learning_rate": 1.2770061728395063e-05, + "loss": 0.0152, + "step": 100860 + }, + { + "epoch": 0.7456166287218, + "grad_norm": 0.08337525278329849, + "learning_rate": 1.2766352089268757e-05, + "loss": 0.0175, + "step": 100870 + }, + { + "epoch": 0.7456905472931019, + "grad_norm": 0.08534202724695206, + "learning_rate": 1.2762642450142451e-05, + "loss": 0.0175, + "step": 100880 + }, + { + "epoch": 0.7457644658644038, + "grad_norm": 0.14618049561977386, + "learning_rate": 1.2758932811016144e-05, + "loss": 0.0185, + "step": 100890 + }, + { + "epoch": 0.7458383844357056, + "grad_norm": 0.08101604878902435, + "learning_rate": 1.2755223171889838e-05, + "loss": 0.0168, + "step": 100900 + }, + { + "epoch": 0.7459123030070075, + "grad_norm": 0.0876898542046547, + "learning_rate": 1.2751513532763534e-05, + "loss": 0.0188, + "step": 100910 + }, + { + "epoch": 0.7459862215783093, + "grad_norm": 0.07244478166103363, + "learning_rate": 1.2747803893637229e-05, + "loss": 0.0189, + "step": 100920 + }, + { + "epoch": 0.7460601401496112, + "grad_norm": 0.08289396017789841, + "learning_rate": 1.2744094254510921e-05, + "loss": 0.0172, + "step": 100930 + }, + { + "epoch": 0.7461340587209131, + "grad_norm": 0.06628874689340591, + "learning_rate": 1.2740384615384615e-05, + "loss": 0.015, + "step": 100940 + }, + { + "epoch": 0.7462079772922149, + "grad_norm": 0.06265491247177124, + "learning_rate": 1.2736674976258311e-05, + "loss": 0.0161, + "step": 100950 + }, + { + "epoch": 0.7462818958635168, + "grad_norm": 0.0762435793876648, + "learning_rate": 1.2732965337132006e-05, + "loss": 0.0186, + "step": 100960 + }, + { + "epoch": 0.7463558144348186, + "grad_norm": 0.08350300043821335, + "learning_rate": 1.2729255698005698e-05, + "loss": 0.0179, + "step": 100970 + }, + { + "epoch": 0.7464297330061205, + "grad_norm": 0.08295262604951859, + "learning_rate": 1.2725546058879393e-05, + "loss": 0.0168, + "step": 100980 + }, + { + "epoch": 0.7465036515774223, + "grad_norm": 0.07857391238212585, + "learning_rate": 1.2721836419753085e-05, + "loss": 0.0144, + "step": 100990 + }, + { + "epoch": 0.7465775701487242, + "grad_norm": 0.08510608971118927, + "learning_rate": 1.2718126780626783e-05, + "loss": 0.0174, + "step": 101000 + }, + { + "epoch": 0.7466514887200261, + "grad_norm": 0.08745384216308594, + "learning_rate": 1.2714417141500475e-05, + "loss": 0.0175, + "step": 101010 + }, + { + "epoch": 0.7467254072913279, + "grad_norm": 0.06416953355073929, + "learning_rate": 1.271070750237417e-05, + "loss": 0.0158, + "step": 101020 + }, + { + "epoch": 0.7467993258626298, + "grad_norm": 0.0958615094423294, + "learning_rate": 1.2706997863247864e-05, + "loss": 0.0193, + "step": 101030 + }, + { + "epoch": 0.7468732444339315, + "grad_norm": 0.09908900409936905, + "learning_rate": 1.2703288224121556e-05, + "loss": 0.0156, + "step": 101040 + }, + { + "epoch": 0.7469471630052334, + "grad_norm": 0.07092351466417313, + "learning_rate": 1.2699578584995252e-05, + "loss": 0.0149, + "step": 101050 + }, + { + "epoch": 0.7470210815765352, + "grad_norm": 0.060773033648729324, + "learning_rate": 1.2695868945868947e-05, + "loss": 0.0149, + "step": 101060 + }, + { + "epoch": 0.7470950001478371, + "grad_norm": 0.06796196848154068, + "learning_rate": 1.2692159306742641e-05, + "loss": 0.0173, + "step": 101070 + }, + { + "epoch": 0.747168918719139, + "grad_norm": 0.0906725749373436, + "learning_rate": 1.2688449667616334e-05, + "loss": 0.0184, + "step": 101080 + }, + { + "epoch": 0.7472428372904408, + "grad_norm": 0.07799607515335083, + "learning_rate": 1.268474002849003e-05, + "loss": 0.0191, + "step": 101090 + }, + { + "epoch": 0.7473167558617427, + "grad_norm": 0.071399986743927, + "learning_rate": 1.2681030389363724e-05, + "loss": 0.0177, + "step": 101100 + }, + { + "epoch": 0.7473906744330445, + "grad_norm": 0.0783345103263855, + "learning_rate": 1.2677320750237418e-05, + "loss": 0.0178, + "step": 101110 + }, + { + "epoch": 0.7474645930043464, + "grad_norm": 0.0984099954366684, + "learning_rate": 1.267361111111111e-05, + "loss": 0.0174, + "step": 101120 + }, + { + "epoch": 0.7475385115756482, + "grad_norm": 0.09083379060029984, + "learning_rate": 1.2669901471984805e-05, + "loss": 0.0182, + "step": 101130 + }, + { + "epoch": 0.7476124301469501, + "grad_norm": 0.08555403351783752, + "learning_rate": 1.2666191832858501e-05, + "loss": 0.0156, + "step": 101140 + }, + { + "epoch": 0.747686348718252, + "grad_norm": 0.07865635305643082, + "learning_rate": 1.2662482193732195e-05, + "loss": 0.0184, + "step": 101150 + }, + { + "epoch": 0.7477602672895538, + "grad_norm": 0.07970672100782394, + "learning_rate": 1.2658772554605888e-05, + "loss": 0.0159, + "step": 101160 + }, + { + "epoch": 0.7478341858608557, + "grad_norm": 0.0820077583193779, + "learning_rate": 1.2655062915479582e-05, + "loss": 0.018, + "step": 101170 + }, + { + "epoch": 0.7479081044321575, + "grad_norm": 0.0661480724811554, + "learning_rate": 1.2651353276353278e-05, + "loss": 0.0187, + "step": 101180 + }, + { + "epoch": 0.7479820230034594, + "grad_norm": 0.056968096643686295, + "learning_rate": 1.2647643637226972e-05, + "loss": 0.0184, + "step": 101190 + }, + { + "epoch": 0.7480559415747613, + "grad_norm": 0.06653908640146255, + "learning_rate": 1.2643933998100665e-05, + "loss": 0.0176, + "step": 101200 + }, + { + "epoch": 0.7481298601460631, + "grad_norm": 0.0841420367360115, + "learning_rate": 1.2640224358974359e-05, + "loss": 0.0148, + "step": 101210 + }, + { + "epoch": 0.748203778717365, + "grad_norm": 0.05905050411820412, + "learning_rate": 1.2636514719848053e-05, + "loss": 0.0157, + "step": 101220 + }, + { + "epoch": 0.7482776972886668, + "grad_norm": 0.06777974218130112, + "learning_rate": 1.263280508072175e-05, + "loss": 0.0181, + "step": 101230 + }, + { + "epoch": 0.7483516158599687, + "grad_norm": 0.09229224175214767, + "learning_rate": 1.2629095441595442e-05, + "loss": 0.0185, + "step": 101240 + }, + { + "epoch": 0.7484255344312705, + "grad_norm": 0.07637009769678116, + "learning_rate": 1.2625385802469136e-05, + "loss": 0.0179, + "step": 101250 + }, + { + "epoch": 0.7484994530025724, + "grad_norm": 0.08407588303089142, + "learning_rate": 1.262167616334283e-05, + "loss": 0.0174, + "step": 101260 + }, + { + "epoch": 0.7485733715738743, + "grad_norm": 0.09340599179267883, + "learning_rate": 1.2617966524216523e-05, + "loss": 0.0173, + "step": 101270 + }, + { + "epoch": 0.7486472901451761, + "grad_norm": 0.08749319612979889, + "learning_rate": 1.2614256885090219e-05, + "loss": 0.0181, + "step": 101280 + }, + { + "epoch": 0.748721208716478, + "grad_norm": 0.08961571007966995, + "learning_rate": 1.2610547245963913e-05, + "loss": 0.017, + "step": 101290 + }, + { + "epoch": 0.7487951272877797, + "grad_norm": 0.10773412883281708, + "learning_rate": 1.2606837606837608e-05, + "loss": 0.0193, + "step": 101300 + }, + { + "epoch": 0.7488690458590817, + "grad_norm": 0.10216166824102402, + "learning_rate": 1.26031279677113e-05, + "loss": 0.02, + "step": 101310 + }, + { + "epoch": 0.7489429644303834, + "grad_norm": 0.0787181630730629, + "learning_rate": 1.2599418328584998e-05, + "loss": 0.0164, + "step": 101320 + }, + { + "epoch": 0.7490168830016853, + "grad_norm": 0.0944897010922432, + "learning_rate": 1.259570868945869e-05, + "loss": 0.0176, + "step": 101330 + }, + { + "epoch": 0.7490908015729872, + "grad_norm": 0.07871638238430023, + "learning_rate": 1.2591999050332385e-05, + "loss": 0.0169, + "step": 101340 + }, + { + "epoch": 0.749164720144289, + "grad_norm": 0.07367895543575287, + "learning_rate": 1.2588289411206077e-05, + "loss": 0.0175, + "step": 101350 + }, + { + "epoch": 0.7492386387155909, + "grad_norm": 0.07219689339399338, + "learning_rate": 1.2584579772079772e-05, + "loss": 0.0181, + "step": 101360 + }, + { + "epoch": 0.7493125572868927, + "grad_norm": 0.09437573701143265, + "learning_rate": 1.2580870132953467e-05, + "loss": 0.0167, + "step": 101370 + }, + { + "epoch": 0.7493864758581946, + "grad_norm": 0.08827673643827438, + "learning_rate": 1.2577160493827162e-05, + "loss": 0.0156, + "step": 101380 + }, + { + "epoch": 0.7494603944294964, + "grad_norm": 0.07567749172449112, + "learning_rate": 1.2573450854700854e-05, + "loss": 0.0154, + "step": 101390 + }, + { + "epoch": 0.7495343130007983, + "grad_norm": 0.08592404425144196, + "learning_rate": 1.2569741215574549e-05, + "loss": 0.0181, + "step": 101400 + }, + { + "epoch": 0.7496082315721002, + "grad_norm": 0.07100368291139603, + "learning_rate": 1.2566031576448245e-05, + "loss": 0.0152, + "step": 101410 + }, + { + "epoch": 0.749682150143402, + "grad_norm": 0.08753882348537445, + "learning_rate": 1.2562321937321939e-05, + "loss": 0.0162, + "step": 101420 + }, + { + "epoch": 0.7497560687147039, + "grad_norm": 0.05322737991809845, + "learning_rate": 1.2558612298195631e-05, + "loss": 0.0197, + "step": 101430 + }, + { + "epoch": 0.7498299872860057, + "grad_norm": 0.10462437570095062, + "learning_rate": 1.2554902659069326e-05, + "loss": 0.0217, + "step": 101440 + }, + { + "epoch": 0.7499039058573076, + "grad_norm": 0.08601612597703934, + "learning_rate": 1.255119301994302e-05, + "loss": 0.0179, + "step": 101450 + }, + { + "epoch": 0.7499778244286095, + "grad_norm": 0.06775778532028198, + "learning_rate": 1.2547483380816716e-05, + "loss": 0.0176, + "step": 101460 + }, + { + "epoch": 0.7500517429999113, + "grad_norm": 0.09714211523532867, + "learning_rate": 1.254377374169041e-05, + "loss": 0.0204, + "step": 101470 + }, + { + "epoch": 0.7501256615712132, + "grad_norm": 0.06501183658838272, + "learning_rate": 1.2540064102564103e-05, + "loss": 0.0163, + "step": 101480 + }, + { + "epoch": 0.750199580142515, + "grad_norm": 0.0687502771615982, + "learning_rate": 1.2536354463437797e-05, + "loss": 0.018, + "step": 101490 + }, + { + "epoch": 0.7502734987138169, + "grad_norm": 0.07812726497650146, + "learning_rate": 1.253264482431149e-05, + "loss": 0.0177, + "step": 101500 + }, + { + "epoch": 0.7503474172851187, + "grad_norm": 0.06303098797798157, + "learning_rate": 1.2528935185185187e-05, + "loss": 0.0166, + "step": 101510 + }, + { + "epoch": 0.7504213358564206, + "grad_norm": 0.07705267518758774, + "learning_rate": 1.252522554605888e-05, + "loss": 0.0142, + "step": 101520 + }, + { + "epoch": 0.7504952544277225, + "grad_norm": 0.0688970610499382, + "learning_rate": 1.2521515906932574e-05, + "loss": 0.0163, + "step": 101530 + }, + { + "epoch": 0.7505691729990243, + "grad_norm": 0.08850441873073578, + "learning_rate": 1.2517806267806267e-05, + "loss": 0.0185, + "step": 101540 + }, + { + "epoch": 0.7506430915703262, + "grad_norm": 0.06363435089588165, + "learning_rate": 1.2514096628679964e-05, + "loss": 0.0183, + "step": 101550 + }, + { + "epoch": 0.750717010141628, + "grad_norm": 0.07842385768890381, + "learning_rate": 1.2510386989553657e-05, + "loss": 0.0165, + "step": 101560 + }, + { + "epoch": 0.7507909287129299, + "grad_norm": 0.06949975341558456, + "learning_rate": 1.2506677350427351e-05, + "loss": 0.0168, + "step": 101570 + }, + { + "epoch": 0.7508648472842316, + "grad_norm": 0.07649943977594376, + "learning_rate": 1.2502967711301044e-05, + "loss": 0.0154, + "step": 101580 + }, + { + "epoch": 0.7509387658555335, + "grad_norm": 0.05525654926896095, + "learning_rate": 1.249925807217474e-05, + "loss": 0.0165, + "step": 101590 + }, + { + "epoch": 0.7510126844268354, + "grad_norm": 0.08242253214120865, + "learning_rate": 1.2495548433048432e-05, + "loss": 0.0195, + "step": 101600 + }, + { + "epoch": 0.7510866029981372, + "grad_norm": 0.06378057599067688, + "learning_rate": 1.2491838793922128e-05, + "loss": 0.0162, + "step": 101610 + }, + { + "epoch": 0.7511605215694391, + "grad_norm": 0.07558952271938324, + "learning_rate": 1.2488129154795823e-05, + "loss": 0.0168, + "step": 101620 + }, + { + "epoch": 0.7512344401407409, + "grad_norm": 0.06266353279352188, + "learning_rate": 1.2484419515669517e-05, + "loss": 0.016, + "step": 101630 + }, + { + "epoch": 0.7513083587120428, + "grad_norm": 0.08001867681741714, + "learning_rate": 1.2480709876543211e-05, + "loss": 0.0162, + "step": 101640 + }, + { + "epoch": 0.7513822772833446, + "grad_norm": 0.0811537504196167, + "learning_rate": 1.2477000237416904e-05, + "loss": 0.015, + "step": 101650 + }, + { + "epoch": 0.7514561958546465, + "grad_norm": 0.08687859773635864, + "learning_rate": 1.24732905982906e-05, + "loss": 0.0182, + "step": 101660 + }, + { + "epoch": 0.7515301144259484, + "grad_norm": 0.07284379005432129, + "learning_rate": 1.2469580959164292e-05, + "loss": 0.0167, + "step": 101670 + }, + { + "epoch": 0.7516040329972502, + "grad_norm": 0.0731201022863388, + "learning_rate": 1.2465871320037988e-05, + "loss": 0.018, + "step": 101680 + }, + { + "epoch": 0.7516779515685521, + "grad_norm": 0.06715166568756104, + "learning_rate": 1.246216168091168e-05, + "loss": 0.0169, + "step": 101690 + }, + { + "epoch": 0.7517518701398539, + "grad_norm": 0.07490894943475723, + "learning_rate": 1.2458452041785377e-05, + "loss": 0.0155, + "step": 101700 + }, + { + "epoch": 0.7518257887111558, + "grad_norm": 0.10497954487800598, + "learning_rate": 1.245474240265907e-05, + "loss": 0.0185, + "step": 101710 + }, + { + "epoch": 0.7518997072824577, + "grad_norm": 0.07911212742328644, + "learning_rate": 1.2451032763532765e-05, + "loss": 0.0145, + "step": 101720 + }, + { + "epoch": 0.7519736258537595, + "grad_norm": 0.07326927036046982, + "learning_rate": 1.2447323124406458e-05, + "loss": 0.0161, + "step": 101730 + }, + { + "epoch": 0.7520475444250614, + "grad_norm": 0.08427941054105759, + "learning_rate": 1.2443613485280152e-05, + "loss": 0.0176, + "step": 101740 + }, + { + "epoch": 0.7521214629963632, + "grad_norm": 0.0915616899728775, + "learning_rate": 1.2439903846153846e-05, + "loss": 0.0176, + "step": 101750 + }, + { + "epoch": 0.7521953815676651, + "grad_norm": 0.07714962959289551, + "learning_rate": 1.243619420702754e-05, + "loss": 0.0171, + "step": 101760 + }, + { + "epoch": 0.7522693001389669, + "grad_norm": 0.0776534304022789, + "learning_rate": 1.2432484567901235e-05, + "loss": 0.0153, + "step": 101770 + }, + { + "epoch": 0.7523432187102688, + "grad_norm": 0.0937986969947815, + "learning_rate": 1.242877492877493e-05, + "loss": 0.0193, + "step": 101780 + }, + { + "epoch": 0.7524171372815707, + "grad_norm": 0.07981719821691513, + "learning_rate": 1.2425065289648624e-05, + "loss": 0.0176, + "step": 101790 + }, + { + "epoch": 0.7524910558528725, + "grad_norm": 0.05857951566576958, + "learning_rate": 1.2421355650522318e-05, + "loss": 0.0183, + "step": 101800 + }, + { + "epoch": 0.7525649744241744, + "grad_norm": 0.0774340108036995, + "learning_rate": 1.2417646011396012e-05, + "loss": 0.016, + "step": 101810 + }, + { + "epoch": 0.7526388929954761, + "grad_norm": 0.08497736603021622, + "learning_rate": 1.2413936372269706e-05, + "loss": 0.0178, + "step": 101820 + }, + { + "epoch": 0.752712811566778, + "grad_norm": 0.07533351331949234, + "learning_rate": 1.24102267331434e-05, + "loss": 0.016, + "step": 101830 + }, + { + "epoch": 0.7527867301380798, + "grad_norm": 0.08452022075653076, + "learning_rate": 1.2406517094017095e-05, + "loss": 0.0173, + "step": 101840 + }, + { + "epoch": 0.7528606487093817, + "grad_norm": 0.07964122295379639, + "learning_rate": 1.240280745489079e-05, + "loss": 0.018, + "step": 101850 + }, + { + "epoch": 0.7529345672806836, + "grad_norm": 0.0665905624628067, + "learning_rate": 1.2399097815764483e-05, + "loss": 0.0191, + "step": 101860 + }, + { + "epoch": 0.7530084858519854, + "grad_norm": 0.08771977573633194, + "learning_rate": 1.2395388176638178e-05, + "loss": 0.0178, + "step": 101870 + }, + { + "epoch": 0.7530824044232873, + "grad_norm": 0.06156764179468155, + "learning_rate": 1.2391678537511872e-05, + "loss": 0.0183, + "step": 101880 + }, + { + "epoch": 0.7531563229945891, + "grad_norm": 0.06278455257415771, + "learning_rate": 1.2387968898385566e-05, + "loss": 0.0191, + "step": 101890 + }, + { + "epoch": 0.753230241565891, + "grad_norm": 0.07921794056892395, + "learning_rate": 1.2384259259259259e-05, + "loss": 0.0166, + "step": 101900 + }, + { + "epoch": 0.7533041601371928, + "grad_norm": 0.10754285752773285, + "learning_rate": 1.2380549620132955e-05, + "loss": 0.0186, + "step": 101910 + }, + { + "epoch": 0.7533780787084947, + "grad_norm": 0.10148213803768158, + "learning_rate": 1.2376839981006647e-05, + "loss": 0.0184, + "step": 101920 + }, + { + "epoch": 0.7534519972797966, + "grad_norm": 0.0633421391248703, + "learning_rate": 1.2373130341880343e-05, + "loss": 0.0178, + "step": 101930 + }, + { + "epoch": 0.7535259158510984, + "grad_norm": 0.07311506569385529, + "learning_rate": 1.2369420702754036e-05, + "loss": 0.0157, + "step": 101940 + }, + { + "epoch": 0.7535998344224003, + "grad_norm": 0.07114532589912415, + "learning_rate": 1.2365711063627732e-05, + "loss": 0.0182, + "step": 101950 + }, + { + "epoch": 0.7536737529937021, + "grad_norm": 0.06623285263776779, + "learning_rate": 1.2362001424501425e-05, + "loss": 0.0152, + "step": 101960 + }, + { + "epoch": 0.753747671565004, + "grad_norm": 0.07522322982549667, + "learning_rate": 1.2358291785375119e-05, + "loss": 0.019, + "step": 101970 + }, + { + "epoch": 0.7538215901363059, + "grad_norm": 0.11243265122175217, + "learning_rate": 1.2354582146248813e-05, + "loss": 0.0175, + "step": 101980 + }, + { + "epoch": 0.7538955087076077, + "grad_norm": 0.08355090767145157, + "learning_rate": 1.2350872507122507e-05, + "loss": 0.0159, + "step": 101990 + }, + { + "epoch": 0.7539694272789096, + "grad_norm": 0.07274141907691956, + "learning_rate": 1.2347162867996202e-05, + "loss": 0.0183, + "step": 102000 + }, + { + "epoch": 0.7540433458502114, + "grad_norm": 0.09165412187576294, + "learning_rate": 1.2343453228869896e-05, + "loss": 0.0168, + "step": 102010 + }, + { + "epoch": 0.7541172644215133, + "grad_norm": 0.09555403888225555, + "learning_rate": 1.233974358974359e-05, + "loss": 0.0175, + "step": 102020 + }, + { + "epoch": 0.7541911829928151, + "grad_norm": 0.09564153850078583, + "learning_rate": 1.2336033950617284e-05, + "loss": 0.0199, + "step": 102030 + }, + { + "epoch": 0.754265101564117, + "grad_norm": 0.06774450838565826, + "learning_rate": 1.2332324311490979e-05, + "loss": 0.0149, + "step": 102040 + }, + { + "epoch": 0.7543390201354189, + "grad_norm": 0.053174346685409546, + "learning_rate": 1.2328614672364673e-05, + "loss": 0.0143, + "step": 102050 + }, + { + "epoch": 0.7544129387067207, + "grad_norm": 0.09616374224424362, + "learning_rate": 1.2324905033238367e-05, + "loss": 0.016, + "step": 102060 + }, + { + "epoch": 0.7544868572780226, + "grad_norm": 0.11800222098827362, + "learning_rate": 1.2321195394112062e-05, + "loss": 0.0173, + "step": 102070 + }, + { + "epoch": 0.7545607758493243, + "grad_norm": 0.0716506615281105, + "learning_rate": 1.2317485754985756e-05, + "loss": 0.0178, + "step": 102080 + }, + { + "epoch": 0.7546346944206263, + "grad_norm": 0.06857968121767044, + "learning_rate": 1.231377611585945e-05, + "loss": 0.0168, + "step": 102090 + }, + { + "epoch": 0.754708612991928, + "grad_norm": 0.08074000477790833, + "learning_rate": 1.2310066476733144e-05, + "loss": 0.0166, + "step": 102100 + }, + { + "epoch": 0.7547825315632299, + "grad_norm": 0.07367126643657684, + "learning_rate": 1.2306356837606839e-05, + "loss": 0.0167, + "step": 102110 + }, + { + "epoch": 0.7548564501345318, + "grad_norm": 0.06470063328742981, + "learning_rate": 1.2302647198480533e-05, + "loss": 0.0166, + "step": 102120 + }, + { + "epoch": 0.7549303687058336, + "grad_norm": 0.06659591943025589, + "learning_rate": 1.2298937559354225e-05, + "loss": 0.0172, + "step": 102130 + }, + { + "epoch": 0.7550042872771355, + "grad_norm": 0.08944061398506165, + "learning_rate": 1.2295227920227921e-05, + "loss": 0.0153, + "step": 102140 + }, + { + "epoch": 0.7550782058484373, + "grad_norm": 0.0934378057718277, + "learning_rate": 1.2291518281101614e-05, + "loss": 0.0163, + "step": 102150 + }, + { + "epoch": 0.7551521244197392, + "grad_norm": 0.0620492585003376, + "learning_rate": 1.228780864197531e-05, + "loss": 0.0179, + "step": 102160 + }, + { + "epoch": 0.7552260429910411, + "grad_norm": 0.0643744096159935, + "learning_rate": 1.2284099002849003e-05, + "loss": 0.019, + "step": 102170 + }, + { + "epoch": 0.7552999615623429, + "grad_norm": 0.08223942667245865, + "learning_rate": 1.2280389363722699e-05, + "loss": 0.0157, + "step": 102180 + }, + { + "epoch": 0.7553738801336448, + "grad_norm": 0.07553199678659439, + "learning_rate": 1.2276679724596391e-05, + "loss": 0.0184, + "step": 102190 + }, + { + "epoch": 0.7554477987049466, + "grad_norm": 0.06578723341226578, + "learning_rate": 1.2272970085470085e-05, + "loss": 0.0151, + "step": 102200 + }, + { + "epoch": 0.7555217172762485, + "grad_norm": 0.08422625809907913, + "learning_rate": 1.226926044634378e-05, + "loss": 0.0184, + "step": 102210 + }, + { + "epoch": 0.7555956358475503, + "grad_norm": 0.07038059830665588, + "learning_rate": 1.2265550807217474e-05, + "loss": 0.0167, + "step": 102220 + }, + { + "epoch": 0.7556695544188522, + "grad_norm": 0.08100122958421707, + "learning_rate": 1.226184116809117e-05, + "loss": 0.0196, + "step": 102230 + }, + { + "epoch": 0.7557434729901541, + "grad_norm": 0.07784950733184814, + "learning_rate": 1.2258131528964862e-05, + "loss": 0.0182, + "step": 102240 + }, + { + "epoch": 0.7558173915614559, + "grad_norm": 0.08980873972177505, + "learning_rate": 1.2254421889838558e-05, + "loss": 0.0178, + "step": 102250 + }, + { + "epoch": 0.7558913101327578, + "grad_norm": 0.07078026235103607, + "learning_rate": 1.2250712250712251e-05, + "loss": 0.0167, + "step": 102260 + }, + { + "epoch": 0.7559652287040596, + "grad_norm": 0.0744447410106659, + "learning_rate": 1.2247002611585945e-05, + "loss": 0.0156, + "step": 102270 + }, + { + "epoch": 0.7560391472753615, + "grad_norm": 0.07403499633073807, + "learning_rate": 1.224329297245964e-05, + "loss": 0.0172, + "step": 102280 + }, + { + "epoch": 0.7561130658466633, + "grad_norm": 0.07184556126594543, + "learning_rate": 1.2239583333333334e-05, + "loss": 0.0173, + "step": 102290 + }, + { + "epoch": 0.7561869844179652, + "grad_norm": 0.09416437149047852, + "learning_rate": 1.2235873694207028e-05, + "loss": 0.0187, + "step": 102300 + }, + { + "epoch": 0.7562609029892671, + "grad_norm": 0.0748690515756607, + "learning_rate": 1.2232164055080722e-05, + "loss": 0.0177, + "step": 102310 + }, + { + "epoch": 0.7563348215605689, + "grad_norm": 0.07672201097011566, + "learning_rate": 1.2228454415954417e-05, + "loss": 0.0155, + "step": 102320 + }, + { + "epoch": 0.7564087401318708, + "grad_norm": 0.09029083698987961, + "learning_rate": 1.2224744776828111e-05, + "loss": 0.0192, + "step": 102330 + }, + { + "epoch": 0.7564826587031726, + "grad_norm": 0.08144398778676987, + "learning_rate": 1.2221035137701805e-05, + "loss": 0.0184, + "step": 102340 + }, + { + "epoch": 0.7565565772744745, + "grad_norm": 0.08030198514461517, + "learning_rate": 1.22173254985755e-05, + "loss": 0.0168, + "step": 102350 + }, + { + "epoch": 0.7566304958457762, + "grad_norm": 0.11785607784986496, + "learning_rate": 1.2213615859449192e-05, + "loss": 0.0193, + "step": 102360 + }, + { + "epoch": 0.7567044144170781, + "grad_norm": 0.07621311396360397, + "learning_rate": 1.2209906220322888e-05, + "loss": 0.0162, + "step": 102370 + }, + { + "epoch": 0.75677833298838, + "grad_norm": 0.050615742802619934, + "learning_rate": 1.2206196581196582e-05, + "loss": 0.0161, + "step": 102380 + }, + { + "epoch": 0.7568522515596818, + "grad_norm": 0.08207380771636963, + "learning_rate": 1.2202486942070277e-05, + "loss": 0.0187, + "step": 102390 + }, + { + "epoch": 0.7569261701309837, + "grad_norm": 0.06875210255384445, + "learning_rate": 1.219877730294397e-05, + "loss": 0.0197, + "step": 102400 + }, + { + "epoch": 0.7570000887022855, + "grad_norm": 0.08055390417575836, + "learning_rate": 1.2195067663817665e-05, + "loss": 0.0185, + "step": 102410 + }, + { + "epoch": 0.7570740072735874, + "grad_norm": 0.08164180815219879, + "learning_rate": 1.219135802469136e-05, + "loss": 0.0162, + "step": 102420 + }, + { + "epoch": 0.7571479258448893, + "grad_norm": 0.07057615369558334, + "learning_rate": 1.2187648385565052e-05, + "loss": 0.0178, + "step": 102430 + }, + { + "epoch": 0.7572218444161911, + "grad_norm": 0.13389045000076294, + "learning_rate": 1.2183938746438748e-05, + "loss": 0.0169, + "step": 102440 + }, + { + "epoch": 0.757295762987493, + "grad_norm": 0.0784306675195694, + "learning_rate": 1.218022910731244e-05, + "loss": 0.016, + "step": 102450 + }, + { + "epoch": 0.7573696815587948, + "grad_norm": 0.05959833040833473, + "learning_rate": 1.2176519468186136e-05, + "loss": 0.0161, + "step": 102460 + }, + { + "epoch": 0.7574436001300967, + "grad_norm": 0.10358747839927673, + "learning_rate": 1.2172809829059829e-05, + "loss": 0.0198, + "step": 102470 + }, + { + "epoch": 0.7575175187013985, + "grad_norm": 0.05675870180130005, + "learning_rate": 1.2169100189933525e-05, + "loss": 0.0175, + "step": 102480 + }, + { + "epoch": 0.7575914372727004, + "grad_norm": 0.07311985641717911, + "learning_rate": 1.2165390550807218e-05, + "loss": 0.0175, + "step": 102490 + }, + { + "epoch": 0.7576653558440023, + "grad_norm": 0.09267310053110123, + "learning_rate": 1.2161680911680912e-05, + "loss": 0.0161, + "step": 102500 + }, + { + "epoch": 0.7577392744153041, + "grad_norm": 0.10184731334447861, + "learning_rate": 1.2157971272554606e-05, + "loss": 0.0155, + "step": 102510 + }, + { + "epoch": 0.757813192986606, + "grad_norm": 0.08719828724861145, + "learning_rate": 1.21542616334283e-05, + "loss": 0.0171, + "step": 102520 + }, + { + "epoch": 0.7578871115579078, + "grad_norm": 0.07820761948823929, + "learning_rate": 1.2150551994301995e-05, + "loss": 0.0155, + "step": 102530 + }, + { + "epoch": 0.7579610301292097, + "grad_norm": 0.07845431566238403, + "learning_rate": 1.2146842355175689e-05, + "loss": 0.0176, + "step": 102540 + }, + { + "epoch": 0.7580349487005115, + "grad_norm": 0.08127642422914505, + "learning_rate": 1.2143132716049383e-05, + "loss": 0.0203, + "step": 102550 + }, + { + "epoch": 0.7581088672718134, + "grad_norm": 0.07473145425319672, + "learning_rate": 1.2139423076923077e-05, + "loss": 0.0174, + "step": 102560 + }, + { + "epoch": 0.7581827858431153, + "grad_norm": 0.11189044266939163, + "learning_rate": 1.2135713437796772e-05, + "loss": 0.0153, + "step": 102570 + }, + { + "epoch": 0.7582567044144171, + "grad_norm": 0.07234430313110352, + "learning_rate": 1.2132003798670466e-05, + "loss": 0.018, + "step": 102580 + }, + { + "epoch": 0.758330622985719, + "grad_norm": 0.09144700318574905, + "learning_rate": 1.212829415954416e-05, + "loss": 0.0176, + "step": 102590 + }, + { + "epoch": 0.7584045415570208, + "grad_norm": 0.06276418268680573, + "learning_rate": 1.2124584520417855e-05, + "loss": 0.0172, + "step": 102600 + }, + { + "epoch": 0.7584784601283227, + "grad_norm": 0.09209229052066803, + "learning_rate": 1.2120874881291549e-05, + "loss": 0.0157, + "step": 102610 + }, + { + "epoch": 0.7585523786996244, + "grad_norm": 0.05703670531511307, + "learning_rate": 1.2117165242165243e-05, + "loss": 0.0161, + "step": 102620 + }, + { + "epoch": 0.7586262972709263, + "grad_norm": 0.07058379054069519, + "learning_rate": 1.2113455603038937e-05, + "loss": 0.0159, + "step": 102630 + }, + { + "epoch": 0.7587002158422282, + "grad_norm": 0.08190514892339706, + "learning_rate": 1.2109745963912632e-05, + "loss": 0.0161, + "step": 102640 + }, + { + "epoch": 0.75877413441353, + "grad_norm": 0.08964069187641144, + "learning_rate": 1.2106036324786326e-05, + "loss": 0.017, + "step": 102650 + }, + { + "epoch": 0.7588480529848319, + "grad_norm": 0.07375913113355637, + "learning_rate": 1.2102326685660019e-05, + "loss": 0.0185, + "step": 102660 + }, + { + "epoch": 0.7589219715561337, + "grad_norm": 0.07200281322002411, + "learning_rate": 1.2098617046533714e-05, + "loss": 0.0178, + "step": 102670 + }, + { + "epoch": 0.7589958901274356, + "grad_norm": 0.08306648582220078, + "learning_rate": 1.2094907407407407e-05, + "loss": 0.0172, + "step": 102680 + }, + { + "epoch": 0.7590698086987375, + "grad_norm": 0.07849260419607162, + "learning_rate": 1.2091197768281103e-05, + "loss": 0.0166, + "step": 102690 + }, + { + "epoch": 0.7591437272700393, + "grad_norm": 0.06569449603557587, + "learning_rate": 1.2087488129154796e-05, + "loss": 0.017, + "step": 102700 + }, + { + "epoch": 0.7592176458413412, + "grad_norm": 0.09783091396093369, + "learning_rate": 1.2083778490028492e-05, + "loss": 0.0186, + "step": 102710 + }, + { + "epoch": 0.759291564412643, + "grad_norm": 0.08046203851699829, + "learning_rate": 1.2080068850902184e-05, + "loss": 0.0184, + "step": 102720 + }, + { + "epoch": 0.7593654829839449, + "grad_norm": 0.0971355214715004, + "learning_rate": 1.2076359211775878e-05, + "loss": 0.0201, + "step": 102730 + }, + { + "epoch": 0.7594394015552467, + "grad_norm": 0.09459102153778076, + "learning_rate": 1.2072649572649573e-05, + "loss": 0.0183, + "step": 102740 + }, + { + "epoch": 0.7595133201265486, + "grad_norm": 0.04941713437438011, + "learning_rate": 1.2068939933523267e-05, + "loss": 0.0199, + "step": 102750 + }, + { + "epoch": 0.7595872386978505, + "grad_norm": 0.06698715686798096, + "learning_rate": 1.2065230294396961e-05, + "loss": 0.0162, + "step": 102760 + }, + { + "epoch": 0.7596611572691523, + "grad_norm": 0.09899577498435974, + "learning_rate": 1.2061520655270656e-05, + "loss": 0.0184, + "step": 102770 + }, + { + "epoch": 0.7597350758404542, + "grad_norm": 0.09042756259441376, + "learning_rate": 1.205781101614435e-05, + "loss": 0.0165, + "step": 102780 + }, + { + "epoch": 0.759808994411756, + "grad_norm": 0.10201259702444077, + "learning_rate": 1.2054101377018044e-05, + "loss": 0.0183, + "step": 102790 + }, + { + "epoch": 0.7598829129830579, + "grad_norm": 0.08615417033433914, + "learning_rate": 1.2050391737891738e-05, + "loss": 0.0179, + "step": 102800 + }, + { + "epoch": 0.7599568315543597, + "grad_norm": 0.0700421929359436, + "learning_rate": 1.2046682098765433e-05, + "loss": 0.0158, + "step": 102810 + }, + { + "epoch": 0.7600307501256616, + "grad_norm": 0.07872634381055832, + "learning_rate": 1.2042972459639127e-05, + "loss": 0.0164, + "step": 102820 + }, + { + "epoch": 0.7601046686969635, + "grad_norm": 0.07595248520374298, + "learning_rate": 1.2039262820512821e-05, + "loss": 0.0171, + "step": 102830 + }, + { + "epoch": 0.7601785872682653, + "grad_norm": 0.04081696644425392, + "learning_rate": 1.2035553181386515e-05, + "loss": 0.0148, + "step": 102840 + }, + { + "epoch": 0.7602525058395672, + "grad_norm": 0.057837892323732376, + "learning_rate": 1.203184354226021e-05, + "loss": 0.0164, + "step": 102850 + }, + { + "epoch": 0.760326424410869, + "grad_norm": 0.09097220748662949, + "learning_rate": 1.2028133903133904e-05, + "loss": 0.0177, + "step": 102860 + }, + { + "epoch": 0.7604003429821709, + "grad_norm": 0.08579658716917038, + "learning_rate": 1.2024424264007598e-05, + "loss": 0.0164, + "step": 102870 + }, + { + "epoch": 0.7604742615534726, + "grad_norm": 0.0934729054570198, + "learning_rate": 1.2020714624881293e-05, + "loss": 0.0215, + "step": 102880 + }, + { + "epoch": 0.7605481801247745, + "grad_norm": 0.06498159468173981, + "learning_rate": 1.2017004985754985e-05, + "loss": 0.0175, + "step": 102890 + }, + { + "epoch": 0.7606220986960764, + "grad_norm": 0.11769692599773407, + "learning_rate": 1.2013295346628681e-05, + "loss": 0.0169, + "step": 102900 + }, + { + "epoch": 0.7606960172673782, + "grad_norm": 0.09046773612499237, + "learning_rate": 1.2009585707502374e-05, + "loss": 0.0183, + "step": 102910 + }, + { + "epoch": 0.7607699358386801, + "grad_norm": 0.08058417588472366, + "learning_rate": 1.200587606837607e-05, + "loss": 0.018, + "step": 102920 + }, + { + "epoch": 0.7608438544099819, + "grad_norm": 0.07865067571401596, + "learning_rate": 1.2002166429249762e-05, + "loss": 0.0147, + "step": 102930 + }, + { + "epoch": 0.7609177729812838, + "grad_norm": 0.05952581763267517, + "learning_rate": 1.1998456790123458e-05, + "loss": 0.016, + "step": 102940 + }, + { + "epoch": 0.7609916915525857, + "grad_norm": 0.12254879623651505, + "learning_rate": 1.199474715099715e-05, + "loss": 0.0172, + "step": 102950 + }, + { + "epoch": 0.7610656101238875, + "grad_norm": 0.09980471432209015, + "learning_rate": 1.1991037511870845e-05, + "loss": 0.0161, + "step": 102960 + }, + { + "epoch": 0.7611395286951894, + "grad_norm": 0.07723431289196014, + "learning_rate": 1.198732787274454e-05, + "loss": 0.0165, + "step": 102970 + }, + { + "epoch": 0.7612134472664912, + "grad_norm": 0.09354310482740402, + "learning_rate": 1.1983618233618234e-05, + "loss": 0.0199, + "step": 102980 + }, + { + "epoch": 0.7612873658377931, + "grad_norm": 0.08254075050354004, + "learning_rate": 1.197990859449193e-05, + "loss": 0.0158, + "step": 102990 + }, + { + "epoch": 0.7613612844090949, + "grad_norm": 0.05928228050470352, + "learning_rate": 1.1976198955365622e-05, + "loss": 0.0159, + "step": 103000 + }, + { + "epoch": 0.7614352029803968, + "grad_norm": 0.08125203102827072, + "learning_rate": 1.1972489316239318e-05, + "loss": 0.0186, + "step": 103010 + }, + { + "epoch": 0.7615091215516987, + "grad_norm": 0.07555700093507767, + "learning_rate": 1.196877967711301e-05, + "loss": 0.0194, + "step": 103020 + }, + { + "epoch": 0.7615830401230005, + "grad_norm": 0.07443048059940338, + "learning_rate": 1.1965070037986707e-05, + "loss": 0.0168, + "step": 103030 + }, + { + "epoch": 0.7616569586943024, + "grad_norm": 0.06883411109447479, + "learning_rate": 1.19613603988604e-05, + "loss": 0.0168, + "step": 103040 + }, + { + "epoch": 0.7617308772656042, + "grad_norm": 0.0769045278429985, + "learning_rate": 1.1957650759734093e-05, + "loss": 0.0164, + "step": 103050 + }, + { + "epoch": 0.7618047958369061, + "grad_norm": 0.07747851312160492, + "learning_rate": 1.1953941120607788e-05, + "loss": 0.0161, + "step": 103060 + }, + { + "epoch": 0.7618787144082079, + "grad_norm": 0.08282840996980667, + "learning_rate": 1.1950231481481482e-05, + "loss": 0.0167, + "step": 103070 + }, + { + "epoch": 0.7619526329795098, + "grad_norm": 0.08814528584480286, + "learning_rate": 1.1946521842355176e-05, + "loss": 0.0168, + "step": 103080 + }, + { + "epoch": 0.7620265515508117, + "grad_norm": 0.08881138265132904, + "learning_rate": 1.194281220322887e-05, + "loss": 0.017, + "step": 103090 + }, + { + "epoch": 0.7621004701221135, + "grad_norm": 0.09396179765462875, + "learning_rate": 1.1939102564102565e-05, + "loss": 0.0198, + "step": 103100 + }, + { + "epoch": 0.7621743886934154, + "grad_norm": 0.0843079537153244, + "learning_rate": 1.1935392924976259e-05, + "loss": 0.0178, + "step": 103110 + }, + { + "epoch": 0.7622483072647172, + "grad_norm": 0.07248686254024506, + "learning_rate": 1.1931683285849952e-05, + "loss": 0.0192, + "step": 103120 + }, + { + "epoch": 0.762322225836019, + "grad_norm": 0.062101028859615326, + "learning_rate": 1.1927973646723648e-05, + "loss": 0.0152, + "step": 103130 + }, + { + "epoch": 0.7623961444073208, + "grad_norm": 0.06474128365516663, + "learning_rate": 1.192426400759734e-05, + "loss": 0.0165, + "step": 103140 + }, + { + "epoch": 0.7624700629786227, + "grad_norm": 0.06901828944683075, + "learning_rate": 1.1920554368471036e-05, + "loss": 0.018, + "step": 103150 + }, + { + "epoch": 0.7625439815499246, + "grad_norm": 0.07394035160541534, + "learning_rate": 1.191684472934473e-05, + "loss": 0.0138, + "step": 103160 + }, + { + "epoch": 0.7626179001212264, + "grad_norm": 0.0887165293097496, + "learning_rate": 1.1913135090218425e-05, + "loss": 0.0164, + "step": 103170 + }, + { + "epoch": 0.7626918186925283, + "grad_norm": 0.09024112671613693, + "learning_rate": 1.1909425451092119e-05, + "loss": 0.0211, + "step": 103180 + }, + { + "epoch": 0.7627657372638301, + "grad_norm": 0.09429723024368286, + "learning_rate": 1.1905715811965812e-05, + "loss": 0.0197, + "step": 103190 + }, + { + "epoch": 0.762839655835132, + "grad_norm": 0.056649837642908096, + "learning_rate": 1.1902006172839508e-05, + "loss": 0.018, + "step": 103200 + }, + { + "epoch": 0.7629135744064339, + "grad_norm": 0.09306730329990387, + "learning_rate": 1.18982965337132e-05, + "loss": 0.0166, + "step": 103210 + }, + { + "epoch": 0.7629874929777357, + "grad_norm": 0.06880921125411987, + "learning_rate": 1.1894586894586896e-05, + "loss": 0.0164, + "step": 103220 + }, + { + "epoch": 0.7630614115490376, + "grad_norm": 0.1179075613617897, + "learning_rate": 1.1890877255460589e-05, + "loss": 0.0166, + "step": 103230 + }, + { + "epoch": 0.7631353301203394, + "grad_norm": 0.07119959592819214, + "learning_rate": 1.1887167616334285e-05, + "loss": 0.016, + "step": 103240 + }, + { + "epoch": 0.7632092486916413, + "grad_norm": 0.06132645159959793, + "learning_rate": 1.1883457977207977e-05, + "loss": 0.0175, + "step": 103250 + }, + { + "epoch": 0.7632831672629431, + "grad_norm": 0.11850385367870331, + "learning_rate": 1.1879748338081673e-05, + "loss": 0.0186, + "step": 103260 + }, + { + "epoch": 0.763357085834245, + "grad_norm": 0.0714501440525055, + "learning_rate": 1.1876038698955366e-05, + "loss": 0.017, + "step": 103270 + }, + { + "epoch": 0.7634310044055469, + "grad_norm": 0.05191829428076744, + "learning_rate": 1.187232905982906e-05, + "loss": 0.0154, + "step": 103280 + }, + { + "epoch": 0.7635049229768487, + "grad_norm": 0.08176074177026749, + "learning_rate": 1.1868619420702754e-05, + "loss": 0.0172, + "step": 103290 + }, + { + "epoch": 0.7635788415481506, + "grad_norm": 0.11602763831615448, + "learning_rate": 1.1864909781576449e-05, + "loss": 0.0182, + "step": 103300 + }, + { + "epoch": 0.7636527601194524, + "grad_norm": 0.06472666561603546, + "learning_rate": 1.1861200142450143e-05, + "loss": 0.0175, + "step": 103310 + }, + { + "epoch": 0.7637266786907543, + "grad_norm": 0.06399382650852203, + "learning_rate": 1.1857490503323837e-05, + "loss": 0.0167, + "step": 103320 + }, + { + "epoch": 0.7638005972620561, + "grad_norm": 0.06925990432500839, + "learning_rate": 1.1853780864197531e-05, + "loss": 0.0169, + "step": 103330 + }, + { + "epoch": 0.763874515833358, + "grad_norm": 0.09379049390554428, + "learning_rate": 1.1850071225071226e-05, + "loss": 0.0177, + "step": 103340 + }, + { + "epoch": 0.7639484344046599, + "grad_norm": 0.062449511140584946, + "learning_rate": 1.184636158594492e-05, + "loss": 0.016, + "step": 103350 + }, + { + "epoch": 0.7640223529759617, + "grad_norm": 0.07464705407619476, + "learning_rate": 1.1842651946818614e-05, + "loss": 0.0163, + "step": 103360 + }, + { + "epoch": 0.7640962715472636, + "grad_norm": 0.06858009099960327, + "learning_rate": 1.1838942307692309e-05, + "loss": 0.0158, + "step": 103370 + }, + { + "epoch": 0.7641701901185654, + "grad_norm": 0.08118981122970581, + "learning_rate": 1.1835232668566003e-05, + "loss": 0.018, + "step": 103380 + }, + { + "epoch": 0.7642441086898673, + "grad_norm": 0.07128416001796722, + "learning_rate": 1.1831523029439697e-05, + "loss": 0.0152, + "step": 103390 + }, + { + "epoch": 0.764318027261169, + "grad_norm": 0.0697440430521965, + "learning_rate": 1.1827813390313391e-05, + "loss": 0.0148, + "step": 103400 + }, + { + "epoch": 0.7643919458324709, + "grad_norm": 0.05445777252316475, + "learning_rate": 1.1824103751187086e-05, + "loss": 0.0154, + "step": 103410 + }, + { + "epoch": 0.7644658644037728, + "grad_norm": 0.07951023429632187, + "learning_rate": 1.1820394112060778e-05, + "loss": 0.0181, + "step": 103420 + }, + { + "epoch": 0.7645397829750746, + "grad_norm": 0.06549987196922302, + "learning_rate": 1.1816684472934474e-05, + "loss": 0.0159, + "step": 103430 + }, + { + "epoch": 0.7646137015463765, + "grad_norm": 0.0658888965845108, + "learning_rate": 1.1812974833808167e-05, + "loss": 0.0173, + "step": 103440 + }, + { + "epoch": 0.7646876201176783, + "grad_norm": 0.07832251489162445, + "learning_rate": 1.1809265194681863e-05, + "loss": 0.0167, + "step": 103450 + }, + { + "epoch": 0.7647615386889802, + "grad_norm": 0.08702033013105392, + "learning_rate": 1.1805555555555555e-05, + "loss": 0.0185, + "step": 103460 + }, + { + "epoch": 0.7648354572602821, + "grad_norm": 0.0707923099398613, + "learning_rate": 1.1801845916429251e-05, + "loss": 0.017, + "step": 103470 + }, + { + "epoch": 0.7649093758315839, + "grad_norm": 0.08582896739244461, + "learning_rate": 1.1798136277302944e-05, + "loss": 0.02, + "step": 103480 + }, + { + "epoch": 0.7649832944028858, + "grad_norm": 0.09733167290687561, + "learning_rate": 1.179442663817664e-05, + "loss": 0.0179, + "step": 103490 + }, + { + "epoch": 0.7650572129741876, + "grad_norm": 0.0878724455833435, + "learning_rate": 1.1790716999050332e-05, + "loss": 0.0176, + "step": 103500 + }, + { + "epoch": 0.7651311315454895, + "grad_norm": 0.09843181073665619, + "learning_rate": 1.1787007359924027e-05, + "loss": 0.0159, + "step": 103510 + }, + { + "epoch": 0.7652050501167913, + "grad_norm": 0.07592066377401352, + "learning_rate": 1.1783297720797721e-05, + "loss": 0.017, + "step": 103520 + }, + { + "epoch": 0.7652789686880932, + "grad_norm": 0.06643280386924744, + "learning_rate": 1.1779588081671415e-05, + "loss": 0.0183, + "step": 103530 + }, + { + "epoch": 0.7653528872593951, + "grad_norm": 0.07768280804157257, + "learning_rate": 1.177587844254511e-05, + "loss": 0.0173, + "step": 103540 + }, + { + "epoch": 0.7654268058306969, + "grad_norm": 0.06870684027671814, + "learning_rate": 1.1772168803418804e-05, + "loss": 0.0175, + "step": 103550 + }, + { + "epoch": 0.7655007244019988, + "grad_norm": 0.0949084684252739, + "learning_rate": 1.1768459164292498e-05, + "loss": 0.0165, + "step": 103560 + }, + { + "epoch": 0.7655746429733006, + "grad_norm": 0.11342213302850723, + "learning_rate": 1.1764749525166192e-05, + "loss": 0.019, + "step": 103570 + }, + { + "epoch": 0.7656485615446025, + "grad_norm": 0.09305454790592194, + "learning_rate": 1.1761039886039887e-05, + "loss": 0.0189, + "step": 103580 + }, + { + "epoch": 0.7657224801159043, + "grad_norm": 0.05589432269334793, + "learning_rate": 1.175733024691358e-05, + "loss": 0.0168, + "step": 103590 + }, + { + "epoch": 0.7657963986872062, + "grad_norm": 0.05523810163140297, + "learning_rate": 1.1753620607787275e-05, + "loss": 0.0189, + "step": 103600 + }, + { + "epoch": 0.7658703172585081, + "grad_norm": 0.08939807116985321, + "learning_rate": 1.174991096866097e-05, + "loss": 0.0182, + "step": 103610 + }, + { + "epoch": 0.7659442358298099, + "grad_norm": 0.057942889630794525, + "learning_rate": 1.1746201329534664e-05, + "loss": 0.0155, + "step": 103620 + }, + { + "epoch": 0.7660181544011118, + "grad_norm": 0.07429220527410507, + "learning_rate": 1.1742491690408358e-05, + "loss": 0.018, + "step": 103630 + }, + { + "epoch": 0.7660920729724136, + "grad_norm": 0.06361553817987442, + "learning_rate": 1.1738782051282052e-05, + "loss": 0.0157, + "step": 103640 + }, + { + "epoch": 0.7661659915437155, + "grad_norm": 0.10519684106111526, + "learning_rate": 1.1735072412155745e-05, + "loss": 0.0166, + "step": 103650 + }, + { + "epoch": 0.7662399101150172, + "grad_norm": 0.09359843283891678, + "learning_rate": 1.173136277302944e-05, + "loss": 0.0171, + "step": 103660 + }, + { + "epoch": 0.7663138286863191, + "grad_norm": 0.08676464110612869, + "learning_rate": 1.1727653133903133e-05, + "loss": 0.0188, + "step": 103670 + }, + { + "epoch": 0.766387747257621, + "grad_norm": 0.07525712251663208, + "learning_rate": 1.172394349477683e-05, + "loss": 0.0175, + "step": 103680 + }, + { + "epoch": 0.7664616658289228, + "grad_norm": 0.05233407020568848, + "learning_rate": 1.1720233855650522e-05, + "loss": 0.0173, + "step": 103690 + }, + { + "epoch": 0.7665355844002247, + "grad_norm": 0.07339996844530106, + "learning_rate": 1.1716524216524218e-05, + "loss": 0.0176, + "step": 103700 + }, + { + "epoch": 0.7666095029715265, + "grad_norm": 0.07441697269678116, + "learning_rate": 1.171281457739791e-05, + "loss": 0.0182, + "step": 103710 + }, + { + "epoch": 0.7666834215428284, + "grad_norm": 0.06306453794240952, + "learning_rate": 1.1709104938271606e-05, + "loss": 0.0176, + "step": 103720 + }, + { + "epoch": 0.7667573401141303, + "grad_norm": 0.06008174642920494, + "learning_rate": 1.1705395299145299e-05, + "loss": 0.0188, + "step": 103730 + }, + { + "epoch": 0.7668312586854321, + "grad_norm": 0.05986854434013367, + "learning_rate": 1.1701685660018993e-05, + "loss": 0.0175, + "step": 103740 + }, + { + "epoch": 0.766905177256734, + "grad_norm": 0.0892731100320816, + "learning_rate": 1.169797602089269e-05, + "loss": 0.0175, + "step": 103750 + }, + { + "epoch": 0.7669790958280358, + "grad_norm": 0.060458648949861526, + "learning_rate": 1.1694266381766382e-05, + "loss": 0.016, + "step": 103760 + }, + { + "epoch": 0.7670530143993377, + "grad_norm": 0.059939801692962646, + "learning_rate": 1.1690556742640078e-05, + "loss": 0.0172, + "step": 103770 + }, + { + "epoch": 0.7671269329706395, + "grad_norm": 0.08211683481931686, + "learning_rate": 1.168684710351377e-05, + "loss": 0.0149, + "step": 103780 + }, + { + "epoch": 0.7672008515419414, + "grad_norm": 0.08790618181228638, + "learning_rate": 1.1683137464387466e-05, + "loss": 0.0171, + "step": 103790 + }, + { + "epoch": 0.7672747701132433, + "grad_norm": 0.07280895859003067, + "learning_rate": 1.1679427825261159e-05, + "loss": 0.0156, + "step": 103800 + }, + { + "epoch": 0.7673486886845451, + "grad_norm": 0.07520013302564621, + "learning_rate": 1.1675718186134853e-05, + "loss": 0.0179, + "step": 103810 + }, + { + "epoch": 0.767422607255847, + "grad_norm": 0.0693606436252594, + "learning_rate": 1.1672008547008547e-05, + "loss": 0.0215, + "step": 103820 + }, + { + "epoch": 0.7674965258271488, + "grad_norm": 0.08086995780467987, + "learning_rate": 1.1668298907882242e-05, + "loss": 0.018, + "step": 103830 + }, + { + "epoch": 0.7675704443984507, + "grad_norm": 0.09201215952634811, + "learning_rate": 1.1664589268755936e-05, + "loss": 0.0164, + "step": 103840 + }, + { + "epoch": 0.7676443629697525, + "grad_norm": 0.09462850540876389, + "learning_rate": 1.166087962962963e-05, + "loss": 0.0188, + "step": 103850 + }, + { + "epoch": 0.7677182815410544, + "grad_norm": 0.05690096318721771, + "learning_rate": 1.1657169990503324e-05, + "loss": 0.0169, + "step": 103860 + }, + { + "epoch": 0.7677922001123563, + "grad_norm": 0.09154248982667923, + "learning_rate": 1.1653460351377019e-05, + "loss": 0.0184, + "step": 103870 + }, + { + "epoch": 0.7678661186836581, + "grad_norm": 0.09101495146751404, + "learning_rate": 1.1649750712250711e-05, + "loss": 0.019, + "step": 103880 + }, + { + "epoch": 0.76794003725496, + "grad_norm": 0.1021990031003952, + "learning_rate": 1.1646041073124407e-05, + "loss": 0.0186, + "step": 103890 + }, + { + "epoch": 0.7680139558262618, + "grad_norm": 0.08415243029594421, + "learning_rate": 1.16423314339981e-05, + "loss": 0.0177, + "step": 103900 + }, + { + "epoch": 0.7680878743975637, + "grad_norm": 0.07058227062225342, + "learning_rate": 1.1638621794871796e-05, + "loss": 0.0174, + "step": 103910 + }, + { + "epoch": 0.7681617929688656, + "grad_norm": 0.09863097965717316, + "learning_rate": 1.163491215574549e-05, + "loss": 0.0185, + "step": 103920 + }, + { + "epoch": 0.7682357115401673, + "grad_norm": 0.06945838034152985, + "learning_rate": 1.1631202516619184e-05, + "loss": 0.0173, + "step": 103930 + }, + { + "epoch": 0.7683096301114692, + "grad_norm": 0.09523185342550278, + "learning_rate": 1.1627492877492879e-05, + "loss": 0.0195, + "step": 103940 + }, + { + "epoch": 0.768383548682771, + "grad_norm": 0.08189336955547333, + "learning_rate": 1.1623783238366573e-05, + "loss": 0.0149, + "step": 103950 + }, + { + "epoch": 0.7684574672540729, + "grad_norm": 0.07451499253511429, + "learning_rate": 1.1620073599240267e-05, + "loss": 0.0172, + "step": 103960 + }, + { + "epoch": 0.7685313858253747, + "grad_norm": 0.1129722073674202, + "learning_rate": 1.161636396011396e-05, + "loss": 0.0171, + "step": 103970 + }, + { + "epoch": 0.7686053043966766, + "grad_norm": 0.08089788258075714, + "learning_rate": 1.1612654320987656e-05, + "loss": 0.0165, + "step": 103980 + }, + { + "epoch": 0.7686792229679785, + "grad_norm": 0.05218294635415077, + "learning_rate": 1.1608944681861348e-05, + "loss": 0.0157, + "step": 103990 + }, + { + "epoch": 0.7687531415392803, + "grad_norm": 0.08422937244176865, + "learning_rate": 1.1605235042735044e-05, + "loss": 0.0165, + "step": 104000 + }, + { + "epoch": 0.7688270601105822, + "grad_norm": 0.09455084800720215, + "learning_rate": 1.1601525403608737e-05, + "loss": 0.0162, + "step": 104010 + }, + { + "epoch": 0.768900978681884, + "grad_norm": 0.1238221526145935, + "learning_rate": 1.1597815764482433e-05, + "loss": 0.0193, + "step": 104020 + }, + { + "epoch": 0.7689748972531859, + "grad_norm": 0.06627760827541351, + "learning_rate": 1.1594106125356125e-05, + "loss": 0.0166, + "step": 104030 + }, + { + "epoch": 0.7690488158244877, + "grad_norm": 0.06207331269979477, + "learning_rate": 1.159039648622982e-05, + "loss": 0.018, + "step": 104040 + }, + { + "epoch": 0.7691227343957896, + "grad_norm": 0.09232545644044876, + "learning_rate": 1.1586686847103514e-05, + "loss": 0.0161, + "step": 104050 + }, + { + "epoch": 0.7691966529670915, + "grad_norm": 0.06648951768875122, + "learning_rate": 1.1582977207977208e-05, + "loss": 0.019, + "step": 104060 + }, + { + "epoch": 0.7692705715383933, + "grad_norm": 0.06869245320558548, + "learning_rate": 1.1579267568850903e-05, + "loss": 0.0171, + "step": 104070 + }, + { + "epoch": 0.7693444901096952, + "grad_norm": 0.09643948078155518, + "learning_rate": 1.1575557929724597e-05, + "loss": 0.0186, + "step": 104080 + }, + { + "epoch": 0.769418408680997, + "grad_norm": 0.09167290478944778, + "learning_rate": 1.1571848290598291e-05, + "loss": 0.0213, + "step": 104090 + }, + { + "epoch": 0.7694923272522989, + "grad_norm": 0.06981848180294037, + "learning_rate": 1.1568138651471985e-05, + "loss": 0.0175, + "step": 104100 + }, + { + "epoch": 0.7695662458236007, + "grad_norm": 0.062315262854099274, + "learning_rate": 1.156442901234568e-05, + "loss": 0.018, + "step": 104110 + }, + { + "epoch": 0.7696401643949026, + "grad_norm": 0.06809121370315552, + "learning_rate": 1.1560719373219374e-05, + "loss": 0.0164, + "step": 104120 + }, + { + "epoch": 0.7697140829662045, + "grad_norm": 0.13859310746192932, + "learning_rate": 1.1557009734093068e-05, + "loss": 0.0196, + "step": 104130 + }, + { + "epoch": 0.7697880015375063, + "grad_norm": 0.12869401276111603, + "learning_rate": 1.1553300094966762e-05, + "loss": 0.0178, + "step": 104140 + }, + { + "epoch": 0.7698619201088082, + "grad_norm": 0.07242847234010696, + "learning_rate": 1.1549590455840457e-05, + "loss": 0.0178, + "step": 104150 + }, + { + "epoch": 0.76993583868011, + "grad_norm": 0.11106471717357635, + "learning_rate": 1.1545880816714151e-05, + "loss": 0.0165, + "step": 104160 + }, + { + "epoch": 0.7700097572514119, + "grad_norm": 0.08795207738876343, + "learning_rate": 1.1542171177587845e-05, + "loss": 0.0173, + "step": 104170 + }, + { + "epoch": 0.7700836758227138, + "grad_norm": 0.05803506821393967, + "learning_rate": 1.153846153846154e-05, + "loss": 0.0172, + "step": 104180 + }, + { + "epoch": 0.7701575943940155, + "grad_norm": 0.08186095952987671, + "learning_rate": 1.1534751899335234e-05, + "loss": 0.0172, + "step": 104190 + }, + { + "epoch": 0.7702315129653174, + "grad_norm": 0.09746869653463364, + "learning_rate": 1.1531042260208926e-05, + "loss": 0.0194, + "step": 104200 + }, + { + "epoch": 0.7703054315366192, + "grad_norm": 0.07588176429271698, + "learning_rate": 1.1527332621082622e-05, + "loss": 0.0173, + "step": 104210 + }, + { + "epoch": 0.7703793501079211, + "grad_norm": 0.10306576639413834, + "learning_rate": 1.1523622981956315e-05, + "loss": 0.017, + "step": 104220 + }, + { + "epoch": 0.7704532686792229, + "grad_norm": 0.10408895462751389, + "learning_rate": 1.1519913342830011e-05, + "loss": 0.0187, + "step": 104230 + }, + { + "epoch": 0.7705271872505248, + "grad_norm": 0.05544160306453705, + "learning_rate": 1.1516203703703703e-05, + "loss": 0.0169, + "step": 104240 + }, + { + "epoch": 0.7706011058218267, + "grad_norm": 0.06006123498082161, + "learning_rate": 1.15124940645774e-05, + "loss": 0.0166, + "step": 104250 + }, + { + "epoch": 0.7706750243931285, + "grad_norm": 0.07598260790109634, + "learning_rate": 1.1508784425451092e-05, + "loss": 0.0161, + "step": 104260 + }, + { + "epoch": 0.7707489429644304, + "grad_norm": 0.07916712760925293, + "learning_rate": 1.1505074786324786e-05, + "loss": 0.0177, + "step": 104270 + }, + { + "epoch": 0.7708228615357322, + "grad_norm": 0.06070020794868469, + "learning_rate": 1.150136514719848e-05, + "loss": 0.0183, + "step": 104280 + }, + { + "epoch": 0.7708967801070341, + "grad_norm": 0.07659520953893661, + "learning_rate": 1.1497655508072175e-05, + "loss": 0.019, + "step": 104290 + }, + { + "epoch": 0.7709706986783359, + "grad_norm": 0.1022806465625763, + "learning_rate": 1.1493945868945869e-05, + "loss": 0.019, + "step": 104300 + }, + { + "epoch": 0.7710446172496378, + "grad_norm": 0.07769617438316345, + "learning_rate": 1.1490236229819563e-05, + "loss": 0.0154, + "step": 104310 + }, + { + "epoch": 0.7711185358209397, + "grad_norm": 0.08433184027671814, + "learning_rate": 1.1486526590693258e-05, + "loss": 0.0173, + "step": 104320 + }, + { + "epoch": 0.7711924543922415, + "grad_norm": 0.08329308032989502, + "learning_rate": 1.1482816951566952e-05, + "loss": 0.0169, + "step": 104330 + }, + { + "epoch": 0.7712663729635434, + "grad_norm": 0.08364029228687286, + "learning_rate": 1.1479107312440646e-05, + "loss": 0.0163, + "step": 104340 + }, + { + "epoch": 0.7713402915348452, + "grad_norm": 0.10364638268947601, + "learning_rate": 1.147539767331434e-05, + "loss": 0.0169, + "step": 104350 + }, + { + "epoch": 0.7714142101061471, + "grad_norm": 0.08816733956336975, + "learning_rate": 1.1471688034188035e-05, + "loss": 0.0154, + "step": 104360 + }, + { + "epoch": 0.7714881286774489, + "grad_norm": 0.07040822505950928, + "learning_rate": 1.1467978395061729e-05, + "loss": 0.0154, + "step": 104370 + }, + { + "epoch": 0.7715620472487508, + "grad_norm": 0.06682794541120529, + "learning_rate": 1.1464268755935423e-05, + "loss": 0.0163, + "step": 104380 + }, + { + "epoch": 0.7716359658200527, + "grad_norm": 0.12376798689365387, + "learning_rate": 1.1460559116809118e-05, + "loss": 0.0168, + "step": 104390 + }, + { + "epoch": 0.7717098843913545, + "grad_norm": 0.09192962199449539, + "learning_rate": 1.1456849477682812e-05, + "loss": 0.017, + "step": 104400 + }, + { + "epoch": 0.7717838029626564, + "grad_norm": 0.09359622001647949, + "learning_rate": 1.1453139838556506e-05, + "loss": 0.0187, + "step": 104410 + }, + { + "epoch": 0.7718577215339582, + "grad_norm": 0.08074385672807693, + "learning_rate": 1.14494301994302e-05, + "loss": 0.0169, + "step": 104420 + }, + { + "epoch": 0.77193164010526, + "grad_norm": 0.0634748712182045, + "learning_rate": 1.1445720560303893e-05, + "loss": 0.0175, + "step": 104430 + }, + { + "epoch": 0.772005558676562, + "grad_norm": 0.07420934736728668, + "learning_rate": 1.1442010921177589e-05, + "loss": 0.0211, + "step": 104440 + }, + { + "epoch": 0.7720794772478637, + "grad_norm": 0.07205003499984741, + "learning_rate": 1.1438301282051282e-05, + "loss": 0.0177, + "step": 104450 + }, + { + "epoch": 0.7721533958191656, + "grad_norm": 0.09320352226495743, + "learning_rate": 1.1434591642924977e-05, + "loss": 0.0169, + "step": 104460 + }, + { + "epoch": 0.7722273143904674, + "grad_norm": 0.06319659948348999, + "learning_rate": 1.143088200379867e-05, + "loss": 0.0156, + "step": 104470 + }, + { + "epoch": 0.7723012329617693, + "grad_norm": 0.05548546463251114, + "learning_rate": 1.1427172364672366e-05, + "loss": 0.0166, + "step": 104480 + }, + { + "epoch": 0.7723751515330711, + "grad_norm": 0.08645886182785034, + "learning_rate": 1.1423462725546059e-05, + "loss": 0.0167, + "step": 104490 + }, + { + "epoch": 0.772449070104373, + "grad_norm": 0.10371587425470352, + "learning_rate": 1.1419753086419753e-05, + "loss": 0.0183, + "step": 104500 + }, + { + "epoch": 0.7725229886756749, + "grad_norm": 0.07877830415964127, + "learning_rate": 1.1416043447293447e-05, + "loss": 0.0179, + "step": 104510 + }, + { + "epoch": 0.7725969072469767, + "grad_norm": 0.07617280632257462, + "learning_rate": 1.1412333808167141e-05, + "loss": 0.0182, + "step": 104520 + }, + { + "epoch": 0.7726708258182786, + "grad_norm": 0.06728853285312653, + "learning_rate": 1.1408624169040837e-05, + "loss": 0.0191, + "step": 104530 + }, + { + "epoch": 0.7727447443895804, + "grad_norm": 0.07350888848304749, + "learning_rate": 1.140491452991453e-05, + "loss": 0.0171, + "step": 104540 + }, + { + "epoch": 0.7728186629608823, + "grad_norm": 0.07082013785839081, + "learning_rate": 1.1401204890788226e-05, + "loss": 0.0214, + "step": 104550 + }, + { + "epoch": 0.7728925815321841, + "grad_norm": 0.09921420365571976, + "learning_rate": 1.1397495251661919e-05, + "loss": 0.0184, + "step": 104560 + }, + { + "epoch": 0.772966500103486, + "grad_norm": 0.058607906103134155, + "learning_rate": 1.1393785612535614e-05, + "loss": 0.0177, + "step": 104570 + }, + { + "epoch": 0.7730404186747879, + "grad_norm": 0.06936301290988922, + "learning_rate": 1.1390075973409307e-05, + "loss": 0.018, + "step": 104580 + }, + { + "epoch": 0.7731143372460897, + "grad_norm": 0.07501678913831711, + "learning_rate": 1.1386366334283001e-05, + "loss": 0.0168, + "step": 104590 + }, + { + "epoch": 0.7731882558173916, + "grad_norm": 0.08128910511732101, + "learning_rate": 1.1382656695156696e-05, + "loss": 0.0176, + "step": 104600 + }, + { + "epoch": 0.7732621743886934, + "grad_norm": 0.09496302157640457, + "learning_rate": 1.137894705603039e-05, + "loss": 0.0149, + "step": 104610 + }, + { + "epoch": 0.7733360929599953, + "grad_norm": 0.0731099471449852, + "learning_rate": 1.1375237416904084e-05, + "loss": 0.0165, + "step": 104620 + }, + { + "epoch": 0.7734100115312971, + "grad_norm": 0.09244808554649353, + "learning_rate": 1.1371527777777778e-05, + "loss": 0.018, + "step": 104630 + }, + { + "epoch": 0.773483930102599, + "grad_norm": 0.10306263715028763, + "learning_rate": 1.1367818138651473e-05, + "loss": 0.0182, + "step": 104640 + }, + { + "epoch": 0.7735578486739009, + "grad_norm": 0.06824192404747009, + "learning_rate": 1.1364108499525167e-05, + "loss": 0.0176, + "step": 104650 + }, + { + "epoch": 0.7736317672452027, + "grad_norm": 0.07538828998804092, + "learning_rate": 1.136039886039886e-05, + "loss": 0.0166, + "step": 104660 + }, + { + "epoch": 0.7737056858165046, + "grad_norm": 0.06246098130941391, + "learning_rate": 1.1356689221272556e-05, + "loss": 0.0171, + "step": 104670 + }, + { + "epoch": 0.7737796043878064, + "grad_norm": 0.09654530882835388, + "learning_rate": 1.135297958214625e-05, + "loss": 0.0166, + "step": 104680 + }, + { + "epoch": 0.7738535229591083, + "grad_norm": 0.06581386178731918, + "learning_rate": 1.1349269943019944e-05, + "loss": 0.017, + "step": 104690 + }, + { + "epoch": 0.7739274415304102, + "grad_norm": 0.06344723701477051, + "learning_rate": 1.1345560303893638e-05, + "loss": 0.0165, + "step": 104700 + }, + { + "epoch": 0.7740013601017119, + "grad_norm": 0.07966010272502899, + "learning_rate": 1.1341850664767333e-05, + "loss": 0.0164, + "step": 104710 + }, + { + "epoch": 0.7740752786730138, + "grad_norm": 0.07721570134162903, + "learning_rate": 1.1338141025641027e-05, + "loss": 0.0158, + "step": 104720 + }, + { + "epoch": 0.7741491972443156, + "grad_norm": 0.07869109511375427, + "learning_rate": 1.133443138651472e-05, + "loss": 0.0159, + "step": 104730 + }, + { + "epoch": 0.7742231158156175, + "grad_norm": 0.07940968871116638, + "learning_rate": 1.1330721747388415e-05, + "loss": 0.0171, + "step": 104740 + }, + { + "epoch": 0.7742970343869193, + "grad_norm": 0.05805268883705139, + "learning_rate": 1.1327012108262108e-05, + "loss": 0.017, + "step": 104750 + }, + { + "epoch": 0.7743709529582212, + "grad_norm": 0.09983490407466888, + "learning_rate": 1.1323302469135804e-05, + "loss": 0.0156, + "step": 104760 + }, + { + "epoch": 0.7744448715295231, + "grad_norm": 0.09040165692567825, + "learning_rate": 1.1319592830009497e-05, + "loss": 0.0178, + "step": 104770 + }, + { + "epoch": 0.7745187901008249, + "grad_norm": 0.08260339498519897, + "learning_rate": 1.1315883190883193e-05, + "loss": 0.017, + "step": 104780 + }, + { + "epoch": 0.7745927086721268, + "grad_norm": 0.09427991509437561, + "learning_rate": 1.1312173551756885e-05, + "loss": 0.0208, + "step": 104790 + }, + { + "epoch": 0.7746666272434286, + "grad_norm": 0.07443516701459885, + "learning_rate": 1.1308463912630581e-05, + "loss": 0.0196, + "step": 104800 + }, + { + "epoch": 0.7747405458147305, + "grad_norm": 0.06800392270088196, + "learning_rate": 1.1304754273504274e-05, + "loss": 0.0178, + "step": 104810 + }, + { + "epoch": 0.7748144643860323, + "grad_norm": 0.060037799179553986, + "learning_rate": 1.1301044634377968e-05, + "loss": 0.0176, + "step": 104820 + }, + { + "epoch": 0.7748883829573342, + "grad_norm": 0.0839676484465599, + "learning_rate": 1.1297334995251662e-05, + "loss": 0.0161, + "step": 104830 + }, + { + "epoch": 0.7749623015286361, + "grad_norm": 0.05880623683333397, + "learning_rate": 1.1293625356125356e-05, + "loss": 0.0164, + "step": 104840 + }, + { + "epoch": 0.7750362200999379, + "grad_norm": 0.0863754004240036, + "learning_rate": 1.128991571699905e-05, + "loss": 0.0153, + "step": 104850 + }, + { + "epoch": 0.7751101386712398, + "grad_norm": 0.08357366174459457, + "learning_rate": 1.1286206077872745e-05, + "loss": 0.0176, + "step": 104860 + }, + { + "epoch": 0.7751840572425416, + "grad_norm": 0.0565825030207634, + "learning_rate": 1.128249643874644e-05, + "loss": 0.0157, + "step": 104870 + }, + { + "epoch": 0.7752579758138435, + "grad_norm": 0.061214566230773926, + "learning_rate": 1.1278786799620134e-05, + "loss": 0.0171, + "step": 104880 + }, + { + "epoch": 0.7753318943851453, + "grad_norm": 0.10586986690759659, + "learning_rate": 1.1275077160493828e-05, + "loss": 0.0171, + "step": 104890 + }, + { + "epoch": 0.7754058129564472, + "grad_norm": 0.08357368409633636, + "learning_rate": 1.1271367521367522e-05, + "loss": 0.016, + "step": 104900 + }, + { + "epoch": 0.7754797315277491, + "grad_norm": 0.09002059698104858, + "learning_rate": 1.1267657882241216e-05, + "loss": 0.0178, + "step": 104910 + }, + { + "epoch": 0.7755536500990509, + "grad_norm": 0.06980524212121964, + "learning_rate": 1.126394824311491e-05, + "loss": 0.018, + "step": 104920 + }, + { + "epoch": 0.7756275686703528, + "grad_norm": 0.07796944677829742, + "learning_rate": 1.1260238603988605e-05, + "loss": 0.0176, + "step": 104930 + }, + { + "epoch": 0.7757014872416546, + "grad_norm": 0.053704190999269485, + "learning_rate": 1.12565289648623e-05, + "loss": 0.0148, + "step": 104940 + }, + { + "epoch": 0.7757754058129565, + "grad_norm": 0.06982135772705078, + "learning_rate": 1.1252819325735993e-05, + "loss": 0.0183, + "step": 104950 + }, + { + "epoch": 0.7758493243842584, + "grad_norm": 0.07299453765153885, + "learning_rate": 1.1249109686609686e-05, + "loss": 0.0167, + "step": 104960 + }, + { + "epoch": 0.7759232429555601, + "grad_norm": 0.10618864744901657, + "learning_rate": 1.1245400047483382e-05, + "loss": 0.0183, + "step": 104970 + }, + { + "epoch": 0.775997161526862, + "grad_norm": 0.08143940567970276, + "learning_rate": 1.1241690408357075e-05, + "loss": 0.0186, + "step": 104980 + }, + { + "epoch": 0.7760710800981638, + "grad_norm": 0.06453964114189148, + "learning_rate": 1.123798076923077e-05, + "loss": 0.0172, + "step": 104990 + }, + { + "epoch": 0.7761449986694657, + "grad_norm": 0.08334316313266754, + "learning_rate": 1.1234271130104463e-05, + "loss": 0.0174, + "step": 105000 + }, + { + "epoch": 0.7762189172407675, + "grad_norm": 0.06593699008226395, + "learning_rate": 1.1230561490978159e-05, + "loss": 0.0171, + "step": 105010 + }, + { + "epoch": 0.7762928358120694, + "grad_norm": 0.0909162312746048, + "learning_rate": 1.1226851851851852e-05, + "loss": 0.0169, + "step": 105020 + }, + { + "epoch": 0.7763667543833713, + "grad_norm": 0.07157652080059052, + "learning_rate": 1.1223142212725548e-05, + "loss": 0.0168, + "step": 105030 + }, + { + "epoch": 0.7764406729546731, + "grad_norm": 0.08083537966012955, + "learning_rate": 1.121943257359924e-05, + "loss": 0.0161, + "step": 105040 + }, + { + "epoch": 0.776514591525975, + "grad_norm": 0.0855388268828392, + "learning_rate": 1.1215722934472935e-05, + "loss": 0.0166, + "step": 105050 + }, + { + "epoch": 0.7765885100972768, + "grad_norm": 0.06755349785089493, + "learning_rate": 1.1212013295346629e-05, + "loss": 0.0164, + "step": 105060 + }, + { + "epoch": 0.7766624286685787, + "grad_norm": 0.07067691534757614, + "learning_rate": 1.1208303656220323e-05, + "loss": 0.014, + "step": 105070 + }, + { + "epoch": 0.7767363472398805, + "grad_norm": 0.11115356534719467, + "learning_rate": 1.1204594017094017e-05, + "loss": 0.0155, + "step": 105080 + }, + { + "epoch": 0.7768102658111824, + "grad_norm": 0.05139143392443657, + "learning_rate": 1.1200884377967712e-05, + "loss": 0.0155, + "step": 105090 + }, + { + "epoch": 0.7768841843824843, + "grad_norm": 0.08463030308485031, + "learning_rate": 1.1197174738841406e-05, + "loss": 0.0169, + "step": 105100 + }, + { + "epoch": 0.7769581029537861, + "grad_norm": 0.06260386109352112, + "learning_rate": 1.11934650997151e-05, + "loss": 0.0169, + "step": 105110 + }, + { + "epoch": 0.777032021525088, + "grad_norm": 0.06074802950024605, + "learning_rate": 1.1189755460588794e-05, + "loss": 0.0175, + "step": 105120 + }, + { + "epoch": 0.7771059400963898, + "grad_norm": 0.06441828608512878, + "learning_rate": 1.1186045821462489e-05, + "loss": 0.0202, + "step": 105130 + }, + { + "epoch": 0.7771798586676917, + "grad_norm": 0.07338294386863708, + "learning_rate": 1.1182336182336183e-05, + "loss": 0.0157, + "step": 105140 + }, + { + "epoch": 0.7772537772389935, + "grad_norm": 0.07631634920835495, + "learning_rate": 1.1178626543209877e-05, + "loss": 0.0182, + "step": 105150 + }, + { + "epoch": 0.7773276958102954, + "grad_norm": 0.08145643770694733, + "learning_rate": 1.1174916904083572e-05, + "loss": 0.0182, + "step": 105160 + }, + { + "epoch": 0.7774016143815973, + "grad_norm": 0.07141675055027008, + "learning_rate": 1.1171207264957266e-05, + "loss": 0.0182, + "step": 105170 + }, + { + "epoch": 0.7774755329528991, + "grad_norm": 0.14300119876861572, + "learning_rate": 1.116749762583096e-05, + "loss": 0.0201, + "step": 105180 + }, + { + "epoch": 0.777549451524201, + "grad_norm": 0.07788577675819397, + "learning_rate": 1.1163787986704653e-05, + "loss": 0.0173, + "step": 105190 + }, + { + "epoch": 0.7776233700955028, + "grad_norm": 0.056850410997867584, + "learning_rate": 1.1160078347578349e-05, + "loss": 0.0169, + "step": 105200 + }, + { + "epoch": 0.7776972886668047, + "grad_norm": 0.08336438238620758, + "learning_rate": 1.1156368708452041e-05, + "loss": 0.0174, + "step": 105210 + }, + { + "epoch": 0.7777712072381066, + "grad_norm": 0.10761765390634537, + "learning_rate": 1.1152659069325737e-05, + "loss": 0.0186, + "step": 105220 + }, + { + "epoch": 0.7778451258094083, + "grad_norm": 0.07789470255374908, + "learning_rate": 1.114894943019943e-05, + "loss": 0.0158, + "step": 105230 + }, + { + "epoch": 0.7779190443807102, + "grad_norm": 0.0754820704460144, + "learning_rate": 1.1145239791073126e-05, + "loss": 0.0189, + "step": 105240 + }, + { + "epoch": 0.777992962952012, + "grad_norm": 0.06808315217494965, + "learning_rate": 1.1141530151946818e-05, + "loss": 0.0175, + "step": 105250 + }, + { + "epoch": 0.7780668815233139, + "grad_norm": 0.06405656784772873, + "learning_rate": 1.1137820512820514e-05, + "loss": 0.0186, + "step": 105260 + }, + { + "epoch": 0.7781408000946157, + "grad_norm": 0.06838709115982056, + "learning_rate": 1.1134110873694207e-05, + "loss": 0.0167, + "step": 105270 + }, + { + "epoch": 0.7782147186659176, + "grad_norm": 0.09492330998182297, + "learning_rate": 1.1130401234567901e-05, + "loss": 0.0184, + "step": 105280 + }, + { + "epoch": 0.7782886372372195, + "grad_norm": 0.05190137028694153, + "learning_rate": 1.1126691595441597e-05, + "loss": 0.0162, + "step": 105290 + }, + { + "epoch": 0.7783625558085213, + "grad_norm": 0.106864333152771, + "learning_rate": 1.112298195631529e-05, + "loss": 0.0168, + "step": 105300 + }, + { + "epoch": 0.7784364743798232, + "grad_norm": 0.0951375812292099, + "learning_rate": 1.1119272317188986e-05, + "loss": 0.0157, + "step": 105310 + }, + { + "epoch": 0.778510392951125, + "grad_norm": 0.06838471442461014, + "learning_rate": 1.1115562678062678e-05, + "loss": 0.0172, + "step": 105320 + }, + { + "epoch": 0.7785843115224269, + "grad_norm": 0.09433237463235855, + "learning_rate": 1.1111853038936374e-05, + "loss": 0.0182, + "step": 105330 + }, + { + "epoch": 0.7786582300937287, + "grad_norm": 0.0777023509144783, + "learning_rate": 1.1108143399810067e-05, + "loss": 0.017, + "step": 105340 + }, + { + "epoch": 0.7787321486650306, + "grad_norm": 0.10564722865819931, + "learning_rate": 1.1104433760683761e-05, + "loss": 0.0186, + "step": 105350 + }, + { + "epoch": 0.7788060672363325, + "grad_norm": 0.09359659254550934, + "learning_rate": 1.1100724121557455e-05, + "loss": 0.0193, + "step": 105360 + }, + { + "epoch": 0.7788799858076343, + "grad_norm": 0.06303531676530838, + "learning_rate": 1.109701448243115e-05, + "loss": 0.0158, + "step": 105370 + }, + { + "epoch": 0.7789539043789362, + "grad_norm": 0.07465647161006927, + "learning_rate": 1.1093304843304844e-05, + "loss": 0.0165, + "step": 105380 + }, + { + "epoch": 0.779027822950238, + "grad_norm": 0.08835309743881226, + "learning_rate": 1.1089595204178538e-05, + "loss": 0.0161, + "step": 105390 + }, + { + "epoch": 0.7791017415215399, + "grad_norm": 0.0837300568819046, + "learning_rate": 1.1085885565052232e-05, + "loss": 0.0167, + "step": 105400 + }, + { + "epoch": 0.7791756600928417, + "grad_norm": 0.06378339231014252, + "learning_rate": 1.1082175925925927e-05, + "loss": 0.0173, + "step": 105410 + }, + { + "epoch": 0.7792495786641436, + "grad_norm": 0.08560236543416977, + "learning_rate": 1.107846628679962e-05, + "loss": 0.0198, + "step": 105420 + }, + { + "epoch": 0.7793234972354455, + "grad_norm": 0.055065080523490906, + "learning_rate": 1.1074756647673315e-05, + "loss": 0.016, + "step": 105430 + }, + { + "epoch": 0.7793974158067473, + "grad_norm": 0.08921148627996445, + "learning_rate": 1.1071047008547008e-05, + "loss": 0.0145, + "step": 105440 + }, + { + "epoch": 0.7794713343780492, + "grad_norm": 0.07413645088672638, + "learning_rate": 1.1067337369420704e-05, + "loss": 0.0187, + "step": 105450 + }, + { + "epoch": 0.779545252949351, + "grad_norm": 0.06614266335964203, + "learning_rate": 1.1063627730294398e-05, + "loss": 0.0184, + "step": 105460 + }, + { + "epoch": 0.7796191715206529, + "grad_norm": 0.08686352521181107, + "learning_rate": 1.1059918091168092e-05, + "loss": 0.0158, + "step": 105470 + }, + { + "epoch": 0.7796930900919548, + "grad_norm": 0.10010645538568497, + "learning_rate": 1.1056208452041787e-05, + "loss": 0.0181, + "step": 105480 + }, + { + "epoch": 0.7797670086632565, + "grad_norm": 0.09116895496845245, + "learning_rate": 1.105249881291548e-05, + "loss": 0.0166, + "step": 105490 + }, + { + "epoch": 0.7798409272345584, + "grad_norm": 0.06380877643823624, + "learning_rate": 1.1048789173789175e-05, + "loss": 0.0168, + "step": 105500 + }, + { + "epoch": 0.7799148458058602, + "grad_norm": 0.09583408385515213, + "learning_rate": 1.1045079534662868e-05, + "loss": 0.0185, + "step": 105510 + }, + { + "epoch": 0.7799887643771621, + "grad_norm": 0.05529385805130005, + "learning_rate": 1.1041369895536564e-05, + "loss": 0.015, + "step": 105520 + }, + { + "epoch": 0.7800626829484639, + "grad_norm": 0.06574016809463501, + "learning_rate": 1.1037660256410256e-05, + "loss": 0.017, + "step": 105530 + }, + { + "epoch": 0.7801366015197658, + "grad_norm": 0.08715861290693283, + "learning_rate": 1.1033950617283952e-05, + "loss": 0.0175, + "step": 105540 + }, + { + "epoch": 0.7802105200910677, + "grad_norm": 0.08664470165967941, + "learning_rate": 1.1030240978157645e-05, + "loss": 0.0163, + "step": 105550 + }, + { + "epoch": 0.7802844386623695, + "grad_norm": 0.08922585099935532, + "learning_rate": 1.102653133903134e-05, + "loss": 0.0143, + "step": 105560 + }, + { + "epoch": 0.7803583572336714, + "grad_norm": 0.07631973177194595, + "learning_rate": 1.1022821699905033e-05, + "loss": 0.0177, + "step": 105570 + }, + { + "epoch": 0.7804322758049732, + "grad_norm": 0.08425720036029816, + "learning_rate": 1.1019112060778728e-05, + "loss": 0.0182, + "step": 105580 + }, + { + "epoch": 0.7805061943762751, + "grad_norm": 0.06511543691158295, + "learning_rate": 1.1015402421652422e-05, + "loss": 0.0151, + "step": 105590 + }, + { + "epoch": 0.7805801129475769, + "grad_norm": 0.0878303050994873, + "learning_rate": 1.1011692782526116e-05, + "loss": 0.0177, + "step": 105600 + }, + { + "epoch": 0.7806540315188788, + "grad_norm": 0.09406014531850815, + "learning_rate": 1.100798314339981e-05, + "loss": 0.0181, + "step": 105610 + }, + { + "epoch": 0.7807279500901807, + "grad_norm": 0.07079024612903595, + "learning_rate": 1.1004273504273505e-05, + "loss": 0.0137, + "step": 105620 + }, + { + "epoch": 0.7808018686614825, + "grad_norm": 0.08367685228586197, + "learning_rate": 1.1000563865147199e-05, + "loss": 0.0167, + "step": 105630 + }, + { + "epoch": 0.7808757872327844, + "grad_norm": 0.06964623928070068, + "learning_rate": 1.0996854226020893e-05, + "loss": 0.0154, + "step": 105640 + }, + { + "epoch": 0.7809497058040862, + "grad_norm": 0.0795077458024025, + "learning_rate": 1.0993144586894587e-05, + "loss": 0.0165, + "step": 105650 + }, + { + "epoch": 0.7810236243753881, + "grad_norm": 0.0842127576470375, + "learning_rate": 1.0989434947768282e-05, + "loss": 0.018, + "step": 105660 + }, + { + "epoch": 0.7810975429466899, + "grad_norm": 0.0904654860496521, + "learning_rate": 1.0985725308641976e-05, + "loss": 0.0182, + "step": 105670 + }, + { + "epoch": 0.7811714615179918, + "grad_norm": 0.06659997254610062, + "learning_rate": 1.098201566951567e-05, + "loss": 0.0179, + "step": 105680 + }, + { + "epoch": 0.7812453800892937, + "grad_norm": 0.19042523205280304, + "learning_rate": 1.0978306030389365e-05, + "loss": 0.0189, + "step": 105690 + }, + { + "epoch": 0.7813192986605955, + "grad_norm": 0.09224945306777954, + "learning_rate": 1.0974596391263059e-05, + "loss": 0.0168, + "step": 105700 + }, + { + "epoch": 0.7813932172318974, + "grad_norm": 0.07589827477931976, + "learning_rate": 1.0970886752136753e-05, + "loss": 0.018, + "step": 105710 + }, + { + "epoch": 0.7814671358031992, + "grad_norm": 0.08175301551818848, + "learning_rate": 1.0967177113010447e-05, + "loss": 0.0178, + "step": 105720 + }, + { + "epoch": 0.781541054374501, + "grad_norm": 0.06693083047866821, + "learning_rate": 1.0963467473884142e-05, + "loss": 0.0153, + "step": 105730 + }, + { + "epoch": 0.781614972945803, + "grad_norm": 0.07367260009050369, + "learning_rate": 1.0959757834757834e-05, + "loss": 0.0151, + "step": 105740 + }, + { + "epoch": 0.7816888915171047, + "grad_norm": 0.08174421638250351, + "learning_rate": 1.095604819563153e-05, + "loss": 0.0183, + "step": 105750 + }, + { + "epoch": 0.7817628100884066, + "grad_norm": 0.06560919433832169, + "learning_rate": 1.0952338556505223e-05, + "loss": 0.0165, + "step": 105760 + }, + { + "epoch": 0.7818367286597084, + "grad_norm": 0.07756762206554413, + "learning_rate": 1.0948628917378919e-05, + "loss": 0.0195, + "step": 105770 + }, + { + "epoch": 0.7819106472310103, + "grad_norm": 0.1017007902264595, + "learning_rate": 1.0944919278252611e-05, + "loss": 0.0179, + "step": 105780 + }, + { + "epoch": 0.7819845658023121, + "grad_norm": 0.07805784791707993, + "learning_rate": 1.0941209639126307e-05, + "loss": 0.0167, + "step": 105790 + }, + { + "epoch": 0.782058484373614, + "grad_norm": 0.08423495292663574, + "learning_rate": 1.09375e-05, + "loss": 0.0193, + "step": 105800 + }, + { + "epoch": 0.7821324029449159, + "grad_norm": 0.05597223341464996, + "learning_rate": 1.0933790360873694e-05, + "loss": 0.0154, + "step": 105810 + }, + { + "epoch": 0.7822063215162177, + "grad_norm": 0.06490608304738998, + "learning_rate": 1.0930080721747388e-05, + "loss": 0.0191, + "step": 105820 + }, + { + "epoch": 0.7822802400875196, + "grad_norm": 0.0634688213467598, + "learning_rate": 1.0926371082621083e-05, + "loss": 0.0168, + "step": 105830 + }, + { + "epoch": 0.7823541586588214, + "grad_norm": 0.07929351180791855, + "learning_rate": 1.0922661443494777e-05, + "loss": 0.0173, + "step": 105840 + }, + { + "epoch": 0.7824280772301233, + "grad_norm": 0.04854271933436394, + "learning_rate": 1.0918951804368471e-05, + "loss": 0.0149, + "step": 105850 + }, + { + "epoch": 0.7825019958014251, + "grad_norm": 0.07452704757452011, + "learning_rate": 1.0915242165242166e-05, + "loss": 0.0154, + "step": 105860 + }, + { + "epoch": 0.782575914372727, + "grad_norm": 0.06969328224658966, + "learning_rate": 1.091153252611586e-05, + "loss": 0.017, + "step": 105870 + }, + { + "epoch": 0.7826498329440289, + "grad_norm": 0.07368597388267517, + "learning_rate": 1.0907822886989554e-05, + "loss": 0.0154, + "step": 105880 + }, + { + "epoch": 0.7827237515153307, + "grad_norm": 0.06966342031955719, + "learning_rate": 1.0904113247863248e-05, + "loss": 0.0188, + "step": 105890 + }, + { + "epoch": 0.7827976700866326, + "grad_norm": 0.06735853850841522, + "learning_rate": 1.0900403608736943e-05, + "loss": 0.0172, + "step": 105900 + }, + { + "epoch": 0.7828715886579344, + "grad_norm": 0.08098297566175461, + "learning_rate": 1.0896693969610637e-05, + "loss": 0.0145, + "step": 105910 + }, + { + "epoch": 0.7829455072292363, + "grad_norm": 0.09270311146974564, + "learning_rate": 1.0892984330484331e-05, + "loss": 0.017, + "step": 105920 + }, + { + "epoch": 0.7830194258005382, + "grad_norm": 0.09456925839185715, + "learning_rate": 1.0889274691358025e-05, + "loss": 0.0157, + "step": 105930 + }, + { + "epoch": 0.78309334437184, + "grad_norm": 0.0878613069653511, + "learning_rate": 1.088556505223172e-05, + "loss": 0.0197, + "step": 105940 + }, + { + "epoch": 0.7831672629431419, + "grad_norm": 0.06533868610858917, + "learning_rate": 1.0881855413105414e-05, + "loss": 0.0162, + "step": 105950 + }, + { + "epoch": 0.7832411815144437, + "grad_norm": 0.11173345893621445, + "learning_rate": 1.0878145773979108e-05, + "loss": 0.021, + "step": 105960 + }, + { + "epoch": 0.7833151000857456, + "grad_norm": 0.10665670782327652, + "learning_rate": 1.08744361348528e-05, + "loss": 0.0177, + "step": 105970 + }, + { + "epoch": 0.7833890186570474, + "grad_norm": 0.09183719754219055, + "learning_rate": 1.0870726495726497e-05, + "loss": 0.0196, + "step": 105980 + }, + { + "epoch": 0.7834629372283493, + "grad_norm": 0.0717807337641716, + "learning_rate": 1.086701685660019e-05, + "loss": 0.0184, + "step": 105990 + }, + { + "epoch": 0.7835368557996512, + "grad_norm": 0.09864635765552521, + "learning_rate": 1.0863307217473885e-05, + "loss": 0.0184, + "step": 106000 + }, + { + "epoch": 0.783610774370953, + "grad_norm": 0.08241227269172668, + "learning_rate": 1.0859597578347578e-05, + "loss": 0.0175, + "step": 106010 + }, + { + "epoch": 0.7836846929422548, + "grad_norm": 0.10284945368766785, + "learning_rate": 1.0855887939221274e-05, + "loss": 0.0162, + "step": 106020 + }, + { + "epoch": 0.7837586115135566, + "grad_norm": 0.07604247331619263, + "learning_rate": 1.0852178300094966e-05, + "loss": 0.0181, + "step": 106030 + }, + { + "epoch": 0.7838325300848585, + "grad_norm": 0.06977668404579163, + "learning_rate": 1.084846866096866e-05, + "loss": 0.0169, + "step": 106040 + }, + { + "epoch": 0.7839064486561603, + "grad_norm": 0.08929763734340668, + "learning_rate": 1.0844759021842357e-05, + "loss": 0.0212, + "step": 106050 + }, + { + "epoch": 0.7839803672274622, + "grad_norm": 0.08664583414793015, + "learning_rate": 1.084104938271605e-05, + "loss": 0.0178, + "step": 106060 + }, + { + "epoch": 0.7840542857987641, + "grad_norm": 0.086372509598732, + "learning_rate": 1.0837339743589745e-05, + "loss": 0.0169, + "step": 106070 + }, + { + "epoch": 0.7841282043700659, + "grad_norm": 0.09958035498857498, + "learning_rate": 1.0833630104463438e-05, + "loss": 0.0172, + "step": 106080 + }, + { + "epoch": 0.7842021229413678, + "grad_norm": 0.08798729628324509, + "learning_rate": 1.0829920465337134e-05, + "loss": 0.0161, + "step": 106090 + }, + { + "epoch": 0.7842760415126696, + "grad_norm": 0.07195137441158295, + "learning_rate": 1.0826210826210826e-05, + "loss": 0.0154, + "step": 106100 + }, + { + "epoch": 0.7843499600839715, + "grad_norm": 0.07254501432180405, + "learning_rate": 1.082250118708452e-05, + "loss": 0.0176, + "step": 106110 + }, + { + "epoch": 0.7844238786552733, + "grad_norm": 0.10619547963142395, + "learning_rate": 1.0818791547958215e-05, + "loss": 0.0156, + "step": 106120 + }, + { + "epoch": 0.7844977972265752, + "grad_norm": 0.0825144425034523, + "learning_rate": 1.081508190883191e-05, + "loss": 0.0175, + "step": 106130 + }, + { + "epoch": 0.7845717157978771, + "grad_norm": 0.06593617796897888, + "learning_rate": 1.0811372269705603e-05, + "loss": 0.0173, + "step": 106140 + }, + { + "epoch": 0.7846456343691789, + "grad_norm": 0.07409842312335968, + "learning_rate": 1.0807662630579298e-05, + "loss": 0.0187, + "step": 106150 + }, + { + "epoch": 0.7847195529404808, + "grad_norm": 0.06352999806404114, + "learning_rate": 1.0803952991452992e-05, + "loss": 0.0169, + "step": 106160 + }, + { + "epoch": 0.7847934715117826, + "grad_norm": 0.08738528192043304, + "learning_rate": 1.0800243352326686e-05, + "loss": 0.0171, + "step": 106170 + }, + { + "epoch": 0.7848673900830845, + "grad_norm": 0.10091836750507355, + "learning_rate": 1.079653371320038e-05, + "loss": 0.0178, + "step": 106180 + }, + { + "epoch": 0.7849413086543864, + "grad_norm": 0.07956136763095856, + "learning_rate": 1.0792824074074075e-05, + "loss": 0.0183, + "step": 106190 + }, + { + "epoch": 0.7850152272256882, + "grad_norm": 0.07380006462335587, + "learning_rate": 1.0789114434947767e-05, + "loss": 0.0143, + "step": 106200 + }, + { + "epoch": 0.7850891457969901, + "grad_norm": 0.07521267235279083, + "learning_rate": 1.0785404795821463e-05, + "loss": 0.0147, + "step": 106210 + }, + { + "epoch": 0.7851630643682919, + "grad_norm": 0.07009510695934296, + "learning_rate": 1.0781695156695158e-05, + "loss": 0.0172, + "step": 106220 + }, + { + "epoch": 0.7852369829395938, + "grad_norm": 0.06240467727184296, + "learning_rate": 1.0777985517568852e-05, + "loss": 0.0164, + "step": 106230 + }, + { + "epoch": 0.7853109015108956, + "grad_norm": 0.07551458477973938, + "learning_rate": 1.0774275878442546e-05, + "loss": 0.0156, + "step": 106240 + }, + { + "epoch": 0.7853848200821975, + "grad_norm": 0.06955337524414062, + "learning_rate": 1.077056623931624e-05, + "loss": 0.0179, + "step": 106250 + }, + { + "epoch": 0.7854587386534994, + "grad_norm": 0.07557106763124466, + "learning_rate": 1.0766856600189935e-05, + "loss": 0.0153, + "step": 106260 + }, + { + "epoch": 0.7855326572248011, + "grad_norm": 0.10826913267374039, + "learning_rate": 1.0763146961063627e-05, + "loss": 0.0179, + "step": 106270 + }, + { + "epoch": 0.785606575796103, + "grad_norm": 0.09861350804567337, + "learning_rate": 1.0759437321937323e-05, + "loss": 0.0162, + "step": 106280 + }, + { + "epoch": 0.7856804943674048, + "grad_norm": 0.08968888968229294, + "learning_rate": 1.0755727682811016e-05, + "loss": 0.0172, + "step": 106290 + }, + { + "epoch": 0.7857544129387067, + "grad_norm": 0.07313573360443115, + "learning_rate": 1.0752018043684712e-05, + "loss": 0.0173, + "step": 106300 + }, + { + "epoch": 0.7858283315100085, + "grad_norm": 0.06940294802188873, + "learning_rate": 1.0748308404558404e-05, + "loss": 0.0166, + "step": 106310 + }, + { + "epoch": 0.7859022500813104, + "grad_norm": 0.10839875042438507, + "learning_rate": 1.07445987654321e-05, + "loss": 0.0177, + "step": 106320 + }, + { + "epoch": 0.7859761686526123, + "grad_norm": 0.06451968848705292, + "learning_rate": 1.0740889126305793e-05, + "loss": 0.0162, + "step": 106330 + }, + { + "epoch": 0.7860500872239141, + "grad_norm": 0.08700212091207504, + "learning_rate": 1.0737179487179487e-05, + "loss": 0.0173, + "step": 106340 + }, + { + "epoch": 0.786124005795216, + "grad_norm": 0.06500065326690674, + "learning_rate": 1.0733469848053182e-05, + "loss": 0.0173, + "step": 106350 + }, + { + "epoch": 0.7861979243665178, + "grad_norm": 0.07031098753213882, + "learning_rate": 1.0729760208926876e-05, + "loss": 0.0161, + "step": 106360 + }, + { + "epoch": 0.7862718429378197, + "grad_norm": 0.05699130520224571, + "learning_rate": 1.072605056980057e-05, + "loss": 0.017, + "step": 106370 + }, + { + "epoch": 0.7863457615091215, + "grad_norm": 0.05871216580271721, + "learning_rate": 1.0722340930674264e-05, + "loss": 0.016, + "step": 106380 + }, + { + "epoch": 0.7864196800804234, + "grad_norm": 0.0568096786737442, + "learning_rate": 1.0718631291547959e-05, + "loss": 0.0147, + "step": 106390 + }, + { + "epoch": 0.7864935986517253, + "grad_norm": 0.08190196007490158, + "learning_rate": 1.0714921652421653e-05, + "loss": 0.0177, + "step": 106400 + }, + { + "epoch": 0.7865675172230271, + "grad_norm": 0.07658049464225769, + "learning_rate": 1.0711212013295347e-05, + "loss": 0.0161, + "step": 106410 + }, + { + "epoch": 0.786641435794329, + "grad_norm": 0.0688110813498497, + "learning_rate": 1.0707502374169041e-05, + "loss": 0.0172, + "step": 106420 + }, + { + "epoch": 0.7867153543656308, + "grad_norm": 0.07113152742385864, + "learning_rate": 1.0703792735042736e-05, + "loss": 0.018, + "step": 106430 + }, + { + "epoch": 0.7867892729369327, + "grad_norm": 0.10106844455003738, + "learning_rate": 1.070008309591643e-05, + "loss": 0.017, + "step": 106440 + }, + { + "epoch": 0.7868631915082346, + "grad_norm": 0.07266218960285187, + "learning_rate": 1.0696373456790124e-05, + "loss": 0.0178, + "step": 106450 + }, + { + "epoch": 0.7869371100795364, + "grad_norm": 0.07605311274528503, + "learning_rate": 1.0692663817663819e-05, + "loss": 0.018, + "step": 106460 + }, + { + "epoch": 0.7870110286508383, + "grad_norm": 0.08731962740421295, + "learning_rate": 1.0688954178537513e-05, + "loss": 0.0148, + "step": 106470 + }, + { + "epoch": 0.7870849472221401, + "grad_norm": 0.08087794482707977, + "learning_rate": 1.0685244539411207e-05, + "loss": 0.0183, + "step": 106480 + }, + { + "epoch": 0.787158865793442, + "grad_norm": 0.08370697498321533, + "learning_rate": 1.0681534900284901e-05, + "loss": 0.0197, + "step": 106490 + }, + { + "epoch": 0.7872327843647438, + "grad_norm": 0.08696702122688293, + "learning_rate": 1.0677825261158594e-05, + "loss": 0.0147, + "step": 106500 + }, + { + "epoch": 0.7873067029360457, + "grad_norm": 0.08174639195203781, + "learning_rate": 1.067411562203229e-05, + "loss": 0.0163, + "step": 106510 + }, + { + "epoch": 0.7873806215073476, + "grad_norm": 0.06434164196252823, + "learning_rate": 1.0670405982905982e-05, + "loss": 0.0155, + "step": 106520 + }, + { + "epoch": 0.7874545400786493, + "grad_norm": 0.07333972305059433, + "learning_rate": 1.0666696343779678e-05, + "loss": 0.0173, + "step": 106530 + }, + { + "epoch": 0.7875284586499512, + "grad_norm": 0.08010855317115784, + "learning_rate": 1.0662986704653371e-05, + "loss": 0.0189, + "step": 106540 + }, + { + "epoch": 0.787602377221253, + "grad_norm": 0.09208738803863525, + "learning_rate": 1.0659277065527067e-05, + "loss": 0.0172, + "step": 106550 + }, + { + "epoch": 0.7876762957925549, + "grad_norm": 0.05726606398820877, + "learning_rate": 1.065556742640076e-05, + "loss": 0.017, + "step": 106560 + }, + { + "epoch": 0.7877502143638567, + "grad_norm": 0.08467677235603333, + "learning_rate": 1.0651857787274454e-05, + "loss": 0.0171, + "step": 106570 + }, + { + "epoch": 0.7878241329351586, + "grad_norm": 0.08000260591506958, + "learning_rate": 1.0648148148148148e-05, + "loss": 0.0179, + "step": 106580 + }, + { + "epoch": 0.7878980515064605, + "grad_norm": 0.09210962057113647, + "learning_rate": 1.0644438509021842e-05, + "loss": 0.0194, + "step": 106590 + }, + { + "epoch": 0.7879719700777623, + "grad_norm": 0.07953054457902908, + "learning_rate": 1.0640728869895537e-05, + "loss": 0.0167, + "step": 106600 + }, + { + "epoch": 0.7880458886490642, + "grad_norm": 0.06847818195819855, + "learning_rate": 1.0637019230769231e-05, + "loss": 0.0178, + "step": 106610 + }, + { + "epoch": 0.788119807220366, + "grad_norm": 0.07081107795238495, + "learning_rate": 1.0633309591642925e-05, + "loss": 0.0181, + "step": 106620 + }, + { + "epoch": 0.7881937257916679, + "grad_norm": 0.08538413792848587, + "learning_rate": 1.062959995251662e-05, + "loss": 0.0161, + "step": 106630 + }, + { + "epoch": 0.7882676443629697, + "grad_norm": 0.09170427918434143, + "learning_rate": 1.0625890313390314e-05, + "loss": 0.0197, + "step": 106640 + }, + { + "epoch": 0.7883415629342716, + "grad_norm": 0.07497674226760864, + "learning_rate": 1.0622180674264008e-05, + "loss": 0.0164, + "step": 106650 + }, + { + "epoch": 0.7884154815055735, + "grad_norm": 0.0653548613190651, + "learning_rate": 1.0618471035137702e-05, + "loss": 0.0155, + "step": 106660 + }, + { + "epoch": 0.7884894000768753, + "grad_norm": 0.05626005306839943, + "learning_rate": 1.0614761396011397e-05, + "loss": 0.0156, + "step": 106670 + }, + { + "epoch": 0.7885633186481772, + "grad_norm": 0.06508783251047134, + "learning_rate": 1.061105175688509e-05, + "loss": 0.0149, + "step": 106680 + }, + { + "epoch": 0.788637237219479, + "grad_norm": 0.07062501460313797, + "learning_rate": 1.0607342117758785e-05, + "loss": 0.0192, + "step": 106690 + }, + { + "epoch": 0.7887111557907809, + "grad_norm": 0.0726802796125412, + "learning_rate": 1.060363247863248e-05, + "loss": 0.0168, + "step": 106700 + }, + { + "epoch": 0.7887850743620828, + "grad_norm": 0.08327441662549973, + "learning_rate": 1.0599922839506174e-05, + "loss": 0.0175, + "step": 106710 + }, + { + "epoch": 0.7888589929333846, + "grad_norm": 0.062011830508708954, + "learning_rate": 1.0596213200379868e-05, + "loss": 0.0185, + "step": 106720 + }, + { + "epoch": 0.7889329115046865, + "grad_norm": 0.0819663405418396, + "learning_rate": 1.059250356125356e-05, + "loss": 0.0181, + "step": 106730 + }, + { + "epoch": 0.7890068300759883, + "grad_norm": 0.08031091094017029, + "learning_rate": 1.0588793922127256e-05, + "loss": 0.017, + "step": 106740 + }, + { + "epoch": 0.7890807486472902, + "grad_norm": 0.061876364052295685, + "learning_rate": 1.0585084283000949e-05, + "loss": 0.0153, + "step": 106750 + }, + { + "epoch": 0.789154667218592, + "grad_norm": 0.08079424500465393, + "learning_rate": 1.0581374643874645e-05, + "loss": 0.0162, + "step": 106760 + }, + { + "epoch": 0.7892285857898939, + "grad_norm": 0.05742736905813217, + "learning_rate": 1.0577665004748338e-05, + "loss": 0.015, + "step": 106770 + }, + { + "epoch": 0.7893025043611958, + "grad_norm": 0.06490683555603027, + "learning_rate": 1.0573955365622034e-05, + "loss": 0.0178, + "step": 106780 + }, + { + "epoch": 0.7893764229324975, + "grad_norm": 0.10366056859493256, + "learning_rate": 1.0570245726495726e-05, + "loss": 0.0172, + "step": 106790 + }, + { + "epoch": 0.7894503415037994, + "grad_norm": 0.06522786617279053, + "learning_rate": 1.056653608736942e-05, + "loss": 0.0167, + "step": 106800 + }, + { + "epoch": 0.7895242600751012, + "grad_norm": 0.06334855407476425, + "learning_rate": 1.0562826448243115e-05, + "loss": 0.019, + "step": 106810 + }, + { + "epoch": 0.7895981786464031, + "grad_norm": 0.10202807933092117, + "learning_rate": 1.0559116809116809e-05, + "loss": 0.0185, + "step": 106820 + }, + { + "epoch": 0.7896720972177049, + "grad_norm": 0.06709656119346619, + "learning_rate": 1.0555407169990505e-05, + "loss": 0.0175, + "step": 106830 + }, + { + "epoch": 0.7897460157890068, + "grad_norm": 0.07228951901197433, + "learning_rate": 1.0551697530864197e-05, + "loss": 0.0155, + "step": 106840 + }, + { + "epoch": 0.7898199343603087, + "grad_norm": 0.05604925751686096, + "learning_rate": 1.0547987891737893e-05, + "loss": 0.0167, + "step": 106850 + }, + { + "epoch": 0.7898938529316105, + "grad_norm": 0.07428887486457825, + "learning_rate": 1.0544278252611586e-05, + "loss": 0.0177, + "step": 106860 + }, + { + "epoch": 0.7899677715029124, + "grad_norm": 0.07909681648015976, + "learning_rate": 1.0540568613485282e-05, + "loss": 0.0161, + "step": 106870 + }, + { + "epoch": 0.7900416900742142, + "grad_norm": 0.08342929184436798, + "learning_rate": 1.0536858974358975e-05, + "loss": 0.0191, + "step": 106880 + }, + { + "epoch": 0.7901156086455161, + "grad_norm": 0.0777612179517746, + "learning_rate": 1.0533149335232669e-05, + "loss": 0.0173, + "step": 106890 + }, + { + "epoch": 0.7901895272168179, + "grad_norm": 0.053565897047519684, + "learning_rate": 1.0529439696106363e-05, + "loss": 0.0161, + "step": 106900 + }, + { + "epoch": 0.7902634457881198, + "grad_norm": 0.06764553487300873, + "learning_rate": 1.0525730056980057e-05, + "loss": 0.017, + "step": 106910 + }, + { + "epoch": 0.7903373643594217, + "grad_norm": 0.1511474996805191, + "learning_rate": 1.0522020417853752e-05, + "loss": 0.0182, + "step": 106920 + }, + { + "epoch": 0.7904112829307235, + "grad_norm": 0.05807117000222206, + "learning_rate": 1.0518310778727446e-05, + "loss": 0.0184, + "step": 106930 + }, + { + "epoch": 0.7904852015020254, + "grad_norm": 0.07401245087385178, + "learning_rate": 1.051460113960114e-05, + "loss": 0.0148, + "step": 106940 + }, + { + "epoch": 0.7905591200733272, + "grad_norm": 0.0792866125702858, + "learning_rate": 1.0510891500474834e-05, + "loss": 0.014, + "step": 106950 + }, + { + "epoch": 0.7906330386446291, + "grad_norm": 0.0900711938738823, + "learning_rate": 1.0507181861348527e-05, + "loss": 0.0202, + "step": 106960 + }, + { + "epoch": 0.790706957215931, + "grad_norm": 0.06799784302711487, + "learning_rate": 1.0503472222222223e-05, + "loss": 0.0179, + "step": 106970 + }, + { + "epoch": 0.7907808757872328, + "grad_norm": 0.06278761476278305, + "learning_rate": 1.0499762583095917e-05, + "loss": 0.0189, + "step": 106980 + }, + { + "epoch": 0.7908547943585347, + "grad_norm": 0.08859692513942719, + "learning_rate": 1.0496052943969612e-05, + "loss": 0.0147, + "step": 106990 + }, + { + "epoch": 0.7909287129298365, + "grad_norm": 0.05939367040991783, + "learning_rate": 1.0492343304843306e-05, + "loss": 0.0163, + "step": 107000 + }, + { + "epoch": 0.7910026315011384, + "grad_norm": 0.11033795028924942, + "learning_rate": 1.0488633665717e-05, + "loss": 0.0208, + "step": 107010 + }, + { + "epoch": 0.7910765500724402, + "grad_norm": 0.08448134362697601, + "learning_rate": 1.0484924026590694e-05, + "loss": 0.0193, + "step": 107020 + }, + { + "epoch": 0.791150468643742, + "grad_norm": 0.09254121780395508, + "learning_rate": 1.0481214387464387e-05, + "loss": 0.0177, + "step": 107030 + }, + { + "epoch": 0.791224387215044, + "grad_norm": 0.11402133107185364, + "learning_rate": 1.0477504748338083e-05, + "loss": 0.0174, + "step": 107040 + }, + { + "epoch": 0.7912983057863457, + "grad_norm": 0.07104026526212692, + "learning_rate": 1.0473795109211776e-05, + "loss": 0.0167, + "step": 107050 + }, + { + "epoch": 0.7913722243576476, + "grad_norm": 0.09040091931819916, + "learning_rate": 1.0470085470085471e-05, + "loss": 0.0171, + "step": 107060 + }, + { + "epoch": 0.7914461429289494, + "grad_norm": 0.0920732393860817, + "learning_rate": 1.0466375830959164e-05, + "loss": 0.0163, + "step": 107070 + }, + { + "epoch": 0.7915200615002513, + "grad_norm": 0.06260927021503448, + "learning_rate": 1.046266619183286e-05, + "loss": 0.0158, + "step": 107080 + }, + { + "epoch": 0.7915939800715531, + "grad_norm": 0.08942589163780212, + "learning_rate": 1.0458956552706553e-05, + "loss": 0.0191, + "step": 107090 + }, + { + "epoch": 0.791667898642855, + "grad_norm": 0.08758687973022461, + "learning_rate": 1.0455246913580249e-05, + "loss": 0.0175, + "step": 107100 + }, + { + "epoch": 0.7917418172141569, + "grad_norm": 0.08504850417375565, + "learning_rate": 1.0451537274453941e-05, + "loss": 0.0167, + "step": 107110 + }, + { + "epoch": 0.7918157357854587, + "grad_norm": 0.08797286450862885, + "learning_rate": 1.0447827635327635e-05, + "loss": 0.0173, + "step": 107120 + }, + { + "epoch": 0.7918896543567606, + "grad_norm": 0.07588206231594086, + "learning_rate": 1.044411799620133e-05, + "loss": 0.0149, + "step": 107130 + }, + { + "epoch": 0.7919635729280624, + "grad_norm": 0.08773746341466904, + "learning_rate": 1.0440408357075024e-05, + "loss": 0.0161, + "step": 107140 + }, + { + "epoch": 0.7920374914993643, + "grad_norm": 0.10242946445941925, + "learning_rate": 1.0436698717948718e-05, + "loss": 0.0156, + "step": 107150 + }, + { + "epoch": 0.7921114100706661, + "grad_norm": 0.06126590818166733, + "learning_rate": 1.0432989078822413e-05, + "loss": 0.0175, + "step": 107160 + }, + { + "epoch": 0.792185328641968, + "grad_norm": 0.08768381923437119, + "learning_rate": 1.0429279439696107e-05, + "loss": 0.017, + "step": 107170 + }, + { + "epoch": 0.7922592472132699, + "grad_norm": 0.09814704209566116, + "learning_rate": 1.0425569800569801e-05, + "loss": 0.0176, + "step": 107180 + }, + { + "epoch": 0.7923331657845717, + "grad_norm": 0.08230622857809067, + "learning_rate": 1.0421860161443495e-05, + "loss": 0.0192, + "step": 107190 + }, + { + "epoch": 0.7924070843558736, + "grad_norm": 0.08793134987354279, + "learning_rate": 1.041815052231719e-05, + "loss": 0.017, + "step": 107200 + }, + { + "epoch": 0.7924810029271754, + "grad_norm": 0.0937204509973526, + "learning_rate": 1.0414440883190884e-05, + "loss": 0.0168, + "step": 107210 + }, + { + "epoch": 0.7925549214984773, + "grad_norm": 0.06012919172644615, + "learning_rate": 1.0410731244064578e-05, + "loss": 0.0172, + "step": 107220 + }, + { + "epoch": 0.7926288400697792, + "grad_norm": 0.0912671908736229, + "learning_rate": 1.0407021604938272e-05, + "loss": 0.0194, + "step": 107230 + }, + { + "epoch": 0.792702758641081, + "grad_norm": 0.05338483676314354, + "learning_rate": 1.0403311965811967e-05, + "loss": 0.0171, + "step": 107240 + }, + { + "epoch": 0.7927766772123829, + "grad_norm": 0.0971188098192215, + "learning_rate": 1.0399602326685661e-05, + "loss": 0.0169, + "step": 107250 + }, + { + "epoch": 0.7928505957836847, + "grad_norm": 0.07669887691736221, + "learning_rate": 1.0395892687559355e-05, + "loss": 0.0175, + "step": 107260 + }, + { + "epoch": 0.7929245143549866, + "grad_norm": 0.07737741619348526, + "learning_rate": 1.039218304843305e-05, + "loss": 0.0167, + "step": 107270 + }, + { + "epoch": 0.7929984329262884, + "grad_norm": 0.09764139354228973, + "learning_rate": 1.0388473409306742e-05, + "loss": 0.017, + "step": 107280 + }, + { + "epoch": 0.7930723514975903, + "grad_norm": 0.09034200757741928, + "learning_rate": 1.0384763770180438e-05, + "loss": 0.0186, + "step": 107290 + }, + { + "epoch": 0.7931462700688922, + "grad_norm": 0.0868106558918953, + "learning_rate": 1.038105413105413e-05, + "loss": 0.0168, + "step": 107300 + }, + { + "epoch": 0.793220188640194, + "grad_norm": 0.08318565785884857, + "learning_rate": 1.0377344491927827e-05, + "loss": 0.017, + "step": 107310 + }, + { + "epoch": 0.7932941072114958, + "grad_norm": 0.09246040880680084, + "learning_rate": 1.037363485280152e-05, + "loss": 0.0181, + "step": 107320 + }, + { + "epoch": 0.7933680257827976, + "grad_norm": 0.08603844046592712, + "learning_rate": 1.0369925213675215e-05, + "loss": 0.0191, + "step": 107330 + }, + { + "epoch": 0.7934419443540995, + "grad_norm": 0.08298540115356445, + "learning_rate": 1.0366215574548908e-05, + "loss": 0.0189, + "step": 107340 + }, + { + "epoch": 0.7935158629254013, + "grad_norm": 0.08032820373773575, + "learning_rate": 1.0362505935422602e-05, + "loss": 0.0165, + "step": 107350 + }, + { + "epoch": 0.7935897814967032, + "grad_norm": 0.07046546041965485, + "learning_rate": 1.0358796296296296e-05, + "loss": 0.0169, + "step": 107360 + }, + { + "epoch": 0.7936637000680051, + "grad_norm": 0.10293319076299667, + "learning_rate": 1.035508665716999e-05, + "loss": 0.0164, + "step": 107370 + }, + { + "epoch": 0.7937376186393069, + "grad_norm": 0.08239208161830902, + "learning_rate": 1.0351377018043685e-05, + "loss": 0.0175, + "step": 107380 + }, + { + "epoch": 0.7938115372106088, + "grad_norm": 0.06712669134140015, + "learning_rate": 1.0347667378917379e-05, + "loss": 0.0163, + "step": 107390 + }, + { + "epoch": 0.7938854557819106, + "grad_norm": 0.09521905332803726, + "learning_rate": 1.0343957739791073e-05, + "loss": 0.0161, + "step": 107400 + }, + { + "epoch": 0.7939593743532125, + "grad_norm": 0.08448793739080429, + "learning_rate": 1.0340248100664768e-05, + "loss": 0.015, + "step": 107410 + }, + { + "epoch": 0.7940332929245143, + "grad_norm": 0.09332919865846634, + "learning_rate": 1.0336538461538462e-05, + "loss": 0.0191, + "step": 107420 + }, + { + "epoch": 0.7941072114958162, + "grad_norm": 0.0625322014093399, + "learning_rate": 1.0332828822412156e-05, + "loss": 0.0155, + "step": 107430 + }, + { + "epoch": 0.7941811300671181, + "grad_norm": 0.06839042901992798, + "learning_rate": 1.032911918328585e-05, + "loss": 0.0169, + "step": 107440 + }, + { + "epoch": 0.7942550486384199, + "grad_norm": 0.09588529914617538, + "learning_rate": 1.0325409544159545e-05, + "loss": 0.0171, + "step": 107450 + }, + { + "epoch": 0.7943289672097218, + "grad_norm": 0.09027482569217682, + "learning_rate": 1.0321699905033239e-05, + "loss": 0.0165, + "step": 107460 + }, + { + "epoch": 0.7944028857810236, + "grad_norm": 0.07311699539422989, + "learning_rate": 1.0317990265906933e-05, + "loss": 0.0162, + "step": 107470 + }, + { + "epoch": 0.7944768043523255, + "grad_norm": 0.07894931733608246, + "learning_rate": 1.0314280626780628e-05, + "loss": 0.0177, + "step": 107480 + }, + { + "epoch": 0.7945507229236274, + "grad_norm": 0.07096471637487411, + "learning_rate": 1.0310570987654322e-05, + "loss": 0.0174, + "step": 107490 + }, + { + "epoch": 0.7946246414949292, + "grad_norm": 0.07074417918920517, + "learning_rate": 1.0306861348528016e-05, + "loss": 0.0155, + "step": 107500 + }, + { + "epoch": 0.7946985600662311, + "grad_norm": 0.087070032954216, + "learning_rate": 1.0303151709401709e-05, + "loss": 0.0194, + "step": 107510 + }, + { + "epoch": 0.7947724786375329, + "grad_norm": 0.0836467370390892, + "learning_rate": 1.0299442070275405e-05, + "loss": 0.0163, + "step": 107520 + }, + { + "epoch": 0.7948463972088348, + "grad_norm": 0.058517493307590485, + "learning_rate": 1.0295732431149097e-05, + "loss": 0.0152, + "step": 107530 + }, + { + "epoch": 0.7949203157801366, + "grad_norm": 0.09755102545022964, + "learning_rate": 1.0292022792022793e-05, + "loss": 0.0181, + "step": 107540 + }, + { + "epoch": 0.7949942343514385, + "grad_norm": 0.0626494511961937, + "learning_rate": 1.0288313152896486e-05, + "loss": 0.0166, + "step": 107550 + }, + { + "epoch": 0.7950681529227404, + "grad_norm": 0.06871600449085236, + "learning_rate": 1.0284603513770182e-05, + "loss": 0.0162, + "step": 107560 + }, + { + "epoch": 0.7951420714940421, + "grad_norm": 0.0932798832654953, + "learning_rate": 1.0280893874643874e-05, + "loss": 0.0178, + "step": 107570 + }, + { + "epoch": 0.795215990065344, + "grad_norm": 0.0795370489358902, + "learning_rate": 1.0277184235517569e-05, + "loss": 0.0184, + "step": 107580 + }, + { + "epoch": 0.7952899086366458, + "grad_norm": 0.09961181879043579, + "learning_rate": 1.0273474596391265e-05, + "loss": 0.0177, + "step": 107590 + }, + { + "epoch": 0.7953638272079477, + "grad_norm": 0.07366261631250381, + "learning_rate": 1.0269764957264957e-05, + "loss": 0.0159, + "step": 107600 + }, + { + "epoch": 0.7954377457792495, + "grad_norm": 0.08025870472192764, + "learning_rate": 1.0266055318138653e-05, + "loss": 0.0162, + "step": 107610 + }, + { + "epoch": 0.7955116643505514, + "grad_norm": 0.06702578067779541, + "learning_rate": 1.0262345679012346e-05, + "loss": 0.0155, + "step": 107620 + }, + { + "epoch": 0.7955855829218533, + "grad_norm": 0.0553353913128376, + "learning_rate": 1.0258636039886042e-05, + "loss": 0.0171, + "step": 107630 + }, + { + "epoch": 0.7956595014931551, + "grad_norm": 0.088180311024189, + "learning_rate": 1.0254926400759734e-05, + "loss": 0.0179, + "step": 107640 + }, + { + "epoch": 0.795733420064457, + "grad_norm": 0.05583290383219719, + "learning_rate": 1.0251216761633429e-05, + "loss": 0.0172, + "step": 107650 + }, + { + "epoch": 0.7958073386357588, + "grad_norm": 0.09140726178884506, + "learning_rate": 1.0247507122507123e-05, + "loss": 0.015, + "step": 107660 + }, + { + "epoch": 0.7958812572070607, + "grad_norm": 0.0885956883430481, + "learning_rate": 1.0243797483380817e-05, + "loss": 0.0161, + "step": 107670 + }, + { + "epoch": 0.7959551757783625, + "grad_norm": 0.06041378155350685, + "learning_rate": 1.0240087844254511e-05, + "loss": 0.0156, + "step": 107680 + }, + { + "epoch": 0.7960290943496644, + "grad_norm": 0.052953656762838364, + "learning_rate": 1.0236378205128206e-05, + "loss": 0.0145, + "step": 107690 + }, + { + "epoch": 0.7961030129209663, + "grad_norm": 0.05720691755414009, + "learning_rate": 1.02326685660019e-05, + "loss": 0.0176, + "step": 107700 + }, + { + "epoch": 0.7961769314922681, + "grad_norm": 0.07044905424118042, + "learning_rate": 1.0228958926875594e-05, + "loss": 0.0166, + "step": 107710 + }, + { + "epoch": 0.79625085006357, + "grad_norm": 0.07559455186128616, + "learning_rate": 1.0225249287749288e-05, + "loss": 0.0173, + "step": 107720 + }, + { + "epoch": 0.7963247686348718, + "grad_norm": 0.11719612777233124, + "learning_rate": 1.0221539648622983e-05, + "loss": 0.019, + "step": 107730 + }, + { + "epoch": 0.7963986872061737, + "grad_norm": 0.07006300985813141, + "learning_rate": 1.0217830009496675e-05, + "loss": 0.0191, + "step": 107740 + }, + { + "epoch": 0.7964726057774756, + "grad_norm": 0.09146808832883835, + "learning_rate": 1.0214120370370371e-05, + "loss": 0.0179, + "step": 107750 + }, + { + "epoch": 0.7965465243487774, + "grad_norm": 0.05993087589740753, + "learning_rate": 1.0210410731244066e-05, + "loss": 0.0163, + "step": 107760 + }, + { + "epoch": 0.7966204429200793, + "grad_norm": 0.09870940446853638, + "learning_rate": 1.020670109211776e-05, + "loss": 0.0148, + "step": 107770 + }, + { + "epoch": 0.7966943614913811, + "grad_norm": 0.07571858912706375, + "learning_rate": 1.0202991452991454e-05, + "loss": 0.0173, + "step": 107780 + }, + { + "epoch": 0.796768280062683, + "grad_norm": 0.0883622094988823, + "learning_rate": 1.0199281813865148e-05, + "loss": 0.0155, + "step": 107790 + }, + { + "epoch": 0.7968421986339848, + "grad_norm": 0.0919145941734314, + "learning_rate": 1.0195572174738843e-05, + "loss": 0.0178, + "step": 107800 + }, + { + "epoch": 0.7969161172052867, + "grad_norm": 0.0809524804353714, + "learning_rate": 1.0191862535612535e-05, + "loss": 0.0157, + "step": 107810 + }, + { + "epoch": 0.7969900357765886, + "grad_norm": 0.07559646666049957, + "learning_rate": 1.0188152896486231e-05, + "loss": 0.0161, + "step": 107820 + }, + { + "epoch": 0.7970639543478903, + "grad_norm": 0.09900683909654617, + "learning_rate": 1.0184443257359924e-05, + "loss": 0.0187, + "step": 107830 + }, + { + "epoch": 0.7971378729191922, + "grad_norm": 0.09665405005216599, + "learning_rate": 1.018073361823362e-05, + "loss": 0.02, + "step": 107840 + }, + { + "epoch": 0.797211791490494, + "grad_norm": 0.08485321700572968, + "learning_rate": 1.0177023979107312e-05, + "loss": 0.0168, + "step": 107850 + }, + { + "epoch": 0.7972857100617959, + "grad_norm": 0.09139339625835419, + "learning_rate": 1.0173314339981008e-05, + "loss": 0.0168, + "step": 107860 + }, + { + "epoch": 0.7973596286330977, + "grad_norm": 0.05562750995159149, + "learning_rate": 1.01696047008547e-05, + "loss": 0.0159, + "step": 107870 + }, + { + "epoch": 0.7974335472043996, + "grad_norm": 0.0958489254117012, + "learning_rate": 1.0165895061728395e-05, + "loss": 0.0154, + "step": 107880 + }, + { + "epoch": 0.7975074657757015, + "grad_norm": 0.10791540890932083, + "learning_rate": 1.016218542260209e-05, + "loss": 0.0159, + "step": 107890 + }, + { + "epoch": 0.7975813843470033, + "grad_norm": 0.07161233574151993, + "learning_rate": 1.0158475783475784e-05, + "loss": 0.0157, + "step": 107900 + }, + { + "epoch": 0.7976553029183052, + "grad_norm": 0.08552657812833786, + "learning_rate": 1.0154766144349478e-05, + "loss": 0.0169, + "step": 107910 + }, + { + "epoch": 0.797729221489607, + "grad_norm": 0.06651432812213898, + "learning_rate": 1.0151056505223172e-05, + "loss": 0.0178, + "step": 107920 + }, + { + "epoch": 0.7978031400609089, + "grad_norm": 0.0792415663599968, + "learning_rate": 1.0147346866096866e-05, + "loss": 0.0177, + "step": 107930 + }, + { + "epoch": 0.7978770586322108, + "grad_norm": 0.09755530953407288, + "learning_rate": 1.014363722697056e-05, + "loss": 0.0166, + "step": 107940 + }, + { + "epoch": 0.7979509772035126, + "grad_norm": 0.0528554804623127, + "learning_rate": 1.0139927587844255e-05, + "loss": 0.0157, + "step": 107950 + }, + { + "epoch": 0.7980248957748145, + "grad_norm": 0.0647616758942604, + "learning_rate": 1.013621794871795e-05, + "loss": 0.0142, + "step": 107960 + }, + { + "epoch": 0.7980988143461163, + "grad_norm": 0.0763440877199173, + "learning_rate": 1.0132508309591644e-05, + "loss": 0.0162, + "step": 107970 + }, + { + "epoch": 0.7981727329174182, + "grad_norm": 0.0879247710108757, + "learning_rate": 1.0128798670465338e-05, + "loss": 0.0177, + "step": 107980 + }, + { + "epoch": 0.79824665148872, + "grad_norm": 0.05888223648071289, + "learning_rate": 1.0125089031339032e-05, + "loss": 0.0138, + "step": 107990 + }, + { + "epoch": 0.7983205700600219, + "grad_norm": 0.09061430394649506, + "learning_rate": 1.0121379392212726e-05, + "loss": 0.0173, + "step": 108000 + }, + { + "epoch": 0.7983944886313238, + "grad_norm": 0.07279080152511597, + "learning_rate": 1.011766975308642e-05, + "loss": 0.0169, + "step": 108010 + }, + { + "epoch": 0.7984684072026256, + "grad_norm": 0.08346417546272278, + "learning_rate": 1.0113960113960115e-05, + "loss": 0.0195, + "step": 108020 + }, + { + "epoch": 0.7985423257739275, + "grad_norm": 0.0859362855553627, + "learning_rate": 1.011025047483381e-05, + "loss": 0.0203, + "step": 108030 + }, + { + "epoch": 0.7986162443452293, + "grad_norm": 0.059124864637851715, + "learning_rate": 1.0106540835707502e-05, + "loss": 0.017, + "step": 108040 + }, + { + "epoch": 0.7986901629165312, + "grad_norm": 0.09157084673643112, + "learning_rate": 1.0102831196581198e-05, + "loss": 0.0182, + "step": 108050 + }, + { + "epoch": 0.798764081487833, + "grad_norm": 0.08674320578575134, + "learning_rate": 1.009912155745489e-05, + "loss": 0.0204, + "step": 108060 + }, + { + "epoch": 0.7988380000591349, + "grad_norm": 0.08345095813274384, + "learning_rate": 1.0095411918328586e-05, + "loss": 0.0163, + "step": 108070 + }, + { + "epoch": 0.7989119186304368, + "grad_norm": 0.08811159431934357, + "learning_rate": 1.0091702279202279e-05, + "loss": 0.0191, + "step": 108080 + }, + { + "epoch": 0.7989858372017385, + "grad_norm": 0.06645365804433823, + "learning_rate": 1.0087992640075975e-05, + "loss": 0.0157, + "step": 108090 + }, + { + "epoch": 0.7990597557730404, + "grad_norm": 0.1114099770784378, + "learning_rate": 1.0084283000949667e-05, + "loss": 0.0194, + "step": 108100 + }, + { + "epoch": 0.7991336743443422, + "grad_norm": 0.048499658703804016, + "learning_rate": 1.0080573361823362e-05, + "loss": 0.0156, + "step": 108110 + }, + { + "epoch": 0.7992075929156441, + "grad_norm": 0.08671610802412033, + "learning_rate": 1.0076863722697056e-05, + "loss": 0.0171, + "step": 108120 + }, + { + "epoch": 0.7992815114869459, + "grad_norm": 0.081625796854496, + "learning_rate": 1.007315408357075e-05, + "loss": 0.0163, + "step": 108130 + }, + { + "epoch": 0.7993554300582478, + "grad_norm": 0.09771347790956497, + "learning_rate": 1.0069444444444445e-05, + "loss": 0.0168, + "step": 108140 + }, + { + "epoch": 0.7994293486295497, + "grad_norm": 0.07416078448295593, + "learning_rate": 1.0065734805318139e-05, + "loss": 0.0164, + "step": 108150 + }, + { + "epoch": 0.7995032672008515, + "grad_norm": 0.08635777980089188, + "learning_rate": 1.0062025166191833e-05, + "loss": 0.0183, + "step": 108160 + }, + { + "epoch": 0.7995771857721534, + "grad_norm": 0.08301952481269836, + "learning_rate": 1.0058315527065527e-05, + "loss": 0.0197, + "step": 108170 + }, + { + "epoch": 0.7996511043434552, + "grad_norm": 0.09106559306383133, + "learning_rate": 1.0054605887939222e-05, + "loss": 0.0167, + "step": 108180 + }, + { + "epoch": 0.7997250229147571, + "grad_norm": 0.099912628531456, + "learning_rate": 1.0050896248812916e-05, + "loss": 0.0186, + "step": 108190 + }, + { + "epoch": 0.799798941486059, + "grad_norm": 0.06465443223714828, + "learning_rate": 1.004718660968661e-05, + "loss": 0.0184, + "step": 108200 + }, + { + "epoch": 0.7998728600573608, + "grad_norm": 0.05838299170136452, + "learning_rate": 1.0043476970560304e-05, + "loss": 0.0188, + "step": 108210 + }, + { + "epoch": 0.7999467786286627, + "grad_norm": 0.11985216289758682, + "learning_rate": 1.0039767331433999e-05, + "loss": 0.0171, + "step": 108220 + }, + { + "epoch": 0.8000206971999645, + "grad_norm": 0.0747854933142662, + "learning_rate": 1.0036057692307693e-05, + "loss": 0.0184, + "step": 108230 + }, + { + "epoch": 0.8000946157712664, + "grad_norm": 0.09232344478368759, + "learning_rate": 1.0032348053181387e-05, + "loss": 0.0185, + "step": 108240 + }, + { + "epoch": 0.8001685343425682, + "grad_norm": 0.07320012152194977, + "learning_rate": 1.0028638414055082e-05, + "loss": 0.0145, + "step": 108250 + }, + { + "epoch": 0.8002424529138701, + "grad_norm": 0.04968751594424248, + "learning_rate": 1.0024928774928776e-05, + "loss": 0.0165, + "step": 108260 + }, + { + "epoch": 0.800316371485172, + "grad_norm": 0.06803929060697556, + "learning_rate": 1.0021219135802468e-05, + "loss": 0.0195, + "step": 108270 + }, + { + "epoch": 0.8003902900564738, + "grad_norm": 0.09313097596168518, + "learning_rate": 1.0017509496676164e-05, + "loss": 0.0186, + "step": 108280 + }, + { + "epoch": 0.8004642086277757, + "grad_norm": 0.08711668848991394, + "learning_rate": 1.0013799857549857e-05, + "loss": 0.0185, + "step": 108290 + }, + { + "epoch": 0.8005381271990775, + "grad_norm": 0.06377244740724564, + "learning_rate": 1.0010090218423553e-05, + "loss": 0.0162, + "step": 108300 + }, + { + "epoch": 0.8006120457703794, + "grad_norm": 0.07629870623350143, + "learning_rate": 1.0006380579297245e-05, + "loss": 0.017, + "step": 108310 + }, + { + "epoch": 0.8006859643416812, + "grad_norm": 0.10412617027759552, + "learning_rate": 1.0002670940170941e-05, + "loss": 0.0151, + "step": 108320 + }, + { + "epoch": 0.8007598829129831, + "grad_norm": 0.07674720883369446, + "learning_rate": 9.998961301044634e-06, + "loss": 0.0176, + "step": 108330 + }, + { + "epoch": 0.800833801484285, + "grad_norm": 0.07292506843805313, + "learning_rate": 9.995251661918328e-06, + "loss": 0.0151, + "step": 108340 + }, + { + "epoch": 0.8009077200555867, + "grad_norm": 0.07875438779592514, + "learning_rate": 9.991542022792024e-06, + "loss": 0.0178, + "step": 108350 + }, + { + "epoch": 0.8009816386268886, + "grad_norm": 0.07804681360721588, + "learning_rate": 9.987832383665717e-06, + "loss": 0.0194, + "step": 108360 + }, + { + "epoch": 0.8010555571981904, + "grad_norm": 0.07799090445041656, + "learning_rate": 9.984122744539413e-06, + "loss": 0.0159, + "step": 108370 + }, + { + "epoch": 0.8011294757694923, + "grad_norm": 0.061510853469371796, + "learning_rate": 9.980413105413105e-06, + "loss": 0.0161, + "step": 108380 + }, + { + "epoch": 0.8012033943407941, + "grad_norm": 0.10582411289215088, + "learning_rate": 9.976703466286801e-06, + "loss": 0.0192, + "step": 108390 + }, + { + "epoch": 0.801277312912096, + "grad_norm": 0.06720519065856934, + "learning_rate": 9.972993827160494e-06, + "loss": 0.0149, + "step": 108400 + }, + { + "epoch": 0.8013512314833979, + "grad_norm": 0.08093603700399399, + "learning_rate": 9.96928418803419e-06, + "loss": 0.0161, + "step": 108410 + }, + { + "epoch": 0.8014251500546997, + "grad_norm": 0.08576540648937225, + "learning_rate": 9.965574548907882e-06, + "loss": 0.0157, + "step": 108420 + }, + { + "epoch": 0.8014990686260016, + "grad_norm": 0.06301461905241013, + "learning_rate": 9.961864909781577e-06, + "loss": 0.0144, + "step": 108430 + }, + { + "epoch": 0.8015729871973034, + "grad_norm": 0.07179810851812363, + "learning_rate": 9.958155270655271e-06, + "loss": 0.0151, + "step": 108440 + }, + { + "epoch": 0.8016469057686053, + "grad_norm": 0.07506982237100601, + "learning_rate": 9.954445631528965e-06, + "loss": 0.0192, + "step": 108450 + }, + { + "epoch": 0.8017208243399072, + "grad_norm": 0.10046809166669846, + "learning_rate": 9.95073599240266e-06, + "loss": 0.0179, + "step": 108460 + }, + { + "epoch": 0.801794742911209, + "grad_norm": 0.06781169772148132, + "learning_rate": 9.947026353276354e-06, + "loss": 0.0186, + "step": 108470 + }, + { + "epoch": 0.8018686614825109, + "grad_norm": 0.06781306862831116, + "learning_rate": 9.943316714150048e-06, + "loss": 0.0163, + "step": 108480 + }, + { + "epoch": 0.8019425800538127, + "grad_norm": 0.05869549140334129, + "learning_rate": 9.939607075023742e-06, + "loss": 0.0175, + "step": 108490 + }, + { + "epoch": 0.8020164986251146, + "grad_norm": 0.06874193996191025, + "learning_rate": 9.935897435897435e-06, + "loss": 0.0176, + "step": 108500 + }, + { + "epoch": 0.8020904171964164, + "grad_norm": 0.0705341100692749, + "learning_rate": 9.932187796771131e-06, + "loss": 0.015, + "step": 108510 + }, + { + "epoch": 0.8021643357677183, + "grad_norm": 0.07271615415811539, + "learning_rate": 9.928478157644825e-06, + "loss": 0.0166, + "step": 108520 + }, + { + "epoch": 0.8022382543390202, + "grad_norm": 0.09714877605438232, + "learning_rate": 9.92476851851852e-06, + "loss": 0.0171, + "step": 108530 + }, + { + "epoch": 0.802312172910322, + "grad_norm": 0.08740437030792236, + "learning_rate": 9.921058879392214e-06, + "loss": 0.0156, + "step": 108540 + }, + { + "epoch": 0.8023860914816239, + "grad_norm": 0.07252812385559082, + "learning_rate": 9.917349240265908e-06, + "loss": 0.0176, + "step": 108550 + }, + { + "epoch": 0.8024600100529257, + "grad_norm": 0.05812780186533928, + "learning_rate": 9.913639601139602e-06, + "loss": 0.0194, + "step": 108560 + }, + { + "epoch": 0.8025339286242276, + "grad_norm": 0.08060342073440552, + "learning_rate": 9.909929962013295e-06, + "loss": 0.0175, + "step": 108570 + }, + { + "epoch": 0.8026078471955294, + "grad_norm": 0.09011170268058777, + "learning_rate": 9.90622032288699e-06, + "loss": 0.0176, + "step": 108580 + }, + { + "epoch": 0.8026817657668313, + "grad_norm": 0.08751154690980911, + "learning_rate": 9.902510683760683e-06, + "loss": 0.0171, + "step": 108590 + }, + { + "epoch": 0.8027556843381332, + "grad_norm": 0.08538112044334412, + "learning_rate": 9.89880104463438e-06, + "loss": 0.0148, + "step": 108600 + }, + { + "epoch": 0.802829602909435, + "grad_norm": 0.07628414034843445, + "learning_rate": 9.895091405508072e-06, + "loss": 0.0181, + "step": 108610 + }, + { + "epoch": 0.8029035214807368, + "grad_norm": 0.07035236805677414, + "learning_rate": 9.891381766381768e-06, + "loss": 0.0167, + "step": 108620 + }, + { + "epoch": 0.8029774400520386, + "grad_norm": 0.10552142560482025, + "learning_rate": 9.88767212725546e-06, + "loss": 0.0189, + "step": 108630 + }, + { + "epoch": 0.8030513586233405, + "grad_norm": 0.06383045017719269, + "learning_rate": 9.883962488129156e-06, + "loss": 0.016, + "step": 108640 + }, + { + "epoch": 0.8031252771946423, + "grad_norm": 0.06668663769960403, + "learning_rate": 9.880252849002849e-06, + "loss": 0.015, + "step": 108650 + }, + { + "epoch": 0.8031991957659442, + "grad_norm": 0.09065516293048859, + "learning_rate": 9.876543209876543e-06, + "loss": 0.018, + "step": 108660 + }, + { + "epoch": 0.8032731143372461, + "grad_norm": 0.08935684710741043, + "learning_rate": 9.872833570750238e-06, + "loss": 0.0195, + "step": 108670 + }, + { + "epoch": 0.8033470329085479, + "grad_norm": 0.09216105192899704, + "learning_rate": 9.869123931623932e-06, + "loss": 0.0184, + "step": 108680 + }, + { + "epoch": 0.8034209514798498, + "grad_norm": 0.09943180531263351, + "learning_rate": 9.865414292497626e-06, + "loss": 0.0207, + "step": 108690 + }, + { + "epoch": 0.8034948700511516, + "grad_norm": 0.08068699389696121, + "learning_rate": 9.86170465337132e-06, + "loss": 0.0188, + "step": 108700 + }, + { + "epoch": 0.8035687886224535, + "grad_norm": 0.07922651618719101, + "learning_rate": 9.857995014245015e-06, + "loss": 0.0186, + "step": 108710 + }, + { + "epoch": 0.8036427071937554, + "grad_norm": 0.12515637278556824, + "learning_rate": 9.854285375118709e-06, + "loss": 0.0176, + "step": 108720 + }, + { + "epoch": 0.8037166257650572, + "grad_norm": 0.07942978292703629, + "learning_rate": 9.850575735992403e-06, + "loss": 0.0175, + "step": 108730 + }, + { + "epoch": 0.8037905443363591, + "grad_norm": 0.09532030671834946, + "learning_rate": 9.846866096866097e-06, + "loss": 0.0192, + "step": 108740 + }, + { + "epoch": 0.8038644629076609, + "grad_norm": 0.07980860769748688, + "learning_rate": 9.843156457739792e-06, + "loss": 0.0184, + "step": 108750 + }, + { + "epoch": 0.8039383814789628, + "grad_norm": 0.0913219228386879, + "learning_rate": 9.839446818613486e-06, + "loss": 0.0164, + "step": 108760 + }, + { + "epoch": 0.8040123000502646, + "grad_norm": 0.07639884203672409, + "learning_rate": 9.83573717948718e-06, + "loss": 0.0179, + "step": 108770 + }, + { + "epoch": 0.8040862186215665, + "grad_norm": 0.06440051645040512, + "learning_rate": 9.832027540360875e-06, + "loss": 0.0161, + "step": 108780 + }, + { + "epoch": 0.8041601371928684, + "grad_norm": 0.08487991243600845, + "learning_rate": 9.828317901234569e-06, + "loss": 0.0165, + "step": 108790 + }, + { + "epoch": 0.8042340557641702, + "grad_norm": 0.06237079203128815, + "learning_rate": 9.824608262108261e-06, + "loss": 0.0184, + "step": 108800 + }, + { + "epoch": 0.8043079743354721, + "grad_norm": 0.08246572315692902, + "learning_rate": 9.820898622981957e-06, + "loss": 0.0164, + "step": 108810 + }, + { + "epoch": 0.8043818929067739, + "grad_norm": 0.07118767499923706, + "learning_rate": 9.81718898385565e-06, + "loss": 0.0173, + "step": 108820 + }, + { + "epoch": 0.8044558114780758, + "grad_norm": 0.09393595904111862, + "learning_rate": 9.813479344729346e-06, + "loss": 0.0166, + "step": 108830 + }, + { + "epoch": 0.8045297300493776, + "grad_norm": 0.098183773458004, + "learning_rate": 9.809769705603039e-06, + "loss": 0.0173, + "step": 108840 + }, + { + "epoch": 0.8046036486206795, + "grad_norm": 0.0713919922709465, + "learning_rate": 9.806060066476734e-06, + "loss": 0.0172, + "step": 108850 + }, + { + "epoch": 0.8046775671919814, + "grad_norm": 0.05390486493706703, + "learning_rate": 9.802350427350427e-06, + "loss": 0.0178, + "step": 108860 + }, + { + "epoch": 0.8047514857632831, + "grad_norm": 0.09047302603721619, + "learning_rate": 9.798640788224123e-06, + "loss": 0.0163, + "step": 108870 + }, + { + "epoch": 0.804825404334585, + "grad_norm": 0.08637501299381256, + "learning_rate": 9.794931149097816e-06, + "loss": 0.0174, + "step": 108880 + }, + { + "epoch": 0.8048993229058868, + "grad_norm": 0.07068019360303879, + "learning_rate": 9.79122150997151e-06, + "loss": 0.0157, + "step": 108890 + }, + { + "epoch": 0.8049732414771887, + "grad_norm": 0.06551461666822433, + "learning_rate": 9.787511870845204e-06, + "loss": 0.0178, + "step": 108900 + }, + { + "epoch": 0.8050471600484905, + "grad_norm": 0.05838355794548988, + "learning_rate": 9.783802231718898e-06, + "loss": 0.0185, + "step": 108910 + }, + { + "epoch": 0.8051210786197924, + "grad_norm": 0.10512775182723999, + "learning_rate": 9.780092592592593e-06, + "loss": 0.0195, + "step": 108920 + }, + { + "epoch": 0.8051949971910943, + "grad_norm": 0.08655641227960587, + "learning_rate": 9.776382953466287e-06, + "loss": 0.016, + "step": 108930 + }, + { + "epoch": 0.8052689157623961, + "grad_norm": 0.10874267667531967, + "learning_rate": 9.772673314339981e-06, + "loss": 0.0167, + "step": 108940 + }, + { + "epoch": 0.805342834333698, + "grad_norm": 0.09654273092746735, + "learning_rate": 9.768963675213676e-06, + "loss": 0.0168, + "step": 108950 + }, + { + "epoch": 0.8054167529049998, + "grad_norm": 0.08570707589387894, + "learning_rate": 9.76525403608737e-06, + "loss": 0.0176, + "step": 108960 + }, + { + "epoch": 0.8054906714763017, + "grad_norm": 0.1013251543045044, + "learning_rate": 9.761544396961064e-06, + "loss": 0.0174, + "step": 108970 + }, + { + "epoch": 0.8055645900476036, + "grad_norm": 0.07357745617628098, + "learning_rate": 9.757834757834758e-06, + "loss": 0.0169, + "step": 108980 + }, + { + "epoch": 0.8056385086189054, + "grad_norm": 0.1125309094786644, + "learning_rate": 9.754125118708453e-06, + "loss": 0.0173, + "step": 108990 + }, + { + "epoch": 0.8057124271902073, + "grad_norm": 0.10419237613677979, + "learning_rate": 9.750415479582147e-06, + "loss": 0.0156, + "step": 109000 + }, + { + "epoch": 0.8057863457615091, + "grad_norm": 0.06879635900259018, + "learning_rate": 9.746705840455841e-06, + "loss": 0.0172, + "step": 109010 + }, + { + "epoch": 0.805860264332811, + "grad_norm": 0.07967539876699448, + "learning_rate": 9.742996201329535e-06, + "loss": 0.0163, + "step": 109020 + }, + { + "epoch": 0.8059341829041128, + "grad_norm": 0.06467405706644058, + "learning_rate": 9.739286562203228e-06, + "loss": 0.0171, + "step": 109030 + }, + { + "epoch": 0.8060081014754147, + "grad_norm": 0.08744856715202332, + "learning_rate": 9.735576923076924e-06, + "loss": 0.0172, + "step": 109040 + }, + { + "epoch": 0.8060820200467166, + "grad_norm": 0.06352761387825012, + "learning_rate": 9.731867283950617e-06, + "loss": 0.0176, + "step": 109050 + }, + { + "epoch": 0.8061559386180184, + "grad_norm": 0.09876241534948349, + "learning_rate": 9.728157644824313e-06, + "loss": 0.0194, + "step": 109060 + }, + { + "epoch": 0.8062298571893203, + "grad_norm": 0.09139581769704819, + "learning_rate": 9.724448005698005e-06, + "loss": 0.0181, + "step": 109070 + }, + { + "epoch": 0.8063037757606221, + "grad_norm": 0.06005903705954552, + "learning_rate": 9.720738366571701e-06, + "loss": 0.0177, + "step": 109080 + }, + { + "epoch": 0.806377694331924, + "grad_norm": 0.06951243430376053, + "learning_rate": 9.717028727445394e-06, + "loss": 0.0178, + "step": 109090 + }, + { + "epoch": 0.8064516129032258, + "grad_norm": 0.05802584066987038, + "learning_rate": 9.71331908831909e-06, + "loss": 0.0137, + "step": 109100 + }, + { + "epoch": 0.8065255314745277, + "grad_norm": 0.0676131471991539, + "learning_rate": 9.709609449192782e-06, + "loss": 0.0171, + "step": 109110 + }, + { + "epoch": 0.8065994500458296, + "grad_norm": 0.06124149635434151, + "learning_rate": 9.705899810066476e-06, + "loss": 0.0178, + "step": 109120 + }, + { + "epoch": 0.8066733686171313, + "grad_norm": 0.0544743649661541, + "learning_rate": 9.702190170940172e-06, + "loss": 0.0164, + "step": 109130 + }, + { + "epoch": 0.8067472871884332, + "grad_norm": 0.0752519741654396, + "learning_rate": 9.698480531813865e-06, + "loss": 0.0178, + "step": 109140 + }, + { + "epoch": 0.806821205759735, + "grad_norm": 0.07265692949295044, + "learning_rate": 9.694770892687561e-06, + "loss": 0.0173, + "step": 109150 + }, + { + "epoch": 0.8068951243310369, + "grad_norm": 0.08553270995616913, + "learning_rate": 9.691061253561254e-06, + "loss": 0.0175, + "step": 109160 + }, + { + "epoch": 0.8069690429023387, + "grad_norm": 0.06974239647388458, + "learning_rate": 9.68735161443495e-06, + "loss": 0.0181, + "step": 109170 + }, + { + "epoch": 0.8070429614736406, + "grad_norm": 0.08455439656972885, + "learning_rate": 9.683641975308642e-06, + "loss": 0.0164, + "step": 109180 + }, + { + "epoch": 0.8071168800449425, + "grad_norm": 0.08338673412799835, + "learning_rate": 9.679932336182336e-06, + "loss": 0.0169, + "step": 109190 + }, + { + "epoch": 0.8071907986162443, + "grad_norm": 0.06884989887475967, + "learning_rate": 9.67622269705603e-06, + "loss": 0.0169, + "step": 109200 + }, + { + "epoch": 0.8072647171875462, + "grad_norm": 0.08037258684635162, + "learning_rate": 9.672513057929725e-06, + "loss": 0.0162, + "step": 109210 + }, + { + "epoch": 0.807338635758848, + "grad_norm": 0.07604662328958511, + "learning_rate": 9.66880341880342e-06, + "loss": 0.0189, + "step": 109220 + }, + { + "epoch": 0.8074125543301499, + "grad_norm": 0.07522002607584, + "learning_rate": 9.665093779677113e-06, + "loss": 0.015, + "step": 109230 + }, + { + "epoch": 0.8074864729014518, + "grad_norm": 0.06911370903253555, + "learning_rate": 9.661384140550808e-06, + "loss": 0.0169, + "step": 109240 + }, + { + "epoch": 0.8075603914727536, + "grad_norm": 0.05914295092225075, + "learning_rate": 9.657674501424502e-06, + "loss": 0.0178, + "step": 109250 + }, + { + "epoch": 0.8076343100440555, + "grad_norm": 0.06423388421535492, + "learning_rate": 9.653964862298195e-06, + "loss": 0.0172, + "step": 109260 + }, + { + "epoch": 0.8077082286153573, + "grad_norm": 0.08206699043512344, + "learning_rate": 9.65025522317189e-06, + "loss": 0.0134, + "step": 109270 + }, + { + "epoch": 0.8077821471866592, + "grad_norm": 0.0941147580742836, + "learning_rate": 9.646545584045585e-06, + "loss": 0.0153, + "step": 109280 + }, + { + "epoch": 0.807856065757961, + "grad_norm": 0.07409139722585678, + "learning_rate": 9.642835944919279e-06, + "loss": 0.0178, + "step": 109290 + }, + { + "epoch": 0.8079299843292629, + "grad_norm": 0.12559185922145844, + "learning_rate": 9.639126305792973e-06, + "loss": 0.0169, + "step": 109300 + }, + { + "epoch": 0.8080039029005648, + "grad_norm": 0.06743061542510986, + "learning_rate": 9.635416666666668e-06, + "loss": 0.0182, + "step": 109310 + }, + { + "epoch": 0.8080778214718666, + "grad_norm": 0.0892493724822998, + "learning_rate": 9.631707027540362e-06, + "loss": 0.0168, + "step": 109320 + }, + { + "epoch": 0.8081517400431685, + "grad_norm": 0.06646522134542465, + "learning_rate": 9.627997388414056e-06, + "loss": 0.0159, + "step": 109330 + }, + { + "epoch": 0.8082256586144703, + "grad_norm": 0.0646686926484108, + "learning_rate": 9.62428774928775e-06, + "loss": 0.0148, + "step": 109340 + }, + { + "epoch": 0.8082995771857722, + "grad_norm": 0.06950532644987106, + "learning_rate": 9.620578110161443e-06, + "loss": 0.0179, + "step": 109350 + }, + { + "epoch": 0.808373495757074, + "grad_norm": 0.08026842027902603, + "learning_rate": 9.616868471035139e-06, + "loss": 0.0153, + "step": 109360 + }, + { + "epoch": 0.8084474143283759, + "grad_norm": 0.0855989009141922, + "learning_rate": 9.613158831908832e-06, + "loss": 0.0197, + "step": 109370 + }, + { + "epoch": 0.8085213328996778, + "grad_norm": 0.07575526088476181, + "learning_rate": 9.609449192782528e-06, + "loss": 0.0148, + "step": 109380 + }, + { + "epoch": 0.8085952514709795, + "grad_norm": 0.06633900105953217, + "learning_rate": 9.60573955365622e-06, + "loss": 0.0167, + "step": 109390 + }, + { + "epoch": 0.8086691700422814, + "grad_norm": 0.09347169101238251, + "learning_rate": 9.602029914529916e-06, + "loss": 0.0152, + "step": 109400 + }, + { + "epoch": 0.8087430886135832, + "grad_norm": 0.08023204654455185, + "learning_rate": 9.598320275403609e-06, + "loss": 0.0168, + "step": 109410 + }, + { + "epoch": 0.8088170071848851, + "grad_norm": 0.11619100719690323, + "learning_rate": 9.594610636277303e-06, + "loss": 0.018, + "step": 109420 + }, + { + "epoch": 0.8088909257561869, + "grad_norm": 0.06467882543802261, + "learning_rate": 9.590900997150997e-06, + "loss": 0.0178, + "step": 109430 + }, + { + "epoch": 0.8089648443274888, + "grad_norm": 0.0748329907655716, + "learning_rate": 9.587191358024692e-06, + "loss": 0.0187, + "step": 109440 + }, + { + "epoch": 0.8090387628987907, + "grad_norm": 0.08651736378669739, + "learning_rate": 9.583481718898386e-06, + "loss": 0.0166, + "step": 109450 + }, + { + "epoch": 0.8091126814700925, + "grad_norm": 0.06881321966648102, + "learning_rate": 9.57977207977208e-06, + "loss": 0.018, + "step": 109460 + }, + { + "epoch": 0.8091866000413944, + "grad_norm": 0.09105443954467773, + "learning_rate": 9.576062440645774e-06, + "loss": 0.0193, + "step": 109470 + }, + { + "epoch": 0.8092605186126962, + "grad_norm": 0.0879446268081665, + "learning_rate": 9.572352801519469e-06, + "loss": 0.0159, + "step": 109480 + }, + { + "epoch": 0.8093344371839981, + "grad_norm": 0.0992131382226944, + "learning_rate": 9.568643162393163e-06, + "loss": 0.0176, + "step": 109490 + }, + { + "epoch": 0.8094083557553, + "grad_norm": 0.0872669368982315, + "learning_rate": 9.564933523266857e-06, + "loss": 0.0148, + "step": 109500 + }, + { + "epoch": 0.8094822743266018, + "grad_norm": 0.08526704460382462, + "learning_rate": 9.561223884140551e-06, + "loss": 0.0177, + "step": 109510 + }, + { + "epoch": 0.8095561928979037, + "grad_norm": 0.10563500970602036, + "learning_rate": 9.557514245014246e-06, + "loss": 0.0162, + "step": 109520 + }, + { + "epoch": 0.8096301114692055, + "grad_norm": 0.09399376064538956, + "learning_rate": 9.55380460588794e-06, + "loss": 0.0179, + "step": 109530 + }, + { + "epoch": 0.8097040300405074, + "grad_norm": 0.07743006944656372, + "learning_rate": 9.550094966761634e-06, + "loss": 0.0166, + "step": 109540 + }, + { + "epoch": 0.8097779486118092, + "grad_norm": 0.08263615518808365, + "learning_rate": 9.546385327635329e-06, + "loss": 0.0171, + "step": 109550 + }, + { + "epoch": 0.8098518671831111, + "grad_norm": 0.10280684381723404, + "learning_rate": 9.542675688509023e-06, + "loss": 0.017, + "step": 109560 + }, + { + "epoch": 0.809925785754413, + "grad_norm": 0.07834198325872421, + "learning_rate": 9.538966049382717e-06, + "loss": 0.0165, + "step": 109570 + }, + { + "epoch": 0.8099997043257148, + "grad_norm": 0.08863791823387146, + "learning_rate": 9.53525641025641e-06, + "loss": 0.0159, + "step": 109580 + }, + { + "epoch": 0.8100736228970167, + "grad_norm": 0.07828589528799057, + "learning_rate": 9.531546771130106e-06, + "loss": 0.0191, + "step": 109590 + }, + { + "epoch": 0.8101475414683185, + "grad_norm": 0.07181723415851593, + "learning_rate": 9.527837132003798e-06, + "loss": 0.0187, + "step": 109600 + }, + { + "epoch": 0.8102214600396204, + "grad_norm": 0.06643036007881165, + "learning_rate": 9.524127492877494e-06, + "loss": 0.016, + "step": 109610 + }, + { + "epoch": 0.8102953786109222, + "grad_norm": 0.04769090190529823, + "learning_rate": 9.520417853751187e-06, + "loss": 0.0198, + "step": 109620 + }, + { + "epoch": 0.8103692971822241, + "grad_norm": 0.0632459968328476, + "learning_rate": 9.516708214624883e-06, + "loss": 0.017, + "step": 109630 + }, + { + "epoch": 0.810443215753526, + "grad_norm": 0.05598078668117523, + "learning_rate": 9.512998575498575e-06, + "loss": 0.0194, + "step": 109640 + }, + { + "epoch": 0.8105171343248277, + "grad_norm": 0.08820049464702606, + "learning_rate": 9.50928893637227e-06, + "loss": 0.0171, + "step": 109650 + }, + { + "epoch": 0.8105910528961296, + "grad_norm": 0.07200802862644196, + "learning_rate": 9.505579297245964e-06, + "loss": 0.0174, + "step": 109660 + }, + { + "epoch": 0.8106649714674314, + "grad_norm": 0.06750143319368362, + "learning_rate": 9.501869658119658e-06, + "loss": 0.0177, + "step": 109670 + }, + { + "epoch": 0.8107388900387333, + "grad_norm": 0.061461541801691055, + "learning_rate": 9.498160018993352e-06, + "loss": 0.0176, + "step": 109680 + }, + { + "epoch": 0.8108128086100352, + "grad_norm": 0.056660935282707214, + "learning_rate": 9.494450379867047e-06, + "loss": 0.0177, + "step": 109690 + }, + { + "epoch": 0.810886727181337, + "grad_norm": 0.06250043213367462, + "learning_rate": 9.490740740740741e-06, + "loss": 0.0176, + "step": 109700 + }, + { + "epoch": 0.8109606457526389, + "grad_norm": 0.06084416061639786, + "learning_rate": 9.487031101614435e-06, + "loss": 0.017, + "step": 109710 + }, + { + "epoch": 0.8110345643239407, + "grad_norm": 0.08368322998285294, + "learning_rate": 9.48332146248813e-06, + "loss": 0.0174, + "step": 109720 + }, + { + "epoch": 0.8111084828952426, + "grad_norm": 0.06921650469303131, + "learning_rate": 9.479611823361824e-06, + "loss": 0.0158, + "step": 109730 + }, + { + "epoch": 0.8111824014665444, + "grad_norm": 0.08464595675468445, + "learning_rate": 9.475902184235518e-06, + "loss": 0.0184, + "step": 109740 + }, + { + "epoch": 0.8112563200378463, + "grad_norm": 0.09766136109828949, + "learning_rate": 9.472192545109212e-06, + "loss": 0.0157, + "step": 109750 + }, + { + "epoch": 0.8113302386091482, + "grad_norm": 0.10288077592849731, + "learning_rate": 9.468482905982907e-06, + "loss": 0.0169, + "step": 109760 + }, + { + "epoch": 0.81140415718045, + "grad_norm": 0.07717662304639816, + "learning_rate": 9.4647732668566e-06, + "loss": 0.0152, + "step": 109770 + }, + { + "epoch": 0.8114780757517519, + "grad_norm": 0.07156575471162796, + "learning_rate": 9.461063627730295e-06, + "loss": 0.0162, + "step": 109780 + }, + { + "epoch": 0.8115519943230537, + "grad_norm": 0.08129802346229553, + "learning_rate": 9.45735398860399e-06, + "loss": 0.0185, + "step": 109790 + }, + { + "epoch": 0.8116259128943556, + "grad_norm": 0.08766285330057144, + "learning_rate": 9.453644349477684e-06, + "loss": 0.0172, + "step": 109800 + }, + { + "epoch": 0.8116998314656574, + "grad_norm": 0.07962168753147125, + "learning_rate": 9.449934710351376e-06, + "loss": 0.0189, + "step": 109810 + }, + { + "epoch": 0.8117737500369593, + "grad_norm": 0.04780622199177742, + "learning_rate": 9.446225071225072e-06, + "loss": 0.0173, + "step": 109820 + }, + { + "epoch": 0.8118476686082612, + "grad_norm": 0.05502776801586151, + "learning_rate": 9.442515432098765e-06, + "loss": 0.0139, + "step": 109830 + }, + { + "epoch": 0.811921587179563, + "grad_norm": 0.07169415056705475, + "learning_rate": 9.43880579297246e-06, + "loss": 0.0189, + "step": 109840 + }, + { + "epoch": 0.8119955057508649, + "grad_norm": 0.08810874819755554, + "learning_rate": 9.435096153846153e-06, + "loss": 0.0191, + "step": 109850 + }, + { + "epoch": 0.8120694243221667, + "grad_norm": 0.07159397751092911, + "learning_rate": 9.43138651471985e-06, + "loss": 0.0187, + "step": 109860 + }, + { + "epoch": 0.8121433428934686, + "grad_norm": 0.11870425194501877, + "learning_rate": 9.427676875593542e-06, + "loss": 0.0191, + "step": 109870 + }, + { + "epoch": 0.8122172614647704, + "grad_norm": 0.0735914409160614, + "learning_rate": 9.423967236467236e-06, + "loss": 0.0161, + "step": 109880 + }, + { + "epoch": 0.8122911800360723, + "grad_norm": 0.06938380748033524, + "learning_rate": 9.420257597340932e-06, + "loss": 0.0173, + "step": 109890 + }, + { + "epoch": 0.8123650986073742, + "grad_norm": 0.0808085948228836, + "learning_rate": 9.416547958214625e-06, + "loss": 0.0173, + "step": 109900 + }, + { + "epoch": 0.812439017178676, + "grad_norm": 0.0733335018157959, + "learning_rate": 9.41283831908832e-06, + "loss": 0.0181, + "step": 109910 + }, + { + "epoch": 0.8125129357499778, + "grad_norm": 0.09480886906385422, + "learning_rate": 9.409128679962013e-06, + "loss": 0.0188, + "step": 109920 + }, + { + "epoch": 0.8125868543212796, + "grad_norm": 0.05744516849517822, + "learning_rate": 9.40541904083571e-06, + "loss": 0.0171, + "step": 109930 + }, + { + "epoch": 0.8126607728925815, + "grad_norm": 0.08588145673274994, + "learning_rate": 9.401709401709402e-06, + "loss": 0.0162, + "step": 109940 + }, + { + "epoch": 0.8127346914638834, + "grad_norm": 0.08707545697689056, + "learning_rate": 9.397999762583098e-06, + "loss": 0.0172, + "step": 109950 + }, + { + "epoch": 0.8128086100351852, + "grad_norm": 0.06720160692930222, + "learning_rate": 9.39429012345679e-06, + "loss": 0.018, + "step": 109960 + }, + { + "epoch": 0.8128825286064871, + "grad_norm": 0.0780353993177414, + "learning_rate": 9.390580484330485e-06, + "loss": 0.0176, + "step": 109970 + }, + { + "epoch": 0.8129564471777889, + "grad_norm": 0.062965989112854, + "learning_rate": 9.386870845204179e-06, + "loss": 0.0172, + "step": 109980 + }, + { + "epoch": 0.8130303657490908, + "grad_norm": 0.10180020332336426, + "learning_rate": 9.383161206077873e-06, + "loss": 0.0166, + "step": 109990 + }, + { + "epoch": 0.8131042843203926, + "grad_norm": 0.089419424533844, + "learning_rate": 9.379451566951567e-06, + "loss": 0.02, + "step": 110000 + }, + { + "epoch": 0.8131042843203926, + "eval_f1": 0.6308062269462349, + "eval_loss": 0.016855139285326004, + "eval_precision": 0.5022165389113012, + "eval_recall": 0.8479083322773926, + "eval_runtime": 2929.1275, + "eval_samples_per_second": 184.742, + "eval_steps_per_second": 2.887, + "step": 110000 + }, + { + "epoch": 0.8131782028916945, + "grad_norm": 0.09093558043241501, + "learning_rate": 9.375741927825262e-06, + "loss": 0.0173, + "step": 110010 + }, + { + "epoch": 0.8132521214629964, + "grad_norm": 0.09518486261367798, + "learning_rate": 9.372032288698956e-06, + "loss": 0.0192, + "step": 110020 + }, + { + "epoch": 0.8133260400342982, + "grad_norm": 0.07745856046676636, + "learning_rate": 9.36832264957265e-06, + "loss": 0.0156, + "step": 110030 + }, + { + "epoch": 0.8133999586056001, + "grad_norm": 0.07020919770002365, + "learning_rate": 9.364613010446343e-06, + "loss": 0.0175, + "step": 110040 + }, + { + "epoch": 0.8134738771769019, + "grad_norm": 0.06830445677042007, + "learning_rate": 9.360903371320039e-06, + "loss": 0.0163, + "step": 110050 + }, + { + "epoch": 0.8135477957482038, + "grad_norm": 0.09864474833011627, + "learning_rate": 9.357193732193733e-06, + "loss": 0.0172, + "step": 110060 + }, + { + "epoch": 0.8136217143195056, + "grad_norm": 0.08051852881908417, + "learning_rate": 9.353484093067427e-06, + "loss": 0.018, + "step": 110070 + }, + { + "epoch": 0.8136956328908075, + "grad_norm": 0.06373199075460434, + "learning_rate": 9.349774453941122e-06, + "loss": 0.0182, + "step": 110080 + }, + { + "epoch": 0.8137695514621094, + "grad_norm": 0.07208918035030365, + "learning_rate": 9.346064814814816e-06, + "loss": 0.0176, + "step": 110090 + }, + { + "epoch": 0.8138434700334112, + "grad_norm": 0.06866229325532913, + "learning_rate": 9.34235517568851e-06, + "loss": 0.0152, + "step": 110100 + }, + { + "epoch": 0.8139173886047131, + "grad_norm": 0.09499530494213104, + "learning_rate": 9.338645536562203e-06, + "loss": 0.0186, + "step": 110110 + }, + { + "epoch": 0.8139913071760149, + "grad_norm": 0.08893073350191116, + "learning_rate": 9.334935897435899e-06, + "loss": 0.0206, + "step": 110120 + }, + { + "epoch": 0.8140652257473168, + "grad_norm": 0.06883365660905838, + "learning_rate": 9.331226258309591e-06, + "loss": 0.0137, + "step": 110130 + }, + { + "epoch": 0.8141391443186186, + "grad_norm": 0.08844020217657089, + "learning_rate": 9.327516619183287e-06, + "loss": 0.0179, + "step": 110140 + }, + { + "epoch": 0.8142130628899205, + "grad_norm": 0.10410746932029724, + "learning_rate": 9.32380698005698e-06, + "loss": 0.019, + "step": 110150 + }, + { + "epoch": 0.8142869814612224, + "grad_norm": 0.07305875420570374, + "learning_rate": 9.320097340930676e-06, + "loss": 0.0154, + "step": 110160 + }, + { + "epoch": 0.8143609000325241, + "grad_norm": 0.09578864276409149, + "learning_rate": 9.316387701804368e-06, + "loss": 0.0176, + "step": 110170 + }, + { + "epoch": 0.814434818603826, + "grad_norm": 0.10740100592374802, + "learning_rate": 9.312678062678064e-06, + "loss": 0.02, + "step": 110180 + }, + { + "epoch": 0.8145087371751278, + "grad_norm": 0.1025090143084526, + "learning_rate": 9.308968423551757e-06, + "loss": 0.0191, + "step": 110190 + }, + { + "epoch": 0.8145826557464297, + "grad_norm": 0.07519301027059555, + "learning_rate": 9.305258784425451e-06, + "loss": 0.0189, + "step": 110200 + }, + { + "epoch": 0.8146565743177316, + "grad_norm": 0.08736584335565567, + "learning_rate": 9.301549145299145e-06, + "loss": 0.0174, + "step": 110210 + }, + { + "epoch": 0.8147304928890334, + "grad_norm": 0.08328347653150558, + "learning_rate": 9.29783950617284e-06, + "loss": 0.0165, + "step": 110220 + }, + { + "epoch": 0.8148044114603353, + "grad_norm": 0.0822676569223404, + "learning_rate": 9.294129867046534e-06, + "loss": 0.0175, + "step": 110230 + }, + { + "epoch": 0.8148783300316371, + "grad_norm": 0.08251149207353592, + "learning_rate": 9.290420227920228e-06, + "loss": 0.0178, + "step": 110240 + }, + { + "epoch": 0.814952248602939, + "grad_norm": 0.0925745889544487, + "learning_rate": 9.286710588793923e-06, + "loss": 0.0158, + "step": 110250 + }, + { + "epoch": 0.8150261671742408, + "grad_norm": 0.09380180388689041, + "learning_rate": 9.283000949667617e-06, + "loss": 0.0161, + "step": 110260 + }, + { + "epoch": 0.8151000857455427, + "grad_norm": 0.05887442082166672, + "learning_rate": 9.279291310541311e-06, + "loss": 0.0153, + "step": 110270 + }, + { + "epoch": 0.8151740043168446, + "grad_norm": 0.0812305212020874, + "learning_rate": 9.275581671415005e-06, + "loss": 0.018, + "step": 110280 + }, + { + "epoch": 0.8152479228881464, + "grad_norm": 0.06080586463212967, + "learning_rate": 9.2718720322887e-06, + "loss": 0.018, + "step": 110290 + }, + { + "epoch": 0.8153218414594483, + "grad_norm": 0.0710943415760994, + "learning_rate": 9.268162393162394e-06, + "loss": 0.0171, + "step": 110300 + }, + { + "epoch": 0.8153957600307501, + "grad_norm": 0.06379541009664536, + "learning_rate": 9.264452754036088e-06, + "loss": 0.0157, + "step": 110310 + }, + { + "epoch": 0.815469678602052, + "grad_norm": 0.11508116126060486, + "learning_rate": 9.260743114909782e-06, + "loss": 0.0167, + "step": 110320 + }, + { + "epoch": 0.8155435971733538, + "grad_norm": 0.09485437721014023, + "learning_rate": 9.257033475783477e-06, + "loss": 0.0165, + "step": 110330 + }, + { + "epoch": 0.8156175157446557, + "grad_norm": 0.062455859035253525, + "learning_rate": 9.25332383665717e-06, + "loss": 0.0167, + "step": 110340 + }, + { + "epoch": 0.8156914343159576, + "grad_norm": 0.0841631069779396, + "learning_rate": 9.249614197530865e-06, + "loss": 0.0152, + "step": 110350 + }, + { + "epoch": 0.8157653528872594, + "grad_norm": 0.07254151254892349, + "learning_rate": 9.245904558404558e-06, + "loss": 0.0187, + "step": 110360 + }, + { + "epoch": 0.8158392714585613, + "grad_norm": 0.07702041417360306, + "learning_rate": 9.242194919278254e-06, + "loss": 0.0184, + "step": 110370 + }, + { + "epoch": 0.8159131900298631, + "grad_norm": 0.06007539853453636, + "learning_rate": 9.238485280151946e-06, + "loss": 0.0167, + "step": 110380 + }, + { + "epoch": 0.815987108601165, + "grad_norm": 0.07187853008508682, + "learning_rate": 9.234775641025642e-06, + "loss": 0.019, + "step": 110390 + }, + { + "epoch": 0.8160610271724668, + "grad_norm": 0.07426824420690536, + "learning_rate": 9.231066001899335e-06, + "loss": 0.0169, + "step": 110400 + }, + { + "epoch": 0.8161349457437687, + "grad_norm": 0.081462062895298, + "learning_rate": 9.227356362773031e-06, + "loss": 0.0179, + "step": 110410 + }, + { + "epoch": 0.8162088643150706, + "grad_norm": 0.08284270763397217, + "learning_rate": 9.223646723646723e-06, + "loss": 0.0152, + "step": 110420 + }, + { + "epoch": 0.8162827828863723, + "grad_norm": 0.09823929518461227, + "learning_rate": 9.219937084520418e-06, + "loss": 0.0178, + "step": 110430 + }, + { + "epoch": 0.8163567014576742, + "grad_norm": 0.09018296003341675, + "learning_rate": 9.216227445394112e-06, + "loss": 0.0168, + "step": 110440 + }, + { + "epoch": 0.816430620028976, + "grad_norm": 0.07779600471258163, + "learning_rate": 9.212517806267806e-06, + "loss": 0.0179, + "step": 110450 + }, + { + "epoch": 0.8165045386002779, + "grad_norm": 0.09467915445566177, + "learning_rate": 9.2088081671415e-06, + "loss": 0.0189, + "step": 110460 + }, + { + "epoch": 0.8165784571715798, + "grad_norm": 0.06862441450357437, + "learning_rate": 9.205098528015195e-06, + "loss": 0.0171, + "step": 110470 + }, + { + "epoch": 0.8166523757428816, + "grad_norm": 0.0598343126475811, + "learning_rate": 9.201388888888889e-06, + "loss": 0.0168, + "step": 110480 + }, + { + "epoch": 0.8167262943141835, + "grad_norm": 0.06925109028816223, + "learning_rate": 9.197679249762583e-06, + "loss": 0.0162, + "step": 110490 + }, + { + "epoch": 0.8168002128854853, + "grad_norm": 0.10528044402599335, + "learning_rate": 9.193969610636278e-06, + "loss": 0.0185, + "step": 110500 + }, + { + "epoch": 0.8168741314567872, + "grad_norm": 0.09137672930955887, + "learning_rate": 9.190259971509972e-06, + "loss": 0.019, + "step": 110510 + }, + { + "epoch": 0.816948050028089, + "grad_norm": 0.07503055036067963, + "learning_rate": 9.186550332383666e-06, + "loss": 0.0162, + "step": 110520 + }, + { + "epoch": 0.8170219685993909, + "grad_norm": 0.08323784172534943, + "learning_rate": 9.18284069325736e-06, + "loss": 0.0194, + "step": 110530 + }, + { + "epoch": 0.8170958871706928, + "grad_norm": 0.07209733873605728, + "learning_rate": 9.179131054131055e-06, + "loss": 0.0181, + "step": 110540 + }, + { + "epoch": 0.8171698057419946, + "grad_norm": 0.07893183827400208, + "learning_rate": 9.175421415004749e-06, + "loss": 0.0166, + "step": 110550 + }, + { + "epoch": 0.8172437243132965, + "grad_norm": 0.05670410022139549, + "learning_rate": 9.171711775878443e-06, + "loss": 0.0162, + "step": 110560 + }, + { + "epoch": 0.8173176428845983, + "grad_norm": 0.09070245921611786, + "learning_rate": 9.168002136752136e-06, + "loss": 0.0173, + "step": 110570 + }, + { + "epoch": 0.8173915614559002, + "grad_norm": 0.10759375989437103, + "learning_rate": 9.164292497625832e-06, + "loss": 0.019, + "step": 110580 + }, + { + "epoch": 0.817465480027202, + "grad_norm": 0.06987611204385757, + "learning_rate": 9.160582858499524e-06, + "loss": 0.0165, + "step": 110590 + }, + { + "epoch": 0.8175393985985039, + "grad_norm": 0.06290466338396072, + "learning_rate": 9.15687321937322e-06, + "loss": 0.0174, + "step": 110600 + }, + { + "epoch": 0.8176133171698058, + "grad_norm": 0.05969390273094177, + "learning_rate": 9.153163580246913e-06, + "loss": 0.0181, + "step": 110610 + }, + { + "epoch": 0.8176872357411076, + "grad_norm": 0.06636475771665573, + "learning_rate": 9.149453941120609e-06, + "loss": 0.0191, + "step": 110620 + }, + { + "epoch": 0.8177611543124095, + "grad_norm": 0.0508534200489521, + "learning_rate": 9.145744301994302e-06, + "loss": 0.0185, + "step": 110630 + }, + { + "epoch": 0.8178350728837113, + "grad_norm": 0.08707664906978607, + "learning_rate": 9.142034662867997e-06, + "loss": 0.0174, + "step": 110640 + }, + { + "epoch": 0.8179089914550132, + "grad_norm": 0.0794239416718483, + "learning_rate": 9.138325023741692e-06, + "loss": 0.0181, + "step": 110650 + }, + { + "epoch": 0.817982910026315, + "grad_norm": 0.07165955752134323, + "learning_rate": 9.134615384615384e-06, + "loss": 0.0196, + "step": 110660 + }, + { + "epoch": 0.8180568285976169, + "grad_norm": 0.07645541429519653, + "learning_rate": 9.13090574548908e-06, + "loss": 0.0176, + "step": 110670 + }, + { + "epoch": 0.8181307471689188, + "grad_norm": 0.08750036358833313, + "learning_rate": 9.127196106362773e-06, + "loss": 0.0161, + "step": 110680 + }, + { + "epoch": 0.8182046657402205, + "grad_norm": 0.10095136612653732, + "learning_rate": 9.123486467236469e-06, + "loss": 0.0189, + "step": 110690 + }, + { + "epoch": 0.8182785843115225, + "grad_norm": 0.07623224705457687, + "learning_rate": 9.119776828110161e-06, + "loss": 0.0151, + "step": 110700 + }, + { + "epoch": 0.8183525028828242, + "grad_norm": 0.09414133429527283, + "learning_rate": 9.116067188983857e-06, + "loss": 0.0176, + "step": 110710 + }, + { + "epoch": 0.8184264214541261, + "grad_norm": 0.062059711664915085, + "learning_rate": 9.11235754985755e-06, + "loss": 0.0158, + "step": 110720 + }, + { + "epoch": 0.818500340025428, + "grad_norm": 0.11147674918174744, + "learning_rate": 9.108647910731244e-06, + "loss": 0.0185, + "step": 110730 + }, + { + "epoch": 0.8185742585967298, + "grad_norm": 0.0802476778626442, + "learning_rate": 9.104938271604939e-06, + "loss": 0.0165, + "step": 110740 + }, + { + "epoch": 0.8186481771680317, + "grad_norm": 0.056705329567193985, + "learning_rate": 9.101228632478633e-06, + "loss": 0.0166, + "step": 110750 + }, + { + "epoch": 0.8187220957393335, + "grad_norm": 0.10482749342918396, + "learning_rate": 9.097518993352327e-06, + "loss": 0.0169, + "step": 110760 + }, + { + "epoch": 0.8187960143106354, + "grad_norm": 0.07661043107509613, + "learning_rate": 9.093809354226021e-06, + "loss": 0.0171, + "step": 110770 + }, + { + "epoch": 0.8188699328819372, + "grad_norm": 0.04403127729892731, + "learning_rate": 9.090099715099716e-06, + "loss": 0.0158, + "step": 110780 + }, + { + "epoch": 0.8189438514532391, + "grad_norm": 0.10330238193273544, + "learning_rate": 9.08639007597341e-06, + "loss": 0.0179, + "step": 110790 + }, + { + "epoch": 0.819017770024541, + "grad_norm": 0.07194700092077255, + "learning_rate": 9.082680436847102e-06, + "loss": 0.0166, + "step": 110800 + }, + { + "epoch": 0.8190916885958428, + "grad_norm": 0.07015722990036011, + "learning_rate": 9.078970797720798e-06, + "loss": 0.0165, + "step": 110810 + }, + { + "epoch": 0.8191656071671447, + "grad_norm": 0.09940315037965775, + "learning_rate": 9.075261158594493e-06, + "loss": 0.0186, + "step": 110820 + }, + { + "epoch": 0.8192395257384465, + "grad_norm": 0.08396787196397781, + "learning_rate": 9.071551519468187e-06, + "loss": 0.0155, + "step": 110830 + }, + { + "epoch": 0.8193134443097484, + "grad_norm": 0.09233757108449936, + "learning_rate": 9.067841880341881e-06, + "loss": 0.0168, + "step": 110840 + }, + { + "epoch": 0.8193873628810502, + "grad_norm": 0.0720495656132698, + "learning_rate": 9.064132241215576e-06, + "loss": 0.0164, + "step": 110850 + }, + { + "epoch": 0.8194612814523521, + "grad_norm": 0.08925806730985641, + "learning_rate": 9.06042260208927e-06, + "loss": 0.017, + "step": 110860 + }, + { + "epoch": 0.819535200023654, + "grad_norm": 0.08104828745126724, + "learning_rate": 9.056712962962964e-06, + "loss": 0.0188, + "step": 110870 + }, + { + "epoch": 0.8196091185949558, + "grad_norm": 0.07082411646842957, + "learning_rate": 9.053003323836658e-06, + "loss": 0.0197, + "step": 110880 + }, + { + "epoch": 0.8196830371662577, + "grad_norm": 0.08721831440925598, + "learning_rate": 9.049293684710351e-06, + "loss": 0.0153, + "step": 110890 + }, + { + "epoch": 0.8197569557375595, + "grad_norm": 0.11455877870321274, + "learning_rate": 9.045584045584047e-06, + "loss": 0.0176, + "step": 110900 + }, + { + "epoch": 0.8198308743088614, + "grad_norm": 0.08649104833602905, + "learning_rate": 9.04187440645774e-06, + "loss": 0.0171, + "step": 110910 + }, + { + "epoch": 0.8199047928801632, + "grad_norm": 0.08350533246994019, + "learning_rate": 9.038164767331435e-06, + "loss": 0.0167, + "step": 110920 + }, + { + "epoch": 0.8199787114514651, + "grad_norm": 0.0666193887591362, + "learning_rate": 9.034455128205128e-06, + "loss": 0.0162, + "step": 110930 + }, + { + "epoch": 0.820052630022767, + "grad_norm": 0.08098326623439789, + "learning_rate": 9.030745489078824e-06, + "loss": 0.0157, + "step": 110940 + }, + { + "epoch": 0.8201265485940687, + "grad_norm": 0.06648987531661987, + "learning_rate": 9.027035849952517e-06, + "loss": 0.0163, + "step": 110950 + }, + { + "epoch": 0.8202004671653707, + "grad_norm": 0.09723273664712906, + "learning_rate": 9.02332621082621e-06, + "loss": 0.017, + "step": 110960 + }, + { + "epoch": 0.8202743857366724, + "grad_norm": 0.09789856523275375, + "learning_rate": 9.019616571699905e-06, + "loss": 0.0159, + "step": 110970 + }, + { + "epoch": 0.8203483043079743, + "grad_norm": 0.07456418126821518, + "learning_rate": 9.0159069325736e-06, + "loss": 0.0147, + "step": 110980 + }, + { + "epoch": 0.8204222228792762, + "grad_norm": 0.08231586217880249, + "learning_rate": 9.012197293447294e-06, + "loss": 0.0183, + "step": 110990 + }, + { + "epoch": 0.820496141450578, + "grad_norm": 0.0645674616098404, + "learning_rate": 9.008487654320988e-06, + "loss": 0.0158, + "step": 111000 + }, + { + "epoch": 0.8205700600218799, + "grad_norm": 0.07256720215082169, + "learning_rate": 9.004778015194682e-06, + "loss": 0.0199, + "step": 111010 + }, + { + "epoch": 0.8206439785931817, + "grad_norm": 0.09442053735256195, + "learning_rate": 9.001068376068376e-06, + "loss": 0.0171, + "step": 111020 + }, + { + "epoch": 0.8207178971644836, + "grad_norm": 0.08105292916297913, + "learning_rate": 8.99735873694207e-06, + "loss": 0.0162, + "step": 111030 + }, + { + "epoch": 0.8207918157357854, + "grad_norm": 0.06316964328289032, + "learning_rate": 8.993649097815765e-06, + "loss": 0.015, + "step": 111040 + }, + { + "epoch": 0.8208657343070873, + "grad_norm": 0.1590401977300644, + "learning_rate": 8.98993945868946e-06, + "loss": 0.016, + "step": 111050 + }, + { + "epoch": 0.8209396528783892, + "grad_norm": 0.1914466768503189, + "learning_rate": 8.986229819563154e-06, + "loss": 0.0171, + "step": 111060 + }, + { + "epoch": 0.821013571449691, + "grad_norm": 0.062346745282411575, + "learning_rate": 8.982520180436848e-06, + "loss": 0.0186, + "step": 111070 + }, + { + "epoch": 0.8210874900209929, + "grad_norm": 0.06994392722845078, + "learning_rate": 8.978810541310542e-06, + "loss": 0.018, + "step": 111080 + }, + { + "epoch": 0.8211614085922947, + "grad_norm": 0.05640175938606262, + "learning_rate": 8.975100902184236e-06, + "loss": 0.0151, + "step": 111090 + }, + { + "epoch": 0.8212353271635966, + "grad_norm": 0.08227109163999557, + "learning_rate": 8.97139126305793e-06, + "loss": 0.018, + "step": 111100 + }, + { + "epoch": 0.8213092457348984, + "grad_norm": 0.09539218246936798, + "learning_rate": 8.967681623931625e-06, + "loss": 0.0163, + "step": 111110 + }, + { + "epoch": 0.8213831643062003, + "grad_norm": 0.07617813348770142, + "learning_rate": 8.963971984805318e-06, + "loss": 0.0189, + "step": 111120 + }, + { + "epoch": 0.8214570828775022, + "grad_norm": 0.07727860659360886, + "learning_rate": 8.960262345679013e-06, + "loss": 0.0177, + "step": 111130 + }, + { + "epoch": 0.821531001448804, + "grad_norm": 0.08091080188751221, + "learning_rate": 8.956552706552706e-06, + "loss": 0.0169, + "step": 111140 + }, + { + "epoch": 0.8216049200201059, + "grad_norm": 0.07870712131261826, + "learning_rate": 8.952843067426402e-06, + "loss": 0.0177, + "step": 111150 + }, + { + "epoch": 0.8216788385914077, + "grad_norm": 0.07519575208425522, + "learning_rate": 8.949133428300095e-06, + "loss": 0.0181, + "step": 111160 + }, + { + "epoch": 0.8217527571627096, + "grad_norm": 0.09125541150569916, + "learning_rate": 8.94542378917379e-06, + "loss": 0.0165, + "step": 111170 + }, + { + "epoch": 0.8218266757340114, + "grad_norm": 0.07214830815792084, + "learning_rate": 8.941714150047483e-06, + "loss": 0.0163, + "step": 111180 + }, + { + "epoch": 0.8219005943053133, + "grad_norm": 0.07173985987901688, + "learning_rate": 8.938004510921177e-06, + "loss": 0.0184, + "step": 111190 + }, + { + "epoch": 0.8219745128766152, + "grad_norm": 0.06702403724193573, + "learning_rate": 8.934294871794872e-06, + "loss": 0.0185, + "step": 111200 + }, + { + "epoch": 0.822048431447917, + "grad_norm": 0.08879362046718597, + "learning_rate": 8.930585232668566e-06, + "loss": 0.0161, + "step": 111210 + }, + { + "epoch": 0.8221223500192189, + "grad_norm": 0.10882284492254257, + "learning_rate": 8.92687559354226e-06, + "loss": 0.0192, + "step": 111220 + }, + { + "epoch": 0.8221962685905206, + "grad_norm": 0.08666688948869705, + "learning_rate": 8.923165954415955e-06, + "loss": 0.0189, + "step": 111230 + }, + { + "epoch": 0.8222701871618225, + "grad_norm": 0.08701031655073166, + "learning_rate": 8.919456315289649e-06, + "loss": 0.0175, + "step": 111240 + }, + { + "epoch": 0.8223441057331244, + "grad_norm": 0.08602787554264069, + "learning_rate": 8.915746676163343e-06, + "loss": 0.0189, + "step": 111250 + }, + { + "epoch": 0.8224180243044262, + "grad_norm": 0.0800880491733551, + "learning_rate": 8.912037037037037e-06, + "loss": 0.0164, + "step": 111260 + }, + { + "epoch": 0.8224919428757281, + "grad_norm": 0.08936553448438644, + "learning_rate": 8.908327397910732e-06, + "loss": 0.0168, + "step": 111270 + }, + { + "epoch": 0.8225658614470299, + "grad_norm": 0.06623522192239761, + "learning_rate": 8.904617758784426e-06, + "loss": 0.0152, + "step": 111280 + }, + { + "epoch": 0.8226397800183318, + "grad_norm": 0.07130187749862671, + "learning_rate": 8.90090811965812e-06, + "loss": 0.02, + "step": 111290 + }, + { + "epoch": 0.8227136985896336, + "grad_norm": 0.04986598715186119, + "learning_rate": 8.897198480531814e-06, + "loss": 0.0166, + "step": 111300 + }, + { + "epoch": 0.8227876171609355, + "grad_norm": 0.07575799524784088, + "learning_rate": 8.893488841405509e-06, + "loss": 0.0173, + "step": 111310 + }, + { + "epoch": 0.8228615357322374, + "grad_norm": 0.0706147775053978, + "learning_rate": 8.889779202279203e-06, + "loss": 0.0169, + "step": 111320 + }, + { + "epoch": 0.8229354543035392, + "grad_norm": 0.07670772075653076, + "learning_rate": 8.886069563152897e-06, + "loss": 0.0187, + "step": 111330 + }, + { + "epoch": 0.8230093728748411, + "grad_norm": 0.09012507647275925, + "learning_rate": 8.882359924026592e-06, + "loss": 0.0178, + "step": 111340 + }, + { + "epoch": 0.8230832914461429, + "grad_norm": 0.06751397997140884, + "learning_rate": 8.878650284900284e-06, + "loss": 0.0188, + "step": 111350 + }, + { + "epoch": 0.8231572100174448, + "grad_norm": 0.06585299223661423, + "learning_rate": 8.87494064577398e-06, + "loss": 0.0175, + "step": 111360 + }, + { + "epoch": 0.8232311285887466, + "grad_norm": 0.08249194175004959, + "learning_rate": 8.871231006647673e-06, + "loss": 0.0193, + "step": 111370 + }, + { + "epoch": 0.8233050471600485, + "grad_norm": 0.06290542334318161, + "learning_rate": 8.867521367521369e-06, + "loss": 0.0157, + "step": 111380 + }, + { + "epoch": 0.8233789657313504, + "grad_norm": 0.09153600037097931, + "learning_rate": 8.863811728395061e-06, + "loss": 0.0198, + "step": 111390 + }, + { + "epoch": 0.8234528843026522, + "grad_norm": 0.07733996957540512, + "learning_rate": 8.860102089268757e-06, + "loss": 0.0162, + "step": 111400 + }, + { + "epoch": 0.8235268028739541, + "grad_norm": 0.0745389461517334, + "learning_rate": 8.85639245014245e-06, + "loss": 0.0188, + "step": 111410 + }, + { + "epoch": 0.8236007214452559, + "grad_norm": 0.06475479900836945, + "learning_rate": 8.852682811016144e-06, + "loss": 0.0176, + "step": 111420 + }, + { + "epoch": 0.8236746400165578, + "grad_norm": 0.05812176316976547, + "learning_rate": 8.84897317188984e-06, + "loss": 0.0172, + "step": 111430 + }, + { + "epoch": 0.8237485585878596, + "grad_norm": 0.1021883636713028, + "learning_rate": 8.845263532763533e-06, + "loss": 0.0186, + "step": 111440 + }, + { + "epoch": 0.8238224771591615, + "grad_norm": 0.06590006500482559, + "learning_rate": 8.841553893637229e-06, + "loss": 0.019, + "step": 111450 + }, + { + "epoch": 0.8238963957304634, + "grad_norm": 0.0770881250500679, + "learning_rate": 8.837844254510921e-06, + "loss": 0.0159, + "step": 111460 + }, + { + "epoch": 0.8239703143017651, + "grad_norm": 0.05062444135546684, + "learning_rate": 8.834134615384617e-06, + "loss": 0.0161, + "step": 111470 + }, + { + "epoch": 0.824044232873067, + "grad_norm": 0.08017632365226746, + "learning_rate": 8.83042497625831e-06, + "loss": 0.017, + "step": 111480 + }, + { + "epoch": 0.8241181514443688, + "grad_norm": 0.08347149938344955, + "learning_rate": 8.826715337132004e-06, + "loss": 0.0201, + "step": 111490 + }, + { + "epoch": 0.8241920700156707, + "grad_norm": 0.08138225227594376, + "learning_rate": 8.823005698005698e-06, + "loss": 0.0184, + "step": 111500 + }, + { + "epoch": 0.8242659885869726, + "grad_norm": 0.09080573171377182, + "learning_rate": 8.819296058879392e-06, + "loss": 0.017, + "step": 111510 + }, + { + "epoch": 0.8243399071582744, + "grad_norm": 0.09142883867025375, + "learning_rate": 8.815586419753087e-06, + "loss": 0.0189, + "step": 111520 + }, + { + "epoch": 0.8244138257295763, + "grad_norm": 0.08484750241041183, + "learning_rate": 8.811876780626781e-06, + "loss": 0.0146, + "step": 111530 + }, + { + "epoch": 0.8244877443008781, + "grad_norm": 0.08886722475290298, + "learning_rate": 8.808167141500475e-06, + "loss": 0.0166, + "step": 111540 + }, + { + "epoch": 0.82456166287218, + "grad_norm": 0.05006714537739754, + "learning_rate": 8.80445750237417e-06, + "loss": 0.0148, + "step": 111550 + }, + { + "epoch": 0.8246355814434818, + "grad_norm": 0.0701959878206253, + "learning_rate": 8.800747863247864e-06, + "loss": 0.015, + "step": 111560 + }, + { + "epoch": 0.8247095000147837, + "grad_norm": 0.06328166276216507, + "learning_rate": 8.797038224121558e-06, + "loss": 0.0169, + "step": 111570 + }, + { + "epoch": 0.8247834185860856, + "grad_norm": 0.10973607003688812, + "learning_rate": 8.793328584995252e-06, + "loss": 0.0186, + "step": 111580 + }, + { + "epoch": 0.8248573371573874, + "grad_norm": 0.05845305323600769, + "learning_rate": 8.789618945868947e-06, + "loss": 0.0163, + "step": 111590 + }, + { + "epoch": 0.8249312557286893, + "grad_norm": 0.0932711809873581, + "learning_rate": 8.785909306742641e-06, + "loss": 0.019, + "step": 111600 + }, + { + "epoch": 0.8250051742999911, + "grad_norm": 0.07008972018957138, + "learning_rate": 8.782199667616335e-06, + "loss": 0.0165, + "step": 111610 + }, + { + "epoch": 0.825079092871293, + "grad_norm": 0.1511513590812683, + "learning_rate": 8.77849002849003e-06, + "loss": 0.0185, + "step": 111620 + }, + { + "epoch": 0.8251530114425948, + "grad_norm": 0.08395200967788696, + "learning_rate": 8.774780389363724e-06, + "loss": 0.0155, + "step": 111630 + }, + { + "epoch": 0.8252269300138967, + "grad_norm": 0.06920085102319717, + "learning_rate": 8.771070750237418e-06, + "loss": 0.0199, + "step": 111640 + }, + { + "epoch": 0.8253008485851986, + "grad_norm": 0.06494973599910736, + "learning_rate": 8.76736111111111e-06, + "loss": 0.0157, + "step": 111650 + }, + { + "epoch": 0.8253747671565004, + "grad_norm": 0.09502291679382324, + "learning_rate": 8.763651471984807e-06, + "loss": 0.016, + "step": 111660 + }, + { + "epoch": 0.8254486857278023, + "grad_norm": 0.06811536103487015, + "learning_rate": 8.759941832858499e-06, + "loss": 0.0157, + "step": 111670 + }, + { + "epoch": 0.8255226042991041, + "grad_norm": 0.051820866763591766, + "learning_rate": 8.756232193732195e-06, + "loss": 0.0165, + "step": 111680 + }, + { + "epoch": 0.825596522870406, + "grad_norm": 0.08794544637203217, + "learning_rate": 8.752522554605888e-06, + "loss": 0.0168, + "step": 111690 + }, + { + "epoch": 0.8256704414417079, + "grad_norm": 0.08136577159166336, + "learning_rate": 8.748812915479584e-06, + "loss": 0.0166, + "step": 111700 + }, + { + "epoch": 0.8257443600130097, + "grad_norm": 0.06894486397504807, + "learning_rate": 8.745103276353276e-06, + "loss": 0.0159, + "step": 111710 + }, + { + "epoch": 0.8258182785843116, + "grad_norm": 0.07653698325157166, + "learning_rate": 8.74139363722697e-06, + "loss": 0.0179, + "step": 111720 + }, + { + "epoch": 0.8258921971556134, + "grad_norm": 0.0793883353471756, + "learning_rate": 8.737683998100665e-06, + "loss": 0.0166, + "step": 111730 + }, + { + "epoch": 0.8259661157269153, + "grad_norm": 0.06822627782821655, + "learning_rate": 8.733974358974359e-06, + "loss": 0.0163, + "step": 111740 + }, + { + "epoch": 0.826040034298217, + "grad_norm": 0.0709918662905693, + "learning_rate": 8.730264719848053e-06, + "loss": 0.0179, + "step": 111750 + }, + { + "epoch": 0.8261139528695189, + "grad_norm": 0.09269371628761292, + "learning_rate": 8.726555080721748e-06, + "loss": 0.019, + "step": 111760 + }, + { + "epoch": 0.8261878714408208, + "grad_norm": 0.09144306927919388, + "learning_rate": 8.722845441595442e-06, + "loss": 0.0173, + "step": 111770 + }, + { + "epoch": 0.8262617900121226, + "grad_norm": 0.06571599096059799, + "learning_rate": 8.719135802469136e-06, + "loss": 0.0165, + "step": 111780 + }, + { + "epoch": 0.8263357085834245, + "grad_norm": 0.0643790140748024, + "learning_rate": 8.71542616334283e-06, + "loss": 0.0165, + "step": 111790 + }, + { + "epoch": 0.8264096271547263, + "grad_norm": 0.06768159568309784, + "learning_rate": 8.711716524216525e-06, + "loss": 0.0177, + "step": 111800 + }, + { + "epoch": 0.8264835457260282, + "grad_norm": 0.08055918663740158, + "learning_rate": 8.708006885090219e-06, + "loss": 0.0172, + "step": 111810 + }, + { + "epoch": 0.82655746429733, + "grad_norm": 0.06265651434659958, + "learning_rate": 8.704297245963913e-06, + "loss": 0.0157, + "step": 111820 + }, + { + "epoch": 0.8266313828686319, + "grad_norm": 0.08364095538854599, + "learning_rate": 8.700587606837607e-06, + "loss": 0.0158, + "step": 111830 + }, + { + "epoch": 0.8267053014399338, + "grad_norm": 0.08273705095052719, + "learning_rate": 8.696877967711302e-06, + "loss": 0.0152, + "step": 111840 + }, + { + "epoch": 0.8267792200112356, + "grad_norm": 0.09029737114906311, + "learning_rate": 8.693168328584996e-06, + "loss": 0.0174, + "step": 111850 + }, + { + "epoch": 0.8268531385825375, + "grad_norm": 0.0713384598493576, + "learning_rate": 8.68945868945869e-06, + "loss": 0.0165, + "step": 111860 + }, + { + "epoch": 0.8269270571538393, + "grad_norm": 0.08109275251626968, + "learning_rate": 8.685749050332385e-06, + "loss": 0.0181, + "step": 111870 + }, + { + "epoch": 0.8270009757251412, + "grad_norm": 0.0740184634923935, + "learning_rate": 8.682039411206077e-06, + "loss": 0.0181, + "step": 111880 + }, + { + "epoch": 0.827074894296443, + "grad_norm": 0.07795722037553787, + "learning_rate": 8.678329772079773e-06, + "loss": 0.0159, + "step": 111890 + }, + { + "epoch": 0.8271488128677449, + "grad_norm": 0.06488138437271118, + "learning_rate": 8.674620132953466e-06, + "loss": 0.0163, + "step": 111900 + }, + { + "epoch": 0.8272227314390468, + "grad_norm": 0.07889160513877869, + "learning_rate": 8.670910493827162e-06, + "loss": 0.0164, + "step": 111910 + }, + { + "epoch": 0.8272966500103486, + "grad_norm": 0.07143231481313705, + "learning_rate": 8.667200854700854e-06, + "loss": 0.0149, + "step": 111920 + }, + { + "epoch": 0.8273705685816505, + "grad_norm": 0.07042443007230759, + "learning_rate": 8.66349121557455e-06, + "loss": 0.0163, + "step": 111930 + }, + { + "epoch": 0.8274444871529523, + "grad_norm": 0.08917763829231262, + "learning_rate": 8.659781576448243e-06, + "loss": 0.0179, + "step": 111940 + }, + { + "epoch": 0.8275184057242542, + "grad_norm": 0.06888213753700256, + "learning_rate": 8.656071937321937e-06, + "loss": 0.015, + "step": 111950 + }, + { + "epoch": 0.8275923242955561, + "grad_norm": 0.07403473556041718, + "learning_rate": 8.652362298195631e-06, + "loss": 0.0167, + "step": 111960 + }, + { + "epoch": 0.8276662428668579, + "grad_norm": 0.06935565918684006, + "learning_rate": 8.648652659069326e-06, + "loss": 0.0189, + "step": 111970 + }, + { + "epoch": 0.8277401614381598, + "grad_norm": 0.07324974238872528, + "learning_rate": 8.64494301994302e-06, + "loss": 0.0167, + "step": 111980 + }, + { + "epoch": 0.8278140800094616, + "grad_norm": 0.06783628463745117, + "learning_rate": 8.641233380816714e-06, + "loss": 0.0177, + "step": 111990 + }, + { + "epoch": 0.8278879985807635, + "grad_norm": 0.07136841118335724, + "learning_rate": 8.637523741690408e-06, + "loss": 0.0169, + "step": 112000 + }, + { + "epoch": 0.8279619171520652, + "grad_norm": 0.058445774018764496, + "learning_rate": 8.633814102564103e-06, + "loss": 0.0187, + "step": 112010 + }, + { + "epoch": 0.8280358357233671, + "grad_norm": 0.07820957899093628, + "learning_rate": 8.630104463437797e-06, + "loss": 0.0164, + "step": 112020 + }, + { + "epoch": 0.828109754294669, + "grad_norm": 0.0695708617568016, + "learning_rate": 8.626394824311491e-06, + "loss": 0.0179, + "step": 112030 + }, + { + "epoch": 0.8281836728659708, + "grad_norm": 0.10643965005874634, + "learning_rate": 8.622685185185186e-06, + "loss": 0.0176, + "step": 112040 + }, + { + "epoch": 0.8282575914372727, + "grad_norm": 0.06634803116321564, + "learning_rate": 8.61897554605888e-06, + "loss": 0.0144, + "step": 112050 + }, + { + "epoch": 0.8283315100085745, + "grad_norm": 0.09435508400201797, + "learning_rate": 8.615265906932574e-06, + "loss": 0.018, + "step": 112060 + }, + { + "epoch": 0.8284054285798764, + "grad_norm": 0.08138860762119293, + "learning_rate": 8.611556267806268e-06, + "loss": 0.0169, + "step": 112070 + }, + { + "epoch": 0.8284793471511782, + "grad_norm": 0.07405047863721848, + "learning_rate": 8.607846628679963e-06, + "loss": 0.018, + "step": 112080 + }, + { + "epoch": 0.8285532657224801, + "grad_norm": 0.08439838141202927, + "learning_rate": 8.604136989553657e-06, + "loss": 0.0183, + "step": 112090 + }, + { + "epoch": 0.828627184293782, + "grad_norm": 0.09000565856695175, + "learning_rate": 8.600427350427351e-06, + "loss": 0.0182, + "step": 112100 + }, + { + "epoch": 0.8287011028650838, + "grad_norm": 0.11695726960897446, + "learning_rate": 8.596717711301044e-06, + "loss": 0.0151, + "step": 112110 + }, + { + "epoch": 0.8287750214363857, + "grad_norm": 0.07910315692424774, + "learning_rate": 8.59300807217474e-06, + "loss": 0.0174, + "step": 112120 + }, + { + "epoch": 0.8288489400076875, + "grad_norm": 0.07704861462116241, + "learning_rate": 8.589298433048432e-06, + "loss": 0.0166, + "step": 112130 + }, + { + "epoch": 0.8289228585789894, + "grad_norm": 0.05719153210520744, + "learning_rate": 8.585588793922128e-06, + "loss": 0.0181, + "step": 112140 + }, + { + "epoch": 0.8289967771502912, + "grad_norm": 0.0725087821483612, + "learning_rate": 8.58187915479582e-06, + "loss": 0.0165, + "step": 112150 + }, + { + "epoch": 0.8290706957215931, + "grad_norm": 0.10840688645839691, + "learning_rate": 8.578169515669517e-06, + "loss": 0.0165, + "step": 112160 + }, + { + "epoch": 0.829144614292895, + "grad_norm": 0.10662669688463211, + "learning_rate": 8.57445987654321e-06, + "loss": 0.0185, + "step": 112170 + }, + { + "epoch": 0.8292185328641968, + "grad_norm": 0.07385453581809998, + "learning_rate": 8.570750237416904e-06, + "loss": 0.018, + "step": 112180 + }, + { + "epoch": 0.8292924514354987, + "grad_norm": 0.06940528750419617, + "learning_rate": 8.5670405982906e-06, + "loss": 0.0162, + "step": 112190 + }, + { + "epoch": 0.8293663700068005, + "grad_norm": 0.08754272758960724, + "learning_rate": 8.563330959164292e-06, + "loss": 0.0178, + "step": 112200 + }, + { + "epoch": 0.8294402885781024, + "grad_norm": 0.09364213049411774, + "learning_rate": 8.559621320037988e-06, + "loss": 0.0163, + "step": 112210 + }, + { + "epoch": 0.8295142071494043, + "grad_norm": 0.050510212779045105, + "learning_rate": 8.55591168091168e-06, + "loss": 0.0163, + "step": 112220 + }, + { + "epoch": 0.8295881257207061, + "grad_norm": 0.07437706738710403, + "learning_rate": 8.552202041785377e-06, + "loss": 0.0168, + "step": 112230 + }, + { + "epoch": 0.829662044292008, + "grad_norm": 0.0602116733789444, + "learning_rate": 8.54849240265907e-06, + "loss": 0.0152, + "step": 112240 + }, + { + "epoch": 0.8297359628633098, + "grad_norm": 0.08567924797534943, + "learning_rate": 8.544782763532765e-06, + "loss": 0.0193, + "step": 112250 + }, + { + "epoch": 0.8298098814346117, + "grad_norm": 0.07743331044912338, + "learning_rate": 8.541073124406458e-06, + "loss": 0.0143, + "step": 112260 + }, + { + "epoch": 0.8298838000059134, + "grad_norm": 0.10275121033191681, + "learning_rate": 8.537363485280152e-06, + "loss": 0.0198, + "step": 112270 + }, + { + "epoch": 0.8299577185772153, + "grad_norm": 0.09203140437602997, + "learning_rate": 8.533653846153846e-06, + "loss": 0.0159, + "step": 112280 + }, + { + "epoch": 0.8300316371485172, + "grad_norm": 0.09986036270856857, + "learning_rate": 8.52994420702754e-06, + "loss": 0.0179, + "step": 112290 + }, + { + "epoch": 0.830105555719819, + "grad_norm": 0.08891689032316208, + "learning_rate": 8.526234567901235e-06, + "loss": 0.0179, + "step": 112300 + }, + { + "epoch": 0.8301794742911209, + "grad_norm": 0.0643787607550621, + "learning_rate": 8.52252492877493e-06, + "loss": 0.0159, + "step": 112310 + }, + { + "epoch": 0.8302533928624227, + "grad_norm": 0.08382133394479752, + "learning_rate": 8.518815289648623e-06, + "loss": 0.0169, + "step": 112320 + }, + { + "epoch": 0.8303273114337246, + "grad_norm": 0.06668587774038315, + "learning_rate": 8.515105650522318e-06, + "loss": 0.0168, + "step": 112330 + }, + { + "epoch": 0.8304012300050264, + "grad_norm": 0.04851803183555603, + "learning_rate": 8.51139601139601e-06, + "loss": 0.0154, + "step": 112340 + }, + { + "epoch": 0.8304751485763283, + "grad_norm": 0.0806727260351181, + "learning_rate": 8.507686372269706e-06, + "loss": 0.0156, + "step": 112350 + }, + { + "epoch": 0.8305490671476302, + "grad_norm": 0.09491311013698578, + "learning_rate": 8.5039767331434e-06, + "loss": 0.0188, + "step": 112360 + }, + { + "epoch": 0.830622985718932, + "grad_norm": 0.08105544745922089, + "learning_rate": 8.500267094017095e-06, + "loss": 0.0191, + "step": 112370 + }, + { + "epoch": 0.8306969042902339, + "grad_norm": 0.10776547342538834, + "learning_rate": 8.496557454890789e-06, + "loss": 0.0159, + "step": 112380 + }, + { + "epoch": 0.8307708228615357, + "grad_norm": 0.06740918010473251, + "learning_rate": 8.492847815764483e-06, + "loss": 0.0165, + "step": 112390 + }, + { + "epoch": 0.8308447414328376, + "grad_norm": 0.07434792071580887, + "learning_rate": 8.489138176638178e-06, + "loss": 0.0168, + "step": 112400 + }, + { + "epoch": 0.8309186600041394, + "grad_norm": 0.097455695271492, + "learning_rate": 8.485428537511872e-06, + "loss": 0.0176, + "step": 112410 + }, + { + "epoch": 0.8309925785754413, + "grad_norm": 0.09368898719549179, + "learning_rate": 8.481718898385566e-06, + "loss": 0.0186, + "step": 112420 + }, + { + "epoch": 0.8310664971467432, + "grad_norm": 0.07678214460611343, + "learning_rate": 8.478009259259259e-06, + "loss": 0.0192, + "step": 112430 + }, + { + "epoch": 0.831140415718045, + "grad_norm": 0.0795578882098198, + "learning_rate": 8.474299620132955e-06, + "loss": 0.0177, + "step": 112440 + }, + { + "epoch": 0.8312143342893469, + "grad_norm": 0.08029112219810486, + "learning_rate": 8.470589981006647e-06, + "loss": 0.0175, + "step": 112450 + }, + { + "epoch": 0.8312882528606487, + "grad_norm": 0.09500591456890106, + "learning_rate": 8.466880341880343e-06, + "loss": 0.0193, + "step": 112460 + }, + { + "epoch": 0.8313621714319506, + "grad_norm": 0.06177762895822525, + "learning_rate": 8.463170702754036e-06, + "loss": 0.0169, + "step": 112470 + }, + { + "epoch": 0.8314360900032525, + "grad_norm": 0.0490424744784832, + "learning_rate": 8.459461063627732e-06, + "loss": 0.0189, + "step": 112480 + }, + { + "epoch": 0.8315100085745543, + "grad_norm": 0.06845315545797348, + "learning_rate": 8.455751424501424e-06, + "loss": 0.0163, + "step": 112490 + }, + { + "epoch": 0.8315839271458562, + "grad_norm": 0.0934458076953888, + "learning_rate": 8.452041785375119e-06, + "loss": 0.0179, + "step": 112500 + }, + { + "epoch": 0.831657845717158, + "grad_norm": 0.07834131270647049, + "learning_rate": 8.448332146248813e-06, + "loss": 0.0175, + "step": 112510 + }, + { + "epoch": 0.8317317642884599, + "grad_norm": 0.08731988817453384, + "learning_rate": 8.444622507122507e-06, + "loss": 0.0177, + "step": 112520 + }, + { + "epoch": 0.8318056828597616, + "grad_norm": 0.10199026763439178, + "learning_rate": 8.440912867996202e-06, + "loss": 0.0173, + "step": 112530 + }, + { + "epoch": 0.8318796014310635, + "grad_norm": 0.06616607308387756, + "learning_rate": 8.437203228869896e-06, + "loss": 0.018, + "step": 112540 + }, + { + "epoch": 0.8319535200023654, + "grad_norm": 0.07040773332118988, + "learning_rate": 8.43349358974359e-06, + "loss": 0.0159, + "step": 112550 + }, + { + "epoch": 0.8320274385736672, + "grad_norm": 0.07667144387960434, + "learning_rate": 8.429783950617284e-06, + "loss": 0.0156, + "step": 112560 + }, + { + "epoch": 0.8321013571449691, + "grad_norm": 0.06196437403559685, + "learning_rate": 8.426074311490979e-06, + "loss": 0.0165, + "step": 112570 + }, + { + "epoch": 0.8321752757162709, + "grad_norm": 0.07039918005466461, + "learning_rate": 8.422364672364673e-06, + "loss": 0.0163, + "step": 112580 + }, + { + "epoch": 0.8322491942875728, + "grad_norm": 0.11908307671546936, + "learning_rate": 8.418655033238367e-06, + "loss": 0.0159, + "step": 112590 + }, + { + "epoch": 0.8323231128588746, + "grad_norm": 0.07255138456821442, + "learning_rate": 8.414945394112061e-06, + "loss": 0.0178, + "step": 112600 + }, + { + "epoch": 0.8323970314301765, + "grad_norm": 0.07477567344903946, + "learning_rate": 8.411235754985756e-06, + "loss": 0.0164, + "step": 112610 + }, + { + "epoch": 0.8324709500014784, + "grad_norm": 0.0650930404663086, + "learning_rate": 8.40752611585945e-06, + "loss": 0.0189, + "step": 112620 + }, + { + "epoch": 0.8325448685727802, + "grad_norm": 0.09028556942939758, + "learning_rate": 8.403816476733144e-06, + "loss": 0.0174, + "step": 112630 + }, + { + "epoch": 0.8326187871440821, + "grad_norm": 0.08841590583324432, + "learning_rate": 8.400106837606839e-06, + "loss": 0.0214, + "step": 112640 + }, + { + "epoch": 0.8326927057153839, + "grad_norm": 0.06070448085665703, + "learning_rate": 8.396397198480533e-06, + "loss": 0.0169, + "step": 112650 + }, + { + "epoch": 0.8327666242866858, + "grad_norm": 0.0896754264831543, + "learning_rate": 8.392687559354225e-06, + "loss": 0.0157, + "step": 112660 + }, + { + "epoch": 0.8328405428579876, + "grad_norm": 0.09884040802717209, + "learning_rate": 8.388977920227921e-06, + "loss": 0.0173, + "step": 112670 + }, + { + "epoch": 0.8329144614292895, + "grad_norm": 0.06297778338193893, + "learning_rate": 8.385268281101614e-06, + "loss": 0.0187, + "step": 112680 + }, + { + "epoch": 0.8329883800005914, + "grad_norm": 0.07070671766996384, + "learning_rate": 8.38155864197531e-06, + "loss": 0.0193, + "step": 112690 + }, + { + "epoch": 0.8330622985718932, + "grad_norm": 0.06912417709827423, + "learning_rate": 8.377849002849002e-06, + "loss": 0.0187, + "step": 112700 + }, + { + "epoch": 0.8331362171431951, + "grad_norm": 0.05659592151641846, + "learning_rate": 8.374139363722698e-06, + "loss": 0.0166, + "step": 112710 + }, + { + "epoch": 0.8332101357144969, + "grad_norm": 0.06409688293933868, + "learning_rate": 8.370429724596391e-06, + "loss": 0.0154, + "step": 112720 + }, + { + "epoch": 0.8332840542857988, + "grad_norm": 0.05440429225564003, + "learning_rate": 8.366720085470085e-06, + "loss": 0.0159, + "step": 112730 + }, + { + "epoch": 0.8333579728571007, + "grad_norm": 0.07336640357971191, + "learning_rate": 8.36301044634378e-06, + "loss": 0.0179, + "step": 112740 + }, + { + "epoch": 0.8334318914284025, + "grad_norm": 0.08996220678091049, + "learning_rate": 8.359300807217474e-06, + "loss": 0.0165, + "step": 112750 + }, + { + "epoch": 0.8335058099997044, + "grad_norm": 0.05664195865392685, + "learning_rate": 8.355591168091168e-06, + "loss": 0.0162, + "step": 112760 + }, + { + "epoch": 0.8335797285710062, + "grad_norm": 0.07927602529525757, + "learning_rate": 8.351881528964862e-06, + "loss": 0.0181, + "step": 112770 + }, + { + "epoch": 0.833653647142308, + "grad_norm": 0.08353345841169357, + "learning_rate": 8.348171889838557e-06, + "loss": 0.0162, + "step": 112780 + }, + { + "epoch": 0.8337275657136098, + "grad_norm": 0.07481536269187927, + "learning_rate": 8.344462250712251e-06, + "loss": 0.0179, + "step": 112790 + }, + { + "epoch": 0.8338014842849117, + "grad_norm": 0.07783810794353485, + "learning_rate": 8.340752611585945e-06, + "loss": 0.0164, + "step": 112800 + }, + { + "epoch": 0.8338754028562136, + "grad_norm": 0.0775987058877945, + "learning_rate": 8.33704297245964e-06, + "loss": 0.0181, + "step": 112810 + }, + { + "epoch": 0.8339493214275154, + "grad_norm": 0.07131469249725342, + "learning_rate": 8.333333333333334e-06, + "loss": 0.018, + "step": 112820 + }, + { + "epoch": 0.8340232399988173, + "grad_norm": 0.088409923017025, + "learning_rate": 8.329623694207028e-06, + "loss": 0.0185, + "step": 112830 + }, + { + "epoch": 0.8340971585701191, + "grad_norm": 0.09589328616857529, + "learning_rate": 8.325914055080722e-06, + "loss": 0.0154, + "step": 112840 + }, + { + "epoch": 0.834171077141421, + "grad_norm": 0.06863109767436981, + "learning_rate": 8.322204415954417e-06, + "loss": 0.0167, + "step": 112850 + }, + { + "epoch": 0.8342449957127228, + "grad_norm": 0.08470222353935242, + "learning_rate": 8.31849477682811e-06, + "loss": 0.0143, + "step": 112860 + }, + { + "epoch": 0.8343189142840247, + "grad_norm": 0.07842252403497696, + "learning_rate": 8.314785137701805e-06, + "loss": 0.0182, + "step": 112870 + }, + { + "epoch": 0.8343928328553266, + "grad_norm": 0.08063441514968872, + "learning_rate": 8.3110754985755e-06, + "loss": 0.0193, + "step": 112880 + }, + { + "epoch": 0.8344667514266284, + "grad_norm": 0.05889301747083664, + "learning_rate": 8.307365859449192e-06, + "loss": 0.0186, + "step": 112890 + }, + { + "epoch": 0.8345406699979303, + "grad_norm": 0.06008196622133255, + "learning_rate": 8.303656220322888e-06, + "loss": 0.0187, + "step": 112900 + }, + { + "epoch": 0.8346145885692321, + "grad_norm": 0.07193055003881454, + "learning_rate": 8.29994658119658e-06, + "loss": 0.0203, + "step": 112910 + }, + { + "epoch": 0.834688507140534, + "grad_norm": 0.1248464584350586, + "learning_rate": 8.296236942070276e-06, + "loss": 0.0156, + "step": 112920 + }, + { + "epoch": 0.8347624257118358, + "grad_norm": 0.06551370769739151, + "learning_rate": 8.292527302943969e-06, + "loss": 0.0189, + "step": 112930 + }, + { + "epoch": 0.8348363442831377, + "grad_norm": 0.08450902998447418, + "learning_rate": 8.288817663817665e-06, + "loss": 0.0194, + "step": 112940 + }, + { + "epoch": 0.8349102628544396, + "grad_norm": 0.07757827639579773, + "learning_rate": 8.28510802469136e-06, + "loss": 0.0182, + "step": 112950 + }, + { + "epoch": 0.8349841814257414, + "grad_norm": 0.08068808168172836, + "learning_rate": 8.281398385565052e-06, + "loss": 0.0172, + "step": 112960 + }, + { + "epoch": 0.8350580999970433, + "grad_norm": 0.07426097244024277, + "learning_rate": 8.277688746438748e-06, + "loss": 0.0185, + "step": 112970 + }, + { + "epoch": 0.8351320185683451, + "grad_norm": 0.07921002060174942, + "learning_rate": 8.27397910731244e-06, + "loss": 0.019, + "step": 112980 + }, + { + "epoch": 0.835205937139647, + "grad_norm": 0.06329020857810974, + "learning_rate": 8.270269468186136e-06, + "loss": 0.0171, + "step": 112990 + }, + { + "epoch": 0.8352798557109489, + "grad_norm": 0.062443338334560394, + "learning_rate": 8.266559829059829e-06, + "loss": 0.0142, + "step": 113000 + }, + { + "epoch": 0.8353537742822507, + "grad_norm": 0.10196477174758911, + "learning_rate": 8.262850189933525e-06, + "loss": 0.0195, + "step": 113010 + }, + { + "epoch": 0.8354276928535526, + "grad_norm": 0.0638945996761322, + "learning_rate": 8.259140550807217e-06, + "loss": 0.0175, + "step": 113020 + }, + { + "epoch": 0.8355016114248544, + "grad_norm": 0.0810115784406662, + "learning_rate": 8.255430911680912e-06, + "loss": 0.0151, + "step": 113030 + }, + { + "epoch": 0.8355755299961563, + "grad_norm": 0.07323488593101501, + "learning_rate": 8.251721272554606e-06, + "loss": 0.0189, + "step": 113040 + }, + { + "epoch": 0.835649448567458, + "grad_norm": 0.06533502042293549, + "learning_rate": 8.2480116334283e-06, + "loss": 0.0162, + "step": 113050 + }, + { + "epoch": 0.8357233671387599, + "grad_norm": 0.08269846439361572, + "learning_rate": 8.244301994301995e-06, + "loss": 0.0189, + "step": 113060 + }, + { + "epoch": 0.8357972857100618, + "grad_norm": 0.05778762325644493, + "learning_rate": 8.240592355175689e-06, + "loss": 0.016, + "step": 113070 + }, + { + "epoch": 0.8358712042813636, + "grad_norm": 0.0773656889796257, + "learning_rate": 8.236882716049383e-06, + "loss": 0.0168, + "step": 113080 + }, + { + "epoch": 0.8359451228526655, + "grad_norm": 0.08560159057378769, + "learning_rate": 8.233173076923077e-06, + "loss": 0.0179, + "step": 113090 + }, + { + "epoch": 0.8360190414239673, + "grad_norm": 0.060995034873485565, + "learning_rate": 8.229463437796772e-06, + "loss": 0.0185, + "step": 113100 + }, + { + "epoch": 0.8360929599952692, + "grad_norm": 0.10020701587200165, + "learning_rate": 8.225753798670466e-06, + "loss": 0.0171, + "step": 113110 + }, + { + "epoch": 0.836166878566571, + "grad_norm": 0.06637522578239441, + "learning_rate": 8.22204415954416e-06, + "loss": 0.0157, + "step": 113120 + }, + { + "epoch": 0.8362407971378729, + "grad_norm": 0.07538586109876633, + "learning_rate": 8.218334520417854e-06, + "loss": 0.0171, + "step": 113130 + }, + { + "epoch": 0.8363147157091748, + "grad_norm": 0.061674199998378754, + "learning_rate": 8.214624881291549e-06, + "loss": 0.0137, + "step": 113140 + }, + { + "epoch": 0.8363886342804766, + "grad_norm": 0.10180699080228806, + "learning_rate": 8.210915242165243e-06, + "loss": 0.0157, + "step": 113150 + }, + { + "epoch": 0.8364625528517785, + "grad_norm": 0.07494241744279861, + "learning_rate": 8.207205603038937e-06, + "loss": 0.0165, + "step": 113160 + }, + { + "epoch": 0.8365364714230803, + "grad_norm": 0.07682108879089355, + "learning_rate": 8.203495963912632e-06, + "loss": 0.0158, + "step": 113170 + }, + { + "epoch": 0.8366103899943822, + "grad_norm": 0.09721241891384125, + "learning_rate": 8.199786324786326e-06, + "loss": 0.0157, + "step": 113180 + }, + { + "epoch": 0.836684308565684, + "grad_norm": 0.11956478655338287, + "learning_rate": 8.196076685660018e-06, + "loss": 0.0177, + "step": 113190 + }, + { + "epoch": 0.8367582271369859, + "grad_norm": 0.08993244171142578, + "learning_rate": 8.192367046533714e-06, + "loss": 0.0167, + "step": 113200 + }, + { + "epoch": 0.8368321457082878, + "grad_norm": 0.08010384440422058, + "learning_rate": 8.188657407407407e-06, + "loss": 0.0182, + "step": 113210 + }, + { + "epoch": 0.8369060642795896, + "grad_norm": 0.06815112382173538, + "learning_rate": 8.184947768281103e-06, + "loss": 0.014, + "step": 113220 + }, + { + "epoch": 0.8369799828508915, + "grad_norm": 0.06534769386053085, + "learning_rate": 8.181238129154796e-06, + "loss": 0.0165, + "step": 113230 + }, + { + "epoch": 0.8370539014221933, + "grad_norm": 0.0753227025270462, + "learning_rate": 8.177528490028492e-06, + "loss": 0.0182, + "step": 113240 + }, + { + "epoch": 0.8371278199934952, + "grad_norm": 0.10362353920936584, + "learning_rate": 8.173818850902184e-06, + "loss": 0.018, + "step": 113250 + }, + { + "epoch": 0.8372017385647971, + "grad_norm": 0.08055713027715683, + "learning_rate": 8.170109211775878e-06, + "loss": 0.0168, + "step": 113260 + }, + { + "epoch": 0.8372756571360989, + "grad_norm": 0.07950981706380844, + "learning_rate": 8.166399572649573e-06, + "loss": 0.0168, + "step": 113270 + }, + { + "epoch": 0.8373495757074008, + "grad_norm": 0.07329950481653214, + "learning_rate": 8.162689933523267e-06, + "loss": 0.0159, + "step": 113280 + }, + { + "epoch": 0.8374234942787026, + "grad_norm": 0.08244430273771286, + "learning_rate": 8.158980294396961e-06, + "loss": 0.0189, + "step": 113290 + }, + { + "epoch": 0.8374974128500045, + "grad_norm": 0.08308271318674088, + "learning_rate": 8.155270655270655e-06, + "loss": 0.0162, + "step": 113300 + }, + { + "epoch": 0.8375713314213062, + "grad_norm": 0.08598320931196213, + "learning_rate": 8.15156101614435e-06, + "loss": 0.0169, + "step": 113310 + }, + { + "epoch": 0.8376452499926081, + "grad_norm": 0.06367569416761398, + "learning_rate": 8.147851377018044e-06, + "loss": 0.0162, + "step": 113320 + }, + { + "epoch": 0.83771916856391, + "grad_norm": 0.058816634118556976, + "learning_rate": 8.144141737891738e-06, + "loss": 0.0183, + "step": 113330 + }, + { + "epoch": 0.8377930871352118, + "grad_norm": 0.0877440795302391, + "learning_rate": 8.140432098765433e-06, + "loss": 0.0161, + "step": 113340 + }, + { + "epoch": 0.8378670057065137, + "grad_norm": 0.054381802678108215, + "learning_rate": 8.136722459639127e-06, + "loss": 0.0167, + "step": 113350 + }, + { + "epoch": 0.8379409242778155, + "grad_norm": 0.05709408223628998, + "learning_rate": 8.133012820512821e-06, + "loss": 0.0197, + "step": 113360 + }, + { + "epoch": 0.8380148428491174, + "grad_norm": 0.058122724294662476, + "learning_rate": 8.129303181386515e-06, + "loss": 0.0169, + "step": 113370 + }, + { + "epoch": 0.8380887614204192, + "grad_norm": 0.07336383312940598, + "learning_rate": 8.12559354226021e-06, + "loss": 0.0178, + "step": 113380 + }, + { + "epoch": 0.8381626799917211, + "grad_norm": 0.07232995331287384, + "learning_rate": 8.121883903133904e-06, + "loss": 0.0174, + "step": 113390 + }, + { + "epoch": 0.838236598563023, + "grad_norm": 0.08827503025531769, + "learning_rate": 8.118174264007598e-06, + "loss": 0.0192, + "step": 113400 + }, + { + "epoch": 0.8383105171343248, + "grad_norm": 0.059943169355392456, + "learning_rate": 8.114464624881292e-06, + "loss": 0.0168, + "step": 113410 + }, + { + "epoch": 0.8383844357056267, + "grad_norm": 0.07064764946699142, + "learning_rate": 8.110754985754985e-06, + "loss": 0.0161, + "step": 113420 + }, + { + "epoch": 0.8384583542769285, + "grad_norm": 0.08875328302383423, + "learning_rate": 8.107045346628681e-06, + "loss": 0.0172, + "step": 113430 + }, + { + "epoch": 0.8385322728482304, + "grad_norm": 0.08653685450553894, + "learning_rate": 8.103335707502374e-06, + "loss": 0.0184, + "step": 113440 + }, + { + "epoch": 0.8386061914195323, + "grad_norm": 0.0637197494506836, + "learning_rate": 8.09962606837607e-06, + "loss": 0.0154, + "step": 113450 + }, + { + "epoch": 0.8386801099908341, + "grad_norm": 0.09255864471197128, + "learning_rate": 8.095916429249762e-06, + "loss": 0.0195, + "step": 113460 + }, + { + "epoch": 0.838754028562136, + "grad_norm": 0.06862996518611908, + "learning_rate": 8.092206790123458e-06, + "loss": 0.0159, + "step": 113470 + }, + { + "epoch": 0.8388279471334378, + "grad_norm": 0.08368046581745148, + "learning_rate": 8.08849715099715e-06, + "loss": 0.0171, + "step": 113480 + }, + { + "epoch": 0.8389018657047397, + "grad_norm": 0.07304484397172928, + "learning_rate": 8.084787511870845e-06, + "loss": 0.0167, + "step": 113490 + }, + { + "epoch": 0.8389757842760415, + "grad_norm": 0.07190733402967453, + "learning_rate": 8.08107787274454e-06, + "loss": 0.0157, + "step": 113500 + }, + { + "epoch": 0.8390497028473434, + "grad_norm": 0.08199194818735123, + "learning_rate": 8.077368233618233e-06, + "loss": 0.0168, + "step": 113510 + }, + { + "epoch": 0.8391236214186453, + "grad_norm": 0.050676293671131134, + "learning_rate": 8.073658594491928e-06, + "loss": 0.0187, + "step": 113520 + }, + { + "epoch": 0.8391975399899471, + "grad_norm": 0.0652499794960022, + "learning_rate": 8.069948955365622e-06, + "loss": 0.016, + "step": 113530 + }, + { + "epoch": 0.839271458561249, + "grad_norm": 0.09222967177629471, + "learning_rate": 8.066239316239316e-06, + "loss": 0.0161, + "step": 113540 + }, + { + "epoch": 0.8393453771325508, + "grad_norm": 0.10128255933523178, + "learning_rate": 8.06252967711301e-06, + "loss": 0.0154, + "step": 113550 + }, + { + "epoch": 0.8394192957038527, + "grad_norm": 0.06996087729930878, + "learning_rate": 8.058820037986707e-06, + "loss": 0.0186, + "step": 113560 + }, + { + "epoch": 0.8394932142751544, + "grad_norm": 0.06131841987371445, + "learning_rate": 8.055110398860399e-06, + "loss": 0.0178, + "step": 113570 + }, + { + "epoch": 0.8395671328464563, + "grad_norm": 0.08808259665966034, + "learning_rate": 8.051400759734093e-06, + "loss": 0.0175, + "step": 113580 + }, + { + "epoch": 0.8396410514177582, + "grad_norm": 0.050261400640010834, + "learning_rate": 8.047691120607788e-06, + "loss": 0.0153, + "step": 113590 + }, + { + "epoch": 0.83971496998906, + "grad_norm": 0.0644235759973526, + "learning_rate": 8.043981481481482e-06, + "loss": 0.0163, + "step": 113600 + }, + { + "epoch": 0.8397888885603619, + "grad_norm": 0.07216489315032959, + "learning_rate": 8.040271842355176e-06, + "loss": 0.0164, + "step": 113610 + }, + { + "epoch": 0.8398628071316637, + "grad_norm": 0.08070338517427444, + "learning_rate": 8.03656220322887e-06, + "loss": 0.0183, + "step": 113620 + }, + { + "epoch": 0.8399367257029656, + "grad_norm": 0.06366539746522903, + "learning_rate": 8.032852564102565e-06, + "loss": 0.0139, + "step": 113630 + }, + { + "epoch": 0.8400106442742674, + "grad_norm": 0.08025801181793213, + "learning_rate": 8.029142924976259e-06, + "loss": 0.0162, + "step": 113640 + }, + { + "epoch": 0.8400845628455693, + "grad_norm": 0.0852746069431305, + "learning_rate": 8.025433285849952e-06, + "loss": 0.0178, + "step": 113650 + }, + { + "epoch": 0.8401584814168712, + "grad_norm": 0.07929253578186035, + "learning_rate": 8.021723646723648e-06, + "loss": 0.0188, + "step": 113660 + }, + { + "epoch": 0.840232399988173, + "grad_norm": 0.06703963130712509, + "learning_rate": 8.01801400759734e-06, + "loss": 0.0181, + "step": 113670 + }, + { + "epoch": 0.8403063185594749, + "grad_norm": 0.08296431601047516, + "learning_rate": 8.014304368471036e-06, + "loss": 0.0159, + "step": 113680 + }, + { + "epoch": 0.8403802371307767, + "grad_norm": 0.07311347126960754, + "learning_rate": 8.010594729344729e-06, + "loss": 0.0172, + "step": 113690 + }, + { + "epoch": 0.8404541557020786, + "grad_norm": 0.0729018971323967, + "learning_rate": 8.006885090218425e-06, + "loss": 0.0172, + "step": 113700 + }, + { + "epoch": 0.8405280742733805, + "grad_norm": 0.07265544682741165, + "learning_rate": 8.003175451092117e-06, + "loss": 0.0166, + "step": 113710 + }, + { + "epoch": 0.8406019928446823, + "grad_norm": 0.057415880262851715, + "learning_rate": 7.999465811965812e-06, + "loss": 0.0162, + "step": 113720 + }, + { + "epoch": 0.8406759114159842, + "grad_norm": 0.062344666570425034, + "learning_rate": 7.995756172839507e-06, + "loss": 0.0188, + "step": 113730 + }, + { + "epoch": 0.840749829987286, + "grad_norm": 0.08408903330564499, + "learning_rate": 7.9920465337132e-06, + "loss": 0.015, + "step": 113740 + }, + { + "epoch": 0.8408237485585879, + "grad_norm": 0.07730511575937271, + "learning_rate": 7.988336894586896e-06, + "loss": 0.0167, + "step": 113750 + }, + { + "epoch": 0.8408976671298897, + "grad_norm": 0.10464189946651459, + "learning_rate": 7.984627255460589e-06, + "loss": 0.0176, + "step": 113760 + }, + { + "epoch": 0.8409715857011916, + "grad_norm": 0.09843303263187408, + "learning_rate": 7.980917616334285e-06, + "loss": 0.0179, + "step": 113770 + }, + { + "epoch": 0.8410455042724935, + "grad_norm": 0.07486625015735626, + "learning_rate": 7.977207977207977e-06, + "loss": 0.0155, + "step": 113780 + }, + { + "epoch": 0.8411194228437953, + "grad_norm": 0.07983004301786423, + "learning_rate": 7.973498338081673e-06, + "loss": 0.017, + "step": 113790 + }, + { + "epoch": 0.8411933414150972, + "grad_norm": 0.07761627435684204, + "learning_rate": 7.969788698955366e-06, + "loss": 0.0163, + "step": 113800 + }, + { + "epoch": 0.841267259986399, + "grad_norm": 0.06571881473064423, + "learning_rate": 7.96607905982906e-06, + "loss": 0.0157, + "step": 113810 + }, + { + "epoch": 0.8413411785577009, + "grad_norm": 0.08808450400829315, + "learning_rate": 7.962369420702754e-06, + "loss": 0.0181, + "step": 113820 + }, + { + "epoch": 0.8414150971290026, + "grad_norm": 0.09128312021493912, + "learning_rate": 7.958659781576449e-06, + "loss": 0.0166, + "step": 113830 + }, + { + "epoch": 0.8414890157003045, + "grad_norm": 0.054315246641635895, + "learning_rate": 7.954950142450143e-06, + "loss": 0.0174, + "step": 113840 + }, + { + "epoch": 0.8415629342716064, + "grad_norm": 0.08689109981060028, + "learning_rate": 7.951240503323837e-06, + "loss": 0.0174, + "step": 113850 + }, + { + "epoch": 0.8416368528429082, + "grad_norm": 0.06928656995296478, + "learning_rate": 7.947530864197531e-06, + "loss": 0.0175, + "step": 113860 + }, + { + "epoch": 0.8417107714142101, + "grad_norm": 0.0875593051314354, + "learning_rate": 7.943821225071226e-06, + "loss": 0.0177, + "step": 113870 + }, + { + "epoch": 0.8417846899855119, + "grad_norm": 0.07347662001848221, + "learning_rate": 7.94011158594492e-06, + "loss": 0.0158, + "step": 113880 + }, + { + "epoch": 0.8418586085568138, + "grad_norm": 0.08090215176343918, + "learning_rate": 7.936401946818614e-06, + "loss": 0.019, + "step": 113890 + }, + { + "epoch": 0.8419325271281156, + "grad_norm": 0.06561324000358582, + "learning_rate": 7.932692307692308e-06, + "loss": 0.0187, + "step": 113900 + }, + { + "epoch": 0.8420064456994175, + "grad_norm": 0.0978066697716713, + "learning_rate": 7.928982668566003e-06, + "loss": 0.0154, + "step": 113910 + }, + { + "epoch": 0.8420803642707194, + "grad_norm": 0.06843017041683197, + "learning_rate": 7.925273029439697e-06, + "loss": 0.0155, + "step": 113920 + }, + { + "epoch": 0.8421542828420212, + "grad_norm": 0.051617275923490524, + "learning_rate": 7.921563390313391e-06, + "loss": 0.0163, + "step": 113930 + }, + { + "epoch": 0.8422282014133231, + "grad_norm": 0.07422637194395065, + "learning_rate": 7.917853751187086e-06, + "loss": 0.0187, + "step": 113940 + }, + { + "epoch": 0.8423021199846249, + "grad_norm": 0.06312904506921768, + "learning_rate": 7.914144112060778e-06, + "loss": 0.0182, + "step": 113950 + }, + { + "epoch": 0.8423760385559268, + "grad_norm": 0.056918200105428696, + "learning_rate": 7.910434472934474e-06, + "loss": 0.0149, + "step": 113960 + }, + { + "epoch": 0.8424499571272287, + "grad_norm": 0.08495678007602692, + "learning_rate": 7.906724833808167e-06, + "loss": 0.0206, + "step": 113970 + }, + { + "epoch": 0.8425238756985305, + "grad_norm": 0.07521885633468628, + "learning_rate": 7.903015194681863e-06, + "loss": 0.0168, + "step": 113980 + }, + { + "epoch": 0.8425977942698324, + "grad_norm": 0.06670526415109634, + "learning_rate": 7.899305555555555e-06, + "loss": 0.0184, + "step": 113990 + }, + { + "epoch": 0.8426717128411342, + "grad_norm": 0.08714757114648819, + "learning_rate": 7.895595916429251e-06, + "loss": 0.0205, + "step": 114000 + }, + { + "epoch": 0.8427456314124361, + "grad_norm": 0.07835818082094193, + "learning_rate": 7.891886277302944e-06, + "loss": 0.0182, + "step": 114010 + }, + { + "epoch": 0.8428195499837379, + "grad_norm": 0.0721108689904213, + "learning_rate": 7.88817663817664e-06, + "loss": 0.0178, + "step": 114020 + }, + { + "epoch": 0.8428934685550398, + "grad_norm": 0.07580531388521194, + "learning_rate": 7.884466999050332e-06, + "loss": 0.0195, + "step": 114030 + }, + { + "epoch": 0.8429673871263417, + "grad_norm": 0.06839949637651443, + "learning_rate": 7.880757359924027e-06, + "loss": 0.0174, + "step": 114040 + }, + { + "epoch": 0.8430413056976435, + "grad_norm": 0.07175683975219727, + "learning_rate": 7.87704772079772e-06, + "loss": 0.0197, + "step": 114050 + }, + { + "epoch": 0.8431152242689454, + "grad_norm": 0.0730174258351326, + "learning_rate": 7.873338081671415e-06, + "loss": 0.0154, + "step": 114060 + }, + { + "epoch": 0.8431891428402472, + "grad_norm": 0.06311491131782532, + "learning_rate": 7.86962844254511e-06, + "loss": 0.0169, + "step": 114070 + }, + { + "epoch": 0.843263061411549, + "grad_norm": 0.11842292547225952, + "learning_rate": 7.865918803418804e-06, + "loss": 0.0215, + "step": 114080 + }, + { + "epoch": 0.8433369799828508, + "grad_norm": 0.06746391952037811, + "learning_rate": 7.862209164292498e-06, + "loss": 0.0168, + "step": 114090 + }, + { + "epoch": 0.8434108985541527, + "grad_norm": 0.0839524045586586, + "learning_rate": 7.858499525166192e-06, + "loss": 0.0203, + "step": 114100 + }, + { + "epoch": 0.8434848171254546, + "grad_norm": 0.07548878341913223, + "learning_rate": 7.854789886039886e-06, + "loss": 0.0172, + "step": 114110 + }, + { + "epoch": 0.8435587356967564, + "grad_norm": 0.07536860555410385, + "learning_rate": 7.85108024691358e-06, + "loss": 0.0155, + "step": 114120 + }, + { + "epoch": 0.8436326542680583, + "grad_norm": 0.06918346881866455, + "learning_rate": 7.847370607787275e-06, + "loss": 0.0182, + "step": 114130 + }, + { + "epoch": 0.8437065728393601, + "grad_norm": 0.06039072573184967, + "learning_rate": 7.84366096866097e-06, + "loss": 0.0162, + "step": 114140 + }, + { + "epoch": 0.843780491410662, + "grad_norm": 0.06949473172426224, + "learning_rate": 7.839951329534664e-06, + "loss": 0.0159, + "step": 114150 + }, + { + "epoch": 0.8438544099819638, + "grad_norm": 0.07249496132135391, + "learning_rate": 7.836241690408358e-06, + "loss": 0.0187, + "step": 114160 + }, + { + "epoch": 0.8439283285532657, + "grad_norm": 0.05909942835569382, + "learning_rate": 7.832532051282052e-06, + "loss": 0.0156, + "step": 114170 + }, + { + "epoch": 0.8440022471245676, + "grad_norm": 0.08391299843788147, + "learning_rate": 7.828822412155745e-06, + "loss": 0.0171, + "step": 114180 + }, + { + "epoch": 0.8440761656958694, + "grad_norm": 0.07751631736755371, + "learning_rate": 7.82511277302944e-06, + "loss": 0.016, + "step": 114190 + }, + { + "epoch": 0.8441500842671713, + "grad_norm": 0.07122786343097687, + "learning_rate": 7.821403133903133e-06, + "loss": 0.016, + "step": 114200 + }, + { + "epoch": 0.8442240028384731, + "grad_norm": 0.07838122546672821, + "learning_rate": 7.81769349477683e-06, + "loss": 0.0172, + "step": 114210 + }, + { + "epoch": 0.844297921409775, + "grad_norm": 0.06461747735738754, + "learning_rate": 7.813983855650522e-06, + "loss": 0.0169, + "step": 114220 + }, + { + "epoch": 0.8443718399810769, + "grad_norm": 0.0487394705414772, + "learning_rate": 7.810274216524218e-06, + "loss": 0.0173, + "step": 114230 + }, + { + "epoch": 0.8444457585523787, + "grad_norm": 0.07009081542491913, + "learning_rate": 7.80656457739791e-06, + "loss": 0.0168, + "step": 114240 + }, + { + "epoch": 0.8445196771236806, + "grad_norm": 0.06905744224786758, + "learning_rate": 7.802854938271606e-06, + "loss": 0.016, + "step": 114250 + }, + { + "epoch": 0.8445935956949824, + "grad_norm": 0.09879184514284134, + "learning_rate": 7.799145299145299e-06, + "loss": 0.0168, + "step": 114260 + }, + { + "epoch": 0.8446675142662843, + "grad_norm": 0.0833684578537941, + "learning_rate": 7.795435660018993e-06, + "loss": 0.0182, + "step": 114270 + }, + { + "epoch": 0.8447414328375861, + "grad_norm": 0.16463328897953033, + "learning_rate": 7.791726020892687e-06, + "loss": 0.0156, + "step": 114280 + }, + { + "epoch": 0.844815351408888, + "grad_norm": 0.07162051647901535, + "learning_rate": 7.788016381766382e-06, + "loss": 0.0172, + "step": 114290 + }, + { + "epoch": 0.8448892699801899, + "grad_norm": 0.06895631551742554, + "learning_rate": 7.784306742640076e-06, + "loss": 0.0169, + "step": 114300 + }, + { + "epoch": 0.8449631885514917, + "grad_norm": 0.0861455574631691, + "learning_rate": 7.78059710351377e-06, + "loss": 0.0155, + "step": 114310 + }, + { + "epoch": 0.8450371071227936, + "grad_norm": 0.06159092113375664, + "learning_rate": 7.776887464387465e-06, + "loss": 0.0143, + "step": 114320 + }, + { + "epoch": 0.8451110256940954, + "grad_norm": 0.06688378751277924, + "learning_rate": 7.773177825261159e-06, + "loss": 0.0159, + "step": 114330 + }, + { + "epoch": 0.8451849442653973, + "grad_norm": 0.05591675639152527, + "learning_rate": 7.769468186134853e-06, + "loss": 0.0153, + "step": 114340 + }, + { + "epoch": 0.845258862836699, + "grad_norm": 0.07992582023143768, + "learning_rate": 7.765758547008547e-06, + "loss": 0.0149, + "step": 114350 + }, + { + "epoch": 0.8453327814080009, + "grad_norm": 0.07913650572299957, + "learning_rate": 7.762048907882242e-06, + "loss": 0.0152, + "step": 114360 + }, + { + "epoch": 0.8454066999793028, + "grad_norm": 0.09735318273305893, + "learning_rate": 7.758339268755936e-06, + "loss": 0.0202, + "step": 114370 + }, + { + "epoch": 0.8454806185506046, + "grad_norm": 0.10346204042434692, + "learning_rate": 7.75462962962963e-06, + "loss": 0.018, + "step": 114380 + }, + { + "epoch": 0.8455545371219065, + "grad_norm": 0.06874750554561615, + "learning_rate": 7.750919990503324e-06, + "loss": 0.0178, + "step": 114390 + }, + { + "epoch": 0.8456284556932083, + "grad_norm": 0.06304837018251419, + "learning_rate": 7.747210351377019e-06, + "loss": 0.0168, + "step": 114400 + }, + { + "epoch": 0.8457023742645102, + "grad_norm": 0.09782323241233826, + "learning_rate": 7.743500712250711e-06, + "loss": 0.0171, + "step": 114410 + }, + { + "epoch": 0.845776292835812, + "grad_norm": 0.07600420713424683, + "learning_rate": 7.739791073124407e-06, + "loss": 0.0175, + "step": 114420 + }, + { + "epoch": 0.8458502114071139, + "grad_norm": 0.06809110939502716, + "learning_rate": 7.7360814339981e-06, + "loss": 0.0185, + "step": 114430 + }, + { + "epoch": 0.8459241299784158, + "grad_norm": 0.09009858220815659, + "learning_rate": 7.732371794871796e-06, + "loss": 0.0132, + "step": 114440 + }, + { + "epoch": 0.8459980485497176, + "grad_norm": 0.0723966658115387, + "learning_rate": 7.728662155745488e-06, + "loss": 0.0167, + "step": 114450 + }, + { + "epoch": 0.8460719671210195, + "grad_norm": 0.08759031444787979, + "learning_rate": 7.724952516619184e-06, + "loss": 0.0158, + "step": 114460 + }, + { + "epoch": 0.8461458856923213, + "grad_norm": 0.070323146879673, + "learning_rate": 7.721242877492877e-06, + "loss": 0.0161, + "step": 114470 + }, + { + "epoch": 0.8462198042636232, + "grad_norm": 0.07351279258728027, + "learning_rate": 7.717533238366573e-06, + "loss": 0.0162, + "step": 114480 + }, + { + "epoch": 0.8462937228349251, + "grad_norm": 0.057759840041399, + "learning_rate": 7.713823599240267e-06, + "loss": 0.0161, + "step": 114490 + }, + { + "epoch": 0.8463676414062269, + "grad_norm": 0.08374207466840744, + "learning_rate": 7.71011396011396e-06, + "loss": 0.0179, + "step": 114500 + }, + { + "epoch": 0.8464415599775288, + "grad_norm": 0.09012964367866516, + "learning_rate": 7.706404320987656e-06, + "loss": 0.0176, + "step": 114510 + }, + { + "epoch": 0.8465154785488306, + "grad_norm": 0.06130051985383034, + "learning_rate": 7.702694681861348e-06, + "loss": 0.0156, + "step": 114520 + }, + { + "epoch": 0.8465893971201325, + "grad_norm": 0.07411500811576843, + "learning_rate": 7.698985042735044e-06, + "loss": 0.0158, + "step": 114530 + }, + { + "epoch": 0.8466633156914343, + "grad_norm": 0.06992583721876144, + "learning_rate": 7.695275403608737e-06, + "loss": 0.0164, + "step": 114540 + }, + { + "epoch": 0.8467372342627362, + "grad_norm": 0.07129481434822083, + "learning_rate": 7.691565764482433e-06, + "loss": 0.0175, + "step": 114550 + }, + { + "epoch": 0.8468111528340381, + "grad_norm": 0.08502853661775589, + "learning_rate": 7.687856125356125e-06, + "loss": 0.0144, + "step": 114560 + }, + { + "epoch": 0.8468850714053399, + "grad_norm": 0.09787731617689133, + "learning_rate": 7.68414648622982e-06, + "loss": 0.0183, + "step": 114570 + }, + { + "epoch": 0.8469589899766418, + "grad_norm": 0.06659726053476334, + "learning_rate": 7.680436847103514e-06, + "loss": 0.0159, + "step": 114580 + }, + { + "epoch": 0.8470329085479436, + "grad_norm": 0.11812327802181244, + "learning_rate": 7.676727207977208e-06, + "loss": 0.0174, + "step": 114590 + }, + { + "epoch": 0.8471068271192455, + "grad_norm": 0.08446000516414642, + "learning_rate": 7.673017568850902e-06, + "loss": 0.0155, + "step": 114600 + }, + { + "epoch": 0.8471807456905472, + "grad_norm": 0.08022765070199966, + "learning_rate": 7.669307929724597e-06, + "loss": 0.017, + "step": 114610 + }, + { + "epoch": 0.8472546642618491, + "grad_norm": 0.06662660837173462, + "learning_rate": 7.665598290598291e-06, + "loss": 0.0166, + "step": 114620 + }, + { + "epoch": 0.847328582833151, + "grad_norm": 0.09599727392196655, + "learning_rate": 7.661888651471985e-06, + "loss": 0.0174, + "step": 114630 + }, + { + "epoch": 0.8474025014044528, + "grad_norm": 0.059651460498571396, + "learning_rate": 7.658179012345678e-06, + "loss": 0.0197, + "step": 114640 + }, + { + "epoch": 0.8474764199757547, + "grad_norm": 0.08370509743690491, + "learning_rate": 7.654469373219374e-06, + "loss": 0.0161, + "step": 114650 + }, + { + "epoch": 0.8475503385470565, + "grad_norm": 0.09393087029457092, + "learning_rate": 7.650759734093068e-06, + "loss": 0.019, + "step": 114660 + }, + { + "epoch": 0.8476242571183584, + "grad_norm": 0.09295108169317245, + "learning_rate": 7.647050094966762e-06, + "loss": 0.0168, + "step": 114670 + }, + { + "epoch": 0.8476981756896602, + "grad_norm": 0.06544492393732071, + "learning_rate": 7.643340455840457e-06, + "loss": 0.0174, + "step": 114680 + }, + { + "epoch": 0.8477720942609621, + "grad_norm": 0.06274139136075974, + "learning_rate": 7.639630816714151e-06, + "loss": 0.0188, + "step": 114690 + }, + { + "epoch": 0.847846012832264, + "grad_norm": 0.06864412128925323, + "learning_rate": 7.635921177587845e-06, + "loss": 0.0177, + "step": 114700 + }, + { + "epoch": 0.8479199314035658, + "grad_norm": 0.06328914314508438, + "learning_rate": 7.63221153846154e-06, + "loss": 0.0168, + "step": 114710 + }, + { + "epoch": 0.8479938499748677, + "grad_norm": 0.07179394364356995, + "learning_rate": 7.628501899335233e-06, + "loss": 0.0171, + "step": 114720 + }, + { + "epoch": 0.8480677685461695, + "grad_norm": 0.07428892701864243, + "learning_rate": 7.624792260208926e-06, + "loss": 0.0162, + "step": 114730 + }, + { + "epoch": 0.8481416871174714, + "grad_norm": 0.08414027839899063, + "learning_rate": 7.6210826210826214e-06, + "loss": 0.017, + "step": 114740 + }, + { + "epoch": 0.8482156056887733, + "grad_norm": 0.06821225583553314, + "learning_rate": 7.617372981956315e-06, + "loss": 0.018, + "step": 114750 + }, + { + "epoch": 0.8482895242600751, + "grad_norm": 0.08133241534233093, + "learning_rate": 7.61366334283001e-06, + "loss": 0.0171, + "step": 114760 + }, + { + "epoch": 0.848363442831377, + "grad_norm": 0.075186587870121, + "learning_rate": 7.609953703703704e-06, + "loss": 0.017, + "step": 114770 + }, + { + "epoch": 0.8484373614026788, + "grad_norm": 0.10400836914777756, + "learning_rate": 7.6062440645773985e-06, + "loss": 0.017, + "step": 114780 + }, + { + "epoch": 0.8485112799739807, + "grad_norm": 0.07053770869970322, + "learning_rate": 7.602534425451093e-06, + "loss": 0.0165, + "step": 114790 + }, + { + "epoch": 0.8485851985452825, + "grad_norm": 0.08867017924785614, + "learning_rate": 7.598824786324786e-06, + "loss": 0.0179, + "step": 114800 + }, + { + "epoch": 0.8486591171165844, + "grad_norm": 0.07490748167037964, + "learning_rate": 7.595115147198481e-06, + "loss": 0.0187, + "step": 114810 + }, + { + "epoch": 0.8487330356878863, + "grad_norm": 0.06685304641723633, + "learning_rate": 7.591405508072175e-06, + "loss": 0.0169, + "step": 114820 + }, + { + "epoch": 0.8488069542591881, + "grad_norm": 0.07209019362926483, + "learning_rate": 7.58769586894587e-06, + "loss": 0.0165, + "step": 114830 + }, + { + "epoch": 0.84888087283049, + "grad_norm": 0.057875972241163254, + "learning_rate": 7.583986229819563e-06, + "loss": 0.0168, + "step": 114840 + }, + { + "epoch": 0.8489547914017918, + "grad_norm": 0.06630287319421768, + "learning_rate": 7.5802765906932584e-06, + "loss": 0.0159, + "step": 114850 + }, + { + "epoch": 0.8490287099730937, + "grad_norm": 0.06411104649305344, + "learning_rate": 7.576566951566952e-06, + "loss": 0.0159, + "step": 114860 + }, + { + "epoch": 0.8491026285443954, + "grad_norm": 0.07323815673589706, + "learning_rate": 7.572857312440645e-06, + "loss": 0.0169, + "step": 114870 + }, + { + "epoch": 0.8491765471156973, + "grad_norm": 0.11291278153657913, + "learning_rate": 7.56914767331434e-06, + "loss": 0.0185, + "step": 114880 + }, + { + "epoch": 0.8492504656869992, + "grad_norm": 0.1187766045331955, + "learning_rate": 7.565438034188034e-06, + "loss": 0.0203, + "step": 114890 + }, + { + "epoch": 0.849324384258301, + "grad_norm": 0.0863991230726242, + "learning_rate": 7.561728395061729e-06, + "loss": 0.0174, + "step": 114900 + }, + { + "epoch": 0.8493983028296029, + "grad_norm": 0.10659705102443695, + "learning_rate": 7.558018755935422e-06, + "loss": 0.0158, + "step": 114910 + }, + { + "epoch": 0.8494722214009047, + "grad_norm": 0.10216405987739563, + "learning_rate": 7.5543091168091175e-06, + "loss": 0.0176, + "step": 114920 + }, + { + "epoch": 0.8495461399722066, + "grad_norm": 0.10233411192893982, + "learning_rate": 7.550599477682811e-06, + "loss": 0.0183, + "step": 114930 + }, + { + "epoch": 0.8496200585435084, + "grad_norm": 0.07432316243648529, + "learning_rate": 7.546889838556506e-06, + "loss": 0.0173, + "step": 114940 + }, + { + "epoch": 0.8496939771148103, + "grad_norm": 0.06906505674123764, + "learning_rate": 7.5431801994301995e-06, + "loss": 0.0168, + "step": 114950 + }, + { + "epoch": 0.8497678956861122, + "grad_norm": 0.060751430690288544, + "learning_rate": 7.539470560303894e-06, + "loss": 0.0158, + "step": 114960 + }, + { + "epoch": 0.849841814257414, + "grad_norm": 0.08791016787290573, + "learning_rate": 7.535760921177588e-06, + "loss": 0.0168, + "step": 114970 + }, + { + "epoch": 0.8499157328287159, + "grad_norm": 0.08157749474048615, + "learning_rate": 7.532051282051282e-06, + "loss": 0.0186, + "step": 114980 + }, + { + "epoch": 0.8499896514000177, + "grad_norm": 0.07384663820266724, + "learning_rate": 7.528341642924977e-06, + "loss": 0.018, + "step": 114990 + }, + { + "epoch": 0.8500635699713196, + "grad_norm": 0.05789195001125336, + "learning_rate": 7.524632003798671e-06, + "loss": 0.0173, + "step": 115000 + }, + { + "epoch": 0.8501374885426215, + "grad_norm": 0.07224296778440475, + "learning_rate": 7.520922364672366e-06, + "loss": 0.0174, + "step": 115010 + }, + { + "epoch": 0.8502114071139233, + "grad_norm": 0.0817876011133194, + "learning_rate": 7.517212725546059e-06, + "loss": 0.0203, + "step": 115020 + }, + { + "epoch": 0.8502853256852252, + "grad_norm": 0.10659945011138916, + "learning_rate": 7.513503086419753e-06, + "loss": 0.0158, + "step": 115030 + }, + { + "epoch": 0.850359244256527, + "grad_norm": 0.06013638898730278, + "learning_rate": 7.509793447293448e-06, + "loss": 0.0139, + "step": 115040 + }, + { + "epoch": 0.8504331628278289, + "grad_norm": 0.12041322141885757, + "learning_rate": 7.506083808167141e-06, + "loss": 0.0154, + "step": 115050 + }, + { + "epoch": 0.8505070813991307, + "grad_norm": 0.08451178669929504, + "learning_rate": 7.5023741690408365e-06, + "loss": 0.0181, + "step": 115060 + }, + { + "epoch": 0.8505809999704326, + "grad_norm": 0.05861854925751686, + "learning_rate": 7.49866452991453e-06, + "loss": 0.0162, + "step": 115070 + }, + { + "epoch": 0.8506549185417345, + "grad_norm": 0.05970502272248268, + "learning_rate": 7.494954890788225e-06, + "loss": 0.0173, + "step": 115080 + }, + { + "epoch": 0.8507288371130363, + "grad_norm": 0.06429281830787659, + "learning_rate": 7.4912452516619184e-06, + "loss": 0.019, + "step": 115090 + }, + { + "epoch": 0.8508027556843382, + "grad_norm": 0.05995741859078407, + "learning_rate": 7.4875356125356136e-06, + "loss": 0.0196, + "step": 115100 + }, + { + "epoch": 0.85087667425564, + "grad_norm": 0.13881725072860718, + "learning_rate": 7.483825973409307e-06, + "loss": 0.0185, + "step": 115110 + }, + { + "epoch": 0.8509505928269419, + "grad_norm": 0.08064279705286026, + "learning_rate": 7.480116334283e-06, + "loss": 0.0167, + "step": 115120 + }, + { + "epoch": 0.8510245113982436, + "grad_norm": 0.10625021904706955, + "learning_rate": 7.4764066951566955e-06, + "loss": 0.0165, + "step": 115130 + }, + { + "epoch": 0.8510984299695455, + "grad_norm": 0.09444019198417664, + "learning_rate": 7.47269705603039e-06, + "loss": 0.0186, + "step": 115140 + }, + { + "epoch": 0.8511723485408474, + "grad_norm": 0.07402213662862778, + "learning_rate": 7.468987416904084e-06, + "loss": 0.0172, + "step": 115150 + }, + { + "epoch": 0.8512462671121492, + "grad_norm": 0.07309866696596146, + "learning_rate": 7.465277777777778e-06, + "loss": 0.0181, + "step": 115160 + }, + { + "epoch": 0.8513201856834511, + "grad_norm": 0.08333403617143631, + "learning_rate": 7.461568138651473e-06, + "loss": 0.017, + "step": 115170 + }, + { + "epoch": 0.8513941042547529, + "grad_norm": 0.06300780922174454, + "learning_rate": 7.457858499525167e-06, + "loss": 0.0162, + "step": 115180 + }, + { + "epoch": 0.8514680228260548, + "grad_norm": 0.06013835594058037, + "learning_rate": 7.45414886039886e-06, + "loss": 0.0173, + "step": 115190 + }, + { + "epoch": 0.8515419413973566, + "grad_norm": 0.06843134760856628, + "learning_rate": 7.4504392212725554e-06, + "loss": 0.0154, + "step": 115200 + }, + { + "epoch": 0.8516158599686585, + "grad_norm": 0.0664408728480339, + "learning_rate": 7.446729582146249e-06, + "loss": 0.0177, + "step": 115210 + }, + { + "epoch": 0.8516897785399604, + "grad_norm": 0.08137643337249756, + "learning_rate": 7.443019943019944e-06, + "loss": 0.015, + "step": 115220 + }, + { + "epoch": 0.8517636971112622, + "grad_norm": 0.07489970326423645, + "learning_rate": 7.439310303893637e-06, + "loss": 0.0156, + "step": 115230 + }, + { + "epoch": 0.8518376156825641, + "grad_norm": 0.08225582540035248, + "learning_rate": 7.4356006647673325e-06, + "loss": 0.0179, + "step": 115240 + }, + { + "epoch": 0.8519115342538659, + "grad_norm": 0.09738589823246002, + "learning_rate": 7.431891025641026e-06, + "loss": 0.0166, + "step": 115250 + }, + { + "epoch": 0.8519854528251678, + "grad_norm": 0.11694802343845367, + "learning_rate": 7.428181386514719e-06, + "loss": 0.0179, + "step": 115260 + }, + { + "epoch": 0.8520593713964697, + "grad_norm": 0.08086628466844559, + "learning_rate": 7.4244717473884145e-06, + "loss": 0.0187, + "step": 115270 + }, + { + "epoch": 0.8521332899677715, + "grad_norm": 0.08938246220350266, + "learning_rate": 7.420762108262108e-06, + "loss": 0.0148, + "step": 115280 + }, + { + "epoch": 0.8522072085390734, + "grad_norm": 0.06674882769584656, + "learning_rate": 7.417052469135803e-06, + "loss": 0.0145, + "step": 115290 + }, + { + "epoch": 0.8522811271103752, + "grad_norm": 0.08584784716367722, + "learning_rate": 7.4133428300094965e-06, + "loss": 0.016, + "step": 115300 + }, + { + "epoch": 0.8523550456816771, + "grad_norm": 0.09939748048782349, + "learning_rate": 7.409633190883192e-06, + "loss": 0.0155, + "step": 115310 + }, + { + "epoch": 0.8524289642529789, + "grad_norm": 0.0794292464852333, + "learning_rate": 7.405923551756885e-06, + "loss": 0.0172, + "step": 115320 + }, + { + "epoch": 0.8525028828242808, + "grad_norm": 0.08377964794635773, + "learning_rate": 7.40221391263058e-06, + "loss": 0.0163, + "step": 115330 + }, + { + "epoch": 0.8525768013955827, + "grad_norm": 0.06709586083889008, + "learning_rate": 7.3985042735042736e-06, + "loss": 0.0192, + "step": 115340 + }, + { + "epoch": 0.8526507199668845, + "grad_norm": 0.05004974827170372, + "learning_rate": 7.394794634377968e-06, + "loss": 0.0178, + "step": 115350 + }, + { + "epoch": 0.8527246385381864, + "grad_norm": 0.0567706897854805, + "learning_rate": 7.391084995251663e-06, + "loss": 0.0136, + "step": 115360 + }, + { + "epoch": 0.8527985571094882, + "grad_norm": 0.07427140325307846, + "learning_rate": 7.387375356125356e-06, + "loss": 0.0169, + "step": 115370 + }, + { + "epoch": 0.85287247568079, + "grad_norm": 0.08540726453065872, + "learning_rate": 7.3836657169990515e-06, + "loss": 0.0175, + "step": 115380 + }, + { + "epoch": 0.8529463942520918, + "grad_norm": 0.07391873747110367, + "learning_rate": 7.379956077872745e-06, + "loss": 0.0176, + "step": 115390 + }, + { + "epoch": 0.8530203128233937, + "grad_norm": 0.09952632337808609, + "learning_rate": 7.37624643874644e-06, + "loss": 0.0207, + "step": 115400 + }, + { + "epoch": 0.8530942313946956, + "grad_norm": 0.08218296617269516, + "learning_rate": 7.3725367996201335e-06, + "loss": 0.016, + "step": 115410 + }, + { + "epoch": 0.8531681499659974, + "grad_norm": 0.07376095652580261, + "learning_rate": 7.368827160493827e-06, + "loss": 0.017, + "step": 115420 + }, + { + "epoch": 0.8532420685372993, + "grad_norm": 0.07043536007404327, + "learning_rate": 7.365117521367522e-06, + "loss": 0.0175, + "step": 115430 + }, + { + "epoch": 0.8533159871086011, + "grad_norm": 0.1102387085556984, + "learning_rate": 7.3614078822412154e-06, + "loss": 0.0178, + "step": 115440 + }, + { + "epoch": 0.853389905679903, + "grad_norm": 0.06748242676258087, + "learning_rate": 7.3576982431149106e-06, + "loss": 0.0177, + "step": 115450 + }, + { + "epoch": 0.8534638242512049, + "grad_norm": 0.11292647570371628, + "learning_rate": 7.353988603988604e-06, + "loss": 0.0196, + "step": 115460 + }, + { + "epoch": 0.8535377428225067, + "grad_norm": 0.05069030448794365, + "learning_rate": 7.350278964862299e-06, + "loss": 0.0145, + "step": 115470 + }, + { + "epoch": 0.8536116613938086, + "grad_norm": 0.09397978335618973, + "learning_rate": 7.3465693257359925e-06, + "loss": 0.0163, + "step": 115480 + }, + { + "epoch": 0.8536855799651104, + "grad_norm": 0.07535222917795181, + "learning_rate": 7.342859686609686e-06, + "loss": 0.0156, + "step": 115490 + }, + { + "epoch": 0.8537594985364123, + "grad_norm": 0.0793633908033371, + "learning_rate": 7.339150047483381e-06, + "loss": 0.0155, + "step": 115500 + }, + { + "epoch": 0.8538334171077141, + "grad_norm": 0.06356097757816315, + "learning_rate": 7.3354404083570745e-06, + "loss": 0.0158, + "step": 115510 + }, + { + "epoch": 0.853907335679016, + "grad_norm": 0.07921180874109268, + "learning_rate": 7.33173076923077e-06, + "loss": 0.0171, + "step": 115520 + }, + { + "epoch": 0.8539812542503179, + "grad_norm": 0.08289998024702072, + "learning_rate": 7.328021130104464e-06, + "loss": 0.0175, + "step": 115530 + }, + { + "epoch": 0.8540551728216197, + "grad_norm": 0.09399142861366272, + "learning_rate": 7.324311490978158e-06, + "loss": 0.0181, + "step": 115540 + }, + { + "epoch": 0.8541290913929216, + "grad_norm": 0.10622493177652359, + "learning_rate": 7.3206018518518524e-06, + "loss": 0.0162, + "step": 115550 + }, + { + "epoch": 0.8542030099642234, + "grad_norm": 0.09257388859987259, + "learning_rate": 7.316892212725547e-06, + "loss": 0.016, + "step": 115560 + }, + { + "epoch": 0.8542769285355253, + "grad_norm": 0.06779754906892776, + "learning_rate": 7.313182573599241e-06, + "loss": 0.0167, + "step": 115570 + }, + { + "epoch": 0.8543508471068271, + "grad_norm": 0.050333622843027115, + "learning_rate": 7.309472934472934e-06, + "loss": 0.0159, + "step": 115580 + }, + { + "epoch": 0.854424765678129, + "grad_norm": 0.0706227645277977, + "learning_rate": 7.3057632953466295e-06, + "loss": 0.0177, + "step": 115590 + }, + { + "epoch": 0.8544986842494309, + "grad_norm": 0.06861665844917297, + "learning_rate": 7.302053656220323e-06, + "loss": 0.0167, + "step": 115600 + }, + { + "epoch": 0.8545726028207327, + "grad_norm": 0.08953040838241577, + "learning_rate": 7.298344017094018e-06, + "loss": 0.0152, + "step": 115610 + }, + { + "epoch": 0.8546465213920346, + "grad_norm": 0.08950774371623993, + "learning_rate": 7.2946343779677115e-06, + "loss": 0.0195, + "step": 115620 + }, + { + "epoch": 0.8547204399633364, + "grad_norm": 0.08176089078187943, + "learning_rate": 7.290924738841407e-06, + "loss": 0.0184, + "step": 115630 + }, + { + "epoch": 0.8547943585346383, + "grad_norm": 0.07711353898048401, + "learning_rate": 7.2872150997151e-06, + "loss": 0.0178, + "step": 115640 + }, + { + "epoch": 0.85486827710594, + "grad_norm": 0.0611676424741745, + "learning_rate": 7.2835054605887935e-06, + "loss": 0.0152, + "step": 115650 + }, + { + "epoch": 0.854942195677242, + "grad_norm": 0.06824205815792084, + "learning_rate": 7.279795821462489e-06, + "loss": 0.015, + "step": 115660 + }, + { + "epoch": 0.8550161142485438, + "grad_norm": 0.10189816355705261, + "learning_rate": 7.276086182336182e-06, + "loss": 0.0187, + "step": 115670 + }, + { + "epoch": 0.8550900328198456, + "grad_norm": 0.05837101861834526, + "learning_rate": 7.272376543209877e-06, + "loss": 0.0198, + "step": 115680 + }, + { + "epoch": 0.8551639513911475, + "grad_norm": 0.060416217893362045, + "learning_rate": 7.2686669040835706e-06, + "loss": 0.0182, + "step": 115690 + }, + { + "epoch": 0.8552378699624493, + "grad_norm": 0.081768698990345, + "learning_rate": 7.264957264957266e-06, + "loss": 0.0167, + "step": 115700 + }, + { + "epoch": 0.8553117885337512, + "grad_norm": 0.07554604113101959, + "learning_rate": 7.261247625830959e-06, + "loss": 0.0173, + "step": 115710 + }, + { + "epoch": 0.8553857071050531, + "grad_norm": 0.0834031030535698, + "learning_rate": 7.257537986704653e-06, + "loss": 0.0173, + "step": 115720 + }, + { + "epoch": 0.8554596256763549, + "grad_norm": 0.08713842928409576, + "learning_rate": 7.253828347578348e-06, + "loss": 0.0173, + "step": 115730 + }, + { + "epoch": 0.8555335442476568, + "grad_norm": 0.06996472924947739, + "learning_rate": 7.250118708452042e-06, + "loss": 0.015, + "step": 115740 + }, + { + "epoch": 0.8556074628189586, + "grad_norm": 0.08058460801839828, + "learning_rate": 7.246409069325737e-06, + "loss": 0.0171, + "step": 115750 + }, + { + "epoch": 0.8556813813902605, + "grad_norm": 0.12176497280597687, + "learning_rate": 7.2426994301994305e-06, + "loss": 0.0183, + "step": 115760 + }, + { + "epoch": 0.8557552999615623, + "grad_norm": 0.07371469587087631, + "learning_rate": 7.238989791073126e-06, + "loss": 0.0178, + "step": 115770 + }, + { + "epoch": 0.8558292185328642, + "grad_norm": 0.09114787727594376, + "learning_rate": 7.235280151946819e-06, + "loss": 0.0203, + "step": 115780 + }, + { + "epoch": 0.8559031371041661, + "grad_norm": 0.07811260223388672, + "learning_rate": 7.231570512820514e-06, + "loss": 0.0154, + "step": 115790 + }, + { + "epoch": 0.8559770556754679, + "grad_norm": 0.07033334672451019, + "learning_rate": 7.227860873694208e-06, + "loss": 0.0187, + "step": 115800 + }, + { + "epoch": 0.8560509742467698, + "grad_norm": 0.08371198922395706, + "learning_rate": 7.224151234567901e-06, + "loss": 0.0185, + "step": 115810 + }, + { + "epoch": 0.8561248928180716, + "grad_norm": 0.07226774841547012, + "learning_rate": 7.220441595441596e-06, + "loss": 0.016, + "step": 115820 + }, + { + "epoch": 0.8561988113893735, + "grad_norm": 0.05450865626335144, + "learning_rate": 7.2167319563152896e-06, + "loss": 0.0179, + "step": 115830 + }, + { + "epoch": 0.8562727299606753, + "grad_norm": 0.08719339966773987, + "learning_rate": 7.213022317188985e-06, + "loss": 0.0187, + "step": 115840 + }, + { + "epoch": 0.8563466485319772, + "grad_norm": 0.08201997727155685, + "learning_rate": 7.209312678062678e-06, + "loss": 0.0155, + "step": 115850 + }, + { + "epoch": 0.8564205671032791, + "grad_norm": 0.07213526964187622, + "learning_rate": 7.205603038936373e-06, + "loss": 0.018, + "step": 115860 + }, + { + "epoch": 0.8564944856745809, + "grad_norm": 0.08951819688081741, + "learning_rate": 7.201893399810067e-06, + "loss": 0.0182, + "step": 115870 + }, + { + "epoch": 0.8565684042458828, + "grad_norm": 0.10166043043136597, + "learning_rate": 7.19818376068376e-06, + "loss": 0.0182, + "step": 115880 + }, + { + "epoch": 0.8566423228171846, + "grad_norm": 0.07484026253223419, + "learning_rate": 7.194474121557455e-06, + "loss": 0.0178, + "step": 115890 + }, + { + "epoch": 0.8567162413884865, + "grad_norm": 0.07420425862073898, + "learning_rate": 7.190764482431149e-06, + "loss": 0.0173, + "step": 115900 + }, + { + "epoch": 0.8567901599597882, + "grad_norm": 0.06019234657287598, + "learning_rate": 7.187054843304844e-06, + "loss": 0.0181, + "step": 115910 + }, + { + "epoch": 0.8568640785310901, + "grad_norm": 0.08211581408977509, + "learning_rate": 7.183345204178538e-06, + "loss": 0.0175, + "step": 115920 + }, + { + "epoch": 0.856937997102392, + "grad_norm": 0.08052533864974976, + "learning_rate": 7.179635565052232e-06, + "loss": 0.0164, + "step": 115930 + }, + { + "epoch": 0.8570119156736938, + "grad_norm": 0.07633931189775467, + "learning_rate": 7.1759259259259266e-06, + "loss": 0.0166, + "step": 115940 + }, + { + "epoch": 0.8570858342449957, + "grad_norm": 0.06322506070137024, + "learning_rate": 7.17221628679962e-06, + "loss": 0.0168, + "step": 115950 + }, + { + "epoch": 0.8571597528162975, + "grad_norm": 0.11195466667413712, + "learning_rate": 7.168506647673315e-06, + "loss": 0.0174, + "step": 115960 + }, + { + "epoch": 0.8572336713875994, + "grad_norm": 0.07432009279727936, + "learning_rate": 7.1647970085470085e-06, + "loss": 0.0178, + "step": 115970 + }, + { + "epoch": 0.8573075899589013, + "grad_norm": 0.06407511979341507, + "learning_rate": 7.161087369420704e-06, + "loss": 0.0159, + "step": 115980 + }, + { + "epoch": 0.8573815085302031, + "grad_norm": 0.07257718592882156, + "learning_rate": 7.157377730294397e-06, + "loss": 0.017, + "step": 115990 + }, + { + "epoch": 0.857455427101505, + "grad_norm": 0.06976402550935745, + "learning_rate": 7.153668091168092e-06, + "loss": 0.0153, + "step": 116000 + }, + { + "epoch": 0.8575293456728068, + "grad_norm": 0.06721027195453644, + "learning_rate": 7.149958452041786e-06, + "loss": 0.0167, + "step": 116010 + }, + { + "epoch": 0.8576032642441087, + "grad_norm": 0.16653594374656677, + "learning_rate": 7.146248812915481e-06, + "loss": 0.0178, + "step": 116020 + }, + { + "epoch": 0.8576771828154105, + "grad_norm": 0.08466430008411407, + "learning_rate": 7.142539173789174e-06, + "loss": 0.0177, + "step": 116030 + }, + { + "epoch": 0.8577511013867124, + "grad_norm": 0.06317166984081268, + "learning_rate": 7.138829534662868e-06, + "loss": 0.0184, + "step": 116040 + }, + { + "epoch": 0.8578250199580143, + "grad_norm": 0.06002514809370041, + "learning_rate": 7.135119895536563e-06, + "loss": 0.0171, + "step": 116050 + }, + { + "epoch": 0.8578989385293161, + "grad_norm": 0.07541421055793762, + "learning_rate": 7.131410256410256e-06, + "loss": 0.0194, + "step": 116060 + }, + { + "epoch": 0.857972857100618, + "grad_norm": 0.07419611513614655, + "learning_rate": 7.127700617283951e-06, + "loss": 0.0154, + "step": 116070 + }, + { + "epoch": 0.8580467756719198, + "grad_norm": 0.07317450642585754, + "learning_rate": 7.123990978157645e-06, + "loss": 0.017, + "step": 116080 + }, + { + "epoch": 0.8581206942432217, + "grad_norm": 0.08183526247739792, + "learning_rate": 7.12028133903134e-06, + "loss": 0.0194, + "step": 116090 + }, + { + "epoch": 0.8581946128145235, + "grad_norm": 0.08646831661462784, + "learning_rate": 7.116571699905033e-06, + "loss": 0.0164, + "step": 116100 + }, + { + "epoch": 0.8582685313858254, + "grad_norm": 0.11067859828472137, + "learning_rate": 7.1128620607787275e-06, + "loss": 0.0196, + "step": 116110 + }, + { + "epoch": 0.8583424499571273, + "grad_norm": 0.0796673446893692, + "learning_rate": 7.109152421652422e-06, + "loss": 0.0156, + "step": 116120 + }, + { + "epoch": 0.8584163685284291, + "grad_norm": 0.08841119706630707, + "learning_rate": 7.105442782526116e-06, + "loss": 0.0187, + "step": 116130 + }, + { + "epoch": 0.858490287099731, + "grad_norm": 0.08489008247852325, + "learning_rate": 7.101733143399811e-06, + "loss": 0.0174, + "step": 116140 + }, + { + "epoch": 0.8585642056710328, + "grad_norm": 0.06625782698392868, + "learning_rate": 7.098023504273505e-06, + "loss": 0.0175, + "step": 116150 + }, + { + "epoch": 0.8586381242423347, + "grad_norm": 0.06949188560247421, + "learning_rate": 7.0943138651472e-06, + "loss": 0.0174, + "step": 116160 + }, + { + "epoch": 0.8587120428136364, + "grad_norm": 0.07019592821598053, + "learning_rate": 7.090604226020893e-06, + "loss": 0.0165, + "step": 116170 + }, + { + "epoch": 0.8587859613849383, + "grad_norm": 0.06393100321292877, + "learning_rate": 7.0868945868945866e-06, + "loss": 0.0165, + "step": 116180 + }, + { + "epoch": 0.8588598799562402, + "grad_norm": 0.07292962074279785, + "learning_rate": 7.083184947768282e-06, + "loss": 0.0176, + "step": 116190 + }, + { + "epoch": 0.858933798527542, + "grad_norm": 0.0762207955121994, + "learning_rate": 7.079475308641975e-06, + "loss": 0.0159, + "step": 116200 + }, + { + "epoch": 0.8590077170988439, + "grad_norm": 0.0796966627240181, + "learning_rate": 7.07576566951567e-06, + "loss": 0.0163, + "step": 116210 + }, + { + "epoch": 0.8590816356701457, + "grad_norm": 0.0938892513513565, + "learning_rate": 7.072056030389364e-06, + "loss": 0.017, + "step": 116220 + }, + { + "epoch": 0.8591555542414476, + "grad_norm": 0.0776246041059494, + "learning_rate": 7.068346391263059e-06, + "loss": 0.0162, + "step": 116230 + }, + { + "epoch": 0.8592294728127495, + "grad_norm": 0.05977245792746544, + "learning_rate": 7.064636752136752e-06, + "loss": 0.0171, + "step": 116240 + }, + { + "epoch": 0.8593033913840513, + "grad_norm": 0.080894835293293, + "learning_rate": 7.060927113010447e-06, + "loss": 0.0175, + "step": 116250 + }, + { + "epoch": 0.8593773099553532, + "grad_norm": 0.0629294291138649, + "learning_rate": 7.057217473884141e-06, + "loss": 0.0187, + "step": 116260 + }, + { + "epoch": 0.859451228526655, + "grad_norm": 0.07322567701339722, + "learning_rate": 7.053507834757834e-06, + "loss": 0.0174, + "step": 116270 + }, + { + "epoch": 0.8595251470979569, + "grad_norm": 0.09472065418958664, + "learning_rate": 7.049798195631529e-06, + "loss": 0.0202, + "step": 116280 + }, + { + "epoch": 0.8595990656692587, + "grad_norm": 0.06628761440515518, + "learning_rate": 7.0460885565052236e-06, + "loss": 0.0172, + "step": 116290 + }, + { + "epoch": 0.8596729842405606, + "grad_norm": 0.0796431228518486, + "learning_rate": 7.042378917378918e-06, + "loss": 0.0165, + "step": 116300 + }, + { + "epoch": 0.8597469028118625, + "grad_norm": 0.10444629937410355, + "learning_rate": 7.038669278252612e-06, + "loss": 0.0175, + "step": 116310 + }, + { + "epoch": 0.8598208213831643, + "grad_norm": 0.09770618379116058, + "learning_rate": 7.034959639126306e-06, + "loss": 0.0184, + "step": 116320 + }, + { + "epoch": 0.8598947399544662, + "grad_norm": 0.072440005838871, + "learning_rate": 7.031250000000001e-06, + "loss": 0.0155, + "step": 116330 + }, + { + "epoch": 0.859968658525768, + "grad_norm": 0.1179017499089241, + "learning_rate": 7.027540360873694e-06, + "loss": 0.0174, + "step": 116340 + }, + { + "epoch": 0.8600425770970699, + "grad_norm": 0.0868251696228981, + "learning_rate": 7.023830721747389e-06, + "loss": 0.0182, + "step": 116350 + }, + { + "epoch": 0.8601164956683717, + "grad_norm": 0.09221919625997543, + "learning_rate": 7.020121082621083e-06, + "loss": 0.017, + "step": 116360 + }, + { + "epoch": 0.8601904142396736, + "grad_norm": 0.06533008813858032, + "learning_rate": 7.016411443494778e-06, + "loss": 0.0162, + "step": 116370 + }, + { + "epoch": 0.8602643328109755, + "grad_norm": 0.0642094761133194, + "learning_rate": 7.012701804368471e-06, + "loss": 0.0169, + "step": 116380 + }, + { + "epoch": 0.8603382513822773, + "grad_norm": 0.0782715305685997, + "learning_rate": 7.008992165242166e-06, + "loss": 0.0163, + "step": 116390 + }, + { + "epoch": 0.8604121699535792, + "grad_norm": 0.07866599410772324, + "learning_rate": 7.00528252611586e-06, + "loss": 0.0164, + "step": 116400 + }, + { + "epoch": 0.860486088524881, + "grad_norm": 0.057049378752708435, + "learning_rate": 7.001572886989553e-06, + "loss": 0.0173, + "step": 116410 + }, + { + "epoch": 0.8605600070961829, + "grad_norm": 0.07760420441627502, + "learning_rate": 6.997863247863248e-06, + "loss": 0.0196, + "step": 116420 + }, + { + "epoch": 0.8606339256674846, + "grad_norm": 0.07585640251636505, + "learning_rate": 6.994153608736942e-06, + "loss": 0.0148, + "step": 116430 + }, + { + "epoch": 0.8607078442387865, + "grad_norm": 0.0873769074678421, + "learning_rate": 6.990443969610637e-06, + "loss": 0.0181, + "step": 116440 + }, + { + "epoch": 0.8607817628100884, + "grad_norm": 0.07950405776500702, + "learning_rate": 6.98673433048433e-06, + "loss": 0.0188, + "step": 116450 + }, + { + "epoch": 0.8608556813813902, + "grad_norm": 0.054485294967889786, + "learning_rate": 6.983024691358025e-06, + "loss": 0.0158, + "step": 116460 + }, + { + "epoch": 0.8609295999526921, + "grad_norm": 0.10393580794334412, + "learning_rate": 6.979315052231719e-06, + "loss": 0.0188, + "step": 116470 + }, + { + "epoch": 0.8610035185239939, + "grad_norm": 0.0881158709526062, + "learning_rate": 6.975605413105414e-06, + "loss": 0.0162, + "step": 116480 + }, + { + "epoch": 0.8610774370952958, + "grad_norm": 0.07192538678646088, + "learning_rate": 6.971895773979107e-06, + "loss": 0.0157, + "step": 116490 + }, + { + "epoch": 0.8611513556665977, + "grad_norm": 0.05255180597305298, + "learning_rate": 6.968186134852802e-06, + "loss": 0.0192, + "step": 116500 + }, + { + "epoch": 0.8612252742378995, + "grad_norm": 0.063807412981987, + "learning_rate": 6.964476495726497e-06, + "loss": 0.0171, + "step": 116510 + }, + { + "epoch": 0.8612991928092014, + "grad_norm": 0.08546123653650284, + "learning_rate": 6.96076685660019e-06, + "loss": 0.0169, + "step": 116520 + }, + { + "epoch": 0.8613731113805032, + "grad_norm": 0.11296427994966507, + "learning_rate": 6.957057217473885e-06, + "loss": 0.0174, + "step": 116530 + }, + { + "epoch": 0.8614470299518051, + "grad_norm": 0.07347305119037628, + "learning_rate": 6.953347578347579e-06, + "loss": 0.0164, + "step": 116540 + }, + { + "epoch": 0.8615209485231069, + "grad_norm": 0.06195492297410965, + "learning_rate": 6.949637939221274e-06, + "loss": 0.017, + "step": 116550 + }, + { + "epoch": 0.8615948670944088, + "grad_norm": 0.05554427579045296, + "learning_rate": 6.945928300094967e-06, + "loss": 0.0159, + "step": 116560 + }, + { + "epoch": 0.8616687856657107, + "grad_norm": 0.06394603103399277, + "learning_rate": 6.942218660968661e-06, + "loss": 0.0151, + "step": 116570 + }, + { + "epoch": 0.8617427042370125, + "grad_norm": 0.0576290637254715, + "learning_rate": 6.938509021842356e-06, + "loss": 0.0172, + "step": 116580 + }, + { + "epoch": 0.8618166228083144, + "grad_norm": 0.09491167217493057, + "learning_rate": 6.934799382716049e-06, + "loss": 0.0194, + "step": 116590 + }, + { + "epoch": 0.8618905413796162, + "grad_norm": 0.07047645002603531, + "learning_rate": 6.931089743589744e-06, + "loss": 0.0155, + "step": 116600 + }, + { + "epoch": 0.8619644599509181, + "grad_norm": 0.06572113931179047, + "learning_rate": 6.927380104463438e-06, + "loss": 0.015, + "step": 116610 + }, + { + "epoch": 0.8620383785222199, + "grad_norm": 0.08306535333395004, + "learning_rate": 6.923670465337133e-06, + "loss": 0.0155, + "step": 116620 + }, + { + "epoch": 0.8621122970935218, + "grad_norm": 0.08482124656438828, + "learning_rate": 6.919960826210826e-06, + "loss": 0.0176, + "step": 116630 + }, + { + "epoch": 0.8621862156648237, + "grad_norm": 0.08405287563800812, + "learning_rate": 6.91625118708452e-06, + "loss": 0.0183, + "step": 116640 + }, + { + "epoch": 0.8622601342361255, + "grad_norm": 0.07583169639110565, + "learning_rate": 6.912541547958215e-06, + "loss": 0.0161, + "step": 116650 + }, + { + "epoch": 0.8623340528074274, + "grad_norm": 0.07283152639865875, + "learning_rate": 6.908831908831908e-06, + "loss": 0.0196, + "step": 116660 + }, + { + "epoch": 0.8624079713787292, + "grad_norm": 0.13090991973876953, + "learning_rate": 6.905122269705603e-06, + "loss": 0.0167, + "step": 116670 + }, + { + "epoch": 0.862481889950031, + "grad_norm": 0.07329442352056503, + "learning_rate": 6.901412630579298e-06, + "loss": 0.0171, + "step": 116680 + }, + { + "epoch": 0.8625558085213328, + "grad_norm": 0.06055987626314163, + "learning_rate": 6.897702991452992e-06, + "loss": 0.018, + "step": 116690 + }, + { + "epoch": 0.8626297270926347, + "grad_norm": 0.06084635481238365, + "learning_rate": 6.893993352326686e-06, + "loss": 0.0177, + "step": 116700 + }, + { + "epoch": 0.8627036456639366, + "grad_norm": 0.09046801179647446, + "learning_rate": 6.8902837132003805e-06, + "loss": 0.0158, + "step": 116710 + }, + { + "epoch": 0.8627775642352384, + "grad_norm": 0.08746856451034546, + "learning_rate": 6.886574074074075e-06, + "loss": 0.0142, + "step": 116720 + }, + { + "epoch": 0.8628514828065403, + "grad_norm": 0.09596207737922668, + "learning_rate": 6.882864434947768e-06, + "loss": 0.0169, + "step": 116730 + }, + { + "epoch": 0.8629254013778421, + "grad_norm": 0.08952523022890091, + "learning_rate": 6.879154795821463e-06, + "loss": 0.0144, + "step": 116740 + }, + { + "epoch": 0.862999319949144, + "grad_norm": 0.08711504936218262, + "learning_rate": 6.875445156695157e-06, + "loss": 0.0157, + "step": 116750 + }, + { + "epoch": 0.8630732385204459, + "grad_norm": 0.0730694904923439, + "learning_rate": 6.871735517568852e-06, + "loss": 0.0175, + "step": 116760 + }, + { + "epoch": 0.8631471570917477, + "grad_norm": 0.0819726511836052, + "learning_rate": 6.868025878442545e-06, + "loss": 0.0155, + "step": 116770 + }, + { + "epoch": 0.8632210756630496, + "grad_norm": 0.07040496915578842, + "learning_rate": 6.86431623931624e-06, + "loss": 0.0182, + "step": 116780 + }, + { + "epoch": 0.8632949942343514, + "grad_norm": 0.09829843789339066, + "learning_rate": 6.860606600189934e-06, + "loss": 0.018, + "step": 116790 + }, + { + "epoch": 0.8633689128056533, + "grad_norm": 0.08462739735841751, + "learning_rate": 6.856896961063627e-06, + "loss": 0.0166, + "step": 116800 + }, + { + "epoch": 0.8634428313769551, + "grad_norm": 0.08167954534292221, + "learning_rate": 6.853187321937322e-06, + "loss": 0.0183, + "step": 116810 + }, + { + "epoch": 0.863516749948257, + "grad_norm": 0.0685417652130127, + "learning_rate": 6.849477682811016e-06, + "loss": 0.0152, + "step": 116820 + }, + { + "epoch": 0.8635906685195589, + "grad_norm": 0.06929564476013184, + "learning_rate": 6.845768043684711e-06, + "loss": 0.016, + "step": 116830 + }, + { + "epoch": 0.8636645870908607, + "grad_norm": 0.08290966600179672, + "learning_rate": 6.842058404558404e-06, + "loss": 0.0182, + "step": 116840 + }, + { + "epoch": 0.8637385056621626, + "grad_norm": 0.08525334298610687, + "learning_rate": 6.8383487654320995e-06, + "loss": 0.0168, + "step": 116850 + }, + { + "epoch": 0.8638124242334644, + "grad_norm": 0.08763402700424194, + "learning_rate": 6.834639126305793e-06, + "loss": 0.0152, + "step": 116860 + }, + { + "epoch": 0.8638863428047663, + "grad_norm": 0.08820448815822601, + "learning_rate": 6.830929487179487e-06, + "loss": 0.0166, + "step": 116870 + }, + { + "epoch": 0.8639602613760681, + "grad_norm": 0.08678741008043289, + "learning_rate": 6.8272198480531814e-06, + "loss": 0.0184, + "step": 116880 + }, + { + "epoch": 0.86403417994737, + "grad_norm": 0.09588257223367691, + "learning_rate": 6.823510208926876e-06, + "loss": 0.019, + "step": 116890 + }, + { + "epoch": 0.8641080985186719, + "grad_norm": 0.0747290551662445, + "learning_rate": 6.819800569800571e-06, + "loss": 0.0174, + "step": 116900 + }, + { + "epoch": 0.8641820170899737, + "grad_norm": 0.10844788700342178, + "learning_rate": 6.816090930674264e-06, + "loss": 0.018, + "step": 116910 + }, + { + "epoch": 0.8642559356612756, + "grad_norm": 0.07840590178966522, + "learning_rate": 6.812381291547959e-06, + "loss": 0.0166, + "step": 116920 + }, + { + "epoch": 0.8643298542325774, + "grad_norm": 0.07529750466346741, + "learning_rate": 6.808671652421653e-06, + "loss": 0.0189, + "step": 116930 + }, + { + "epoch": 0.8644037728038793, + "grad_norm": 0.08012118935585022, + "learning_rate": 6.804962013295348e-06, + "loss": 0.0166, + "step": 116940 + }, + { + "epoch": 0.864477691375181, + "grad_norm": 0.07295463234186172, + "learning_rate": 6.801252374169041e-06, + "loss": 0.0199, + "step": 116950 + }, + { + "epoch": 0.864551609946483, + "grad_norm": 0.0702839270234108, + "learning_rate": 6.797542735042735e-06, + "loss": 0.0155, + "step": 116960 + }, + { + "epoch": 0.8646255285177848, + "grad_norm": 0.060946814715862274, + "learning_rate": 6.79383309591643e-06, + "loss": 0.015, + "step": 116970 + }, + { + "epoch": 0.8646994470890866, + "grad_norm": 0.07480543851852417, + "learning_rate": 6.790123456790123e-06, + "loss": 0.019, + "step": 116980 + }, + { + "epoch": 0.8647733656603885, + "grad_norm": 0.07786723971366882, + "learning_rate": 6.7864138176638184e-06, + "loss": 0.0149, + "step": 116990 + }, + { + "epoch": 0.8648472842316903, + "grad_norm": 0.08606283366680145, + "learning_rate": 6.782704178537512e-06, + "loss": 0.0165, + "step": 117000 + }, + { + "epoch": 0.8649212028029922, + "grad_norm": 0.08000627905130386, + "learning_rate": 6.778994539411207e-06, + "loss": 0.0152, + "step": 117010 + }, + { + "epoch": 0.8649951213742941, + "grad_norm": 0.0888182744383812, + "learning_rate": 6.7752849002849e-06, + "loss": 0.0146, + "step": 117020 + }, + { + "epoch": 0.8650690399455959, + "grad_norm": 0.06171491742134094, + "learning_rate": 6.771575261158594e-06, + "loss": 0.016, + "step": 117030 + }, + { + "epoch": 0.8651429585168978, + "grad_norm": 0.08448406308889389, + "learning_rate": 6.767865622032289e-06, + "loss": 0.0152, + "step": 117040 + }, + { + "epoch": 0.8652168770881996, + "grad_norm": 0.07047073543071747, + "learning_rate": 6.764155982905982e-06, + "loss": 0.0158, + "step": 117050 + }, + { + "epoch": 0.8652907956595015, + "grad_norm": 0.06946824491024017, + "learning_rate": 6.7604463437796775e-06, + "loss": 0.0156, + "step": 117060 + }, + { + "epoch": 0.8653647142308033, + "grad_norm": 0.08859638124704361, + "learning_rate": 6.756736704653372e-06, + "loss": 0.0166, + "step": 117070 + }, + { + "epoch": 0.8654386328021052, + "grad_norm": 0.08528061211109161, + "learning_rate": 6.753027065527066e-06, + "loss": 0.0161, + "step": 117080 + }, + { + "epoch": 0.8655125513734071, + "grad_norm": 0.08984003961086273, + "learning_rate": 6.74931742640076e-06, + "loss": 0.0185, + "step": 117090 + }, + { + "epoch": 0.8655864699447089, + "grad_norm": 0.08175487071275711, + "learning_rate": 6.745607787274454e-06, + "loss": 0.0164, + "step": 117100 + }, + { + "epoch": 0.8656603885160108, + "grad_norm": 0.05711497738957405, + "learning_rate": 6.741898148148149e-06, + "loss": 0.0184, + "step": 117110 + }, + { + "epoch": 0.8657343070873126, + "grad_norm": 0.081973135471344, + "learning_rate": 6.738188509021842e-06, + "loss": 0.0209, + "step": 117120 + }, + { + "epoch": 0.8658082256586145, + "grad_norm": 0.06424666941165924, + "learning_rate": 6.734478869895537e-06, + "loss": 0.0157, + "step": 117130 + }, + { + "epoch": 0.8658821442299163, + "grad_norm": 0.08565365523099899, + "learning_rate": 6.730769230769231e-06, + "loss": 0.0185, + "step": 117140 + }, + { + "epoch": 0.8659560628012182, + "grad_norm": 0.06290718913078308, + "learning_rate": 6.727059591642926e-06, + "loss": 0.0173, + "step": 117150 + }, + { + "epoch": 0.8660299813725201, + "grad_norm": 0.06647878885269165, + "learning_rate": 6.723349952516619e-06, + "loss": 0.0148, + "step": 117160 + }, + { + "epoch": 0.8661038999438219, + "grad_norm": 0.08329624682664871, + "learning_rate": 6.7196403133903145e-06, + "loss": 0.0152, + "step": 117170 + }, + { + "epoch": 0.8661778185151238, + "grad_norm": 0.053988419473171234, + "learning_rate": 6.715930674264008e-06, + "loss": 0.0151, + "step": 117180 + }, + { + "epoch": 0.8662517370864256, + "grad_norm": 0.06262490898370743, + "learning_rate": 6.712221035137701e-06, + "loss": 0.0159, + "step": 117190 + }, + { + "epoch": 0.8663256556577275, + "grad_norm": 0.07730703055858612, + "learning_rate": 6.7085113960113965e-06, + "loss": 0.0202, + "step": 117200 + }, + { + "epoch": 0.8663995742290294, + "grad_norm": 0.06828780472278595, + "learning_rate": 6.70480175688509e-06, + "loss": 0.016, + "step": 117210 + }, + { + "epoch": 0.8664734928003311, + "grad_norm": 0.0874919444322586, + "learning_rate": 6.701092117758785e-06, + "loss": 0.016, + "step": 117220 + }, + { + "epoch": 0.866547411371633, + "grad_norm": 0.07567555457353592, + "learning_rate": 6.6973824786324784e-06, + "loss": 0.0198, + "step": 117230 + }, + { + "epoch": 0.8666213299429348, + "grad_norm": 0.08014479279518127, + "learning_rate": 6.6936728395061736e-06, + "loss": 0.0175, + "step": 117240 + }, + { + "epoch": 0.8666952485142367, + "grad_norm": 0.08009376376867294, + "learning_rate": 6.689963200379867e-06, + "loss": 0.0157, + "step": 117250 + }, + { + "epoch": 0.8667691670855385, + "grad_norm": 0.0808338075876236, + "learning_rate": 6.686253561253561e-06, + "loss": 0.0149, + "step": 117260 + }, + { + "epoch": 0.8668430856568404, + "grad_norm": 0.07933666557073593, + "learning_rate": 6.6825439221272555e-06, + "loss": 0.0153, + "step": 117270 + }, + { + "epoch": 0.8669170042281423, + "grad_norm": 0.07114316523075104, + "learning_rate": 6.67883428300095e-06, + "loss": 0.0164, + "step": 117280 + }, + { + "epoch": 0.8669909227994441, + "grad_norm": 0.072788305580616, + "learning_rate": 6.675124643874645e-06, + "loss": 0.0176, + "step": 117290 + }, + { + "epoch": 0.867064841370746, + "grad_norm": 0.06929667294025421, + "learning_rate": 6.671415004748338e-06, + "loss": 0.0154, + "step": 117300 + }, + { + "epoch": 0.8671387599420478, + "grad_norm": 0.077360600233078, + "learning_rate": 6.6677053656220335e-06, + "loss": 0.017, + "step": 117310 + }, + { + "epoch": 0.8672126785133497, + "grad_norm": 0.06290005892515182, + "learning_rate": 6.663995726495727e-06, + "loss": 0.0175, + "step": 117320 + }, + { + "epoch": 0.8672865970846515, + "grad_norm": 0.06111391261219978, + "learning_rate": 6.66028608736942e-06, + "loss": 0.0163, + "step": 117330 + }, + { + "epoch": 0.8673605156559534, + "grad_norm": 0.0982351154088974, + "learning_rate": 6.6565764482431154e-06, + "loss": 0.0175, + "step": 117340 + }, + { + "epoch": 0.8674344342272553, + "grad_norm": 0.07444393634796143, + "learning_rate": 6.652866809116809e-06, + "loss": 0.0161, + "step": 117350 + }, + { + "epoch": 0.8675083527985571, + "grad_norm": 0.062405869364738464, + "learning_rate": 6.649157169990504e-06, + "loss": 0.0172, + "step": 117360 + }, + { + "epoch": 0.867582271369859, + "grad_norm": 0.08480317890644073, + "learning_rate": 6.645447530864197e-06, + "loss": 0.0188, + "step": 117370 + }, + { + "epoch": 0.8676561899411608, + "grad_norm": 0.0864877700805664, + "learning_rate": 6.6417378917378925e-06, + "loss": 0.0166, + "step": 117380 + }, + { + "epoch": 0.8677301085124627, + "grad_norm": 0.07572106271982193, + "learning_rate": 6.638028252611586e-06, + "loss": 0.018, + "step": 117390 + }, + { + "epoch": 0.8678040270837645, + "grad_norm": 0.09496478736400604, + "learning_rate": 6.634318613485281e-06, + "loss": 0.0178, + "step": 117400 + }, + { + "epoch": 0.8678779456550664, + "grad_norm": 0.057836130261421204, + "learning_rate": 6.6306089743589745e-06, + "loss": 0.0167, + "step": 117410 + }, + { + "epoch": 0.8679518642263683, + "grad_norm": 0.06510256230831146, + "learning_rate": 6.626899335232668e-06, + "loss": 0.0158, + "step": 117420 + }, + { + "epoch": 0.8680257827976701, + "grad_norm": 0.07079634070396423, + "learning_rate": 6.623189696106363e-06, + "loss": 0.0178, + "step": 117430 + }, + { + "epoch": 0.868099701368972, + "grad_norm": 0.09129411727190018, + "learning_rate": 6.619480056980057e-06, + "loss": 0.019, + "step": 117440 + }, + { + "epoch": 0.8681736199402738, + "grad_norm": 0.07951433956623077, + "learning_rate": 6.615770417853752e-06, + "loss": 0.0192, + "step": 117450 + }, + { + "epoch": 0.8682475385115757, + "grad_norm": 0.060429371893405914, + "learning_rate": 6.612060778727446e-06, + "loss": 0.0153, + "step": 117460 + }, + { + "epoch": 0.8683214570828776, + "grad_norm": 0.07009291648864746, + "learning_rate": 6.60835113960114e-06, + "loss": 0.0167, + "step": 117470 + }, + { + "epoch": 0.8683953756541793, + "grad_norm": 0.08836237341165543, + "learning_rate": 6.604641500474834e-06, + "loss": 0.0172, + "step": 117480 + }, + { + "epoch": 0.8684692942254812, + "grad_norm": 0.09908463060855865, + "learning_rate": 6.600931861348528e-06, + "loss": 0.0189, + "step": 117490 + }, + { + "epoch": 0.868543212796783, + "grad_norm": 0.08165255934000015, + "learning_rate": 6.597222222222223e-06, + "loss": 0.0167, + "step": 117500 + }, + { + "epoch": 0.8686171313680849, + "grad_norm": 0.0675068348646164, + "learning_rate": 6.593512583095916e-06, + "loss": 0.0165, + "step": 117510 + }, + { + "epoch": 0.8686910499393867, + "grad_norm": 0.07735388725996017, + "learning_rate": 6.5898029439696115e-06, + "loss": 0.0171, + "step": 117520 + }, + { + "epoch": 0.8687649685106886, + "grad_norm": 0.1081601232290268, + "learning_rate": 6.586093304843305e-06, + "loss": 0.0158, + "step": 117530 + }, + { + "epoch": 0.8688388870819905, + "grad_norm": 0.0819975882768631, + "learning_rate": 6.582383665717e-06, + "loss": 0.0158, + "step": 117540 + }, + { + "epoch": 0.8689128056532923, + "grad_norm": 0.07920730113983154, + "learning_rate": 6.5786740265906935e-06, + "loss": 0.0185, + "step": 117550 + }, + { + "epoch": 0.8689867242245942, + "grad_norm": 0.0685114786028862, + "learning_rate": 6.574964387464387e-06, + "loss": 0.0172, + "step": 117560 + }, + { + "epoch": 0.869060642795896, + "grad_norm": 0.07959503680467606, + "learning_rate": 6.571254748338082e-06, + "loss": 0.015, + "step": 117570 + }, + { + "epoch": 0.8691345613671979, + "grad_norm": 0.10559289902448654, + "learning_rate": 6.5675451092117755e-06, + "loss": 0.0196, + "step": 117580 + }, + { + "epoch": 0.8692084799384997, + "grad_norm": 0.08839577436447144, + "learning_rate": 6.5638354700854706e-06, + "loss": 0.0187, + "step": 117590 + }, + { + "epoch": 0.8692823985098016, + "grad_norm": 0.06458381563425064, + "learning_rate": 6.560125830959164e-06, + "loss": 0.017, + "step": 117600 + }, + { + "epoch": 0.8693563170811035, + "grad_norm": 0.08625758439302444, + "learning_rate": 6.556416191832859e-06, + "loss": 0.0179, + "step": 117610 + }, + { + "epoch": 0.8694302356524053, + "grad_norm": 0.07499458640813828, + "learning_rate": 6.5527065527065525e-06, + "loss": 0.0174, + "step": 117620 + }, + { + "epoch": 0.8695041542237072, + "grad_norm": 0.07140425592660904, + "learning_rate": 6.548996913580248e-06, + "loss": 0.0176, + "step": 117630 + }, + { + "epoch": 0.869578072795009, + "grad_norm": 0.0880943313241005, + "learning_rate": 6.545287274453941e-06, + "loss": 0.0141, + "step": 117640 + }, + { + "epoch": 0.8696519913663109, + "grad_norm": 0.0611373633146286, + "learning_rate": 6.541577635327635e-06, + "loss": 0.0164, + "step": 117650 + }, + { + "epoch": 0.8697259099376127, + "grad_norm": 0.06848348677158356, + "learning_rate": 6.5378679962013305e-06, + "loss": 0.0156, + "step": 117660 + }, + { + "epoch": 0.8697998285089146, + "grad_norm": 0.07925549894571304, + "learning_rate": 6.534158357075024e-06, + "loss": 0.0169, + "step": 117670 + }, + { + "epoch": 0.8698737470802165, + "grad_norm": 0.08508136868476868, + "learning_rate": 6.530448717948719e-06, + "loss": 0.0163, + "step": 117680 + }, + { + "epoch": 0.8699476656515183, + "grad_norm": 0.08420059829950333, + "learning_rate": 6.5267390788224125e-06, + "loss": 0.0152, + "step": 117690 + }, + { + "epoch": 0.8700215842228202, + "grad_norm": 0.07651443034410477, + "learning_rate": 6.5230294396961076e-06, + "loss": 0.0164, + "step": 117700 + }, + { + "epoch": 0.870095502794122, + "grad_norm": 0.0796276330947876, + "learning_rate": 6.519319800569801e-06, + "loss": 0.0177, + "step": 117710 + }, + { + "epoch": 0.8701694213654239, + "grad_norm": 0.0848752036690712, + "learning_rate": 6.5156101614434944e-06, + "loss": 0.0175, + "step": 117720 + }, + { + "epoch": 0.8702433399367258, + "grad_norm": 0.06675256788730621, + "learning_rate": 6.5119005223171895e-06, + "loss": 0.0164, + "step": 117730 + }, + { + "epoch": 0.8703172585080275, + "grad_norm": 0.10622403770685196, + "learning_rate": 6.508190883190883e-06, + "loss": 0.0182, + "step": 117740 + }, + { + "epoch": 0.8703911770793294, + "grad_norm": 0.05470990762114525, + "learning_rate": 6.504481244064578e-06, + "loss": 0.0158, + "step": 117750 + }, + { + "epoch": 0.8704650956506312, + "grad_norm": 0.07306019216775894, + "learning_rate": 6.5007716049382715e-06, + "loss": 0.0159, + "step": 117760 + }, + { + "epoch": 0.8705390142219331, + "grad_norm": 0.07729344069957733, + "learning_rate": 6.497061965811967e-06, + "loss": 0.0161, + "step": 117770 + }, + { + "epoch": 0.8706129327932349, + "grad_norm": 0.06492874026298523, + "learning_rate": 6.49335232668566e-06, + "loss": 0.0183, + "step": 117780 + }, + { + "epoch": 0.8706868513645368, + "grad_norm": 0.05055154487490654, + "learning_rate": 6.489642687559355e-06, + "loss": 0.0153, + "step": 117790 + }, + { + "epoch": 0.8707607699358387, + "grad_norm": 0.0782744288444519, + "learning_rate": 6.485933048433049e-06, + "loss": 0.0177, + "step": 117800 + }, + { + "epoch": 0.8708346885071405, + "grad_norm": 0.06355128437280655, + "learning_rate": 6.482223409306742e-06, + "loss": 0.0167, + "step": 117810 + }, + { + "epoch": 0.8709086070784424, + "grad_norm": 0.07428276538848877, + "learning_rate": 6.478513770180437e-06, + "loss": 0.0183, + "step": 117820 + }, + { + "epoch": 0.8709825256497442, + "grad_norm": 0.07489913702011108, + "learning_rate": 6.4748041310541314e-06, + "loss": 0.0175, + "step": 117830 + }, + { + "epoch": 0.8710564442210461, + "grad_norm": 0.08711172640323639, + "learning_rate": 6.471094491927826e-06, + "loss": 0.0154, + "step": 117840 + }, + { + "epoch": 0.8711303627923479, + "grad_norm": 0.08286292850971222, + "learning_rate": 6.46738485280152e-06, + "loss": 0.0173, + "step": 117850 + }, + { + "epoch": 0.8712042813636498, + "grad_norm": 0.09852740913629532, + "learning_rate": 6.463675213675214e-06, + "loss": 0.0189, + "step": 117860 + }, + { + "epoch": 0.8712781999349517, + "grad_norm": 0.06941636651754379, + "learning_rate": 6.4599655745489085e-06, + "loss": 0.0176, + "step": 117870 + }, + { + "epoch": 0.8713521185062535, + "grad_norm": 0.08215862512588501, + "learning_rate": 6.456255935422602e-06, + "loss": 0.0184, + "step": 117880 + }, + { + "epoch": 0.8714260370775554, + "grad_norm": 0.07836437970399857, + "learning_rate": 6.452546296296297e-06, + "loss": 0.0168, + "step": 117890 + }, + { + "epoch": 0.8714999556488572, + "grad_norm": 0.09308689087629318, + "learning_rate": 6.4488366571699905e-06, + "loss": 0.0188, + "step": 117900 + }, + { + "epoch": 0.8715738742201591, + "grad_norm": 0.0990125983953476, + "learning_rate": 6.445127018043686e-06, + "loss": 0.0165, + "step": 117910 + }, + { + "epoch": 0.8716477927914609, + "grad_norm": 0.0700770914554596, + "learning_rate": 6.441417378917379e-06, + "loss": 0.0162, + "step": 117920 + }, + { + "epoch": 0.8717217113627628, + "grad_norm": 0.05903575196862221, + "learning_rate": 6.437707739791074e-06, + "loss": 0.0177, + "step": 117930 + }, + { + "epoch": 0.8717956299340647, + "grad_norm": 0.05856722965836525, + "learning_rate": 6.433998100664768e-06, + "loss": 0.0164, + "step": 117940 + }, + { + "epoch": 0.8718695485053665, + "grad_norm": 0.06907519698143005, + "learning_rate": 6.430288461538461e-06, + "loss": 0.0175, + "step": 117950 + }, + { + "epoch": 0.8719434670766684, + "grad_norm": 0.09158388525247574, + "learning_rate": 6.426578822412156e-06, + "loss": 0.0193, + "step": 117960 + }, + { + "epoch": 0.8720173856479702, + "grad_norm": 0.10444381088018417, + "learning_rate": 6.4228691832858496e-06, + "loss": 0.0176, + "step": 117970 + }, + { + "epoch": 0.8720913042192721, + "grad_norm": 0.06845099478960037, + "learning_rate": 6.419159544159545e-06, + "loss": 0.0196, + "step": 117980 + }, + { + "epoch": 0.872165222790574, + "grad_norm": 0.05813978984951973, + "learning_rate": 6.415449905033238e-06, + "loss": 0.0161, + "step": 117990 + }, + { + "epoch": 0.8722391413618757, + "grad_norm": 0.08608116954565048, + "learning_rate": 6.411740265906933e-06, + "loss": 0.0175, + "step": 118000 + }, + { + "epoch": 0.8723130599331776, + "grad_norm": 0.06167742609977722, + "learning_rate": 6.408030626780627e-06, + "loss": 0.0155, + "step": 118010 + }, + { + "epoch": 0.8723869785044794, + "grad_norm": 0.09053590893745422, + "learning_rate": 6.404320987654322e-06, + "loss": 0.0164, + "step": 118020 + }, + { + "epoch": 0.8724608970757813, + "grad_norm": 0.07437341660261154, + "learning_rate": 6.400611348528015e-06, + "loss": 0.0182, + "step": 118030 + }, + { + "epoch": 0.8725348156470831, + "grad_norm": 0.052046339958906174, + "learning_rate": 6.3969017094017095e-06, + "loss": 0.0162, + "step": 118040 + }, + { + "epoch": 0.872608734218385, + "grad_norm": 0.06491658091545105, + "learning_rate": 6.393192070275405e-06, + "loss": 0.0155, + "step": 118050 + }, + { + "epoch": 0.8726826527896869, + "grad_norm": 0.10303626209497452, + "learning_rate": 6.389482431149098e-06, + "loss": 0.0159, + "step": 118060 + }, + { + "epoch": 0.8727565713609887, + "grad_norm": 0.11025142669677734, + "learning_rate": 6.385772792022793e-06, + "loss": 0.0162, + "step": 118070 + }, + { + "epoch": 0.8728304899322906, + "grad_norm": 0.08896782994270325, + "learning_rate": 6.3820631528964866e-06, + "loss": 0.0171, + "step": 118080 + }, + { + "epoch": 0.8729044085035924, + "grad_norm": 0.07671711593866348, + "learning_rate": 6.378353513770182e-06, + "loss": 0.0146, + "step": 118090 + }, + { + "epoch": 0.8729783270748943, + "grad_norm": 0.07379849255084991, + "learning_rate": 6.374643874643875e-06, + "loss": 0.0166, + "step": 118100 + }, + { + "epoch": 0.8730522456461961, + "grad_norm": 0.07860969752073288, + "learning_rate": 6.3709342355175685e-06, + "loss": 0.0176, + "step": 118110 + }, + { + "epoch": 0.873126164217498, + "grad_norm": 0.08312699943780899, + "learning_rate": 6.367224596391264e-06, + "loss": 0.0151, + "step": 118120 + }, + { + "epoch": 0.8732000827887999, + "grad_norm": 0.06143682450056076, + "learning_rate": 6.363514957264957e-06, + "loss": 0.0174, + "step": 118130 + }, + { + "epoch": 0.8732740013601017, + "grad_norm": 0.06969328969717026, + "learning_rate": 6.359805318138652e-06, + "loss": 0.0175, + "step": 118140 + }, + { + "epoch": 0.8733479199314036, + "grad_norm": 0.07606179267168045, + "learning_rate": 6.356095679012346e-06, + "loss": 0.0166, + "step": 118150 + }, + { + "epoch": 0.8734218385027054, + "grad_norm": 0.06871391087770462, + "learning_rate": 6.352386039886041e-06, + "loss": 0.0164, + "step": 118160 + }, + { + "epoch": 0.8734957570740073, + "grad_norm": 0.06539017707109451, + "learning_rate": 6.348676400759734e-06, + "loss": 0.015, + "step": 118170 + }, + { + "epoch": 0.8735696756453091, + "grad_norm": 0.058919504284858704, + "learning_rate": 6.344966761633428e-06, + "loss": 0.0166, + "step": 118180 + }, + { + "epoch": 0.873643594216611, + "grad_norm": 0.07681267708539963, + "learning_rate": 6.341257122507123e-06, + "loss": 0.0165, + "step": 118190 + }, + { + "epoch": 0.8737175127879129, + "grad_norm": 0.09190231561660767, + "learning_rate": 6.337547483380816e-06, + "loss": 0.0195, + "step": 118200 + }, + { + "epoch": 0.8737914313592147, + "grad_norm": 0.056361399590969086, + "learning_rate": 6.333837844254511e-06, + "loss": 0.0174, + "step": 118210 + }, + { + "epoch": 0.8738653499305166, + "grad_norm": 0.07933057844638824, + "learning_rate": 6.3301282051282055e-06, + "loss": 0.0188, + "step": 118220 + }, + { + "epoch": 0.8739392685018184, + "grad_norm": 0.08587239682674408, + "learning_rate": 6.3264185660019e-06, + "loss": 0.0173, + "step": 118230 + }, + { + "epoch": 0.8740131870731203, + "grad_norm": 0.07572540640830994, + "learning_rate": 6.322708926875594e-06, + "loss": 0.0171, + "step": 118240 + }, + { + "epoch": 0.8740871056444222, + "grad_norm": 0.09864775091409683, + "learning_rate": 6.318999287749288e-06, + "loss": 0.0162, + "step": 118250 + }, + { + "epoch": 0.874161024215724, + "grad_norm": 0.053165651857852936, + "learning_rate": 6.315289648622983e-06, + "loss": 0.016, + "step": 118260 + }, + { + "epoch": 0.8742349427870258, + "grad_norm": 0.09423815459012985, + "learning_rate": 6.311580009496676e-06, + "loss": 0.0166, + "step": 118270 + }, + { + "epoch": 0.8743088613583276, + "grad_norm": 0.07985405623912811, + "learning_rate": 6.307870370370371e-06, + "loss": 0.0175, + "step": 118280 + }, + { + "epoch": 0.8743827799296295, + "grad_norm": 0.07381712645292282, + "learning_rate": 6.304160731244065e-06, + "loss": 0.0143, + "step": 118290 + }, + { + "epoch": 0.8744566985009313, + "grad_norm": 0.0779792070388794, + "learning_rate": 6.30045109211776e-06, + "loss": 0.0167, + "step": 118300 + }, + { + "epoch": 0.8745306170722332, + "grad_norm": 0.06026535481214523, + "learning_rate": 6.296741452991453e-06, + "loss": 0.0154, + "step": 118310 + }, + { + "epoch": 0.8746045356435351, + "grad_norm": 0.06948491185903549, + "learning_rate": 6.293031813865148e-06, + "loss": 0.0181, + "step": 118320 + }, + { + "epoch": 0.8746784542148369, + "grad_norm": 0.09600476175546646, + "learning_rate": 6.289322174738842e-06, + "loss": 0.0181, + "step": 118330 + }, + { + "epoch": 0.8747523727861388, + "grad_norm": 0.07605545967817307, + "learning_rate": 6.285612535612535e-06, + "loss": 0.017, + "step": 118340 + }, + { + "epoch": 0.8748262913574406, + "grad_norm": 0.07898372411727905, + "learning_rate": 6.28190289648623e-06, + "loss": 0.0182, + "step": 118350 + }, + { + "epoch": 0.8749002099287425, + "grad_norm": 0.07002683728933334, + "learning_rate": 6.278193257359924e-06, + "loss": 0.016, + "step": 118360 + }, + { + "epoch": 0.8749741285000443, + "grad_norm": 0.06312119960784912, + "learning_rate": 6.274483618233619e-06, + "loss": 0.0173, + "step": 118370 + }, + { + "epoch": 0.8750480470713462, + "grad_norm": 0.07424383610486984, + "learning_rate": 6.270773979107312e-06, + "loss": 0.0188, + "step": 118380 + }, + { + "epoch": 0.8751219656426481, + "grad_norm": 0.09007822722196579, + "learning_rate": 6.267064339981007e-06, + "loss": 0.0151, + "step": 118390 + }, + { + "epoch": 0.8751958842139499, + "grad_norm": 0.07628867030143738, + "learning_rate": 6.263354700854701e-06, + "loss": 0.017, + "step": 118400 + }, + { + "epoch": 0.8752698027852518, + "grad_norm": 0.09205362200737, + "learning_rate": 6.259645061728395e-06, + "loss": 0.0191, + "step": 118410 + }, + { + "epoch": 0.8753437213565536, + "grad_norm": 0.08055655658245087, + "learning_rate": 6.255935422602089e-06, + "loss": 0.0165, + "step": 118420 + }, + { + "epoch": 0.8754176399278555, + "grad_norm": 0.06153779849410057, + "learning_rate": 6.2522257834757836e-06, + "loss": 0.0166, + "step": 118430 + }, + { + "epoch": 0.8754915584991573, + "grad_norm": 0.07093429565429688, + "learning_rate": 6.248516144349479e-06, + "loss": 0.0175, + "step": 118440 + }, + { + "epoch": 0.8755654770704592, + "grad_norm": 0.07778339833021164, + "learning_rate": 6.244806505223172e-06, + "loss": 0.0178, + "step": 118450 + }, + { + "epoch": 0.8756393956417611, + "grad_norm": 0.0749053806066513, + "learning_rate": 6.241096866096866e-06, + "loss": 0.0198, + "step": 118460 + }, + { + "epoch": 0.8757133142130629, + "grad_norm": 0.07246764004230499, + "learning_rate": 6.237387226970561e-06, + "loss": 0.0173, + "step": 118470 + }, + { + "epoch": 0.8757872327843648, + "grad_norm": 0.062185708433389664, + "learning_rate": 6.233677587844255e-06, + "loss": 0.0173, + "step": 118480 + }, + { + "epoch": 0.8758611513556666, + "grad_norm": 0.08127869665622711, + "learning_rate": 6.229967948717949e-06, + "loss": 0.017, + "step": 118490 + }, + { + "epoch": 0.8759350699269685, + "grad_norm": 0.06540331989526749, + "learning_rate": 6.2262583095916435e-06, + "loss": 0.0184, + "step": 118500 + }, + { + "epoch": 0.8760089884982704, + "grad_norm": 0.10836683958768845, + "learning_rate": 6.222548670465338e-06, + "loss": 0.0191, + "step": 118510 + }, + { + "epoch": 0.8760829070695721, + "grad_norm": 0.07994542270898819, + "learning_rate": 6.218839031339032e-06, + "loss": 0.0183, + "step": 118520 + }, + { + "epoch": 0.876156825640874, + "grad_norm": 0.07900182902812958, + "learning_rate": 6.2151293922127254e-06, + "loss": 0.0174, + "step": 118530 + }, + { + "epoch": 0.8762307442121758, + "grad_norm": 0.07550885528326035, + "learning_rate": 6.21141975308642e-06, + "loss": 0.0159, + "step": 118540 + }, + { + "epoch": 0.8763046627834777, + "grad_norm": 0.07126692682504654, + "learning_rate": 6.207710113960114e-06, + "loss": 0.0153, + "step": 118550 + }, + { + "epoch": 0.8763785813547795, + "grad_norm": 0.06923384964466095, + "learning_rate": 6.204000474833808e-06, + "loss": 0.0156, + "step": 118560 + }, + { + "epoch": 0.8764524999260814, + "grad_norm": 0.0925263836979866, + "learning_rate": 6.2002908357075025e-06, + "loss": 0.0176, + "step": 118570 + }, + { + "epoch": 0.8765264184973833, + "grad_norm": 0.06728041917085648, + "learning_rate": 6.196581196581197e-06, + "loss": 0.0153, + "step": 118580 + }, + { + "epoch": 0.8766003370686851, + "grad_norm": 0.06608957797288895, + "learning_rate": 6.192871557454891e-06, + "loss": 0.0167, + "step": 118590 + }, + { + "epoch": 0.876674255639987, + "grad_norm": 0.07481256872415543, + "learning_rate": 6.189161918328585e-06, + "loss": 0.0153, + "step": 118600 + }, + { + "epoch": 0.8767481742112888, + "grad_norm": 0.07264452427625656, + "learning_rate": 6.18545227920228e-06, + "loss": 0.0172, + "step": 118610 + }, + { + "epoch": 0.8768220927825907, + "grad_norm": 0.08074048906564713, + "learning_rate": 6.181742640075974e-06, + "loss": 0.0178, + "step": 118620 + }, + { + "epoch": 0.8768960113538925, + "grad_norm": 0.09411899000406265, + "learning_rate": 6.178033000949668e-06, + "loss": 0.0175, + "step": 118630 + }, + { + "epoch": 0.8769699299251944, + "grad_norm": 0.0678592100739479, + "learning_rate": 6.1743233618233625e-06, + "loss": 0.0169, + "step": 118640 + }, + { + "epoch": 0.8770438484964963, + "grad_norm": 0.06519462913274765, + "learning_rate": 6.170613722697057e-06, + "loss": 0.0165, + "step": 118650 + }, + { + "epoch": 0.8771177670677981, + "grad_norm": 0.07327589392662048, + "learning_rate": 6.166904083570751e-06, + "loss": 0.0165, + "step": 118660 + }, + { + "epoch": 0.8771916856391, + "grad_norm": 0.06687984615564346, + "learning_rate": 6.163194444444445e-06, + "loss": 0.0161, + "step": 118670 + }, + { + "epoch": 0.8772656042104018, + "grad_norm": 0.08779148012399673, + "learning_rate": 6.159484805318139e-06, + "loss": 0.0177, + "step": 118680 + }, + { + "epoch": 0.8773395227817037, + "grad_norm": 0.12947805225849152, + "learning_rate": 6.155775166191833e-06, + "loss": 0.0156, + "step": 118690 + }, + { + "epoch": 0.8774134413530055, + "grad_norm": 0.07689861208200455, + "learning_rate": 6.152065527065527e-06, + "loss": 0.0158, + "step": 118700 + }, + { + "epoch": 0.8774873599243074, + "grad_norm": 0.06581514328718185, + "learning_rate": 6.1483558879392215e-06, + "loss": 0.0175, + "step": 118710 + }, + { + "epoch": 0.8775612784956093, + "grad_norm": 0.07931865751743317, + "learning_rate": 6.144646248812916e-06, + "loss": 0.0154, + "step": 118720 + }, + { + "epoch": 0.8776351970669111, + "grad_norm": 0.10354472696781158, + "learning_rate": 6.14093660968661e-06, + "loss": 0.0183, + "step": 118730 + }, + { + "epoch": 0.877709115638213, + "grad_norm": 0.06549788266420364, + "learning_rate": 6.137226970560304e-06, + "loss": 0.0182, + "step": 118740 + }, + { + "epoch": 0.8777830342095148, + "grad_norm": 0.059177011251449585, + "learning_rate": 6.133517331433999e-06, + "loss": 0.0162, + "step": 118750 + }, + { + "epoch": 0.8778569527808167, + "grad_norm": 0.08157137036323547, + "learning_rate": 6.129807692307692e-06, + "loss": 0.0184, + "step": 118760 + }, + { + "epoch": 0.8779308713521186, + "grad_norm": 0.08282024413347244, + "learning_rate": 6.126098053181386e-06, + "loss": 0.0153, + "step": 118770 + }, + { + "epoch": 0.8780047899234203, + "grad_norm": 0.08915043622255325, + "learning_rate": 6.122388414055081e-06, + "loss": 0.0161, + "step": 118780 + }, + { + "epoch": 0.8780787084947222, + "grad_norm": 0.06341081857681274, + "learning_rate": 6.118678774928775e-06, + "loss": 0.0152, + "step": 118790 + }, + { + "epoch": 0.878152627066024, + "grad_norm": 0.09454089403152466, + "learning_rate": 6.114969135802469e-06, + "loss": 0.0163, + "step": 118800 + }, + { + "epoch": 0.8782265456373259, + "grad_norm": 0.09894683212041855, + "learning_rate": 6.111259496676164e-06, + "loss": 0.0168, + "step": 118810 + }, + { + "epoch": 0.8783004642086277, + "grad_norm": 0.07308226823806763, + "learning_rate": 6.1075498575498585e-06, + "loss": 0.0157, + "step": 118820 + }, + { + "epoch": 0.8783743827799296, + "grad_norm": 0.0727970078587532, + "learning_rate": 6.103840218423552e-06, + "loss": 0.0184, + "step": 118830 + }, + { + "epoch": 0.8784483013512315, + "grad_norm": 0.08420936018228531, + "learning_rate": 6.100130579297246e-06, + "loss": 0.0174, + "step": 118840 + }, + { + "epoch": 0.8785222199225333, + "grad_norm": 0.07505591958761215, + "learning_rate": 6.0964209401709405e-06, + "loss": 0.0159, + "step": 118850 + }, + { + "epoch": 0.8785961384938352, + "grad_norm": 0.06973277777433395, + "learning_rate": 6.092711301044635e-06, + "loss": 0.0195, + "step": 118860 + }, + { + "epoch": 0.878670057065137, + "grad_norm": 0.07956535369157791, + "learning_rate": 6.089001661918329e-06, + "loss": 0.0173, + "step": 118870 + }, + { + "epoch": 0.8787439756364389, + "grad_norm": 0.07417246699333191, + "learning_rate": 6.085292022792023e-06, + "loss": 0.0169, + "step": 118880 + }, + { + "epoch": 0.8788178942077407, + "grad_norm": 0.054359957575798035, + "learning_rate": 6.081582383665718e-06, + "loss": 0.016, + "step": 118890 + }, + { + "epoch": 0.8788918127790426, + "grad_norm": 0.08991627395153046, + "learning_rate": 6.077872744539412e-06, + "loss": 0.0167, + "step": 118900 + }, + { + "epoch": 0.8789657313503445, + "grad_norm": 0.06934183090925217, + "learning_rate": 6.074163105413105e-06, + "loss": 0.0166, + "step": 118910 + }, + { + "epoch": 0.8790396499216463, + "grad_norm": 0.07893542945384979, + "learning_rate": 6.0704534662867996e-06, + "loss": 0.0177, + "step": 118920 + }, + { + "epoch": 0.8791135684929482, + "grad_norm": 0.07071886956691742, + "learning_rate": 6.066743827160494e-06, + "loss": 0.0188, + "step": 118930 + }, + { + "epoch": 0.87918748706425, + "grad_norm": 0.10038921236991882, + "learning_rate": 6.063034188034188e-06, + "loss": 0.0153, + "step": 118940 + }, + { + "epoch": 0.8792614056355519, + "grad_norm": 0.07400741428136826, + "learning_rate": 6.059324548907882e-06, + "loss": 0.0182, + "step": 118950 + }, + { + "epoch": 0.8793353242068537, + "grad_norm": 0.06069588661193848, + "learning_rate": 6.055614909781577e-06, + "loss": 0.0151, + "step": 118960 + }, + { + "epoch": 0.8794092427781556, + "grad_norm": 0.08919277042150497, + "learning_rate": 6.051905270655271e-06, + "loss": 0.017, + "step": 118970 + }, + { + "epoch": 0.8794831613494575, + "grad_norm": 0.08653470873832703, + "learning_rate": 6.048195631528965e-06, + "loss": 0.0177, + "step": 118980 + }, + { + "epoch": 0.8795570799207593, + "grad_norm": 0.07246468216180801, + "learning_rate": 6.0444859924026595e-06, + "loss": 0.0161, + "step": 118990 + }, + { + "epoch": 0.8796309984920612, + "grad_norm": 0.08383968472480774, + "learning_rate": 6.040776353276354e-06, + "loss": 0.0162, + "step": 119000 + }, + { + "epoch": 0.879704917063363, + "grad_norm": 0.062004826962947845, + "learning_rate": 6.037066714150048e-06, + "loss": 0.0165, + "step": 119010 + }, + { + "epoch": 0.8797788356346649, + "grad_norm": 0.056051649153232574, + "learning_rate": 6.033357075023742e-06, + "loss": 0.0164, + "step": 119020 + }, + { + "epoch": 0.8798527542059668, + "grad_norm": 0.07480798661708832, + "learning_rate": 6.0296474358974366e-06, + "loss": 0.0168, + "step": 119030 + }, + { + "epoch": 0.8799266727772685, + "grad_norm": 0.07464089244604111, + "learning_rate": 6.025937796771131e-06, + "loss": 0.0169, + "step": 119040 + }, + { + "epoch": 0.8800005913485704, + "grad_norm": 0.06516869366168976, + "learning_rate": 6.022228157644825e-06, + "loss": 0.0153, + "step": 119050 + }, + { + "epoch": 0.8800745099198722, + "grad_norm": 0.06231831759214401, + "learning_rate": 6.0185185185185185e-06, + "loss": 0.0148, + "step": 119060 + }, + { + "epoch": 0.8801484284911741, + "grad_norm": 0.09648638218641281, + "learning_rate": 6.014808879392213e-06, + "loss": 0.017, + "step": 119070 + }, + { + "epoch": 0.8802223470624759, + "grad_norm": 0.07397064566612244, + "learning_rate": 6.011099240265907e-06, + "loss": 0.0159, + "step": 119080 + }, + { + "epoch": 0.8802962656337778, + "grad_norm": 0.07799141854047775, + "learning_rate": 6.007389601139601e-06, + "loss": 0.0165, + "step": 119090 + }, + { + "epoch": 0.8803701842050797, + "grad_norm": 0.08395656943321228, + "learning_rate": 6.003679962013296e-06, + "loss": 0.018, + "step": 119100 + }, + { + "epoch": 0.8804441027763815, + "grad_norm": 0.07837633043527603, + "learning_rate": 5.99997032288699e-06, + "loss": 0.0178, + "step": 119110 + }, + { + "epoch": 0.8805180213476834, + "grad_norm": 0.12159515917301178, + "learning_rate": 5.996260683760684e-06, + "loss": 0.0191, + "step": 119120 + }, + { + "epoch": 0.8805919399189852, + "grad_norm": 0.06409422308206558, + "learning_rate": 5.9925510446343784e-06, + "loss": 0.0169, + "step": 119130 + }, + { + "epoch": 0.8806658584902871, + "grad_norm": 0.09560755640268326, + "learning_rate": 5.988841405508072e-06, + "loss": 0.0158, + "step": 119140 + }, + { + "epoch": 0.8807397770615889, + "grad_norm": 0.07393410056829453, + "learning_rate": 5.985131766381766e-06, + "loss": 0.0193, + "step": 119150 + }, + { + "epoch": 0.8808136956328908, + "grad_norm": 0.10730542987585068, + "learning_rate": 5.98142212725546e-06, + "loss": 0.0177, + "step": 119160 + }, + { + "epoch": 0.8808876142041927, + "grad_norm": 0.05886324122548103, + "learning_rate": 5.977712488129155e-06, + "loss": 0.0163, + "step": 119170 + }, + { + "epoch": 0.8809615327754945, + "grad_norm": 0.09222765266895294, + "learning_rate": 5.974002849002849e-06, + "loss": 0.0157, + "step": 119180 + }, + { + "epoch": 0.8810354513467964, + "grad_norm": 0.07070504128932953, + "learning_rate": 5.970293209876543e-06, + "loss": 0.016, + "step": 119190 + }, + { + "epoch": 0.8811093699180982, + "grad_norm": 0.055727239698171616, + "learning_rate": 5.966583570750238e-06, + "loss": 0.0183, + "step": 119200 + }, + { + "epoch": 0.8811832884894001, + "grad_norm": 0.06888076663017273, + "learning_rate": 5.962873931623933e-06, + "loss": 0.0166, + "step": 119210 + }, + { + "epoch": 0.881257207060702, + "grad_norm": 0.10308843851089478, + "learning_rate": 5.959164292497626e-06, + "loss": 0.0173, + "step": 119220 + }, + { + "epoch": 0.8813311256320038, + "grad_norm": 0.10609335452318192, + "learning_rate": 5.95545465337132e-06, + "loss": 0.0177, + "step": 119230 + }, + { + "epoch": 0.8814050442033057, + "grad_norm": 0.07922697812318802, + "learning_rate": 5.951745014245015e-06, + "loss": 0.0162, + "step": 119240 + }, + { + "epoch": 0.8814789627746075, + "grad_norm": 0.050138652324676514, + "learning_rate": 5.948035375118709e-06, + "loss": 0.0136, + "step": 119250 + }, + { + "epoch": 0.8815528813459094, + "grad_norm": 0.0792040005326271, + "learning_rate": 5.944325735992403e-06, + "loss": 0.0164, + "step": 119260 + }, + { + "epoch": 0.8816267999172112, + "grad_norm": 0.0703444704413414, + "learning_rate": 5.940616096866097e-06, + "loss": 0.0156, + "step": 119270 + }, + { + "epoch": 0.8817007184885131, + "grad_norm": 0.07934896647930145, + "learning_rate": 5.936906457739792e-06, + "loss": 0.0178, + "step": 119280 + }, + { + "epoch": 0.881774637059815, + "grad_norm": 0.08978631347417831, + "learning_rate": 5.933196818613485e-06, + "loss": 0.0188, + "step": 119290 + }, + { + "epoch": 0.8818485556311167, + "grad_norm": 0.07721249014139175, + "learning_rate": 5.929487179487179e-06, + "loss": 0.0176, + "step": 119300 + }, + { + "epoch": 0.8819224742024186, + "grad_norm": 0.07975966483354568, + "learning_rate": 5.925777540360874e-06, + "loss": 0.0142, + "step": 119310 + }, + { + "epoch": 0.8819963927737204, + "grad_norm": 0.06769223511219025, + "learning_rate": 5.922067901234568e-06, + "loss": 0.0153, + "step": 119320 + }, + { + "epoch": 0.8820703113450223, + "grad_norm": 0.06781387329101562, + "learning_rate": 5.918358262108262e-06, + "loss": 0.017, + "step": 119330 + }, + { + "epoch": 0.8821442299163241, + "grad_norm": 0.07139957696199417, + "learning_rate": 5.9146486229819565e-06, + "loss": 0.0195, + "step": 119340 + }, + { + "epoch": 0.882218148487626, + "grad_norm": 0.09100551903247833, + "learning_rate": 5.910938983855651e-06, + "loss": 0.0186, + "step": 119350 + }, + { + "epoch": 0.8822920670589279, + "grad_norm": 0.08640255779027939, + "learning_rate": 5.907229344729345e-06, + "loss": 0.0189, + "step": 119360 + }, + { + "epoch": 0.8823659856302297, + "grad_norm": 0.06398281455039978, + "learning_rate": 5.903519705603039e-06, + "loss": 0.0176, + "step": 119370 + }, + { + "epoch": 0.8824399042015316, + "grad_norm": 0.08789733052253723, + "learning_rate": 5.8998100664767336e-06, + "loss": 0.0187, + "step": 119380 + }, + { + "epoch": 0.8825138227728334, + "grad_norm": 0.06949253380298615, + "learning_rate": 5.896100427350428e-06, + "loss": 0.0162, + "step": 119390 + }, + { + "epoch": 0.8825877413441353, + "grad_norm": 0.07115206122398376, + "learning_rate": 5.892390788224122e-06, + "loss": 0.0154, + "step": 119400 + }, + { + "epoch": 0.8826616599154371, + "grad_norm": 0.07384328544139862, + "learning_rate": 5.888681149097816e-06, + "loss": 0.0166, + "step": 119410 + }, + { + "epoch": 0.882735578486739, + "grad_norm": 0.07285971939563751, + "learning_rate": 5.884971509971511e-06, + "loss": 0.0168, + "step": 119420 + }, + { + "epoch": 0.8828094970580409, + "grad_norm": 0.07207538932561874, + "learning_rate": 5.881261870845205e-06, + "loss": 0.0171, + "step": 119430 + }, + { + "epoch": 0.8828834156293427, + "grad_norm": 0.08270884305238724, + "learning_rate": 5.877552231718899e-06, + "loss": 0.0178, + "step": 119440 + }, + { + "epoch": 0.8829573342006446, + "grad_norm": 0.11721400171518326, + "learning_rate": 5.873842592592593e-06, + "loss": 0.0168, + "step": 119450 + }, + { + "epoch": 0.8830312527719464, + "grad_norm": 0.07226939499378204, + "learning_rate": 5.870132953466287e-06, + "loss": 0.0154, + "step": 119460 + }, + { + "epoch": 0.8831051713432483, + "grad_norm": 0.05731596797704697, + "learning_rate": 5.866423314339981e-06, + "loss": 0.0173, + "step": 119470 + }, + { + "epoch": 0.8831790899145502, + "grad_norm": 0.07784925401210785, + "learning_rate": 5.8627136752136754e-06, + "loss": 0.0157, + "step": 119480 + }, + { + "epoch": 0.883253008485852, + "grad_norm": 0.057738661766052246, + "learning_rate": 5.85900403608737e-06, + "loss": 0.0164, + "step": 119490 + }, + { + "epoch": 0.8833269270571539, + "grad_norm": 0.06912290304899216, + "learning_rate": 5.855294396961064e-06, + "loss": 0.0183, + "step": 119500 + }, + { + "epoch": 0.8834008456284557, + "grad_norm": 0.07481943070888519, + "learning_rate": 5.851584757834758e-06, + "loss": 0.0155, + "step": 119510 + }, + { + "epoch": 0.8834747641997576, + "grad_norm": 0.14108766615390778, + "learning_rate": 5.847875118708452e-06, + "loss": 0.0166, + "step": 119520 + }, + { + "epoch": 0.8835486827710594, + "grad_norm": 0.06856310367584229, + "learning_rate": 5.844165479582146e-06, + "loss": 0.0158, + "step": 119530 + }, + { + "epoch": 0.8836226013423613, + "grad_norm": 0.09352389723062515, + "learning_rate": 5.84045584045584e-06, + "loss": 0.0154, + "step": 119540 + }, + { + "epoch": 0.8836965199136632, + "grad_norm": 0.06415341049432755, + "learning_rate": 5.8367462013295345e-06, + "loss": 0.0175, + "step": 119550 + }, + { + "epoch": 0.883770438484965, + "grad_norm": 0.07106293737888336, + "learning_rate": 5.833036562203229e-06, + "loss": 0.0182, + "step": 119560 + }, + { + "epoch": 0.8838443570562668, + "grad_norm": 0.0817696824669838, + "learning_rate": 5.829326923076923e-06, + "loss": 0.0154, + "step": 119570 + }, + { + "epoch": 0.8839182756275686, + "grad_norm": 0.07956176996231079, + "learning_rate": 5.825617283950618e-06, + "loss": 0.0172, + "step": 119580 + }, + { + "epoch": 0.8839921941988705, + "grad_norm": 0.07482311129570007, + "learning_rate": 5.8219076448243124e-06, + "loss": 0.0154, + "step": 119590 + }, + { + "epoch": 0.8840661127701723, + "grad_norm": 0.09065963327884674, + "learning_rate": 5.818198005698006e-06, + "loss": 0.0151, + "step": 119600 + }, + { + "epoch": 0.8841400313414742, + "grad_norm": 0.12129585444927216, + "learning_rate": 5.8144883665717e-06, + "loss": 0.0164, + "step": 119610 + }, + { + "epoch": 0.8842139499127761, + "grad_norm": 0.11287616938352585, + "learning_rate": 5.810778727445394e-06, + "loss": 0.0173, + "step": 119620 + }, + { + "epoch": 0.8842878684840779, + "grad_norm": 0.05906311795115471, + "learning_rate": 5.807069088319089e-06, + "loss": 0.0157, + "step": 119630 + }, + { + "epoch": 0.8843617870553798, + "grad_norm": 0.11285065859556198, + "learning_rate": 5.803359449192783e-06, + "loss": 0.0193, + "step": 119640 + }, + { + "epoch": 0.8844357056266816, + "grad_norm": 0.07003043591976166, + "learning_rate": 5.799649810066477e-06, + "loss": 0.0173, + "step": 119650 + }, + { + "epoch": 0.8845096241979835, + "grad_norm": 0.06678735464811325, + "learning_rate": 5.7959401709401715e-06, + "loss": 0.0148, + "step": 119660 + }, + { + "epoch": 0.8845835427692853, + "grad_norm": 0.05683312565088272, + "learning_rate": 5.792230531813866e-06, + "loss": 0.0147, + "step": 119670 + }, + { + "epoch": 0.8846574613405872, + "grad_norm": 0.08991986513137817, + "learning_rate": 5.788520892687559e-06, + "loss": 0.0144, + "step": 119680 + }, + { + "epoch": 0.8847313799118891, + "grad_norm": 0.08308696746826172, + "learning_rate": 5.7848112535612535e-06, + "loss": 0.017, + "step": 119690 + }, + { + "epoch": 0.8848052984831909, + "grad_norm": 0.07566976547241211, + "learning_rate": 5.781101614434948e-06, + "loss": 0.0162, + "step": 119700 + }, + { + "epoch": 0.8848792170544928, + "grad_norm": 0.08281205594539642, + "learning_rate": 5.777391975308642e-06, + "loss": 0.0182, + "step": 119710 + }, + { + "epoch": 0.8849531356257946, + "grad_norm": 0.0822274386882782, + "learning_rate": 5.773682336182336e-06, + "loss": 0.0147, + "step": 119720 + }, + { + "epoch": 0.8850270541970965, + "grad_norm": 0.07982059568166733, + "learning_rate": 5.7699726970560306e-06, + "loss": 0.0161, + "step": 119730 + }, + { + "epoch": 0.8851009727683984, + "grad_norm": 0.10032445192337036, + "learning_rate": 5.766263057929725e-06, + "loss": 0.0184, + "step": 119740 + }, + { + "epoch": 0.8851748913397002, + "grad_norm": 0.05839885026216507, + "learning_rate": 5.762553418803419e-06, + "loss": 0.016, + "step": 119750 + }, + { + "epoch": 0.8852488099110021, + "grad_norm": 0.08391750603914261, + "learning_rate": 5.758843779677113e-06, + "loss": 0.0151, + "step": 119760 + }, + { + "epoch": 0.8853227284823039, + "grad_norm": 0.09729672968387604, + "learning_rate": 5.755134140550808e-06, + "loss": 0.0169, + "step": 119770 + }, + { + "epoch": 0.8853966470536058, + "grad_norm": 0.06830310076475143, + "learning_rate": 5.751424501424502e-06, + "loss": 0.0172, + "step": 119780 + }, + { + "epoch": 0.8854705656249076, + "grad_norm": 0.09163478761911392, + "learning_rate": 5.747714862298196e-06, + "loss": 0.0181, + "step": 119790 + }, + { + "epoch": 0.8855444841962095, + "grad_norm": 0.047696489840745926, + "learning_rate": 5.7440052231718905e-06, + "loss": 0.0178, + "step": 119800 + }, + { + "epoch": 0.8856184027675114, + "grad_norm": 0.09021651744842529, + "learning_rate": 5.740295584045585e-06, + "loss": 0.0164, + "step": 119810 + }, + { + "epoch": 0.8856923213388131, + "grad_norm": 0.0775601714849472, + "learning_rate": 5.736585944919279e-06, + "loss": 0.0174, + "step": 119820 + }, + { + "epoch": 0.885766239910115, + "grad_norm": 0.09051928669214249, + "learning_rate": 5.7328763057929725e-06, + "loss": 0.0192, + "step": 119830 + }, + { + "epoch": 0.8858401584814168, + "grad_norm": 0.07663477212190628, + "learning_rate": 5.729166666666667e-06, + "loss": 0.0196, + "step": 119840 + }, + { + "epoch": 0.8859140770527187, + "grad_norm": 0.09055771678686142, + "learning_rate": 5.725457027540361e-06, + "loss": 0.0156, + "step": 119850 + }, + { + "epoch": 0.8859879956240205, + "grad_norm": 0.08544610440731049, + "learning_rate": 5.721747388414055e-06, + "loss": 0.0187, + "step": 119860 + }, + { + "epoch": 0.8860619141953224, + "grad_norm": 0.09478023648262024, + "learning_rate": 5.7180377492877495e-06, + "loss": 0.0182, + "step": 119870 + }, + { + "epoch": 0.8861358327666243, + "grad_norm": 0.0887848511338234, + "learning_rate": 5.714328110161444e-06, + "loss": 0.0174, + "step": 119880 + }, + { + "epoch": 0.8862097513379261, + "grad_norm": 0.10478053241968155, + "learning_rate": 5.710618471035138e-06, + "loss": 0.0172, + "step": 119890 + }, + { + "epoch": 0.886283669909228, + "grad_norm": 0.06299924105405807, + "learning_rate": 5.706908831908832e-06, + "loss": 0.0168, + "step": 119900 + }, + { + "epoch": 0.8863575884805298, + "grad_norm": 0.07549279928207397, + "learning_rate": 5.703199192782526e-06, + "loss": 0.0168, + "step": 119910 + }, + { + "epoch": 0.8864315070518317, + "grad_norm": 0.0627603754401207, + "learning_rate": 5.69948955365622e-06, + "loss": 0.0166, + "step": 119920 + }, + { + "epoch": 0.8865054256231335, + "grad_norm": 0.08275282382965088, + "learning_rate": 5.695779914529914e-06, + "loss": 0.0159, + "step": 119930 + }, + { + "epoch": 0.8865793441944354, + "grad_norm": 0.06280075758695602, + "learning_rate": 5.692070275403609e-06, + "loss": 0.0174, + "step": 119940 + }, + { + "epoch": 0.8866532627657373, + "grad_norm": 0.07194601744413376, + "learning_rate": 5.688360636277303e-06, + "loss": 0.0166, + "step": 119950 + }, + { + "epoch": 0.8867271813370391, + "grad_norm": 0.1123146116733551, + "learning_rate": 5.684650997150998e-06, + "loss": 0.02, + "step": 119960 + }, + { + "epoch": 0.886801099908341, + "grad_norm": 0.07987431436777115, + "learning_rate": 5.680941358024692e-06, + "loss": 0.0159, + "step": 119970 + }, + { + "epoch": 0.8868750184796428, + "grad_norm": 0.07852199673652649, + "learning_rate": 5.6772317188983866e-06, + "loss": 0.0182, + "step": 119980 + }, + { + "epoch": 0.8869489370509447, + "grad_norm": 0.08840122073888779, + "learning_rate": 5.67352207977208e-06, + "loss": 0.0163, + "step": 119990 + }, + { + "epoch": 0.8870228556222466, + "grad_norm": 0.05927702784538269, + "learning_rate": 5.669812440645774e-06, + "loss": 0.0161, + "step": 120000 + }, + { + "epoch": 0.8870228556222466, + "eval_f1": 0.6344552032256219, + "eval_loss": 0.016682270914316177, + "eval_precision": 0.5062286369185682, + "eval_recall": 0.8496762787442149, + "eval_runtime": 2912.7696, + "eval_samples_per_second": 185.78, + "eval_steps_per_second": 2.903, + "step": 120000 + }, + { + "epoch": 0.8870967741935484, + "grad_norm": 0.09450909495353699, + "learning_rate": 5.6661028015194685e-06, + "loss": 0.0155, + "step": 120010 + }, + { + "epoch": 0.8871706927648503, + "grad_norm": 0.06688588857650757, + "learning_rate": 5.662393162393163e-06, + "loss": 0.0183, + "step": 120020 + }, + { + "epoch": 0.8872446113361521, + "grad_norm": 0.09937802702188492, + "learning_rate": 5.658683523266857e-06, + "loss": 0.0192, + "step": 120030 + }, + { + "epoch": 0.887318529907454, + "grad_norm": 0.06747591495513916, + "learning_rate": 5.654973884140551e-06, + "loss": 0.0152, + "step": 120040 + }, + { + "epoch": 0.8873924484787558, + "grad_norm": 0.07164330780506134, + "learning_rate": 5.651264245014246e-06, + "loss": 0.0153, + "step": 120050 + }, + { + "epoch": 0.8874663670500577, + "grad_norm": 0.07619501650333405, + "learning_rate": 5.647554605887939e-06, + "loss": 0.0156, + "step": 120060 + }, + { + "epoch": 0.8875402856213596, + "grad_norm": 0.07995057851076126, + "learning_rate": 5.643844966761633e-06, + "loss": 0.0181, + "step": 120070 + }, + { + "epoch": 0.8876142041926613, + "grad_norm": 0.06575370579957962, + "learning_rate": 5.640135327635328e-06, + "loss": 0.016, + "step": 120080 + }, + { + "epoch": 0.8876881227639632, + "grad_norm": 0.09357030689716339, + "learning_rate": 5.636425688509022e-06, + "loss": 0.0178, + "step": 120090 + }, + { + "epoch": 0.887762041335265, + "grad_norm": 0.1380130648612976, + "learning_rate": 5.632716049382716e-06, + "loss": 0.0189, + "step": 120100 + }, + { + "epoch": 0.8878359599065669, + "grad_norm": 0.08238451927900314, + "learning_rate": 5.62900641025641e-06, + "loss": 0.016, + "step": 120110 + }, + { + "epoch": 0.8879098784778687, + "grad_norm": 0.08737955242395401, + "learning_rate": 5.625296771130105e-06, + "loss": 0.0175, + "step": 120120 + }, + { + "epoch": 0.8879837970491706, + "grad_norm": 0.08896934986114502, + "learning_rate": 5.621587132003799e-06, + "loss": 0.0162, + "step": 120130 + }, + { + "epoch": 0.8880577156204725, + "grad_norm": 0.07118116319179535, + "learning_rate": 5.617877492877493e-06, + "loss": 0.0168, + "step": 120140 + }, + { + "epoch": 0.8881316341917743, + "grad_norm": 0.08958224952220917, + "learning_rate": 5.6141678537511875e-06, + "loss": 0.0165, + "step": 120150 + }, + { + "epoch": 0.8882055527630762, + "grad_norm": 0.08103135973215103, + "learning_rate": 5.610458214624882e-06, + "loss": 0.0171, + "step": 120160 + }, + { + "epoch": 0.888279471334378, + "grad_norm": 0.04269809275865555, + "learning_rate": 5.606748575498576e-06, + "loss": 0.0155, + "step": 120170 + }, + { + "epoch": 0.8883533899056799, + "grad_norm": 0.08776000887155533, + "learning_rate": 5.60303893637227e-06, + "loss": 0.0155, + "step": 120180 + }, + { + "epoch": 0.8884273084769817, + "grad_norm": 0.09480982273817062, + "learning_rate": 5.599329297245965e-06, + "loss": 0.0152, + "step": 120190 + }, + { + "epoch": 0.8885012270482836, + "grad_norm": 0.09173163771629333, + "learning_rate": 5.595619658119659e-06, + "loss": 0.0182, + "step": 120200 + }, + { + "epoch": 0.8885751456195855, + "grad_norm": 0.08384384214878082, + "learning_rate": 5.591910018993353e-06, + "loss": 0.0178, + "step": 120210 + }, + { + "epoch": 0.8886490641908873, + "grad_norm": 0.08577314764261246, + "learning_rate": 5.5882003798670466e-06, + "loss": 0.0193, + "step": 120220 + }, + { + "epoch": 0.8887229827621892, + "grad_norm": 0.05332199111580849, + "learning_rate": 5.584490740740741e-06, + "loss": 0.0142, + "step": 120230 + }, + { + "epoch": 0.888796901333491, + "grad_norm": 0.06766955554485321, + "learning_rate": 5.580781101614435e-06, + "loss": 0.0174, + "step": 120240 + }, + { + "epoch": 0.8888708199047929, + "grad_norm": 0.0663696676492691, + "learning_rate": 5.577071462488129e-06, + "loss": 0.0172, + "step": 120250 + }, + { + "epoch": 0.8889447384760948, + "grad_norm": 0.0493619330227375, + "learning_rate": 5.573361823361824e-06, + "loss": 0.0148, + "step": 120260 + }, + { + "epoch": 0.8890186570473966, + "grad_norm": 0.08228102326393127, + "learning_rate": 5.569652184235518e-06, + "loss": 0.0193, + "step": 120270 + }, + { + "epoch": 0.8890925756186985, + "grad_norm": 0.07778341323137283, + "learning_rate": 5.565942545109212e-06, + "loss": 0.0175, + "step": 120280 + }, + { + "epoch": 0.8891664941900003, + "grad_norm": 0.09558829665184021, + "learning_rate": 5.562232905982906e-06, + "loss": 0.0186, + "step": 120290 + }, + { + "epoch": 0.8892404127613022, + "grad_norm": 0.09294694662094116, + "learning_rate": 5.5585232668566e-06, + "loss": 0.0149, + "step": 120300 + }, + { + "epoch": 0.889314331332604, + "grad_norm": 0.08595292270183563, + "learning_rate": 5.554813627730294e-06, + "loss": 0.0163, + "step": 120310 + }, + { + "epoch": 0.8893882499039059, + "grad_norm": 0.052427005022764206, + "learning_rate": 5.5511039886039884e-06, + "loss": 0.0158, + "step": 120320 + }, + { + "epoch": 0.8894621684752078, + "grad_norm": 0.09456674754619598, + "learning_rate": 5.547394349477683e-06, + "loss": 0.0172, + "step": 120330 + }, + { + "epoch": 0.8895360870465095, + "grad_norm": 0.07085176557302475, + "learning_rate": 5.543684710351377e-06, + "loss": 0.0176, + "step": 120340 + }, + { + "epoch": 0.8896100056178115, + "grad_norm": 0.10055804252624512, + "learning_rate": 5.539975071225072e-06, + "loss": 0.0163, + "step": 120350 + }, + { + "epoch": 0.8896839241891132, + "grad_norm": 0.08094070106744766, + "learning_rate": 5.536265432098766e-06, + "loss": 0.0184, + "step": 120360 + }, + { + "epoch": 0.8897578427604151, + "grad_norm": 0.07718300074338913, + "learning_rate": 5.53255579297246e-06, + "loss": 0.0169, + "step": 120370 + }, + { + "epoch": 0.8898317613317169, + "grad_norm": 0.07198705524206161, + "learning_rate": 5.528846153846154e-06, + "loss": 0.0161, + "step": 120380 + }, + { + "epoch": 0.8899056799030188, + "grad_norm": 0.09524102509021759, + "learning_rate": 5.525136514719848e-06, + "loss": 0.0176, + "step": 120390 + }, + { + "epoch": 0.8899795984743207, + "grad_norm": 0.06162939593195915, + "learning_rate": 5.521426875593543e-06, + "loss": 0.0158, + "step": 120400 + }, + { + "epoch": 0.8900535170456225, + "grad_norm": 0.0801302120089531, + "learning_rate": 5.517717236467237e-06, + "loss": 0.0147, + "step": 120410 + }, + { + "epoch": 0.8901274356169244, + "grad_norm": 0.07829449325799942, + "learning_rate": 5.514007597340931e-06, + "loss": 0.0172, + "step": 120420 + }, + { + "epoch": 0.8902013541882262, + "grad_norm": 0.11463841050863266, + "learning_rate": 5.5102979582146254e-06, + "loss": 0.0199, + "step": 120430 + }, + { + "epoch": 0.8902752727595281, + "grad_norm": 0.09118429571390152, + "learning_rate": 5.50658831908832e-06, + "loss": 0.0162, + "step": 120440 + }, + { + "epoch": 0.8903491913308299, + "grad_norm": 0.07519903779029846, + "learning_rate": 5.502878679962013e-06, + "loss": 0.0173, + "step": 120450 + }, + { + "epoch": 0.8904231099021318, + "grad_norm": 0.08263619244098663, + "learning_rate": 5.499169040835707e-06, + "loss": 0.0167, + "step": 120460 + }, + { + "epoch": 0.8904970284734337, + "grad_norm": 0.0880921334028244, + "learning_rate": 5.495459401709402e-06, + "loss": 0.0188, + "step": 120470 + }, + { + "epoch": 0.8905709470447355, + "grad_norm": 0.07637332379817963, + "learning_rate": 5.491749762583096e-06, + "loss": 0.0168, + "step": 120480 + }, + { + "epoch": 0.8906448656160374, + "grad_norm": 0.08179379999637604, + "learning_rate": 5.48804012345679e-06, + "loss": 0.0144, + "step": 120490 + }, + { + "epoch": 0.8907187841873392, + "grad_norm": 0.0767722874879837, + "learning_rate": 5.4843304843304845e-06, + "loss": 0.0145, + "step": 120500 + }, + { + "epoch": 0.8907927027586411, + "grad_norm": 0.05685288831591606, + "learning_rate": 5.480620845204179e-06, + "loss": 0.0156, + "step": 120510 + }, + { + "epoch": 0.890866621329943, + "grad_norm": 0.0973455086350441, + "learning_rate": 5.476911206077873e-06, + "loss": 0.0157, + "step": 120520 + }, + { + "epoch": 0.8909405399012448, + "grad_norm": 0.06820717453956604, + "learning_rate": 5.473201566951567e-06, + "loss": 0.0164, + "step": 120530 + }, + { + "epoch": 0.8910144584725467, + "grad_norm": 0.08437089622020721, + "learning_rate": 5.469491927825262e-06, + "loss": 0.0166, + "step": 120540 + }, + { + "epoch": 0.8910883770438485, + "grad_norm": 0.08168923109769821, + "learning_rate": 5.465782288698956e-06, + "loss": 0.017, + "step": 120550 + }, + { + "epoch": 0.8911622956151504, + "grad_norm": 0.05618911609053612, + "learning_rate": 5.46207264957265e-06, + "loss": 0.0176, + "step": 120560 + }, + { + "epoch": 0.8912362141864522, + "grad_norm": 0.06802624464035034, + "learning_rate": 5.458363010446344e-06, + "loss": 0.0191, + "step": 120570 + }, + { + "epoch": 0.8913101327577541, + "grad_norm": 0.07354087382555008, + "learning_rate": 5.454653371320039e-06, + "loss": 0.0166, + "step": 120580 + }, + { + "epoch": 0.891384051329056, + "grad_norm": 0.08429256081581116, + "learning_rate": 5.450943732193733e-06, + "loss": 0.0163, + "step": 120590 + }, + { + "epoch": 0.8914579699003577, + "grad_norm": 0.06684058159589767, + "learning_rate": 5.447234093067426e-06, + "loss": 0.0152, + "step": 120600 + }, + { + "epoch": 0.8915318884716597, + "grad_norm": 0.06897986680269241, + "learning_rate": 5.443524453941121e-06, + "loss": 0.0159, + "step": 120610 + }, + { + "epoch": 0.8916058070429614, + "grad_norm": 0.07506728172302246, + "learning_rate": 5.439814814814815e-06, + "loss": 0.0179, + "step": 120620 + }, + { + "epoch": 0.8916797256142633, + "grad_norm": 0.08993647992610931, + "learning_rate": 5.436105175688509e-06, + "loss": 0.0175, + "step": 120630 + }, + { + "epoch": 0.8917536441855651, + "grad_norm": 0.0962747111916542, + "learning_rate": 5.4323955365622035e-06, + "loss": 0.0189, + "step": 120640 + }, + { + "epoch": 0.891827562756867, + "grad_norm": 0.07659200578927994, + "learning_rate": 5.428685897435898e-06, + "loss": 0.0184, + "step": 120650 + }, + { + "epoch": 0.8919014813281689, + "grad_norm": 0.10518428683280945, + "learning_rate": 5.424976258309592e-06, + "loss": 0.017, + "step": 120660 + }, + { + "epoch": 0.8919753998994707, + "grad_norm": 0.07588525116443634, + "learning_rate": 5.421266619183286e-06, + "loss": 0.0189, + "step": 120670 + }, + { + "epoch": 0.8920493184707726, + "grad_norm": 0.06094416230916977, + "learning_rate": 5.41755698005698e-06, + "loss": 0.0155, + "step": 120680 + }, + { + "epoch": 0.8921232370420744, + "grad_norm": 0.07991020381450653, + "learning_rate": 5.413847340930674e-06, + "loss": 0.0156, + "step": 120690 + }, + { + "epoch": 0.8921971556133763, + "grad_norm": 0.09702847898006439, + "learning_rate": 5.410137701804368e-06, + "loss": 0.018, + "step": 120700 + }, + { + "epoch": 0.8922710741846781, + "grad_norm": 0.07065367698669434, + "learning_rate": 5.4064280626780625e-06, + "loss": 0.0174, + "step": 120710 + }, + { + "epoch": 0.89234499275598, + "grad_norm": 0.0530664436519146, + "learning_rate": 5.402718423551757e-06, + "loss": 0.0159, + "step": 120720 + }, + { + "epoch": 0.8924189113272819, + "grad_norm": 0.08903425186872482, + "learning_rate": 5.399008784425452e-06, + "loss": 0.018, + "step": 120730 + }, + { + "epoch": 0.8924928298985837, + "grad_norm": 0.07997802644968033, + "learning_rate": 5.395299145299146e-06, + "loss": 0.0177, + "step": 120740 + }, + { + "epoch": 0.8925667484698856, + "grad_norm": 0.06450887769460678, + "learning_rate": 5.39158950617284e-06, + "loss": 0.0171, + "step": 120750 + }, + { + "epoch": 0.8926406670411874, + "grad_norm": 0.06914730370044708, + "learning_rate": 5.387879867046534e-06, + "loss": 0.0138, + "step": 120760 + }, + { + "epoch": 0.8927145856124893, + "grad_norm": 0.05761079117655754, + "learning_rate": 5.384170227920228e-06, + "loss": 0.0166, + "step": 120770 + }, + { + "epoch": 0.8927885041837912, + "grad_norm": 0.07949425280094147, + "learning_rate": 5.3804605887939225e-06, + "loss": 0.0163, + "step": 120780 + }, + { + "epoch": 0.892862422755093, + "grad_norm": 0.06849084794521332, + "learning_rate": 5.376750949667617e-06, + "loss": 0.0158, + "step": 120790 + }, + { + "epoch": 0.8929363413263949, + "grad_norm": 0.07909443974494934, + "learning_rate": 5.373041310541311e-06, + "loss": 0.0178, + "step": 120800 + }, + { + "epoch": 0.8930102598976967, + "grad_norm": 0.08381883054971695, + "learning_rate": 5.369331671415005e-06, + "loss": 0.017, + "step": 120810 + }, + { + "epoch": 0.8930841784689986, + "grad_norm": 0.0874544307589531, + "learning_rate": 5.3656220322886995e-06, + "loss": 0.0179, + "step": 120820 + }, + { + "epoch": 0.8931580970403004, + "grad_norm": 0.08168869465589523, + "learning_rate": 5.361912393162393e-06, + "loss": 0.0206, + "step": 120830 + }, + { + "epoch": 0.8932320156116023, + "grad_norm": 0.10479090362787247, + "learning_rate": 5.358202754036087e-06, + "loss": 0.0195, + "step": 120840 + }, + { + "epoch": 0.8933059341829042, + "grad_norm": 0.10459177941083908, + "learning_rate": 5.3544931149097815e-06, + "loss": 0.019, + "step": 120850 + }, + { + "epoch": 0.893379852754206, + "grad_norm": 0.10548960417509079, + "learning_rate": 5.350783475783476e-06, + "loss": 0.0166, + "step": 120860 + }, + { + "epoch": 0.8934537713255079, + "grad_norm": 0.06455662101507187, + "learning_rate": 5.34707383665717e-06, + "loss": 0.015, + "step": 120870 + }, + { + "epoch": 0.8935276898968096, + "grad_norm": 0.10260767489671707, + "learning_rate": 5.343364197530864e-06, + "loss": 0.016, + "step": 120880 + }, + { + "epoch": 0.8936016084681115, + "grad_norm": 0.09701945632696152, + "learning_rate": 5.339654558404559e-06, + "loss": 0.0188, + "step": 120890 + }, + { + "epoch": 0.8936755270394133, + "grad_norm": 0.09393393993377686, + "learning_rate": 5.335944919278253e-06, + "loss": 0.0213, + "step": 120900 + }, + { + "epoch": 0.8937494456107152, + "grad_norm": 0.06040528416633606, + "learning_rate": 5.332235280151947e-06, + "loss": 0.0145, + "step": 120910 + }, + { + "epoch": 0.8938233641820171, + "grad_norm": 0.0938306599855423, + "learning_rate": 5.3285256410256414e-06, + "loss": 0.0176, + "step": 120920 + }, + { + "epoch": 0.8938972827533189, + "grad_norm": 0.10529477894306183, + "learning_rate": 5.324816001899336e-06, + "loss": 0.0161, + "step": 120930 + }, + { + "epoch": 0.8939712013246208, + "grad_norm": 0.08327803015708923, + "learning_rate": 5.32110636277303e-06, + "loss": 0.0193, + "step": 120940 + }, + { + "epoch": 0.8940451198959226, + "grad_norm": 0.09573381394147873, + "learning_rate": 5.317396723646724e-06, + "loss": 0.0188, + "step": 120950 + }, + { + "epoch": 0.8941190384672245, + "grad_norm": 0.06852541863918304, + "learning_rate": 5.3136870845204185e-06, + "loss": 0.0154, + "step": 120960 + }, + { + "epoch": 0.8941929570385264, + "grad_norm": 0.08873478323221207, + "learning_rate": 5.309977445394113e-06, + "loss": 0.0166, + "step": 120970 + }, + { + "epoch": 0.8942668756098282, + "grad_norm": 0.09256771206855774, + "learning_rate": 5.306267806267807e-06, + "loss": 0.0173, + "step": 120980 + }, + { + "epoch": 0.8943407941811301, + "grad_norm": 0.08251002430915833, + "learning_rate": 5.3025581671415005e-06, + "loss": 0.0157, + "step": 120990 + }, + { + "epoch": 0.8944147127524319, + "grad_norm": 0.0775420218706131, + "learning_rate": 5.298848528015195e-06, + "loss": 0.0176, + "step": 121000 + }, + { + "epoch": 0.8944886313237338, + "grad_norm": 0.05607917532324791, + "learning_rate": 5.295138888888889e-06, + "loss": 0.0181, + "step": 121010 + }, + { + "epoch": 0.8945625498950356, + "grad_norm": 0.10188756138086319, + "learning_rate": 5.291429249762583e-06, + "loss": 0.0164, + "step": 121020 + }, + { + "epoch": 0.8946364684663375, + "grad_norm": 0.0746680498123169, + "learning_rate": 5.287719610636278e-06, + "loss": 0.0157, + "step": 121030 + }, + { + "epoch": 0.8947103870376394, + "grad_norm": 0.07992107421159744, + "learning_rate": 5.284009971509972e-06, + "loss": 0.0147, + "step": 121040 + }, + { + "epoch": 0.8947843056089412, + "grad_norm": 0.08044373244047165, + "learning_rate": 5.280300332383666e-06, + "loss": 0.0157, + "step": 121050 + }, + { + "epoch": 0.8948582241802431, + "grad_norm": 0.07756134122610092, + "learning_rate": 5.2765906932573596e-06, + "loss": 0.0178, + "step": 121060 + }, + { + "epoch": 0.8949321427515449, + "grad_norm": 0.08153444528579712, + "learning_rate": 5.272881054131054e-06, + "loss": 0.0156, + "step": 121070 + }, + { + "epoch": 0.8950060613228468, + "grad_norm": 0.06693438440561295, + "learning_rate": 5.269171415004748e-06, + "loss": 0.0167, + "step": 121080 + }, + { + "epoch": 0.8950799798941486, + "grad_norm": 0.07793907821178436, + "learning_rate": 5.265461775878442e-06, + "loss": 0.0166, + "step": 121090 + }, + { + "epoch": 0.8951538984654505, + "grad_norm": 0.06497281044721603, + "learning_rate": 5.261752136752137e-06, + "loss": 0.0153, + "step": 121100 + }, + { + "epoch": 0.8952278170367524, + "grad_norm": 0.0652812048792839, + "learning_rate": 5.258042497625832e-06, + "loss": 0.016, + "step": 121110 + }, + { + "epoch": 0.8953017356080542, + "grad_norm": 0.08423350006341934, + "learning_rate": 5.254332858499526e-06, + "loss": 0.0183, + "step": 121120 + }, + { + "epoch": 0.895375654179356, + "grad_norm": 0.09293195605278015, + "learning_rate": 5.25062321937322e-06, + "loss": 0.0187, + "step": 121130 + }, + { + "epoch": 0.8954495727506578, + "grad_norm": 0.07499924302101135, + "learning_rate": 5.246913580246914e-06, + "loss": 0.0153, + "step": 121140 + }, + { + "epoch": 0.8955234913219597, + "grad_norm": 0.07277142256498337, + "learning_rate": 5.243203941120608e-06, + "loss": 0.0164, + "step": 121150 + }, + { + "epoch": 0.8955974098932615, + "grad_norm": 0.07718726992607117, + "learning_rate": 5.239494301994302e-06, + "loss": 0.019, + "step": 121160 + }, + { + "epoch": 0.8956713284645634, + "grad_norm": 0.07208777219057083, + "learning_rate": 5.2357846628679966e-06, + "loss": 0.0162, + "step": 121170 + }, + { + "epoch": 0.8957452470358653, + "grad_norm": 0.06988029927015305, + "learning_rate": 5.232075023741691e-06, + "loss": 0.0161, + "step": 121180 + }, + { + "epoch": 0.8958191656071671, + "grad_norm": 0.07956439256668091, + "learning_rate": 5.228365384615385e-06, + "loss": 0.0144, + "step": 121190 + }, + { + "epoch": 0.895893084178469, + "grad_norm": 0.062169358134269714, + "learning_rate": 5.224655745489079e-06, + "loss": 0.0178, + "step": 121200 + }, + { + "epoch": 0.8959670027497708, + "grad_norm": 0.07907166332006454, + "learning_rate": 5.220946106362774e-06, + "loss": 0.0166, + "step": 121210 + }, + { + "epoch": 0.8960409213210727, + "grad_norm": 0.07376344501972198, + "learning_rate": 5.217236467236467e-06, + "loss": 0.0151, + "step": 121220 + }, + { + "epoch": 0.8961148398923746, + "grad_norm": 0.08888218551874161, + "learning_rate": 5.213526828110161e-06, + "loss": 0.0186, + "step": 121230 + }, + { + "epoch": 0.8961887584636764, + "grad_norm": 0.062063366174697876, + "learning_rate": 5.209817188983856e-06, + "loss": 0.0143, + "step": 121240 + }, + { + "epoch": 0.8962626770349783, + "grad_norm": 0.09087532758712769, + "learning_rate": 5.20610754985755e-06, + "loss": 0.0159, + "step": 121250 + }, + { + "epoch": 0.8963365956062801, + "grad_norm": 0.08101866394281387, + "learning_rate": 5.202397910731244e-06, + "loss": 0.0178, + "step": 121260 + }, + { + "epoch": 0.896410514177582, + "grad_norm": 0.07356608659029007, + "learning_rate": 5.1986882716049384e-06, + "loss": 0.0159, + "step": 121270 + }, + { + "epoch": 0.8964844327488838, + "grad_norm": 0.07567834854125977, + "learning_rate": 5.194978632478633e-06, + "loss": 0.018, + "step": 121280 + }, + { + "epoch": 0.8965583513201857, + "grad_norm": 0.08786658197641373, + "learning_rate": 5.191268993352327e-06, + "loss": 0.0186, + "step": 121290 + }, + { + "epoch": 0.8966322698914876, + "grad_norm": 0.0797184482216835, + "learning_rate": 5.187559354226021e-06, + "loss": 0.0166, + "step": 121300 + }, + { + "epoch": 0.8967061884627894, + "grad_norm": 0.08069688081741333, + "learning_rate": 5.1838497150997155e-06, + "loss": 0.0193, + "step": 121310 + }, + { + "epoch": 0.8967801070340913, + "grad_norm": 0.07624389231204987, + "learning_rate": 5.18014007597341e-06, + "loss": 0.0163, + "step": 121320 + }, + { + "epoch": 0.8968540256053931, + "grad_norm": 0.06398090720176697, + "learning_rate": 5.176430436847104e-06, + "loss": 0.0175, + "step": 121330 + }, + { + "epoch": 0.896927944176695, + "grad_norm": 0.09289465844631195, + "learning_rate": 5.172720797720798e-06, + "loss": 0.0161, + "step": 121340 + }, + { + "epoch": 0.8970018627479968, + "grad_norm": 0.07656008005142212, + "learning_rate": 5.169011158594493e-06, + "loss": 0.017, + "step": 121350 + }, + { + "epoch": 0.8970757813192987, + "grad_norm": 0.08518286049365997, + "learning_rate": 5.165301519468187e-06, + "loss": 0.0161, + "step": 121360 + }, + { + "epoch": 0.8971496998906006, + "grad_norm": 0.06318601220846176, + "learning_rate": 5.16159188034188e-06, + "loss": 0.0172, + "step": 121370 + }, + { + "epoch": 0.8972236184619024, + "grad_norm": 0.07610304653644562, + "learning_rate": 5.157882241215575e-06, + "loss": 0.0164, + "step": 121380 + }, + { + "epoch": 0.8972975370332043, + "grad_norm": 0.08971790969371796, + "learning_rate": 5.154172602089269e-06, + "loss": 0.0206, + "step": 121390 + }, + { + "epoch": 0.897371455604506, + "grad_norm": 0.07047516852617264, + "learning_rate": 5.150462962962963e-06, + "loss": 0.0166, + "step": 121400 + }, + { + "epoch": 0.8974453741758079, + "grad_norm": 0.08966632932424545, + "learning_rate": 5.146753323836657e-06, + "loss": 0.016, + "step": 121410 + }, + { + "epoch": 0.8975192927471097, + "grad_norm": 0.07193748652935028, + "learning_rate": 5.143043684710352e-06, + "loss": 0.014, + "step": 121420 + }, + { + "epoch": 0.8975932113184116, + "grad_norm": 0.07095572352409363, + "learning_rate": 5.139334045584046e-06, + "loss": 0.0152, + "step": 121430 + }, + { + "epoch": 0.8976671298897135, + "grad_norm": 0.08083576709032059, + "learning_rate": 5.13562440645774e-06, + "loss": 0.016, + "step": 121440 + }, + { + "epoch": 0.8977410484610153, + "grad_norm": 0.0850079134106636, + "learning_rate": 5.131914767331434e-06, + "loss": 0.0168, + "step": 121450 + }, + { + "epoch": 0.8978149670323172, + "grad_norm": 0.05830361321568489, + "learning_rate": 5.128205128205128e-06, + "loss": 0.0155, + "step": 121460 + }, + { + "epoch": 0.897888885603619, + "grad_norm": 0.06578114628791809, + "learning_rate": 5.124495489078822e-06, + "loss": 0.0185, + "step": 121470 + }, + { + "epoch": 0.8979628041749209, + "grad_norm": 0.087006576359272, + "learning_rate": 5.1207858499525165e-06, + "loss": 0.0183, + "step": 121480 + }, + { + "epoch": 0.8980367227462228, + "grad_norm": 0.06513582915067673, + "learning_rate": 5.117076210826211e-06, + "loss": 0.0161, + "step": 121490 + }, + { + "epoch": 0.8981106413175246, + "grad_norm": 0.08822928369045258, + "learning_rate": 5.113366571699906e-06, + "loss": 0.0195, + "step": 121500 + }, + { + "epoch": 0.8981845598888265, + "grad_norm": 0.06557939201593399, + "learning_rate": 5.1096569325736e-06, + "loss": 0.0178, + "step": 121510 + }, + { + "epoch": 0.8982584784601283, + "grad_norm": 0.09345387667417526, + "learning_rate": 5.1059472934472936e-06, + "loss": 0.0184, + "step": 121520 + }, + { + "epoch": 0.8983323970314302, + "grad_norm": 0.10218565911054611, + "learning_rate": 5.102237654320988e-06, + "loss": 0.0198, + "step": 121530 + }, + { + "epoch": 0.898406315602732, + "grad_norm": 0.07959327101707458, + "learning_rate": 5.098528015194682e-06, + "loss": 0.0178, + "step": 121540 + }, + { + "epoch": 0.8984802341740339, + "grad_norm": 0.06820554286241531, + "learning_rate": 5.094818376068376e-06, + "loss": 0.0144, + "step": 121550 + }, + { + "epoch": 0.8985541527453358, + "grad_norm": 0.09927608072757721, + "learning_rate": 5.091108736942071e-06, + "loss": 0.0159, + "step": 121560 + }, + { + "epoch": 0.8986280713166376, + "grad_norm": 0.07535196840763092, + "learning_rate": 5.087399097815765e-06, + "loss": 0.0147, + "step": 121570 + }, + { + "epoch": 0.8987019898879395, + "grad_norm": 0.06772439926862717, + "learning_rate": 5.083689458689459e-06, + "loss": 0.0161, + "step": 121580 + }, + { + "epoch": 0.8987759084592413, + "grad_norm": 0.06075502559542656, + "learning_rate": 5.0799798195631535e-06, + "loss": 0.0164, + "step": 121590 + }, + { + "epoch": 0.8988498270305432, + "grad_norm": 0.09440341591835022, + "learning_rate": 5.076270180436847e-06, + "loss": 0.0165, + "step": 121600 + }, + { + "epoch": 0.898923745601845, + "grad_norm": 0.0991566926240921, + "learning_rate": 5.072560541310541e-06, + "loss": 0.0167, + "step": 121610 + }, + { + "epoch": 0.8989976641731469, + "grad_norm": 0.05196934938430786, + "learning_rate": 5.0688509021842354e-06, + "loss": 0.0187, + "step": 121620 + }, + { + "epoch": 0.8990715827444488, + "grad_norm": 0.07425817847251892, + "learning_rate": 5.06514126305793e-06, + "loss": 0.0174, + "step": 121630 + }, + { + "epoch": 0.8991455013157506, + "grad_norm": 0.07375023514032364, + "learning_rate": 5.061431623931624e-06, + "loss": 0.0155, + "step": 121640 + }, + { + "epoch": 0.8992194198870525, + "grad_norm": 0.07478194683790207, + "learning_rate": 5.057721984805318e-06, + "loss": 0.0172, + "step": 121650 + }, + { + "epoch": 0.8992933384583542, + "grad_norm": 0.07814273983240128, + "learning_rate": 5.0540123456790125e-06, + "loss": 0.0163, + "step": 121660 + }, + { + "epoch": 0.8993672570296561, + "grad_norm": 0.09756813198328018, + "learning_rate": 5.050302706552707e-06, + "loss": 0.0163, + "step": 121670 + }, + { + "epoch": 0.8994411756009579, + "grad_norm": 0.09019820392131805, + "learning_rate": 5.046593067426401e-06, + "loss": 0.0189, + "step": 121680 + }, + { + "epoch": 0.8995150941722598, + "grad_norm": 0.07590436935424805, + "learning_rate": 5.042883428300095e-06, + "loss": 0.0189, + "step": 121690 + }, + { + "epoch": 0.8995890127435617, + "grad_norm": 0.05467082932591438, + "learning_rate": 5.03917378917379e-06, + "loss": 0.0158, + "step": 121700 + }, + { + "epoch": 0.8996629313148635, + "grad_norm": 0.07867514342069626, + "learning_rate": 5.035464150047484e-06, + "loss": 0.0178, + "step": 121710 + }, + { + "epoch": 0.8997368498861654, + "grad_norm": 0.07019774615764618, + "learning_rate": 5.031754510921178e-06, + "loss": 0.0143, + "step": 121720 + }, + { + "epoch": 0.8998107684574672, + "grad_norm": 0.06641940027475357, + "learning_rate": 5.0280448717948725e-06, + "loss": 0.0175, + "step": 121730 + }, + { + "epoch": 0.8998846870287691, + "grad_norm": 0.07111826539039612, + "learning_rate": 5.024335232668567e-06, + "loss": 0.0172, + "step": 121740 + }, + { + "epoch": 0.899958605600071, + "grad_norm": 0.08311285078525543, + "learning_rate": 5.02062559354226e-06, + "loss": 0.0177, + "step": 121750 + }, + { + "epoch": 0.9000325241713728, + "grad_norm": 0.06598973274230957, + "learning_rate": 5.016915954415954e-06, + "loss": 0.0163, + "step": 121760 + }, + { + "epoch": 0.9001064427426747, + "grad_norm": 0.06733829528093338, + "learning_rate": 5.013206315289649e-06, + "loss": 0.0153, + "step": 121770 + }, + { + "epoch": 0.9001803613139765, + "grad_norm": 0.06981953978538513, + "learning_rate": 5.009496676163343e-06, + "loss": 0.0171, + "step": 121780 + }, + { + "epoch": 0.9002542798852784, + "grad_norm": 0.10032287985086441, + "learning_rate": 5.005787037037037e-06, + "loss": 0.0158, + "step": 121790 + }, + { + "epoch": 0.9003281984565802, + "grad_norm": 0.07456158101558685, + "learning_rate": 5.0020773979107315e-06, + "loss": 0.0177, + "step": 121800 + }, + { + "epoch": 0.9004021170278821, + "grad_norm": 0.0792987272143364, + "learning_rate": 4.998367758784426e-06, + "loss": 0.0165, + "step": 121810 + }, + { + "epoch": 0.900476035599184, + "grad_norm": 0.07135865837335587, + "learning_rate": 4.99465811965812e-06, + "loss": 0.0171, + "step": 121820 + }, + { + "epoch": 0.9005499541704858, + "grad_norm": 0.08674946427345276, + "learning_rate": 4.9909484805318135e-06, + "loss": 0.0175, + "step": 121830 + }, + { + "epoch": 0.9006238727417877, + "grad_norm": 0.06915190815925598, + "learning_rate": 4.987238841405508e-06, + "loss": 0.016, + "step": 121840 + }, + { + "epoch": 0.9006977913130895, + "grad_norm": 0.09359006583690643, + "learning_rate": 4.983529202279202e-06, + "loss": 0.0159, + "step": 121850 + }, + { + "epoch": 0.9007717098843914, + "grad_norm": 0.08661477267742157, + "learning_rate": 4.979819563152896e-06, + "loss": 0.0175, + "step": 121860 + }, + { + "epoch": 0.9008456284556932, + "grad_norm": 0.06485380232334137, + "learning_rate": 4.976109924026591e-06, + "loss": 0.0171, + "step": 121870 + }, + { + "epoch": 0.9009195470269951, + "grad_norm": 0.0564052015542984, + "learning_rate": 4.972400284900286e-06, + "loss": 0.0157, + "step": 121880 + }, + { + "epoch": 0.900993465598297, + "grad_norm": 0.06444479525089264, + "learning_rate": 4.96869064577398e-06, + "loss": 0.0175, + "step": 121890 + }, + { + "epoch": 0.9010673841695988, + "grad_norm": 0.0819358304142952, + "learning_rate": 4.964981006647674e-06, + "loss": 0.0177, + "step": 121900 + }, + { + "epoch": 0.9011413027409007, + "grad_norm": 0.10272148251533508, + "learning_rate": 4.961271367521368e-06, + "loss": 0.0178, + "step": 121910 + }, + { + "epoch": 0.9012152213122024, + "grad_norm": 0.07735146582126617, + "learning_rate": 4.957561728395062e-06, + "loss": 0.0184, + "step": 121920 + }, + { + "epoch": 0.9012891398835043, + "grad_norm": 0.06250828504562378, + "learning_rate": 4.953852089268756e-06, + "loss": 0.0157, + "step": 121930 + }, + { + "epoch": 0.9013630584548061, + "grad_norm": 0.05823352932929993, + "learning_rate": 4.9501424501424505e-06, + "loss": 0.0142, + "step": 121940 + }, + { + "epoch": 0.901436977026108, + "grad_norm": 0.09979364275932312, + "learning_rate": 4.946432811016145e-06, + "loss": 0.0142, + "step": 121950 + }, + { + "epoch": 0.9015108955974099, + "grad_norm": 0.07167592644691467, + "learning_rate": 4.942723171889839e-06, + "loss": 0.0169, + "step": 121960 + }, + { + "epoch": 0.9015848141687117, + "grad_norm": 0.09357810020446777, + "learning_rate": 4.939013532763533e-06, + "loss": 0.016, + "step": 121970 + }, + { + "epoch": 0.9016587327400136, + "grad_norm": 0.06767541170120239, + "learning_rate": 4.935303893637227e-06, + "loss": 0.0159, + "step": 121980 + }, + { + "epoch": 0.9017326513113154, + "grad_norm": 0.06723565608263016, + "learning_rate": 4.931594254510921e-06, + "loss": 0.0161, + "step": 121990 + }, + { + "epoch": 0.9018065698826173, + "grad_norm": 0.08712873607873917, + "learning_rate": 4.927884615384615e-06, + "loss": 0.0157, + "step": 122000 + }, + { + "epoch": 0.9018804884539192, + "grad_norm": 0.08881501108407974, + "learning_rate": 4.9241749762583096e-06, + "loss": 0.0182, + "step": 122010 + }, + { + "epoch": 0.901954407025221, + "grad_norm": 0.08723103255033493, + "learning_rate": 4.920465337132004e-06, + "loss": 0.0193, + "step": 122020 + }, + { + "epoch": 0.9020283255965229, + "grad_norm": 0.09474994242191315, + "learning_rate": 4.916755698005698e-06, + "loss": 0.0147, + "step": 122030 + }, + { + "epoch": 0.9021022441678247, + "grad_norm": 0.0643463209271431, + "learning_rate": 4.913046058879392e-06, + "loss": 0.0192, + "step": 122040 + }, + { + "epoch": 0.9021761627391266, + "grad_norm": 0.06547664850950241, + "learning_rate": 4.909336419753087e-06, + "loss": 0.0173, + "step": 122050 + }, + { + "epoch": 0.9022500813104284, + "grad_norm": 0.08374817669391632, + "learning_rate": 4.905626780626781e-06, + "loss": 0.0194, + "step": 122060 + }, + { + "epoch": 0.9023239998817303, + "grad_norm": 0.07471413910388947, + "learning_rate": 4.901917141500475e-06, + "loss": 0.0169, + "step": 122070 + }, + { + "epoch": 0.9023979184530322, + "grad_norm": 0.07656655460596085, + "learning_rate": 4.8982075023741695e-06, + "loss": 0.0154, + "step": 122080 + }, + { + "epoch": 0.902471837024334, + "grad_norm": 0.07885884493589401, + "learning_rate": 4.894497863247864e-06, + "loss": 0.0161, + "step": 122090 + }, + { + "epoch": 0.9025457555956359, + "grad_norm": 0.07954913377761841, + "learning_rate": 4.890788224121558e-06, + "loss": 0.0171, + "step": 122100 + }, + { + "epoch": 0.9026196741669377, + "grad_norm": 0.10426130145788193, + "learning_rate": 4.887078584995252e-06, + "loss": 0.0195, + "step": 122110 + }, + { + "epoch": 0.9026935927382396, + "grad_norm": 0.08550582826137543, + "learning_rate": 4.8833689458689466e-06, + "loss": 0.0189, + "step": 122120 + }, + { + "epoch": 0.9027675113095414, + "grad_norm": 0.08839669823646545, + "learning_rate": 4.879659306742641e-06, + "loss": 0.0167, + "step": 122130 + }, + { + "epoch": 0.9028414298808433, + "grad_norm": 0.0725245326757431, + "learning_rate": 4.875949667616334e-06, + "loss": 0.0164, + "step": 122140 + }, + { + "epoch": 0.9029153484521452, + "grad_norm": 0.061979878693819046, + "learning_rate": 4.8722400284900285e-06, + "loss": 0.0162, + "step": 122150 + }, + { + "epoch": 0.902989267023447, + "grad_norm": 0.059237040579319, + "learning_rate": 4.868530389363723e-06, + "loss": 0.0177, + "step": 122160 + }, + { + "epoch": 0.9030631855947489, + "grad_norm": 0.07624328136444092, + "learning_rate": 4.864820750237417e-06, + "loss": 0.0179, + "step": 122170 + }, + { + "epoch": 0.9031371041660506, + "grad_norm": 0.09410177916288376, + "learning_rate": 4.861111111111111e-06, + "loss": 0.0184, + "step": 122180 + }, + { + "epoch": 0.9032110227373525, + "grad_norm": 0.09074148535728455, + "learning_rate": 4.857401471984806e-06, + "loss": 0.0158, + "step": 122190 + }, + { + "epoch": 0.9032849413086543, + "grad_norm": 0.07468174397945404, + "learning_rate": 4.8536918328585e-06, + "loss": 0.0165, + "step": 122200 + }, + { + "epoch": 0.9033588598799562, + "grad_norm": 0.08903239667415619, + "learning_rate": 4.849982193732193e-06, + "loss": 0.0175, + "step": 122210 + }, + { + "epoch": 0.9034327784512581, + "grad_norm": 0.065603107213974, + "learning_rate": 4.846272554605888e-06, + "loss": 0.0148, + "step": 122220 + }, + { + "epoch": 0.9035066970225599, + "grad_norm": 0.08339151740074158, + "learning_rate": 4.842562915479582e-06, + "loss": 0.015, + "step": 122230 + }, + { + "epoch": 0.9035806155938618, + "grad_norm": 0.0531030036509037, + "learning_rate": 4.838853276353276e-06, + "loss": 0.0184, + "step": 122240 + }, + { + "epoch": 0.9036545341651636, + "grad_norm": 0.07239489257335663, + "learning_rate": 4.83514363722697e-06, + "loss": 0.0178, + "step": 122250 + }, + { + "epoch": 0.9037284527364655, + "grad_norm": 0.09008802473545074, + "learning_rate": 4.8314339981006655e-06, + "loss": 0.0168, + "step": 122260 + }, + { + "epoch": 0.9038023713077674, + "grad_norm": 0.06578698009252548, + "learning_rate": 4.82772435897436e-06, + "loss": 0.0172, + "step": 122270 + }, + { + "epoch": 0.9038762898790692, + "grad_norm": 0.0775492787361145, + "learning_rate": 4.824014719848054e-06, + "loss": 0.0168, + "step": 122280 + }, + { + "epoch": 0.9039502084503711, + "grad_norm": 0.09722190350294113, + "learning_rate": 4.8203050807217475e-06, + "loss": 0.0181, + "step": 122290 + }, + { + "epoch": 0.9040241270216729, + "grad_norm": 0.09262166172266006, + "learning_rate": 4.816595441595442e-06, + "loss": 0.0154, + "step": 122300 + }, + { + "epoch": 0.9040980455929748, + "grad_norm": 0.05689336732029915, + "learning_rate": 4.812885802469136e-06, + "loss": 0.0155, + "step": 122310 + }, + { + "epoch": 0.9041719641642766, + "grad_norm": 0.07098221778869629, + "learning_rate": 4.80917616334283e-06, + "loss": 0.0177, + "step": 122320 + }, + { + "epoch": 0.9042458827355785, + "grad_norm": 0.08449529111385345, + "learning_rate": 4.805466524216525e-06, + "loss": 0.016, + "step": 122330 + }, + { + "epoch": 0.9043198013068804, + "grad_norm": 0.061818934977054596, + "learning_rate": 4.801756885090219e-06, + "loss": 0.0158, + "step": 122340 + }, + { + "epoch": 0.9043937198781822, + "grad_norm": 0.052378345280885696, + "learning_rate": 4.798047245963913e-06, + "loss": 0.0171, + "step": 122350 + }, + { + "epoch": 0.9044676384494841, + "grad_norm": 0.08641374856233597, + "learning_rate": 4.794337606837607e-06, + "loss": 0.016, + "step": 122360 + }, + { + "epoch": 0.9045415570207859, + "grad_norm": 0.07966215908527374, + "learning_rate": 4.790627967711301e-06, + "loss": 0.0163, + "step": 122370 + }, + { + "epoch": 0.9046154755920878, + "grad_norm": 0.06808850914239883, + "learning_rate": 4.786918328584995e-06, + "loss": 0.0174, + "step": 122380 + }, + { + "epoch": 0.9046893941633896, + "grad_norm": 0.0724160447716713, + "learning_rate": 4.783208689458689e-06, + "loss": 0.0165, + "step": 122390 + }, + { + "epoch": 0.9047633127346915, + "grad_norm": 0.08387361466884613, + "learning_rate": 4.779499050332384e-06, + "loss": 0.0164, + "step": 122400 + }, + { + "epoch": 0.9048372313059934, + "grad_norm": 0.06441134214401245, + "learning_rate": 4.775789411206078e-06, + "loss": 0.0144, + "step": 122410 + }, + { + "epoch": 0.9049111498772952, + "grad_norm": 0.09223717451095581, + "learning_rate": 4.772079772079772e-06, + "loss": 0.0182, + "step": 122420 + }, + { + "epoch": 0.904985068448597, + "grad_norm": 0.08156295120716095, + "learning_rate": 4.7683701329534665e-06, + "loss": 0.0166, + "step": 122430 + }, + { + "epoch": 0.9050589870198988, + "grad_norm": 0.06494969874620438, + "learning_rate": 4.764660493827161e-06, + "loss": 0.0134, + "step": 122440 + }, + { + "epoch": 0.9051329055912007, + "grad_norm": 0.07923052459955215, + "learning_rate": 4.760950854700855e-06, + "loss": 0.0163, + "step": 122450 + }, + { + "epoch": 0.9052068241625025, + "grad_norm": 0.08734191954135895, + "learning_rate": 4.757241215574549e-06, + "loss": 0.0196, + "step": 122460 + }, + { + "epoch": 0.9052807427338044, + "grad_norm": 0.05656689032912254, + "learning_rate": 4.7535315764482436e-06, + "loss": 0.0157, + "step": 122470 + }, + { + "epoch": 0.9053546613051063, + "grad_norm": 0.08763660490512848, + "learning_rate": 4.749821937321938e-06, + "loss": 0.0186, + "step": 122480 + }, + { + "epoch": 0.9054285798764081, + "grad_norm": 0.08455368131399155, + "learning_rate": 4.746112298195632e-06, + "loss": 0.0188, + "step": 122490 + }, + { + "epoch": 0.90550249844771, + "grad_norm": 0.1215326115489006, + "learning_rate": 4.742402659069326e-06, + "loss": 0.0161, + "step": 122500 + }, + { + "epoch": 0.9055764170190118, + "grad_norm": 0.07503091543912888, + "learning_rate": 4.738693019943021e-06, + "loss": 0.0163, + "step": 122510 + }, + { + "epoch": 0.9056503355903137, + "grad_norm": 0.061274394392967224, + "learning_rate": 4.734983380816714e-06, + "loss": 0.0191, + "step": 122520 + }, + { + "epoch": 0.9057242541616156, + "grad_norm": 0.05623488873243332, + "learning_rate": 4.731273741690408e-06, + "loss": 0.0152, + "step": 122530 + }, + { + "epoch": 0.9057981727329174, + "grad_norm": 0.09959295392036438, + "learning_rate": 4.727564102564103e-06, + "loss": 0.016, + "step": 122540 + }, + { + "epoch": 0.9058720913042193, + "grad_norm": 0.06970833986997604, + "learning_rate": 4.723854463437797e-06, + "loss": 0.0168, + "step": 122550 + }, + { + "epoch": 0.9059460098755211, + "grad_norm": 0.0789426937699318, + "learning_rate": 4.720144824311491e-06, + "loss": 0.0146, + "step": 122560 + }, + { + "epoch": 0.906019928446823, + "grad_norm": 0.058944690972566605, + "learning_rate": 4.7164351851851854e-06, + "loss": 0.0154, + "step": 122570 + }, + { + "epoch": 0.9060938470181248, + "grad_norm": 0.11136980354785919, + "learning_rate": 4.71272554605888e-06, + "loss": 0.0192, + "step": 122580 + }, + { + "epoch": 0.9061677655894267, + "grad_norm": 0.07429955899715424, + "learning_rate": 4.709015906932574e-06, + "loss": 0.0179, + "step": 122590 + }, + { + "epoch": 0.9062416841607286, + "grad_norm": 0.06887314468622208, + "learning_rate": 4.705306267806267e-06, + "loss": 0.0161, + "step": 122600 + }, + { + "epoch": 0.9063156027320304, + "grad_norm": 0.06658325344324112, + "learning_rate": 4.701596628679962e-06, + "loss": 0.0186, + "step": 122610 + }, + { + "epoch": 0.9063895213033323, + "grad_norm": 0.09197299927473068, + "learning_rate": 4.697886989553656e-06, + "loss": 0.0161, + "step": 122620 + }, + { + "epoch": 0.9064634398746341, + "grad_norm": 0.08984216302633286, + "learning_rate": 4.69417735042735e-06, + "loss": 0.0177, + "step": 122630 + }, + { + "epoch": 0.906537358445936, + "grad_norm": 0.07162947952747345, + "learning_rate": 4.6904677113010445e-06, + "loss": 0.0157, + "step": 122640 + }, + { + "epoch": 0.9066112770172378, + "grad_norm": 0.06426996737718582, + "learning_rate": 4.68675807217474e-06, + "loss": 0.0163, + "step": 122650 + }, + { + "epoch": 0.9066851955885397, + "grad_norm": 0.09726285934448242, + "learning_rate": 4.683048433048434e-06, + "loss": 0.0179, + "step": 122660 + }, + { + "epoch": 0.9067591141598416, + "grad_norm": 0.09400387853384018, + "learning_rate": 4.679338793922128e-06, + "loss": 0.0158, + "step": 122670 + }, + { + "epoch": 0.9068330327311434, + "grad_norm": 0.08956044912338257, + "learning_rate": 4.675629154795822e-06, + "loss": 0.0155, + "step": 122680 + }, + { + "epoch": 0.9069069513024453, + "grad_norm": 0.07334977388381958, + "learning_rate": 4.671919515669516e-06, + "loss": 0.0173, + "step": 122690 + }, + { + "epoch": 0.906980869873747, + "grad_norm": 0.05719450116157532, + "learning_rate": 4.66820987654321e-06, + "loss": 0.0183, + "step": 122700 + }, + { + "epoch": 0.9070547884450489, + "grad_norm": 0.059781696647405624, + "learning_rate": 4.664500237416904e-06, + "loss": 0.0164, + "step": 122710 + }, + { + "epoch": 0.9071287070163507, + "grad_norm": 0.0924462080001831, + "learning_rate": 4.660790598290599e-06, + "loss": 0.0166, + "step": 122720 + }, + { + "epoch": 0.9072026255876526, + "grad_norm": 0.06804405897855759, + "learning_rate": 4.657080959164293e-06, + "loss": 0.0173, + "step": 122730 + }, + { + "epoch": 0.9072765441589545, + "grad_norm": 0.08799871802330017, + "learning_rate": 4.653371320037987e-06, + "loss": 0.0186, + "step": 122740 + }, + { + "epoch": 0.9073504627302563, + "grad_norm": 0.08435172587633133, + "learning_rate": 4.649661680911681e-06, + "loss": 0.0163, + "step": 122750 + }, + { + "epoch": 0.9074243813015582, + "grad_norm": 0.09052610397338867, + "learning_rate": 4.645952041785375e-06, + "loss": 0.019, + "step": 122760 + }, + { + "epoch": 0.90749829987286, + "grad_norm": 0.055818017572164536, + "learning_rate": 4.642242402659069e-06, + "loss": 0.0177, + "step": 122770 + }, + { + "epoch": 0.9075722184441619, + "grad_norm": 0.07775738090276718, + "learning_rate": 4.6385327635327635e-06, + "loss": 0.0165, + "step": 122780 + }, + { + "epoch": 0.9076461370154638, + "grad_norm": 0.10018419474363327, + "learning_rate": 4.634823124406458e-06, + "loss": 0.0179, + "step": 122790 + }, + { + "epoch": 0.9077200555867656, + "grad_norm": 0.068797767162323, + "learning_rate": 4.631113485280152e-06, + "loss": 0.0183, + "step": 122800 + }, + { + "epoch": 0.9077939741580675, + "grad_norm": 0.07335837185382843, + "learning_rate": 4.627403846153846e-06, + "loss": 0.0173, + "step": 122810 + }, + { + "epoch": 0.9078678927293693, + "grad_norm": 0.05817072093486786, + "learning_rate": 4.6236942070275406e-06, + "loss": 0.0157, + "step": 122820 + }, + { + "epoch": 0.9079418113006712, + "grad_norm": 0.0800141766667366, + "learning_rate": 4.619984567901235e-06, + "loss": 0.0176, + "step": 122830 + }, + { + "epoch": 0.908015729871973, + "grad_norm": 0.12860678136348724, + "learning_rate": 4.616274928774929e-06, + "loss": 0.0183, + "step": 122840 + }, + { + "epoch": 0.9080896484432749, + "grad_norm": 0.09741950035095215, + "learning_rate": 4.612565289648623e-06, + "loss": 0.0187, + "step": 122850 + }, + { + "epoch": 0.9081635670145768, + "grad_norm": 0.06886820495128632, + "learning_rate": 4.608855650522318e-06, + "loss": 0.0174, + "step": 122860 + }, + { + "epoch": 0.9082374855858786, + "grad_norm": 0.0636502280831337, + "learning_rate": 4.605146011396012e-06, + "loss": 0.0184, + "step": 122870 + }, + { + "epoch": 0.9083114041571805, + "grad_norm": 0.06838538497686386, + "learning_rate": 4.601436372269706e-06, + "loss": 0.017, + "step": 122880 + }, + { + "epoch": 0.9083853227284823, + "grad_norm": 0.0679185763001442, + "learning_rate": 4.5977267331434005e-06, + "loss": 0.0171, + "step": 122890 + }, + { + "epoch": 0.9084592412997842, + "grad_norm": 0.08525998890399933, + "learning_rate": 4.594017094017095e-06, + "loss": 0.0181, + "step": 122900 + }, + { + "epoch": 0.908533159871086, + "grad_norm": 0.10945747792720795, + "learning_rate": 4.590307454890788e-06, + "loss": 0.0172, + "step": 122910 + }, + { + "epoch": 0.9086070784423879, + "grad_norm": 0.06596176326274872, + "learning_rate": 4.5865978157644825e-06, + "loss": 0.0168, + "step": 122920 + }, + { + "epoch": 0.9086809970136898, + "grad_norm": 0.06511720269918442, + "learning_rate": 4.582888176638177e-06, + "loss": 0.0165, + "step": 122930 + }, + { + "epoch": 0.9087549155849916, + "grad_norm": 0.10036970674991608, + "learning_rate": 4.579178537511871e-06, + "loss": 0.02, + "step": 122940 + }, + { + "epoch": 0.9088288341562935, + "grad_norm": 0.07263343781232834, + "learning_rate": 4.575468898385565e-06, + "loss": 0.0181, + "step": 122950 + }, + { + "epoch": 0.9089027527275952, + "grad_norm": 0.06853675842285156, + "learning_rate": 4.5717592592592595e-06, + "loss": 0.0164, + "step": 122960 + }, + { + "epoch": 0.9089766712988971, + "grad_norm": 0.08699023723602295, + "learning_rate": 4.568049620132954e-06, + "loss": 0.0161, + "step": 122970 + }, + { + "epoch": 0.909050589870199, + "grad_norm": 0.08658810704946518, + "learning_rate": 4.564339981006647e-06, + "loss": 0.0181, + "step": 122980 + }, + { + "epoch": 0.9091245084415008, + "grad_norm": 0.10963263362646103, + "learning_rate": 4.5606303418803415e-06, + "loss": 0.0182, + "step": 122990 + }, + { + "epoch": 0.9091984270128027, + "grad_norm": 0.09154821932315826, + "learning_rate": 4.556920702754036e-06, + "loss": 0.0169, + "step": 123000 + }, + { + "epoch": 0.9092723455841045, + "grad_norm": 0.07183562219142914, + "learning_rate": 4.55321106362773e-06, + "loss": 0.0141, + "step": 123010 + }, + { + "epoch": 0.9093462641554064, + "grad_norm": 0.07396021485328674, + "learning_rate": 4.549501424501424e-06, + "loss": 0.0173, + "step": 123020 + }, + { + "epoch": 0.9094201827267082, + "grad_norm": 0.07018588483333588, + "learning_rate": 4.5457917853751195e-06, + "loss": 0.0172, + "step": 123030 + }, + { + "epoch": 0.9094941012980101, + "grad_norm": 0.08253616839647293, + "learning_rate": 4.542082146248814e-06, + "loss": 0.016, + "step": 123040 + }, + { + "epoch": 0.909568019869312, + "grad_norm": 0.11334005743265152, + "learning_rate": 4.538372507122508e-06, + "loss": 0.0162, + "step": 123050 + }, + { + "epoch": 0.9096419384406138, + "grad_norm": 0.09180520474910736, + "learning_rate": 4.5346628679962014e-06, + "loss": 0.0167, + "step": 123060 + }, + { + "epoch": 0.9097158570119157, + "grad_norm": 0.08304790407419205, + "learning_rate": 4.530953228869896e-06, + "loss": 0.0169, + "step": 123070 + }, + { + "epoch": 0.9097897755832175, + "grad_norm": 0.08140013366937637, + "learning_rate": 4.52724358974359e-06, + "loss": 0.0149, + "step": 123080 + }, + { + "epoch": 0.9098636941545194, + "grad_norm": 0.07130993902683258, + "learning_rate": 4.523533950617284e-06, + "loss": 0.0187, + "step": 123090 + }, + { + "epoch": 0.9099376127258212, + "grad_norm": 0.07848509401082993, + "learning_rate": 4.5198243114909785e-06, + "loss": 0.0163, + "step": 123100 + }, + { + "epoch": 0.9100115312971231, + "grad_norm": 0.07500548660755157, + "learning_rate": 4.516114672364673e-06, + "loss": 0.0167, + "step": 123110 + }, + { + "epoch": 0.910085449868425, + "grad_norm": 0.06861239671707153, + "learning_rate": 4.512405033238367e-06, + "loss": 0.0144, + "step": 123120 + }, + { + "epoch": 0.9101593684397268, + "grad_norm": 0.07011248916387558, + "learning_rate": 4.508695394112061e-06, + "loss": 0.0182, + "step": 123130 + }, + { + "epoch": 0.9102332870110287, + "grad_norm": 0.07584835588932037, + "learning_rate": 4.504985754985755e-06, + "loss": 0.0162, + "step": 123140 + }, + { + "epoch": 0.9103072055823305, + "grad_norm": 0.07313469797372818, + "learning_rate": 4.501276115859449e-06, + "loss": 0.0191, + "step": 123150 + }, + { + "epoch": 0.9103811241536324, + "grad_norm": 0.07153768092393875, + "learning_rate": 4.497566476733143e-06, + "loss": 0.0157, + "step": 123160 + }, + { + "epoch": 0.9104550427249342, + "grad_norm": 0.059805650264024734, + "learning_rate": 4.493856837606838e-06, + "loss": 0.0146, + "step": 123170 + }, + { + "epoch": 0.9105289612962361, + "grad_norm": 0.07940264791250229, + "learning_rate": 4.490147198480532e-06, + "loss": 0.0141, + "step": 123180 + }, + { + "epoch": 0.910602879867538, + "grad_norm": 0.07751365751028061, + "learning_rate": 4.486437559354226e-06, + "loss": 0.0144, + "step": 123190 + }, + { + "epoch": 0.9106767984388398, + "grad_norm": 0.07345723360776901, + "learning_rate": 4.48272792022792e-06, + "loss": 0.0177, + "step": 123200 + }, + { + "epoch": 0.9107507170101417, + "grad_norm": 0.14844071865081787, + "learning_rate": 4.479018281101615e-06, + "loss": 0.0163, + "step": 123210 + }, + { + "epoch": 0.9108246355814434, + "grad_norm": 0.0766979455947876, + "learning_rate": 4.475308641975309e-06, + "loss": 0.0147, + "step": 123220 + }, + { + "epoch": 0.9108985541527453, + "grad_norm": 0.054067812860012054, + "learning_rate": 4.471599002849003e-06, + "loss": 0.0146, + "step": 123230 + }, + { + "epoch": 0.9109724727240472, + "grad_norm": 0.16767457127571106, + "learning_rate": 4.4678893637226975e-06, + "loss": 0.0187, + "step": 123240 + }, + { + "epoch": 0.911046391295349, + "grad_norm": 0.07993284612894058, + "learning_rate": 4.464179724596392e-06, + "loss": 0.0174, + "step": 123250 + }, + { + "epoch": 0.9111203098666509, + "grad_norm": 0.06665073335170746, + "learning_rate": 4.460470085470086e-06, + "loss": 0.0168, + "step": 123260 + }, + { + "epoch": 0.9111942284379527, + "grad_norm": 0.10466733574867249, + "learning_rate": 4.45676044634378e-06, + "loss": 0.0186, + "step": 123270 + }, + { + "epoch": 0.9112681470092546, + "grad_norm": 0.08831361681222916, + "learning_rate": 4.453050807217475e-06, + "loss": 0.0165, + "step": 123280 + }, + { + "epoch": 0.9113420655805564, + "grad_norm": 0.08201731741428375, + "learning_rate": 4.449341168091168e-06, + "loss": 0.0175, + "step": 123290 + }, + { + "epoch": 0.9114159841518583, + "grad_norm": 0.066454216837883, + "learning_rate": 4.445631528964862e-06, + "loss": 0.0168, + "step": 123300 + }, + { + "epoch": 0.9114899027231602, + "grad_norm": 0.07183511555194855, + "learning_rate": 4.4419218898385566e-06, + "loss": 0.018, + "step": 123310 + }, + { + "epoch": 0.911563821294462, + "grad_norm": 0.08452675491571426, + "learning_rate": 4.438212250712251e-06, + "loss": 0.0174, + "step": 123320 + }, + { + "epoch": 0.9116377398657639, + "grad_norm": 0.07579536736011505, + "learning_rate": 4.434502611585945e-06, + "loss": 0.0171, + "step": 123330 + }, + { + "epoch": 0.9117116584370657, + "grad_norm": 0.0779586210846901, + "learning_rate": 4.430792972459639e-06, + "loss": 0.0166, + "step": 123340 + }, + { + "epoch": 0.9117855770083676, + "grad_norm": 0.08135387301445007, + "learning_rate": 4.427083333333334e-06, + "loss": 0.0152, + "step": 123350 + }, + { + "epoch": 0.9118594955796694, + "grad_norm": 0.08736226707696915, + "learning_rate": 4.423373694207028e-06, + "loss": 0.017, + "step": 123360 + }, + { + "epoch": 0.9119334141509713, + "grad_norm": 0.07891429215669632, + "learning_rate": 4.419664055080721e-06, + "loss": 0.0164, + "step": 123370 + }, + { + "epoch": 0.9120073327222732, + "grad_norm": 0.10436808317899704, + "learning_rate": 4.415954415954416e-06, + "loss": 0.0182, + "step": 123380 + }, + { + "epoch": 0.912081251293575, + "grad_norm": 0.07729527354240417, + "learning_rate": 4.41224477682811e-06, + "loss": 0.0167, + "step": 123390 + }, + { + "epoch": 0.9121551698648769, + "grad_norm": 0.06779775768518448, + "learning_rate": 4.408535137701804e-06, + "loss": 0.0139, + "step": 123400 + }, + { + "epoch": 0.9122290884361787, + "grad_norm": 0.09348805993795395, + "learning_rate": 4.404825498575499e-06, + "loss": 0.0151, + "step": 123410 + }, + { + "epoch": 0.9123030070074806, + "grad_norm": 0.07028793543577194, + "learning_rate": 4.4011158594491936e-06, + "loss": 0.0168, + "step": 123420 + }, + { + "epoch": 0.9123769255787824, + "grad_norm": 0.10762038826942444, + "learning_rate": 4.397406220322888e-06, + "loss": 0.0179, + "step": 123430 + }, + { + "epoch": 0.9124508441500843, + "grad_norm": 0.08728470653295517, + "learning_rate": 4.393696581196581e-06, + "loss": 0.0177, + "step": 123440 + }, + { + "epoch": 0.9125247627213862, + "grad_norm": 0.07320371270179749, + "learning_rate": 4.3899869420702755e-06, + "loss": 0.0151, + "step": 123450 + }, + { + "epoch": 0.912598681292688, + "grad_norm": 0.10701316595077515, + "learning_rate": 4.38627730294397e-06, + "loss": 0.0181, + "step": 123460 + }, + { + "epoch": 0.9126725998639899, + "grad_norm": 0.07376556843519211, + "learning_rate": 4.382567663817664e-06, + "loss": 0.0164, + "step": 123470 + }, + { + "epoch": 0.9127465184352916, + "grad_norm": 0.07854204624891281, + "learning_rate": 4.378858024691358e-06, + "loss": 0.019, + "step": 123480 + }, + { + "epoch": 0.9128204370065935, + "grad_norm": 0.0887388065457344, + "learning_rate": 4.375148385565053e-06, + "loss": 0.0192, + "step": 123490 + }, + { + "epoch": 0.9128943555778954, + "grad_norm": 0.0878068134188652, + "learning_rate": 4.371438746438747e-06, + "loss": 0.0171, + "step": 123500 + }, + { + "epoch": 0.9129682741491972, + "grad_norm": 0.0852675661444664, + "learning_rate": 4.367729107312441e-06, + "loss": 0.0136, + "step": 123510 + }, + { + "epoch": 0.9130421927204991, + "grad_norm": 0.08798334002494812, + "learning_rate": 4.364019468186135e-06, + "loss": 0.0188, + "step": 123520 + }, + { + "epoch": 0.9131161112918009, + "grad_norm": 0.06569161266088486, + "learning_rate": 4.360309829059829e-06, + "loss": 0.0176, + "step": 123530 + }, + { + "epoch": 0.9131900298631028, + "grad_norm": 0.05657649040222168, + "learning_rate": 4.356600189933523e-06, + "loss": 0.0163, + "step": 123540 + }, + { + "epoch": 0.9132639484344046, + "grad_norm": 0.07725798338651657, + "learning_rate": 4.352890550807217e-06, + "loss": 0.0179, + "step": 123550 + }, + { + "epoch": 0.9133378670057065, + "grad_norm": 0.07964330911636353, + "learning_rate": 4.349180911680912e-06, + "loss": 0.018, + "step": 123560 + }, + { + "epoch": 0.9134117855770084, + "grad_norm": 0.0666557103395462, + "learning_rate": 4.345471272554606e-06, + "loss": 0.0189, + "step": 123570 + }, + { + "epoch": 0.9134857041483102, + "grad_norm": 0.11181606352329254, + "learning_rate": 4.3417616334283e-06, + "loss": 0.0175, + "step": 123580 + }, + { + "epoch": 0.9135596227196121, + "grad_norm": 0.07617869973182678, + "learning_rate": 4.3380519943019945e-06, + "loss": 0.0156, + "step": 123590 + }, + { + "epoch": 0.9136335412909139, + "grad_norm": 0.0608520433306694, + "learning_rate": 4.334342355175689e-06, + "loss": 0.0158, + "step": 123600 + }, + { + "epoch": 0.9137074598622158, + "grad_norm": 0.07510219514369965, + "learning_rate": 4.330632716049383e-06, + "loss": 0.0167, + "step": 123610 + }, + { + "epoch": 0.9137813784335176, + "grad_norm": 0.07799938321113586, + "learning_rate": 4.326923076923077e-06, + "loss": 0.0145, + "step": 123620 + }, + { + "epoch": 0.9138552970048195, + "grad_norm": 0.09128500521183014, + "learning_rate": 4.323213437796772e-06, + "loss": 0.0195, + "step": 123630 + }, + { + "epoch": 0.9139292155761214, + "grad_norm": 0.07458452880382538, + "learning_rate": 4.319503798670466e-06, + "loss": 0.0159, + "step": 123640 + }, + { + "epoch": 0.9140031341474232, + "grad_norm": 0.06051633879542351, + "learning_rate": 4.31579415954416e-06, + "loss": 0.0158, + "step": 123650 + }, + { + "epoch": 0.9140770527187251, + "grad_norm": 0.068628691136837, + "learning_rate": 4.312084520417854e-06, + "loss": 0.0176, + "step": 123660 + }, + { + "epoch": 0.9141509712900269, + "grad_norm": 0.05428208038210869, + "learning_rate": 4.308374881291549e-06, + "loss": 0.0146, + "step": 123670 + }, + { + "epoch": 0.9142248898613288, + "grad_norm": 0.08900542557239532, + "learning_rate": 4.304665242165242e-06, + "loss": 0.0169, + "step": 123680 + }, + { + "epoch": 0.9142988084326306, + "grad_norm": 0.05679711326956749, + "learning_rate": 4.300955603038936e-06, + "loss": 0.0184, + "step": 123690 + }, + { + "epoch": 0.9143727270039325, + "grad_norm": 0.07910363376140594, + "learning_rate": 4.297245963912631e-06, + "loss": 0.0168, + "step": 123700 + }, + { + "epoch": 0.9144466455752344, + "grad_norm": 0.06929022818803787, + "learning_rate": 4.293536324786325e-06, + "loss": 0.0179, + "step": 123710 + }, + { + "epoch": 0.9145205641465362, + "grad_norm": 0.08467814326286316, + "learning_rate": 4.289826685660019e-06, + "loss": 0.0148, + "step": 123720 + }, + { + "epoch": 0.914594482717838, + "grad_norm": 0.08828160911798477, + "learning_rate": 4.2861170465337135e-06, + "loss": 0.0164, + "step": 123730 + }, + { + "epoch": 0.9146684012891398, + "grad_norm": 0.08820133656263351, + "learning_rate": 4.282407407407408e-06, + "loss": 0.0158, + "step": 123740 + }, + { + "epoch": 0.9147423198604417, + "grad_norm": 0.06262107938528061, + "learning_rate": 4.278697768281101e-06, + "loss": 0.0197, + "step": 123750 + }, + { + "epoch": 0.9148162384317436, + "grad_norm": 0.07561814039945602, + "learning_rate": 4.2749881291547955e-06, + "loss": 0.0179, + "step": 123760 + }, + { + "epoch": 0.9148901570030454, + "grad_norm": 0.08782166242599487, + "learning_rate": 4.27127849002849e-06, + "loss": 0.0178, + "step": 123770 + }, + { + "epoch": 0.9149640755743473, + "grad_norm": 0.07191344350576401, + "learning_rate": 4.267568850902184e-06, + "loss": 0.0177, + "step": 123780 + }, + { + "epoch": 0.9150379941456491, + "grad_norm": 0.10342707484960556, + "learning_rate": 4.263859211775878e-06, + "loss": 0.0186, + "step": 123790 + }, + { + "epoch": 0.915111912716951, + "grad_norm": 0.0876234620809555, + "learning_rate": 4.260149572649573e-06, + "loss": 0.0171, + "step": 123800 + }, + { + "epoch": 0.9151858312882528, + "grad_norm": 0.07162667065858841, + "learning_rate": 4.256439933523268e-06, + "loss": 0.0185, + "step": 123810 + }, + { + "epoch": 0.9152597498595547, + "grad_norm": 0.07592875510454178, + "learning_rate": 4.252730294396962e-06, + "loss": 0.0168, + "step": 123820 + }, + { + "epoch": 0.9153336684308566, + "grad_norm": 0.08524484187364578, + "learning_rate": 4.249020655270655e-06, + "loss": 0.0176, + "step": 123830 + }, + { + "epoch": 0.9154075870021584, + "grad_norm": 0.09293634444475174, + "learning_rate": 4.24531101614435e-06, + "loss": 0.0178, + "step": 123840 + }, + { + "epoch": 0.9154815055734603, + "grad_norm": 0.10401646792888641, + "learning_rate": 4.241601377018044e-06, + "loss": 0.0167, + "step": 123850 + }, + { + "epoch": 0.9155554241447621, + "grad_norm": 0.0697089210152626, + "learning_rate": 4.237891737891738e-06, + "loss": 0.0171, + "step": 123860 + }, + { + "epoch": 0.915629342716064, + "grad_norm": 0.06575757265090942, + "learning_rate": 4.2341820987654325e-06, + "loss": 0.0177, + "step": 123870 + }, + { + "epoch": 0.9157032612873658, + "grad_norm": 0.05003274977207184, + "learning_rate": 4.230472459639127e-06, + "loss": 0.0172, + "step": 123880 + }, + { + "epoch": 0.9157771798586677, + "grad_norm": 0.08424724638462067, + "learning_rate": 4.226762820512821e-06, + "loss": 0.0207, + "step": 123890 + }, + { + "epoch": 0.9158510984299696, + "grad_norm": 0.06601136922836304, + "learning_rate": 4.223053181386515e-06, + "loss": 0.0188, + "step": 123900 + }, + { + "epoch": 0.9159250170012714, + "grad_norm": 0.082279734313488, + "learning_rate": 4.219343542260209e-06, + "loss": 0.0167, + "step": 123910 + }, + { + "epoch": 0.9159989355725733, + "grad_norm": 0.06980005651712418, + "learning_rate": 4.215633903133903e-06, + "loss": 0.0175, + "step": 123920 + }, + { + "epoch": 0.9160728541438751, + "grad_norm": 0.08342483639717102, + "learning_rate": 4.211924264007597e-06, + "loss": 0.0176, + "step": 123930 + }, + { + "epoch": 0.916146772715177, + "grad_norm": 0.07356125116348267, + "learning_rate": 4.2082146248812915e-06, + "loss": 0.0143, + "step": 123940 + }, + { + "epoch": 0.9162206912864788, + "grad_norm": 0.06317978352308273, + "learning_rate": 4.204504985754986e-06, + "loss": 0.0179, + "step": 123950 + }, + { + "epoch": 0.9162946098577807, + "grad_norm": 0.08833855390548706, + "learning_rate": 4.20079534662868e-06, + "loss": 0.0164, + "step": 123960 + }, + { + "epoch": 0.9163685284290826, + "grad_norm": 0.07598882913589478, + "learning_rate": 4.197085707502374e-06, + "loss": 0.0166, + "step": 123970 + }, + { + "epoch": 0.9164424470003844, + "grad_norm": 0.08852225542068481, + "learning_rate": 4.193376068376069e-06, + "loss": 0.0167, + "step": 123980 + }, + { + "epoch": 0.9165163655716863, + "grad_norm": 0.08856448531150818, + "learning_rate": 4.189666429249763e-06, + "loss": 0.0187, + "step": 123990 + }, + { + "epoch": 0.916590284142988, + "grad_norm": 0.08729839324951172, + "learning_rate": 4.185956790123457e-06, + "loss": 0.0182, + "step": 124000 + }, + { + "epoch": 0.9166642027142899, + "grad_norm": 0.07654992491006851, + "learning_rate": 4.1822471509971514e-06, + "loss": 0.0146, + "step": 124010 + }, + { + "epoch": 0.9167381212855918, + "grad_norm": 0.06819306313991547, + "learning_rate": 4.178537511870846e-06, + "loss": 0.017, + "step": 124020 + }, + { + "epoch": 0.9168120398568936, + "grad_norm": 0.059470996260643005, + "learning_rate": 4.17482787274454e-06, + "loss": 0.0181, + "step": 124030 + }, + { + "epoch": 0.9168859584281955, + "grad_norm": 0.0711124911904335, + "learning_rate": 4.171118233618234e-06, + "loss": 0.0163, + "step": 124040 + }, + { + "epoch": 0.9169598769994973, + "grad_norm": 0.09192194789648056, + "learning_rate": 4.1674085944919285e-06, + "loss": 0.0171, + "step": 124050 + }, + { + "epoch": 0.9170337955707992, + "grad_norm": 0.07689784467220306, + "learning_rate": 4.163698955365622e-06, + "loss": 0.0167, + "step": 124060 + }, + { + "epoch": 0.917107714142101, + "grad_norm": 0.0563366636633873, + "learning_rate": 4.159989316239316e-06, + "loss": 0.016, + "step": 124070 + }, + { + "epoch": 0.9171816327134029, + "grad_norm": 0.10087716579437256, + "learning_rate": 4.1562796771130105e-06, + "loss": 0.0161, + "step": 124080 + }, + { + "epoch": 0.9172555512847048, + "grad_norm": 0.12589262425899506, + "learning_rate": 4.152570037986705e-06, + "loss": 0.0194, + "step": 124090 + }, + { + "epoch": 0.9173294698560066, + "grad_norm": 0.06327742338180542, + "learning_rate": 4.148860398860399e-06, + "loss": 0.0153, + "step": 124100 + }, + { + "epoch": 0.9174033884273085, + "grad_norm": 0.09386986494064331, + "learning_rate": 4.145150759734093e-06, + "loss": 0.0151, + "step": 124110 + }, + { + "epoch": 0.9174773069986103, + "grad_norm": 0.07422593981027603, + "learning_rate": 4.141441120607788e-06, + "loss": 0.0186, + "step": 124120 + }, + { + "epoch": 0.9175512255699122, + "grad_norm": 0.07547813653945923, + "learning_rate": 4.137731481481482e-06, + "loss": 0.0137, + "step": 124130 + }, + { + "epoch": 0.917625144141214, + "grad_norm": 0.06815105676651001, + "learning_rate": 4.134021842355175e-06, + "loss": 0.0182, + "step": 124140 + }, + { + "epoch": 0.9176990627125159, + "grad_norm": 0.08372704684734344, + "learning_rate": 4.1303122032288696e-06, + "loss": 0.0165, + "step": 124150 + }, + { + "epoch": 0.9177729812838178, + "grad_norm": 0.06949080526828766, + "learning_rate": 4.126602564102564e-06, + "loss": 0.0171, + "step": 124160 + }, + { + "epoch": 0.9178468998551196, + "grad_norm": 0.07713343948125839, + "learning_rate": 4.122892924976258e-06, + "loss": 0.0166, + "step": 124170 + }, + { + "epoch": 0.9179208184264215, + "grad_norm": 0.05164538323879242, + "learning_rate": 4.119183285849953e-06, + "loss": 0.019, + "step": 124180 + }, + { + "epoch": 0.9179947369977233, + "grad_norm": 0.08802802860736847, + "learning_rate": 4.1154736467236475e-06, + "loss": 0.019, + "step": 124190 + }, + { + "epoch": 0.9180686555690252, + "grad_norm": 0.043701138347387314, + "learning_rate": 4.111764007597342e-06, + "loss": 0.0164, + "step": 124200 + }, + { + "epoch": 0.918142574140327, + "grad_norm": 0.05825362727046013, + "learning_rate": 4.108054368471035e-06, + "loss": 0.0176, + "step": 124210 + }, + { + "epoch": 0.9182164927116289, + "grad_norm": 0.07954519242048264, + "learning_rate": 4.1043447293447295e-06, + "loss": 0.0177, + "step": 124220 + }, + { + "epoch": 0.9182904112829308, + "grad_norm": 0.0765024796128273, + "learning_rate": 4.100635090218424e-06, + "loss": 0.0178, + "step": 124230 + }, + { + "epoch": 0.9183643298542326, + "grad_norm": 0.08571942150592804, + "learning_rate": 4.096925451092118e-06, + "loss": 0.0172, + "step": 124240 + }, + { + "epoch": 0.9184382484255345, + "grad_norm": 0.046985041350126266, + "learning_rate": 4.093215811965812e-06, + "loss": 0.015, + "step": 124250 + }, + { + "epoch": 0.9185121669968362, + "grad_norm": 0.0847211554646492, + "learning_rate": 4.0895061728395066e-06, + "loss": 0.0169, + "step": 124260 + }, + { + "epoch": 0.9185860855681381, + "grad_norm": 0.0697716549038887, + "learning_rate": 4.085796533713201e-06, + "loss": 0.0185, + "step": 124270 + }, + { + "epoch": 0.91866000413944, + "grad_norm": 0.06355872005224228, + "learning_rate": 4.082086894586895e-06, + "loss": 0.0153, + "step": 124280 + }, + { + "epoch": 0.9187339227107418, + "grad_norm": 0.11508794873952866, + "learning_rate": 4.0783772554605885e-06, + "loss": 0.0168, + "step": 124290 + }, + { + "epoch": 0.9188078412820437, + "grad_norm": 0.07200739532709122, + "learning_rate": 4.074667616334283e-06, + "loss": 0.0175, + "step": 124300 + }, + { + "epoch": 0.9188817598533455, + "grad_norm": 0.09681064635515213, + "learning_rate": 4.070957977207977e-06, + "loss": 0.0173, + "step": 124310 + }, + { + "epoch": 0.9189556784246474, + "grad_norm": 0.06718669086694717, + "learning_rate": 4.067248338081671e-06, + "loss": 0.0163, + "step": 124320 + }, + { + "epoch": 0.9190295969959492, + "grad_norm": 0.05894205719232559, + "learning_rate": 4.063538698955366e-06, + "loss": 0.0152, + "step": 124330 + }, + { + "epoch": 0.9191035155672511, + "grad_norm": 0.14502786099910736, + "learning_rate": 4.05982905982906e-06, + "loss": 0.0198, + "step": 124340 + }, + { + "epoch": 0.919177434138553, + "grad_norm": 0.07610470056533813, + "learning_rate": 4.056119420702754e-06, + "loss": 0.0142, + "step": 124350 + }, + { + "epoch": 0.9192513527098548, + "grad_norm": 0.07440314441919327, + "learning_rate": 4.0524097815764484e-06, + "loss": 0.0149, + "step": 124360 + }, + { + "epoch": 0.9193252712811567, + "grad_norm": 0.05952728912234306, + "learning_rate": 4.048700142450143e-06, + "loss": 0.0153, + "step": 124370 + }, + { + "epoch": 0.9193991898524585, + "grad_norm": 0.08123992383480072, + "learning_rate": 4.044990503323837e-06, + "loss": 0.0171, + "step": 124380 + }, + { + "epoch": 0.9194731084237604, + "grad_norm": 0.08689317107200623, + "learning_rate": 4.041280864197531e-06, + "loss": 0.015, + "step": 124390 + }, + { + "epoch": 0.9195470269950622, + "grad_norm": 0.06676415354013443, + "learning_rate": 4.0375712250712255e-06, + "loss": 0.0155, + "step": 124400 + }, + { + "epoch": 0.9196209455663641, + "grad_norm": 0.06977392733097076, + "learning_rate": 4.03386158594492e-06, + "loss": 0.0163, + "step": 124410 + }, + { + "epoch": 0.919694864137666, + "grad_norm": 0.07469875365495682, + "learning_rate": 4.030151946818614e-06, + "loss": 0.0127, + "step": 124420 + }, + { + "epoch": 0.9197687827089678, + "grad_norm": 0.06375137716531754, + "learning_rate": 4.026442307692308e-06, + "loss": 0.0152, + "step": 124430 + }, + { + "epoch": 0.9198427012802697, + "grad_norm": 0.06735789030790329, + "learning_rate": 4.022732668566002e-06, + "loss": 0.018, + "step": 124440 + }, + { + "epoch": 0.9199166198515715, + "grad_norm": 0.09881290048360825, + "learning_rate": 4.019023029439696e-06, + "loss": 0.0198, + "step": 124450 + }, + { + "epoch": 0.9199905384228734, + "grad_norm": 0.13130244612693787, + "learning_rate": 4.01531339031339e-06, + "loss": 0.0203, + "step": 124460 + }, + { + "epoch": 0.9200644569941752, + "grad_norm": 0.08290501683950424, + "learning_rate": 4.011603751187085e-06, + "loss": 0.0196, + "step": 124470 + }, + { + "epoch": 0.9201383755654771, + "grad_norm": 0.07831768691539764, + "learning_rate": 4.007894112060779e-06, + "loss": 0.0176, + "step": 124480 + }, + { + "epoch": 0.920212294136779, + "grad_norm": 0.10180113464593887, + "learning_rate": 4.004184472934473e-06, + "loss": 0.0169, + "step": 124490 + }, + { + "epoch": 0.9202862127080808, + "grad_norm": 0.05810206010937691, + "learning_rate": 4.000474833808167e-06, + "loss": 0.017, + "step": 124500 + }, + { + "epoch": 0.9203601312793827, + "grad_norm": 0.06321538984775543, + "learning_rate": 3.996765194681862e-06, + "loss": 0.0154, + "step": 124510 + }, + { + "epoch": 0.9204340498506844, + "grad_norm": 0.11284110695123672, + "learning_rate": 3.993055555555555e-06, + "loss": 0.0161, + "step": 124520 + }, + { + "epoch": 0.9205079684219863, + "grad_norm": 0.07424329966306686, + "learning_rate": 3.989345916429249e-06, + "loss": 0.0159, + "step": 124530 + }, + { + "epoch": 0.9205818869932882, + "grad_norm": 0.05204179137945175, + "learning_rate": 3.985636277302944e-06, + "loss": 0.016, + "step": 124540 + }, + { + "epoch": 0.92065580556459, + "grad_norm": 0.0912189781665802, + "learning_rate": 3.981926638176638e-06, + "loss": 0.0176, + "step": 124550 + }, + { + "epoch": 0.9207297241358919, + "grad_norm": 0.08071061968803406, + "learning_rate": 3.978216999050333e-06, + "loss": 0.0169, + "step": 124560 + }, + { + "epoch": 0.9208036427071937, + "grad_norm": 0.08048860728740692, + "learning_rate": 3.974507359924027e-06, + "loss": 0.0162, + "step": 124570 + }, + { + "epoch": 0.9208775612784956, + "grad_norm": 0.09860378503799438, + "learning_rate": 3.970797720797722e-06, + "loss": 0.0173, + "step": 124580 + }, + { + "epoch": 0.9209514798497974, + "grad_norm": 0.06638229638338089, + "learning_rate": 3.967088081671416e-06, + "loss": 0.0176, + "step": 124590 + }, + { + "epoch": 0.9210253984210993, + "grad_norm": 0.09299634397029877, + "learning_rate": 3.963378442545109e-06, + "loss": 0.0187, + "step": 124600 + }, + { + "epoch": 0.9210993169924012, + "grad_norm": 0.07731974869966507, + "learning_rate": 3.9596688034188036e-06, + "loss": 0.0169, + "step": 124610 + }, + { + "epoch": 0.921173235563703, + "grad_norm": 0.05448630824685097, + "learning_rate": 3.955959164292498e-06, + "loss": 0.0147, + "step": 124620 + }, + { + "epoch": 0.9212471541350049, + "grad_norm": 0.0719091147184372, + "learning_rate": 3.952249525166192e-06, + "loss": 0.016, + "step": 124630 + }, + { + "epoch": 0.9213210727063067, + "grad_norm": 0.09676989912986755, + "learning_rate": 3.948539886039886e-06, + "loss": 0.0157, + "step": 124640 + }, + { + "epoch": 0.9213949912776086, + "grad_norm": 0.07981038093566895, + "learning_rate": 3.944830246913581e-06, + "loss": 0.0186, + "step": 124650 + }, + { + "epoch": 0.9214689098489104, + "grad_norm": 0.06857098639011383, + "learning_rate": 3.941120607787275e-06, + "loss": 0.0188, + "step": 124660 + }, + { + "epoch": 0.9215428284202123, + "grad_norm": 0.06626041978597641, + "learning_rate": 3.937410968660968e-06, + "loss": 0.0164, + "step": 124670 + }, + { + "epoch": 0.9216167469915142, + "grad_norm": 0.08632654696702957, + "learning_rate": 3.933701329534663e-06, + "loss": 0.0155, + "step": 124680 + }, + { + "epoch": 0.921690665562816, + "grad_norm": 0.06686916947364807, + "learning_rate": 3.929991690408357e-06, + "loss": 0.0158, + "step": 124690 + }, + { + "epoch": 0.9217645841341179, + "grad_norm": 0.11379338055849075, + "learning_rate": 3.926282051282051e-06, + "loss": 0.0169, + "step": 124700 + }, + { + "epoch": 0.9218385027054197, + "grad_norm": 0.05981620028614998, + "learning_rate": 3.9225724121557454e-06, + "loss": 0.018, + "step": 124710 + }, + { + "epoch": 0.9219124212767216, + "grad_norm": 0.0945410206913948, + "learning_rate": 3.91886277302944e-06, + "loss": 0.0174, + "step": 124720 + }, + { + "epoch": 0.9219863398480235, + "grad_norm": 0.08452748507261276, + "learning_rate": 3.915153133903134e-06, + "loss": 0.0148, + "step": 124730 + }, + { + "epoch": 0.9220602584193253, + "grad_norm": 0.06958349049091339, + "learning_rate": 3.911443494776828e-06, + "loss": 0.0161, + "step": 124740 + }, + { + "epoch": 0.9221341769906272, + "grad_norm": 0.061324093490839005, + "learning_rate": 3.9077338556505225e-06, + "loss": 0.0157, + "step": 124750 + }, + { + "epoch": 0.922208095561929, + "grad_norm": 0.07830534875392914, + "learning_rate": 3.904024216524217e-06, + "loss": 0.0149, + "step": 124760 + }, + { + "epoch": 0.9222820141332309, + "grad_norm": 0.0901099294424057, + "learning_rate": 3.900314577397911e-06, + "loss": 0.0148, + "step": 124770 + }, + { + "epoch": 0.9223559327045326, + "grad_norm": 0.06344560533761978, + "learning_rate": 3.896604938271605e-06, + "loss": 0.016, + "step": 124780 + }, + { + "epoch": 0.9224298512758345, + "grad_norm": 0.08566906303167343, + "learning_rate": 3.8928952991453e-06, + "loss": 0.016, + "step": 124790 + }, + { + "epoch": 0.9225037698471364, + "grad_norm": 0.05252756178379059, + "learning_rate": 3.889185660018994e-06, + "loss": 0.0142, + "step": 124800 + }, + { + "epoch": 0.9225776884184382, + "grad_norm": 0.07893887907266617, + "learning_rate": 3.885476020892688e-06, + "loss": 0.0194, + "step": 124810 + }, + { + "epoch": 0.9226516069897401, + "grad_norm": 0.07250846922397614, + "learning_rate": 3.8817663817663825e-06, + "loss": 0.0144, + "step": 124820 + }, + { + "epoch": 0.9227255255610419, + "grad_norm": 0.07483262568712234, + "learning_rate": 3.878056742640076e-06, + "loss": 0.0209, + "step": 124830 + }, + { + "epoch": 0.9227994441323438, + "grad_norm": 0.09314266592264175, + "learning_rate": 3.87434710351377e-06, + "loss": 0.0191, + "step": 124840 + }, + { + "epoch": 0.9228733627036456, + "grad_norm": 0.06463508307933807, + "learning_rate": 3.8706374643874644e-06, + "loss": 0.0194, + "step": 124850 + }, + { + "epoch": 0.9229472812749475, + "grad_norm": 0.07391108572483063, + "learning_rate": 3.866927825261159e-06, + "loss": 0.0157, + "step": 124860 + }, + { + "epoch": 0.9230211998462494, + "grad_norm": 0.06343664973974228, + "learning_rate": 3.863218186134853e-06, + "loss": 0.0191, + "step": 124870 + }, + { + "epoch": 0.9230951184175512, + "grad_norm": 0.09996242076158524, + "learning_rate": 3.859508547008547e-06, + "loss": 0.016, + "step": 124880 + }, + { + "epoch": 0.9231690369888531, + "grad_norm": 0.07135334610939026, + "learning_rate": 3.8557989078822415e-06, + "loss": 0.0198, + "step": 124890 + }, + { + "epoch": 0.9232429555601549, + "grad_norm": 0.071023128926754, + "learning_rate": 3.852089268755936e-06, + "loss": 0.0176, + "step": 124900 + }, + { + "epoch": 0.9233168741314568, + "grad_norm": 0.07268857210874557, + "learning_rate": 3.848379629629629e-06, + "loss": 0.0176, + "step": 124910 + }, + { + "epoch": 0.9233907927027586, + "grad_norm": 0.10081833600997925, + "learning_rate": 3.8446699905033235e-06, + "loss": 0.0179, + "step": 124920 + }, + { + "epoch": 0.9234647112740605, + "grad_norm": 0.08457359671592712, + "learning_rate": 3.840960351377018e-06, + "loss": 0.0157, + "step": 124930 + }, + { + "epoch": 0.9235386298453624, + "grad_norm": 0.07194271683692932, + "learning_rate": 3.837250712250712e-06, + "loss": 0.0167, + "step": 124940 + }, + { + "epoch": 0.9236125484166642, + "grad_norm": 0.06114126741886139, + "learning_rate": 3.833541073124407e-06, + "loss": 0.0164, + "step": 124950 + }, + { + "epoch": 0.9236864669879661, + "grad_norm": 0.08034610003232956, + "learning_rate": 3.8298314339981014e-06, + "loss": 0.0191, + "step": 124960 + }, + { + "epoch": 0.9237603855592679, + "grad_norm": 0.061103809624910355, + "learning_rate": 3.826121794871796e-06, + "loss": 0.0182, + "step": 124970 + }, + { + "epoch": 0.9238343041305698, + "grad_norm": 0.0654354989528656, + "learning_rate": 3.822412155745489e-06, + "loss": 0.0166, + "step": 124980 + }, + { + "epoch": 0.9239082227018717, + "grad_norm": 0.07981405407190323, + "learning_rate": 3.818702516619183e-06, + "loss": 0.0202, + "step": 124990 + }, + { + "epoch": 0.9239821412731735, + "grad_norm": 0.08441367000341415, + "learning_rate": 3.814992877492878e-06, + "loss": 0.0169, + "step": 125000 + }, + { + "epoch": 0.9240560598444754, + "grad_norm": 0.08189128339290619, + "learning_rate": 3.811283238366572e-06, + "loss": 0.0176, + "step": 125010 + }, + { + "epoch": 0.9241299784157772, + "grad_norm": 0.08971022814512253, + "learning_rate": 3.8075735992402662e-06, + "loss": 0.0162, + "step": 125020 + }, + { + "epoch": 0.924203896987079, + "grad_norm": 0.07982345670461655, + "learning_rate": 3.8038639601139605e-06, + "loss": 0.0153, + "step": 125030 + }, + { + "epoch": 0.9242778155583808, + "grad_norm": 0.074432373046875, + "learning_rate": 3.8001543209876548e-06, + "loss": 0.0174, + "step": 125040 + }, + { + "epoch": 0.9243517341296827, + "grad_norm": 0.08879147469997406, + "learning_rate": 3.796444681861349e-06, + "loss": 0.0159, + "step": 125050 + }, + { + "epoch": 0.9244256527009846, + "grad_norm": 0.07412946224212646, + "learning_rate": 3.7927350427350425e-06, + "loss": 0.0178, + "step": 125060 + }, + { + "epoch": 0.9244995712722864, + "grad_norm": 0.08697675913572311, + "learning_rate": 3.7890254036087367e-06, + "loss": 0.0187, + "step": 125070 + }, + { + "epoch": 0.9245734898435883, + "grad_norm": 0.09308616816997528, + "learning_rate": 3.7853157644824314e-06, + "loss": 0.0169, + "step": 125080 + }, + { + "epoch": 0.9246474084148901, + "grad_norm": 0.07256640493869781, + "learning_rate": 3.7816061253561257e-06, + "loss": 0.0193, + "step": 125090 + }, + { + "epoch": 0.924721326986192, + "grad_norm": 0.09890392422676086, + "learning_rate": 3.77789648622982e-06, + "loss": 0.0173, + "step": 125100 + }, + { + "epoch": 0.9247952455574938, + "grad_norm": 0.07536329329013824, + "learning_rate": 3.7741868471035142e-06, + "loss": 0.0182, + "step": 125110 + }, + { + "epoch": 0.9248691641287957, + "grad_norm": 0.05913927033543587, + "learning_rate": 3.7704772079772085e-06, + "loss": 0.0164, + "step": 125120 + }, + { + "epoch": 0.9249430827000976, + "grad_norm": 0.09645496308803558, + "learning_rate": 3.766767568850903e-06, + "loss": 0.02, + "step": 125130 + }, + { + "epoch": 0.9250170012713994, + "grad_norm": 0.08335822820663452, + "learning_rate": 3.7630579297245962e-06, + "loss": 0.0162, + "step": 125140 + }, + { + "epoch": 0.9250909198427013, + "grad_norm": 0.1134815365076065, + "learning_rate": 3.7593482905982905e-06, + "loss": 0.0183, + "step": 125150 + }, + { + "epoch": 0.9251648384140031, + "grad_norm": 0.07296968251466751, + "learning_rate": 3.7556386514719848e-06, + "loss": 0.0156, + "step": 125160 + }, + { + "epoch": 0.925238756985305, + "grad_norm": 0.06772264093160629, + "learning_rate": 3.751929012345679e-06, + "loss": 0.0155, + "step": 125170 + }, + { + "epoch": 0.9253126755566068, + "grad_norm": 0.06695547699928284, + "learning_rate": 3.7482193732193733e-06, + "loss": 0.0163, + "step": 125180 + }, + { + "epoch": 0.9253865941279087, + "grad_norm": 0.0516495443880558, + "learning_rate": 3.7445097340930676e-06, + "loss": 0.0177, + "step": 125190 + }, + { + "epoch": 0.9254605126992106, + "grad_norm": 0.09707432985305786, + "learning_rate": 3.7408000949667623e-06, + "loss": 0.0153, + "step": 125200 + }, + { + "epoch": 0.9255344312705124, + "grad_norm": 0.06112994998693466, + "learning_rate": 3.7370904558404557e-06, + "loss": 0.0163, + "step": 125210 + }, + { + "epoch": 0.9256083498418143, + "grad_norm": 0.07663624733686447, + "learning_rate": 3.73338081671415e-06, + "loss": 0.0182, + "step": 125220 + }, + { + "epoch": 0.9256822684131161, + "grad_norm": 0.05725317820906639, + "learning_rate": 3.7296711775878443e-06, + "loss": 0.0144, + "step": 125230 + }, + { + "epoch": 0.925756186984418, + "grad_norm": 0.09255700558423996, + "learning_rate": 3.7259615384615385e-06, + "loss": 0.0183, + "step": 125240 + }, + { + "epoch": 0.9258301055557199, + "grad_norm": 0.05431872233748436, + "learning_rate": 3.722251899335233e-06, + "loss": 0.0145, + "step": 125250 + }, + { + "epoch": 0.9259040241270217, + "grad_norm": 0.06756569445133209, + "learning_rate": 3.718542260208927e-06, + "loss": 0.0152, + "step": 125260 + }, + { + "epoch": 0.9259779426983236, + "grad_norm": 0.08588846772909164, + "learning_rate": 3.7148326210826213e-06, + "loss": 0.0165, + "step": 125270 + }, + { + "epoch": 0.9260518612696254, + "grad_norm": 0.0836460068821907, + "learning_rate": 3.7111229819563156e-06, + "loss": 0.018, + "step": 125280 + }, + { + "epoch": 0.9261257798409273, + "grad_norm": 0.08511676639318466, + "learning_rate": 3.7074133428300095e-06, + "loss": 0.0156, + "step": 125290 + }, + { + "epoch": 0.926199698412229, + "grad_norm": 0.08848274499177933, + "learning_rate": 3.7037037037037037e-06, + "loss": 0.0172, + "step": 125300 + }, + { + "epoch": 0.926273616983531, + "grad_norm": 0.08268193900585175, + "learning_rate": 3.699994064577398e-06, + "loss": 0.0157, + "step": 125310 + }, + { + "epoch": 0.9263475355548328, + "grad_norm": 0.056754596531391144, + "learning_rate": 3.6962844254510923e-06, + "loss": 0.0169, + "step": 125320 + }, + { + "epoch": 0.9264214541261346, + "grad_norm": 0.0771530270576477, + "learning_rate": 3.6925747863247866e-06, + "loss": 0.018, + "step": 125330 + }, + { + "epoch": 0.9264953726974365, + "grad_norm": 0.08571093529462814, + "learning_rate": 3.688865147198481e-06, + "loss": 0.0193, + "step": 125340 + }, + { + "epoch": 0.9265692912687383, + "grad_norm": 0.08762447535991669, + "learning_rate": 3.685155508072175e-06, + "loss": 0.0169, + "step": 125350 + }, + { + "epoch": 0.9266432098400402, + "grad_norm": 0.0783274844288826, + "learning_rate": 3.6814458689458694e-06, + "loss": 0.0185, + "step": 125360 + }, + { + "epoch": 0.926717128411342, + "grad_norm": 0.08024732768535614, + "learning_rate": 3.6777362298195632e-06, + "loss": 0.0143, + "step": 125370 + }, + { + "epoch": 0.9267910469826439, + "grad_norm": 0.09043636173009872, + "learning_rate": 3.6740265906932575e-06, + "loss": 0.0176, + "step": 125380 + }, + { + "epoch": 0.9268649655539458, + "grad_norm": 0.07826859503984451, + "learning_rate": 3.6703169515669518e-06, + "loss": 0.0164, + "step": 125390 + }, + { + "epoch": 0.9269388841252476, + "grad_norm": 0.06709279119968414, + "learning_rate": 3.666607312440646e-06, + "loss": 0.0175, + "step": 125400 + }, + { + "epoch": 0.9270128026965495, + "grad_norm": 0.0865020826458931, + "learning_rate": 3.6628976733143403e-06, + "loss": 0.0167, + "step": 125410 + }, + { + "epoch": 0.9270867212678513, + "grad_norm": 0.09275317937135696, + "learning_rate": 3.6591880341880346e-06, + "loss": 0.0169, + "step": 125420 + }, + { + "epoch": 0.9271606398391532, + "grad_norm": 0.06730522960424423, + "learning_rate": 3.655478395061729e-06, + "loss": 0.0197, + "step": 125430 + }, + { + "epoch": 0.927234558410455, + "grad_norm": 0.06249605491757393, + "learning_rate": 3.6517687559354223e-06, + "loss": 0.015, + "step": 125440 + }, + { + "epoch": 0.9273084769817569, + "grad_norm": 0.09848795086145401, + "learning_rate": 3.6480591168091166e-06, + "loss": 0.0178, + "step": 125450 + }, + { + "epoch": 0.9273823955530588, + "grad_norm": 0.09894700348377228, + "learning_rate": 3.644349477682811e-06, + "loss": 0.0194, + "step": 125460 + }, + { + "epoch": 0.9274563141243606, + "grad_norm": 0.0778171494603157, + "learning_rate": 3.6406398385565055e-06, + "loss": 0.017, + "step": 125470 + }, + { + "epoch": 0.9275302326956625, + "grad_norm": 0.08636230230331421, + "learning_rate": 3.6369301994302e-06, + "loss": 0.0192, + "step": 125480 + }, + { + "epoch": 0.9276041512669643, + "grad_norm": 0.06680939346551895, + "learning_rate": 3.633220560303894e-06, + "loss": 0.0165, + "step": 125490 + }, + { + "epoch": 0.9276780698382662, + "grad_norm": 0.08700402081012726, + "learning_rate": 3.6295109211775884e-06, + "loss": 0.0161, + "step": 125500 + }, + { + "epoch": 0.9277519884095681, + "grad_norm": 0.07302654534578323, + "learning_rate": 3.6258012820512826e-06, + "loss": 0.0169, + "step": 125510 + }, + { + "epoch": 0.9278259069808699, + "grad_norm": 0.08297467976808548, + "learning_rate": 3.622091642924976e-06, + "loss": 0.0167, + "step": 125520 + }, + { + "epoch": 0.9278998255521718, + "grad_norm": 0.12444033473730087, + "learning_rate": 3.6183820037986703e-06, + "loss": 0.0184, + "step": 125530 + }, + { + "epoch": 0.9279737441234736, + "grad_norm": 0.09066224098205566, + "learning_rate": 3.6146723646723646e-06, + "loss": 0.0178, + "step": 125540 + }, + { + "epoch": 0.9280476626947755, + "grad_norm": 0.08981047570705414, + "learning_rate": 3.610962725546059e-06, + "loss": 0.0184, + "step": 125550 + }, + { + "epoch": 0.9281215812660772, + "grad_norm": 0.07104262709617615, + "learning_rate": 3.607253086419753e-06, + "loss": 0.0181, + "step": 125560 + }, + { + "epoch": 0.9281954998373791, + "grad_norm": 0.08694016188383102, + "learning_rate": 3.6035434472934474e-06, + "loss": 0.0172, + "step": 125570 + }, + { + "epoch": 0.928269418408681, + "grad_norm": 0.09042564779520035, + "learning_rate": 3.599833808167142e-06, + "loss": 0.0177, + "step": 125580 + }, + { + "epoch": 0.9283433369799828, + "grad_norm": 0.054768797010183334, + "learning_rate": 3.5961241690408364e-06, + "loss": 0.0163, + "step": 125590 + }, + { + "epoch": 0.9284172555512847, + "grad_norm": 0.07020504772663116, + "learning_rate": 3.59241452991453e-06, + "loss": 0.0174, + "step": 125600 + }, + { + "epoch": 0.9284911741225865, + "grad_norm": 0.08846597373485565, + "learning_rate": 3.588704890788224e-06, + "loss": 0.0172, + "step": 125610 + }, + { + "epoch": 0.9285650926938884, + "grad_norm": 0.07999665290117264, + "learning_rate": 3.5849952516619184e-06, + "loss": 0.0168, + "step": 125620 + }, + { + "epoch": 0.9286390112651902, + "grad_norm": 0.06280770152807236, + "learning_rate": 3.5812856125356126e-06, + "loss": 0.0173, + "step": 125630 + }, + { + "epoch": 0.9287129298364921, + "grad_norm": 0.07466577738523483, + "learning_rate": 3.577575973409307e-06, + "loss": 0.0164, + "step": 125640 + }, + { + "epoch": 0.928786848407794, + "grad_norm": 0.0839623361825943, + "learning_rate": 3.573866334283001e-06, + "loss": 0.0168, + "step": 125650 + }, + { + "epoch": 0.9288607669790958, + "grad_norm": 0.0837130919098854, + "learning_rate": 3.5701566951566954e-06, + "loss": 0.0172, + "step": 125660 + }, + { + "epoch": 0.9289346855503977, + "grad_norm": 0.07399124652147293, + "learning_rate": 3.5664470560303893e-06, + "loss": 0.0178, + "step": 125670 + }, + { + "epoch": 0.9290086041216995, + "grad_norm": 0.06902366876602173, + "learning_rate": 3.5627374169040836e-06, + "loss": 0.0178, + "step": 125680 + }, + { + "epoch": 0.9290825226930014, + "grad_norm": 0.09093683958053589, + "learning_rate": 3.559027777777778e-06, + "loss": 0.0161, + "step": 125690 + }, + { + "epoch": 0.9291564412643032, + "grad_norm": 0.09570712596178055, + "learning_rate": 3.555318138651472e-06, + "loss": 0.0185, + "step": 125700 + }, + { + "epoch": 0.9292303598356051, + "grad_norm": 0.07997146993875504, + "learning_rate": 3.5516084995251664e-06, + "loss": 0.0148, + "step": 125710 + }, + { + "epoch": 0.929304278406907, + "grad_norm": 0.058948636054992676, + "learning_rate": 3.5478988603988607e-06, + "loss": 0.0156, + "step": 125720 + }, + { + "epoch": 0.9293781969782088, + "grad_norm": 0.09420224279165268, + "learning_rate": 3.544189221272555e-06, + "loss": 0.0148, + "step": 125730 + }, + { + "epoch": 0.9294521155495107, + "grad_norm": 0.08917292952537537, + "learning_rate": 3.540479582146249e-06, + "loss": 0.0149, + "step": 125740 + }, + { + "epoch": 0.9295260341208125, + "grad_norm": 0.058667514473199844, + "learning_rate": 3.536769943019943e-06, + "loss": 0.0165, + "step": 125750 + }, + { + "epoch": 0.9295999526921144, + "grad_norm": 0.07369853556156158, + "learning_rate": 3.5330603038936373e-06, + "loss": 0.0171, + "step": 125760 + }, + { + "epoch": 0.9296738712634163, + "grad_norm": 0.06561733782291412, + "learning_rate": 3.5293506647673316e-06, + "loss": 0.0152, + "step": 125770 + }, + { + "epoch": 0.9297477898347181, + "grad_norm": 0.05797674506902695, + "learning_rate": 3.525641025641026e-06, + "loss": 0.0175, + "step": 125780 + }, + { + "epoch": 0.92982170840602, + "grad_norm": 0.09540330618619919, + "learning_rate": 3.52193138651472e-06, + "loss": 0.0149, + "step": 125790 + }, + { + "epoch": 0.9298956269773218, + "grad_norm": 0.08474097400903702, + "learning_rate": 3.5182217473884144e-06, + "loss": 0.0184, + "step": 125800 + }, + { + "epoch": 0.9299695455486237, + "grad_norm": 0.0823434367775917, + "learning_rate": 3.5145121082621087e-06, + "loss": 0.0167, + "step": 125810 + }, + { + "epoch": 0.9300434641199254, + "grad_norm": 0.10081304609775543, + "learning_rate": 3.510802469135803e-06, + "loss": 0.0173, + "step": 125820 + }, + { + "epoch": 0.9301173826912273, + "grad_norm": 0.0666218101978302, + "learning_rate": 3.5070928300094964e-06, + "loss": 0.0164, + "step": 125830 + }, + { + "epoch": 0.9301913012625292, + "grad_norm": 0.06705756485462189, + "learning_rate": 3.5033831908831907e-06, + "loss": 0.0147, + "step": 125840 + }, + { + "epoch": 0.930265219833831, + "grad_norm": 0.0807531476020813, + "learning_rate": 3.4996735517568854e-06, + "loss": 0.0168, + "step": 125850 + }, + { + "epoch": 0.9303391384051329, + "grad_norm": 0.08411208540201187, + "learning_rate": 3.4959639126305796e-06, + "loss": 0.0167, + "step": 125860 + }, + { + "epoch": 0.9304130569764347, + "grad_norm": 0.085993692278862, + "learning_rate": 3.492254273504274e-06, + "loss": 0.017, + "step": 125870 + }, + { + "epoch": 0.9304869755477366, + "grad_norm": 0.07423291355371475, + "learning_rate": 3.488544634377968e-06, + "loss": 0.0142, + "step": 125880 + }, + { + "epoch": 0.9305608941190384, + "grad_norm": 0.05742659792304039, + "learning_rate": 3.4848349952516625e-06, + "loss": 0.0166, + "step": 125890 + }, + { + "epoch": 0.9306348126903403, + "grad_norm": 0.06945046037435532, + "learning_rate": 3.481125356125356e-06, + "loss": 0.0193, + "step": 125900 + }, + { + "epoch": 0.9307087312616422, + "grad_norm": 0.0680810883641243, + "learning_rate": 3.47741571699905e-06, + "loss": 0.0168, + "step": 125910 + }, + { + "epoch": 0.930782649832944, + "grad_norm": 0.06685182452201843, + "learning_rate": 3.4737060778727444e-06, + "loss": 0.0167, + "step": 125920 + }, + { + "epoch": 0.9308565684042459, + "grad_norm": 0.064430370926857, + "learning_rate": 3.4699964387464387e-06, + "loss": 0.0164, + "step": 125930 + }, + { + "epoch": 0.9309304869755477, + "grad_norm": 0.06506548076868057, + "learning_rate": 3.466286799620133e-06, + "loss": 0.0155, + "step": 125940 + }, + { + "epoch": 0.9310044055468496, + "grad_norm": 0.08693241328001022, + "learning_rate": 3.4625771604938272e-06, + "loss": 0.0187, + "step": 125950 + }, + { + "epoch": 0.9310783241181514, + "grad_norm": 0.10978690534830093, + "learning_rate": 3.458867521367522e-06, + "loss": 0.0193, + "step": 125960 + }, + { + "epoch": 0.9311522426894533, + "grad_norm": 0.09855156391859055, + "learning_rate": 3.4551578822412162e-06, + "loss": 0.0163, + "step": 125970 + }, + { + "epoch": 0.9312261612607552, + "grad_norm": 0.06699176877737045, + "learning_rate": 3.4514482431149096e-06, + "loss": 0.0166, + "step": 125980 + }, + { + "epoch": 0.931300079832057, + "grad_norm": 0.064228355884552, + "learning_rate": 3.447738603988604e-06, + "loss": 0.0139, + "step": 125990 + }, + { + "epoch": 0.9313739984033589, + "grad_norm": 0.10097555071115494, + "learning_rate": 3.444028964862298e-06, + "loss": 0.0181, + "step": 126000 + }, + { + "epoch": 0.9314479169746607, + "grad_norm": 0.07844749093055725, + "learning_rate": 3.4403193257359925e-06, + "loss": 0.017, + "step": 126010 + }, + { + "epoch": 0.9315218355459626, + "grad_norm": 0.06773234158754349, + "learning_rate": 3.4366096866096867e-06, + "loss": 0.0182, + "step": 126020 + }, + { + "epoch": 0.9315957541172645, + "grad_norm": 0.08484964072704315, + "learning_rate": 3.432900047483381e-06, + "loss": 0.0201, + "step": 126030 + }, + { + "epoch": 0.9316696726885663, + "grad_norm": 0.05464823171496391, + "learning_rate": 3.4291904083570753e-06, + "loss": 0.0163, + "step": 126040 + }, + { + "epoch": 0.9317435912598682, + "grad_norm": 0.07915545254945755, + "learning_rate": 3.4254807692307695e-06, + "loss": 0.0178, + "step": 126050 + }, + { + "epoch": 0.93181750983117, + "grad_norm": 0.08858298510313034, + "learning_rate": 3.4217711301044634e-06, + "loss": 0.0167, + "step": 126060 + }, + { + "epoch": 0.9318914284024719, + "grad_norm": 0.10065144300460815, + "learning_rate": 3.4180614909781577e-06, + "loss": 0.019, + "step": 126070 + }, + { + "epoch": 0.9319653469737736, + "grad_norm": 0.04998321086168289, + "learning_rate": 3.414351851851852e-06, + "loss": 0.0144, + "step": 126080 + }, + { + "epoch": 0.9320392655450755, + "grad_norm": 0.07834048569202423, + "learning_rate": 3.4106422127255462e-06, + "loss": 0.0191, + "step": 126090 + }, + { + "epoch": 0.9321131841163774, + "grad_norm": 0.07724786549806595, + "learning_rate": 3.4069325735992405e-06, + "loss": 0.016, + "step": 126100 + }, + { + "epoch": 0.9321871026876792, + "grad_norm": 0.06425752490758896, + "learning_rate": 3.4032229344729348e-06, + "loss": 0.0164, + "step": 126110 + }, + { + "epoch": 0.9322610212589811, + "grad_norm": 0.0907018855214119, + "learning_rate": 3.399513295346629e-06, + "loss": 0.0186, + "step": 126120 + }, + { + "epoch": 0.9323349398302829, + "grad_norm": 0.09153434634208679, + "learning_rate": 3.395803656220323e-06, + "loss": 0.0168, + "step": 126130 + }, + { + "epoch": 0.9324088584015848, + "grad_norm": 0.09171310812234879, + "learning_rate": 3.392094017094017e-06, + "loss": 0.0177, + "step": 126140 + }, + { + "epoch": 0.9324827769728866, + "grad_norm": 0.08220788836479187, + "learning_rate": 3.3883843779677114e-06, + "loss": 0.0167, + "step": 126150 + }, + { + "epoch": 0.9325566955441885, + "grad_norm": 0.06586333364248276, + "learning_rate": 3.3846747388414057e-06, + "loss": 0.0159, + "step": 126160 + }, + { + "epoch": 0.9326306141154904, + "grad_norm": 0.06101555377244949, + "learning_rate": 3.3809650997151e-06, + "loss": 0.017, + "step": 126170 + }, + { + "epoch": 0.9327045326867922, + "grad_norm": 0.13610626757144928, + "learning_rate": 3.3772554605887943e-06, + "loss": 0.0181, + "step": 126180 + }, + { + "epoch": 0.9327784512580941, + "grad_norm": 0.07935914397239685, + "learning_rate": 3.3735458214624885e-06, + "loss": 0.017, + "step": 126190 + }, + { + "epoch": 0.9328523698293959, + "grad_norm": 0.08567145466804504, + "learning_rate": 3.369836182336183e-06, + "loss": 0.0168, + "step": 126200 + }, + { + "epoch": 0.9329262884006978, + "grad_norm": 0.09398678690195084, + "learning_rate": 3.3661265432098762e-06, + "loss": 0.0182, + "step": 126210 + }, + { + "epoch": 0.9330002069719996, + "grad_norm": 0.07996780425310135, + "learning_rate": 3.3624169040835705e-06, + "loss": 0.0172, + "step": 126220 + }, + { + "epoch": 0.9330741255433015, + "grad_norm": 0.08147966116666794, + "learning_rate": 3.358707264957265e-06, + "loss": 0.0174, + "step": 126230 + }, + { + "epoch": 0.9331480441146034, + "grad_norm": 0.06722401827573776, + "learning_rate": 3.3549976258309595e-06, + "loss": 0.014, + "step": 126240 + }, + { + "epoch": 0.9332219626859052, + "grad_norm": 0.08256746828556061, + "learning_rate": 3.3512879867046537e-06, + "loss": 0.0161, + "step": 126250 + }, + { + "epoch": 0.9332958812572071, + "grad_norm": 0.07081350684165955, + "learning_rate": 3.347578347578348e-06, + "loss": 0.0181, + "step": 126260 + }, + { + "epoch": 0.9333697998285089, + "grad_norm": 0.06583075225353241, + "learning_rate": 3.3438687084520423e-06, + "loss": 0.0183, + "step": 126270 + }, + { + "epoch": 0.9334437183998108, + "grad_norm": 0.0817515179514885, + "learning_rate": 3.3401590693257366e-06, + "loss": 0.015, + "step": 126280 + }, + { + "epoch": 0.9335176369711127, + "grad_norm": 0.08005262911319733, + "learning_rate": 3.33644943019943e-06, + "loss": 0.0171, + "step": 126290 + }, + { + "epoch": 0.9335915555424145, + "grad_norm": 0.09398916363716125, + "learning_rate": 3.3327397910731243e-06, + "loss": 0.0178, + "step": 126300 + }, + { + "epoch": 0.9336654741137164, + "grad_norm": 0.07812084257602692, + "learning_rate": 3.3290301519468185e-06, + "loss": 0.0176, + "step": 126310 + }, + { + "epoch": 0.9337393926850182, + "grad_norm": 0.06297401338815689, + "learning_rate": 3.325320512820513e-06, + "loss": 0.019, + "step": 126320 + }, + { + "epoch": 0.9338133112563201, + "grad_norm": 0.054416362196207047, + "learning_rate": 3.321610873694207e-06, + "loss": 0.016, + "step": 126330 + }, + { + "epoch": 0.9338872298276218, + "grad_norm": 0.08170922100543976, + "learning_rate": 3.3179012345679013e-06, + "loss": 0.017, + "step": 126340 + }, + { + "epoch": 0.9339611483989237, + "grad_norm": 0.0713760256767273, + "learning_rate": 3.314191595441596e-06, + "loss": 0.0174, + "step": 126350 + }, + { + "epoch": 0.9340350669702256, + "grad_norm": 0.10220906138420105, + "learning_rate": 3.3104819563152903e-06, + "loss": 0.0152, + "step": 126360 + }, + { + "epoch": 0.9341089855415274, + "grad_norm": 0.081649549305439, + "learning_rate": 3.3067723171889837e-06, + "loss": 0.018, + "step": 126370 + }, + { + "epoch": 0.9341829041128293, + "grad_norm": 0.09127277880907059, + "learning_rate": 3.303062678062678e-06, + "loss": 0.0169, + "step": 126380 + }, + { + "epoch": 0.9342568226841311, + "grad_norm": 0.08659278601408005, + "learning_rate": 3.2993530389363723e-06, + "loss": 0.016, + "step": 126390 + }, + { + "epoch": 0.934330741255433, + "grad_norm": 0.08412818610668182, + "learning_rate": 3.2956433998100666e-06, + "loss": 0.0161, + "step": 126400 + }, + { + "epoch": 0.9344046598267348, + "grad_norm": 0.074937604367733, + "learning_rate": 3.291933760683761e-06, + "loss": 0.0147, + "step": 126410 + }, + { + "epoch": 0.9344785783980367, + "grad_norm": 0.09494459629058838, + "learning_rate": 3.288224121557455e-06, + "loss": 0.015, + "step": 126420 + }, + { + "epoch": 0.9345524969693386, + "grad_norm": 0.06063415855169296, + "learning_rate": 3.2845144824311494e-06, + "loss": 0.0139, + "step": 126430 + }, + { + "epoch": 0.9346264155406404, + "grad_norm": 0.06621581315994263, + "learning_rate": 3.2808048433048432e-06, + "loss": 0.0155, + "step": 126440 + }, + { + "epoch": 0.9347003341119423, + "grad_norm": 0.08298254758119583, + "learning_rate": 3.2770952041785375e-06, + "loss": 0.0153, + "step": 126450 + }, + { + "epoch": 0.9347742526832441, + "grad_norm": 0.09773992747068405, + "learning_rate": 3.2733855650522318e-06, + "loss": 0.0157, + "step": 126460 + }, + { + "epoch": 0.934848171254546, + "grad_norm": 0.07338134944438934, + "learning_rate": 3.269675925925926e-06, + "loss": 0.0192, + "step": 126470 + }, + { + "epoch": 0.9349220898258478, + "grad_norm": 0.06597955524921417, + "learning_rate": 3.2659662867996203e-06, + "loss": 0.0197, + "step": 126480 + }, + { + "epoch": 0.9349960083971497, + "grad_norm": 0.09893263131380081, + "learning_rate": 3.2622566476733146e-06, + "loss": 0.015, + "step": 126490 + }, + { + "epoch": 0.9350699269684516, + "grad_norm": 0.0914556086063385, + "learning_rate": 3.258547008547009e-06, + "loss": 0.0168, + "step": 126500 + }, + { + "epoch": 0.9351438455397534, + "grad_norm": 0.07962629944086075, + "learning_rate": 3.254837369420703e-06, + "loss": 0.0177, + "step": 126510 + }, + { + "epoch": 0.9352177641110553, + "grad_norm": 0.08136551082134247, + "learning_rate": 3.251127730294397e-06, + "loss": 0.0176, + "step": 126520 + }, + { + "epoch": 0.9352916826823571, + "grad_norm": 0.07951479405164719, + "learning_rate": 3.2474180911680913e-06, + "loss": 0.0183, + "step": 126530 + }, + { + "epoch": 0.935365601253659, + "grad_norm": 0.06223934143781662, + "learning_rate": 3.2437084520417855e-06, + "loss": 0.015, + "step": 126540 + }, + { + "epoch": 0.9354395198249609, + "grad_norm": 0.07506666332483292, + "learning_rate": 3.23999881291548e-06, + "loss": 0.0171, + "step": 126550 + }, + { + "epoch": 0.9355134383962627, + "grad_norm": 0.10078860819339752, + "learning_rate": 3.236289173789174e-06, + "loss": 0.0139, + "step": 126560 + }, + { + "epoch": 0.9355873569675646, + "grad_norm": 0.0667853131890297, + "learning_rate": 3.2325795346628684e-06, + "loss": 0.0168, + "step": 126570 + }, + { + "epoch": 0.9356612755388664, + "grad_norm": 0.09738816320896149, + "learning_rate": 3.2288698955365626e-06, + "loss": 0.0165, + "step": 126580 + }, + { + "epoch": 0.9357351941101683, + "grad_norm": 0.0639905333518982, + "learning_rate": 3.225160256410257e-06, + "loss": 0.0186, + "step": 126590 + }, + { + "epoch": 0.93580911268147, + "grad_norm": 0.08258546143770218, + "learning_rate": 3.2214506172839503e-06, + "loss": 0.017, + "step": 126600 + }, + { + "epoch": 0.935883031252772, + "grad_norm": 0.08823540061712265, + "learning_rate": 3.2177409781576446e-06, + "loss": 0.0199, + "step": 126610 + }, + { + "epoch": 0.9359569498240738, + "grad_norm": 0.09814473241567612, + "learning_rate": 3.2140313390313393e-06, + "loss": 0.016, + "step": 126620 + }, + { + "epoch": 0.9360308683953756, + "grad_norm": 0.07945670187473297, + "learning_rate": 3.2103216999050336e-06, + "loss": 0.0157, + "step": 126630 + }, + { + "epoch": 0.9361047869666775, + "grad_norm": 0.098643459379673, + "learning_rate": 3.206612060778728e-06, + "loss": 0.0163, + "step": 126640 + }, + { + "epoch": 0.9361787055379793, + "grad_norm": 0.08727847039699554, + "learning_rate": 3.202902421652422e-06, + "loss": 0.0185, + "step": 126650 + }, + { + "epoch": 0.9362526241092812, + "grad_norm": 0.07201675325632095, + "learning_rate": 3.1991927825261164e-06, + "loss": 0.015, + "step": 126660 + }, + { + "epoch": 0.936326542680583, + "grad_norm": 0.07368409633636475, + "learning_rate": 3.19548314339981e-06, + "loss": 0.0191, + "step": 126670 + }, + { + "epoch": 0.9364004612518849, + "grad_norm": 0.09178756922483444, + "learning_rate": 3.191773504273504e-06, + "loss": 0.0182, + "step": 126680 + }, + { + "epoch": 0.9364743798231868, + "grad_norm": 0.05671729892492294, + "learning_rate": 3.1880638651471984e-06, + "loss": 0.0155, + "step": 126690 + }, + { + "epoch": 0.9365482983944886, + "grad_norm": 0.07169840484857559, + "learning_rate": 3.1843542260208926e-06, + "loss": 0.0169, + "step": 126700 + }, + { + "epoch": 0.9366222169657905, + "grad_norm": 0.08545159548521042, + "learning_rate": 3.180644586894587e-06, + "loss": 0.0157, + "step": 126710 + }, + { + "epoch": 0.9366961355370923, + "grad_norm": 0.0851563885807991, + "learning_rate": 3.176934947768281e-06, + "loss": 0.0168, + "step": 126720 + }, + { + "epoch": 0.9367700541083942, + "grad_norm": 0.06378570944070816, + "learning_rate": 3.173225308641976e-06, + "loss": 0.0191, + "step": 126730 + }, + { + "epoch": 0.9368439726796961, + "grad_norm": 0.07919839024543762, + "learning_rate": 3.16951566951567e-06, + "loss": 0.0187, + "step": 126740 + }, + { + "epoch": 0.9369178912509979, + "grad_norm": 0.0702909529209137, + "learning_rate": 3.1658060303893636e-06, + "loss": 0.0143, + "step": 126750 + }, + { + "epoch": 0.9369918098222998, + "grad_norm": 0.07548509538173676, + "learning_rate": 3.162096391263058e-06, + "loss": 0.0173, + "step": 126760 + }, + { + "epoch": 0.9370657283936016, + "grad_norm": 0.09032569825649261, + "learning_rate": 3.158386752136752e-06, + "loss": 0.0203, + "step": 126770 + }, + { + "epoch": 0.9371396469649035, + "grad_norm": 0.06742248684167862, + "learning_rate": 3.1546771130104464e-06, + "loss": 0.0161, + "step": 126780 + }, + { + "epoch": 0.9372135655362053, + "grad_norm": 0.07519455999135971, + "learning_rate": 3.1509674738841407e-06, + "loss": 0.0172, + "step": 126790 + }, + { + "epoch": 0.9372874841075072, + "grad_norm": 0.09951967000961304, + "learning_rate": 3.147257834757835e-06, + "loss": 0.0173, + "step": 126800 + }, + { + "epoch": 0.9373614026788091, + "grad_norm": 0.11275292932987213, + "learning_rate": 3.143548195631529e-06, + "loss": 0.0186, + "step": 126810 + }, + { + "epoch": 0.9374353212501109, + "grad_norm": 0.06716936826705933, + "learning_rate": 3.1398385565052235e-06, + "loss": 0.0174, + "step": 126820 + }, + { + "epoch": 0.9375092398214128, + "grad_norm": 0.07388816773891449, + "learning_rate": 3.1361289173789173e-06, + "loss": 0.0173, + "step": 126830 + }, + { + "epoch": 0.9375831583927146, + "grad_norm": 0.07803674042224884, + "learning_rate": 3.1324192782526116e-06, + "loss": 0.0186, + "step": 126840 + }, + { + "epoch": 0.9376570769640165, + "grad_norm": 0.084476538002491, + "learning_rate": 3.128709639126306e-06, + "loss": 0.0174, + "step": 126850 + }, + { + "epoch": 0.9377309955353182, + "grad_norm": 0.08462337404489517, + "learning_rate": 3.125e-06, + "loss": 0.0172, + "step": 126860 + }, + { + "epoch": 0.9378049141066201, + "grad_norm": 0.09327008575201035, + "learning_rate": 3.1212903608736944e-06, + "loss": 0.0182, + "step": 126870 + }, + { + "epoch": 0.937878832677922, + "grad_norm": 0.07979398220777512, + "learning_rate": 3.1175807217473883e-06, + "loss": 0.0163, + "step": 126880 + }, + { + "epoch": 0.9379527512492238, + "grad_norm": 0.09190664440393448, + "learning_rate": 3.1138710826210825e-06, + "loss": 0.0177, + "step": 126890 + }, + { + "epoch": 0.9380266698205257, + "grad_norm": 0.07389857620000839, + "learning_rate": 3.110161443494777e-06, + "loss": 0.0176, + "step": 126900 + }, + { + "epoch": 0.9381005883918275, + "grad_norm": 0.06767547875642776, + "learning_rate": 3.106451804368471e-06, + "loss": 0.0154, + "step": 126910 + }, + { + "epoch": 0.9381745069631294, + "grad_norm": 0.10601852089166641, + "learning_rate": 3.1027421652421654e-06, + "loss": 0.018, + "step": 126920 + }, + { + "epoch": 0.9382484255344312, + "grad_norm": 0.06298653781414032, + "learning_rate": 3.0990325261158596e-06, + "loss": 0.0143, + "step": 126930 + }, + { + "epoch": 0.9383223441057331, + "grad_norm": 0.06366831809282303, + "learning_rate": 3.095322886989554e-06, + "loss": 0.0147, + "step": 126940 + }, + { + "epoch": 0.938396262677035, + "grad_norm": 0.06097415089607239, + "learning_rate": 3.091613247863248e-06, + "loss": 0.0149, + "step": 126950 + }, + { + "epoch": 0.9384701812483368, + "grad_norm": 0.04413948208093643, + "learning_rate": 3.087903608736942e-06, + "loss": 0.0144, + "step": 126960 + }, + { + "epoch": 0.9385440998196387, + "grad_norm": 0.07926429063081741, + "learning_rate": 3.0841939696106363e-06, + "loss": 0.0185, + "step": 126970 + }, + { + "epoch": 0.9386180183909405, + "grad_norm": 0.1001443862915039, + "learning_rate": 3.0804843304843306e-06, + "loss": 0.0161, + "step": 126980 + }, + { + "epoch": 0.9386919369622424, + "grad_norm": 0.11276740580797195, + "learning_rate": 3.076774691358025e-06, + "loss": 0.0169, + "step": 126990 + }, + { + "epoch": 0.9387658555335443, + "grad_norm": 0.087139792740345, + "learning_rate": 3.073065052231719e-06, + "loss": 0.018, + "step": 127000 + }, + { + "epoch": 0.9388397741048461, + "grad_norm": 0.06877847015857697, + "learning_rate": 3.0693554131054134e-06, + "loss": 0.0169, + "step": 127010 + }, + { + "epoch": 0.938913692676148, + "grad_norm": 0.09786475449800491, + "learning_rate": 3.0656457739791077e-06, + "loss": 0.0156, + "step": 127020 + }, + { + "epoch": 0.9389876112474498, + "grad_norm": 0.06537863612174988, + "learning_rate": 3.061936134852802e-06, + "loss": 0.0167, + "step": 127030 + }, + { + "epoch": 0.9390615298187517, + "grad_norm": 0.06823769956827164, + "learning_rate": 3.058226495726496e-06, + "loss": 0.0177, + "step": 127040 + }, + { + "epoch": 0.9391354483900535, + "grad_norm": 0.12075066566467285, + "learning_rate": 3.05451685660019e-06, + "loss": 0.0186, + "step": 127050 + }, + { + "epoch": 0.9392093669613554, + "grad_norm": 0.06154416501522064, + "learning_rate": 3.0508072174738843e-06, + "loss": 0.0178, + "step": 127060 + }, + { + "epoch": 0.9392832855326573, + "grad_norm": 0.0991363450884819, + "learning_rate": 3.0470975783475786e-06, + "loss": 0.0163, + "step": 127070 + }, + { + "epoch": 0.9393572041039591, + "grad_norm": 0.07791083306074142, + "learning_rate": 3.0433879392212725e-06, + "loss": 0.0185, + "step": 127080 + }, + { + "epoch": 0.939431122675261, + "grad_norm": 0.07921697944402695, + "learning_rate": 3.0396783000949667e-06, + "loss": 0.0163, + "step": 127090 + }, + { + "epoch": 0.9395050412465628, + "grad_norm": 0.07544318586587906, + "learning_rate": 3.035968660968661e-06, + "loss": 0.0154, + "step": 127100 + }, + { + "epoch": 0.9395789598178647, + "grad_norm": 0.06722431629896164, + "learning_rate": 3.0322590218423553e-06, + "loss": 0.0187, + "step": 127110 + }, + { + "epoch": 0.9396528783891664, + "grad_norm": 0.06898238509893417, + "learning_rate": 3.0285493827160496e-06, + "loss": 0.0174, + "step": 127120 + }, + { + "epoch": 0.9397267969604683, + "grad_norm": 0.10192032903432846, + "learning_rate": 3.024839743589744e-06, + "loss": 0.0182, + "step": 127130 + }, + { + "epoch": 0.9398007155317702, + "grad_norm": 0.0944291204214096, + "learning_rate": 3.021130104463438e-06, + "loss": 0.0161, + "step": 127140 + }, + { + "epoch": 0.939874634103072, + "grad_norm": 0.05713077634572983, + "learning_rate": 3.017420465337132e-06, + "loss": 0.0159, + "step": 127150 + }, + { + "epoch": 0.9399485526743739, + "grad_norm": 0.09956385940313339, + "learning_rate": 3.0137108262108262e-06, + "loss": 0.0168, + "step": 127160 + }, + { + "epoch": 0.9400224712456757, + "grad_norm": 0.05738692730665207, + "learning_rate": 3.0100011870845205e-06, + "loss": 0.0179, + "step": 127170 + }, + { + "epoch": 0.9400963898169776, + "grad_norm": 0.11991485953330994, + "learning_rate": 3.0062915479582148e-06, + "loss": 0.0158, + "step": 127180 + }, + { + "epoch": 0.9401703083882794, + "grad_norm": 0.07158111780881882, + "learning_rate": 3.002581908831909e-06, + "loss": 0.0162, + "step": 127190 + }, + { + "epoch": 0.9402442269595813, + "grad_norm": 0.08031366765499115, + "learning_rate": 2.9988722697056033e-06, + "loss": 0.0188, + "step": 127200 + }, + { + "epoch": 0.9403181455308832, + "grad_norm": 0.060922037810087204, + "learning_rate": 2.9951626305792976e-06, + "loss": 0.0164, + "step": 127210 + }, + { + "epoch": 0.940392064102185, + "grad_norm": 0.0666850134730339, + "learning_rate": 2.991452991452992e-06, + "loss": 0.0172, + "step": 127220 + }, + { + "epoch": 0.9404659826734869, + "grad_norm": 0.06392525881528854, + "learning_rate": 2.9877433523266857e-06, + "loss": 0.0155, + "step": 127230 + }, + { + "epoch": 0.9405399012447887, + "grad_norm": 0.07114558666944504, + "learning_rate": 2.98403371320038e-06, + "loss": 0.0167, + "step": 127240 + }, + { + "epoch": 0.9406138198160906, + "grad_norm": 0.10076846182346344, + "learning_rate": 2.9803240740740743e-06, + "loss": 0.0204, + "step": 127250 + }, + { + "epoch": 0.9406877383873925, + "grad_norm": 0.08315786719322205, + "learning_rate": 2.9766144349477685e-06, + "loss": 0.0152, + "step": 127260 + }, + { + "epoch": 0.9407616569586943, + "grad_norm": 0.07295054942369461, + "learning_rate": 2.9729047958214624e-06, + "loss": 0.0159, + "step": 127270 + }, + { + "epoch": 0.9408355755299962, + "grad_norm": 0.07902075350284576, + "learning_rate": 2.9691951566951566e-06, + "loss": 0.0185, + "step": 127280 + }, + { + "epoch": 0.940909494101298, + "grad_norm": 0.08938170969486237, + "learning_rate": 2.965485517568851e-06, + "loss": 0.0179, + "step": 127290 + }, + { + "epoch": 0.9409834126725999, + "grad_norm": 0.07118421047925949, + "learning_rate": 2.9617758784425456e-06, + "loss": 0.0178, + "step": 127300 + }, + { + "epoch": 0.9410573312439017, + "grad_norm": 0.09154824167490005, + "learning_rate": 2.9580662393162395e-06, + "loss": 0.018, + "step": 127310 + }, + { + "epoch": 0.9411312498152036, + "grad_norm": 0.058051031082868576, + "learning_rate": 2.9543566001899337e-06, + "loss": 0.0185, + "step": 127320 + }, + { + "epoch": 0.9412051683865055, + "grad_norm": 0.058608926832675934, + "learning_rate": 2.950646961063628e-06, + "loss": 0.0167, + "step": 127330 + }, + { + "epoch": 0.9412790869578073, + "grad_norm": 0.08225753158330917, + "learning_rate": 2.9469373219373223e-06, + "loss": 0.0158, + "step": 127340 + }, + { + "epoch": 0.9413530055291092, + "grad_norm": 0.06874485313892365, + "learning_rate": 2.943227682811016e-06, + "loss": 0.018, + "step": 127350 + }, + { + "epoch": 0.941426924100411, + "grad_norm": 0.09509554505348206, + "learning_rate": 2.9395180436847104e-06, + "loss": 0.0172, + "step": 127360 + }, + { + "epoch": 0.9415008426717129, + "grad_norm": 0.06457925587892532, + "learning_rate": 2.9358084045584047e-06, + "loss": 0.0183, + "step": 127370 + }, + { + "epoch": 0.9415747612430146, + "grad_norm": 0.09607307612895966, + "learning_rate": 2.932098765432099e-06, + "loss": 0.0164, + "step": 127380 + }, + { + "epoch": 0.9416486798143165, + "grad_norm": 0.08715007454156876, + "learning_rate": 2.9283891263057932e-06, + "loss": 0.0181, + "step": 127390 + }, + { + "epoch": 0.9417225983856184, + "grad_norm": 0.10236938297748566, + "learning_rate": 2.9246794871794875e-06, + "loss": 0.0172, + "step": 127400 + }, + { + "epoch": 0.9417965169569202, + "grad_norm": 0.11540023237466812, + "learning_rate": 2.9209698480531818e-06, + "loss": 0.0185, + "step": 127410 + }, + { + "epoch": 0.9418704355282221, + "grad_norm": 0.06768453121185303, + "learning_rate": 2.9172602089268756e-06, + "loss": 0.0185, + "step": 127420 + }, + { + "epoch": 0.9419443540995239, + "grad_norm": 0.05372827127575874, + "learning_rate": 2.91355056980057e-06, + "loss": 0.0144, + "step": 127430 + }, + { + "epoch": 0.9420182726708258, + "grad_norm": 0.05831387639045715, + "learning_rate": 2.909840930674264e-06, + "loss": 0.0154, + "step": 127440 + }, + { + "epoch": 0.9420921912421276, + "grad_norm": 0.0981128066778183, + "learning_rate": 2.9061312915479584e-06, + "loss": 0.0166, + "step": 127450 + }, + { + "epoch": 0.9421661098134295, + "grad_norm": 0.07359275966882706, + "learning_rate": 2.9024216524216523e-06, + "loss": 0.0171, + "step": 127460 + }, + { + "epoch": 0.9422400283847314, + "grad_norm": 0.06113864481449127, + "learning_rate": 2.8987120132953466e-06, + "loss": 0.0167, + "step": 127470 + }, + { + "epoch": 0.9423139469560332, + "grad_norm": 0.053697239607572556, + "learning_rate": 2.895002374169041e-06, + "loss": 0.015, + "step": 127480 + }, + { + "epoch": 0.9423878655273351, + "grad_norm": 0.07835987955331802, + "learning_rate": 2.891292735042735e-06, + "loss": 0.0156, + "step": 127490 + }, + { + "epoch": 0.9424617840986369, + "grad_norm": 0.0714726373553276, + "learning_rate": 2.8875830959164294e-06, + "loss": 0.0173, + "step": 127500 + }, + { + "epoch": 0.9425357026699388, + "grad_norm": 0.08675273507833481, + "learning_rate": 2.8838734567901237e-06, + "loss": 0.0148, + "step": 127510 + }, + { + "epoch": 0.9426096212412407, + "grad_norm": 0.05766710638999939, + "learning_rate": 2.880163817663818e-06, + "loss": 0.0156, + "step": 127520 + }, + { + "epoch": 0.9426835398125425, + "grad_norm": 0.07460221648216248, + "learning_rate": 2.876454178537512e-06, + "loss": 0.0168, + "step": 127530 + }, + { + "epoch": 0.9427574583838444, + "grad_norm": 0.0762360617518425, + "learning_rate": 2.872744539411206e-06, + "loss": 0.0157, + "step": 127540 + }, + { + "epoch": 0.9428313769551462, + "grad_norm": 0.06541568040847778, + "learning_rate": 2.8690349002849003e-06, + "loss": 0.0171, + "step": 127550 + }, + { + "epoch": 0.9429052955264481, + "grad_norm": 0.07995308935642242, + "learning_rate": 2.8653252611585946e-06, + "loss": 0.0171, + "step": 127560 + }, + { + "epoch": 0.9429792140977499, + "grad_norm": 0.08494476228952408, + "learning_rate": 2.861615622032289e-06, + "loss": 0.017, + "step": 127570 + }, + { + "epoch": 0.9430531326690518, + "grad_norm": 0.05350199714303017, + "learning_rate": 2.857905982905983e-06, + "loss": 0.0174, + "step": 127580 + }, + { + "epoch": 0.9431270512403537, + "grad_norm": 0.08649630844593048, + "learning_rate": 2.8541963437796774e-06, + "loss": 0.0187, + "step": 127590 + }, + { + "epoch": 0.9432009698116555, + "grad_norm": 0.07305306941270828, + "learning_rate": 2.8504867046533717e-06, + "loss": 0.0143, + "step": 127600 + }, + { + "epoch": 0.9432748883829574, + "grad_norm": 0.075900137424469, + "learning_rate": 2.8467770655270655e-06, + "loss": 0.0157, + "step": 127610 + }, + { + "epoch": 0.9433488069542592, + "grad_norm": 0.05308043584227562, + "learning_rate": 2.84306742640076e-06, + "loss": 0.0152, + "step": 127620 + }, + { + "epoch": 0.9434227255255611, + "grad_norm": 0.08873714506626129, + "learning_rate": 2.839357787274454e-06, + "loss": 0.0179, + "step": 127630 + }, + { + "epoch": 0.9434966440968628, + "grad_norm": 0.08229392766952515, + "learning_rate": 2.8356481481481484e-06, + "loss": 0.019, + "step": 127640 + }, + { + "epoch": 0.9435705626681647, + "grad_norm": 0.09823070466518402, + "learning_rate": 2.831938509021842e-06, + "loss": 0.0156, + "step": 127650 + }, + { + "epoch": 0.9436444812394666, + "grad_norm": 0.07842585444450378, + "learning_rate": 2.8282288698955365e-06, + "loss": 0.0167, + "step": 127660 + }, + { + "epoch": 0.9437183998107684, + "grad_norm": 0.08402416855096817, + "learning_rate": 2.8245192307692307e-06, + "loss": 0.0174, + "step": 127670 + }, + { + "epoch": 0.9437923183820703, + "grad_norm": 0.07629430294036865, + "learning_rate": 2.820809591642925e-06, + "loss": 0.0154, + "step": 127680 + }, + { + "epoch": 0.9438662369533721, + "grad_norm": 0.10126028209924698, + "learning_rate": 2.8170999525166193e-06, + "loss": 0.0165, + "step": 127690 + }, + { + "epoch": 0.943940155524674, + "grad_norm": 0.10771343111991882, + "learning_rate": 2.8133903133903136e-06, + "loss": 0.0172, + "step": 127700 + }, + { + "epoch": 0.9440140740959758, + "grad_norm": 0.11257972568273544, + "learning_rate": 2.809680674264008e-06, + "loss": 0.017, + "step": 127710 + }, + { + "epoch": 0.9440879926672777, + "grad_norm": 0.06903264671564102, + "learning_rate": 2.805971035137702e-06, + "loss": 0.0179, + "step": 127720 + }, + { + "epoch": 0.9441619112385796, + "grad_norm": 0.07695875316858292, + "learning_rate": 2.802261396011396e-06, + "loss": 0.0153, + "step": 127730 + }, + { + "epoch": 0.9442358298098814, + "grad_norm": 0.06173216924071312, + "learning_rate": 2.7985517568850902e-06, + "loss": 0.0183, + "step": 127740 + }, + { + "epoch": 0.9443097483811833, + "grad_norm": 0.06688038259744644, + "learning_rate": 2.7948421177587845e-06, + "loss": 0.0167, + "step": 127750 + }, + { + "epoch": 0.9443836669524851, + "grad_norm": 0.08888844400644302, + "learning_rate": 2.7911324786324788e-06, + "loss": 0.0177, + "step": 127760 + }, + { + "epoch": 0.944457585523787, + "grad_norm": 0.106315016746521, + "learning_rate": 2.787422839506173e-06, + "loss": 0.0185, + "step": 127770 + }, + { + "epoch": 0.9445315040950889, + "grad_norm": 0.08033721894025803, + "learning_rate": 2.7837132003798673e-06, + "loss": 0.0162, + "step": 127780 + }, + { + "epoch": 0.9446054226663907, + "grad_norm": 0.0840701088309288, + "learning_rate": 2.7800035612535616e-06, + "loss": 0.0197, + "step": 127790 + }, + { + "epoch": 0.9446793412376926, + "grad_norm": 0.060849256813526154, + "learning_rate": 2.776293922127256e-06, + "loss": 0.0177, + "step": 127800 + }, + { + "epoch": 0.9447532598089944, + "grad_norm": 0.08838897198438644, + "learning_rate": 2.7725842830009497e-06, + "loss": 0.0153, + "step": 127810 + }, + { + "epoch": 0.9448271783802963, + "grad_norm": 0.06309714913368225, + "learning_rate": 2.768874643874644e-06, + "loss": 0.0161, + "step": 127820 + }, + { + "epoch": 0.9449010969515981, + "grad_norm": 0.0807114914059639, + "learning_rate": 2.7651650047483383e-06, + "loss": 0.0151, + "step": 127830 + }, + { + "epoch": 0.9449750155229, + "grad_norm": 0.06764519959688187, + "learning_rate": 2.761455365622032e-06, + "loss": 0.0162, + "step": 127840 + }, + { + "epoch": 0.9450489340942019, + "grad_norm": 0.09021104127168655, + "learning_rate": 2.7577457264957264e-06, + "loss": 0.0171, + "step": 127850 + }, + { + "epoch": 0.9451228526655037, + "grad_norm": 0.06262607127428055, + "learning_rate": 2.7540360873694207e-06, + "loss": 0.0194, + "step": 127860 + }, + { + "epoch": 0.9451967712368056, + "grad_norm": 0.08771193772554398, + "learning_rate": 2.750326448243115e-06, + "loss": 0.0138, + "step": 127870 + }, + { + "epoch": 0.9452706898081074, + "grad_norm": 0.10063749551773071, + "learning_rate": 2.746616809116809e-06, + "loss": 0.0216, + "step": 127880 + }, + { + "epoch": 0.9453446083794093, + "grad_norm": 0.07008014619350433, + "learning_rate": 2.7429071699905035e-06, + "loss": 0.016, + "step": 127890 + }, + { + "epoch": 0.945418526950711, + "grad_norm": 0.06852864474058151, + "learning_rate": 2.7391975308641978e-06, + "loss": 0.0171, + "step": 127900 + }, + { + "epoch": 0.945492445522013, + "grad_norm": 0.056721948087215424, + "learning_rate": 2.735487891737892e-06, + "loss": 0.0145, + "step": 127910 + }, + { + "epoch": 0.9455663640933148, + "grad_norm": 0.06805742532014847, + "learning_rate": 2.731778252611586e-06, + "loss": 0.0166, + "step": 127920 + }, + { + "epoch": 0.9456402826646166, + "grad_norm": 0.06866194307804108, + "learning_rate": 2.72806861348528e-06, + "loss": 0.0179, + "step": 127930 + }, + { + "epoch": 0.9457142012359185, + "grad_norm": 0.12696869671344757, + "learning_rate": 2.7243589743589744e-06, + "loss": 0.0174, + "step": 127940 + }, + { + "epoch": 0.9457881198072203, + "grad_norm": 0.05472124367952347, + "learning_rate": 2.7206493352326687e-06, + "loss": 0.014, + "step": 127950 + }, + { + "epoch": 0.9458620383785222, + "grad_norm": 0.08839566260576248, + "learning_rate": 2.716939696106363e-06, + "loss": 0.0159, + "step": 127960 + }, + { + "epoch": 0.945935956949824, + "grad_norm": 0.09860622137784958, + "learning_rate": 2.7132300569800572e-06, + "loss": 0.02, + "step": 127970 + }, + { + "epoch": 0.9460098755211259, + "grad_norm": 0.07679364830255508, + "learning_rate": 2.7095204178537515e-06, + "loss": 0.0189, + "step": 127980 + }, + { + "epoch": 0.9460837940924278, + "grad_norm": 0.07669613510370255, + "learning_rate": 2.705810778727446e-06, + "loss": 0.0168, + "step": 127990 + }, + { + "epoch": 0.9461577126637296, + "grad_norm": 0.13627739250659943, + "learning_rate": 2.7021011396011396e-06, + "loss": 0.0178, + "step": 128000 + }, + { + "epoch": 0.9462316312350315, + "grad_norm": 0.09532894939184189, + "learning_rate": 2.698391500474834e-06, + "loss": 0.0186, + "step": 128010 + }, + { + "epoch": 0.9463055498063333, + "grad_norm": 0.07109887897968292, + "learning_rate": 2.694681861348528e-06, + "loss": 0.0175, + "step": 128020 + }, + { + "epoch": 0.9463794683776352, + "grad_norm": 0.08016806095838547, + "learning_rate": 2.6909722222222225e-06, + "loss": 0.0161, + "step": 128030 + }, + { + "epoch": 0.9464533869489371, + "grad_norm": 0.10615555942058563, + "learning_rate": 2.6872625830959163e-06, + "loss": 0.0143, + "step": 128040 + }, + { + "epoch": 0.9465273055202389, + "grad_norm": 0.0732378363609314, + "learning_rate": 2.6835529439696106e-06, + "loss": 0.0156, + "step": 128050 + }, + { + "epoch": 0.9466012240915408, + "grad_norm": 0.06836659461259842, + "learning_rate": 2.679843304843305e-06, + "loss": 0.0177, + "step": 128060 + }, + { + "epoch": 0.9466751426628426, + "grad_norm": 0.06310658156871796, + "learning_rate": 2.6761336657169995e-06, + "loss": 0.0178, + "step": 128070 + }, + { + "epoch": 0.9467490612341445, + "grad_norm": 0.07476349174976349, + "learning_rate": 2.6724240265906934e-06, + "loss": 0.0153, + "step": 128080 + }, + { + "epoch": 0.9468229798054463, + "grad_norm": 0.061532557010650635, + "learning_rate": 2.6687143874643877e-06, + "loss": 0.0154, + "step": 128090 + }, + { + "epoch": 0.9468968983767482, + "grad_norm": 0.07229252904653549, + "learning_rate": 2.665004748338082e-06, + "loss": 0.0164, + "step": 128100 + }, + { + "epoch": 0.9469708169480501, + "grad_norm": 0.09586562216281891, + "learning_rate": 2.661295109211776e-06, + "loss": 0.0187, + "step": 128110 + }, + { + "epoch": 0.9470447355193519, + "grad_norm": 0.06675871461629868, + "learning_rate": 2.65758547008547e-06, + "loss": 0.0172, + "step": 128120 + }, + { + "epoch": 0.9471186540906538, + "grad_norm": 0.05785641446709633, + "learning_rate": 2.6538758309591643e-06, + "loss": 0.0165, + "step": 128130 + }, + { + "epoch": 0.9471925726619556, + "grad_norm": 0.08318644762039185, + "learning_rate": 2.6501661918328586e-06, + "loss": 0.0164, + "step": 128140 + }, + { + "epoch": 0.9472664912332575, + "grad_norm": 0.08673836290836334, + "learning_rate": 2.646456552706553e-06, + "loss": 0.0183, + "step": 128150 + }, + { + "epoch": 0.9473404098045592, + "grad_norm": 0.07311297208070755, + "learning_rate": 2.642746913580247e-06, + "loss": 0.0173, + "step": 128160 + }, + { + "epoch": 0.9474143283758611, + "grad_norm": 0.09003617614507675, + "learning_rate": 2.6390372744539414e-06, + "loss": 0.0186, + "step": 128170 + }, + { + "epoch": 0.947488246947163, + "grad_norm": 0.059183746576309204, + "learning_rate": 2.6353276353276357e-06, + "loss": 0.0159, + "step": 128180 + }, + { + "epoch": 0.9475621655184648, + "grad_norm": 0.0824420377612114, + "learning_rate": 2.6316179962013296e-06, + "loss": 0.017, + "step": 128190 + }, + { + "epoch": 0.9476360840897667, + "grad_norm": 0.07933705300092697, + "learning_rate": 2.627908357075024e-06, + "loss": 0.0156, + "step": 128200 + }, + { + "epoch": 0.9477100026610685, + "grad_norm": 0.09012126922607422, + "learning_rate": 2.624198717948718e-06, + "loss": 0.02, + "step": 128210 + }, + { + "epoch": 0.9477839212323704, + "grad_norm": 0.0675535500049591, + "learning_rate": 2.6204890788224124e-06, + "loss": 0.0142, + "step": 128220 + }, + { + "epoch": 0.9478578398036722, + "grad_norm": 0.08034668117761612, + "learning_rate": 2.6167794396961062e-06, + "loss": 0.0162, + "step": 128230 + }, + { + "epoch": 0.9479317583749741, + "grad_norm": 0.06311357021331787, + "learning_rate": 2.6130698005698005e-06, + "loss": 0.0191, + "step": 128240 + }, + { + "epoch": 0.948005676946276, + "grad_norm": 0.0800691470503807, + "learning_rate": 2.6093601614434948e-06, + "loss": 0.0148, + "step": 128250 + }, + { + "epoch": 0.9480795955175778, + "grad_norm": 0.10849236696958542, + "learning_rate": 2.6056505223171895e-06, + "loss": 0.0168, + "step": 128260 + }, + { + "epoch": 0.9481535140888797, + "grad_norm": 0.08853597939014435, + "learning_rate": 2.6019408831908833e-06, + "loss": 0.0182, + "step": 128270 + }, + { + "epoch": 0.9482274326601815, + "grad_norm": 0.09330402314662933, + "learning_rate": 2.5982312440645776e-06, + "loss": 0.0138, + "step": 128280 + }, + { + "epoch": 0.9483013512314834, + "grad_norm": 0.06071847304701805, + "learning_rate": 2.594521604938272e-06, + "loss": 0.0162, + "step": 128290 + }, + { + "epoch": 0.9483752698027853, + "grad_norm": 0.07208188623189926, + "learning_rate": 2.590811965811966e-06, + "loss": 0.0166, + "step": 128300 + }, + { + "epoch": 0.9484491883740871, + "grad_norm": 0.0893167182803154, + "learning_rate": 2.58710232668566e-06, + "loss": 0.0155, + "step": 128310 + }, + { + "epoch": 0.948523106945389, + "grad_norm": 0.07082314789295197, + "learning_rate": 2.5833926875593543e-06, + "loss": 0.0176, + "step": 128320 + }, + { + "epoch": 0.9485970255166908, + "grad_norm": 0.08515604585409164, + "learning_rate": 2.5796830484330485e-06, + "loss": 0.0191, + "step": 128330 + }, + { + "epoch": 0.9486709440879927, + "grad_norm": 0.11537264287471771, + "learning_rate": 2.575973409306743e-06, + "loss": 0.017, + "step": 128340 + }, + { + "epoch": 0.9487448626592945, + "grad_norm": 0.07748724520206451, + "learning_rate": 2.572263770180437e-06, + "loss": 0.02, + "step": 128350 + }, + { + "epoch": 0.9488187812305964, + "grad_norm": 0.06140293553471565, + "learning_rate": 2.5685541310541313e-06, + "loss": 0.0162, + "step": 128360 + }, + { + "epoch": 0.9488926998018983, + "grad_norm": 0.08523867279291153, + "learning_rate": 2.5648444919278256e-06, + "loss": 0.0177, + "step": 128370 + }, + { + "epoch": 0.9489666183732001, + "grad_norm": 0.06785237044095993, + "learning_rate": 2.5611348528015195e-06, + "loss": 0.0171, + "step": 128380 + }, + { + "epoch": 0.949040536944502, + "grad_norm": 0.07030639052391052, + "learning_rate": 2.5574252136752137e-06, + "loss": 0.0174, + "step": 128390 + }, + { + "epoch": 0.9491144555158038, + "grad_norm": 0.06864111125469208, + "learning_rate": 2.553715574548908e-06, + "loss": 0.0193, + "step": 128400 + }, + { + "epoch": 0.9491883740871057, + "grad_norm": 0.09462801367044449, + "learning_rate": 2.5500059354226023e-06, + "loss": 0.0157, + "step": 128410 + }, + { + "epoch": 0.9492622926584074, + "grad_norm": 0.08532844483852386, + "learning_rate": 2.546296296296296e-06, + "loss": 0.0181, + "step": 128420 + }, + { + "epoch": 0.9493362112297093, + "grad_norm": 0.0970272496342659, + "learning_rate": 2.5425866571699904e-06, + "loss": 0.0181, + "step": 128430 + }, + { + "epoch": 0.9494101298010112, + "grad_norm": 0.07665450870990753, + "learning_rate": 2.5388770180436847e-06, + "loss": 0.0179, + "step": 128440 + }, + { + "epoch": 0.949484048372313, + "grad_norm": 0.07551456242799759, + "learning_rate": 2.5351673789173794e-06, + "loss": 0.0178, + "step": 128450 + }, + { + "epoch": 0.9495579669436149, + "grad_norm": 0.0852522924542427, + "learning_rate": 2.5314577397910732e-06, + "loss": 0.0189, + "step": 128460 + }, + { + "epoch": 0.9496318855149167, + "grad_norm": 0.07986302673816681, + "learning_rate": 2.5277481006647675e-06, + "loss": 0.0162, + "step": 128470 + }, + { + "epoch": 0.9497058040862186, + "grad_norm": 0.06319686025381088, + "learning_rate": 2.5240384615384618e-06, + "loss": 0.0168, + "step": 128480 + }, + { + "epoch": 0.9497797226575205, + "grad_norm": 0.09003529697656631, + "learning_rate": 2.520328822412156e-06, + "loss": 0.0172, + "step": 128490 + }, + { + "epoch": 0.9498536412288223, + "grad_norm": 0.08166031539440155, + "learning_rate": 2.51661918328585e-06, + "loss": 0.0157, + "step": 128500 + }, + { + "epoch": 0.9499275598001242, + "grad_norm": 0.07876944541931152, + "learning_rate": 2.512909544159544e-06, + "loss": 0.0153, + "step": 128510 + }, + { + "epoch": 0.950001478371426, + "grad_norm": 0.08147630095481873, + "learning_rate": 2.5091999050332384e-06, + "loss": 0.0162, + "step": 128520 + }, + { + "epoch": 0.9500753969427279, + "grad_norm": 0.07436820864677429, + "learning_rate": 2.5054902659069327e-06, + "loss": 0.0158, + "step": 128530 + }, + { + "epoch": 0.9501493155140297, + "grad_norm": 0.07813509553670883, + "learning_rate": 2.501780626780627e-06, + "loss": 0.0162, + "step": 128540 + }, + { + "epoch": 0.9502232340853316, + "grad_norm": 0.07872185111045837, + "learning_rate": 2.4980709876543213e-06, + "loss": 0.0147, + "step": 128550 + }, + { + "epoch": 0.9502971526566335, + "grad_norm": 0.09251312166452408, + "learning_rate": 2.4943613485280155e-06, + "loss": 0.0183, + "step": 128560 + }, + { + "epoch": 0.9503710712279353, + "grad_norm": 0.08193115890026093, + "learning_rate": 2.49065170940171e-06, + "loss": 0.0152, + "step": 128570 + }, + { + "epoch": 0.9504449897992372, + "grad_norm": 0.07224851846694946, + "learning_rate": 2.4869420702754037e-06, + "loss": 0.0176, + "step": 128580 + }, + { + "epoch": 0.950518908370539, + "grad_norm": 0.06449359655380249, + "learning_rate": 2.483232431149098e-06, + "loss": 0.016, + "step": 128590 + }, + { + "epoch": 0.9505928269418409, + "grad_norm": 0.06900657713413239, + "learning_rate": 2.479522792022792e-06, + "loss": 0.0182, + "step": 128600 + }, + { + "epoch": 0.9506667455131427, + "grad_norm": 0.08904869109392166, + "learning_rate": 2.475813152896486e-06, + "loss": 0.0172, + "step": 128610 + }, + { + "epoch": 0.9507406640844446, + "grad_norm": 0.07366520911455154, + "learning_rate": 2.4721035137701803e-06, + "loss": 0.0165, + "step": 128620 + }, + { + "epoch": 0.9508145826557465, + "grad_norm": 0.06534479558467865, + "learning_rate": 2.4683938746438746e-06, + "loss": 0.0184, + "step": 128630 + }, + { + "epoch": 0.9508885012270483, + "grad_norm": 0.08039779961109161, + "learning_rate": 2.464684235517569e-06, + "loss": 0.0154, + "step": 128640 + }, + { + "epoch": 0.9509624197983502, + "grad_norm": 0.08867663890123367, + "learning_rate": 2.460974596391263e-06, + "loss": 0.0169, + "step": 128650 + }, + { + "epoch": 0.951036338369652, + "grad_norm": 0.0995791032910347, + "learning_rate": 2.4572649572649574e-06, + "loss": 0.0157, + "step": 128660 + }, + { + "epoch": 0.9511102569409539, + "grad_norm": 0.07323886454105377, + "learning_rate": 2.4535553181386517e-06, + "loss": 0.0161, + "step": 128670 + }, + { + "epoch": 0.9511841755122556, + "grad_norm": 0.06839929521083832, + "learning_rate": 2.449845679012346e-06, + "loss": 0.0166, + "step": 128680 + }, + { + "epoch": 0.9512580940835575, + "grad_norm": 0.06438833475112915, + "learning_rate": 2.44613603988604e-06, + "loss": 0.0145, + "step": 128690 + }, + { + "epoch": 0.9513320126548594, + "grad_norm": 0.07878058403730392, + "learning_rate": 2.442426400759734e-06, + "loss": 0.0159, + "step": 128700 + }, + { + "epoch": 0.9514059312261612, + "grad_norm": 0.06943828612565994, + "learning_rate": 2.4387167616334284e-06, + "loss": 0.0176, + "step": 128710 + }, + { + "epoch": 0.9514798497974631, + "grad_norm": 0.08369556069374084, + "learning_rate": 2.4350071225071226e-06, + "loss": 0.0164, + "step": 128720 + }, + { + "epoch": 0.9515537683687649, + "grad_norm": 0.10918588936328888, + "learning_rate": 2.431297483380817e-06, + "loss": 0.0171, + "step": 128730 + }, + { + "epoch": 0.9516276869400668, + "grad_norm": 0.06301585584878922, + "learning_rate": 2.427587844254511e-06, + "loss": 0.0156, + "step": 128740 + }, + { + "epoch": 0.9517016055113687, + "grad_norm": 0.08701598644256592, + "learning_rate": 2.4238782051282054e-06, + "loss": 0.0164, + "step": 128750 + }, + { + "epoch": 0.9517755240826705, + "grad_norm": 0.09200041741132736, + "learning_rate": 2.4201685660018997e-06, + "loss": 0.0186, + "step": 128760 + }, + { + "epoch": 0.9518494426539724, + "grad_norm": 0.07510281354188919, + "learning_rate": 2.4164589268755936e-06, + "loss": 0.018, + "step": 128770 + }, + { + "epoch": 0.9519233612252742, + "grad_norm": 0.07271352410316467, + "learning_rate": 2.412749287749288e-06, + "loss": 0.013, + "step": 128780 + }, + { + "epoch": 0.9519972797965761, + "grad_norm": 0.09494227916002274, + "learning_rate": 2.409039648622982e-06, + "loss": 0.0191, + "step": 128790 + }, + { + "epoch": 0.9520711983678779, + "grad_norm": 0.07326217740774155, + "learning_rate": 2.4053300094966764e-06, + "loss": 0.0157, + "step": 128800 + }, + { + "epoch": 0.9521451169391798, + "grad_norm": 0.08859758824110031, + "learning_rate": 2.4016203703703702e-06, + "loss": 0.0159, + "step": 128810 + }, + { + "epoch": 0.9522190355104817, + "grad_norm": 0.06138451024889946, + "learning_rate": 2.3979107312440645e-06, + "loss": 0.0147, + "step": 128820 + }, + { + "epoch": 0.9522929540817835, + "grad_norm": 0.09262833744287491, + "learning_rate": 2.3942010921177588e-06, + "loss": 0.0164, + "step": 128830 + }, + { + "epoch": 0.9523668726530854, + "grad_norm": 0.06121642515063286, + "learning_rate": 2.390491452991453e-06, + "loss": 0.016, + "step": 128840 + }, + { + "epoch": 0.9524407912243872, + "grad_norm": 0.055575739592313766, + "learning_rate": 2.3867818138651473e-06, + "loss": 0.0149, + "step": 128850 + }, + { + "epoch": 0.9525147097956891, + "grad_norm": 0.06918497383594513, + "learning_rate": 2.3830721747388416e-06, + "loss": 0.0144, + "step": 128860 + }, + { + "epoch": 0.9525886283669909, + "grad_norm": 0.12187926471233368, + "learning_rate": 2.379362535612536e-06, + "loss": 0.0206, + "step": 128870 + }, + { + "epoch": 0.9526625469382928, + "grad_norm": 0.054340194910764694, + "learning_rate": 2.3756528964862297e-06, + "loss": 0.0168, + "step": 128880 + }, + { + "epoch": 0.9527364655095947, + "grad_norm": 0.07291319966316223, + "learning_rate": 2.371943257359924e-06, + "loss": 0.0182, + "step": 128890 + }, + { + "epoch": 0.9528103840808965, + "grad_norm": 0.05469479411840439, + "learning_rate": 2.3682336182336183e-06, + "loss": 0.0156, + "step": 128900 + }, + { + "epoch": 0.9528843026521984, + "grad_norm": 0.07560256868600845, + "learning_rate": 2.3645239791073125e-06, + "loss": 0.0154, + "step": 128910 + }, + { + "epoch": 0.9529582212235002, + "grad_norm": 0.08988160640001297, + "learning_rate": 2.360814339981007e-06, + "loss": 0.017, + "step": 128920 + }, + { + "epoch": 0.9530321397948021, + "grad_norm": 0.08384236693382263, + "learning_rate": 2.357104700854701e-06, + "loss": 0.0198, + "step": 128930 + }, + { + "epoch": 0.9531060583661038, + "grad_norm": 0.09252948313951492, + "learning_rate": 2.3533950617283954e-06, + "loss": 0.0167, + "step": 128940 + }, + { + "epoch": 0.9531799769374057, + "grad_norm": 0.10098661482334137, + "learning_rate": 2.3496854226020896e-06, + "loss": 0.0164, + "step": 128950 + }, + { + "epoch": 0.9532538955087076, + "grad_norm": 0.07959005981683731, + "learning_rate": 2.3459757834757835e-06, + "loss": 0.0164, + "step": 128960 + }, + { + "epoch": 0.9533278140800094, + "grad_norm": 0.07650546729564667, + "learning_rate": 2.3422661443494778e-06, + "loss": 0.0177, + "step": 128970 + }, + { + "epoch": 0.9534017326513113, + "grad_norm": 0.07977327704429626, + "learning_rate": 2.338556505223172e-06, + "loss": 0.0167, + "step": 128980 + }, + { + "epoch": 0.9534756512226131, + "grad_norm": 0.09248465299606323, + "learning_rate": 2.3348468660968663e-06, + "loss": 0.019, + "step": 128990 + }, + { + "epoch": 0.953549569793915, + "grad_norm": 0.06973160803318024, + "learning_rate": 2.33113722697056e-06, + "loss": 0.0169, + "step": 129000 + }, + { + "epoch": 0.9536234883652169, + "grad_norm": 0.0796571671962738, + "learning_rate": 2.3274275878442544e-06, + "loss": 0.015, + "step": 129010 + }, + { + "epoch": 0.9536974069365187, + "grad_norm": 0.07682634890079498, + "learning_rate": 2.3237179487179487e-06, + "loss": 0.0152, + "step": 129020 + }, + { + "epoch": 0.9537713255078206, + "grad_norm": 0.06715840101242065, + "learning_rate": 2.3200083095916434e-06, + "loss": 0.0136, + "step": 129030 + }, + { + "epoch": 0.9538452440791224, + "grad_norm": 0.06069952994585037, + "learning_rate": 2.3162986704653372e-06, + "loss": 0.017, + "step": 129040 + }, + { + "epoch": 0.9539191626504243, + "grad_norm": 0.08695065975189209, + "learning_rate": 2.3125890313390315e-06, + "loss": 0.0189, + "step": 129050 + }, + { + "epoch": 0.9539930812217261, + "grad_norm": 0.08549745380878448, + "learning_rate": 2.308879392212726e-06, + "loss": 0.0178, + "step": 129060 + }, + { + "epoch": 0.954066999793028, + "grad_norm": 0.0622650645673275, + "learning_rate": 2.3051697530864196e-06, + "loss": 0.0157, + "step": 129070 + }, + { + "epoch": 0.9541409183643299, + "grad_norm": 0.057839758694171906, + "learning_rate": 2.301460113960114e-06, + "loss": 0.0168, + "step": 129080 + }, + { + "epoch": 0.9542148369356317, + "grad_norm": 0.10336287319660187, + "learning_rate": 2.297750474833808e-06, + "loss": 0.0202, + "step": 129090 + }, + { + "epoch": 0.9542887555069336, + "grad_norm": 0.10132849961519241, + "learning_rate": 2.2940408357075025e-06, + "loss": 0.019, + "step": 129100 + }, + { + "epoch": 0.9543626740782354, + "grad_norm": 0.0783289447426796, + "learning_rate": 2.2903311965811967e-06, + "loss": 0.0167, + "step": 129110 + }, + { + "epoch": 0.9544365926495373, + "grad_norm": 0.07380151748657227, + "learning_rate": 2.286621557454891e-06, + "loss": 0.0179, + "step": 129120 + }, + { + "epoch": 0.9545105112208391, + "grad_norm": 0.06636784225702286, + "learning_rate": 2.2829119183285853e-06, + "loss": 0.0182, + "step": 129130 + }, + { + "epoch": 0.954584429792141, + "grad_norm": 0.07031899690628052, + "learning_rate": 2.2792022792022796e-06, + "loss": 0.018, + "step": 129140 + }, + { + "epoch": 0.9546583483634429, + "grad_norm": 0.08217660337686539, + "learning_rate": 2.2754926400759734e-06, + "loss": 0.0181, + "step": 129150 + }, + { + "epoch": 0.9547322669347447, + "grad_norm": 0.08827585726976395, + "learning_rate": 2.2717830009496677e-06, + "loss": 0.0161, + "step": 129160 + }, + { + "epoch": 0.9548061855060466, + "grad_norm": 0.07113460451364517, + "learning_rate": 2.268073361823362e-06, + "loss": 0.0165, + "step": 129170 + }, + { + "epoch": 0.9548801040773484, + "grad_norm": 0.09373348206281662, + "learning_rate": 2.2643637226970562e-06, + "loss": 0.0162, + "step": 129180 + }, + { + "epoch": 0.9549540226486503, + "grad_norm": 0.07932069897651672, + "learning_rate": 2.26065408357075e-06, + "loss": 0.0176, + "step": 129190 + }, + { + "epoch": 0.955027941219952, + "grad_norm": 0.06112619861960411, + "learning_rate": 2.2569444444444443e-06, + "loss": 0.0152, + "step": 129200 + }, + { + "epoch": 0.955101859791254, + "grad_norm": 0.07475372403860092, + "learning_rate": 2.2532348053181386e-06, + "loss": 0.0149, + "step": 129210 + }, + { + "epoch": 0.9551757783625558, + "grad_norm": 0.09839701652526855, + "learning_rate": 2.2495251661918333e-06, + "loss": 0.0179, + "step": 129220 + }, + { + "epoch": 0.9552496969338576, + "grad_norm": 0.08877590298652649, + "learning_rate": 2.245815527065527e-06, + "loss": 0.0172, + "step": 129230 + }, + { + "epoch": 0.9553236155051595, + "grad_norm": 0.09224338829517365, + "learning_rate": 2.2421058879392214e-06, + "loss": 0.0176, + "step": 129240 + }, + { + "epoch": 0.9553975340764613, + "grad_norm": 0.07588019222021103, + "learning_rate": 2.2383962488129157e-06, + "loss": 0.0174, + "step": 129250 + }, + { + "epoch": 0.9554714526477632, + "grad_norm": 0.06307175010442734, + "learning_rate": 2.23468660968661e-06, + "loss": 0.0171, + "step": 129260 + }, + { + "epoch": 0.9555453712190651, + "grad_norm": 0.1095893457531929, + "learning_rate": 2.230976970560304e-06, + "loss": 0.0171, + "step": 129270 + }, + { + "epoch": 0.9556192897903669, + "grad_norm": 0.07829178124666214, + "learning_rate": 2.227267331433998e-06, + "loss": 0.016, + "step": 129280 + }, + { + "epoch": 0.9556932083616688, + "grad_norm": 0.09236520528793335, + "learning_rate": 2.2235576923076924e-06, + "loss": 0.0157, + "step": 129290 + }, + { + "epoch": 0.9557671269329706, + "grad_norm": 0.09746488928794861, + "learning_rate": 2.2198480531813866e-06, + "loss": 0.0202, + "step": 129300 + }, + { + "epoch": 0.9558410455042725, + "grad_norm": 0.09018874168395996, + "learning_rate": 2.216138414055081e-06, + "loss": 0.0164, + "step": 129310 + }, + { + "epoch": 0.9559149640755743, + "grad_norm": 0.06953372806310654, + "learning_rate": 2.212428774928775e-06, + "loss": 0.0175, + "step": 129320 + }, + { + "epoch": 0.9559888826468762, + "grad_norm": 0.08356118947267532, + "learning_rate": 2.2087191358024695e-06, + "loss": 0.0177, + "step": 129330 + }, + { + "epoch": 0.9560628012181781, + "grad_norm": 0.06745917350053787, + "learning_rate": 2.2050094966761633e-06, + "loss": 0.0139, + "step": 129340 + }, + { + "epoch": 0.9561367197894799, + "grad_norm": 0.06323149800300598, + "learning_rate": 2.2012998575498576e-06, + "loss": 0.0168, + "step": 129350 + }, + { + "epoch": 0.9562106383607818, + "grad_norm": 0.06741084158420563, + "learning_rate": 2.197590218423552e-06, + "loss": 0.0168, + "step": 129360 + }, + { + "epoch": 0.9562845569320836, + "grad_norm": 0.06664817035198212, + "learning_rate": 2.193880579297246e-06, + "loss": 0.0184, + "step": 129370 + }, + { + "epoch": 0.9563584755033855, + "grad_norm": 0.10696565359830856, + "learning_rate": 2.19017094017094e-06, + "loss": 0.0192, + "step": 129380 + }, + { + "epoch": 0.9564323940746873, + "grad_norm": 0.09820018708705902, + "learning_rate": 2.1864613010446343e-06, + "loss": 0.017, + "step": 129390 + }, + { + "epoch": 0.9565063126459892, + "grad_norm": 0.07728125154972076, + "learning_rate": 2.1827516619183285e-06, + "loss": 0.0156, + "step": 129400 + }, + { + "epoch": 0.9565802312172911, + "grad_norm": 0.046400558203458786, + "learning_rate": 2.1790420227920232e-06, + "loss": 0.0156, + "step": 129410 + }, + { + "epoch": 0.9566541497885929, + "grad_norm": 0.0884847566485405, + "learning_rate": 2.175332383665717e-06, + "loss": 0.0162, + "step": 129420 + }, + { + "epoch": 0.9567280683598948, + "grad_norm": 0.11267104744911194, + "learning_rate": 2.1716227445394113e-06, + "loss": 0.0196, + "step": 129430 + }, + { + "epoch": 0.9568019869311966, + "grad_norm": 0.0674358531832695, + "learning_rate": 2.1679131054131056e-06, + "loss": 0.0162, + "step": 129440 + }, + { + "epoch": 0.9568759055024985, + "grad_norm": 0.041945770382881165, + "learning_rate": 2.1642034662868e-06, + "loss": 0.0159, + "step": 129450 + }, + { + "epoch": 0.9569498240738002, + "grad_norm": 0.10006273537874222, + "learning_rate": 2.1604938271604937e-06, + "loss": 0.0153, + "step": 129460 + }, + { + "epoch": 0.9570237426451021, + "grad_norm": 0.06909745931625366, + "learning_rate": 2.156784188034188e-06, + "loss": 0.0172, + "step": 129470 + }, + { + "epoch": 0.957097661216404, + "grad_norm": 0.08743210136890411, + "learning_rate": 2.1530745489078823e-06, + "loss": 0.014, + "step": 129480 + }, + { + "epoch": 0.9571715797877058, + "grad_norm": 0.10677314549684525, + "learning_rate": 2.1493649097815766e-06, + "loss": 0.0165, + "step": 129490 + }, + { + "epoch": 0.9572454983590077, + "grad_norm": 0.0807448998093605, + "learning_rate": 2.145655270655271e-06, + "loss": 0.017, + "step": 129500 + }, + { + "epoch": 0.9573194169303095, + "grad_norm": 0.08446327596902847, + "learning_rate": 2.141945631528965e-06, + "loss": 0.0156, + "step": 129510 + }, + { + "epoch": 0.9573933355016114, + "grad_norm": 0.08753930777311325, + "learning_rate": 2.1382359924026594e-06, + "loss": 0.0165, + "step": 129520 + }, + { + "epoch": 0.9574672540729133, + "grad_norm": 0.061072368174791336, + "learning_rate": 2.1345263532763537e-06, + "loss": 0.0149, + "step": 129530 + }, + { + "epoch": 0.9575411726442151, + "grad_norm": 0.06465945392847061, + "learning_rate": 2.1308167141500475e-06, + "loss": 0.0168, + "step": 129540 + }, + { + "epoch": 0.957615091215517, + "grad_norm": 0.10568177700042725, + "learning_rate": 2.1271070750237418e-06, + "loss": 0.0168, + "step": 129550 + }, + { + "epoch": 0.9576890097868188, + "grad_norm": 0.05580895394086838, + "learning_rate": 2.123397435897436e-06, + "loss": 0.0152, + "step": 129560 + }, + { + "epoch": 0.9577629283581207, + "grad_norm": 0.07329615205526352, + "learning_rate": 2.11968779677113e-06, + "loss": 0.015, + "step": 129570 + }, + { + "epoch": 0.9578368469294225, + "grad_norm": 0.0981418713927269, + "learning_rate": 2.115978157644824e-06, + "loss": 0.0178, + "step": 129580 + }, + { + "epoch": 0.9579107655007244, + "grad_norm": 0.06811246275901794, + "learning_rate": 2.1122685185185184e-06, + "loss": 0.0156, + "step": 129590 + }, + { + "epoch": 0.9579846840720263, + "grad_norm": 0.08827519416809082, + "learning_rate": 2.108558879392213e-06, + "loss": 0.0151, + "step": 129600 + }, + { + "epoch": 0.9580586026433281, + "grad_norm": 0.06045457348227501, + "learning_rate": 2.104849240265907e-06, + "loss": 0.0192, + "step": 129610 + }, + { + "epoch": 0.95813252121463, + "grad_norm": 0.07158850133419037, + "learning_rate": 2.1011396011396013e-06, + "loss": 0.0175, + "step": 129620 + }, + { + "epoch": 0.9582064397859318, + "grad_norm": 0.11043514311313629, + "learning_rate": 2.0974299620132955e-06, + "loss": 0.0176, + "step": 129630 + }, + { + "epoch": 0.9582803583572337, + "grad_norm": 0.07115330547094345, + "learning_rate": 2.09372032288699e-06, + "loss": 0.0171, + "step": 129640 + }, + { + "epoch": 0.9583542769285355, + "grad_norm": 0.07935353368520737, + "learning_rate": 2.0900106837606837e-06, + "loss": 0.0169, + "step": 129650 + }, + { + "epoch": 0.9584281954998374, + "grad_norm": 0.08272580802440643, + "learning_rate": 2.086301044634378e-06, + "loss": 0.0187, + "step": 129660 + }, + { + "epoch": 0.9585021140711393, + "grad_norm": 0.0728270560503006, + "learning_rate": 2.082591405508072e-06, + "loss": 0.0185, + "step": 129670 + }, + { + "epoch": 0.9585760326424411, + "grad_norm": 0.09612071514129639, + "learning_rate": 2.0788817663817665e-06, + "loss": 0.0191, + "step": 129680 + }, + { + "epoch": 0.958649951213743, + "grad_norm": 0.08082949370145798, + "learning_rate": 2.0751721272554607e-06, + "loss": 0.0177, + "step": 129690 + }, + { + "epoch": 0.9587238697850448, + "grad_norm": 0.09920556843280792, + "learning_rate": 2.071462488129155e-06, + "loss": 0.0161, + "step": 129700 + }, + { + "epoch": 0.9587977883563467, + "grad_norm": 0.056050848215818405, + "learning_rate": 2.0677528490028493e-06, + "loss": 0.0175, + "step": 129710 + }, + { + "epoch": 0.9588717069276484, + "grad_norm": 0.09262334555387497, + "learning_rate": 2.0640432098765436e-06, + "loss": 0.0189, + "step": 129720 + }, + { + "epoch": 0.9589456254989503, + "grad_norm": 0.0797777995467186, + "learning_rate": 2.0603335707502374e-06, + "loss": 0.0151, + "step": 129730 + }, + { + "epoch": 0.9590195440702523, + "grad_norm": 0.07810277491807938, + "learning_rate": 2.0566239316239317e-06, + "loss": 0.0157, + "step": 129740 + }, + { + "epoch": 0.959093462641554, + "grad_norm": 0.088081493973732, + "learning_rate": 2.052914292497626e-06, + "loss": 0.0176, + "step": 129750 + }, + { + "epoch": 0.9591673812128559, + "grad_norm": 0.07194243371486664, + "learning_rate": 2.0492046533713202e-06, + "loss": 0.0153, + "step": 129760 + }, + { + "epoch": 0.9592412997841577, + "grad_norm": 0.05726609006524086, + "learning_rate": 2.045495014245014e-06, + "loss": 0.0152, + "step": 129770 + }, + { + "epoch": 0.9593152183554596, + "grad_norm": 0.08413214981555939, + "learning_rate": 2.0417853751187084e-06, + "loss": 0.0187, + "step": 129780 + }, + { + "epoch": 0.9593891369267615, + "grad_norm": 0.07600409537553787, + "learning_rate": 2.0380757359924026e-06, + "loss": 0.0155, + "step": 129790 + }, + { + "epoch": 0.9594630554980633, + "grad_norm": 0.11079154163599014, + "learning_rate": 2.034366096866097e-06, + "loss": 0.0184, + "step": 129800 + }, + { + "epoch": 0.9595369740693652, + "grad_norm": 0.07840550690889359, + "learning_rate": 2.030656457739791e-06, + "loss": 0.0155, + "step": 129810 + }, + { + "epoch": 0.959610892640667, + "grad_norm": 0.07995374500751495, + "learning_rate": 2.0269468186134854e-06, + "loss": 0.017, + "step": 129820 + }, + { + "epoch": 0.9596848112119689, + "grad_norm": 0.06168173998594284, + "learning_rate": 2.0232371794871797e-06, + "loss": 0.0154, + "step": 129830 + }, + { + "epoch": 0.9597587297832707, + "grad_norm": 0.057839542627334595, + "learning_rate": 2.0195275403608736e-06, + "loss": 0.0152, + "step": 129840 + }, + { + "epoch": 0.9598326483545726, + "grad_norm": 0.0621907114982605, + "learning_rate": 2.015817901234568e-06, + "loss": 0.017, + "step": 129850 + }, + { + "epoch": 0.9599065669258745, + "grad_norm": 0.07333408296108246, + "learning_rate": 2.012108262108262e-06, + "loss": 0.0181, + "step": 129860 + }, + { + "epoch": 0.9599804854971763, + "grad_norm": 0.09220781177282333, + "learning_rate": 2.0083986229819564e-06, + "loss": 0.0194, + "step": 129870 + }, + { + "epoch": 0.9600544040684782, + "grad_norm": 0.07996411621570587, + "learning_rate": 2.0046889838556507e-06, + "loss": 0.0159, + "step": 129880 + }, + { + "epoch": 0.96012832263978, + "grad_norm": 0.07438327372074127, + "learning_rate": 2.000979344729345e-06, + "loss": 0.0175, + "step": 129890 + }, + { + "epoch": 0.9602022412110819, + "grad_norm": 0.06726469844579697, + "learning_rate": 1.997269705603039e-06, + "loss": 0.0177, + "step": 129900 + }, + { + "epoch": 0.9602761597823837, + "grad_norm": 0.14511814713478088, + "learning_rate": 1.9935600664767335e-06, + "loss": 0.0167, + "step": 129910 + }, + { + "epoch": 0.9603500783536856, + "grad_norm": 0.07616829872131348, + "learning_rate": 1.9898504273504273e-06, + "loss": 0.0181, + "step": 129920 + }, + { + "epoch": 0.9604239969249875, + "grad_norm": 0.07339037954807281, + "learning_rate": 1.9861407882241216e-06, + "loss": 0.0173, + "step": 129930 + }, + { + "epoch": 0.9604979154962893, + "grad_norm": 0.07685656100511551, + "learning_rate": 1.982431149097816e-06, + "loss": 0.0177, + "step": 129940 + }, + { + "epoch": 0.9605718340675912, + "grad_norm": 0.06435885280370712, + "learning_rate": 1.97872150997151e-06, + "loss": 0.019, + "step": 129950 + }, + { + "epoch": 0.960645752638893, + "grad_norm": 0.07235367596149445, + "learning_rate": 1.975011870845204e-06, + "loss": 0.0158, + "step": 129960 + }, + { + "epoch": 0.9607196712101949, + "grad_norm": 0.10989423841238022, + "learning_rate": 1.9713022317188983e-06, + "loss": 0.0161, + "step": 129970 + }, + { + "epoch": 0.9607935897814966, + "grad_norm": 0.08029764890670776, + "learning_rate": 1.9675925925925925e-06, + "loss": 0.0154, + "step": 129980 + }, + { + "epoch": 0.9608675083527985, + "grad_norm": 0.06860236823558807, + "learning_rate": 1.9638829534662872e-06, + "loss": 0.0179, + "step": 129990 + }, + { + "epoch": 0.9609414269241005, + "grad_norm": 0.07281980663537979, + "learning_rate": 1.960173314339981e-06, + "loss": 0.0179, + "step": 130000 + }, + { + "epoch": 0.9609414269241005, + "eval_f1": 0.63648960197002, + "eval_loss": 0.01656239666044712, + "eval_precision": 0.5083805166236832, + "eval_recall": 0.8509156514550863, + "eval_runtime": 2907.7863, + "eval_samples_per_second": 186.098, + "eval_steps_per_second": 2.908, + "step": 130000 + } + ], + "logging_steps": 10, + "max_steps": 135284, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 10000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.7620045586432e+17, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}