diff --git "a/checkpoint-36500/trainer_state.json" "b/checkpoint-36500/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-36500/trainer_state.json" @@ -0,0 +1,25584 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9980722166773766, + "eval_steps": 500, + "global_step": 36500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00027344444292530865, + "grad_norm": 1896.0, + "learning_rate": 9.000000000000001e-07, + "loss": 6.5599, + "step": 10 + }, + { + "epoch": 0.0005468888858506173, + "grad_norm": 27.0, + "learning_rate": 1.9000000000000002e-06, + "loss": 3.6252, + "step": 20 + }, + { + "epoch": 0.000820333328775926, + "grad_norm": 25.875, + "learning_rate": 2.9e-06, + "loss": 3.3786, + "step": 30 + }, + { + "epoch": 0.0010937777717012346, + "grad_norm": 138.0, + "learning_rate": 3.900000000000001e-06, + "loss": 2.8689, + "step": 40 + }, + { + "epoch": 0.0013672222146265433, + "grad_norm": 8.0, + "learning_rate": 4.9000000000000005e-06, + "loss": 2.112, + "step": 50 + }, + { + "epoch": 0.001640666657551852, + "grad_norm": 9.8125, + "learning_rate": 5.9e-06, + "loss": 1.4067, + "step": 60 + }, + { + "epoch": 0.0019141111004771605, + "grad_norm": 26.75, + "learning_rate": 6.9e-06, + "loss": 1.2991, + "step": 70 + }, + { + "epoch": 0.002187555543402469, + "grad_norm": 5280.0, + "learning_rate": 7.9e-06, + "loss": 2.1783, + "step": 80 + }, + { + "epoch": 0.002460999986327778, + "grad_norm": 4.34375, + "learning_rate": 8.900000000000001e-06, + "loss": 1.1253, + "step": 90 + }, + { + "epoch": 0.0027344444292530866, + "grad_norm": 3.578125, + "learning_rate": 9.9e-06, + "loss": 1.1952, + "step": 100 + }, + { + "epoch": 0.0030078888721783953, + "grad_norm": 3.09375, + "learning_rate": 9.997532285925805e-06, + "loss": 1.0657, + "step": 110 + }, + { + "epoch": 0.003281333315103704, + "grad_norm": 3.9375, + "learning_rate": 9.99479038139892e-06, + "loss": 1.0124, + "step": 120 + }, + { + "epoch": 0.0035547777580290122, + "grad_norm": 12.5625, + "learning_rate": 9.992048476872036e-06, + "loss": 0.9081, + "step": 130 + }, + { + "epoch": 0.003828222200954321, + "grad_norm": 3.359375, + "learning_rate": 9.989306572345152e-06, + "loss": 0.9172, + "step": 140 + }, + { + "epoch": 0.00410166664387963, + "grad_norm": 3.171875, + "learning_rate": 9.986564667818267e-06, + "loss": 0.8917, + "step": 150 + }, + { + "epoch": 0.004375111086804938, + "grad_norm": 3.03125, + "learning_rate": 9.983822763291383e-06, + "loss": 0.8512, + "step": 160 + }, + { + "epoch": 0.004648555529730247, + "grad_norm": 3.0625, + "learning_rate": 9.9810808587645e-06, + "loss": 0.898, + "step": 170 + }, + { + "epoch": 0.004921999972655556, + "grad_norm": 2.71875, + "learning_rate": 9.978338954237614e-06, + "loss": 0.8238, + "step": 180 + }, + { + "epoch": 0.0051954444155808644, + "grad_norm": 2.96875, + "learning_rate": 9.97559704971073e-06, + "loss": 0.8266, + "step": 190 + }, + { + "epoch": 0.005468888858506173, + "grad_norm": 3.15625, + "learning_rate": 9.972855145183845e-06, + "loss": 0.8465, + "step": 200 + }, + { + "epoch": 0.005742333301431482, + "grad_norm": 3.375, + "learning_rate": 9.970113240656962e-06, + "loss": 0.924, + "step": 210 + }, + { + "epoch": 0.0060157777443567905, + "grad_norm": 2.703125, + "learning_rate": 9.967371336130078e-06, + "loss": 0.7541, + "step": 220 + }, + { + "epoch": 0.006289222187282099, + "grad_norm": 2.71875, + "learning_rate": 9.964629431603192e-06, + "loss": 0.8921, + "step": 230 + }, + { + "epoch": 0.006562666630207408, + "grad_norm": 2.84375, + "learning_rate": 9.961887527076309e-06, + "loss": 0.8874, + "step": 240 + }, + { + "epoch": 0.006836111073132717, + "grad_norm": 2.578125, + "learning_rate": 9.959145622549423e-06, + "loss": 0.792, + "step": 250 + }, + { + "epoch": 0.0071095555160580245, + "grad_norm": 3.015625, + "learning_rate": 9.95640371802254e-06, + "loss": 0.8166, + "step": 260 + }, + { + "epoch": 0.007382999958983333, + "grad_norm": 2.828125, + "learning_rate": 9.953661813495656e-06, + "loss": 0.8695, + "step": 270 + }, + { + "epoch": 0.007656444401908642, + "grad_norm": 2.375, + "learning_rate": 9.95091990896877e-06, + "loss": 0.7685, + "step": 280 + }, + { + "epoch": 0.00792988884483395, + "grad_norm": 2.671875, + "learning_rate": 9.948178004441887e-06, + "loss": 0.8273, + "step": 290 + }, + { + "epoch": 0.00820333328775926, + "grad_norm": 2.6875, + "learning_rate": 9.945436099915002e-06, + "loss": 0.8486, + "step": 300 + }, + { + "epoch": 0.008476777730684568, + "grad_norm": 2.453125, + "learning_rate": 9.942694195388118e-06, + "loss": 0.7712, + "step": 310 + }, + { + "epoch": 0.008750222173609877, + "grad_norm": 2.484375, + "learning_rate": 9.939952290861233e-06, + "loss": 0.8415, + "step": 320 + }, + { + "epoch": 0.009023666616535185, + "grad_norm": 2.8125, + "learning_rate": 9.937210386334349e-06, + "loss": 0.8591, + "step": 330 + }, + { + "epoch": 0.009297111059460494, + "grad_norm": 2.515625, + "learning_rate": 9.934468481807464e-06, + "loss": 0.8108, + "step": 340 + }, + { + "epoch": 0.009570555502385803, + "grad_norm": 2.484375, + "learning_rate": 9.93172657728058e-06, + "loss": 0.8237, + "step": 350 + }, + { + "epoch": 0.009843999945311111, + "grad_norm": 2.3125, + "learning_rate": 9.928984672753695e-06, + "loss": 0.7216, + "step": 360 + }, + { + "epoch": 0.01011744438823642, + "grad_norm": 2.328125, + "learning_rate": 9.926242768226811e-06, + "loss": 0.7826, + "step": 370 + }, + { + "epoch": 0.010390888831161729, + "grad_norm": 2.5625, + "learning_rate": 9.923500863699926e-06, + "loss": 0.8139, + "step": 380 + }, + { + "epoch": 0.010664333274087038, + "grad_norm": 2.5, + "learning_rate": 9.920758959173042e-06, + "loss": 0.8307, + "step": 390 + }, + { + "epoch": 0.010937777717012346, + "grad_norm": 2.5625, + "learning_rate": 9.918017054646158e-06, + "loss": 0.8096, + "step": 400 + }, + { + "epoch": 0.011211222159937655, + "grad_norm": 2.96875, + "learning_rate": 9.915275150119273e-06, + "loss": 0.7875, + "step": 410 + }, + { + "epoch": 0.011484666602862964, + "grad_norm": 2.796875, + "learning_rate": 9.91253324559239e-06, + "loss": 0.8298, + "step": 420 + }, + { + "epoch": 0.011758111045788272, + "grad_norm": 2.515625, + "learning_rate": 9.909791341065504e-06, + "loss": 0.8296, + "step": 430 + }, + { + "epoch": 0.012031555488713581, + "grad_norm": 2.5, + "learning_rate": 9.90704943653862e-06, + "loss": 0.7895, + "step": 440 + }, + { + "epoch": 0.01230499993163889, + "grad_norm": 2.640625, + "learning_rate": 9.904307532011736e-06, + "loss": 0.724, + "step": 450 + }, + { + "epoch": 0.012578444374564198, + "grad_norm": 2.34375, + "learning_rate": 9.901565627484851e-06, + "loss": 0.7024, + "step": 460 + }, + { + "epoch": 0.012851888817489507, + "grad_norm": 2.703125, + "learning_rate": 9.898823722957967e-06, + "loss": 0.7299, + "step": 470 + }, + { + "epoch": 0.013125333260414816, + "grad_norm": 2.625, + "learning_rate": 9.896081818431082e-06, + "loss": 0.8051, + "step": 480 + }, + { + "epoch": 0.013398777703340125, + "grad_norm": 2.859375, + "learning_rate": 9.893339913904198e-06, + "loss": 0.7479, + "step": 490 + }, + { + "epoch": 0.013672222146265433, + "grad_norm": 2.859375, + "learning_rate": 9.890598009377315e-06, + "loss": 0.7858, + "step": 500 + }, + { + "epoch": 0.013945666589190742, + "grad_norm": 2.203125, + "learning_rate": 9.88785610485043e-06, + "loss": 0.7261, + "step": 510 + }, + { + "epoch": 0.014219111032116049, + "grad_norm": 2.984375, + "learning_rate": 9.885114200323546e-06, + "loss": 0.8463, + "step": 520 + }, + { + "epoch": 0.014492555475041358, + "grad_norm": 2.359375, + "learning_rate": 9.882372295796662e-06, + "loss": 0.7567, + "step": 530 + }, + { + "epoch": 0.014765999917966666, + "grad_norm": 2.484375, + "learning_rate": 9.879630391269777e-06, + "loss": 0.7418, + "step": 540 + }, + { + "epoch": 0.015039444360891975, + "grad_norm": 2.15625, + "learning_rate": 9.876888486742893e-06, + "loss": 0.7041, + "step": 550 + }, + { + "epoch": 0.015312888803817284, + "grad_norm": 2.609375, + "learning_rate": 9.874146582216008e-06, + "loss": 0.7952, + "step": 560 + }, + { + "epoch": 0.015586333246742592, + "grad_norm": 2.5625, + "learning_rate": 9.871404677689124e-06, + "loss": 0.6998, + "step": 570 + }, + { + "epoch": 0.0158597776896679, + "grad_norm": 2.953125, + "learning_rate": 9.86866277316224e-06, + "loss": 0.8426, + "step": 580 + }, + { + "epoch": 0.01613322213259321, + "grad_norm": 2.671875, + "learning_rate": 9.865920868635355e-06, + "loss": 0.7063, + "step": 590 + }, + { + "epoch": 0.01640666657551852, + "grad_norm": 2.59375, + "learning_rate": 9.863178964108471e-06, + "loss": 0.8102, + "step": 600 + }, + { + "epoch": 0.016680111018443827, + "grad_norm": 2.546875, + "learning_rate": 9.860437059581586e-06, + "loss": 0.8037, + "step": 610 + }, + { + "epoch": 0.016953555461369136, + "grad_norm": 2.5, + "learning_rate": 9.857695155054702e-06, + "loss": 0.7553, + "step": 620 + }, + { + "epoch": 0.017226999904294445, + "grad_norm": 2.4375, + "learning_rate": 9.854953250527819e-06, + "loss": 0.8293, + "step": 630 + }, + { + "epoch": 0.017500444347219753, + "grad_norm": 2.640625, + "learning_rate": 9.852211346000933e-06, + "loss": 0.7451, + "step": 640 + }, + { + "epoch": 0.017773888790145062, + "grad_norm": 2.28125, + "learning_rate": 9.84946944147405e-06, + "loss": 0.7486, + "step": 650 + }, + { + "epoch": 0.01804733323307037, + "grad_norm": 2.671875, + "learning_rate": 9.846727536947164e-06, + "loss": 0.6981, + "step": 660 + }, + { + "epoch": 0.01832077767599568, + "grad_norm": 2.890625, + "learning_rate": 9.84398563242028e-06, + "loss": 0.7658, + "step": 670 + }, + { + "epoch": 0.018594222118920988, + "grad_norm": 2.59375, + "learning_rate": 9.841243727893395e-06, + "loss": 0.7807, + "step": 680 + }, + { + "epoch": 0.018867666561846297, + "grad_norm": 2.515625, + "learning_rate": 9.838501823366511e-06, + "loss": 0.7659, + "step": 690 + }, + { + "epoch": 0.019141111004771606, + "grad_norm": 2.5625, + "learning_rate": 9.835759918839626e-06, + "loss": 0.7199, + "step": 700 + }, + { + "epoch": 0.019414555447696914, + "grad_norm": 2.609375, + "learning_rate": 9.833018014312742e-06, + "loss": 0.7504, + "step": 710 + }, + { + "epoch": 0.019687999890622223, + "grad_norm": 2.546875, + "learning_rate": 9.830276109785859e-06, + "loss": 0.7001, + "step": 720 + }, + { + "epoch": 0.01996144433354753, + "grad_norm": 2.46875, + "learning_rate": 9.827534205258973e-06, + "loss": 0.7776, + "step": 730 + }, + { + "epoch": 0.02023488877647284, + "grad_norm": 2.484375, + "learning_rate": 9.82479230073209e-06, + "loss": 0.6961, + "step": 740 + }, + { + "epoch": 0.02050833321939815, + "grad_norm": 2.421875, + "learning_rate": 9.822050396205204e-06, + "loss": 0.7371, + "step": 750 + }, + { + "epoch": 0.020781777662323458, + "grad_norm": 2.15625, + "learning_rate": 9.81930849167832e-06, + "loss": 0.7172, + "step": 760 + }, + { + "epoch": 0.021055222105248766, + "grad_norm": 2.265625, + "learning_rate": 9.816566587151435e-06, + "loss": 0.7591, + "step": 770 + }, + { + "epoch": 0.021328666548174075, + "grad_norm": 2.4375, + "learning_rate": 9.813824682624552e-06, + "loss": 0.7655, + "step": 780 + }, + { + "epoch": 0.021602110991099384, + "grad_norm": 2.34375, + "learning_rate": 9.811082778097666e-06, + "loss": 0.771, + "step": 790 + }, + { + "epoch": 0.021875555434024693, + "grad_norm": 2.390625, + "learning_rate": 9.808340873570783e-06, + "loss": 0.7777, + "step": 800 + }, + { + "epoch": 0.02214899987695, + "grad_norm": 2.234375, + "learning_rate": 9.805598969043899e-06, + "loss": 0.7443, + "step": 810 + }, + { + "epoch": 0.02242244431987531, + "grad_norm": 2.6875, + "learning_rate": 9.802857064517014e-06, + "loss": 0.7241, + "step": 820 + }, + { + "epoch": 0.02269588876280062, + "grad_norm": 2.515625, + "learning_rate": 9.80011515999013e-06, + "loss": 0.7555, + "step": 830 + }, + { + "epoch": 0.022969333205725927, + "grad_norm": 2.515625, + "learning_rate": 9.797373255463245e-06, + "loss": 0.7858, + "step": 840 + }, + { + "epoch": 0.023242777648651236, + "grad_norm": 2.4375, + "learning_rate": 9.79463135093636e-06, + "loss": 0.7149, + "step": 850 + }, + { + "epoch": 0.023516222091576545, + "grad_norm": 2.390625, + "learning_rate": 9.791889446409477e-06, + "loss": 0.7042, + "step": 860 + }, + { + "epoch": 0.023789666534501853, + "grad_norm": 2.59375, + "learning_rate": 9.789147541882592e-06, + "loss": 0.8091, + "step": 870 + }, + { + "epoch": 0.024063110977427162, + "grad_norm": 2.28125, + "learning_rate": 9.786405637355708e-06, + "loss": 0.7842, + "step": 880 + }, + { + "epoch": 0.02433655542035247, + "grad_norm": 2.859375, + "learning_rate": 9.783663732828824e-06, + "loss": 0.7569, + "step": 890 + }, + { + "epoch": 0.02460999986327778, + "grad_norm": 2.53125, + "learning_rate": 9.780921828301939e-06, + "loss": 0.7228, + "step": 900 + }, + { + "epoch": 0.02488344430620309, + "grad_norm": 2.796875, + "learning_rate": 9.778179923775055e-06, + "loss": 0.8507, + "step": 910 + }, + { + "epoch": 0.025156888749128397, + "grad_norm": 2.296875, + "learning_rate": 9.77543801924817e-06, + "loss": 0.814, + "step": 920 + }, + { + "epoch": 0.025430333192053706, + "grad_norm": 2.515625, + "learning_rate": 9.772696114721286e-06, + "loss": 0.7462, + "step": 930 + }, + { + "epoch": 0.025703777634979014, + "grad_norm": 2.421875, + "learning_rate": 9.769954210194403e-06, + "loss": 0.7244, + "step": 940 + }, + { + "epoch": 0.025977222077904323, + "grad_norm": 2.3125, + "learning_rate": 9.767212305667517e-06, + "loss": 0.711, + "step": 950 + }, + { + "epoch": 0.026250666520829632, + "grad_norm": 2.5, + "learning_rate": 9.764470401140634e-06, + "loss": 0.8858, + "step": 960 + }, + { + "epoch": 0.02652411096375494, + "grad_norm": 2.640625, + "learning_rate": 9.761728496613748e-06, + "loss": 0.7479, + "step": 970 + }, + { + "epoch": 0.02679755540668025, + "grad_norm": 2.578125, + "learning_rate": 9.758986592086865e-06, + "loss": 0.7941, + "step": 980 + }, + { + "epoch": 0.027070999849605558, + "grad_norm": 2.765625, + "learning_rate": 9.756244687559981e-06, + "loss": 0.8549, + "step": 990 + }, + { + "epoch": 0.027344444292530867, + "grad_norm": 2.609375, + "learning_rate": 9.753502783033096e-06, + "loss": 0.7651, + "step": 1000 + }, + { + "epoch": 0.027617888735456175, + "grad_norm": 2.21875, + "learning_rate": 9.750760878506212e-06, + "loss": 0.8144, + "step": 1010 + }, + { + "epoch": 0.027891333178381484, + "grad_norm": 2.328125, + "learning_rate": 9.748018973979327e-06, + "loss": 0.7076, + "step": 1020 + }, + { + "epoch": 0.028164777621306793, + "grad_norm": 2.734375, + "learning_rate": 9.745277069452443e-06, + "loss": 0.6917, + "step": 1030 + }, + { + "epoch": 0.028438222064232098, + "grad_norm": 2.5625, + "learning_rate": 9.74253516492556e-06, + "loss": 0.7972, + "step": 1040 + }, + { + "epoch": 0.028711666507157407, + "grad_norm": 2.59375, + "learning_rate": 9.739793260398674e-06, + "loss": 0.7161, + "step": 1050 + }, + { + "epoch": 0.028985110950082715, + "grad_norm": 2.59375, + "learning_rate": 9.73705135587179e-06, + "loss": 0.752, + "step": 1060 + }, + { + "epoch": 0.029258555393008024, + "grad_norm": 2.5, + "learning_rate": 9.734309451344905e-06, + "loss": 0.8185, + "step": 1070 + }, + { + "epoch": 0.029531999835933333, + "grad_norm": 2.5625, + "learning_rate": 9.731567546818021e-06, + "loss": 0.7669, + "step": 1080 + }, + { + "epoch": 0.02980544427885864, + "grad_norm": 2.71875, + "learning_rate": 9.728825642291136e-06, + "loss": 0.7679, + "step": 1090 + }, + { + "epoch": 0.03007888872178395, + "grad_norm": 2.65625, + "learning_rate": 9.726083737764252e-06, + "loss": 0.7009, + "step": 1100 + }, + { + "epoch": 0.03035233316470926, + "grad_norm": 2.640625, + "learning_rate": 9.723341833237367e-06, + "loss": 0.7807, + "step": 1110 + }, + { + "epoch": 0.030625777607634568, + "grad_norm": 2.78125, + "learning_rate": 9.720599928710483e-06, + "loss": 0.7801, + "step": 1120 + }, + { + "epoch": 0.030899222050559876, + "grad_norm": 2.171875, + "learning_rate": 9.717858024183598e-06, + "loss": 0.7226, + "step": 1130 + }, + { + "epoch": 0.031172666493485185, + "grad_norm": 2.78125, + "learning_rate": 9.715116119656714e-06, + "loss": 0.7538, + "step": 1140 + }, + { + "epoch": 0.0314461109364105, + "grad_norm": 2.796875, + "learning_rate": 9.712374215129829e-06, + "loss": 0.8462, + "step": 1150 + }, + { + "epoch": 0.0317195553793358, + "grad_norm": 2.359375, + "learning_rate": 9.709632310602945e-06, + "loss": 0.6615, + "step": 1160 + }, + { + "epoch": 0.031992999822261114, + "grad_norm": 2.78125, + "learning_rate": 9.706890406076061e-06, + "loss": 0.8396, + "step": 1170 + }, + { + "epoch": 0.03226644426518642, + "grad_norm": 2.484375, + "learning_rate": 9.704148501549176e-06, + "loss": 0.7388, + "step": 1180 + }, + { + "epoch": 0.03253988870811173, + "grad_norm": 2.84375, + "learning_rate": 9.701406597022292e-06, + "loss": 0.802, + "step": 1190 + }, + { + "epoch": 0.03281333315103704, + "grad_norm": 2.265625, + "learning_rate": 9.698664692495409e-06, + "loss": 0.6973, + "step": 1200 + }, + { + "epoch": 0.03308677759396235, + "grad_norm": 2.6875, + "learning_rate": 9.695922787968523e-06, + "loss": 0.8428, + "step": 1210 + }, + { + "epoch": 0.033360222036887655, + "grad_norm": 2.203125, + "learning_rate": 9.69318088344164e-06, + "loss": 0.7894, + "step": 1220 + }, + { + "epoch": 0.03363366647981297, + "grad_norm": 2.546875, + "learning_rate": 9.690438978914754e-06, + "loss": 0.7225, + "step": 1230 + }, + { + "epoch": 0.03390711092273827, + "grad_norm": 2.828125, + "learning_rate": 9.68769707438787e-06, + "loss": 0.8022, + "step": 1240 + }, + { + "epoch": 0.034180555365663584, + "grad_norm": 2.359375, + "learning_rate": 9.684955169860987e-06, + "loss": 0.7958, + "step": 1250 + }, + { + "epoch": 0.03445399980858889, + "grad_norm": 2.6875, + "learning_rate": 9.682213265334102e-06, + "loss": 0.7053, + "step": 1260 + }, + { + "epoch": 0.0347274442515142, + "grad_norm": 2.875, + "learning_rate": 9.679471360807218e-06, + "loss": 0.8154, + "step": 1270 + }, + { + "epoch": 0.03500088869443951, + "grad_norm": 2.359375, + "learning_rate": 9.676729456280332e-06, + "loss": 0.7323, + "step": 1280 + }, + { + "epoch": 0.03527433313736482, + "grad_norm": 2.5625, + "learning_rate": 9.673987551753449e-06, + "loss": 0.7683, + "step": 1290 + }, + { + "epoch": 0.035547777580290124, + "grad_norm": 2.78125, + "learning_rate": 9.671245647226565e-06, + "loss": 0.7345, + "step": 1300 + }, + { + "epoch": 0.035821222023215436, + "grad_norm": 2.28125, + "learning_rate": 9.66850374269968e-06, + "loss": 0.7208, + "step": 1310 + }, + { + "epoch": 0.03609466646614074, + "grad_norm": 2.953125, + "learning_rate": 9.665761838172796e-06, + "loss": 0.827, + "step": 1320 + }, + { + "epoch": 0.036368110909066054, + "grad_norm": 2.609375, + "learning_rate": 9.66301993364591e-06, + "loss": 0.8135, + "step": 1330 + }, + { + "epoch": 0.03664155535199136, + "grad_norm": 2.640625, + "learning_rate": 9.660278029119027e-06, + "loss": 0.6895, + "step": 1340 + }, + { + "epoch": 0.03691499979491667, + "grad_norm": 2.859375, + "learning_rate": 9.657536124592143e-06, + "loss": 0.7645, + "step": 1350 + }, + { + "epoch": 0.037188444237841976, + "grad_norm": 2.4375, + "learning_rate": 9.654794220065258e-06, + "loss": 0.7629, + "step": 1360 + }, + { + "epoch": 0.03746188868076729, + "grad_norm": 2.171875, + "learning_rate": 9.652052315538374e-06, + "loss": 0.7334, + "step": 1370 + }, + { + "epoch": 0.037735333123692594, + "grad_norm": 2.828125, + "learning_rate": 9.64931041101149e-06, + "loss": 0.769, + "step": 1380 + }, + { + "epoch": 0.0380087775666179, + "grad_norm": 2.375, + "learning_rate": 9.646568506484605e-06, + "loss": 0.7722, + "step": 1390 + }, + { + "epoch": 0.03828222200954321, + "grad_norm": 2.421875, + "learning_rate": 9.643826601957722e-06, + "loss": 0.7187, + "step": 1400 + }, + { + "epoch": 0.038555666452468516, + "grad_norm": 2.40625, + "learning_rate": 9.641084697430836e-06, + "loss": 0.7417, + "step": 1410 + }, + { + "epoch": 0.03882911089539383, + "grad_norm": 2.53125, + "learning_rate": 9.638342792903953e-06, + "loss": 0.7457, + "step": 1420 + }, + { + "epoch": 0.039102555338319134, + "grad_norm": 2.078125, + "learning_rate": 9.635600888377067e-06, + "loss": 0.7441, + "step": 1430 + }, + { + "epoch": 0.039375999781244446, + "grad_norm": 2.390625, + "learning_rate": 9.632858983850184e-06, + "loss": 0.6988, + "step": 1440 + }, + { + "epoch": 0.03964944422416975, + "grad_norm": 2.78125, + "learning_rate": 9.630117079323298e-06, + "loss": 0.7586, + "step": 1450 + }, + { + "epoch": 0.03992288866709506, + "grad_norm": 2.609375, + "learning_rate": 9.627375174796415e-06, + "loss": 0.7791, + "step": 1460 + }, + { + "epoch": 0.04019633311002037, + "grad_norm": 2.6875, + "learning_rate": 9.62463327026953e-06, + "loss": 0.784, + "step": 1470 + }, + { + "epoch": 0.04046977755294568, + "grad_norm": 2.546875, + "learning_rate": 9.621891365742646e-06, + "loss": 0.6897, + "step": 1480 + }, + { + "epoch": 0.040743221995870986, + "grad_norm": 2.84375, + "learning_rate": 9.61914946121576e-06, + "loss": 0.7845, + "step": 1490 + }, + { + "epoch": 0.0410166664387963, + "grad_norm": 2.6875, + "learning_rate": 9.616407556688876e-06, + "loss": 0.7592, + "step": 1500 + }, + { + "epoch": 0.0412901108817216, + "grad_norm": 2.390625, + "learning_rate": 9.613665652161993e-06, + "loss": 0.6765, + "step": 1510 + }, + { + "epoch": 0.041563555324646916, + "grad_norm": 2.609375, + "learning_rate": 9.610923747635107e-06, + "loss": 0.7223, + "step": 1520 + }, + { + "epoch": 0.04183699976757222, + "grad_norm": 2.828125, + "learning_rate": 9.608181843108224e-06, + "loss": 0.6825, + "step": 1530 + }, + { + "epoch": 0.04211044421049753, + "grad_norm": 2.28125, + "learning_rate": 9.605439938581338e-06, + "loss": 0.7194, + "step": 1540 + }, + { + "epoch": 0.04238388865342284, + "grad_norm": 2.03125, + "learning_rate": 9.602698034054455e-06, + "loss": 0.6613, + "step": 1550 + }, + { + "epoch": 0.04265733309634815, + "grad_norm": 2.59375, + "learning_rate": 9.599956129527571e-06, + "loss": 0.8118, + "step": 1560 + }, + { + "epoch": 0.042930777539273456, + "grad_norm": 2.359375, + "learning_rate": 9.597214225000686e-06, + "loss": 0.7024, + "step": 1570 + }, + { + "epoch": 0.04320422198219877, + "grad_norm": 2.609375, + "learning_rate": 9.594472320473802e-06, + "loss": 0.7343, + "step": 1580 + }, + { + "epoch": 0.04347766642512407, + "grad_norm": 2.5625, + "learning_rate": 9.591730415946917e-06, + "loss": 0.6862, + "step": 1590 + }, + { + "epoch": 0.043751110868049385, + "grad_norm": 2.671875, + "learning_rate": 9.588988511420033e-06, + "loss": 0.6886, + "step": 1600 + }, + { + "epoch": 0.04402455531097469, + "grad_norm": 2.609375, + "learning_rate": 9.58624660689315e-06, + "loss": 0.7629, + "step": 1610 + }, + { + "epoch": 0.0442979997539, + "grad_norm": 2.515625, + "learning_rate": 9.583504702366264e-06, + "loss": 0.7699, + "step": 1620 + }, + { + "epoch": 0.04457144419682531, + "grad_norm": 2.421875, + "learning_rate": 9.58076279783938e-06, + "loss": 0.7388, + "step": 1630 + }, + { + "epoch": 0.04484488863975062, + "grad_norm": 2.609375, + "learning_rate": 9.578020893312495e-06, + "loss": 0.7558, + "step": 1640 + }, + { + "epoch": 0.045118333082675925, + "grad_norm": 2.609375, + "learning_rate": 9.575278988785611e-06, + "loss": 0.8286, + "step": 1650 + }, + { + "epoch": 0.04539177752560124, + "grad_norm": 2.78125, + "learning_rate": 9.572537084258728e-06, + "loss": 0.7431, + "step": 1660 + }, + { + "epoch": 0.04566522196852654, + "grad_norm": 2.8125, + "learning_rate": 9.569795179731842e-06, + "loss": 0.7432, + "step": 1670 + }, + { + "epoch": 0.045938666411451855, + "grad_norm": 2.375, + "learning_rate": 9.567053275204959e-06, + "loss": 0.7249, + "step": 1680 + }, + { + "epoch": 0.04621211085437716, + "grad_norm": 2.59375, + "learning_rate": 9.564311370678073e-06, + "loss": 0.7563, + "step": 1690 + }, + { + "epoch": 0.04648555529730247, + "grad_norm": 2.71875, + "learning_rate": 9.56156946615119e-06, + "loss": 0.7134, + "step": 1700 + }, + { + "epoch": 0.04675899974022778, + "grad_norm": 2.421875, + "learning_rate": 9.558827561624306e-06, + "loss": 0.7157, + "step": 1710 + }, + { + "epoch": 0.04703244418315309, + "grad_norm": 2.359375, + "learning_rate": 9.55608565709742e-06, + "loss": 0.711, + "step": 1720 + }, + { + "epoch": 0.047305888626078395, + "grad_norm": 2.625, + "learning_rate": 9.553343752570537e-06, + "loss": 0.7702, + "step": 1730 + }, + { + "epoch": 0.04757933306900371, + "grad_norm": 2.703125, + "learning_rate": 9.550601848043653e-06, + "loss": 0.786, + "step": 1740 + }, + { + "epoch": 0.04785277751192901, + "grad_norm": 2.625, + "learning_rate": 9.547859943516768e-06, + "loss": 0.6589, + "step": 1750 + }, + { + "epoch": 0.048126221954854324, + "grad_norm": 2.265625, + "learning_rate": 9.545118038989884e-06, + "loss": 0.6113, + "step": 1760 + }, + { + "epoch": 0.04839966639777963, + "grad_norm": 2.546875, + "learning_rate": 9.542376134462999e-06, + "loss": 0.7724, + "step": 1770 + }, + { + "epoch": 0.04867311084070494, + "grad_norm": 2.6875, + "learning_rate": 9.539634229936115e-06, + "loss": 0.7697, + "step": 1780 + }, + { + "epoch": 0.04894655528363025, + "grad_norm": 2.6875, + "learning_rate": 9.53689232540923e-06, + "loss": 0.7129, + "step": 1790 + }, + { + "epoch": 0.04921999972655556, + "grad_norm": 2.828125, + "learning_rate": 9.534150420882346e-06, + "loss": 0.7778, + "step": 1800 + }, + { + "epoch": 0.049493444169480864, + "grad_norm": 2.609375, + "learning_rate": 9.53140851635546e-06, + "loss": 0.6428, + "step": 1810 + }, + { + "epoch": 0.04976688861240618, + "grad_norm": 2.5625, + "learning_rate": 9.528666611828577e-06, + "loss": 0.7865, + "step": 1820 + }, + { + "epoch": 0.05004033305533148, + "grad_norm": 2.640625, + "learning_rate": 9.525924707301693e-06, + "loss": 0.8194, + "step": 1830 + }, + { + "epoch": 0.050313777498256794, + "grad_norm": 2.515625, + "learning_rate": 9.523182802774808e-06, + "loss": 0.7181, + "step": 1840 + }, + { + "epoch": 0.0505872219411821, + "grad_norm": 2.59375, + "learning_rate": 9.520440898247924e-06, + "loss": 0.7142, + "step": 1850 + }, + { + "epoch": 0.05086066638410741, + "grad_norm": 2.65625, + "learning_rate": 9.517698993721039e-06, + "loss": 0.7482, + "step": 1860 + }, + { + "epoch": 0.05113411082703272, + "grad_norm": 2.8125, + "learning_rate": 9.514957089194155e-06, + "loss": 0.7046, + "step": 1870 + }, + { + "epoch": 0.05140755526995803, + "grad_norm": 3.0, + "learning_rate": 9.51221518466727e-06, + "loss": 0.7585, + "step": 1880 + }, + { + "epoch": 0.051680999712883334, + "grad_norm": 2.703125, + "learning_rate": 9.509473280140386e-06, + "loss": 0.7295, + "step": 1890 + }, + { + "epoch": 0.051954444155808646, + "grad_norm": 2.3125, + "learning_rate": 9.5067313756135e-06, + "loss": 0.7939, + "step": 1900 + }, + { + "epoch": 0.05222788859873395, + "grad_norm": 2.578125, + "learning_rate": 9.503989471086617e-06, + "loss": 0.7067, + "step": 1910 + }, + { + "epoch": 0.052501333041659264, + "grad_norm": 2.5, + "learning_rate": 9.501247566559733e-06, + "loss": 0.7374, + "step": 1920 + }, + { + "epoch": 0.05277477748458457, + "grad_norm": 3.015625, + "learning_rate": 9.498505662032848e-06, + "loss": 0.798, + "step": 1930 + }, + { + "epoch": 0.05304822192750988, + "grad_norm": 3.046875, + "learning_rate": 9.495763757505964e-06, + "loss": 0.7785, + "step": 1940 + }, + { + "epoch": 0.053321666370435186, + "grad_norm": 2.484375, + "learning_rate": 9.493021852979079e-06, + "loss": 0.7982, + "step": 1950 + }, + { + "epoch": 0.0535951108133605, + "grad_norm": 2.421875, + "learning_rate": 9.490279948452195e-06, + "loss": 0.7191, + "step": 1960 + }, + { + "epoch": 0.053868555256285804, + "grad_norm": 2.34375, + "learning_rate": 9.487538043925312e-06, + "loss": 0.6877, + "step": 1970 + }, + { + "epoch": 0.054141999699211116, + "grad_norm": 2.421875, + "learning_rate": 9.484796139398426e-06, + "loss": 0.7858, + "step": 1980 + }, + { + "epoch": 0.05441544414213642, + "grad_norm": 2.6875, + "learning_rate": 9.482054234871543e-06, + "loss": 0.7308, + "step": 1990 + }, + { + "epoch": 0.05468888858506173, + "grad_norm": 2.4375, + "learning_rate": 9.479312330344657e-06, + "loss": 0.6782, + "step": 2000 + }, + { + "epoch": 0.05496233302798704, + "grad_norm": 2.78125, + "learning_rate": 9.476570425817774e-06, + "loss": 0.7048, + "step": 2010 + }, + { + "epoch": 0.05523577747091235, + "grad_norm": 2.375, + "learning_rate": 9.47382852129089e-06, + "loss": 0.7187, + "step": 2020 + }, + { + "epoch": 0.055509221913837656, + "grad_norm": 2.234375, + "learning_rate": 9.471086616764005e-06, + "loss": 0.7757, + "step": 2030 + }, + { + "epoch": 0.05578266635676297, + "grad_norm": 2.75, + "learning_rate": 9.468344712237121e-06, + "loss": 0.6249, + "step": 2040 + }, + { + "epoch": 0.05605611079968827, + "grad_norm": 2.90625, + "learning_rate": 9.465602807710236e-06, + "loss": 0.6795, + "step": 2050 + }, + { + "epoch": 0.056329555242613585, + "grad_norm": 2.65625, + "learning_rate": 9.462860903183352e-06, + "loss": 0.7831, + "step": 2060 + }, + { + "epoch": 0.05660299968553889, + "grad_norm": 2.546875, + "learning_rate": 9.460118998656468e-06, + "loss": 0.7694, + "step": 2070 + }, + { + "epoch": 0.056876444128464196, + "grad_norm": 2.5, + "learning_rate": 9.457377094129583e-06, + "loss": 0.7548, + "step": 2080 + }, + { + "epoch": 0.05714988857138951, + "grad_norm": 2.46875, + "learning_rate": 9.4546351896027e-06, + "loss": 0.7172, + "step": 2090 + }, + { + "epoch": 0.05742333301431481, + "grad_norm": 2.796875, + "learning_rate": 9.451893285075816e-06, + "loss": 0.6696, + "step": 2100 + }, + { + "epoch": 0.057696777457240125, + "grad_norm": 2.375, + "learning_rate": 9.44915138054893e-06, + "loss": 0.6847, + "step": 2110 + }, + { + "epoch": 0.05797022190016543, + "grad_norm": 2.453125, + "learning_rate": 9.446409476022047e-06, + "loss": 0.6531, + "step": 2120 + }, + { + "epoch": 0.05824366634309074, + "grad_norm": 2.46875, + "learning_rate": 9.443667571495161e-06, + "loss": 0.7827, + "step": 2130 + }, + { + "epoch": 0.05851711078601605, + "grad_norm": 2.828125, + "learning_rate": 9.440925666968277e-06, + "loss": 0.7604, + "step": 2140 + }, + { + "epoch": 0.05879055522894136, + "grad_norm": 2.734375, + "learning_rate": 9.438183762441394e-06, + "loss": 0.7345, + "step": 2150 + }, + { + "epoch": 0.059063999671866665, + "grad_norm": 2.640625, + "learning_rate": 9.435441857914508e-06, + "loss": 0.7002, + "step": 2160 + }, + { + "epoch": 0.05933744411479198, + "grad_norm": 2.796875, + "learning_rate": 9.432699953387625e-06, + "loss": 0.7382, + "step": 2170 + }, + { + "epoch": 0.05961088855771728, + "grad_norm": 2.6875, + "learning_rate": 9.42995804886074e-06, + "loss": 0.7264, + "step": 2180 + }, + { + "epoch": 0.059884333000642595, + "grad_norm": 2.859375, + "learning_rate": 9.427216144333856e-06, + "loss": 0.7551, + "step": 2190 + }, + { + "epoch": 0.0601577774435679, + "grad_norm": 2.609375, + "learning_rate": 9.42447423980697e-06, + "loss": 0.6975, + "step": 2200 + }, + { + "epoch": 0.06043122188649321, + "grad_norm": 2.203125, + "learning_rate": 9.421732335280087e-06, + "loss": 0.6908, + "step": 2210 + }, + { + "epoch": 0.06070466632941852, + "grad_norm": 2.328125, + "learning_rate": 9.418990430753201e-06, + "loss": 0.7643, + "step": 2220 + }, + { + "epoch": 0.06097811077234383, + "grad_norm": 2.6875, + "learning_rate": 9.416248526226318e-06, + "loss": 0.7596, + "step": 2230 + }, + { + "epoch": 0.061251555215269135, + "grad_norm": 2.46875, + "learning_rate": 9.413506621699432e-06, + "loss": 0.758, + "step": 2240 + }, + { + "epoch": 0.06152499965819445, + "grad_norm": 2.703125, + "learning_rate": 9.410764717172549e-06, + "loss": 0.6851, + "step": 2250 + }, + { + "epoch": 0.06179844410111975, + "grad_norm": 2.828125, + "learning_rate": 9.408022812645663e-06, + "loss": 0.7515, + "step": 2260 + }, + { + "epoch": 0.062071888544045065, + "grad_norm": 2.609375, + "learning_rate": 9.40528090811878e-06, + "loss": 0.7356, + "step": 2270 + }, + { + "epoch": 0.06234533298697037, + "grad_norm": 2.484375, + "learning_rate": 9.402539003591896e-06, + "loss": 0.6987, + "step": 2280 + }, + { + "epoch": 0.06261877742989568, + "grad_norm": 2.953125, + "learning_rate": 9.39979709906501e-06, + "loss": 0.7255, + "step": 2290 + }, + { + "epoch": 0.062892221872821, + "grad_norm": 1.8828125, + "learning_rate": 9.397055194538127e-06, + "loss": 0.6714, + "step": 2300 + }, + { + "epoch": 0.0631656663157463, + "grad_norm": 2.375, + "learning_rate": 9.394313290011242e-06, + "loss": 0.7164, + "step": 2310 + }, + { + "epoch": 0.0634391107586716, + "grad_norm": 2.671875, + "learning_rate": 9.391571385484358e-06, + "loss": 0.7498, + "step": 2320 + }, + { + "epoch": 0.06371255520159691, + "grad_norm": 2.65625, + "learning_rate": 9.388829480957474e-06, + "loss": 0.6796, + "step": 2330 + }, + { + "epoch": 0.06398599964452223, + "grad_norm": 2.890625, + "learning_rate": 9.386087576430589e-06, + "loss": 0.7478, + "step": 2340 + }, + { + "epoch": 0.06425944408744753, + "grad_norm": 2.5625, + "learning_rate": 9.383345671903705e-06, + "loss": 0.7363, + "step": 2350 + }, + { + "epoch": 0.06453288853037284, + "grad_norm": 2.53125, + "learning_rate": 9.38060376737682e-06, + "loss": 0.6932, + "step": 2360 + }, + { + "epoch": 0.06480633297329814, + "grad_norm": 2.75, + "learning_rate": 9.377861862849936e-06, + "loss": 0.7147, + "step": 2370 + }, + { + "epoch": 0.06507977741622346, + "grad_norm": 2.5625, + "learning_rate": 9.375119958323052e-06, + "loss": 0.7282, + "step": 2380 + }, + { + "epoch": 0.06535322185914877, + "grad_norm": 2.515625, + "learning_rate": 9.372378053796167e-06, + "loss": 0.6753, + "step": 2390 + }, + { + "epoch": 0.06562666630207407, + "grad_norm": 2.21875, + "learning_rate": 9.369636149269283e-06, + "loss": 0.6923, + "step": 2400 + }, + { + "epoch": 0.06590011074499938, + "grad_norm": 2.65625, + "learning_rate": 9.3668942447424e-06, + "loss": 0.6947, + "step": 2410 + }, + { + "epoch": 0.0661735551879247, + "grad_norm": 2.953125, + "learning_rate": 9.364152340215514e-06, + "loss": 0.6729, + "step": 2420 + }, + { + "epoch": 0.06644699963085, + "grad_norm": 2.515625, + "learning_rate": 9.36141043568863e-06, + "loss": 0.7164, + "step": 2430 + }, + { + "epoch": 0.06672044407377531, + "grad_norm": 2.515625, + "learning_rate": 9.358668531161745e-06, + "loss": 0.7092, + "step": 2440 + }, + { + "epoch": 0.06699388851670061, + "grad_norm": 2.78125, + "learning_rate": 9.355926626634862e-06, + "loss": 0.791, + "step": 2450 + }, + { + "epoch": 0.06726733295962593, + "grad_norm": 2.890625, + "learning_rate": 9.353184722107978e-06, + "loss": 0.7566, + "step": 2460 + }, + { + "epoch": 0.06754077740255124, + "grad_norm": 2.59375, + "learning_rate": 9.350442817581093e-06, + "loss": 0.6943, + "step": 2470 + }, + { + "epoch": 0.06781422184547654, + "grad_norm": 2.375, + "learning_rate": 9.347700913054209e-06, + "loss": 0.7462, + "step": 2480 + }, + { + "epoch": 0.06808766628840185, + "grad_norm": 2.359375, + "learning_rate": 9.344959008527324e-06, + "loss": 0.6632, + "step": 2490 + }, + { + "epoch": 0.06836111073132717, + "grad_norm": 2.40625, + "learning_rate": 9.34221710400044e-06, + "loss": 0.7105, + "step": 2500 + }, + { + "epoch": 0.06863455517425247, + "grad_norm": 2.96875, + "learning_rate": 9.339475199473556e-06, + "loss": 0.7333, + "step": 2510 + }, + { + "epoch": 0.06890799961717778, + "grad_norm": 2.8125, + "learning_rate": 9.336733294946671e-06, + "loss": 0.7125, + "step": 2520 + }, + { + "epoch": 0.06918144406010308, + "grad_norm": 2.515625, + "learning_rate": 9.333991390419787e-06, + "loss": 0.7751, + "step": 2530 + }, + { + "epoch": 0.0694548885030284, + "grad_norm": 2.46875, + "learning_rate": 9.331249485892902e-06, + "loss": 0.7913, + "step": 2540 + }, + { + "epoch": 0.06972833294595371, + "grad_norm": 2.734375, + "learning_rate": 9.328507581366018e-06, + "loss": 0.7526, + "step": 2550 + }, + { + "epoch": 0.07000177738887901, + "grad_norm": 1.7734375, + "learning_rate": 9.325765676839133e-06, + "loss": 0.6748, + "step": 2560 + }, + { + "epoch": 0.07027522183180432, + "grad_norm": 2.515625, + "learning_rate": 9.323023772312249e-06, + "loss": 0.7181, + "step": 2570 + }, + { + "epoch": 0.07054866627472964, + "grad_norm": 2.484375, + "learning_rate": 9.320281867785364e-06, + "loss": 0.7296, + "step": 2580 + }, + { + "epoch": 0.07082211071765494, + "grad_norm": 1.984375, + "learning_rate": 9.31753996325848e-06, + "loss": 0.6338, + "step": 2590 + }, + { + "epoch": 0.07109555516058025, + "grad_norm": 2.609375, + "learning_rate": 9.314798058731596e-06, + "loss": 0.7439, + "step": 2600 + }, + { + "epoch": 0.07136899960350555, + "grad_norm": 2.953125, + "learning_rate": 9.312056154204711e-06, + "loss": 0.7339, + "step": 2610 + }, + { + "epoch": 0.07164244404643087, + "grad_norm": 3.078125, + "learning_rate": 9.309314249677827e-06, + "loss": 0.7345, + "step": 2620 + }, + { + "epoch": 0.07191588848935618, + "grad_norm": 2.875, + "learning_rate": 9.306572345150942e-06, + "loss": 0.7684, + "step": 2630 + }, + { + "epoch": 0.07218933293228148, + "grad_norm": 2.3125, + "learning_rate": 9.303830440624058e-06, + "loss": 0.6764, + "step": 2640 + }, + { + "epoch": 0.07246277737520679, + "grad_norm": 2.703125, + "learning_rate": 9.301088536097173e-06, + "loss": 0.698, + "step": 2650 + }, + { + "epoch": 0.07273622181813211, + "grad_norm": 2.21875, + "learning_rate": 9.29834663157029e-06, + "loss": 0.7214, + "step": 2660 + }, + { + "epoch": 0.07300966626105741, + "grad_norm": 2.734375, + "learning_rate": 9.295604727043404e-06, + "loss": 0.7003, + "step": 2670 + }, + { + "epoch": 0.07328311070398272, + "grad_norm": 2.34375, + "learning_rate": 9.29286282251652e-06, + "loss": 0.7278, + "step": 2680 + }, + { + "epoch": 0.07355655514690802, + "grad_norm": 2.546875, + "learning_rate": 9.290120917989637e-06, + "loss": 0.7587, + "step": 2690 + }, + { + "epoch": 0.07382999958983334, + "grad_norm": 2.453125, + "learning_rate": 9.287379013462751e-06, + "loss": 0.6787, + "step": 2700 + }, + { + "epoch": 0.07410344403275865, + "grad_norm": 2.828125, + "learning_rate": 9.284637108935868e-06, + "loss": 0.7689, + "step": 2710 + }, + { + "epoch": 0.07437688847568395, + "grad_norm": 2.734375, + "learning_rate": 9.281895204408982e-06, + "loss": 0.7211, + "step": 2720 + }, + { + "epoch": 0.07465033291860926, + "grad_norm": 2.734375, + "learning_rate": 9.279153299882099e-06, + "loss": 0.7231, + "step": 2730 + }, + { + "epoch": 0.07492377736153458, + "grad_norm": 2.6875, + "learning_rate": 9.276411395355215e-06, + "loss": 0.7228, + "step": 2740 + }, + { + "epoch": 0.07519722180445988, + "grad_norm": 2.5, + "learning_rate": 9.27366949082833e-06, + "loss": 0.6982, + "step": 2750 + }, + { + "epoch": 0.07547066624738519, + "grad_norm": 2.5, + "learning_rate": 9.270927586301446e-06, + "loss": 0.7375, + "step": 2760 + }, + { + "epoch": 0.07574411069031049, + "grad_norm": 2.078125, + "learning_rate": 9.268185681774562e-06, + "loss": 0.7524, + "step": 2770 + }, + { + "epoch": 0.0760175551332358, + "grad_norm": 3.109375, + "learning_rate": 9.265443777247677e-06, + "loss": 0.7594, + "step": 2780 + }, + { + "epoch": 0.07629099957616112, + "grad_norm": 2.875, + "learning_rate": 9.262701872720793e-06, + "loss": 0.7392, + "step": 2790 + }, + { + "epoch": 0.07656444401908642, + "grad_norm": 2.59375, + "learning_rate": 9.259959968193908e-06, + "loss": 0.6191, + "step": 2800 + }, + { + "epoch": 0.07683788846201173, + "grad_norm": 2.203125, + "learning_rate": 9.257218063667024e-06, + "loss": 0.694, + "step": 2810 + }, + { + "epoch": 0.07711133290493703, + "grad_norm": 2.453125, + "learning_rate": 9.25447615914014e-06, + "loss": 0.7963, + "step": 2820 + }, + { + "epoch": 0.07738477734786235, + "grad_norm": 2.46875, + "learning_rate": 9.251734254613255e-06, + "loss": 0.7302, + "step": 2830 + }, + { + "epoch": 0.07765822179078766, + "grad_norm": 2.609375, + "learning_rate": 9.248992350086371e-06, + "loss": 0.7355, + "step": 2840 + }, + { + "epoch": 0.07793166623371296, + "grad_norm": 3.109375, + "learning_rate": 9.246250445559486e-06, + "loss": 0.702, + "step": 2850 + }, + { + "epoch": 0.07820511067663827, + "grad_norm": 2.140625, + "learning_rate": 9.243508541032602e-06, + "loss": 0.6293, + "step": 2860 + }, + { + "epoch": 0.07847855511956359, + "grad_norm": 2.84375, + "learning_rate": 9.240766636505719e-06, + "loss": 0.733, + "step": 2870 + }, + { + "epoch": 0.07875199956248889, + "grad_norm": 2.734375, + "learning_rate": 9.238024731978833e-06, + "loss": 0.7017, + "step": 2880 + }, + { + "epoch": 0.0790254440054142, + "grad_norm": 2.3125, + "learning_rate": 9.23528282745195e-06, + "loss": 0.7087, + "step": 2890 + }, + { + "epoch": 0.0792988884483395, + "grad_norm": 2.578125, + "learning_rate": 9.232540922925064e-06, + "loss": 0.7148, + "step": 2900 + }, + { + "epoch": 0.07957233289126482, + "grad_norm": 2.515625, + "learning_rate": 9.22979901839818e-06, + "loss": 0.6705, + "step": 2910 + }, + { + "epoch": 0.07984577733419013, + "grad_norm": 2.75, + "learning_rate": 9.227057113871297e-06, + "loss": 0.8337, + "step": 2920 + }, + { + "epoch": 0.08011922177711543, + "grad_norm": 2.484375, + "learning_rate": 9.224315209344412e-06, + "loss": 0.7417, + "step": 2930 + }, + { + "epoch": 0.08039266622004074, + "grad_norm": 2.734375, + "learning_rate": 9.221573304817528e-06, + "loss": 0.6853, + "step": 2940 + }, + { + "epoch": 0.08066611066296606, + "grad_norm": 2.390625, + "learning_rate": 9.218831400290643e-06, + "loss": 0.7064, + "step": 2950 + }, + { + "epoch": 0.08093955510589136, + "grad_norm": 2.359375, + "learning_rate": 9.216089495763759e-06, + "loss": 0.7303, + "step": 2960 + }, + { + "epoch": 0.08121299954881667, + "grad_norm": 2.4375, + "learning_rate": 9.213347591236874e-06, + "loss": 0.7357, + "step": 2970 + }, + { + "epoch": 0.08148644399174197, + "grad_norm": 2.25, + "learning_rate": 9.21060568670999e-06, + "loss": 0.7279, + "step": 2980 + }, + { + "epoch": 0.08175988843466729, + "grad_norm": 2.890625, + "learning_rate": 9.207863782183104e-06, + "loss": 0.7148, + "step": 2990 + }, + { + "epoch": 0.0820333328775926, + "grad_norm": 2.984375, + "learning_rate": 9.20512187765622e-06, + "loss": 0.7621, + "step": 3000 + }, + { + "epoch": 0.0823067773205179, + "grad_norm": 2.296875, + "learning_rate": 9.202379973129335e-06, + "loss": 0.7009, + "step": 3010 + }, + { + "epoch": 0.0825802217634432, + "grad_norm": 2.421875, + "learning_rate": 9.199638068602452e-06, + "loss": 0.6917, + "step": 3020 + }, + { + "epoch": 0.08285366620636853, + "grad_norm": 2.515625, + "learning_rate": 9.196896164075566e-06, + "loss": 0.6573, + "step": 3030 + }, + { + "epoch": 0.08312711064929383, + "grad_norm": 2.53125, + "learning_rate": 9.194154259548683e-06, + "loss": 0.7536, + "step": 3040 + }, + { + "epoch": 0.08340055509221914, + "grad_norm": 2.609375, + "learning_rate": 9.191412355021799e-06, + "loss": 0.7113, + "step": 3050 + }, + { + "epoch": 0.08367399953514444, + "grad_norm": 2.484375, + "learning_rate": 9.188670450494914e-06, + "loss": 0.6865, + "step": 3060 + }, + { + "epoch": 0.08394744397806976, + "grad_norm": 2.71875, + "learning_rate": 9.18592854596803e-06, + "loss": 0.6777, + "step": 3070 + }, + { + "epoch": 0.08422088842099507, + "grad_norm": 2.453125, + "learning_rate": 9.183186641441145e-06, + "loss": 0.7431, + "step": 3080 + }, + { + "epoch": 0.08449433286392037, + "grad_norm": 2.859375, + "learning_rate": 9.180444736914261e-06, + "loss": 0.8253, + "step": 3090 + }, + { + "epoch": 0.08476777730684568, + "grad_norm": 2.96875, + "learning_rate": 9.177702832387377e-06, + "loss": 0.6511, + "step": 3100 + }, + { + "epoch": 0.085041221749771, + "grad_norm": 2.625, + "learning_rate": 9.174960927860492e-06, + "loss": 0.6804, + "step": 3110 + }, + { + "epoch": 0.0853146661926963, + "grad_norm": 2.8125, + "learning_rate": 9.172219023333608e-06, + "loss": 0.7164, + "step": 3120 + }, + { + "epoch": 0.0855881106356216, + "grad_norm": 2.0, + "learning_rate": 9.169477118806725e-06, + "loss": 0.6317, + "step": 3130 + }, + { + "epoch": 0.08586155507854691, + "grad_norm": 2.375, + "learning_rate": 9.16673521427984e-06, + "loss": 0.6372, + "step": 3140 + }, + { + "epoch": 0.08613499952147223, + "grad_norm": 2.53125, + "learning_rate": 9.163993309752956e-06, + "loss": 0.8034, + "step": 3150 + }, + { + "epoch": 0.08640844396439754, + "grad_norm": 2.796875, + "learning_rate": 9.16125140522607e-06, + "loss": 0.7131, + "step": 3160 + }, + { + "epoch": 0.08668188840732284, + "grad_norm": 2.140625, + "learning_rate": 9.158509500699187e-06, + "loss": 0.7006, + "step": 3170 + }, + { + "epoch": 0.08695533285024815, + "grad_norm": 2.28125, + "learning_rate": 9.155767596172303e-06, + "loss": 0.6829, + "step": 3180 + }, + { + "epoch": 0.08722877729317347, + "grad_norm": 2.765625, + "learning_rate": 9.153025691645417e-06, + "loss": 0.7358, + "step": 3190 + }, + { + "epoch": 0.08750222173609877, + "grad_norm": 2.8125, + "learning_rate": 9.150283787118534e-06, + "loss": 0.7103, + "step": 3200 + }, + { + "epoch": 0.08777566617902408, + "grad_norm": 3.015625, + "learning_rate": 9.147541882591648e-06, + "loss": 0.7994, + "step": 3210 + }, + { + "epoch": 0.08804911062194938, + "grad_norm": 2.65625, + "learning_rate": 9.144799978064765e-06, + "loss": 0.73, + "step": 3220 + }, + { + "epoch": 0.0883225550648747, + "grad_norm": 2.796875, + "learning_rate": 9.142058073537881e-06, + "loss": 0.7445, + "step": 3230 + }, + { + "epoch": 0.0885959995078, + "grad_norm": 2.921875, + "learning_rate": 9.139316169010996e-06, + "loss": 0.7588, + "step": 3240 + }, + { + "epoch": 0.08886944395072531, + "grad_norm": 2.765625, + "learning_rate": 9.136574264484112e-06, + "loss": 0.7597, + "step": 3250 + }, + { + "epoch": 0.08914288839365062, + "grad_norm": 2.484375, + "learning_rate": 9.133832359957227e-06, + "loss": 0.6559, + "step": 3260 + }, + { + "epoch": 0.08941633283657593, + "grad_norm": 2.734375, + "learning_rate": 9.131090455430343e-06, + "loss": 0.7017, + "step": 3270 + }, + { + "epoch": 0.08968977727950124, + "grad_norm": 2.953125, + "learning_rate": 9.12834855090346e-06, + "loss": 0.7672, + "step": 3280 + }, + { + "epoch": 0.08996322172242655, + "grad_norm": 2.75, + "learning_rate": 9.125606646376574e-06, + "loss": 0.7921, + "step": 3290 + }, + { + "epoch": 0.09023666616535185, + "grad_norm": 2.125, + "learning_rate": 9.12286474184969e-06, + "loss": 0.7452, + "step": 3300 + }, + { + "epoch": 0.09051011060827717, + "grad_norm": 2.015625, + "learning_rate": 9.120122837322805e-06, + "loss": 0.6165, + "step": 3310 + }, + { + "epoch": 0.09078355505120247, + "grad_norm": 2.078125, + "learning_rate": 9.117380932795921e-06, + "loss": 0.7387, + "step": 3320 + }, + { + "epoch": 0.09105699949412778, + "grad_norm": 2.375, + "learning_rate": 9.114639028269036e-06, + "loss": 0.7043, + "step": 3330 + }, + { + "epoch": 0.09133044393705309, + "grad_norm": 2.796875, + "learning_rate": 9.111897123742152e-06, + "loss": 0.755, + "step": 3340 + }, + { + "epoch": 0.0916038883799784, + "grad_norm": 2.3125, + "learning_rate": 9.109155219215267e-06, + "loss": 0.6737, + "step": 3350 + }, + { + "epoch": 0.09187733282290371, + "grad_norm": 2.546875, + "learning_rate": 9.106413314688383e-06, + "loss": 0.7364, + "step": 3360 + }, + { + "epoch": 0.09215077726582901, + "grad_norm": 2.140625, + "learning_rate": 9.103671410161498e-06, + "loss": 0.6957, + "step": 3370 + }, + { + "epoch": 0.09242422170875432, + "grad_norm": 2.484375, + "learning_rate": 9.100929505634614e-06, + "loss": 0.747, + "step": 3380 + }, + { + "epoch": 0.09269766615167964, + "grad_norm": 2.6875, + "learning_rate": 9.09818760110773e-06, + "loss": 0.7225, + "step": 3390 + }, + { + "epoch": 0.09297111059460494, + "grad_norm": 2.734375, + "learning_rate": 9.095445696580845e-06, + "loss": 0.6379, + "step": 3400 + }, + { + "epoch": 0.09324455503753025, + "grad_norm": 2.484375, + "learning_rate": 9.092703792053961e-06, + "loss": 0.7161, + "step": 3410 + }, + { + "epoch": 0.09351799948045555, + "grad_norm": 2.484375, + "learning_rate": 9.089961887527076e-06, + "loss": 0.7085, + "step": 3420 + }, + { + "epoch": 0.09379144392338087, + "grad_norm": 2.4375, + "learning_rate": 9.087219983000192e-06, + "loss": 0.7941, + "step": 3430 + }, + { + "epoch": 0.09406488836630618, + "grad_norm": 2.28125, + "learning_rate": 9.084478078473307e-06, + "loss": 0.6901, + "step": 3440 + }, + { + "epoch": 0.09433833280923148, + "grad_norm": 2.5625, + "learning_rate": 9.081736173946423e-06, + "loss": 0.7395, + "step": 3450 + }, + { + "epoch": 0.09461177725215679, + "grad_norm": 2.46875, + "learning_rate": 9.07899426941954e-06, + "loss": 0.6675, + "step": 3460 + }, + { + "epoch": 0.0948852216950821, + "grad_norm": 2.203125, + "learning_rate": 9.076252364892654e-06, + "loss": 0.6616, + "step": 3470 + }, + { + "epoch": 0.09515866613800741, + "grad_norm": 3.21875, + "learning_rate": 9.07351046036577e-06, + "loss": 0.7121, + "step": 3480 + }, + { + "epoch": 0.09543211058093272, + "grad_norm": 2.90625, + "learning_rate": 9.070768555838887e-06, + "loss": 0.6962, + "step": 3490 + }, + { + "epoch": 0.09570555502385802, + "grad_norm": 2.4375, + "learning_rate": 9.068026651312002e-06, + "loss": 0.6692, + "step": 3500 + }, + { + "epoch": 0.09597899946678333, + "grad_norm": 2.734375, + "learning_rate": 9.065284746785118e-06, + "loss": 0.7412, + "step": 3510 + }, + { + "epoch": 0.09625244390970865, + "grad_norm": 2.828125, + "learning_rate": 9.062542842258233e-06, + "loss": 0.7908, + "step": 3520 + }, + { + "epoch": 0.09652588835263395, + "grad_norm": 2.5, + "learning_rate": 9.059800937731349e-06, + "loss": 0.7482, + "step": 3530 + }, + { + "epoch": 0.09679933279555926, + "grad_norm": 2.703125, + "learning_rate": 9.057059033204465e-06, + "loss": 0.7422, + "step": 3540 + }, + { + "epoch": 0.09707277723848456, + "grad_norm": 2.53125, + "learning_rate": 9.05431712867758e-06, + "loss": 0.7405, + "step": 3550 + }, + { + "epoch": 0.09734622168140988, + "grad_norm": 2.71875, + "learning_rate": 9.051575224150696e-06, + "loss": 0.7859, + "step": 3560 + }, + { + "epoch": 0.09761966612433519, + "grad_norm": 2.46875, + "learning_rate": 9.048833319623811e-06, + "loss": 0.684, + "step": 3570 + }, + { + "epoch": 0.0978931105672605, + "grad_norm": 2.796875, + "learning_rate": 9.046091415096927e-06, + "loss": 0.7065, + "step": 3580 + }, + { + "epoch": 0.0981665550101858, + "grad_norm": 2.015625, + "learning_rate": 9.043349510570044e-06, + "loss": 0.7461, + "step": 3590 + }, + { + "epoch": 0.09843999945311112, + "grad_norm": 2.921875, + "learning_rate": 9.040607606043158e-06, + "loss": 0.7034, + "step": 3600 + }, + { + "epoch": 0.09871344389603642, + "grad_norm": 2.484375, + "learning_rate": 9.037865701516275e-06, + "loss": 0.7345, + "step": 3610 + }, + { + "epoch": 0.09898688833896173, + "grad_norm": 2.9375, + "learning_rate": 9.035123796989389e-06, + "loss": 0.6923, + "step": 3620 + }, + { + "epoch": 0.09926033278188703, + "grad_norm": 2.390625, + "learning_rate": 9.032381892462505e-06, + "loss": 0.6892, + "step": 3630 + }, + { + "epoch": 0.09953377722481235, + "grad_norm": 2.328125, + "learning_rate": 9.029639987935622e-06, + "loss": 0.6434, + "step": 3640 + }, + { + "epoch": 0.09980722166773766, + "grad_norm": 2.703125, + "learning_rate": 9.026898083408736e-06, + "loss": 0.6552, + "step": 3650 + }, + { + "epoch": 0.10008066611066296, + "grad_norm": 2.359375, + "learning_rate": 9.024156178881853e-06, + "loss": 0.7417, + "step": 3660 + }, + { + "epoch": 0.10035411055358827, + "grad_norm": 3.265625, + "learning_rate": 9.021414274354967e-06, + "loss": 0.6971, + "step": 3670 + }, + { + "epoch": 0.10062755499651359, + "grad_norm": 2.890625, + "learning_rate": 9.018672369828084e-06, + "loss": 0.772, + "step": 3680 + }, + { + "epoch": 0.1009009994394389, + "grad_norm": 2.78125, + "learning_rate": 9.015930465301198e-06, + "loss": 0.6931, + "step": 3690 + }, + { + "epoch": 0.1011744438823642, + "grad_norm": 2.328125, + "learning_rate": 9.013188560774315e-06, + "loss": 0.7003, + "step": 3700 + }, + { + "epoch": 0.1014478883252895, + "grad_norm": 2.84375, + "learning_rate": 9.010446656247431e-06, + "loss": 0.7358, + "step": 3710 + }, + { + "epoch": 0.10172133276821482, + "grad_norm": 2.71875, + "learning_rate": 9.007704751720546e-06, + "loss": 0.6683, + "step": 3720 + }, + { + "epoch": 0.10199477721114013, + "grad_norm": 2.59375, + "learning_rate": 9.004962847193662e-06, + "loss": 0.7268, + "step": 3730 + }, + { + "epoch": 0.10226822165406543, + "grad_norm": 2.53125, + "learning_rate": 9.002220942666777e-06, + "loss": 0.7729, + "step": 3740 + }, + { + "epoch": 0.10254166609699074, + "grad_norm": 2.375, + "learning_rate": 8.999479038139893e-06, + "loss": 0.6931, + "step": 3750 + }, + { + "epoch": 0.10281511053991606, + "grad_norm": 2.421875, + "learning_rate": 8.996737133613008e-06, + "loss": 0.6683, + "step": 3760 + }, + { + "epoch": 0.10308855498284136, + "grad_norm": 2.765625, + "learning_rate": 8.993995229086124e-06, + "loss": 0.7224, + "step": 3770 + }, + { + "epoch": 0.10336199942576667, + "grad_norm": 2.96875, + "learning_rate": 8.991253324559239e-06, + "loss": 0.6687, + "step": 3780 + }, + { + "epoch": 0.10363544386869197, + "grad_norm": 3.046875, + "learning_rate": 8.988511420032355e-06, + "loss": 0.7198, + "step": 3790 + }, + { + "epoch": 0.10390888831161729, + "grad_norm": 2.3125, + "learning_rate": 8.985769515505471e-06, + "loss": 0.7144, + "step": 3800 + }, + { + "epoch": 0.1041823327545426, + "grad_norm": 2.703125, + "learning_rate": 8.983027610978586e-06, + "loss": 0.687, + "step": 3810 + }, + { + "epoch": 0.1044557771974679, + "grad_norm": 2.875, + "learning_rate": 8.980285706451702e-06, + "loss": 0.6806, + "step": 3820 + }, + { + "epoch": 0.10472922164039321, + "grad_norm": 2.8125, + "learning_rate": 8.977543801924817e-06, + "loss": 0.7434, + "step": 3830 + }, + { + "epoch": 0.10500266608331853, + "grad_norm": 2.578125, + "learning_rate": 8.974801897397933e-06, + "loss": 0.6502, + "step": 3840 + }, + { + "epoch": 0.10527611052624383, + "grad_norm": 2.46875, + "learning_rate": 8.97205999287105e-06, + "loss": 0.7147, + "step": 3850 + }, + { + "epoch": 0.10554955496916914, + "grad_norm": 2.28125, + "learning_rate": 8.969318088344164e-06, + "loss": 0.7403, + "step": 3860 + }, + { + "epoch": 0.10582299941209444, + "grad_norm": 3.953125, + "learning_rate": 8.96657618381728e-06, + "loss": 0.6041, + "step": 3870 + }, + { + "epoch": 0.10609644385501976, + "grad_norm": 2.53125, + "learning_rate": 8.963834279290395e-06, + "loss": 0.7531, + "step": 3880 + }, + { + "epoch": 0.10636988829794507, + "grad_norm": 2.4375, + "learning_rate": 8.961092374763511e-06, + "loss": 0.6987, + "step": 3890 + }, + { + "epoch": 0.10664333274087037, + "grad_norm": 2.703125, + "learning_rate": 8.958350470236628e-06, + "loss": 0.7257, + "step": 3900 + }, + { + "epoch": 0.10691677718379568, + "grad_norm": 2.625, + "learning_rate": 8.955608565709742e-06, + "loss": 0.7666, + "step": 3910 + }, + { + "epoch": 0.107190221626721, + "grad_norm": 2.75, + "learning_rate": 8.952866661182859e-06, + "loss": 0.7229, + "step": 3920 + }, + { + "epoch": 0.1074636660696463, + "grad_norm": 2.703125, + "learning_rate": 8.950124756655973e-06, + "loss": 0.6536, + "step": 3930 + }, + { + "epoch": 0.10773711051257161, + "grad_norm": 2.484375, + "learning_rate": 8.94738285212909e-06, + "loss": 0.7457, + "step": 3940 + }, + { + "epoch": 0.10801055495549691, + "grad_norm": 2.390625, + "learning_rate": 8.944640947602206e-06, + "loss": 0.6894, + "step": 3950 + }, + { + "epoch": 0.10828399939842223, + "grad_norm": 2.515625, + "learning_rate": 8.94189904307532e-06, + "loss": 0.6934, + "step": 3960 + }, + { + "epoch": 0.10855744384134754, + "grad_norm": 2.609375, + "learning_rate": 8.939157138548437e-06, + "loss": 0.7197, + "step": 3970 + }, + { + "epoch": 0.10883088828427284, + "grad_norm": 2.609375, + "learning_rate": 8.936415234021553e-06, + "loss": 0.6729, + "step": 3980 + }, + { + "epoch": 0.10910433272719815, + "grad_norm": 2.953125, + "learning_rate": 8.933673329494668e-06, + "loss": 0.7135, + "step": 3990 + }, + { + "epoch": 0.10937777717012347, + "grad_norm": 2.515625, + "learning_rate": 8.930931424967784e-06, + "loss": 0.7233, + "step": 4000 + }, + { + "epoch": 0.10965122161304877, + "grad_norm": 2.796875, + "learning_rate": 8.928189520440899e-06, + "loss": 0.7577, + "step": 4010 + }, + { + "epoch": 0.10992466605597408, + "grad_norm": 2.84375, + "learning_rate": 8.925447615914015e-06, + "loss": 0.7103, + "step": 4020 + }, + { + "epoch": 0.11019811049889938, + "grad_norm": 2.78125, + "learning_rate": 8.922705711387132e-06, + "loss": 0.7714, + "step": 4030 + }, + { + "epoch": 0.1104715549418247, + "grad_norm": 2.53125, + "learning_rate": 8.919963806860246e-06, + "loss": 0.7234, + "step": 4040 + }, + { + "epoch": 0.11074499938475, + "grad_norm": 2.25, + "learning_rate": 8.917221902333362e-06, + "loss": 0.6417, + "step": 4050 + }, + { + "epoch": 0.11101844382767531, + "grad_norm": 2.375, + "learning_rate": 8.914479997806477e-06, + "loss": 0.7264, + "step": 4060 + }, + { + "epoch": 0.11129188827060062, + "grad_norm": 2.71875, + "learning_rate": 8.911738093279593e-06, + "loss": 0.6653, + "step": 4070 + }, + { + "epoch": 0.11156533271352594, + "grad_norm": 2.96875, + "learning_rate": 8.908996188752708e-06, + "loss": 0.6545, + "step": 4080 + }, + { + "epoch": 0.11183877715645124, + "grad_norm": 2.484375, + "learning_rate": 8.906254284225824e-06, + "loss": 0.7, + "step": 4090 + }, + { + "epoch": 0.11211222159937655, + "grad_norm": 2.8125, + "learning_rate": 8.903512379698939e-06, + "loss": 0.638, + "step": 4100 + }, + { + "epoch": 0.11238566604230185, + "grad_norm": 2.5, + "learning_rate": 8.900770475172055e-06, + "loss": 0.7191, + "step": 4110 + }, + { + "epoch": 0.11265911048522717, + "grad_norm": 2.546875, + "learning_rate": 8.89802857064517e-06, + "loss": 0.7591, + "step": 4120 + }, + { + "epoch": 0.11293255492815248, + "grad_norm": 2.671875, + "learning_rate": 8.895286666118286e-06, + "loss": 0.7115, + "step": 4130 + }, + { + "epoch": 0.11320599937107778, + "grad_norm": 2.828125, + "learning_rate": 8.892544761591401e-06, + "loss": 0.7683, + "step": 4140 + }, + { + "epoch": 0.11347944381400309, + "grad_norm": 2.640625, + "learning_rate": 8.889802857064517e-06, + "loss": 0.6935, + "step": 4150 + }, + { + "epoch": 0.11375288825692839, + "grad_norm": 2.421875, + "learning_rate": 8.887060952537634e-06, + "loss": 0.7217, + "step": 4160 + }, + { + "epoch": 0.11402633269985371, + "grad_norm": 2.203125, + "learning_rate": 8.884319048010748e-06, + "loss": 0.7102, + "step": 4170 + }, + { + "epoch": 0.11429977714277902, + "grad_norm": 2.6875, + "learning_rate": 8.881577143483865e-06, + "loss": 0.7157, + "step": 4180 + }, + { + "epoch": 0.11457322158570432, + "grad_norm": 2.515625, + "learning_rate": 8.87883523895698e-06, + "loss": 0.8331, + "step": 4190 + }, + { + "epoch": 0.11484666602862963, + "grad_norm": 2.546875, + "learning_rate": 8.876093334430096e-06, + "loss": 0.6918, + "step": 4200 + }, + { + "epoch": 0.11512011047155495, + "grad_norm": 2.640625, + "learning_rate": 8.873351429903212e-06, + "loss": 0.7676, + "step": 4210 + }, + { + "epoch": 0.11539355491448025, + "grad_norm": 2.828125, + "learning_rate": 8.870609525376327e-06, + "loss": 0.7235, + "step": 4220 + }, + { + "epoch": 0.11566699935740556, + "grad_norm": 2.5, + "learning_rate": 8.867867620849443e-06, + "loss": 0.6853, + "step": 4230 + }, + { + "epoch": 0.11594044380033086, + "grad_norm": 2.625, + "learning_rate": 8.865125716322558e-06, + "loss": 0.6729, + "step": 4240 + }, + { + "epoch": 0.11621388824325618, + "grad_norm": 2.90625, + "learning_rate": 8.862383811795674e-06, + "loss": 0.7693, + "step": 4250 + }, + { + "epoch": 0.11648733268618149, + "grad_norm": 2.109375, + "learning_rate": 8.85964190726879e-06, + "loss": 0.7707, + "step": 4260 + }, + { + "epoch": 0.11676077712910679, + "grad_norm": 2.421875, + "learning_rate": 8.856900002741905e-06, + "loss": 0.7274, + "step": 4270 + }, + { + "epoch": 0.1170342215720321, + "grad_norm": 2.609375, + "learning_rate": 8.854158098215021e-06, + "loss": 0.6655, + "step": 4280 + }, + { + "epoch": 0.11730766601495742, + "grad_norm": 2.84375, + "learning_rate": 8.851416193688136e-06, + "loss": 0.7099, + "step": 4290 + }, + { + "epoch": 0.11758111045788272, + "grad_norm": 2.71875, + "learning_rate": 8.848674289161252e-06, + "loss": 0.7477, + "step": 4300 + }, + { + "epoch": 0.11785455490080803, + "grad_norm": 2.265625, + "learning_rate": 8.845932384634368e-06, + "loss": 0.6887, + "step": 4310 + }, + { + "epoch": 0.11812799934373333, + "grad_norm": 2.453125, + "learning_rate": 8.843190480107483e-06, + "loss": 0.6919, + "step": 4320 + }, + { + "epoch": 0.11840144378665865, + "grad_norm": 2.25, + "learning_rate": 8.8404485755806e-06, + "loss": 0.6508, + "step": 4330 + }, + { + "epoch": 0.11867488822958396, + "grad_norm": 2.390625, + "learning_rate": 8.837706671053716e-06, + "loss": 0.6674, + "step": 4340 + }, + { + "epoch": 0.11894833267250926, + "grad_norm": 2.734375, + "learning_rate": 8.83496476652683e-06, + "loss": 0.7467, + "step": 4350 + }, + { + "epoch": 0.11922177711543457, + "grad_norm": 2.8125, + "learning_rate": 8.832222861999947e-06, + "loss": 0.7059, + "step": 4360 + }, + { + "epoch": 0.11949522155835988, + "grad_norm": 2.5, + "learning_rate": 8.829480957473061e-06, + "loss": 0.6914, + "step": 4370 + }, + { + "epoch": 0.11976866600128519, + "grad_norm": 2.6875, + "learning_rate": 8.826739052946178e-06, + "loss": 0.7489, + "step": 4380 + }, + { + "epoch": 0.1200421104442105, + "grad_norm": 2.703125, + "learning_rate": 8.823997148419294e-06, + "loss": 0.6841, + "step": 4390 + }, + { + "epoch": 0.1203155548871358, + "grad_norm": 2.265625, + "learning_rate": 8.821255243892409e-06, + "loss": 0.6628, + "step": 4400 + }, + { + "epoch": 0.12058899933006112, + "grad_norm": 2.796875, + "learning_rate": 8.818513339365525e-06, + "loss": 0.7411, + "step": 4410 + }, + { + "epoch": 0.12086244377298642, + "grad_norm": 2.09375, + "learning_rate": 8.81577143483864e-06, + "loss": 0.6633, + "step": 4420 + }, + { + "epoch": 0.12113588821591173, + "grad_norm": 2.765625, + "learning_rate": 8.813029530311756e-06, + "loss": 0.7844, + "step": 4430 + }, + { + "epoch": 0.12140933265883704, + "grad_norm": 2.453125, + "learning_rate": 8.81028762578487e-06, + "loss": 0.7023, + "step": 4440 + }, + { + "epoch": 0.12168277710176235, + "grad_norm": 2.78125, + "learning_rate": 8.807545721257987e-06, + "loss": 0.6457, + "step": 4450 + }, + { + "epoch": 0.12195622154468766, + "grad_norm": 2.625, + "learning_rate": 8.804803816731101e-06, + "loss": 0.725, + "step": 4460 + }, + { + "epoch": 0.12222966598761296, + "grad_norm": 2.953125, + "learning_rate": 8.802061912204218e-06, + "loss": 0.7752, + "step": 4470 + }, + { + "epoch": 0.12250311043053827, + "grad_norm": 2.90625, + "learning_rate": 8.799320007677332e-06, + "loss": 0.7277, + "step": 4480 + }, + { + "epoch": 0.12277655487346359, + "grad_norm": 3.015625, + "learning_rate": 8.796578103150449e-06, + "loss": 0.6553, + "step": 4490 + }, + { + "epoch": 0.1230499993163889, + "grad_norm": 2.609375, + "learning_rate": 8.793836198623565e-06, + "loss": 0.7371, + "step": 4500 + }, + { + "epoch": 0.1233234437593142, + "grad_norm": 2.5, + "learning_rate": 8.79109429409668e-06, + "loss": 0.6542, + "step": 4510 + }, + { + "epoch": 0.1235968882022395, + "grad_norm": 2.46875, + "learning_rate": 8.788352389569796e-06, + "loss": 0.7508, + "step": 4520 + }, + { + "epoch": 0.12387033264516482, + "grad_norm": 2.6875, + "learning_rate": 8.78561048504291e-06, + "loss": 0.7154, + "step": 4530 + }, + { + "epoch": 0.12414377708809013, + "grad_norm": 2.390625, + "learning_rate": 8.782868580516027e-06, + "loss": 0.6304, + "step": 4540 + }, + { + "epoch": 0.12441722153101543, + "grad_norm": 3.03125, + "learning_rate": 8.780126675989142e-06, + "loss": 0.7055, + "step": 4550 + }, + { + "epoch": 0.12469066597394074, + "grad_norm": 2.234375, + "learning_rate": 8.777384771462258e-06, + "loss": 0.7053, + "step": 4560 + }, + { + "epoch": 0.12496411041686606, + "grad_norm": 2.828125, + "learning_rate": 8.774642866935374e-06, + "loss": 0.6757, + "step": 4570 + }, + { + "epoch": 0.12523755485979135, + "grad_norm": 3.15625, + "learning_rate": 8.771900962408489e-06, + "loss": 0.659, + "step": 4580 + }, + { + "epoch": 0.12551099930271667, + "grad_norm": 2.96875, + "learning_rate": 8.769159057881605e-06, + "loss": 0.6657, + "step": 4590 + }, + { + "epoch": 0.125784443745642, + "grad_norm": 2.4375, + "learning_rate": 8.76641715335472e-06, + "loss": 0.6331, + "step": 4600 + }, + { + "epoch": 0.12605788818856728, + "grad_norm": 2.609375, + "learning_rate": 8.763675248827836e-06, + "loss": 0.7044, + "step": 4610 + }, + { + "epoch": 0.1263313326314926, + "grad_norm": 2.71875, + "learning_rate": 8.760933344300953e-06, + "loss": 0.6441, + "step": 4620 + }, + { + "epoch": 0.12660477707441792, + "grad_norm": 2.453125, + "learning_rate": 8.758191439774067e-06, + "loss": 0.6773, + "step": 4630 + }, + { + "epoch": 0.1268782215173432, + "grad_norm": 3.03125, + "learning_rate": 8.755449535247184e-06, + "loss": 0.7616, + "step": 4640 + }, + { + "epoch": 0.12715166596026853, + "grad_norm": 3.125, + "learning_rate": 8.752707630720298e-06, + "loss": 0.7642, + "step": 4650 + }, + { + "epoch": 0.12742511040319382, + "grad_norm": 2.953125, + "learning_rate": 8.749965726193415e-06, + "loss": 0.7509, + "step": 4660 + }, + { + "epoch": 0.12769855484611914, + "grad_norm": 2.5625, + "learning_rate": 8.747223821666531e-06, + "loss": 0.6931, + "step": 4670 + }, + { + "epoch": 0.12797199928904446, + "grad_norm": 2.578125, + "learning_rate": 8.744481917139645e-06, + "loss": 0.7423, + "step": 4680 + }, + { + "epoch": 0.12824544373196975, + "grad_norm": 2.53125, + "learning_rate": 8.741740012612762e-06, + "loss": 0.7211, + "step": 4690 + }, + { + "epoch": 0.12851888817489507, + "grad_norm": 2.625, + "learning_rate": 8.738998108085878e-06, + "loss": 0.6715, + "step": 4700 + }, + { + "epoch": 0.1287923326178204, + "grad_norm": 2.625, + "learning_rate": 8.736256203558993e-06, + "loss": 0.6229, + "step": 4710 + }, + { + "epoch": 0.12906577706074568, + "grad_norm": 2.625, + "learning_rate": 8.733514299032109e-06, + "loss": 0.6676, + "step": 4720 + }, + { + "epoch": 0.129339221503671, + "grad_norm": 2.6875, + "learning_rate": 8.730772394505224e-06, + "loss": 0.7498, + "step": 4730 + }, + { + "epoch": 0.1296126659465963, + "grad_norm": 2.6875, + "learning_rate": 8.72803048997834e-06, + "loss": 0.7396, + "step": 4740 + }, + { + "epoch": 0.1298861103895216, + "grad_norm": 2.328125, + "learning_rate": 8.725288585451456e-06, + "loss": 0.7093, + "step": 4750 + }, + { + "epoch": 0.13015955483244693, + "grad_norm": 3.0, + "learning_rate": 8.722546680924571e-06, + "loss": 0.8104, + "step": 4760 + }, + { + "epoch": 0.13043299927537222, + "grad_norm": 2.234375, + "learning_rate": 8.719804776397687e-06, + "loss": 0.6919, + "step": 4770 + }, + { + "epoch": 0.13070644371829754, + "grad_norm": 2.5625, + "learning_rate": 8.717062871870802e-06, + "loss": 0.7879, + "step": 4780 + }, + { + "epoch": 0.13097988816122286, + "grad_norm": 2.671875, + "learning_rate": 8.714320967343918e-06, + "loss": 0.7091, + "step": 4790 + }, + { + "epoch": 0.13125333260414815, + "grad_norm": 3.15625, + "learning_rate": 8.711579062817033e-06, + "loss": 0.7491, + "step": 4800 + }, + { + "epoch": 0.13152677704707347, + "grad_norm": 3.109375, + "learning_rate": 8.70883715829015e-06, + "loss": 0.6757, + "step": 4810 + }, + { + "epoch": 0.13180022148999876, + "grad_norm": 2.421875, + "learning_rate": 8.706095253763266e-06, + "loss": 0.7679, + "step": 4820 + }, + { + "epoch": 0.13207366593292408, + "grad_norm": 2.984375, + "learning_rate": 8.70335334923638e-06, + "loss": 0.7233, + "step": 4830 + }, + { + "epoch": 0.1323471103758494, + "grad_norm": 2.46875, + "learning_rate": 8.700611444709497e-06, + "loss": 0.6943, + "step": 4840 + }, + { + "epoch": 0.1326205548187747, + "grad_norm": 2.765625, + "learning_rate": 8.697869540182611e-06, + "loss": 0.6535, + "step": 4850 + }, + { + "epoch": 0.1328939992617, + "grad_norm": 2.875, + "learning_rate": 8.695127635655728e-06, + "loss": 0.7324, + "step": 4860 + }, + { + "epoch": 0.1331674437046253, + "grad_norm": 2.875, + "learning_rate": 8.692385731128842e-06, + "loss": 0.7232, + "step": 4870 + }, + { + "epoch": 0.13344088814755062, + "grad_norm": 2.640625, + "learning_rate": 8.689643826601959e-06, + "loss": 0.6952, + "step": 4880 + }, + { + "epoch": 0.13371433259047594, + "grad_norm": 2.84375, + "learning_rate": 8.686901922075073e-06, + "loss": 0.7213, + "step": 4890 + }, + { + "epoch": 0.13398777703340123, + "grad_norm": 3.15625, + "learning_rate": 8.68416001754819e-06, + "loss": 0.7087, + "step": 4900 + }, + { + "epoch": 0.13426122147632655, + "grad_norm": 2.59375, + "learning_rate": 8.681418113021304e-06, + "loss": 0.6853, + "step": 4910 + }, + { + "epoch": 0.13453466591925187, + "grad_norm": 2.765625, + "learning_rate": 8.67867620849442e-06, + "loss": 0.6944, + "step": 4920 + }, + { + "epoch": 0.13480811036217716, + "grad_norm": 2.1875, + "learning_rate": 8.675934303967537e-06, + "loss": 0.719, + "step": 4930 + }, + { + "epoch": 0.13508155480510248, + "grad_norm": 2.28125, + "learning_rate": 8.673192399440651e-06, + "loss": 0.7218, + "step": 4940 + }, + { + "epoch": 0.13535499924802777, + "grad_norm": 2.71875, + "learning_rate": 8.670450494913768e-06, + "loss": 0.7447, + "step": 4950 + }, + { + "epoch": 0.1356284436909531, + "grad_norm": 2.640625, + "learning_rate": 8.667708590386882e-06, + "loss": 0.7047, + "step": 4960 + }, + { + "epoch": 0.1359018881338784, + "grad_norm": 2.46875, + "learning_rate": 8.664966685859999e-06, + "loss": 0.6646, + "step": 4970 + }, + { + "epoch": 0.1361753325768037, + "grad_norm": 2.5625, + "learning_rate": 8.662224781333115e-06, + "loss": 0.7471, + "step": 4980 + }, + { + "epoch": 0.13644877701972902, + "grad_norm": 2.84375, + "learning_rate": 8.65948287680623e-06, + "loss": 0.7759, + "step": 4990 + }, + { + "epoch": 0.13672222146265434, + "grad_norm": 2.671875, + "learning_rate": 8.656740972279346e-06, + "loss": 0.7191, + "step": 5000 + }, + { + "epoch": 0.13699566590557963, + "grad_norm": 2.59375, + "learning_rate": 8.65399906775246e-06, + "loss": 0.7392, + "step": 5010 + }, + { + "epoch": 0.13726911034850495, + "grad_norm": 2.53125, + "learning_rate": 8.651257163225577e-06, + "loss": 0.7191, + "step": 5020 + }, + { + "epoch": 0.13754255479143024, + "grad_norm": 2.859375, + "learning_rate": 8.648515258698693e-06, + "loss": 0.7622, + "step": 5030 + }, + { + "epoch": 0.13781599923435556, + "grad_norm": 2.75, + "learning_rate": 8.645773354171808e-06, + "loss": 0.7585, + "step": 5040 + }, + { + "epoch": 0.13808944367728088, + "grad_norm": 2.484375, + "learning_rate": 8.643031449644924e-06, + "loss": 0.6846, + "step": 5050 + }, + { + "epoch": 0.13836288812020617, + "grad_norm": 2.515625, + "learning_rate": 8.64028954511804e-06, + "loss": 0.7397, + "step": 5060 + }, + { + "epoch": 0.1386363325631315, + "grad_norm": 2.875, + "learning_rate": 8.637547640591155e-06, + "loss": 0.6789, + "step": 5070 + }, + { + "epoch": 0.1389097770060568, + "grad_norm": 2.484375, + "learning_rate": 8.634805736064272e-06, + "loss": 0.7214, + "step": 5080 + }, + { + "epoch": 0.1391832214489821, + "grad_norm": 2.625, + "learning_rate": 8.632063831537386e-06, + "loss": 0.7123, + "step": 5090 + }, + { + "epoch": 0.13945666589190742, + "grad_norm": 2.84375, + "learning_rate": 8.629321927010503e-06, + "loss": 0.6605, + "step": 5100 + }, + { + "epoch": 0.1397301103348327, + "grad_norm": 2.6875, + "learning_rate": 8.626580022483619e-06, + "loss": 0.7382, + "step": 5110 + }, + { + "epoch": 0.14000355477775803, + "grad_norm": 2.703125, + "learning_rate": 8.623838117956733e-06, + "loss": 0.7103, + "step": 5120 + }, + { + "epoch": 0.14027699922068335, + "grad_norm": 2.390625, + "learning_rate": 8.62109621342985e-06, + "loss": 0.6646, + "step": 5130 + }, + { + "epoch": 0.14055044366360864, + "grad_norm": 2.734375, + "learning_rate": 8.618354308902964e-06, + "loss": 0.6872, + "step": 5140 + }, + { + "epoch": 0.14082388810653396, + "grad_norm": 3.078125, + "learning_rate": 8.61561240437608e-06, + "loss": 0.708, + "step": 5150 + }, + { + "epoch": 0.14109733254945928, + "grad_norm": 2.75, + "learning_rate": 8.612870499849197e-06, + "loss": 0.6966, + "step": 5160 + }, + { + "epoch": 0.14137077699238457, + "grad_norm": 3.25, + "learning_rate": 8.610128595322312e-06, + "loss": 0.7286, + "step": 5170 + }, + { + "epoch": 0.14164422143530989, + "grad_norm": 2.640625, + "learning_rate": 8.607386690795428e-06, + "loss": 0.6637, + "step": 5180 + }, + { + "epoch": 0.14191766587823518, + "grad_norm": 2.40625, + "learning_rate": 8.604644786268543e-06, + "loss": 0.7338, + "step": 5190 + }, + { + "epoch": 0.1421911103211605, + "grad_norm": 3.34375, + "learning_rate": 8.601902881741659e-06, + "loss": 0.7104, + "step": 5200 + }, + { + "epoch": 0.14246455476408582, + "grad_norm": 2.953125, + "learning_rate": 8.599160977214774e-06, + "loss": 0.7038, + "step": 5210 + }, + { + "epoch": 0.1427379992070111, + "grad_norm": 2.84375, + "learning_rate": 8.59641907268789e-06, + "loss": 0.7704, + "step": 5220 + }, + { + "epoch": 0.14301144364993643, + "grad_norm": 2.53125, + "learning_rate": 8.593677168161005e-06, + "loss": 0.7014, + "step": 5230 + }, + { + "epoch": 0.14328488809286175, + "grad_norm": 2.5, + "learning_rate": 8.590935263634121e-06, + "loss": 0.7237, + "step": 5240 + }, + { + "epoch": 0.14355833253578704, + "grad_norm": 2.953125, + "learning_rate": 8.588193359107236e-06, + "loss": 0.7022, + "step": 5250 + }, + { + "epoch": 0.14383177697871236, + "grad_norm": 2.90625, + "learning_rate": 8.585451454580352e-06, + "loss": 0.694, + "step": 5260 + }, + { + "epoch": 0.14410522142163765, + "grad_norm": 3.046875, + "learning_rate": 8.582709550053468e-06, + "loss": 0.7499, + "step": 5270 + }, + { + "epoch": 0.14437866586456297, + "grad_norm": 2.796875, + "learning_rate": 8.579967645526583e-06, + "loss": 0.6762, + "step": 5280 + }, + { + "epoch": 0.14465211030748829, + "grad_norm": 2.53125, + "learning_rate": 8.5772257409997e-06, + "loss": 0.7721, + "step": 5290 + }, + { + "epoch": 0.14492555475041358, + "grad_norm": 2.625, + "learning_rate": 8.574483836472814e-06, + "loss": 0.7793, + "step": 5300 + }, + { + "epoch": 0.1451989991933389, + "grad_norm": 2.734375, + "learning_rate": 8.57174193194593e-06, + "loss": 0.6799, + "step": 5310 + }, + { + "epoch": 0.14547244363626421, + "grad_norm": 2.828125, + "learning_rate": 8.569000027419045e-06, + "loss": 0.6888, + "step": 5320 + }, + { + "epoch": 0.1457458880791895, + "grad_norm": 2.703125, + "learning_rate": 8.566258122892161e-06, + "loss": 0.7632, + "step": 5330 + }, + { + "epoch": 0.14601933252211483, + "grad_norm": 2.78125, + "learning_rate": 8.563516218365277e-06, + "loss": 0.675, + "step": 5340 + }, + { + "epoch": 0.14629277696504012, + "grad_norm": 2.046875, + "learning_rate": 8.560774313838392e-06, + "loss": 0.6194, + "step": 5350 + }, + { + "epoch": 0.14656622140796544, + "grad_norm": 2.75, + "learning_rate": 8.558032409311508e-06, + "loss": 0.7368, + "step": 5360 + }, + { + "epoch": 0.14683966585089075, + "grad_norm": 2.53125, + "learning_rate": 8.555290504784625e-06, + "loss": 0.619, + "step": 5370 + }, + { + "epoch": 0.14711311029381605, + "grad_norm": 2.625, + "learning_rate": 8.55254860025774e-06, + "loss": 0.6897, + "step": 5380 + }, + { + "epoch": 0.14738655473674137, + "grad_norm": 2.390625, + "learning_rate": 8.549806695730856e-06, + "loss": 0.6628, + "step": 5390 + }, + { + "epoch": 0.14765999917966668, + "grad_norm": 2.53125, + "learning_rate": 8.54706479120397e-06, + "loss": 0.66, + "step": 5400 + }, + { + "epoch": 0.14793344362259198, + "grad_norm": 2.890625, + "learning_rate": 8.544322886677087e-06, + "loss": 0.7292, + "step": 5410 + }, + { + "epoch": 0.1482068880655173, + "grad_norm": 2.53125, + "learning_rate": 8.541580982150203e-06, + "loss": 0.6785, + "step": 5420 + }, + { + "epoch": 0.1484803325084426, + "grad_norm": 2.375, + "learning_rate": 8.538839077623318e-06, + "loss": 0.7086, + "step": 5430 + }, + { + "epoch": 0.1487537769513679, + "grad_norm": 2.609375, + "learning_rate": 8.536097173096434e-06, + "loss": 0.6942, + "step": 5440 + }, + { + "epoch": 0.14902722139429322, + "grad_norm": 2.453125, + "learning_rate": 8.533355268569549e-06, + "loss": 0.6452, + "step": 5450 + }, + { + "epoch": 0.14930066583721852, + "grad_norm": 2.828125, + "learning_rate": 8.530613364042665e-06, + "loss": 0.6932, + "step": 5460 + }, + { + "epoch": 0.14957411028014383, + "grad_norm": 2.453125, + "learning_rate": 8.527871459515781e-06, + "loss": 0.6652, + "step": 5470 + }, + { + "epoch": 0.14984755472306915, + "grad_norm": 2.59375, + "learning_rate": 8.525129554988896e-06, + "loss": 0.7413, + "step": 5480 + }, + { + "epoch": 0.15012099916599445, + "grad_norm": 2.640625, + "learning_rate": 8.522387650462012e-06, + "loss": 0.7156, + "step": 5490 + }, + { + "epoch": 0.15039444360891976, + "grad_norm": 2.65625, + "learning_rate": 8.519645745935127e-06, + "loss": 0.694, + "step": 5500 + }, + { + "epoch": 0.15066788805184506, + "grad_norm": 3.125, + "learning_rate": 8.516903841408243e-06, + "loss": 0.7845, + "step": 5510 + }, + { + "epoch": 0.15094133249477037, + "grad_norm": 2.328125, + "learning_rate": 8.51416193688136e-06, + "loss": 0.6495, + "step": 5520 + }, + { + "epoch": 0.1512147769376957, + "grad_norm": 3.015625, + "learning_rate": 8.511420032354474e-06, + "loss": 0.7663, + "step": 5530 + }, + { + "epoch": 0.15148822138062099, + "grad_norm": 2.625, + "learning_rate": 8.50867812782759e-06, + "loss": 0.6831, + "step": 5540 + }, + { + "epoch": 0.1517616658235463, + "grad_norm": 2.21875, + "learning_rate": 8.505936223300705e-06, + "loss": 0.6493, + "step": 5550 + }, + { + "epoch": 0.1520351102664716, + "grad_norm": 2.578125, + "learning_rate": 8.503194318773821e-06, + "loss": 0.6528, + "step": 5560 + }, + { + "epoch": 0.15230855470939691, + "grad_norm": 3.234375, + "learning_rate": 8.500452414246936e-06, + "loss": 0.7426, + "step": 5570 + }, + { + "epoch": 0.15258199915232223, + "grad_norm": 2.765625, + "learning_rate": 8.497710509720052e-06, + "loss": 0.715, + "step": 5580 + }, + { + "epoch": 0.15285544359524753, + "grad_norm": 2.828125, + "learning_rate": 8.494968605193169e-06, + "loss": 0.712, + "step": 5590 + }, + { + "epoch": 0.15312888803817284, + "grad_norm": 2.578125, + "learning_rate": 8.492226700666283e-06, + "loss": 0.6856, + "step": 5600 + }, + { + "epoch": 0.15340233248109816, + "grad_norm": 2.453125, + "learning_rate": 8.4894847961394e-06, + "loss": 0.6684, + "step": 5610 + }, + { + "epoch": 0.15367577692402346, + "grad_norm": 2.328125, + "learning_rate": 8.486742891612514e-06, + "loss": 0.7006, + "step": 5620 + }, + { + "epoch": 0.15394922136694877, + "grad_norm": 2.6875, + "learning_rate": 8.48400098708563e-06, + "loss": 0.6484, + "step": 5630 + }, + { + "epoch": 0.15422266580987407, + "grad_norm": 2.703125, + "learning_rate": 8.481259082558745e-06, + "loss": 0.6751, + "step": 5640 + }, + { + "epoch": 0.15449611025279938, + "grad_norm": 2.21875, + "learning_rate": 8.478517178031862e-06, + "loss": 0.6373, + "step": 5650 + }, + { + "epoch": 0.1547695546957247, + "grad_norm": 2.640625, + "learning_rate": 8.475775273504976e-06, + "loss": 0.7138, + "step": 5660 + }, + { + "epoch": 0.15504299913865, + "grad_norm": 2.828125, + "learning_rate": 8.473033368978093e-06, + "loss": 0.7317, + "step": 5670 + }, + { + "epoch": 0.15531644358157531, + "grad_norm": 2.5625, + "learning_rate": 8.470291464451207e-06, + "loss": 0.7045, + "step": 5680 + }, + { + "epoch": 0.15558988802450063, + "grad_norm": 2.53125, + "learning_rate": 8.467549559924324e-06, + "loss": 0.7913, + "step": 5690 + }, + { + "epoch": 0.15586333246742592, + "grad_norm": 2.6875, + "learning_rate": 8.46480765539744e-06, + "loss": 0.726, + "step": 5700 + }, + { + "epoch": 0.15613677691035124, + "grad_norm": 2.203125, + "learning_rate": 8.462065750870555e-06, + "loss": 0.7121, + "step": 5710 + }, + { + "epoch": 0.15641022135327654, + "grad_norm": 2.9375, + "learning_rate": 8.459323846343671e-06, + "loss": 0.7186, + "step": 5720 + }, + { + "epoch": 0.15668366579620185, + "grad_norm": 2.609375, + "learning_rate": 8.456581941816787e-06, + "loss": 0.6472, + "step": 5730 + }, + { + "epoch": 0.15695711023912717, + "grad_norm": 2.703125, + "learning_rate": 8.453840037289902e-06, + "loss": 0.7427, + "step": 5740 + }, + { + "epoch": 0.15723055468205246, + "grad_norm": 2.84375, + "learning_rate": 8.451098132763018e-06, + "loss": 0.6463, + "step": 5750 + }, + { + "epoch": 0.15750399912497778, + "grad_norm": 2.359375, + "learning_rate": 8.448356228236133e-06, + "loss": 0.7166, + "step": 5760 + }, + { + "epoch": 0.1577774435679031, + "grad_norm": 2.578125, + "learning_rate": 8.445614323709249e-06, + "loss": 0.7253, + "step": 5770 + }, + { + "epoch": 0.1580508880108284, + "grad_norm": 2.796875, + "learning_rate": 8.442872419182365e-06, + "loss": 0.7311, + "step": 5780 + }, + { + "epoch": 0.1583243324537537, + "grad_norm": 2.390625, + "learning_rate": 8.44013051465548e-06, + "loss": 0.7085, + "step": 5790 + }, + { + "epoch": 0.158597776896679, + "grad_norm": 2.734375, + "learning_rate": 8.437388610128596e-06, + "loss": 0.6869, + "step": 5800 + }, + { + "epoch": 0.15887122133960432, + "grad_norm": 2.53125, + "learning_rate": 8.434646705601711e-06, + "loss": 0.7748, + "step": 5810 + }, + { + "epoch": 0.15914466578252964, + "grad_norm": 2.390625, + "learning_rate": 8.431904801074827e-06, + "loss": 0.741, + "step": 5820 + }, + { + "epoch": 0.15941811022545493, + "grad_norm": 2.5625, + "learning_rate": 8.429162896547944e-06, + "loss": 0.6964, + "step": 5830 + }, + { + "epoch": 0.15969155466838025, + "grad_norm": 3.0, + "learning_rate": 8.426420992021058e-06, + "loss": 0.7194, + "step": 5840 + }, + { + "epoch": 0.15996499911130557, + "grad_norm": 2.875, + "learning_rate": 8.423679087494175e-06, + "loss": 0.7095, + "step": 5850 + }, + { + "epoch": 0.16023844355423086, + "grad_norm": 2.921875, + "learning_rate": 8.42093718296729e-06, + "loss": 0.7354, + "step": 5860 + }, + { + "epoch": 0.16051188799715618, + "grad_norm": 2.34375, + "learning_rate": 8.418195278440406e-06, + "loss": 0.6978, + "step": 5870 + }, + { + "epoch": 0.16078533244008147, + "grad_norm": 2.625, + "learning_rate": 8.415453373913522e-06, + "loss": 0.6818, + "step": 5880 + }, + { + "epoch": 0.1610587768830068, + "grad_norm": 2.8125, + "learning_rate": 8.412711469386637e-06, + "loss": 0.6714, + "step": 5890 + }, + { + "epoch": 0.1613322213259321, + "grad_norm": 2.46875, + "learning_rate": 8.409969564859753e-06, + "loss": 0.6876, + "step": 5900 + }, + { + "epoch": 0.1616056657688574, + "grad_norm": 2.875, + "learning_rate": 8.40722766033287e-06, + "loss": 0.7288, + "step": 5910 + }, + { + "epoch": 0.16187911021178272, + "grad_norm": 2.9375, + "learning_rate": 8.404485755805984e-06, + "loss": 0.752, + "step": 5920 + }, + { + "epoch": 0.16215255465470804, + "grad_norm": 2.859375, + "learning_rate": 8.4017438512791e-06, + "loss": 0.6885, + "step": 5930 + }, + { + "epoch": 0.16242599909763333, + "grad_norm": 2.21875, + "learning_rate": 8.399001946752215e-06, + "loss": 0.6508, + "step": 5940 + }, + { + "epoch": 0.16269944354055865, + "grad_norm": 2.3125, + "learning_rate": 8.396260042225331e-06, + "loss": 0.7862, + "step": 5950 + }, + { + "epoch": 0.16297288798348394, + "grad_norm": 2.46875, + "learning_rate": 8.393518137698446e-06, + "loss": 0.7009, + "step": 5960 + }, + { + "epoch": 0.16324633242640926, + "grad_norm": 2.703125, + "learning_rate": 8.390776233171562e-06, + "loss": 0.7271, + "step": 5970 + }, + { + "epoch": 0.16351977686933458, + "grad_norm": 2.625, + "learning_rate": 8.388034328644677e-06, + "loss": 0.6894, + "step": 5980 + }, + { + "epoch": 0.16379322131225987, + "grad_norm": 2.921875, + "learning_rate": 8.385292424117793e-06, + "loss": 0.7306, + "step": 5990 + }, + { + "epoch": 0.1640666657551852, + "grad_norm": 2.875, + "learning_rate": 8.382550519590908e-06, + "loss": 0.8003, + "step": 6000 + }, + { + "epoch": 0.1643401101981105, + "grad_norm": 2.953125, + "learning_rate": 8.379808615064024e-06, + "loss": 0.7013, + "step": 6010 + }, + { + "epoch": 0.1646135546410358, + "grad_norm": 2.5, + "learning_rate": 8.377066710537139e-06, + "loss": 0.6948, + "step": 6020 + }, + { + "epoch": 0.16488699908396112, + "grad_norm": 2.1875, + "learning_rate": 8.374324806010255e-06, + "loss": 0.6641, + "step": 6030 + }, + { + "epoch": 0.1651604435268864, + "grad_norm": 2.875, + "learning_rate": 8.37158290148337e-06, + "loss": 0.73, + "step": 6040 + }, + { + "epoch": 0.16543388796981173, + "grad_norm": 2.421875, + "learning_rate": 8.368840996956486e-06, + "loss": 0.6809, + "step": 6050 + }, + { + "epoch": 0.16570733241273705, + "grad_norm": 2.828125, + "learning_rate": 8.366099092429602e-06, + "loss": 0.6309, + "step": 6060 + }, + { + "epoch": 0.16598077685566234, + "grad_norm": 2.65625, + "learning_rate": 8.363357187902717e-06, + "loss": 0.6947, + "step": 6070 + }, + { + "epoch": 0.16625422129858766, + "grad_norm": 2.6875, + "learning_rate": 8.360615283375833e-06, + "loss": 0.648, + "step": 6080 + }, + { + "epoch": 0.16652766574151298, + "grad_norm": 2.390625, + "learning_rate": 8.35787337884895e-06, + "loss": 0.7386, + "step": 6090 + }, + { + "epoch": 0.16680111018443827, + "grad_norm": 2.375, + "learning_rate": 8.355131474322064e-06, + "loss": 0.7219, + "step": 6100 + }, + { + "epoch": 0.1670745546273636, + "grad_norm": 2.890625, + "learning_rate": 8.35238956979518e-06, + "loss": 0.7803, + "step": 6110 + }, + { + "epoch": 0.16734799907028888, + "grad_norm": 2.71875, + "learning_rate": 8.349647665268295e-06, + "loss": 0.7205, + "step": 6120 + }, + { + "epoch": 0.1676214435132142, + "grad_norm": 2.40625, + "learning_rate": 8.346905760741412e-06, + "loss": 0.6415, + "step": 6130 + }, + { + "epoch": 0.16789488795613952, + "grad_norm": 2.765625, + "learning_rate": 8.344163856214528e-06, + "loss": 0.692, + "step": 6140 + }, + { + "epoch": 0.1681683323990648, + "grad_norm": 2.40625, + "learning_rate": 8.341421951687643e-06, + "loss": 0.6347, + "step": 6150 + }, + { + "epoch": 0.16844177684199013, + "grad_norm": 2.625, + "learning_rate": 8.338680047160759e-06, + "loss": 0.7345, + "step": 6160 + }, + { + "epoch": 0.16871522128491545, + "grad_norm": 2.6875, + "learning_rate": 8.335938142633873e-06, + "loss": 0.6842, + "step": 6170 + }, + { + "epoch": 0.16898866572784074, + "grad_norm": 2.65625, + "learning_rate": 8.33319623810699e-06, + "loss": 0.66, + "step": 6180 + }, + { + "epoch": 0.16926211017076606, + "grad_norm": 2.640625, + "learning_rate": 8.330454333580106e-06, + "loss": 0.717, + "step": 6190 + }, + { + "epoch": 0.16953555461369135, + "grad_norm": 2.71875, + "learning_rate": 8.32771242905322e-06, + "loss": 0.7247, + "step": 6200 + }, + { + "epoch": 0.16980899905661667, + "grad_norm": 2.78125, + "learning_rate": 8.324970524526337e-06, + "loss": 0.7192, + "step": 6210 + }, + { + "epoch": 0.170082443499542, + "grad_norm": 2.734375, + "learning_rate": 8.322228619999452e-06, + "loss": 0.6518, + "step": 6220 + }, + { + "epoch": 0.17035588794246728, + "grad_norm": 2.890625, + "learning_rate": 8.319486715472568e-06, + "loss": 0.7406, + "step": 6230 + }, + { + "epoch": 0.1706293323853926, + "grad_norm": 2.5625, + "learning_rate": 8.316744810945684e-06, + "loss": 0.7224, + "step": 6240 + }, + { + "epoch": 0.1709027768283179, + "grad_norm": 2.734375, + "learning_rate": 8.314002906418799e-06, + "loss": 0.8149, + "step": 6250 + }, + { + "epoch": 0.1711762212712432, + "grad_norm": 2.515625, + "learning_rate": 8.311261001891915e-06, + "loss": 0.7296, + "step": 6260 + }, + { + "epoch": 0.17144966571416853, + "grad_norm": 2.46875, + "learning_rate": 8.308519097365032e-06, + "loss": 0.7115, + "step": 6270 + }, + { + "epoch": 0.17172311015709382, + "grad_norm": 2.9375, + "learning_rate": 8.305777192838146e-06, + "loss": 0.7331, + "step": 6280 + }, + { + "epoch": 0.17199655460001914, + "grad_norm": 2.40625, + "learning_rate": 8.303035288311263e-06, + "loss": 0.6872, + "step": 6290 + }, + { + "epoch": 0.17226999904294446, + "grad_norm": 2.484375, + "learning_rate": 8.300293383784377e-06, + "loss": 0.6936, + "step": 6300 + }, + { + "epoch": 0.17254344348586975, + "grad_norm": 2.296875, + "learning_rate": 8.297551479257494e-06, + "loss": 0.6532, + "step": 6310 + }, + { + "epoch": 0.17281688792879507, + "grad_norm": 2.90625, + "learning_rate": 8.294809574730608e-06, + "loss": 0.764, + "step": 6320 + }, + { + "epoch": 0.17309033237172036, + "grad_norm": 2.5625, + "learning_rate": 8.292067670203725e-06, + "loss": 0.6219, + "step": 6330 + }, + { + "epoch": 0.17336377681464568, + "grad_norm": 2.75, + "learning_rate": 8.28932576567684e-06, + "loss": 0.7028, + "step": 6340 + }, + { + "epoch": 0.173637221257571, + "grad_norm": 2.40625, + "learning_rate": 8.286583861149956e-06, + "loss": 0.6943, + "step": 6350 + }, + { + "epoch": 0.1739106657004963, + "grad_norm": 2.625, + "learning_rate": 8.28384195662307e-06, + "loss": 0.6714, + "step": 6360 + }, + { + "epoch": 0.1741841101434216, + "grad_norm": 2.53125, + "learning_rate": 8.281100052096187e-06, + "loss": 0.7102, + "step": 6370 + }, + { + "epoch": 0.17445755458634693, + "grad_norm": 2.859375, + "learning_rate": 8.278358147569303e-06, + "loss": 0.6785, + "step": 6380 + }, + { + "epoch": 0.17473099902927222, + "grad_norm": 2.6875, + "learning_rate": 8.275616243042417e-06, + "loss": 0.7144, + "step": 6390 + }, + { + "epoch": 0.17500444347219754, + "grad_norm": 2.484375, + "learning_rate": 8.272874338515534e-06, + "loss": 0.7297, + "step": 6400 + }, + { + "epoch": 0.17527788791512283, + "grad_norm": 2.265625, + "learning_rate": 8.270132433988648e-06, + "loss": 0.6153, + "step": 6410 + }, + { + "epoch": 0.17555133235804815, + "grad_norm": 2.796875, + "learning_rate": 8.267390529461765e-06, + "loss": 0.7173, + "step": 6420 + }, + { + "epoch": 0.17582477680097347, + "grad_norm": 2.6875, + "learning_rate": 8.26464862493488e-06, + "loss": 0.6497, + "step": 6430 + }, + { + "epoch": 0.17609822124389876, + "grad_norm": 2.75, + "learning_rate": 8.261906720407996e-06, + "loss": 0.7468, + "step": 6440 + }, + { + "epoch": 0.17637166568682408, + "grad_norm": 3.0, + "learning_rate": 8.259164815881112e-06, + "loss": 0.7303, + "step": 6450 + }, + { + "epoch": 0.1766451101297494, + "grad_norm": 3.515625, + "learning_rate": 8.256422911354227e-06, + "loss": 0.674, + "step": 6460 + }, + { + "epoch": 0.1769185545726747, + "grad_norm": 2.53125, + "learning_rate": 8.253681006827343e-06, + "loss": 0.7902, + "step": 6470 + }, + { + "epoch": 0.1771919990156, + "grad_norm": 2.75, + "learning_rate": 8.250939102300458e-06, + "loss": 0.7388, + "step": 6480 + }, + { + "epoch": 0.1774654434585253, + "grad_norm": 3.03125, + "learning_rate": 8.248197197773574e-06, + "loss": 0.7135, + "step": 6490 + }, + { + "epoch": 0.17773888790145062, + "grad_norm": 2.390625, + "learning_rate": 8.24545529324669e-06, + "loss": 0.678, + "step": 6500 + }, + { + "epoch": 0.17801233234437594, + "grad_norm": 2.890625, + "learning_rate": 8.242713388719805e-06, + "loss": 0.6821, + "step": 6510 + }, + { + "epoch": 0.17828577678730123, + "grad_norm": 2.703125, + "learning_rate": 8.239971484192921e-06, + "loss": 0.618, + "step": 6520 + }, + { + "epoch": 0.17855922123022655, + "grad_norm": 2.125, + "learning_rate": 8.237229579666036e-06, + "loss": 0.6572, + "step": 6530 + }, + { + "epoch": 0.17883266567315187, + "grad_norm": 2.703125, + "learning_rate": 8.234487675139152e-06, + "loss": 0.7001, + "step": 6540 + }, + { + "epoch": 0.17910611011607716, + "grad_norm": 2.875, + "learning_rate": 8.231745770612269e-06, + "loss": 0.6963, + "step": 6550 + }, + { + "epoch": 0.17937955455900248, + "grad_norm": 2.546875, + "learning_rate": 8.229003866085383e-06, + "loss": 0.6611, + "step": 6560 + }, + { + "epoch": 0.17965299900192777, + "grad_norm": 2.78125, + "learning_rate": 8.2262619615585e-06, + "loss": 0.7419, + "step": 6570 + }, + { + "epoch": 0.1799264434448531, + "grad_norm": 3.0, + "learning_rate": 8.223520057031616e-06, + "loss": 0.7737, + "step": 6580 + }, + { + "epoch": 0.1801998878877784, + "grad_norm": 2.5625, + "learning_rate": 8.22077815250473e-06, + "loss": 0.6804, + "step": 6590 + }, + { + "epoch": 0.1804733323307037, + "grad_norm": 2.734375, + "learning_rate": 8.218036247977847e-06, + "loss": 0.6785, + "step": 6600 + }, + { + "epoch": 0.18074677677362902, + "grad_norm": 2.625, + "learning_rate": 8.215294343450961e-06, + "loss": 0.717, + "step": 6610 + }, + { + "epoch": 0.18102022121655434, + "grad_norm": 2.71875, + "learning_rate": 8.212552438924078e-06, + "loss": 0.7222, + "step": 6620 + }, + { + "epoch": 0.18129366565947963, + "grad_norm": 2.96875, + "learning_rate": 8.209810534397194e-06, + "loss": 0.6882, + "step": 6630 + }, + { + "epoch": 0.18156711010240495, + "grad_norm": 2.546875, + "learning_rate": 8.207068629870309e-06, + "loss": 0.6836, + "step": 6640 + }, + { + "epoch": 0.18184055454533024, + "grad_norm": 2.71875, + "learning_rate": 8.204326725343425e-06, + "loss": 0.7369, + "step": 6650 + }, + { + "epoch": 0.18211399898825556, + "grad_norm": 2.3125, + "learning_rate": 8.20158482081654e-06, + "loss": 0.7083, + "step": 6660 + }, + { + "epoch": 0.18238744343118088, + "grad_norm": 2.5625, + "learning_rate": 8.198842916289656e-06, + "loss": 0.75, + "step": 6670 + }, + { + "epoch": 0.18266088787410617, + "grad_norm": 2.65625, + "learning_rate": 8.19610101176277e-06, + "loss": 0.7765, + "step": 6680 + }, + { + "epoch": 0.1829343323170315, + "grad_norm": 2.859375, + "learning_rate": 8.193359107235887e-06, + "loss": 0.7226, + "step": 6690 + }, + { + "epoch": 0.1832077767599568, + "grad_norm": 3.640625, + "learning_rate": 8.190617202709003e-06, + "loss": 0.6855, + "step": 6700 + }, + { + "epoch": 0.1834812212028821, + "grad_norm": 2.796875, + "learning_rate": 8.187875298182118e-06, + "loss": 0.663, + "step": 6710 + }, + { + "epoch": 0.18375466564580742, + "grad_norm": 2.984375, + "learning_rate": 8.185133393655234e-06, + "loss": 0.7216, + "step": 6720 + }, + { + "epoch": 0.1840281100887327, + "grad_norm": 2.78125, + "learning_rate": 8.182391489128349e-06, + "loss": 0.6865, + "step": 6730 + }, + { + "epoch": 0.18430155453165803, + "grad_norm": 2.625, + "learning_rate": 8.179649584601465e-06, + "loss": 0.7234, + "step": 6740 + }, + { + "epoch": 0.18457499897458335, + "grad_norm": 2.828125, + "learning_rate": 8.17690768007458e-06, + "loss": 0.6552, + "step": 6750 + }, + { + "epoch": 0.18484844341750864, + "grad_norm": 2.734375, + "learning_rate": 8.174165775547696e-06, + "loss": 0.6976, + "step": 6760 + }, + { + "epoch": 0.18512188786043396, + "grad_norm": 2.953125, + "learning_rate": 8.171423871020811e-06, + "loss": 0.6946, + "step": 6770 + }, + { + "epoch": 0.18539533230335928, + "grad_norm": 2.75, + "learning_rate": 8.168681966493927e-06, + "loss": 0.7791, + "step": 6780 + }, + { + "epoch": 0.18566877674628457, + "grad_norm": 2.578125, + "learning_rate": 8.165940061967042e-06, + "loss": 0.707, + "step": 6790 + }, + { + "epoch": 0.1859422211892099, + "grad_norm": 2.65625, + "learning_rate": 8.163198157440158e-06, + "loss": 0.776, + "step": 6800 + }, + { + "epoch": 0.18621566563213518, + "grad_norm": 3.09375, + "learning_rate": 8.160456252913274e-06, + "loss": 0.6908, + "step": 6810 + }, + { + "epoch": 0.1864891100750605, + "grad_norm": 2.3125, + "learning_rate": 8.157714348386389e-06, + "loss": 0.6502, + "step": 6820 + }, + { + "epoch": 0.18676255451798582, + "grad_norm": 2.359375, + "learning_rate": 8.154972443859505e-06, + "loss": 0.6662, + "step": 6830 + }, + { + "epoch": 0.1870359989609111, + "grad_norm": 2.25, + "learning_rate": 8.15223053933262e-06, + "loss": 0.6613, + "step": 6840 + }, + { + "epoch": 0.18730944340383643, + "grad_norm": 2.609375, + "learning_rate": 8.149488634805736e-06, + "loss": 0.679, + "step": 6850 + }, + { + "epoch": 0.18758288784676175, + "grad_norm": 3.359375, + "learning_rate": 8.146746730278853e-06, + "loss": 0.7829, + "step": 6860 + }, + { + "epoch": 0.18785633228968704, + "grad_norm": 2.859375, + "learning_rate": 8.144004825751967e-06, + "loss": 0.7227, + "step": 6870 + }, + { + "epoch": 0.18812977673261236, + "grad_norm": 2.59375, + "learning_rate": 8.141262921225084e-06, + "loss": 0.7565, + "step": 6880 + }, + { + "epoch": 0.18840322117553765, + "grad_norm": 2.65625, + "learning_rate": 8.138521016698198e-06, + "loss": 0.6529, + "step": 6890 + }, + { + "epoch": 0.18867666561846297, + "grad_norm": 2.734375, + "learning_rate": 8.135779112171315e-06, + "loss": 0.7092, + "step": 6900 + }, + { + "epoch": 0.1889501100613883, + "grad_norm": 2.734375, + "learning_rate": 8.133037207644431e-06, + "loss": 0.6895, + "step": 6910 + }, + { + "epoch": 0.18922355450431358, + "grad_norm": 2.703125, + "learning_rate": 8.130295303117546e-06, + "loss": 0.7286, + "step": 6920 + }, + { + "epoch": 0.1894969989472389, + "grad_norm": 2.671875, + "learning_rate": 8.127553398590662e-06, + "loss": 0.6628, + "step": 6930 + }, + { + "epoch": 0.1897704433901642, + "grad_norm": 2.796875, + "learning_rate": 8.124811494063778e-06, + "loss": 0.6949, + "step": 6940 + }, + { + "epoch": 0.1900438878330895, + "grad_norm": 2.25, + "learning_rate": 8.122069589536893e-06, + "loss": 0.6493, + "step": 6950 + }, + { + "epoch": 0.19031733227601483, + "grad_norm": 2.953125, + "learning_rate": 8.11932768501001e-06, + "loss": 0.7083, + "step": 6960 + }, + { + "epoch": 0.19059077671894012, + "grad_norm": 2.75, + "learning_rate": 8.116585780483124e-06, + "loss": 0.6602, + "step": 6970 + }, + { + "epoch": 0.19086422116186544, + "grad_norm": 2.234375, + "learning_rate": 8.11384387595624e-06, + "loss": 0.68, + "step": 6980 + }, + { + "epoch": 0.19113766560479076, + "grad_norm": 2.6875, + "learning_rate": 8.111101971429357e-06, + "loss": 0.8084, + "step": 6990 + }, + { + "epoch": 0.19141111004771605, + "grad_norm": 2.875, + "learning_rate": 8.108360066902471e-06, + "loss": 0.7732, + "step": 7000 + }, + { + "epoch": 0.19168455449064137, + "grad_norm": 3.015625, + "learning_rate": 8.105618162375588e-06, + "loss": 0.6623, + "step": 7010 + }, + { + "epoch": 0.19195799893356666, + "grad_norm": 2.890625, + "learning_rate": 8.102876257848702e-06, + "loss": 0.7723, + "step": 7020 + }, + { + "epoch": 0.19223144337649198, + "grad_norm": 2.765625, + "learning_rate": 8.100134353321818e-06, + "loss": 0.6913, + "step": 7030 + }, + { + "epoch": 0.1925048878194173, + "grad_norm": 2.96875, + "learning_rate": 8.097392448794935e-06, + "loss": 0.7974, + "step": 7040 + }, + { + "epoch": 0.1927783322623426, + "grad_norm": 2.828125, + "learning_rate": 8.09465054426805e-06, + "loss": 0.7618, + "step": 7050 + }, + { + "epoch": 0.1930517767052679, + "grad_norm": 2.6875, + "learning_rate": 8.091908639741166e-06, + "loss": 0.7155, + "step": 7060 + }, + { + "epoch": 0.19332522114819323, + "grad_norm": 2.640625, + "learning_rate": 8.08916673521428e-06, + "loss": 0.7114, + "step": 7070 + }, + { + "epoch": 0.19359866559111852, + "grad_norm": 2.640625, + "learning_rate": 8.086424830687397e-06, + "loss": 0.7111, + "step": 7080 + }, + { + "epoch": 0.19387211003404384, + "grad_norm": 3.125, + "learning_rate": 8.083682926160511e-06, + "loss": 0.6835, + "step": 7090 + }, + { + "epoch": 0.19414555447696913, + "grad_norm": 2.5625, + "learning_rate": 8.080941021633628e-06, + "loss": 0.7392, + "step": 7100 + }, + { + "epoch": 0.19441899891989445, + "grad_norm": 2.546875, + "learning_rate": 8.078199117106742e-06, + "loss": 0.7701, + "step": 7110 + }, + { + "epoch": 0.19469244336281977, + "grad_norm": 2.609375, + "learning_rate": 8.075457212579859e-06, + "loss": 0.6767, + "step": 7120 + }, + { + "epoch": 0.19496588780574506, + "grad_norm": 3.1875, + "learning_rate": 8.072715308052973e-06, + "loss": 0.7154, + "step": 7130 + }, + { + "epoch": 0.19523933224867038, + "grad_norm": 2.875, + "learning_rate": 8.06997340352609e-06, + "loss": 0.6923, + "step": 7140 + }, + { + "epoch": 0.1955127766915957, + "grad_norm": 2.4375, + "learning_rate": 8.067231498999204e-06, + "loss": 0.7875, + "step": 7150 + }, + { + "epoch": 0.195786221134521, + "grad_norm": 2.671875, + "learning_rate": 8.06448959447232e-06, + "loss": 0.7205, + "step": 7160 + }, + { + "epoch": 0.1960596655774463, + "grad_norm": 2.734375, + "learning_rate": 8.061747689945437e-06, + "loss": 0.6838, + "step": 7170 + }, + { + "epoch": 0.1963331100203716, + "grad_norm": 2.828125, + "learning_rate": 8.059005785418552e-06, + "loss": 0.7421, + "step": 7180 + }, + { + "epoch": 0.19660655446329692, + "grad_norm": 3.03125, + "learning_rate": 8.056263880891668e-06, + "loss": 0.7385, + "step": 7190 + }, + { + "epoch": 0.19687999890622224, + "grad_norm": 2.96875, + "learning_rate": 8.053521976364783e-06, + "loss": 0.8019, + "step": 7200 + }, + { + "epoch": 0.19715344334914753, + "grad_norm": 2.546875, + "learning_rate": 8.050780071837899e-06, + "loss": 0.6608, + "step": 7210 + }, + { + "epoch": 0.19742688779207285, + "grad_norm": 2.46875, + "learning_rate": 8.048038167311015e-06, + "loss": 0.6893, + "step": 7220 + }, + { + "epoch": 0.19770033223499817, + "grad_norm": 2.796875, + "learning_rate": 8.04529626278413e-06, + "loss": 0.6946, + "step": 7230 + }, + { + "epoch": 0.19797377667792346, + "grad_norm": 2.484375, + "learning_rate": 8.042554358257246e-06, + "loss": 0.6856, + "step": 7240 + }, + { + "epoch": 0.19824722112084878, + "grad_norm": 2.953125, + "learning_rate": 8.03981245373036e-06, + "loss": 0.7387, + "step": 7250 + }, + { + "epoch": 0.19852066556377407, + "grad_norm": 2.703125, + "learning_rate": 8.037070549203477e-06, + "loss": 0.7115, + "step": 7260 + }, + { + "epoch": 0.1987941100066994, + "grad_norm": 2.921875, + "learning_rate": 8.034328644676593e-06, + "loss": 0.7336, + "step": 7270 + }, + { + "epoch": 0.1990675544496247, + "grad_norm": 2.5625, + "learning_rate": 8.031586740149708e-06, + "loss": 0.6799, + "step": 7280 + }, + { + "epoch": 0.19934099889255, + "grad_norm": 2.78125, + "learning_rate": 8.028844835622824e-06, + "loss": 0.6311, + "step": 7290 + }, + { + "epoch": 0.19961444333547532, + "grad_norm": 2.65625, + "learning_rate": 8.02610293109594e-06, + "loss": 0.7292, + "step": 7300 + }, + { + "epoch": 0.19988788777840064, + "grad_norm": 3.046875, + "learning_rate": 8.023361026569055e-06, + "loss": 0.677, + "step": 7310 + }, + { + "epoch": 0.20016133222132593, + "grad_norm": 2.5, + "learning_rate": 8.020619122042172e-06, + "loss": 0.7558, + "step": 7320 + }, + { + "epoch": 0.20043477666425125, + "grad_norm": 2.6875, + "learning_rate": 8.017877217515286e-06, + "loss": 0.6776, + "step": 7330 + }, + { + "epoch": 0.20070822110717654, + "grad_norm": 2.8125, + "learning_rate": 8.015135312988403e-06, + "loss": 0.7339, + "step": 7340 + }, + { + "epoch": 0.20098166555010186, + "grad_norm": 2.703125, + "learning_rate": 8.012393408461519e-06, + "loss": 0.7231, + "step": 7350 + }, + { + "epoch": 0.20125510999302718, + "grad_norm": 2.484375, + "learning_rate": 8.009651503934634e-06, + "loss": 0.7257, + "step": 7360 + }, + { + "epoch": 0.20152855443595247, + "grad_norm": 2.421875, + "learning_rate": 8.00690959940775e-06, + "loss": 0.6538, + "step": 7370 + }, + { + "epoch": 0.2018019988788778, + "grad_norm": 2.546875, + "learning_rate": 8.004167694880865e-06, + "loss": 0.6581, + "step": 7380 + }, + { + "epoch": 0.2020754433218031, + "grad_norm": 2.921875, + "learning_rate": 8.001425790353981e-06, + "loss": 0.7096, + "step": 7390 + }, + { + "epoch": 0.2023488877647284, + "grad_norm": 2.421875, + "learning_rate": 7.998683885827097e-06, + "loss": 0.6994, + "step": 7400 + }, + { + "epoch": 0.20262233220765372, + "grad_norm": 2.8125, + "learning_rate": 7.995941981300212e-06, + "loss": 0.7282, + "step": 7410 + }, + { + "epoch": 0.202895776650579, + "grad_norm": 2.859375, + "learning_rate": 7.993200076773328e-06, + "loss": 0.767, + "step": 7420 + }, + { + "epoch": 0.20316922109350433, + "grad_norm": 2.1875, + "learning_rate": 7.990458172246443e-06, + "loss": 0.6625, + "step": 7430 + }, + { + "epoch": 0.20344266553642965, + "grad_norm": 2.75, + "learning_rate": 7.98771626771956e-06, + "loss": 0.6955, + "step": 7440 + }, + { + "epoch": 0.20371610997935494, + "grad_norm": 2.75, + "learning_rate": 7.984974363192674e-06, + "loss": 0.6762, + "step": 7450 + }, + { + "epoch": 0.20398955442228026, + "grad_norm": 2.65625, + "learning_rate": 7.98223245866579e-06, + "loss": 0.6661, + "step": 7460 + }, + { + "epoch": 0.20426299886520558, + "grad_norm": 3.03125, + "learning_rate": 7.979490554138905e-06, + "loss": 0.7025, + "step": 7470 + }, + { + "epoch": 0.20453644330813087, + "grad_norm": 2.859375, + "learning_rate": 7.976748649612021e-06, + "loss": 0.6563, + "step": 7480 + }, + { + "epoch": 0.20480988775105619, + "grad_norm": 2.53125, + "learning_rate": 7.974006745085137e-06, + "loss": 0.8056, + "step": 7490 + }, + { + "epoch": 0.20508333219398148, + "grad_norm": 2.828125, + "learning_rate": 7.971264840558252e-06, + "loss": 0.7226, + "step": 7500 + }, + { + "epoch": 0.2053567766369068, + "grad_norm": 2.21875, + "learning_rate": 7.968522936031368e-06, + "loss": 0.6216, + "step": 7510 + }, + { + "epoch": 0.20563022107983212, + "grad_norm": 2.734375, + "learning_rate": 7.965781031504483e-06, + "loss": 0.7541, + "step": 7520 + }, + { + "epoch": 0.2059036655227574, + "grad_norm": 2.6875, + "learning_rate": 7.9630391269776e-06, + "loss": 0.7987, + "step": 7530 + }, + { + "epoch": 0.20617710996568273, + "grad_norm": 2.828125, + "learning_rate": 7.960297222450714e-06, + "loss": 0.7113, + "step": 7540 + }, + { + "epoch": 0.20645055440860804, + "grad_norm": 2.328125, + "learning_rate": 7.95755531792383e-06, + "loss": 0.6564, + "step": 7550 + }, + { + "epoch": 0.20672399885153334, + "grad_norm": 2.5625, + "learning_rate": 7.954813413396945e-06, + "loss": 0.6892, + "step": 7560 + }, + { + "epoch": 0.20699744329445866, + "grad_norm": 2.953125, + "learning_rate": 7.952071508870061e-06, + "loss": 0.7113, + "step": 7570 + }, + { + "epoch": 0.20727088773738395, + "grad_norm": 2.578125, + "learning_rate": 7.949329604343178e-06, + "loss": 0.6635, + "step": 7580 + }, + { + "epoch": 0.20754433218030927, + "grad_norm": 3.21875, + "learning_rate": 7.946587699816292e-06, + "loss": 0.7383, + "step": 7590 + }, + { + "epoch": 0.20781777662323458, + "grad_norm": 2.515625, + "learning_rate": 7.943845795289409e-06, + "loss": 0.6727, + "step": 7600 + }, + { + "epoch": 0.20809122106615988, + "grad_norm": 2.5625, + "learning_rate": 7.941103890762523e-06, + "loss": 0.7019, + "step": 7610 + }, + { + "epoch": 0.2083646655090852, + "grad_norm": 2.921875, + "learning_rate": 7.93836198623564e-06, + "loss": 0.6564, + "step": 7620 + }, + { + "epoch": 0.2086381099520105, + "grad_norm": 2.5625, + "learning_rate": 7.935620081708756e-06, + "loss": 0.7474, + "step": 7630 + }, + { + "epoch": 0.2089115543949358, + "grad_norm": 2.734375, + "learning_rate": 7.93287817718187e-06, + "loss": 0.6763, + "step": 7640 + }, + { + "epoch": 0.20918499883786112, + "grad_norm": 3.109375, + "learning_rate": 7.930136272654987e-06, + "loss": 0.7594, + "step": 7650 + }, + { + "epoch": 0.20945844328078642, + "grad_norm": 2.84375, + "learning_rate": 7.927394368128103e-06, + "loss": 0.7537, + "step": 7660 + }, + { + "epoch": 0.20973188772371174, + "grad_norm": 3.140625, + "learning_rate": 7.924652463601218e-06, + "loss": 0.7038, + "step": 7670 + }, + { + "epoch": 0.21000533216663705, + "grad_norm": 2.203125, + "learning_rate": 7.921910559074334e-06, + "loss": 0.6148, + "step": 7680 + }, + { + "epoch": 0.21027877660956235, + "grad_norm": 2.609375, + "learning_rate": 7.919168654547449e-06, + "loss": 0.7059, + "step": 7690 + }, + { + "epoch": 0.21055222105248766, + "grad_norm": 2.609375, + "learning_rate": 7.916426750020565e-06, + "loss": 0.8172, + "step": 7700 + }, + { + "epoch": 0.21082566549541296, + "grad_norm": 2.640625, + "learning_rate": 7.913684845493681e-06, + "loss": 0.6022, + "step": 7710 + }, + { + "epoch": 0.21109910993833828, + "grad_norm": 3.15625, + "learning_rate": 7.910942940966796e-06, + "loss": 0.7819, + "step": 7720 + }, + { + "epoch": 0.2113725543812636, + "grad_norm": 2.5, + "learning_rate": 7.908201036439912e-06, + "loss": 0.6228, + "step": 7730 + }, + { + "epoch": 0.21164599882418889, + "grad_norm": 2.578125, + "learning_rate": 7.905459131913027e-06, + "loss": 0.7004, + "step": 7740 + }, + { + "epoch": 0.2119194432671142, + "grad_norm": 2.375, + "learning_rate": 7.902717227386143e-06, + "loss": 0.7204, + "step": 7750 + }, + { + "epoch": 0.21219288771003952, + "grad_norm": 2.828125, + "learning_rate": 7.89997532285926e-06, + "loss": 0.7008, + "step": 7760 + }, + { + "epoch": 0.21246633215296482, + "grad_norm": 2.8125, + "learning_rate": 7.897233418332374e-06, + "loss": 0.7058, + "step": 7770 + }, + { + "epoch": 0.21273977659589013, + "grad_norm": 2.5, + "learning_rate": 7.89449151380549e-06, + "loss": 0.6728, + "step": 7780 + }, + { + "epoch": 0.21301322103881543, + "grad_norm": 2.359375, + "learning_rate": 7.891749609278605e-06, + "loss": 0.6555, + "step": 7790 + }, + { + "epoch": 0.21328666548174074, + "grad_norm": 3.046875, + "learning_rate": 7.889007704751722e-06, + "loss": 0.6773, + "step": 7800 + }, + { + "epoch": 0.21356010992466606, + "grad_norm": 2.703125, + "learning_rate": 7.886265800224838e-06, + "loss": 0.7249, + "step": 7810 + }, + { + "epoch": 0.21383355436759136, + "grad_norm": 2.84375, + "learning_rate": 7.883523895697953e-06, + "loss": 0.683, + "step": 7820 + }, + { + "epoch": 0.21410699881051667, + "grad_norm": 2.828125, + "learning_rate": 7.880781991171069e-06, + "loss": 0.7468, + "step": 7830 + }, + { + "epoch": 0.214380443253442, + "grad_norm": 2.4375, + "learning_rate": 7.878040086644184e-06, + "loss": 0.671, + "step": 7840 + }, + { + "epoch": 0.21465388769636728, + "grad_norm": 2.859375, + "learning_rate": 7.8752981821173e-06, + "loss": 0.6613, + "step": 7850 + }, + { + "epoch": 0.2149273321392926, + "grad_norm": 2.984375, + "learning_rate": 7.872556277590414e-06, + "loss": 0.7463, + "step": 7860 + }, + { + "epoch": 0.2152007765822179, + "grad_norm": 2.5625, + "learning_rate": 7.86981437306353e-06, + "loss": 0.6532, + "step": 7870 + }, + { + "epoch": 0.21547422102514321, + "grad_norm": 2.515625, + "learning_rate": 7.867072468536645e-06, + "loss": 0.6985, + "step": 7880 + }, + { + "epoch": 0.21574766546806853, + "grad_norm": 2.578125, + "learning_rate": 7.864330564009762e-06, + "loss": 0.6782, + "step": 7890 + }, + { + "epoch": 0.21602110991099382, + "grad_norm": 2.34375, + "learning_rate": 7.861588659482876e-06, + "loss": 0.7212, + "step": 7900 + }, + { + "epoch": 0.21629455435391914, + "grad_norm": 2.9375, + "learning_rate": 7.858846754955993e-06, + "loss": 0.6803, + "step": 7910 + }, + { + "epoch": 0.21656799879684446, + "grad_norm": 2.609375, + "learning_rate": 7.856104850429107e-06, + "loss": 0.7446, + "step": 7920 + }, + { + "epoch": 0.21684144323976975, + "grad_norm": 2.359375, + "learning_rate": 7.853362945902224e-06, + "loss": 0.6921, + "step": 7930 + }, + { + "epoch": 0.21711488768269507, + "grad_norm": 2.484375, + "learning_rate": 7.85062104137534e-06, + "loss": 0.7634, + "step": 7940 + }, + { + "epoch": 0.21738833212562036, + "grad_norm": 2.65625, + "learning_rate": 7.847879136848455e-06, + "loss": 0.7258, + "step": 7950 + }, + { + "epoch": 0.21766177656854568, + "grad_norm": 2.828125, + "learning_rate": 7.845137232321571e-06, + "loss": 0.7002, + "step": 7960 + }, + { + "epoch": 0.217935221011471, + "grad_norm": 2.40625, + "learning_rate": 7.842395327794687e-06, + "loss": 0.7136, + "step": 7970 + }, + { + "epoch": 0.2182086654543963, + "grad_norm": 2.703125, + "learning_rate": 7.839653423267802e-06, + "loss": 0.6587, + "step": 7980 + }, + { + "epoch": 0.2184821098973216, + "grad_norm": 2.734375, + "learning_rate": 7.836911518740918e-06, + "loss": 0.7183, + "step": 7990 + }, + { + "epoch": 0.21875555434024693, + "grad_norm": 2.59375, + "learning_rate": 7.834169614214033e-06, + "loss": 0.7181, + "step": 8000 + }, + { + "epoch": 0.21902899878317222, + "grad_norm": 2.703125, + "learning_rate": 7.83142770968715e-06, + "loss": 0.7664, + "step": 8010 + }, + { + "epoch": 0.21930244322609754, + "grad_norm": 2.40625, + "learning_rate": 7.828685805160266e-06, + "loss": 0.6806, + "step": 8020 + }, + { + "epoch": 0.21957588766902283, + "grad_norm": 3.140625, + "learning_rate": 7.82594390063338e-06, + "loss": 0.7141, + "step": 8030 + }, + { + "epoch": 0.21984933211194815, + "grad_norm": 3.015625, + "learning_rate": 7.823201996106497e-06, + "loss": 0.7348, + "step": 8040 + }, + { + "epoch": 0.22012277655487347, + "grad_norm": 2.578125, + "learning_rate": 7.820460091579611e-06, + "loss": 0.6858, + "step": 8050 + }, + { + "epoch": 0.22039622099779876, + "grad_norm": 2.328125, + "learning_rate": 7.817718187052728e-06, + "loss": 0.707, + "step": 8060 + }, + { + "epoch": 0.22066966544072408, + "grad_norm": 3.0, + "learning_rate": 7.814976282525844e-06, + "loss": 0.7124, + "step": 8070 + }, + { + "epoch": 0.2209431098836494, + "grad_norm": 2.8125, + "learning_rate": 7.812234377998958e-06, + "loss": 0.6937, + "step": 8080 + }, + { + "epoch": 0.2212165543265747, + "grad_norm": 2.390625, + "learning_rate": 7.809492473472075e-06, + "loss": 0.6625, + "step": 8090 + }, + { + "epoch": 0.2214899987695, + "grad_norm": 2.53125, + "learning_rate": 7.80675056894519e-06, + "loss": 0.6625, + "step": 8100 + }, + { + "epoch": 0.2217634432124253, + "grad_norm": 3.15625, + "learning_rate": 7.804008664418306e-06, + "loss": 0.7438, + "step": 8110 + }, + { + "epoch": 0.22203688765535062, + "grad_norm": 2.875, + "learning_rate": 7.801266759891422e-06, + "loss": 0.7229, + "step": 8120 + }, + { + "epoch": 0.22231033209827594, + "grad_norm": 2.234375, + "learning_rate": 7.798524855364537e-06, + "loss": 0.7284, + "step": 8130 + }, + { + "epoch": 0.22258377654120123, + "grad_norm": 2.5625, + "learning_rate": 7.795782950837653e-06, + "loss": 0.7385, + "step": 8140 + }, + { + "epoch": 0.22285722098412655, + "grad_norm": 2.609375, + "learning_rate": 7.79304104631077e-06, + "loss": 0.6546, + "step": 8150 + }, + { + "epoch": 0.22313066542705187, + "grad_norm": 2.609375, + "learning_rate": 7.790299141783884e-06, + "loss": 0.7147, + "step": 8160 + }, + { + "epoch": 0.22340410986997716, + "grad_norm": 2.71875, + "learning_rate": 7.787557237257e-06, + "loss": 0.678, + "step": 8170 + }, + { + "epoch": 0.22367755431290248, + "grad_norm": 2.71875, + "learning_rate": 7.784815332730115e-06, + "loss": 0.7315, + "step": 8180 + }, + { + "epoch": 0.22395099875582777, + "grad_norm": 2.65625, + "learning_rate": 7.782073428203231e-06, + "loss": 0.7517, + "step": 8190 + }, + { + "epoch": 0.2242244431987531, + "grad_norm": 2.421875, + "learning_rate": 7.779331523676346e-06, + "loss": 0.678, + "step": 8200 + }, + { + "epoch": 0.2244978876416784, + "grad_norm": 2.90625, + "learning_rate": 7.776589619149462e-06, + "loss": 0.69, + "step": 8210 + }, + { + "epoch": 0.2247713320846037, + "grad_norm": 2.921875, + "learning_rate": 7.773847714622577e-06, + "loss": 0.6988, + "step": 8220 + }, + { + "epoch": 0.22504477652752902, + "grad_norm": 2.609375, + "learning_rate": 7.771105810095693e-06, + "loss": 0.7531, + "step": 8230 + }, + { + "epoch": 0.22531822097045434, + "grad_norm": 2.546875, + "learning_rate": 7.768363905568808e-06, + "loss": 0.6979, + "step": 8240 + }, + { + "epoch": 0.22559166541337963, + "grad_norm": 2.59375, + "learning_rate": 7.765622001041924e-06, + "loss": 0.6515, + "step": 8250 + }, + { + "epoch": 0.22586510985630495, + "grad_norm": 2.484375, + "learning_rate": 7.76288009651504e-06, + "loss": 0.5952, + "step": 8260 + }, + { + "epoch": 0.22613855429923024, + "grad_norm": 2.96875, + "learning_rate": 7.760138191988155e-06, + "loss": 0.7456, + "step": 8270 + }, + { + "epoch": 0.22641199874215556, + "grad_norm": 2.46875, + "learning_rate": 7.757396287461272e-06, + "loss": 0.6701, + "step": 8280 + }, + { + "epoch": 0.22668544318508088, + "grad_norm": 2.859375, + "learning_rate": 7.754654382934386e-06, + "loss": 0.7564, + "step": 8290 + }, + { + "epoch": 0.22695888762800617, + "grad_norm": 2.78125, + "learning_rate": 7.751912478407502e-06, + "loss": 0.7033, + "step": 8300 + }, + { + "epoch": 0.2272323320709315, + "grad_norm": 3.21875, + "learning_rate": 7.749170573880617e-06, + "loss": 0.7193, + "step": 8310 + }, + { + "epoch": 0.22750577651385678, + "grad_norm": 2.59375, + "learning_rate": 7.746428669353733e-06, + "loss": 0.7145, + "step": 8320 + }, + { + "epoch": 0.2277792209567821, + "grad_norm": 2.625, + "learning_rate": 7.74368676482685e-06, + "loss": 0.638, + "step": 8330 + }, + { + "epoch": 0.22805266539970742, + "grad_norm": 2.4375, + "learning_rate": 7.740944860299964e-06, + "loss": 0.7182, + "step": 8340 + }, + { + "epoch": 0.2283261098426327, + "grad_norm": 2.5625, + "learning_rate": 7.73820295577308e-06, + "loss": 0.641, + "step": 8350 + }, + { + "epoch": 0.22859955428555803, + "grad_norm": 2.453125, + "learning_rate": 7.735461051246195e-06, + "loss": 0.6775, + "step": 8360 + }, + { + "epoch": 0.22887299872848335, + "grad_norm": 2.828125, + "learning_rate": 7.732719146719312e-06, + "loss": 0.6925, + "step": 8370 + }, + { + "epoch": 0.22914644317140864, + "grad_norm": 2.53125, + "learning_rate": 7.729977242192428e-06, + "loss": 0.6075, + "step": 8380 + }, + { + "epoch": 0.22941988761433396, + "grad_norm": 2.75, + "learning_rate": 7.727235337665543e-06, + "loss": 0.7305, + "step": 8390 + }, + { + "epoch": 0.22969333205725925, + "grad_norm": 2.5, + "learning_rate": 7.724493433138659e-06, + "loss": 0.675, + "step": 8400 + }, + { + "epoch": 0.22996677650018457, + "grad_norm": 5.75, + "learning_rate": 7.721751528611774e-06, + "loss": 0.6105, + "step": 8410 + }, + { + "epoch": 0.2302402209431099, + "grad_norm": 3.640625, + "learning_rate": 7.71900962408489e-06, + "loss": 0.7066, + "step": 8420 + }, + { + "epoch": 0.23051366538603518, + "grad_norm": 2.734375, + "learning_rate": 7.716267719558006e-06, + "loss": 0.6597, + "step": 8430 + }, + { + "epoch": 0.2307871098289605, + "grad_norm": 2.796875, + "learning_rate": 7.713525815031121e-06, + "loss": 0.6767, + "step": 8440 + }, + { + "epoch": 0.23106055427188582, + "grad_norm": 3.234375, + "learning_rate": 7.710783910504237e-06, + "loss": 0.6576, + "step": 8450 + }, + { + "epoch": 0.2313339987148111, + "grad_norm": 2.859375, + "learning_rate": 7.708042005977352e-06, + "loss": 0.7109, + "step": 8460 + }, + { + "epoch": 0.23160744315773643, + "grad_norm": 2.765625, + "learning_rate": 7.705300101450468e-06, + "loss": 0.7459, + "step": 8470 + }, + { + "epoch": 0.23188088760066172, + "grad_norm": 3.46875, + "learning_rate": 7.702558196923585e-06, + "loss": 0.6968, + "step": 8480 + }, + { + "epoch": 0.23215433204358704, + "grad_norm": 2.90625, + "learning_rate": 7.6998162923967e-06, + "loss": 0.6928, + "step": 8490 + }, + { + "epoch": 0.23242777648651236, + "grad_norm": 2.265625, + "learning_rate": 7.697074387869816e-06, + "loss": 0.695, + "step": 8500 + }, + { + "epoch": 0.23270122092943765, + "grad_norm": 3.09375, + "learning_rate": 7.694332483342932e-06, + "loss": 0.7941, + "step": 8510 + }, + { + "epoch": 0.23297466537236297, + "grad_norm": 2.671875, + "learning_rate": 7.691590578816046e-06, + "loss": 0.6616, + "step": 8520 + }, + { + "epoch": 0.2332481098152883, + "grad_norm": 2.859375, + "learning_rate": 7.688848674289163e-06, + "loss": 0.6886, + "step": 8530 + }, + { + "epoch": 0.23352155425821358, + "grad_norm": 2.640625, + "learning_rate": 7.686106769762277e-06, + "loss": 0.7339, + "step": 8540 + }, + { + "epoch": 0.2337949987011389, + "grad_norm": 2.421875, + "learning_rate": 7.683364865235394e-06, + "loss": 0.7151, + "step": 8550 + }, + { + "epoch": 0.2340684431440642, + "grad_norm": 2.421875, + "learning_rate": 7.680622960708508e-06, + "loss": 0.65, + "step": 8560 + }, + { + "epoch": 0.2343418875869895, + "grad_norm": 2.984375, + "learning_rate": 7.677881056181625e-06, + "loss": 0.722, + "step": 8570 + }, + { + "epoch": 0.23461533202991483, + "grad_norm": 2.0625, + "learning_rate": 7.675139151654741e-06, + "loss": 0.7052, + "step": 8580 + }, + { + "epoch": 0.23488877647284012, + "grad_norm": 2.9375, + "learning_rate": 7.672397247127856e-06, + "loss": 0.6782, + "step": 8590 + }, + { + "epoch": 0.23516222091576544, + "grad_norm": 2.6875, + "learning_rate": 7.669655342600972e-06, + "loss": 0.7057, + "step": 8600 + }, + { + "epoch": 0.23543566535869076, + "grad_norm": 2.609375, + "learning_rate": 7.666913438074087e-06, + "loss": 0.7619, + "step": 8610 + }, + { + "epoch": 0.23570910980161605, + "grad_norm": 2.65625, + "learning_rate": 7.664171533547203e-06, + "loss": 0.6647, + "step": 8620 + }, + { + "epoch": 0.23598255424454137, + "grad_norm": 2.671875, + "learning_rate": 7.661429629020318e-06, + "loss": 0.6891, + "step": 8630 + }, + { + "epoch": 0.23625599868746666, + "grad_norm": 2.546875, + "learning_rate": 7.658687724493434e-06, + "loss": 0.7388, + "step": 8640 + }, + { + "epoch": 0.23652944313039198, + "grad_norm": 2.890625, + "learning_rate": 7.655945819966549e-06, + "loss": 0.7183, + "step": 8650 + }, + { + "epoch": 0.2368028875733173, + "grad_norm": 2.640625, + "learning_rate": 7.653203915439665e-06, + "loss": 0.6682, + "step": 8660 + }, + { + "epoch": 0.2370763320162426, + "grad_norm": 3.1875, + "learning_rate": 7.65046201091278e-06, + "loss": 0.7204, + "step": 8670 + }, + { + "epoch": 0.2373497764591679, + "grad_norm": 2.78125, + "learning_rate": 7.647720106385896e-06, + "loss": 0.7241, + "step": 8680 + }, + { + "epoch": 0.23762322090209323, + "grad_norm": 3.078125, + "learning_rate": 7.644978201859012e-06, + "loss": 0.7432, + "step": 8690 + }, + { + "epoch": 0.23789666534501852, + "grad_norm": 2.671875, + "learning_rate": 7.642236297332127e-06, + "loss": 0.8173, + "step": 8700 + }, + { + "epoch": 0.23817010978794384, + "grad_norm": 3.03125, + "learning_rate": 7.639494392805243e-06, + "loss": 0.6875, + "step": 8710 + }, + { + "epoch": 0.23844355423086913, + "grad_norm": 2.921875, + "learning_rate": 7.636752488278358e-06, + "loss": 0.69, + "step": 8720 + }, + { + "epoch": 0.23871699867379445, + "grad_norm": 3.0625, + "learning_rate": 7.634010583751474e-06, + "loss": 0.6373, + "step": 8730 + }, + { + "epoch": 0.23899044311671977, + "grad_norm": 2.953125, + "learning_rate": 7.63126867922459e-06, + "loss": 0.7567, + "step": 8740 + }, + { + "epoch": 0.23926388755964506, + "grad_norm": 2.6875, + "learning_rate": 7.628526774697705e-06, + "loss": 0.6443, + "step": 8750 + }, + { + "epoch": 0.23953733200257038, + "grad_norm": 2.71875, + "learning_rate": 7.625784870170821e-06, + "loss": 0.6936, + "step": 8760 + }, + { + "epoch": 0.2398107764454957, + "grad_norm": 3.1875, + "learning_rate": 7.623042965643936e-06, + "loss": 0.7132, + "step": 8770 + }, + { + "epoch": 0.240084220888421, + "grad_norm": 3.015625, + "learning_rate": 7.620301061117052e-06, + "loss": 0.6631, + "step": 8780 + }, + { + "epoch": 0.2403576653313463, + "grad_norm": 2.453125, + "learning_rate": 7.617559156590169e-06, + "loss": 0.6444, + "step": 8790 + }, + { + "epoch": 0.2406311097742716, + "grad_norm": 2.6875, + "learning_rate": 7.614817252063283e-06, + "loss": 0.7201, + "step": 8800 + }, + { + "epoch": 0.24090455421719692, + "grad_norm": 2.75, + "learning_rate": 7.6120753475364e-06, + "loss": 0.6874, + "step": 8810 + }, + { + "epoch": 0.24117799866012224, + "grad_norm": 2.75, + "learning_rate": 7.609333443009514e-06, + "loss": 0.6698, + "step": 8820 + }, + { + "epoch": 0.24145144310304753, + "grad_norm": 2.234375, + "learning_rate": 7.606591538482631e-06, + "loss": 0.6676, + "step": 8830 + }, + { + "epoch": 0.24172488754597285, + "grad_norm": 2.265625, + "learning_rate": 7.603849633955747e-06, + "loss": 0.76, + "step": 8840 + }, + { + "epoch": 0.24199833198889817, + "grad_norm": 2.640625, + "learning_rate": 7.601107729428862e-06, + "loss": 0.6583, + "step": 8850 + }, + { + "epoch": 0.24227177643182346, + "grad_norm": 2.515625, + "learning_rate": 7.598365824901978e-06, + "loss": 0.6999, + "step": 8860 + }, + { + "epoch": 0.24254522087474878, + "grad_norm": 2.484375, + "learning_rate": 7.595623920375093e-06, + "loss": 0.717, + "step": 8870 + }, + { + "epoch": 0.24281866531767407, + "grad_norm": 3.234375, + "learning_rate": 7.592882015848209e-06, + "loss": 0.75, + "step": 8880 + }, + { + "epoch": 0.2430921097605994, + "grad_norm": 2.96875, + "learning_rate": 7.590140111321324e-06, + "loss": 0.6548, + "step": 8890 + }, + { + "epoch": 0.2433655542035247, + "grad_norm": 2.640625, + "learning_rate": 7.58739820679444e-06, + "loss": 0.6962, + "step": 8900 + }, + { + "epoch": 0.24363899864645, + "grad_norm": 2.546875, + "learning_rate": 7.584656302267555e-06, + "loss": 0.6997, + "step": 8910 + }, + { + "epoch": 0.24391244308937532, + "grad_norm": 3.0625, + "learning_rate": 7.581914397740672e-06, + "loss": 0.7096, + "step": 8920 + }, + { + "epoch": 0.24418588753230064, + "grad_norm": 2.453125, + "learning_rate": 7.579172493213786e-06, + "loss": 0.6557, + "step": 8930 + }, + { + "epoch": 0.24445933197522593, + "grad_norm": 2.84375, + "learning_rate": 7.576430588686903e-06, + "loss": 0.6859, + "step": 8940 + }, + { + "epoch": 0.24473277641815125, + "grad_norm": 2.625, + "learning_rate": 7.573688684160017e-06, + "loss": 0.6274, + "step": 8950 + }, + { + "epoch": 0.24500622086107654, + "grad_norm": 2.34375, + "learning_rate": 7.570946779633134e-06, + "loss": 0.6992, + "step": 8960 + }, + { + "epoch": 0.24527966530400186, + "grad_norm": 2.75, + "learning_rate": 7.56820487510625e-06, + "loss": 0.7478, + "step": 8970 + }, + { + "epoch": 0.24555310974692718, + "grad_norm": 2.703125, + "learning_rate": 7.5654629705793646e-06, + "loss": 0.7116, + "step": 8980 + }, + { + "epoch": 0.24582655418985247, + "grad_norm": 2.5, + "learning_rate": 7.562721066052481e-06, + "loss": 0.6652, + "step": 8990 + }, + { + "epoch": 0.2460999986327778, + "grad_norm": 3.0, + "learning_rate": 7.5599791615255955e-06, + "loss": 0.7693, + "step": 9000 + }, + { + "epoch": 0.24637344307570308, + "grad_norm": 2.78125, + "learning_rate": 7.557237256998712e-06, + "loss": 0.6308, + "step": 9010 + }, + { + "epoch": 0.2466468875186284, + "grad_norm": 2.515625, + "learning_rate": 7.554495352471828e-06, + "loss": 0.6975, + "step": 9020 + }, + { + "epoch": 0.24692033196155372, + "grad_norm": 2.59375, + "learning_rate": 7.551753447944943e-06, + "loss": 0.6442, + "step": 9030 + }, + { + "epoch": 0.247193776404479, + "grad_norm": 2.1875, + "learning_rate": 7.549011543418059e-06, + "loss": 0.6081, + "step": 9040 + }, + { + "epoch": 0.24746722084740433, + "grad_norm": 2.859375, + "learning_rate": 7.546269638891175e-06, + "loss": 0.7126, + "step": 9050 + }, + { + "epoch": 0.24774066529032965, + "grad_norm": 2.625, + "learning_rate": 7.54352773436429e-06, + "loss": 0.6811, + "step": 9060 + }, + { + "epoch": 0.24801410973325494, + "grad_norm": 2.515625, + "learning_rate": 7.540785829837406e-06, + "loss": 0.7447, + "step": 9070 + }, + { + "epoch": 0.24828755417618026, + "grad_norm": 2.4375, + "learning_rate": 7.538043925310521e-06, + "loss": 0.7173, + "step": 9080 + }, + { + "epoch": 0.24856099861910555, + "grad_norm": 2.65625, + "learning_rate": 7.5353020207836366e-06, + "loss": 0.7225, + "step": 9090 + }, + { + "epoch": 0.24883444306203087, + "grad_norm": 2.359375, + "learning_rate": 7.532560116256753e-06, + "loss": 0.6711, + "step": 9100 + }, + { + "epoch": 0.2491078875049562, + "grad_norm": 2.71875, + "learning_rate": 7.5298182117298675e-06, + "loss": 0.6839, + "step": 9110 + }, + { + "epoch": 0.24938133194788148, + "grad_norm": 2.734375, + "learning_rate": 7.527076307202984e-06, + "loss": 0.6785, + "step": 9120 + }, + { + "epoch": 0.2496547763908068, + "grad_norm": 2.6875, + "learning_rate": 7.524334402676099e-06, + "loss": 0.7844, + "step": 9130 + }, + { + "epoch": 0.24992822083373212, + "grad_norm": 2.84375, + "learning_rate": 7.521592498149215e-06, + "loss": 0.6824, + "step": 9140 + }, + { + "epoch": 0.25020166527665744, + "grad_norm": 2.703125, + "learning_rate": 7.518850593622331e-06, + "loss": 0.7132, + "step": 9150 + }, + { + "epoch": 0.2504751097195827, + "grad_norm": 2.34375, + "learning_rate": 7.516108689095446e-06, + "loss": 0.7171, + "step": 9160 + }, + { + "epoch": 0.250748554162508, + "grad_norm": 2.546875, + "learning_rate": 7.513366784568562e-06, + "loss": 0.7262, + "step": 9170 + }, + { + "epoch": 0.25102199860543334, + "grad_norm": 2.609375, + "learning_rate": 7.5106248800416784e-06, + "loss": 0.647, + "step": 9180 + }, + { + "epoch": 0.25129544304835866, + "grad_norm": 2.5, + "learning_rate": 7.507882975514793e-06, + "loss": 0.693, + "step": 9190 + }, + { + "epoch": 0.251568887491284, + "grad_norm": 2.46875, + "learning_rate": 7.505141070987909e-06, + "loss": 0.6096, + "step": 9200 + }, + { + "epoch": 0.25184233193420924, + "grad_norm": 2.59375, + "learning_rate": 7.502399166461024e-06, + "loss": 0.7002, + "step": 9210 + }, + { + "epoch": 0.25211577637713456, + "grad_norm": 2.71875, + "learning_rate": 7.49965726193414e-06, + "loss": 0.6868, + "step": 9220 + }, + { + "epoch": 0.2523892208200599, + "grad_norm": 3.09375, + "learning_rate": 7.496915357407256e-06, + "loss": 0.7755, + "step": 9230 + }, + { + "epoch": 0.2526626652629852, + "grad_norm": 2.296875, + "learning_rate": 7.494173452880371e-06, + "loss": 0.68, + "step": 9240 + }, + { + "epoch": 0.2529361097059105, + "grad_norm": 2.71875, + "learning_rate": 7.491431548353487e-06, + "loss": 0.6615, + "step": 9250 + }, + { + "epoch": 0.25320955414883584, + "grad_norm": 2.859375, + "learning_rate": 7.488689643826602e-06, + "loss": 0.7068, + "step": 9260 + }, + { + "epoch": 0.2534829985917611, + "grad_norm": 2.265625, + "learning_rate": 7.485947739299718e-06, + "loss": 0.6525, + "step": 9270 + }, + { + "epoch": 0.2537564430346864, + "grad_norm": 2.609375, + "learning_rate": 7.483205834772834e-06, + "loss": 0.7144, + "step": 9280 + }, + { + "epoch": 0.25402988747761174, + "grad_norm": 3.140625, + "learning_rate": 7.48046393024595e-06, + "loss": 0.7296, + "step": 9290 + }, + { + "epoch": 0.25430333192053706, + "grad_norm": 2.625, + "learning_rate": 7.477722025719065e-06, + "loss": 0.7118, + "step": 9300 + }, + { + "epoch": 0.2545767763634624, + "grad_norm": 2.984375, + "learning_rate": 7.4749801211921806e-06, + "loss": 0.6641, + "step": 9310 + }, + { + "epoch": 0.25485022080638764, + "grad_norm": 2.9375, + "learning_rate": 7.472238216665296e-06, + "loss": 0.6263, + "step": 9320 + }, + { + "epoch": 0.25512366524931296, + "grad_norm": 2.59375, + "learning_rate": 7.469496312138412e-06, + "loss": 0.677, + "step": 9330 + }, + { + "epoch": 0.2553971096922383, + "grad_norm": 2.40625, + "learning_rate": 7.466754407611527e-06, + "loss": 0.6567, + "step": 9340 + }, + { + "epoch": 0.2556705541351636, + "grad_norm": 2.890625, + "learning_rate": 7.464012503084643e-06, + "loss": 0.7111, + "step": 9350 + }, + { + "epoch": 0.2559439985780889, + "grad_norm": 2.390625, + "learning_rate": 7.46127059855776e-06, + "loss": 0.6513, + "step": 9360 + }, + { + "epoch": 0.2562174430210142, + "grad_norm": 2.609375, + "learning_rate": 7.458528694030874e-06, + "loss": 0.7197, + "step": 9370 + }, + { + "epoch": 0.2564908874639395, + "grad_norm": 2.78125, + "learning_rate": 7.455786789503991e-06, + "loss": 0.7078, + "step": 9380 + }, + { + "epoch": 0.2567643319068648, + "grad_norm": 2.75, + "learning_rate": 7.453044884977105e-06, + "loss": 0.7895, + "step": 9390 + }, + { + "epoch": 0.25703777634979014, + "grad_norm": 2.71875, + "learning_rate": 7.450302980450222e-06, + "loss": 0.5916, + "step": 9400 + }, + { + "epoch": 0.25731122079271546, + "grad_norm": 2.453125, + "learning_rate": 7.447561075923337e-06, + "loss": 0.7061, + "step": 9410 + }, + { + "epoch": 0.2575846652356408, + "grad_norm": 2.921875, + "learning_rate": 7.4448191713964525e-06, + "loss": 0.7298, + "step": 9420 + }, + { + "epoch": 0.25785810967856604, + "grad_norm": 2.8125, + "learning_rate": 7.442077266869568e-06, + "loss": 0.629, + "step": 9430 + }, + { + "epoch": 0.25813155412149136, + "grad_norm": 2.40625, + "learning_rate": 7.4393353623426835e-06, + "loss": 0.5903, + "step": 9440 + }, + { + "epoch": 0.2584049985644167, + "grad_norm": 2.75, + "learning_rate": 7.4365934578158e-06, + "loss": 0.7129, + "step": 9450 + }, + { + "epoch": 0.258678443007342, + "grad_norm": 3.453125, + "learning_rate": 7.433851553288915e-06, + "loss": 0.693, + "step": 9460 + }, + { + "epoch": 0.2589518874502673, + "grad_norm": 2.59375, + "learning_rate": 7.431109648762031e-06, + "loss": 0.687, + "step": 9470 + }, + { + "epoch": 0.2592253318931926, + "grad_norm": 2.6875, + "learning_rate": 7.428367744235146e-06, + "loss": 0.6796, + "step": 9480 + }, + { + "epoch": 0.2594987763361179, + "grad_norm": 2.46875, + "learning_rate": 7.425625839708262e-06, + "loss": 0.6469, + "step": 9490 + }, + { + "epoch": 0.2597722207790432, + "grad_norm": 2.53125, + "learning_rate": 7.422883935181377e-06, + "loss": 0.6474, + "step": 9500 + }, + { + "epoch": 0.26004566522196854, + "grad_norm": 2.859375, + "learning_rate": 7.420142030654494e-06, + "loss": 0.7923, + "step": 9510 + }, + { + "epoch": 0.26031910966489386, + "grad_norm": 2.546875, + "learning_rate": 7.417400126127608e-06, + "loss": 0.6564, + "step": 9520 + }, + { + "epoch": 0.2605925541078191, + "grad_norm": 2.515625, + "learning_rate": 7.4146582216007245e-06, + "loss": 0.7091, + "step": 9530 + }, + { + "epoch": 0.26086599855074444, + "grad_norm": 2.828125, + "learning_rate": 7.411916317073841e-06, + "loss": 0.6955, + "step": 9540 + }, + { + "epoch": 0.26113944299366976, + "grad_norm": 2.4375, + "learning_rate": 7.4091744125469555e-06, + "loss": 0.7045, + "step": 9550 + }, + { + "epoch": 0.2614128874365951, + "grad_norm": 2.15625, + "learning_rate": 7.406432508020072e-06, + "loss": 0.6304, + "step": 9560 + }, + { + "epoch": 0.2616863318795204, + "grad_norm": 2.75, + "learning_rate": 7.4036906034931865e-06, + "loss": 0.6762, + "step": 9570 + }, + { + "epoch": 0.2619597763224457, + "grad_norm": 2.515625, + "learning_rate": 7.400948698966303e-06, + "loss": 0.6756, + "step": 9580 + }, + { + "epoch": 0.262233220765371, + "grad_norm": 2.859375, + "learning_rate": 7.398206794439418e-06, + "loss": 0.7616, + "step": 9590 + }, + { + "epoch": 0.2625066652082963, + "grad_norm": 2.703125, + "learning_rate": 7.395464889912534e-06, + "loss": 0.7114, + "step": 9600 + }, + { + "epoch": 0.2627801096512216, + "grad_norm": 2.734375, + "learning_rate": 7.39272298538565e-06, + "loss": 0.7287, + "step": 9610 + }, + { + "epoch": 0.26305355409414694, + "grad_norm": 3.046875, + "learning_rate": 7.389981080858765e-06, + "loss": 0.7074, + "step": 9620 + }, + { + "epoch": 0.26332699853707225, + "grad_norm": 2.890625, + "learning_rate": 7.387239176331881e-06, + "loss": 0.6533, + "step": 9630 + }, + { + "epoch": 0.2636004429799975, + "grad_norm": 2.5625, + "learning_rate": 7.3844972718049965e-06, + "loss": 0.7169, + "step": 9640 + }, + { + "epoch": 0.26387388742292284, + "grad_norm": 2.84375, + "learning_rate": 7.381755367278112e-06, + "loss": 0.6494, + "step": 9650 + }, + { + "epoch": 0.26414733186584816, + "grad_norm": 2.46875, + "learning_rate": 7.3790134627512275e-06, + "loss": 0.6044, + "step": 9660 + }, + { + "epoch": 0.2644207763087735, + "grad_norm": 2.75, + "learning_rate": 7.376271558224343e-06, + "loss": 0.7373, + "step": 9670 + }, + { + "epoch": 0.2646942207516988, + "grad_norm": 2.78125, + "learning_rate": 7.3735296536974585e-06, + "loss": 0.7098, + "step": 9680 + }, + { + "epoch": 0.26496766519462406, + "grad_norm": 2.96875, + "learning_rate": 7.370787749170575e-06, + "loss": 0.7358, + "step": 9690 + }, + { + "epoch": 0.2652411096375494, + "grad_norm": 2.828125, + "learning_rate": 7.3680458446436894e-06, + "loss": 0.6778, + "step": 9700 + }, + { + "epoch": 0.2655145540804747, + "grad_norm": 2.53125, + "learning_rate": 7.365303940116806e-06, + "loss": 0.7236, + "step": 9710 + }, + { + "epoch": 0.2657879985234, + "grad_norm": 2.75, + "learning_rate": 7.362562035589922e-06, + "loss": 0.7346, + "step": 9720 + }, + { + "epoch": 0.26606144296632533, + "grad_norm": 2.8125, + "learning_rate": 7.359820131063037e-06, + "loss": 0.6637, + "step": 9730 + }, + { + "epoch": 0.2663348874092506, + "grad_norm": 2.984375, + "learning_rate": 7.357078226536153e-06, + "loss": 0.7364, + "step": 9740 + }, + { + "epoch": 0.2666083318521759, + "grad_norm": 3.59375, + "learning_rate": 7.354336322009268e-06, + "loss": 0.7688, + "step": 9750 + }, + { + "epoch": 0.26688177629510124, + "grad_norm": 3.296875, + "learning_rate": 7.351594417482384e-06, + "loss": 0.7163, + "step": 9760 + }, + { + "epoch": 0.26715522073802656, + "grad_norm": 2.609375, + "learning_rate": 7.3488525129555e-06, + "loss": 0.7108, + "step": 9770 + }, + { + "epoch": 0.2674286651809519, + "grad_norm": 2.46875, + "learning_rate": 7.346110608428615e-06, + "loss": 0.7109, + "step": 9780 + }, + { + "epoch": 0.2677021096238772, + "grad_norm": 2.8125, + "learning_rate": 7.343368703901731e-06, + "loss": 0.7241, + "step": 9790 + }, + { + "epoch": 0.26797555406680246, + "grad_norm": 2.890625, + "learning_rate": 7.340626799374846e-06, + "loss": 0.7336, + "step": 9800 + }, + { + "epoch": 0.2682489985097278, + "grad_norm": 2.9375, + "learning_rate": 7.337884894847962e-06, + "loss": 0.7293, + "step": 9810 + }, + { + "epoch": 0.2685224429526531, + "grad_norm": 2.953125, + "learning_rate": 7.335142990321078e-06, + "loss": 0.7154, + "step": 9820 + }, + { + "epoch": 0.2687958873955784, + "grad_norm": 2.53125, + "learning_rate": 7.332401085794193e-06, + "loss": 0.6306, + "step": 9830 + }, + { + "epoch": 0.26906933183850373, + "grad_norm": 2.828125, + "learning_rate": 7.329659181267309e-06, + "loss": 0.6722, + "step": 9840 + }, + { + "epoch": 0.269342776281429, + "grad_norm": 2.9375, + "learning_rate": 7.326917276740424e-06, + "loss": 0.7869, + "step": 9850 + }, + { + "epoch": 0.2696162207243543, + "grad_norm": 3.1875, + "learning_rate": 7.32417537221354e-06, + "loss": 0.8045, + "step": 9860 + }, + { + "epoch": 0.26988966516727964, + "grad_norm": 2.859375, + "learning_rate": 7.321433467686656e-06, + "loss": 0.8061, + "step": 9870 + }, + { + "epoch": 0.27016310961020495, + "grad_norm": 2.828125, + "learning_rate": 7.318691563159771e-06, + "loss": 0.6681, + "step": 9880 + }, + { + "epoch": 0.2704365540531303, + "grad_norm": 3.03125, + "learning_rate": 7.315949658632887e-06, + "loss": 0.6793, + "step": 9890 + }, + { + "epoch": 0.27070999849605554, + "grad_norm": 2.46875, + "learning_rate": 7.313207754106003e-06, + "loss": 0.7424, + "step": 9900 + }, + { + "epoch": 0.27098344293898086, + "grad_norm": 2.796875, + "learning_rate": 7.310465849579118e-06, + "loss": 0.7068, + "step": 9910 + }, + { + "epoch": 0.2712568873819062, + "grad_norm": 2.828125, + "learning_rate": 7.307723945052234e-06, + "loss": 0.7346, + "step": 9920 + }, + { + "epoch": 0.2715303318248315, + "grad_norm": 2.46875, + "learning_rate": 7.304982040525349e-06, + "loss": 0.711, + "step": 9930 + }, + { + "epoch": 0.2718037762677568, + "grad_norm": 2.796875, + "learning_rate": 7.302240135998465e-06, + "loss": 0.702, + "step": 9940 + }, + { + "epoch": 0.27207722071068213, + "grad_norm": 2.53125, + "learning_rate": 7.2994982314715816e-06, + "loss": 0.6556, + "step": 9950 + }, + { + "epoch": 0.2723506651536074, + "grad_norm": 2.921875, + "learning_rate": 7.296756326944696e-06, + "loss": 0.7083, + "step": 9960 + }, + { + "epoch": 0.2726241095965327, + "grad_norm": 2.484375, + "learning_rate": 7.2940144224178125e-06, + "loss": 0.7038, + "step": 9970 + }, + { + "epoch": 0.27289755403945803, + "grad_norm": 2.609375, + "learning_rate": 7.291272517890927e-06, + "loss": 0.5966, + "step": 9980 + }, + { + "epoch": 0.27317099848238335, + "grad_norm": 2.671875, + "learning_rate": 7.2885306133640435e-06, + "loss": 0.73, + "step": 9990 + }, + { + "epoch": 0.2734444429253087, + "grad_norm": 2.578125, + "learning_rate": 7.285788708837159e-06, + "loss": 0.6524, + "step": 10000 + }, + { + "epoch": 0.27371788736823394, + "grad_norm": 2.296875, + "learning_rate": 7.2830468043102745e-06, + "loss": 0.6547, + "step": 10010 + }, + { + "epoch": 0.27399133181115926, + "grad_norm": 3.078125, + "learning_rate": 7.28030489978339e-06, + "loss": 0.7095, + "step": 10020 + }, + { + "epoch": 0.2742647762540846, + "grad_norm": 2.96875, + "learning_rate": 7.277562995256505e-06, + "loss": 0.7374, + "step": 10030 + }, + { + "epoch": 0.2745382206970099, + "grad_norm": 3.015625, + "learning_rate": 7.274821090729621e-06, + "loss": 0.7398, + "step": 10040 + }, + { + "epoch": 0.2748116651399352, + "grad_norm": 2.09375, + "learning_rate": 7.272079186202737e-06, + "loss": 0.6579, + "step": 10050 + }, + { + "epoch": 0.2750851095828605, + "grad_norm": 2.65625, + "learning_rate": 7.269337281675853e-06, + "loss": 0.7178, + "step": 10060 + }, + { + "epoch": 0.2753585540257858, + "grad_norm": 3.0, + "learning_rate": 7.266595377148968e-06, + "loss": 0.6846, + "step": 10070 + }, + { + "epoch": 0.2756319984687111, + "grad_norm": 2.46875, + "learning_rate": 7.2638534726220845e-06, + "loss": 0.712, + "step": 10080 + }, + { + "epoch": 0.27590544291163643, + "grad_norm": 2.703125, + "learning_rate": 7.261111568095199e-06, + "loss": 0.6423, + "step": 10090 + }, + { + "epoch": 0.27617888735456175, + "grad_norm": 2.625, + "learning_rate": 7.2583696635683155e-06, + "loss": 0.7389, + "step": 10100 + }, + { + "epoch": 0.27645233179748707, + "grad_norm": 3.109375, + "learning_rate": 7.25562775904143e-06, + "loss": 0.6908, + "step": 10110 + }, + { + "epoch": 0.27672577624041234, + "grad_norm": 2.65625, + "learning_rate": 7.2528858545145464e-06, + "loss": 0.6775, + "step": 10120 + }, + { + "epoch": 0.27699922068333765, + "grad_norm": 2.90625, + "learning_rate": 7.250143949987663e-06, + "loss": 0.7069, + "step": 10130 + }, + { + "epoch": 0.277272665126263, + "grad_norm": 2.765625, + "learning_rate": 7.247402045460777e-06, + "loss": 0.765, + "step": 10140 + }, + { + "epoch": 0.2775461095691883, + "grad_norm": 2.765625, + "learning_rate": 7.244660140933894e-06, + "loss": 0.6856, + "step": 10150 + }, + { + "epoch": 0.2778195540121136, + "grad_norm": 2.671875, + "learning_rate": 7.241918236407008e-06, + "loss": 0.7105, + "step": 10160 + }, + { + "epoch": 0.2780929984550389, + "grad_norm": 2.203125, + "learning_rate": 7.239176331880125e-06, + "loss": 0.638, + "step": 10170 + }, + { + "epoch": 0.2783664428979642, + "grad_norm": 2.34375, + "learning_rate": 7.23643442735324e-06, + "loss": 0.7125, + "step": 10180 + }, + { + "epoch": 0.2786398873408895, + "grad_norm": 3.375, + "learning_rate": 7.233692522826356e-06, + "loss": 0.6691, + "step": 10190 + }, + { + "epoch": 0.27891333178381483, + "grad_norm": 2.859375, + "learning_rate": 7.230950618299471e-06, + "loss": 0.6366, + "step": 10200 + }, + { + "epoch": 0.27918677622674015, + "grad_norm": 3.09375, + "learning_rate": 7.228208713772587e-06, + "loss": 0.6189, + "step": 10210 + }, + { + "epoch": 0.2794602206696654, + "grad_norm": 2.734375, + "learning_rate": 7.225466809245703e-06, + "loss": 0.7729, + "step": 10220 + }, + { + "epoch": 0.27973366511259073, + "grad_norm": 2.796875, + "learning_rate": 7.2227249047188184e-06, + "loss": 0.7949, + "step": 10230 + }, + { + "epoch": 0.28000710955551605, + "grad_norm": 2.625, + "learning_rate": 7.219983000191934e-06, + "loss": 0.6995, + "step": 10240 + }, + { + "epoch": 0.2802805539984414, + "grad_norm": 2.625, + "learning_rate": 7.217241095665049e-06, + "loss": 0.7009, + "step": 10250 + }, + { + "epoch": 0.2805539984413667, + "grad_norm": 2.4375, + "learning_rate": 7.214499191138166e-06, + "loss": 0.6849, + "step": 10260 + }, + { + "epoch": 0.280827442884292, + "grad_norm": 3.34375, + "learning_rate": 7.21175728661128e-06, + "loss": 0.7117, + "step": 10270 + }, + { + "epoch": 0.2811008873272173, + "grad_norm": 2.28125, + "learning_rate": 7.209015382084397e-06, + "loss": 0.7217, + "step": 10280 + }, + { + "epoch": 0.2813743317701426, + "grad_norm": 3.546875, + "learning_rate": 7.206273477557511e-06, + "loss": 0.687, + "step": 10290 + }, + { + "epoch": 0.2816477762130679, + "grad_norm": 2.671875, + "learning_rate": 7.203531573030628e-06, + "loss": 0.7166, + "step": 10300 + }, + { + "epoch": 0.28192122065599323, + "grad_norm": 3.078125, + "learning_rate": 7.200789668503744e-06, + "loss": 0.6192, + "step": 10310 + }, + { + "epoch": 0.28219466509891855, + "grad_norm": 2.53125, + "learning_rate": 7.198047763976859e-06, + "loss": 0.683, + "step": 10320 + }, + { + "epoch": 0.2824681095418438, + "grad_norm": 2.71875, + "learning_rate": 7.195305859449975e-06, + "loss": 0.7471, + "step": 10330 + }, + { + "epoch": 0.28274155398476913, + "grad_norm": 2.921875, + "learning_rate": 7.19256395492309e-06, + "loss": 0.7108, + "step": 10340 + }, + { + "epoch": 0.28301499842769445, + "grad_norm": 3.0625, + "learning_rate": 7.189822050396206e-06, + "loss": 0.6842, + "step": 10350 + }, + { + "epoch": 0.28328844287061977, + "grad_norm": 3.046875, + "learning_rate": 7.187080145869321e-06, + "loss": 0.6432, + "step": 10360 + }, + { + "epoch": 0.2835618873135451, + "grad_norm": 2.75, + "learning_rate": 7.184338241342437e-06, + "loss": 0.7452, + "step": 10370 + }, + { + "epoch": 0.28383533175647035, + "grad_norm": 2.828125, + "learning_rate": 7.181596336815553e-06, + "loss": 0.6307, + "step": 10380 + }, + { + "epoch": 0.2841087761993957, + "grad_norm": 2.671875, + "learning_rate": 7.178854432288669e-06, + "loss": 0.6685, + "step": 10390 + }, + { + "epoch": 0.284382220642321, + "grad_norm": 2.71875, + "learning_rate": 7.176112527761784e-06, + "loss": 0.6847, + "step": 10400 + }, + { + "epoch": 0.2846556650852463, + "grad_norm": 3.046875, + "learning_rate": 7.1733706232349e-06, + "loss": 0.7554, + "step": 10410 + }, + { + "epoch": 0.28492910952817163, + "grad_norm": 2.921875, + "learning_rate": 7.170628718708015e-06, + "loss": 0.7918, + "step": 10420 + }, + { + "epoch": 0.2852025539710969, + "grad_norm": 2.3125, + "learning_rate": 7.167886814181131e-06, + "loss": 0.6851, + "step": 10430 + }, + { + "epoch": 0.2854759984140222, + "grad_norm": 2.8125, + "learning_rate": 7.165144909654247e-06, + "loss": 0.7382, + "step": 10440 + }, + { + "epoch": 0.28574944285694753, + "grad_norm": 2.65625, + "learning_rate": 7.162403005127362e-06, + "loss": 0.7446, + "step": 10450 + }, + { + "epoch": 0.28602288729987285, + "grad_norm": 2.796875, + "learning_rate": 7.159661100600478e-06, + "loss": 0.6616, + "step": 10460 + }, + { + "epoch": 0.28629633174279817, + "grad_norm": 2.296875, + "learning_rate": 7.1569191960735926e-06, + "loss": 0.6103, + "step": 10470 + }, + { + "epoch": 0.2865697761857235, + "grad_norm": 2.8125, + "learning_rate": 7.154177291546709e-06, + "loss": 0.6605, + "step": 10480 + }, + { + "epoch": 0.28684322062864875, + "grad_norm": 3.171875, + "learning_rate": 7.151435387019825e-06, + "loss": 0.6494, + "step": 10490 + }, + { + "epoch": 0.2871166650715741, + "grad_norm": 2.328125, + "learning_rate": 7.14869348249294e-06, + "loss": 0.7084, + "step": 10500 + }, + { + "epoch": 0.2873901095144994, + "grad_norm": 2.8125, + "learning_rate": 7.145951577966056e-06, + "loss": 0.6977, + "step": 10510 + }, + { + "epoch": 0.2876635539574247, + "grad_norm": 2.75, + "learning_rate": 7.143209673439171e-06, + "loss": 0.6618, + "step": 10520 + }, + { + "epoch": 0.28793699840035003, + "grad_norm": 2.65625, + "learning_rate": 7.140467768912287e-06, + "loss": 0.7375, + "step": 10530 + }, + { + "epoch": 0.2882104428432753, + "grad_norm": 2.734375, + "learning_rate": 7.1377258643854035e-06, + "loss": 0.5987, + "step": 10540 + }, + { + "epoch": 0.2884838872862006, + "grad_norm": 2.703125, + "learning_rate": 7.134983959858518e-06, + "loss": 0.6533, + "step": 10550 + }, + { + "epoch": 0.28875733172912593, + "grad_norm": 2.953125, + "learning_rate": 7.1322420553316344e-06, + "loss": 0.6927, + "step": 10560 + }, + { + "epoch": 0.28903077617205125, + "grad_norm": 3.25, + "learning_rate": 7.12950015080475e-06, + "loss": 0.7638, + "step": 10570 + }, + { + "epoch": 0.28930422061497657, + "grad_norm": 2.484375, + "learning_rate": 7.126758246277865e-06, + "loss": 0.6918, + "step": 10580 + }, + { + "epoch": 0.28957766505790183, + "grad_norm": 2.96875, + "learning_rate": 7.124016341750981e-06, + "loss": 0.7578, + "step": 10590 + }, + { + "epoch": 0.28985110950082715, + "grad_norm": 2.28125, + "learning_rate": 7.121274437224096e-06, + "loss": 0.6612, + "step": 10600 + }, + { + "epoch": 0.2901245539437525, + "grad_norm": 2.515625, + "learning_rate": 7.118532532697212e-06, + "loss": 0.6589, + "step": 10610 + }, + { + "epoch": 0.2903979983866778, + "grad_norm": 2.953125, + "learning_rate": 7.115790628170328e-06, + "loss": 0.691, + "step": 10620 + }, + { + "epoch": 0.2906714428296031, + "grad_norm": 3.0625, + "learning_rate": 7.113048723643443e-06, + "loss": 0.7327, + "step": 10630 + }, + { + "epoch": 0.29094488727252843, + "grad_norm": 3.0, + "learning_rate": 7.110306819116559e-06, + "loss": 0.7485, + "step": 10640 + }, + { + "epoch": 0.2912183317154537, + "grad_norm": 2.75, + "learning_rate": 7.107564914589674e-06, + "loss": 0.676, + "step": 10650 + }, + { + "epoch": 0.291491776158379, + "grad_norm": 2.796875, + "learning_rate": 7.10482301006279e-06, + "loss": 0.7206, + "step": 10660 + }, + { + "epoch": 0.29176522060130433, + "grad_norm": 2.625, + "learning_rate": 7.1020811055359064e-06, + "loss": 0.6657, + "step": 10670 + }, + { + "epoch": 0.29203866504422965, + "grad_norm": 2.609375, + "learning_rate": 7.099339201009021e-06, + "loss": 0.6122, + "step": 10680 + }, + { + "epoch": 0.29231210948715497, + "grad_norm": 2.46875, + "learning_rate": 7.096597296482137e-06, + "loss": 0.6871, + "step": 10690 + }, + { + "epoch": 0.29258555393008023, + "grad_norm": 2.640625, + "learning_rate": 7.093855391955252e-06, + "loss": 0.664, + "step": 10700 + }, + { + "epoch": 0.29285899837300555, + "grad_norm": 4.125, + "learning_rate": 7.091113487428368e-06, + "loss": 0.746, + "step": 10710 + }, + { + "epoch": 0.29313244281593087, + "grad_norm": 2.921875, + "learning_rate": 7.088371582901485e-06, + "loss": 0.7636, + "step": 10720 + }, + { + "epoch": 0.2934058872588562, + "grad_norm": 2.21875, + "learning_rate": 7.085629678374599e-06, + "loss": 0.6021, + "step": 10730 + }, + { + "epoch": 0.2936793317017815, + "grad_norm": 3.75, + "learning_rate": 7.082887773847716e-06, + "loss": 0.6665, + "step": 10740 + }, + { + "epoch": 0.2939527761447068, + "grad_norm": 2.40625, + "learning_rate": 7.080145869320831e-06, + "loss": 0.6754, + "step": 10750 + }, + { + "epoch": 0.2942262205876321, + "grad_norm": 2.8125, + "learning_rate": 7.077403964793947e-06, + "loss": 0.761, + "step": 10760 + }, + { + "epoch": 0.2944996650305574, + "grad_norm": 2.875, + "learning_rate": 7.074662060267062e-06, + "loss": 0.7016, + "step": 10770 + }, + { + "epoch": 0.29477310947348273, + "grad_norm": 2.859375, + "learning_rate": 7.0719201557401776e-06, + "loss": 0.7447, + "step": 10780 + }, + { + "epoch": 0.29504655391640805, + "grad_norm": 2.375, + "learning_rate": 7.069178251213293e-06, + "loss": 0.6428, + "step": 10790 + }, + { + "epoch": 0.29531999835933337, + "grad_norm": 2.515625, + "learning_rate": 7.066436346686409e-06, + "loss": 0.6487, + "step": 10800 + }, + { + "epoch": 0.29559344280225863, + "grad_norm": 2.921875, + "learning_rate": 7.063694442159524e-06, + "loss": 0.7809, + "step": 10810 + }, + { + "epoch": 0.29586688724518395, + "grad_norm": 2.8125, + "learning_rate": 7.06095253763264e-06, + "loss": 0.7382, + "step": 10820 + }, + { + "epoch": 0.29614033168810927, + "grad_norm": 2.703125, + "learning_rate": 7.058210633105755e-06, + "loss": 0.71, + "step": 10830 + }, + { + "epoch": 0.2964137761310346, + "grad_norm": 2.78125, + "learning_rate": 7.055468728578871e-06, + "loss": 0.6666, + "step": 10840 + }, + { + "epoch": 0.2966872205739599, + "grad_norm": 2.78125, + "learning_rate": 7.052726824051988e-06, + "loss": 0.728, + "step": 10850 + }, + { + "epoch": 0.2969606650168852, + "grad_norm": 3.34375, + "learning_rate": 7.049984919525102e-06, + "loss": 0.695, + "step": 10860 + }, + { + "epoch": 0.2972341094598105, + "grad_norm": 2.796875, + "learning_rate": 7.047243014998219e-06, + "loss": 0.6523, + "step": 10870 + }, + { + "epoch": 0.2975075539027358, + "grad_norm": 2.9375, + "learning_rate": 7.044501110471333e-06, + "loss": 0.7291, + "step": 10880 + }, + { + "epoch": 0.29778099834566113, + "grad_norm": 2.578125, + "learning_rate": 7.0417592059444496e-06, + "loss": 0.6904, + "step": 10890 + }, + { + "epoch": 0.29805444278858645, + "grad_norm": 2.703125, + "learning_rate": 7.039017301417566e-06, + "loss": 0.7389, + "step": 10900 + }, + { + "epoch": 0.2983278872315117, + "grad_norm": 2.53125, + "learning_rate": 7.0362753968906805e-06, + "loss": 0.6369, + "step": 10910 + }, + { + "epoch": 0.29860133167443703, + "grad_norm": 2.734375, + "learning_rate": 7.033533492363797e-06, + "loss": 0.6776, + "step": 10920 + }, + { + "epoch": 0.29887477611736235, + "grad_norm": 2.953125, + "learning_rate": 7.030791587836912e-06, + "loss": 0.6531, + "step": 10930 + }, + { + "epoch": 0.29914822056028767, + "grad_norm": 2.8125, + "learning_rate": 7.028049683310028e-06, + "loss": 0.7285, + "step": 10940 + }, + { + "epoch": 0.299421665003213, + "grad_norm": 2.40625, + "learning_rate": 7.025307778783143e-06, + "loss": 0.6325, + "step": 10950 + }, + { + "epoch": 0.2996951094461383, + "grad_norm": 2.546875, + "learning_rate": 7.022565874256259e-06, + "loss": 0.6957, + "step": 10960 + }, + { + "epoch": 0.29996855388906357, + "grad_norm": 3.171875, + "learning_rate": 7.019823969729374e-06, + "loss": 0.7209, + "step": 10970 + }, + { + "epoch": 0.3002419983319889, + "grad_norm": 2.4375, + "learning_rate": 7.017082065202491e-06, + "loss": 0.7059, + "step": 10980 + }, + { + "epoch": 0.3005154427749142, + "grad_norm": 2.828125, + "learning_rate": 7.014340160675605e-06, + "loss": 0.7217, + "step": 10990 + }, + { + "epoch": 0.30078888721783953, + "grad_norm": 2.53125, + "learning_rate": 7.0115982561487216e-06, + "loss": 0.7045, + "step": 11000 + }, + { + "epoch": 0.30106233166076485, + "grad_norm": 3.09375, + "learning_rate": 7.008856351621837e-06, + "loss": 0.6908, + "step": 11010 + }, + { + "epoch": 0.3013357761036901, + "grad_norm": 2.390625, + "learning_rate": 7.0061144470949525e-06, + "loss": 0.7213, + "step": 11020 + }, + { + "epoch": 0.30160922054661543, + "grad_norm": 2.59375, + "learning_rate": 7.003372542568069e-06, + "loss": 0.7418, + "step": 11030 + }, + { + "epoch": 0.30188266498954075, + "grad_norm": 2.921875, + "learning_rate": 7.0006306380411835e-06, + "loss": 0.6833, + "step": 11040 + }, + { + "epoch": 0.30215610943246607, + "grad_norm": 2.578125, + "learning_rate": 6.9978887335143e-06, + "loss": 0.7172, + "step": 11050 + }, + { + "epoch": 0.3024295538753914, + "grad_norm": 2.875, + "learning_rate": 6.9951468289874145e-06, + "loss": 0.7148, + "step": 11060 + }, + { + "epoch": 0.30270299831831665, + "grad_norm": 2.78125, + "learning_rate": 6.992404924460531e-06, + "loss": 0.7217, + "step": 11070 + }, + { + "epoch": 0.30297644276124197, + "grad_norm": 2.984375, + "learning_rate": 6.989663019933647e-06, + "loss": 0.6593, + "step": 11080 + }, + { + "epoch": 0.3032498872041673, + "grad_norm": 2.828125, + "learning_rate": 6.986921115406762e-06, + "loss": 0.7034, + "step": 11090 + }, + { + "epoch": 0.3035233316470926, + "grad_norm": 2.578125, + "learning_rate": 6.984179210879878e-06, + "loss": 0.7046, + "step": 11100 + }, + { + "epoch": 0.30379677609001793, + "grad_norm": 2.953125, + "learning_rate": 6.9814373063529936e-06, + "loss": 0.6705, + "step": 11110 + }, + { + "epoch": 0.3040702205329432, + "grad_norm": 2.296875, + "learning_rate": 6.978695401826109e-06, + "loss": 0.7415, + "step": 11120 + }, + { + "epoch": 0.3043436649758685, + "grad_norm": 2.875, + "learning_rate": 6.9759534972992245e-06, + "loss": 0.6866, + "step": 11130 + }, + { + "epoch": 0.30461710941879383, + "grad_norm": 3.21875, + "learning_rate": 6.97321159277234e-06, + "loss": 0.729, + "step": 11140 + }, + { + "epoch": 0.30489055386171915, + "grad_norm": 2.4375, + "learning_rate": 6.9704696882454555e-06, + "loss": 0.681, + "step": 11150 + }, + { + "epoch": 0.30516399830464447, + "grad_norm": 2.390625, + "learning_rate": 6.967727783718572e-06, + "loss": 0.6561, + "step": 11160 + }, + { + "epoch": 0.3054374427475698, + "grad_norm": 2.890625, + "learning_rate": 6.964985879191687e-06, + "loss": 0.6162, + "step": 11170 + }, + { + "epoch": 0.30571088719049505, + "grad_norm": 2.625, + "learning_rate": 6.962243974664803e-06, + "loss": 0.6543, + "step": 11180 + }, + { + "epoch": 0.30598433163342037, + "grad_norm": 3.0625, + "learning_rate": 6.959502070137918e-06, + "loss": 0.7339, + "step": 11190 + }, + { + "epoch": 0.3062577760763457, + "grad_norm": 3.078125, + "learning_rate": 6.956760165611034e-06, + "loss": 0.6975, + "step": 11200 + }, + { + "epoch": 0.306531220519271, + "grad_norm": 3.265625, + "learning_rate": 6.95401826108415e-06, + "loss": 0.6994, + "step": 11210 + }, + { + "epoch": 0.3068046649621963, + "grad_norm": 2.96875, + "learning_rate": 6.951276356557265e-06, + "loss": 0.6846, + "step": 11220 + }, + { + "epoch": 0.3070781094051216, + "grad_norm": 2.203125, + "learning_rate": 6.948534452030381e-06, + "loss": 0.6327, + "step": 11230 + }, + { + "epoch": 0.3073515538480469, + "grad_norm": 2.53125, + "learning_rate": 6.945792547503496e-06, + "loss": 0.6395, + "step": 11240 + }, + { + "epoch": 0.30762499829097223, + "grad_norm": 2.734375, + "learning_rate": 6.943050642976612e-06, + "loss": 0.6184, + "step": 11250 + }, + { + "epoch": 0.30789844273389755, + "grad_norm": 2.640625, + "learning_rate": 6.940308738449728e-06, + "loss": 0.7189, + "step": 11260 + }, + { + "epoch": 0.30817188717682287, + "grad_norm": 2.9375, + "learning_rate": 6.937566833922843e-06, + "loss": 0.6948, + "step": 11270 + }, + { + "epoch": 0.30844533161974813, + "grad_norm": 3.046875, + "learning_rate": 6.934824929395959e-06, + "loss": 0.7341, + "step": 11280 + }, + { + "epoch": 0.30871877606267345, + "grad_norm": 2.875, + "learning_rate": 6.932083024869075e-06, + "loss": 0.7089, + "step": 11290 + }, + { + "epoch": 0.30899222050559877, + "grad_norm": 2.96875, + "learning_rate": 6.92934112034219e-06, + "loss": 0.7374, + "step": 11300 + }, + { + "epoch": 0.3092656649485241, + "grad_norm": 2.40625, + "learning_rate": 6.926599215815306e-06, + "loss": 0.7596, + "step": 11310 + }, + { + "epoch": 0.3095391093914494, + "grad_norm": 2.84375, + "learning_rate": 6.923857311288421e-06, + "loss": 0.6692, + "step": 11320 + }, + { + "epoch": 0.3098125538343747, + "grad_norm": 2.703125, + "learning_rate": 6.9211154067615376e-06, + "loss": 0.7374, + "step": 11330 + }, + { + "epoch": 0.3100859982773, + "grad_norm": 2.671875, + "learning_rate": 6.918373502234653e-06, + "loss": 0.6903, + "step": 11340 + }, + { + "epoch": 0.3103594427202253, + "grad_norm": 3.28125, + "learning_rate": 6.9156315977077685e-06, + "loss": 0.7144, + "step": 11350 + }, + { + "epoch": 0.31063288716315063, + "grad_norm": 2.890625, + "learning_rate": 6.912889693180884e-06, + "loss": 0.6638, + "step": 11360 + }, + { + "epoch": 0.31090633160607595, + "grad_norm": 2.765625, + "learning_rate": 6.9101477886539995e-06, + "loss": 0.7098, + "step": 11370 + }, + { + "epoch": 0.31117977604900127, + "grad_norm": 2.453125, + "learning_rate": 6.907405884127115e-06, + "loss": 0.6232, + "step": 11380 + }, + { + "epoch": 0.31145322049192653, + "grad_norm": 2.640625, + "learning_rate": 6.904663979600231e-06, + "loss": 0.7413, + "step": 11390 + }, + { + "epoch": 0.31172666493485185, + "grad_norm": 2.765625, + "learning_rate": 6.901922075073346e-06, + "loss": 0.7506, + "step": 11400 + }, + { + "epoch": 0.31200010937777717, + "grad_norm": 3.234375, + "learning_rate": 6.899180170546462e-06, + "loss": 0.6956, + "step": 11410 + }, + { + "epoch": 0.3122735538207025, + "grad_norm": 2.953125, + "learning_rate": 6.896438266019577e-06, + "loss": 0.8072, + "step": 11420 + }, + { + "epoch": 0.3125469982636278, + "grad_norm": 2.78125, + "learning_rate": 6.893696361492693e-06, + "loss": 0.755, + "step": 11430 + }, + { + "epoch": 0.31282044270655307, + "grad_norm": 2.453125, + "learning_rate": 6.8909544569658095e-06, + "loss": 0.7009, + "step": 11440 + }, + { + "epoch": 0.3130938871494784, + "grad_norm": 2.796875, + "learning_rate": 6.888212552438924e-06, + "loss": 0.644, + "step": 11450 + }, + { + "epoch": 0.3133673315924037, + "grad_norm": 2.953125, + "learning_rate": 6.8854706479120405e-06, + "loss": 0.6101, + "step": 11460 + }, + { + "epoch": 0.313640776035329, + "grad_norm": 2.75, + "learning_rate": 6.882728743385156e-06, + "loss": 0.7123, + "step": 11470 + }, + { + "epoch": 0.31391422047825435, + "grad_norm": 2.6875, + "learning_rate": 6.8799868388582715e-06, + "loss": 0.5996, + "step": 11480 + }, + { + "epoch": 0.31418766492117967, + "grad_norm": 2.90625, + "learning_rate": 6.877244934331388e-06, + "loss": 0.7435, + "step": 11490 + }, + { + "epoch": 0.31446110936410493, + "grad_norm": 3.484375, + "learning_rate": 6.8745030298045024e-06, + "loss": 0.7611, + "step": 11500 + }, + { + "epoch": 0.31473455380703025, + "grad_norm": 2.4375, + "learning_rate": 6.871761125277619e-06, + "loss": 0.6308, + "step": 11510 + }, + { + "epoch": 0.31500799824995557, + "grad_norm": 2.640625, + "learning_rate": 6.869019220750734e-06, + "loss": 0.655, + "step": 11520 + }, + { + "epoch": 0.3152814426928809, + "grad_norm": 2.890625, + "learning_rate": 6.86627731622385e-06, + "loss": 0.7073, + "step": 11530 + }, + { + "epoch": 0.3155548871358062, + "grad_norm": 3.03125, + "learning_rate": 6.863535411696965e-06, + "loss": 0.6477, + "step": 11540 + }, + { + "epoch": 0.31582833157873147, + "grad_norm": 2.78125, + "learning_rate": 6.860793507170081e-06, + "loss": 0.7341, + "step": 11550 + }, + { + "epoch": 0.3161017760216568, + "grad_norm": 2.5625, + "learning_rate": 6.858051602643196e-06, + "loss": 0.7324, + "step": 11560 + }, + { + "epoch": 0.3163752204645821, + "grad_norm": 2.6875, + "learning_rate": 6.8553096981163125e-06, + "loss": 0.6826, + "step": 11570 + }, + { + "epoch": 0.3166486649075074, + "grad_norm": 2.296875, + "learning_rate": 6.852567793589427e-06, + "loss": 0.687, + "step": 11580 + }, + { + "epoch": 0.31692210935043275, + "grad_norm": 3.140625, + "learning_rate": 6.8498258890625435e-06, + "loss": 0.7307, + "step": 11590 + }, + { + "epoch": 0.317195553793358, + "grad_norm": 2.5625, + "learning_rate": 6.847083984535658e-06, + "loss": 0.6419, + "step": 11600 + }, + { + "epoch": 0.31746899823628333, + "grad_norm": 2.796875, + "learning_rate": 6.8443420800087744e-06, + "loss": 0.6693, + "step": 11610 + }, + { + "epoch": 0.31774244267920865, + "grad_norm": 2.5, + "learning_rate": 6.841600175481891e-06, + "loss": 0.6882, + "step": 11620 + }, + { + "epoch": 0.31801588712213397, + "grad_norm": 2.703125, + "learning_rate": 6.838858270955005e-06, + "loss": 0.6351, + "step": 11630 + }, + { + "epoch": 0.3182893315650593, + "grad_norm": 2.515625, + "learning_rate": 6.836116366428122e-06, + "loss": 0.767, + "step": 11640 + }, + { + "epoch": 0.3185627760079846, + "grad_norm": 2.765625, + "learning_rate": 6.833374461901238e-06, + "loss": 0.72, + "step": 11650 + }, + { + "epoch": 0.31883622045090987, + "grad_norm": 2.75, + "learning_rate": 6.830632557374353e-06, + "loss": 0.653, + "step": 11660 + }, + { + "epoch": 0.3191096648938352, + "grad_norm": 2.734375, + "learning_rate": 6.827890652847469e-06, + "loss": 0.6554, + "step": 11670 + }, + { + "epoch": 0.3193831093367605, + "grad_norm": 2.375, + "learning_rate": 6.825148748320584e-06, + "loss": 0.5828, + "step": 11680 + }, + { + "epoch": 0.3196565537796858, + "grad_norm": 2.984375, + "learning_rate": 6.8224068437937e-06, + "loss": 0.7171, + "step": 11690 + }, + { + "epoch": 0.31992999822261114, + "grad_norm": 2.65625, + "learning_rate": 6.8196649392668155e-06, + "loss": 0.6954, + "step": 11700 + }, + { + "epoch": 0.3202034426655364, + "grad_norm": 2.890625, + "learning_rate": 6.816923034739931e-06, + "loss": 0.7713, + "step": 11710 + }, + { + "epoch": 0.3204768871084617, + "grad_norm": 2.78125, + "learning_rate": 6.8141811302130464e-06, + "loss": 0.728, + "step": 11720 + }, + { + "epoch": 0.32075033155138705, + "grad_norm": 2.84375, + "learning_rate": 6.811439225686162e-06, + "loss": 0.7415, + "step": 11730 + }, + { + "epoch": 0.32102377599431237, + "grad_norm": 2.5, + "learning_rate": 6.808697321159277e-06, + "loss": 0.6014, + "step": 11740 + }, + { + "epoch": 0.3212972204372377, + "grad_norm": 2.328125, + "learning_rate": 6.805955416632394e-06, + "loss": 0.6663, + "step": 11750 + }, + { + "epoch": 0.32157066488016295, + "grad_norm": 2.703125, + "learning_rate": 6.803213512105508e-06, + "loss": 0.7049, + "step": 11760 + }, + { + "epoch": 0.32184410932308827, + "grad_norm": 2.9375, + "learning_rate": 6.800471607578625e-06, + "loss": 0.6885, + "step": 11770 + }, + { + "epoch": 0.3221175537660136, + "grad_norm": 2.640625, + "learning_rate": 6.797729703051741e-06, + "loss": 0.6875, + "step": 11780 + }, + { + "epoch": 0.3223909982089389, + "grad_norm": 3.078125, + "learning_rate": 6.794987798524856e-06, + "loss": 0.7229, + "step": 11790 + }, + { + "epoch": 0.3226644426518642, + "grad_norm": 2.953125, + "learning_rate": 6.792245893997972e-06, + "loss": 0.7529, + "step": 11800 + }, + { + "epoch": 0.3229378870947895, + "grad_norm": 2.4375, + "learning_rate": 6.789503989471087e-06, + "loss": 0.7187, + "step": 11810 + }, + { + "epoch": 0.3232113315377148, + "grad_norm": 2.890625, + "learning_rate": 6.786762084944203e-06, + "loss": 0.7406, + "step": 11820 + }, + { + "epoch": 0.3234847759806401, + "grad_norm": 3.046875, + "learning_rate": 6.784020180417319e-06, + "loss": 0.7044, + "step": 11830 + }, + { + "epoch": 0.32375822042356545, + "grad_norm": 2.59375, + "learning_rate": 6.781278275890434e-06, + "loss": 0.6815, + "step": 11840 + }, + { + "epoch": 0.32403166486649077, + "grad_norm": 2.546875, + "learning_rate": 6.77853637136355e-06, + "loss": 0.6475, + "step": 11850 + }, + { + "epoch": 0.3243051093094161, + "grad_norm": 2.546875, + "learning_rate": 6.775794466836665e-06, + "loss": 0.6687, + "step": 11860 + }, + { + "epoch": 0.32457855375234135, + "grad_norm": 2.28125, + "learning_rate": 6.773052562309781e-06, + "loss": 0.6364, + "step": 11870 + }, + { + "epoch": 0.32485199819526667, + "grad_norm": 2.921875, + "learning_rate": 6.770310657782897e-06, + "loss": 0.6746, + "step": 11880 + }, + { + "epoch": 0.325125442638192, + "grad_norm": 2.796875, + "learning_rate": 6.767568753256012e-06, + "loss": 0.6738, + "step": 11890 + }, + { + "epoch": 0.3253988870811173, + "grad_norm": 2.875, + "learning_rate": 6.764826848729128e-06, + "loss": 0.5849, + "step": 11900 + }, + { + "epoch": 0.3256723315240426, + "grad_norm": 2.65625, + "learning_rate": 6.762084944202243e-06, + "loss": 0.7135, + "step": 11910 + }, + { + "epoch": 0.3259457759669679, + "grad_norm": 2.625, + "learning_rate": 6.759343039675359e-06, + "loss": 0.7883, + "step": 11920 + }, + { + "epoch": 0.3262192204098932, + "grad_norm": 2.734375, + "learning_rate": 6.756601135148475e-06, + "loss": 0.6686, + "step": 11930 + }, + { + "epoch": 0.3264926648528185, + "grad_norm": 2.703125, + "learning_rate": 6.75385923062159e-06, + "loss": 0.7054, + "step": 11940 + }, + { + "epoch": 0.32676610929574385, + "grad_norm": 2.765625, + "learning_rate": 6.751117326094706e-06, + "loss": 0.6563, + "step": 11950 + }, + { + "epoch": 0.32703955373866916, + "grad_norm": 2.6875, + "learning_rate": 6.748375421567822e-06, + "loss": 0.7064, + "step": 11960 + }, + { + "epoch": 0.32731299818159443, + "grad_norm": 2.71875, + "learning_rate": 6.745633517040937e-06, + "loss": 0.6405, + "step": 11970 + }, + { + "epoch": 0.32758644262451975, + "grad_norm": 2.015625, + "learning_rate": 6.742891612514053e-06, + "loss": 0.6231, + "step": 11980 + }, + { + "epoch": 0.32785988706744507, + "grad_norm": 2.578125, + "learning_rate": 6.740149707987168e-06, + "loss": 0.6941, + "step": 11990 + }, + { + "epoch": 0.3281333315103704, + "grad_norm": 3.0, + "learning_rate": 6.737407803460284e-06, + "loss": 0.6867, + "step": 12000 + }, + { + "epoch": 0.3284067759532957, + "grad_norm": 2.078125, + "learning_rate": 6.7346658989334005e-06, + "loss": 0.6154, + "step": 12010 + }, + { + "epoch": 0.328680220396221, + "grad_norm": 3.3125, + "learning_rate": 6.731923994406515e-06, + "loss": 0.6894, + "step": 12020 + }, + { + "epoch": 0.3289536648391463, + "grad_norm": 2.484375, + "learning_rate": 6.7291820898796315e-06, + "loss": 0.663, + "step": 12030 + }, + { + "epoch": 0.3292271092820716, + "grad_norm": 2.484375, + "learning_rate": 6.726440185352746e-06, + "loss": 0.7265, + "step": 12040 + }, + { + "epoch": 0.3295005537249969, + "grad_norm": 2.734375, + "learning_rate": 6.723698280825862e-06, + "loss": 0.7118, + "step": 12050 + }, + { + "epoch": 0.32977399816792224, + "grad_norm": 2.71875, + "learning_rate": 6.720956376298978e-06, + "loss": 0.7145, + "step": 12060 + }, + { + "epoch": 0.33004744261084756, + "grad_norm": 3.21875, + "learning_rate": 6.718214471772093e-06, + "loss": 0.7637, + "step": 12070 + }, + { + "epoch": 0.3303208870537728, + "grad_norm": 2.6875, + "learning_rate": 6.715472567245209e-06, + "loss": 0.6981, + "step": 12080 + }, + { + "epoch": 0.33059433149669815, + "grad_norm": 2.921875, + "learning_rate": 6.712730662718324e-06, + "loss": 0.7494, + "step": 12090 + }, + { + "epoch": 0.33086777593962347, + "grad_norm": 3.1875, + "learning_rate": 6.70998875819144e-06, + "loss": 0.6865, + "step": 12100 + }, + { + "epoch": 0.3311412203825488, + "grad_norm": 2.65625, + "learning_rate": 6.707246853664556e-06, + "loss": 0.7279, + "step": 12110 + }, + { + "epoch": 0.3314146648254741, + "grad_norm": 2.59375, + "learning_rate": 6.704504949137672e-06, + "loss": 0.7386, + "step": 12120 + }, + { + "epoch": 0.33168810926839937, + "grad_norm": 2.78125, + "learning_rate": 6.701763044610787e-06, + "loss": 0.6342, + "step": 12130 + }, + { + "epoch": 0.3319615537113247, + "grad_norm": 2.546875, + "learning_rate": 6.6990211400839035e-06, + "loss": 0.6769, + "step": 12140 + }, + { + "epoch": 0.33223499815425, + "grad_norm": 2.734375, + "learning_rate": 6.696279235557018e-06, + "loss": 0.6702, + "step": 12150 + }, + { + "epoch": 0.3325084425971753, + "grad_norm": 3.140625, + "learning_rate": 6.693537331030134e-06, + "loss": 0.7738, + "step": 12160 + }, + { + "epoch": 0.33278188704010064, + "grad_norm": 3.0625, + "learning_rate": 6.690795426503249e-06, + "loss": 0.7227, + "step": 12170 + }, + { + "epoch": 0.33305533148302596, + "grad_norm": 2.828125, + "learning_rate": 6.688053521976365e-06, + "loss": 0.7018, + "step": 12180 + }, + { + "epoch": 0.3333287759259512, + "grad_norm": 2.921875, + "learning_rate": 6.685311617449482e-06, + "loss": 0.6944, + "step": 12190 + }, + { + "epoch": 0.33360222036887655, + "grad_norm": 2.921875, + "learning_rate": 6.682569712922596e-06, + "loss": 0.6985, + "step": 12200 + }, + { + "epoch": 0.33387566481180186, + "grad_norm": 2.890625, + "learning_rate": 6.679827808395713e-06, + "loss": 0.685, + "step": 12210 + }, + { + "epoch": 0.3341491092547272, + "grad_norm": 2.78125, + "learning_rate": 6.677085903868827e-06, + "loss": 0.7549, + "step": 12220 + }, + { + "epoch": 0.3344225536976525, + "grad_norm": 3.109375, + "learning_rate": 6.674343999341944e-06, + "loss": 0.705, + "step": 12230 + }, + { + "epoch": 0.33469599814057777, + "grad_norm": 2.9375, + "learning_rate": 6.671602094815059e-06, + "loss": 0.825, + "step": 12240 + }, + { + "epoch": 0.3349694425835031, + "grad_norm": 2.375, + "learning_rate": 6.668860190288175e-06, + "loss": 0.7136, + "step": 12250 + }, + { + "epoch": 0.3352428870264284, + "grad_norm": 2.625, + "learning_rate": 6.66611828576129e-06, + "loss": 0.6553, + "step": 12260 + }, + { + "epoch": 0.3355163314693537, + "grad_norm": 2.640625, + "learning_rate": 6.6633763812344056e-06, + "loss": 0.6817, + "step": 12270 + }, + { + "epoch": 0.33578977591227904, + "grad_norm": 3.0, + "learning_rate": 6.660634476707522e-06, + "loss": 0.7475, + "step": 12280 + }, + { + "epoch": 0.3360632203552043, + "grad_norm": 2.8125, + "learning_rate": 6.657892572180637e-06, + "loss": 0.7105, + "step": 12290 + }, + { + "epoch": 0.3363366647981296, + "grad_norm": 2.828125, + "learning_rate": 6.655150667653753e-06, + "loss": 0.6866, + "step": 12300 + }, + { + "epoch": 0.33661010924105494, + "grad_norm": 2.890625, + "learning_rate": 6.652408763126868e-06, + "loss": 0.7252, + "step": 12310 + }, + { + "epoch": 0.33688355368398026, + "grad_norm": 2.921875, + "learning_rate": 6.649666858599985e-06, + "loss": 0.683, + "step": 12320 + }, + { + "epoch": 0.3371569981269056, + "grad_norm": 2.546875, + "learning_rate": 6.646924954073099e-06, + "loss": 0.6286, + "step": 12330 + }, + { + "epoch": 0.3374304425698309, + "grad_norm": 2.53125, + "learning_rate": 6.644183049546216e-06, + "loss": 0.7069, + "step": 12340 + }, + { + "epoch": 0.33770388701275617, + "grad_norm": 2.640625, + "learning_rate": 6.64144114501933e-06, + "loss": 0.725, + "step": 12350 + }, + { + "epoch": 0.3379773314556815, + "grad_norm": 2.78125, + "learning_rate": 6.638699240492447e-06, + "loss": 0.7511, + "step": 12360 + }, + { + "epoch": 0.3382507758986068, + "grad_norm": 2.78125, + "learning_rate": 6.635957335965563e-06, + "loss": 0.6582, + "step": 12370 + }, + { + "epoch": 0.3385242203415321, + "grad_norm": 2.859375, + "learning_rate": 6.6332154314386776e-06, + "loss": 0.6827, + "step": 12380 + }, + { + "epoch": 0.33879766478445744, + "grad_norm": 2.609375, + "learning_rate": 6.630473526911794e-06, + "loss": 0.6445, + "step": 12390 + }, + { + "epoch": 0.3390711092273827, + "grad_norm": 2.859375, + "learning_rate": 6.6277316223849085e-06, + "loss": 0.6738, + "step": 12400 + }, + { + "epoch": 0.339344553670308, + "grad_norm": 2.96875, + "learning_rate": 6.624989717858025e-06, + "loss": 0.6651, + "step": 12410 + }, + { + "epoch": 0.33961799811323334, + "grad_norm": 2.609375, + "learning_rate": 6.62224781333114e-06, + "loss": 0.6363, + "step": 12420 + }, + { + "epoch": 0.33989144255615866, + "grad_norm": 2.5, + "learning_rate": 6.619505908804256e-06, + "loss": 0.7364, + "step": 12430 + }, + { + "epoch": 0.340164886999084, + "grad_norm": 2.859375, + "learning_rate": 6.616764004277372e-06, + "loss": 0.6807, + "step": 12440 + }, + { + "epoch": 0.34043833144200925, + "grad_norm": 2.546875, + "learning_rate": 6.614022099750487e-06, + "loss": 0.7298, + "step": 12450 + }, + { + "epoch": 0.34071177588493456, + "grad_norm": 2.703125, + "learning_rate": 6.611280195223603e-06, + "loss": 0.7365, + "step": 12460 + }, + { + "epoch": 0.3409852203278599, + "grad_norm": 3.125, + "learning_rate": 6.608538290696719e-06, + "loss": 0.7325, + "step": 12470 + }, + { + "epoch": 0.3412586647707852, + "grad_norm": 2.71875, + "learning_rate": 6.605796386169834e-06, + "loss": 0.6979, + "step": 12480 + }, + { + "epoch": 0.3415321092137105, + "grad_norm": 2.734375, + "learning_rate": 6.6030544816429496e-06, + "loss": 0.6499, + "step": 12490 + }, + { + "epoch": 0.3418055536566358, + "grad_norm": 2.90625, + "learning_rate": 6.600312577116066e-06, + "loss": 0.6311, + "step": 12500 + }, + { + "epoch": 0.3420789980995611, + "grad_norm": 3.046875, + "learning_rate": 6.5975706725891805e-06, + "loss": 0.7192, + "step": 12510 + }, + { + "epoch": 0.3423524425424864, + "grad_norm": 2.3125, + "learning_rate": 6.594828768062297e-06, + "loss": 0.7258, + "step": 12520 + }, + { + "epoch": 0.34262588698541174, + "grad_norm": 3.34375, + "learning_rate": 6.5920868635354115e-06, + "loss": 0.6524, + "step": 12530 + }, + { + "epoch": 0.34289933142833706, + "grad_norm": 2.421875, + "learning_rate": 6.589344959008528e-06, + "loss": 0.6693, + "step": 12540 + }, + { + "epoch": 0.3431727758712624, + "grad_norm": 3.046875, + "learning_rate": 6.586603054481644e-06, + "loss": 0.7093, + "step": 12550 + }, + { + "epoch": 0.34344622031418764, + "grad_norm": 2.515625, + "learning_rate": 6.583861149954759e-06, + "loss": 0.7526, + "step": 12560 + }, + { + "epoch": 0.34371966475711296, + "grad_norm": 2.71875, + "learning_rate": 6.581119245427875e-06, + "loss": 0.6705, + "step": 12570 + }, + { + "epoch": 0.3439931092000383, + "grad_norm": 2.4375, + "learning_rate": 6.57837734090099e-06, + "loss": 0.6943, + "step": 12580 + }, + { + "epoch": 0.3442665536429636, + "grad_norm": 2.546875, + "learning_rate": 6.575635436374106e-06, + "loss": 0.7304, + "step": 12590 + }, + { + "epoch": 0.3445399980858889, + "grad_norm": 2.859375, + "learning_rate": 6.572893531847222e-06, + "loss": 0.7203, + "step": 12600 + }, + { + "epoch": 0.3448134425288142, + "grad_norm": 2.6875, + "learning_rate": 6.570151627320337e-06, + "loss": 0.6275, + "step": 12610 + }, + { + "epoch": 0.3450868869717395, + "grad_norm": 2.4375, + "learning_rate": 6.567409722793453e-06, + "loss": 0.6899, + "step": 12620 + }, + { + "epoch": 0.3453603314146648, + "grad_norm": 2.65625, + "learning_rate": 6.564667818266568e-06, + "loss": 0.7792, + "step": 12630 + }, + { + "epoch": 0.34563377585759014, + "grad_norm": 2.671875, + "learning_rate": 6.561925913739684e-06, + "loss": 0.6737, + "step": 12640 + }, + { + "epoch": 0.34590722030051546, + "grad_norm": 2.6875, + "learning_rate": 6.5591840092128e-06, + "loss": 0.7049, + "step": 12650 + }, + { + "epoch": 0.3461806647434407, + "grad_norm": 2.828125, + "learning_rate": 6.556442104685915e-06, + "loss": 0.763, + "step": 12660 + }, + { + "epoch": 0.34645410918636604, + "grad_norm": 2.453125, + "learning_rate": 6.553700200159031e-06, + "loss": 0.6805, + "step": 12670 + }, + { + "epoch": 0.34672755362929136, + "grad_norm": 3.53125, + "learning_rate": 6.550958295632147e-06, + "loss": 0.7152, + "step": 12680 + }, + { + "epoch": 0.3470009980722167, + "grad_norm": 2.953125, + "learning_rate": 6.548216391105262e-06, + "loss": 0.7106, + "step": 12690 + }, + { + "epoch": 0.347274442515142, + "grad_norm": 2.640625, + "learning_rate": 6.545474486578378e-06, + "loss": 0.6086, + "step": 12700 + }, + { + "epoch": 0.3475478869580673, + "grad_norm": 3.0, + "learning_rate": 6.542732582051493e-06, + "loss": 0.6425, + "step": 12710 + }, + { + "epoch": 0.3478213314009926, + "grad_norm": 3.171875, + "learning_rate": 6.539990677524609e-06, + "loss": 0.7287, + "step": 12720 + }, + { + "epoch": 0.3480947758439179, + "grad_norm": 3.03125, + "learning_rate": 6.537248772997725e-06, + "loss": 0.7655, + "step": 12730 + }, + { + "epoch": 0.3483682202868432, + "grad_norm": 3.28125, + "learning_rate": 6.53450686847084e-06, + "loss": 0.7742, + "step": 12740 + }, + { + "epoch": 0.34864166472976854, + "grad_norm": 2.625, + "learning_rate": 6.531764963943956e-06, + "loss": 0.678, + "step": 12750 + }, + { + "epoch": 0.34891510917269386, + "grad_norm": 2.75, + "learning_rate": 6.529023059417071e-06, + "loss": 0.7807, + "step": 12760 + }, + { + "epoch": 0.3491885536156191, + "grad_norm": 2.421875, + "learning_rate": 6.526281154890187e-06, + "loss": 0.6971, + "step": 12770 + }, + { + "epoch": 0.34946199805854444, + "grad_norm": 2.5625, + "learning_rate": 6.523539250363304e-06, + "loss": 0.6924, + "step": 12780 + }, + { + "epoch": 0.34973544250146976, + "grad_norm": 2.875, + "learning_rate": 6.520797345836418e-06, + "loss": 0.7426, + "step": 12790 + }, + { + "epoch": 0.3500088869443951, + "grad_norm": 2.625, + "learning_rate": 6.518055441309535e-06, + "loss": 0.754, + "step": 12800 + }, + { + "epoch": 0.3502823313873204, + "grad_norm": 2.75, + "learning_rate": 6.515313536782649e-06, + "loss": 0.7247, + "step": 12810 + }, + { + "epoch": 0.35055577583024566, + "grad_norm": 2.78125, + "learning_rate": 6.5125716322557655e-06, + "loss": 0.6583, + "step": 12820 + }, + { + "epoch": 0.350829220273171, + "grad_norm": 2.609375, + "learning_rate": 6.509829727728881e-06, + "loss": 0.6691, + "step": 12830 + }, + { + "epoch": 0.3511026647160963, + "grad_norm": 3.265625, + "learning_rate": 6.5070878232019965e-06, + "loss": 0.6095, + "step": 12840 + }, + { + "epoch": 0.3513761091590216, + "grad_norm": 2.484375, + "learning_rate": 6.504345918675112e-06, + "loss": 0.7547, + "step": 12850 + }, + { + "epoch": 0.35164955360194694, + "grad_norm": 3.0, + "learning_rate": 6.501604014148228e-06, + "loss": 0.7327, + "step": 12860 + }, + { + "epoch": 0.35192299804487226, + "grad_norm": 2.40625, + "learning_rate": 6.498862109621343e-06, + "loss": 0.6493, + "step": 12870 + }, + { + "epoch": 0.3521964424877975, + "grad_norm": 2.8125, + "learning_rate": 6.496120205094459e-06, + "loss": 0.6474, + "step": 12880 + }, + { + "epoch": 0.35246988693072284, + "grad_norm": 2.5625, + "learning_rate": 6.493378300567575e-06, + "loss": 0.66, + "step": 12890 + }, + { + "epoch": 0.35274333137364816, + "grad_norm": 2.34375, + "learning_rate": 6.49063639604069e-06, + "loss": 0.6339, + "step": 12900 + }, + { + "epoch": 0.3530167758165735, + "grad_norm": 2.640625, + "learning_rate": 6.4878944915138066e-06, + "loss": 0.6468, + "step": 12910 + }, + { + "epoch": 0.3532902202594988, + "grad_norm": 2.75, + "learning_rate": 6.485152586986921e-06, + "loss": 0.6921, + "step": 12920 + }, + { + "epoch": 0.35356366470242406, + "grad_norm": 3.015625, + "learning_rate": 6.4824106824600375e-06, + "loss": 0.7402, + "step": 12930 + }, + { + "epoch": 0.3538371091453494, + "grad_norm": 3.015625, + "learning_rate": 6.479668777933152e-06, + "loss": 0.7112, + "step": 12940 + }, + { + "epoch": 0.3541105535882747, + "grad_norm": 2.71875, + "learning_rate": 6.4769268734062685e-06, + "loss": 0.6417, + "step": 12950 + }, + { + "epoch": 0.3543839980312, + "grad_norm": 2.609375, + "learning_rate": 6.474184968879385e-06, + "loss": 0.6665, + "step": 12960 + }, + { + "epoch": 0.35465744247412534, + "grad_norm": 3.09375, + "learning_rate": 6.4714430643524995e-06, + "loss": 0.6761, + "step": 12970 + }, + { + "epoch": 0.3549308869170506, + "grad_norm": 3.109375, + "learning_rate": 6.468701159825616e-06, + "loss": 0.8327, + "step": 12980 + }, + { + "epoch": 0.3552043313599759, + "grad_norm": 2.984375, + "learning_rate": 6.4659592552987304e-06, + "loss": 0.695, + "step": 12990 + }, + { + "epoch": 0.35547777580290124, + "grad_norm": 2.375, + "learning_rate": 6.463217350771847e-06, + "loss": 0.6741, + "step": 13000 + }, + { + "epoch": 0.35575122024582656, + "grad_norm": 3.375, + "learning_rate": 6.460475446244962e-06, + "loss": 0.7361, + "step": 13010 + }, + { + "epoch": 0.3560246646887519, + "grad_norm": 3.3125, + "learning_rate": 6.457733541718078e-06, + "loss": 0.7786, + "step": 13020 + }, + { + "epoch": 0.3562981091316772, + "grad_norm": 2.359375, + "learning_rate": 6.454991637191193e-06, + "loss": 0.7425, + "step": 13030 + }, + { + "epoch": 0.35657155357460246, + "grad_norm": 2.671875, + "learning_rate": 6.4522497326643095e-06, + "loss": 0.6, + "step": 13040 + }, + { + "epoch": 0.3568449980175278, + "grad_norm": 2.671875, + "learning_rate": 6.449507828137425e-06, + "loss": 0.643, + "step": 13050 + }, + { + "epoch": 0.3571184424604531, + "grad_norm": 2.53125, + "learning_rate": 6.4467659236105405e-06, + "loss": 0.6522, + "step": 13060 + }, + { + "epoch": 0.3573918869033784, + "grad_norm": 3.0625, + "learning_rate": 6.444024019083656e-06, + "loss": 0.6744, + "step": 13070 + }, + { + "epoch": 0.35766533134630374, + "grad_norm": 2.734375, + "learning_rate": 6.4412821145567715e-06, + "loss": 0.6567, + "step": 13080 + }, + { + "epoch": 0.357938775789229, + "grad_norm": 2.359375, + "learning_rate": 6.438540210029888e-06, + "loss": 0.6331, + "step": 13090 + }, + { + "epoch": 0.3582122202321543, + "grad_norm": 2.734375, + "learning_rate": 6.4357983055030024e-06, + "loss": 0.7281, + "step": 13100 + }, + { + "epoch": 0.35848566467507964, + "grad_norm": 2.578125, + "learning_rate": 6.433056400976119e-06, + "loss": 0.6825, + "step": 13110 + }, + { + "epoch": 0.35875910911800496, + "grad_norm": 2.546875, + "learning_rate": 6.430314496449233e-06, + "loss": 0.6924, + "step": 13120 + }, + { + "epoch": 0.3590325535609303, + "grad_norm": 2.65625, + "learning_rate": 6.42757259192235e-06, + "loss": 0.6499, + "step": 13130 + }, + { + "epoch": 0.35930599800385554, + "grad_norm": 2.625, + "learning_rate": 6.424830687395466e-06, + "loss": 0.6913, + "step": 13140 + }, + { + "epoch": 0.35957944244678086, + "grad_norm": 2.1875, + "learning_rate": 6.422088782868581e-06, + "loss": 0.708, + "step": 13150 + }, + { + "epoch": 0.3598528868897062, + "grad_norm": 2.796875, + "learning_rate": 6.419346878341697e-06, + "loss": 0.7335, + "step": 13160 + }, + { + "epoch": 0.3601263313326315, + "grad_norm": 2.796875, + "learning_rate": 6.4166049738148125e-06, + "loss": 0.7103, + "step": 13170 + }, + { + "epoch": 0.3603997757755568, + "grad_norm": 2.09375, + "learning_rate": 6.413863069287928e-06, + "loss": 0.6485, + "step": 13180 + }, + { + "epoch": 0.3606732202184821, + "grad_norm": 3.0, + "learning_rate": 6.4111211647610435e-06, + "loss": 0.6755, + "step": 13190 + }, + { + "epoch": 0.3609466646614074, + "grad_norm": 2.734375, + "learning_rate": 6.408379260234159e-06, + "loss": 0.6169, + "step": 13200 + }, + { + "epoch": 0.3612201091043327, + "grad_norm": 3.03125, + "learning_rate": 6.405637355707275e-06, + "loss": 0.7475, + "step": 13210 + }, + { + "epoch": 0.36149355354725804, + "grad_norm": 2.609375, + "learning_rate": 6.402895451180391e-06, + "loss": 0.7178, + "step": 13220 + }, + { + "epoch": 0.36176699799018336, + "grad_norm": 2.59375, + "learning_rate": 6.400153546653506e-06, + "loss": 0.7109, + "step": 13230 + }, + { + "epoch": 0.3620404424331087, + "grad_norm": 2.71875, + "learning_rate": 6.397411642126622e-06, + "loss": 0.7612, + "step": 13240 + }, + { + "epoch": 0.36231388687603394, + "grad_norm": 2.78125, + "learning_rate": 6.394669737599737e-06, + "loss": 0.6037, + "step": 13250 + }, + { + "epoch": 0.36258733131895926, + "grad_norm": 2.71875, + "learning_rate": 6.391927833072853e-06, + "loss": 0.7557, + "step": 13260 + }, + { + "epoch": 0.3628607757618846, + "grad_norm": 2.796875, + "learning_rate": 6.389185928545969e-06, + "loss": 0.679, + "step": 13270 + }, + { + "epoch": 0.3631342202048099, + "grad_norm": 2.984375, + "learning_rate": 6.386444024019084e-06, + "loss": 0.718, + "step": 13280 + }, + { + "epoch": 0.3634076646477352, + "grad_norm": 2.828125, + "learning_rate": 6.3837021194922e-06, + "loss": 0.6763, + "step": 13290 + }, + { + "epoch": 0.3636811090906605, + "grad_norm": 2.859375, + "learning_rate": 6.380960214965315e-06, + "loss": 0.6543, + "step": 13300 + }, + { + "epoch": 0.3639545535335858, + "grad_norm": 2.765625, + "learning_rate": 6.378218310438431e-06, + "loss": 0.6607, + "step": 13310 + }, + { + "epoch": 0.3642279979765111, + "grad_norm": 2.46875, + "learning_rate": 6.375476405911547e-06, + "loss": 0.7173, + "step": 13320 + }, + { + "epoch": 0.36450144241943644, + "grad_norm": 2.84375, + "learning_rate": 6.372734501384662e-06, + "loss": 0.6754, + "step": 13330 + }, + { + "epoch": 0.36477488686236176, + "grad_norm": 2.171875, + "learning_rate": 6.369992596857778e-06, + "loss": 0.6021, + "step": 13340 + }, + { + "epoch": 0.365048331305287, + "grad_norm": 3.03125, + "learning_rate": 6.367250692330894e-06, + "loss": 0.7117, + "step": 13350 + }, + { + "epoch": 0.36532177574821234, + "grad_norm": 2.8125, + "learning_rate": 6.364508787804009e-06, + "loss": 0.6762, + "step": 13360 + }, + { + "epoch": 0.36559522019113766, + "grad_norm": 3.078125, + "learning_rate": 6.3617668832771255e-06, + "loss": 0.788, + "step": 13370 + }, + { + "epoch": 0.365868664634063, + "grad_norm": 2.53125, + "learning_rate": 6.35902497875024e-06, + "loss": 0.7206, + "step": 13380 + }, + { + "epoch": 0.3661421090769883, + "grad_norm": 2.53125, + "learning_rate": 6.3562830742233565e-06, + "loss": 0.6886, + "step": 13390 + }, + { + "epoch": 0.3664155535199136, + "grad_norm": 2.546875, + "learning_rate": 6.353541169696472e-06, + "loss": 0.664, + "step": 13400 + }, + { + "epoch": 0.3666889979628389, + "grad_norm": 2.65625, + "learning_rate": 6.3507992651695874e-06, + "loss": 0.7328, + "step": 13410 + }, + { + "epoch": 0.3669624424057642, + "grad_norm": 2.328125, + "learning_rate": 6.348057360642703e-06, + "loss": 0.6768, + "step": 13420 + }, + { + "epoch": 0.3672358868486895, + "grad_norm": 2.765625, + "learning_rate": 6.345315456115818e-06, + "loss": 0.71, + "step": 13430 + }, + { + "epoch": 0.36750933129161484, + "grad_norm": 2.390625, + "learning_rate": 6.342573551588934e-06, + "loss": 0.6394, + "step": 13440 + }, + { + "epoch": 0.36778277573454016, + "grad_norm": 2.671875, + "learning_rate": 6.33983164706205e-06, + "loss": 0.684, + "step": 13450 + }, + { + "epoch": 0.3680562201774654, + "grad_norm": 2.03125, + "learning_rate": 6.337089742535165e-06, + "loss": 0.6506, + "step": 13460 + }, + { + "epoch": 0.36832966462039074, + "grad_norm": 2.5, + "learning_rate": 6.334347838008281e-06, + "loss": 0.7561, + "step": 13470 + }, + { + "epoch": 0.36860310906331606, + "grad_norm": 2.75, + "learning_rate": 6.331605933481396e-06, + "loss": 0.6451, + "step": 13480 + }, + { + "epoch": 0.3688765535062414, + "grad_norm": 2.875, + "learning_rate": 6.328864028954512e-06, + "loss": 0.7024, + "step": 13490 + }, + { + "epoch": 0.3691499979491667, + "grad_norm": 3.0, + "learning_rate": 6.3261221244276285e-06, + "loss": 0.7214, + "step": 13500 + }, + { + "epoch": 0.36942344239209196, + "grad_norm": 2.734375, + "learning_rate": 6.323380219900743e-06, + "loss": 0.6148, + "step": 13510 + }, + { + "epoch": 0.3696968868350173, + "grad_norm": 2.421875, + "learning_rate": 6.3206383153738594e-06, + "loss": 0.7339, + "step": 13520 + }, + { + "epoch": 0.3699703312779426, + "grad_norm": 3.0, + "learning_rate": 6.317896410846976e-06, + "loss": 0.7109, + "step": 13530 + }, + { + "epoch": 0.3702437757208679, + "grad_norm": 2.984375, + "learning_rate": 6.31515450632009e-06, + "loss": 0.7465, + "step": 13540 + }, + { + "epoch": 0.37051722016379324, + "grad_norm": 2.78125, + "learning_rate": 6.312412601793207e-06, + "loss": 0.7331, + "step": 13550 + }, + { + "epoch": 0.37079066460671856, + "grad_norm": 2.59375, + "learning_rate": 6.309670697266321e-06, + "loss": 0.6578, + "step": 13560 + }, + { + "epoch": 0.3710641090496438, + "grad_norm": 2.875, + "learning_rate": 6.306928792739438e-06, + "loss": 0.6959, + "step": 13570 + }, + { + "epoch": 0.37133755349256914, + "grad_norm": 2.734375, + "learning_rate": 6.304186888212553e-06, + "loss": 0.7239, + "step": 13580 + }, + { + "epoch": 0.37161099793549446, + "grad_norm": 2.859375, + "learning_rate": 6.301444983685669e-06, + "loss": 0.7013, + "step": 13590 + }, + { + "epoch": 0.3718844423784198, + "grad_norm": 2.953125, + "learning_rate": 6.298703079158784e-06, + "loss": 0.7123, + "step": 13600 + }, + { + "epoch": 0.3721578868213451, + "grad_norm": 2.78125, + "learning_rate": 6.2959611746319e-06, + "loss": 0.6951, + "step": 13610 + }, + { + "epoch": 0.37243133126427036, + "grad_norm": 2.53125, + "learning_rate": 6.293219270105015e-06, + "loss": 0.7117, + "step": 13620 + }, + { + "epoch": 0.3727047757071957, + "grad_norm": 2.96875, + "learning_rate": 6.2904773655781314e-06, + "loss": 0.7431, + "step": 13630 + }, + { + "epoch": 0.372978220150121, + "grad_norm": 2.640625, + "learning_rate": 6.287735461051246e-06, + "loss": 0.8444, + "step": 13640 + }, + { + "epoch": 0.3732516645930463, + "grad_norm": 2.578125, + "learning_rate": 6.284993556524362e-06, + "loss": 0.7474, + "step": 13650 + }, + { + "epoch": 0.37352510903597164, + "grad_norm": 2.5, + "learning_rate": 6.282251651997477e-06, + "loss": 0.7127, + "step": 13660 + }, + { + "epoch": 0.3737985534788969, + "grad_norm": 3.109375, + "learning_rate": 6.279509747470593e-06, + "loss": 0.7059, + "step": 13670 + }, + { + "epoch": 0.3740719979218222, + "grad_norm": 2.421875, + "learning_rate": 6.27676784294371e-06, + "loss": 0.6749, + "step": 13680 + }, + { + "epoch": 0.37434544236474754, + "grad_norm": 2.734375, + "learning_rate": 6.274025938416824e-06, + "loss": 0.6664, + "step": 13690 + }, + { + "epoch": 0.37461888680767286, + "grad_norm": 2.875, + "learning_rate": 6.271284033889941e-06, + "loss": 0.7008, + "step": 13700 + }, + { + "epoch": 0.3748923312505982, + "grad_norm": 3.40625, + "learning_rate": 6.268542129363057e-06, + "loss": 0.6752, + "step": 13710 + }, + { + "epoch": 0.3751657756935235, + "grad_norm": 3.171875, + "learning_rate": 6.265800224836172e-06, + "loss": 0.7562, + "step": 13720 + }, + { + "epoch": 0.37543922013644876, + "grad_norm": 2.515625, + "learning_rate": 6.263058320309288e-06, + "loss": 0.6944, + "step": 13730 + }, + { + "epoch": 0.3757126645793741, + "grad_norm": 3.03125, + "learning_rate": 6.260316415782403e-06, + "loss": 0.7294, + "step": 13740 + }, + { + "epoch": 0.3759861090222994, + "grad_norm": 2.6875, + "learning_rate": 6.257574511255519e-06, + "loss": 0.7171, + "step": 13750 + }, + { + "epoch": 0.3762595534652247, + "grad_norm": 2.765625, + "learning_rate": 6.254832606728634e-06, + "loss": 0.6723, + "step": 13760 + }, + { + "epoch": 0.37653299790815004, + "grad_norm": 2.6875, + "learning_rate": 6.25209070220175e-06, + "loss": 0.6431, + "step": 13770 + }, + { + "epoch": 0.3768064423510753, + "grad_norm": 2.34375, + "learning_rate": 6.249348797674865e-06, + "loss": 0.6719, + "step": 13780 + }, + { + "epoch": 0.3770798867940006, + "grad_norm": 2.421875, + "learning_rate": 6.246606893147981e-06, + "loss": 0.6918, + "step": 13790 + }, + { + "epoch": 0.37735333123692594, + "grad_norm": 2.625, + "learning_rate": 6.243864988621096e-06, + "loss": 0.6617, + "step": 13800 + }, + { + "epoch": 0.37762677567985126, + "grad_norm": 2.671875, + "learning_rate": 6.241123084094213e-06, + "loss": 0.7124, + "step": 13810 + }, + { + "epoch": 0.3779002201227766, + "grad_norm": 2.390625, + "learning_rate": 6.238381179567327e-06, + "loss": 0.5449, + "step": 13820 + }, + { + "epoch": 0.37817366456570184, + "grad_norm": 2.59375, + "learning_rate": 6.235639275040444e-06, + "loss": 0.7055, + "step": 13830 + }, + { + "epoch": 0.37844710900862716, + "grad_norm": 3.140625, + "learning_rate": 6.232897370513559e-06, + "loss": 0.6791, + "step": 13840 + }, + { + "epoch": 0.3787205534515525, + "grad_norm": 3.09375, + "learning_rate": 6.230155465986675e-06, + "loss": 0.7459, + "step": 13850 + }, + { + "epoch": 0.3789939978944778, + "grad_norm": 2.640625, + "learning_rate": 6.227413561459791e-06, + "loss": 0.7265, + "step": 13860 + }, + { + "epoch": 0.3792674423374031, + "grad_norm": 2.59375, + "learning_rate": 6.2246716569329055e-06, + "loss": 0.6804, + "step": 13870 + }, + { + "epoch": 0.3795408867803284, + "grad_norm": 2.453125, + "learning_rate": 6.221929752406022e-06, + "loss": 0.6714, + "step": 13880 + }, + { + "epoch": 0.3798143312232537, + "grad_norm": 3.328125, + "learning_rate": 6.219187847879138e-06, + "loss": 0.7088, + "step": 13890 + }, + { + "epoch": 0.380087775666179, + "grad_norm": 2.4375, + "learning_rate": 6.216445943352253e-06, + "loss": 0.6568, + "step": 13900 + }, + { + "epoch": 0.38036122010910434, + "grad_norm": 2.8125, + "learning_rate": 6.213704038825369e-06, + "loss": 0.7247, + "step": 13910 + }, + { + "epoch": 0.38063466455202966, + "grad_norm": 2.9375, + "learning_rate": 6.210962134298484e-06, + "loss": 0.6368, + "step": 13920 + }, + { + "epoch": 0.380908108994955, + "grad_norm": 2.421875, + "learning_rate": 6.2082202297716e-06, + "loss": 0.6921, + "step": 13930 + }, + { + "epoch": 0.38118155343788024, + "grad_norm": 2.65625, + "learning_rate": 6.205478325244716e-06, + "loss": 0.6863, + "step": 13940 + }, + { + "epoch": 0.38145499788080556, + "grad_norm": 2.8125, + "learning_rate": 6.202736420717831e-06, + "loss": 0.6825, + "step": 13950 + }, + { + "epoch": 0.3817284423237309, + "grad_norm": 2.546875, + "learning_rate": 6.199994516190947e-06, + "loss": 0.7176, + "step": 13960 + }, + { + "epoch": 0.3820018867666562, + "grad_norm": 3.09375, + "learning_rate": 6.197252611664062e-06, + "loss": 0.6778, + "step": 13970 + }, + { + "epoch": 0.3822753312095815, + "grad_norm": 3.171875, + "learning_rate": 6.1945107071371775e-06, + "loss": 0.7316, + "step": 13980 + }, + { + "epoch": 0.3825487756525068, + "grad_norm": 3.1875, + "learning_rate": 6.191768802610294e-06, + "loss": 0.7122, + "step": 13990 + }, + { + "epoch": 0.3828222200954321, + "grad_norm": 2.640625, + "learning_rate": 6.189026898083409e-06, + "loss": 0.6813, + "step": 14000 + }, + { + "epoch": 0.3830956645383574, + "grad_norm": 2.84375, + "learning_rate": 6.186284993556525e-06, + "loss": 0.7347, + "step": 14010 + }, + { + "epoch": 0.38336910898128274, + "grad_norm": 2.921875, + "learning_rate": 6.18354308902964e-06, + "loss": 0.7135, + "step": 14020 + }, + { + "epoch": 0.38364255342420805, + "grad_norm": 3.046875, + "learning_rate": 6.180801184502756e-06, + "loss": 0.7529, + "step": 14030 + }, + { + "epoch": 0.3839159978671333, + "grad_norm": 2.90625, + "learning_rate": 6.178059279975872e-06, + "loss": 0.7285, + "step": 14040 + }, + { + "epoch": 0.38418944231005864, + "grad_norm": 2.484375, + "learning_rate": 6.175317375448987e-06, + "loss": 0.7314, + "step": 14050 + }, + { + "epoch": 0.38446288675298396, + "grad_norm": 2.953125, + "learning_rate": 6.172575470922103e-06, + "loss": 0.7224, + "step": 14060 + }, + { + "epoch": 0.3847363311959093, + "grad_norm": 2.953125, + "learning_rate": 6.169833566395219e-06, + "loss": 0.6825, + "step": 14070 + }, + { + "epoch": 0.3850097756388346, + "grad_norm": 2.96875, + "learning_rate": 6.167091661868334e-06, + "loss": 0.7531, + "step": 14080 + }, + { + "epoch": 0.3852832200817599, + "grad_norm": 2.859375, + "learning_rate": 6.16434975734145e-06, + "loss": 0.6188, + "step": 14090 + }, + { + "epoch": 0.3855566645246852, + "grad_norm": 2.796875, + "learning_rate": 6.161607852814565e-06, + "loss": 0.7209, + "step": 14100 + }, + { + "epoch": 0.3858301089676105, + "grad_norm": 3.0625, + "learning_rate": 6.158865948287681e-06, + "loss": 0.7137, + "step": 14110 + }, + { + "epoch": 0.3861035534105358, + "grad_norm": 3.265625, + "learning_rate": 6.156124043760797e-06, + "loss": 0.7105, + "step": 14120 + }, + { + "epoch": 0.38637699785346113, + "grad_norm": 2.25, + "learning_rate": 6.153382139233912e-06, + "loss": 0.6481, + "step": 14130 + }, + { + "epoch": 0.38665044229638645, + "grad_norm": 2.6875, + "learning_rate": 6.150640234707028e-06, + "loss": 0.647, + "step": 14140 + }, + { + "epoch": 0.3869238867393117, + "grad_norm": 2.609375, + "learning_rate": 6.147898330180143e-06, + "loss": 0.7331, + "step": 14150 + }, + { + "epoch": 0.38719733118223704, + "grad_norm": 2.6875, + "learning_rate": 6.14515642565326e-06, + "loss": 0.6945, + "step": 14160 + }, + { + "epoch": 0.38747077562516236, + "grad_norm": 2.9375, + "learning_rate": 6.142414521126375e-06, + "loss": 0.7456, + "step": 14170 + }, + { + "epoch": 0.3877442200680877, + "grad_norm": 2.515625, + "learning_rate": 6.1396726165994906e-06, + "loss": 0.6746, + "step": 14180 + }, + { + "epoch": 0.388017664511013, + "grad_norm": 2.40625, + "learning_rate": 6.136930712072606e-06, + "loss": 0.6863, + "step": 14190 + }, + { + "epoch": 0.38829110895393826, + "grad_norm": 2.890625, + "learning_rate": 6.1341888075457215e-06, + "loss": 0.779, + "step": 14200 + }, + { + "epoch": 0.3885645533968636, + "grad_norm": 2.453125, + "learning_rate": 6.131446903018837e-06, + "loss": 0.6763, + "step": 14210 + }, + { + "epoch": 0.3888379978397889, + "grad_norm": 2.640625, + "learning_rate": 6.128704998491953e-06, + "loss": 0.6615, + "step": 14220 + }, + { + "epoch": 0.3891114422827142, + "grad_norm": 2.640625, + "learning_rate": 6.125963093965068e-06, + "loss": 0.6847, + "step": 14230 + }, + { + "epoch": 0.38938488672563953, + "grad_norm": 2.90625, + "learning_rate": 6.123221189438184e-06, + "loss": 0.6729, + "step": 14240 + }, + { + "epoch": 0.38965833116856485, + "grad_norm": 2.390625, + "learning_rate": 6.120479284911301e-06, + "loss": 0.6582, + "step": 14250 + }, + { + "epoch": 0.3899317756114901, + "grad_norm": 2.796875, + "learning_rate": 6.117737380384415e-06, + "loss": 0.7595, + "step": 14260 + }, + { + "epoch": 0.39020522005441544, + "grad_norm": 2.625, + "learning_rate": 6.114995475857532e-06, + "loss": 0.6754, + "step": 14270 + }, + { + "epoch": 0.39047866449734076, + "grad_norm": 2.40625, + "learning_rate": 6.112253571330646e-06, + "loss": 0.7241, + "step": 14280 + }, + { + "epoch": 0.3907521089402661, + "grad_norm": 2.796875, + "learning_rate": 6.1095116668037626e-06, + "loss": 0.6959, + "step": 14290 + }, + { + "epoch": 0.3910255533831914, + "grad_norm": 2.96875, + "learning_rate": 6.106769762276878e-06, + "loss": 0.7203, + "step": 14300 + }, + { + "epoch": 0.39129899782611666, + "grad_norm": 2.71875, + "learning_rate": 6.1040278577499935e-06, + "loss": 0.725, + "step": 14310 + }, + { + "epoch": 0.391572442269042, + "grad_norm": 2.78125, + "learning_rate": 6.10128595322311e-06, + "loss": 0.7359, + "step": 14320 + }, + { + "epoch": 0.3918458867119673, + "grad_norm": 2.265625, + "learning_rate": 6.0985440486962245e-06, + "loss": 0.6918, + "step": 14330 + }, + { + "epoch": 0.3921193311548926, + "grad_norm": 2.5625, + "learning_rate": 6.095802144169341e-06, + "loss": 0.6649, + "step": 14340 + }, + { + "epoch": 0.39239277559781793, + "grad_norm": 3.640625, + "learning_rate": 6.093060239642456e-06, + "loss": 0.7633, + "step": 14350 + }, + { + "epoch": 0.3926662200407432, + "grad_norm": 3.140625, + "learning_rate": 6.090318335115572e-06, + "loss": 0.6541, + "step": 14360 + }, + { + "epoch": 0.3929396644836685, + "grad_norm": 2.9375, + "learning_rate": 6.087576430588687e-06, + "loss": 0.6348, + "step": 14370 + }, + { + "epoch": 0.39321310892659384, + "grad_norm": 2.625, + "learning_rate": 6.084834526061803e-06, + "loss": 0.6468, + "step": 14380 + }, + { + "epoch": 0.39348655336951915, + "grad_norm": 2.71875, + "learning_rate": 6.082092621534918e-06, + "loss": 0.7333, + "step": 14390 + }, + { + "epoch": 0.3937599978124445, + "grad_norm": 2.953125, + "learning_rate": 6.0793507170080346e-06, + "loss": 0.7096, + "step": 14400 + }, + { + "epoch": 0.3940334422553698, + "grad_norm": 2.90625, + "learning_rate": 6.076608812481149e-06, + "loss": 0.7081, + "step": 14410 + }, + { + "epoch": 0.39430688669829506, + "grad_norm": 3.15625, + "learning_rate": 6.0738669079542655e-06, + "loss": 0.7049, + "step": 14420 + }, + { + "epoch": 0.3945803311412204, + "grad_norm": 2.9375, + "learning_rate": 6.071125003427382e-06, + "loss": 0.6019, + "step": 14430 + }, + { + "epoch": 0.3948537755841457, + "grad_norm": 2.875, + "learning_rate": 6.0683830989004965e-06, + "loss": 0.6275, + "step": 14440 + }, + { + "epoch": 0.395127220027071, + "grad_norm": 3.171875, + "learning_rate": 6.065641194373613e-06, + "loss": 0.7434, + "step": 14450 + }, + { + "epoch": 0.39540066446999633, + "grad_norm": 2.609375, + "learning_rate": 6.0628992898467275e-06, + "loss": 0.6777, + "step": 14460 + }, + { + "epoch": 0.3956741089129216, + "grad_norm": 2.75, + "learning_rate": 6.060157385319844e-06, + "loss": 0.6286, + "step": 14470 + }, + { + "epoch": 0.3959475533558469, + "grad_norm": 2.40625, + "learning_rate": 6.05741548079296e-06, + "loss": 0.6864, + "step": 14480 + }, + { + "epoch": 0.39622099779877223, + "grad_norm": 2.75, + "learning_rate": 6.054673576266075e-06, + "loss": 0.7356, + "step": 14490 + }, + { + "epoch": 0.39649444224169755, + "grad_norm": 2.859375, + "learning_rate": 6.051931671739191e-06, + "loss": 0.7372, + "step": 14500 + }, + { + "epoch": 0.3967678866846229, + "grad_norm": 3.03125, + "learning_rate": 6.049189767212306e-06, + "loss": 0.6744, + "step": 14510 + }, + { + "epoch": 0.39704133112754814, + "grad_norm": 3.140625, + "learning_rate": 6.046447862685422e-06, + "loss": 0.6878, + "step": 14520 + }, + { + "epoch": 0.39731477557047346, + "grad_norm": 2.609375, + "learning_rate": 6.0437059581585375e-06, + "loss": 0.6424, + "step": 14530 + }, + { + "epoch": 0.3975882200133988, + "grad_norm": 2.4375, + "learning_rate": 6.040964053631653e-06, + "loss": 0.6246, + "step": 14540 + }, + { + "epoch": 0.3978616644563241, + "grad_norm": 2.6875, + "learning_rate": 6.0382221491047685e-06, + "loss": 0.6473, + "step": 14550 + }, + { + "epoch": 0.3981351088992494, + "grad_norm": 2.640625, + "learning_rate": 6.035480244577885e-06, + "loss": 0.7126, + "step": 14560 + }, + { + "epoch": 0.3984085533421747, + "grad_norm": 2.34375, + "learning_rate": 6.0327383400509995e-06, + "loss": 0.6815, + "step": 14570 + }, + { + "epoch": 0.3986819977851, + "grad_norm": 2.96875, + "learning_rate": 6.029996435524116e-06, + "loss": 0.727, + "step": 14580 + }, + { + "epoch": 0.3989554422280253, + "grad_norm": 2.640625, + "learning_rate": 6.02725453099723e-06, + "loss": 0.6589, + "step": 14590 + }, + { + "epoch": 0.39922888667095063, + "grad_norm": 3.390625, + "learning_rate": 6.024512626470347e-06, + "loss": 0.7331, + "step": 14600 + }, + { + "epoch": 0.39950233111387595, + "grad_norm": 2.453125, + "learning_rate": 6.021770721943463e-06, + "loss": 0.639, + "step": 14610 + }, + { + "epoch": 0.39977577555680127, + "grad_norm": 2.140625, + "learning_rate": 6.019028817416578e-06, + "loss": 0.7135, + "step": 14620 + }, + { + "epoch": 0.40004921999972654, + "grad_norm": 2.484375, + "learning_rate": 6.016286912889694e-06, + "loss": 0.6038, + "step": 14630 + }, + { + "epoch": 0.40032266444265185, + "grad_norm": 2.734375, + "learning_rate": 6.013545008362809e-06, + "loss": 0.6952, + "step": 14640 + }, + { + "epoch": 0.4005961088855772, + "grad_norm": 2.65625, + "learning_rate": 6.010803103835925e-06, + "loss": 0.6579, + "step": 14650 + }, + { + "epoch": 0.4008695533285025, + "grad_norm": 2.6875, + "learning_rate": 6.008061199309041e-06, + "loss": 0.7139, + "step": 14660 + }, + { + "epoch": 0.4011429977714278, + "grad_norm": 2.890625, + "learning_rate": 6.005319294782156e-06, + "loss": 0.636, + "step": 14670 + }, + { + "epoch": 0.4014164422143531, + "grad_norm": 2.453125, + "learning_rate": 6.002577390255272e-06, + "loss": 0.595, + "step": 14680 + }, + { + "epoch": 0.4016898866572784, + "grad_norm": 2.671875, + "learning_rate": 5.999835485728387e-06, + "loss": 0.7053, + "step": 14690 + }, + { + "epoch": 0.4019633311002037, + "grad_norm": 2.71875, + "learning_rate": 5.997093581201503e-06, + "loss": 0.7116, + "step": 14700 + }, + { + "epoch": 0.40223677554312903, + "grad_norm": 2.65625, + "learning_rate": 5.994351676674619e-06, + "loss": 0.6998, + "step": 14710 + }, + { + "epoch": 0.40251021998605435, + "grad_norm": 2.703125, + "learning_rate": 5.991609772147734e-06, + "loss": 0.6792, + "step": 14720 + }, + { + "epoch": 0.4027836644289796, + "grad_norm": 3.359375, + "learning_rate": 5.98886786762085e-06, + "loss": 0.7268, + "step": 14730 + }, + { + "epoch": 0.40305710887190493, + "grad_norm": 3.0625, + "learning_rate": 5.986125963093966e-06, + "loss": 0.7736, + "step": 14740 + }, + { + "epoch": 0.40333055331483025, + "grad_norm": 2.3125, + "learning_rate": 5.983384058567081e-06, + "loss": 0.6195, + "step": 14750 + }, + { + "epoch": 0.4036039977577556, + "grad_norm": 2.8125, + "learning_rate": 5.980642154040197e-06, + "loss": 0.6881, + "step": 14760 + }, + { + "epoch": 0.4038774422006809, + "grad_norm": 3.03125, + "learning_rate": 5.977900249513312e-06, + "loss": 0.7166, + "step": 14770 + }, + { + "epoch": 0.4041508866436062, + "grad_norm": 2.96875, + "learning_rate": 5.975158344986428e-06, + "loss": 0.7411, + "step": 14780 + }, + { + "epoch": 0.4044243310865315, + "grad_norm": 2.859375, + "learning_rate": 5.972416440459544e-06, + "loss": 0.6692, + "step": 14790 + }, + { + "epoch": 0.4046977755294568, + "grad_norm": 2.5, + "learning_rate": 5.969674535932659e-06, + "loss": 0.6725, + "step": 14800 + }, + { + "epoch": 0.4049712199723821, + "grad_norm": 2.671875, + "learning_rate": 5.966932631405775e-06, + "loss": 0.6516, + "step": 14810 + }, + { + "epoch": 0.40524466441530743, + "grad_norm": 2.890625, + "learning_rate": 5.96419072687889e-06, + "loss": 0.6855, + "step": 14820 + }, + { + "epoch": 0.40551810885823275, + "grad_norm": 2.609375, + "learning_rate": 5.961448822352006e-06, + "loss": 0.6546, + "step": 14830 + }, + { + "epoch": 0.405791553301158, + "grad_norm": 2.90625, + "learning_rate": 5.9587069178251225e-06, + "loss": 0.7487, + "step": 14840 + }, + { + "epoch": 0.40606499774408333, + "grad_norm": 2.90625, + "learning_rate": 5.955965013298237e-06, + "loss": 0.7097, + "step": 14850 + }, + { + "epoch": 0.40633844218700865, + "grad_norm": 3.5, + "learning_rate": 5.9532231087713535e-06, + "loss": 0.7235, + "step": 14860 + }, + { + "epoch": 0.40661188662993397, + "grad_norm": 2.734375, + "learning_rate": 5.950481204244468e-06, + "loss": 0.6973, + "step": 14870 + }, + { + "epoch": 0.4068853310728593, + "grad_norm": 2.796875, + "learning_rate": 5.9477392997175845e-06, + "loss": 0.768, + "step": 14880 + }, + { + "epoch": 0.40715877551578455, + "grad_norm": 2.421875, + "learning_rate": 5.9449973951907e-06, + "loss": 0.6414, + "step": 14890 + }, + { + "epoch": 0.4074322199587099, + "grad_norm": 2.53125, + "learning_rate": 5.9422554906638154e-06, + "loss": 0.6941, + "step": 14900 + }, + { + "epoch": 0.4077056644016352, + "grad_norm": 2.984375, + "learning_rate": 5.939513586136931e-06, + "loss": 0.825, + "step": 14910 + }, + { + "epoch": 0.4079791088445605, + "grad_norm": 2.703125, + "learning_rate": 5.936771681610047e-06, + "loss": 0.6626, + "step": 14920 + }, + { + "epoch": 0.40825255328748583, + "grad_norm": 2.9375, + "learning_rate": 5.934029777083162e-06, + "loss": 0.7031, + "step": 14930 + }, + { + "epoch": 0.40852599773041115, + "grad_norm": 2.671875, + "learning_rate": 5.931287872556278e-06, + "loss": 0.7209, + "step": 14940 + }, + { + "epoch": 0.4087994421733364, + "grad_norm": 3.34375, + "learning_rate": 5.928545968029394e-06, + "loss": 0.5933, + "step": 14950 + }, + { + "epoch": 0.40907288661626173, + "grad_norm": 2.421875, + "learning_rate": 5.925804063502509e-06, + "loss": 0.7659, + "step": 14960 + }, + { + "epoch": 0.40934633105918705, + "grad_norm": 2.96875, + "learning_rate": 5.9230621589756255e-06, + "loss": 0.5858, + "step": 14970 + }, + { + "epoch": 0.40961977550211237, + "grad_norm": 2.84375, + "learning_rate": 5.92032025444874e-06, + "loss": 0.7104, + "step": 14980 + }, + { + "epoch": 0.4098932199450377, + "grad_norm": 3.421875, + "learning_rate": 5.9175783499218565e-06, + "loss": 0.7616, + "step": 14990 + }, + { + "epoch": 0.41016666438796295, + "grad_norm": 2.84375, + "learning_rate": 5.914836445394971e-06, + "loss": 0.6702, + "step": 15000 + }, + { + "epoch": 0.4104401088308883, + "grad_norm": 2.625, + "learning_rate": 5.9120945408680874e-06, + "loss": 0.6992, + "step": 15010 + }, + { + "epoch": 0.4107135532738136, + "grad_norm": 2.9375, + "learning_rate": 5.909352636341204e-06, + "loss": 0.7395, + "step": 15020 + }, + { + "epoch": 0.4109869977167389, + "grad_norm": 3.03125, + "learning_rate": 5.906610731814318e-06, + "loss": 0.7298, + "step": 15030 + }, + { + "epoch": 0.41126044215966423, + "grad_norm": 2.9375, + "learning_rate": 5.903868827287435e-06, + "loss": 0.7603, + "step": 15040 + }, + { + "epoch": 0.4115338866025895, + "grad_norm": 2.6875, + "learning_rate": 5.901126922760549e-06, + "loss": 0.7144, + "step": 15050 + }, + { + "epoch": 0.4118073310455148, + "grad_norm": 2.828125, + "learning_rate": 5.898385018233666e-06, + "loss": 0.7735, + "step": 15060 + }, + { + "epoch": 0.41208077548844013, + "grad_norm": 2.890625, + "learning_rate": 5.895643113706781e-06, + "loss": 0.6999, + "step": 15070 + }, + { + "epoch": 0.41235421993136545, + "grad_norm": 2.4375, + "learning_rate": 5.892901209179897e-06, + "loss": 0.726, + "step": 15080 + }, + { + "epoch": 0.41262766437429077, + "grad_norm": 2.9375, + "learning_rate": 5.890159304653012e-06, + "loss": 0.5768, + "step": 15090 + }, + { + "epoch": 0.4129011088172161, + "grad_norm": 2.453125, + "learning_rate": 5.8874174001261285e-06, + "loss": 0.7366, + "step": 15100 + }, + { + "epoch": 0.41317455326014135, + "grad_norm": 2.921875, + "learning_rate": 5.884675495599244e-06, + "loss": 0.6978, + "step": 15110 + }, + { + "epoch": 0.41344799770306667, + "grad_norm": 2.328125, + "learning_rate": 5.8819335910723594e-06, + "loss": 0.6102, + "step": 15120 + }, + { + "epoch": 0.413721442145992, + "grad_norm": 2.96875, + "learning_rate": 5.879191686545475e-06, + "loss": 0.7044, + "step": 15130 + }, + { + "epoch": 0.4139948865889173, + "grad_norm": 2.515625, + "learning_rate": 5.87644978201859e-06, + "loss": 0.6246, + "step": 15140 + }, + { + "epoch": 0.41426833103184263, + "grad_norm": 2.53125, + "learning_rate": 5.873707877491707e-06, + "loss": 0.6282, + "step": 15150 + }, + { + "epoch": 0.4145417754747679, + "grad_norm": 2.8125, + "learning_rate": 5.870965972964821e-06, + "loss": 0.7281, + "step": 15160 + }, + { + "epoch": 0.4148152199176932, + "grad_norm": 2.640625, + "learning_rate": 5.868224068437938e-06, + "loss": 0.6464, + "step": 15170 + }, + { + "epoch": 0.41508866436061853, + "grad_norm": 2.5625, + "learning_rate": 5.865482163911052e-06, + "loss": 0.7392, + "step": 15180 + }, + { + "epoch": 0.41536210880354385, + "grad_norm": 3.25, + "learning_rate": 5.862740259384169e-06, + "loss": 0.724, + "step": 15190 + }, + { + "epoch": 0.41563555324646917, + "grad_norm": 2.453125, + "learning_rate": 5.859998354857285e-06, + "loss": 0.66, + "step": 15200 + }, + { + "epoch": 0.41590899768939443, + "grad_norm": 3.21875, + "learning_rate": 5.8572564503304e-06, + "loss": 0.7156, + "step": 15210 + }, + { + "epoch": 0.41618244213231975, + "grad_norm": 3.0, + "learning_rate": 5.854514545803516e-06, + "loss": 0.6355, + "step": 15220 + }, + { + "epoch": 0.41645588657524507, + "grad_norm": 2.21875, + "learning_rate": 5.851772641276631e-06, + "loss": 0.7075, + "step": 15230 + }, + { + "epoch": 0.4167293310181704, + "grad_norm": 2.25, + "learning_rate": 5.849030736749747e-06, + "loss": 0.6611, + "step": 15240 + }, + { + "epoch": 0.4170027754610957, + "grad_norm": 3.09375, + "learning_rate": 5.846288832222862e-06, + "loss": 0.7198, + "step": 15250 + }, + { + "epoch": 0.417276219904021, + "grad_norm": 2.90625, + "learning_rate": 5.843546927695978e-06, + "loss": 0.7012, + "step": 15260 + }, + { + "epoch": 0.4175496643469463, + "grad_norm": 2.625, + "learning_rate": 5.840805023169094e-06, + "loss": 0.6907, + "step": 15270 + }, + { + "epoch": 0.4178231087898716, + "grad_norm": 3.078125, + "learning_rate": 5.83806311864221e-06, + "loss": 0.6457, + "step": 15280 + }, + { + "epoch": 0.41809655323279693, + "grad_norm": 2.84375, + "learning_rate": 5.835321214115325e-06, + "loss": 0.7494, + "step": 15290 + }, + { + "epoch": 0.41836999767572225, + "grad_norm": 2.8125, + "learning_rate": 5.832579309588441e-06, + "loss": 0.7189, + "step": 15300 + }, + { + "epoch": 0.41864344211864757, + "grad_norm": 2.546875, + "learning_rate": 5.829837405061556e-06, + "loss": 0.6462, + "step": 15310 + }, + { + "epoch": 0.41891688656157283, + "grad_norm": 2.65625, + "learning_rate": 5.827095500534672e-06, + "loss": 0.6437, + "step": 15320 + }, + { + "epoch": 0.41919033100449815, + "grad_norm": 3.1875, + "learning_rate": 5.824353596007788e-06, + "loss": 0.7495, + "step": 15330 + }, + { + "epoch": 0.41946377544742347, + "grad_norm": 3.0625, + "learning_rate": 5.8216116914809026e-06, + "loss": 0.7473, + "step": 15340 + }, + { + "epoch": 0.4197372198903488, + "grad_norm": 2.796875, + "learning_rate": 5.818869786954019e-06, + "loss": 0.6484, + "step": 15350 + }, + { + "epoch": 0.4200106643332741, + "grad_norm": 2.984375, + "learning_rate": 5.8161278824271335e-06, + "loss": 0.7195, + "step": 15360 + }, + { + "epoch": 0.42028410877619937, + "grad_norm": 2.765625, + "learning_rate": 5.81338597790025e-06, + "loss": 0.7102, + "step": 15370 + }, + { + "epoch": 0.4205575532191247, + "grad_norm": 3.046875, + "learning_rate": 5.810644073373366e-06, + "loss": 0.6974, + "step": 15380 + }, + { + "epoch": 0.42083099766205, + "grad_norm": 2.796875, + "learning_rate": 5.807902168846481e-06, + "loss": 0.7082, + "step": 15390 + }, + { + "epoch": 0.42110444210497533, + "grad_norm": 2.875, + "learning_rate": 5.805160264319597e-06, + "loss": 0.744, + "step": 15400 + }, + { + "epoch": 0.42137788654790065, + "grad_norm": 2.890625, + "learning_rate": 5.802418359792712e-06, + "loss": 0.7514, + "step": 15410 + }, + { + "epoch": 0.4216513309908259, + "grad_norm": 2.703125, + "learning_rate": 5.799676455265828e-06, + "loss": 0.6245, + "step": 15420 + }, + { + "epoch": 0.42192477543375123, + "grad_norm": 2.875, + "learning_rate": 5.7969345507389445e-06, + "loss": 0.6849, + "step": 15430 + }, + { + "epoch": 0.42219821987667655, + "grad_norm": 2.75, + "learning_rate": 5.794192646212059e-06, + "loss": 0.6904, + "step": 15440 + }, + { + "epoch": 0.42247166431960187, + "grad_norm": 2.765625, + "learning_rate": 5.791450741685175e-06, + "loss": 0.6454, + "step": 15450 + }, + { + "epoch": 0.4227451087625272, + "grad_norm": 3.046875, + "learning_rate": 5.788708837158291e-06, + "loss": 0.6852, + "step": 15460 + }, + { + "epoch": 0.4230185532054525, + "grad_norm": 2.75, + "learning_rate": 5.785966932631406e-06, + "loss": 0.6395, + "step": 15470 + }, + { + "epoch": 0.42329199764837777, + "grad_norm": 3.34375, + "learning_rate": 5.783225028104522e-06, + "loss": 0.6912, + "step": 15480 + }, + { + "epoch": 0.4235654420913031, + "grad_norm": 2.890625, + "learning_rate": 5.780483123577637e-06, + "loss": 0.6453, + "step": 15490 + }, + { + "epoch": 0.4238388865342284, + "grad_norm": 2.5, + "learning_rate": 5.777741219050753e-06, + "loss": 0.6702, + "step": 15500 + }, + { + "epoch": 0.42411233097715373, + "grad_norm": 2.859375, + "learning_rate": 5.774999314523869e-06, + "loss": 0.7166, + "step": 15510 + }, + { + "epoch": 0.42438577542007905, + "grad_norm": 2.40625, + "learning_rate": 5.772257409996984e-06, + "loss": 0.7389, + "step": 15520 + }, + { + "epoch": 0.4246592198630043, + "grad_norm": 2.65625, + "learning_rate": 5.7695155054701e-06, + "loss": 0.65, + "step": 15530 + }, + { + "epoch": 0.42493266430592963, + "grad_norm": 2.859375, + "learning_rate": 5.766773600943215e-06, + "loss": 0.7133, + "step": 15540 + }, + { + "epoch": 0.42520610874885495, + "grad_norm": 3.171875, + "learning_rate": 5.764031696416331e-06, + "loss": 0.6706, + "step": 15550 + }, + { + "epoch": 0.42547955319178027, + "grad_norm": 2.59375, + "learning_rate": 5.761289791889447e-06, + "loss": 0.7512, + "step": 15560 + }, + { + "epoch": 0.4257529976347056, + "grad_norm": 2.796875, + "learning_rate": 5.758547887362562e-06, + "loss": 0.6969, + "step": 15570 + }, + { + "epoch": 0.42602644207763085, + "grad_norm": 3.359375, + "learning_rate": 5.755805982835678e-06, + "loss": 0.7272, + "step": 15580 + }, + { + "epoch": 0.42629988652055617, + "grad_norm": 2.515625, + "learning_rate": 5.753064078308793e-06, + "loss": 0.703, + "step": 15590 + }, + { + "epoch": 0.4265733309634815, + "grad_norm": 2.984375, + "learning_rate": 5.750322173781909e-06, + "loss": 0.6982, + "step": 15600 + }, + { + "epoch": 0.4268467754064068, + "grad_norm": 2.765625, + "learning_rate": 5.747580269255026e-06, + "loss": 0.7823, + "step": 15610 + }, + { + "epoch": 0.42712021984933213, + "grad_norm": 2.8125, + "learning_rate": 5.74483836472814e-06, + "loss": 0.7053, + "step": 15620 + }, + { + "epoch": 0.42739366429225745, + "grad_norm": 2.609375, + "learning_rate": 5.742096460201257e-06, + "loss": 0.6289, + "step": 15630 + }, + { + "epoch": 0.4276671087351827, + "grad_norm": 2.8125, + "learning_rate": 5.739354555674372e-06, + "loss": 0.7626, + "step": 15640 + }, + { + "epoch": 0.42794055317810803, + "grad_norm": 2.40625, + "learning_rate": 5.736612651147488e-06, + "loss": 0.6943, + "step": 15650 + }, + { + "epoch": 0.42821399762103335, + "grad_norm": 2.984375, + "learning_rate": 5.733870746620603e-06, + "loss": 0.6606, + "step": 15660 + }, + { + "epoch": 0.42848744206395867, + "grad_norm": 3.21875, + "learning_rate": 5.7311288420937186e-06, + "loss": 0.7333, + "step": 15670 + }, + { + "epoch": 0.428760886506884, + "grad_norm": 3.234375, + "learning_rate": 5.728386937566834e-06, + "loss": 0.7562, + "step": 15680 + }, + { + "epoch": 0.42903433094980925, + "grad_norm": 2.859375, + "learning_rate": 5.72564503303995e-06, + "loss": 0.6959, + "step": 15690 + }, + { + "epoch": 0.42930777539273457, + "grad_norm": 2.78125, + "learning_rate": 5.722903128513065e-06, + "loss": 0.64, + "step": 15700 + }, + { + "epoch": 0.4295812198356599, + "grad_norm": 2.828125, + "learning_rate": 5.720161223986181e-06, + "loss": 0.7431, + "step": 15710 + }, + { + "epoch": 0.4298546642785852, + "grad_norm": 2.171875, + "learning_rate": 5.717419319459297e-06, + "loss": 0.659, + "step": 15720 + }, + { + "epoch": 0.4301281087215105, + "grad_norm": 2.578125, + "learning_rate": 5.714677414932412e-06, + "loss": 0.6002, + "step": 15730 + }, + { + "epoch": 0.4304015531644358, + "grad_norm": 2.453125, + "learning_rate": 5.711935510405529e-06, + "loss": 0.6399, + "step": 15740 + }, + { + "epoch": 0.4306749976073611, + "grad_norm": 2.609375, + "learning_rate": 5.709193605878643e-06, + "loss": 0.6559, + "step": 15750 + }, + { + "epoch": 0.43094844205028643, + "grad_norm": 2.71875, + "learning_rate": 5.70645170135176e-06, + "loss": 0.7101, + "step": 15760 + }, + { + "epoch": 0.43122188649321175, + "grad_norm": 2.296875, + "learning_rate": 5.703709796824876e-06, + "loss": 0.7159, + "step": 15770 + }, + { + "epoch": 0.43149533093613707, + "grad_norm": 2.59375, + "learning_rate": 5.7009678922979906e-06, + "loss": 0.6853, + "step": 15780 + }, + { + "epoch": 0.4317687753790624, + "grad_norm": 2.71875, + "learning_rate": 5.698225987771107e-06, + "loss": 0.7091, + "step": 15790 + }, + { + "epoch": 0.43204221982198765, + "grad_norm": 2.78125, + "learning_rate": 5.6954840832442215e-06, + "loss": 0.7035, + "step": 15800 + }, + { + "epoch": 0.43231566426491297, + "grad_norm": 2.625, + "learning_rate": 5.692742178717338e-06, + "loss": 0.6755, + "step": 15810 + }, + { + "epoch": 0.4325891087078383, + "grad_norm": 2.921875, + "learning_rate": 5.690000274190453e-06, + "loss": 0.6707, + "step": 15820 + }, + { + "epoch": 0.4328625531507636, + "grad_norm": 2.921875, + "learning_rate": 5.687258369663569e-06, + "loss": 0.7217, + "step": 15830 + }, + { + "epoch": 0.4331359975936889, + "grad_norm": 2.84375, + "learning_rate": 5.684516465136684e-06, + "loss": 0.695, + "step": 15840 + }, + { + "epoch": 0.4334094420366142, + "grad_norm": 2.953125, + "learning_rate": 5.6817745606098e-06, + "loss": 0.7308, + "step": 15850 + }, + { + "epoch": 0.4336828864795395, + "grad_norm": 2.625, + "learning_rate": 5.679032656082915e-06, + "loss": 0.6607, + "step": 15860 + }, + { + "epoch": 0.43395633092246483, + "grad_norm": 2.796875, + "learning_rate": 5.676290751556032e-06, + "loss": 0.7026, + "step": 15870 + }, + { + "epoch": 0.43422977536539015, + "grad_norm": 2.328125, + "learning_rate": 5.673548847029147e-06, + "loss": 0.6638, + "step": 15880 + }, + { + "epoch": 0.43450321980831547, + "grad_norm": 3.03125, + "learning_rate": 5.6708069425022626e-06, + "loss": 0.634, + "step": 15890 + }, + { + "epoch": 0.43477666425124073, + "grad_norm": 3.40625, + "learning_rate": 5.668065037975378e-06, + "loss": 0.8034, + "step": 15900 + }, + { + "epoch": 0.43505010869416605, + "grad_norm": 2.921875, + "learning_rate": 5.6653231334484935e-06, + "loss": 0.6354, + "step": 15910 + }, + { + "epoch": 0.43532355313709137, + "grad_norm": 3.0625, + "learning_rate": 5.66258122892161e-06, + "loss": 0.6598, + "step": 15920 + }, + { + "epoch": 0.4355969975800167, + "grad_norm": 2.75, + "learning_rate": 5.6598393243947245e-06, + "loss": 0.6071, + "step": 15930 + }, + { + "epoch": 0.435870442022942, + "grad_norm": 2.703125, + "learning_rate": 5.657097419867841e-06, + "loss": 0.6799, + "step": 15940 + }, + { + "epoch": 0.43614388646586727, + "grad_norm": 2.546875, + "learning_rate": 5.654355515340957e-06, + "loss": 0.7165, + "step": 15950 + }, + { + "epoch": 0.4364173309087926, + "grad_norm": 2.96875, + "learning_rate": 5.651613610814072e-06, + "loss": 0.704, + "step": 15960 + }, + { + "epoch": 0.4366907753517179, + "grad_norm": 2.875, + "learning_rate": 5.648871706287188e-06, + "loss": 0.7388, + "step": 15970 + }, + { + "epoch": 0.4369642197946432, + "grad_norm": 2.328125, + "learning_rate": 5.646129801760303e-06, + "loss": 0.6585, + "step": 15980 + }, + { + "epoch": 0.43723766423756855, + "grad_norm": 2.8125, + "learning_rate": 5.643387897233419e-06, + "loss": 0.5972, + "step": 15990 + }, + { + "epoch": 0.43751110868049387, + "grad_norm": 2.59375, + "learning_rate": 5.6406459927065345e-06, + "loss": 0.6451, + "step": 16000 + }, + { + "epoch": 0.43778455312341913, + "grad_norm": 2.875, + "learning_rate": 5.63790408817965e-06, + "loss": 0.6892, + "step": 16010 + }, + { + "epoch": 0.43805799756634445, + "grad_norm": 3.078125, + "learning_rate": 5.6351621836527655e-06, + "loss": 0.7897, + "step": 16020 + }, + { + "epoch": 0.43833144200926977, + "grad_norm": 2.75, + "learning_rate": 5.632420279125881e-06, + "loss": 0.624, + "step": 16030 + }, + { + "epoch": 0.4386048864521951, + "grad_norm": 2.578125, + "learning_rate": 5.629678374598997e-06, + "loss": 0.7036, + "step": 16040 + }, + { + "epoch": 0.4388783308951204, + "grad_norm": 2.65625, + "learning_rate": 5.626936470072113e-06, + "loss": 0.6196, + "step": 16050 + }, + { + "epoch": 0.43915177533804567, + "grad_norm": 2.515625, + "learning_rate": 5.624194565545228e-06, + "loss": 0.6539, + "step": 16060 + }, + { + "epoch": 0.439425219780971, + "grad_norm": 2.90625, + "learning_rate": 5.621452661018344e-06, + "loss": 0.7136, + "step": 16070 + }, + { + "epoch": 0.4396986642238963, + "grad_norm": 2.859375, + "learning_rate": 5.618710756491459e-06, + "loss": 0.7133, + "step": 16080 + }, + { + "epoch": 0.4399721086668216, + "grad_norm": 2.578125, + "learning_rate": 5.615968851964575e-06, + "loss": 0.6938, + "step": 16090 + }, + { + "epoch": 0.44024555310974695, + "grad_norm": 2.59375, + "learning_rate": 5.613226947437691e-06, + "loss": 0.6777, + "step": 16100 + }, + { + "epoch": 0.4405189975526722, + "grad_norm": 2.265625, + "learning_rate": 5.610485042910806e-06, + "loss": 0.6498, + "step": 16110 + }, + { + "epoch": 0.44079244199559753, + "grad_norm": 2.515625, + "learning_rate": 5.607743138383922e-06, + "loss": 0.6162, + "step": 16120 + }, + { + "epoch": 0.44106588643852285, + "grad_norm": 2.90625, + "learning_rate": 5.605001233857038e-06, + "loss": 0.6843, + "step": 16130 + }, + { + "epoch": 0.44133933088144817, + "grad_norm": 2.375, + "learning_rate": 5.602259329330153e-06, + "loss": 0.6719, + "step": 16140 + }, + { + "epoch": 0.4416127753243735, + "grad_norm": 2.875, + "learning_rate": 5.599517424803269e-06, + "loss": 0.6539, + "step": 16150 + }, + { + "epoch": 0.4418862197672988, + "grad_norm": 3.125, + "learning_rate": 5.596775520276384e-06, + "loss": 0.6956, + "step": 16160 + }, + { + "epoch": 0.44215966421022407, + "grad_norm": 3.046875, + "learning_rate": 5.5940336157495e-06, + "loss": 0.6771, + "step": 16170 + }, + { + "epoch": 0.4424331086531494, + "grad_norm": 2.921875, + "learning_rate": 5.591291711222616e-06, + "loss": 0.6583, + "step": 16180 + }, + { + "epoch": 0.4427065530960747, + "grad_norm": 2.859375, + "learning_rate": 5.588549806695731e-06, + "loss": 0.7375, + "step": 16190 + }, + { + "epoch": 0.442979997539, + "grad_norm": 2.859375, + "learning_rate": 5.5858079021688476e-06, + "loss": 0.7138, + "step": 16200 + }, + { + "epoch": 0.44325344198192534, + "grad_norm": 2.265625, + "learning_rate": 5.583065997641962e-06, + "loss": 0.6597, + "step": 16210 + }, + { + "epoch": 0.4435268864248506, + "grad_norm": 2.53125, + "learning_rate": 5.5803240931150785e-06, + "loss": 0.6809, + "step": 16220 + }, + { + "epoch": 0.4438003308677759, + "grad_norm": 2.828125, + "learning_rate": 5.577582188588194e-06, + "loss": 0.694, + "step": 16230 + }, + { + "epoch": 0.44407377531070125, + "grad_norm": 2.859375, + "learning_rate": 5.5748402840613095e-06, + "loss": 0.6712, + "step": 16240 + }, + { + "epoch": 0.44434721975362657, + "grad_norm": 3.046875, + "learning_rate": 5.572098379534425e-06, + "loss": 0.7146, + "step": 16250 + }, + { + "epoch": 0.4446206641965519, + "grad_norm": 3.0625, + "learning_rate": 5.5693564750075405e-06, + "loss": 0.767, + "step": 16260 + }, + { + "epoch": 0.44489410863947715, + "grad_norm": 3.125, + "learning_rate": 5.566614570480656e-06, + "loss": 0.6735, + "step": 16270 + }, + { + "epoch": 0.44516755308240247, + "grad_norm": 3.125, + "learning_rate": 5.563872665953772e-06, + "loss": 0.6784, + "step": 16280 + }, + { + "epoch": 0.4454409975253278, + "grad_norm": 2.890625, + "learning_rate": 5.561130761426887e-06, + "loss": 0.7688, + "step": 16290 + }, + { + "epoch": 0.4457144419682531, + "grad_norm": 2.78125, + "learning_rate": 5.558388856900003e-06, + "loss": 0.7248, + "step": 16300 + }, + { + "epoch": 0.4459878864111784, + "grad_norm": 2.546875, + "learning_rate": 5.5556469523731196e-06, + "loss": 0.7285, + "step": 16310 + }, + { + "epoch": 0.44626133085410374, + "grad_norm": 2.84375, + "learning_rate": 5.552905047846234e-06, + "loss": 0.7176, + "step": 16320 + }, + { + "epoch": 0.446534775297029, + "grad_norm": 2.578125, + "learning_rate": 5.5501631433193505e-06, + "loss": 0.7016, + "step": 16330 + }, + { + "epoch": 0.4468082197399543, + "grad_norm": 2.453125, + "learning_rate": 5.547421238792465e-06, + "loss": 0.6685, + "step": 16340 + }, + { + "epoch": 0.44708166418287965, + "grad_norm": 3.078125, + "learning_rate": 5.5446793342655815e-06, + "loss": 0.7495, + "step": 16350 + }, + { + "epoch": 0.44735510862580496, + "grad_norm": 2.921875, + "learning_rate": 5.541937429738698e-06, + "loss": 0.6678, + "step": 16360 + }, + { + "epoch": 0.4476285530687303, + "grad_norm": 2.859375, + "learning_rate": 5.5391955252118125e-06, + "loss": 0.7367, + "step": 16370 + }, + { + "epoch": 0.44790199751165555, + "grad_norm": 2.828125, + "learning_rate": 5.536453620684929e-06, + "loss": 0.6578, + "step": 16380 + }, + { + "epoch": 0.44817544195458087, + "grad_norm": 2.796875, + "learning_rate": 5.5337117161580434e-06, + "loss": 0.6856, + "step": 16390 + }, + { + "epoch": 0.4484488863975062, + "grad_norm": 2.65625, + "learning_rate": 5.53096981163116e-06, + "loss": 0.7409, + "step": 16400 + }, + { + "epoch": 0.4487223308404315, + "grad_norm": 3.125, + "learning_rate": 5.528227907104275e-06, + "loss": 0.6849, + "step": 16410 + }, + { + "epoch": 0.4489957752833568, + "grad_norm": 3.375, + "learning_rate": 5.525486002577391e-06, + "loss": 0.8204, + "step": 16420 + }, + { + "epoch": 0.4492692197262821, + "grad_norm": 2.359375, + "learning_rate": 5.522744098050506e-06, + "loss": 0.6096, + "step": 16430 + }, + { + "epoch": 0.4495426641692074, + "grad_norm": 3.015625, + "learning_rate": 5.520002193523622e-06, + "loss": 0.7223, + "step": 16440 + }, + { + "epoch": 0.4498161086121327, + "grad_norm": 2.671875, + "learning_rate": 5.517260288996737e-06, + "loss": 0.7118, + "step": 16450 + }, + { + "epoch": 0.45008955305505804, + "grad_norm": 2.921875, + "learning_rate": 5.5145183844698535e-06, + "loss": 0.685, + "step": 16460 + }, + { + "epoch": 0.45036299749798336, + "grad_norm": 2.84375, + "learning_rate": 5.511776479942968e-06, + "loss": 0.757, + "step": 16470 + }, + { + "epoch": 0.4506364419409087, + "grad_norm": 2.59375, + "learning_rate": 5.5090345754160845e-06, + "loss": 0.6923, + "step": 16480 + }, + { + "epoch": 0.45090988638383395, + "grad_norm": 2.796875, + "learning_rate": 5.506292670889201e-06, + "loss": 0.7471, + "step": 16490 + }, + { + "epoch": 0.45118333082675927, + "grad_norm": 2.46875, + "learning_rate": 5.503550766362315e-06, + "loss": 0.6723, + "step": 16500 + }, + { + "epoch": 0.4514567752696846, + "grad_norm": 2.953125, + "learning_rate": 5.500808861835432e-06, + "loss": 0.6824, + "step": 16510 + }, + { + "epoch": 0.4517302197126099, + "grad_norm": 2.84375, + "learning_rate": 5.498066957308546e-06, + "loss": 0.6889, + "step": 16520 + }, + { + "epoch": 0.4520036641555352, + "grad_norm": 2.59375, + "learning_rate": 5.495325052781663e-06, + "loss": 0.6781, + "step": 16530 + }, + { + "epoch": 0.4522771085984605, + "grad_norm": 2.625, + "learning_rate": 5.492583148254779e-06, + "loss": 0.7044, + "step": 16540 + }, + { + "epoch": 0.4525505530413858, + "grad_norm": 2.796875, + "learning_rate": 5.489841243727894e-06, + "loss": 0.6045, + "step": 16550 + }, + { + "epoch": 0.4528239974843111, + "grad_norm": 3.125, + "learning_rate": 5.48709933920101e-06, + "loss": 0.7622, + "step": 16560 + }, + { + "epoch": 0.45309744192723644, + "grad_norm": 2.421875, + "learning_rate": 5.484357434674125e-06, + "loss": 0.705, + "step": 16570 + }, + { + "epoch": 0.45337088637016176, + "grad_norm": 3.171875, + "learning_rate": 5.481615530147241e-06, + "loss": 0.7218, + "step": 16580 + }, + { + "epoch": 0.453644330813087, + "grad_norm": 2.234375, + "learning_rate": 5.4788736256203565e-06, + "loss": 0.6365, + "step": 16590 + }, + { + "epoch": 0.45391777525601235, + "grad_norm": 3.484375, + "learning_rate": 5.476131721093472e-06, + "loss": 0.6723, + "step": 16600 + }, + { + "epoch": 0.45419121969893766, + "grad_norm": 2.9375, + "learning_rate": 5.473389816566587e-06, + "loss": 0.7121, + "step": 16610 + }, + { + "epoch": 0.454464664141863, + "grad_norm": 2.296875, + "learning_rate": 5.470647912039703e-06, + "loss": 0.6517, + "step": 16620 + }, + { + "epoch": 0.4547381085847883, + "grad_norm": 2.78125, + "learning_rate": 5.467906007512818e-06, + "loss": 0.6533, + "step": 16630 + }, + { + "epoch": 0.45501155302771357, + "grad_norm": 2.984375, + "learning_rate": 5.465164102985935e-06, + "loss": 0.7379, + "step": 16640 + }, + { + "epoch": 0.4552849974706389, + "grad_norm": 2.90625, + "learning_rate": 5.462422198459049e-06, + "loss": 0.7042, + "step": 16650 + }, + { + "epoch": 0.4555584419135642, + "grad_norm": 2.6875, + "learning_rate": 5.459680293932166e-06, + "loss": 0.7023, + "step": 16660 + }, + { + "epoch": 0.4558318863564895, + "grad_norm": 2.25, + "learning_rate": 5.456938389405282e-06, + "loss": 0.6324, + "step": 16670 + }, + { + "epoch": 0.45610533079941484, + "grad_norm": 2.90625, + "learning_rate": 5.454196484878397e-06, + "loss": 0.6947, + "step": 16680 + }, + { + "epoch": 0.45637877524234016, + "grad_norm": 3.65625, + "learning_rate": 5.451454580351513e-06, + "loss": 0.7733, + "step": 16690 + }, + { + "epoch": 0.4566522196852654, + "grad_norm": 2.75, + "learning_rate": 5.448712675824628e-06, + "loss": 0.7789, + "step": 16700 + }, + { + "epoch": 0.45692566412819074, + "grad_norm": 2.828125, + "learning_rate": 5.445970771297744e-06, + "loss": 0.6233, + "step": 16710 + }, + { + "epoch": 0.45719910857111606, + "grad_norm": 2.453125, + "learning_rate": 5.44322886677086e-06, + "loss": 0.6149, + "step": 16720 + }, + { + "epoch": 0.4574725530140414, + "grad_norm": 2.796875, + "learning_rate": 5.440486962243975e-06, + "loss": 0.635, + "step": 16730 + }, + { + "epoch": 0.4577459974569667, + "grad_norm": 2.59375, + "learning_rate": 5.437745057717091e-06, + "loss": 0.7475, + "step": 16740 + }, + { + "epoch": 0.45801944189989197, + "grad_norm": 2.40625, + "learning_rate": 5.435003153190206e-06, + "loss": 0.5786, + "step": 16750 + }, + { + "epoch": 0.4582928863428173, + "grad_norm": 2.625, + "learning_rate": 5.432261248663322e-06, + "loss": 0.6695, + "step": 16760 + }, + { + "epoch": 0.4585663307857426, + "grad_norm": 2.765625, + "learning_rate": 5.429519344136438e-06, + "loss": 0.7091, + "step": 16770 + }, + { + "epoch": 0.4588397752286679, + "grad_norm": 2.828125, + "learning_rate": 5.426777439609553e-06, + "loss": 0.7884, + "step": 16780 + }, + { + "epoch": 0.45911321967159324, + "grad_norm": 2.84375, + "learning_rate": 5.424035535082669e-06, + "loss": 0.7404, + "step": 16790 + }, + { + "epoch": 0.4593866641145185, + "grad_norm": 2.453125, + "learning_rate": 5.421293630555784e-06, + "loss": 0.6936, + "step": 16800 + }, + { + "epoch": 0.4596601085574438, + "grad_norm": 2.984375, + "learning_rate": 5.4185517260289e-06, + "loss": 0.7395, + "step": 16810 + }, + { + "epoch": 0.45993355300036914, + "grad_norm": 2.46875, + "learning_rate": 5.415809821502016e-06, + "loss": 0.6455, + "step": 16820 + }, + { + "epoch": 0.46020699744329446, + "grad_norm": 2.640625, + "learning_rate": 5.413067916975131e-06, + "loss": 0.7302, + "step": 16830 + }, + { + "epoch": 0.4604804418862198, + "grad_norm": 2.96875, + "learning_rate": 5.410326012448247e-06, + "loss": 0.777, + "step": 16840 + }, + { + "epoch": 0.4607538863291451, + "grad_norm": 2.78125, + "learning_rate": 5.407584107921363e-06, + "loss": 0.7175, + "step": 16850 + }, + { + "epoch": 0.46102733077207037, + "grad_norm": 2.53125, + "learning_rate": 5.404842203394478e-06, + "loss": 0.6448, + "step": 16860 + }, + { + "epoch": 0.4613007752149957, + "grad_norm": 2.890625, + "learning_rate": 5.402100298867594e-06, + "loss": 0.6362, + "step": 16870 + }, + { + "epoch": 0.461574219657921, + "grad_norm": 2.59375, + "learning_rate": 5.399358394340709e-06, + "loss": 0.6346, + "step": 16880 + }, + { + "epoch": 0.4618476641008463, + "grad_norm": 3.296875, + "learning_rate": 5.396616489813825e-06, + "loss": 0.6802, + "step": 16890 + }, + { + "epoch": 0.46212110854377164, + "grad_norm": 2.765625, + "learning_rate": 5.3938745852869415e-06, + "loss": 0.633, + "step": 16900 + }, + { + "epoch": 0.4623945529866969, + "grad_norm": 2.796875, + "learning_rate": 5.391132680760056e-06, + "loss": 0.6539, + "step": 16910 + }, + { + "epoch": 0.4626679974296222, + "grad_norm": 2.40625, + "learning_rate": 5.3883907762331724e-06, + "loss": 0.675, + "step": 16920 + }, + { + "epoch": 0.46294144187254754, + "grad_norm": 3.40625, + "learning_rate": 5.385648871706287e-06, + "loss": 0.7065, + "step": 16930 + }, + { + "epoch": 0.46321488631547286, + "grad_norm": 2.6875, + "learning_rate": 5.382906967179403e-06, + "loss": 0.7009, + "step": 16940 + }, + { + "epoch": 0.4634883307583982, + "grad_norm": 2.625, + "learning_rate": 5.380165062652519e-06, + "loss": 0.7301, + "step": 16950 + }, + { + "epoch": 0.46376177520132345, + "grad_norm": 3.0625, + "learning_rate": 5.377423158125634e-06, + "loss": 0.683, + "step": 16960 + }, + { + "epoch": 0.46403521964424876, + "grad_norm": 3.171875, + "learning_rate": 5.37468125359875e-06, + "loss": 0.7845, + "step": 16970 + }, + { + "epoch": 0.4643086640871741, + "grad_norm": 3.234375, + "learning_rate": 5.371939349071865e-06, + "loss": 0.8183, + "step": 16980 + }, + { + "epoch": 0.4645821085300994, + "grad_norm": 3.15625, + "learning_rate": 5.369197444544982e-06, + "loss": 0.6598, + "step": 16990 + }, + { + "epoch": 0.4648555529730247, + "grad_norm": 2.59375, + "learning_rate": 5.366455540018097e-06, + "loss": 0.673, + "step": 17000 + }, + { + "epoch": 0.46512899741595004, + "grad_norm": 2.71875, + "learning_rate": 5.363713635491213e-06, + "loss": 0.7532, + "step": 17010 + }, + { + "epoch": 0.4654024418588753, + "grad_norm": 2.703125, + "learning_rate": 5.360971730964328e-06, + "loss": 0.6747, + "step": 17020 + }, + { + "epoch": 0.4656758863018006, + "grad_norm": 2.796875, + "learning_rate": 5.3582298264374444e-06, + "loss": 0.6817, + "step": 17030 + }, + { + "epoch": 0.46594933074472594, + "grad_norm": 3.15625, + "learning_rate": 5.355487921910559e-06, + "loss": 0.6773, + "step": 17040 + }, + { + "epoch": 0.46622277518765126, + "grad_norm": 3.15625, + "learning_rate": 5.352746017383675e-06, + "loss": 0.7744, + "step": 17050 + }, + { + "epoch": 0.4664962196305766, + "grad_norm": 2.734375, + "learning_rate": 5.35000411285679e-06, + "loss": 0.6683, + "step": 17060 + }, + { + "epoch": 0.46676966407350184, + "grad_norm": 3.015625, + "learning_rate": 5.347262208329906e-06, + "loss": 0.7371, + "step": 17070 + }, + { + "epoch": 0.46704310851642716, + "grad_norm": 2.875, + "learning_rate": 5.344520303803023e-06, + "loss": 0.7589, + "step": 17080 + }, + { + "epoch": 0.4673165529593525, + "grad_norm": 3.203125, + "learning_rate": 5.341778399276137e-06, + "loss": 0.7275, + "step": 17090 + }, + { + "epoch": 0.4675899974022778, + "grad_norm": 3.078125, + "learning_rate": 5.339036494749254e-06, + "loss": 0.7315, + "step": 17100 + }, + { + "epoch": 0.4678634418452031, + "grad_norm": 2.890625, + "learning_rate": 5.336294590222368e-06, + "loss": 0.6282, + "step": 17110 + }, + { + "epoch": 0.4681368862881284, + "grad_norm": 2.78125, + "learning_rate": 5.333552685695485e-06, + "loss": 0.6921, + "step": 17120 + }, + { + "epoch": 0.4684103307310537, + "grad_norm": 3.46875, + "learning_rate": 5.3308107811686e-06, + "loss": 0.7187, + "step": 17130 + }, + { + "epoch": 0.468683775173979, + "grad_norm": 2.953125, + "learning_rate": 5.328068876641716e-06, + "loss": 0.7035, + "step": 17140 + }, + { + "epoch": 0.46895721961690434, + "grad_norm": 2.484375, + "learning_rate": 5.325326972114832e-06, + "loss": 0.7026, + "step": 17150 + }, + { + "epoch": 0.46923066405982966, + "grad_norm": 2.859375, + "learning_rate": 5.322585067587947e-06, + "loss": 0.735, + "step": 17160 + }, + { + "epoch": 0.469504108502755, + "grad_norm": 2.578125, + "learning_rate": 5.319843163061063e-06, + "loss": 0.7784, + "step": 17170 + }, + { + "epoch": 0.46977755294568024, + "grad_norm": 2.953125, + "learning_rate": 5.317101258534178e-06, + "loss": 0.7082, + "step": 17180 + }, + { + "epoch": 0.47005099738860556, + "grad_norm": 3.3125, + "learning_rate": 5.314359354007294e-06, + "loss": 0.7477, + "step": 17190 + }, + { + "epoch": 0.4703244418315309, + "grad_norm": 3.0, + "learning_rate": 5.311617449480409e-06, + "loss": 0.7412, + "step": 17200 + }, + { + "epoch": 0.4705978862744562, + "grad_norm": 2.453125, + "learning_rate": 5.308875544953526e-06, + "loss": 0.7294, + "step": 17210 + }, + { + "epoch": 0.4708713307173815, + "grad_norm": 2.875, + "learning_rate": 5.30613364042664e-06, + "loss": 0.6907, + "step": 17220 + }, + { + "epoch": 0.4711447751603068, + "grad_norm": 2.75, + "learning_rate": 5.303391735899757e-06, + "loss": 0.6658, + "step": 17230 + }, + { + "epoch": 0.4714182196032321, + "grad_norm": 2.859375, + "learning_rate": 5.300649831372871e-06, + "loss": 0.6401, + "step": 17240 + }, + { + "epoch": 0.4716916640461574, + "grad_norm": 3.015625, + "learning_rate": 5.297907926845988e-06, + "loss": 0.695, + "step": 17250 + }, + { + "epoch": 0.47196510848908274, + "grad_norm": 2.859375, + "learning_rate": 5.295166022319104e-06, + "loss": 0.6576, + "step": 17260 + }, + { + "epoch": 0.47223855293200806, + "grad_norm": 2.75, + "learning_rate": 5.2924241177922185e-06, + "loss": 0.7626, + "step": 17270 + }, + { + "epoch": 0.4725119973749333, + "grad_norm": 2.734375, + "learning_rate": 5.289682213265335e-06, + "loss": 0.706, + "step": 17280 + }, + { + "epoch": 0.47278544181785864, + "grad_norm": 2.578125, + "learning_rate": 5.2869403087384495e-06, + "loss": 0.7233, + "step": 17290 + }, + { + "epoch": 0.47305888626078396, + "grad_norm": 2.765625, + "learning_rate": 5.284198404211566e-06, + "loss": 0.6719, + "step": 17300 + }, + { + "epoch": 0.4733323307037093, + "grad_norm": 2.6875, + "learning_rate": 5.281456499684682e-06, + "loss": 0.7049, + "step": 17310 + }, + { + "epoch": 0.4736057751466346, + "grad_norm": 2.34375, + "learning_rate": 5.278714595157797e-06, + "loss": 0.691, + "step": 17320 + }, + { + "epoch": 0.47387921958955986, + "grad_norm": 2.734375, + "learning_rate": 5.275972690630913e-06, + "loss": 0.7139, + "step": 17330 + }, + { + "epoch": 0.4741526640324852, + "grad_norm": 2.703125, + "learning_rate": 5.273230786104029e-06, + "loss": 0.6278, + "step": 17340 + }, + { + "epoch": 0.4744261084754105, + "grad_norm": 2.703125, + "learning_rate": 5.270488881577144e-06, + "loss": 0.5927, + "step": 17350 + }, + { + "epoch": 0.4746995529183358, + "grad_norm": 3.015625, + "learning_rate": 5.26774697705026e-06, + "loss": 0.6849, + "step": 17360 + }, + { + "epoch": 0.47497299736126114, + "grad_norm": 2.28125, + "learning_rate": 5.265005072523375e-06, + "loss": 0.7456, + "step": 17370 + }, + { + "epoch": 0.47524644180418646, + "grad_norm": 2.46875, + "learning_rate": 5.2622631679964905e-06, + "loss": 0.6039, + "step": 17380 + }, + { + "epoch": 0.4755198862471117, + "grad_norm": 2.890625, + "learning_rate": 5.259521263469607e-06, + "loss": 0.7238, + "step": 17390 + }, + { + "epoch": 0.47579333069003704, + "grad_norm": 2.5625, + "learning_rate": 5.2567793589427215e-06, + "loss": 0.7005, + "step": 17400 + }, + { + "epoch": 0.47606677513296236, + "grad_norm": 2.953125, + "learning_rate": 5.254037454415838e-06, + "loss": 0.7948, + "step": 17410 + }, + { + "epoch": 0.4763402195758877, + "grad_norm": 3.078125, + "learning_rate": 5.2512955498889525e-06, + "loss": 0.6585, + "step": 17420 + }, + { + "epoch": 0.476613664018813, + "grad_norm": 2.859375, + "learning_rate": 5.248553645362069e-06, + "loss": 0.7341, + "step": 17430 + }, + { + "epoch": 0.47688710846173826, + "grad_norm": 2.734375, + "learning_rate": 5.245811740835185e-06, + "loss": 0.6731, + "step": 17440 + }, + { + "epoch": 0.4771605529046636, + "grad_norm": 3.0, + "learning_rate": 5.2430698363083e-06, + "loss": 0.7003, + "step": 17450 + }, + { + "epoch": 0.4774339973475889, + "grad_norm": 2.84375, + "learning_rate": 5.240327931781416e-06, + "loss": 0.6762, + "step": 17460 + }, + { + "epoch": 0.4777074417905142, + "grad_norm": 2.65625, + "learning_rate": 5.237586027254531e-06, + "loss": 0.678, + "step": 17470 + }, + { + "epoch": 0.47798088623343954, + "grad_norm": 2.4375, + "learning_rate": 5.234844122727647e-06, + "loss": 0.7466, + "step": 17480 + }, + { + "epoch": 0.4782543306763648, + "grad_norm": 2.953125, + "learning_rate": 5.232102218200763e-06, + "loss": 0.6788, + "step": 17490 + }, + { + "epoch": 0.4785277751192901, + "grad_norm": 2.421875, + "learning_rate": 5.229360313673878e-06, + "loss": 0.6385, + "step": 17500 + }, + { + "epoch": 0.47880121956221544, + "grad_norm": 3.578125, + "learning_rate": 5.226618409146994e-06, + "loss": 0.6834, + "step": 17510 + }, + { + "epoch": 0.47907466400514076, + "grad_norm": 2.765625, + "learning_rate": 5.22387650462011e-06, + "loss": 0.6528, + "step": 17520 + }, + { + "epoch": 0.4793481084480661, + "grad_norm": 2.03125, + "learning_rate": 5.221134600093225e-06, + "loss": 0.6356, + "step": 17530 + }, + { + "epoch": 0.4796215528909914, + "grad_norm": 2.671875, + "learning_rate": 5.218392695566341e-06, + "loss": 0.6317, + "step": 17540 + }, + { + "epoch": 0.47989499733391666, + "grad_norm": 2.515625, + "learning_rate": 5.215650791039456e-06, + "loss": 0.6527, + "step": 17550 + }, + { + "epoch": 0.480168441776842, + "grad_norm": 2.328125, + "learning_rate": 5.212908886512572e-06, + "loss": 0.6464, + "step": 17560 + }, + { + "epoch": 0.4804418862197673, + "grad_norm": 3.03125, + "learning_rate": 5.210166981985688e-06, + "loss": 0.641, + "step": 17570 + }, + { + "epoch": 0.4807153306626926, + "grad_norm": 3.046875, + "learning_rate": 5.207425077458803e-06, + "loss": 0.7125, + "step": 17580 + }, + { + "epoch": 0.48098877510561794, + "grad_norm": 2.734375, + "learning_rate": 5.204683172931919e-06, + "loss": 0.6778, + "step": 17590 + }, + { + "epoch": 0.4812622195485432, + "grad_norm": 2.859375, + "learning_rate": 5.201941268405034e-06, + "loss": 0.7131, + "step": 17600 + }, + { + "epoch": 0.4815356639914685, + "grad_norm": 2.65625, + "learning_rate": 5.19919936387815e-06, + "loss": 0.7327, + "step": 17610 + }, + { + "epoch": 0.48180910843439384, + "grad_norm": 3.125, + "learning_rate": 5.196457459351266e-06, + "loss": 0.6985, + "step": 17620 + }, + { + "epoch": 0.48208255287731916, + "grad_norm": 2.78125, + "learning_rate": 5.193715554824381e-06, + "loss": 0.7478, + "step": 17630 + }, + { + "epoch": 0.4823559973202445, + "grad_norm": 2.59375, + "learning_rate": 5.190973650297497e-06, + "loss": 0.7122, + "step": 17640 + }, + { + "epoch": 0.48262944176316974, + "grad_norm": 2.390625, + "learning_rate": 5.188231745770612e-06, + "loss": 0.684, + "step": 17650 + }, + { + "epoch": 0.48290288620609506, + "grad_norm": 2.796875, + "learning_rate": 5.185489841243728e-06, + "loss": 0.711, + "step": 17660 + }, + { + "epoch": 0.4831763306490204, + "grad_norm": 2.8125, + "learning_rate": 5.182747936716845e-06, + "loss": 0.6777, + "step": 17670 + }, + { + "epoch": 0.4834497750919457, + "grad_norm": 2.671875, + "learning_rate": 5.180006032189959e-06, + "loss": 0.7224, + "step": 17680 + }, + { + "epoch": 0.483723219534871, + "grad_norm": 2.96875, + "learning_rate": 5.1772641276630756e-06, + "loss": 0.7227, + "step": 17690 + }, + { + "epoch": 0.48399666397779634, + "grad_norm": 2.921875, + "learning_rate": 5.174522223136191e-06, + "loss": 0.7225, + "step": 17700 + }, + { + "epoch": 0.4842701084207216, + "grad_norm": 3.3125, + "learning_rate": 5.1717803186093065e-06, + "loss": 0.6859, + "step": 17710 + }, + { + "epoch": 0.4845435528636469, + "grad_norm": 2.8125, + "learning_rate": 5.169038414082422e-06, + "loss": 0.6958, + "step": 17720 + }, + { + "epoch": 0.48481699730657224, + "grad_norm": 2.921875, + "learning_rate": 5.1662965095555375e-06, + "loss": 0.7592, + "step": 17730 + }, + { + "epoch": 0.48509044174949756, + "grad_norm": 2.625, + "learning_rate": 5.163554605028653e-06, + "loss": 0.63, + "step": 17740 + }, + { + "epoch": 0.4853638861924229, + "grad_norm": 3.140625, + "learning_rate": 5.160812700501769e-06, + "loss": 0.7445, + "step": 17750 + }, + { + "epoch": 0.48563733063534814, + "grad_norm": 3.40625, + "learning_rate": 5.158070795974884e-06, + "loss": 0.6839, + "step": 17760 + }, + { + "epoch": 0.48591077507827346, + "grad_norm": 3.28125, + "learning_rate": 5.155328891448e-06, + "loss": 0.6781, + "step": 17770 + }, + { + "epoch": 0.4861842195211988, + "grad_norm": 3.484375, + "learning_rate": 5.152586986921116e-06, + "loss": 0.753, + "step": 17780 + }, + { + "epoch": 0.4864576639641241, + "grad_norm": 2.40625, + "learning_rate": 5.149845082394231e-06, + "loss": 0.7041, + "step": 17790 + }, + { + "epoch": 0.4867311084070494, + "grad_norm": 3.265625, + "learning_rate": 5.1471031778673476e-06, + "loss": 0.6776, + "step": 17800 + }, + { + "epoch": 0.4870045528499747, + "grad_norm": 3.03125, + "learning_rate": 5.144361273340462e-06, + "loss": 0.7598, + "step": 17810 + }, + { + "epoch": 0.4872779972929, + "grad_norm": 2.671875, + "learning_rate": 5.1416193688135785e-06, + "loss": 0.7051, + "step": 17820 + }, + { + "epoch": 0.4875514417358253, + "grad_norm": 2.4375, + "learning_rate": 5.138877464286693e-06, + "loss": 0.615, + "step": 17830 + }, + { + "epoch": 0.48782488617875064, + "grad_norm": 2.9375, + "learning_rate": 5.1361355597598095e-06, + "loss": 0.6564, + "step": 17840 + }, + { + "epoch": 0.48809833062167596, + "grad_norm": 2.859375, + "learning_rate": 5.133393655232926e-06, + "loss": 0.7284, + "step": 17850 + }, + { + "epoch": 0.4883717750646013, + "grad_norm": 2.90625, + "learning_rate": 5.1306517507060405e-06, + "loss": 0.6361, + "step": 17860 + }, + { + "epoch": 0.48864521950752654, + "grad_norm": 2.640625, + "learning_rate": 5.127909846179157e-06, + "loss": 0.6992, + "step": 17870 + }, + { + "epoch": 0.48891866395045186, + "grad_norm": 2.515625, + "learning_rate": 5.125167941652272e-06, + "loss": 0.6776, + "step": 17880 + }, + { + "epoch": 0.4891921083933772, + "grad_norm": 2.828125, + "learning_rate": 5.122426037125388e-06, + "loss": 0.5967, + "step": 17890 + }, + { + "epoch": 0.4894655528363025, + "grad_norm": 2.765625, + "learning_rate": 5.119684132598503e-06, + "loss": 0.6571, + "step": 17900 + }, + { + "epoch": 0.4897389972792278, + "grad_norm": 2.78125, + "learning_rate": 5.116942228071619e-06, + "loss": 0.7763, + "step": 17910 + }, + { + "epoch": 0.4900124417221531, + "grad_norm": 2.53125, + "learning_rate": 5.114200323544734e-06, + "loss": 0.6491, + "step": 17920 + }, + { + "epoch": 0.4902858861650784, + "grad_norm": 2.8125, + "learning_rate": 5.1114584190178505e-06, + "loss": 0.6642, + "step": 17930 + }, + { + "epoch": 0.4905593306080037, + "grad_norm": 2.90625, + "learning_rate": 5.108716514490966e-06, + "loss": 0.7604, + "step": 17940 + }, + { + "epoch": 0.49083277505092904, + "grad_norm": 2.84375, + "learning_rate": 5.1059746099640815e-06, + "loss": 0.6204, + "step": 17950 + }, + { + "epoch": 0.49110621949385436, + "grad_norm": 2.96875, + "learning_rate": 5.103232705437197e-06, + "loss": 0.7275, + "step": 17960 + }, + { + "epoch": 0.4913796639367796, + "grad_norm": 2.765625, + "learning_rate": 5.1004908009103124e-06, + "loss": 0.7532, + "step": 17970 + }, + { + "epoch": 0.49165310837970494, + "grad_norm": 2.890625, + "learning_rate": 5.097748896383429e-06, + "loss": 0.6374, + "step": 17980 + }, + { + "epoch": 0.49192655282263026, + "grad_norm": 2.3125, + "learning_rate": 5.095006991856543e-06, + "loss": 0.7289, + "step": 17990 + }, + { + "epoch": 0.4921999972655556, + "grad_norm": 3.015625, + "learning_rate": 5.09226508732966e-06, + "loss": 0.7089, + "step": 18000 + }, + { + "epoch": 0.4924734417084809, + "grad_norm": 2.484375, + "learning_rate": 5.089523182802774e-06, + "loss": 0.6699, + "step": 18010 + }, + { + "epoch": 0.49274688615140616, + "grad_norm": 2.765625, + "learning_rate": 5.086781278275891e-06, + "loss": 0.6772, + "step": 18020 + }, + { + "epoch": 0.4930203305943315, + "grad_norm": 2.984375, + "learning_rate": 5.084039373749007e-06, + "loss": 0.6919, + "step": 18030 + }, + { + "epoch": 0.4932937750372568, + "grad_norm": 2.5625, + "learning_rate": 5.081297469222122e-06, + "loss": 0.6288, + "step": 18040 + }, + { + "epoch": 0.4935672194801821, + "grad_norm": 2.34375, + "learning_rate": 5.078555564695238e-06, + "loss": 0.7086, + "step": 18050 + }, + { + "epoch": 0.49384066392310744, + "grad_norm": 2.65625, + "learning_rate": 5.0758136601683535e-06, + "loss": 0.6064, + "step": 18060 + }, + { + "epoch": 0.49411410836603276, + "grad_norm": 2.96875, + "learning_rate": 5.073071755641469e-06, + "loss": 0.6964, + "step": 18070 + }, + { + "epoch": 0.494387552808958, + "grad_norm": 2.515625, + "learning_rate": 5.070329851114585e-06, + "loss": 0.6649, + "step": 18080 + }, + { + "epoch": 0.49466099725188334, + "grad_norm": 3.171875, + "learning_rate": 5.0675879465877e-06, + "loss": 0.6752, + "step": 18090 + }, + { + "epoch": 0.49493444169480866, + "grad_norm": 3.390625, + "learning_rate": 5.064846042060816e-06, + "loss": 0.7428, + "step": 18100 + }, + { + "epoch": 0.495207886137734, + "grad_norm": 2.578125, + "learning_rate": 5.062104137533932e-06, + "loss": 0.6359, + "step": 18110 + }, + { + "epoch": 0.4954813305806593, + "grad_norm": 2.703125, + "learning_rate": 5.059362233007047e-06, + "loss": 0.6202, + "step": 18120 + }, + { + "epoch": 0.49575477502358456, + "grad_norm": 2.640625, + "learning_rate": 5.056620328480163e-06, + "loss": 0.7494, + "step": 18130 + }, + { + "epoch": 0.4960282194665099, + "grad_norm": 2.375, + "learning_rate": 5.053878423953278e-06, + "loss": 0.7305, + "step": 18140 + }, + { + "epoch": 0.4963016639094352, + "grad_norm": 2.640625, + "learning_rate": 5.051136519426394e-06, + "loss": 0.7331, + "step": 18150 + }, + { + "epoch": 0.4965751083523605, + "grad_norm": 3.28125, + "learning_rate": 5.04839461489951e-06, + "loss": 0.7122, + "step": 18160 + }, + { + "epoch": 0.49684855279528584, + "grad_norm": 2.859375, + "learning_rate": 5.045652710372625e-06, + "loss": 0.6936, + "step": 18170 + }, + { + "epoch": 0.4971219972382111, + "grad_norm": 2.828125, + "learning_rate": 5.042910805845741e-06, + "loss": 0.6649, + "step": 18180 + }, + { + "epoch": 0.4973954416811364, + "grad_norm": 2.421875, + "learning_rate": 5.040168901318856e-06, + "loss": 0.6629, + "step": 18190 + }, + { + "epoch": 0.49766888612406174, + "grad_norm": 2.796875, + "learning_rate": 5.037426996791972e-06, + "loss": 0.6654, + "step": 18200 + }, + { + "epoch": 0.49794233056698706, + "grad_norm": 2.734375, + "learning_rate": 5.034685092265088e-06, + "loss": 0.6939, + "step": 18210 + }, + { + "epoch": 0.4982157750099124, + "grad_norm": 2.78125, + "learning_rate": 5.031943187738203e-06, + "loss": 0.6656, + "step": 18220 + }, + { + "epoch": 0.4984892194528377, + "grad_norm": 2.5625, + "learning_rate": 5.029201283211319e-06, + "loss": 0.7188, + "step": 18230 + }, + { + "epoch": 0.49876266389576296, + "grad_norm": 2.46875, + "learning_rate": 5.0264593786844355e-06, + "loss": 0.6231, + "step": 18240 + }, + { + "epoch": 0.4990361083386883, + "grad_norm": 2.65625, + "learning_rate": 5.02371747415755e-06, + "loss": 0.6916, + "step": 18250 + }, + { + "epoch": 0.4993095527816136, + "grad_norm": 2.78125, + "learning_rate": 5.0209755696306665e-06, + "loss": 0.6731, + "step": 18260 + }, + { + "epoch": 0.4995829972245389, + "grad_norm": 2.640625, + "learning_rate": 5.018233665103781e-06, + "loss": 0.645, + "step": 18270 + }, + { + "epoch": 0.49985644166746424, + "grad_norm": 2.9375, + "learning_rate": 5.0154917605768975e-06, + "loss": 0.7123, + "step": 18280 + }, + { + "epoch": 0.5001298861103896, + "grad_norm": 2.765625, + "learning_rate": 5.012749856050013e-06, + "loss": 0.6978, + "step": 18290 + }, + { + "epoch": 0.5004033305533149, + "grad_norm": 2.625, + "learning_rate": 5.0100079515231284e-06, + "loss": 0.6407, + "step": 18300 + }, + { + "epoch": 0.5006767749962402, + "grad_norm": 2.9375, + "learning_rate": 5.007266046996244e-06, + "loss": 0.7159, + "step": 18310 + }, + { + "epoch": 0.5009502194391654, + "grad_norm": 2.34375, + "learning_rate": 5.004524142469359e-06, + "loss": 0.6525, + "step": 18320 + }, + { + "epoch": 0.5012236638820907, + "grad_norm": 2.9375, + "learning_rate": 5.001782237942475e-06, + "loss": 0.7188, + "step": 18330 + }, + { + "epoch": 0.501497108325016, + "grad_norm": 2.640625, + "learning_rate": 4.99904033341559e-06, + "loss": 0.6594, + "step": 18340 + }, + { + "epoch": 0.5017705527679414, + "grad_norm": 2.75, + "learning_rate": 4.996298428888707e-06, + "loss": 0.6811, + "step": 18350 + }, + { + "epoch": 0.5020439972108667, + "grad_norm": 2.90625, + "learning_rate": 4.993556524361822e-06, + "loss": 0.7097, + "step": 18360 + }, + { + "epoch": 0.502317441653792, + "grad_norm": 2.4375, + "learning_rate": 4.990814619834938e-06, + "loss": 0.6983, + "step": 18370 + }, + { + "epoch": 0.5025908860967173, + "grad_norm": 2.671875, + "learning_rate": 4.988072715308053e-06, + "loss": 0.6817, + "step": 18380 + }, + { + "epoch": 0.5028643305396426, + "grad_norm": 2.921875, + "learning_rate": 4.985330810781169e-06, + "loss": 0.7247, + "step": 18390 + }, + { + "epoch": 0.503137774982568, + "grad_norm": 2.78125, + "learning_rate": 4.982588906254285e-06, + "loss": 0.6772, + "step": 18400 + }, + { + "epoch": 0.5034112194254933, + "grad_norm": 2.90625, + "learning_rate": 4.9798470017274004e-06, + "loss": 0.7281, + "step": 18410 + }, + { + "epoch": 0.5036846638684185, + "grad_norm": 2.765625, + "learning_rate": 4.977105097200516e-06, + "loss": 0.717, + "step": 18420 + }, + { + "epoch": 0.5039581083113438, + "grad_norm": 2.84375, + "learning_rate": 4.974363192673631e-06, + "loss": 0.6924, + "step": 18430 + }, + { + "epoch": 0.5042315527542691, + "grad_norm": 2.625, + "learning_rate": 4.971621288146748e-06, + "loss": 0.752, + "step": 18440 + }, + { + "epoch": 0.5045049971971944, + "grad_norm": 2.96875, + "learning_rate": 4.968879383619863e-06, + "loss": 0.7681, + "step": 18450 + }, + { + "epoch": 0.5047784416401198, + "grad_norm": 2.796875, + "learning_rate": 4.966137479092979e-06, + "loss": 0.6581, + "step": 18460 + }, + { + "epoch": 0.5050518860830451, + "grad_norm": 3.0, + "learning_rate": 4.963395574566094e-06, + "loss": 0.7711, + "step": 18470 + }, + { + "epoch": 0.5053253305259704, + "grad_norm": 2.640625, + "learning_rate": 4.96065367003921e-06, + "loss": 0.6294, + "step": 18480 + }, + { + "epoch": 0.5055987749688957, + "grad_norm": 2.75, + "learning_rate": 4.957911765512325e-06, + "loss": 0.71, + "step": 18490 + }, + { + "epoch": 0.505872219411821, + "grad_norm": 2.453125, + "learning_rate": 4.955169860985441e-06, + "loss": 0.6588, + "step": 18500 + }, + { + "epoch": 0.5061456638547464, + "grad_norm": 3.75, + "learning_rate": 4.952427956458556e-06, + "loss": 0.7516, + "step": 18510 + }, + { + "epoch": 0.5064191082976717, + "grad_norm": 2.765625, + "learning_rate": 4.949686051931672e-06, + "loss": 0.6872, + "step": 18520 + }, + { + "epoch": 0.5066925527405969, + "grad_norm": 2.578125, + "learning_rate": 4.946944147404788e-06, + "loss": 0.6495, + "step": 18530 + }, + { + "epoch": 0.5069659971835222, + "grad_norm": 2.796875, + "learning_rate": 4.944202242877903e-06, + "loss": 0.7672, + "step": 18540 + }, + { + "epoch": 0.5072394416264475, + "grad_norm": 2.890625, + "learning_rate": 4.941460338351019e-06, + "loss": 0.7386, + "step": 18550 + }, + { + "epoch": 0.5075128860693728, + "grad_norm": 2.984375, + "learning_rate": 4.938718433824134e-06, + "loss": 0.7186, + "step": 18560 + }, + { + "epoch": 0.5077863305122982, + "grad_norm": 2.984375, + "learning_rate": 4.93597652929725e-06, + "loss": 0.7155, + "step": 18570 + }, + { + "epoch": 0.5080597749552235, + "grad_norm": 2.640625, + "learning_rate": 4.933234624770366e-06, + "loss": 0.6686, + "step": 18580 + }, + { + "epoch": 0.5083332193981488, + "grad_norm": 2.390625, + "learning_rate": 4.930492720243482e-06, + "loss": 0.682, + "step": 18590 + }, + { + "epoch": 0.5086066638410741, + "grad_norm": 3.0, + "learning_rate": 4.927750815716597e-06, + "loss": 0.6497, + "step": 18600 + }, + { + "epoch": 0.5088801082839994, + "grad_norm": 2.296875, + "learning_rate": 4.925008911189713e-06, + "loss": 0.6369, + "step": 18610 + }, + { + "epoch": 0.5091535527269248, + "grad_norm": 2.78125, + "learning_rate": 4.922267006662829e-06, + "loss": 0.6718, + "step": 18620 + }, + { + "epoch": 0.5094269971698501, + "grad_norm": 3.171875, + "learning_rate": 4.919525102135944e-06, + "loss": 0.6862, + "step": 18630 + }, + { + "epoch": 0.5097004416127753, + "grad_norm": 2.953125, + "learning_rate": 4.91678319760906e-06, + "loss": 0.6695, + "step": 18640 + }, + { + "epoch": 0.5099738860557006, + "grad_norm": 2.703125, + "learning_rate": 4.914041293082175e-06, + "loss": 0.7417, + "step": 18650 + }, + { + "epoch": 0.5102473304986259, + "grad_norm": 2.765625, + "learning_rate": 4.911299388555291e-06, + "loss": 0.7005, + "step": 18660 + }, + { + "epoch": 0.5105207749415512, + "grad_norm": 2.578125, + "learning_rate": 4.908557484028406e-06, + "loss": 0.7466, + "step": 18670 + }, + { + "epoch": 0.5107942193844766, + "grad_norm": 2.796875, + "learning_rate": 4.905815579501522e-06, + "loss": 0.6777, + "step": 18680 + }, + { + "epoch": 0.5110676638274019, + "grad_norm": 3.28125, + "learning_rate": 4.903073674974637e-06, + "loss": 0.6993, + "step": 18690 + }, + { + "epoch": 0.5113411082703272, + "grad_norm": 2.53125, + "learning_rate": 4.900331770447753e-06, + "loss": 0.6274, + "step": 18700 + }, + { + "epoch": 0.5116145527132525, + "grad_norm": 2.578125, + "learning_rate": 4.897589865920869e-06, + "loss": 0.7551, + "step": 18710 + }, + { + "epoch": 0.5118879971561778, + "grad_norm": 3.328125, + "learning_rate": 4.894847961393985e-06, + "loss": 0.6499, + "step": 18720 + }, + { + "epoch": 0.5121614415991032, + "grad_norm": 3.03125, + "learning_rate": 4.8921060568671e-06, + "loss": 0.6998, + "step": 18730 + }, + { + "epoch": 0.5124348860420284, + "grad_norm": 2.34375, + "learning_rate": 4.8893641523402156e-06, + "loss": 0.5908, + "step": 18740 + }, + { + "epoch": 0.5127083304849537, + "grad_norm": 3.453125, + "learning_rate": 4.886622247813331e-06, + "loss": 0.7284, + "step": 18750 + }, + { + "epoch": 0.512981774927879, + "grad_norm": 2.8125, + "learning_rate": 4.883880343286447e-06, + "loss": 0.6817, + "step": 18760 + }, + { + "epoch": 0.5132552193708043, + "grad_norm": 2.765625, + "learning_rate": 4.881138438759563e-06, + "loss": 0.6707, + "step": 18770 + }, + { + "epoch": 0.5135286638137296, + "grad_norm": 2.890625, + "learning_rate": 4.878396534232678e-06, + "loss": 0.7267, + "step": 18780 + }, + { + "epoch": 0.513802108256655, + "grad_norm": 2.703125, + "learning_rate": 4.875654629705794e-06, + "loss": 0.7265, + "step": 18790 + }, + { + "epoch": 0.5140755526995803, + "grad_norm": 2.28125, + "learning_rate": 4.87291272517891e-06, + "loss": 0.6385, + "step": 18800 + }, + { + "epoch": 0.5143489971425056, + "grad_norm": 3.125, + "learning_rate": 4.870170820652026e-06, + "loss": 0.7145, + "step": 18810 + }, + { + "epoch": 0.5146224415854309, + "grad_norm": 3.46875, + "learning_rate": 4.867428916125141e-06, + "loss": 0.6973, + "step": 18820 + }, + { + "epoch": 0.5148958860283562, + "grad_norm": 2.6875, + "learning_rate": 4.864687011598257e-06, + "loss": 0.7059, + "step": 18830 + }, + { + "epoch": 0.5151693304712816, + "grad_norm": 3.390625, + "learning_rate": 4.861945107071372e-06, + "loss": 0.6898, + "step": 18840 + }, + { + "epoch": 0.5154427749142068, + "grad_norm": 3.0, + "learning_rate": 4.8592032025444876e-06, + "loss": 0.7593, + "step": 18850 + }, + { + "epoch": 0.5157162193571321, + "grad_norm": 2.71875, + "learning_rate": 4.856461298017603e-06, + "loss": 0.7073, + "step": 18860 + }, + { + "epoch": 0.5159896638000574, + "grad_norm": 2.90625, + "learning_rate": 4.853719393490719e-06, + "loss": 0.7068, + "step": 18870 + }, + { + "epoch": 0.5162631082429827, + "grad_norm": 3.046875, + "learning_rate": 4.850977488963835e-06, + "loss": 0.6895, + "step": 18880 + }, + { + "epoch": 0.516536552685908, + "grad_norm": 3.34375, + "learning_rate": 4.84823558443695e-06, + "loss": 0.6914, + "step": 18890 + }, + { + "epoch": 0.5168099971288334, + "grad_norm": 2.828125, + "learning_rate": 4.845493679910066e-06, + "loss": 0.6822, + "step": 18900 + }, + { + "epoch": 0.5170834415717587, + "grad_norm": 3.25, + "learning_rate": 4.842751775383181e-06, + "loss": 0.654, + "step": 18910 + }, + { + "epoch": 0.517356886014684, + "grad_norm": 2.546875, + "learning_rate": 4.840009870856297e-06, + "loss": 0.6853, + "step": 18920 + }, + { + "epoch": 0.5176303304576093, + "grad_norm": 2.4375, + "learning_rate": 4.837267966329412e-06, + "loss": 0.6625, + "step": 18930 + }, + { + "epoch": 0.5179037749005346, + "grad_norm": 2.671875, + "learning_rate": 4.834526061802529e-06, + "loss": 0.6396, + "step": 18940 + }, + { + "epoch": 0.5181772193434598, + "grad_norm": 3.203125, + "learning_rate": 4.831784157275644e-06, + "loss": 0.7489, + "step": 18950 + }, + { + "epoch": 0.5184506637863852, + "grad_norm": 2.484375, + "learning_rate": 4.8290422527487596e-06, + "loss": 0.7637, + "step": 18960 + }, + { + "epoch": 0.5187241082293105, + "grad_norm": 2.796875, + "learning_rate": 4.826300348221875e-06, + "loss": 0.7256, + "step": 18970 + }, + { + "epoch": 0.5189975526722358, + "grad_norm": 3.359375, + "learning_rate": 4.823558443694991e-06, + "loss": 0.6722, + "step": 18980 + }, + { + "epoch": 0.5192709971151611, + "grad_norm": 2.375, + "learning_rate": 4.820816539168107e-06, + "loss": 0.7188, + "step": 18990 + }, + { + "epoch": 0.5195444415580864, + "grad_norm": 2.53125, + "learning_rate": 4.818074634641222e-06, + "loss": 0.6522, + "step": 19000 + }, + { + "epoch": 0.5198178860010118, + "grad_norm": 2.6875, + "learning_rate": 4.815332730114338e-06, + "loss": 0.6654, + "step": 19010 + }, + { + "epoch": 0.5200913304439371, + "grad_norm": 2.90625, + "learning_rate": 4.812590825587453e-06, + "loss": 0.7387, + "step": 19020 + }, + { + "epoch": 0.5203647748868624, + "grad_norm": 3.09375, + "learning_rate": 4.80984892106057e-06, + "loss": 0.6441, + "step": 19030 + }, + { + "epoch": 0.5206382193297877, + "grad_norm": 2.546875, + "learning_rate": 4.807107016533685e-06, + "loss": 0.6674, + "step": 19040 + }, + { + "epoch": 0.520911663772713, + "grad_norm": 2.953125, + "learning_rate": 4.804365112006801e-06, + "loss": 0.7107, + "step": 19050 + }, + { + "epoch": 0.5211851082156382, + "grad_norm": 2.984375, + "learning_rate": 4.801623207479916e-06, + "loss": 0.7812, + "step": 19060 + }, + { + "epoch": 0.5214585526585636, + "grad_norm": 2.484375, + "learning_rate": 4.7988813029530316e-06, + "loss": 0.6002, + "step": 19070 + }, + { + "epoch": 0.5217319971014889, + "grad_norm": 2.40625, + "learning_rate": 4.796139398426147e-06, + "loss": 0.6217, + "step": 19080 + }, + { + "epoch": 0.5220054415444142, + "grad_norm": 2.703125, + "learning_rate": 4.7933974938992625e-06, + "loss": 0.6772, + "step": 19090 + }, + { + "epoch": 0.5222788859873395, + "grad_norm": 2.734375, + "learning_rate": 4.790655589372378e-06, + "loss": 0.5856, + "step": 19100 + }, + { + "epoch": 0.5225523304302648, + "grad_norm": 3.125, + "learning_rate": 4.787913684845494e-06, + "loss": 0.6684, + "step": 19110 + }, + { + "epoch": 0.5228257748731902, + "grad_norm": 3.171875, + "learning_rate": 4.78517178031861e-06, + "loss": 0.6603, + "step": 19120 + }, + { + "epoch": 0.5230992193161155, + "grad_norm": 2.546875, + "learning_rate": 4.782429875791725e-06, + "loss": 0.6754, + "step": 19130 + }, + { + "epoch": 0.5233726637590408, + "grad_norm": 2.703125, + "learning_rate": 4.779687971264841e-06, + "loss": 0.6245, + "step": 19140 + }, + { + "epoch": 0.5236461082019661, + "grad_norm": 2.703125, + "learning_rate": 4.776946066737956e-06, + "loss": 0.7126, + "step": 19150 + }, + { + "epoch": 0.5239195526448914, + "grad_norm": 3.578125, + "learning_rate": 4.774204162211073e-06, + "loss": 0.668, + "step": 19160 + }, + { + "epoch": 0.5241929970878166, + "grad_norm": 3.21875, + "learning_rate": 4.771462257684188e-06, + "loss": 0.7577, + "step": 19170 + }, + { + "epoch": 0.524466441530742, + "grad_norm": 2.5, + "learning_rate": 4.7687203531573036e-06, + "loss": 0.7556, + "step": 19180 + }, + { + "epoch": 0.5247398859736673, + "grad_norm": 2.765625, + "learning_rate": 4.765978448630419e-06, + "loss": 0.7141, + "step": 19190 + }, + { + "epoch": 0.5250133304165926, + "grad_norm": 2.734375, + "learning_rate": 4.763236544103535e-06, + "loss": 0.6281, + "step": 19200 + }, + { + "epoch": 0.5252867748595179, + "grad_norm": 2.75, + "learning_rate": 4.760494639576651e-06, + "loss": 0.7107, + "step": 19210 + }, + { + "epoch": 0.5255602193024432, + "grad_norm": 2.484375, + "learning_rate": 4.757752735049766e-06, + "loss": 0.726, + "step": 19220 + }, + { + "epoch": 0.5258336637453686, + "grad_norm": 2.609375, + "learning_rate": 4.755010830522882e-06, + "loss": 0.6732, + "step": 19230 + }, + { + "epoch": 0.5261071081882939, + "grad_norm": 3.015625, + "learning_rate": 4.752268925995997e-06, + "loss": 0.7208, + "step": 19240 + }, + { + "epoch": 0.5263805526312192, + "grad_norm": 2.8125, + "learning_rate": 4.749527021469113e-06, + "loss": 0.6911, + "step": 19250 + }, + { + "epoch": 0.5266539970741445, + "grad_norm": 3.0, + "learning_rate": 4.746785116942228e-06, + "loss": 0.7489, + "step": 19260 + }, + { + "epoch": 0.5269274415170697, + "grad_norm": 2.546875, + "learning_rate": 4.744043212415344e-06, + "loss": 0.6225, + "step": 19270 + }, + { + "epoch": 0.527200885959995, + "grad_norm": 2.21875, + "learning_rate": 4.741301307888459e-06, + "loss": 0.6847, + "step": 19280 + }, + { + "epoch": 0.5274743304029204, + "grad_norm": 2.703125, + "learning_rate": 4.7385594033615755e-06, + "loss": 0.7516, + "step": 19290 + }, + { + "epoch": 0.5277477748458457, + "grad_norm": 2.859375, + "learning_rate": 4.735817498834691e-06, + "loss": 0.7448, + "step": 19300 + }, + { + "epoch": 0.528021219288771, + "grad_norm": 2.796875, + "learning_rate": 4.7330755943078065e-06, + "loss": 0.7301, + "step": 19310 + }, + { + "epoch": 0.5282946637316963, + "grad_norm": 3.125, + "learning_rate": 4.730333689780922e-06, + "loss": 0.6996, + "step": 19320 + }, + { + "epoch": 0.5285681081746216, + "grad_norm": 2.640625, + "learning_rate": 4.7275917852540375e-06, + "loss": 0.6248, + "step": 19330 + }, + { + "epoch": 0.528841552617547, + "grad_norm": 3.140625, + "learning_rate": 4.724849880727154e-06, + "loss": 0.7418, + "step": 19340 + }, + { + "epoch": 0.5291149970604723, + "grad_norm": 2.609375, + "learning_rate": 4.722107976200269e-06, + "loss": 0.6927, + "step": 19350 + }, + { + "epoch": 0.5293884415033976, + "grad_norm": 2.734375, + "learning_rate": 4.719366071673385e-06, + "loss": 0.7105, + "step": 19360 + }, + { + "epoch": 0.5296618859463229, + "grad_norm": 2.34375, + "learning_rate": 4.7166241671465e-06, + "loss": 0.6705, + "step": 19370 + }, + { + "epoch": 0.5299353303892481, + "grad_norm": 2.8125, + "learning_rate": 4.713882262619617e-06, + "loss": 0.6294, + "step": 19380 + }, + { + "epoch": 0.5302087748321734, + "grad_norm": 3.171875, + "learning_rate": 4.711140358092732e-06, + "loss": 0.6245, + "step": 19390 + }, + { + "epoch": 0.5304822192750988, + "grad_norm": 2.390625, + "learning_rate": 4.7083984535658475e-06, + "loss": 0.7179, + "step": 19400 + }, + { + "epoch": 0.5307556637180241, + "grad_norm": 2.5, + "learning_rate": 4.705656549038963e-06, + "loss": 0.6198, + "step": 19410 + }, + { + "epoch": 0.5310291081609494, + "grad_norm": 2.703125, + "learning_rate": 4.7029146445120785e-06, + "loss": 0.7195, + "step": 19420 + }, + { + "epoch": 0.5313025526038747, + "grad_norm": 2.6875, + "learning_rate": 4.700172739985194e-06, + "loss": 0.6926, + "step": 19430 + }, + { + "epoch": 0.5315759970468, + "grad_norm": 2.78125, + "learning_rate": 4.6974308354583095e-06, + "loss": 0.6234, + "step": 19440 + }, + { + "epoch": 0.5318494414897253, + "grad_norm": 2.828125, + "learning_rate": 4.694688930931425e-06, + "loss": 0.6912, + "step": 19450 + }, + { + "epoch": 0.5321228859326507, + "grad_norm": 2.78125, + "learning_rate": 4.6919470264045404e-06, + "loss": 0.6584, + "step": 19460 + }, + { + "epoch": 0.532396330375576, + "grad_norm": 2.46875, + "learning_rate": 4.689205121877657e-06, + "loss": 0.7239, + "step": 19470 + }, + { + "epoch": 0.5326697748185012, + "grad_norm": 2.984375, + "learning_rate": 4.686463217350772e-06, + "loss": 0.7124, + "step": 19480 + }, + { + "epoch": 0.5329432192614265, + "grad_norm": 2.890625, + "learning_rate": 4.683721312823888e-06, + "loss": 0.6204, + "step": 19490 + }, + { + "epoch": 0.5332166637043518, + "grad_norm": 3.390625, + "learning_rate": 4.680979408297003e-06, + "loss": 0.7051, + "step": 19500 + }, + { + "epoch": 0.5334901081472772, + "grad_norm": 3.09375, + "learning_rate": 4.678237503770119e-06, + "loss": 0.7524, + "step": 19510 + }, + { + "epoch": 0.5337635525902025, + "grad_norm": 3.140625, + "learning_rate": 4.675495599243235e-06, + "loss": 0.679, + "step": 19520 + }, + { + "epoch": 0.5340369970331278, + "grad_norm": 2.65625, + "learning_rate": 4.6727536947163505e-06, + "loss": 0.7024, + "step": 19530 + }, + { + "epoch": 0.5343104414760531, + "grad_norm": 2.9375, + "learning_rate": 4.670011790189466e-06, + "loss": 0.6619, + "step": 19540 + }, + { + "epoch": 0.5345838859189784, + "grad_norm": 2.859375, + "learning_rate": 4.6672698856625815e-06, + "loss": 0.6081, + "step": 19550 + }, + { + "epoch": 0.5348573303619037, + "grad_norm": 2.9375, + "learning_rate": 4.664527981135698e-06, + "loss": 0.6983, + "step": 19560 + }, + { + "epoch": 0.5351307748048291, + "grad_norm": 3.171875, + "learning_rate": 4.661786076608813e-06, + "loss": 0.6668, + "step": 19570 + }, + { + "epoch": 0.5354042192477544, + "grad_norm": 2.625, + "learning_rate": 4.659044172081929e-06, + "loss": 0.6601, + "step": 19580 + }, + { + "epoch": 0.5356776636906796, + "grad_norm": 2.8125, + "learning_rate": 4.656302267555044e-06, + "loss": 0.7087, + "step": 19590 + }, + { + "epoch": 0.5359511081336049, + "grad_norm": 3.078125, + "learning_rate": 4.65356036302816e-06, + "loss": 0.7753, + "step": 19600 + }, + { + "epoch": 0.5362245525765302, + "grad_norm": 2.90625, + "learning_rate": 4.650818458501275e-06, + "loss": 0.7579, + "step": 19610 + }, + { + "epoch": 0.5364979970194556, + "grad_norm": 2.875, + "learning_rate": 4.648076553974391e-06, + "loss": 0.7217, + "step": 19620 + }, + { + "epoch": 0.5367714414623809, + "grad_norm": 3.03125, + "learning_rate": 4.645334649447506e-06, + "loss": 0.7523, + "step": 19630 + }, + { + "epoch": 0.5370448859053062, + "grad_norm": 3.359375, + "learning_rate": 4.642592744920622e-06, + "loss": 0.6836, + "step": 19640 + }, + { + "epoch": 0.5373183303482315, + "grad_norm": 3.1875, + "learning_rate": 4.639850840393738e-06, + "loss": 0.7166, + "step": 19650 + }, + { + "epoch": 0.5375917747911568, + "grad_norm": 3.109375, + "learning_rate": 4.6371089358668535e-06, + "loss": 0.7123, + "step": 19660 + }, + { + "epoch": 0.5378652192340821, + "grad_norm": 2.703125, + "learning_rate": 4.634367031339969e-06, + "loss": 0.7586, + "step": 19670 + }, + { + "epoch": 0.5381386636770075, + "grad_norm": 2.75, + "learning_rate": 4.6316251268130844e-06, + "loss": 0.6322, + "step": 19680 + }, + { + "epoch": 0.5384121081199328, + "grad_norm": 2.640625, + "learning_rate": 4.6288832222862e-06, + "loss": 0.6921, + "step": 19690 + }, + { + "epoch": 0.538685552562858, + "grad_norm": 2.453125, + "learning_rate": 4.626141317759316e-06, + "loss": 0.664, + "step": 19700 + }, + { + "epoch": 0.5389589970057833, + "grad_norm": 3.71875, + "learning_rate": 4.623399413232432e-06, + "loss": 0.7034, + "step": 19710 + }, + { + "epoch": 0.5392324414487086, + "grad_norm": 2.8125, + "learning_rate": 4.620657508705547e-06, + "loss": 0.6747, + "step": 19720 + }, + { + "epoch": 0.539505885891634, + "grad_norm": 2.640625, + "learning_rate": 4.617915604178663e-06, + "loss": 0.6734, + "step": 19730 + }, + { + "epoch": 0.5397793303345593, + "grad_norm": 3.0625, + "learning_rate": 4.615173699651779e-06, + "loss": 0.6742, + "step": 19740 + }, + { + "epoch": 0.5400527747774846, + "grad_norm": 2.734375, + "learning_rate": 4.6124317951248945e-06, + "loss": 0.7365, + "step": 19750 + }, + { + "epoch": 0.5403262192204099, + "grad_norm": 3.015625, + "learning_rate": 4.60968989059801e-06, + "loss": 0.7404, + "step": 19760 + }, + { + "epoch": 0.5405996636633352, + "grad_norm": 2.8125, + "learning_rate": 4.6069479860711255e-06, + "loss": 0.6442, + "step": 19770 + }, + { + "epoch": 0.5408731081062605, + "grad_norm": 2.765625, + "learning_rate": 4.604206081544241e-06, + "loss": 0.6004, + "step": 19780 + }, + { + "epoch": 0.5411465525491859, + "grad_norm": 2.828125, + "learning_rate": 4.601464177017356e-06, + "loss": 0.8411, + "step": 19790 + }, + { + "epoch": 0.5414199969921111, + "grad_norm": 3.03125, + "learning_rate": 4.598722272490472e-06, + "loss": 0.7472, + "step": 19800 + }, + { + "epoch": 0.5416934414350364, + "grad_norm": 2.359375, + "learning_rate": 4.595980367963588e-06, + "loss": 0.639, + "step": 19810 + }, + { + "epoch": 0.5419668858779617, + "grad_norm": 3.046875, + "learning_rate": 4.593238463436704e-06, + "loss": 0.7474, + "step": 19820 + }, + { + "epoch": 0.542240330320887, + "grad_norm": 3.171875, + "learning_rate": 4.590496558909819e-06, + "loss": 0.7224, + "step": 19830 + }, + { + "epoch": 0.5425137747638124, + "grad_norm": 2.78125, + "learning_rate": 4.587754654382935e-06, + "loss": 0.7052, + "step": 19840 + }, + { + "epoch": 0.5427872192067377, + "grad_norm": 3.265625, + "learning_rate": 4.58501274985605e-06, + "loss": 0.7569, + "step": 19850 + }, + { + "epoch": 0.543060663649663, + "grad_norm": 2.8125, + "learning_rate": 4.582270845329166e-06, + "loss": 0.7447, + "step": 19860 + }, + { + "epoch": 0.5433341080925883, + "grad_norm": 2.703125, + "learning_rate": 4.579528940802281e-06, + "loss": 0.6715, + "step": 19870 + }, + { + "epoch": 0.5436075525355136, + "grad_norm": 2.640625, + "learning_rate": 4.5767870362753975e-06, + "loss": 0.6665, + "step": 19880 + }, + { + "epoch": 0.543880996978439, + "grad_norm": 2.796875, + "learning_rate": 4.574045131748513e-06, + "loss": 0.6988, + "step": 19890 + }, + { + "epoch": 0.5441544414213643, + "grad_norm": 2.53125, + "learning_rate": 4.571303227221628e-06, + "loss": 0.6768, + "step": 19900 + }, + { + "epoch": 0.5444278858642895, + "grad_norm": 2.484375, + "learning_rate": 4.568561322694744e-06, + "loss": 0.649, + "step": 19910 + }, + { + "epoch": 0.5447013303072148, + "grad_norm": 3.484375, + "learning_rate": 4.56581941816786e-06, + "loss": 0.6657, + "step": 19920 + }, + { + "epoch": 0.5449747747501401, + "grad_norm": 3.140625, + "learning_rate": 4.563077513640976e-06, + "loss": 0.6554, + "step": 19930 + }, + { + "epoch": 0.5452482191930654, + "grad_norm": 2.671875, + "learning_rate": 4.560335609114091e-06, + "loss": 0.6416, + "step": 19940 + }, + { + "epoch": 0.5455216636359908, + "grad_norm": 3.296875, + "learning_rate": 4.557593704587207e-06, + "loss": 0.6985, + "step": 19950 + }, + { + "epoch": 0.5457951080789161, + "grad_norm": 3.34375, + "learning_rate": 4.554851800060322e-06, + "loss": 0.7019, + "step": 19960 + }, + { + "epoch": 0.5460685525218414, + "grad_norm": 2.84375, + "learning_rate": 4.5521098955334385e-06, + "loss": 0.6751, + "step": 19970 + }, + { + "epoch": 0.5463419969647667, + "grad_norm": 3.078125, + "learning_rate": 4.549367991006554e-06, + "loss": 0.65, + "step": 19980 + }, + { + "epoch": 0.546615441407692, + "grad_norm": 2.828125, + "learning_rate": 4.5466260864796695e-06, + "loss": 0.591, + "step": 19990 + }, + { + "epoch": 0.5468888858506173, + "grad_norm": 2.703125, + "learning_rate": 4.543884181952785e-06, + "loss": 0.6397, + "step": 20000 + }, + { + "epoch": 0.5471623302935427, + "grad_norm": 3.03125, + "learning_rate": 4.5411422774259e-06, + "loss": 0.7258, + "step": 20010 + }, + { + "epoch": 0.5474357747364679, + "grad_norm": 2.765625, + "learning_rate": 4.538400372899016e-06, + "loss": 0.7103, + "step": 20020 + }, + { + "epoch": 0.5477092191793932, + "grad_norm": 2.609375, + "learning_rate": 4.535658468372131e-06, + "loss": 0.7195, + "step": 20030 + }, + { + "epoch": 0.5479826636223185, + "grad_norm": 2.5, + "learning_rate": 4.532916563845247e-06, + "loss": 0.6332, + "step": 20040 + }, + { + "epoch": 0.5482561080652438, + "grad_norm": 2.984375, + "learning_rate": 4.530174659318362e-06, + "loss": 0.7484, + "step": 20050 + }, + { + "epoch": 0.5485295525081691, + "grad_norm": 2.328125, + "learning_rate": 4.527432754791479e-06, + "loss": 0.6646, + "step": 20060 + }, + { + "epoch": 0.5488029969510945, + "grad_norm": 2.84375, + "learning_rate": 4.524690850264594e-06, + "loss": 0.6387, + "step": 20070 + }, + { + "epoch": 0.5490764413940198, + "grad_norm": 2.703125, + "learning_rate": 4.52194894573771e-06, + "loss": 0.7343, + "step": 20080 + }, + { + "epoch": 0.5493498858369451, + "grad_norm": 2.53125, + "learning_rate": 4.519207041210825e-06, + "loss": 0.7343, + "step": 20090 + }, + { + "epoch": 0.5496233302798704, + "grad_norm": 2.9375, + "learning_rate": 4.5164651366839414e-06, + "loss": 0.7184, + "step": 20100 + }, + { + "epoch": 0.5498967747227957, + "grad_norm": 2.5, + "learning_rate": 4.513723232157057e-06, + "loss": 0.6625, + "step": 20110 + }, + { + "epoch": 0.550170219165721, + "grad_norm": 2.78125, + "learning_rate": 4.510981327630172e-06, + "loss": 0.6399, + "step": 20120 + }, + { + "epoch": 0.5504436636086463, + "grad_norm": 2.625, + "learning_rate": 4.508239423103288e-06, + "loss": 0.7136, + "step": 20130 + }, + { + "epoch": 0.5507171080515716, + "grad_norm": 3.234375, + "learning_rate": 4.505497518576403e-06, + "loss": 0.7013, + "step": 20140 + }, + { + "epoch": 0.5509905524944969, + "grad_norm": 3.109375, + "learning_rate": 4.50275561404952e-06, + "loss": 0.655, + "step": 20150 + }, + { + "epoch": 0.5512639969374222, + "grad_norm": 2.71875, + "learning_rate": 4.500013709522635e-06, + "loss": 0.7694, + "step": 20160 + }, + { + "epoch": 0.5515374413803475, + "grad_norm": 3.15625, + "learning_rate": 4.497271804995751e-06, + "loss": 0.7054, + "step": 20170 + }, + { + "epoch": 0.5518108858232729, + "grad_norm": 2.734375, + "learning_rate": 4.494529900468866e-06, + "loss": 0.6754, + "step": 20180 + }, + { + "epoch": 0.5520843302661982, + "grad_norm": 2.375, + "learning_rate": 4.491787995941982e-06, + "loss": 0.7457, + "step": 20190 + }, + { + "epoch": 0.5523577747091235, + "grad_norm": 2.4375, + "learning_rate": 4.489046091415097e-06, + "loss": 0.6481, + "step": 20200 + }, + { + "epoch": 0.5526312191520488, + "grad_norm": 2.78125, + "learning_rate": 4.486304186888213e-06, + "loss": 0.6407, + "step": 20210 + }, + { + "epoch": 0.5529046635949741, + "grad_norm": 2.640625, + "learning_rate": 4.483562282361328e-06, + "loss": 0.6862, + "step": 20220 + }, + { + "epoch": 0.5531781080378994, + "grad_norm": 2.828125, + "learning_rate": 4.4808203778344436e-06, + "loss": 0.6917, + "step": 20230 + }, + { + "epoch": 0.5534515524808247, + "grad_norm": 2.75, + "learning_rate": 4.47807847330756e-06, + "loss": 0.6999, + "step": 20240 + }, + { + "epoch": 0.55372499692375, + "grad_norm": 2.203125, + "learning_rate": 4.475336568780675e-06, + "loss": 0.6537, + "step": 20250 + }, + { + "epoch": 0.5539984413666753, + "grad_norm": 2.359375, + "learning_rate": 4.472594664253791e-06, + "loss": 0.659, + "step": 20260 + }, + { + "epoch": 0.5542718858096006, + "grad_norm": 2.40625, + "learning_rate": 4.469852759726906e-06, + "loss": 0.6239, + "step": 20270 + }, + { + "epoch": 0.554545330252526, + "grad_norm": 2.171875, + "learning_rate": 4.467110855200023e-06, + "loss": 0.6467, + "step": 20280 + }, + { + "epoch": 0.5548187746954513, + "grad_norm": 2.890625, + "learning_rate": 4.464368950673138e-06, + "loss": 0.6364, + "step": 20290 + }, + { + "epoch": 0.5550922191383766, + "grad_norm": 2.703125, + "learning_rate": 4.461627046146254e-06, + "loss": 0.7023, + "step": 20300 + }, + { + "epoch": 0.5553656635813019, + "grad_norm": 2.796875, + "learning_rate": 4.458885141619369e-06, + "loss": 0.7158, + "step": 20310 + }, + { + "epoch": 0.5556391080242272, + "grad_norm": 2.96875, + "learning_rate": 4.456143237092485e-06, + "loss": 0.6542, + "step": 20320 + }, + { + "epoch": 0.5559125524671524, + "grad_norm": 3.0625, + "learning_rate": 4.453401332565601e-06, + "loss": 0.7751, + "step": 20330 + }, + { + "epoch": 0.5561859969100778, + "grad_norm": 2.796875, + "learning_rate": 4.450659428038716e-06, + "loss": 0.7021, + "step": 20340 + }, + { + "epoch": 0.5564594413530031, + "grad_norm": 3.140625, + "learning_rate": 4.447917523511832e-06, + "loss": 0.7211, + "step": 20350 + }, + { + "epoch": 0.5567328857959284, + "grad_norm": 2.734375, + "learning_rate": 4.445175618984947e-06, + "loss": 0.6943, + "step": 20360 + }, + { + "epoch": 0.5570063302388537, + "grad_norm": 2.671875, + "learning_rate": 4.442433714458063e-06, + "loss": 0.748, + "step": 20370 + }, + { + "epoch": 0.557279774681779, + "grad_norm": 2.25, + "learning_rate": 4.439691809931178e-06, + "loss": 0.6461, + "step": 20380 + }, + { + "epoch": 0.5575532191247043, + "grad_norm": 2.84375, + "learning_rate": 4.436949905404294e-06, + "loss": 0.7508, + "step": 20390 + }, + { + "epoch": 0.5578266635676297, + "grad_norm": 3.0, + "learning_rate": 4.434208000877409e-06, + "loss": 0.6852, + "step": 20400 + }, + { + "epoch": 0.558100108010555, + "grad_norm": 2.625, + "learning_rate": 4.431466096350526e-06, + "loss": 0.6415, + "step": 20410 + }, + { + "epoch": 0.5583735524534803, + "grad_norm": 3.609375, + "learning_rate": 4.428724191823641e-06, + "loss": 0.746, + "step": 20420 + }, + { + "epoch": 0.5586469968964056, + "grad_norm": 2.46875, + "learning_rate": 4.425982287296757e-06, + "loss": 0.622, + "step": 20430 + }, + { + "epoch": 0.5589204413393308, + "grad_norm": 2.890625, + "learning_rate": 4.423240382769872e-06, + "loss": 0.7444, + "step": 20440 + }, + { + "epoch": 0.5591938857822562, + "grad_norm": 3.171875, + "learning_rate": 4.4204984782429875e-06, + "loss": 0.7601, + "step": 20450 + }, + { + "epoch": 0.5594673302251815, + "grad_norm": 2.28125, + "learning_rate": 4.417756573716104e-06, + "loss": 0.6773, + "step": 20460 + }, + { + "epoch": 0.5597407746681068, + "grad_norm": 2.90625, + "learning_rate": 4.415014669189219e-06, + "loss": 0.6696, + "step": 20470 + }, + { + "epoch": 0.5600142191110321, + "grad_norm": 2.796875, + "learning_rate": 4.412272764662335e-06, + "loss": 0.7401, + "step": 20480 + }, + { + "epoch": 0.5602876635539574, + "grad_norm": 2.78125, + "learning_rate": 4.40953086013545e-06, + "loss": 0.7083, + "step": 20490 + }, + { + "epoch": 0.5605611079968827, + "grad_norm": 3.109375, + "learning_rate": 4.406788955608567e-06, + "loss": 0.6916, + "step": 20500 + }, + { + "epoch": 0.5608345524398081, + "grad_norm": 3.71875, + "learning_rate": 4.404047051081682e-06, + "loss": 0.7645, + "step": 20510 + }, + { + "epoch": 0.5611079968827334, + "grad_norm": 2.96875, + "learning_rate": 4.401305146554798e-06, + "loss": 0.6941, + "step": 20520 + }, + { + "epoch": 0.5613814413256587, + "grad_norm": 3.546875, + "learning_rate": 4.398563242027913e-06, + "loss": 0.7158, + "step": 20530 + }, + { + "epoch": 0.561654885768584, + "grad_norm": 2.859375, + "learning_rate": 4.395821337501029e-06, + "loss": 0.6711, + "step": 20540 + }, + { + "epoch": 0.5619283302115092, + "grad_norm": 2.6875, + "learning_rate": 4.393079432974144e-06, + "loss": 0.6702, + "step": 20550 + }, + { + "epoch": 0.5622017746544345, + "grad_norm": 3.109375, + "learning_rate": 4.3903375284472595e-06, + "loss": 0.6488, + "step": 20560 + }, + { + "epoch": 0.5624752190973599, + "grad_norm": 2.859375, + "learning_rate": 4.387595623920375e-06, + "loss": 0.6939, + "step": 20570 + }, + { + "epoch": 0.5627486635402852, + "grad_norm": 2.71875, + "learning_rate": 4.3848537193934905e-06, + "loss": 0.6448, + "step": 20580 + }, + { + "epoch": 0.5630221079832105, + "grad_norm": 2.765625, + "learning_rate": 4.382111814866607e-06, + "loss": 0.6943, + "step": 20590 + }, + { + "epoch": 0.5632955524261358, + "grad_norm": 2.640625, + "learning_rate": 4.379369910339722e-06, + "loss": 0.7163, + "step": 20600 + }, + { + "epoch": 0.5635689968690611, + "grad_norm": 3.015625, + "learning_rate": 4.376628005812838e-06, + "loss": 0.6511, + "step": 20610 + }, + { + "epoch": 0.5638424413119865, + "grad_norm": 2.515625, + "learning_rate": 4.373886101285953e-06, + "loss": 0.628, + "step": 20620 + }, + { + "epoch": 0.5641158857549118, + "grad_norm": 2.703125, + "learning_rate": 4.371144196759069e-06, + "loss": 0.6502, + "step": 20630 + }, + { + "epoch": 0.5643893301978371, + "grad_norm": 3.234375, + "learning_rate": 4.368402292232185e-06, + "loss": 0.7693, + "step": 20640 + }, + { + "epoch": 0.5646627746407623, + "grad_norm": 3.046875, + "learning_rate": 4.365660387705301e-06, + "loss": 0.6744, + "step": 20650 + }, + { + "epoch": 0.5649362190836876, + "grad_norm": 2.953125, + "learning_rate": 4.362918483178416e-06, + "loss": 0.6402, + "step": 20660 + }, + { + "epoch": 0.565209663526613, + "grad_norm": 2.625, + "learning_rate": 4.3601765786515315e-06, + "loss": 0.686, + "step": 20670 + }, + { + "epoch": 0.5654831079695383, + "grad_norm": 3.0, + "learning_rate": 4.357434674124648e-06, + "loss": 0.6589, + "step": 20680 + }, + { + "epoch": 0.5657565524124636, + "grad_norm": 2.65625, + "learning_rate": 4.354692769597763e-06, + "loss": 0.6699, + "step": 20690 + }, + { + "epoch": 0.5660299968553889, + "grad_norm": 3.140625, + "learning_rate": 4.351950865070879e-06, + "loss": 0.7462, + "step": 20700 + }, + { + "epoch": 0.5663034412983142, + "grad_norm": 3.0625, + "learning_rate": 4.349208960543994e-06, + "loss": 0.747, + "step": 20710 + }, + { + "epoch": 0.5665768857412395, + "grad_norm": 2.171875, + "learning_rate": 4.34646705601711e-06, + "loss": 0.6914, + "step": 20720 + }, + { + "epoch": 0.5668503301841649, + "grad_norm": 3.109375, + "learning_rate": 4.343725151490225e-06, + "loss": 0.6306, + "step": 20730 + }, + { + "epoch": 0.5671237746270902, + "grad_norm": 2.984375, + "learning_rate": 4.340983246963341e-06, + "loss": 0.6822, + "step": 20740 + }, + { + "epoch": 0.5673972190700155, + "grad_norm": 2.515625, + "learning_rate": 4.338241342436457e-06, + "loss": 0.6707, + "step": 20750 + }, + { + "epoch": 0.5676706635129407, + "grad_norm": 2.5, + "learning_rate": 4.3354994379095726e-06, + "loss": 0.6745, + "step": 20760 + }, + { + "epoch": 0.567944107955866, + "grad_norm": 2.921875, + "learning_rate": 4.332757533382688e-06, + "loss": 0.6819, + "step": 20770 + }, + { + "epoch": 0.5682175523987913, + "grad_norm": 2.78125, + "learning_rate": 4.3300156288558035e-06, + "loss": 0.702, + "step": 20780 + }, + { + "epoch": 0.5684909968417167, + "grad_norm": 2.984375, + "learning_rate": 4.327273724328919e-06, + "loss": 0.6993, + "step": 20790 + }, + { + "epoch": 0.568764441284642, + "grad_norm": 2.515625, + "learning_rate": 4.3245318198020345e-06, + "loss": 0.6789, + "step": 20800 + }, + { + "epoch": 0.5690378857275673, + "grad_norm": 2.421875, + "learning_rate": 4.32178991527515e-06, + "loss": 0.6539, + "step": 20810 + }, + { + "epoch": 0.5693113301704926, + "grad_norm": 2.78125, + "learning_rate": 4.319048010748266e-06, + "loss": 0.7513, + "step": 20820 + }, + { + "epoch": 0.5695847746134179, + "grad_norm": 2.796875, + "learning_rate": 4.316306106221382e-06, + "loss": 0.6603, + "step": 20830 + }, + { + "epoch": 0.5698582190563433, + "grad_norm": 2.6875, + "learning_rate": 4.313564201694497e-06, + "loss": 0.6716, + "step": 20840 + }, + { + "epoch": 0.5701316634992686, + "grad_norm": 2.640625, + "learning_rate": 4.310822297167613e-06, + "loss": 0.6961, + "step": 20850 + }, + { + "epoch": 0.5704051079421938, + "grad_norm": 2.421875, + "learning_rate": 4.308080392640729e-06, + "loss": 0.653, + "step": 20860 + }, + { + "epoch": 0.5706785523851191, + "grad_norm": 2.625, + "learning_rate": 4.3053384881138446e-06, + "loss": 0.7163, + "step": 20870 + }, + { + "epoch": 0.5709519968280444, + "grad_norm": 3.15625, + "learning_rate": 4.30259658358696e-06, + "loss": 0.7386, + "step": 20880 + }, + { + "epoch": 0.5712254412709697, + "grad_norm": 2.4375, + "learning_rate": 4.2998546790600755e-06, + "loss": 0.6396, + "step": 20890 + }, + { + "epoch": 0.5714988857138951, + "grad_norm": 2.140625, + "learning_rate": 4.297112774533191e-06, + "loss": 0.7141, + "step": 20900 + }, + { + "epoch": 0.5717723301568204, + "grad_norm": 2.875, + "learning_rate": 4.294370870006307e-06, + "loss": 0.6805, + "step": 20910 + }, + { + "epoch": 0.5720457745997457, + "grad_norm": 2.21875, + "learning_rate": 4.291628965479423e-06, + "loss": 0.7246, + "step": 20920 + }, + { + "epoch": 0.572319219042671, + "grad_norm": 3.0, + "learning_rate": 4.288887060952538e-06, + "loss": 0.6721, + "step": 20930 + }, + { + "epoch": 0.5725926634855963, + "grad_norm": 2.65625, + "learning_rate": 4.286145156425654e-06, + "loss": 0.6246, + "step": 20940 + }, + { + "epoch": 0.5728661079285217, + "grad_norm": 2.5625, + "learning_rate": 4.283403251898769e-06, + "loss": 0.6247, + "step": 20950 + }, + { + "epoch": 0.573139552371447, + "grad_norm": 2.796875, + "learning_rate": 4.280661347371885e-06, + "loss": 0.7205, + "step": 20960 + }, + { + "epoch": 0.5734129968143722, + "grad_norm": 2.640625, + "learning_rate": 4.277919442845e-06, + "loss": 0.6722, + "step": 20970 + }, + { + "epoch": 0.5736864412572975, + "grad_norm": 2.625, + "learning_rate": 4.275177538318116e-06, + "loss": 0.6769, + "step": 20980 + }, + { + "epoch": 0.5739598857002228, + "grad_norm": 2.640625, + "learning_rate": 4.272435633791231e-06, + "loss": 0.8026, + "step": 20990 + }, + { + "epoch": 0.5742333301431481, + "grad_norm": 2.59375, + "learning_rate": 4.2696937292643475e-06, + "loss": 0.6418, + "step": 21000 + }, + { + "epoch": 0.5745067745860735, + "grad_norm": 2.640625, + "learning_rate": 4.266951824737463e-06, + "loss": 0.7091, + "step": 21010 + }, + { + "epoch": 0.5747802190289988, + "grad_norm": 2.96875, + "learning_rate": 4.2642099202105785e-06, + "loss": 0.7538, + "step": 21020 + }, + { + "epoch": 0.5750536634719241, + "grad_norm": 2.390625, + "learning_rate": 4.261468015683694e-06, + "loss": 0.6391, + "step": 21030 + }, + { + "epoch": 0.5753271079148494, + "grad_norm": 2.875, + "learning_rate": 4.25872611115681e-06, + "loss": 0.7346, + "step": 21040 + }, + { + "epoch": 0.5756005523577747, + "grad_norm": 2.40625, + "learning_rate": 4.255984206629926e-06, + "loss": 0.7234, + "step": 21050 + }, + { + "epoch": 0.5758739968007001, + "grad_norm": 2.5625, + "learning_rate": 4.253242302103041e-06, + "loss": 0.6384, + "step": 21060 + }, + { + "epoch": 0.5761474412436254, + "grad_norm": 2.6875, + "learning_rate": 4.250500397576157e-06, + "loss": 0.7078, + "step": 21070 + }, + { + "epoch": 0.5764208856865506, + "grad_norm": 2.96875, + "learning_rate": 4.247758493049272e-06, + "loss": 0.7438, + "step": 21080 + }, + { + "epoch": 0.5766943301294759, + "grad_norm": 2.578125, + "learning_rate": 4.2450165885223886e-06, + "loss": 0.6692, + "step": 21090 + }, + { + "epoch": 0.5769677745724012, + "grad_norm": 2.953125, + "learning_rate": 4.242274683995504e-06, + "loss": 0.6652, + "step": 21100 + }, + { + "epoch": 0.5772412190153265, + "grad_norm": 2.5625, + "learning_rate": 4.2395327794686195e-06, + "loss": 0.6446, + "step": 21110 + }, + { + "epoch": 0.5775146634582519, + "grad_norm": 2.59375, + "learning_rate": 4.236790874941735e-06, + "loss": 0.6881, + "step": 21120 + }, + { + "epoch": 0.5777881079011772, + "grad_norm": 2.90625, + "learning_rate": 4.2340489704148505e-06, + "loss": 0.6918, + "step": 21130 + }, + { + "epoch": 0.5780615523441025, + "grad_norm": 2.671875, + "learning_rate": 4.231307065887966e-06, + "loss": 0.6761, + "step": 21140 + }, + { + "epoch": 0.5783349967870278, + "grad_norm": 2.703125, + "learning_rate": 4.2285651613610815e-06, + "loss": 0.7023, + "step": 21150 + }, + { + "epoch": 0.5786084412299531, + "grad_norm": 2.71875, + "learning_rate": 4.225823256834197e-06, + "loss": 0.7169, + "step": 21160 + }, + { + "epoch": 0.5788818856728785, + "grad_norm": 2.703125, + "learning_rate": 4.223081352307312e-06, + "loss": 0.7241, + "step": 21170 + }, + { + "epoch": 0.5791553301158037, + "grad_norm": 2.84375, + "learning_rate": 4.220339447780429e-06, + "loss": 0.6779, + "step": 21180 + }, + { + "epoch": 0.579428774558729, + "grad_norm": 3.265625, + "learning_rate": 4.217597543253544e-06, + "loss": 0.6845, + "step": 21190 + }, + { + "epoch": 0.5797022190016543, + "grad_norm": 2.671875, + "learning_rate": 4.21485563872666e-06, + "loss": 0.6401, + "step": 21200 + }, + { + "epoch": 0.5799756634445796, + "grad_norm": 2.796875, + "learning_rate": 4.212113734199775e-06, + "loss": 0.6713, + "step": 21210 + }, + { + "epoch": 0.580249107887505, + "grad_norm": 2.765625, + "learning_rate": 4.2093718296728915e-06, + "loss": 0.6344, + "step": 21220 + }, + { + "epoch": 0.5805225523304303, + "grad_norm": 2.5, + "learning_rate": 4.206629925146007e-06, + "loss": 0.7343, + "step": 21230 + }, + { + "epoch": 0.5807959967733556, + "grad_norm": 3.4375, + "learning_rate": 4.2038880206191225e-06, + "loss": 0.7955, + "step": 21240 + }, + { + "epoch": 0.5810694412162809, + "grad_norm": 3.109375, + "learning_rate": 4.201146116092238e-06, + "loss": 0.7284, + "step": 21250 + }, + { + "epoch": 0.5813428856592062, + "grad_norm": 2.59375, + "learning_rate": 4.1984042115653534e-06, + "loss": 0.7046, + "step": 21260 + }, + { + "epoch": 0.5816163301021315, + "grad_norm": 2.734375, + "learning_rate": 4.19566230703847e-06, + "loss": 0.782, + "step": 21270 + }, + { + "epoch": 0.5818897745450569, + "grad_norm": 2.828125, + "learning_rate": 4.192920402511585e-06, + "loss": 0.7421, + "step": 21280 + }, + { + "epoch": 0.5821632189879821, + "grad_norm": 2.390625, + "learning_rate": 4.190178497984701e-06, + "loss": 0.6807, + "step": 21290 + }, + { + "epoch": 0.5824366634309074, + "grad_norm": 3.046875, + "learning_rate": 4.187436593457816e-06, + "loss": 0.7554, + "step": 21300 + }, + { + "epoch": 0.5827101078738327, + "grad_norm": 2.46875, + "learning_rate": 4.184694688930932e-06, + "loss": 0.605, + "step": 21310 + }, + { + "epoch": 0.582983552316758, + "grad_norm": 2.53125, + "learning_rate": 4.181952784404047e-06, + "loss": 0.7022, + "step": 21320 + }, + { + "epoch": 0.5832569967596833, + "grad_norm": 2.828125, + "learning_rate": 4.179210879877163e-06, + "loss": 0.717, + "step": 21330 + }, + { + "epoch": 0.5835304412026087, + "grad_norm": 2.515625, + "learning_rate": 4.176468975350278e-06, + "loss": 0.737, + "step": 21340 + }, + { + "epoch": 0.583803885645534, + "grad_norm": 3.296875, + "learning_rate": 4.173727070823394e-06, + "loss": 0.674, + "step": 21350 + }, + { + "epoch": 0.5840773300884593, + "grad_norm": 2.71875, + "learning_rate": 4.17098516629651e-06, + "loss": 0.7124, + "step": 21360 + }, + { + "epoch": 0.5843507745313846, + "grad_norm": 2.734375, + "learning_rate": 4.1682432617696254e-06, + "loss": 0.6624, + "step": 21370 + }, + { + "epoch": 0.5846242189743099, + "grad_norm": 2.453125, + "learning_rate": 4.165501357242741e-06, + "loss": 0.6499, + "step": 21380 + }, + { + "epoch": 0.5848976634172353, + "grad_norm": 2.390625, + "learning_rate": 4.162759452715856e-06, + "loss": 0.6917, + "step": 21390 + }, + { + "epoch": 0.5851711078601605, + "grad_norm": 2.71875, + "learning_rate": 4.160017548188973e-06, + "loss": 0.7452, + "step": 21400 + }, + { + "epoch": 0.5854445523030858, + "grad_norm": 2.703125, + "learning_rate": 4.157275643662088e-06, + "loss": 0.6747, + "step": 21410 + }, + { + "epoch": 0.5857179967460111, + "grad_norm": 2.5, + "learning_rate": 4.154533739135204e-06, + "loss": 0.6442, + "step": 21420 + }, + { + "epoch": 0.5859914411889364, + "grad_norm": 2.59375, + "learning_rate": 4.151791834608319e-06, + "loss": 0.7073, + "step": 21430 + }, + { + "epoch": 0.5862648856318617, + "grad_norm": 2.515625, + "learning_rate": 4.149049930081435e-06, + "loss": 0.7086, + "step": 21440 + }, + { + "epoch": 0.5865383300747871, + "grad_norm": 2.9375, + "learning_rate": 4.146308025554551e-06, + "loss": 0.6645, + "step": 21450 + }, + { + "epoch": 0.5868117745177124, + "grad_norm": 2.78125, + "learning_rate": 4.1435661210276665e-06, + "loss": 0.7121, + "step": 21460 + }, + { + "epoch": 0.5870852189606377, + "grad_norm": 3.015625, + "learning_rate": 4.140824216500782e-06, + "loss": 0.7212, + "step": 21470 + }, + { + "epoch": 0.587358663403563, + "grad_norm": 2.828125, + "learning_rate": 4.1380823119738974e-06, + "loss": 0.6601, + "step": 21480 + }, + { + "epoch": 0.5876321078464883, + "grad_norm": 2.921875, + "learning_rate": 4.135340407447013e-06, + "loss": 0.7365, + "step": 21490 + }, + { + "epoch": 0.5879055522894135, + "grad_norm": 2.703125, + "learning_rate": 4.132598502920128e-06, + "loss": 0.5913, + "step": 21500 + }, + { + "epoch": 0.5881789967323389, + "grad_norm": 3.546875, + "learning_rate": 4.129856598393244e-06, + "loss": 0.6874, + "step": 21510 + }, + { + "epoch": 0.5884524411752642, + "grad_norm": 3.109375, + "learning_rate": 4.127114693866359e-06, + "loss": 0.7438, + "step": 21520 + }, + { + "epoch": 0.5887258856181895, + "grad_norm": 2.828125, + "learning_rate": 4.124372789339475e-06, + "loss": 0.6046, + "step": 21530 + }, + { + "epoch": 0.5889993300611148, + "grad_norm": 3.046875, + "learning_rate": 4.121630884812591e-06, + "loss": 0.6267, + "step": 21540 + }, + { + "epoch": 0.5892727745040401, + "grad_norm": 3.171875, + "learning_rate": 4.118888980285707e-06, + "loss": 0.7884, + "step": 21550 + }, + { + "epoch": 0.5895462189469655, + "grad_norm": 2.703125, + "learning_rate": 4.116147075758822e-06, + "loss": 0.6817, + "step": 21560 + }, + { + "epoch": 0.5898196633898908, + "grad_norm": 2.859375, + "learning_rate": 4.113405171231938e-06, + "loss": 0.774, + "step": 21570 + }, + { + "epoch": 0.5900931078328161, + "grad_norm": 2.734375, + "learning_rate": 4.110663266705054e-06, + "loss": 0.8388, + "step": 21580 + }, + { + "epoch": 0.5903665522757414, + "grad_norm": 2.890625, + "learning_rate": 4.1079213621781694e-06, + "loss": 0.7202, + "step": 21590 + }, + { + "epoch": 0.5906399967186667, + "grad_norm": 3.546875, + "learning_rate": 4.105179457651285e-06, + "loss": 0.7443, + "step": 21600 + }, + { + "epoch": 0.590913441161592, + "grad_norm": 2.90625, + "learning_rate": 4.1024375531244e-06, + "loss": 0.6917, + "step": 21610 + }, + { + "epoch": 0.5911868856045173, + "grad_norm": 2.53125, + "learning_rate": 4.099695648597516e-06, + "loss": 0.6694, + "step": 21620 + }, + { + "epoch": 0.5914603300474426, + "grad_norm": 2.5625, + "learning_rate": 4.096953744070632e-06, + "loss": 0.735, + "step": 21630 + }, + { + "epoch": 0.5917337744903679, + "grad_norm": 2.40625, + "learning_rate": 4.094211839543748e-06, + "loss": 0.681, + "step": 21640 + }, + { + "epoch": 0.5920072189332932, + "grad_norm": 3.125, + "learning_rate": 4.091469935016863e-06, + "loss": 0.6902, + "step": 21650 + }, + { + "epoch": 0.5922806633762185, + "grad_norm": 3.1875, + "learning_rate": 4.088728030489979e-06, + "loss": 0.6081, + "step": 21660 + }, + { + "epoch": 0.5925541078191439, + "grad_norm": 2.890625, + "learning_rate": 4.085986125963094e-06, + "loss": 0.7042, + "step": 21670 + }, + { + "epoch": 0.5928275522620692, + "grad_norm": 2.515625, + "learning_rate": 4.08324422143621e-06, + "loss": 0.7268, + "step": 21680 + }, + { + "epoch": 0.5931009967049945, + "grad_norm": 2.640625, + "learning_rate": 4.080502316909325e-06, + "loss": 0.7241, + "step": 21690 + }, + { + "epoch": 0.5933744411479198, + "grad_norm": 2.65625, + "learning_rate": 4.0777604123824414e-06, + "loss": 0.7943, + "step": 21700 + }, + { + "epoch": 0.593647885590845, + "grad_norm": 3.65625, + "learning_rate": 4.075018507855557e-06, + "loss": 0.7843, + "step": 21710 + }, + { + "epoch": 0.5939213300337703, + "grad_norm": 3.375, + "learning_rate": 4.072276603328672e-06, + "loss": 0.7699, + "step": 21720 + }, + { + "epoch": 0.5941947744766957, + "grad_norm": 2.734375, + "learning_rate": 4.069534698801788e-06, + "loss": 0.6464, + "step": 21730 + }, + { + "epoch": 0.594468218919621, + "grad_norm": 2.546875, + "learning_rate": 4.066792794274903e-06, + "loss": 0.6841, + "step": 21740 + }, + { + "epoch": 0.5947416633625463, + "grad_norm": 2.96875, + "learning_rate": 4.064050889748019e-06, + "loss": 0.7134, + "step": 21750 + }, + { + "epoch": 0.5950151078054716, + "grad_norm": 2.984375, + "learning_rate": 4.061308985221135e-06, + "loss": 0.7008, + "step": 21760 + }, + { + "epoch": 0.5952885522483969, + "grad_norm": 2.90625, + "learning_rate": 4.058567080694251e-06, + "loss": 0.7227, + "step": 21770 + }, + { + "epoch": 0.5955619966913223, + "grad_norm": 2.703125, + "learning_rate": 4.055825176167366e-06, + "loss": 0.6985, + "step": 21780 + }, + { + "epoch": 0.5958354411342476, + "grad_norm": 2.9375, + "learning_rate": 4.053083271640482e-06, + "loss": 0.7251, + "step": 21790 + }, + { + "epoch": 0.5961088855771729, + "grad_norm": 3.078125, + "learning_rate": 4.050341367113598e-06, + "loss": 0.6982, + "step": 21800 + }, + { + "epoch": 0.5963823300200982, + "grad_norm": 3.046875, + "learning_rate": 4.0475994625867134e-06, + "loss": 0.7228, + "step": 21810 + }, + { + "epoch": 0.5966557744630234, + "grad_norm": 2.921875, + "learning_rate": 4.044857558059829e-06, + "loss": 0.7625, + "step": 21820 + }, + { + "epoch": 0.5969292189059487, + "grad_norm": 3.25, + "learning_rate": 4.042115653532944e-06, + "loss": 0.6867, + "step": 21830 + }, + { + "epoch": 0.5972026633488741, + "grad_norm": 2.875, + "learning_rate": 4.03937374900606e-06, + "loss": 0.7144, + "step": 21840 + }, + { + "epoch": 0.5974761077917994, + "grad_norm": 2.671875, + "learning_rate": 4.036631844479175e-06, + "loss": 0.6852, + "step": 21850 + }, + { + "epoch": 0.5977495522347247, + "grad_norm": 2.859375, + "learning_rate": 4.033889939952292e-06, + "loss": 0.7112, + "step": 21860 + }, + { + "epoch": 0.59802299667765, + "grad_norm": 2.90625, + "learning_rate": 4.031148035425407e-06, + "loss": 0.7088, + "step": 21870 + }, + { + "epoch": 0.5982964411205753, + "grad_norm": 3.0, + "learning_rate": 4.028406130898523e-06, + "loss": 0.7043, + "step": 21880 + }, + { + "epoch": 0.5985698855635007, + "grad_norm": 2.625, + "learning_rate": 4.025664226371638e-06, + "loss": 0.6846, + "step": 21890 + }, + { + "epoch": 0.598843330006426, + "grad_norm": 2.828125, + "learning_rate": 4.022922321844754e-06, + "loss": 0.7059, + "step": 21900 + }, + { + "epoch": 0.5991167744493513, + "grad_norm": 2.453125, + "learning_rate": 4.020180417317869e-06, + "loss": 0.6306, + "step": 21910 + }, + { + "epoch": 0.5993902188922766, + "grad_norm": 2.859375, + "learning_rate": 4.0174385127909846e-06, + "loss": 0.7424, + "step": 21920 + }, + { + "epoch": 0.5996636633352018, + "grad_norm": 2.796875, + "learning_rate": 4.0146966082641e-06, + "loss": 0.6839, + "step": 21930 + }, + { + "epoch": 0.5999371077781271, + "grad_norm": 2.671875, + "learning_rate": 4.011954703737216e-06, + "loss": 0.6605, + "step": 21940 + }, + { + "epoch": 0.6002105522210525, + "grad_norm": 2.921875, + "learning_rate": 4.009212799210332e-06, + "loss": 0.6775, + "step": 21950 + }, + { + "epoch": 0.6004839966639778, + "grad_norm": 2.859375, + "learning_rate": 4.006470894683447e-06, + "loss": 0.7025, + "step": 21960 + }, + { + "epoch": 0.6007574411069031, + "grad_norm": 2.625, + "learning_rate": 4.003728990156563e-06, + "loss": 0.7513, + "step": 21970 + }, + { + "epoch": 0.6010308855498284, + "grad_norm": 2.796875, + "learning_rate": 4.000987085629679e-06, + "loss": 0.7388, + "step": 21980 + }, + { + "epoch": 0.6013043299927537, + "grad_norm": 2.640625, + "learning_rate": 3.998245181102795e-06, + "loss": 0.6949, + "step": 21990 + }, + { + "epoch": 0.6015777744356791, + "grad_norm": 3.140625, + "learning_rate": 3.99550327657591e-06, + "loss": 0.7143, + "step": 22000 + }, + { + "epoch": 0.6018512188786044, + "grad_norm": 2.703125, + "learning_rate": 3.992761372049026e-06, + "loss": 0.6596, + "step": 22010 + }, + { + "epoch": 0.6021246633215297, + "grad_norm": 3.03125, + "learning_rate": 3.990019467522141e-06, + "loss": 0.7035, + "step": 22020 + }, + { + "epoch": 0.6023981077644549, + "grad_norm": 2.890625, + "learning_rate": 3.987277562995257e-06, + "loss": 0.7225, + "step": 22030 + }, + { + "epoch": 0.6026715522073802, + "grad_norm": 3.015625, + "learning_rate": 3.984535658468373e-06, + "loss": 0.6985, + "step": 22040 + }, + { + "epoch": 0.6029449966503055, + "grad_norm": 2.78125, + "learning_rate": 3.981793753941488e-06, + "loss": 0.6738, + "step": 22050 + }, + { + "epoch": 0.6032184410932309, + "grad_norm": 2.890625, + "learning_rate": 3.979051849414604e-06, + "loss": 0.6349, + "step": 22060 + }, + { + "epoch": 0.6034918855361562, + "grad_norm": 2.546875, + "learning_rate": 3.976309944887719e-06, + "loss": 0.7237, + "step": 22070 + }, + { + "epoch": 0.6037653299790815, + "grad_norm": 2.671875, + "learning_rate": 3.973568040360835e-06, + "loss": 0.6546, + "step": 22080 + }, + { + "epoch": 0.6040387744220068, + "grad_norm": 2.625, + "learning_rate": 3.97082613583395e-06, + "loss": 0.7055, + "step": 22090 + }, + { + "epoch": 0.6043122188649321, + "grad_norm": 2.65625, + "learning_rate": 3.968084231307066e-06, + "loss": 0.6938, + "step": 22100 + }, + { + "epoch": 0.6045856633078575, + "grad_norm": 2.8125, + "learning_rate": 3.965342326780181e-06, + "loss": 0.7283, + "step": 22110 + }, + { + "epoch": 0.6048591077507828, + "grad_norm": 2.546875, + "learning_rate": 3.962600422253298e-06, + "loss": 0.7663, + "step": 22120 + }, + { + "epoch": 0.6051325521937081, + "grad_norm": 2.515625, + "learning_rate": 3.959858517726413e-06, + "loss": 0.6901, + "step": 22130 + }, + { + "epoch": 0.6054059966366333, + "grad_norm": 3.03125, + "learning_rate": 3.9571166131995286e-06, + "loss": 0.6238, + "step": 22140 + }, + { + "epoch": 0.6056794410795586, + "grad_norm": 2.8125, + "learning_rate": 3.954374708672644e-06, + "loss": 0.6461, + "step": 22150 + }, + { + "epoch": 0.6059528855224839, + "grad_norm": 2.421875, + "learning_rate": 3.95163280414576e-06, + "loss": 0.626, + "step": 22160 + }, + { + "epoch": 0.6062263299654093, + "grad_norm": 2.875, + "learning_rate": 3.948890899618876e-06, + "loss": 0.7233, + "step": 22170 + }, + { + "epoch": 0.6064997744083346, + "grad_norm": 2.875, + "learning_rate": 3.946148995091991e-06, + "loss": 0.6262, + "step": 22180 + }, + { + "epoch": 0.6067732188512599, + "grad_norm": 2.6875, + "learning_rate": 3.943407090565107e-06, + "loss": 0.7057, + "step": 22190 + }, + { + "epoch": 0.6070466632941852, + "grad_norm": 3.125, + "learning_rate": 3.940665186038222e-06, + "loss": 0.7417, + "step": 22200 + }, + { + "epoch": 0.6073201077371105, + "grad_norm": 2.796875, + "learning_rate": 3.937923281511339e-06, + "loss": 0.6634, + "step": 22210 + }, + { + "epoch": 0.6075935521800359, + "grad_norm": 2.875, + "learning_rate": 3.935181376984454e-06, + "loss": 0.7111, + "step": 22220 + }, + { + "epoch": 0.6078669966229612, + "grad_norm": 3.125, + "learning_rate": 3.93243947245757e-06, + "loss": 0.7821, + "step": 22230 + }, + { + "epoch": 0.6081404410658864, + "grad_norm": 2.546875, + "learning_rate": 3.929697567930685e-06, + "loss": 0.645, + "step": 22240 + }, + { + "epoch": 0.6084138855088117, + "grad_norm": 2.609375, + "learning_rate": 3.9269556634038006e-06, + "loss": 0.6601, + "step": 22250 + }, + { + "epoch": 0.608687329951737, + "grad_norm": 3.0, + "learning_rate": 3.924213758876916e-06, + "loss": 0.6969, + "step": 22260 + }, + { + "epoch": 0.6089607743946623, + "grad_norm": 2.59375, + "learning_rate": 3.9214718543500315e-06, + "loss": 0.7205, + "step": 22270 + }, + { + "epoch": 0.6092342188375877, + "grad_norm": 3.25, + "learning_rate": 3.918729949823147e-06, + "loss": 0.6232, + "step": 22280 + }, + { + "epoch": 0.609507663280513, + "grad_norm": 3.015625, + "learning_rate": 3.9159880452962625e-06, + "loss": 0.7378, + "step": 22290 + }, + { + "epoch": 0.6097811077234383, + "grad_norm": 2.6875, + "learning_rate": 3.913246140769379e-06, + "loss": 0.6416, + "step": 22300 + }, + { + "epoch": 0.6100545521663636, + "grad_norm": 2.625, + "learning_rate": 3.910504236242494e-06, + "loss": 0.6597, + "step": 22310 + }, + { + "epoch": 0.6103279966092889, + "grad_norm": 2.96875, + "learning_rate": 3.90776233171561e-06, + "loss": 0.699, + "step": 22320 + }, + { + "epoch": 0.6106014410522143, + "grad_norm": 2.515625, + "learning_rate": 3.905020427188725e-06, + "loss": 0.6792, + "step": 22330 + }, + { + "epoch": 0.6108748854951396, + "grad_norm": 2.75, + "learning_rate": 3.902278522661842e-06, + "loss": 0.7344, + "step": 22340 + }, + { + "epoch": 0.6111483299380648, + "grad_norm": 2.640625, + "learning_rate": 3.899536618134957e-06, + "loss": 0.6968, + "step": 22350 + }, + { + "epoch": 0.6114217743809901, + "grad_norm": 3.015625, + "learning_rate": 3.8967947136080726e-06, + "loss": 0.6598, + "step": 22360 + }, + { + "epoch": 0.6116952188239154, + "grad_norm": 3.03125, + "learning_rate": 3.894052809081188e-06, + "loss": 0.6956, + "step": 22370 + }, + { + "epoch": 0.6119686632668407, + "grad_norm": 3.03125, + "learning_rate": 3.8913109045543035e-06, + "loss": 0.6943, + "step": 22380 + }, + { + "epoch": 0.6122421077097661, + "grad_norm": 2.984375, + "learning_rate": 3.88856900002742e-06, + "loss": 0.658, + "step": 22390 + }, + { + "epoch": 0.6125155521526914, + "grad_norm": 2.703125, + "learning_rate": 3.885827095500535e-06, + "loss": 0.6777, + "step": 22400 + }, + { + "epoch": 0.6127889965956167, + "grad_norm": 2.703125, + "learning_rate": 3.883085190973651e-06, + "loss": 0.6925, + "step": 22410 + }, + { + "epoch": 0.613062441038542, + "grad_norm": 3.0625, + "learning_rate": 3.880343286446766e-06, + "loss": 0.6553, + "step": 22420 + }, + { + "epoch": 0.6133358854814673, + "grad_norm": 3.21875, + "learning_rate": 3.877601381919882e-06, + "loss": 0.7011, + "step": 22430 + }, + { + "epoch": 0.6136093299243927, + "grad_norm": 2.625, + "learning_rate": 3.874859477392997e-06, + "loss": 0.688, + "step": 22440 + }, + { + "epoch": 0.613882774367318, + "grad_norm": 2.0625, + "learning_rate": 3.872117572866113e-06, + "loss": 0.618, + "step": 22450 + }, + { + "epoch": 0.6141562188102432, + "grad_norm": 2.875, + "learning_rate": 3.869375668339228e-06, + "loss": 0.6841, + "step": 22460 + }, + { + "epoch": 0.6144296632531685, + "grad_norm": 2.375, + "learning_rate": 3.866633763812344e-06, + "loss": 0.6258, + "step": 22470 + }, + { + "epoch": 0.6147031076960938, + "grad_norm": 2.90625, + "learning_rate": 3.86389185928546e-06, + "loss": 0.6363, + "step": 22480 + }, + { + "epoch": 0.6149765521390191, + "grad_norm": 2.78125, + "learning_rate": 3.8611499547585755e-06, + "loss": 0.7634, + "step": 22490 + }, + { + "epoch": 0.6152499965819445, + "grad_norm": 3.015625, + "learning_rate": 3.858408050231691e-06, + "loss": 0.7057, + "step": 22500 + }, + { + "epoch": 0.6155234410248698, + "grad_norm": 3.09375, + "learning_rate": 3.8556661457048065e-06, + "loss": 0.6706, + "step": 22510 + }, + { + "epoch": 0.6157968854677951, + "grad_norm": 2.59375, + "learning_rate": 3.852924241177923e-06, + "loss": 0.6494, + "step": 22520 + }, + { + "epoch": 0.6160703299107204, + "grad_norm": 2.578125, + "learning_rate": 3.850182336651038e-06, + "loss": 0.7279, + "step": 22530 + }, + { + "epoch": 0.6163437743536457, + "grad_norm": 2.65625, + "learning_rate": 3.847440432124154e-06, + "loss": 0.669, + "step": 22540 + }, + { + "epoch": 0.616617218796571, + "grad_norm": 2.609375, + "learning_rate": 3.844698527597269e-06, + "loss": 0.6135, + "step": 22550 + }, + { + "epoch": 0.6168906632394963, + "grad_norm": 2.703125, + "learning_rate": 3.841956623070385e-06, + "loss": 0.719, + "step": 22560 + }, + { + "epoch": 0.6171641076824216, + "grad_norm": 2.453125, + "learning_rate": 3.839214718543501e-06, + "loss": 0.6439, + "step": 22570 + }, + { + "epoch": 0.6174375521253469, + "grad_norm": 2.953125, + "learning_rate": 3.8364728140166165e-06, + "loss": 0.6999, + "step": 22580 + }, + { + "epoch": 0.6177109965682722, + "grad_norm": 2.765625, + "learning_rate": 3.833730909489732e-06, + "loss": 0.6648, + "step": 22590 + }, + { + "epoch": 0.6179844410111975, + "grad_norm": 3.078125, + "learning_rate": 3.8309890049628475e-06, + "loss": 0.6728, + "step": 22600 + }, + { + "epoch": 0.6182578854541229, + "grad_norm": 2.578125, + "learning_rate": 3.828247100435963e-06, + "loss": 0.6993, + "step": 22610 + }, + { + "epoch": 0.6185313298970482, + "grad_norm": 3.234375, + "learning_rate": 3.8255051959090785e-06, + "loss": 0.6778, + "step": 22620 + }, + { + "epoch": 0.6188047743399735, + "grad_norm": 2.640625, + "learning_rate": 3.822763291382194e-06, + "loss": 0.6924, + "step": 22630 + }, + { + "epoch": 0.6190782187828988, + "grad_norm": 3.359375, + "learning_rate": 3.82002138685531e-06, + "loss": 0.7111, + "step": 22640 + }, + { + "epoch": 0.6193516632258241, + "grad_norm": 2.859375, + "learning_rate": 3.817279482328426e-06, + "loss": 0.6996, + "step": 22650 + }, + { + "epoch": 0.6196251076687495, + "grad_norm": 2.28125, + "learning_rate": 3.8145375778015413e-06, + "loss": 0.6589, + "step": 22660 + }, + { + "epoch": 0.6198985521116747, + "grad_norm": 2.96875, + "learning_rate": 3.8117956732746567e-06, + "loss": 0.6479, + "step": 22670 + }, + { + "epoch": 0.6201719965546, + "grad_norm": 2.921875, + "learning_rate": 3.8090537687477726e-06, + "loss": 0.7018, + "step": 22680 + }, + { + "epoch": 0.6204454409975253, + "grad_norm": 2.859375, + "learning_rate": 3.806311864220888e-06, + "loss": 0.6675, + "step": 22690 + }, + { + "epoch": 0.6207188854404506, + "grad_norm": 2.625, + "learning_rate": 3.803569959694004e-06, + "loss": 0.7075, + "step": 22700 + }, + { + "epoch": 0.6209923298833759, + "grad_norm": 2.6875, + "learning_rate": 3.8008280551671195e-06, + "loss": 0.6355, + "step": 22710 + }, + { + "epoch": 0.6212657743263013, + "grad_norm": 2.859375, + "learning_rate": 3.798086150640235e-06, + "loss": 0.7422, + "step": 22720 + }, + { + "epoch": 0.6215392187692266, + "grad_norm": 2.4375, + "learning_rate": 3.7953442461133505e-06, + "loss": 0.6899, + "step": 22730 + }, + { + "epoch": 0.6218126632121519, + "grad_norm": 2.46875, + "learning_rate": 3.792602341586466e-06, + "loss": 0.6843, + "step": 22740 + }, + { + "epoch": 0.6220861076550772, + "grad_norm": 2.546875, + "learning_rate": 3.789860437059582e-06, + "loss": 0.7052, + "step": 22750 + }, + { + "epoch": 0.6223595520980025, + "grad_norm": 2.953125, + "learning_rate": 3.7871185325326978e-06, + "loss": 0.6484, + "step": 22760 + }, + { + "epoch": 0.6226329965409279, + "grad_norm": 2.8125, + "learning_rate": 3.7843766280058132e-06, + "loss": 0.6369, + "step": 22770 + }, + { + "epoch": 0.6229064409838531, + "grad_norm": 2.28125, + "learning_rate": 3.7816347234789287e-06, + "loss": 0.6478, + "step": 22780 + }, + { + "epoch": 0.6231798854267784, + "grad_norm": 2.953125, + "learning_rate": 3.7788928189520446e-06, + "loss": 0.6345, + "step": 22790 + }, + { + "epoch": 0.6234533298697037, + "grad_norm": 2.546875, + "learning_rate": 3.77615091442516e-06, + "loss": 0.6116, + "step": 22800 + }, + { + "epoch": 0.623726774312629, + "grad_norm": 2.78125, + "learning_rate": 3.7734090098982756e-06, + "loss": 0.7744, + "step": 22810 + }, + { + "epoch": 0.6240002187555543, + "grad_norm": 2.8125, + "learning_rate": 3.770667105371391e-06, + "loss": 0.6581, + "step": 22820 + }, + { + "epoch": 0.6242736631984797, + "grad_norm": 3.140625, + "learning_rate": 3.7679252008445066e-06, + "loss": 0.6244, + "step": 22830 + }, + { + "epoch": 0.624547107641405, + "grad_norm": 2.65625, + "learning_rate": 3.765183296317623e-06, + "loss": 0.6519, + "step": 22840 + }, + { + "epoch": 0.6248205520843303, + "grad_norm": 3.015625, + "learning_rate": 3.7624413917907384e-06, + "loss": 0.6796, + "step": 22850 + }, + { + "epoch": 0.6250939965272556, + "grad_norm": 3.03125, + "learning_rate": 3.759699487263854e-06, + "loss": 0.6712, + "step": 22860 + }, + { + "epoch": 0.6253674409701809, + "grad_norm": 2.59375, + "learning_rate": 3.7569575827369693e-06, + "loss": 0.7151, + "step": 22870 + }, + { + "epoch": 0.6256408854131061, + "grad_norm": 3.109375, + "learning_rate": 3.7542156782100852e-06, + "loss": 0.7102, + "step": 22880 + }, + { + "epoch": 0.6259143298560315, + "grad_norm": 2.671875, + "learning_rate": 3.7514737736832007e-06, + "loss": 0.6646, + "step": 22890 + }, + { + "epoch": 0.6261877742989568, + "grad_norm": 3.046875, + "learning_rate": 3.748731869156316e-06, + "loss": 0.6981, + "step": 22900 + }, + { + "epoch": 0.6264612187418821, + "grad_norm": 3.328125, + "learning_rate": 3.7459899646294317e-06, + "loss": 0.668, + "step": 22910 + }, + { + "epoch": 0.6267346631848074, + "grad_norm": 2.703125, + "learning_rate": 3.743248060102547e-06, + "loss": 0.7357, + "step": 22920 + }, + { + "epoch": 0.6270081076277327, + "grad_norm": 2.640625, + "learning_rate": 3.7405061555756635e-06, + "loss": 0.6612, + "step": 22930 + }, + { + "epoch": 0.627281552070658, + "grad_norm": 2.546875, + "learning_rate": 3.737764251048779e-06, + "loss": 0.6727, + "step": 22940 + }, + { + "epoch": 0.6275549965135834, + "grad_norm": 2.625, + "learning_rate": 3.7350223465218945e-06, + "loss": 0.7183, + "step": 22950 + }, + { + "epoch": 0.6278284409565087, + "grad_norm": 2.90625, + "learning_rate": 3.73228044199501e-06, + "loss": 0.7308, + "step": 22960 + }, + { + "epoch": 0.628101885399434, + "grad_norm": 2.578125, + "learning_rate": 3.729538537468126e-06, + "loss": 0.6869, + "step": 22970 + }, + { + "epoch": 0.6283753298423593, + "grad_norm": 3.296875, + "learning_rate": 3.7267966329412413e-06, + "loss": 0.7522, + "step": 22980 + }, + { + "epoch": 0.6286487742852845, + "grad_norm": 2.84375, + "learning_rate": 3.724054728414357e-06, + "loss": 0.6818, + "step": 22990 + }, + { + "epoch": 0.6289222187282099, + "grad_norm": 2.828125, + "learning_rate": 3.7213128238874723e-06, + "loss": 0.6618, + "step": 23000 + }, + { + "epoch": 0.6291956631711352, + "grad_norm": 2.84375, + "learning_rate": 3.7185709193605878e-06, + "loss": 0.7353, + "step": 23010 + }, + { + "epoch": 0.6294691076140605, + "grad_norm": 2.65625, + "learning_rate": 3.715829014833704e-06, + "loss": 0.6523, + "step": 23020 + }, + { + "epoch": 0.6297425520569858, + "grad_norm": 2.859375, + "learning_rate": 3.7130871103068196e-06, + "loss": 0.6072, + "step": 23030 + }, + { + "epoch": 0.6300159964999111, + "grad_norm": 2.734375, + "learning_rate": 3.710345205779935e-06, + "loss": 0.7253, + "step": 23040 + }, + { + "epoch": 0.6302894409428365, + "grad_norm": 2.53125, + "learning_rate": 3.7076033012530506e-06, + "loss": 0.7064, + "step": 23050 + }, + { + "epoch": 0.6305628853857618, + "grad_norm": 3.0, + "learning_rate": 3.7048613967261665e-06, + "loss": 0.7274, + "step": 23060 + }, + { + "epoch": 0.6308363298286871, + "grad_norm": 2.625, + "learning_rate": 3.702119492199282e-06, + "loss": 0.796, + "step": 23070 + }, + { + "epoch": 0.6311097742716124, + "grad_norm": 2.90625, + "learning_rate": 3.6993775876723974e-06, + "loss": 0.7455, + "step": 23080 + }, + { + "epoch": 0.6313832187145376, + "grad_norm": 2.8125, + "learning_rate": 3.696635683145513e-06, + "loss": 0.6813, + "step": 23090 + }, + { + "epoch": 0.6316566631574629, + "grad_norm": 2.453125, + "learning_rate": 3.6938937786186292e-06, + "loss": 0.6736, + "step": 23100 + }, + { + "epoch": 0.6319301076003883, + "grad_norm": 2.703125, + "learning_rate": 3.6911518740917447e-06, + "loss": 0.7878, + "step": 23110 + }, + { + "epoch": 0.6322035520433136, + "grad_norm": 2.78125, + "learning_rate": 3.68840996956486e-06, + "loss": 0.6719, + "step": 23120 + }, + { + "epoch": 0.6324769964862389, + "grad_norm": 2.53125, + "learning_rate": 3.6856680650379757e-06, + "loss": 0.6668, + "step": 23130 + }, + { + "epoch": 0.6327504409291642, + "grad_norm": 3.015625, + "learning_rate": 3.682926160511091e-06, + "loss": 0.6749, + "step": 23140 + }, + { + "epoch": 0.6330238853720895, + "grad_norm": 3.0625, + "learning_rate": 3.680184255984207e-06, + "loss": 0.6816, + "step": 23150 + }, + { + "epoch": 0.6332973298150149, + "grad_norm": 2.765625, + "learning_rate": 3.6774423514573225e-06, + "loss": 0.6832, + "step": 23160 + }, + { + "epoch": 0.6335707742579402, + "grad_norm": 2.4375, + "learning_rate": 3.674700446930438e-06, + "loss": 0.6274, + "step": 23170 + }, + { + "epoch": 0.6338442187008655, + "grad_norm": 2.40625, + "learning_rate": 3.6719585424035535e-06, + "loss": 0.7433, + "step": 23180 + }, + { + "epoch": 0.6341176631437908, + "grad_norm": 3.359375, + "learning_rate": 3.66921663787667e-06, + "loss": 0.6965, + "step": 23190 + }, + { + "epoch": 0.634391107586716, + "grad_norm": 2.96875, + "learning_rate": 3.6664747333497853e-06, + "loss": 0.6988, + "step": 23200 + }, + { + "epoch": 0.6346645520296413, + "grad_norm": 2.796875, + "learning_rate": 3.663732828822901e-06, + "loss": 0.7373, + "step": 23210 + }, + { + "epoch": 0.6349379964725667, + "grad_norm": 2.984375, + "learning_rate": 3.6609909242960163e-06, + "loss": 0.7152, + "step": 23220 + }, + { + "epoch": 0.635211440915492, + "grad_norm": 2.59375, + "learning_rate": 3.6582490197691318e-06, + "loss": 0.7013, + "step": 23230 + }, + { + "epoch": 0.6354848853584173, + "grad_norm": 3.03125, + "learning_rate": 3.6555071152422477e-06, + "loss": 0.7327, + "step": 23240 + }, + { + "epoch": 0.6357583298013426, + "grad_norm": 3.171875, + "learning_rate": 3.652765210715363e-06, + "loss": 0.6706, + "step": 23250 + }, + { + "epoch": 0.6360317742442679, + "grad_norm": 2.734375, + "learning_rate": 3.6500233061884786e-06, + "loss": 0.638, + "step": 23260 + }, + { + "epoch": 0.6363052186871933, + "grad_norm": 2.5625, + "learning_rate": 3.647281401661594e-06, + "loss": 0.5764, + "step": 23270 + }, + { + "epoch": 0.6365786631301186, + "grad_norm": 2.53125, + "learning_rate": 3.6445394971347105e-06, + "loss": 0.702, + "step": 23280 + }, + { + "epoch": 0.6368521075730439, + "grad_norm": 2.71875, + "learning_rate": 3.641797592607826e-06, + "loss": 0.664, + "step": 23290 + }, + { + "epoch": 0.6371255520159692, + "grad_norm": 3.0, + "learning_rate": 3.6390556880809414e-06, + "loss": 0.7008, + "step": 23300 + }, + { + "epoch": 0.6373989964588944, + "grad_norm": 3.0, + "learning_rate": 3.636313783554057e-06, + "loss": 0.6567, + "step": 23310 + }, + { + "epoch": 0.6376724409018197, + "grad_norm": 3.21875, + "learning_rate": 3.6335718790271724e-06, + "loss": 0.709, + "step": 23320 + }, + { + "epoch": 0.6379458853447451, + "grad_norm": 2.828125, + "learning_rate": 3.6308299745002883e-06, + "loss": 0.7129, + "step": 23330 + }, + { + "epoch": 0.6382193297876704, + "grad_norm": 3.015625, + "learning_rate": 3.6280880699734038e-06, + "loss": 0.6956, + "step": 23340 + }, + { + "epoch": 0.6384927742305957, + "grad_norm": 2.96875, + "learning_rate": 3.6253461654465192e-06, + "loss": 0.6821, + "step": 23350 + }, + { + "epoch": 0.638766218673521, + "grad_norm": 3.03125, + "learning_rate": 3.6226042609196347e-06, + "loss": 0.658, + "step": 23360 + }, + { + "epoch": 0.6390396631164463, + "grad_norm": 2.515625, + "learning_rate": 3.619862356392751e-06, + "loss": 0.6904, + "step": 23370 + }, + { + "epoch": 0.6393131075593717, + "grad_norm": 2.90625, + "learning_rate": 3.6171204518658665e-06, + "loss": 0.6809, + "step": 23380 + }, + { + "epoch": 0.639586552002297, + "grad_norm": 2.609375, + "learning_rate": 3.614378547338982e-06, + "loss": 0.6679, + "step": 23390 + }, + { + "epoch": 0.6398599964452223, + "grad_norm": 2.015625, + "learning_rate": 3.6116366428120975e-06, + "loss": 0.58, + "step": 23400 + }, + { + "epoch": 0.6401334408881475, + "grad_norm": 2.75, + "learning_rate": 3.608894738285213e-06, + "loss": 0.6336, + "step": 23410 + }, + { + "epoch": 0.6404068853310728, + "grad_norm": 2.515625, + "learning_rate": 3.606152833758329e-06, + "loss": 0.6392, + "step": 23420 + }, + { + "epoch": 0.6406803297739981, + "grad_norm": 3.234375, + "learning_rate": 3.6034109292314444e-06, + "loss": 0.7383, + "step": 23430 + }, + { + "epoch": 0.6409537742169235, + "grad_norm": 2.890625, + "learning_rate": 3.60066902470456e-06, + "loss": 0.7506, + "step": 23440 + }, + { + "epoch": 0.6412272186598488, + "grad_norm": 2.765625, + "learning_rate": 3.5979271201776753e-06, + "loss": 0.7083, + "step": 23450 + }, + { + "epoch": 0.6415006631027741, + "grad_norm": 2.953125, + "learning_rate": 3.5951852156507917e-06, + "loss": 0.6915, + "step": 23460 + }, + { + "epoch": 0.6417741075456994, + "grad_norm": 3.296875, + "learning_rate": 3.592443311123907e-06, + "loss": 0.6901, + "step": 23470 + }, + { + "epoch": 0.6420475519886247, + "grad_norm": 2.609375, + "learning_rate": 3.5897014065970226e-06, + "loss": 0.6709, + "step": 23480 + }, + { + "epoch": 0.64232099643155, + "grad_norm": 2.671875, + "learning_rate": 3.586959502070138e-06, + "loss": 0.7622, + "step": 23490 + }, + { + "epoch": 0.6425944408744754, + "grad_norm": 2.96875, + "learning_rate": 3.5842175975432536e-06, + "loss": 0.7031, + "step": 23500 + }, + { + "epoch": 0.6428678853174007, + "grad_norm": 3.421875, + "learning_rate": 3.5814756930163695e-06, + "loss": 0.8086, + "step": 23510 + }, + { + "epoch": 0.6431413297603259, + "grad_norm": 2.65625, + "learning_rate": 3.578733788489485e-06, + "loss": 0.7047, + "step": 23520 + }, + { + "epoch": 0.6434147742032512, + "grad_norm": 2.546875, + "learning_rate": 3.5759918839626005e-06, + "loss": 0.6328, + "step": 23530 + }, + { + "epoch": 0.6436882186461765, + "grad_norm": 3.28125, + "learning_rate": 3.5732499794357164e-06, + "loss": 0.6807, + "step": 23540 + }, + { + "epoch": 0.6439616630891019, + "grad_norm": 2.765625, + "learning_rate": 3.5705080749088323e-06, + "loss": 0.7675, + "step": 23550 + }, + { + "epoch": 0.6442351075320272, + "grad_norm": 2.796875, + "learning_rate": 3.5677661703819478e-06, + "loss": 0.7683, + "step": 23560 + }, + { + "epoch": 0.6445085519749525, + "grad_norm": 2.90625, + "learning_rate": 3.5650242658550632e-06, + "loss": 0.7118, + "step": 23570 + }, + { + "epoch": 0.6447819964178778, + "grad_norm": 3.015625, + "learning_rate": 3.5622823613281787e-06, + "loss": 0.733, + "step": 23580 + }, + { + "epoch": 0.6450554408608031, + "grad_norm": 2.703125, + "learning_rate": 3.559540456801294e-06, + "loss": 0.7149, + "step": 23590 + }, + { + "epoch": 0.6453288853037285, + "grad_norm": 2.375, + "learning_rate": 3.55679855227441e-06, + "loss": 0.718, + "step": 23600 + }, + { + "epoch": 0.6456023297466538, + "grad_norm": 2.890625, + "learning_rate": 3.5540566477475256e-06, + "loss": 0.6758, + "step": 23610 + }, + { + "epoch": 0.645875774189579, + "grad_norm": 2.78125, + "learning_rate": 3.5513147432206415e-06, + "loss": 0.745, + "step": 23620 + }, + { + "epoch": 0.6461492186325043, + "grad_norm": 3.0, + "learning_rate": 3.548572838693757e-06, + "loss": 0.6931, + "step": 23630 + }, + { + "epoch": 0.6464226630754296, + "grad_norm": 3.25, + "learning_rate": 3.545830934166873e-06, + "loss": 0.7065, + "step": 23640 + }, + { + "epoch": 0.6466961075183549, + "grad_norm": 2.8125, + "learning_rate": 3.5430890296399884e-06, + "loss": 0.7518, + "step": 23650 + }, + { + "epoch": 0.6469695519612803, + "grad_norm": 2.359375, + "learning_rate": 3.540347125113104e-06, + "loss": 0.7066, + "step": 23660 + }, + { + "epoch": 0.6472429964042056, + "grad_norm": 2.640625, + "learning_rate": 3.5376052205862193e-06, + "loss": 0.6887, + "step": 23670 + }, + { + "epoch": 0.6475164408471309, + "grad_norm": 2.703125, + "learning_rate": 3.534863316059335e-06, + "loss": 0.6952, + "step": 23680 + }, + { + "epoch": 0.6477898852900562, + "grad_norm": 2.625, + "learning_rate": 3.5321214115324507e-06, + "loss": 0.6967, + "step": 23690 + }, + { + "epoch": 0.6480633297329815, + "grad_norm": 2.625, + "learning_rate": 3.5293795070055666e-06, + "loss": 0.6667, + "step": 23700 + }, + { + "epoch": 0.6483367741759068, + "grad_norm": 2.671875, + "learning_rate": 3.526637602478682e-06, + "loss": 0.6994, + "step": 23710 + }, + { + "epoch": 0.6486102186188322, + "grad_norm": 3.25, + "learning_rate": 3.5238956979517976e-06, + "loss": 0.7374, + "step": 23720 + }, + { + "epoch": 0.6488836630617574, + "grad_norm": 3.234375, + "learning_rate": 3.5211537934249135e-06, + "loss": 0.6126, + "step": 23730 + }, + { + "epoch": 0.6491571075046827, + "grad_norm": 3.078125, + "learning_rate": 3.518411888898029e-06, + "loss": 0.7083, + "step": 23740 + }, + { + "epoch": 0.649430551947608, + "grad_norm": 3.125, + "learning_rate": 3.5156699843711445e-06, + "loss": 0.7015, + "step": 23750 + }, + { + "epoch": 0.6497039963905333, + "grad_norm": 2.546875, + "learning_rate": 3.51292807984426e-06, + "loss": 0.6931, + "step": 23760 + }, + { + "epoch": 0.6499774408334587, + "grad_norm": 2.46875, + "learning_rate": 3.5101861753173754e-06, + "loss": 0.7422, + "step": 23770 + }, + { + "epoch": 0.650250885276384, + "grad_norm": 3.109375, + "learning_rate": 3.5074442707904917e-06, + "loss": 0.6667, + "step": 23780 + }, + { + "epoch": 0.6505243297193093, + "grad_norm": 2.890625, + "learning_rate": 3.5047023662636072e-06, + "loss": 0.7285, + "step": 23790 + }, + { + "epoch": 0.6507977741622346, + "grad_norm": 2.84375, + "learning_rate": 3.5019604617367227e-06, + "loss": 0.6477, + "step": 23800 + }, + { + "epoch": 0.6510712186051599, + "grad_norm": 3.3125, + "learning_rate": 3.499218557209838e-06, + "loss": 0.7161, + "step": 23810 + }, + { + "epoch": 0.6513446630480852, + "grad_norm": 2.59375, + "learning_rate": 3.496476652682954e-06, + "loss": 0.7416, + "step": 23820 + }, + { + "epoch": 0.6516181074910106, + "grad_norm": 2.9375, + "learning_rate": 3.4937347481560696e-06, + "loss": 0.6478, + "step": 23830 + }, + { + "epoch": 0.6518915519339358, + "grad_norm": 3.15625, + "learning_rate": 3.490992843629185e-06, + "loss": 0.7007, + "step": 23840 + }, + { + "epoch": 0.6521649963768611, + "grad_norm": 2.75, + "learning_rate": 3.4882509391023005e-06, + "loss": 0.7133, + "step": 23850 + }, + { + "epoch": 0.6524384408197864, + "grad_norm": 3.390625, + "learning_rate": 3.485509034575416e-06, + "loss": 0.7781, + "step": 23860 + }, + { + "epoch": 0.6527118852627117, + "grad_norm": 3.0625, + "learning_rate": 3.4827671300485324e-06, + "loss": 0.7424, + "step": 23870 + }, + { + "epoch": 0.652985329705637, + "grad_norm": 3.3125, + "learning_rate": 3.480025225521648e-06, + "loss": 0.7535, + "step": 23880 + }, + { + "epoch": 0.6532587741485624, + "grad_norm": 2.8125, + "learning_rate": 3.4772833209947633e-06, + "loss": 0.6907, + "step": 23890 + }, + { + "epoch": 0.6535322185914877, + "grad_norm": 2.515625, + "learning_rate": 3.474541416467879e-06, + "loss": 0.7119, + "step": 23900 + }, + { + "epoch": 0.653805663034413, + "grad_norm": 2.609375, + "learning_rate": 3.4717995119409947e-06, + "loss": 0.7625, + "step": 23910 + }, + { + "epoch": 0.6540791074773383, + "grad_norm": 2.8125, + "learning_rate": 3.46905760741411e-06, + "loss": 0.7243, + "step": 23920 + }, + { + "epoch": 0.6543525519202636, + "grad_norm": 2.75, + "learning_rate": 3.4663157028872257e-06, + "loss": 0.6832, + "step": 23930 + }, + { + "epoch": 0.6546259963631889, + "grad_norm": 3.0, + "learning_rate": 3.463573798360341e-06, + "loss": 0.73, + "step": 23940 + }, + { + "epoch": 0.6548994408061142, + "grad_norm": 2.265625, + "learning_rate": 3.4608318938334566e-06, + "loss": 0.6125, + "step": 23950 + }, + { + "epoch": 0.6551728852490395, + "grad_norm": 2.671875, + "learning_rate": 3.458089989306573e-06, + "loss": 0.7195, + "step": 23960 + }, + { + "epoch": 0.6554463296919648, + "grad_norm": 2.65625, + "learning_rate": 3.4553480847796884e-06, + "loss": 0.6719, + "step": 23970 + }, + { + "epoch": 0.6557197741348901, + "grad_norm": 2.828125, + "learning_rate": 3.452606180252804e-06, + "loss": 0.7712, + "step": 23980 + }, + { + "epoch": 0.6559932185778155, + "grad_norm": 2.90625, + "learning_rate": 3.4498642757259194e-06, + "loss": 0.7246, + "step": 23990 + }, + { + "epoch": 0.6562666630207408, + "grad_norm": 2.6875, + "learning_rate": 3.4471223711990353e-06, + "loss": 0.734, + "step": 24000 + }, + { + "epoch": 0.6565401074636661, + "grad_norm": 2.15625, + "learning_rate": 3.444380466672151e-06, + "loss": 0.6182, + "step": 24010 + }, + { + "epoch": 0.6568135519065914, + "grad_norm": 2.765625, + "learning_rate": 3.4416385621452663e-06, + "loss": 0.6635, + "step": 24020 + }, + { + "epoch": 0.6570869963495167, + "grad_norm": 2.34375, + "learning_rate": 3.4388966576183818e-06, + "loss": 0.7769, + "step": 24030 + }, + { + "epoch": 0.657360440792442, + "grad_norm": 2.640625, + "learning_rate": 3.4361547530914972e-06, + "loss": 0.7696, + "step": 24040 + }, + { + "epoch": 0.6576338852353673, + "grad_norm": 2.546875, + "learning_rate": 3.4334128485646136e-06, + "loss": 0.6662, + "step": 24050 + }, + { + "epoch": 0.6579073296782926, + "grad_norm": 2.953125, + "learning_rate": 3.430670944037729e-06, + "loss": 0.6823, + "step": 24060 + }, + { + "epoch": 0.6581807741212179, + "grad_norm": 2.75, + "learning_rate": 3.4279290395108445e-06, + "loss": 0.7321, + "step": 24070 + }, + { + "epoch": 0.6584542185641432, + "grad_norm": 2.640625, + "learning_rate": 3.42518713498396e-06, + "loss": 0.6752, + "step": 24080 + }, + { + "epoch": 0.6587276630070685, + "grad_norm": 2.765625, + "learning_rate": 3.422445230457076e-06, + "loss": 0.6701, + "step": 24090 + }, + { + "epoch": 0.6590011074499939, + "grad_norm": 3.53125, + "learning_rate": 3.4197033259301914e-06, + "loss": 0.7158, + "step": 24100 + }, + { + "epoch": 0.6592745518929192, + "grad_norm": 3.03125, + "learning_rate": 3.416961421403307e-06, + "loss": 0.728, + "step": 24110 + }, + { + "epoch": 0.6595479963358445, + "grad_norm": 2.921875, + "learning_rate": 3.4142195168764224e-06, + "loss": 0.7289, + "step": 24120 + }, + { + "epoch": 0.6598214407787698, + "grad_norm": 2.953125, + "learning_rate": 3.411477612349538e-06, + "loss": 0.7723, + "step": 24130 + }, + { + "epoch": 0.6600948852216951, + "grad_norm": 2.671875, + "learning_rate": 3.408735707822654e-06, + "loss": 0.8101, + "step": 24140 + }, + { + "epoch": 0.6603683296646204, + "grad_norm": 2.6875, + "learning_rate": 3.4059938032957697e-06, + "loss": 0.6973, + "step": 24150 + }, + { + "epoch": 0.6606417741075457, + "grad_norm": 2.921875, + "learning_rate": 3.403251898768885e-06, + "loss": 0.6472, + "step": 24160 + }, + { + "epoch": 0.660915218550471, + "grad_norm": 2.59375, + "learning_rate": 3.4005099942420006e-06, + "loss": 0.7256, + "step": 24170 + }, + { + "epoch": 0.6611886629933963, + "grad_norm": 2.921875, + "learning_rate": 3.3977680897151165e-06, + "loss": 0.7473, + "step": 24180 + }, + { + "epoch": 0.6614621074363216, + "grad_norm": 3.359375, + "learning_rate": 3.395026185188232e-06, + "loss": 0.7221, + "step": 24190 + }, + { + "epoch": 0.6617355518792469, + "grad_norm": 3.078125, + "learning_rate": 3.3922842806613475e-06, + "loss": 0.732, + "step": 24200 + }, + { + "epoch": 0.6620089963221722, + "grad_norm": 2.8125, + "learning_rate": 3.389542376134463e-06, + "loss": 0.7411, + "step": 24210 + }, + { + "epoch": 0.6622824407650976, + "grad_norm": 3.015625, + "learning_rate": 3.3868004716075785e-06, + "loss": 0.639, + "step": 24220 + }, + { + "epoch": 0.6625558852080229, + "grad_norm": 2.78125, + "learning_rate": 3.384058567080695e-06, + "loss": 0.6492, + "step": 24230 + }, + { + "epoch": 0.6628293296509482, + "grad_norm": 2.640625, + "learning_rate": 3.3813166625538103e-06, + "loss": 0.6646, + "step": 24240 + }, + { + "epoch": 0.6631027740938735, + "grad_norm": 2.5, + "learning_rate": 3.3785747580269258e-06, + "loss": 0.6614, + "step": 24250 + }, + { + "epoch": 0.6633762185367987, + "grad_norm": 2.828125, + "learning_rate": 3.3758328535000412e-06, + "loss": 0.6843, + "step": 24260 + }, + { + "epoch": 0.663649662979724, + "grad_norm": 2.828125, + "learning_rate": 3.373090948973157e-06, + "loss": 0.6717, + "step": 24270 + }, + { + "epoch": 0.6639231074226494, + "grad_norm": 3.203125, + "learning_rate": 3.3703490444462726e-06, + "loss": 0.7756, + "step": 24280 + }, + { + "epoch": 0.6641965518655747, + "grad_norm": 3.09375, + "learning_rate": 3.367607139919388e-06, + "loss": 0.6996, + "step": 24290 + }, + { + "epoch": 0.6644699963085, + "grad_norm": 3.296875, + "learning_rate": 3.3648652353925036e-06, + "loss": 0.7749, + "step": 24300 + }, + { + "epoch": 0.6647434407514253, + "grad_norm": 2.546875, + "learning_rate": 3.362123330865619e-06, + "loss": 0.6611, + "step": 24310 + }, + { + "epoch": 0.6650168851943506, + "grad_norm": 2.6875, + "learning_rate": 3.3593814263387354e-06, + "loss": 0.734, + "step": 24320 + }, + { + "epoch": 0.665290329637276, + "grad_norm": 3.03125, + "learning_rate": 3.356639521811851e-06, + "loss": 0.678, + "step": 24330 + }, + { + "epoch": 0.6655637740802013, + "grad_norm": 2.65625, + "learning_rate": 3.3538976172849664e-06, + "loss": 0.7496, + "step": 24340 + }, + { + "epoch": 0.6658372185231266, + "grad_norm": 2.59375, + "learning_rate": 3.351155712758082e-06, + "loss": 0.6766, + "step": 24350 + }, + { + "epoch": 0.6661106629660519, + "grad_norm": 2.296875, + "learning_rate": 3.3484138082311977e-06, + "loss": 0.6793, + "step": 24360 + }, + { + "epoch": 0.6663841074089771, + "grad_norm": 3.09375, + "learning_rate": 3.3456719037043132e-06, + "loss": 0.7391, + "step": 24370 + }, + { + "epoch": 0.6666575518519025, + "grad_norm": 9.0, + "learning_rate": 3.3429299991774287e-06, + "loss": 0.6607, + "step": 24380 + }, + { + "epoch": 0.6669309962948278, + "grad_norm": 2.890625, + "learning_rate": 3.340188094650544e-06, + "loss": 0.7463, + "step": 24390 + }, + { + "epoch": 0.6672044407377531, + "grad_norm": 3.453125, + "learning_rate": 3.33744619012366e-06, + "loss": 0.7348, + "step": 24400 + }, + { + "epoch": 0.6674778851806784, + "grad_norm": 3.078125, + "learning_rate": 3.334704285596776e-06, + "loss": 0.7383, + "step": 24410 + }, + { + "epoch": 0.6677513296236037, + "grad_norm": 2.671875, + "learning_rate": 3.3319623810698915e-06, + "loss": 0.7071, + "step": 24420 + }, + { + "epoch": 0.668024774066529, + "grad_norm": 2.828125, + "learning_rate": 3.329220476543007e-06, + "loss": 0.6869, + "step": 24430 + }, + { + "epoch": 0.6682982185094544, + "grad_norm": 2.65625, + "learning_rate": 3.3264785720161225e-06, + "loss": 0.7147, + "step": 24440 + }, + { + "epoch": 0.6685716629523797, + "grad_norm": 2.8125, + "learning_rate": 3.3237366674892384e-06, + "loss": 0.7004, + "step": 24450 + }, + { + "epoch": 0.668845107395305, + "grad_norm": 2.453125, + "learning_rate": 3.320994762962354e-06, + "loss": 0.6808, + "step": 24460 + }, + { + "epoch": 0.6691185518382302, + "grad_norm": 3.359375, + "learning_rate": 3.3182528584354693e-06, + "loss": 0.7618, + "step": 24470 + }, + { + "epoch": 0.6693919962811555, + "grad_norm": 2.75, + "learning_rate": 3.3155109539085852e-06, + "loss": 0.7601, + "step": 24480 + }, + { + "epoch": 0.6696654407240809, + "grad_norm": 2.78125, + "learning_rate": 3.312769049381701e-06, + "loss": 0.7842, + "step": 24490 + }, + { + "epoch": 0.6699388851670062, + "grad_norm": 2.578125, + "learning_rate": 3.3100271448548166e-06, + "loss": 0.6621, + "step": 24500 + }, + { + "epoch": 0.6702123296099315, + "grad_norm": 2.734375, + "learning_rate": 3.307285240327932e-06, + "loss": 0.7139, + "step": 24510 + }, + { + "epoch": 0.6704857740528568, + "grad_norm": 2.609375, + "learning_rate": 3.3045433358010476e-06, + "loss": 0.6101, + "step": 24520 + }, + { + "epoch": 0.6707592184957821, + "grad_norm": 2.375, + "learning_rate": 3.301801431274163e-06, + "loss": 0.6756, + "step": 24530 + }, + { + "epoch": 0.6710326629387074, + "grad_norm": 2.671875, + "learning_rate": 3.299059526747279e-06, + "loss": 0.6592, + "step": 24540 + }, + { + "epoch": 0.6713061073816328, + "grad_norm": 2.890625, + "learning_rate": 3.2963176222203944e-06, + "loss": 0.7061, + "step": 24550 + }, + { + "epoch": 0.6715795518245581, + "grad_norm": 2.84375, + "learning_rate": 3.2935757176935104e-06, + "loss": 0.6514, + "step": 24560 + }, + { + "epoch": 0.6718529962674834, + "grad_norm": 2.640625, + "learning_rate": 3.290833813166626e-06, + "loss": 0.7673, + "step": 24570 + }, + { + "epoch": 0.6721264407104086, + "grad_norm": 2.34375, + "learning_rate": 3.2880919086397417e-06, + "loss": 0.6626, + "step": 24580 + }, + { + "epoch": 0.6723998851533339, + "grad_norm": 3.078125, + "learning_rate": 3.2853500041128572e-06, + "loss": 0.6736, + "step": 24590 + }, + { + "epoch": 0.6726733295962593, + "grad_norm": 2.765625, + "learning_rate": 3.2826080995859727e-06, + "loss": 0.7229, + "step": 24600 + }, + { + "epoch": 0.6729467740391846, + "grad_norm": 2.84375, + "learning_rate": 3.279866195059088e-06, + "loss": 0.6412, + "step": 24610 + }, + { + "epoch": 0.6732202184821099, + "grad_norm": 2.96875, + "learning_rate": 3.2771242905322037e-06, + "loss": 0.631, + "step": 24620 + }, + { + "epoch": 0.6734936629250352, + "grad_norm": 2.921875, + "learning_rate": 3.2743823860053196e-06, + "loss": 0.7024, + "step": 24630 + }, + { + "epoch": 0.6737671073679605, + "grad_norm": 2.96875, + "learning_rate": 3.2716404814784355e-06, + "loss": 0.698, + "step": 24640 + }, + { + "epoch": 0.6740405518108858, + "grad_norm": 2.75, + "learning_rate": 3.268898576951551e-06, + "loss": 0.7555, + "step": 24650 + }, + { + "epoch": 0.6743139962538112, + "grad_norm": 2.6875, + "learning_rate": 3.2661566724246664e-06, + "loss": 0.6749, + "step": 24660 + }, + { + "epoch": 0.6745874406967365, + "grad_norm": 2.75, + "learning_rate": 3.2634147678977823e-06, + "loss": 0.68, + "step": 24670 + }, + { + "epoch": 0.6748608851396618, + "grad_norm": 2.84375, + "learning_rate": 3.260672863370898e-06, + "loss": 0.7131, + "step": 24680 + }, + { + "epoch": 0.675134329582587, + "grad_norm": 3.046875, + "learning_rate": 3.2579309588440133e-06, + "loss": 0.7191, + "step": 24690 + }, + { + "epoch": 0.6754077740255123, + "grad_norm": 2.734375, + "learning_rate": 3.255189054317129e-06, + "loss": 0.7465, + "step": 24700 + }, + { + "epoch": 0.6756812184684376, + "grad_norm": 2.828125, + "learning_rate": 3.2524471497902443e-06, + "loss": 0.6861, + "step": 24710 + }, + { + "epoch": 0.675954662911363, + "grad_norm": 3.078125, + "learning_rate": 3.2497052452633606e-06, + "loss": 0.6912, + "step": 24720 + }, + { + "epoch": 0.6762281073542883, + "grad_norm": 2.671875, + "learning_rate": 3.246963340736476e-06, + "loss": 0.7229, + "step": 24730 + }, + { + "epoch": 0.6765015517972136, + "grad_norm": 3.0625, + "learning_rate": 3.2442214362095916e-06, + "loss": 0.7592, + "step": 24740 + }, + { + "epoch": 0.6767749962401389, + "grad_norm": 3.015625, + "learning_rate": 3.241479531682707e-06, + "loss": 0.712, + "step": 24750 + }, + { + "epoch": 0.6770484406830642, + "grad_norm": 2.546875, + "learning_rate": 3.238737627155823e-06, + "loss": 0.6363, + "step": 24760 + }, + { + "epoch": 0.6773218851259896, + "grad_norm": 3.21875, + "learning_rate": 3.2359957226289384e-06, + "loss": 0.7003, + "step": 24770 + }, + { + "epoch": 0.6775953295689149, + "grad_norm": 2.953125, + "learning_rate": 3.233253818102054e-06, + "loss": 0.7144, + "step": 24780 + }, + { + "epoch": 0.6778687740118401, + "grad_norm": 2.765625, + "learning_rate": 3.2305119135751694e-06, + "loss": 0.7053, + "step": 24790 + }, + { + "epoch": 0.6781422184547654, + "grad_norm": 2.9375, + "learning_rate": 3.227770009048285e-06, + "loss": 0.62, + "step": 24800 + }, + { + "epoch": 0.6784156628976907, + "grad_norm": 2.34375, + "learning_rate": 3.2250281045214012e-06, + "loss": 0.6657, + "step": 24810 + }, + { + "epoch": 0.678689107340616, + "grad_norm": 2.859375, + "learning_rate": 3.2222861999945167e-06, + "loss": 0.7047, + "step": 24820 + }, + { + "epoch": 0.6789625517835414, + "grad_norm": 2.703125, + "learning_rate": 3.219544295467632e-06, + "loss": 0.6404, + "step": 24830 + }, + { + "epoch": 0.6792359962264667, + "grad_norm": 2.953125, + "learning_rate": 3.2168023909407477e-06, + "loss": 0.6494, + "step": 24840 + }, + { + "epoch": 0.679509440669392, + "grad_norm": 2.875, + "learning_rate": 3.2140604864138636e-06, + "loss": 0.6778, + "step": 24850 + }, + { + "epoch": 0.6797828851123173, + "grad_norm": 3.046875, + "learning_rate": 3.211318581886979e-06, + "loss": 0.7099, + "step": 24860 + }, + { + "epoch": 0.6800563295552426, + "grad_norm": 2.71875, + "learning_rate": 3.2085766773600945e-06, + "loss": 0.7202, + "step": 24870 + }, + { + "epoch": 0.680329773998168, + "grad_norm": 2.703125, + "learning_rate": 3.20583477283321e-06, + "loss": 0.7683, + "step": 24880 + }, + { + "epoch": 0.6806032184410933, + "grad_norm": 2.671875, + "learning_rate": 3.2030928683063255e-06, + "loss": 0.6545, + "step": 24890 + }, + { + "epoch": 0.6808766628840185, + "grad_norm": 2.46875, + "learning_rate": 3.200350963779442e-06, + "loss": 0.7011, + "step": 24900 + }, + { + "epoch": 0.6811501073269438, + "grad_norm": 2.546875, + "learning_rate": 3.1976090592525573e-06, + "loss": 0.6686, + "step": 24910 + }, + { + "epoch": 0.6814235517698691, + "grad_norm": 2.8125, + "learning_rate": 3.1948671547256728e-06, + "loss": 0.6578, + "step": 24920 + }, + { + "epoch": 0.6816969962127944, + "grad_norm": 2.546875, + "learning_rate": 3.1921252501987883e-06, + "loss": 0.6826, + "step": 24930 + }, + { + "epoch": 0.6819704406557198, + "grad_norm": 2.875, + "learning_rate": 3.189383345671904e-06, + "loss": 0.7305, + "step": 24940 + }, + { + "epoch": 0.6822438850986451, + "grad_norm": 3.375, + "learning_rate": 3.1866414411450197e-06, + "loss": 0.781, + "step": 24950 + }, + { + "epoch": 0.6825173295415704, + "grad_norm": 2.5625, + "learning_rate": 3.183899536618135e-06, + "loss": 0.7408, + "step": 24960 + }, + { + "epoch": 0.6827907739844957, + "grad_norm": 2.671875, + "learning_rate": 3.1811576320912506e-06, + "loss": 0.6196, + "step": 24970 + }, + { + "epoch": 0.683064218427421, + "grad_norm": 2.96875, + "learning_rate": 3.178415727564366e-06, + "loss": 0.7858, + "step": 24980 + }, + { + "epoch": 0.6833376628703464, + "grad_norm": 2.8125, + "learning_rate": 3.1756738230374824e-06, + "loss": 0.7179, + "step": 24990 + }, + { + "epoch": 0.6836111073132716, + "grad_norm": 2.609375, + "learning_rate": 3.172931918510598e-06, + "loss": 0.6825, + "step": 25000 + }, + { + "epoch": 0.6838845517561969, + "grad_norm": 2.59375, + "learning_rate": 3.1701900139837134e-06, + "loss": 0.6931, + "step": 25010 + }, + { + "epoch": 0.6841579961991222, + "grad_norm": 2.890625, + "learning_rate": 3.167448109456829e-06, + "loss": 0.6828, + "step": 25020 + }, + { + "epoch": 0.6844314406420475, + "grad_norm": 2.90625, + "learning_rate": 3.1647062049299448e-06, + "loss": 0.7768, + "step": 25030 + }, + { + "epoch": 0.6847048850849728, + "grad_norm": 2.96875, + "learning_rate": 3.1619643004030603e-06, + "loss": 0.7115, + "step": 25040 + }, + { + "epoch": 0.6849783295278982, + "grad_norm": 2.53125, + "learning_rate": 3.1592223958761757e-06, + "loss": 0.702, + "step": 25050 + }, + { + "epoch": 0.6852517739708235, + "grad_norm": 2.484375, + "learning_rate": 3.1564804913492912e-06, + "loss": 0.6722, + "step": 25060 + }, + { + "epoch": 0.6855252184137488, + "grad_norm": 2.53125, + "learning_rate": 3.1537385868224067e-06, + "loss": 0.7131, + "step": 25070 + }, + { + "epoch": 0.6857986628566741, + "grad_norm": 3.125, + "learning_rate": 3.150996682295523e-06, + "loss": 0.6656, + "step": 25080 + }, + { + "epoch": 0.6860721072995994, + "grad_norm": 2.40625, + "learning_rate": 3.1482547777686385e-06, + "loss": 0.6308, + "step": 25090 + }, + { + "epoch": 0.6863455517425248, + "grad_norm": 3.15625, + "learning_rate": 3.145512873241754e-06, + "loss": 0.6882, + "step": 25100 + }, + { + "epoch": 0.68661899618545, + "grad_norm": 2.84375, + "learning_rate": 3.1427709687148695e-06, + "loss": 0.6682, + "step": 25110 + }, + { + "epoch": 0.6868924406283753, + "grad_norm": 2.6875, + "learning_rate": 3.1400290641879854e-06, + "loss": 0.7355, + "step": 25120 + }, + { + "epoch": 0.6871658850713006, + "grad_norm": 2.90625, + "learning_rate": 3.137287159661101e-06, + "loss": 0.7702, + "step": 25130 + }, + { + "epoch": 0.6874393295142259, + "grad_norm": 2.421875, + "learning_rate": 3.1345452551342164e-06, + "loss": 0.7137, + "step": 25140 + }, + { + "epoch": 0.6877127739571512, + "grad_norm": 2.484375, + "learning_rate": 3.131803350607332e-06, + "loss": 0.5851, + "step": 25150 + }, + { + "epoch": 0.6879862184000766, + "grad_norm": 2.6875, + "learning_rate": 3.1290614460804473e-06, + "loss": 0.6763, + "step": 25160 + }, + { + "epoch": 0.6882596628430019, + "grad_norm": 2.515625, + "learning_rate": 3.1263195415535636e-06, + "loss": 0.6181, + "step": 25170 + }, + { + "epoch": 0.6885331072859272, + "grad_norm": 2.90625, + "learning_rate": 3.123577637026679e-06, + "loss": 0.6899, + "step": 25180 + }, + { + "epoch": 0.6888065517288525, + "grad_norm": 2.59375, + "learning_rate": 3.1208357324997946e-06, + "loss": 0.6408, + "step": 25190 + }, + { + "epoch": 0.6890799961717778, + "grad_norm": 2.75, + "learning_rate": 3.11809382797291e-06, + "loss": 0.763, + "step": 25200 + }, + { + "epoch": 0.6893534406147032, + "grad_norm": 2.75, + "learning_rate": 3.115351923446026e-06, + "loss": 0.6963, + "step": 25210 + }, + { + "epoch": 0.6896268850576284, + "grad_norm": 2.703125, + "learning_rate": 3.1126100189191415e-06, + "loss": 0.695, + "step": 25220 + }, + { + "epoch": 0.6899003295005537, + "grad_norm": 2.734375, + "learning_rate": 3.109868114392257e-06, + "loss": 0.7055, + "step": 25230 + }, + { + "epoch": 0.690173773943479, + "grad_norm": 3.546875, + "learning_rate": 3.1071262098653724e-06, + "loss": 0.7645, + "step": 25240 + }, + { + "epoch": 0.6904472183864043, + "grad_norm": 2.609375, + "learning_rate": 3.104384305338488e-06, + "loss": 0.7013, + "step": 25250 + }, + { + "epoch": 0.6907206628293296, + "grad_norm": 2.859375, + "learning_rate": 3.1016424008116043e-06, + "loss": 0.7244, + "step": 25260 + }, + { + "epoch": 0.690994107272255, + "grad_norm": 2.921875, + "learning_rate": 3.0989004962847197e-06, + "loss": 0.6992, + "step": 25270 + }, + { + "epoch": 0.6912675517151803, + "grad_norm": 2.765625, + "learning_rate": 3.0961585917578352e-06, + "loss": 0.7223, + "step": 25280 + }, + { + "epoch": 0.6915409961581056, + "grad_norm": 2.90625, + "learning_rate": 3.0934166872309507e-06, + "loss": 0.6981, + "step": 25290 + }, + { + "epoch": 0.6918144406010309, + "grad_norm": 2.984375, + "learning_rate": 3.0906747827040666e-06, + "loss": 0.7413, + "step": 25300 + }, + { + "epoch": 0.6920878850439562, + "grad_norm": 2.84375, + "learning_rate": 3.087932878177182e-06, + "loss": 0.7292, + "step": 25310 + }, + { + "epoch": 0.6923613294868814, + "grad_norm": 2.890625, + "learning_rate": 3.0851909736502976e-06, + "loss": 0.7068, + "step": 25320 + }, + { + "epoch": 0.6926347739298068, + "grad_norm": 3.296875, + "learning_rate": 3.082449069123413e-06, + "loss": 0.7829, + "step": 25330 + }, + { + "epoch": 0.6929082183727321, + "grad_norm": 3.203125, + "learning_rate": 3.0797071645965285e-06, + "loss": 0.761, + "step": 25340 + }, + { + "epoch": 0.6931816628156574, + "grad_norm": 2.640625, + "learning_rate": 3.076965260069645e-06, + "loss": 0.7247, + "step": 25350 + }, + { + "epoch": 0.6934551072585827, + "grad_norm": 2.625, + "learning_rate": 3.0742233555427603e-06, + "loss": 0.7451, + "step": 25360 + }, + { + "epoch": 0.693728551701508, + "grad_norm": 2.765625, + "learning_rate": 3.071481451015876e-06, + "loss": 0.6505, + "step": 25370 + }, + { + "epoch": 0.6940019961444334, + "grad_norm": 2.703125, + "learning_rate": 3.0687395464889913e-06, + "loss": 0.6476, + "step": 25380 + }, + { + "epoch": 0.6942754405873587, + "grad_norm": 2.96875, + "learning_rate": 3.0659976419621072e-06, + "loss": 0.6482, + "step": 25390 + }, + { + "epoch": 0.694548885030284, + "grad_norm": 2.875, + "learning_rate": 3.0632557374352227e-06, + "loss": 0.8068, + "step": 25400 + }, + { + "epoch": 0.6948223294732093, + "grad_norm": 2.625, + "learning_rate": 3.060513832908338e-06, + "loss": 0.6539, + "step": 25410 + }, + { + "epoch": 0.6950957739161346, + "grad_norm": 2.921875, + "learning_rate": 3.0577719283814537e-06, + "loss": 0.6979, + "step": 25420 + }, + { + "epoch": 0.6953692183590598, + "grad_norm": 3.109375, + "learning_rate": 3.0550300238545696e-06, + "loss": 0.6004, + "step": 25430 + }, + { + "epoch": 0.6956426628019852, + "grad_norm": 2.640625, + "learning_rate": 3.0522881193276855e-06, + "loss": 0.7148, + "step": 25440 + }, + { + "epoch": 0.6959161072449105, + "grad_norm": 2.8125, + "learning_rate": 3.049546214800801e-06, + "loss": 0.7252, + "step": 25450 + }, + { + "epoch": 0.6961895516878358, + "grad_norm": 2.671875, + "learning_rate": 3.0468043102739164e-06, + "loss": 0.7433, + "step": 25460 + }, + { + "epoch": 0.6964629961307611, + "grad_norm": 2.96875, + "learning_rate": 3.044062405747032e-06, + "loss": 0.6297, + "step": 25470 + }, + { + "epoch": 0.6967364405736864, + "grad_norm": 2.609375, + "learning_rate": 3.041320501220148e-06, + "loss": 0.6938, + "step": 25480 + }, + { + "epoch": 0.6970098850166118, + "grad_norm": 3.125, + "learning_rate": 3.0385785966932633e-06, + "loss": 0.6621, + "step": 25490 + }, + { + "epoch": 0.6972833294595371, + "grad_norm": 2.921875, + "learning_rate": 3.0358366921663788e-06, + "loss": 0.6341, + "step": 25500 + }, + { + "epoch": 0.6975567739024624, + "grad_norm": 2.578125, + "learning_rate": 3.0330947876394947e-06, + "loss": 0.6799, + "step": 25510 + }, + { + "epoch": 0.6978302183453877, + "grad_norm": 2.5, + "learning_rate": 3.03035288311261e-06, + "loss": 0.7273, + "step": 25520 + }, + { + "epoch": 0.698103662788313, + "grad_norm": 3.015625, + "learning_rate": 3.027610978585726e-06, + "loss": 0.6838, + "step": 25530 + }, + { + "epoch": 0.6983771072312382, + "grad_norm": 2.734375, + "learning_rate": 3.0248690740588416e-06, + "loss": 0.6762, + "step": 25540 + }, + { + "epoch": 0.6986505516741636, + "grad_norm": 2.125, + "learning_rate": 3.022127169531957e-06, + "loss": 0.6554, + "step": 25550 + }, + { + "epoch": 0.6989239961170889, + "grad_norm": 2.59375, + "learning_rate": 3.0193852650050725e-06, + "loss": 0.665, + "step": 25560 + }, + { + "epoch": 0.6991974405600142, + "grad_norm": 2.734375, + "learning_rate": 3.0166433604781884e-06, + "loss": 0.6735, + "step": 25570 + }, + { + "epoch": 0.6994708850029395, + "grad_norm": 2.734375, + "learning_rate": 3.013901455951304e-06, + "loss": 0.7068, + "step": 25580 + }, + { + "epoch": 0.6997443294458648, + "grad_norm": 2.78125, + "learning_rate": 3.01115955142442e-06, + "loss": 0.6789, + "step": 25590 + }, + { + "epoch": 0.7000177738887902, + "grad_norm": 3.140625, + "learning_rate": 3.0084176468975353e-06, + "loss": 0.7074, + "step": 25600 + }, + { + "epoch": 0.7002912183317155, + "grad_norm": 3.015625, + "learning_rate": 3.0056757423706508e-06, + "loss": 0.6785, + "step": 25610 + }, + { + "epoch": 0.7005646627746408, + "grad_norm": 2.84375, + "learning_rate": 3.0029338378437667e-06, + "loss": 0.6724, + "step": 25620 + }, + { + "epoch": 0.7008381072175661, + "grad_norm": 2.953125, + "learning_rate": 3.000191933316882e-06, + "loss": 0.687, + "step": 25630 + }, + { + "epoch": 0.7011115516604913, + "grad_norm": 2.765625, + "learning_rate": 2.9974500287899977e-06, + "loss": 0.7426, + "step": 25640 + }, + { + "epoch": 0.7013849961034166, + "grad_norm": 2.8125, + "learning_rate": 2.994708124263113e-06, + "loss": 0.6782, + "step": 25650 + }, + { + "epoch": 0.701658440546342, + "grad_norm": 2.859375, + "learning_rate": 2.991966219736229e-06, + "loss": 0.6829, + "step": 25660 + }, + { + "epoch": 0.7019318849892673, + "grad_norm": 2.59375, + "learning_rate": 2.989224315209345e-06, + "loss": 0.6529, + "step": 25670 + }, + { + "epoch": 0.7022053294321926, + "grad_norm": 2.90625, + "learning_rate": 2.9864824106824604e-06, + "loss": 0.6697, + "step": 25680 + }, + { + "epoch": 0.7024787738751179, + "grad_norm": 3.03125, + "learning_rate": 2.983740506155576e-06, + "loss": 0.6845, + "step": 25690 + }, + { + "epoch": 0.7027522183180432, + "grad_norm": 2.890625, + "learning_rate": 2.9809986016286914e-06, + "loss": 0.7535, + "step": 25700 + }, + { + "epoch": 0.7030256627609686, + "grad_norm": 2.609375, + "learning_rate": 2.9782566971018073e-06, + "loss": 0.6802, + "step": 25710 + }, + { + "epoch": 0.7032991072038939, + "grad_norm": 3.125, + "learning_rate": 2.9755147925749228e-06, + "loss": 0.7259, + "step": 25720 + }, + { + "epoch": 0.7035725516468192, + "grad_norm": 2.9375, + "learning_rate": 2.9727728880480383e-06, + "loss": 0.6913, + "step": 25730 + }, + { + "epoch": 0.7038459960897445, + "grad_norm": 3.34375, + "learning_rate": 2.9700309835211537e-06, + "loss": 0.6758, + "step": 25740 + }, + { + "epoch": 0.7041194405326697, + "grad_norm": 3.15625, + "learning_rate": 2.96728907899427e-06, + "loss": 0.7054, + "step": 25750 + }, + { + "epoch": 0.704392884975595, + "grad_norm": 2.4375, + "learning_rate": 2.9645471744673856e-06, + "loss": 0.6368, + "step": 25760 + }, + { + "epoch": 0.7046663294185204, + "grad_norm": 3.28125, + "learning_rate": 2.961805269940501e-06, + "loss": 0.6907, + "step": 25770 + }, + { + "epoch": 0.7049397738614457, + "grad_norm": 2.546875, + "learning_rate": 2.9590633654136165e-06, + "loss": 0.6433, + "step": 25780 + }, + { + "epoch": 0.705213218304371, + "grad_norm": 3.140625, + "learning_rate": 2.9563214608867324e-06, + "loss": 0.7706, + "step": 25790 + }, + { + "epoch": 0.7054866627472963, + "grad_norm": 2.890625, + "learning_rate": 2.953579556359848e-06, + "loss": 0.6803, + "step": 25800 + }, + { + "epoch": 0.7057601071902216, + "grad_norm": 3.15625, + "learning_rate": 2.9508376518329634e-06, + "loss": 0.7153, + "step": 25810 + }, + { + "epoch": 0.706033551633147, + "grad_norm": 2.96875, + "learning_rate": 2.948095747306079e-06, + "loss": 0.6899, + "step": 25820 + }, + { + "epoch": 0.7063069960760723, + "grad_norm": 3.234375, + "learning_rate": 2.9453538427791944e-06, + "loss": 0.7005, + "step": 25830 + }, + { + "epoch": 0.7065804405189976, + "grad_norm": 2.71875, + "learning_rate": 2.9426119382523107e-06, + "loss": 0.7019, + "step": 25840 + }, + { + "epoch": 0.7068538849619228, + "grad_norm": 2.796875, + "learning_rate": 2.939870033725426e-06, + "loss": 0.7224, + "step": 25850 + }, + { + "epoch": 0.7071273294048481, + "grad_norm": 2.875, + "learning_rate": 2.9371281291985416e-06, + "loss": 0.6899, + "step": 25860 + }, + { + "epoch": 0.7074007738477734, + "grad_norm": 2.65625, + "learning_rate": 2.934386224671657e-06, + "loss": 0.6789, + "step": 25870 + }, + { + "epoch": 0.7076742182906988, + "grad_norm": 2.5625, + "learning_rate": 2.931644320144773e-06, + "loss": 0.6677, + "step": 25880 + }, + { + "epoch": 0.7079476627336241, + "grad_norm": 2.921875, + "learning_rate": 2.9289024156178885e-06, + "loss": 0.6555, + "step": 25890 + }, + { + "epoch": 0.7082211071765494, + "grad_norm": 2.953125, + "learning_rate": 2.926160511091004e-06, + "loss": 0.7629, + "step": 25900 + }, + { + "epoch": 0.7084945516194747, + "grad_norm": 3.921875, + "learning_rate": 2.9234186065641195e-06, + "loss": 0.6896, + "step": 25910 + }, + { + "epoch": 0.7087679960624, + "grad_norm": 2.359375, + "learning_rate": 2.920676702037235e-06, + "loss": 0.6545, + "step": 25920 + }, + { + "epoch": 0.7090414405053254, + "grad_norm": 2.890625, + "learning_rate": 2.9179347975103513e-06, + "loss": 0.6048, + "step": 25930 + }, + { + "epoch": 0.7093148849482507, + "grad_norm": 2.40625, + "learning_rate": 2.9151928929834668e-06, + "loss": 0.7316, + "step": 25940 + }, + { + "epoch": 0.709588329391176, + "grad_norm": 2.4375, + "learning_rate": 2.9124509884565823e-06, + "loss": 0.7251, + "step": 25950 + }, + { + "epoch": 0.7098617738341012, + "grad_norm": 3.015625, + "learning_rate": 2.9097090839296977e-06, + "loss": 0.7502, + "step": 25960 + }, + { + "epoch": 0.7101352182770265, + "grad_norm": 2.46875, + "learning_rate": 2.9069671794028136e-06, + "loss": 0.6837, + "step": 25970 + }, + { + "epoch": 0.7104086627199518, + "grad_norm": 2.765625, + "learning_rate": 2.904225274875929e-06, + "loss": 0.6053, + "step": 25980 + }, + { + "epoch": 0.7106821071628772, + "grad_norm": 2.78125, + "learning_rate": 2.9014833703490446e-06, + "loss": 0.7697, + "step": 25990 + }, + { + "epoch": 0.7109555516058025, + "grad_norm": 2.921875, + "learning_rate": 2.89874146582216e-06, + "loss": 0.6528, + "step": 26000 + }, + { + "epoch": 0.7112289960487278, + "grad_norm": 2.703125, + "learning_rate": 2.8959995612952756e-06, + "loss": 0.7213, + "step": 26010 + }, + { + "epoch": 0.7115024404916531, + "grad_norm": 3.046875, + "learning_rate": 2.893257656768392e-06, + "loss": 0.7147, + "step": 26020 + }, + { + "epoch": 0.7117758849345784, + "grad_norm": 2.796875, + "learning_rate": 2.8905157522415074e-06, + "loss": 0.7372, + "step": 26030 + }, + { + "epoch": 0.7120493293775038, + "grad_norm": 2.578125, + "learning_rate": 2.887773847714623e-06, + "loss": 0.6489, + "step": 26040 + }, + { + "epoch": 0.7123227738204291, + "grad_norm": 2.921875, + "learning_rate": 2.8850319431877383e-06, + "loss": 0.7039, + "step": 26050 + }, + { + "epoch": 0.7125962182633544, + "grad_norm": 2.8125, + "learning_rate": 2.8822900386608542e-06, + "loss": 0.7271, + "step": 26060 + }, + { + "epoch": 0.7128696627062796, + "grad_norm": 3.4375, + "learning_rate": 2.8795481341339697e-06, + "loss": 0.6807, + "step": 26070 + }, + { + "epoch": 0.7131431071492049, + "grad_norm": 2.53125, + "learning_rate": 2.876806229607085e-06, + "loss": 0.668, + "step": 26080 + }, + { + "epoch": 0.7134165515921302, + "grad_norm": 2.21875, + "learning_rate": 2.8740643250802007e-06, + "loss": 0.7168, + "step": 26090 + }, + { + "epoch": 0.7136899960350556, + "grad_norm": 2.890625, + "learning_rate": 2.871322420553316e-06, + "loss": 0.7389, + "step": 26100 + }, + { + "epoch": 0.7139634404779809, + "grad_norm": 2.578125, + "learning_rate": 2.8685805160264325e-06, + "loss": 0.5996, + "step": 26110 + }, + { + "epoch": 0.7142368849209062, + "grad_norm": 2.65625, + "learning_rate": 2.865838611499548e-06, + "loss": 0.7028, + "step": 26120 + }, + { + "epoch": 0.7145103293638315, + "grad_norm": 2.625, + "learning_rate": 2.8630967069726635e-06, + "loss": 0.6481, + "step": 26130 + }, + { + "epoch": 0.7147837738067568, + "grad_norm": 3.109375, + "learning_rate": 2.860354802445779e-06, + "loss": 0.7945, + "step": 26140 + }, + { + "epoch": 0.7150572182496822, + "grad_norm": 2.546875, + "learning_rate": 2.857612897918895e-06, + "loss": 0.6749, + "step": 26150 + }, + { + "epoch": 0.7153306626926075, + "grad_norm": 2.78125, + "learning_rate": 2.8548709933920103e-06, + "loss": 0.6878, + "step": 26160 + }, + { + "epoch": 0.7156041071355327, + "grad_norm": 2.609375, + "learning_rate": 2.852129088865126e-06, + "loss": 0.6528, + "step": 26170 + }, + { + "epoch": 0.715877551578458, + "grad_norm": 2.671875, + "learning_rate": 2.8493871843382413e-06, + "loss": 0.7319, + "step": 26180 + }, + { + "epoch": 0.7161509960213833, + "grad_norm": 2.59375, + "learning_rate": 2.8466452798113568e-06, + "loss": 0.6188, + "step": 26190 + }, + { + "epoch": 0.7164244404643086, + "grad_norm": 2.921875, + "learning_rate": 2.843903375284473e-06, + "loss": 0.7615, + "step": 26200 + }, + { + "epoch": 0.716697884907234, + "grad_norm": 2.765625, + "learning_rate": 2.8411614707575886e-06, + "loss": 0.65, + "step": 26210 + }, + { + "epoch": 0.7169713293501593, + "grad_norm": 2.984375, + "learning_rate": 2.838419566230704e-06, + "loss": 0.7356, + "step": 26220 + }, + { + "epoch": 0.7172447737930846, + "grad_norm": 2.859375, + "learning_rate": 2.8356776617038196e-06, + "loss": 0.6764, + "step": 26230 + }, + { + "epoch": 0.7175182182360099, + "grad_norm": 2.765625, + "learning_rate": 2.8329357571769355e-06, + "loss": 0.6949, + "step": 26240 + }, + { + "epoch": 0.7177916626789352, + "grad_norm": 2.453125, + "learning_rate": 2.830193852650051e-06, + "loss": 0.6178, + "step": 26250 + }, + { + "epoch": 0.7180651071218606, + "grad_norm": 2.53125, + "learning_rate": 2.8274519481231664e-06, + "loss": 0.6146, + "step": 26260 + }, + { + "epoch": 0.7183385515647859, + "grad_norm": 2.875, + "learning_rate": 2.824710043596282e-06, + "loss": 0.7565, + "step": 26270 + }, + { + "epoch": 0.7186119960077111, + "grad_norm": 2.34375, + "learning_rate": 2.8219681390693974e-06, + "loss": 0.6574, + "step": 26280 + }, + { + "epoch": 0.7188854404506364, + "grad_norm": 2.75, + "learning_rate": 2.8192262345425137e-06, + "loss": 0.6707, + "step": 26290 + }, + { + "epoch": 0.7191588848935617, + "grad_norm": 2.53125, + "learning_rate": 2.816484330015629e-06, + "loss": 0.7539, + "step": 26300 + }, + { + "epoch": 0.719432329336487, + "grad_norm": 2.359375, + "learning_rate": 2.8137424254887447e-06, + "loss": 0.6744, + "step": 26310 + }, + { + "epoch": 0.7197057737794124, + "grad_norm": 2.9375, + "learning_rate": 2.81100052096186e-06, + "loss": 0.6788, + "step": 26320 + }, + { + "epoch": 0.7199792182223377, + "grad_norm": 2.625, + "learning_rate": 2.808258616434976e-06, + "loss": 0.6764, + "step": 26330 + }, + { + "epoch": 0.720252662665263, + "grad_norm": 3.0, + "learning_rate": 2.8055167119080916e-06, + "loss": 0.7155, + "step": 26340 + }, + { + "epoch": 0.7205261071081883, + "grad_norm": 2.875, + "learning_rate": 2.802774807381207e-06, + "loss": 0.733, + "step": 26350 + }, + { + "epoch": 0.7207995515511136, + "grad_norm": 2.5, + "learning_rate": 2.8000329028543225e-06, + "loss": 0.6468, + "step": 26360 + }, + { + "epoch": 0.721072995994039, + "grad_norm": 3.0, + "learning_rate": 2.7972909983274384e-06, + "loss": 0.6206, + "step": 26370 + }, + { + "epoch": 0.7213464404369642, + "grad_norm": 3.21875, + "learning_rate": 2.7945490938005543e-06, + "loss": 0.7199, + "step": 26380 + }, + { + "epoch": 0.7216198848798895, + "grad_norm": 2.359375, + "learning_rate": 2.79180718927367e-06, + "loss": 0.6472, + "step": 26390 + }, + { + "epoch": 0.7218933293228148, + "grad_norm": 3.296875, + "learning_rate": 2.7890652847467853e-06, + "loss": 0.7543, + "step": 26400 + }, + { + "epoch": 0.7221667737657401, + "grad_norm": 3.328125, + "learning_rate": 2.7863233802199008e-06, + "loss": 0.7146, + "step": 26410 + }, + { + "epoch": 0.7224402182086654, + "grad_norm": 2.21875, + "learning_rate": 2.7835814756930167e-06, + "loss": 0.6151, + "step": 26420 + }, + { + "epoch": 0.7227136626515908, + "grad_norm": 3.359375, + "learning_rate": 2.780839571166132e-06, + "loss": 0.7219, + "step": 26430 + }, + { + "epoch": 0.7229871070945161, + "grad_norm": 2.6875, + "learning_rate": 2.7780976666392476e-06, + "loss": 0.75, + "step": 26440 + }, + { + "epoch": 0.7232605515374414, + "grad_norm": 2.5625, + "learning_rate": 2.7753557621123635e-06, + "loss": 0.7328, + "step": 26450 + }, + { + "epoch": 0.7235339959803667, + "grad_norm": 2.6875, + "learning_rate": 2.772613857585479e-06, + "loss": 0.6746, + "step": 26460 + }, + { + "epoch": 0.723807440423292, + "grad_norm": 2.40625, + "learning_rate": 2.769871953058595e-06, + "loss": 0.6206, + "step": 26470 + }, + { + "epoch": 0.7240808848662174, + "grad_norm": 2.703125, + "learning_rate": 2.7671300485317104e-06, + "loss": 0.7274, + "step": 26480 + }, + { + "epoch": 0.7243543293091426, + "grad_norm": 3.171875, + "learning_rate": 2.764388144004826e-06, + "loss": 0.7153, + "step": 26490 + }, + { + "epoch": 0.7246277737520679, + "grad_norm": 3.015625, + "learning_rate": 2.7616462394779414e-06, + "loss": 0.7339, + "step": 26500 + }, + { + "epoch": 0.7249012181949932, + "grad_norm": 2.484375, + "learning_rate": 2.7589043349510573e-06, + "loss": 0.6517, + "step": 26510 + }, + { + "epoch": 0.7251746626379185, + "grad_norm": 2.734375, + "learning_rate": 2.7561624304241728e-06, + "loss": 0.7495, + "step": 26520 + }, + { + "epoch": 0.7254481070808438, + "grad_norm": 3.46875, + "learning_rate": 2.7534205258972887e-06, + "loss": 0.755, + "step": 26530 + }, + { + "epoch": 0.7257215515237692, + "grad_norm": 3.9375, + "learning_rate": 2.750678621370404e-06, + "loss": 0.7211, + "step": 26540 + }, + { + "epoch": 0.7259949959666945, + "grad_norm": 3.15625, + "learning_rate": 2.7479367168435196e-06, + "loss": 0.707, + "step": 26550 + }, + { + "epoch": 0.7262684404096198, + "grad_norm": 2.90625, + "learning_rate": 2.7451948123166355e-06, + "loss": 0.6846, + "step": 26560 + }, + { + "epoch": 0.7265418848525451, + "grad_norm": 2.65625, + "learning_rate": 2.742452907789751e-06, + "loss": 0.6856, + "step": 26570 + }, + { + "epoch": 0.7268153292954704, + "grad_norm": 2.921875, + "learning_rate": 2.7397110032628665e-06, + "loss": 0.6944, + "step": 26580 + }, + { + "epoch": 0.7270887737383958, + "grad_norm": 2.765625, + "learning_rate": 2.736969098735982e-06, + "loss": 0.6306, + "step": 26590 + }, + { + "epoch": 0.727362218181321, + "grad_norm": 2.671875, + "learning_rate": 2.734227194209098e-06, + "loss": 0.6281, + "step": 26600 + }, + { + "epoch": 0.7276356626242463, + "grad_norm": 2.5, + "learning_rate": 2.731485289682214e-06, + "loss": 0.6322, + "step": 26610 + }, + { + "epoch": 0.7279091070671716, + "grad_norm": 2.859375, + "learning_rate": 2.7287433851553293e-06, + "loss": 0.621, + "step": 26620 + }, + { + "epoch": 0.7281825515100969, + "grad_norm": 2.796875, + "learning_rate": 2.7260014806284448e-06, + "loss": 0.6693, + "step": 26630 + }, + { + "epoch": 0.7284559959530222, + "grad_norm": 2.53125, + "learning_rate": 2.7232595761015602e-06, + "loss": 0.6598, + "step": 26640 + }, + { + "epoch": 0.7287294403959476, + "grad_norm": 2.734375, + "learning_rate": 2.720517671574676e-06, + "loss": 0.6647, + "step": 26650 + }, + { + "epoch": 0.7290028848388729, + "grad_norm": 2.53125, + "learning_rate": 2.7177757670477916e-06, + "loss": 0.6328, + "step": 26660 + }, + { + "epoch": 0.7292763292817982, + "grad_norm": 2.984375, + "learning_rate": 2.715033862520907e-06, + "loss": 0.7776, + "step": 26670 + }, + { + "epoch": 0.7295497737247235, + "grad_norm": 2.921875, + "learning_rate": 2.7122919579940226e-06, + "loss": 0.6284, + "step": 26680 + }, + { + "epoch": 0.7298232181676488, + "grad_norm": 2.875, + "learning_rate": 2.709550053467139e-06, + "loss": 0.6576, + "step": 26690 + }, + { + "epoch": 0.730096662610574, + "grad_norm": 2.703125, + "learning_rate": 2.7068081489402544e-06, + "loss": 0.7837, + "step": 26700 + }, + { + "epoch": 0.7303701070534994, + "grad_norm": 2.328125, + "learning_rate": 2.70406624441337e-06, + "loss": 0.7047, + "step": 26710 + }, + { + "epoch": 0.7306435514964247, + "grad_norm": 2.84375, + "learning_rate": 2.7013243398864854e-06, + "loss": 0.753, + "step": 26720 + }, + { + "epoch": 0.73091699593935, + "grad_norm": 2.671875, + "learning_rate": 2.698582435359601e-06, + "loss": 0.7367, + "step": 26730 + }, + { + "epoch": 0.7311904403822753, + "grad_norm": 2.6875, + "learning_rate": 2.6958405308327168e-06, + "loss": 0.6614, + "step": 26740 + }, + { + "epoch": 0.7314638848252006, + "grad_norm": 2.375, + "learning_rate": 2.6930986263058322e-06, + "loss": 0.7485, + "step": 26750 + }, + { + "epoch": 0.731737329268126, + "grad_norm": 2.734375, + "learning_rate": 2.6903567217789477e-06, + "loss": 0.6772, + "step": 26760 + }, + { + "epoch": 0.7320107737110513, + "grad_norm": 2.75, + "learning_rate": 2.687614817252063e-06, + "loss": 0.6751, + "step": 26770 + }, + { + "epoch": 0.7322842181539766, + "grad_norm": 3.34375, + "learning_rate": 2.6848729127251795e-06, + "loss": 0.711, + "step": 26780 + }, + { + "epoch": 0.7325576625969019, + "grad_norm": 2.40625, + "learning_rate": 2.682131008198295e-06, + "loss": 0.6291, + "step": 26790 + }, + { + "epoch": 0.7328311070398272, + "grad_norm": 2.953125, + "learning_rate": 2.6793891036714105e-06, + "loss": 0.6424, + "step": 26800 + }, + { + "epoch": 0.7331045514827524, + "grad_norm": 2.90625, + "learning_rate": 2.676647199144526e-06, + "loss": 0.6943, + "step": 26810 + }, + { + "epoch": 0.7333779959256778, + "grad_norm": 3.015625, + "learning_rate": 2.6739052946176415e-06, + "loss": 0.7476, + "step": 26820 + }, + { + "epoch": 0.7336514403686031, + "grad_norm": 2.421875, + "learning_rate": 2.6711633900907574e-06, + "loss": 0.6419, + "step": 26830 + }, + { + "epoch": 0.7339248848115284, + "grad_norm": 3.09375, + "learning_rate": 2.668421485563873e-06, + "loss": 0.6848, + "step": 26840 + }, + { + "epoch": 0.7341983292544537, + "grad_norm": 2.625, + "learning_rate": 2.6656795810369883e-06, + "loss": 0.6847, + "step": 26850 + }, + { + "epoch": 0.734471773697379, + "grad_norm": 2.703125, + "learning_rate": 2.662937676510104e-06, + "loss": 0.7222, + "step": 26860 + }, + { + "epoch": 0.7347452181403044, + "grad_norm": 3.25, + "learning_rate": 2.66019577198322e-06, + "loss": 0.6762, + "step": 26870 + }, + { + "epoch": 0.7350186625832297, + "grad_norm": 2.625, + "learning_rate": 2.6574538674563356e-06, + "loss": 0.6395, + "step": 26880 + }, + { + "epoch": 0.735292107026155, + "grad_norm": 2.796875, + "learning_rate": 2.654711962929451e-06, + "loss": 0.6612, + "step": 26890 + }, + { + "epoch": 0.7355655514690803, + "grad_norm": 3.21875, + "learning_rate": 2.6519700584025666e-06, + "loss": 0.6933, + "step": 26900 + }, + { + "epoch": 0.7358389959120056, + "grad_norm": 3.171875, + "learning_rate": 2.649228153875682e-06, + "loss": 0.7057, + "step": 26910 + }, + { + "epoch": 0.7361124403549308, + "grad_norm": 2.875, + "learning_rate": 2.646486249348798e-06, + "loss": 0.6481, + "step": 26920 + }, + { + "epoch": 0.7363858847978562, + "grad_norm": 2.625, + "learning_rate": 2.6437443448219135e-06, + "loss": 0.6462, + "step": 26930 + }, + { + "epoch": 0.7366593292407815, + "grad_norm": 2.71875, + "learning_rate": 2.641002440295029e-06, + "loss": 0.6532, + "step": 26940 + }, + { + "epoch": 0.7369327736837068, + "grad_norm": 2.6875, + "learning_rate": 2.6382605357681444e-06, + "loss": 0.679, + "step": 26950 + }, + { + "epoch": 0.7372062181266321, + "grad_norm": 3.578125, + "learning_rate": 2.6355186312412608e-06, + "loss": 0.7106, + "step": 26960 + }, + { + "epoch": 0.7374796625695574, + "grad_norm": 2.953125, + "learning_rate": 2.6327767267143762e-06, + "loss": 0.6571, + "step": 26970 + }, + { + "epoch": 0.7377531070124828, + "grad_norm": 2.5, + "learning_rate": 2.6300348221874917e-06, + "loss": 0.6749, + "step": 26980 + }, + { + "epoch": 0.7380265514554081, + "grad_norm": 2.625, + "learning_rate": 2.627292917660607e-06, + "loss": 0.7128, + "step": 26990 + }, + { + "epoch": 0.7382999958983334, + "grad_norm": 2.46875, + "learning_rate": 2.6245510131337227e-06, + "loss": 0.6555, + "step": 27000 + }, + { + "epoch": 0.7385734403412587, + "grad_norm": 2.734375, + "learning_rate": 2.6218091086068386e-06, + "loss": 0.6867, + "step": 27010 + }, + { + "epoch": 0.7388468847841839, + "grad_norm": 3.671875, + "learning_rate": 2.619067204079954e-06, + "loss": 0.7057, + "step": 27020 + }, + { + "epoch": 0.7391203292271092, + "grad_norm": 2.453125, + "learning_rate": 2.6163252995530696e-06, + "loss": 0.6199, + "step": 27030 + }, + { + "epoch": 0.7393937736700346, + "grad_norm": 2.609375, + "learning_rate": 2.613583395026185e-06, + "loss": 0.684, + "step": 27040 + }, + { + "epoch": 0.7396672181129599, + "grad_norm": 2.453125, + "learning_rate": 2.6108414904993014e-06, + "loss": 0.7214, + "step": 27050 + }, + { + "epoch": 0.7399406625558852, + "grad_norm": 3.1875, + "learning_rate": 2.608099585972417e-06, + "loss": 0.7273, + "step": 27060 + }, + { + "epoch": 0.7402141069988105, + "grad_norm": 2.828125, + "learning_rate": 2.6053576814455323e-06, + "loss": 0.7282, + "step": 27070 + }, + { + "epoch": 0.7404875514417358, + "grad_norm": 2.78125, + "learning_rate": 2.602615776918648e-06, + "loss": 0.7235, + "step": 27080 + }, + { + "epoch": 0.7407609958846612, + "grad_norm": 2.609375, + "learning_rate": 2.5998738723917637e-06, + "loss": 0.673, + "step": 27090 + }, + { + "epoch": 0.7410344403275865, + "grad_norm": 2.671875, + "learning_rate": 2.597131967864879e-06, + "loss": 0.6798, + "step": 27100 + }, + { + "epoch": 0.7413078847705118, + "grad_norm": 2.984375, + "learning_rate": 2.5943900633379947e-06, + "loss": 0.7259, + "step": 27110 + }, + { + "epoch": 0.7415813292134371, + "grad_norm": 2.671875, + "learning_rate": 2.59164815881111e-06, + "loss": 0.6767, + "step": 27120 + }, + { + "epoch": 0.7418547736563623, + "grad_norm": 3.015625, + "learning_rate": 2.5889062542842256e-06, + "loss": 0.6648, + "step": 27130 + }, + { + "epoch": 0.7421282180992876, + "grad_norm": 2.4375, + "learning_rate": 2.586164349757342e-06, + "loss": 0.6945, + "step": 27140 + }, + { + "epoch": 0.742401662542213, + "grad_norm": 2.5625, + "learning_rate": 2.5834224452304575e-06, + "loss": 0.6622, + "step": 27150 + }, + { + "epoch": 0.7426751069851383, + "grad_norm": 2.46875, + "learning_rate": 2.580680540703573e-06, + "loss": 0.6715, + "step": 27160 + }, + { + "epoch": 0.7429485514280636, + "grad_norm": 2.84375, + "learning_rate": 2.5779386361766884e-06, + "loss": 0.6882, + "step": 27170 + }, + { + "epoch": 0.7432219958709889, + "grad_norm": 3.21875, + "learning_rate": 2.5751967316498043e-06, + "loss": 0.6983, + "step": 27180 + }, + { + "epoch": 0.7434954403139142, + "grad_norm": 2.53125, + "learning_rate": 2.57245482712292e-06, + "loss": 0.6911, + "step": 27190 + }, + { + "epoch": 0.7437688847568396, + "grad_norm": 2.796875, + "learning_rate": 2.5697129225960353e-06, + "loss": 0.7332, + "step": 27200 + }, + { + "epoch": 0.7440423291997649, + "grad_norm": 3.203125, + "learning_rate": 2.5669710180691508e-06, + "loss": 0.6356, + "step": 27210 + }, + { + "epoch": 0.7443157736426902, + "grad_norm": 2.78125, + "learning_rate": 2.5642291135422662e-06, + "loss": 0.6714, + "step": 27220 + }, + { + "epoch": 0.7445892180856154, + "grad_norm": 2.8125, + "learning_rate": 2.5614872090153826e-06, + "loss": 0.6495, + "step": 27230 + }, + { + "epoch": 0.7448626625285407, + "grad_norm": 2.78125, + "learning_rate": 2.558745304488498e-06, + "loss": 0.7544, + "step": 27240 + }, + { + "epoch": 0.745136106971466, + "grad_norm": 3.59375, + "learning_rate": 2.5560033999616135e-06, + "loss": 0.7132, + "step": 27250 + }, + { + "epoch": 0.7454095514143914, + "grad_norm": 2.828125, + "learning_rate": 2.553261495434729e-06, + "loss": 0.7109, + "step": 27260 + }, + { + "epoch": 0.7456829958573167, + "grad_norm": 3.109375, + "learning_rate": 2.550519590907845e-06, + "loss": 0.6141, + "step": 27270 + }, + { + "epoch": 0.745956440300242, + "grad_norm": 2.5, + "learning_rate": 2.5477776863809604e-06, + "loss": 0.7022, + "step": 27280 + }, + { + "epoch": 0.7462298847431673, + "grad_norm": 2.8125, + "learning_rate": 2.545035781854076e-06, + "loss": 0.691, + "step": 27290 + }, + { + "epoch": 0.7465033291860926, + "grad_norm": 2.578125, + "learning_rate": 2.5422938773271914e-06, + "loss": 0.7147, + "step": 27300 + }, + { + "epoch": 0.746776773629018, + "grad_norm": 3.046875, + "learning_rate": 2.5395519728003073e-06, + "loss": 0.6676, + "step": 27310 + }, + { + "epoch": 0.7470502180719433, + "grad_norm": 2.796875, + "learning_rate": 2.536810068273423e-06, + "loss": 0.6751, + "step": 27320 + }, + { + "epoch": 0.7473236625148686, + "grad_norm": 2.84375, + "learning_rate": 2.5340681637465387e-06, + "loss": 0.7644, + "step": 27330 + }, + { + "epoch": 0.7475971069577938, + "grad_norm": 3.09375, + "learning_rate": 2.531326259219654e-06, + "loss": 0.7566, + "step": 27340 + }, + { + "epoch": 0.7478705514007191, + "grad_norm": 2.703125, + "learning_rate": 2.5285843546927696e-06, + "loss": 0.6961, + "step": 27350 + }, + { + "epoch": 0.7481439958436444, + "grad_norm": 3.4375, + "learning_rate": 2.5258424501658855e-06, + "loss": 0.7344, + "step": 27360 + }, + { + "epoch": 0.7484174402865698, + "grad_norm": 3.0625, + "learning_rate": 2.523100545639001e-06, + "loss": 0.6898, + "step": 27370 + }, + { + "epoch": 0.7486908847294951, + "grad_norm": 2.921875, + "learning_rate": 2.5203586411121165e-06, + "loss": 0.6549, + "step": 27380 + }, + { + "epoch": 0.7489643291724204, + "grad_norm": 2.8125, + "learning_rate": 2.5176167365852324e-06, + "loss": 0.6599, + "step": 27390 + }, + { + "epoch": 0.7492377736153457, + "grad_norm": 2.953125, + "learning_rate": 2.514874832058348e-06, + "loss": 0.7372, + "step": 27400 + }, + { + "epoch": 0.749511218058271, + "grad_norm": 2.75, + "learning_rate": 2.512132927531464e-06, + "loss": 0.6892, + "step": 27410 + }, + { + "epoch": 0.7497846625011964, + "grad_norm": 3.125, + "learning_rate": 2.5093910230045793e-06, + "loss": 0.7421, + "step": 27420 + }, + { + "epoch": 0.7500581069441217, + "grad_norm": 2.609375, + "learning_rate": 2.5066491184776948e-06, + "loss": 0.6903, + "step": 27430 + }, + { + "epoch": 0.750331551387047, + "grad_norm": 2.625, + "learning_rate": 2.5039072139508102e-06, + "loss": 0.657, + "step": 27440 + }, + { + "epoch": 0.7506049958299722, + "grad_norm": 2.6875, + "learning_rate": 2.501165309423926e-06, + "loss": 0.6459, + "step": 27450 + }, + { + "epoch": 0.7508784402728975, + "grad_norm": 3.234375, + "learning_rate": 2.4984234048970416e-06, + "loss": 0.7908, + "step": 27460 + }, + { + "epoch": 0.7511518847158228, + "grad_norm": 2.9375, + "learning_rate": 2.4956815003701575e-06, + "loss": 0.7309, + "step": 27470 + }, + { + "epoch": 0.7514253291587482, + "grad_norm": 2.53125, + "learning_rate": 2.492939595843273e-06, + "loss": 0.676, + "step": 27480 + }, + { + "epoch": 0.7516987736016735, + "grad_norm": 2.78125, + "learning_rate": 2.4901976913163885e-06, + "loss": 0.6583, + "step": 27490 + }, + { + "epoch": 0.7519722180445988, + "grad_norm": 2.578125, + "learning_rate": 2.487455786789504e-06, + "loss": 0.6944, + "step": 27500 + }, + { + "epoch": 0.7522456624875241, + "grad_norm": 3.171875, + "learning_rate": 2.48471388226262e-06, + "loss": 0.7356, + "step": 27510 + }, + { + "epoch": 0.7525191069304494, + "grad_norm": 3.265625, + "learning_rate": 2.4819719777357354e-06, + "loss": 0.7559, + "step": 27520 + }, + { + "epoch": 0.7527925513733748, + "grad_norm": 2.421875, + "learning_rate": 2.4792300732088513e-06, + "loss": 0.6029, + "step": 27530 + }, + { + "epoch": 0.7530659958163001, + "grad_norm": 2.84375, + "learning_rate": 2.4764881686819668e-06, + "loss": 0.7059, + "step": 27540 + }, + { + "epoch": 0.7533394402592253, + "grad_norm": 2.9375, + "learning_rate": 2.4737462641550827e-06, + "loss": 0.6691, + "step": 27550 + }, + { + "epoch": 0.7536128847021506, + "grad_norm": 2.671875, + "learning_rate": 2.471004359628198e-06, + "loss": 0.6429, + "step": 27560 + }, + { + "epoch": 0.7538863291450759, + "grad_norm": 3.046875, + "learning_rate": 2.4682624551013136e-06, + "loss": 0.669, + "step": 27570 + }, + { + "epoch": 0.7541597735880012, + "grad_norm": 2.53125, + "learning_rate": 2.465520550574429e-06, + "loss": 0.6678, + "step": 27580 + }, + { + "epoch": 0.7544332180309266, + "grad_norm": 3.734375, + "learning_rate": 2.4627786460475446e-06, + "loss": 0.7395, + "step": 27590 + }, + { + "epoch": 0.7547066624738519, + "grad_norm": 2.46875, + "learning_rate": 2.4600367415206605e-06, + "loss": 0.6557, + "step": 27600 + }, + { + "epoch": 0.7549801069167772, + "grad_norm": 2.734375, + "learning_rate": 2.457294836993776e-06, + "loss": 0.7038, + "step": 27610 + }, + { + "epoch": 0.7552535513597025, + "grad_norm": 2.734375, + "learning_rate": 2.454552932466892e-06, + "loss": 0.7161, + "step": 27620 + }, + { + "epoch": 0.7555269958026278, + "grad_norm": 3.09375, + "learning_rate": 2.4518110279400074e-06, + "loss": 0.7045, + "step": 27630 + }, + { + "epoch": 0.7558004402455532, + "grad_norm": 2.484375, + "learning_rate": 2.4490691234131233e-06, + "loss": 0.7158, + "step": 27640 + }, + { + "epoch": 0.7560738846884785, + "grad_norm": 2.546875, + "learning_rate": 2.4463272188862387e-06, + "loss": 0.6648, + "step": 27650 + }, + { + "epoch": 0.7563473291314037, + "grad_norm": 2.65625, + "learning_rate": 2.4435853143593542e-06, + "loss": 0.7042, + "step": 27660 + }, + { + "epoch": 0.756620773574329, + "grad_norm": 2.328125, + "learning_rate": 2.4408434098324697e-06, + "loss": 0.7044, + "step": 27670 + }, + { + "epoch": 0.7568942180172543, + "grad_norm": 2.609375, + "learning_rate": 2.438101505305585e-06, + "loss": 0.6179, + "step": 27680 + }, + { + "epoch": 0.7571676624601796, + "grad_norm": 2.890625, + "learning_rate": 2.435359600778701e-06, + "loss": 0.6136, + "step": 27690 + }, + { + "epoch": 0.757441106903105, + "grad_norm": 2.65625, + "learning_rate": 2.4326176962518166e-06, + "loss": 0.7061, + "step": 27700 + }, + { + "epoch": 0.7577145513460303, + "grad_norm": 3.375, + "learning_rate": 2.4298757917249325e-06, + "loss": 0.6911, + "step": 27710 + }, + { + "epoch": 0.7579879957889556, + "grad_norm": 2.9375, + "learning_rate": 2.427133887198048e-06, + "loss": 0.6592, + "step": 27720 + }, + { + "epoch": 0.7582614402318809, + "grad_norm": 3.140625, + "learning_rate": 2.424391982671164e-06, + "loss": 0.6529, + "step": 27730 + }, + { + "epoch": 0.7585348846748062, + "grad_norm": 2.90625, + "learning_rate": 2.4216500781442794e-06, + "loss": 0.7146, + "step": 27740 + }, + { + "epoch": 0.7588083291177316, + "grad_norm": 2.328125, + "learning_rate": 2.418908173617395e-06, + "loss": 0.6487, + "step": 27750 + }, + { + "epoch": 0.7590817735606568, + "grad_norm": 3.0625, + "learning_rate": 2.4161662690905103e-06, + "loss": 0.6496, + "step": 27760 + }, + { + "epoch": 0.7593552180035821, + "grad_norm": 2.59375, + "learning_rate": 2.413424364563626e-06, + "loss": 0.6842, + "step": 27770 + }, + { + "epoch": 0.7596286624465074, + "grad_norm": 2.796875, + "learning_rate": 2.4106824600367417e-06, + "loss": 0.6868, + "step": 27780 + }, + { + "epoch": 0.7599021068894327, + "grad_norm": 2.9375, + "learning_rate": 2.407940555509857e-06, + "loss": 0.6998, + "step": 27790 + }, + { + "epoch": 0.760175551332358, + "grad_norm": 2.96875, + "learning_rate": 2.405198650982973e-06, + "loss": 0.7066, + "step": 27800 + }, + { + "epoch": 0.7604489957752834, + "grad_norm": 2.859375, + "learning_rate": 2.4024567464560886e-06, + "loss": 0.6898, + "step": 27810 + }, + { + "epoch": 0.7607224402182087, + "grad_norm": 2.5, + "learning_rate": 2.3997148419292045e-06, + "loss": 0.6877, + "step": 27820 + }, + { + "epoch": 0.760995884661134, + "grad_norm": 2.890625, + "learning_rate": 2.39697293740232e-06, + "loss": 0.6975, + "step": 27830 + }, + { + "epoch": 0.7612693291040593, + "grad_norm": 3.0625, + "learning_rate": 2.3942310328754354e-06, + "loss": 0.665, + "step": 27840 + }, + { + "epoch": 0.7615427735469846, + "grad_norm": 3.125, + "learning_rate": 2.391489128348551e-06, + "loss": 0.7079, + "step": 27850 + }, + { + "epoch": 0.76181621798991, + "grad_norm": 2.828125, + "learning_rate": 2.388747223821667e-06, + "loss": 0.6674, + "step": 27860 + }, + { + "epoch": 0.7620896624328352, + "grad_norm": 3.234375, + "learning_rate": 2.3860053192947823e-06, + "loss": 0.6587, + "step": 27870 + }, + { + "epoch": 0.7623631068757605, + "grad_norm": 2.828125, + "learning_rate": 2.383263414767898e-06, + "loss": 0.7108, + "step": 27880 + }, + { + "epoch": 0.7626365513186858, + "grad_norm": 2.8125, + "learning_rate": 2.3805215102410137e-06, + "loss": 0.6927, + "step": 27890 + }, + { + "epoch": 0.7629099957616111, + "grad_norm": 2.84375, + "learning_rate": 2.377779605714129e-06, + "loss": 0.6825, + "step": 27900 + }, + { + "epoch": 0.7631834402045364, + "grad_norm": 2.421875, + "learning_rate": 2.375037701187245e-06, + "loss": 0.6959, + "step": 27910 + }, + { + "epoch": 0.7634568846474618, + "grad_norm": 2.8125, + "learning_rate": 2.3722957966603606e-06, + "loss": 0.8114, + "step": 27920 + }, + { + "epoch": 0.7637303290903871, + "grad_norm": 2.625, + "learning_rate": 2.369553892133476e-06, + "loss": 0.6609, + "step": 27930 + }, + { + "epoch": 0.7640037735333124, + "grad_norm": 2.84375, + "learning_rate": 2.366811987606592e-06, + "loss": 0.6765, + "step": 27940 + }, + { + "epoch": 0.7642772179762377, + "grad_norm": 2.703125, + "learning_rate": 2.3640700830797074e-06, + "loss": 0.6795, + "step": 27950 + }, + { + "epoch": 0.764550662419163, + "grad_norm": 2.984375, + "learning_rate": 2.361328178552823e-06, + "loss": 0.6434, + "step": 27960 + }, + { + "epoch": 0.7648241068620883, + "grad_norm": 2.75, + "learning_rate": 2.3585862740259384e-06, + "loss": 0.678, + "step": 27970 + }, + { + "epoch": 0.7650975513050136, + "grad_norm": 2.703125, + "learning_rate": 2.3558443694990543e-06, + "loss": 0.6875, + "step": 27980 + }, + { + "epoch": 0.7653709957479389, + "grad_norm": 2.859375, + "learning_rate": 2.35310246497217e-06, + "loss": 0.7308, + "step": 27990 + }, + { + "epoch": 0.7656444401908642, + "grad_norm": 2.78125, + "learning_rate": 2.3503605604452857e-06, + "loss": 0.6993, + "step": 28000 + }, + { + "epoch": 0.7659178846337895, + "grad_norm": 2.734375, + "learning_rate": 2.347618655918401e-06, + "loss": 0.7111, + "step": 28010 + }, + { + "epoch": 0.7661913290767148, + "grad_norm": 2.796875, + "learning_rate": 2.3448767513915167e-06, + "loss": 0.6926, + "step": 28020 + }, + { + "epoch": 0.7664647735196402, + "grad_norm": 2.6875, + "learning_rate": 2.3421348468646326e-06, + "loss": 0.6278, + "step": 28030 + }, + { + "epoch": 0.7667382179625655, + "grad_norm": 2.765625, + "learning_rate": 2.339392942337748e-06, + "loss": 0.636, + "step": 28040 + }, + { + "epoch": 0.7670116624054908, + "grad_norm": 2.78125, + "learning_rate": 2.3366510378108635e-06, + "loss": 0.6023, + "step": 28050 + }, + { + "epoch": 0.7672851068484161, + "grad_norm": 2.890625, + "learning_rate": 2.333909133283979e-06, + "loss": 0.6754, + "step": 28060 + }, + { + "epoch": 0.7675585512913414, + "grad_norm": 2.6875, + "learning_rate": 2.331167228757095e-06, + "loss": 0.6969, + "step": 28070 + }, + { + "epoch": 0.7678319957342666, + "grad_norm": 2.890625, + "learning_rate": 2.3284253242302104e-06, + "loss": 0.7725, + "step": 28080 + }, + { + "epoch": 0.768105440177192, + "grad_norm": 2.5625, + "learning_rate": 2.3256834197033263e-06, + "loss": 0.6004, + "step": 28090 + }, + { + "epoch": 0.7683788846201173, + "grad_norm": 3.015625, + "learning_rate": 2.322941515176442e-06, + "loss": 0.7091, + "step": 28100 + }, + { + "epoch": 0.7686523290630426, + "grad_norm": 2.921875, + "learning_rate": 2.3201996106495577e-06, + "loss": 0.6678, + "step": 28110 + }, + { + "epoch": 0.7689257735059679, + "grad_norm": 3.140625, + "learning_rate": 2.317457706122673e-06, + "loss": 0.6659, + "step": 28120 + }, + { + "epoch": 0.7691992179488932, + "grad_norm": 3.125, + "learning_rate": 2.3147158015957887e-06, + "loss": 0.623, + "step": 28130 + }, + { + "epoch": 0.7694726623918186, + "grad_norm": 2.734375, + "learning_rate": 2.311973897068904e-06, + "loss": 0.595, + "step": 28140 + }, + { + "epoch": 0.7697461068347439, + "grad_norm": 3.03125, + "learning_rate": 2.3092319925420196e-06, + "loss": 0.7364, + "step": 28150 + }, + { + "epoch": 0.7700195512776692, + "grad_norm": 2.734375, + "learning_rate": 2.3064900880151355e-06, + "loss": 0.6746, + "step": 28160 + }, + { + "epoch": 0.7702929957205945, + "grad_norm": 2.53125, + "learning_rate": 2.303748183488251e-06, + "loss": 0.6785, + "step": 28170 + }, + { + "epoch": 0.7705664401635198, + "grad_norm": 2.328125, + "learning_rate": 2.301006278961367e-06, + "loss": 0.6948, + "step": 28180 + }, + { + "epoch": 0.770839884606445, + "grad_norm": 2.96875, + "learning_rate": 2.2982643744344824e-06, + "loss": 0.7128, + "step": 28190 + }, + { + "epoch": 0.7711133290493704, + "grad_norm": 3.0, + "learning_rate": 2.2955224699075983e-06, + "loss": 0.7539, + "step": 28200 + }, + { + "epoch": 0.7713867734922957, + "grad_norm": 2.703125, + "learning_rate": 2.2927805653807138e-06, + "loss": 0.6673, + "step": 28210 + }, + { + "epoch": 0.771660217935221, + "grad_norm": 2.671875, + "learning_rate": 2.2900386608538293e-06, + "loss": 0.7276, + "step": 28220 + }, + { + "epoch": 0.7719336623781463, + "grad_norm": 2.59375, + "learning_rate": 2.2872967563269448e-06, + "loss": 0.609, + "step": 28230 + }, + { + "epoch": 0.7722071068210716, + "grad_norm": 3.078125, + "learning_rate": 2.2845548518000602e-06, + "loss": 0.7507, + "step": 28240 + }, + { + "epoch": 0.772480551263997, + "grad_norm": 2.6875, + "learning_rate": 2.281812947273176e-06, + "loss": 0.6425, + "step": 28250 + }, + { + "epoch": 0.7727539957069223, + "grad_norm": 3.15625, + "learning_rate": 2.2790710427462916e-06, + "loss": 0.7171, + "step": 28260 + }, + { + "epoch": 0.7730274401498476, + "grad_norm": 2.609375, + "learning_rate": 2.2763291382194075e-06, + "loss": 0.652, + "step": 28270 + }, + { + "epoch": 0.7733008845927729, + "grad_norm": 2.640625, + "learning_rate": 2.273587233692523e-06, + "loss": 0.6733, + "step": 28280 + }, + { + "epoch": 0.7735743290356982, + "grad_norm": 2.640625, + "learning_rate": 2.270845329165639e-06, + "loss": 0.6628, + "step": 28290 + }, + { + "epoch": 0.7738477734786234, + "grad_norm": 3.046875, + "learning_rate": 2.2681034246387544e-06, + "loss": 0.7505, + "step": 28300 + }, + { + "epoch": 0.7741212179215488, + "grad_norm": 2.6875, + "learning_rate": 2.26536152011187e-06, + "loss": 0.731, + "step": 28310 + }, + { + "epoch": 0.7743946623644741, + "grad_norm": 2.765625, + "learning_rate": 2.2626196155849854e-06, + "loss": 0.655, + "step": 28320 + }, + { + "epoch": 0.7746681068073994, + "grad_norm": 2.875, + "learning_rate": 2.2598777110581013e-06, + "loss": 0.7311, + "step": 28330 + }, + { + "epoch": 0.7749415512503247, + "grad_norm": 3.265625, + "learning_rate": 2.2571358065312167e-06, + "loss": 0.7112, + "step": 28340 + }, + { + "epoch": 0.77521499569325, + "grad_norm": 2.828125, + "learning_rate": 2.2543939020043322e-06, + "loss": 0.6663, + "step": 28350 + }, + { + "epoch": 0.7754884401361753, + "grad_norm": 3.1875, + "learning_rate": 2.251651997477448e-06, + "loss": 0.6556, + "step": 28360 + }, + { + "epoch": 0.7757618845791007, + "grad_norm": 2.625, + "learning_rate": 2.2489100929505636e-06, + "loss": 0.6644, + "step": 28370 + }, + { + "epoch": 0.776035329022026, + "grad_norm": 2.78125, + "learning_rate": 2.2461681884236795e-06, + "loss": 0.7091, + "step": 28380 + }, + { + "epoch": 0.7763087734649513, + "grad_norm": 2.78125, + "learning_rate": 2.243426283896795e-06, + "loss": 0.6637, + "step": 28390 + }, + { + "epoch": 0.7765822179078765, + "grad_norm": 2.515625, + "learning_rate": 2.2406843793699105e-06, + "loss": 0.7008, + "step": 28400 + }, + { + "epoch": 0.7768556623508018, + "grad_norm": 3.03125, + "learning_rate": 2.2379424748430264e-06, + "loss": 0.725, + "step": 28410 + }, + { + "epoch": 0.7771291067937272, + "grad_norm": 2.53125, + "learning_rate": 2.235200570316142e-06, + "loss": 0.7132, + "step": 28420 + }, + { + "epoch": 0.7774025512366525, + "grad_norm": 3.109375, + "learning_rate": 2.2324586657892574e-06, + "loss": 0.7478, + "step": 28430 + }, + { + "epoch": 0.7776759956795778, + "grad_norm": 3.046875, + "learning_rate": 2.229716761262373e-06, + "loss": 0.6303, + "step": 28440 + }, + { + "epoch": 0.7779494401225031, + "grad_norm": 3.171875, + "learning_rate": 2.2269748567354887e-06, + "loss": 0.6339, + "step": 28450 + }, + { + "epoch": 0.7782228845654284, + "grad_norm": 2.71875, + "learning_rate": 2.2242329522086042e-06, + "loss": 0.6326, + "step": 28460 + }, + { + "epoch": 0.7784963290083537, + "grad_norm": 2.984375, + "learning_rate": 2.22149104768172e-06, + "loss": 0.6357, + "step": 28470 + }, + { + "epoch": 0.7787697734512791, + "grad_norm": 2.953125, + "learning_rate": 2.2187491431548356e-06, + "loss": 0.6949, + "step": 28480 + }, + { + "epoch": 0.7790432178942044, + "grad_norm": 3.140625, + "learning_rate": 2.216007238627951e-06, + "loss": 0.6841, + "step": 28490 + }, + { + "epoch": 0.7793166623371297, + "grad_norm": 3.015625, + "learning_rate": 2.213265334101067e-06, + "loss": 0.6528, + "step": 28500 + }, + { + "epoch": 0.7795901067800549, + "grad_norm": 2.359375, + "learning_rate": 2.2105234295741825e-06, + "loss": 0.7035, + "step": 28510 + }, + { + "epoch": 0.7798635512229802, + "grad_norm": 2.65625, + "learning_rate": 2.207781525047298e-06, + "loss": 0.7023, + "step": 28520 + }, + { + "epoch": 0.7801369956659056, + "grad_norm": 2.828125, + "learning_rate": 2.2050396205204134e-06, + "loss": 0.7383, + "step": 28530 + }, + { + "epoch": 0.7804104401088309, + "grad_norm": 2.703125, + "learning_rate": 2.2022977159935294e-06, + "loss": 0.6561, + "step": 28540 + }, + { + "epoch": 0.7806838845517562, + "grad_norm": 2.796875, + "learning_rate": 2.199555811466645e-06, + "loss": 0.6811, + "step": 28550 + }, + { + "epoch": 0.7809573289946815, + "grad_norm": 3.0625, + "learning_rate": 2.1968139069397607e-06, + "loss": 0.6894, + "step": 28560 + }, + { + "epoch": 0.7812307734376068, + "grad_norm": 2.578125, + "learning_rate": 2.1940720024128762e-06, + "loss": 0.7475, + "step": 28570 + }, + { + "epoch": 0.7815042178805321, + "grad_norm": 2.90625, + "learning_rate": 2.1913300978859917e-06, + "loss": 0.6895, + "step": 28580 + }, + { + "epoch": 0.7817776623234575, + "grad_norm": 2.578125, + "learning_rate": 2.1885881933591076e-06, + "loss": 0.6484, + "step": 28590 + }, + { + "epoch": 0.7820511067663828, + "grad_norm": 2.734375, + "learning_rate": 2.185846288832223e-06, + "loss": 0.7171, + "step": 28600 + }, + { + "epoch": 0.782324551209308, + "grad_norm": 3.140625, + "learning_rate": 2.1831043843053386e-06, + "loss": 0.686, + "step": 28610 + }, + { + "epoch": 0.7825979956522333, + "grad_norm": 2.59375, + "learning_rate": 2.180362479778454e-06, + "loss": 0.681, + "step": 28620 + }, + { + "epoch": 0.7828714400951586, + "grad_norm": 2.203125, + "learning_rate": 2.17762057525157e-06, + "loss": 0.6183, + "step": 28630 + }, + { + "epoch": 0.783144884538084, + "grad_norm": 2.46875, + "learning_rate": 2.1748786707246854e-06, + "loss": 0.6628, + "step": 28640 + }, + { + "epoch": 0.7834183289810093, + "grad_norm": 2.828125, + "learning_rate": 2.1721367661978013e-06, + "loss": 0.7031, + "step": 28650 + }, + { + "epoch": 0.7836917734239346, + "grad_norm": 2.25, + "learning_rate": 2.169394861670917e-06, + "loss": 0.5876, + "step": 28660 + }, + { + "epoch": 0.7839652178668599, + "grad_norm": 3.265625, + "learning_rate": 2.1666529571440323e-06, + "loss": 0.7004, + "step": 28670 + }, + { + "epoch": 0.7842386623097852, + "grad_norm": 2.734375, + "learning_rate": 2.1639110526171482e-06, + "loss": 0.6449, + "step": 28680 + }, + { + "epoch": 0.7845121067527105, + "grad_norm": 2.984375, + "learning_rate": 2.1611691480902637e-06, + "loss": 0.6954, + "step": 28690 + }, + { + "epoch": 0.7847855511956359, + "grad_norm": 2.75, + "learning_rate": 2.158427243563379e-06, + "loss": 0.6598, + "step": 28700 + }, + { + "epoch": 0.7850589956385612, + "grad_norm": 2.859375, + "learning_rate": 2.1556853390364947e-06, + "loss": 0.717, + "step": 28710 + }, + { + "epoch": 0.7853324400814864, + "grad_norm": 2.59375, + "learning_rate": 2.1529434345096106e-06, + "loss": 0.616, + "step": 28720 + }, + { + "epoch": 0.7856058845244117, + "grad_norm": 2.40625, + "learning_rate": 2.150201529982726e-06, + "loss": 0.6828, + "step": 28730 + }, + { + "epoch": 0.785879328967337, + "grad_norm": 2.375, + "learning_rate": 2.147459625455842e-06, + "loss": 0.742, + "step": 28740 + }, + { + "epoch": 0.7861527734102624, + "grad_norm": 3.203125, + "learning_rate": 2.1447177209289574e-06, + "loss": 0.63, + "step": 28750 + }, + { + "epoch": 0.7864262178531877, + "grad_norm": 2.78125, + "learning_rate": 2.1419758164020733e-06, + "loss": 0.6715, + "step": 28760 + }, + { + "epoch": 0.786699662296113, + "grad_norm": 2.65625, + "learning_rate": 2.139233911875189e-06, + "loss": 0.6895, + "step": 28770 + }, + { + "epoch": 0.7869731067390383, + "grad_norm": 2.9375, + "learning_rate": 2.1364920073483043e-06, + "loss": 0.6485, + "step": 28780 + }, + { + "epoch": 0.7872465511819636, + "grad_norm": 3.046875, + "learning_rate": 2.1337501028214198e-06, + "loss": 0.7096, + "step": 28790 + }, + { + "epoch": 0.787519995624889, + "grad_norm": 2.75, + "learning_rate": 2.1310081982945353e-06, + "loss": 0.6793, + "step": 28800 + }, + { + "epoch": 0.7877934400678143, + "grad_norm": 2.90625, + "learning_rate": 2.128266293767651e-06, + "loss": 0.676, + "step": 28810 + }, + { + "epoch": 0.7880668845107396, + "grad_norm": 2.65625, + "learning_rate": 2.1255243892407667e-06, + "loss": 0.6814, + "step": 28820 + }, + { + "epoch": 0.7883403289536648, + "grad_norm": 2.84375, + "learning_rate": 2.1227824847138826e-06, + "loss": 0.7144, + "step": 28830 + }, + { + "epoch": 0.7886137733965901, + "grad_norm": 2.8125, + "learning_rate": 2.120040580186998e-06, + "loss": 0.7356, + "step": 28840 + }, + { + "epoch": 0.7888872178395154, + "grad_norm": 2.96875, + "learning_rate": 2.117298675660114e-06, + "loss": 0.664, + "step": 28850 + }, + { + "epoch": 0.7891606622824408, + "grad_norm": 2.765625, + "learning_rate": 2.1145567711332294e-06, + "loss": 0.706, + "step": 28860 + }, + { + "epoch": 0.7894341067253661, + "grad_norm": 2.9375, + "learning_rate": 2.111814866606345e-06, + "loss": 0.6952, + "step": 28870 + }, + { + "epoch": 0.7897075511682914, + "grad_norm": 2.859375, + "learning_rate": 2.1090729620794604e-06, + "loss": 0.6487, + "step": 28880 + }, + { + "epoch": 0.7899809956112167, + "grad_norm": 2.765625, + "learning_rate": 2.1063310575525763e-06, + "loss": 0.6579, + "step": 28890 + }, + { + "epoch": 0.790254440054142, + "grad_norm": 2.71875, + "learning_rate": 2.1035891530256918e-06, + "loss": 0.7273, + "step": 28900 + }, + { + "epoch": 0.7905278844970673, + "grad_norm": 2.8125, + "learning_rate": 2.1008472484988073e-06, + "loss": 0.5908, + "step": 28910 + }, + { + "epoch": 0.7908013289399927, + "grad_norm": 2.671875, + "learning_rate": 2.098105343971923e-06, + "loss": 0.6259, + "step": 28920 + }, + { + "epoch": 0.7910747733829179, + "grad_norm": 2.96875, + "learning_rate": 2.0953634394450387e-06, + "loss": 0.6401, + "step": 28930 + }, + { + "epoch": 0.7913482178258432, + "grad_norm": 3.140625, + "learning_rate": 2.0926215349181546e-06, + "loss": 0.706, + "step": 28940 + }, + { + "epoch": 0.7916216622687685, + "grad_norm": 3.046875, + "learning_rate": 2.08987963039127e-06, + "loss": 0.6401, + "step": 28950 + }, + { + "epoch": 0.7918951067116938, + "grad_norm": 2.828125, + "learning_rate": 2.0871377258643855e-06, + "loss": 0.6159, + "step": 28960 + }, + { + "epoch": 0.7921685511546191, + "grad_norm": 2.828125, + "learning_rate": 2.0843958213375014e-06, + "loss": 0.6776, + "step": 28970 + }, + { + "epoch": 0.7924419955975445, + "grad_norm": 2.96875, + "learning_rate": 2.081653916810617e-06, + "loss": 0.6629, + "step": 28980 + }, + { + "epoch": 0.7927154400404698, + "grad_norm": 2.8125, + "learning_rate": 2.0789120122837324e-06, + "loss": 0.647, + "step": 28990 + }, + { + "epoch": 0.7929888844833951, + "grad_norm": 2.984375, + "learning_rate": 2.076170107756848e-06, + "loss": 0.6613, + "step": 29000 + }, + { + "epoch": 0.7932623289263204, + "grad_norm": 2.640625, + "learning_rate": 2.0734282032299638e-06, + "loss": 0.6359, + "step": 29010 + }, + { + "epoch": 0.7935357733692457, + "grad_norm": 2.90625, + "learning_rate": 2.0706862987030793e-06, + "loss": 0.7339, + "step": 29020 + }, + { + "epoch": 0.7938092178121711, + "grad_norm": 2.65625, + "learning_rate": 2.067944394176195e-06, + "loss": 0.7012, + "step": 29030 + }, + { + "epoch": 0.7940826622550963, + "grad_norm": 2.359375, + "learning_rate": 2.0652024896493106e-06, + "loss": 0.6424, + "step": 29040 + }, + { + "epoch": 0.7943561066980216, + "grad_norm": 3.09375, + "learning_rate": 2.062460585122426e-06, + "loss": 0.7177, + "step": 29050 + }, + { + "epoch": 0.7946295511409469, + "grad_norm": 2.921875, + "learning_rate": 2.059718680595542e-06, + "loss": 0.6562, + "step": 29060 + }, + { + "epoch": 0.7949029955838722, + "grad_norm": 3.484375, + "learning_rate": 2.0569767760686575e-06, + "loss": 0.7826, + "step": 29070 + }, + { + "epoch": 0.7951764400267975, + "grad_norm": 3.078125, + "learning_rate": 2.054234871541773e-06, + "loss": 0.6957, + "step": 29080 + }, + { + "epoch": 0.7954498844697229, + "grad_norm": 2.609375, + "learning_rate": 2.0514929670148885e-06, + "loss": 0.7241, + "step": 29090 + }, + { + "epoch": 0.7957233289126482, + "grad_norm": 3.0, + "learning_rate": 2.0487510624880044e-06, + "loss": 0.741, + "step": 29100 + }, + { + "epoch": 0.7959967733555735, + "grad_norm": 2.65625, + "learning_rate": 2.04600915796112e-06, + "loss": 0.7424, + "step": 29110 + }, + { + "epoch": 0.7962702177984988, + "grad_norm": 2.5, + "learning_rate": 2.0432672534342358e-06, + "loss": 0.6503, + "step": 29120 + }, + { + "epoch": 0.7965436622414241, + "grad_norm": 2.125, + "learning_rate": 2.0405253489073513e-06, + "loss": 0.6394, + "step": 29130 + }, + { + "epoch": 0.7968171066843494, + "grad_norm": 2.375, + "learning_rate": 2.0377834443804667e-06, + "loss": 0.6781, + "step": 29140 + }, + { + "epoch": 0.7970905511272747, + "grad_norm": 3.109375, + "learning_rate": 2.0350415398535826e-06, + "loss": 0.7336, + "step": 29150 + }, + { + "epoch": 0.7973639955702, + "grad_norm": 2.96875, + "learning_rate": 2.032299635326698e-06, + "loss": 0.7016, + "step": 29160 + }, + { + "epoch": 0.7976374400131253, + "grad_norm": 2.75, + "learning_rate": 2.0295577307998136e-06, + "loss": 0.6697, + "step": 29170 + }, + { + "epoch": 0.7979108844560506, + "grad_norm": 2.640625, + "learning_rate": 2.026815826272929e-06, + "loss": 0.6629, + "step": 29180 + }, + { + "epoch": 0.798184328898976, + "grad_norm": 2.671875, + "learning_rate": 2.024073921746045e-06, + "loss": 0.6533, + "step": 29190 + }, + { + "epoch": 0.7984577733419013, + "grad_norm": 3.09375, + "learning_rate": 2.0213320172191605e-06, + "loss": 0.6641, + "step": 29200 + }, + { + "epoch": 0.7987312177848266, + "grad_norm": 2.765625, + "learning_rate": 2.0185901126922764e-06, + "loss": 0.6704, + "step": 29210 + }, + { + "epoch": 0.7990046622277519, + "grad_norm": 2.640625, + "learning_rate": 2.015848208165392e-06, + "loss": 0.7237, + "step": 29220 + }, + { + "epoch": 0.7992781066706772, + "grad_norm": 2.890625, + "learning_rate": 2.0131063036385073e-06, + "loss": 0.6812, + "step": 29230 + }, + { + "epoch": 0.7995515511136025, + "grad_norm": 2.875, + "learning_rate": 2.0103643991116233e-06, + "loss": 0.7029, + "step": 29240 + }, + { + "epoch": 0.7998249955565278, + "grad_norm": 2.71875, + "learning_rate": 2.0076224945847387e-06, + "loss": 0.6595, + "step": 29250 + }, + { + "epoch": 0.8000984399994531, + "grad_norm": 2.921875, + "learning_rate": 2.0048805900578542e-06, + "loss": 0.6282, + "step": 29260 + }, + { + "epoch": 0.8003718844423784, + "grad_norm": 2.546875, + "learning_rate": 2.0021386855309697e-06, + "loss": 0.7454, + "step": 29270 + }, + { + "epoch": 0.8006453288853037, + "grad_norm": 2.703125, + "learning_rate": 1.9993967810040856e-06, + "loss": 0.6795, + "step": 29280 + }, + { + "epoch": 0.800918773328229, + "grad_norm": 3.015625, + "learning_rate": 1.996654876477201e-06, + "loss": 0.6849, + "step": 29290 + }, + { + "epoch": 0.8011922177711543, + "grad_norm": 3.328125, + "learning_rate": 1.993912971950317e-06, + "loss": 0.7439, + "step": 29300 + }, + { + "epoch": 0.8014656622140797, + "grad_norm": 2.734375, + "learning_rate": 1.9911710674234325e-06, + "loss": 0.6776, + "step": 29310 + }, + { + "epoch": 0.801739106657005, + "grad_norm": 2.796875, + "learning_rate": 1.988429162896548e-06, + "loss": 0.7098, + "step": 29320 + }, + { + "epoch": 0.8020125510999303, + "grad_norm": 2.578125, + "learning_rate": 1.985687258369664e-06, + "loss": 0.6648, + "step": 29330 + }, + { + "epoch": 0.8022859955428556, + "grad_norm": 3.0625, + "learning_rate": 1.9829453538427793e-06, + "loss": 0.7303, + "step": 29340 + }, + { + "epoch": 0.8025594399857809, + "grad_norm": 2.8125, + "learning_rate": 1.980203449315895e-06, + "loss": 0.7445, + "step": 29350 + }, + { + "epoch": 0.8028328844287062, + "grad_norm": 2.65625, + "learning_rate": 1.9774615447890107e-06, + "loss": 0.7238, + "step": 29360 + }, + { + "epoch": 0.8031063288716315, + "grad_norm": 2.90625, + "learning_rate": 1.974719640262126e-06, + "loss": 0.6743, + "step": 29370 + }, + { + "epoch": 0.8033797733145568, + "grad_norm": 2.8125, + "learning_rate": 1.9719777357352417e-06, + "loss": 0.6651, + "step": 29380 + }, + { + "epoch": 0.8036532177574821, + "grad_norm": 2.390625, + "learning_rate": 1.9692358312083576e-06, + "loss": 0.7182, + "step": 29390 + }, + { + "epoch": 0.8039266622004074, + "grad_norm": 3.03125, + "learning_rate": 1.966493926681473e-06, + "loss": 0.7179, + "step": 29400 + }, + { + "epoch": 0.8042001066433327, + "grad_norm": 3.203125, + "learning_rate": 1.9637520221545886e-06, + "loss": 0.6637, + "step": 29410 + }, + { + "epoch": 0.8044735510862581, + "grad_norm": 2.921875, + "learning_rate": 1.9610101176277045e-06, + "loss": 0.6719, + "step": 29420 + }, + { + "epoch": 0.8047469955291834, + "grad_norm": 2.921875, + "learning_rate": 1.95826821310082e-06, + "loss": 0.7492, + "step": 29430 + }, + { + "epoch": 0.8050204399721087, + "grad_norm": 2.65625, + "learning_rate": 1.955526308573936e-06, + "loss": 0.6767, + "step": 29440 + }, + { + "epoch": 0.805293884415034, + "grad_norm": 2.609375, + "learning_rate": 1.9527844040470513e-06, + "loss": 0.6514, + "step": 29450 + }, + { + "epoch": 0.8055673288579592, + "grad_norm": 3.046875, + "learning_rate": 1.950042499520167e-06, + "loss": 0.6393, + "step": 29460 + }, + { + "epoch": 0.8058407733008845, + "grad_norm": 2.765625, + "learning_rate": 1.9473005949932823e-06, + "loss": 0.8214, + "step": 29470 + }, + { + "epoch": 0.8061142177438099, + "grad_norm": 2.59375, + "learning_rate": 1.944558690466398e-06, + "loss": 0.6726, + "step": 29480 + }, + { + "epoch": 0.8063876621867352, + "grad_norm": 2.515625, + "learning_rate": 1.9418167859395137e-06, + "loss": 0.731, + "step": 29490 + }, + { + "epoch": 0.8066611066296605, + "grad_norm": 2.984375, + "learning_rate": 1.9390748814126296e-06, + "loss": 0.7451, + "step": 29500 + }, + { + "epoch": 0.8069345510725858, + "grad_norm": 2.671875, + "learning_rate": 1.936332976885745e-06, + "loss": 0.6598, + "step": 29510 + }, + { + "epoch": 0.8072079955155111, + "grad_norm": 2.96875, + "learning_rate": 1.9335910723588606e-06, + "loss": 0.6708, + "step": 29520 + }, + { + "epoch": 0.8074814399584365, + "grad_norm": 2.9375, + "learning_rate": 1.9308491678319765e-06, + "loss": 0.6284, + "step": 29530 + }, + { + "epoch": 0.8077548844013618, + "grad_norm": 3.171875, + "learning_rate": 1.928107263305092e-06, + "loss": 0.6692, + "step": 29540 + }, + { + "epoch": 0.8080283288442871, + "grad_norm": 2.515625, + "learning_rate": 1.9253653587782074e-06, + "loss": 0.6918, + "step": 29550 + }, + { + "epoch": 0.8083017732872124, + "grad_norm": 2.984375, + "learning_rate": 1.922623454251323e-06, + "loss": 0.6994, + "step": 29560 + }, + { + "epoch": 0.8085752177301376, + "grad_norm": 2.421875, + "learning_rate": 1.919881549724439e-06, + "loss": 0.7276, + "step": 29570 + }, + { + "epoch": 0.808848662173063, + "grad_norm": 2.671875, + "learning_rate": 1.9171396451975543e-06, + "loss": 0.6658, + "step": 29580 + }, + { + "epoch": 0.8091221066159883, + "grad_norm": 2.671875, + "learning_rate": 1.91439774067067e-06, + "loss": 0.649, + "step": 29590 + }, + { + "epoch": 0.8093955510589136, + "grad_norm": 2.625, + "learning_rate": 1.9116558361437857e-06, + "loss": 0.6513, + "step": 29600 + }, + { + "epoch": 0.8096689955018389, + "grad_norm": 2.71875, + "learning_rate": 1.908913931616901e-06, + "loss": 0.7674, + "step": 29610 + }, + { + "epoch": 0.8099424399447642, + "grad_norm": 2.984375, + "learning_rate": 1.9061720270900169e-06, + "loss": 0.6799, + "step": 29620 + }, + { + "epoch": 0.8102158843876895, + "grad_norm": 2.9375, + "learning_rate": 1.9034301225631323e-06, + "loss": 0.644, + "step": 29630 + }, + { + "epoch": 0.8104893288306149, + "grad_norm": 3.78125, + "learning_rate": 1.9006882180362482e-06, + "loss": 0.6633, + "step": 29640 + }, + { + "epoch": 0.8107627732735402, + "grad_norm": 2.71875, + "learning_rate": 1.8979463135093637e-06, + "loss": 0.7557, + "step": 29650 + }, + { + "epoch": 0.8110362177164655, + "grad_norm": 2.984375, + "learning_rate": 1.8952044089824794e-06, + "loss": 0.6416, + "step": 29660 + }, + { + "epoch": 0.8113096621593908, + "grad_norm": 3.21875, + "learning_rate": 1.892462504455595e-06, + "loss": 0.7213, + "step": 29670 + }, + { + "epoch": 0.811583106602316, + "grad_norm": 2.84375, + "learning_rate": 1.8897205999287108e-06, + "loss": 0.634, + "step": 29680 + }, + { + "epoch": 0.8118565510452413, + "grad_norm": 2.671875, + "learning_rate": 1.8869786954018263e-06, + "loss": 0.6895, + "step": 29690 + }, + { + "epoch": 0.8121299954881667, + "grad_norm": 2.140625, + "learning_rate": 1.8842367908749418e-06, + "loss": 0.6604, + "step": 29700 + }, + { + "epoch": 0.812403439931092, + "grad_norm": 2.78125, + "learning_rate": 1.8814948863480575e-06, + "loss": 0.684, + "step": 29710 + }, + { + "epoch": 0.8126768843740173, + "grad_norm": 2.703125, + "learning_rate": 1.878752981821173e-06, + "loss": 0.6527, + "step": 29720 + }, + { + "epoch": 0.8129503288169426, + "grad_norm": 3.1875, + "learning_rate": 1.8760110772942889e-06, + "loss": 0.6745, + "step": 29730 + }, + { + "epoch": 0.8132237732598679, + "grad_norm": 3.421875, + "learning_rate": 1.8732691727674043e-06, + "loss": 0.763, + "step": 29740 + }, + { + "epoch": 0.8134972177027933, + "grad_norm": 2.4375, + "learning_rate": 1.87052726824052e-06, + "loss": 0.6093, + "step": 29750 + }, + { + "epoch": 0.8137706621457186, + "grad_norm": 2.28125, + "learning_rate": 1.8677853637136355e-06, + "loss": 0.6659, + "step": 29760 + }, + { + "epoch": 0.8140441065886439, + "grad_norm": 3.28125, + "learning_rate": 1.8650434591867514e-06, + "loss": 0.7346, + "step": 29770 + }, + { + "epoch": 0.8143175510315691, + "grad_norm": 3.296875, + "learning_rate": 1.862301554659867e-06, + "loss": 0.6485, + "step": 29780 + }, + { + "epoch": 0.8145909954744944, + "grad_norm": 2.546875, + "learning_rate": 1.8595596501329824e-06, + "loss": 0.6405, + "step": 29790 + }, + { + "epoch": 0.8148644399174197, + "grad_norm": 2.65625, + "learning_rate": 1.856817745606098e-06, + "loss": 0.6973, + "step": 29800 + }, + { + "epoch": 0.8151378843603451, + "grad_norm": 2.84375, + "learning_rate": 1.8540758410792138e-06, + "loss": 0.6907, + "step": 29810 + }, + { + "epoch": 0.8154113288032704, + "grad_norm": 2.875, + "learning_rate": 1.8513339365523295e-06, + "loss": 0.6947, + "step": 29820 + }, + { + "epoch": 0.8156847732461957, + "grad_norm": 2.578125, + "learning_rate": 1.848592032025445e-06, + "loss": 0.6937, + "step": 29830 + }, + { + "epoch": 0.815958217689121, + "grad_norm": 2.78125, + "learning_rate": 1.8458501274985606e-06, + "loss": 0.6427, + "step": 29840 + }, + { + "epoch": 0.8162316621320463, + "grad_norm": 2.765625, + "learning_rate": 1.8431082229716763e-06, + "loss": 0.676, + "step": 29850 + }, + { + "epoch": 0.8165051065749717, + "grad_norm": 2.375, + "learning_rate": 1.840366318444792e-06, + "loss": 0.7, + "step": 29860 + }, + { + "epoch": 0.816778551017897, + "grad_norm": 2.34375, + "learning_rate": 1.8376244139179075e-06, + "loss": 0.6519, + "step": 29870 + }, + { + "epoch": 0.8170519954608223, + "grad_norm": 2.625, + "learning_rate": 1.834882509391023e-06, + "loss": 0.7279, + "step": 29880 + }, + { + "epoch": 0.8173254399037475, + "grad_norm": 3.03125, + "learning_rate": 1.832140604864139e-06, + "loss": 0.7716, + "step": 29890 + }, + { + "epoch": 0.8175988843466728, + "grad_norm": 2.703125, + "learning_rate": 1.8293987003372544e-06, + "loss": 0.6372, + "step": 29900 + }, + { + "epoch": 0.8178723287895981, + "grad_norm": 2.9375, + "learning_rate": 1.82665679581037e-06, + "loss": 0.7262, + "step": 29910 + }, + { + "epoch": 0.8181457732325235, + "grad_norm": 3.9375, + "learning_rate": 1.8239148912834856e-06, + "loss": 0.6635, + "step": 29920 + }, + { + "epoch": 0.8184192176754488, + "grad_norm": 2.828125, + "learning_rate": 1.8211729867566015e-06, + "loss": 0.6248, + "step": 29930 + }, + { + "epoch": 0.8186926621183741, + "grad_norm": 3.234375, + "learning_rate": 1.818431082229717e-06, + "loss": 0.7272, + "step": 29940 + }, + { + "epoch": 0.8189661065612994, + "grad_norm": 3.1875, + "learning_rate": 1.8156891777028326e-06, + "loss": 0.7776, + "step": 29950 + }, + { + "epoch": 0.8192395510042247, + "grad_norm": 2.796875, + "learning_rate": 1.8129472731759481e-06, + "loss": 0.6713, + "step": 29960 + }, + { + "epoch": 0.8195129954471501, + "grad_norm": 2.953125, + "learning_rate": 1.8102053686490636e-06, + "loss": 0.7753, + "step": 29970 + }, + { + "epoch": 0.8197864398900754, + "grad_norm": 2.609375, + "learning_rate": 1.8074634641221795e-06, + "loss": 0.6909, + "step": 29980 + }, + { + "epoch": 0.8200598843330006, + "grad_norm": 3.1875, + "learning_rate": 1.804721559595295e-06, + "loss": 0.7323, + "step": 29990 + }, + { + "epoch": 0.8203333287759259, + "grad_norm": 2.4375, + "learning_rate": 1.8019796550684107e-06, + "loss": 0.747, + "step": 30000 + }, + { + "epoch": 0.8206067732188512, + "grad_norm": 2.9375, + "learning_rate": 1.7992377505415262e-06, + "loss": 0.6893, + "step": 30010 + }, + { + "epoch": 0.8208802176617765, + "grad_norm": 3.09375, + "learning_rate": 1.796495846014642e-06, + "loss": 0.6364, + "step": 30020 + }, + { + "epoch": 0.8211536621047019, + "grad_norm": 3.25, + "learning_rate": 1.7937539414877576e-06, + "loss": 0.6876, + "step": 30030 + }, + { + "epoch": 0.8214271065476272, + "grad_norm": 2.921875, + "learning_rate": 1.7910120369608732e-06, + "loss": 0.7402, + "step": 30040 + }, + { + "epoch": 0.8217005509905525, + "grad_norm": 2.890625, + "learning_rate": 1.7882701324339887e-06, + "loss": 0.6785, + "step": 30050 + }, + { + "epoch": 0.8219739954334778, + "grad_norm": 2.9375, + "learning_rate": 1.7855282279071042e-06, + "loss": 0.7366, + "step": 30060 + }, + { + "epoch": 0.8222474398764031, + "grad_norm": 2.953125, + "learning_rate": 1.7827863233802201e-06, + "loss": 0.7553, + "step": 30070 + }, + { + "epoch": 0.8225208843193285, + "grad_norm": 2.578125, + "learning_rate": 1.7800444188533356e-06, + "loss": 0.6867, + "step": 30080 + }, + { + "epoch": 0.8227943287622538, + "grad_norm": 3.15625, + "learning_rate": 1.7773025143264513e-06, + "loss": 0.6681, + "step": 30090 + }, + { + "epoch": 0.823067773205179, + "grad_norm": 2.90625, + "learning_rate": 1.7745606097995668e-06, + "loss": 0.6665, + "step": 30100 + }, + { + "epoch": 0.8233412176481043, + "grad_norm": 2.703125, + "learning_rate": 1.7718187052726827e-06, + "loss": 0.6859, + "step": 30110 + }, + { + "epoch": 0.8236146620910296, + "grad_norm": 2.515625, + "learning_rate": 1.7690768007457982e-06, + "loss": 0.6791, + "step": 30120 + }, + { + "epoch": 0.823888106533955, + "grad_norm": 2.890625, + "learning_rate": 1.7663348962189139e-06, + "loss": 0.6777, + "step": 30130 + }, + { + "epoch": 0.8241615509768803, + "grad_norm": 2.4375, + "learning_rate": 1.7635929916920293e-06, + "loss": 0.6707, + "step": 30140 + }, + { + "epoch": 0.8244349954198056, + "grad_norm": 3.125, + "learning_rate": 1.7608510871651452e-06, + "loss": 0.6685, + "step": 30150 + }, + { + "epoch": 0.8247084398627309, + "grad_norm": 2.46875, + "learning_rate": 1.7581091826382607e-06, + "loss": 0.7005, + "step": 30160 + }, + { + "epoch": 0.8249818843056562, + "grad_norm": 2.6875, + "learning_rate": 1.7553672781113762e-06, + "loss": 0.6219, + "step": 30170 + }, + { + "epoch": 0.8252553287485815, + "grad_norm": 3.3125, + "learning_rate": 1.752625373584492e-06, + "loss": 0.6975, + "step": 30180 + }, + { + "epoch": 0.8255287731915069, + "grad_norm": 2.6875, + "learning_rate": 1.7498834690576074e-06, + "loss": 0.7406, + "step": 30190 + }, + { + "epoch": 0.8258022176344322, + "grad_norm": 2.703125, + "learning_rate": 1.7471415645307233e-06, + "loss": 0.7332, + "step": 30200 + }, + { + "epoch": 0.8260756620773574, + "grad_norm": 3.046875, + "learning_rate": 1.7443996600038388e-06, + "loss": 0.6965, + "step": 30210 + }, + { + "epoch": 0.8263491065202827, + "grad_norm": 2.359375, + "learning_rate": 1.7416577554769545e-06, + "loss": 0.6646, + "step": 30220 + }, + { + "epoch": 0.826622550963208, + "grad_norm": 2.75, + "learning_rate": 1.73891585095007e-06, + "loss": 0.6773, + "step": 30230 + }, + { + "epoch": 0.8268959954061333, + "grad_norm": 2.40625, + "learning_rate": 1.7361739464231858e-06, + "loss": 0.6588, + "step": 30240 + }, + { + "epoch": 0.8271694398490587, + "grad_norm": 2.921875, + "learning_rate": 1.7334320418963013e-06, + "loss": 0.7025, + "step": 30250 + }, + { + "epoch": 0.827442884291984, + "grad_norm": 3.0, + "learning_rate": 1.7306901373694168e-06, + "loss": 0.6728, + "step": 30260 + }, + { + "epoch": 0.8277163287349093, + "grad_norm": 2.578125, + "learning_rate": 1.7279482328425325e-06, + "loss": 0.7441, + "step": 30270 + }, + { + "epoch": 0.8279897731778346, + "grad_norm": 2.484375, + "learning_rate": 1.7252063283156482e-06, + "loss": 0.6255, + "step": 30280 + }, + { + "epoch": 0.8282632176207599, + "grad_norm": 2.703125, + "learning_rate": 1.7224644237887639e-06, + "loss": 0.7182, + "step": 30290 + }, + { + "epoch": 0.8285366620636853, + "grad_norm": 2.296875, + "learning_rate": 1.7197225192618794e-06, + "loss": 0.6508, + "step": 30300 + }, + { + "epoch": 0.8288101065066105, + "grad_norm": 2.828125, + "learning_rate": 1.716980614734995e-06, + "loss": 0.7083, + "step": 30310 + }, + { + "epoch": 0.8290835509495358, + "grad_norm": 2.8125, + "learning_rate": 1.7142387102081108e-06, + "loss": 0.6775, + "step": 30320 + }, + { + "epoch": 0.8293569953924611, + "grad_norm": 2.734375, + "learning_rate": 1.7114968056812265e-06, + "loss": 0.6648, + "step": 30330 + }, + { + "epoch": 0.8296304398353864, + "grad_norm": 2.84375, + "learning_rate": 1.708754901154342e-06, + "loss": 0.6443, + "step": 30340 + }, + { + "epoch": 0.8299038842783117, + "grad_norm": 2.953125, + "learning_rate": 1.7060129966274574e-06, + "loss": 0.6466, + "step": 30350 + }, + { + "epoch": 0.8301773287212371, + "grad_norm": 2.390625, + "learning_rate": 1.7032710921005733e-06, + "loss": 0.6562, + "step": 30360 + }, + { + "epoch": 0.8304507731641624, + "grad_norm": 2.484375, + "learning_rate": 1.7005291875736888e-06, + "loss": 0.7088, + "step": 30370 + }, + { + "epoch": 0.8307242176070877, + "grad_norm": 2.59375, + "learning_rate": 1.6977872830468045e-06, + "loss": 0.7132, + "step": 30380 + }, + { + "epoch": 0.830997662050013, + "grad_norm": 2.890625, + "learning_rate": 1.69504537851992e-06, + "loss": 0.7735, + "step": 30390 + }, + { + "epoch": 0.8312711064929383, + "grad_norm": 2.46875, + "learning_rate": 1.6923034739930359e-06, + "loss": 0.6987, + "step": 30400 + }, + { + "epoch": 0.8315445509358637, + "grad_norm": 3.15625, + "learning_rate": 1.6895615694661514e-06, + "loss": 0.6989, + "step": 30410 + }, + { + "epoch": 0.8318179953787889, + "grad_norm": 2.53125, + "learning_rate": 1.686819664939267e-06, + "loss": 0.6231, + "step": 30420 + }, + { + "epoch": 0.8320914398217142, + "grad_norm": 2.953125, + "learning_rate": 1.6840777604123825e-06, + "loss": 0.7272, + "step": 30430 + }, + { + "epoch": 0.8323648842646395, + "grad_norm": 3.390625, + "learning_rate": 1.681335855885498e-06, + "loss": 0.6673, + "step": 30440 + }, + { + "epoch": 0.8326383287075648, + "grad_norm": 2.84375, + "learning_rate": 1.678593951358614e-06, + "loss": 0.6205, + "step": 30450 + }, + { + "epoch": 0.8329117731504901, + "grad_norm": 3.203125, + "learning_rate": 1.6758520468317294e-06, + "loss": 0.6708, + "step": 30460 + }, + { + "epoch": 0.8331852175934155, + "grad_norm": 2.859375, + "learning_rate": 1.6731101423048451e-06, + "loss": 0.602, + "step": 30470 + }, + { + "epoch": 0.8334586620363408, + "grad_norm": 3.234375, + "learning_rate": 1.6703682377779606e-06, + "loss": 0.6627, + "step": 30480 + }, + { + "epoch": 0.8337321064792661, + "grad_norm": 2.546875, + "learning_rate": 1.6676263332510765e-06, + "loss": 0.6648, + "step": 30490 + }, + { + "epoch": 0.8340055509221914, + "grad_norm": 3.15625, + "learning_rate": 1.664884428724192e-06, + "loss": 0.6699, + "step": 30500 + }, + { + "epoch": 0.8342789953651167, + "grad_norm": 3.375, + "learning_rate": 1.6621425241973077e-06, + "loss": 0.7066, + "step": 30510 + }, + { + "epoch": 0.834552439808042, + "grad_norm": 2.953125, + "learning_rate": 1.6594006196704232e-06, + "loss": 0.6297, + "step": 30520 + }, + { + "epoch": 0.8348258842509673, + "grad_norm": 2.75, + "learning_rate": 1.6566587151435386e-06, + "loss": 0.6894, + "step": 30530 + }, + { + "epoch": 0.8350993286938926, + "grad_norm": 3.0625, + "learning_rate": 1.6539168106166545e-06, + "loss": 0.6887, + "step": 30540 + }, + { + "epoch": 0.8353727731368179, + "grad_norm": 2.84375, + "learning_rate": 1.65117490608977e-06, + "loss": 0.6956, + "step": 30550 + }, + { + "epoch": 0.8356462175797432, + "grad_norm": 3.0625, + "learning_rate": 1.6484330015628857e-06, + "loss": 0.6743, + "step": 30560 + }, + { + "epoch": 0.8359196620226685, + "grad_norm": 3.34375, + "learning_rate": 1.6456910970360012e-06, + "loss": 0.7092, + "step": 30570 + }, + { + "epoch": 0.8361931064655939, + "grad_norm": 2.609375, + "learning_rate": 1.642949192509117e-06, + "loss": 0.5998, + "step": 30580 + }, + { + "epoch": 0.8364665509085192, + "grad_norm": 3.203125, + "learning_rate": 1.6402072879822326e-06, + "loss": 0.7203, + "step": 30590 + }, + { + "epoch": 0.8367399953514445, + "grad_norm": 2.6875, + "learning_rate": 1.6374653834553483e-06, + "loss": 0.7093, + "step": 30600 + }, + { + "epoch": 0.8370134397943698, + "grad_norm": 2.53125, + "learning_rate": 1.6347234789284638e-06, + "loss": 0.7403, + "step": 30610 + }, + { + "epoch": 0.8372868842372951, + "grad_norm": 2.828125, + "learning_rate": 1.6319815744015792e-06, + "loss": 0.6797, + "step": 30620 + }, + { + "epoch": 0.8375603286802203, + "grad_norm": 2.703125, + "learning_rate": 1.6292396698746952e-06, + "loss": 0.6737, + "step": 30630 + }, + { + "epoch": 0.8378337731231457, + "grad_norm": 2.640625, + "learning_rate": 1.6264977653478106e-06, + "loss": 0.7157, + "step": 30640 + }, + { + "epoch": 0.838107217566071, + "grad_norm": 2.984375, + "learning_rate": 1.6237558608209263e-06, + "loss": 0.7188, + "step": 30650 + }, + { + "epoch": 0.8383806620089963, + "grad_norm": 2.828125, + "learning_rate": 1.6210139562940418e-06, + "loss": 0.7357, + "step": 30660 + }, + { + "epoch": 0.8386541064519216, + "grad_norm": 2.875, + "learning_rate": 1.6182720517671577e-06, + "loss": 0.7572, + "step": 30670 + }, + { + "epoch": 0.8389275508948469, + "grad_norm": 2.796875, + "learning_rate": 1.6155301472402732e-06, + "loss": 0.7094, + "step": 30680 + }, + { + "epoch": 0.8392009953377723, + "grad_norm": 3.0, + "learning_rate": 1.6127882427133889e-06, + "loss": 0.7043, + "step": 30690 + }, + { + "epoch": 0.8394744397806976, + "grad_norm": 2.609375, + "learning_rate": 1.6100463381865044e-06, + "loss": 0.6885, + "step": 30700 + }, + { + "epoch": 0.8397478842236229, + "grad_norm": 3.046875, + "learning_rate": 1.60730443365962e-06, + "loss": 0.7758, + "step": 30710 + }, + { + "epoch": 0.8400213286665482, + "grad_norm": 2.71875, + "learning_rate": 1.6045625291327358e-06, + "loss": 0.7035, + "step": 30720 + }, + { + "epoch": 0.8402947731094735, + "grad_norm": 2.796875, + "learning_rate": 1.6018206246058512e-06, + "loss": 0.7599, + "step": 30730 + }, + { + "epoch": 0.8405682175523987, + "grad_norm": 3.1875, + "learning_rate": 1.599078720078967e-06, + "loss": 0.7077, + "step": 30740 + }, + { + "epoch": 0.8408416619953241, + "grad_norm": 3.0, + "learning_rate": 1.5963368155520826e-06, + "loss": 0.6576, + "step": 30750 + }, + { + "epoch": 0.8411151064382494, + "grad_norm": 2.34375, + "learning_rate": 1.5935949110251983e-06, + "loss": 0.7599, + "step": 30760 + }, + { + "epoch": 0.8413885508811747, + "grad_norm": 2.75, + "learning_rate": 1.5908530064983138e-06, + "loss": 0.6395, + "step": 30770 + }, + { + "epoch": 0.8416619953241, + "grad_norm": 2.890625, + "learning_rate": 1.5881111019714295e-06, + "loss": 0.7263, + "step": 30780 + }, + { + "epoch": 0.8419354397670253, + "grad_norm": 3.203125, + "learning_rate": 1.5853691974445452e-06, + "loss": 0.6954, + "step": 30790 + }, + { + "epoch": 0.8422088842099507, + "grad_norm": 2.90625, + "learning_rate": 1.5826272929176609e-06, + "loss": 0.7124, + "step": 30800 + }, + { + "epoch": 0.842482328652876, + "grad_norm": 3.09375, + "learning_rate": 1.5798853883907764e-06, + "loss": 0.6956, + "step": 30810 + }, + { + "epoch": 0.8427557730958013, + "grad_norm": 3.265625, + "learning_rate": 1.5771434838638918e-06, + "loss": 0.7108, + "step": 30820 + }, + { + "epoch": 0.8430292175387266, + "grad_norm": 2.796875, + "learning_rate": 1.5744015793370078e-06, + "loss": 0.6347, + "step": 30830 + }, + { + "epoch": 0.8433026619816518, + "grad_norm": 2.90625, + "learning_rate": 1.5716596748101232e-06, + "loss": 0.6504, + "step": 30840 + }, + { + "epoch": 0.8435761064245771, + "grad_norm": 3.140625, + "learning_rate": 1.568917770283239e-06, + "loss": 0.7289, + "step": 30850 + }, + { + "epoch": 0.8438495508675025, + "grad_norm": 2.84375, + "learning_rate": 1.5661758657563544e-06, + "loss": 0.6362, + "step": 30860 + }, + { + "epoch": 0.8441229953104278, + "grad_norm": 2.734375, + "learning_rate": 1.5634339612294703e-06, + "loss": 0.707, + "step": 30870 + }, + { + "epoch": 0.8443964397533531, + "grad_norm": 2.734375, + "learning_rate": 1.5606920567025858e-06, + "loss": 0.7408, + "step": 30880 + }, + { + "epoch": 0.8446698841962784, + "grad_norm": 2.421875, + "learning_rate": 1.5579501521757015e-06, + "loss": 0.6686, + "step": 30890 + }, + { + "epoch": 0.8449433286392037, + "grad_norm": 2.859375, + "learning_rate": 1.555208247648817e-06, + "loss": 0.7821, + "step": 30900 + }, + { + "epoch": 0.8452167730821291, + "grad_norm": 2.453125, + "learning_rate": 1.5524663431219325e-06, + "loss": 0.6423, + "step": 30910 + }, + { + "epoch": 0.8454902175250544, + "grad_norm": 3.046875, + "learning_rate": 1.5497244385950484e-06, + "loss": 0.7212, + "step": 30920 + }, + { + "epoch": 0.8457636619679797, + "grad_norm": 3.15625, + "learning_rate": 1.5469825340681638e-06, + "loss": 0.7124, + "step": 30930 + }, + { + "epoch": 0.846037106410905, + "grad_norm": 2.640625, + "learning_rate": 1.5442406295412795e-06, + "loss": 0.7046, + "step": 30940 + }, + { + "epoch": 0.8463105508538302, + "grad_norm": 2.578125, + "learning_rate": 1.541498725014395e-06, + "loss": 0.6397, + "step": 30950 + }, + { + "epoch": 0.8465839952967555, + "grad_norm": 3.03125, + "learning_rate": 1.538756820487511e-06, + "loss": 0.6764, + "step": 30960 + }, + { + "epoch": 0.8468574397396809, + "grad_norm": 2.71875, + "learning_rate": 1.5360149159606264e-06, + "loss": 0.6684, + "step": 30970 + }, + { + "epoch": 0.8471308841826062, + "grad_norm": 2.875, + "learning_rate": 1.533273011433742e-06, + "loss": 0.7348, + "step": 30980 + }, + { + "epoch": 0.8474043286255315, + "grad_norm": 3.15625, + "learning_rate": 1.5305311069068576e-06, + "loss": 0.7689, + "step": 30990 + }, + { + "epoch": 0.8476777730684568, + "grad_norm": 2.625, + "learning_rate": 1.527789202379973e-06, + "loss": 0.7193, + "step": 31000 + }, + { + "epoch": 0.8479512175113821, + "grad_norm": 2.921875, + "learning_rate": 1.525047297853089e-06, + "loss": 0.751, + "step": 31010 + }, + { + "epoch": 0.8482246619543075, + "grad_norm": 2.46875, + "learning_rate": 1.5223053933262045e-06, + "loss": 0.6922, + "step": 31020 + }, + { + "epoch": 0.8484981063972328, + "grad_norm": 3.0625, + "learning_rate": 1.5195634887993201e-06, + "loss": 0.6646, + "step": 31030 + }, + { + "epoch": 0.8487715508401581, + "grad_norm": 2.578125, + "learning_rate": 1.5168215842724356e-06, + "loss": 0.7276, + "step": 31040 + }, + { + "epoch": 0.8490449952830834, + "grad_norm": 2.734375, + "learning_rate": 1.5140796797455515e-06, + "loss": 0.7017, + "step": 31050 + }, + { + "epoch": 0.8493184397260086, + "grad_norm": 2.40625, + "learning_rate": 1.511337775218667e-06, + "loss": 0.6766, + "step": 31060 + }, + { + "epoch": 0.8495918841689339, + "grad_norm": 2.328125, + "learning_rate": 1.5085958706917827e-06, + "loss": 0.6522, + "step": 31070 + }, + { + "epoch": 0.8498653286118593, + "grad_norm": 2.6875, + "learning_rate": 1.5058539661648982e-06, + "loss": 0.5558, + "step": 31080 + }, + { + "epoch": 0.8501387730547846, + "grad_norm": 2.546875, + "learning_rate": 1.5031120616380137e-06, + "loss": 0.7203, + "step": 31090 + }, + { + "epoch": 0.8504122174977099, + "grad_norm": 2.5, + "learning_rate": 1.5003701571111296e-06, + "loss": 0.699, + "step": 31100 + }, + { + "epoch": 0.8506856619406352, + "grad_norm": 3.234375, + "learning_rate": 1.497628252584245e-06, + "loss": 0.6766, + "step": 31110 + }, + { + "epoch": 0.8509591063835605, + "grad_norm": 2.65625, + "learning_rate": 1.4948863480573608e-06, + "loss": 0.6129, + "step": 31120 + }, + { + "epoch": 0.8512325508264859, + "grad_norm": 2.8125, + "learning_rate": 1.4921444435304762e-06, + "loss": 0.6463, + "step": 31130 + }, + { + "epoch": 0.8515059952694112, + "grad_norm": 2.640625, + "learning_rate": 1.4894025390035921e-06, + "loss": 0.7253, + "step": 31140 + }, + { + "epoch": 0.8517794397123365, + "grad_norm": 2.9375, + "learning_rate": 1.4866606344767076e-06, + "loss": 0.6665, + "step": 31150 + }, + { + "epoch": 0.8520528841552617, + "grad_norm": 2.34375, + "learning_rate": 1.4839187299498233e-06, + "loss": 0.6528, + "step": 31160 + }, + { + "epoch": 0.852326328598187, + "grad_norm": 2.359375, + "learning_rate": 1.4811768254229388e-06, + "loss": 0.6397, + "step": 31170 + }, + { + "epoch": 0.8525997730411123, + "grad_norm": 2.671875, + "learning_rate": 1.4784349208960545e-06, + "loss": 0.684, + "step": 31180 + }, + { + "epoch": 0.8528732174840377, + "grad_norm": 2.484375, + "learning_rate": 1.4756930163691702e-06, + "loss": 0.6894, + "step": 31190 + }, + { + "epoch": 0.853146661926963, + "grad_norm": 2.90625, + "learning_rate": 1.4729511118422857e-06, + "loss": 0.7683, + "step": 31200 + }, + { + "epoch": 0.8534201063698883, + "grad_norm": 2.703125, + "learning_rate": 1.4702092073154014e-06, + "loss": 0.6482, + "step": 31210 + }, + { + "epoch": 0.8536935508128136, + "grad_norm": 2.34375, + "learning_rate": 1.467467302788517e-06, + "loss": 0.6837, + "step": 31220 + }, + { + "epoch": 0.8539669952557389, + "grad_norm": 2.359375, + "learning_rate": 1.4647253982616328e-06, + "loss": 0.6255, + "step": 31230 + }, + { + "epoch": 0.8542404396986643, + "grad_norm": 2.984375, + "learning_rate": 1.4619834937347482e-06, + "loss": 0.6877, + "step": 31240 + }, + { + "epoch": 0.8545138841415896, + "grad_norm": 3.3125, + "learning_rate": 1.459241589207864e-06, + "loss": 0.736, + "step": 31250 + }, + { + "epoch": 0.8547873285845149, + "grad_norm": 3.078125, + "learning_rate": 1.4564996846809796e-06, + "loss": 0.7007, + "step": 31260 + }, + { + "epoch": 0.8550607730274401, + "grad_norm": 3.046875, + "learning_rate": 1.453757780154095e-06, + "loss": 0.7401, + "step": 31270 + }, + { + "epoch": 0.8553342174703654, + "grad_norm": 2.65625, + "learning_rate": 1.4510158756272108e-06, + "loss": 0.7197, + "step": 31280 + }, + { + "epoch": 0.8556076619132907, + "grad_norm": 2.75, + "learning_rate": 1.4482739711003263e-06, + "loss": 0.6267, + "step": 31290 + }, + { + "epoch": 0.8558811063562161, + "grad_norm": 2.640625, + "learning_rate": 1.4455320665734422e-06, + "loss": 0.6282, + "step": 31300 + }, + { + "epoch": 0.8561545507991414, + "grad_norm": 2.765625, + "learning_rate": 1.4427901620465577e-06, + "loss": 0.6542, + "step": 31310 + }, + { + "epoch": 0.8564279952420667, + "grad_norm": 2.609375, + "learning_rate": 1.4400482575196734e-06, + "loss": 0.6859, + "step": 31320 + }, + { + "epoch": 0.856701439684992, + "grad_norm": 2.71875, + "learning_rate": 1.4373063529927888e-06, + "loss": 0.666, + "step": 31330 + }, + { + "epoch": 0.8569748841279173, + "grad_norm": 3.09375, + "learning_rate": 1.4345644484659047e-06, + "loss": 0.6532, + "step": 31340 + }, + { + "epoch": 0.8572483285708427, + "grad_norm": 2.671875, + "learning_rate": 1.4318225439390202e-06, + "loss": 0.6707, + "step": 31350 + }, + { + "epoch": 0.857521773013768, + "grad_norm": 2.609375, + "learning_rate": 1.4290806394121357e-06, + "loss": 0.6709, + "step": 31360 + }, + { + "epoch": 0.8577952174566932, + "grad_norm": 3.328125, + "learning_rate": 1.4263387348852514e-06, + "loss": 0.6642, + "step": 31370 + }, + { + "epoch": 0.8580686618996185, + "grad_norm": 2.59375, + "learning_rate": 1.4235968303583669e-06, + "loss": 0.6578, + "step": 31380 + }, + { + "epoch": 0.8583421063425438, + "grad_norm": 2.71875, + "learning_rate": 1.4208549258314828e-06, + "loss": 0.6275, + "step": 31390 + }, + { + "epoch": 0.8586155507854691, + "grad_norm": 3.15625, + "learning_rate": 1.4181130213045983e-06, + "loss": 0.6046, + "step": 31400 + }, + { + "epoch": 0.8588889952283945, + "grad_norm": 2.828125, + "learning_rate": 1.415371116777714e-06, + "loss": 0.7618, + "step": 31410 + }, + { + "epoch": 0.8591624396713198, + "grad_norm": 2.875, + "learning_rate": 1.4126292122508294e-06, + "loss": 0.6928, + "step": 31420 + }, + { + "epoch": 0.8594358841142451, + "grad_norm": 2.546875, + "learning_rate": 1.4098873077239454e-06, + "loss": 0.7199, + "step": 31430 + }, + { + "epoch": 0.8597093285571704, + "grad_norm": 2.703125, + "learning_rate": 1.4071454031970608e-06, + "loss": 0.7412, + "step": 31440 + }, + { + "epoch": 0.8599827730000957, + "grad_norm": 2.828125, + "learning_rate": 1.4044034986701765e-06, + "loss": 0.7258, + "step": 31450 + }, + { + "epoch": 0.860256217443021, + "grad_norm": 3.4375, + "learning_rate": 1.401661594143292e-06, + "loss": 0.7338, + "step": 31460 + }, + { + "epoch": 0.8605296618859464, + "grad_norm": 2.90625, + "learning_rate": 1.3989196896164075e-06, + "loss": 0.6748, + "step": 31470 + }, + { + "epoch": 0.8608031063288716, + "grad_norm": 3.140625, + "learning_rate": 1.3961777850895234e-06, + "loss": 0.6383, + "step": 31480 + }, + { + "epoch": 0.8610765507717969, + "grad_norm": 2.921875, + "learning_rate": 1.3934358805626389e-06, + "loss": 0.6804, + "step": 31490 + }, + { + "epoch": 0.8613499952147222, + "grad_norm": 2.484375, + "learning_rate": 1.3906939760357546e-06, + "loss": 0.7167, + "step": 31500 + }, + { + "epoch": 0.8616234396576475, + "grad_norm": 2.890625, + "learning_rate": 1.38795207150887e-06, + "loss": 0.7357, + "step": 31510 + }, + { + "epoch": 0.8618968841005729, + "grad_norm": 2.84375, + "learning_rate": 1.385210166981986e-06, + "loss": 0.6955, + "step": 31520 + }, + { + "epoch": 0.8621703285434982, + "grad_norm": 2.59375, + "learning_rate": 1.3824682624551014e-06, + "loss": 0.6083, + "step": 31530 + }, + { + "epoch": 0.8624437729864235, + "grad_norm": 2.796875, + "learning_rate": 1.3797263579282171e-06, + "loss": 0.6843, + "step": 31540 + }, + { + "epoch": 0.8627172174293488, + "grad_norm": 2.703125, + "learning_rate": 1.3769844534013326e-06, + "loss": 0.6709, + "step": 31550 + }, + { + "epoch": 0.8629906618722741, + "grad_norm": 2.828125, + "learning_rate": 1.374242548874448e-06, + "loss": 0.6906, + "step": 31560 + }, + { + "epoch": 0.8632641063151995, + "grad_norm": 2.734375, + "learning_rate": 1.371500644347564e-06, + "loss": 0.7971, + "step": 31570 + }, + { + "epoch": 0.8635375507581248, + "grad_norm": 2.796875, + "learning_rate": 1.3687587398206795e-06, + "loss": 0.6573, + "step": 31580 + }, + { + "epoch": 0.86381099520105, + "grad_norm": 2.609375, + "learning_rate": 1.3660168352937952e-06, + "loss": 0.6587, + "step": 31590 + }, + { + "epoch": 0.8640844396439753, + "grad_norm": 2.875, + "learning_rate": 1.3632749307669107e-06, + "loss": 0.7308, + "step": 31600 + }, + { + "epoch": 0.8643578840869006, + "grad_norm": 2.578125, + "learning_rate": 1.3605330262400266e-06, + "loss": 0.6517, + "step": 31610 + }, + { + "epoch": 0.8646313285298259, + "grad_norm": 2.828125, + "learning_rate": 1.357791121713142e-06, + "loss": 0.7239, + "step": 31620 + }, + { + "epoch": 0.8649047729727513, + "grad_norm": 2.96875, + "learning_rate": 1.3550492171862577e-06, + "loss": 0.6933, + "step": 31630 + }, + { + "epoch": 0.8651782174156766, + "grad_norm": 2.5, + "learning_rate": 1.3523073126593732e-06, + "loss": 0.6891, + "step": 31640 + }, + { + "epoch": 0.8654516618586019, + "grad_norm": 3.015625, + "learning_rate": 1.349565408132489e-06, + "loss": 0.7191, + "step": 31650 + }, + { + "epoch": 0.8657251063015272, + "grad_norm": 2.65625, + "learning_rate": 1.3468235036056046e-06, + "loss": 0.7518, + "step": 31660 + }, + { + "epoch": 0.8659985507444525, + "grad_norm": 2.90625, + "learning_rate": 1.34408159907872e-06, + "loss": 0.6219, + "step": 31670 + }, + { + "epoch": 0.8662719951873779, + "grad_norm": 3.125, + "learning_rate": 1.3413396945518358e-06, + "loss": 0.7036, + "step": 31680 + }, + { + "epoch": 0.8665454396303031, + "grad_norm": 2.59375, + "learning_rate": 1.3385977900249515e-06, + "loss": 0.5871, + "step": 31690 + }, + { + "epoch": 0.8668188840732284, + "grad_norm": 2.765625, + "learning_rate": 1.3358558854980672e-06, + "loss": 0.6961, + "step": 31700 + }, + { + "epoch": 0.8670923285161537, + "grad_norm": 2.890625, + "learning_rate": 1.3331139809711827e-06, + "loss": 0.6831, + "step": 31710 + }, + { + "epoch": 0.867365772959079, + "grad_norm": 2.546875, + "learning_rate": 1.3303720764442984e-06, + "loss": 0.6597, + "step": 31720 + }, + { + "epoch": 0.8676392174020043, + "grad_norm": 3.296875, + "learning_rate": 1.327630171917414e-06, + "loss": 0.727, + "step": 31730 + }, + { + "epoch": 0.8679126618449297, + "grad_norm": 3.390625, + "learning_rate": 1.3248882673905295e-06, + "loss": 0.6901, + "step": 31740 + }, + { + "epoch": 0.868186106287855, + "grad_norm": 2.8125, + "learning_rate": 1.3221463628636452e-06, + "loss": 0.6138, + "step": 31750 + }, + { + "epoch": 0.8684595507307803, + "grad_norm": 3.125, + "learning_rate": 1.3194044583367607e-06, + "loss": 0.7354, + "step": 31760 + }, + { + "epoch": 0.8687329951737056, + "grad_norm": 3.28125, + "learning_rate": 1.3166625538098766e-06, + "loss": 0.7129, + "step": 31770 + }, + { + "epoch": 0.8690064396166309, + "grad_norm": 2.734375, + "learning_rate": 1.313920649282992e-06, + "loss": 0.6773, + "step": 31780 + }, + { + "epoch": 0.8692798840595563, + "grad_norm": 2.46875, + "learning_rate": 1.3111787447561078e-06, + "loss": 0.6331, + "step": 31790 + }, + { + "epoch": 0.8695533285024815, + "grad_norm": 2.828125, + "learning_rate": 1.3084368402292233e-06, + "loss": 0.6363, + "step": 31800 + }, + { + "epoch": 0.8698267729454068, + "grad_norm": 2.5, + "learning_rate": 1.3056949357023392e-06, + "loss": 0.7217, + "step": 31810 + }, + { + "epoch": 0.8701002173883321, + "grad_norm": 2.8125, + "learning_rate": 1.3029530311754547e-06, + "loss": 0.6198, + "step": 31820 + }, + { + "epoch": 0.8703736618312574, + "grad_norm": 2.78125, + "learning_rate": 1.3002111266485701e-06, + "loss": 0.7566, + "step": 31830 + }, + { + "epoch": 0.8706471062741827, + "grad_norm": 2.765625, + "learning_rate": 1.2974692221216858e-06, + "loss": 0.7346, + "step": 31840 + }, + { + "epoch": 0.870920550717108, + "grad_norm": 2.671875, + "learning_rate": 1.2947273175948013e-06, + "loss": 0.7018, + "step": 31850 + }, + { + "epoch": 0.8711939951600334, + "grad_norm": 2.59375, + "learning_rate": 1.2919854130679172e-06, + "loss": 0.657, + "step": 31860 + }, + { + "epoch": 0.8714674396029587, + "grad_norm": 2.40625, + "learning_rate": 1.2892435085410327e-06, + "loss": 0.6274, + "step": 31870 + }, + { + "epoch": 0.871740884045884, + "grad_norm": 2.859375, + "learning_rate": 1.2865016040141484e-06, + "loss": 0.7189, + "step": 31880 + }, + { + "epoch": 0.8720143284888093, + "grad_norm": 2.890625, + "learning_rate": 1.2837596994872639e-06, + "loss": 0.6678, + "step": 31890 + }, + { + "epoch": 0.8722877729317345, + "grad_norm": 2.890625, + "learning_rate": 1.2810177949603798e-06, + "loss": 0.7377, + "step": 31900 + }, + { + "epoch": 0.8725612173746599, + "grad_norm": 2.71875, + "learning_rate": 1.2782758904334953e-06, + "loss": 0.6595, + "step": 31910 + }, + { + "epoch": 0.8728346618175852, + "grad_norm": 2.953125, + "learning_rate": 1.2755339859066107e-06, + "loss": 0.6391, + "step": 31920 + }, + { + "epoch": 0.8731081062605105, + "grad_norm": 3.375, + "learning_rate": 1.2727920813797264e-06, + "loss": 0.7499, + "step": 31930 + }, + { + "epoch": 0.8733815507034358, + "grad_norm": 3.0, + "learning_rate": 1.270050176852842e-06, + "loss": 0.7519, + "step": 31940 + }, + { + "epoch": 0.8736549951463611, + "grad_norm": 2.6875, + "learning_rate": 1.2673082723259578e-06, + "loss": 0.6991, + "step": 31950 + }, + { + "epoch": 0.8739284395892865, + "grad_norm": 2.953125, + "learning_rate": 1.2645663677990733e-06, + "loss": 0.7351, + "step": 31960 + }, + { + "epoch": 0.8742018840322118, + "grad_norm": 3.4375, + "learning_rate": 1.261824463272189e-06, + "loss": 0.7028, + "step": 31970 + }, + { + "epoch": 0.8744753284751371, + "grad_norm": 2.71875, + "learning_rate": 1.2590825587453045e-06, + "loss": 0.6779, + "step": 31980 + }, + { + "epoch": 0.8747487729180624, + "grad_norm": 2.6875, + "learning_rate": 1.2563406542184204e-06, + "loss": 0.6255, + "step": 31990 + }, + { + "epoch": 0.8750222173609877, + "grad_norm": 2.671875, + "learning_rate": 1.2535987496915359e-06, + "loss": 0.6857, + "step": 32000 + }, + { + "epoch": 0.8752956618039129, + "grad_norm": 2.75, + "learning_rate": 1.2508568451646514e-06, + "loss": 0.631, + "step": 32010 + }, + { + "epoch": 0.8755691062468383, + "grad_norm": 3.046875, + "learning_rate": 1.248114940637767e-06, + "loss": 0.6383, + "step": 32020 + }, + { + "epoch": 0.8758425506897636, + "grad_norm": 2.953125, + "learning_rate": 1.2453730361108827e-06, + "loss": 0.7158, + "step": 32030 + }, + { + "epoch": 0.8761159951326889, + "grad_norm": 2.71875, + "learning_rate": 1.2426311315839984e-06, + "loss": 0.6703, + "step": 32040 + }, + { + "epoch": 0.8763894395756142, + "grad_norm": 2.609375, + "learning_rate": 1.2398892270571141e-06, + "loss": 0.6235, + "step": 32050 + }, + { + "epoch": 0.8766628840185395, + "grad_norm": 2.375, + "learning_rate": 1.2371473225302296e-06, + "loss": 0.6921, + "step": 32060 + }, + { + "epoch": 0.8769363284614649, + "grad_norm": 3.46875, + "learning_rate": 1.234405418003345e-06, + "loss": 0.8106, + "step": 32070 + }, + { + "epoch": 0.8772097729043902, + "grad_norm": 3.703125, + "learning_rate": 1.2316635134764608e-06, + "loss": 0.7295, + "step": 32080 + }, + { + "epoch": 0.8774832173473155, + "grad_norm": 2.625, + "learning_rate": 1.2289216089495765e-06, + "loss": 0.6944, + "step": 32090 + }, + { + "epoch": 0.8777566617902408, + "grad_norm": 2.5625, + "learning_rate": 1.2261797044226922e-06, + "loss": 0.6818, + "step": 32100 + }, + { + "epoch": 0.8780301062331661, + "grad_norm": 2.265625, + "learning_rate": 1.2234377998958077e-06, + "loss": 0.5916, + "step": 32110 + }, + { + "epoch": 0.8783035506760913, + "grad_norm": 2.640625, + "learning_rate": 1.2206958953689234e-06, + "loss": 0.6403, + "step": 32120 + }, + { + "epoch": 0.8785769951190167, + "grad_norm": 3.0, + "learning_rate": 1.217953990842039e-06, + "loss": 0.6867, + "step": 32130 + }, + { + "epoch": 0.878850439561942, + "grad_norm": 3.25, + "learning_rate": 1.2152120863151547e-06, + "loss": 0.8073, + "step": 32140 + }, + { + "epoch": 0.8791238840048673, + "grad_norm": 2.578125, + "learning_rate": 1.2124701817882702e-06, + "loss": 0.6828, + "step": 32150 + }, + { + "epoch": 0.8793973284477926, + "grad_norm": 2.703125, + "learning_rate": 1.209728277261386e-06, + "loss": 0.6987, + "step": 32160 + }, + { + "epoch": 0.8796707728907179, + "grad_norm": 2.9375, + "learning_rate": 1.2069863727345014e-06, + "loss": 0.6322, + "step": 32170 + }, + { + "epoch": 0.8799442173336433, + "grad_norm": 2.546875, + "learning_rate": 1.204244468207617e-06, + "loss": 0.7233, + "step": 32180 + }, + { + "epoch": 0.8802176617765686, + "grad_norm": 2.828125, + "learning_rate": 1.2015025636807328e-06, + "loss": 0.7719, + "step": 32190 + }, + { + "epoch": 0.8804911062194939, + "grad_norm": 2.890625, + "learning_rate": 1.1987606591538485e-06, + "loss": 0.6685, + "step": 32200 + }, + { + "epoch": 0.8807645506624192, + "grad_norm": 3.046875, + "learning_rate": 1.196018754626964e-06, + "loss": 0.7149, + "step": 32210 + }, + { + "epoch": 0.8810379951053444, + "grad_norm": 2.703125, + "learning_rate": 1.1932768501000797e-06, + "loss": 0.6962, + "step": 32220 + }, + { + "epoch": 0.8813114395482697, + "grad_norm": 2.90625, + "learning_rate": 1.1905349455731953e-06, + "loss": 0.6712, + "step": 32230 + }, + { + "epoch": 0.8815848839911951, + "grad_norm": 2.734375, + "learning_rate": 1.1877930410463108e-06, + "loss": 0.7176, + "step": 32240 + }, + { + "epoch": 0.8818583284341204, + "grad_norm": 2.9375, + "learning_rate": 1.1850511365194265e-06, + "loss": 0.6493, + "step": 32250 + }, + { + "epoch": 0.8821317728770457, + "grad_norm": 2.984375, + "learning_rate": 1.182309231992542e-06, + "loss": 0.7206, + "step": 32260 + }, + { + "epoch": 0.882405217319971, + "grad_norm": 2.890625, + "learning_rate": 1.1795673274656577e-06, + "loss": 0.6704, + "step": 32270 + }, + { + "epoch": 0.8826786617628963, + "grad_norm": 2.390625, + "learning_rate": 1.1768254229387734e-06, + "loss": 0.6022, + "step": 32280 + }, + { + "epoch": 0.8829521062058217, + "grad_norm": 2.78125, + "learning_rate": 1.174083518411889e-06, + "loss": 0.7023, + "step": 32290 + }, + { + "epoch": 0.883225550648747, + "grad_norm": 2.328125, + "learning_rate": 1.1713416138850046e-06, + "loss": 0.684, + "step": 32300 + }, + { + "epoch": 0.8834989950916723, + "grad_norm": 2.46875, + "learning_rate": 1.1685997093581203e-06, + "loss": 0.718, + "step": 32310 + }, + { + "epoch": 0.8837724395345976, + "grad_norm": 2.84375, + "learning_rate": 1.165857804831236e-06, + "loss": 0.6877, + "step": 32320 + }, + { + "epoch": 0.8840458839775228, + "grad_norm": 2.796875, + "learning_rate": 1.1631159003043516e-06, + "loss": 0.6762, + "step": 32330 + }, + { + "epoch": 0.8843193284204481, + "grad_norm": 2.4375, + "learning_rate": 1.1603739957774671e-06, + "loss": 0.6931, + "step": 32340 + }, + { + "epoch": 0.8845927728633735, + "grad_norm": 3.34375, + "learning_rate": 1.1576320912505826e-06, + "loss": 0.6549, + "step": 32350 + }, + { + "epoch": 0.8848662173062988, + "grad_norm": 2.609375, + "learning_rate": 1.1548901867236983e-06, + "loss": 0.7437, + "step": 32360 + }, + { + "epoch": 0.8851396617492241, + "grad_norm": 3.109375, + "learning_rate": 1.152148282196814e-06, + "loss": 0.6882, + "step": 32370 + }, + { + "epoch": 0.8854131061921494, + "grad_norm": 2.8125, + "learning_rate": 1.1494063776699297e-06, + "loss": 0.7683, + "step": 32380 + }, + { + "epoch": 0.8856865506350747, + "grad_norm": 3.109375, + "learning_rate": 1.1466644731430452e-06, + "loss": 0.6753, + "step": 32390 + }, + { + "epoch": 0.885959995078, + "grad_norm": 2.625, + "learning_rate": 1.1439225686161609e-06, + "loss": 0.5924, + "step": 32400 + }, + { + "epoch": 0.8862334395209254, + "grad_norm": 2.65625, + "learning_rate": 1.1411806640892766e-06, + "loss": 0.7129, + "step": 32410 + }, + { + "epoch": 0.8865068839638507, + "grad_norm": 2.5, + "learning_rate": 1.1384387595623923e-06, + "loss": 0.7051, + "step": 32420 + }, + { + "epoch": 0.886780328406776, + "grad_norm": 2.859375, + "learning_rate": 1.1356968550355077e-06, + "loss": 0.6628, + "step": 32430 + }, + { + "epoch": 0.8870537728497012, + "grad_norm": 2.484375, + "learning_rate": 1.1329549505086234e-06, + "loss": 0.6724, + "step": 32440 + }, + { + "epoch": 0.8873272172926265, + "grad_norm": 2.84375, + "learning_rate": 1.130213045981739e-06, + "loss": 0.7243, + "step": 32450 + }, + { + "epoch": 0.8876006617355519, + "grad_norm": 2.703125, + "learning_rate": 1.1274711414548546e-06, + "loss": 0.7248, + "step": 32460 + }, + { + "epoch": 0.8878741061784772, + "grad_norm": 2.421875, + "learning_rate": 1.1247292369279703e-06, + "loss": 0.6673, + "step": 32470 + }, + { + "epoch": 0.8881475506214025, + "grad_norm": 3.03125, + "learning_rate": 1.121987332401086e-06, + "loss": 0.7378, + "step": 32480 + }, + { + "epoch": 0.8884209950643278, + "grad_norm": 2.71875, + "learning_rate": 1.1192454278742015e-06, + "loss": 0.667, + "step": 32490 + }, + { + "epoch": 0.8886944395072531, + "grad_norm": 3.21875, + "learning_rate": 1.1165035233473172e-06, + "loss": 0.6808, + "step": 32500 + }, + { + "epoch": 0.8889678839501785, + "grad_norm": 2.53125, + "learning_rate": 1.1137616188204329e-06, + "loss": 0.643, + "step": 32510 + }, + { + "epoch": 0.8892413283931038, + "grad_norm": 3.1875, + "learning_rate": 1.1110197142935483e-06, + "loss": 0.6712, + "step": 32520 + }, + { + "epoch": 0.8895147728360291, + "grad_norm": 2.875, + "learning_rate": 1.108277809766664e-06, + "loss": 0.7407, + "step": 32530 + }, + { + "epoch": 0.8897882172789543, + "grad_norm": 3.125, + "learning_rate": 1.1055359052397795e-06, + "loss": 0.6496, + "step": 32540 + }, + { + "epoch": 0.8900616617218796, + "grad_norm": 3.0, + "learning_rate": 1.1027940007128952e-06, + "loss": 0.7009, + "step": 32550 + }, + { + "epoch": 0.8903351061648049, + "grad_norm": 2.734375, + "learning_rate": 1.100052096186011e-06, + "loss": 0.7813, + "step": 32560 + }, + { + "epoch": 0.8906085506077303, + "grad_norm": 2.890625, + "learning_rate": 1.0973101916591266e-06, + "loss": 0.6824, + "step": 32570 + }, + { + "epoch": 0.8908819950506556, + "grad_norm": 3.53125, + "learning_rate": 1.094568287132242e-06, + "loss": 0.6851, + "step": 32580 + }, + { + "epoch": 0.8911554394935809, + "grad_norm": 2.5, + "learning_rate": 1.0918263826053578e-06, + "loss": 0.7132, + "step": 32590 + }, + { + "epoch": 0.8914288839365062, + "grad_norm": 3.015625, + "learning_rate": 1.0890844780784735e-06, + "loss": 0.6753, + "step": 32600 + }, + { + "epoch": 0.8917023283794315, + "grad_norm": 2.390625, + "learning_rate": 1.0863425735515892e-06, + "loss": 0.7204, + "step": 32610 + }, + { + "epoch": 0.8919757728223568, + "grad_norm": 3.015625, + "learning_rate": 1.0836006690247046e-06, + "loss": 0.677, + "step": 32620 + }, + { + "epoch": 0.8922492172652822, + "grad_norm": 2.859375, + "learning_rate": 1.0808587644978201e-06, + "loss": 0.6179, + "step": 32630 + }, + { + "epoch": 0.8925226617082075, + "grad_norm": 2.75, + "learning_rate": 1.0781168599709358e-06, + "loss": 0.7296, + "step": 32640 + }, + { + "epoch": 0.8927961061511327, + "grad_norm": 3.40625, + "learning_rate": 1.0753749554440515e-06, + "loss": 0.7376, + "step": 32650 + }, + { + "epoch": 0.893069550594058, + "grad_norm": 2.375, + "learning_rate": 1.0726330509171672e-06, + "loss": 0.6736, + "step": 32660 + }, + { + "epoch": 0.8933429950369833, + "grad_norm": 2.9375, + "learning_rate": 1.0698911463902827e-06, + "loss": 0.7426, + "step": 32670 + }, + { + "epoch": 0.8936164394799087, + "grad_norm": 2.609375, + "learning_rate": 1.0671492418633984e-06, + "loss": 0.6668, + "step": 32680 + }, + { + "epoch": 0.893889883922834, + "grad_norm": 2.78125, + "learning_rate": 1.064407337336514e-06, + "loss": 0.7112, + "step": 32690 + }, + { + "epoch": 0.8941633283657593, + "grad_norm": 2.734375, + "learning_rate": 1.0616654328096298e-06, + "loss": 0.7007, + "step": 32700 + }, + { + "epoch": 0.8944367728086846, + "grad_norm": 2.65625, + "learning_rate": 1.0589235282827453e-06, + "loss": 0.7086, + "step": 32710 + }, + { + "epoch": 0.8947102172516099, + "grad_norm": 2.75, + "learning_rate": 1.056181623755861e-06, + "loss": 0.5982, + "step": 32720 + }, + { + "epoch": 0.8949836616945352, + "grad_norm": 3.5, + "learning_rate": 1.0534397192289764e-06, + "loss": 0.6493, + "step": 32730 + }, + { + "epoch": 0.8952571061374606, + "grad_norm": 2.8125, + "learning_rate": 1.0506978147020921e-06, + "loss": 0.7261, + "step": 32740 + }, + { + "epoch": 0.8955305505803858, + "grad_norm": 2.515625, + "learning_rate": 1.0479559101752078e-06, + "loss": 0.6131, + "step": 32750 + }, + { + "epoch": 0.8958039950233111, + "grad_norm": 2.546875, + "learning_rate": 1.0452140056483235e-06, + "loss": 0.6638, + "step": 32760 + }, + { + "epoch": 0.8960774394662364, + "grad_norm": 2.515625, + "learning_rate": 1.042472101121439e-06, + "loss": 0.5816, + "step": 32770 + }, + { + "epoch": 0.8963508839091617, + "grad_norm": 2.9375, + "learning_rate": 1.0397301965945547e-06, + "loss": 0.705, + "step": 32780 + }, + { + "epoch": 0.896624328352087, + "grad_norm": 2.453125, + "learning_rate": 1.0369882920676704e-06, + "loss": 0.6765, + "step": 32790 + }, + { + "epoch": 0.8968977727950124, + "grad_norm": 3.046875, + "learning_rate": 1.0342463875407859e-06, + "loss": 0.7313, + "step": 32800 + }, + { + "epoch": 0.8971712172379377, + "grad_norm": 2.828125, + "learning_rate": 1.0315044830139016e-06, + "loss": 0.7061, + "step": 32810 + }, + { + "epoch": 0.897444661680863, + "grad_norm": 3.171875, + "learning_rate": 1.028762578487017e-06, + "loss": 0.6994, + "step": 32820 + }, + { + "epoch": 0.8977181061237883, + "grad_norm": 2.75, + "learning_rate": 1.0260206739601327e-06, + "loss": 0.7142, + "step": 32830 + }, + { + "epoch": 0.8979915505667136, + "grad_norm": 3.0, + "learning_rate": 1.0232787694332484e-06, + "loss": 0.7611, + "step": 32840 + }, + { + "epoch": 0.898264995009639, + "grad_norm": 2.65625, + "learning_rate": 1.0205368649063641e-06, + "loss": 0.6377, + "step": 32850 + }, + { + "epoch": 0.8985384394525642, + "grad_norm": 3.3125, + "learning_rate": 1.0177949603794796e-06, + "loss": 0.666, + "step": 32860 + }, + { + "epoch": 0.8988118838954895, + "grad_norm": 2.96875, + "learning_rate": 1.0150530558525953e-06, + "loss": 0.6963, + "step": 32870 + }, + { + "epoch": 0.8990853283384148, + "grad_norm": 3.0625, + "learning_rate": 1.012311151325711e-06, + "loss": 0.7274, + "step": 32880 + }, + { + "epoch": 0.8993587727813401, + "grad_norm": 2.84375, + "learning_rate": 1.0095692467988265e-06, + "loss": 0.712, + "step": 32890 + }, + { + "epoch": 0.8996322172242655, + "grad_norm": 2.671875, + "learning_rate": 1.0068273422719422e-06, + "loss": 0.6751, + "step": 32900 + }, + { + "epoch": 0.8999056616671908, + "grad_norm": 2.890625, + "learning_rate": 1.0040854377450579e-06, + "loss": 0.6699, + "step": 32910 + }, + { + "epoch": 0.9001791061101161, + "grad_norm": 3.203125, + "learning_rate": 1.0013435332181733e-06, + "loss": 0.6858, + "step": 32920 + }, + { + "epoch": 0.9004525505530414, + "grad_norm": 2.40625, + "learning_rate": 9.98601628691289e-07, + "loss": 0.7005, + "step": 32930 + }, + { + "epoch": 0.9007259949959667, + "grad_norm": 3.046875, + "learning_rate": 9.958597241644047e-07, + "loss": 0.6379, + "step": 32940 + }, + { + "epoch": 0.900999439438892, + "grad_norm": 3.03125, + "learning_rate": 9.931178196375204e-07, + "loss": 0.6823, + "step": 32950 + }, + { + "epoch": 0.9012728838818174, + "grad_norm": 2.734375, + "learning_rate": 9.90375915110636e-07, + "loss": 0.7349, + "step": 32960 + }, + { + "epoch": 0.9015463283247426, + "grad_norm": 2.796875, + "learning_rate": 9.876340105837516e-07, + "loss": 0.6317, + "step": 32970 + }, + { + "epoch": 0.9018197727676679, + "grad_norm": 2.875, + "learning_rate": 9.848921060568673e-07, + "loss": 0.7727, + "step": 32980 + }, + { + "epoch": 0.9020932172105932, + "grad_norm": 2.5625, + "learning_rate": 9.821502015299828e-07, + "loss": 0.6228, + "step": 32990 + }, + { + "epoch": 0.9023666616535185, + "grad_norm": 3.078125, + "learning_rate": 9.794082970030985e-07, + "loss": 0.656, + "step": 33000 + }, + { + "epoch": 0.9026401060964439, + "grad_norm": 2.5625, + "learning_rate": 9.76666392476214e-07, + "loss": 0.6878, + "step": 33010 + }, + { + "epoch": 0.9029135505393692, + "grad_norm": 2.734375, + "learning_rate": 9.739244879493296e-07, + "loss": 0.705, + "step": 33020 + }, + { + "epoch": 0.9031869949822945, + "grad_norm": 2.578125, + "learning_rate": 9.711825834224453e-07, + "loss": 0.7055, + "step": 33030 + }, + { + "epoch": 0.9034604394252198, + "grad_norm": 2.421875, + "learning_rate": 9.68440678895561e-07, + "loss": 0.6558, + "step": 33040 + }, + { + "epoch": 0.9037338838681451, + "grad_norm": 2.8125, + "learning_rate": 9.656987743686765e-07, + "loss": 0.7187, + "step": 33050 + }, + { + "epoch": 0.9040073283110704, + "grad_norm": 2.9375, + "learning_rate": 9.629568698417922e-07, + "loss": 0.6986, + "step": 33060 + }, + { + "epoch": 0.9042807727539957, + "grad_norm": 2.875, + "learning_rate": 9.60214965314908e-07, + "loss": 0.6936, + "step": 33070 + }, + { + "epoch": 0.904554217196921, + "grad_norm": 3.0, + "learning_rate": 9.574730607880234e-07, + "loss": 0.6345, + "step": 33080 + }, + { + "epoch": 0.9048276616398463, + "grad_norm": 2.953125, + "learning_rate": 9.54731156261139e-07, + "loss": 0.7363, + "step": 33090 + }, + { + "epoch": 0.9051011060827716, + "grad_norm": 2.625, + "learning_rate": 9.519892517342547e-07, + "loss": 0.6912, + "step": 33100 + }, + { + "epoch": 0.9053745505256969, + "grad_norm": 2.484375, + "learning_rate": 9.492473472073703e-07, + "loss": 0.7091, + "step": 33110 + }, + { + "epoch": 0.9056479949686222, + "grad_norm": 3.0, + "learning_rate": 9.46505442680486e-07, + "loss": 0.7431, + "step": 33120 + }, + { + "epoch": 0.9059214394115476, + "grad_norm": 2.59375, + "learning_rate": 9.437635381536015e-07, + "loss": 0.686, + "step": 33130 + }, + { + "epoch": 0.9061948838544729, + "grad_norm": 7.8125, + "learning_rate": 9.410216336267172e-07, + "loss": 0.6041, + "step": 33140 + }, + { + "epoch": 0.9064683282973982, + "grad_norm": 2.671875, + "learning_rate": 9.382797290998328e-07, + "loss": 0.6435, + "step": 33150 + }, + { + "epoch": 0.9067417727403235, + "grad_norm": 2.796875, + "learning_rate": 9.355378245729485e-07, + "loss": 0.637, + "step": 33160 + }, + { + "epoch": 0.9070152171832488, + "grad_norm": 2.40625, + "learning_rate": 9.32795920046064e-07, + "loss": 0.6794, + "step": 33170 + }, + { + "epoch": 0.907288661626174, + "grad_norm": 2.703125, + "learning_rate": 9.300540155191797e-07, + "loss": 0.6549, + "step": 33180 + }, + { + "epoch": 0.9075621060690994, + "grad_norm": 3.328125, + "learning_rate": 9.273121109922953e-07, + "loss": 0.7215, + "step": 33190 + }, + { + "epoch": 0.9078355505120247, + "grad_norm": 3.28125, + "learning_rate": 9.24570206465411e-07, + "loss": 0.7975, + "step": 33200 + }, + { + "epoch": 0.90810899495495, + "grad_norm": 2.421875, + "learning_rate": 9.218283019385266e-07, + "loss": 0.7061, + "step": 33210 + }, + { + "epoch": 0.9083824393978753, + "grad_norm": 3.125, + "learning_rate": 9.190863974116422e-07, + "loss": 0.7391, + "step": 33220 + }, + { + "epoch": 0.9086558838408006, + "grad_norm": 2.703125, + "learning_rate": 9.163444928847578e-07, + "loss": 0.6997, + "step": 33230 + }, + { + "epoch": 0.908929328283726, + "grad_norm": 2.984375, + "learning_rate": 9.136025883578735e-07, + "loss": 0.6592, + "step": 33240 + }, + { + "epoch": 0.9092027727266513, + "grad_norm": 2.890625, + "learning_rate": 9.108606838309891e-07, + "loss": 0.755, + "step": 33250 + }, + { + "epoch": 0.9094762171695766, + "grad_norm": 2.671875, + "learning_rate": 9.081187793041046e-07, + "loss": 0.6718, + "step": 33260 + }, + { + "epoch": 0.9097496616125019, + "grad_norm": 2.890625, + "learning_rate": 9.053768747772203e-07, + "loss": 0.7111, + "step": 33270 + }, + { + "epoch": 0.9100231060554271, + "grad_norm": 3.109375, + "learning_rate": 9.026349702503359e-07, + "loss": 0.7134, + "step": 33280 + }, + { + "epoch": 0.9102965504983525, + "grad_norm": 2.546875, + "learning_rate": 8.998930657234516e-07, + "loss": 0.6471, + "step": 33290 + }, + { + "epoch": 0.9105699949412778, + "grad_norm": 2.59375, + "learning_rate": 8.971511611965672e-07, + "loss": 0.663, + "step": 33300 + }, + { + "epoch": 0.9108434393842031, + "grad_norm": 3.1875, + "learning_rate": 8.944092566696829e-07, + "loss": 0.7223, + "step": 33310 + }, + { + "epoch": 0.9111168838271284, + "grad_norm": 3.171875, + "learning_rate": 8.916673521427984e-07, + "loss": 0.7169, + "step": 33320 + }, + { + "epoch": 0.9113903282700537, + "grad_norm": 3.15625, + "learning_rate": 8.889254476159141e-07, + "loss": 0.8168, + "step": 33330 + }, + { + "epoch": 0.911663772712979, + "grad_norm": 2.53125, + "learning_rate": 8.861835430890297e-07, + "loss": 0.7459, + "step": 33340 + }, + { + "epoch": 0.9119372171559044, + "grad_norm": 2.890625, + "learning_rate": 8.834416385621454e-07, + "loss": 0.6906, + "step": 33350 + }, + { + "epoch": 0.9122106615988297, + "grad_norm": 2.78125, + "learning_rate": 8.806997340352609e-07, + "loss": 0.6756, + "step": 33360 + }, + { + "epoch": 0.912484106041755, + "grad_norm": 2.75, + "learning_rate": 8.779578295083766e-07, + "loss": 0.7747, + "step": 33370 + }, + { + "epoch": 0.9127575504846803, + "grad_norm": 2.828125, + "learning_rate": 8.752159249814922e-07, + "loss": 0.6701, + "step": 33380 + }, + { + "epoch": 0.9130309949276055, + "grad_norm": 2.984375, + "learning_rate": 8.724740204546079e-07, + "loss": 0.6761, + "step": 33390 + }, + { + "epoch": 0.9133044393705309, + "grad_norm": 2.265625, + "learning_rate": 8.697321159277235e-07, + "loss": 0.6425, + "step": 33400 + }, + { + "epoch": 0.9135778838134562, + "grad_norm": 2.546875, + "learning_rate": 8.669902114008392e-07, + "loss": 0.6261, + "step": 33410 + }, + { + "epoch": 0.9138513282563815, + "grad_norm": 2.578125, + "learning_rate": 8.642483068739547e-07, + "loss": 0.6579, + "step": 33420 + }, + { + "epoch": 0.9141247726993068, + "grad_norm": 2.546875, + "learning_rate": 8.615064023470704e-07, + "loss": 0.7225, + "step": 33430 + }, + { + "epoch": 0.9143982171422321, + "grad_norm": 2.59375, + "learning_rate": 8.58764497820186e-07, + "loss": 0.636, + "step": 33440 + }, + { + "epoch": 0.9146716615851574, + "grad_norm": 3.234375, + "learning_rate": 8.560225932933015e-07, + "loss": 0.7805, + "step": 33450 + }, + { + "epoch": 0.9149451060280828, + "grad_norm": 2.90625, + "learning_rate": 8.532806887664172e-07, + "loss": 0.7072, + "step": 33460 + }, + { + "epoch": 0.9152185504710081, + "grad_norm": 2.28125, + "learning_rate": 8.505387842395328e-07, + "loss": 0.6916, + "step": 33470 + }, + { + "epoch": 0.9154919949139334, + "grad_norm": 2.46875, + "learning_rate": 8.477968797126485e-07, + "loss": 0.6482, + "step": 33480 + }, + { + "epoch": 0.9157654393568587, + "grad_norm": 2.59375, + "learning_rate": 8.450549751857641e-07, + "loss": 0.6627, + "step": 33490 + }, + { + "epoch": 0.9160388837997839, + "grad_norm": 2.9375, + "learning_rate": 8.423130706588798e-07, + "loss": 0.6866, + "step": 33500 + }, + { + "epoch": 0.9163123282427093, + "grad_norm": 2.703125, + "learning_rate": 8.395711661319954e-07, + "loss": 0.6808, + "step": 33510 + }, + { + "epoch": 0.9165857726856346, + "grad_norm": 2.796875, + "learning_rate": 8.36829261605111e-07, + "loss": 0.6833, + "step": 33520 + }, + { + "epoch": 0.9168592171285599, + "grad_norm": 3.28125, + "learning_rate": 8.340873570782266e-07, + "loss": 0.7627, + "step": 33530 + }, + { + "epoch": 0.9171326615714852, + "grad_norm": 3.1875, + "learning_rate": 8.313454525513421e-07, + "loss": 0.7339, + "step": 33540 + }, + { + "epoch": 0.9174061060144105, + "grad_norm": 3.671875, + "learning_rate": 8.286035480244578e-07, + "loss": 0.6587, + "step": 33550 + }, + { + "epoch": 0.9176795504573358, + "grad_norm": 3.109375, + "learning_rate": 8.258616434975734e-07, + "loss": 0.5575, + "step": 33560 + }, + { + "epoch": 0.9179529949002612, + "grad_norm": 2.921875, + "learning_rate": 8.231197389706891e-07, + "loss": 0.7591, + "step": 33570 + }, + { + "epoch": 0.9182264393431865, + "grad_norm": 2.671875, + "learning_rate": 8.203778344438047e-07, + "loss": 0.6615, + "step": 33580 + }, + { + "epoch": 0.9184998837861118, + "grad_norm": 2.765625, + "learning_rate": 8.176359299169204e-07, + "loss": 0.697, + "step": 33590 + }, + { + "epoch": 0.918773328229037, + "grad_norm": 3.0625, + "learning_rate": 8.14894025390036e-07, + "loss": 0.6257, + "step": 33600 + }, + { + "epoch": 0.9190467726719623, + "grad_norm": 2.703125, + "learning_rate": 8.121521208631517e-07, + "loss": 0.6974, + "step": 33610 + }, + { + "epoch": 0.9193202171148877, + "grad_norm": 2.15625, + "learning_rate": 8.094102163362672e-07, + "loss": 0.6564, + "step": 33620 + }, + { + "epoch": 0.919593661557813, + "grad_norm": 3.109375, + "learning_rate": 8.066683118093829e-07, + "loss": 0.7374, + "step": 33630 + }, + { + "epoch": 0.9198671060007383, + "grad_norm": 2.375, + "learning_rate": 8.039264072824984e-07, + "loss": 0.6139, + "step": 33640 + }, + { + "epoch": 0.9201405504436636, + "grad_norm": 3.203125, + "learning_rate": 8.011845027556141e-07, + "loss": 0.7125, + "step": 33650 + }, + { + "epoch": 0.9204139948865889, + "grad_norm": 2.921875, + "learning_rate": 7.984425982287297e-07, + "loss": 0.6622, + "step": 33660 + }, + { + "epoch": 0.9206874393295142, + "grad_norm": 3.03125, + "learning_rate": 7.957006937018454e-07, + "loss": 0.6447, + "step": 33670 + }, + { + "epoch": 0.9209608837724396, + "grad_norm": 3.078125, + "learning_rate": 7.92958789174961e-07, + "loss": 0.6918, + "step": 33680 + }, + { + "epoch": 0.9212343282153649, + "grad_norm": 2.984375, + "learning_rate": 7.902168846480767e-07, + "loss": 0.7583, + "step": 33690 + }, + { + "epoch": 0.9215077726582902, + "grad_norm": 3.046875, + "learning_rate": 7.874749801211923e-07, + "loss": 0.7628, + "step": 33700 + }, + { + "epoch": 0.9217812171012154, + "grad_norm": 2.671875, + "learning_rate": 7.84733075594308e-07, + "loss": 0.6778, + "step": 33710 + }, + { + "epoch": 0.9220546615441407, + "grad_norm": 2.703125, + "learning_rate": 7.819911710674235e-07, + "loss": 0.7393, + "step": 33720 + }, + { + "epoch": 0.922328105987066, + "grad_norm": 2.5625, + "learning_rate": 7.79249266540539e-07, + "loss": 0.6959, + "step": 33730 + }, + { + "epoch": 0.9226015504299914, + "grad_norm": 2.984375, + "learning_rate": 7.765073620136547e-07, + "loss": 0.7236, + "step": 33740 + }, + { + "epoch": 0.9228749948729167, + "grad_norm": 2.59375, + "learning_rate": 7.737654574867703e-07, + "loss": 0.6686, + "step": 33750 + }, + { + "epoch": 0.923148439315842, + "grad_norm": 2.734375, + "learning_rate": 7.71023552959886e-07, + "loss": 0.6145, + "step": 33760 + }, + { + "epoch": 0.9234218837587673, + "grad_norm": 2.78125, + "learning_rate": 7.682816484330016e-07, + "loss": 0.6953, + "step": 33770 + }, + { + "epoch": 0.9236953282016926, + "grad_norm": 2.453125, + "learning_rate": 7.655397439061173e-07, + "loss": 0.7735, + "step": 33780 + }, + { + "epoch": 0.923968772644618, + "grad_norm": 2.921875, + "learning_rate": 7.627978393792329e-07, + "loss": 0.6689, + "step": 33790 + }, + { + "epoch": 0.9242422170875433, + "grad_norm": 2.828125, + "learning_rate": 7.600559348523486e-07, + "loss": 0.6683, + "step": 33800 + }, + { + "epoch": 0.9245156615304686, + "grad_norm": 2.6875, + "learning_rate": 7.573140303254642e-07, + "loss": 0.6512, + "step": 33810 + }, + { + "epoch": 0.9247891059733938, + "grad_norm": 3.0, + "learning_rate": 7.545721257985797e-07, + "loss": 0.7211, + "step": 33820 + }, + { + "epoch": 0.9250625504163191, + "grad_norm": 3.015625, + "learning_rate": 7.518302212716953e-07, + "loss": 0.7131, + "step": 33830 + }, + { + "epoch": 0.9253359948592444, + "grad_norm": 2.796875, + "learning_rate": 7.49088316744811e-07, + "loss": 0.6708, + "step": 33840 + }, + { + "epoch": 0.9256094393021698, + "grad_norm": 3.84375, + "learning_rate": 7.463464122179266e-07, + "loss": 0.7524, + "step": 33850 + }, + { + "epoch": 0.9258828837450951, + "grad_norm": 3.15625, + "learning_rate": 7.436045076910423e-07, + "loss": 0.649, + "step": 33860 + }, + { + "epoch": 0.9261563281880204, + "grad_norm": 2.40625, + "learning_rate": 7.408626031641579e-07, + "loss": 0.6876, + "step": 33870 + }, + { + "epoch": 0.9264297726309457, + "grad_norm": 3.046875, + "learning_rate": 7.381206986372736e-07, + "loss": 0.7787, + "step": 33880 + }, + { + "epoch": 0.926703217073871, + "grad_norm": 2.671875, + "learning_rate": 7.353787941103892e-07, + "loss": 0.692, + "step": 33890 + }, + { + "epoch": 0.9269766615167964, + "grad_norm": 2.65625, + "learning_rate": 7.326368895835049e-07, + "loss": 0.6546, + "step": 33900 + }, + { + "epoch": 0.9272501059597217, + "grad_norm": 2.71875, + "learning_rate": 7.298949850566204e-07, + "loss": 0.7151, + "step": 33910 + }, + { + "epoch": 0.9275235504026469, + "grad_norm": 2.65625, + "learning_rate": 7.271530805297359e-07, + "loss": 0.7019, + "step": 33920 + }, + { + "epoch": 0.9277969948455722, + "grad_norm": 3.0625, + "learning_rate": 7.244111760028516e-07, + "loss": 0.7618, + "step": 33930 + }, + { + "epoch": 0.9280704392884975, + "grad_norm": 3.046875, + "learning_rate": 7.216692714759672e-07, + "loss": 0.7042, + "step": 33940 + }, + { + "epoch": 0.9283438837314228, + "grad_norm": 3.296875, + "learning_rate": 7.189273669490829e-07, + "loss": 0.7013, + "step": 33950 + }, + { + "epoch": 0.9286173281743482, + "grad_norm": 3.046875, + "learning_rate": 7.161854624221985e-07, + "loss": 0.7193, + "step": 33960 + }, + { + "epoch": 0.9288907726172735, + "grad_norm": 2.75, + "learning_rate": 7.134435578953142e-07, + "loss": 0.7073, + "step": 33970 + }, + { + "epoch": 0.9291642170601988, + "grad_norm": 2.859375, + "learning_rate": 7.107016533684298e-07, + "loss": 0.6908, + "step": 33980 + }, + { + "epoch": 0.9294376615031241, + "grad_norm": 3.078125, + "learning_rate": 7.079597488415455e-07, + "loss": 0.7555, + "step": 33990 + }, + { + "epoch": 0.9297111059460494, + "grad_norm": 3.046875, + "learning_rate": 7.052178443146611e-07, + "loss": 0.7599, + "step": 34000 + }, + { + "epoch": 0.9299845503889748, + "grad_norm": 2.734375, + "learning_rate": 7.024759397877765e-07, + "loss": 0.7087, + "step": 34010 + }, + { + "epoch": 0.9302579948319001, + "grad_norm": 2.78125, + "learning_rate": 6.997340352608922e-07, + "loss": 0.6177, + "step": 34020 + }, + { + "epoch": 0.9305314392748253, + "grad_norm": 3.078125, + "learning_rate": 6.969921307340078e-07, + "loss": 0.6574, + "step": 34030 + }, + { + "epoch": 0.9308048837177506, + "grad_norm": 3.3125, + "learning_rate": 6.942502262071235e-07, + "loss": 0.7505, + "step": 34040 + }, + { + "epoch": 0.9310783281606759, + "grad_norm": 2.65625, + "learning_rate": 6.915083216802391e-07, + "loss": 0.6506, + "step": 34050 + }, + { + "epoch": 0.9313517726036012, + "grad_norm": 2.8125, + "learning_rate": 6.887664171533548e-07, + "loss": 0.7158, + "step": 34060 + }, + { + "epoch": 0.9316252170465266, + "grad_norm": 2.71875, + "learning_rate": 6.860245126264704e-07, + "loss": 0.7064, + "step": 34070 + }, + { + "epoch": 0.9318986614894519, + "grad_norm": 2.84375, + "learning_rate": 6.832826080995861e-07, + "loss": 0.6666, + "step": 34080 + }, + { + "epoch": 0.9321721059323772, + "grad_norm": 2.890625, + "learning_rate": 6.805407035727017e-07, + "loss": 0.7401, + "step": 34090 + }, + { + "epoch": 0.9324455503753025, + "grad_norm": 3.203125, + "learning_rate": 6.777987990458173e-07, + "loss": 0.7309, + "step": 34100 + }, + { + "epoch": 0.9327189948182278, + "grad_norm": 2.640625, + "learning_rate": 6.750568945189328e-07, + "loss": 0.5635, + "step": 34110 + }, + { + "epoch": 0.9329924392611532, + "grad_norm": 2.296875, + "learning_rate": 6.723149899920485e-07, + "loss": 0.7179, + "step": 34120 + }, + { + "epoch": 0.9332658837040784, + "grad_norm": 2.234375, + "learning_rate": 6.695730854651641e-07, + "loss": 0.5347, + "step": 34130 + }, + { + "epoch": 0.9335393281470037, + "grad_norm": 2.5, + "learning_rate": 6.668311809382798e-07, + "loss": 0.6241, + "step": 34140 + }, + { + "epoch": 0.933812772589929, + "grad_norm": 2.5, + "learning_rate": 6.640892764113954e-07, + "loss": 0.6597, + "step": 34150 + }, + { + "epoch": 0.9340862170328543, + "grad_norm": 2.390625, + "learning_rate": 6.613473718845111e-07, + "loss": 0.6212, + "step": 34160 + }, + { + "epoch": 0.9343596614757796, + "grad_norm": 2.5625, + "learning_rate": 6.586054673576267e-07, + "loss": 0.7027, + "step": 34170 + }, + { + "epoch": 0.934633105918705, + "grad_norm": 2.890625, + "learning_rate": 6.558635628307424e-07, + "loss": 0.6964, + "step": 34180 + }, + { + "epoch": 0.9349065503616303, + "grad_norm": 2.796875, + "learning_rate": 6.531216583038579e-07, + "loss": 0.7089, + "step": 34190 + }, + { + "epoch": 0.9351799948045556, + "grad_norm": 2.359375, + "learning_rate": 6.503797537769735e-07, + "loss": 0.6494, + "step": 34200 + }, + { + "epoch": 0.9354534392474809, + "grad_norm": 2.90625, + "learning_rate": 6.476378492500892e-07, + "loss": 0.6194, + "step": 34210 + }, + { + "epoch": 0.9357268836904062, + "grad_norm": 3.1875, + "learning_rate": 6.448959447232047e-07, + "loss": 0.6999, + "step": 34220 + }, + { + "epoch": 0.9360003281333316, + "grad_norm": 2.859375, + "learning_rate": 6.421540401963204e-07, + "loss": 0.6851, + "step": 34230 + }, + { + "epoch": 0.9362737725762568, + "grad_norm": 2.84375, + "learning_rate": 6.39412135669436e-07, + "loss": 0.7426, + "step": 34240 + }, + { + "epoch": 0.9365472170191821, + "grad_norm": 2.609375, + "learning_rate": 6.366702311425517e-07, + "loss": 0.628, + "step": 34250 + }, + { + "epoch": 0.9368206614621074, + "grad_norm": 2.28125, + "learning_rate": 6.339283266156673e-07, + "loss": 0.6801, + "step": 34260 + }, + { + "epoch": 0.9370941059050327, + "grad_norm": 2.75, + "learning_rate": 6.31186422088783e-07, + "loss": 0.6944, + "step": 34270 + }, + { + "epoch": 0.937367550347958, + "grad_norm": 2.546875, + "learning_rate": 6.284445175618986e-07, + "loss": 0.7342, + "step": 34280 + }, + { + "epoch": 0.9376409947908834, + "grad_norm": 2.671875, + "learning_rate": 6.257026130350141e-07, + "loss": 0.7032, + "step": 34290 + }, + { + "epoch": 0.9379144392338087, + "grad_norm": 2.796875, + "learning_rate": 6.229607085081299e-07, + "loss": 0.7003, + "step": 34300 + }, + { + "epoch": 0.938187883676734, + "grad_norm": 2.484375, + "learning_rate": 6.202188039812453e-07, + "loss": 0.6951, + "step": 34310 + }, + { + "epoch": 0.9384613281196593, + "grad_norm": 2.96875, + "learning_rate": 6.17476899454361e-07, + "loss": 0.6513, + "step": 34320 + }, + { + "epoch": 0.9387347725625846, + "grad_norm": 2.5625, + "learning_rate": 6.147349949274766e-07, + "loss": 0.7214, + "step": 34330 + }, + { + "epoch": 0.93900821700551, + "grad_norm": 2.71875, + "learning_rate": 6.119930904005923e-07, + "loss": 0.6893, + "step": 34340 + }, + { + "epoch": 0.9392816614484352, + "grad_norm": 2.859375, + "learning_rate": 6.092511858737079e-07, + "loss": 0.6913, + "step": 34350 + }, + { + "epoch": 0.9395551058913605, + "grad_norm": 2.859375, + "learning_rate": 6.065092813468235e-07, + "loss": 0.7542, + "step": 34360 + }, + { + "epoch": 0.9398285503342858, + "grad_norm": 3.0, + "learning_rate": 6.037673768199392e-07, + "loss": 0.7434, + "step": 34370 + }, + { + "epoch": 0.9401019947772111, + "grad_norm": 2.671875, + "learning_rate": 6.010254722930548e-07, + "loss": 0.6511, + "step": 34380 + }, + { + "epoch": 0.9403754392201364, + "grad_norm": 2.84375, + "learning_rate": 5.982835677661705e-07, + "loss": 0.6766, + "step": 34390 + }, + { + "epoch": 0.9406488836630618, + "grad_norm": 3.109375, + "learning_rate": 5.955416632392861e-07, + "loss": 0.6432, + "step": 34400 + }, + { + "epoch": 0.9409223281059871, + "grad_norm": 2.71875, + "learning_rate": 5.927997587124016e-07, + "loss": 0.7139, + "step": 34410 + }, + { + "epoch": 0.9411957725489124, + "grad_norm": 2.953125, + "learning_rate": 5.900578541855173e-07, + "loss": 0.7345, + "step": 34420 + }, + { + "epoch": 0.9414692169918377, + "grad_norm": 2.84375, + "learning_rate": 5.873159496586329e-07, + "loss": 0.6788, + "step": 34430 + }, + { + "epoch": 0.941742661434763, + "grad_norm": 2.609375, + "learning_rate": 5.845740451317486e-07, + "loss": 0.6549, + "step": 34440 + }, + { + "epoch": 0.9420161058776882, + "grad_norm": 2.6875, + "learning_rate": 5.818321406048642e-07, + "loss": 0.7004, + "step": 34450 + }, + { + "epoch": 0.9422895503206136, + "grad_norm": 2.828125, + "learning_rate": 5.790902360779798e-07, + "loss": 0.6704, + "step": 34460 + }, + { + "epoch": 0.9425629947635389, + "grad_norm": 3.15625, + "learning_rate": 5.763483315510955e-07, + "loss": 0.716, + "step": 34470 + }, + { + "epoch": 0.9428364392064642, + "grad_norm": 3.25, + "learning_rate": 5.736064270242111e-07, + "loss": 0.7607, + "step": 34480 + }, + { + "epoch": 0.9431098836493895, + "grad_norm": 3.3125, + "learning_rate": 5.708645224973267e-07, + "loss": 0.7615, + "step": 34490 + }, + { + "epoch": 0.9433833280923148, + "grad_norm": 2.6875, + "learning_rate": 5.681226179704423e-07, + "loss": 0.6662, + "step": 34500 + }, + { + "epoch": 0.9436567725352402, + "grad_norm": 3.0625, + "learning_rate": 5.65380713443558e-07, + "loss": 0.7541, + "step": 34510 + }, + { + "epoch": 0.9439302169781655, + "grad_norm": 2.875, + "learning_rate": 5.626388089166735e-07, + "loss": 0.7429, + "step": 34520 + }, + { + "epoch": 0.9442036614210908, + "grad_norm": 3.046875, + "learning_rate": 5.598969043897892e-07, + "loss": 0.6663, + "step": 34530 + }, + { + "epoch": 0.9444771058640161, + "grad_norm": 2.890625, + "learning_rate": 5.571549998629048e-07, + "loss": 0.7608, + "step": 34540 + }, + { + "epoch": 0.9447505503069414, + "grad_norm": 2.921875, + "learning_rate": 5.544130953360204e-07, + "loss": 0.7172, + "step": 34550 + }, + { + "epoch": 0.9450239947498666, + "grad_norm": 2.9375, + "learning_rate": 5.516711908091361e-07, + "loss": 0.6975, + "step": 34560 + }, + { + "epoch": 0.945297439192792, + "grad_norm": 2.875, + "learning_rate": 5.489292862822517e-07, + "loss": 0.6997, + "step": 34570 + }, + { + "epoch": 0.9455708836357173, + "grad_norm": 2.921875, + "learning_rate": 5.461873817553674e-07, + "loss": 0.6904, + "step": 34580 + }, + { + "epoch": 0.9458443280786426, + "grad_norm": 2.734375, + "learning_rate": 5.43445477228483e-07, + "loss": 0.6558, + "step": 34590 + }, + { + "epoch": 0.9461177725215679, + "grad_norm": 2.953125, + "learning_rate": 5.407035727015986e-07, + "loss": 0.7171, + "step": 34600 + }, + { + "epoch": 0.9463912169644932, + "grad_norm": 2.8125, + "learning_rate": 5.379616681747143e-07, + "loss": 0.7054, + "step": 34610 + }, + { + "epoch": 0.9466646614074186, + "grad_norm": 3.125, + "learning_rate": 5.352197636478298e-07, + "loss": 0.7574, + "step": 34620 + }, + { + "epoch": 0.9469381058503439, + "grad_norm": 2.921875, + "learning_rate": 5.324778591209454e-07, + "loss": 0.6892, + "step": 34630 + }, + { + "epoch": 0.9472115502932692, + "grad_norm": 2.921875, + "learning_rate": 5.29735954594061e-07, + "loss": 0.7158, + "step": 34640 + }, + { + "epoch": 0.9474849947361945, + "grad_norm": 3.09375, + "learning_rate": 5.269940500671767e-07, + "loss": 0.6707, + "step": 34650 + }, + { + "epoch": 0.9477584391791197, + "grad_norm": 2.578125, + "learning_rate": 5.242521455402923e-07, + "loss": 0.5821, + "step": 34660 + }, + { + "epoch": 0.948031883622045, + "grad_norm": 2.484375, + "learning_rate": 5.21510241013408e-07, + "loss": 0.6984, + "step": 34670 + }, + { + "epoch": 0.9483053280649704, + "grad_norm": 3.421875, + "learning_rate": 5.187683364865236e-07, + "loss": 0.7214, + "step": 34680 + }, + { + "epoch": 0.9485787725078957, + "grad_norm": 2.5, + "learning_rate": 5.160264319596392e-07, + "loss": 0.7149, + "step": 34690 + }, + { + "epoch": 0.948852216950821, + "grad_norm": 2.671875, + "learning_rate": 5.132845274327549e-07, + "loss": 0.6753, + "step": 34700 + }, + { + "epoch": 0.9491256613937463, + "grad_norm": 2.65625, + "learning_rate": 5.105426229058704e-07, + "loss": 0.6647, + "step": 34710 + }, + { + "epoch": 0.9493991058366716, + "grad_norm": 2.890625, + "learning_rate": 5.078007183789861e-07, + "loss": 0.6846, + "step": 34720 + }, + { + "epoch": 0.949672550279597, + "grad_norm": 2.765625, + "learning_rate": 5.050588138521017e-07, + "loss": 0.747, + "step": 34730 + }, + { + "epoch": 0.9499459947225223, + "grad_norm": 4.78125, + "learning_rate": 5.023169093252173e-07, + "loss": 0.6727, + "step": 34740 + }, + { + "epoch": 0.9502194391654476, + "grad_norm": 2.453125, + "learning_rate": 4.99575004798333e-07, + "loss": 0.6553, + "step": 34750 + }, + { + "epoch": 0.9504928836083729, + "grad_norm": 2.828125, + "learning_rate": 4.968331002714486e-07, + "loss": 0.7261, + "step": 34760 + }, + { + "epoch": 0.9507663280512981, + "grad_norm": 2.6875, + "learning_rate": 4.940911957445642e-07, + "loss": 0.693, + "step": 34770 + }, + { + "epoch": 0.9510397724942234, + "grad_norm": 3.0, + "learning_rate": 4.913492912176798e-07, + "loss": 0.6444, + "step": 34780 + }, + { + "epoch": 0.9513132169371488, + "grad_norm": 2.8125, + "learning_rate": 4.886073866907955e-07, + "loss": 0.733, + "step": 34790 + }, + { + "epoch": 0.9515866613800741, + "grad_norm": 2.828125, + "learning_rate": 4.858654821639111e-07, + "loss": 0.6775, + "step": 34800 + }, + { + "epoch": 0.9518601058229994, + "grad_norm": 3.4375, + "learning_rate": 4.831235776370268e-07, + "loss": 0.6965, + "step": 34810 + }, + { + "epoch": 0.9521335502659247, + "grad_norm": 3.0, + "learning_rate": 4.803816731101423e-07, + "loss": 0.6736, + "step": 34820 + }, + { + "epoch": 0.95240699470885, + "grad_norm": 3.28125, + "learning_rate": 4.776397685832579e-07, + "loss": 0.7333, + "step": 34830 + }, + { + "epoch": 0.9526804391517754, + "grad_norm": 2.9375, + "learning_rate": 4.748978640563736e-07, + "loss": 0.7, + "step": 34840 + }, + { + "epoch": 0.9529538835947007, + "grad_norm": 2.765625, + "learning_rate": 4.7215595952948926e-07, + "loss": 0.6343, + "step": 34850 + }, + { + "epoch": 0.953227328037626, + "grad_norm": 2.46875, + "learning_rate": 4.694140550026049e-07, + "loss": 0.6286, + "step": 34860 + }, + { + "epoch": 0.9535007724805513, + "grad_norm": 3.046875, + "learning_rate": 4.6667215047572044e-07, + "loss": 0.6234, + "step": 34870 + }, + { + "epoch": 0.9537742169234765, + "grad_norm": 2.921875, + "learning_rate": 4.639302459488361e-07, + "loss": 0.6814, + "step": 34880 + }, + { + "epoch": 0.9540476613664018, + "grad_norm": 3.125, + "learning_rate": 4.611883414219517e-07, + "loss": 0.6411, + "step": 34890 + }, + { + "epoch": 0.9543211058093272, + "grad_norm": 3.109375, + "learning_rate": 4.5844643689506736e-07, + "loss": 0.6912, + "step": 34900 + }, + { + "epoch": 0.9545945502522525, + "grad_norm": 2.625, + "learning_rate": 4.5570453236818295e-07, + "loss": 0.7091, + "step": 34910 + }, + { + "epoch": 0.9548679946951778, + "grad_norm": 2.359375, + "learning_rate": 4.529626278412986e-07, + "loss": 0.6023, + "step": 34920 + }, + { + "epoch": 0.9551414391381031, + "grad_norm": 2.484375, + "learning_rate": 4.5022072331441423e-07, + "loss": 0.6433, + "step": 34930 + }, + { + "epoch": 0.9554148835810284, + "grad_norm": 2.5, + "learning_rate": 4.4747881878752987e-07, + "loss": 0.6087, + "step": 34940 + }, + { + "epoch": 0.9556883280239538, + "grad_norm": 2.984375, + "learning_rate": 4.447369142606455e-07, + "loss": 0.7129, + "step": 34950 + }, + { + "epoch": 0.9559617724668791, + "grad_norm": 2.703125, + "learning_rate": 4.419950097337611e-07, + "loss": 0.668, + "step": 34960 + }, + { + "epoch": 0.9562352169098044, + "grad_norm": 3.40625, + "learning_rate": 4.3925310520687674e-07, + "loss": 0.6594, + "step": 34970 + }, + { + "epoch": 0.9565086613527296, + "grad_norm": 2.5625, + "learning_rate": 4.365112006799924e-07, + "loss": 0.7197, + "step": 34980 + }, + { + "epoch": 0.9567821057956549, + "grad_norm": 2.78125, + "learning_rate": 4.33769296153108e-07, + "loss": 0.7347, + "step": 34990 + }, + { + "epoch": 0.9570555502385802, + "grad_norm": 3.28125, + "learning_rate": 4.3102739162622356e-07, + "loss": 0.6882, + "step": 35000 + }, + { + "epoch": 0.9573289946815056, + "grad_norm": 2.921875, + "learning_rate": 4.282854870993392e-07, + "loss": 0.7005, + "step": 35010 + }, + { + "epoch": 0.9576024391244309, + "grad_norm": 2.8125, + "learning_rate": 4.2554358257245484e-07, + "loss": 0.7138, + "step": 35020 + }, + { + "epoch": 0.9578758835673562, + "grad_norm": 3.59375, + "learning_rate": 4.228016780455705e-07, + "loss": 0.7006, + "step": 35030 + }, + { + "epoch": 0.9581493280102815, + "grad_norm": 2.5, + "learning_rate": 4.200597735186861e-07, + "loss": 0.6811, + "step": 35040 + }, + { + "epoch": 0.9584227724532068, + "grad_norm": 2.875, + "learning_rate": 4.173178689918017e-07, + "loss": 0.7468, + "step": 35050 + }, + { + "epoch": 0.9586962168961322, + "grad_norm": 2.484375, + "learning_rate": 4.1457596446491735e-07, + "loss": 0.7405, + "step": 35060 + }, + { + "epoch": 0.9589696613390575, + "grad_norm": 2.953125, + "learning_rate": 4.11834059938033e-07, + "loss": 0.7261, + "step": 35070 + }, + { + "epoch": 0.9592431057819828, + "grad_norm": 2.953125, + "learning_rate": 4.0909215541114863e-07, + "loss": 0.702, + "step": 35080 + }, + { + "epoch": 0.959516550224908, + "grad_norm": 2.78125, + "learning_rate": 4.0635025088426427e-07, + "loss": 0.6498, + "step": 35090 + }, + { + "epoch": 0.9597899946678333, + "grad_norm": 2.6875, + "learning_rate": 4.0360834635737986e-07, + "loss": 0.6982, + "step": 35100 + }, + { + "epoch": 0.9600634391107586, + "grad_norm": 2.828125, + "learning_rate": 4.008664418304955e-07, + "loss": 0.7046, + "step": 35110 + }, + { + "epoch": 0.960336883553684, + "grad_norm": 2.859375, + "learning_rate": 3.9812453730361114e-07, + "loss": 0.6958, + "step": 35120 + }, + { + "epoch": 0.9606103279966093, + "grad_norm": 2.734375, + "learning_rate": 3.953826327767268e-07, + "loss": 0.7164, + "step": 35130 + }, + { + "epoch": 0.9608837724395346, + "grad_norm": 2.578125, + "learning_rate": 3.926407282498423e-07, + "loss": 0.6473, + "step": 35140 + }, + { + "epoch": 0.9611572168824599, + "grad_norm": 2.640625, + "learning_rate": 3.8989882372295796e-07, + "loss": 0.6317, + "step": 35150 + }, + { + "epoch": 0.9614306613253852, + "grad_norm": 2.78125, + "learning_rate": 3.871569191960736e-07, + "loss": 0.7089, + "step": 35160 + }, + { + "epoch": 0.9617041057683106, + "grad_norm": 2.78125, + "learning_rate": 3.8441501466918924e-07, + "loss": 0.6676, + "step": 35170 + }, + { + "epoch": 0.9619775502112359, + "grad_norm": 2.5, + "learning_rate": 3.816731101423049e-07, + "loss": 0.7055, + "step": 35180 + }, + { + "epoch": 0.9622509946541612, + "grad_norm": 2.5625, + "learning_rate": 3.7893120561542047e-07, + "loss": 0.7123, + "step": 35190 + }, + { + "epoch": 0.9625244390970864, + "grad_norm": 3.078125, + "learning_rate": 3.761893010885361e-07, + "loss": 0.7269, + "step": 35200 + }, + { + "epoch": 0.9627978835400117, + "grad_norm": 2.78125, + "learning_rate": 3.7344739656165175e-07, + "loss": 0.6884, + "step": 35210 + }, + { + "epoch": 0.963071327982937, + "grad_norm": 2.8125, + "learning_rate": 3.707054920347674e-07, + "loss": 0.6583, + "step": 35220 + }, + { + "epoch": 0.9633447724258624, + "grad_norm": 3.0625, + "learning_rate": 3.6796358750788303e-07, + "loss": 0.7579, + "step": 35230 + }, + { + "epoch": 0.9636182168687877, + "grad_norm": 2.53125, + "learning_rate": 3.652216829809986e-07, + "loss": 0.6611, + "step": 35240 + }, + { + "epoch": 0.963891661311713, + "grad_norm": 2.84375, + "learning_rate": 3.6247977845411426e-07, + "loss": 0.6366, + "step": 35250 + }, + { + "epoch": 0.9641651057546383, + "grad_norm": 3.203125, + "learning_rate": 3.597378739272299e-07, + "loss": 0.7212, + "step": 35260 + }, + { + "epoch": 0.9644385501975636, + "grad_norm": 2.703125, + "learning_rate": 3.5699596940034554e-07, + "loss": 0.7117, + "step": 35270 + }, + { + "epoch": 0.964711994640489, + "grad_norm": 3.046875, + "learning_rate": 3.5425406487346113e-07, + "loss": 0.6765, + "step": 35280 + }, + { + "epoch": 0.9649854390834143, + "grad_norm": 2.703125, + "learning_rate": 3.5151216034657677e-07, + "loss": 0.6882, + "step": 35290 + }, + { + "epoch": 0.9652588835263395, + "grad_norm": 2.859375, + "learning_rate": 3.487702558196924e-07, + "loss": 0.7082, + "step": 35300 + }, + { + "epoch": 0.9655323279692648, + "grad_norm": 2.625, + "learning_rate": 3.4602835129280805e-07, + "loss": 0.7668, + "step": 35310 + }, + { + "epoch": 0.9658057724121901, + "grad_norm": 2.65625, + "learning_rate": 3.432864467659237e-07, + "loss": 0.6504, + "step": 35320 + }, + { + "epoch": 0.9660792168551154, + "grad_norm": 2.953125, + "learning_rate": 3.405445422390392e-07, + "loss": 0.6301, + "step": 35330 + }, + { + "epoch": 0.9663526612980408, + "grad_norm": 2.359375, + "learning_rate": 3.3780263771215487e-07, + "loss": 0.6349, + "step": 35340 + }, + { + "epoch": 0.9666261057409661, + "grad_norm": 2.78125, + "learning_rate": 3.350607331852705e-07, + "loss": 0.7404, + "step": 35350 + }, + { + "epoch": 0.9668995501838914, + "grad_norm": 2.65625, + "learning_rate": 3.3231882865838615e-07, + "loss": 0.7021, + "step": 35360 + }, + { + "epoch": 0.9671729946268167, + "grad_norm": 2.875, + "learning_rate": 3.295769241315018e-07, + "loss": 0.7051, + "step": 35370 + }, + { + "epoch": 0.967446439069742, + "grad_norm": 2.71875, + "learning_rate": 3.268350196046174e-07, + "loss": 0.6914, + "step": 35380 + }, + { + "epoch": 0.9677198835126674, + "grad_norm": 2.671875, + "learning_rate": 3.24093115077733e-07, + "loss": 0.7076, + "step": 35390 + }, + { + "epoch": 0.9679933279555927, + "grad_norm": 2.515625, + "learning_rate": 3.2135121055084866e-07, + "loss": 0.7532, + "step": 35400 + }, + { + "epoch": 0.9682667723985179, + "grad_norm": 2.953125, + "learning_rate": 3.186093060239643e-07, + "loss": 0.7188, + "step": 35410 + }, + { + "epoch": 0.9685402168414432, + "grad_norm": 2.453125, + "learning_rate": 3.158674014970799e-07, + "loss": 0.6316, + "step": 35420 + }, + { + "epoch": 0.9688136612843685, + "grad_norm": 2.78125, + "learning_rate": 3.1312549697019553e-07, + "loss": 0.6367, + "step": 35430 + }, + { + "epoch": 0.9690871057272938, + "grad_norm": 2.53125, + "learning_rate": 3.1038359244331117e-07, + "loss": 0.7353, + "step": 35440 + }, + { + "epoch": 0.9693605501702192, + "grad_norm": 3.359375, + "learning_rate": 3.076416879164268e-07, + "loss": 0.667, + "step": 35450 + }, + { + "epoch": 0.9696339946131445, + "grad_norm": 2.828125, + "learning_rate": 3.048997833895424e-07, + "loss": 0.7658, + "step": 35460 + }, + { + "epoch": 0.9699074390560698, + "grad_norm": 2.828125, + "learning_rate": 3.0215787886265804e-07, + "loss": 0.7402, + "step": 35470 + }, + { + "epoch": 0.9701808834989951, + "grad_norm": 2.640625, + "learning_rate": 2.994159743357736e-07, + "loss": 0.6522, + "step": 35480 + }, + { + "epoch": 0.9704543279419204, + "grad_norm": 2.5625, + "learning_rate": 2.9667406980888927e-07, + "loss": 0.6638, + "step": 35490 + }, + { + "epoch": 0.9707277723848458, + "grad_norm": 2.6875, + "learning_rate": 2.939321652820049e-07, + "loss": 0.7189, + "step": 35500 + }, + { + "epoch": 0.971001216827771, + "grad_norm": 2.5, + "learning_rate": 2.9119026075512055e-07, + "loss": 0.7112, + "step": 35510 + }, + { + "epoch": 0.9712746612706963, + "grad_norm": 2.53125, + "learning_rate": 2.884483562282362e-07, + "loss": 0.7113, + "step": 35520 + }, + { + "epoch": 0.9715481057136216, + "grad_norm": 2.890625, + "learning_rate": 2.857064517013518e-07, + "loss": 0.7018, + "step": 35530 + }, + { + "epoch": 0.9718215501565469, + "grad_norm": 2.484375, + "learning_rate": 2.829645471744674e-07, + "loss": 0.6085, + "step": 35540 + }, + { + "epoch": 0.9720949945994722, + "grad_norm": 2.75, + "learning_rate": 2.80222642647583e-07, + "loss": 0.6941, + "step": 35550 + }, + { + "epoch": 0.9723684390423976, + "grad_norm": 2.921875, + "learning_rate": 2.7748073812069865e-07, + "loss": 0.6904, + "step": 35560 + }, + { + "epoch": 0.9726418834853229, + "grad_norm": 2.59375, + "learning_rate": 2.747388335938143e-07, + "loss": 0.5828, + "step": 35570 + }, + { + "epoch": 0.9729153279282482, + "grad_norm": 2.890625, + "learning_rate": 2.7199692906692993e-07, + "loss": 0.7111, + "step": 35580 + }, + { + "epoch": 0.9731887723711735, + "grad_norm": 2.796875, + "learning_rate": 2.6925502454004557e-07, + "loss": 0.7078, + "step": 35590 + }, + { + "epoch": 0.9734622168140988, + "grad_norm": 2.921875, + "learning_rate": 2.6651312001316116e-07, + "loss": 0.6168, + "step": 35600 + }, + { + "epoch": 0.9737356612570242, + "grad_norm": 2.84375, + "learning_rate": 2.637712154862768e-07, + "loss": 0.685, + "step": 35610 + }, + { + "epoch": 0.9740091056999494, + "grad_norm": 2.640625, + "learning_rate": 2.610293109593924e-07, + "loss": 0.6182, + "step": 35620 + }, + { + "epoch": 0.9742825501428747, + "grad_norm": 2.625, + "learning_rate": 2.58287406432508e-07, + "loss": 0.7358, + "step": 35630 + }, + { + "epoch": 0.9745559945858, + "grad_norm": 3.046875, + "learning_rate": 2.5554550190562367e-07, + "loss": 0.7443, + "step": 35640 + }, + { + "epoch": 0.9748294390287253, + "grad_norm": 2.875, + "learning_rate": 2.528035973787393e-07, + "loss": 0.6795, + "step": 35650 + }, + { + "epoch": 0.9751028834716506, + "grad_norm": 2.984375, + "learning_rate": 2.5006169285185495e-07, + "loss": 0.7345, + "step": 35660 + }, + { + "epoch": 0.975376327914576, + "grad_norm": 2.8125, + "learning_rate": 2.4731978832497054e-07, + "loss": 0.6703, + "step": 35670 + }, + { + "epoch": 0.9756497723575013, + "grad_norm": 2.6875, + "learning_rate": 2.445778837980862e-07, + "loss": 0.6477, + "step": 35680 + }, + { + "epoch": 0.9759232168004266, + "grad_norm": 2.625, + "learning_rate": 2.4183597927120176e-07, + "loss": 0.6718, + "step": 35690 + }, + { + "epoch": 0.9761966612433519, + "grad_norm": 2.71875, + "learning_rate": 2.390940747443174e-07, + "loss": 0.7363, + "step": 35700 + }, + { + "epoch": 0.9764701056862772, + "grad_norm": 2.421875, + "learning_rate": 2.3635217021743305e-07, + "loss": 0.7118, + "step": 35710 + }, + { + "epoch": 0.9767435501292026, + "grad_norm": 3.5, + "learning_rate": 2.3361026569054869e-07, + "loss": 0.8073, + "step": 35720 + }, + { + "epoch": 0.9770169945721278, + "grad_norm": 2.453125, + "learning_rate": 2.3086836116366433e-07, + "loss": 0.7192, + "step": 35730 + }, + { + "epoch": 0.9772904390150531, + "grad_norm": 2.859375, + "learning_rate": 2.2812645663677991e-07, + "loss": 0.7012, + "step": 35740 + }, + { + "epoch": 0.9775638834579784, + "grad_norm": 2.875, + "learning_rate": 2.2538455210989556e-07, + "loss": 0.663, + "step": 35750 + }, + { + "epoch": 0.9778373279009037, + "grad_norm": 2.953125, + "learning_rate": 2.2264264758301117e-07, + "loss": 0.6944, + "step": 35760 + }, + { + "epoch": 0.978110772343829, + "grad_norm": 2.796875, + "learning_rate": 2.199007430561268e-07, + "loss": 0.7547, + "step": 35770 + }, + { + "epoch": 0.9783842167867544, + "grad_norm": 3.140625, + "learning_rate": 2.1715883852924243e-07, + "loss": 0.7007, + "step": 35780 + }, + { + "epoch": 0.9786576612296797, + "grad_norm": 2.46875, + "learning_rate": 2.1441693400235807e-07, + "loss": 0.6133, + "step": 35790 + }, + { + "epoch": 0.978931105672605, + "grad_norm": 2.890625, + "learning_rate": 2.1167502947547365e-07, + "loss": 0.7038, + "step": 35800 + }, + { + "epoch": 0.9792045501155303, + "grad_norm": 2.8125, + "learning_rate": 2.089331249485893e-07, + "loss": 0.7063, + "step": 35810 + }, + { + "epoch": 0.9794779945584556, + "grad_norm": 2.984375, + "learning_rate": 2.0619122042170494e-07, + "loss": 0.705, + "step": 35820 + }, + { + "epoch": 0.9797514390013808, + "grad_norm": 3.046875, + "learning_rate": 2.0344931589482055e-07, + "loss": 0.6751, + "step": 35830 + }, + { + "epoch": 0.9800248834443062, + "grad_norm": 2.703125, + "learning_rate": 2.007074113679362e-07, + "loss": 0.7438, + "step": 35840 + }, + { + "epoch": 0.9802983278872315, + "grad_norm": 3.140625, + "learning_rate": 1.979655068410518e-07, + "loss": 0.7276, + "step": 35850 + }, + { + "epoch": 0.9805717723301568, + "grad_norm": 2.65625, + "learning_rate": 1.9522360231416745e-07, + "loss": 0.6596, + "step": 35860 + }, + { + "epoch": 0.9808452167730821, + "grad_norm": 2.625, + "learning_rate": 1.9248169778728303e-07, + "loss": 0.7635, + "step": 35870 + }, + { + "epoch": 0.9811186612160074, + "grad_norm": 3.65625, + "learning_rate": 1.8973979326039867e-07, + "loss": 0.7105, + "step": 35880 + }, + { + "epoch": 0.9813921056589328, + "grad_norm": 2.25, + "learning_rate": 1.8699788873351431e-07, + "loss": 0.6777, + "step": 35890 + }, + { + "epoch": 0.9816655501018581, + "grad_norm": 2.765625, + "learning_rate": 1.8425598420662993e-07, + "loss": 0.7574, + "step": 35900 + }, + { + "epoch": 0.9819389945447834, + "grad_norm": 2.78125, + "learning_rate": 1.8151407967974557e-07, + "loss": 0.7161, + "step": 35910 + }, + { + "epoch": 0.9822124389877087, + "grad_norm": 2.71875, + "learning_rate": 1.7877217515286118e-07, + "loss": 0.7063, + "step": 35920 + }, + { + "epoch": 0.982485883430634, + "grad_norm": 2.71875, + "learning_rate": 1.7603027062597683e-07, + "loss": 0.6843, + "step": 35930 + }, + { + "epoch": 0.9827593278735592, + "grad_norm": 2.734375, + "learning_rate": 1.7328836609909244e-07, + "loss": 0.7016, + "step": 35940 + }, + { + "epoch": 0.9830327723164846, + "grad_norm": 2.859375, + "learning_rate": 1.7054646157220808e-07, + "loss": 0.6936, + "step": 35950 + }, + { + "epoch": 0.9833062167594099, + "grad_norm": 2.796875, + "learning_rate": 1.6780455704532372e-07, + "loss": 0.7624, + "step": 35960 + }, + { + "epoch": 0.9835796612023352, + "grad_norm": 2.59375, + "learning_rate": 1.650626525184393e-07, + "loss": 0.6882, + "step": 35970 + }, + { + "epoch": 0.9838531056452605, + "grad_norm": 2.59375, + "learning_rate": 1.6232074799155495e-07, + "loss": 0.6692, + "step": 35980 + }, + { + "epoch": 0.9841265500881858, + "grad_norm": 2.875, + "learning_rate": 1.5957884346467056e-07, + "loss": 0.7824, + "step": 35990 + }, + { + "epoch": 0.9843999945311112, + "grad_norm": 2.546875, + "learning_rate": 1.568369389377862e-07, + "loss": 0.712, + "step": 36000 + }, + { + "epoch": 0.9846734389740365, + "grad_norm": 2.515625, + "learning_rate": 1.5409503441090182e-07, + "loss": 0.7134, + "step": 36010 + }, + { + "epoch": 0.9849468834169618, + "grad_norm": 2.75, + "learning_rate": 1.5135312988401746e-07, + "loss": 0.6855, + "step": 36020 + }, + { + "epoch": 0.9852203278598871, + "grad_norm": 2.90625, + "learning_rate": 1.4861122535713307e-07, + "loss": 0.615, + "step": 36030 + }, + { + "epoch": 0.9854937723028123, + "grad_norm": 3.03125, + "learning_rate": 1.458693208302487e-07, + "loss": 0.7006, + "step": 36040 + }, + { + "epoch": 0.9857672167457376, + "grad_norm": 2.890625, + "learning_rate": 1.4312741630336433e-07, + "loss": 0.7225, + "step": 36050 + }, + { + "epoch": 0.986040661188663, + "grad_norm": 2.640625, + "learning_rate": 1.4038551177647997e-07, + "loss": 0.6622, + "step": 36060 + }, + { + "epoch": 0.9863141056315883, + "grad_norm": 2.4375, + "learning_rate": 1.3764360724959558e-07, + "loss": 0.652, + "step": 36070 + }, + { + "epoch": 0.9865875500745136, + "grad_norm": 2.8125, + "learning_rate": 1.349017027227112e-07, + "loss": 0.7124, + "step": 36080 + }, + { + "epoch": 0.9868609945174389, + "grad_norm": 3.046875, + "learning_rate": 1.3215979819582684e-07, + "loss": 0.7341, + "step": 36090 + }, + { + "epoch": 0.9871344389603642, + "grad_norm": 2.765625, + "learning_rate": 1.2941789366894245e-07, + "loss": 0.7236, + "step": 36100 + }, + { + "epoch": 0.9874078834032896, + "grad_norm": 2.671875, + "learning_rate": 1.2667598914205807e-07, + "loss": 0.7345, + "step": 36110 + }, + { + "epoch": 0.9876813278462149, + "grad_norm": 3.3125, + "learning_rate": 1.239340846151737e-07, + "loss": 0.7393, + "step": 36120 + }, + { + "epoch": 0.9879547722891402, + "grad_norm": 2.71875, + "learning_rate": 1.2119218008828935e-07, + "loss": 0.6382, + "step": 36130 + }, + { + "epoch": 0.9882282167320655, + "grad_norm": 2.375, + "learning_rate": 1.1845027556140496e-07, + "loss": 0.6763, + "step": 36140 + }, + { + "epoch": 0.9885016611749907, + "grad_norm": 2.71875, + "learning_rate": 1.1570837103452059e-07, + "loss": 0.6947, + "step": 36150 + }, + { + "epoch": 0.988775105617916, + "grad_norm": 2.671875, + "learning_rate": 1.1296646650763622e-07, + "loss": 0.7217, + "step": 36160 + }, + { + "epoch": 0.9890485500608414, + "grad_norm": 2.984375, + "learning_rate": 1.1022456198075183e-07, + "loss": 0.7436, + "step": 36170 + }, + { + "epoch": 0.9893219945037667, + "grad_norm": 2.75, + "learning_rate": 1.0748265745386746e-07, + "loss": 0.6377, + "step": 36180 + }, + { + "epoch": 0.989595438946692, + "grad_norm": 2.65625, + "learning_rate": 1.0474075292698309e-07, + "loss": 0.6429, + "step": 36190 + }, + { + "epoch": 0.9898688833896173, + "grad_norm": 3.484375, + "learning_rate": 1.019988484000987e-07, + "loss": 0.696, + "step": 36200 + }, + { + "epoch": 0.9901423278325426, + "grad_norm": 2.78125, + "learning_rate": 9.925694387321434e-08, + "loss": 0.7024, + "step": 36210 + }, + { + "epoch": 0.990415772275468, + "grad_norm": 3.296875, + "learning_rate": 9.651503934632997e-08, + "loss": 0.6419, + "step": 36220 + }, + { + "epoch": 0.9906892167183933, + "grad_norm": 2.421875, + "learning_rate": 9.37731348194456e-08, + "loss": 0.6579, + "step": 36230 + }, + { + "epoch": 0.9909626611613186, + "grad_norm": 2.71875, + "learning_rate": 9.103123029256123e-08, + "loss": 0.695, + "step": 36240 + }, + { + "epoch": 0.9912361056042439, + "grad_norm": 2.75, + "learning_rate": 8.828932576567684e-08, + "loss": 0.6821, + "step": 36250 + }, + { + "epoch": 0.9915095500471691, + "grad_norm": 3.03125, + "learning_rate": 8.554742123879247e-08, + "loss": 0.7694, + "step": 36260 + }, + { + "epoch": 0.9917829944900944, + "grad_norm": 2.90625, + "learning_rate": 8.28055167119081e-08, + "loss": 0.7164, + "step": 36270 + }, + { + "epoch": 0.9920564389330198, + "grad_norm": 3.375, + "learning_rate": 8.006361218502374e-08, + "loss": 0.7653, + "step": 36280 + }, + { + "epoch": 0.9923298833759451, + "grad_norm": 2.96875, + "learning_rate": 7.732170765813935e-08, + "loss": 0.6463, + "step": 36290 + }, + { + "epoch": 0.9926033278188704, + "grad_norm": 2.453125, + "learning_rate": 7.457980313125498e-08, + "loss": 0.6799, + "step": 36300 + }, + { + "epoch": 0.9928767722617957, + "grad_norm": 2.5625, + "learning_rate": 7.18378986043706e-08, + "loss": 0.7258, + "step": 36310 + }, + { + "epoch": 0.993150216704721, + "grad_norm": 2.84375, + "learning_rate": 6.909599407748622e-08, + "loss": 0.7094, + "step": 36320 + }, + { + "epoch": 0.9934236611476464, + "grad_norm": 2.203125, + "learning_rate": 6.635408955060186e-08, + "loss": 0.654, + "step": 36330 + }, + { + "epoch": 0.9936971055905717, + "grad_norm": 2.90625, + "learning_rate": 6.361218502371748e-08, + "loss": 0.6714, + "step": 36340 + }, + { + "epoch": 0.993970550033497, + "grad_norm": 2.84375, + "learning_rate": 6.08702804968331e-08, + "loss": 0.7055, + "step": 36350 + }, + { + "epoch": 0.9942439944764222, + "grad_norm": 2.375, + "learning_rate": 5.812837596994873e-08, + "loss": 0.6541, + "step": 36360 + }, + { + "epoch": 0.9945174389193475, + "grad_norm": 2.875, + "learning_rate": 5.538647144306436e-08, + "loss": 0.6878, + "step": 36370 + }, + { + "epoch": 0.9947908833622728, + "grad_norm": 2.78125, + "learning_rate": 5.2644566916179985e-08, + "loss": 0.6804, + "step": 36380 + }, + { + "epoch": 0.9950643278051982, + "grad_norm": 2.65625, + "learning_rate": 4.9902662389295606e-08, + "loss": 0.618, + "step": 36390 + }, + { + "epoch": 0.9953377722481235, + "grad_norm": 2.8125, + "learning_rate": 4.7160757862411234e-08, + "loss": 0.7042, + "step": 36400 + }, + { + "epoch": 0.9956112166910488, + "grad_norm": 3.125, + "learning_rate": 4.441885333552686e-08, + "loss": 0.6653, + "step": 36410 + }, + { + "epoch": 0.9958846611339741, + "grad_norm": 3.078125, + "learning_rate": 4.167694880864249e-08, + "loss": 0.6694, + "step": 36420 + }, + { + "epoch": 0.9961581055768994, + "grad_norm": 2.625, + "learning_rate": 3.8935044281758117e-08, + "loss": 0.6993, + "step": 36430 + }, + { + "epoch": 0.9964315500198248, + "grad_norm": 2.671875, + "learning_rate": 3.619313975487374e-08, + "loss": 0.6414, + "step": 36440 + }, + { + "epoch": 0.9967049944627501, + "grad_norm": 2.84375, + "learning_rate": 3.3451235227989365e-08, + "loss": 0.6967, + "step": 36450 + }, + { + "epoch": 0.9969784389056754, + "grad_norm": 2.828125, + "learning_rate": 3.0709330701104986e-08, + "loss": 0.6618, + "step": 36460 + }, + { + "epoch": 0.9972518833486006, + "grad_norm": 2.953125, + "learning_rate": 2.7967426174220617e-08, + "loss": 0.7492, + "step": 36470 + }, + { + "epoch": 0.9975253277915259, + "grad_norm": 2.796875, + "learning_rate": 2.522552164733624e-08, + "loss": 0.6612, + "step": 36480 + }, + { + "epoch": 0.9977987722344512, + "grad_norm": 2.59375, + "learning_rate": 2.248361712045187e-08, + "loss": 0.6469, + "step": 36490 + }, + { + "epoch": 0.9980722166773766, + "grad_norm": 2.921875, + "learning_rate": 1.9741712593567493e-08, + "loss": 0.7214, + "step": 36500 + } + ], + "logging_steps": 10, + "max_steps": 36571, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.3039385772032e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}