diff --git "a/checkpoint-7500/trainer_state.json" "b/checkpoint-7500/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-7500/trainer_state.json" @@ -0,0 +1,5284 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.16464293064416546, + "eval_steps": 500, + "global_step": 7500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00021952390752555395, + "grad_norm": 129536.0, + "learning_rate": 1.99775e-05, + "loss": 11.4186, + "step": 10 + }, + { + "epoch": 0.0004390478150511079, + "grad_norm": 154.0, + "learning_rate": 1.9952500000000003e-05, + "loss": 6.1068, + "step": 20 + }, + { + "epoch": 0.0006585717225766619, + "grad_norm": 2.1875, + "learning_rate": 1.9927500000000002e-05, + "loss": 4.8048, + "step": 30 + }, + { + "epoch": 0.0008780956301022158, + "grad_norm": 1.640625, + "learning_rate": 1.99025e-05, + "loss": 0.2014, + "step": 40 + }, + { + "epoch": 0.0010976195376277698, + "grad_norm": 1.4921875, + "learning_rate": 1.98775e-05, + "loss": 0.2177, + "step": 50 + }, + { + "epoch": 0.0013171434451533237, + "grad_norm": 1.3125, + "learning_rate": 1.98525e-05, + "loss": 0.2286, + "step": 60 + }, + { + "epoch": 0.0015366673526788777, + "grad_norm": 0.67578125, + "learning_rate": 1.9827500000000003e-05, + "loss": 0.1803, + "step": 70 + }, + { + "epoch": 0.0017561912602044316, + "grad_norm": 1.1328125, + "learning_rate": 1.9802500000000002e-05, + "loss": 0.1814, + "step": 80 + }, + { + "epoch": 0.0019757151677299856, + "grad_norm": 1.1953125, + "learning_rate": 1.97775e-05, + "loss": 0.1881, + "step": 90 + }, + { + "epoch": 0.0021952390752555395, + "grad_norm": 0.74609375, + "learning_rate": 1.97525e-05, + "loss": 0.1877, + "step": 100 + }, + { + "epoch": 0.0024147629827810935, + "grad_norm": 0.98046875, + "learning_rate": 1.97275e-05, + "loss": 0.1892, + "step": 110 + }, + { + "epoch": 0.0026342868903066474, + "grad_norm": 0.92578125, + "learning_rate": 1.9702500000000003e-05, + "loss": 0.1921, + "step": 120 + }, + { + "epoch": 0.0028538107978322014, + "grad_norm": 1.25, + "learning_rate": 1.9677500000000003e-05, + "loss": 0.2261, + "step": 130 + }, + { + "epoch": 0.0030733347053577553, + "grad_norm": 0.82421875, + "learning_rate": 1.9652500000000002e-05, + "loss": 0.2294, + "step": 140 + }, + { + "epoch": 0.0032928586128833093, + "grad_norm": 0.89453125, + "learning_rate": 1.96275e-05, + "loss": 0.1871, + "step": 150 + }, + { + "epoch": 0.0035123825204088632, + "grad_norm": 0.83203125, + "learning_rate": 1.96025e-05, + "loss": 0.1888, + "step": 160 + }, + { + "epoch": 0.003731906427934417, + "grad_norm": 1.0390625, + "learning_rate": 1.9577500000000004e-05, + "loss": 0.1925, + "step": 170 + }, + { + "epoch": 0.003951430335459971, + "grad_norm": 0.95703125, + "learning_rate": 1.95525e-05, + "loss": 0.1974, + "step": 180 + }, + { + "epoch": 0.004170954242985525, + "grad_norm": 1.2421875, + "learning_rate": 1.9527500000000002e-05, + "loss": 0.2109, + "step": 190 + }, + { + "epoch": 0.004390478150511079, + "grad_norm": 0.734375, + "learning_rate": 1.9502500000000002e-05, + "loss": 0.2015, + "step": 200 + }, + { + "epoch": 0.004610002058036633, + "grad_norm": 0.953125, + "learning_rate": 1.94775e-05, + "loss": 0.1631, + "step": 210 + }, + { + "epoch": 0.004829525965562187, + "grad_norm": 0.6484375, + "learning_rate": 1.94525e-05, + "loss": 0.195, + "step": 220 + }, + { + "epoch": 0.005049049873087741, + "grad_norm": 0.95703125, + "learning_rate": 1.94275e-05, + "loss": 0.1882, + "step": 230 + }, + { + "epoch": 0.005268573780613295, + "grad_norm": 1.1875, + "learning_rate": 1.9402500000000003e-05, + "loss": 0.1956, + "step": 240 + }, + { + "epoch": 0.005488097688138849, + "grad_norm": 0.9453125, + "learning_rate": 1.9377500000000002e-05, + "loss": 0.2236, + "step": 250 + }, + { + "epoch": 0.005707621595664403, + "grad_norm": 0.65625, + "learning_rate": 1.93525e-05, + "loss": 0.1906, + "step": 260 + }, + { + "epoch": 0.005927145503189957, + "grad_norm": 0.79296875, + "learning_rate": 1.93275e-05, + "loss": 0.2174, + "step": 270 + }, + { + "epoch": 0.006146669410715511, + "grad_norm": 1.1875, + "learning_rate": 1.93025e-05, + "loss": 0.1898, + "step": 280 + }, + { + "epoch": 0.006366193318241065, + "grad_norm": 0.79296875, + "learning_rate": 1.9277500000000003e-05, + "loss": 0.2021, + "step": 290 + }, + { + "epoch": 0.0065857172257666186, + "grad_norm": 0.8125, + "learning_rate": 1.92525e-05, + "loss": 0.192, + "step": 300 + }, + { + "epoch": 0.0068052411332921725, + "grad_norm": 0.8359375, + "learning_rate": 1.9227500000000002e-05, + "loss": 0.2299, + "step": 310 + }, + { + "epoch": 0.0070247650408177265, + "grad_norm": 0.90234375, + "learning_rate": 1.92025e-05, + "loss": 0.2137, + "step": 320 + }, + { + "epoch": 0.00724428894834328, + "grad_norm": 0.8359375, + "learning_rate": 1.91775e-05, + "loss": 0.1918, + "step": 330 + }, + { + "epoch": 0.007463812855868834, + "grad_norm": 0.796875, + "learning_rate": 1.91525e-05, + "loss": 0.2303, + "step": 340 + }, + { + "epoch": 0.007683336763394388, + "grad_norm": 0.68359375, + "learning_rate": 1.91275e-05, + "loss": 0.1957, + "step": 350 + }, + { + "epoch": 0.007902860670919942, + "grad_norm": 1.1015625, + "learning_rate": 1.9102500000000002e-05, + "loss": 0.2029, + "step": 360 + }, + { + "epoch": 0.008122384578445496, + "grad_norm": 0.87109375, + "learning_rate": 1.90775e-05, + "loss": 0.219, + "step": 370 + }, + { + "epoch": 0.00834190848597105, + "grad_norm": 0.8828125, + "learning_rate": 1.90525e-05, + "loss": 0.2189, + "step": 380 + }, + { + "epoch": 0.008561432393496604, + "grad_norm": 1.125, + "learning_rate": 1.90275e-05, + "loss": 0.2014, + "step": 390 + }, + { + "epoch": 0.008780956301022158, + "grad_norm": 0.69921875, + "learning_rate": 1.9002500000000003e-05, + "loss": 0.1757, + "step": 400 + }, + { + "epoch": 0.009000480208547712, + "grad_norm": 0.6640625, + "learning_rate": 1.8977500000000003e-05, + "loss": 0.2146, + "step": 410 + }, + { + "epoch": 0.009220004116073266, + "grad_norm": 0.7578125, + "learning_rate": 1.8952500000000002e-05, + "loss": 0.1767, + "step": 420 + }, + { + "epoch": 0.00943952802359882, + "grad_norm": 0.9140625, + "learning_rate": 1.89275e-05, + "loss": 0.1913, + "step": 430 + }, + { + "epoch": 0.009659051931124374, + "grad_norm": 0.890625, + "learning_rate": 1.89025e-05, + "loss": 0.1938, + "step": 440 + }, + { + "epoch": 0.009878575838649928, + "grad_norm": 0.9765625, + "learning_rate": 1.8877500000000003e-05, + "loss": 0.1898, + "step": 450 + }, + { + "epoch": 0.010098099746175482, + "grad_norm": 1.0078125, + "learning_rate": 1.8852500000000003e-05, + "loss": 0.214, + "step": 460 + }, + { + "epoch": 0.010317623653701036, + "grad_norm": 0.85546875, + "learning_rate": 1.8827500000000002e-05, + "loss": 0.191, + "step": 470 + }, + { + "epoch": 0.01053714756122659, + "grad_norm": 0.71484375, + "learning_rate": 1.88025e-05, + "loss": 0.207, + "step": 480 + }, + { + "epoch": 0.010756671468752144, + "grad_norm": 1.2734375, + "learning_rate": 1.87775e-05, + "loss": 0.2191, + "step": 490 + }, + { + "epoch": 0.010976195376277698, + "grad_norm": 1.1171875, + "learning_rate": 1.8752500000000004e-05, + "loss": 0.2271, + "step": 500 + }, + { + "epoch": 0.011195719283803252, + "grad_norm": 0.703125, + "learning_rate": 1.87275e-05, + "loss": 0.1823, + "step": 510 + }, + { + "epoch": 0.011415243191328805, + "grad_norm": 1.0859375, + "learning_rate": 1.8702500000000003e-05, + "loss": 0.2054, + "step": 520 + }, + { + "epoch": 0.01163476709885436, + "grad_norm": 0.765625, + "learning_rate": 1.8677500000000002e-05, + "loss": 0.2131, + "step": 530 + }, + { + "epoch": 0.011854291006379913, + "grad_norm": 0.73046875, + "learning_rate": 1.86525e-05, + "loss": 0.1569, + "step": 540 + }, + { + "epoch": 0.012073814913905467, + "grad_norm": 1.0078125, + "learning_rate": 1.86275e-05, + "loss": 0.1877, + "step": 550 + }, + { + "epoch": 0.012293338821431021, + "grad_norm": 0.97265625, + "learning_rate": 1.86025e-05, + "loss": 0.2228, + "step": 560 + }, + { + "epoch": 0.012512862728956575, + "grad_norm": 0.98046875, + "learning_rate": 1.8577500000000003e-05, + "loss": 0.2096, + "step": 570 + }, + { + "epoch": 0.01273238663648213, + "grad_norm": 0.7421875, + "learning_rate": 1.8552500000000002e-05, + "loss": 0.1861, + "step": 580 + }, + { + "epoch": 0.012951910544007683, + "grad_norm": 0.60546875, + "learning_rate": 1.8527500000000002e-05, + "loss": 0.1832, + "step": 590 + }, + { + "epoch": 0.013171434451533237, + "grad_norm": 0.90234375, + "learning_rate": 1.85025e-05, + "loss": 0.2182, + "step": 600 + }, + { + "epoch": 0.013390958359058791, + "grad_norm": 0.828125, + "learning_rate": 1.84775e-05, + "loss": 0.2285, + "step": 610 + }, + { + "epoch": 0.013610482266584345, + "grad_norm": 0.703125, + "learning_rate": 1.8452500000000003e-05, + "loss": 0.2024, + "step": 620 + }, + { + "epoch": 0.013830006174109899, + "grad_norm": 1.3203125, + "learning_rate": 1.8427500000000003e-05, + "loss": 0.2115, + "step": 630 + }, + { + "epoch": 0.014049530081635453, + "grad_norm": 1.03125, + "learning_rate": 1.8402500000000002e-05, + "loss": 0.227, + "step": 640 + }, + { + "epoch": 0.014269053989161007, + "grad_norm": 0.73828125, + "learning_rate": 1.83775e-05, + "loss": 0.185, + "step": 650 + }, + { + "epoch": 0.01448857789668656, + "grad_norm": 0.78125, + "learning_rate": 1.83525e-05, + "loss": 0.1754, + "step": 660 + }, + { + "epoch": 0.014708101804212115, + "grad_norm": 0.7578125, + "learning_rate": 1.8327500000000004e-05, + "loss": 0.2198, + "step": 670 + }, + { + "epoch": 0.014927625711737669, + "grad_norm": 0.8046875, + "learning_rate": 1.83025e-05, + "loss": 0.1966, + "step": 680 + }, + { + "epoch": 0.015147149619263223, + "grad_norm": 0.9140625, + "learning_rate": 1.8277500000000002e-05, + "loss": 0.187, + "step": 690 + }, + { + "epoch": 0.015366673526788777, + "grad_norm": 1.0625, + "learning_rate": 1.8252500000000002e-05, + "loss": 0.2088, + "step": 700 + }, + { + "epoch": 0.01558619743431433, + "grad_norm": 0.85546875, + "learning_rate": 1.82275e-05, + "loss": 0.2132, + "step": 710 + }, + { + "epoch": 0.015805721341839885, + "grad_norm": 0.93359375, + "learning_rate": 1.82025e-05, + "loss": 0.2149, + "step": 720 + }, + { + "epoch": 0.01602524524936544, + "grad_norm": 0.87109375, + "learning_rate": 1.81775e-05, + "loss": 0.1931, + "step": 730 + }, + { + "epoch": 0.016244769156890992, + "grad_norm": 0.6875, + "learning_rate": 1.8152500000000003e-05, + "loss": 0.1721, + "step": 740 + }, + { + "epoch": 0.016464293064416546, + "grad_norm": 0.78125, + "learning_rate": 1.8127500000000002e-05, + "loss": 0.1911, + "step": 750 + }, + { + "epoch": 0.0166838169719421, + "grad_norm": 0.95703125, + "learning_rate": 1.81025e-05, + "loss": 0.2294, + "step": 760 + }, + { + "epoch": 0.016903340879467654, + "grad_norm": 1.0546875, + "learning_rate": 1.80775e-05, + "loss": 0.1867, + "step": 770 + }, + { + "epoch": 0.017122864786993208, + "grad_norm": 0.58984375, + "learning_rate": 1.80525e-05, + "loss": 0.1997, + "step": 780 + }, + { + "epoch": 0.017342388694518762, + "grad_norm": 1.015625, + "learning_rate": 1.8027500000000003e-05, + "loss": 0.2095, + "step": 790 + }, + { + "epoch": 0.017561912602044316, + "grad_norm": 1.0078125, + "learning_rate": 1.80025e-05, + "loss": 0.1862, + "step": 800 + }, + { + "epoch": 0.01778143650956987, + "grad_norm": 0.75, + "learning_rate": 1.7977500000000002e-05, + "loss": 0.1971, + "step": 810 + }, + { + "epoch": 0.018000960417095424, + "grad_norm": 1.046875, + "learning_rate": 1.79525e-05, + "loss": 0.193, + "step": 820 + }, + { + "epoch": 0.018220484324620978, + "grad_norm": 0.65234375, + "learning_rate": 1.79275e-05, + "loss": 0.2195, + "step": 830 + }, + { + "epoch": 0.018440008232146532, + "grad_norm": 0.68359375, + "learning_rate": 1.79025e-05, + "loss": 0.1954, + "step": 840 + }, + { + "epoch": 0.018659532139672086, + "grad_norm": 0.76953125, + "learning_rate": 1.78775e-05, + "loss": 0.1546, + "step": 850 + }, + { + "epoch": 0.01887905604719764, + "grad_norm": 0.80078125, + "learning_rate": 1.7852500000000002e-05, + "loss": 0.188, + "step": 860 + }, + { + "epoch": 0.019098579954723194, + "grad_norm": 0.91796875, + "learning_rate": 1.78275e-05, + "loss": 0.2128, + "step": 870 + }, + { + "epoch": 0.019318103862248748, + "grad_norm": 0.71875, + "learning_rate": 1.78025e-05, + "loss": 0.1846, + "step": 880 + }, + { + "epoch": 0.0195376277697743, + "grad_norm": 0.734375, + "learning_rate": 1.77775e-05, + "loss": 0.1938, + "step": 890 + }, + { + "epoch": 0.019757151677299856, + "grad_norm": 0.66796875, + "learning_rate": 1.77525e-05, + "loss": 0.1781, + "step": 900 + }, + { + "epoch": 0.01997667558482541, + "grad_norm": 0.98046875, + "learning_rate": 1.7727500000000003e-05, + "loss": 0.1848, + "step": 910 + }, + { + "epoch": 0.020196199492350964, + "grad_norm": 0.81640625, + "learning_rate": 1.7702500000000002e-05, + "loss": 0.1739, + "step": 920 + }, + { + "epoch": 0.020415723399876518, + "grad_norm": 1.09375, + "learning_rate": 1.76775e-05, + "loss": 0.1878, + "step": 930 + }, + { + "epoch": 0.02063524730740207, + "grad_norm": 1.078125, + "learning_rate": 1.76525e-05, + "loss": 0.2028, + "step": 940 + }, + { + "epoch": 0.020854771214927625, + "grad_norm": 0.828125, + "learning_rate": 1.76275e-05, + "loss": 0.1897, + "step": 950 + }, + { + "epoch": 0.02107429512245318, + "grad_norm": 0.671875, + "learning_rate": 1.7602500000000003e-05, + "loss": 0.1748, + "step": 960 + }, + { + "epoch": 0.021293819029978733, + "grad_norm": 0.84375, + "learning_rate": 1.75775e-05, + "loss": 0.1997, + "step": 970 + }, + { + "epoch": 0.021513342937504287, + "grad_norm": 0.78125, + "learning_rate": 1.7552500000000002e-05, + "loss": 0.1989, + "step": 980 + }, + { + "epoch": 0.02173286684502984, + "grad_norm": 0.64453125, + "learning_rate": 1.75275e-05, + "loss": 0.1836, + "step": 990 + }, + { + "epoch": 0.021952390752555395, + "grad_norm": 0.91015625, + "learning_rate": 1.7502500000000004e-05, + "loss": 0.2198, + "step": 1000 + }, + { + "epoch": 0.02217191466008095, + "grad_norm": 0.96484375, + "learning_rate": 1.74775e-05, + "loss": 0.1847, + "step": 1010 + }, + { + "epoch": 0.022391438567606503, + "grad_norm": 1.015625, + "learning_rate": 1.7452500000000003e-05, + "loss": 0.1954, + "step": 1020 + }, + { + "epoch": 0.022610962475132057, + "grad_norm": 0.71484375, + "learning_rate": 1.7427500000000002e-05, + "loss": 0.2007, + "step": 1030 + }, + { + "epoch": 0.02283048638265761, + "grad_norm": 0.5546875, + "learning_rate": 1.74025e-05, + "loss": 0.2068, + "step": 1040 + }, + { + "epoch": 0.023050010290183165, + "grad_norm": 0.78125, + "learning_rate": 1.73775e-05, + "loss": 0.1898, + "step": 1050 + }, + { + "epoch": 0.02326953419770872, + "grad_norm": 0.64453125, + "learning_rate": 1.73525e-05, + "loss": 0.1661, + "step": 1060 + }, + { + "epoch": 0.023489058105234273, + "grad_norm": 0.7265625, + "learning_rate": 1.7327500000000003e-05, + "loss": 0.1745, + "step": 1070 + }, + { + "epoch": 0.023708582012759827, + "grad_norm": 0.60546875, + "learning_rate": 1.7302500000000002e-05, + "loss": 0.1691, + "step": 1080 + }, + { + "epoch": 0.02392810592028538, + "grad_norm": 0.94921875, + "learning_rate": 1.7277500000000002e-05, + "loss": 0.204, + "step": 1090 + }, + { + "epoch": 0.024147629827810935, + "grad_norm": 0.69140625, + "learning_rate": 1.72525e-05, + "loss": 0.1775, + "step": 1100 + }, + { + "epoch": 0.02436715373533649, + "grad_norm": 0.8203125, + "learning_rate": 1.72275e-05, + "loss": 0.1953, + "step": 1110 + }, + { + "epoch": 0.024586677642862043, + "grad_norm": 0.78515625, + "learning_rate": 1.7202500000000003e-05, + "loss": 0.1916, + "step": 1120 + }, + { + "epoch": 0.024806201550387597, + "grad_norm": 0.6640625, + "learning_rate": 1.7177500000000003e-05, + "loss": 0.1762, + "step": 1130 + }, + { + "epoch": 0.02502572545791315, + "grad_norm": 1.0078125, + "learning_rate": 1.7152500000000002e-05, + "loss": 0.2267, + "step": 1140 + }, + { + "epoch": 0.025245249365438704, + "grad_norm": 0.84765625, + "learning_rate": 1.71275e-05, + "loss": 0.1924, + "step": 1150 + }, + { + "epoch": 0.02546477327296426, + "grad_norm": 0.91796875, + "learning_rate": 1.71025e-05, + "loss": 0.2081, + "step": 1160 + }, + { + "epoch": 0.025684297180489812, + "grad_norm": 0.734375, + "learning_rate": 1.7077500000000004e-05, + "loss": 0.1693, + "step": 1170 + }, + { + "epoch": 0.025903821088015366, + "grad_norm": 0.8046875, + "learning_rate": 1.70525e-05, + "loss": 0.2278, + "step": 1180 + }, + { + "epoch": 0.02612334499554092, + "grad_norm": 0.7109375, + "learning_rate": 1.7027500000000003e-05, + "loss": 0.1819, + "step": 1190 + }, + { + "epoch": 0.026342868903066474, + "grad_norm": 0.671875, + "learning_rate": 1.7002500000000002e-05, + "loss": 0.147, + "step": 1200 + }, + { + "epoch": 0.026562392810592028, + "grad_norm": 0.8671875, + "learning_rate": 1.69775e-05, + "loss": 0.1857, + "step": 1210 + }, + { + "epoch": 0.026781916718117582, + "grad_norm": 0.7890625, + "learning_rate": 1.69525e-05, + "loss": 0.1665, + "step": 1220 + }, + { + "epoch": 0.027001440625643136, + "grad_norm": 0.8984375, + "learning_rate": 1.69275e-05, + "loss": 0.2032, + "step": 1230 + }, + { + "epoch": 0.02722096453316869, + "grad_norm": 0.6171875, + "learning_rate": 1.6902500000000003e-05, + "loss": 0.182, + "step": 1240 + }, + { + "epoch": 0.027440488440694244, + "grad_norm": 0.65234375, + "learning_rate": 1.6877500000000002e-05, + "loss": 0.1936, + "step": 1250 + }, + { + "epoch": 0.027660012348219798, + "grad_norm": 0.84765625, + "learning_rate": 1.6852500000000002e-05, + "loss": 0.1971, + "step": 1260 + }, + { + "epoch": 0.027879536255745352, + "grad_norm": 0.91015625, + "learning_rate": 1.68275e-05, + "loss": 0.1824, + "step": 1270 + }, + { + "epoch": 0.028099060163270906, + "grad_norm": 0.65234375, + "learning_rate": 1.68025e-05, + "loss": 0.1731, + "step": 1280 + }, + { + "epoch": 0.02831858407079646, + "grad_norm": 0.83203125, + "learning_rate": 1.6777500000000003e-05, + "loss": 0.1818, + "step": 1290 + }, + { + "epoch": 0.028538107978322014, + "grad_norm": 0.95703125, + "learning_rate": 1.67525e-05, + "loss": 0.1959, + "step": 1300 + }, + { + "epoch": 0.028757631885847568, + "grad_norm": 0.96875, + "learning_rate": 1.6727500000000002e-05, + "loss": 0.1777, + "step": 1310 + }, + { + "epoch": 0.02897715579337312, + "grad_norm": 0.7734375, + "learning_rate": 1.67025e-05, + "loss": 0.1719, + "step": 1320 + }, + { + "epoch": 0.029196679700898676, + "grad_norm": 0.64453125, + "learning_rate": 1.66775e-05, + "loss": 0.1985, + "step": 1330 + }, + { + "epoch": 0.02941620360842423, + "grad_norm": 0.96875, + "learning_rate": 1.66525e-05, + "loss": 0.1841, + "step": 1340 + }, + { + "epoch": 0.029635727515949783, + "grad_norm": 0.78125, + "learning_rate": 1.66275e-05, + "loss": 0.2174, + "step": 1350 + }, + { + "epoch": 0.029855251423475337, + "grad_norm": 0.80078125, + "learning_rate": 1.6602500000000002e-05, + "loss": 0.1925, + "step": 1360 + }, + { + "epoch": 0.03007477533100089, + "grad_norm": 0.78515625, + "learning_rate": 1.6577500000000002e-05, + "loss": 0.1686, + "step": 1370 + }, + { + "epoch": 0.030294299238526445, + "grad_norm": 0.765625, + "learning_rate": 1.65525e-05, + "loss": 0.1877, + "step": 1380 + }, + { + "epoch": 0.030513823146052, + "grad_norm": 0.9140625, + "learning_rate": 1.65275e-05, + "loss": 0.2048, + "step": 1390 + }, + { + "epoch": 0.030733347053577553, + "grad_norm": 0.52734375, + "learning_rate": 1.65025e-05, + "loss": 0.2031, + "step": 1400 + }, + { + "epoch": 0.030952870961103107, + "grad_norm": 0.8984375, + "learning_rate": 1.6477500000000003e-05, + "loss": 0.217, + "step": 1410 + }, + { + "epoch": 0.03117239486862866, + "grad_norm": 0.78125, + "learning_rate": 1.6452500000000002e-05, + "loss": 0.1783, + "step": 1420 + }, + { + "epoch": 0.031391918776154215, + "grad_norm": 0.80078125, + "learning_rate": 1.64275e-05, + "loss": 0.1761, + "step": 1430 + }, + { + "epoch": 0.03161144268367977, + "grad_norm": 0.83203125, + "learning_rate": 1.64025e-05, + "loss": 0.182, + "step": 1440 + }, + { + "epoch": 0.03183096659120532, + "grad_norm": 0.48828125, + "learning_rate": 1.63775e-05, + "loss": 0.1732, + "step": 1450 + }, + { + "epoch": 0.03205049049873088, + "grad_norm": 0.87109375, + "learning_rate": 1.6352500000000003e-05, + "loss": 0.1951, + "step": 1460 + }, + { + "epoch": 0.03227001440625643, + "grad_norm": 1.0078125, + "learning_rate": 1.63275e-05, + "loss": 0.2427, + "step": 1470 + }, + { + "epoch": 0.032489538313781985, + "grad_norm": 0.78125, + "learning_rate": 1.6302500000000002e-05, + "loss": 0.1904, + "step": 1480 + }, + { + "epoch": 0.03270906222130754, + "grad_norm": 0.81640625, + "learning_rate": 1.62775e-05, + "loss": 0.1938, + "step": 1490 + }, + { + "epoch": 0.03292858612883309, + "grad_norm": 0.59765625, + "learning_rate": 1.62525e-05, + "loss": 0.1673, + "step": 1500 + }, + { + "epoch": 0.03314811003635865, + "grad_norm": 1.109375, + "learning_rate": 1.62275e-05, + "loss": 0.2362, + "step": 1510 + }, + { + "epoch": 0.0333676339438842, + "grad_norm": 0.85546875, + "learning_rate": 1.62025e-05, + "loss": 0.1716, + "step": 1520 + }, + { + "epoch": 0.033587157851409755, + "grad_norm": 0.9375, + "learning_rate": 1.6177500000000002e-05, + "loss": 0.1872, + "step": 1530 + }, + { + "epoch": 0.03380668175893531, + "grad_norm": 0.95703125, + "learning_rate": 1.61525e-05, + "loss": 0.2068, + "step": 1540 + }, + { + "epoch": 0.03402620566646086, + "grad_norm": 0.78515625, + "learning_rate": 1.61275e-05, + "loss": 0.1902, + "step": 1550 + }, + { + "epoch": 0.034245729573986416, + "grad_norm": 0.72265625, + "learning_rate": 1.61025e-05, + "loss": 0.1866, + "step": 1560 + }, + { + "epoch": 0.03446525348151197, + "grad_norm": 0.703125, + "learning_rate": 1.60775e-05, + "loss": 0.1694, + "step": 1570 + }, + { + "epoch": 0.034684777389037524, + "grad_norm": 0.57421875, + "learning_rate": 1.6052500000000003e-05, + "loss": 0.1915, + "step": 1580 + }, + { + "epoch": 0.03490430129656308, + "grad_norm": 0.671875, + "learning_rate": 1.60275e-05, + "loss": 0.2041, + "step": 1590 + }, + { + "epoch": 0.03512382520408863, + "grad_norm": 1.1484375, + "learning_rate": 1.60025e-05, + "loss": 0.2597, + "step": 1600 + }, + { + "epoch": 0.035343349111614186, + "grad_norm": 0.73046875, + "learning_rate": 1.59775e-05, + "loss": 0.2025, + "step": 1610 + }, + { + "epoch": 0.03556287301913974, + "grad_norm": 0.76171875, + "learning_rate": 1.5952500000000004e-05, + "loss": 0.178, + "step": 1620 + }, + { + "epoch": 0.035782396926665294, + "grad_norm": 1.1015625, + "learning_rate": 1.5927500000000003e-05, + "loss": 0.2145, + "step": 1630 + }, + { + "epoch": 0.03600192083419085, + "grad_norm": 0.625, + "learning_rate": 1.5902500000000002e-05, + "loss": 0.2196, + "step": 1640 + }, + { + "epoch": 0.0362214447417164, + "grad_norm": 0.8125, + "learning_rate": 1.5877500000000002e-05, + "loss": 0.236, + "step": 1650 + }, + { + "epoch": 0.036440968649241956, + "grad_norm": 1.0, + "learning_rate": 1.58525e-05, + "loss": 0.1927, + "step": 1660 + }, + { + "epoch": 0.03666049255676751, + "grad_norm": 0.8203125, + "learning_rate": 1.5827500000000004e-05, + "loss": 0.1991, + "step": 1670 + }, + { + "epoch": 0.036880016464293064, + "grad_norm": 1.171875, + "learning_rate": 1.58025e-05, + "loss": 0.2282, + "step": 1680 + }, + { + "epoch": 0.03709954037181862, + "grad_norm": 0.78515625, + "learning_rate": 1.5777500000000003e-05, + "loss": 0.1799, + "step": 1690 + }, + { + "epoch": 0.03731906427934417, + "grad_norm": 0.81640625, + "learning_rate": 1.5752500000000002e-05, + "loss": 0.1936, + "step": 1700 + }, + { + "epoch": 0.037538588186869726, + "grad_norm": 0.74609375, + "learning_rate": 1.57275e-05, + "loss": 0.1886, + "step": 1710 + }, + { + "epoch": 0.03775811209439528, + "grad_norm": 0.765625, + "learning_rate": 1.57025e-05, + "loss": 0.1771, + "step": 1720 + }, + { + "epoch": 0.037977636001920834, + "grad_norm": 0.84765625, + "learning_rate": 1.56775e-05, + "loss": 0.1853, + "step": 1730 + }, + { + "epoch": 0.03819715990944639, + "grad_norm": 0.90625, + "learning_rate": 1.5652500000000003e-05, + "loss": 0.1987, + "step": 1740 + }, + { + "epoch": 0.03841668381697194, + "grad_norm": 0.734375, + "learning_rate": 1.5627500000000002e-05, + "loss": 0.2049, + "step": 1750 + }, + { + "epoch": 0.038636207724497496, + "grad_norm": 0.69921875, + "learning_rate": 1.5602500000000002e-05, + "loss": 0.1691, + "step": 1760 + }, + { + "epoch": 0.03885573163202305, + "grad_norm": 0.83984375, + "learning_rate": 1.55775e-05, + "loss": 0.1981, + "step": 1770 + }, + { + "epoch": 0.0390752555395486, + "grad_norm": 0.69140625, + "learning_rate": 1.55525e-05, + "loss": 0.1842, + "step": 1780 + }, + { + "epoch": 0.03929477944707416, + "grad_norm": 1.0546875, + "learning_rate": 1.5527500000000003e-05, + "loss": 0.1696, + "step": 1790 + }, + { + "epoch": 0.03951430335459971, + "grad_norm": 1.0234375, + "learning_rate": 1.55025e-05, + "loss": 0.2018, + "step": 1800 + }, + { + "epoch": 0.039733827262125265, + "grad_norm": 0.6953125, + "learning_rate": 1.5477500000000002e-05, + "loss": 0.201, + "step": 1810 + }, + { + "epoch": 0.03995335116965082, + "grad_norm": 0.65234375, + "learning_rate": 1.54525e-05, + "loss": 0.1855, + "step": 1820 + }, + { + "epoch": 0.04017287507717637, + "grad_norm": 0.7578125, + "learning_rate": 1.54275e-05, + "loss": 0.1679, + "step": 1830 + }, + { + "epoch": 0.04039239898470193, + "grad_norm": 0.875, + "learning_rate": 1.54025e-05, + "loss": 0.1916, + "step": 1840 + }, + { + "epoch": 0.04061192289222748, + "grad_norm": 0.75, + "learning_rate": 1.53775e-05, + "loss": 0.1715, + "step": 1850 + }, + { + "epoch": 0.040831446799753035, + "grad_norm": 0.68359375, + "learning_rate": 1.5352500000000003e-05, + "loss": 0.1899, + "step": 1860 + }, + { + "epoch": 0.04105097070727859, + "grad_norm": 0.78515625, + "learning_rate": 1.5327500000000002e-05, + "loss": 0.193, + "step": 1870 + }, + { + "epoch": 0.04127049461480414, + "grad_norm": 0.7109375, + "learning_rate": 1.53025e-05, + "loss": 0.1822, + "step": 1880 + }, + { + "epoch": 0.0414900185223297, + "grad_norm": 0.66015625, + "learning_rate": 1.52775e-05, + "loss": 0.1502, + "step": 1890 + }, + { + "epoch": 0.04170954242985525, + "grad_norm": 0.9140625, + "learning_rate": 1.5252500000000002e-05, + "loss": 0.1909, + "step": 1900 + }, + { + "epoch": 0.041929066337380805, + "grad_norm": 0.73046875, + "learning_rate": 1.5227500000000001e-05, + "loss": 0.1853, + "step": 1910 + }, + { + "epoch": 0.04214859024490636, + "grad_norm": 0.7578125, + "learning_rate": 1.5202500000000002e-05, + "loss": 0.1905, + "step": 1920 + }, + { + "epoch": 0.04236811415243191, + "grad_norm": 0.80078125, + "learning_rate": 1.51775e-05, + "loss": 0.1877, + "step": 1930 + }, + { + "epoch": 0.04258763805995747, + "grad_norm": 0.5859375, + "learning_rate": 1.5152500000000001e-05, + "loss": 0.1947, + "step": 1940 + }, + { + "epoch": 0.04280716196748302, + "grad_norm": 0.76953125, + "learning_rate": 1.5127500000000002e-05, + "loss": 0.1985, + "step": 1950 + }, + { + "epoch": 0.043026685875008575, + "grad_norm": 0.71875, + "learning_rate": 1.5102500000000002e-05, + "loss": 0.1846, + "step": 1960 + }, + { + "epoch": 0.04324620978253413, + "grad_norm": 0.734375, + "learning_rate": 1.5077500000000001e-05, + "loss": 0.2175, + "step": 1970 + }, + { + "epoch": 0.04346573369005968, + "grad_norm": 0.7109375, + "learning_rate": 1.50525e-05, + "loss": 0.203, + "step": 1980 + }, + { + "epoch": 0.043685257597585236, + "grad_norm": 0.609375, + "learning_rate": 1.5027500000000001e-05, + "loss": 0.1688, + "step": 1990 + }, + { + "epoch": 0.04390478150511079, + "grad_norm": 0.9375, + "learning_rate": 1.5002500000000002e-05, + "loss": 0.1951, + "step": 2000 + }, + { + "epoch": 0.044124305412636344, + "grad_norm": 0.91796875, + "learning_rate": 1.49775e-05, + "loss": 0.1931, + "step": 2010 + }, + { + "epoch": 0.0443438293201619, + "grad_norm": 0.671875, + "learning_rate": 1.4952500000000001e-05, + "loss": 0.2017, + "step": 2020 + }, + { + "epoch": 0.04456335322768745, + "grad_norm": 0.98828125, + "learning_rate": 1.49275e-05, + "loss": 0.2095, + "step": 2030 + }, + { + "epoch": 0.044782877135213006, + "grad_norm": 0.7265625, + "learning_rate": 1.4902500000000002e-05, + "loss": 0.1814, + "step": 2040 + }, + { + "epoch": 0.04500240104273856, + "grad_norm": 0.8828125, + "learning_rate": 1.48775e-05, + "loss": 0.1912, + "step": 2050 + }, + { + "epoch": 0.045221924950264114, + "grad_norm": 0.71875, + "learning_rate": 1.48525e-05, + "loss": 0.1799, + "step": 2060 + }, + { + "epoch": 0.04544144885778967, + "grad_norm": 0.64453125, + "learning_rate": 1.4827500000000002e-05, + "loss": 0.1754, + "step": 2070 + }, + { + "epoch": 0.04566097276531522, + "grad_norm": 0.72265625, + "learning_rate": 1.4802500000000003e-05, + "loss": 0.2019, + "step": 2080 + }, + { + "epoch": 0.045880496672840776, + "grad_norm": 0.7109375, + "learning_rate": 1.47775e-05, + "loss": 0.1767, + "step": 2090 + }, + { + "epoch": 0.04610002058036633, + "grad_norm": 0.71875, + "learning_rate": 1.4752500000000001e-05, + "loss": 0.1762, + "step": 2100 + }, + { + "epoch": 0.046319544487891884, + "grad_norm": 0.74609375, + "learning_rate": 1.4727500000000001e-05, + "loss": 0.2086, + "step": 2110 + }, + { + "epoch": 0.04653906839541744, + "grad_norm": 1.03125, + "learning_rate": 1.4702500000000002e-05, + "loss": 0.2083, + "step": 2120 + }, + { + "epoch": 0.04675859230294299, + "grad_norm": 0.80859375, + "learning_rate": 1.4677500000000003e-05, + "loss": 0.1774, + "step": 2130 + }, + { + "epoch": 0.046978116210468546, + "grad_norm": 0.78125, + "learning_rate": 1.46525e-05, + "loss": 0.1908, + "step": 2140 + }, + { + "epoch": 0.0471976401179941, + "grad_norm": 0.91015625, + "learning_rate": 1.4627500000000002e-05, + "loss": 0.2223, + "step": 2150 + }, + { + "epoch": 0.047417164025519654, + "grad_norm": 1.046875, + "learning_rate": 1.4602500000000001e-05, + "loss": 0.2084, + "step": 2160 + }, + { + "epoch": 0.04763668793304521, + "grad_norm": 1.0859375, + "learning_rate": 1.4577500000000002e-05, + "loss": 0.2029, + "step": 2170 + }, + { + "epoch": 0.04785621184057076, + "grad_norm": 0.78515625, + "learning_rate": 1.45525e-05, + "loss": 0.1641, + "step": 2180 + }, + { + "epoch": 0.048075735748096315, + "grad_norm": 0.83203125, + "learning_rate": 1.4527500000000001e-05, + "loss": 0.2053, + "step": 2190 + }, + { + "epoch": 0.04829525965562187, + "grad_norm": 0.80859375, + "learning_rate": 1.4502500000000002e-05, + "loss": 0.1783, + "step": 2200 + }, + { + "epoch": 0.04851478356314742, + "grad_norm": 0.984375, + "learning_rate": 1.4477500000000002e-05, + "loss": 0.1733, + "step": 2210 + }, + { + "epoch": 0.04873430747067298, + "grad_norm": 0.80859375, + "learning_rate": 1.4452500000000001e-05, + "loss": 0.1852, + "step": 2220 + }, + { + "epoch": 0.04895383137819853, + "grad_norm": 0.76953125, + "learning_rate": 1.44275e-05, + "loss": 0.1626, + "step": 2230 + }, + { + "epoch": 0.049173355285724085, + "grad_norm": 0.76953125, + "learning_rate": 1.4402500000000001e-05, + "loss": 0.1804, + "step": 2240 + }, + { + "epoch": 0.04939287919324964, + "grad_norm": 0.8203125, + "learning_rate": 1.4377500000000003e-05, + "loss": 0.1895, + "step": 2250 + }, + { + "epoch": 0.04961240310077519, + "grad_norm": 0.8984375, + "learning_rate": 1.43525e-05, + "loss": 0.1911, + "step": 2260 + }, + { + "epoch": 0.04983192700830075, + "grad_norm": 0.9765625, + "learning_rate": 1.4327500000000001e-05, + "loss": 0.1903, + "step": 2270 + }, + { + "epoch": 0.0500514509158263, + "grad_norm": 0.890625, + "learning_rate": 1.43025e-05, + "loss": 0.2024, + "step": 2280 + }, + { + "epoch": 0.050270974823351855, + "grad_norm": 0.6171875, + "learning_rate": 1.4277500000000002e-05, + "loss": 0.1669, + "step": 2290 + }, + { + "epoch": 0.05049049873087741, + "grad_norm": 0.99609375, + "learning_rate": 1.42525e-05, + "loss": 0.1819, + "step": 2300 + }, + { + "epoch": 0.05071002263840296, + "grad_norm": 0.7734375, + "learning_rate": 1.42275e-05, + "loss": 0.1808, + "step": 2310 + }, + { + "epoch": 0.05092954654592852, + "grad_norm": 0.57421875, + "learning_rate": 1.4202500000000002e-05, + "loss": 0.1809, + "step": 2320 + }, + { + "epoch": 0.05114907045345407, + "grad_norm": 0.94921875, + "learning_rate": 1.4177500000000001e-05, + "loss": 0.177, + "step": 2330 + }, + { + "epoch": 0.051368594360979625, + "grad_norm": 0.671875, + "learning_rate": 1.41525e-05, + "loss": 0.1882, + "step": 2340 + }, + { + "epoch": 0.05158811826850518, + "grad_norm": 0.79296875, + "learning_rate": 1.41275e-05, + "loss": 0.1705, + "step": 2350 + }, + { + "epoch": 0.05180764217603073, + "grad_norm": 0.75390625, + "learning_rate": 1.4102500000000001e-05, + "loss": 0.1671, + "step": 2360 + }, + { + "epoch": 0.05202716608355629, + "grad_norm": 0.79296875, + "learning_rate": 1.4077500000000002e-05, + "loss": 0.1622, + "step": 2370 + }, + { + "epoch": 0.05224668999108184, + "grad_norm": 0.64453125, + "learning_rate": 1.4052500000000001e-05, + "loss": 0.1847, + "step": 2380 + }, + { + "epoch": 0.052466213898607394, + "grad_norm": 0.75390625, + "learning_rate": 1.40275e-05, + "loss": 0.1789, + "step": 2390 + }, + { + "epoch": 0.05268573780613295, + "grad_norm": 0.66015625, + "learning_rate": 1.4002500000000002e-05, + "loss": 0.1815, + "step": 2400 + }, + { + "epoch": 0.0529052617136585, + "grad_norm": 0.94921875, + "learning_rate": 1.3977500000000001e-05, + "loss": 0.2352, + "step": 2410 + }, + { + "epoch": 0.053124785621184056, + "grad_norm": 0.69140625, + "learning_rate": 1.3952500000000002e-05, + "loss": 0.1681, + "step": 2420 + }, + { + "epoch": 0.05334430952870961, + "grad_norm": 0.828125, + "learning_rate": 1.39275e-05, + "loss": 0.2157, + "step": 2430 + }, + { + "epoch": 0.053563833436235164, + "grad_norm": 0.76953125, + "learning_rate": 1.3902500000000001e-05, + "loss": 0.1829, + "step": 2440 + }, + { + "epoch": 0.05378335734376072, + "grad_norm": 0.89453125, + "learning_rate": 1.3877500000000002e-05, + "loss": 0.1778, + "step": 2450 + }, + { + "epoch": 0.05400288125128627, + "grad_norm": 0.66015625, + "learning_rate": 1.3852500000000002e-05, + "loss": 0.2, + "step": 2460 + }, + { + "epoch": 0.054222405158811826, + "grad_norm": 0.75, + "learning_rate": 1.3827500000000001e-05, + "loss": 0.1975, + "step": 2470 + }, + { + "epoch": 0.05444192906633738, + "grad_norm": 0.87109375, + "learning_rate": 1.38025e-05, + "loss": 0.1732, + "step": 2480 + }, + { + "epoch": 0.054661452973862934, + "grad_norm": 0.6796875, + "learning_rate": 1.3777500000000001e-05, + "loss": 0.2128, + "step": 2490 + }, + { + "epoch": 0.05488097688138849, + "grad_norm": 0.7578125, + "learning_rate": 1.3752500000000003e-05, + "loss": 0.2146, + "step": 2500 + }, + { + "epoch": 0.05510050078891404, + "grad_norm": 0.73046875, + "learning_rate": 1.37275e-05, + "loss": 0.1893, + "step": 2510 + }, + { + "epoch": 0.055320024696439596, + "grad_norm": 0.76953125, + "learning_rate": 1.3702500000000001e-05, + "loss": 0.2053, + "step": 2520 + }, + { + "epoch": 0.05553954860396515, + "grad_norm": 0.71875, + "learning_rate": 1.36775e-05, + "loss": 0.1867, + "step": 2530 + }, + { + "epoch": 0.055759072511490704, + "grad_norm": 0.83203125, + "learning_rate": 1.3652500000000002e-05, + "loss": 0.2008, + "step": 2540 + }, + { + "epoch": 0.05597859641901626, + "grad_norm": 0.86328125, + "learning_rate": 1.36275e-05, + "loss": 0.1968, + "step": 2550 + }, + { + "epoch": 0.05619812032654181, + "grad_norm": 1.03125, + "learning_rate": 1.36025e-05, + "loss": 0.199, + "step": 2560 + }, + { + "epoch": 0.056417644234067366, + "grad_norm": 0.58984375, + "learning_rate": 1.3577500000000002e-05, + "loss": 0.1708, + "step": 2570 + }, + { + "epoch": 0.05663716814159292, + "grad_norm": 0.83203125, + "learning_rate": 1.3552500000000001e-05, + "loss": 0.1923, + "step": 2580 + }, + { + "epoch": 0.056856692049118474, + "grad_norm": 0.8046875, + "learning_rate": 1.35275e-05, + "loss": 0.2035, + "step": 2590 + }, + { + "epoch": 0.05707621595664403, + "grad_norm": 1.03125, + "learning_rate": 1.35025e-05, + "loss": 0.2349, + "step": 2600 + }, + { + "epoch": 0.05729573986416958, + "grad_norm": 0.74609375, + "learning_rate": 1.3477500000000001e-05, + "loss": 0.1965, + "step": 2610 + }, + { + "epoch": 0.057515263771695135, + "grad_norm": 0.6796875, + "learning_rate": 1.3452500000000002e-05, + "loss": 0.1683, + "step": 2620 + }, + { + "epoch": 0.05773478767922069, + "grad_norm": 1.015625, + "learning_rate": 1.3427500000000001e-05, + "loss": 0.2099, + "step": 2630 + }, + { + "epoch": 0.05795431158674624, + "grad_norm": 0.80859375, + "learning_rate": 1.3402500000000001e-05, + "loss": 0.1802, + "step": 2640 + }, + { + "epoch": 0.0581738354942718, + "grad_norm": 0.88671875, + "learning_rate": 1.33775e-05, + "loss": 0.1875, + "step": 2650 + }, + { + "epoch": 0.05839335940179735, + "grad_norm": 0.7578125, + "learning_rate": 1.3352500000000001e-05, + "loss": 0.1849, + "step": 2660 + }, + { + "epoch": 0.058612883309322905, + "grad_norm": 0.80859375, + "learning_rate": 1.3327500000000002e-05, + "loss": 0.1549, + "step": 2670 + }, + { + "epoch": 0.05883240721684846, + "grad_norm": 0.87109375, + "learning_rate": 1.33025e-05, + "loss": 0.1936, + "step": 2680 + }, + { + "epoch": 0.05905193112437401, + "grad_norm": 0.80078125, + "learning_rate": 1.3277500000000001e-05, + "loss": 0.1891, + "step": 2690 + }, + { + "epoch": 0.05927145503189957, + "grad_norm": 0.75, + "learning_rate": 1.3252500000000002e-05, + "loss": 0.1879, + "step": 2700 + }, + { + "epoch": 0.05949097893942512, + "grad_norm": 0.71484375, + "learning_rate": 1.3227500000000002e-05, + "loss": 0.1838, + "step": 2710 + }, + { + "epoch": 0.059710502846950675, + "grad_norm": 0.67578125, + "learning_rate": 1.3202500000000001e-05, + "loss": 0.1807, + "step": 2720 + }, + { + "epoch": 0.05993002675447623, + "grad_norm": 0.58203125, + "learning_rate": 1.31775e-05, + "loss": 0.1519, + "step": 2730 + }, + { + "epoch": 0.06014955066200178, + "grad_norm": 0.87109375, + "learning_rate": 1.3152500000000002e-05, + "loss": 0.1843, + "step": 2740 + }, + { + "epoch": 0.06036907456952734, + "grad_norm": 0.8515625, + "learning_rate": 1.3127500000000003e-05, + "loss": 0.2063, + "step": 2750 + }, + { + "epoch": 0.06058859847705289, + "grad_norm": 0.7265625, + "learning_rate": 1.31025e-05, + "loss": 0.1785, + "step": 2760 + }, + { + "epoch": 0.060808122384578445, + "grad_norm": 0.71875, + "learning_rate": 1.3077500000000001e-05, + "loss": 0.1707, + "step": 2770 + }, + { + "epoch": 0.061027646292104, + "grad_norm": 1.03125, + "learning_rate": 1.30525e-05, + "loss": 0.1761, + "step": 2780 + }, + { + "epoch": 0.06124717019962955, + "grad_norm": 0.67578125, + "learning_rate": 1.3027500000000002e-05, + "loss": 0.1782, + "step": 2790 + }, + { + "epoch": 0.061466694107155107, + "grad_norm": 0.8671875, + "learning_rate": 1.30025e-05, + "loss": 0.1988, + "step": 2800 + }, + { + "epoch": 0.06168621801468066, + "grad_norm": 0.78125, + "learning_rate": 1.29775e-05, + "loss": 0.1701, + "step": 2810 + }, + { + "epoch": 0.061905741922206214, + "grad_norm": 0.83984375, + "learning_rate": 1.2952500000000002e-05, + "loss": 0.2079, + "step": 2820 + }, + { + "epoch": 0.06212526582973177, + "grad_norm": 0.9609375, + "learning_rate": 1.2927500000000001e-05, + "loss": 0.1982, + "step": 2830 + }, + { + "epoch": 0.06234478973725732, + "grad_norm": 0.68359375, + "learning_rate": 1.29025e-05, + "loss": 0.1798, + "step": 2840 + }, + { + "epoch": 0.06256431364478288, + "grad_norm": 0.953125, + "learning_rate": 1.28775e-05, + "loss": 0.1582, + "step": 2850 + }, + { + "epoch": 0.06278383755230843, + "grad_norm": 0.81640625, + "learning_rate": 1.2852500000000001e-05, + "loss": 0.2233, + "step": 2860 + }, + { + "epoch": 0.06300336145983398, + "grad_norm": 0.875, + "learning_rate": 1.2827500000000002e-05, + "loss": 0.1952, + "step": 2870 + }, + { + "epoch": 0.06322288536735954, + "grad_norm": 0.83984375, + "learning_rate": 1.2802500000000002e-05, + "loss": 0.1854, + "step": 2880 + }, + { + "epoch": 0.06344240927488509, + "grad_norm": 0.671875, + "learning_rate": 1.2777500000000001e-05, + "loss": 0.205, + "step": 2890 + }, + { + "epoch": 0.06366193318241065, + "grad_norm": 0.8125, + "learning_rate": 1.27525e-05, + "loss": 0.1899, + "step": 2900 + }, + { + "epoch": 0.0638814570899362, + "grad_norm": 0.7109375, + "learning_rate": 1.2727500000000001e-05, + "loss": 0.1874, + "step": 2910 + }, + { + "epoch": 0.06410098099746175, + "grad_norm": 0.6875, + "learning_rate": 1.2702500000000002e-05, + "loss": 0.186, + "step": 2920 + }, + { + "epoch": 0.06432050490498731, + "grad_norm": 0.56640625, + "learning_rate": 1.26775e-05, + "loss": 0.1619, + "step": 2930 + }, + { + "epoch": 0.06454002881251286, + "grad_norm": 0.64453125, + "learning_rate": 1.2652500000000001e-05, + "loss": 0.1959, + "step": 2940 + }, + { + "epoch": 0.06475955272003842, + "grad_norm": 0.76953125, + "learning_rate": 1.26275e-05, + "loss": 0.1919, + "step": 2950 + }, + { + "epoch": 0.06497907662756397, + "grad_norm": 1.25, + "learning_rate": 1.2602500000000002e-05, + "loss": 0.1998, + "step": 2960 + }, + { + "epoch": 0.06519860053508952, + "grad_norm": 0.83984375, + "learning_rate": 1.25775e-05, + "loss": 0.2058, + "step": 2970 + }, + { + "epoch": 0.06541812444261508, + "grad_norm": 0.55859375, + "learning_rate": 1.25525e-05, + "loss": 0.1861, + "step": 2980 + }, + { + "epoch": 0.06563764835014063, + "grad_norm": 0.84765625, + "learning_rate": 1.2527500000000002e-05, + "loss": 0.2199, + "step": 2990 + }, + { + "epoch": 0.06585717225766619, + "grad_norm": 0.73046875, + "learning_rate": 1.2502500000000003e-05, + "loss": 0.1804, + "step": 3000 + }, + { + "epoch": 0.06607669616519174, + "grad_norm": 0.5703125, + "learning_rate": 1.24775e-05, + "loss": 0.1742, + "step": 3010 + }, + { + "epoch": 0.0662962200727173, + "grad_norm": 0.73828125, + "learning_rate": 1.2452500000000001e-05, + "loss": 0.1782, + "step": 3020 + }, + { + "epoch": 0.06651574398024285, + "grad_norm": 0.921875, + "learning_rate": 1.2427500000000001e-05, + "loss": 0.1822, + "step": 3030 + }, + { + "epoch": 0.0667352678877684, + "grad_norm": 0.6640625, + "learning_rate": 1.2402500000000002e-05, + "loss": 0.1902, + "step": 3040 + }, + { + "epoch": 0.06695479179529396, + "grad_norm": 2.34375, + "learning_rate": 1.23775e-05, + "loss": 0.1937, + "step": 3050 + }, + { + "epoch": 0.06717431570281951, + "grad_norm": 0.58203125, + "learning_rate": 1.23525e-05, + "loss": 0.1919, + "step": 3060 + }, + { + "epoch": 0.06739383961034506, + "grad_norm": 5.625, + "learning_rate": 1.2327500000000002e-05, + "loss": 0.1763, + "step": 3070 + }, + { + "epoch": 0.06761336351787062, + "grad_norm": 0.7578125, + "learning_rate": 1.2302500000000001e-05, + "loss": 0.1956, + "step": 3080 + }, + { + "epoch": 0.06783288742539617, + "grad_norm": 1.171875, + "learning_rate": 1.22775e-05, + "loss": 0.2159, + "step": 3090 + }, + { + "epoch": 0.06805241133292173, + "grad_norm": 0.76171875, + "learning_rate": 1.22525e-05, + "loss": 0.1775, + "step": 3100 + }, + { + "epoch": 0.06827193524044728, + "grad_norm": 0.71875, + "learning_rate": 1.2227500000000001e-05, + "loss": 0.1617, + "step": 3110 + }, + { + "epoch": 0.06849145914797283, + "grad_norm": 0.81640625, + "learning_rate": 1.2202500000000002e-05, + "loss": 0.1725, + "step": 3120 + }, + { + "epoch": 0.06871098305549839, + "grad_norm": 0.78125, + "learning_rate": 1.2177500000000002e-05, + "loss": 0.2037, + "step": 3130 + }, + { + "epoch": 0.06893050696302394, + "grad_norm": 0.9609375, + "learning_rate": 1.2152500000000001e-05, + "loss": 0.21, + "step": 3140 + }, + { + "epoch": 0.0691500308705495, + "grad_norm": 0.6796875, + "learning_rate": 1.21275e-05, + "loss": 0.1816, + "step": 3150 + }, + { + "epoch": 0.06936955477807505, + "grad_norm": 0.83984375, + "learning_rate": 1.2102500000000001e-05, + "loss": 0.1811, + "step": 3160 + }, + { + "epoch": 0.0695890786856006, + "grad_norm": 0.8828125, + "learning_rate": 1.2077500000000003e-05, + "loss": 0.1659, + "step": 3170 + }, + { + "epoch": 0.06980860259312616, + "grad_norm": 0.83203125, + "learning_rate": 1.20525e-05, + "loss": 0.1924, + "step": 3180 + }, + { + "epoch": 0.07002812650065171, + "grad_norm": 0.796875, + "learning_rate": 1.2027500000000001e-05, + "loss": 0.1891, + "step": 3190 + }, + { + "epoch": 0.07024765040817726, + "grad_norm": 0.82421875, + "learning_rate": 1.20025e-05, + "loss": 0.1836, + "step": 3200 + }, + { + "epoch": 0.07046717431570282, + "grad_norm": 0.71875, + "learning_rate": 1.1977500000000002e-05, + "loss": 0.2071, + "step": 3210 + }, + { + "epoch": 0.07068669822322837, + "grad_norm": 0.80859375, + "learning_rate": 1.19525e-05, + "loss": 0.2144, + "step": 3220 + }, + { + "epoch": 0.07090622213075393, + "grad_norm": 0.74609375, + "learning_rate": 1.19275e-05, + "loss": 0.1881, + "step": 3230 + }, + { + "epoch": 0.07112574603827948, + "grad_norm": 0.7890625, + "learning_rate": 1.1902500000000002e-05, + "loss": 0.1751, + "step": 3240 + }, + { + "epoch": 0.07134526994580503, + "grad_norm": 0.78515625, + "learning_rate": 1.1877500000000001e-05, + "loss": 0.1844, + "step": 3250 + }, + { + "epoch": 0.07156479385333059, + "grad_norm": 1.0, + "learning_rate": 1.18525e-05, + "loss": 0.177, + "step": 3260 + }, + { + "epoch": 0.07178431776085614, + "grad_norm": 0.84375, + "learning_rate": 1.18275e-05, + "loss": 0.1831, + "step": 3270 + }, + { + "epoch": 0.0720038416683817, + "grad_norm": 0.79296875, + "learning_rate": 1.1802500000000001e-05, + "loss": 0.1793, + "step": 3280 + }, + { + "epoch": 0.07222336557590725, + "grad_norm": 0.7578125, + "learning_rate": 1.1777500000000002e-05, + "loss": 0.1824, + "step": 3290 + }, + { + "epoch": 0.0724428894834328, + "grad_norm": 0.875, + "learning_rate": 1.17525e-05, + "loss": 0.2064, + "step": 3300 + }, + { + "epoch": 0.07266241339095836, + "grad_norm": 0.85546875, + "learning_rate": 1.17275e-05, + "loss": 0.1824, + "step": 3310 + }, + { + "epoch": 0.07288193729848391, + "grad_norm": 0.55859375, + "learning_rate": 1.1702500000000002e-05, + "loss": 0.1792, + "step": 3320 + }, + { + "epoch": 0.07310146120600947, + "grad_norm": 1.0703125, + "learning_rate": 1.1677500000000001e-05, + "loss": 0.1882, + "step": 3330 + }, + { + "epoch": 0.07332098511353502, + "grad_norm": 0.73828125, + "learning_rate": 1.16525e-05, + "loss": 0.1945, + "step": 3340 + }, + { + "epoch": 0.07354050902106057, + "grad_norm": 0.82421875, + "learning_rate": 1.16275e-05, + "loss": 0.2059, + "step": 3350 + }, + { + "epoch": 0.07376003292858613, + "grad_norm": 0.8515625, + "learning_rate": 1.1602500000000001e-05, + "loss": 0.2108, + "step": 3360 + }, + { + "epoch": 0.07397955683611168, + "grad_norm": 0.87890625, + "learning_rate": 1.1577500000000002e-05, + "loss": 0.1833, + "step": 3370 + }, + { + "epoch": 0.07419908074363724, + "grad_norm": 0.96484375, + "learning_rate": 1.1552500000000002e-05, + "loss": 0.1719, + "step": 3380 + }, + { + "epoch": 0.07441860465116279, + "grad_norm": 0.7578125, + "learning_rate": 1.1527500000000001e-05, + "loss": 0.191, + "step": 3390 + }, + { + "epoch": 0.07463812855868834, + "grad_norm": 0.67578125, + "learning_rate": 1.15025e-05, + "loss": 0.1933, + "step": 3400 + }, + { + "epoch": 0.0748576524662139, + "grad_norm": 0.73046875, + "learning_rate": 1.1477500000000001e-05, + "loss": 0.2048, + "step": 3410 + }, + { + "epoch": 0.07507717637373945, + "grad_norm": 0.78515625, + "learning_rate": 1.1452500000000003e-05, + "loss": 0.1669, + "step": 3420 + }, + { + "epoch": 0.075296700281265, + "grad_norm": 0.7109375, + "learning_rate": 1.14275e-05, + "loss": 0.1684, + "step": 3430 + }, + { + "epoch": 0.07551622418879056, + "grad_norm": 0.64453125, + "learning_rate": 1.1402500000000001e-05, + "loss": 0.1744, + "step": 3440 + }, + { + "epoch": 0.07573574809631611, + "grad_norm": 0.7578125, + "learning_rate": 1.13775e-05, + "loss": 0.1822, + "step": 3450 + }, + { + "epoch": 0.07595527200384167, + "grad_norm": 0.66015625, + "learning_rate": 1.1352500000000002e-05, + "loss": 0.1808, + "step": 3460 + }, + { + "epoch": 0.07617479591136722, + "grad_norm": 0.65234375, + "learning_rate": 1.13275e-05, + "loss": 0.1872, + "step": 3470 + }, + { + "epoch": 0.07639431981889278, + "grad_norm": 0.74609375, + "learning_rate": 1.13025e-05, + "loss": 0.192, + "step": 3480 + }, + { + "epoch": 0.07661384372641833, + "grad_norm": 0.76171875, + "learning_rate": 1.1277500000000002e-05, + "loss": 0.219, + "step": 3490 + }, + { + "epoch": 0.07683336763394388, + "grad_norm": 0.984375, + "learning_rate": 1.1252500000000001e-05, + "loss": 0.2029, + "step": 3500 + }, + { + "epoch": 0.07705289154146944, + "grad_norm": 1.421875, + "learning_rate": 1.12275e-05, + "loss": 0.1787, + "step": 3510 + }, + { + "epoch": 0.07727241544899499, + "grad_norm": 0.5859375, + "learning_rate": 1.12025e-05, + "loss": 0.1613, + "step": 3520 + }, + { + "epoch": 0.07749193935652054, + "grad_norm": 1.0546875, + "learning_rate": 1.1177500000000001e-05, + "loss": 0.1823, + "step": 3530 + }, + { + "epoch": 0.0777114632640461, + "grad_norm": 0.84375, + "learning_rate": 1.1152500000000002e-05, + "loss": 0.1823, + "step": 3540 + }, + { + "epoch": 0.07793098717157165, + "grad_norm": 0.7421875, + "learning_rate": 1.11275e-05, + "loss": 0.194, + "step": 3550 + }, + { + "epoch": 0.0781505110790972, + "grad_norm": 0.921875, + "learning_rate": 1.1102500000000001e-05, + "loss": 0.202, + "step": 3560 + }, + { + "epoch": 0.07837003498662276, + "grad_norm": 0.69921875, + "learning_rate": 1.10775e-05, + "loss": 0.1599, + "step": 3570 + }, + { + "epoch": 0.07858955889414831, + "grad_norm": 1.1484375, + "learning_rate": 1.1052500000000001e-05, + "loss": 0.183, + "step": 3580 + }, + { + "epoch": 0.07880908280167387, + "grad_norm": 0.8203125, + "learning_rate": 1.1027499999999999e-05, + "loss": 0.1697, + "step": 3590 + }, + { + "epoch": 0.07902860670919942, + "grad_norm": 1.1484375, + "learning_rate": 1.10025e-05, + "loss": 0.1872, + "step": 3600 + }, + { + "epoch": 0.07924813061672498, + "grad_norm": 0.890625, + "learning_rate": 1.0977500000000001e-05, + "loss": 0.2008, + "step": 3610 + }, + { + "epoch": 0.07946765452425053, + "grad_norm": 1.1015625, + "learning_rate": 1.0952500000000002e-05, + "loss": 0.1857, + "step": 3620 + }, + { + "epoch": 0.07968717843177608, + "grad_norm": 0.79296875, + "learning_rate": 1.0927500000000002e-05, + "loss": 0.2013, + "step": 3630 + }, + { + "epoch": 0.07990670233930164, + "grad_norm": 0.73828125, + "learning_rate": 1.0902500000000001e-05, + "loss": 0.1996, + "step": 3640 + }, + { + "epoch": 0.08012622624682719, + "grad_norm": 0.65234375, + "learning_rate": 1.08775e-05, + "loss": 0.1866, + "step": 3650 + }, + { + "epoch": 0.08034575015435275, + "grad_norm": 0.953125, + "learning_rate": 1.0852500000000002e-05, + "loss": 0.1854, + "step": 3660 + }, + { + "epoch": 0.0805652740618783, + "grad_norm": 0.73828125, + "learning_rate": 1.0827500000000003e-05, + "loss": 0.2041, + "step": 3670 + }, + { + "epoch": 0.08078479796940385, + "grad_norm": 1.046875, + "learning_rate": 1.08025e-05, + "loss": 0.2103, + "step": 3680 + }, + { + "epoch": 0.08100432187692941, + "grad_norm": 0.76953125, + "learning_rate": 1.0777500000000001e-05, + "loss": 0.1689, + "step": 3690 + }, + { + "epoch": 0.08122384578445496, + "grad_norm": 0.9140625, + "learning_rate": 1.07525e-05, + "loss": 0.2075, + "step": 3700 + }, + { + "epoch": 0.08144336969198052, + "grad_norm": 1.015625, + "learning_rate": 1.0727500000000002e-05, + "loss": 0.1958, + "step": 3710 + }, + { + "epoch": 0.08166289359950607, + "grad_norm": 0.78515625, + "learning_rate": 1.07025e-05, + "loss": 0.2133, + "step": 3720 + }, + { + "epoch": 0.08188241750703162, + "grad_norm": 0.703125, + "learning_rate": 1.06775e-05, + "loss": 0.2199, + "step": 3730 + }, + { + "epoch": 0.08210194141455718, + "grad_norm": 0.640625, + "learning_rate": 1.0652500000000002e-05, + "loss": 0.1816, + "step": 3740 + }, + { + "epoch": 0.08232146532208273, + "grad_norm": 0.75, + "learning_rate": 1.0627500000000001e-05, + "loss": 0.1618, + "step": 3750 + }, + { + "epoch": 0.08254098922960829, + "grad_norm": 0.6796875, + "learning_rate": 1.06025e-05, + "loss": 0.2193, + "step": 3760 + }, + { + "epoch": 0.08276051313713384, + "grad_norm": 0.71875, + "learning_rate": 1.05775e-05, + "loss": 0.1626, + "step": 3770 + }, + { + "epoch": 0.0829800370446594, + "grad_norm": 1.015625, + "learning_rate": 1.0552500000000001e-05, + "loss": 0.1873, + "step": 3780 + }, + { + "epoch": 0.08319956095218495, + "grad_norm": 0.7578125, + "learning_rate": 1.0527500000000002e-05, + "loss": 0.2129, + "step": 3790 + }, + { + "epoch": 0.0834190848597105, + "grad_norm": 0.921875, + "learning_rate": 1.05025e-05, + "loss": 0.1906, + "step": 3800 + }, + { + "epoch": 0.08363860876723606, + "grad_norm": 0.71484375, + "learning_rate": 1.0477500000000001e-05, + "loss": 0.1711, + "step": 3810 + }, + { + "epoch": 0.08385813267476161, + "grad_norm": 1.1640625, + "learning_rate": 1.04525e-05, + "loss": 0.1813, + "step": 3820 + }, + { + "epoch": 0.08407765658228716, + "grad_norm": 0.8671875, + "learning_rate": 1.0427500000000001e-05, + "loss": 0.1996, + "step": 3830 + }, + { + "epoch": 0.08429718048981272, + "grad_norm": 0.68359375, + "learning_rate": 1.0402499999999999e-05, + "loss": 0.1536, + "step": 3840 + }, + { + "epoch": 0.08451670439733827, + "grad_norm": 0.7265625, + "learning_rate": 1.03775e-05, + "loss": 0.1617, + "step": 3850 + }, + { + "epoch": 0.08473622830486383, + "grad_norm": 0.90234375, + "learning_rate": 1.0352500000000001e-05, + "loss": 0.1874, + "step": 3860 + }, + { + "epoch": 0.08495575221238938, + "grad_norm": 0.79296875, + "learning_rate": 1.03275e-05, + "loss": 0.2133, + "step": 3870 + }, + { + "epoch": 0.08517527611991493, + "grad_norm": 0.9296875, + "learning_rate": 1.0302500000000002e-05, + "loss": 0.1796, + "step": 3880 + }, + { + "epoch": 0.08539480002744049, + "grad_norm": 0.671875, + "learning_rate": 1.02775e-05, + "loss": 0.1698, + "step": 3890 + }, + { + "epoch": 0.08561432393496604, + "grad_norm": 0.765625, + "learning_rate": 1.02525e-05, + "loss": 0.1938, + "step": 3900 + }, + { + "epoch": 0.0858338478424916, + "grad_norm": 0.83203125, + "learning_rate": 1.0227500000000002e-05, + "loss": 0.1816, + "step": 3910 + }, + { + "epoch": 0.08605337175001715, + "grad_norm": 0.7890625, + "learning_rate": 1.0202500000000003e-05, + "loss": 0.1677, + "step": 3920 + }, + { + "epoch": 0.0862728956575427, + "grad_norm": 0.65234375, + "learning_rate": 1.01775e-05, + "loss": 0.1956, + "step": 3930 + }, + { + "epoch": 0.08649241956506826, + "grad_norm": 0.80859375, + "learning_rate": 1.0152500000000001e-05, + "loss": 0.1769, + "step": 3940 + }, + { + "epoch": 0.08671194347259381, + "grad_norm": 0.84765625, + "learning_rate": 1.0127500000000001e-05, + "loss": 0.2014, + "step": 3950 + }, + { + "epoch": 0.08693146738011936, + "grad_norm": 0.6484375, + "learning_rate": 1.0102500000000002e-05, + "loss": 0.2199, + "step": 3960 + }, + { + "epoch": 0.08715099128764492, + "grad_norm": 0.75, + "learning_rate": 1.00775e-05, + "loss": 0.1815, + "step": 3970 + }, + { + "epoch": 0.08737051519517047, + "grad_norm": 0.8828125, + "learning_rate": 1.00525e-05, + "loss": 0.1797, + "step": 3980 + }, + { + "epoch": 0.08759003910269603, + "grad_norm": 0.83984375, + "learning_rate": 1.0027500000000002e-05, + "loss": 0.2142, + "step": 3990 + }, + { + "epoch": 0.08780956301022158, + "grad_norm": 0.8125, + "learning_rate": 1.0002500000000001e-05, + "loss": 0.2015, + "step": 4000 + }, + { + "epoch": 0.08802908691774713, + "grad_norm": 0.703125, + "learning_rate": 9.9775e-06, + "loss": 0.1653, + "step": 4010 + }, + { + "epoch": 0.08824861082527269, + "grad_norm": 0.76171875, + "learning_rate": 9.9525e-06, + "loss": 0.1784, + "step": 4020 + }, + { + "epoch": 0.08846813473279824, + "grad_norm": 1.1015625, + "learning_rate": 9.927500000000001e-06, + "loss": 0.1915, + "step": 4030 + }, + { + "epoch": 0.0886876586403238, + "grad_norm": 0.76953125, + "learning_rate": 9.9025e-06, + "loss": 0.1997, + "step": 4040 + }, + { + "epoch": 0.08890718254784935, + "grad_norm": 0.7265625, + "learning_rate": 9.877500000000002e-06, + "loss": 0.1816, + "step": 4050 + }, + { + "epoch": 0.0891267064553749, + "grad_norm": 0.7734375, + "learning_rate": 9.852500000000001e-06, + "loss": 0.1954, + "step": 4060 + }, + { + "epoch": 0.08934623036290046, + "grad_norm": 0.63671875, + "learning_rate": 9.8275e-06, + "loss": 0.1532, + "step": 4070 + }, + { + "epoch": 0.08956575427042601, + "grad_norm": 0.7265625, + "learning_rate": 9.8025e-06, + "loss": 0.2117, + "step": 4080 + }, + { + "epoch": 0.08978527817795157, + "grad_norm": 0.7265625, + "learning_rate": 9.7775e-06, + "loss": 0.1957, + "step": 4090 + }, + { + "epoch": 0.09000480208547712, + "grad_norm": 1.0546875, + "learning_rate": 9.7525e-06, + "loss": 0.1683, + "step": 4100 + }, + { + "epoch": 0.09022432599300267, + "grad_norm": 0.671875, + "learning_rate": 9.727500000000001e-06, + "loss": 0.202, + "step": 4110 + }, + { + "epoch": 0.09044384990052823, + "grad_norm": 0.92578125, + "learning_rate": 9.7025e-06, + "loss": 0.1806, + "step": 4120 + }, + { + "epoch": 0.09066337380805378, + "grad_norm": 0.671875, + "learning_rate": 9.6775e-06, + "loss": 0.1941, + "step": 4130 + }, + { + "epoch": 0.09088289771557934, + "grad_norm": 0.9453125, + "learning_rate": 9.652500000000001e-06, + "loss": 0.1824, + "step": 4140 + }, + { + "epoch": 0.09110242162310489, + "grad_norm": 0.66015625, + "learning_rate": 9.6275e-06, + "loss": 0.1489, + "step": 4150 + }, + { + "epoch": 0.09132194553063044, + "grad_norm": 0.78515625, + "learning_rate": 9.602500000000002e-06, + "loss": 0.1578, + "step": 4160 + }, + { + "epoch": 0.091541469438156, + "grad_norm": 1.1015625, + "learning_rate": 9.577500000000001e-06, + "loss": 0.1837, + "step": 4170 + }, + { + "epoch": 0.09176099334568155, + "grad_norm": 0.75, + "learning_rate": 9.5525e-06, + "loss": 0.1943, + "step": 4180 + }, + { + "epoch": 0.0919805172532071, + "grad_norm": 0.6875, + "learning_rate": 9.5275e-06, + "loss": 0.1938, + "step": 4190 + }, + { + "epoch": 0.09220004116073266, + "grad_norm": 0.890625, + "learning_rate": 9.502500000000001e-06, + "loss": 0.1722, + "step": 4200 + }, + { + "epoch": 0.09241956506825821, + "grad_norm": 0.72265625, + "learning_rate": 9.4775e-06, + "loss": 0.1893, + "step": 4210 + }, + { + "epoch": 0.09263908897578377, + "grad_norm": 0.8515625, + "learning_rate": 9.452500000000001e-06, + "loss": 0.1843, + "step": 4220 + }, + { + "epoch": 0.09285861288330932, + "grad_norm": 0.75, + "learning_rate": 9.4275e-06, + "loss": 0.2091, + "step": 4230 + }, + { + "epoch": 0.09307813679083488, + "grad_norm": 0.83984375, + "learning_rate": 9.402500000000002e-06, + "loss": 0.1744, + "step": 4240 + }, + { + "epoch": 0.09329766069836043, + "grad_norm": 0.9921875, + "learning_rate": 9.377500000000001e-06, + "loss": 0.1934, + "step": 4250 + }, + { + "epoch": 0.09351718460588598, + "grad_norm": 0.88671875, + "learning_rate": 9.3525e-06, + "loss": 0.1982, + "step": 4260 + }, + { + "epoch": 0.09373670851341154, + "grad_norm": 0.60546875, + "learning_rate": 9.3275e-06, + "loss": 0.1705, + "step": 4270 + }, + { + "epoch": 0.09395623242093709, + "grad_norm": 0.7890625, + "learning_rate": 9.302500000000001e-06, + "loss": 0.1701, + "step": 4280 + }, + { + "epoch": 0.09417575632846265, + "grad_norm": 0.74609375, + "learning_rate": 9.2775e-06, + "loss": 0.1956, + "step": 4290 + }, + { + "epoch": 0.0943952802359882, + "grad_norm": 0.6796875, + "learning_rate": 9.252500000000002e-06, + "loss": 0.1792, + "step": 4300 + }, + { + "epoch": 0.09461480414351375, + "grad_norm": 0.90625, + "learning_rate": 9.227500000000001e-06, + "loss": 0.1744, + "step": 4310 + }, + { + "epoch": 0.09483432805103931, + "grad_norm": 0.8203125, + "learning_rate": 9.2025e-06, + "loss": 0.2185, + "step": 4320 + }, + { + "epoch": 0.09505385195856486, + "grad_norm": 0.9453125, + "learning_rate": 9.1775e-06, + "loss": 0.1655, + "step": 4330 + }, + { + "epoch": 0.09527337586609042, + "grad_norm": 0.94921875, + "learning_rate": 9.152500000000001e-06, + "loss": 0.2083, + "step": 4340 + }, + { + "epoch": 0.09549289977361597, + "grad_norm": 1.078125, + "learning_rate": 9.1275e-06, + "loss": 0.2039, + "step": 4350 + }, + { + "epoch": 0.09571242368114152, + "grad_norm": 0.6171875, + "learning_rate": 9.102500000000001e-06, + "loss": 0.1923, + "step": 4360 + }, + { + "epoch": 0.09593194758866708, + "grad_norm": 0.59765625, + "learning_rate": 9.0775e-06, + "loss": 0.2013, + "step": 4370 + }, + { + "epoch": 0.09615147149619263, + "grad_norm": 0.6328125, + "learning_rate": 9.0525e-06, + "loss": 0.1856, + "step": 4380 + }, + { + "epoch": 0.09637099540371818, + "grad_norm": 0.75, + "learning_rate": 9.027500000000001e-06, + "loss": 0.2029, + "step": 4390 + }, + { + "epoch": 0.09659051931124374, + "grad_norm": 0.77734375, + "learning_rate": 9.0025e-06, + "loss": 0.1753, + "step": 4400 + }, + { + "epoch": 0.09681004321876929, + "grad_norm": 0.62109375, + "learning_rate": 8.977500000000002e-06, + "loss": 0.1767, + "step": 4410 + }, + { + "epoch": 0.09702956712629485, + "grad_norm": 0.64453125, + "learning_rate": 8.952500000000001e-06, + "loss": 0.1727, + "step": 4420 + }, + { + "epoch": 0.0972490910338204, + "grad_norm": 0.71484375, + "learning_rate": 8.9275e-06, + "loss": 0.1802, + "step": 4430 + }, + { + "epoch": 0.09746861494134595, + "grad_norm": 1.015625, + "learning_rate": 8.9025e-06, + "loss": 0.217, + "step": 4440 + }, + { + "epoch": 0.09768813884887151, + "grad_norm": 0.79296875, + "learning_rate": 8.877500000000001e-06, + "loss": 0.1644, + "step": 4450 + }, + { + "epoch": 0.09790766275639706, + "grad_norm": 0.7890625, + "learning_rate": 8.8525e-06, + "loss": 0.195, + "step": 4460 + }, + { + "epoch": 0.09812718666392262, + "grad_norm": 0.68359375, + "learning_rate": 8.827500000000001e-06, + "loss": 0.1872, + "step": 4470 + }, + { + "epoch": 0.09834671057144817, + "grad_norm": 0.83203125, + "learning_rate": 8.802500000000001e-06, + "loss": 0.196, + "step": 4480 + }, + { + "epoch": 0.09856623447897372, + "grad_norm": 0.59765625, + "learning_rate": 8.7775e-06, + "loss": 0.186, + "step": 4490 + }, + { + "epoch": 0.09878575838649928, + "grad_norm": 0.76171875, + "learning_rate": 8.7525e-06, + "loss": 0.1871, + "step": 4500 + }, + { + "epoch": 0.09900528229402483, + "grad_norm": 0.84765625, + "learning_rate": 8.7275e-06, + "loss": 0.189, + "step": 4510 + }, + { + "epoch": 0.09922480620155039, + "grad_norm": 0.6875, + "learning_rate": 8.7025e-06, + "loss": 0.1596, + "step": 4520 + }, + { + "epoch": 0.09944433010907594, + "grad_norm": 0.96484375, + "learning_rate": 8.677500000000001e-06, + "loss": 0.1781, + "step": 4530 + }, + { + "epoch": 0.0996638540166015, + "grad_norm": 0.93359375, + "learning_rate": 8.6525e-06, + "loss": 0.1879, + "step": 4540 + }, + { + "epoch": 0.09988337792412705, + "grad_norm": 0.9765625, + "learning_rate": 8.627500000000002e-06, + "loss": 0.2002, + "step": 4550 + }, + { + "epoch": 0.1001029018316526, + "grad_norm": 1.0078125, + "learning_rate": 8.602500000000001e-06, + "loss": 0.2004, + "step": 4560 + }, + { + "epoch": 0.10032242573917816, + "grad_norm": 0.6328125, + "learning_rate": 8.5775e-06, + "loss": 0.1844, + "step": 4570 + }, + { + "epoch": 0.10054194964670371, + "grad_norm": 0.56640625, + "learning_rate": 8.5525e-06, + "loss": 0.1701, + "step": 4580 + }, + { + "epoch": 0.10076147355422926, + "grad_norm": 0.66015625, + "learning_rate": 8.527500000000001e-06, + "loss": 0.2176, + "step": 4590 + }, + { + "epoch": 0.10098099746175482, + "grad_norm": 0.6796875, + "learning_rate": 8.5025e-06, + "loss": 0.1736, + "step": 4600 + }, + { + "epoch": 0.10120052136928037, + "grad_norm": 0.6796875, + "learning_rate": 8.477500000000001e-06, + "loss": 0.1813, + "step": 4610 + }, + { + "epoch": 0.10142004527680593, + "grad_norm": 0.96484375, + "learning_rate": 8.4525e-06, + "loss": 0.1878, + "step": 4620 + }, + { + "epoch": 0.10163956918433148, + "grad_norm": 0.85546875, + "learning_rate": 8.4275e-06, + "loss": 0.1796, + "step": 4630 + }, + { + "epoch": 0.10185909309185703, + "grad_norm": 0.8359375, + "learning_rate": 8.402500000000001e-06, + "loss": 0.1702, + "step": 4640 + }, + { + "epoch": 0.10207861699938259, + "grad_norm": 0.7109375, + "learning_rate": 8.3775e-06, + "loss": 0.1689, + "step": 4650 + }, + { + "epoch": 0.10229814090690814, + "grad_norm": 0.86328125, + "learning_rate": 8.352500000000002e-06, + "loss": 0.1922, + "step": 4660 + }, + { + "epoch": 0.1025176648144337, + "grad_norm": 0.77734375, + "learning_rate": 8.327500000000001e-06, + "loss": 0.1813, + "step": 4670 + }, + { + "epoch": 0.10273718872195925, + "grad_norm": 0.65625, + "learning_rate": 8.3025e-06, + "loss": 0.1486, + "step": 4680 + }, + { + "epoch": 0.1029567126294848, + "grad_norm": 0.734375, + "learning_rate": 8.2775e-06, + "loss": 0.1606, + "step": 4690 + }, + { + "epoch": 0.10317623653701036, + "grad_norm": 0.7890625, + "learning_rate": 8.252500000000001e-06, + "loss": 0.2014, + "step": 4700 + }, + { + "epoch": 0.10339576044453591, + "grad_norm": 0.6015625, + "learning_rate": 8.2275e-06, + "loss": 0.1757, + "step": 4710 + }, + { + "epoch": 0.10361528435206147, + "grad_norm": 0.9140625, + "learning_rate": 8.202500000000002e-06, + "loss": 0.1895, + "step": 4720 + }, + { + "epoch": 0.10383480825958702, + "grad_norm": 0.6484375, + "learning_rate": 8.177500000000001e-06, + "loss": 0.1753, + "step": 4730 + }, + { + "epoch": 0.10405433216711257, + "grad_norm": 0.75, + "learning_rate": 8.1525e-06, + "loss": 0.1705, + "step": 4740 + }, + { + "epoch": 0.10427385607463813, + "grad_norm": 0.796875, + "learning_rate": 8.1275e-06, + "loss": 0.217, + "step": 4750 + }, + { + "epoch": 0.10449337998216368, + "grad_norm": 0.671875, + "learning_rate": 8.1025e-06, + "loss": 0.2155, + "step": 4760 + }, + { + "epoch": 0.10471290388968924, + "grad_norm": 0.8671875, + "learning_rate": 8.0775e-06, + "loss": 0.1972, + "step": 4770 + }, + { + "epoch": 0.10493242779721479, + "grad_norm": 0.859375, + "learning_rate": 8.052500000000001e-06, + "loss": 0.1927, + "step": 4780 + }, + { + "epoch": 0.10515195170474034, + "grad_norm": 1.078125, + "learning_rate": 8.0275e-06, + "loss": 0.2067, + "step": 4790 + }, + { + "epoch": 0.1053714756122659, + "grad_norm": 0.70703125, + "learning_rate": 8.0025e-06, + "loss": 0.1802, + "step": 4800 + }, + { + "epoch": 0.10559099951979145, + "grad_norm": 0.76171875, + "learning_rate": 7.9775e-06, + "loss": 0.1808, + "step": 4810 + }, + { + "epoch": 0.105810523427317, + "grad_norm": 0.75390625, + "learning_rate": 7.9525e-06, + "loss": 0.1641, + "step": 4820 + }, + { + "epoch": 0.10603004733484256, + "grad_norm": 0.84375, + "learning_rate": 7.9275e-06, + "loss": 0.175, + "step": 4830 + }, + { + "epoch": 0.10624957124236811, + "grad_norm": 1.1640625, + "learning_rate": 7.902500000000001e-06, + "loss": 0.1984, + "step": 4840 + }, + { + "epoch": 0.10646909514989367, + "grad_norm": 1.0, + "learning_rate": 7.8775e-06, + "loss": 0.182, + "step": 4850 + }, + { + "epoch": 0.10668861905741922, + "grad_norm": 0.66796875, + "learning_rate": 7.852500000000001e-06, + "loss": 0.1785, + "step": 4860 + }, + { + "epoch": 0.10690814296494477, + "grad_norm": 0.71484375, + "learning_rate": 7.827500000000001e-06, + "loss": 0.1917, + "step": 4870 + }, + { + "epoch": 0.10712766687247033, + "grad_norm": 0.671875, + "learning_rate": 7.8025e-06, + "loss": 0.1747, + "step": 4880 + }, + { + "epoch": 0.10734719077999588, + "grad_norm": 0.921875, + "learning_rate": 7.777500000000001e-06, + "loss": 0.1806, + "step": 4890 + }, + { + "epoch": 0.10756671468752144, + "grad_norm": 0.73046875, + "learning_rate": 7.7525e-06, + "loss": 0.1672, + "step": 4900 + }, + { + "epoch": 0.10778623859504699, + "grad_norm": 0.78515625, + "learning_rate": 7.727500000000002e-06, + "loss": 0.2223, + "step": 4910 + }, + { + "epoch": 0.10800576250257254, + "grad_norm": 0.671875, + "learning_rate": 7.702500000000001e-06, + "loss": 0.1946, + "step": 4920 + }, + { + "epoch": 0.1082252864100981, + "grad_norm": 0.69921875, + "learning_rate": 7.6775e-06, + "loss": 0.1682, + "step": 4930 + }, + { + "epoch": 0.10844481031762365, + "grad_norm": 0.63671875, + "learning_rate": 7.6525e-06, + "loss": 0.1494, + "step": 4940 + }, + { + "epoch": 0.1086643342251492, + "grad_norm": 0.9296875, + "learning_rate": 7.627500000000001e-06, + "loss": 0.1856, + "step": 4950 + }, + { + "epoch": 0.10888385813267476, + "grad_norm": 0.87890625, + "learning_rate": 7.6025000000000005e-06, + "loss": 0.1922, + "step": 4960 + }, + { + "epoch": 0.10910338204020031, + "grad_norm": 0.7421875, + "learning_rate": 7.577500000000001e-06, + "loss": 0.1919, + "step": 4970 + }, + { + "epoch": 0.10932290594772587, + "grad_norm": 0.76953125, + "learning_rate": 7.5525e-06, + "loss": 0.1758, + "step": 4980 + }, + { + "epoch": 0.10954242985525142, + "grad_norm": 0.73046875, + "learning_rate": 7.527500000000001e-06, + "loss": 0.2053, + "step": 4990 + }, + { + "epoch": 0.10976195376277698, + "grad_norm": 0.81640625, + "learning_rate": 7.502500000000001e-06, + "loss": 0.1742, + "step": 5000 + }, + { + "epoch": 0.10998147767030253, + "grad_norm": 0.9140625, + "learning_rate": 7.477500000000001e-06, + "loss": 0.2089, + "step": 5010 + }, + { + "epoch": 0.11020100157782808, + "grad_norm": 0.84765625, + "learning_rate": 7.4525e-06, + "loss": 0.1788, + "step": 5020 + }, + { + "epoch": 0.11042052548535364, + "grad_norm": 0.9375, + "learning_rate": 7.4275000000000005e-06, + "loss": 0.1981, + "step": 5030 + }, + { + "epoch": 0.11064004939287919, + "grad_norm": 1.0078125, + "learning_rate": 7.4025e-06, + "loss": 0.1708, + "step": 5040 + }, + { + "epoch": 0.11085957330040475, + "grad_norm": 0.7421875, + "learning_rate": 7.377500000000001e-06, + "loss": 0.1621, + "step": 5050 + }, + { + "epoch": 0.1110790972079303, + "grad_norm": 0.80078125, + "learning_rate": 7.3525e-06, + "loss": 0.1805, + "step": 5060 + }, + { + "epoch": 0.11129862111545585, + "grad_norm": 0.8046875, + "learning_rate": 7.3275000000000006e-06, + "loss": 0.201, + "step": 5070 + }, + { + "epoch": 0.11151814502298141, + "grad_norm": 0.72265625, + "learning_rate": 7.3025e-06, + "loss": 0.2034, + "step": 5080 + }, + { + "epoch": 0.11173766893050696, + "grad_norm": 0.8671875, + "learning_rate": 7.277500000000001e-06, + "loss": 0.1858, + "step": 5090 + }, + { + "epoch": 0.11195719283803252, + "grad_norm": 0.75390625, + "learning_rate": 7.2525000000000004e-06, + "loss": 0.2008, + "step": 5100 + }, + { + "epoch": 0.11217671674555807, + "grad_norm": 0.81640625, + "learning_rate": 7.227500000000001e-06, + "loss": 0.1954, + "step": 5110 + }, + { + "epoch": 0.11239624065308362, + "grad_norm": 0.859375, + "learning_rate": 7.2025e-06, + "loss": 0.1638, + "step": 5120 + }, + { + "epoch": 0.11261576456060918, + "grad_norm": 0.80078125, + "learning_rate": 7.1775e-06, + "loss": 0.1845, + "step": 5130 + }, + { + "epoch": 0.11283528846813473, + "grad_norm": 0.80859375, + "learning_rate": 7.152500000000001e-06, + "loss": 0.1953, + "step": 5140 + }, + { + "epoch": 0.11305481237566029, + "grad_norm": 0.640625, + "learning_rate": 7.127500000000001e-06, + "loss": 0.1937, + "step": 5150 + }, + { + "epoch": 0.11327433628318584, + "grad_norm": 0.73828125, + "learning_rate": 7.102500000000001e-06, + "loss": 0.1736, + "step": 5160 + }, + { + "epoch": 0.1134938601907114, + "grad_norm": 0.70703125, + "learning_rate": 7.0775000000000004e-06, + "loss": 0.2043, + "step": 5170 + }, + { + "epoch": 0.11371338409823695, + "grad_norm": 0.8984375, + "learning_rate": 7.052500000000001e-06, + "loss": 0.1758, + "step": 5180 + }, + { + "epoch": 0.1139329080057625, + "grad_norm": 0.93359375, + "learning_rate": 7.0275e-06, + "loss": 0.1895, + "step": 5190 + }, + { + "epoch": 0.11415243191328805, + "grad_norm": 0.66796875, + "learning_rate": 7.002500000000001e-06, + "loss": 0.2119, + "step": 5200 + }, + { + "epoch": 0.11437195582081361, + "grad_norm": 0.7734375, + "learning_rate": 6.9775000000000005e-06, + "loss": 0.1839, + "step": 5210 + }, + { + "epoch": 0.11459147972833916, + "grad_norm": 0.91015625, + "learning_rate": 6.952500000000001e-06, + "loss": 0.186, + "step": 5220 + }, + { + "epoch": 0.11481100363586472, + "grad_norm": 0.78125, + "learning_rate": 6.9275e-06, + "loss": 0.1887, + "step": 5230 + }, + { + "epoch": 0.11503052754339027, + "grad_norm": 0.60546875, + "learning_rate": 6.902500000000001e-06, + "loss": 0.1916, + "step": 5240 + }, + { + "epoch": 0.11525005145091582, + "grad_norm": 0.64453125, + "learning_rate": 6.877500000000001e-06, + "loss": 0.1755, + "step": 5250 + }, + { + "epoch": 0.11546957535844138, + "grad_norm": 0.75390625, + "learning_rate": 6.852500000000001e-06, + "loss": 0.2005, + "step": 5260 + }, + { + "epoch": 0.11568909926596693, + "grad_norm": 1.1796875, + "learning_rate": 6.8275e-06, + "loss": 0.229, + "step": 5270 + }, + { + "epoch": 0.11590862317349249, + "grad_norm": 0.78125, + "learning_rate": 6.8025000000000005e-06, + "loss": 0.1747, + "step": 5280 + }, + { + "epoch": 0.11612814708101804, + "grad_norm": 0.77734375, + "learning_rate": 6.7775e-06, + "loss": 0.1735, + "step": 5290 + }, + { + "epoch": 0.1163476709885436, + "grad_norm": 0.90625, + "learning_rate": 6.752500000000001e-06, + "loss": 0.1971, + "step": 5300 + }, + { + "epoch": 0.11656719489606915, + "grad_norm": 0.89453125, + "learning_rate": 6.7275e-06, + "loss": 0.1885, + "step": 5310 + }, + { + "epoch": 0.1167867188035947, + "grad_norm": 0.73046875, + "learning_rate": 6.702500000000001e-06, + "loss": 0.1801, + "step": 5320 + }, + { + "epoch": 0.11700624271112026, + "grad_norm": 0.7734375, + "learning_rate": 6.6775e-06, + "loss": 0.2179, + "step": 5330 + }, + { + "epoch": 0.11722576661864581, + "grad_norm": 0.578125, + "learning_rate": 6.6525e-06, + "loss": 0.1607, + "step": 5340 + }, + { + "epoch": 0.11744529052617136, + "grad_norm": 0.61328125, + "learning_rate": 6.6275e-06, + "loss": 0.1931, + "step": 5350 + }, + { + "epoch": 0.11766481443369692, + "grad_norm": 0.85546875, + "learning_rate": 6.602500000000001e-06, + "loss": 0.2009, + "step": 5360 + }, + { + "epoch": 0.11788433834122247, + "grad_norm": 0.5390625, + "learning_rate": 6.5775e-06, + "loss": 0.1551, + "step": 5370 + }, + { + "epoch": 0.11810386224874803, + "grad_norm": 0.87109375, + "learning_rate": 6.5525e-06, + "loss": 0.1775, + "step": 5380 + }, + { + "epoch": 0.11832338615627358, + "grad_norm": 0.6171875, + "learning_rate": 6.5275000000000015e-06, + "loss": 0.1673, + "step": 5390 + }, + { + "epoch": 0.11854291006379913, + "grad_norm": 0.8203125, + "learning_rate": 6.502500000000001e-06, + "loss": 0.1785, + "step": 5400 + }, + { + "epoch": 0.11876243397132469, + "grad_norm": 0.8515625, + "learning_rate": 6.477500000000001e-06, + "loss": 0.1902, + "step": 5410 + }, + { + "epoch": 0.11898195787885024, + "grad_norm": 0.640625, + "learning_rate": 6.4525000000000005e-06, + "loss": 0.1794, + "step": 5420 + }, + { + "epoch": 0.1192014817863758, + "grad_norm": 0.7265625, + "learning_rate": 6.427500000000001e-06, + "loss": 0.1954, + "step": 5430 + }, + { + "epoch": 0.11942100569390135, + "grad_norm": 0.87890625, + "learning_rate": 6.4025e-06, + "loss": 0.21, + "step": 5440 + }, + { + "epoch": 0.1196405296014269, + "grad_norm": 0.76171875, + "learning_rate": 6.377500000000001e-06, + "loss": 0.1887, + "step": 5450 + }, + { + "epoch": 0.11986005350895246, + "grad_norm": 0.7109375, + "learning_rate": 6.352500000000001e-06, + "loss": 0.1928, + "step": 5460 + }, + { + "epoch": 0.12007957741647801, + "grad_norm": 0.63671875, + "learning_rate": 6.327500000000001e-06, + "loss": 0.2024, + "step": 5470 + }, + { + "epoch": 0.12029910132400357, + "grad_norm": 0.890625, + "learning_rate": 6.3025e-06, + "loss": 0.1855, + "step": 5480 + }, + { + "epoch": 0.12051862523152912, + "grad_norm": 0.92578125, + "learning_rate": 6.2775000000000005e-06, + "loss": 0.1693, + "step": 5490 + }, + { + "epoch": 0.12073814913905467, + "grad_norm": 0.94140625, + "learning_rate": 6.2525e-06, + "loss": 0.1777, + "step": 5500 + }, + { + "epoch": 0.12095767304658023, + "grad_norm": 0.76953125, + "learning_rate": 6.227500000000001e-06, + "loss": 0.221, + "step": 5510 + }, + { + "epoch": 0.12117719695410578, + "grad_norm": 0.6953125, + "learning_rate": 6.2025e-06, + "loss": 0.1692, + "step": 5520 + }, + { + "epoch": 0.12139672086163134, + "grad_norm": 0.77734375, + "learning_rate": 6.1775000000000006e-06, + "loss": 0.192, + "step": 5530 + }, + { + "epoch": 0.12161624476915689, + "grad_norm": 0.6484375, + "learning_rate": 6.1525e-06, + "loss": 0.1882, + "step": 5540 + }, + { + "epoch": 0.12183576867668244, + "grad_norm": 0.98046875, + "learning_rate": 6.127500000000001e-06, + "loss": 0.2051, + "step": 5550 + }, + { + "epoch": 0.122055292584208, + "grad_norm": 0.9609375, + "learning_rate": 6.1025000000000004e-06, + "loss": 0.2132, + "step": 5560 + }, + { + "epoch": 0.12227481649173355, + "grad_norm": 0.76953125, + "learning_rate": 6.077500000000001e-06, + "loss": 0.1776, + "step": 5570 + }, + { + "epoch": 0.1224943403992591, + "grad_norm": 0.83984375, + "learning_rate": 6.0525e-06, + "loss": 0.2029, + "step": 5580 + }, + { + "epoch": 0.12271386430678466, + "grad_norm": 0.796875, + "learning_rate": 6.0275e-06, + "loss": 0.209, + "step": 5590 + }, + { + "epoch": 0.12293338821431021, + "grad_norm": 0.5859375, + "learning_rate": 6.0025e-06, + "loss": 0.1967, + "step": 5600 + }, + { + "epoch": 0.12315291212183577, + "grad_norm": 1.0, + "learning_rate": 5.977500000000001e-06, + "loss": 0.2383, + "step": 5610 + }, + { + "epoch": 0.12337243602936132, + "grad_norm": 0.6875, + "learning_rate": 5.9525e-06, + "loss": 0.163, + "step": 5620 + }, + { + "epoch": 0.12359195993688687, + "grad_norm": 0.578125, + "learning_rate": 5.9275e-06, + "loss": 0.196, + "step": 5630 + }, + { + "epoch": 0.12381148384441243, + "grad_norm": 0.76171875, + "learning_rate": 5.902500000000001e-06, + "loss": 0.2017, + "step": 5640 + }, + { + "epoch": 0.12403100775193798, + "grad_norm": 0.82421875, + "learning_rate": 5.8775e-06, + "loss": 0.1859, + "step": 5650 + }, + { + "epoch": 0.12425053165946354, + "grad_norm": 0.6875, + "learning_rate": 5.852500000000001e-06, + "loss": 0.2002, + "step": 5660 + }, + { + "epoch": 0.12447005556698909, + "grad_norm": 0.71875, + "learning_rate": 5.8275000000000005e-06, + "loss": 0.1784, + "step": 5670 + }, + { + "epoch": 0.12468957947451464, + "grad_norm": 0.765625, + "learning_rate": 5.802500000000001e-06, + "loss": 0.1809, + "step": 5680 + }, + { + "epoch": 0.1249091033820402, + "grad_norm": 0.51953125, + "learning_rate": 5.7775e-06, + "loss": 0.1663, + "step": 5690 + }, + { + "epoch": 0.12512862728956575, + "grad_norm": 0.76953125, + "learning_rate": 5.752500000000001e-06, + "loss": 0.173, + "step": 5700 + }, + { + "epoch": 0.1253481511970913, + "grad_norm": 0.7109375, + "learning_rate": 5.727500000000001e-06, + "loss": 0.1743, + "step": 5710 + }, + { + "epoch": 0.12556767510461686, + "grad_norm": 0.90234375, + "learning_rate": 5.702500000000001e-06, + "loss": 0.2364, + "step": 5720 + }, + { + "epoch": 0.12578719901214241, + "grad_norm": 0.69140625, + "learning_rate": 5.6775e-06, + "loss": 0.212, + "step": 5730 + }, + { + "epoch": 0.12600672291966797, + "grad_norm": 0.76171875, + "learning_rate": 5.6525000000000005e-06, + "loss": 0.1757, + "step": 5740 + }, + { + "epoch": 0.12622624682719352, + "grad_norm": 0.84765625, + "learning_rate": 5.6275e-06, + "loss": 0.1868, + "step": 5750 + }, + { + "epoch": 0.12644577073471908, + "grad_norm": 0.84375, + "learning_rate": 5.602500000000001e-06, + "loss": 0.1901, + "step": 5760 + }, + { + "epoch": 0.12666529464224463, + "grad_norm": 0.66796875, + "learning_rate": 5.5775e-06, + "loss": 0.2302, + "step": 5770 + }, + { + "epoch": 0.12688481854977018, + "grad_norm": 0.61328125, + "learning_rate": 5.552500000000001e-06, + "loss": 0.1639, + "step": 5780 + }, + { + "epoch": 0.12710434245729574, + "grad_norm": 1.4453125, + "learning_rate": 5.5275e-06, + "loss": 0.1778, + "step": 5790 + }, + { + "epoch": 0.1273238663648213, + "grad_norm": 0.85546875, + "learning_rate": 5.5025e-06, + "loss": 0.1727, + "step": 5800 + }, + { + "epoch": 0.12754339027234685, + "grad_norm": 0.5859375, + "learning_rate": 5.4775e-06, + "loss": 0.1731, + "step": 5810 + }, + { + "epoch": 0.1277629141798724, + "grad_norm": 0.6171875, + "learning_rate": 5.452500000000001e-06, + "loss": 0.1762, + "step": 5820 + }, + { + "epoch": 0.12798243808739795, + "grad_norm": 0.98828125, + "learning_rate": 5.4275e-06, + "loss": 0.1874, + "step": 5830 + }, + { + "epoch": 0.1282019619949235, + "grad_norm": 0.6328125, + "learning_rate": 5.4025e-06, + "loss": 0.199, + "step": 5840 + }, + { + "epoch": 0.12842148590244906, + "grad_norm": 0.8984375, + "learning_rate": 5.3775e-06, + "loss": 0.1935, + "step": 5850 + }, + { + "epoch": 0.12864100980997462, + "grad_norm": 0.875, + "learning_rate": 5.352500000000001e-06, + "loss": 0.1862, + "step": 5860 + }, + { + "epoch": 0.12886053371750017, + "grad_norm": 0.73046875, + "learning_rate": 5.3275e-06, + "loss": 0.1693, + "step": 5870 + }, + { + "epoch": 0.12908005762502572, + "grad_norm": 0.87109375, + "learning_rate": 5.3025000000000005e-06, + "loss": 0.1972, + "step": 5880 + }, + { + "epoch": 0.12929958153255128, + "grad_norm": 0.65234375, + "learning_rate": 5.277500000000001e-06, + "loss": 0.1742, + "step": 5890 + }, + { + "epoch": 0.12951910544007683, + "grad_norm": 0.62109375, + "learning_rate": 5.2525e-06, + "loss": 0.1765, + "step": 5900 + }, + { + "epoch": 0.12973862934760239, + "grad_norm": 0.97265625, + "learning_rate": 5.227500000000001e-06, + "loss": 0.1713, + "step": 5910 + }, + { + "epoch": 0.12995815325512794, + "grad_norm": 0.7109375, + "learning_rate": 5.202500000000001e-06, + "loss": 0.1647, + "step": 5920 + }, + { + "epoch": 0.1301776771626535, + "grad_norm": 0.984375, + "learning_rate": 5.177500000000001e-06, + "loss": 0.1795, + "step": 5930 + }, + { + "epoch": 0.13039720107017905, + "grad_norm": 0.71875, + "learning_rate": 5.1525e-06, + "loss": 0.2107, + "step": 5940 + }, + { + "epoch": 0.1306167249777046, + "grad_norm": 0.96875, + "learning_rate": 5.1275000000000005e-06, + "loss": 0.1919, + "step": 5950 + }, + { + "epoch": 0.13083624888523016, + "grad_norm": 0.7109375, + "learning_rate": 5.1025e-06, + "loss": 0.1755, + "step": 5960 + }, + { + "epoch": 0.1310557727927557, + "grad_norm": 0.5625, + "learning_rate": 5.077500000000001e-06, + "loss": 0.1673, + "step": 5970 + }, + { + "epoch": 0.13127529670028126, + "grad_norm": 0.88671875, + "learning_rate": 5.0525e-06, + "loss": 0.2152, + "step": 5980 + }, + { + "epoch": 0.13149482060780682, + "grad_norm": 1.1015625, + "learning_rate": 5.0275000000000006e-06, + "loss": 0.2161, + "step": 5990 + }, + { + "epoch": 0.13171434451533237, + "grad_norm": 0.83984375, + "learning_rate": 5.0025e-06, + "loss": 0.1849, + "step": 6000 + }, + { + "epoch": 0.13193386842285793, + "grad_norm": 0.8125, + "learning_rate": 4.977500000000001e-06, + "loss": 0.1786, + "step": 6010 + }, + { + "epoch": 0.13215339233038348, + "grad_norm": 0.8515625, + "learning_rate": 4.9525000000000004e-06, + "loss": 0.1818, + "step": 6020 + }, + { + "epoch": 0.13237291623790903, + "grad_norm": 0.6796875, + "learning_rate": 4.927500000000001e-06, + "loss": 0.182, + "step": 6030 + }, + { + "epoch": 0.1325924401454346, + "grad_norm": 0.8359375, + "learning_rate": 4.902500000000001e-06, + "loss": 0.1778, + "step": 6040 + }, + { + "epoch": 0.13281196405296014, + "grad_norm": 0.7109375, + "learning_rate": 4.8775e-06, + "loss": 0.165, + "step": 6050 + }, + { + "epoch": 0.1330314879604857, + "grad_norm": 0.875, + "learning_rate": 4.8525000000000006e-06, + "loss": 0.2036, + "step": 6060 + }, + { + "epoch": 0.13325101186801125, + "grad_norm": 0.74609375, + "learning_rate": 4.827500000000001e-06, + "loss": 0.1749, + "step": 6070 + }, + { + "epoch": 0.1334705357755368, + "grad_norm": 0.83984375, + "learning_rate": 4.8025e-06, + "loss": 0.1979, + "step": 6080 + }, + { + "epoch": 0.13369005968306236, + "grad_norm": 0.75390625, + "learning_rate": 4.7775e-06, + "loss": 0.1883, + "step": 6090 + }, + { + "epoch": 0.1339095835905879, + "grad_norm": 0.73828125, + "learning_rate": 4.752500000000001e-06, + "loss": 0.1742, + "step": 6100 + }, + { + "epoch": 0.13412910749811346, + "grad_norm": 0.84765625, + "learning_rate": 4.7275e-06, + "loss": 0.1704, + "step": 6110 + }, + { + "epoch": 0.13434863140563902, + "grad_norm": 0.984375, + "learning_rate": 4.7025e-06, + "loss": 0.1963, + "step": 6120 + }, + { + "epoch": 0.13456815531316457, + "grad_norm": 0.90625, + "learning_rate": 4.6775000000000005e-06, + "loss": 0.1935, + "step": 6130 + }, + { + "epoch": 0.13478767922069013, + "grad_norm": 0.62109375, + "learning_rate": 4.652500000000001e-06, + "loss": 0.1756, + "step": 6140 + }, + { + "epoch": 0.13500720312821568, + "grad_norm": 0.78515625, + "learning_rate": 4.6275e-06, + "loss": 0.1922, + "step": 6150 + }, + { + "epoch": 0.13522672703574123, + "grad_norm": 0.9765625, + "learning_rate": 4.6025e-06, + "loss": 0.2005, + "step": 6160 + }, + { + "epoch": 0.1354462509432668, + "grad_norm": 0.5625, + "learning_rate": 4.577500000000001e-06, + "loss": 0.184, + "step": 6170 + }, + { + "epoch": 0.13566577485079234, + "grad_norm": 0.83984375, + "learning_rate": 4.5525e-06, + "loss": 0.1865, + "step": 6180 + }, + { + "epoch": 0.1358852987583179, + "grad_norm": 0.703125, + "learning_rate": 4.5275e-06, + "loss": 0.201, + "step": 6190 + }, + { + "epoch": 0.13610482266584345, + "grad_norm": 0.57421875, + "learning_rate": 4.5025000000000005e-06, + "loss": 0.2129, + "step": 6200 + }, + { + "epoch": 0.136324346573369, + "grad_norm": 0.81640625, + "learning_rate": 4.4775e-06, + "loss": 0.2103, + "step": 6210 + }, + { + "epoch": 0.13654387048089456, + "grad_norm": 0.8671875, + "learning_rate": 4.4525e-06, + "loss": 0.1933, + "step": 6220 + }, + { + "epoch": 0.1367633943884201, + "grad_norm": 1.0234375, + "learning_rate": 4.4275e-06, + "loss": 0.1837, + "step": 6230 + }, + { + "epoch": 0.13698291829594567, + "grad_norm": 0.72265625, + "learning_rate": 4.4025e-06, + "loss": 0.1696, + "step": 6240 + }, + { + "epoch": 0.13720244220347122, + "grad_norm": 0.9140625, + "learning_rate": 4.3775e-06, + "loss": 0.1885, + "step": 6250 + }, + { + "epoch": 0.13742196611099677, + "grad_norm": 0.7421875, + "learning_rate": 4.3525e-06, + "loss": 0.175, + "step": 6260 + }, + { + "epoch": 0.13764149001852233, + "grad_norm": 0.67578125, + "learning_rate": 4.3275000000000005e-06, + "loss": 0.1905, + "step": 6270 + }, + { + "epoch": 0.13786101392604788, + "grad_norm": 0.7109375, + "learning_rate": 4.302500000000001e-06, + "loss": 0.1854, + "step": 6280 + }, + { + "epoch": 0.13808053783357344, + "grad_norm": 0.7734375, + "learning_rate": 4.2775e-06, + "loss": 0.2116, + "step": 6290 + }, + { + "epoch": 0.138300061741099, + "grad_norm": 0.8671875, + "learning_rate": 4.2525e-06, + "loss": 0.184, + "step": 6300 + }, + { + "epoch": 0.13851958564862454, + "grad_norm": 0.67578125, + "learning_rate": 4.227500000000001e-06, + "loss": 0.1831, + "step": 6310 + }, + { + "epoch": 0.1387391095561501, + "grad_norm": 0.703125, + "learning_rate": 4.202500000000001e-06, + "loss": 0.1765, + "step": 6320 + }, + { + "epoch": 0.13895863346367565, + "grad_norm": 0.67578125, + "learning_rate": 4.1775e-06, + "loss": 0.1742, + "step": 6330 + }, + { + "epoch": 0.1391781573712012, + "grad_norm": 0.87890625, + "learning_rate": 4.1525000000000005e-06, + "loss": 0.2031, + "step": 6340 + }, + { + "epoch": 0.13939768127872676, + "grad_norm": 0.6953125, + "learning_rate": 4.127500000000001e-06, + "loss": 0.1976, + "step": 6350 + }, + { + "epoch": 0.1396172051862523, + "grad_norm": 0.625, + "learning_rate": 4.1025e-06, + "loss": 0.1701, + "step": 6360 + }, + { + "epoch": 0.13983672909377787, + "grad_norm": 0.77734375, + "learning_rate": 4.0775e-06, + "loss": 0.1827, + "step": 6370 + }, + { + "epoch": 0.14005625300130342, + "grad_norm": 0.8828125, + "learning_rate": 4.052500000000001e-06, + "loss": 0.1767, + "step": 6380 + }, + { + "epoch": 0.14027577690882898, + "grad_norm": 0.56640625, + "learning_rate": 4.0275e-06, + "loss": 0.1869, + "step": 6390 + }, + { + "epoch": 0.14049530081635453, + "grad_norm": 0.78125, + "learning_rate": 4.0025e-06, + "loss": 0.1954, + "step": 6400 + }, + { + "epoch": 0.14071482472388008, + "grad_norm": 0.76953125, + "learning_rate": 3.9775000000000005e-06, + "loss": 0.1762, + "step": 6410 + }, + { + "epoch": 0.14093434863140564, + "grad_norm": 0.953125, + "learning_rate": 3.9525e-06, + "loss": 0.1865, + "step": 6420 + }, + { + "epoch": 0.1411538725389312, + "grad_norm": 0.59375, + "learning_rate": 3.9275e-06, + "loss": 0.1816, + "step": 6430 + }, + { + "epoch": 0.14137339644645674, + "grad_norm": 0.765625, + "learning_rate": 3.9025e-06, + "loss": 0.1757, + "step": 6440 + }, + { + "epoch": 0.1415929203539823, + "grad_norm": 0.5234375, + "learning_rate": 3.8775000000000006e-06, + "loss": 0.1836, + "step": 6450 + }, + { + "epoch": 0.14181244426150785, + "grad_norm": 0.81640625, + "learning_rate": 3.8525e-06, + "loss": 0.1953, + "step": 6460 + }, + { + "epoch": 0.1420319681690334, + "grad_norm": 0.67578125, + "learning_rate": 3.8275e-06, + "loss": 0.1605, + "step": 6470 + }, + { + "epoch": 0.14225149207655896, + "grad_norm": 0.86328125, + "learning_rate": 3.8025e-06, + "loss": 0.1865, + "step": 6480 + }, + { + "epoch": 0.14247101598408451, + "grad_norm": 0.65234375, + "learning_rate": 3.7775000000000003e-06, + "loss": 0.1746, + "step": 6490 + }, + { + "epoch": 0.14269053989161007, + "grad_norm": 0.59375, + "learning_rate": 3.7525e-06, + "loss": 0.1572, + "step": 6500 + }, + { + "epoch": 0.14291006379913562, + "grad_norm": 0.65234375, + "learning_rate": 3.7275000000000007e-06, + "loss": 0.1942, + "step": 6510 + }, + { + "epoch": 0.14312958770666118, + "grad_norm": 0.8125, + "learning_rate": 3.7025000000000005e-06, + "loss": 0.1841, + "step": 6520 + }, + { + "epoch": 0.14334911161418673, + "grad_norm": 0.64453125, + "learning_rate": 3.6775000000000004e-06, + "loss": 0.1964, + "step": 6530 + }, + { + "epoch": 0.14356863552171228, + "grad_norm": 0.66796875, + "learning_rate": 3.6525000000000006e-06, + "loss": 0.198, + "step": 6540 + }, + { + "epoch": 0.14378815942923784, + "grad_norm": 0.72265625, + "learning_rate": 3.6275000000000004e-06, + "loss": 0.1773, + "step": 6550 + }, + { + "epoch": 0.1440076833367634, + "grad_norm": 0.78125, + "learning_rate": 3.6025000000000002e-06, + "loss": 0.1699, + "step": 6560 + }, + { + "epoch": 0.14422720724428895, + "grad_norm": 0.73046875, + "learning_rate": 3.5775000000000005e-06, + "loss": 0.2117, + "step": 6570 + }, + { + "epoch": 0.1444467311518145, + "grad_norm": 0.859375, + "learning_rate": 3.5525000000000003e-06, + "loss": 0.1783, + "step": 6580 + }, + { + "epoch": 0.14466625505934005, + "grad_norm": 0.546875, + "learning_rate": 3.5275000000000005e-06, + "loss": 0.1608, + "step": 6590 + }, + { + "epoch": 0.1448857789668656, + "grad_norm": 1.109375, + "learning_rate": 3.5025000000000003e-06, + "loss": 0.1933, + "step": 6600 + }, + { + "epoch": 0.14510530287439116, + "grad_norm": 0.69921875, + "learning_rate": 3.4775e-06, + "loss": 0.2031, + "step": 6610 + }, + { + "epoch": 0.14532482678191672, + "grad_norm": 1.09375, + "learning_rate": 3.4525000000000004e-06, + "loss": 0.188, + "step": 6620 + }, + { + "epoch": 0.14554435068944227, + "grad_norm": 0.94921875, + "learning_rate": 3.4275000000000002e-06, + "loss": 0.1767, + "step": 6630 + }, + { + "epoch": 0.14576387459696782, + "grad_norm": 0.59765625, + "learning_rate": 3.4025000000000005e-06, + "loss": 0.1888, + "step": 6640 + }, + { + "epoch": 0.14598339850449338, + "grad_norm": 1.0546875, + "learning_rate": 3.3775000000000003e-06, + "loss": 0.1918, + "step": 6650 + }, + { + "epoch": 0.14620292241201893, + "grad_norm": 0.6328125, + "learning_rate": 3.3525e-06, + "loss": 0.167, + "step": 6660 + }, + { + "epoch": 0.14642244631954449, + "grad_norm": 0.6328125, + "learning_rate": 3.3275000000000003e-06, + "loss": 0.1635, + "step": 6670 + }, + { + "epoch": 0.14664197022707004, + "grad_norm": 0.9140625, + "learning_rate": 3.3025e-06, + "loss": 0.2107, + "step": 6680 + }, + { + "epoch": 0.1468614941345956, + "grad_norm": 0.84375, + "learning_rate": 3.2775e-06, + "loss": 0.1872, + "step": 6690 + }, + { + "epoch": 0.14708101804212115, + "grad_norm": 0.78515625, + "learning_rate": 3.2525e-06, + "loss": 0.1627, + "step": 6700 + }, + { + "epoch": 0.1473005419496467, + "grad_norm": 0.79296875, + "learning_rate": 3.2275e-06, + "loss": 0.1499, + "step": 6710 + }, + { + "epoch": 0.14752006585717226, + "grad_norm": 0.69921875, + "learning_rate": 3.2025000000000003e-06, + "loss": 0.1921, + "step": 6720 + }, + { + "epoch": 0.1477395897646978, + "grad_norm": 0.78125, + "learning_rate": 3.1775e-06, + "loss": 0.1811, + "step": 6730 + }, + { + "epoch": 0.14795911367222336, + "grad_norm": 0.8515625, + "learning_rate": 3.1525e-06, + "loss": 0.176, + "step": 6740 + }, + { + "epoch": 0.14817863757974892, + "grad_norm": 0.8828125, + "learning_rate": 3.1275e-06, + "loss": 0.2066, + "step": 6750 + }, + { + "epoch": 0.14839816148727447, + "grad_norm": 0.5859375, + "learning_rate": 3.1025000000000004e-06, + "loss": 0.1424, + "step": 6760 + }, + { + "epoch": 0.14861768539480003, + "grad_norm": 0.65234375, + "learning_rate": 3.0775000000000006e-06, + "loss": 0.1997, + "step": 6770 + }, + { + "epoch": 0.14883720930232558, + "grad_norm": 0.7890625, + "learning_rate": 3.0525000000000004e-06, + "loss": 0.1976, + "step": 6780 + }, + { + "epoch": 0.14905673320985113, + "grad_norm": 0.640625, + "learning_rate": 3.0275000000000002e-06, + "loss": 0.1596, + "step": 6790 + }, + { + "epoch": 0.1492762571173767, + "grad_norm": 0.63671875, + "learning_rate": 3.0025000000000005e-06, + "loss": 0.1694, + "step": 6800 + }, + { + "epoch": 0.14949578102490224, + "grad_norm": 0.8828125, + "learning_rate": 2.9775000000000003e-06, + "loss": 0.1774, + "step": 6810 + }, + { + "epoch": 0.1497153049324278, + "grad_norm": 0.90234375, + "learning_rate": 2.9525000000000005e-06, + "loss": 0.1849, + "step": 6820 + }, + { + "epoch": 0.14993482883995335, + "grad_norm": 0.69140625, + "learning_rate": 2.9275000000000003e-06, + "loss": 0.2215, + "step": 6830 + }, + { + "epoch": 0.1501543527474789, + "grad_norm": 0.7578125, + "learning_rate": 2.9025e-06, + "loss": 0.1916, + "step": 6840 + }, + { + "epoch": 0.15037387665500446, + "grad_norm": 1.109375, + "learning_rate": 2.8775000000000004e-06, + "loss": 0.185, + "step": 6850 + }, + { + "epoch": 0.15059340056253, + "grad_norm": 0.81640625, + "learning_rate": 2.8525000000000002e-06, + "loss": 0.1826, + "step": 6860 + }, + { + "epoch": 0.15081292447005556, + "grad_norm": 0.78515625, + "learning_rate": 2.8275e-06, + "loss": 0.1935, + "step": 6870 + }, + { + "epoch": 0.15103244837758112, + "grad_norm": 0.765625, + "learning_rate": 2.8025000000000003e-06, + "loss": 0.1683, + "step": 6880 + }, + { + "epoch": 0.15125197228510667, + "grad_norm": 1.078125, + "learning_rate": 2.7775e-06, + "loss": 0.2083, + "step": 6890 + }, + { + "epoch": 0.15147149619263223, + "grad_norm": 0.87109375, + "learning_rate": 2.7525000000000003e-06, + "loss": 0.1656, + "step": 6900 + }, + { + "epoch": 0.15169102010015778, + "grad_norm": 0.62890625, + "learning_rate": 2.7275e-06, + "loss": 0.1748, + "step": 6910 + }, + { + "epoch": 0.15191054400768333, + "grad_norm": 0.7578125, + "learning_rate": 2.7025e-06, + "loss": 0.2087, + "step": 6920 + }, + { + "epoch": 0.1521300679152089, + "grad_norm": 0.7421875, + "learning_rate": 2.6775e-06, + "loss": 0.1721, + "step": 6930 + }, + { + "epoch": 0.15234959182273444, + "grad_norm": 0.86328125, + "learning_rate": 2.6525e-06, + "loss": 0.2098, + "step": 6940 + }, + { + "epoch": 0.15256911573026, + "grad_norm": 0.921875, + "learning_rate": 2.6275000000000003e-06, + "loss": 0.1765, + "step": 6950 + }, + { + "epoch": 0.15278863963778555, + "grad_norm": 0.640625, + "learning_rate": 2.6025e-06, + "loss": 0.1839, + "step": 6960 + }, + { + "epoch": 0.1530081635453111, + "grad_norm": 0.6953125, + "learning_rate": 2.5775e-06, + "loss": 0.1648, + "step": 6970 + }, + { + "epoch": 0.15322768745283666, + "grad_norm": 0.50390625, + "learning_rate": 2.5525e-06, + "loss": 0.1808, + "step": 6980 + }, + { + "epoch": 0.1534472113603622, + "grad_norm": 0.68359375, + "learning_rate": 2.5275e-06, + "loss": 0.1903, + "step": 6990 + }, + { + "epoch": 0.15366673526788777, + "grad_norm": 0.61328125, + "learning_rate": 2.5024999999999998e-06, + "loss": 0.1867, + "step": 7000 + }, + { + "epoch": 0.15388625917541332, + "grad_norm": 0.70703125, + "learning_rate": 2.4775e-06, + "loss": 0.1942, + "step": 7010 + }, + { + "epoch": 0.15410578308293887, + "grad_norm": 0.73828125, + "learning_rate": 2.4525000000000002e-06, + "loss": 0.1753, + "step": 7020 + }, + { + "epoch": 0.15432530699046443, + "grad_norm": 0.6796875, + "learning_rate": 2.4275e-06, + "loss": 0.1916, + "step": 7030 + }, + { + "epoch": 0.15454483089798998, + "grad_norm": 0.765625, + "learning_rate": 2.4025000000000003e-06, + "loss": 0.1735, + "step": 7040 + }, + { + "epoch": 0.15476435480551554, + "grad_norm": 0.78515625, + "learning_rate": 2.3775e-06, + "loss": 0.1675, + "step": 7050 + }, + { + "epoch": 0.1549838787130411, + "grad_norm": 0.7265625, + "learning_rate": 2.3525e-06, + "loss": 0.176, + "step": 7060 + }, + { + "epoch": 0.15520340262056664, + "grad_norm": 0.81640625, + "learning_rate": 2.3275e-06, + "loss": 0.1785, + "step": 7070 + }, + { + "epoch": 0.1554229265280922, + "grad_norm": 0.82421875, + "learning_rate": 2.3025000000000004e-06, + "loss": 0.1981, + "step": 7080 + }, + { + "epoch": 0.15564245043561775, + "grad_norm": 0.7109375, + "learning_rate": 2.2775000000000002e-06, + "loss": 0.2026, + "step": 7090 + }, + { + "epoch": 0.1558619743431433, + "grad_norm": 0.7109375, + "learning_rate": 2.2525e-06, + "loss": 0.1676, + "step": 7100 + }, + { + "epoch": 0.15608149825066886, + "grad_norm": 1.0546875, + "learning_rate": 2.2275000000000003e-06, + "loss": 0.1657, + "step": 7110 + }, + { + "epoch": 0.1563010221581944, + "grad_norm": 0.6953125, + "learning_rate": 2.2025e-06, + "loss": 0.1702, + "step": 7120 + }, + { + "epoch": 0.15652054606571997, + "grad_norm": 0.7578125, + "learning_rate": 2.1775000000000003e-06, + "loss": 0.1788, + "step": 7130 + }, + { + "epoch": 0.15674006997324552, + "grad_norm": 0.671875, + "learning_rate": 2.1525e-06, + "loss": 0.1713, + "step": 7140 + }, + { + "epoch": 0.15695959388077108, + "grad_norm": 0.5078125, + "learning_rate": 2.1275e-06, + "loss": 0.1754, + "step": 7150 + }, + { + "epoch": 0.15717911778829663, + "grad_norm": 0.8046875, + "learning_rate": 2.1025e-06, + "loss": 0.1924, + "step": 7160 + }, + { + "epoch": 0.15739864169582218, + "grad_norm": 0.6328125, + "learning_rate": 2.0775e-06, + "loss": 0.1997, + "step": 7170 + }, + { + "epoch": 0.15761816560334774, + "grad_norm": 0.74609375, + "learning_rate": 2.0525000000000003e-06, + "loss": 0.1917, + "step": 7180 + }, + { + "epoch": 0.1578376895108733, + "grad_norm": 0.85546875, + "learning_rate": 2.0275000000000005e-06, + "loss": 0.2014, + "step": 7190 + }, + { + "epoch": 0.15805721341839885, + "grad_norm": 0.8125, + "learning_rate": 2.0025000000000003e-06, + "loss": 0.1756, + "step": 7200 + }, + { + "epoch": 0.1582767373259244, + "grad_norm": 0.62890625, + "learning_rate": 1.9775e-06, + "loss": 0.1767, + "step": 7210 + }, + { + "epoch": 0.15849626123344995, + "grad_norm": 0.84765625, + "learning_rate": 1.9525000000000004e-06, + "loss": 0.1863, + "step": 7220 + }, + { + "epoch": 0.1587157851409755, + "grad_norm": 1.0234375, + "learning_rate": 1.9275e-06, + "loss": 0.2036, + "step": 7230 + }, + { + "epoch": 0.15893530904850106, + "grad_norm": 0.84765625, + "learning_rate": 1.9025000000000002e-06, + "loss": 0.1922, + "step": 7240 + }, + { + "epoch": 0.15915483295602662, + "grad_norm": 1.1171875, + "learning_rate": 1.8775000000000002e-06, + "loss": 0.1937, + "step": 7250 + }, + { + "epoch": 0.15937435686355217, + "grad_norm": 0.828125, + "learning_rate": 1.8525e-06, + "loss": 0.2084, + "step": 7260 + }, + { + "epoch": 0.15959388077107772, + "grad_norm": 0.87890625, + "learning_rate": 1.8275e-06, + "loss": 0.2239, + "step": 7270 + }, + { + "epoch": 0.15981340467860328, + "grad_norm": 0.6640625, + "learning_rate": 1.8025000000000001e-06, + "loss": 0.1826, + "step": 7280 + }, + { + "epoch": 0.16003292858612883, + "grad_norm": 0.93359375, + "learning_rate": 1.7775000000000001e-06, + "loss": 0.1847, + "step": 7290 + }, + { + "epoch": 0.16025245249365438, + "grad_norm": 0.83203125, + "learning_rate": 1.7525e-06, + "loss": 0.2061, + "step": 7300 + }, + { + "epoch": 0.16047197640117994, + "grad_norm": 0.69921875, + "learning_rate": 1.7275e-06, + "loss": 0.1872, + "step": 7310 + }, + { + "epoch": 0.1606915003087055, + "grad_norm": 0.66015625, + "learning_rate": 1.7025000000000002e-06, + "loss": 0.1826, + "step": 7320 + }, + { + "epoch": 0.16091102421623105, + "grad_norm": 1.109375, + "learning_rate": 1.6775000000000002e-06, + "loss": 0.1821, + "step": 7330 + }, + { + "epoch": 0.1611305481237566, + "grad_norm": 0.7734375, + "learning_rate": 1.6525000000000003e-06, + "loss": 0.1842, + "step": 7340 + }, + { + "epoch": 0.16135007203128215, + "grad_norm": 0.62109375, + "learning_rate": 1.6275e-06, + "loss": 0.1754, + "step": 7350 + }, + { + "epoch": 0.1615695959388077, + "grad_norm": 0.828125, + "learning_rate": 1.6025000000000001e-06, + "loss": 0.1928, + "step": 7360 + }, + { + "epoch": 0.16178911984633326, + "grad_norm": 0.91015625, + "learning_rate": 1.5775000000000001e-06, + "loss": 0.1871, + "step": 7370 + }, + { + "epoch": 0.16200864375385882, + "grad_norm": 0.859375, + "learning_rate": 1.5525000000000002e-06, + "loss": 0.2064, + "step": 7380 + }, + { + "epoch": 0.16222816766138437, + "grad_norm": 1.1484375, + "learning_rate": 1.5275000000000002e-06, + "loss": 0.2008, + "step": 7390 + }, + { + "epoch": 0.16244769156890992, + "grad_norm": 0.6875, + "learning_rate": 1.5025e-06, + "loss": 0.1788, + "step": 7400 + }, + { + "epoch": 0.16266721547643548, + "grad_norm": 0.8828125, + "learning_rate": 1.4775e-06, + "loss": 0.1762, + "step": 7410 + }, + { + "epoch": 0.16288673938396103, + "grad_norm": 0.79296875, + "learning_rate": 1.4525e-06, + "loss": 0.1807, + "step": 7420 + }, + { + "epoch": 0.1631062632914866, + "grad_norm": 0.82421875, + "learning_rate": 1.4275e-06, + "loss": 0.2052, + "step": 7430 + }, + { + "epoch": 0.16332578719901214, + "grad_norm": 0.7578125, + "learning_rate": 1.4025000000000003e-06, + "loss": 0.1669, + "step": 7440 + }, + { + "epoch": 0.1635453111065377, + "grad_norm": 0.7578125, + "learning_rate": 1.3775000000000002e-06, + "loss": 0.1858, + "step": 7450 + }, + { + "epoch": 0.16376483501406325, + "grad_norm": 0.6953125, + "learning_rate": 1.3525000000000002e-06, + "loss": 0.1636, + "step": 7460 + }, + { + "epoch": 0.1639843589215888, + "grad_norm": 0.72265625, + "learning_rate": 1.3275000000000002e-06, + "loss": 0.1912, + "step": 7470 + }, + { + "epoch": 0.16420388282911436, + "grad_norm": 0.83984375, + "learning_rate": 1.3025000000000002e-06, + "loss": 0.2127, + "step": 7480 + }, + { + "epoch": 0.1644234067366399, + "grad_norm": 0.80859375, + "learning_rate": 1.2775e-06, + "loss": 0.1856, + "step": 7490 + }, + { + "epoch": 0.16464293064416546, + "grad_norm": 0.72265625, + "learning_rate": 1.2525e-06, + "loss": 0.1888, + "step": 7500 + } + ], + "logging_steps": 10, + "max_steps": 8000, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.4314080978858947e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}