{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.2001067235859125, "eval_steps": 500, "global_step": 2250, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0008893632159373888, "grad_norm": 68.5, "learning_rate": 2.7e-06, "loss": 8.8776, "step": 10 }, { "epoch": 0.0017787264318747777, "grad_norm": 39.0, "learning_rate": 5.7000000000000005e-06, "loss": 7.711, "step": 20 }, { "epoch": 0.0026680896478121665, "grad_norm": 29.375, "learning_rate": 8.7e-06, "loss": 5.7038, "step": 30 }, { "epoch": 0.0035574528637495554, "grad_norm": 24.5, "learning_rate": 1.1700000000000001e-05, "loss": 4.746, "step": 40 }, { "epoch": 0.004446816079686944, "grad_norm": 22.625, "learning_rate": 1.47e-05, "loss": 4.1094, "step": 50 }, { "epoch": 0.005336179295624333, "grad_norm": 21.625, "learning_rate": 1.77e-05, "loss": 4.0746, "step": 60 }, { "epoch": 0.0062255425115617215, "grad_norm": 25.25, "learning_rate": 2.07e-05, "loss": 3.8396, "step": 70 }, { "epoch": 0.007114905727499111, "grad_norm": 20.875, "learning_rate": 2.37e-05, "loss": 3.8629, "step": 80 }, { "epoch": 0.0080042689434365, "grad_norm": 22.875, "learning_rate": 2.6700000000000002e-05, "loss": 3.9097, "step": 90 }, { "epoch": 0.008893632159373888, "grad_norm": 19.375, "learning_rate": 2.97e-05, "loss": 3.763, "step": 100 }, { "epoch": 0.009782995375311278, "grad_norm": 19.625, "learning_rate": 2.9874418604651165e-05, "loss": 4.1501, "step": 110 }, { "epoch": 0.010672358591248666, "grad_norm": 19.0, "learning_rate": 2.9734883720930235e-05, "loss": 3.9038, "step": 120 }, { "epoch": 0.011561721807186055, "grad_norm": 16.625, "learning_rate": 2.9595348837209305e-05, "loss": 4.0583, "step": 130 }, { "epoch": 0.012451085023123443, "grad_norm": 16.375, "learning_rate": 2.9455813953488376e-05, "loss": 3.5958, "step": 140 }, { "epoch": 0.013340448239060833, "grad_norm": 15.375, "learning_rate": 2.9316279069767443e-05, "loss": 3.9075, "step": 150 }, { "epoch": 0.014229811454998222, "grad_norm": 15.625, "learning_rate": 2.9176744186046513e-05, "loss": 3.9495, "step": 160 }, { "epoch": 0.01511917467093561, "grad_norm": 18.125, "learning_rate": 2.9037209302325583e-05, "loss": 3.3247, "step": 170 }, { "epoch": 0.016008537886873, "grad_norm": 18.75, "learning_rate": 2.889767441860465e-05, "loss": 3.6863, "step": 180 }, { "epoch": 0.01689790110281039, "grad_norm": 15.625, "learning_rate": 2.875813953488372e-05, "loss": 3.3097, "step": 190 }, { "epoch": 0.017787264318747775, "grad_norm": 14.3125, "learning_rate": 2.861860465116279e-05, "loss": 3.48, "step": 200 }, { "epoch": 0.018676627534685165, "grad_norm": 17.875, "learning_rate": 2.847906976744186e-05, "loss": 3.6216, "step": 210 }, { "epoch": 0.019565990750622556, "grad_norm": 15.4375, "learning_rate": 2.833953488372093e-05, "loss": 3.4751, "step": 220 }, { "epoch": 0.020455353966559942, "grad_norm": 14.75, "learning_rate": 2.8199999999999998e-05, "loss": 3.9, "step": 230 }, { "epoch": 0.021344717182497332, "grad_norm": 14.8125, "learning_rate": 2.8060465116279068e-05, "loss": 3.7454, "step": 240 }, { "epoch": 0.02223408039843472, "grad_norm": 15.6875, "learning_rate": 2.7920930232558138e-05, "loss": 3.5053, "step": 250 }, { "epoch": 0.02312344361437211, "grad_norm": 14.5, "learning_rate": 2.778139534883721e-05, "loss": 3.3656, "step": 260 }, { "epoch": 0.0240128068303095, "grad_norm": 13.6875, "learning_rate": 2.764186046511628e-05, "loss": 3.769, "step": 270 }, { "epoch": 0.024902170046246886, "grad_norm": 15.0625, "learning_rate": 2.750232558139535e-05, "loss": 3.5721, "step": 280 }, { "epoch": 0.025791533262184276, "grad_norm": 14.6875, "learning_rate": 2.736279069767442e-05, "loss": 3.6104, "step": 290 }, { "epoch": 0.026680896478121666, "grad_norm": 13.875, "learning_rate": 2.722325581395349e-05, "loss": 3.5904, "step": 300 }, { "epoch": 0.027570259694059053, "grad_norm": 13.8125, "learning_rate": 2.7083720930232556e-05, "loss": 3.5633, "step": 310 }, { "epoch": 0.028459622909996443, "grad_norm": 13.5625, "learning_rate": 2.6944186046511626e-05, "loss": 3.3366, "step": 320 }, { "epoch": 0.02934898612593383, "grad_norm": 14.3125, "learning_rate": 2.6804651162790697e-05, "loss": 3.6519, "step": 330 }, { "epoch": 0.03023834934187122, "grad_norm": 14.4375, "learning_rate": 2.6665116279069767e-05, "loss": 3.4482, "step": 340 }, { "epoch": 0.03112771255780861, "grad_norm": 13.9375, "learning_rate": 2.6525581395348837e-05, "loss": 3.3554, "step": 350 }, { "epoch": 0.032017075773746, "grad_norm": 14.5625, "learning_rate": 2.6386046511627907e-05, "loss": 3.4008, "step": 360 }, { "epoch": 0.03290643898968339, "grad_norm": 14.375, "learning_rate": 2.6246511627906978e-05, "loss": 3.3279, "step": 370 }, { "epoch": 0.03379580220562078, "grad_norm": 13.9375, "learning_rate": 2.6106976744186048e-05, "loss": 3.487, "step": 380 }, { "epoch": 0.03468516542155817, "grad_norm": 14.625, "learning_rate": 2.5967441860465115e-05, "loss": 3.2542, "step": 390 }, { "epoch": 0.03557452863749555, "grad_norm": 13.1875, "learning_rate": 2.5827906976744185e-05, "loss": 3.3328, "step": 400 }, { "epoch": 0.03646389185343294, "grad_norm": 13.25, "learning_rate": 2.5688372093023255e-05, "loss": 3.5205, "step": 410 }, { "epoch": 0.03735325506937033, "grad_norm": 11.9375, "learning_rate": 2.5548837209302325e-05, "loss": 3.4151, "step": 420 }, { "epoch": 0.03824261828530772, "grad_norm": 13.625, "learning_rate": 2.5409302325581396e-05, "loss": 3.1404, "step": 430 }, { "epoch": 0.03913198150124511, "grad_norm": 14.0625, "learning_rate": 2.5269767441860466e-05, "loss": 3.5277, "step": 440 }, { "epoch": 0.040021344717182494, "grad_norm": 13.1875, "learning_rate": 2.5130232558139536e-05, "loss": 3.1584, "step": 450 }, { "epoch": 0.040910707933119884, "grad_norm": 13.4375, "learning_rate": 2.4990697674418606e-05, "loss": 3.4669, "step": 460 }, { "epoch": 0.041800071149057275, "grad_norm": 14.5625, "learning_rate": 2.4851162790697673e-05, "loss": 3.2653, "step": 470 }, { "epoch": 0.042689434364994665, "grad_norm": 13.9375, "learning_rate": 2.4711627906976743e-05, "loss": 3.4219, "step": 480 }, { "epoch": 0.043578797580932055, "grad_norm": 12.875, "learning_rate": 2.4572093023255814e-05, "loss": 3.508, "step": 490 }, { "epoch": 0.04446816079686944, "grad_norm": 14.0625, "learning_rate": 2.4432558139534884e-05, "loss": 3.3343, "step": 500 }, { "epoch": 0.04535752401280683, "grad_norm": 13.75, "learning_rate": 2.4293023255813954e-05, "loss": 3.5054, "step": 510 }, { "epoch": 0.04624688722874422, "grad_norm": 15.25, "learning_rate": 2.4153488372093024e-05, "loss": 3.0298, "step": 520 }, { "epoch": 0.04713625044468161, "grad_norm": 13.75, "learning_rate": 2.4013953488372095e-05, "loss": 3.4306, "step": 530 }, { "epoch": 0.048025613660619, "grad_norm": 14.25, "learning_rate": 2.3874418604651165e-05, "loss": 3.1824, "step": 540 }, { "epoch": 0.04891497687655639, "grad_norm": 12.3125, "learning_rate": 2.373488372093023e-05, "loss": 3.211, "step": 550 }, { "epoch": 0.04980434009249377, "grad_norm": 13.75, "learning_rate": 2.3595348837209302e-05, "loss": 3.091, "step": 560 }, { "epoch": 0.05069370330843116, "grad_norm": 13.75, "learning_rate": 2.3455813953488372e-05, "loss": 3.234, "step": 570 }, { "epoch": 0.05158306652436855, "grad_norm": 13.75, "learning_rate": 2.3316279069767442e-05, "loss": 3.1404, "step": 580 }, { "epoch": 0.05247242974030594, "grad_norm": 13.25, "learning_rate": 2.3176744186046513e-05, "loss": 3.2191, "step": 590 }, { "epoch": 0.05336179295624333, "grad_norm": 12.625, "learning_rate": 2.3037209302325583e-05, "loss": 3.0968, "step": 600 }, { "epoch": 0.054251156172180716, "grad_norm": 13.5625, "learning_rate": 2.2897674418604653e-05, "loss": 3.1108, "step": 610 }, { "epoch": 0.055140519388118106, "grad_norm": 12.75, "learning_rate": 2.2758139534883723e-05, "loss": 3.028, "step": 620 }, { "epoch": 0.056029882604055496, "grad_norm": 12.375, "learning_rate": 2.261860465116279e-05, "loss": 3.1469, "step": 630 }, { "epoch": 0.056919245819992886, "grad_norm": 14.8125, "learning_rate": 2.247906976744186e-05, "loss": 3.1258, "step": 640 }, { "epoch": 0.057808609035930276, "grad_norm": 12.5625, "learning_rate": 2.233953488372093e-05, "loss": 3.1147, "step": 650 }, { "epoch": 0.05869797225186766, "grad_norm": 13.1875, "learning_rate": 2.22e-05, "loss": 3.0447, "step": 660 }, { "epoch": 0.05958733546780505, "grad_norm": 15.3125, "learning_rate": 2.206046511627907e-05, "loss": 3.1177, "step": 670 }, { "epoch": 0.06047669868374244, "grad_norm": 13.6875, "learning_rate": 2.192093023255814e-05, "loss": 3.2542, "step": 680 }, { "epoch": 0.06136606189967983, "grad_norm": 13.1875, "learning_rate": 2.178139534883721e-05, "loss": 3.188, "step": 690 }, { "epoch": 0.06225542511561722, "grad_norm": 13.625, "learning_rate": 2.1641860465116282e-05, "loss": 3.2298, "step": 700 }, { "epoch": 0.06314478833155461, "grad_norm": 14.0625, "learning_rate": 2.150232558139535e-05, "loss": 3.1127, "step": 710 }, { "epoch": 0.064034151547492, "grad_norm": 15.1875, "learning_rate": 2.136279069767442e-05, "loss": 3.1468, "step": 720 }, { "epoch": 0.06492351476342939, "grad_norm": 12.625, "learning_rate": 2.122325581395349e-05, "loss": 3.2763, "step": 730 }, { "epoch": 0.06581287797936677, "grad_norm": 13.875, "learning_rate": 2.108372093023256e-05, "loss": 3.051, "step": 740 }, { "epoch": 0.06670224119530416, "grad_norm": 13.9375, "learning_rate": 2.094418604651163e-05, "loss": 3.1937, "step": 750 }, { "epoch": 0.06759160441124155, "grad_norm": 13.3125, "learning_rate": 2.08046511627907e-05, "loss": 3.0506, "step": 760 }, { "epoch": 0.06848096762717894, "grad_norm": 12.625, "learning_rate": 2.066511627906977e-05, "loss": 2.975, "step": 770 }, { "epoch": 0.06937033084311633, "grad_norm": 12.75, "learning_rate": 2.052558139534884e-05, "loss": 2.9174, "step": 780 }, { "epoch": 0.07025969405905372, "grad_norm": 12.875, "learning_rate": 2.0386046511627907e-05, "loss": 3.0532, "step": 790 }, { "epoch": 0.0711490572749911, "grad_norm": 12.4375, "learning_rate": 2.0246511627906977e-05, "loss": 3.2092, "step": 800 }, { "epoch": 0.0720384204909285, "grad_norm": 12.625, "learning_rate": 2.0106976744186048e-05, "loss": 2.9702, "step": 810 }, { "epoch": 0.07292778370686588, "grad_norm": 14.0625, "learning_rate": 1.9967441860465118e-05, "loss": 3.0595, "step": 820 }, { "epoch": 0.07381714692280328, "grad_norm": 14.5, "learning_rate": 1.9827906976744188e-05, "loss": 3.0102, "step": 830 }, { "epoch": 0.07470651013874066, "grad_norm": 13.25, "learning_rate": 1.968837209302326e-05, "loss": 3.0725, "step": 840 }, { "epoch": 0.07559587335467804, "grad_norm": 13.6875, "learning_rate": 1.954883720930233e-05, "loss": 3.0679, "step": 850 }, { "epoch": 0.07648523657061544, "grad_norm": 13.1875, "learning_rate": 1.94093023255814e-05, "loss": 3.0764, "step": 860 }, { "epoch": 0.07737459978655283, "grad_norm": 14.5, "learning_rate": 1.9269767441860466e-05, "loss": 2.768, "step": 870 }, { "epoch": 0.07826396300249022, "grad_norm": 14.875, "learning_rate": 1.9130232558139536e-05, "loss": 3.2016, "step": 880 }, { "epoch": 0.0791533262184276, "grad_norm": 12.875, "learning_rate": 1.8990697674418606e-05, "loss": 3.0906, "step": 890 }, { "epoch": 0.08004268943436499, "grad_norm": 13.0, "learning_rate": 1.8851162790697673e-05, "loss": 3.1441, "step": 900 }, { "epoch": 0.08093205265030239, "grad_norm": 13.25, "learning_rate": 1.8711627906976743e-05, "loss": 2.9791, "step": 910 }, { "epoch": 0.08182141586623977, "grad_norm": 14.0625, "learning_rate": 1.8572093023255814e-05, "loss": 2.8393, "step": 920 }, { "epoch": 0.08271077908217717, "grad_norm": 13.1875, "learning_rate": 1.8432558139534884e-05, "loss": 2.9332, "step": 930 }, { "epoch": 0.08360014229811455, "grad_norm": 13.0625, "learning_rate": 1.8293023255813954e-05, "loss": 2.9787, "step": 940 }, { "epoch": 0.08448950551405193, "grad_norm": 15.375, "learning_rate": 1.815348837209302e-05, "loss": 2.8229, "step": 950 }, { "epoch": 0.08537886872998933, "grad_norm": 13.5625, "learning_rate": 1.801395348837209e-05, "loss": 2.8112, "step": 960 }, { "epoch": 0.08626823194592671, "grad_norm": 16.75, "learning_rate": 1.787441860465116e-05, "loss": 3.0131, "step": 970 }, { "epoch": 0.08715759516186411, "grad_norm": 13.9375, "learning_rate": 1.773488372093023e-05, "loss": 2.9496, "step": 980 }, { "epoch": 0.08804695837780149, "grad_norm": 15.0625, "learning_rate": 1.7595348837209302e-05, "loss": 3.0641, "step": 990 }, { "epoch": 0.08893632159373888, "grad_norm": 13.0, "learning_rate": 1.7455813953488372e-05, "loss": 2.8714, "step": 1000 }, { "epoch": 0.08982568480967627, "grad_norm": 13.8125, "learning_rate": 1.7316279069767442e-05, "loss": 2.7792, "step": 1010 }, { "epoch": 0.09071504802561366, "grad_norm": 13.875, "learning_rate": 1.7176744186046512e-05, "loss": 2.9472, "step": 1020 }, { "epoch": 0.09160441124155105, "grad_norm": 14.0, "learning_rate": 1.703720930232558e-05, "loss": 2.5662, "step": 1030 }, { "epoch": 0.09249377445748844, "grad_norm": 13.9375, "learning_rate": 1.689767441860465e-05, "loss": 2.82, "step": 1040 }, { "epoch": 0.09338313767342583, "grad_norm": 13.875, "learning_rate": 1.675813953488372e-05, "loss": 2.8245, "step": 1050 }, { "epoch": 0.09427250088936322, "grad_norm": 11.8125, "learning_rate": 1.661860465116279e-05, "loss": 2.8141, "step": 1060 }, { "epoch": 0.0951618641053006, "grad_norm": 13.1875, "learning_rate": 1.647906976744186e-05, "loss": 2.8975, "step": 1070 }, { "epoch": 0.096051227321238, "grad_norm": 14.625, "learning_rate": 1.633953488372093e-05, "loss": 2.7996, "step": 1080 }, { "epoch": 0.09694059053717538, "grad_norm": 12.625, "learning_rate": 1.62e-05, "loss": 2.7679, "step": 1090 }, { "epoch": 0.09782995375311278, "grad_norm": 14.0625, "learning_rate": 1.606046511627907e-05, "loss": 3.0057, "step": 1100 }, { "epoch": 0.09871931696905016, "grad_norm": 13.6875, "learning_rate": 1.5920930232558138e-05, "loss": 2.5742, "step": 1110 }, { "epoch": 0.09960868018498754, "grad_norm": 15.3125, "learning_rate": 1.5781395348837208e-05, "loss": 2.7903, "step": 1120 }, { "epoch": 0.10049804340092494, "grad_norm": 14.8125, "learning_rate": 1.564186046511628e-05, "loss": 2.5664, "step": 1130 }, { "epoch": 0.10138740661686232, "grad_norm": 13.8125, "learning_rate": 1.550232558139535e-05, "loss": 2.9392, "step": 1140 }, { "epoch": 0.10227676983279972, "grad_norm": 13.9375, "learning_rate": 1.536279069767442e-05, "loss": 2.7105, "step": 1150 }, { "epoch": 0.1031661330487371, "grad_norm": 14.5, "learning_rate": 1.5223255813953489e-05, "loss": 2.9472, "step": 1160 }, { "epoch": 0.10405549626467449, "grad_norm": 13.4375, "learning_rate": 1.508372093023256e-05, "loss": 2.7698, "step": 1170 }, { "epoch": 0.10494485948061189, "grad_norm": 13.125, "learning_rate": 1.4944186046511628e-05, "loss": 2.8998, "step": 1180 }, { "epoch": 0.10583422269654927, "grad_norm": 14.125, "learning_rate": 1.4804651162790698e-05, "loss": 2.7952, "step": 1190 }, { "epoch": 0.10672358591248667, "grad_norm": 14.875, "learning_rate": 1.4665116279069768e-05, "loss": 2.7689, "step": 1200 }, { "epoch": 0.10761294912842405, "grad_norm": 13.375, "learning_rate": 1.4525581395348837e-05, "loss": 3.0368, "step": 1210 }, { "epoch": 0.10850231234436143, "grad_norm": 14.5, "learning_rate": 1.4386046511627907e-05, "loss": 2.8113, "step": 1220 }, { "epoch": 0.10939167556029883, "grad_norm": 13.25, "learning_rate": 1.4246511627906977e-05, "loss": 2.5883, "step": 1230 }, { "epoch": 0.11028103877623621, "grad_norm": 14.25, "learning_rate": 1.4106976744186048e-05, "loss": 2.9207, "step": 1240 }, { "epoch": 0.11117040199217361, "grad_norm": 12.9375, "learning_rate": 1.3967441860465116e-05, "loss": 2.8662, "step": 1250 }, { "epoch": 0.11205976520811099, "grad_norm": 14.25, "learning_rate": 1.3827906976744186e-05, "loss": 2.8439, "step": 1260 }, { "epoch": 0.11294912842404838, "grad_norm": 13.1875, "learning_rate": 1.3688372093023257e-05, "loss": 2.8322, "step": 1270 }, { "epoch": 0.11383849163998577, "grad_norm": 13.6875, "learning_rate": 1.3548837209302327e-05, "loss": 2.8627, "step": 1280 }, { "epoch": 0.11472785485592316, "grad_norm": 12.8125, "learning_rate": 1.3409302325581395e-05, "loss": 2.733, "step": 1290 }, { "epoch": 0.11561721807186055, "grad_norm": 12.75, "learning_rate": 1.3269767441860466e-05, "loss": 2.7608, "step": 1300 }, { "epoch": 0.11650658128779794, "grad_norm": 14.4375, "learning_rate": 1.3130232558139536e-05, "loss": 2.7633, "step": 1310 }, { "epoch": 0.11739594450373532, "grad_norm": 13.875, "learning_rate": 1.2990697674418606e-05, "loss": 2.682, "step": 1320 }, { "epoch": 0.11828530771967272, "grad_norm": 14.4375, "learning_rate": 1.2851162790697675e-05, "loss": 2.7205, "step": 1330 }, { "epoch": 0.1191746709356101, "grad_norm": 12.6875, "learning_rate": 1.2711627906976745e-05, "loss": 2.7408, "step": 1340 }, { "epoch": 0.1200640341515475, "grad_norm": 13.5, "learning_rate": 1.2572093023255815e-05, "loss": 3.0955, "step": 1350 }, { "epoch": 0.12095339736748488, "grad_norm": 13.8125, "learning_rate": 1.2432558139534885e-05, "loss": 2.8928, "step": 1360 }, { "epoch": 0.12184276058342226, "grad_norm": 12.5625, "learning_rate": 1.2293023255813954e-05, "loss": 2.7429, "step": 1370 }, { "epoch": 0.12273212379935966, "grad_norm": 14.3125, "learning_rate": 1.2153488372093024e-05, "loss": 2.6795, "step": 1380 }, { "epoch": 0.12362148701529704, "grad_norm": 13.125, "learning_rate": 1.2013953488372094e-05, "loss": 2.6802, "step": 1390 }, { "epoch": 0.12451085023123444, "grad_norm": 13.8125, "learning_rate": 1.1874418604651165e-05, "loss": 2.6818, "step": 1400 }, { "epoch": 0.12540021344717184, "grad_norm": 13.875, "learning_rate": 1.1734883720930233e-05, "loss": 2.5968, "step": 1410 }, { "epoch": 0.12628957666310922, "grad_norm": 13.75, "learning_rate": 1.1595348837209303e-05, "loss": 2.7026, "step": 1420 }, { "epoch": 0.1271789398790466, "grad_norm": 13.5625, "learning_rate": 1.1455813953488372e-05, "loss": 2.8092, "step": 1430 }, { "epoch": 0.128068303094984, "grad_norm": 13.9375, "learning_rate": 1.1316279069767442e-05, "loss": 2.6957, "step": 1440 }, { "epoch": 0.12895766631092137, "grad_norm": 14.0, "learning_rate": 1.117674418604651e-05, "loss": 2.8901, "step": 1450 }, { "epoch": 0.12984702952685878, "grad_norm": 14.3125, "learning_rate": 1.1037209302325581e-05, "loss": 2.6284, "step": 1460 }, { "epoch": 0.13073639274279616, "grad_norm": 13.875, "learning_rate": 1.0897674418604651e-05, "loss": 2.9493, "step": 1470 }, { "epoch": 0.13162575595873355, "grad_norm": 13.5625, "learning_rate": 1.0758139534883721e-05, "loss": 2.6665, "step": 1480 }, { "epoch": 0.13251511917467093, "grad_norm": 14.125, "learning_rate": 1.061860465116279e-05, "loss": 2.8124, "step": 1490 }, { "epoch": 0.13340448239060831, "grad_norm": 13.8125, "learning_rate": 1.047906976744186e-05, "loss": 2.8094, "step": 1500 }, { "epoch": 0.13429384560654573, "grad_norm": 14.3125, "learning_rate": 1.033953488372093e-05, "loss": 2.6246, "step": 1510 }, { "epoch": 0.1351832088224831, "grad_norm": 14.5625, "learning_rate": 1.02e-05, "loss": 2.8486, "step": 1520 }, { "epoch": 0.1360725720384205, "grad_norm": 13.625, "learning_rate": 1.0060465116279069e-05, "loss": 2.7712, "step": 1530 }, { "epoch": 0.13696193525435787, "grad_norm": 14.5625, "learning_rate": 9.92093023255814e-06, "loss": 2.7667, "step": 1540 }, { "epoch": 0.13785129847029526, "grad_norm": 14.375, "learning_rate": 9.78139534883721e-06, "loss": 2.6918, "step": 1550 }, { "epoch": 0.13874066168623267, "grad_norm": 14.1875, "learning_rate": 9.64186046511628e-06, "loss": 2.872, "step": 1560 }, { "epoch": 0.13963002490217005, "grad_norm": 14.375, "learning_rate": 9.502325581395348e-06, "loss": 2.9586, "step": 1570 }, { "epoch": 0.14051938811810744, "grad_norm": 13.8125, "learning_rate": 9.362790697674419e-06, "loss": 2.5803, "step": 1580 }, { "epoch": 0.14140875133404482, "grad_norm": 13.8125, "learning_rate": 9.223255813953489e-06, "loss": 2.72, "step": 1590 }, { "epoch": 0.1422981145499822, "grad_norm": 13.4375, "learning_rate": 9.083720930232559e-06, "loss": 2.7268, "step": 1600 }, { "epoch": 0.1431874777659196, "grad_norm": 14.0625, "learning_rate": 8.944186046511628e-06, "loss": 2.7176, "step": 1610 }, { "epoch": 0.144076840981857, "grad_norm": 15.125, "learning_rate": 8.804651162790698e-06, "loss": 2.6748, "step": 1620 }, { "epoch": 0.14496620419779438, "grad_norm": 15.25, "learning_rate": 8.665116279069768e-06, "loss": 2.7349, "step": 1630 }, { "epoch": 0.14585556741373176, "grad_norm": 15.0625, "learning_rate": 8.525581395348838e-06, "loss": 2.729, "step": 1640 }, { "epoch": 0.14674493062966915, "grad_norm": 14.5625, "learning_rate": 8.386046511627907e-06, "loss": 2.6542, "step": 1650 }, { "epoch": 0.14763429384560656, "grad_norm": 14.625, "learning_rate": 8.246511627906977e-06, "loss": 2.6102, "step": 1660 }, { "epoch": 0.14852365706154394, "grad_norm": 13.8125, "learning_rate": 8.106976744186047e-06, "loss": 2.7355, "step": 1670 }, { "epoch": 0.14941302027748132, "grad_norm": 14.1875, "learning_rate": 7.967441860465118e-06, "loss": 2.7689, "step": 1680 }, { "epoch": 0.1503023834934187, "grad_norm": 14.3125, "learning_rate": 7.827906976744186e-06, "loss": 2.7373, "step": 1690 }, { "epoch": 0.1511917467093561, "grad_norm": 15.25, "learning_rate": 7.688372093023256e-06, "loss": 2.7359, "step": 1700 }, { "epoch": 0.1520811099252935, "grad_norm": 15.0, "learning_rate": 7.548837209302326e-06, "loss": 2.6088, "step": 1710 }, { "epoch": 0.15297047314123088, "grad_norm": 14.4375, "learning_rate": 7.409302325581395e-06, "loss": 2.7727, "step": 1720 }, { "epoch": 0.15385983635716827, "grad_norm": 13.1875, "learning_rate": 7.269767441860465e-06, "loss": 2.8457, "step": 1730 }, { "epoch": 0.15474919957310565, "grad_norm": 14.0625, "learning_rate": 7.130232558139535e-06, "loss": 2.6671, "step": 1740 }, { "epoch": 0.15563856278904303, "grad_norm": 14.0, "learning_rate": 6.990697674418605e-06, "loss": 2.593, "step": 1750 }, { "epoch": 0.15652792600498044, "grad_norm": 13.6875, "learning_rate": 6.851162790697674e-06, "loss": 2.6174, "step": 1760 }, { "epoch": 0.15741728922091783, "grad_norm": 14.375, "learning_rate": 6.711627906976745e-06, "loss": 2.6143, "step": 1770 }, { "epoch": 0.1583066524368552, "grad_norm": 12.875, "learning_rate": 6.572093023255814e-06, "loss": 2.7958, "step": 1780 }, { "epoch": 0.1591960156527926, "grad_norm": 13.875, "learning_rate": 6.432558139534884e-06, "loss": 2.6567, "step": 1790 }, { "epoch": 0.16008537886872998, "grad_norm": 14.625, "learning_rate": 6.293023255813954e-06, "loss": 2.5427, "step": 1800 }, { "epoch": 0.1609747420846674, "grad_norm": 14.1875, "learning_rate": 6.153488372093024e-06, "loss": 2.6013, "step": 1810 }, { "epoch": 0.16186410530060477, "grad_norm": 14.8125, "learning_rate": 6.013953488372093e-06, "loss": 2.6624, "step": 1820 }, { "epoch": 0.16275346851654215, "grad_norm": 14.625, "learning_rate": 5.8744186046511635e-06, "loss": 2.6861, "step": 1830 }, { "epoch": 0.16364283173247954, "grad_norm": 12.875, "learning_rate": 5.734883720930233e-06, "loss": 2.7337, "step": 1840 }, { "epoch": 0.16453219494841692, "grad_norm": 13.25, "learning_rate": 5.595348837209303e-06, "loss": 2.532, "step": 1850 }, { "epoch": 0.16542155816435433, "grad_norm": 14.0625, "learning_rate": 5.4558139534883726e-06, "loss": 2.5045, "step": 1860 }, { "epoch": 0.16631092138029172, "grad_norm": 14.5625, "learning_rate": 5.316279069767443e-06, "loss": 2.8413, "step": 1870 }, { "epoch": 0.1672002845962291, "grad_norm": 14.8125, "learning_rate": 5.176744186046511e-06, "loss": 2.6274, "step": 1880 }, { "epoch": 0.16808964781216648, "grad_norm": 13.5625, "learning_rate": 5.0372093023255816e-06, "loss": 2.5649, "step": 1890 }, { "epoch": 0.16897901102810386, "grad_norm": 15.0625, "learning_rate": 4.897674418604651e-06, "loss": 2.7143, "step": 1900 }, { "epoch": 0.16986837424404128, "grad_norm": 15.125, "learning_rate": 4.758139534883721e-06, "loss": 2.642, "step": 1910 }, { "epoch": 0.17075773745997866, "grad_norm": 12.875, "learning_rate": 4.618604651162791e-06, "loss": 2.8059, "step": 1920 }, { "epoch": 0.17164710067591604, "grad_norm": 13.8125, "learning_rate": 4.479069767441861e-06, "loss": 2.6971, "step": 1930 }, { "epoch": 0.17253646389185343, "grad_norm": 13.6875, "learning_rate": 4.33953488372093e-06, "loss": 2.6398, "step": 1940 }, { "epoch": 0.1734258271077908, "grad_norm": 16.375, "learning_rate": 4.2000000000000004e-06, "loss": 2.9306, "step": 1950 }, { "epoch": 0.17431519032372822, "grad_norm": 13.8125, "learning_rate": 4.06046511627907e-06, "loss": 2.7713, "step": 1960 }, { "epoch": 0.1752045535396656, "grad_norm": 13.125, "learning_rate": 3.92093023255814e-06, "loss": 2.7845, "step": 1970 }, { "epoch": 0.17609391675560299, "grad_norm": 13.5, "learning_rate": 3.7813953488372095e-06, "loss": 2.5285, "step": 1980 }, { "epoch": 0.17698327997154037, "grad_norm": 14.75, "learning_rate": 3.6418604651162793e-06, "loss": 2.7907, "step": 1990 }, { "epoch": 0.17787264318747775, "grad_norm": 14.25, "learning_rate": 3.502325581395349e-06, "loss": 2.6383, "step": 2000 }, { "epoch": 0.17876200640341516, "grad_norm": 13.4375, "learning_rate": 3.3627906976744185e-06, "loss": 2.62, "step": 2010 }, { "epoch": 0.17965136961935255, "grad_norm": 16.0, "learning_rate": 3.2232558139534883e-06, "loss": 2.6536, "step": 2020 }, { "epoch": 0.18054073283528993, "grad_norm": 13.3125, "learning_rate": 3.083720930232558e-06, "loss": 2.6178, "step": 2030 }, { "epoch": 0.1814300960512273, "grad_norm": 15.0, "learning_rate": 2.944186046511628e-06, "loss": 2.7065, "step": 2040 }, { "epoch": 0.1823194592671647, "grad_norm": 12.375, "learning_rate": 2.8046511627906977e-06, "loss": 2.6898, "step": 2050 }, { "epoch": 0.1832088224831021, "grad_norm": 14.0625, "learning_rate": 2.6651162790697675e-06, "loss": 2.5896, "step": 2060 }, { "epoch": 0.1840981856990395, "grad_norm": 15.4375, "learning_rate": 2.5255813953488374e-06, "loss": 2.6833, "step": 2070 }, { "epoch": 0.18498754891497687, "grad_norm": 15.5, "learning_rate": 2.386046511627907e-06, "loss": 2.7408, "step": 2080 }, { "epoch": 0.18587691213091426, "grad_norm": 13.375, "learning_rate": 2.246511627906977e-06, "loss": 2.7091, "step": 2090 }, { "epoch": 0.18676627534685167, "grad_norm": 14.1875, "learning_rate": 2.1069767441860464e-06, "loss": 2.4644, "step": 2100 }, { "epoch": 0.18765563856278905, "grad_norm": 14.625, "learning_rate": 1.967441860465116e-06, "loss": 2.7544, "step": 2110 }, { "epoch": 0.18854500177872643, "grad_norm": 14.625, "learning_rate": 1.8279069767441862e-06, "loss": 2.7613, "step": 2120 }, { "epoch": 0.18943436499466382, "grad_norm": 14.0625, "learning_rate": 1.6883720930232558e-06, "loss": 2.5281, "step": 2130 }, { "epoch": 0.1903237282106012, "grad_norm": 14.1875, "learning_rate": 1.5488372093023256e-06, "loss": 2.56, "step": 2140 }, { "epoch": 0.1912130914265386, "grad_norm": 14.0, "learning_rate": 1.4093023255813954e-06, "loss": 2.6563, "step": 2150 }, { "epoch": 0.192102454642476, "grad_norm": 14.0, "learning_rate": 1.2697674418604653e-06, "loss": 2.6377, "step": 2160 }, { "epoch": 0.19299181785841338, "grad_norm": 14.4375, "learning_rate": 1.1302325581395349e-06, "loss": 2.694, "step": 2170 }, { "epoch": 0.19388118107435076, "grad_norm": 14.5, "learning_rate": 9.906976744186047e-07, "loss": 2.6686, "step": 2180 }, { "epoch": 0.19477054429028814, "grad_norm": 14.0625, "learning_rate": 8.511627906976745e-07, "loss": 2.7232, "step": 2190 }, { "epoch": 0.19565990750622556, "grad_norm": 14.5625, "learning_rate": 7.116279069767442e-07, "loss": 2.7544, "step": 2200 }, { "epoch": 0.19654927072216294, "grad_norm": 13.5, "learning_rate": 5.72093023255814e-07, "loss": 2.6411, "step": 2210 }, { "epoch": 0.19743863393810032, "grad_norm": 13.1875, "learning_rate": 4.325581395348837e-07, "loss": 2.5296, "step": 2220 }, { "epoch": 0.1983279971540377, "grad_norm": 15.25, "learning_rate": 2.9302325581395347e-07, "loss": 2.5208, "step": 2230 }, { "epoch": 0.1992173603699751, "grad_norm": 13.125, "learning_rate": 1.5348837209302325e-07, "loss": 2.6465, "step": 2240 }, { "epoch": 0.2001067235859125, "grad_norm": 14.75, "learning_rate": 1.3953488372093025e-08, "loss": 2.7407, "step": 2250 } ], "logging_steps": 10, "max_steps": 2250, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.3699129752e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }