|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.2001067235859125, |
|
"eval_steps": 500, |
|
"global_step": 2250, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0008893632159373888, |
|
"grad_norm": 68.5, |
|
"learning_rate": 2.7e-06, |
|
"loss": 8.8776, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0017787264318747777, |
|
"grad_norm": 39.0, |
|
"learning_rate": 5.7000000000000005e-06, |
|
"loss": 7.711, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0026680896478121665, |
|
"grad_norm": 29.375, |
|
"learning_rate": 8.7e-06, |
|
"loss": 5.7038, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0035574528637495554, |
|
"grad_norm": 24.5, |
|
"learning_rate": 1.1700000000000001e-05, |
|
"loss": 4.746, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.004446816079686944, |
|
"grad_norm": 22.625, |
|
"learning_rate": 1.47e-05, |
|
"loss": 4.1094, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.005336179295624333, |
|
"grad_norm": 21.625, |
|
"learning_rate": 1.77e-05, |
|
"loss": 4.0746, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.0062255425115617215, |
|
"grad_norm": 25.25, |
|
"learning_rate": 2.07e-05, |
|
"loss": 3.8396, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.007114905727499111, |
|
"grad_norm": 20.875, |
|
"learning_rate": 2.37e-05, |
|
"loss": 3.8629, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.0080042689434365, |
|
"grad_norm": 22.875, |
|
"learning_rate": 2.6700000000000002e-05, |
|
"loss": 3.9097, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.008893632159373888, |
|
"grad_norm": 19.375, |
|
"learning_rate": 2.97e-05, |
|
"loss": 3.763, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.009782995375311278, |
|
"grad_norm": 19.625, |
|
"learning_rate": 2.9874418604651165e-05, |
|
"loss": 4.1501, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.010672358591248666, |
|
"grad_norm": 19.0, |
|
"learning_rate": 2.9734883720930235e-05, |
|
"loss": 3.9038, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.011561721807186055, |
|
"grad_norm": 16.625, |
|
"learning_rate": 2.9595348837209305e-05, |
|
"loss": 4.0583, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.012451085023123443, |
|
"grad_norm": 16.375, |
|
"learning_rate": 2.9455813953488376e-05, |
|
"loss": 3.5958, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.013340448239060833, |
|
"grad_norm": 15.375, |
|
"learning_rate": 2.9316279069767443e-05, |
|
"loss": 3.9075, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.014229811454998222, |
|
"grad_norm": 15.625, |
|
"learning_rate": 2.9176744186046513e-05, |
|
"loss": 3.9495, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.01511917467093561, |
|
"grad_norm": 18.125, |
|
"learning_rate": 2.9037209302325583e-05, |
|
"loss": 3.3247, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.016008537886873, |
|
"grad_norm": 18.75, |
|
"learning_rate": 2.889767441860465e-05, |
|
"loss": 3.6863, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.01689790110281039, |
|
"grad_norm": 15.625, |
|
"learning_rate": 2.875813953488372e-05, |
|
"loss": 3.3097, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.017787264318747775, |
|
"grad_norm": 14.3125, |
|
"learning_rate": 2.861860465116279e-05, |
|
"loss": 3.48, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.018676627534685165, |
|
"grad_norm": 17.875, |
|
"learning_rate": 2.847906976744186e-05, |
|
"loss": 3.6216, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.019565990750622556, |
|
"grad_norm": 15.4375, |
|
"learning_rate": 2.833953488372093e-05, |
|
"loss": 3.4751, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.020455353966559942, |
|
"grad_norm": 14.75, |
|
"learning_rate": 2.8199999999999998e-05, |
|
"loss": 3.9, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.021344717182497332, |
|
"grad_norm": 14.8125, |
|
"learning_rate": 2.8060465116279068e-05, |
|
"loss": 3.7454, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.02223408039843472, |
|
"grad_norm": 15.6875, |
|
"learning_rate": 2.7920930232558138e-05, |
|
"loss": 3.5053, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.02312344361437211, |
|
"grad_norm": 14.5, |
|
"learning_rate": 2.778139534883721e-05, |
|
"loss": 3.3656, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.0240128068303095, |
|
"grad_norm": 13.6875, |
|
"learning_rate": 2.764186046511628e-05, |
|
"loss": 3.769, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.024902170046246886, |
|
"grad_norm": 15.0625, |
|
"learning_rate": 2.750232558139535e-05, |
|
"loss": 3.5721, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.025791533262184276, |
|
"grad_norm": 14.6875, |
|
"learning_rate": 2.736279069767442e-05, |
|
"loss": 3.6104, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.026680896478121666, |
|
"grad_norm": 13.875, |
|
"learning_rate": 2.722325581395349e-05, |
|
"loss": 3.5904, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.027570259694059053, |
|
"grad_norm": 13.8125, |
|
"learning_rate": 2.7083720930232556e-05, |
|
"loss": 3.5633, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.028459622909996443, |
|
"grad_norm": 13.5625, |
|
"learning_rate": 2.6944186046511626e-05, |
|
"loss": 3.3366, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.02934898612593383, |
|
"grad_norm": 14.3125, |
|
"learning_rate": 2.6804651162790697e-05, |
|
"loss": 3.6519, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.03023834934187122, |
|
"grad_norm": 14.4375, |
|
"learning_rate": 2.6665116279069767e-05, |
|
"loss": 3.4482, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.03112771255780861, |
|
"grad_norm": 13.9375, |
|
"learning_rate": 2.6525581395348837e-05, |
|
"loss": 3.3554, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.032017075773746, |
|
"grad_norm": 14.5625, |
|
"learning_rate": 2.6386046511627907e-05, |
|
"loss": 3.4008, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.03290643898968339, |
|
"grad_norm": 14.375, |
|
"learning_rate": 2.6246511627906978e-05, |
|
"loss": 3.3279, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.03379580220562078, |
|
"grad_norm": 13.9375, |
|
"learning_rate": 2.6106976744186048e-05, |
|
"loss": 3.487, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.03468516542155817, |
|
"grad_norm": 14.625, |
|
"learning_rate": 2.5967441860465115e-05, |
|
"loss": 3.2542, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.03557452863749555, |
|
"grad_norm": 13.1875, |
|
"learning_rate": 2.5827906976744185e-05, |
|
"loss": 3.3328, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.03646389185343294, |
|
"grad_norm": 13.25, |
|
"learning_rate": 2.5688372093023255e-05, |
|
"loss": 3.5205, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.03735325506937033, |
|
"grad_norm": 11.9375, |
|
"learning_rate": 2.5548837209302325e-05, |
|
"loss": 3.4151, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.03824261828530772, |
|
"grad_norm": 13.625, |
|
"learning_rate": 2.5409302325581396e-05, |
|
"loss": 3.1404, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.03913198150124511, |
|
"grad_norm": 14.0625, |
|
"learning_rate": 2.5269767441860466e-05, |
|
"loss": 3.5277, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.040021344717182494, |
|
"grad_norm": 13.1875, |
|
"learning_rate": 2.5130232558139536e-05, |
|
"loss": 3.1584, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.040910707933119884, |
|
"grad_norm": 13.4375, |
|
"learning_rate": 2.4990697674418606e-05, |
|
"loss": 3.4669, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.041800071149057275, |
|
"grad_norm": 14.5625, |
|
"learning_rate": 2.4851162790697673e-05, |
|
"loss": 3.2653, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.042689434364994665, |
|
"grad_norm": 13.9375, |
|
"learning_rate": 2.4711627906976743e-05, |
|
"loss": 3.4219, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.043578797580932055, |
|
"grad_norm": 12.875, |
|
"learning_rate": 2.4572093023255814e-05, |
|
"loss": 3.508, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.04446816079686944, |
|
"grad_norm": 14.0625, |
|
"learning_rate": 2.4432558139534884e-05, |
|
"loss": 3.3343, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.04535752401280683, |
|
"grad_norm": 13.75, |
|
"learning_rate": 2.4293023255813954e-05, |
|
"loss": 3.5054, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.04624688722874422, |
|
"grad_norm": 15.25, |
|
"learning_rate": 2.4153488372093024e-05, |
|
"loss": 3.0298, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.04713625044468161, |
|
"grad_norm": 13.75, |
|
"learning_rate": 2.4013953488372095e-05, |
|
"loss": 3.4306, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.048025613660619, |
|
"grad_norm": 14.25, |
|
"learning_rate": 2.3874418604651165e-05, |
|
"loss": 3.1824, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.04891497687655639, |
|
"grad_norm": 12.3125, |
|
"learning_rate": 2.373488372093023e-05, |
|
"loss": 3.211, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.04980434009249377, |
|
"grad_norm": 13.75, |
|
"learning_rate": 2.3595348837209302e-05, |
|
"loss": 3.091, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.05069370330843116, |
|
"grad_norm": 13.75, |
|
"learning_rate": 2.3455813953488372e-05, |
|
"loss": 3.234, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.05158306652436855, |
|
"grad_norm": 13.75, |
|
"learning_rate": 2.3316279069767442e-05, |
|
"loss": 3.1404, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.05247242974030594, |
|
"grad_norm": 13.25, |
|
"learning_rate": 2.3176744186046513e-05, |
|
"loss": 3.2191, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.05336179295624333, |
|
"grad_norm": 12.625, |
|
"learning_rate": 2.3037209302325583e-05, |
|
"loss": 3.0968, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.054251156172180716, |
|
"grad_norm": 13.5625, |
|
"learning_rate": 2.2897674418604653e-05, |
|
"loss": 3.1108, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.055140519388118106, |
|
"grad_norm": 12.75, |
|
"learning_rate": 2.2758139534883723e-05, |
|
"loss": 3.028, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.056029882604055496, |
|
"grad_norm": 12.375, |
|
"learning_rate": 2.261860465116279e-05, |
|
"loss": 3.1469, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.056919245819992886, |
|
"grad_norm": 14.8125, |
|
"learning_rate": 2.247906976744186e-05, |
|
"loss": 3.1258, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.057808609035930276, |
|
"grad_norm": 12.5625, |
|
"learning_rate": 2.233953488372093e-05, |
|
"loss": 3.1147, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.05869797225186766, |
|
"grad_norm": 13.1875, |
|
"learning_rate": 2.22e-05, |
|
"loss": 3.0447, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.05958733546780505, |
|
"grad_norm": 15.3125, |
|
"learning_rate": 2.206046511627907e-05, |
|
"loss": 3.1177, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.06047669868374244, |
|
"grad_norm": 13.6875, |
|
"learning_rate": 2.192093023255814e-05, |
|
"loss": 3.2542, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.06136606189967983, |
|
"grad_norm": 13.1875, |
|
"learning_rate": 2.178139534883721e-05, |
|
"loss": 3.188, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.06225542511561722, |
|
"grad_norm": 13.625, |
|
"learning_rate": 2.1641860465116282e-05, |
|
"loss": 3.2298, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.06314478833155461, |
|
"grad_norm": 14.0625, |
|
"learning_rate": 2.150232558139535e-05, |
|
"loss": 3.1127, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.064034151547492, |
|
"grad_norm": 15.1875, |
|
"learning_rate": 2.136279069767442e-05, |
|
"loss": 3.1468, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.06492351476342939, |
|
"grad_norm": 12.625, |
|
"learning_rate": 2.122325581395349e-05, |
|
"loss": 3.2763, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.06581287797936677, |
|
"grad_norm": 13.875, |
|
"learning_rate": 2.108372093023256e-05, |
|
"loss": 3.051, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.06670224119530416, |
|
"grad_norm": 13.9375, |
|
"learning_rate": 2.094418604651163e-05, |
|
"loss": 3.1937, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.06759160441124155, |
|
"grad_norm": 13.3125, |
|
"learning_rate": 2.08046511627907e-05, |
|
"loss": 3.0506, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.06848096762717894, |
|
"grad_norm": 12.625, |
|
"learning_rate": 2.066511627906977e-05, |
|
"loss": 2.975, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.06937033084311633, |
|
"grad_norm": 12.75, |
|
"learning_rate": 2.052558139534884e-05, |
|
"loss": 2.9174, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.07025969405905372, |
|
"grad_norm": 12.875, |
|
"learning_rate": 2.0386046511627907e-05, |
|
"loss": 3.0532, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.0711490572749911, |
|
"grad_norm": 12.4375, |
|
"learning_rate": 2.0246511627906977e-05, |
|
"loss": 3.2092, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.0720384204909285, |
|
"grad_norm": 12.625, |
|
"learning_rate": 2.0106976744186048e-05, |
|
"loss": 2.9702, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.07292778370686588, |
|
"grad_norm": 14.0625, |
|
"learning_rate": 1.9967441860465118e-05, |
|
"loss": 3.0595, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.07381714692280328, |
|
"grad_norm": 14.5, |
|
"learning_rate": 1.9827906976744188e-05, |
|
"loss": 3.0102, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.07470651013874066, |
|
"grad_norm": 13.25, |
|
"learning_rate": 1.968837209302326e-05, |
|
"loss": 3.0725, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.07559587335467804, |
|
"grad_norm": 13.6875, |
|
"learning_rate": 1.954883720930233e-05, |
|
"loss": 3.0679, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.07648523657061544, |
|
"grad_norm": 13.1875, |
|
"learning_rate": 1.94093023255814e-05, |
|
"loss": 3.0764, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.07737459978655283, |
|
"grad_norm": 14.5, |
|
"learning_rate": 1.9269767441860466e-05, |
|
"loss": 2.768, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.07826396300249022, |
|
"grad_norm": 14.875, |
|
"learning_rate": 1.9130232558139536e-05, |
|
"loss": 3.2016, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.0791533262184276, |
|
"grad_norm": 12.875, |
|
"learning_rate": 1.8990697674418606e-05, |
|
"loss": 3.0906, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.08004268943436499, |
|
"grad_norm": 13.0, |
|
"learning_rate": 1.8851162790697673e-05, |
|
"loss": 3.1441, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.08093205265030239, |
|
"grad_norm": 13.25, |
|
"learning_rate": 1.8711627906976743e-05, |
|
"loss": 2.9791, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.08182141586623977, |
|
"grad_norm": 14.0625, |
|
"learning_rate": 1.8572093023255814e-05, |
|
"loss": 2.8393, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.08271077908217717, |
|
"grad_norm": 13.1875, |
|
"learning_rate": 1.8432558139534884e-05, |
|
"loss": 2.9332, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.08360014229811455, |
|
"grad_norm": 13.0625, |
|
"learning_rate": 1.8293023255813954e-05, |
|
"loss": 2.9787, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.08448950551405193, |
|
"grad_norm": 15.375, |
|
"learning_rate": 1.815348837209302e-05, |
|
"loss": 2.8229, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.08537886872998933, |
|
"grad_norm": 13.5625, |
|
"learning_rate": 1.801395348837209e-05, |
|
"loss": 2.8112, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.08626823194592671, |
|
"grad_norm": 16.75, |
|
"learning_rate": 1.787441860465116e-05, |
|
"loss": 3.0131, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.08715759516186411, |
|
"grad_norm": 13.9375, |
|
"learning_rate": 1.773488372093023e-05, |
|
"loss": 2.9496, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.08804695837780149, |
|
"grad_norm": 15.0625, |
|
"learning_rate": 1.7595348837209302e-05, |
|
"loss": 3.0641, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.08893632159373888, |
|
"grad_norm": 13.0, |
|
"learning_rate": 1.7455813953488372e-05, |
|
"loss": 2.8714, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.08982568480967627, |
|
"grad_norm": 13.8125, |
|
"learning_rate": 1.7316279069767442e-05, |
|
"loss": 2.7792, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.09071504802561366, |
|
"grad_norm": 13.875, |
|
"learning_rate": 1.7176744186046512e-05, |
|
"loss": 2.9472, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.09160441124155105, |
|
"grad_norm": 14.0, |
|
"learning_rate": 1.703720930232558e-05, |
|
"loss": 2.5662, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.09249377445748844, |
|
"grad_norm": 13.9375, |
|
"learning_rate": 1.689767441860465e-05, |
|
"loss": 2.82, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.09338313767342583, |
|
"grad_norm": 13.875, |
|
"learning_rate": 1.675813953488372e-05, |
|
"loss": 2.8245, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.09427250088936322, |
|
"grad_norm": 11.8125, |
|
"learning_rate": 1.661860465116279e-05, |
|
"loss": 2.8141, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.0951618641053006, |
|
"grad_norm": 13.1875, |
|
"learning_rate": 1.647906976744186e-05, |
|
"loss": 2.8975, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.096051227321238, |
|
"grad_norm": 14.625, |
|
"learning_rate": 1.633953488372093e-05, |
|
"loss": 2.7996, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.09694059053717538, |
|
"grad_norm": 12.625, |
|
"learning_rate": 1.62e-05, |
|
"loss": 2.7679, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.09782995375311278, |
|
"grad_norm": 14.0625, |
|
"learning_rate": 1.606046511627907e-05, |
|
"loss": 3.0057, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.09871931696905016, |
|
"grad_norm": 13.6875, |
|
"learning_rate": 1.5920930232558138e-05, |
|
"loss": 2.5742, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.09960868018498754, |
|
"grad_norm": 15.3125, |
|
"learning_rate": 1.5781395348837208e-05, |
|
"loss": 2.7903, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.10049804340092494, |
|
"grad_norm": 14.8125, |
|
"learning_rate": 1.564186046511628e-05, |
|
"loss": 2.5664, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.10138740661686232, |
|
"grad_norm": 13.8125, |
|
"learning_rate": 1.550232558139535e-05, |
|
"loss": 2.9392, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.10227676983279972, |
|
"grad_norm": 13.9375, |
|
"learning_rate": 1.536279069767442e-05, |
|
"loss": 2.7105, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.1031661330487371, |
|
"grad_norm": 14.5, |
|
"learning_rate": 1.5223255813953489e-05, |
|
"loss": 2.9472, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.10405549626467449, |
|
"grad_norm": 13.4375, |
|
"learning_rate": 1.508372093023256e-05, |
|
"loss": 2.7698, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.10494485948061189, |
|
"grad_norm": 13.125, |
|
"learning_rate": 1.4944186046511628e-05, |
|
"loss": 2.8998, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.10583422269654927, |
|
"grad_norm": 14.125, |
|
"learning_rate": 1.4804651162790698e-05, |
|
"loss": 2.7952, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.10672358591248667, |
|
"grad_norm": 14.875, |
|
"learning_rate": 1.4665116279069768e-05, |
|
"loss": 2.7689, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.10761294912842405, |
|
"grad_norm": 13.375, |
|
"learning_rate": 1.4525581395348837e-05, |
|
"loss": 3.0368, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.10850231234436143, |
|
"grad_norm": 14.5, |
|
"learning_rate": 1.4386046511627907e-05, |
|
"loss": 2.8113, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.10939167556029883, |
|
"grad_norm": 13.25, |
|
"learning_rate": 1.4246511627906977e-05, |
|
"loss": 2.5883, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.11028103877623621, |
|
"grad_norm": 14.25, |
|
"learning_rate": 1.4106976744186048e-05, |
|
"loss": 2.9207, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.11117040199217361, |
|
"grad_norm": 12.9375, |
|
"learning_rate": 1.3967441860465116e-05, |
|
"loss": 2.8662, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.11205976520811099, |
|
"grad_norm": 14.25, |
|
"learning_rate": 1.3827906976744186e-05, |
|
"loss": 2.8439, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.11294912842404838, |
|
"grad_norm": 13.1875, |
|
"learning_rate": 1.3688372093023257e-05, |
|
"loss": 2.8322, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.11383849163998577, |
|
"grad_norm": 13.6875, |
|
"learning_rate": 1.3548837209302327e-05, |
|
"loss": 2.8627, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.11472785485592316, |
|
"grad_norm": 12.8125, |
|
"learning_rate": 1.3409302325581395e-05, |
|
"loss": 2.733, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.11561721807186055, |
|
"grad_norm": 12.75, |
|
"learning_rate": 1.3269767441860466e-05, |
|
"loss": 2.7608, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.11650658128779794, |
|
"grad_norm": 14.4375, |
|
"learning_rate": 1.3130232558139536e-05, |
|
"loss": 2.7633, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.11739594450373532, |
|
"grad_norm": 13.875, |
|
"learning_rate": 1.2990697674418606e-05, |
|
"loss": 2.682, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.11828530771967272, |
|
"grad_norm": 14.4375, |
|
"learning_rate": 1.2851162790697675e-05, |
|
"loss": 2.7205, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.1191746709356101, |
|
"grad_norm": 12.6875, |
|
"learning_rate": 1.2711627906976745e-05, |
|
"loss": 2.7408, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.1200640341515475, |
|
"grad_norm": 13.5, |
|
"learning_rate": 1.2572093023255815e-05, |
|
"loss": 3.0955, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.12095339736748488, |
|
"grad_norm": 13.8125, |
|
"learning_rate": 1.2432558139534885e-05, |
|
"loss": 2.8928, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.12184276058342226, |
|
"grad_norm": 12.5625, |
|
"learning_rate": 1.2293023255813954e-05, |
|
"loss": 2.7429, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.12273212379935966, |
|
"grad_norm": 14.3125, |
|
"learning_rate": 1.2153488372093024e-05, |
|
"loss": 2.6795, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.12362148701529704, |
|
"grad_norm": 13.125, |
|
"learning_rate": 1.2013953488372094e-05, |
|
"loss": 2.6802, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.12451085023123444, |
|
"grad_norm": 13.8125, |
|
"learning_rate": 1.1874418604651165e-05, |
|
"loss": 2.6818, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.12540021344717184, |
|
"grad_norm": 13.875, |
|
"learning_rate": 1.1734883720930233e-05, |
|
"loss": 2.5968, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.12628957666310922, |
|
"grad_norm": 13.75, |
|
"learning_rate": 1.1595348837209303e-05, |
|
"loss": 2.7026, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.1271789398790466, |
|
"grad_norm": 13.5625, |
|
"learning_rate": 1.1455813953488372e-05, |
|
"loss": 2.8092, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.128068303094984, |
|
"grad_norm": 13.9375, |
|
"learning_rate": 1.1316279069767442e-05, |
|
"loss": 2.6957, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.12895766631092137, |
|
"grad_norm": 14.0, |
|
"learning_rate": 1.117674418604651e-05, |
|
"loss": 2.8901, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.12984702952685878, |
|
"grad_norm": 14.3125, |
|
"learning_rate": 1.1037209302325581e-05, |
|
"loss": 2.6284, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.13073639274279616, |
|
"grad_norm": 13.875, |
|
"learning_rate": 1.0897674418604651e-05, |
|
"loss": 2.9493, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.13162575595873355, |
|
"grad_norm": 13.5625, |
|
"learning_rate": 1.0758139534883721e-05, |
|
"loss": 2.6665, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.13251511917467093, |
|
"grad_norm": 14.125, |
|
"learning_rate": 1.061860465116279e-05, |
|
"loss": 2.8124, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.13340448239060831, |
|
"grad_norm": 13.8125, |
|
"learning_rate": 1.047906976744186e-05, |
|
"loss": 2.8094, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.13429384560654573, |
|
"grad_norm": 14.3125, |
|
"learning_rate": 1.033953488372093e-05, |
|
"loss": 2.6246, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.1351832088224831, |
|
"grad_norm": 14.5625, |
|
"learning_rate": 1.02e-05, |
|
"loss": 2.8486, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.1360725720384205, |
|
"grad_norm": 13.625, |
|
"learning_rate": 1.0060465116279069e-05, |
|
"loss": 2.7712, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.13696193525435787, |
|
"grad_norm": 14.5625, |
|
"learning_rate": 9.92093023255814e-06, |
|
"loss": 2.7667, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.13785129847029526, |
|
"grad_norm": 14.375, |
|
"learning_rate": 9.78139534883721e-06, |
|
"loss": 2.6918, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.13874066168623267, |
|
"grad_norm": 14.1875, |
|
"learning_rate": 9.64186046511628e-06, |
|
"loss": 2.872, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.13963002490217005, |
|
"grad_norm": 14.375, |
|
"learning_rate": 9.502325581395348e-06, |
|
"loss": 2.9586, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.14051938811810744, |
|
"grad_norm": 13.8125, |
|
"learning_rate": 9.362790697674419e-06, |
|
"loss": 2.5803, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.14140875133404482, |
|
"grad_norm": 13.8125, |
|
"learning_rate": 9.223255813953489e-06, |
|
"loss": 2.72, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.1422981145499822, |
|
"grad_norm": 13.4375, |
|
"learning_rate": 9.083720930232559e-06, |
|
"loss": 2.7268, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.1431874777659196, |
|
"grad_norm": 14.0625, |
|
"learning_rate": 8.944186046511628e-06, |
|
"loss": 2.7176, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.144076840981857, |
|
"grad_norm": 15.125, |
|
"learning_rate": 8.804651162790698e-06, |
|
"loss": 2.6748, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.14496620419779438, |
|
"grad_norm": 15.25, |
|
"learning_rate": 8.665116279069768e-06, |
|
"loss": 2.7349, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.14585556741373176, |
|
"grad_norm": 15.0625, |
|
"learning_rate": 8.525581395348838e-06, |
|
"loss": 2.729, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.14674493062966915, |
|
"grad_norm": 14.5625, |
|
"learning_rate": 8.386046511627907e-06, |
|
"loss": 2.6542, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.14763429384560656, |
|
"grad_norm": 14.625, |
|
"learning_rate": 8.246511627906977e-06, |
|
"loss": 2.6102, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.14852365706154394, |
|
"grad_norm": 13.8125, |
|
"learning_rate": 8.106976744186047e-06, |
|
"loss": 2.7355, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.14941302027748132, |
|
"grad_norm": 14.1875, |
|
"learning_rate": 7.967441860465118e-06, |
|
"loss": 2.7689, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.1503023834934187, |
|
"grad_norm": 14.3125, |
|
"learning_rate": 7.827906976744186e-06, |
|
"loss": 2.7373, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.1511917467093561, |
|
"grad_norm": 15.25, |
|
"learning_rate": 7.688372093023256e-06, |
|
"loss": 2.7359, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.1520811099252935, |
|
"grad_norm": 15.0, |
|
"learning_rate": 7.548837209302326e-06, |
|
"loss": 2.6088, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.15297047314123088, |
|
"grad_norm": 14.4375, |
|
"learning_rate": 7.409302325581395e-06, |
|
"loss": 2.7727, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.15385983635716827, |
|
"grad_norm": 13.1875, |
|
"learning_rate": 7.269767441860465e-06, |
|
"loss": 2.8457, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.15474919957310565, |
|
"grad_norm": 14.0625, |
|
"learning_rate": 7.130232558139535e-06, |
|
"loss": 2.6671, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.15563856278904303, |
|
"grad_norm": 14.0, |
|
"learning_rate": 6.990697674418605e-06, |
|
"loss": 2.593, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.15652792600498044, |
|
"grad_norm": 13.6875, |
|
"learning_rate": 6.851162790697674e-06, |
|
"loss": 2.6174, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.15741728922091783, |
|
"grad_norm": 14.375, |
|
"learning_rate": 6.711627906976745e-06, |
|
"loss": 2.6143, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.1583066524368552, |
|
"grad_norm": 12.875, |
|
"learning_rate": 6.572093023255814e-06, |
|
"loss": 2.7958, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.1591960156527926, |
|
"grad_norm": 13.875, |
|
"learning_rate": 6.432558139534884e-06, |
|
"loss": 2.6567, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.16008537886872998, |
|
"grad_norm": 14.625, |
|
"learning_rate": 6.293023255813954e-06, |
|
"loss": 2.5427, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.1609747420846674, |
|
"grad_norm": 14.1875, |
|
"learning_rate": 6.153488372093024e-06, |
|
"loss": 2.6013, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.16186410530060477, |
|
"grad_norm": 14.8125, |
|
"learning_rate": 6.013953488372093e-06, |
|
"loss": 2.6624, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.16275346851654215, |
|
"grad_norm": 14.625, |
|
"learning_rate": 5.8744186046511635e-06, |
|
"loss": 2.6861, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.16364283173247954, |
|
"grad_norm": 12.875, |
|
"learning_rate": 5.734883720930233e-06, |
|
"loss": 2.7337, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.16453219494841692, |
|
"grad_norm": 13.25, |
|
"learning_rate": 5.595348837209303e-06, |
|
"loss": 2.532, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.16542155816435433, |
|
"grad_norm": 14.0625, |
|
"learning_rate": 5.4558139534883726e-06, |
|
"loss": 2.5045, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.16631092138029172, |
|
"grad_norm": 14.5625, |
|
"learning_rate": 5.316279069767443e-06, |
|
"loss": 2.8413, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.1672002845962291, |
|
"grad_norm": 14.8125, |
|
"learning_rate": 5.176744186046511e-06, |
|
"loss": 2.6274, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.16808964781216648, |
|
"grad_norm": 13.5625, |
|
"learning_rate": 5.0372093023255816e-06, |
|
"loss": 2.5649, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.16897901102810386, |
|
"grad_norm": 15.0625, |
|
"learning_rate": 4.897674418604651e-06, |
|
"loss": 2.7143, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.16986837424404128, |
|
"grad_norm": 15.125, |
|
"learning_rate": 4.758139534883721e-06, |
|
"loss": 2.642, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.17075773745997866, |
|
"grad_norm": 12.875, |
|
"learning_rate": 4.618604651162791e-06, |
|
"loss": 2.8059, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.17164710067591604, |
|
"grad_norm": 13.8125, |
|
"learning_rate": 4.479069767441861e-06, |
|
"loss": 2.6971, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.17253646389185343, |
|
"grad_norm": 13.6875, |
|
"learning_rate": 4.33953488372093e-06, |
|
"loss": 2.6398, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.1734258271077908, |
|
"grad_norm": 16.375, |
|
"learning_rate": 4.2000000000000004e-06, |
|
"loss": 2.9306, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.17431519032372822, |
|
"grad_norm": 13.8125, |
|
"learning_rate": 4.06046511627907e-06, |
|
"loss": 2.7713, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.1752045535396656, |
|
"grad_norm": 13.125, |
|
"learning_rate": 3.92093023255814e-06, |
|
"loss": 2.7845, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.17609391675560299, |
|
"grad_norm": 13.5, |
|
"learning_rate": 3.7813953488372095e-06, |
|
"loss": 2.5285, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.17698327997154037, |
|
"grad_norm": 14.75, |
|
"learning_rate": 3.6418604651162793e-06, |
|
"loss": 2.7907, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.17787264318747775, |
|
"grad_norm": 14.25, |
|
"learning_rate": 3.502325581395349e-06, |
|
"loss": 2.6383, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.17876200640341516, |
|
"grad_norm": 13.4375, |
|
"learning_rate": 3.3627906976744185e-06, |
|
"loss": 2.62, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.17965136961935255, |
|
"grad_norm": 16.0, |
|
"learning_rate": 3.2232558139534883e-06, |
|
"loss": 2.6536, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.18054073283528993, |
|
"grad_norm": 13.3125, |
|
"learning_rate": 3.083720930232558e-06, |
|
"loss": 2.6178, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.1814300960512273, |
|
"grad_norm": 15.0, |
|
"learning_rate": 2.944186046511628e-06, |
|
"loss": 2.7065, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.1823194592671647, |
|
"grad_norm": 12.375, |
|
"learning_rate": 2.8046511627906977e-06, |
|
"loss": 2.6898, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.1832088224831021, |
|
"grad_norm": 14.0625, |
|
"learning_rate": 2.6651162790697675e-06, |
|
"loss": 2.5896, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.1840981856990395, |
|
"grad_norm": 15.4375, |
|
"learning_rate": 2.5255813953488374e-06, |
|
"loss": 2.6833, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.18498754891497687, |
|
"grad_norm": 15.5, |
|
"learning_rate": 2.386046511627907e-06, |
|
"loss": 2.7408, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.18587691213091426, |
|
"grad_norm": 13.375, |
|
"learning_rate": 2.246511627906977e-06, |
|
"loss": 2.7091, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.18676627534685167, |
|
"grad_norm": 14.1875, |
|
"learning_rate": 2.1069767441860464e-06, |
|
"loss": 2.4644, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.18765563856278905, |
|
"grad_norm": 14.625, |
|
"learning_rate": 1.967441860465116e-06, |
|
"loss": 2.7544, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.18854500177872643, |
|
"grad_norm": 14.625, |
|
"learning_rate": 1.8279069767441862e-06, |
|
"loss": 2.7613, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.18943436499466382, |
|
"grad_norm": 14.0625, |
|
"learning_rate": 1.6883720930232558e-06, |
|
"loss": 2.5281, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.1903237282106012, |
|
"grad_norm": 14.1875, |
|
"learning_rate": 1.5488372093023256e-06, |
|
"loss": 2.56, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.1912130914265386, |
|
"grad_norm": 14.0, |
|
"learning_rate": 1.4093023255813954e-06, |
|
"loss": 2.6563, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.192102454642476, |
|
"grad_norm": 14.0, |
|
"learning_rate": 1.2697674418604653e-06, |
|
"loss": 2.6377, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.19299181785841338, |
|
"grad_norm": 14.4375, |
|
"learning_rate": 1.1302325581395349e-06, |
|
"loss": 2.694, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.19388118107435076, |
|
"grad_norm": 14.5, |
|
"learning_rate": 9.906976744186047e-07, |
|
"loss": 2.6686, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.19477054429028814, |
|
"grad_norm": 14.0625, |
|
"learning_rate": 8.511627906976745e-07, |
|
"loss": 2.7232, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.19565990750622556, |
|
"grad_norm": 14.5625, |
|
"learning_rate": 7.116279069767442e-07, |
|
"loss": 2.7544, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.19654927072216294, |
|
"grad_norm": 13.5, |
|
"learning_rate": 5.72093023255814e-07, |
|
"loss": 2.6411, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.19743863393810032, |
|
"grad_norm": 13.1875, |
|
"learning_rate": 4.325581395348837e-07, |
|
"loss": 2.5296, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.1983279971540377, |
|
"grad_norm": 15.25, |
|
"learning_rate": 2.9302325581395347e-07, |
|
"loss": 2.5208, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 0.1992173603699751, |
|
"grad_norm": 13.125, |
|
"learning_rate": 1.5348837209302325e-07, |
|
"loss": 2.6465, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.2001067235859125, |
|
"grad_norm": 14.75, |
|
"learning_rate": 1.3953488372093025e-08, |
|
"loss": 2.7407, |
|
"step": 2250 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 2250, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.3699129752e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|