{ "best_global_step": 2200, "best_metric": 1.8803235292434692, "best_model_checkpoint": "/content/drive/MyDrive/hyperclova-deobfuscation-lora/checkpoint-2200", "epoch": 2.6666666666666665, "eval_steps": 200, "global_step": 3000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008888888888888889, "grad_norm": 3.629798412322998, "learning_rate": 1.8e-05, "loss": 4.1483, "mean_token_accuracy": 0.34797456339001653, "num_tokens": 11242.0, "step": 10 }, { "epoch": 0.017777777777777778, "grad_norm": 2.6125221252441406, "learning_rate": 3.8e-05, "loss": 3.7515, "mean_token_accuracy": 0.4058148756623268, "num_tokens": 22106.0, "step": 20 }, { "epoch": 0.02666666666666667, "grad_norm": 2.9313137531280518, "learning_rate": 5.8e-05, "loss": 3.3279, "mean_token_accuracy": 0.4703808955848217, "num_tokens": 33774.0, "step": 30 }, { "epoch": 0.035555555555555556, "grad_norm": 2.0496416091918945, "learning_rate": 7.800000000000001e-05, "loss": 2.9114, "mean_token_accuracy": 0.5239812344312668, "num_tokens": 44943.0, "step": 40 }, { "epoch": 0.044444444444444446, "grad_norm": 2.282668352127075, "learning_rate": 9.8e-05, "loss": 2.8468, "mean_token_accuracy": 0.534189497679472, "num_tokens": 56341.0, "step": 50 }, { "epoch": 0.05333333333333334, "grad_norm": 2.168651819229126, "learning_rate": 0.000118, "loss": 2.7785, "mean_token_accuracy": 0.5407359585165977, "num_tokens": 67397.0, "step": 60 }, { "epoch": 0.06222222222222222, "grad_norm": 2.289881467819214, "learning_rate": 0.000138, "loss": 2.736, "mean_token_accuracy": 0.5326176360249519, "num_tokens": 78482.0, "step": 70 }, { "epoch": 0.07111111111111111, "grad_norm": 2.1038105487823486, "learning_rate": 0.00015800000000000002, "loss": 2.5855, "mean_token_accuracy": 0.5618595249950886, "num_tokens": 89803.0, "step": 80 }, { "epoch": 0.08, "grad_norm": 2.24312686920166, "learning_rate": 0.00017800000000000002, "loss": 2.5365, "mean_token_accuracy": 0.5661972932517528, "num_tokens": 101015.0, "step": 90 }, { "epoch": 0.08888888888888889, "grad_norm": 1.9482938051223755, "learning_rate": 0.00019800000000000002, "loss": 2.5634, "mean_token_accuracy": 0.5538406319916248, "num_tokens": 112364.0, "step": 100 }, { "epoch": 0.09777777777777778, "grad_norm": 1.86210298538208, "learning_rate": 0.00019945038167938932, "loss": 2.4629, "mean_token_accuracy": 0.5780388668179512, "num_tokens": 122882.0, "step": 110 }, { "epoch": 0.10666666666666667, "grad_norm": 1.8806918859481812, "learning_rate": 0.00019883969465648855, "loss": 2.5022, "mean_token_accuracy": 0.563551553338766, "num_tokens": 134028.0, "step": 120 }, { "epoch": 0.11555555555555555, "grad_norm": 2.3264434337615967, "learning_rate": 0.00019829007633587786, "loss": 2.4065, "mean_token_accuracy": 0.5807355619966984, "num_tokens": 145192.0, "step": 130 }, { "epoch": 0.12444444444444444, "grad_norm": 1.8537976741790771, "learning_rate": 0.00019767938931297712, "loss": 2.4838, "mean_token_accuracy": 0.566282794624567, "num_tokens": 156703.0, "step": 140 }, { "epoch": 0.13333333333333333, "grad_norm": 2.0960652828216553, "learning_rate": 0.00019706870229007636, "loss": 2.4119, "mean_token_accuracy": 0.5830203481018543, "num_tokens": 168041.0, "step": 150 }, { "epoch": 0.14222222222222222, "grad_norm": 2.2244813442230225, "learning_rate": 0.00019645801526717557, "loss": 2.3726, "mean_token_accuracy": 0.5844443172216416, "num_tokens": 178986.0, "step": 160 }, { "epoch": 0.1511111111111111, "grad_norm": 1.8238722085952759, "learning_rate": 0.0001958473282442748, "loss": 2.4419, "mean_token_accuracy": 0.5708602093160152, "num_tokens": 190391.0, "step": 170 }, { "epoch": 0.16, "grad_norm": 1.7154136896133423, "learning_rate": 0.00019523664122137407, "loss": 2.4293, "mean_token_accuracy": 0.5748118035495281, "num_tokens": 201989.0, "step": 180 }, { "epoch": 0.1688888888888889, "grad_norm": 1.7582788467407227, "learning_rate": 0.0001946259541984733, "loss": 2.3577, "mean_token_accuracy": 0.5877166777849198, "num_tokens": 212914.0, "step": 190 }, { "epoch": 0.17777777777777778, "grad_norm": 1.8613263368606567, "learning_rate": 0.0001940152671755725, "loss": 2.3486, "mean_token_accuracy": 0.5889834299683571, "num_tokens": 223936.0, "step": 200 }, { "epoch": 0.17777777777777778, "eval_loss": 2.3320820331573486, "eval_mean_token_accuracy": 0.5868698905706405, "eval_num_tokens": 223936.0, "eval_runtime": 49.2429, "eval_samples_per_second": 20.307, "eval_steps_per_second": 10.154, "step": 200 }, { "epoch": 0.18666666666666668, "grad_norm": 1.8486477136611938, "learning_rate": 0.00019340458015267175, "loss": 2.3666, "mean_token_accuracy": 0.5847611322999, "num_tokens": 235036.0, "step": 210 }, { "epoch": 0.19555555555555557, "grad_norm": 2.018049478530884, "learning_rate": 0.000192793893129771, "loss": 2.2689, "mean_token_accuracy": 0.59971177354455, "num_tokens": 246101.0, "step": 220 }, { "epoch": 0.20444444444444446, "grad_norm": 1.7244890928268433, "learning_rate": 0.00019218320610687024, "loss": 2.3262, "mean_token_accuracy": 0.5855986528098583, "num_tokens": 257953.0, "step": 230 }, { "epoch": 0.21333333333333335, "grad_norm": 1.8928934335708618, "learning_rate": 0.00019157251908396948, "loss": 2.3318, "mean_token_accuracy": 0.5885626815259457, "num_tokens": 269187.0, "step": 240 }, { "epoch": 0.2222222222222222, "grad_norm": 1.7358920574188232, "learning_rate": 0.0001909618320610687, "loss": 2.2145, "mean_token_accuracy": 0.6092555984854698, "num_tokens": 279762.0, "step": 250 }, { "epoch": 0.2311111111111111, "grad_norm": 1.6779032945632935, "learning_rate": 0.00019035114503816795, "loss": 2.3152, "mean_token_accuracy": 0.584602715075016, "num_tokens": 291454.0, "step": 260 }, { "epoch": 0.24, "grad_norm": 1.6310207843780518, "learning_rate": 0.0001897404580152672, "loss": 2.2669, "mean_token_accuracy": 0.5965895019471645, "num_tokens": 302969.0, "step": 270 }, { "epoch": 0.24888888888888888, "grad_norm": 1.6765615940093994, "learning_rate": 0.00018912977099236642, "loss": 2.269, "mean_token_accuracy": 0.5934441670775413, "num_tokens": 314204.0, "step": 280 }, { "epoch": 0.2577777777777778, "grad_norm": 1.793959617614746, "learning_rate": 0.00018851908396946566, "loss": 2.2554, "mean_token_accuracy": 0.600947193801403, "num_tokens": 325649.0, "step": 290 }, { "epoch": 0.26666666666666666, "grad_norm": 1.7492129802703857, "learning_rate": 0.0001879083969465649, "loss": 2.2157, "mean_token_accuracy": 0.6022505328059197, "num_tokens": 337167.0, "step": 300 }, { "epoch": 0.27555555555555555, "grad_norm": 1.803576946258545, "learning_rate": 0.00018729770992366413, "loss": 2.2854, "mean_token_accuracy": 0.5923042424023152, "num_tokens": 348621.0, "step": 310 }, { "epoch": 0.28444444444444444, "grad_norm": 1.9662351608276367, "learning_rate": 0.00018668702290076337, "loss": 2.2639, "mean_token_accuracy": 0.588193366676569, "num_tokens": 360272.0, "step": 320 }, { "epoch": 0.29333333333333333, "grad_norm": 1.6725891828536987, "learning_rate": 0.0001860763358778626, "loss": 2.2249, "mean_token_accuracy": 0.6054098337888718, "num_tokens": 371346.0, "step": 330 }, { "epoch": 0.3022222222222222, "grad_norm": 1.68416166305542, "learning_rate": 0.00018546564885496184, "loss": 2.1678, "mean_token_accuracy": 0.6146526508033275, "num_tokens": 382779.0, "step": 340 }, { "epoch": 0.3111111111111111, "grad_norm": 1.7218507528305054, "learning_rate": 0.00018485496183206108, "loss": 2.2011, "mean_token_accuracy": 0.6104303196072578, "num_tokens": 393823.0, "step": 350 }, { "epoch": 0.32, "grad_norm": 1.6817256212234497, "learning_rate": 0.0001842442748091603, "loss": 2.2264, "mean_token_accuracy": 0.5987282857298851, "num_tokens": 405438.0, "step": 360 }, { "epoch": 0.3288888888888889, "grad_norm": 1.7454718351364136, "learning_rate": 0.00018363358778625955, "loss": 2.2712, "mean_token_accuracy": 0.5939777493476868, "num_tokens": 417299.0, "step": 370 }, { "epoch": 0.3377777777777778, "grad_norm": 2.011315107345581, "learning_rate": 0.00018302290076335878, "loss": 2.2247, "mean_token_accuracy": 0.6061037018895149, "num_tokens": 428660.0, "step": 380 }, { "epoch": 0.3466666666666667, "grad_norm": 1.6242053508758545, "learning_rate": 0.00018241221374045802, "loss": 2.232, "mean_token_accuracy": 0.6062197655439376, "num_tokens": 439768.0, "step": 390 }, { "epoch": 0.35555555555555557, "grad_norm": 1.9328559637069702, "learning_rate": 0.00018180152671755725, "loss": 2.1291, "mean_token_accuracy": 0.6168317429721355, "num_tokens": 450808.0, "step": 400 }, { "epoch": 0.35555555555555557, "eval_loss": 2.1662538051605225, "eval_mean_token_accuracy": 0.6099509916305542, "eval_num_tokens": 450808.0, "eval_runtime": 49.4213, "eval_samples_per_second": 20.234, "eval_steps_per_second": 10.117, "step": 400 }, { "epoch": 0.36444444444444446, "grad_norm": 1.8797143697738647, "learning_rate": 0.0001811908396946565, "loss": 2.2086, "mean_token_accuracy": 0.6012695133686066, "num_tokens": 461592.0, "step": 410 }, { "epoch": 0.37333333333333335, "grad_norm": 1.7558225393295288, "learning_rate": 0.00018058015267175575, "loss": 2.1771, "mean_token_accuracy": 0.6060668036341668, "num_tokens": 473434.0, "step": 420 }, { "epoch": 0.38222222222222224, "grad_norm": 1.845051884651184, "learning_rate": 0.00017996946564885496, "loss": 2.2576, "mean_token_accuracy": 0.5929104581475257, "num_tokens": 485130.0, "step": 430 }, { "epoch": 0.39111111111111113, "grad_norm": 1.6992298364639282, "learning_rate": 0.0001793587786259542, "loss": 2.1815, "mean_token_accuracy": 0.6100690707564353, "num_tokens": 496482.0, "step": 440 }, { "epoch": 0.4, "grad_norm": 1.7239253520965576, "learning_rate": 0.00017874809160305343, "loss": 2.2082, "mean_token_accuracy": 0.6001435503363609, "num_tokens": 508218.0, "step": 450 }, { "epoch": 0.4088888888888889, "grad_norm": 1.7856336832046509, "learning_rate": 0.0001781374045801527, "loss": 2.1593, "mean_token_accuracy": 0.6118309393525123, "num_tokens": 519379.0, "step": 460 }, { "epoch": 0.4177777777777778, "grad_norm": 1.611831545829773, "learning_rate": 0.00017752671755725193, "loss": 2.1797, "mean_token_accuracy": 0.6033190444111824, "num_tokens": 530561.0, "step": 470 }, { "epoch": 0.4266666666666667, "grad_norm": 1.7420586347579956, "learning_rate": 0.00017691603053435114, "loss": 2.2027, "mean_token_accuracy": 0.6067790001630783, "num_tokens": 542631.0, "step": 480 }, { "epoch": 0.43555555555555553, "grad_norm": 1.948723316192627, "learning_rate": 0.00017630534351145038, "loss": 2.1753, "mean_token_accuracy": 0.6109650492668152, "num_tokens": 553477.0, "step": 490 }, { "epoch": 0.4444444444444444, "grad_norm": 1.7983819246292114, "learning_rate": 0.00017569465648854964, "loss": 2.158, "mean_token_accuracy": 0.5996212616562844, "num_tokens": 565400.0, "step": 500 }, { "epoch": 0.4533333333333333, "grad_norm": 1.842372179031372, "learning_rate": 0.00017508396946564888, "loss": 2.0825, "mean_token_accuracy": 0.6168116196990013, "num_tokens": 576953.0, "step": 510 }, { "epoch": 0.4622222222222222, "grad_norm": 1.91799795627594, "learning_rate": 0.00017447328244274809, "loss": 2.1022, "mean_token_accuracy": 0.6168905258178711, "num_tokens": 588003.0, "step": 520 }, { "epoch": 0.4711111111111111, "grad_norm": 1.7727124691009521, "learning_rate": 0.00017386259541984732, "loss": 2.1695, "mean_token_accuracy": 0.5997609972953797, "num_tokens": 600043.0, "step": 530 }, { "epoch": 0.48, "grad_norm": 1.8602296113967896, "learning_rate": 0.00017325190839694658, "loss": 2.0849, "mean_token_accuracy": 0.6266478568315506, "num_tokens": 610974.0, "step": 540 }, { "epoch": 0.4888888888888889, "grad_norm": 1.545620083808899, "learning_rate": 0.00017264122137404582, "loss": 2.1824, "mean_token_accuracy": 0.6072694823145867, "num_tokens": 622632.0, "step": 550 }, { "epoch": 0.49777777777777776, "grad_norm": 1.7485988140106201, "learning_rate": 0.00017203053435114506, "loss": 2.1374, "mean_token_accuracy": 0.6164417043328285, "num_tokens": 634093.0, "step": 560 }, { "epoch": 0.5066666666666667, "grad_norm": 1.8591196537017822, "learning_rate": 0.00017141984732824426, "loss": 2.0928, "mean_token_accuracy": 0.6241554819047451, "num_tokens": 645226.0, "step": 570 }, { "epoch": 0.5155555555555555, "grad_norm": 1.8163517713546753, "learning_rate": 0.00017080916030534353, "loss": 2.0476, "mean_token_accuracy": 0.6285594403743744, "num_tokens": 656188.0, "step": 580 }, { "epoch": 0.5244444444444445, "grad_norm": 1.7729696035385132, "learning_rate": 0.00017019847328244276, "loss": 2.1036, "mean_token_accuracy": 0.6208315283060074, "num_tokens": 667642.0, "step": 590 }, { "epoch": 0.5333333333333333, "grad_norm": 1.7804032564163208, "learning_rate": 0.000169587786259542, "loss": 2.1174, "mean_token_accuracy": 0.6148250237107277, "num_tokens": 678769.0, "step": 600 }, { "epoch": 0.5333333333333333, "eval_loss": 2.0850696563720703, "eval_mean_token_accuracy": 0.6197466601729393, "eval_num_tokens": 678769.0, "eval_runtime": 49.7611, "eval_samples_per_second": 20.096, "eval_steps_per_second": 10.048, "step": 600 }, { "epoch": 0.5422222222222223, "grad_norm": 1.8643274307250977, "learning_rate": 0.00016897709923664124, "loss": 2.0485, "mean_token_accuracy": 0.6331146821379662, "num_tokens": 690014.0, "step": 610 }, { "epoch": 0.5511111111111111, "grad_norm": 1.8060939311981201, "learning_rate": 0.00016836641221374047, "loss": 2.1117, "mean_token_accuracy": 0.612041813135147, "num_tokens": 701734.0, "step": 620 }, { "epoch": 0.56, "grad_norm": 1.7059085369110107, "learning_rate": 0.0001677557251908397, "loss": 2.0747, "mean_token_accuracy": 0.6174572542309761, "num_tokens": 713570.0, "step": 630 }, { "epoch": 0.5688888888888889, "grad_norm": 1.6600592136383057, "learning_rate": 0.00016714503816793894, "loss": 2.0685, "mean_token_accuracy": 0.6293445661664009, "num_tokens": 724815.0, "step": 640 }, { "epoch": 0.5777777777777777, "grad_norm": 1.6598913669586182, "learning_rate": 0.00016653435114503818, "loss": 2.0255, "mean_token_accuracy": 0.6309839904308319, "num_tokens": 735777.0, "step": 650 }, { "epoch": 0.5866666666666667, "grad_norm": 1.8306963443756104, "learning_rate": 0.00016592366412213741, "loss": 2.1249, "mean_token_accuracy": 0.6147443532943726, "num_tokens": 746903.0, "step": 660 }, { "epoch": 0.5955555555555555, "grad_norm": 1.626795768737793, "learning_rate": 0.00016531297709923665, "loss": 2.0694, "mean_token_accuracy": 0.6254988595843315, "num_tokens": 757881.0, "step": 670 }, { "epoch": 0.6044444444444445, "grad_norm": 1.710806131362915, "learning_rate": 0.00016470229007633589, "loss": 2.0397, "mean_token_accuracy": 0.6233279958367348, "num_tokens": 768982.0, "step": 680 }, { "epoch": 0.6133333333333333, "grad_norm": 1.7051280736923218, "learning_rate": 0.00016409160305343512, "loss": 2.116, "mean_token_accuracy": 0.6183760315179825, "num_tokens": 780072.0, "step": 690 }, { "epoch": 0.6222222222222222, "grad_norm": 1.607917070388794, "learning_rate": 0.00016348091603053436, "loss": 2.0478, "mean_token_accuracy": 0.6331974640488625, "num_tokens": 791061.0, "step": 700 }, { "epoch": 0.6311111111111111, "grad_norm": 1.7803592681884766, "learning_rate": 0.0001628702290076336, "loss": 2.0595, "mean_token_accuracy": 0.6249041527509689, "num_tokens": 801867.0, "step": 710 }, { "epoch": 0.64, "grad_norm": 1.6132373809814453, "learning_rate": 0.00016225954198473283, "loss": 2.0789, "mean_token_accuracy": 0.6235784366726875, "num_tokens": 813112.0, "step": 720 }, { "epoch": 0.6488888888888888, "grad_norm": 1.790528655052185, "learning_rate": 0.00016164885496183207, "loss": 2.0632, "mean_token_accuracy": 0.6268924325704575, "num_tokens": 824133.0, "step": 730 }, { "epoch": 0.6577777777777778, "grad_norm": 2.0007362365722656, "learning_rate": 0.0001610381679389313, "loss": 2.0701, "mean_token_accuracy": 0.6189413338899612, "num_tokens": 835469.0, "step": 740 }, { "epoch": 0.6666666666666666, "grad_norm": 2.227158546447754, "learning_rate": 0.00016042748091603054, "loss": 2.0339, "mean_token_accuracy": 0.621903920173645, "num_tokens": 846572.0, "step": 750 }, { "epoch": 0.6755555555555556, "grad_norm": 1.80472731590271, "learning_rate": 0.00015981679389312977, "loss": 2.1285, "mean_token_accuracy": 0.604806374013424, "num_tokens": 857795.0, "step": 760 }, { "epoch": 0.6844444444444444, "grad_norm": 1.7893937826156616, "learning_rate": 0.000159206106870229, "loss": 2.0347, "mean_token_accuracy": 0.6292635962367058, "num_tokens": 868429.0, "step": 770 }, { "epoch": 0.6933333333333334, "grad_norm": 1.6761573553085327, "learning_rate": 0.00015859541984732824, "loss": 2.0591, "mean_token_accuracy": 0.6254431992769242, "num_tokens": 879659.0, "step": 780 }, { "epoch": 0.7022222222222222, "grad_norm": 1.803045630455017, "learning_rate": 0.0001579847328244275, "loss": 2.0293, "mean_token_accuracy": 0.6273573949933052, "num_tokens": 890911.0, "step": 790 }, { "epoch": 0.7111111111111111, "grad_norm": 1.7385220527648926, "learning_rate": 0.00015737404580152672, "loss": 2.0197, "mean_token_accuracy": 0.63025072067976, "num_tokens": 902240.0, "step": 800 }, { "epoch": 0.7111111111111111, "eval_loss": 2.0297935009002686, "eval_mean_token_accuracy": 0.628437293112278, "eval_num_tokens": 902240.0, "eval_runtime": 49.3011, "eval_samples_per_second": 20.284, "eval_steps_per_second": 10.142, "step": 800 }, { "epoch": 0.72, "grad_norm": 1.8906656503677368, "learning_rate": 0.00015676335877862595, "loss": 2.0806, "mean_token_accuracy": 0.619849094748497, "num_tokens": 914009.0, "step": 810 }, { "epoch": 0.7288888888888889, "grad_norm": 1.714268684387207, "learning_rate": 0.0001561526717557252, "loss": 2.0343, "mean_token_accuracy": 0.632188580930233, "num_tokens": 925091.0, "step": 820 }, { "epoch": 0.7377777777777778, "grad_norm": 1.833918809890747, "learning_rate": 0.00015554198473282445, "loss": 2.0747, "mean_token_accuracy": 0.6280180156230927, "num_tokens": 936675.0, "step": 830 }, { "epoch": 0.7466666666666667, "grad_norm": 1.9817575216293335, "learning_rate": 0.00015493129770992366, "loss": 2.0859, "mean_token_accuracy": 0.6128378361463547, "num_tokens": 948151.0, "step": 840 }, { "epoch": 0.7555555555555555, "grad_norm": 1.5982656478881836, "learning_rate": 0.0001543206106870229, "loss": 2.0455, "mean_token_accuracy": 0.6276382938027382, "num_tokens": 959266.0, "step": 850 }, { "epoch": 0.7644444444444445, "grad_norm": 1.7298970222473145, "learning_rate": 0.00015370992366412213, "loss": 1.9604, "mean_token_accuracy": 0.6377590849995614, "num_tokens": 970339.0, "step": 860 }, { "epoch": 0.7733333333333333, "grad_norm": 1.8064581155776978, "learning_rate": 0.0001530992366412214, "loss": 2.0698, "mean_token_accuracy": 0.6194617792963981, "num_tokens": 981805.0, "step": 870 }, { "epoch": 0.7822222222222223, "grad_norm": 1.5860410928726196, "learning_rate": 0.00015248854961832063, "loss": 2.0182, "mean_token_accuracy": 0.6292306095361709, "num_tokens": 993552.0, "step": 880 }, { "epoch": 0.7911111111111111, "grad_norm": 1.8761259317398071, "learning_rate": 0.00015187786259541984, "loss": 2.0335, "mean_token_accuracy": 0.6285651385784149, "num_tokens": 1004400.0, "step": 890 }, { "epoch": 0.8, "grad_norm": 1.6973590850830078, "learning_rate": 0.00015126717557251908, "loss": 2.0927, "mean_token_accuracy": 0.6183614790439605, "num_tokens": 1015564.0, "step": 900 }, { "epoch": 0.8088888888888889, "grad_norm": 1.6477675437927246, "learning_rate": 0.00015065648854961834, "loss": 1.9187, "mean_token_accuracy": 0.6427812784910202, "num_tokens": 1026849.0, "step": 910 }, { "epoch": 0.8177777777777778, "grad_norm": 1.6942589282989502, "learning_rate": 0.00015004580152671757, "loss": 2.0139, "mean_token_accuracy": 0.6322552219033242, "num_tokens": 1037721.0, "step": 920 }, { "epoch": 0.8266666666666667, "grad_norm": 1.6394822597503662, "learning_rate": 0.0001494351145038168, "loss": 2.0392, "mean_token_accuracy": 0.6273665294051171, "num_tokens": 1048986.0, "step": 930 }, { "epoch": 0.8355555555555556, "grad_norm": 1.697804570198059, "learning_rate": 0.00014882442748091602, "loss": 2.0412, "mean_token_accuracy": 0.625536386668682, "num_tokens": 1060627.0, "step": 940 }, { "epoch": 0.8444444444444444, "grad_norm": 1.8058092594146729, "learning_rate": 0.00014821374045801528, "loss": 1.9737, "mean_token_accuracy": 0.6332821652293206, "num_tokens": 1071482.0, "step": 950 }, { "epoch": 0.8533333333333334, "grad_norm": 1.773294448852539, "learning_rate": 0.00014760305343511452, "loss": 2.054, "mean_token_accuracy": 0.6256278708577157, "num_tokens": 1082672.0, "step": 960 }, { "epoch": 0.8622222222222222, "grad_norm": 1.6936707496643066, "learning_rate": 0.00014699236641221375, "loss": 1.9957, "mean_token_accuracy": 0.6333451583981514, "num_tokens": 1093493.0, "step": 970 }, { "epoch": 0.8711111111111111, "grad_norm": 1.7029008865356445, "learning_rate": 0.000146381679389313, "loss": 2.0526, "mean_token_accuracy": 0.6244132176041604, "num_tokens": 1104857.0, "step": 980 }, { "epoch": 0.88, "grad_norm": 1.8421082496643066, "learning_rate": 0.00014577099236641223, "loss": 2.0311, "mean_token_accuracy": 0.6236826583743096, "num_tokens": 1116131.0, "step": 990 }, { "epoch": 0.8888888888888888, "grad_norm": 1.646053433418274, "learning_rate": 0.00014516030534351146, "loss": 1.9973, "mean_token_accuracy": 0.6274659112095833, "num_tokens": 1127612.0, "step": 1000 }, { "epoch": 0.8888888888888888, "eval_loss": 1.989682674407959, "eval_mean_token_accuracy": 0.633990108013153, "eval_num_tokens": 1127612.0, "eval_runtime": 49.3043, "eval_samples_per_second": 20.282, "eval_steps_per_second": 10.141, "step": 1000 }, { "epoch": 0.8977777777777778, "grad_norm": 1.5941271781921387, "learning_rate": 0.0001445496183206107, "loss": 2.0579, "mean_token_accuracy": 0.6256210282444954, "num_tokens": 1138866.0, "step": 1010 }, { "epoch": 0.9066666666666666, "grad_norm": 1.7826253175735474, "learning_rate": 0.00014393893129770993, "loss": 1.9866, "mean_token_accuracy": 0.6332772478461266, "num_tokens": 1150411.0, "step": 1020 }, { "epoch": 0.9155555555555556, "grad_norm": 1.8722221851348877, "learning_rate": 0.00014332824427480917, "loss": 2.0398, "mean_token_accuracy": 0.627329595386982, "num_tokens": 1161360.0, "step": 1030 }, { "epoch": 0.9244444444444444, "grad_norm": 1.6533294916152954, "learning_rate": 0.0001427175572519084, "loss": 2.0271, "mean_token_accuracy": 0.6259514302015304, "num_tokens": 1172683.0, "step": 1040 }, { "epoch": 0.9333333333333333, "grad_norm": 1.5746543407440186, "learning_rate": 0.00014210687022900764, "loss": 1.9634, "mean_token_accuracy": 0.6359310179948807, "num_tokens": 1183277.0, "step": 1050 }, { "epoch": 0.9422222222222222, "grad_norm": 1.6094276905059814, "learning_rate": 0.00014149618320610688, "loss": 1.9195, "mean_token_accuracy": 0.649330523610115, "num_tokens": 1194160.0, "step": 1060 }, { "epoch": 0.9511111111111111, "grad_norm": 1.9643882513046265, "learning_rate": 0.0001408854961832061, "loss": 2.0042, "mean_token_accuracy": 0.6356254667043686, "num_tokens": 1205308.0, "step": 1070 }, { "epoch": 0.96, "grad_norm": 1.8238948583602905, "learning_rate": 0.00014027480916030535, "loss": 1.9172, "mean_token_accuracy": 0.6497033536434174, "num_tokens": 1215760.0, "step": 1080 }, { "epoch": 0.9688888888888889, "grad_norm": 1.7422380447387695, "learning_rate": 0.00013966412213740458, "loss": 2.0213, "mean_token_accuracy": 0.6309294819831848, "num_tokens": 1226775.0, "step": 1090 }, { "epoch": 0.9777777777777777, "grad_norm": 1.651795744895935, "learning_rate": 0.00013905343511450382, "loss": 2.033, "mean_token_accuracy": 0.6295390352606773, "num_tokens": 1238191.0, "step": 1100 }, { "epoch": 0.9866666666666667, "grad_norm": 1.673543095588684, "learning_rate": 0.00013844274809160308, "loss": 2.0085, "mean_token_accuracy": 0.6329691678285598, "num_tokens": 1249561.0, "step": 1110 }, { "epoch": 0.9955555555555555, "grad_norm": 1.7423163652420044, "learning_rate": 0.0001378320610687023, "loss": 1.9751, "mean_token_accuracy": 0.6307685926556588, "num_tokens": 1260429.0, "step": 1120 }, { "epoch": 1.0044444444444445, "grad_norm": 1.4878981113433838, "learning_rate": 0.00013722137404580153, "loss": 1.9171, "mean_token_accuracy": 0.644737622141838, "num_tokens": 1271111.0, "step": 1130 }, { "epoch": 1.0133333333333334, "grad_norm": 1.5343797206878662, "learning_rate": 0.00013661068702290076, "loss": 1.8544, "mean_token_accuracy": 0.6503374725580215, "num_tokens": 1282434.0, "step": 1140 }, { "epoch": 1.0222222222222221, "grad_norm": 1.5450340509414673, "learning_rate": 0.00013600000000000003, "loss": 1.828, "mean_token_accuracy": 0.6514182686805725, "num_tokens": 1294382.0, "step": 1150 }, { "epoch": 1.031111111111111, "grad_norm": 1.8313877582550049, "learning_rate": 0.00013538931297709923, "loss": 1.7704, "mean_token_accuracy": 0.6693721905350685, "num_tokens": 1305343.0, "step": 1160 }, { "epoch": 1.04, "grad_norm": 1.8418430089950562, "learning_rate": 0.00013477862595419847, "loss": 1.7591, "mean_token_accuracy": 0.67226582467556, "num_tokens": 1316558.0, "step": 1170 }, { "epoch": 1.048888888888889, "grad_norm": 1.6022825241088867, "learning_rate": 0.0001341679389312977, "loss": 1.8048, "mean_token_accuracy": 0.6629651457071304, "num_tokens": 1327938.0, "step": 1180 }, { "epoch": 1.0577777777777777, "grad_norm": 1.5888707637786865, "learning_rate": 0.00013355725190839697, "loss": 1.773, "mean_token_accuracy": 0.6730352655053139, "num_tokens": 1338732.0, "step": 1190 }, { "epoch": 1.0666666666666667, "grad_norm": 1.833946943283081, "learning_rate": 0.0001329465648854962, "loss": 1.7887, "mean_token_accuracy": 0.6616317644715309, "num_tokens": 1350096.0, "step": 1200 }, { "epoch": 1.0666666666666667, "eval_loss": 1.9697085618972778, "eval_mean_token_accuracy": 0.6378205664157868, "eval_num_tokens": 1350096.0, "eval_runtime": 49.9237, "eval_samples_per_second": 20.031, "eval_steps_per_second": 10.015, "step": 1200 }, { "epoch": 1.0755555555555556, "grad_norm": 1.6338160037994385, "learning_rate": 0.00013233587786259541, "loss": 1.7889, "mean_token_accuracy": 0.6668319672346115, "num_tokens": 1360771.0, "step": 1210 }, { "epoch": 1.0844444444444445, "grad_norm": 1.8737561702728271, "learning_rate": 0.00013172519083969465, "loss": 1.7997, "mean_token_accuracy": 0.6570939287543297, "num_tokens": 1372450.0, "step": 1220 }, { "epoch": 1.0933333333333333, "grad_norm": 1.758074402809143, "learning_rate": 0.0001311145038167939, "loss": 1.8457, "mean_token_accuracy": 0.653074924647808, "num_tokens": 1383711.0, "step": 1230 }, { "epoch": 1.1022222222222222, "grad_norm": 1.839158296585083, "learning_rate": 0.00013050381679389315, "loss": 1.8013, "mean_token_accuracy": 0.6608111187815666, "num_tokens": 1394856.0, "step": 1240 }, { "epoch": 1.1111111111111112, "grad_norm": 1.733567476272583, "learning_rate": 0.00012989312977099238, "loss": 1.7814, "mean_token_accuracy": 0.6655508041381836, "num_tokens": 1406193.0, "step": 1250 }, { "epoch": 1.12, "grad_norm": 1.6274900436401367, "learning_rate": 0.0001292824427480916, "loss": 1.858, "mean_token_accuracy": 0.6488608077168465, "num_tokens": 1417607.0, "step": 1260 }, { "epoch": 1.1288888888888888, "grad_norm": 1.690090537071228, "learning_rate": 0.00012867175572519086, "loss": 1.8256, "mean_token_accuracy": 0.6595686703920365, "num_tokens": 1429073.0, "step": 1270 }, { "epoch": 1.1377777777777778, "grad_norm": 1.6638071537017822, "learning_rate": 0.0001280610687022901, "loss": 1.8334, "mean_token_accuracy": 0.6580470725893974, "num_tokens": 1440194.0, "step": 1280 }, { "epoch": 1.1466666666666667, "grad_norm": 1.8339307308197021, "learning_rate": 0.00012745038167938933, "loss": 1.783, "mean_token_accuracy": 0.6632378786802292, "num_tokens": 1451221.0, "step": 1290 }, { "epoch": 1.1555555555555554, "grad_norm": 1.7621415853500366, "learning_rate": 0.00012683969465648854, "loss": 1.844, "mean_token_accuracy": 0.6506654173135757, "num_tokens": 1462493.0, "step": 1300 }, { "epoch": 1.1644444444444444, "grad_norm": 1.7811567783355713, "learning_rate": 0.00012622900763358777, "loss": 1.8235, "mean_token_accuracy": 0.6505810797214509, "num_tokens": 1473710.0, "step": 1310 }, { "epoch": 1.1733333333333333, "grad_norm": 1.9157836437225342, "learning_rate": 0.00012561832061068704, "loss": 1.8885, "mean_token_accuracy": 0.6459546625614166, "num_tokens": 1485215.0, "step": 1320 }, { "epoch": 1.1822222222222223, "grad_norm": 1.6572569608688354, "learning_rate": 0.00012500763358778627, "loss": 1.813, "mean_token_accuracy": 0.6597578257322312, "num_tokens": 1496371.0, "step": 1330 }, { "epoch": 1.1911111111111112, "grad_norm": 1.8602449893951416, "learning_rate": 0.0001243969465648855, "loss": 1.8179, "mean_token_accuracy": 0.6519266426563263, "num_tokens": 1508348.0, "step": 1340 }, { "epoch": 1.2, "grad_norm": 1.8736369609832764, "learning_rate": 0.00012378625954198472, "loss": 1.8029, "mean_token_accuracy": 0.6621162816882133, "num_tokens": 1519322.0, "step": 1350 }, { "epoch": 1.208888888888889, "grad_norm": 2.026744842529297, "learning_rate": 0.00012317557251908398, "loss": 1.8168, "mean_token_accuracy": 0.6635635286569596, "num_tokens": 1530183.0, "step": 1360 }, { "epoch": 1.2177777777777778, "grad_norm": 1.7360782623291016, "learning_rate": 0.00012256488549618322, "loss": 1.7521, "mean_token_accuracy": 0.6706348299980164, "num_tokens": 1540862.0, "step": 1370 }, { "epoch": 1.2266666666666666, "grad_norm": 1.9620578289031982, "learning_rate": 0.00012195419847328244, "loss": 1.8228, "mean_token_accuracy": 0.6569086670875549, "num_tokens": 1552212.0, "step": 1380 }, { "epoch": 1.2355555555555555, "grad_norm": 1.6294327974319458, "learning_rate": 0.00012134351145038167, "loss": 1.7654, "mean_token_accuracy": 0.6697377026081085, "num_tokens": 1563356.0, "step": 1390 }, { "epoch": 1.2444444444444445, "grad_norm": 1.7311524152755737, "learning_rate": 0.00012073282442748092, "loss": 1.9019, "mean_token_accuracy": 0.6457875579595566, "num_tokens": 1574569.0, "step": 1400 }, { "epoch": 1.2444444444444445, "eval_loss": 1.9411770105361938, "eval_mean_token_accuracy": 0.6407178282737732, "eval_num_tokens": 1574569.0, "eval_runtime": 48.3309, "eval_samples_per_second": 20.691, "eval_steps_per_second": 10.345, "step": 1400 }, { "epoch": 1.2533333333333334, "grad_norm": 1.8629728555679321, "learning_rate": 0.00012012213740458016, "loss": 1.7585, "mean_token_accuracy": 0.671015702188015, "num_tokens": 1585308.0, "step": 1410 }, { "epoch": 1.2622222222222224, "grad_norm": 1.958808183670044, "learning_rate": 0.0001195114503816794, "loss": 1.8479, "mean_token_accuracy": 0.6535898372530937, "num_tokens": 1596886.0, "step": 1420 }, { "epoch": 1.271111111111111, "grad_norm": 1.950421690940857, "learning_rate": 0.00011890076335877862, "loss": 1.8173, "mean_token_accuracy": 0.6655478686094284, "num_tokens": 1607683.0, "step": 1430 }, { "epoch": 1.28, "grad_norm": 1.8152872323989868, "learning_rate": 0.00011829007633587788, "loss": 1.8791, "mean_token_accuracy": 0.6531546950340271, "num_tokens": 1618906.0, "step": 1440 }, { "epoch": 1.2888888888888888, "grad_norm": 1.7857719659805298, "learning_rate": 0.0001176793893129771, "loss": 1.7887, "mean_token_accuracy": 0.6610255971550941, "num_tokens": 1629981.0, "step": 1450 }, { "epoch": 1.2977777777777777, "grad_norm": 1.8434971570968628, "learning_rate": 0.00011706870229007634, "loss": 1.8368, "mean_token_accuracy": 0.653369964659214, "num_tokens": 1641429.0, "step": 1460 }, { "epoch": 1.3066666666666666, "grad_norm": 1.8877320289611816, "learning_rate": 0.00011645801526717557, "loss": 1.7938, "mean_token_accuracy": 0.6639183640480042, "num_tokens": 1652601.0, "step": 1470 }, { "epoch": 1.3155555555555556, "grad_norm": 1.8121625185012817, "learning_rate": 0.00011584732824427482, "loss": 1.7862, "mean_token_accuracy": 0.661414910852909, "num_tokens": 1663837.0, "step": 1480 }, { "epoch": 1.3244444444444445, "grad_norm": 1.7919855117797852, "learning_rate": 0.00011523664122137406, "loss": 1.8148, "mean_token_accuracy": 0.6654411420226097, "num_tokens": 1675018.0, "step": 1490 }, { "epoch": 1.3333333333333333, "grad_norm": 1.828735589981079, "learning_rate": 0.00011462595419847328, "loss": 1.8456, "mean_token_accuracy": 0.6496043875813484, "num_tokens": 1686136.0, "step": 1500 }, { "epoch": 1.3422222222222222, "grad_norm": 1.9462794065475464, "learning_rate": 0.00011401526717557252, "loss": 1.8412, "mean_token_accuracy": 0.6603908941149712, "num_tokens": 1697160.0, "step": 1510 }, { "epoch": 1.3511111111111112, "grad_norm": 1.6794313192367554, "learning_rate": 0.00011340458015267177, "loss": 1.7774, "mean_token_accuracy": 0.6664682924747467, "num_tokens": 1707831.0, "step": 1520 }, { "epoch": 1.3599999999999999, "grad_norm": 1.8189337253570557, "learning_rate": 0.000112793893129771, "loss": 1.8031, "mean_token_accuracy": 0.6627006307244301, "num_tokens": 1719074.0, "step": 1530 }, { "epoch": 1.3688888888888888, "grad_norm": 2.073533296585083, "learning_rate": 0.00011218320610687022, "loss": 1.8657, "mean_token_accuracy": 0.6476830393075943, "num_tokens": 1730388.0, "step": 1540 }, { "epoch": 1.3777777777777778, "grad_norm": 2.1564207077026367, "learning_rate": 0.00011157251908396946, "loss": 1.8261, "mean_token_accuracy": 0.6567840203642845, "num_tokens": 1741806.0, "step": 1550 }, { "epoch": 1.3866666666666667, "grad_norm": 1.6113232374191284, "learning_rate": 0.00011096183206106871, "loss": 1.7753, "mean_token_accuracy": 0.6659888163208961, "num_tokens": 1753313.0, "step": 1560 }, { "epoch": 1.3955555555555557, "grad_norm": 1.8112174272537231, "learning_rate": 0.00011035114503816795, "loss": 1.8046, "mean_token_accuracy": 0.6593015149235726, "num_tokens": 1765144.0, "step": 1570 }, { "epoch": 1.4044444444444444, "grad_norm": 1.8377541303634644, "learning_rate": 0.00010974045801526718, "loss": 1.8848, "mean_token_accuracy": 0.6533517614006996, "num_tokens": 1776783.0, "step": 1580 }, { "epoch": 1.4133333333333333, "grad_norm": 1.8384325504302979, "learning_rate": 0.0001091297709923664, "loss": 1.7669, "mean_token_accuracy": 0.6613995045423507, "num_tokens": 1788274.0, "step": 1590 }, { "epoch": 1.4222222222222223, "grad_norm": 1.8124533891677856, "learning_rate": 0.00010851908396946567, "loss": 1.8164, "mean_token_accuracy": 0.6591159239411354, "num_tokens": 1799707.0, "step": 1600 }, { "epoch": 1.4222222222222223, "eval_loss": 1.9286668300628662, "eval_mean_token_accuracy": 0.6434953879117966, "eval_num_tokens": 1799707.0, "eval_runtime": 48.6198, "eval_samples_per_second": 20.568, "eval_steps_per_second": 10.284, "step": 1600 }, { "epoch": 1.431111111111111, "grad_norm": 1.6931661367416382, "learning_rate": 0.00010790839694656489, "loss": 1.7548, "mean_token_accuracy": 0.664087076485157, "num_tokens": 1810865.0, "step": 1610 }, { "epoch": 1.44, "grad_norm": 1.7501254081726074, "learning_rate": 0.00010729770992366413, "loss": 1.7652, "mean_token_accuracy": 0.6640020117163659, "num_tokens": 1821807.0, "step": 1620 }, { "epoch": 1.448888888888889, "grad_norm": 1.8411732912063599, "learning_rate": 0.00010668702290076336, "loss": 1.831, "mean_token_accuracy": 0.6564242169260979, "num_tokens": 1832886.0, "step": 1630 }, { "epoch": 1.4577777777777778, "grad_norm": 2.003892183303833, "learning_rate": 0.00010607633587786261, "loss": 1.7791, "mean_token_accuracy": 0.6632592365145683, "num_tokens": 1843989.0, "step": 1640 }, { "epoch": 1.4666666666666668, "grad_norm": 1.7987340688705444, "learning_rate": 0.00010546564885496185, "loss": 1.7627, "mean_token_accuracy": 0.6713873609900475, "num_tokens": 1855106.0, "step": 1650 }, { "epoch": 1.4755555555555555, "grad_norm": 1.931877851486206, "learning_rate": 0.00010485496183206107, "loss": 1.7976, "mean_token_accuracy": 0.6631382897496223, "num_tokens": 1866900.0, "step": 1660 }, { "epoch": 1.4844444444444445, "grad_norm": 1.7883687019348145, "learning_rate": 0.0001042442748091603, "loss": 1.7671, "mean_token_accuracy": 0.6675158813595772, "num_tokens": 1877911.0, "step": 1670 }, { "epoch": 1.4933333333333334, "grad_norm": 1.8195563554763794, "learning_rate": 0.00010363358778625955, "loss": 1.8346, "mean_token_accuracy": 0.652577318251133, "num_tokens": 1889580.0, "step": 1680 }, { "epoch": 1.5022222222222221, "grad_norm": 1.7439149618148804, "learning_rate": 0.00010302290076335879, "loss": 1.7476, "mean_token_accuracy": 0.6717594474554062, "num_tokens": 1901133.0, "step": 1690 }, { "epoch": 1.511111111111111, "grad_norm": 1.8155314922332764, "learning_rate": 0.00010241221374045801, "loss": 1.8044, "mean_token_accuracy": 0.6617274522781372, "num_tokens": 1911796.0, "step": 1700 }, { "epoch": 1.52, "grad_norm": 1.7685112953186035, "learning_rate": 0.00010180152671755725, "loss": 1.7727, "mean_token_accuracy": 0.665304908156395, "num_tokens": 1923217.0, "step": 1710 }, { "epoch": 1.528888888888889, "grad_norm": 1.737053632736206, "learning_rate": 0.0001011908396946565, "loss": 1.8345, "mean_token_accuracy": 0.6577870160341263, "num_tokens": 1934355.0, "step": 1720 }, { "epoch": 1.537777777777778, "grad_norm": 1.9686291217803955, "learning_rate": 0.00010058015267175573, "loss": 1.8165, "mean_token_accuracy": 0.6594037398695946, "num_tokens": 1945653.0, "step": 1730 }, { "epoch": 1.5466666666666666, "grad_norm": 1.844651699066162, "learning_rate": 9.996946564885497e-05, "loss": 1.8273, "mean_token_accuracy": 0.6566928923130035, "num_tokens": 1956891.0, "step": 1740 }, { "epoch": 1.5555555555555556, "grad_norm": 1.8607743978500366, "learning_rate": 9.93587786259542e-05, "loss": 1.785, "mean_token_accuracy": 0.6692357853055, "num_tokens": 1967789.0, "step": 1750 }, { "epoch": 1.5644444444444443, "grad_norm": 1.9204373359680176, "learning_rate": 9.874809160305344e-05, "loss": 1.8264, "mean_token_accuracy": 0.6549209818243981, "num_tokens": 1979224.0, "step": 1760 }, { "epoch": 1.5733333333333333, "grad_norm": 1.7754265069961548, "learning_rate": 9.813740458015268e-05, "loss": 1.7467, "mean_token_accuracy": 0.6670090600848197, "num_tokens": 1990255.0, "step": 1770 }, { "epoch": 1.5822222222222222, "grad_norm": 2.069091796875, "learning_rate": 9.752671755725191e-05, "loss": 1.7731, "mean_token_accuracy": 0.6609751120209694, "num_tokens": 2001606.0, "step": 1780 }, { "epoch": 1.5911111111111111, "grad_norm": 2.1375646591186523, "learning_rate": 9.691603053435115e-05, "loss": 1.8009, "mean_token_accuracy": 0.6624869346618653, "num_tokens": 2012912.0, "step": 1790 }, { "epoch": 1.6, "grad_norm": 1.5623434782028198, "learning_rate": 9.630534351145038e-05, "loss": 1.7383, "mean_token_accuracy": 0.6694582119584084, "num_tokens": 2024571.0, "step": 1800 }, { "epoch": 1.6, "eval_loss": 1.90510892868042, "eval_mean_token_accuracy": 0.6464553346633911, "eval_num_tokens": 2024571.0, "eval_runtime": 48.9449, "eval_samples_per_second": 20.431, "eval_steps_per_second": 10.216, "step": 1800 }, { "epoch": 1.608888888888889, "grad_norm": 1.745969295501709, "learning_rate": 9.569465648854963e-05, "loss": 1.7552, "mean_token_accuracy": 0.6786300778388977, "num_tokens": 2035783.0, "step": 1810 }, { "epoch": 1.6177777777777778, "grad_norm": 1.7463303804397583, "learning_rate": 9.508396946564886e-05, "loss": 1.7495, "mean_token_accuracy": 0.6666959136724472, "num_tokens": 2047304.0, "step": 1820 }, { "epoch": 1.6266666666666667, "grad_norm": 1.9058139324188232, "learning_rate": 9.44732824427481e-05, "loss": 1.8365, "mean_token_accuracy": 0.6536470741033554, "num_tokens": 2058792.0, "step": 1830 }, { "epoch": 1.6355555555555554, "grad_norm": 2.065488576889038, "learning_rate": 9.386259541984733e-05, "loss": 1.7939, "mean_token_accuracy": 0.6519258007407188, "num_tokens": 2070175.0, "step": 1840 }, { "epoch": 1.6444444444444444, "grad_norm": 1.778023600578308, "learning_rate": 9.325190839694658e-05, "loss": 1.8155, "mean_token_accuracy": 0.655296416580677, "num_tokens": 2081343.0, "step": 1850 }, { "epoch": 1.6533333333333333, "grad_norm": 1.7437517642974854, "learning_rate": 9.26412213740458e-05, "loss": 1.7996, "mean_token_accuracy": 0.6618543311953544, "num_tokens": 2093074.0, "step": 1860 }, { "epoch": 1.6622222222222223, "grad_norm": 1.7666471004486084, "learning_rate": 9.203053435114505e-05, "loss": 1.7658, "mean_token_accuracy": 0.6631957843899727, "num_tokens": 2104640.0, "step": 1870 }, { "epoch": 1.6711111111111112, "grad_norm": 1.912842869758606, "learning_rate": 9.141984732824428e-05, "loss": 1.7996, "mean_token_accuracy": 0.6606781020760536, "num_tokens": 2115628.0, "step": 1880 }, { "epoch": 1.6800000000000002, "grad_norm": 1.7230331897735596, "learning_rate": 9.080916030534351e-05, "loss": 1.8042, "mean_token_accuracy": 0.6600380197167397, "num_tokens": 2126505.0, "step": 1890 }, { "epoch": 1.6888888888888889, "grad_norm": 1.7043401002883911, "learning_rate": 9.019847328244276e-05, "loss": 1.7993, "mean_token_accuracy": 0.6613149493932724, "num_tokens": 2138364.0, "step": 1900 }, { "epoch": 1.6977777777777778, "grad_norm": 1.9145572185516357, "learning_rate": 8.958778625954198e-05, "loss": 1.8046, "mean_token_accuracy": 0.662477345764637, "num_tokens": 2149425.0, "step": 1910 }, { "epoch": 1.7066666666666666, "grad_norm": 1.7448140382766724, "learning_rate": 8.897709923664123e-05, "loss": 1.8004, "mean_token_accuracy": 0.6539181426167489, "num_tokens": 2160843.0, "step": 1920 }, { "epoch": 1.7155555555555555, "grad_norm": 1.8304840326309204, "learning_rate": 8.836641221374045e-05, "loss": 1.8404, "mean_token_accuracy": 0.6593489304184914, "num_tokens": 2172044.0, "step": 1930 }, { "epoch": 1.7244444444444444, "grad_norm": 1.802331566810608, "learning_rate": 8.77557251908397e-05, "loss": 1.7995, "mean_token_accuracy": 0.6634193584322929, "num_tokens": 2182916.0, "step": 1940 }, { "epoch": 1.7333333333333334, "grad_norm": 1.9834682941436768, "learning_rate": 8.714503816793894e-05, "loss": 1.7525, "mean_token_accuracy": 0.6685526207089424, "num_tokens": 2194913.0, "step": 1950 }, { "epoch": 1.7422222222222223, "grad_norm": 1.8077235221862793, "learning_rate": 8.653435114503817e-05, "loss": 1.7612, "mean_token_accuracy": 0.6704939991235733, "num_tokens": 2205721.0, "step": 1960 }, { "epoch": 1.751111111111111, "grad_norm": 1.957993745803833, "learning_rate": 8.592366412213741e-05, "loss": 1.8059, "mean_token_accuracy": 0.6547697961330414, "num_tokens": 2217489.0, "step": 1970 }, { "epoch": 1.76, "grad_norm": 1.7215981483459473, "learning_rate": 8.531297709923664e-05, "loss": 1.7913, "mean_token_accuracy": 0.657075221836567, "num_tokens": 2228972.0, "step": 1980 }, { "epoch": 1.7688888888888887, "grad_norm": 1.8760231733322144, "learning_rate": 8.470229007633588e-05, "loss": 1.7923, "mean_token_accuracy": 0.6629065066576004, "num_tokens": 2240239.0, "step": 1990 }, { "epoch": 1.7777777777777777, "grad_norm": 2.092407703399658, "learning_rate": 8.409160305343512e-05, "loss": 1.7593, "mean_token_accuracy": 0.6686230883002281, "num_tokens": 2251436.0, "step": 2000 }, { "epoch": 1.7777777777777777, "eval_loss": 1.893255591392517, "eval_mean_token_accuracy": 0.6482590944766998, "eval_num_tokens": 2251436.0, "eval_runtime": 49.0676, "eval_samples_per_second": 20.38, "eval_steps_per_second": 10.19, "step": 2000 }, { "epoch": 1.7866666666666666, "grad_norm": 1.7836107015609741, "learning_rate": 8.348091603053435e-05, "loss": 1.8033, "mean_token_accuracy": 0.6598399996757507, "num_tokens": 2263069.0, "step": 2010 }, { "epoch": 1.7955555555555556, "grad_norm": 1.7955141067504883, "learning_rate": 8.287022900763359e-05, "loss": 1.7922, "mean_token_accuracy": 0.6619856491684913, "num_tokens": 2274050.0, "step": 2020 }, { "epoch": 1.8044444444444445, "grad_norm": 1.7887564897537231, "learning_rate": 8.225954198473282e-05, "loss": 1.8353, "mean_token_accuracy": 0.658150726556778, "num_tokens": 2285060.0, "step": 2030 }, { "epoch": 1.8133333333333335, "grad_norm": 1.8892567157745361, "learning_rate": 8.164885496183207e-05, "loss": 1.7266, "mean_token_accuracy": 0.6728688895702362, "num_tokens": 2296211.0, "step": 2040 }, { "epoch": 1.8222222222222222, "grad_norm": 1.9226106405258179, "learning_rate": 8.10381679389313e-05, "loss": 1.7243, "mean_token_accuracy": 0.6712497785687447, "num_tokens": 2307184.0, "step": 2050 }, { "epoch": 1.8311111111111111, "grad_norm": 1.735863208770752, "learning_rate": 8.042748091603054e-05, "loss": 1.7739, "mean_token_accuracy": 0.6621047109365463, "num_tokens": 2318602.0, "step": 2060 }, { "epoch": 1.8399999999999999, "grad_norm": 1.8361355066299438, "learning_rate": 7.981679389312977e-05, "loss": 1.8223, "mean_token_accuracy": 0.6560095950961113, "num_tokens": 2330193.0, "step": 2070 }, { "epoch": 1.8488888888888888, "grad_norm": 1.8159486055374146, "learning_rate": 7.920610687022902e-05, "loss": 1.7695, "mean_token_accuracy": 0.6657541528344154, "num_tokens": 2341442.0, "step": 2080 }, { "epoch": 1.8577777777777778, "grad_norm": 1.9189419746398926, "learning_rate": 7.859541984732824e-05, "loss": 1.8333, "mean_token_accuracy": 0.6628425523638726, "num_tokens": 2352479.0, "step": 2090 }, { "epoch": 1.8666666666666667, "grad_norm": 1.8809512853622437, "learning_rate": 7.798473282442749e-05, "loss": 1.7371, "mean_token_accuracy": 0.6683435723185539, "num_tokens": 2363642.0, "step": 2100 }, { "epoch": 1.8755555555555556, "grad_norm": 1.845886468887329, "learning_rate": 7.737404580152672e-05, "loss": 1.7774, "mean_token_accuracy": 0.6559944331645966, "num_tokens": 2375376.0, "step": 2110 }, { "epoch": 1.8844444444444446, "grad_norm": 1.7780894041061401, "learning_rate": 7.676335877862596e-05, "loss": 1.7823, "mean_token_accuracy": 0.6601730152964592, "num_tokens": 2386944.0, "step": 2120 }, { "epoch": 1.8933333333333333, "grad_norm": 1.9167022705078125, "learning_rate": 7.61526717557252e-05, "loss": 1.7869, "mean_token_accuracy": 0.6573449537158013, "num_tokens": 2398391.0, "step": 2130 }, { "epoch": 1.9022222222222223, "grad_norm": 2.037911891937256, "learning_rate": 7.554198473282443e-05, "loss": 1.7858, "mean_token_accuracy": 0.6593190267682075, "num_tokens": 2409837.0, "step": 2140 }, { "epoch": 1.911111111111111, "grad_norm": 1.7496647834777832, "learning_rate": 7.493129770992367e-05, "loss": 1.7241, "mean_token_accuracy": 0.6702290028333664, "num_tokens": 2421607.0, "step": 2150 }, { "epoch": 1.92, "grad_norm": 2.0227596759796143, "learning_rate": 7.43206106870229e-05, "loss": 1.7731, "mean_token_accuracy": 0.6679618924856185, "num_tokens": 2432376.0, "step": 2160 }, { "epoch": 1.9288888888888889, "grad_norm": 1.7401562929153442, "learning_rate": 7.370992366412214e-05, "loss": 1.7684, "mean_token_accuracy": 0.6676609605550766, "num_tokens": 2443683.0, "step": 2170 }, { "epoch": 1.9377777777777778, "grad_norm": 2.709106922149658, "learning_rate": 7.309923664122137e-05, "loss": 1.709, "mean_token_accuracy": 0.6738818466663361, "num_tokens": 2454757.0, "step": 2180 }, { "epoch": 1.9466666666666668, "grad_norm": 1.8504191637039185, "learning_rate": 7.248854961832061e-05, "loss": 1.7411, "mean_token_accuracy": 0.6681609645485878, "num_tokens": 2465562.0, "step": 2190 }, { "epoch": 1.9555555555555557, "grad_norm": 1.9488162994384766, "learning_rate": 7.187786259541986e-05, "loss": 1.7927, "mean_token_accuracy": 0.6587553441524505, "num_tokens": 2476869.0, "step": 2200 }, { "epoch": 1.9555555555555557, "eval_loss": 1.8803235292434692, "eval_mean_token_accuracy": 0.6499251070022583, "eval_num_tokens": 2476869.0, "eval_runtime": 47.7648, "eval_samples_per_second": 20.936, "eval_steps_per_second": 10.468, "step": 2200 }, { "epoch": 1.9644444444444444, "grad_norm": 1.9747337102890015, "learning_rate": 7.132824427480917e-05, "loss": 1.7689, "mean_token_accuracy": 0.666295376420021, "num_tokens": 2487704.0, "step": 2210 }, { "epoch": 1.9733333333333334, "grad_norm": 1.8904316425323486, "learning_rate": 7.071755725190839e-05, "loss": 1.7538, "mean_token_accuracy": 0.6645636394619941, "num_tokens": 2498918.0, "step": 2220 }, { "epoch": 1.982222222222222, "grad_norm": 1.8791844844818115, "learning_rate": 7.010687022900764e-05, "loss": 1.7926, "mean_token_accuracy": 0.6631673067808151, "num_tokens": 2509728.0, "step": 2230 }, { "epoch": 1.991111111111111, "grad_norm": 1.9756606817245483, "learning_rate": 6.949618320610687e-05, "loss": 1.7863, "mean_token_accuracy": 0.6628521859645844, "num_tokens": 2521073.0, "step": 2240 }, { "epoch": 2.0, "grad_norm": 1.7894699573516846, "learning_rate": 6.888549618320611e-05, "loss": 1.7539, "mean_token_accuracy": 0.6728802308440208, "num_tokens": 2531820.0, "step": 2250 }, { "epoch": 2.008888888888889, "grad_norm": 1.702850341796875, "learning_rate": 6.827480916030535e-05, "loss": 1.4903, "mean_token_accuracy": 0.7138098135590554, "num_tokens": 2542512.0, "step": 2260 }, { "epoch": 2.017777777777778, "grad_norm": 1.7931528091430664, "learning_rate": 6.766412213740458e-05, "loss": 1.601, "mean_token_accuracy": 0.6894692406058311, "num_tokens": 2553338.0, "step": 2270 }, { "epoch": 2.026666666666667, "grad_norm": 2.228480339050293, "learning_rate": 6.705343511450382e-05, "loss": 1.609, "mean_token_accuracy": 0.6943154886364937, "num_tokens": 2564182.0, "step": 2280 }, { "epoch": 2.0355555555555553, "grad_norm": 1.9658042192459106, "learning_rate": 6.644274809160305e-05, "loss": 1.6545, "mean_token_accuracy": 0.6824306204915047, "num_tokens": 2575789.0, "step": 2290 }, { "epoch": 2.0444444444444443, "grad_norm": 1.7540594339370728, "learning_rate": 6.583206106870229e-05, "loss": 1.6229, "mean_token_accuracy": 0.6881745710968972, "num_tokens": 2587147.0, "step": 2300 }, { "epoch": 2.0533333333333332, "grad_norm": 1.799501895904541, "learning_rate": 6.522137404580153e-05, "loss": 1.6119, "mean_token_accuracy": 0.6896049126982688, "num_tokens": 2598282.0, "step": 2310 }, { "epoch": 2.062222222222222, "grad_norm": 1.7720867395401, "learning_rate": 6.461068702290076e-05, "loss": 1.5519, "mean_token_accuracy": 0.7038252353668213, "num_tokens": 2609125.0, "step": 2320 }, { "epoch": 2.071111111111111, "grad_norm": 1.994992971420288, "learning_rate": 6.400000000000001e-05, "loss": 1.5872, "mean_token_accuracy": 0.690100908279419, "num_tokens": 2620411.0, "step": 2330 }, { "epoch": 2.08, "grad_norm": 1.9283640384674072, "learning_rate": 6.338931297709923e-05, "loss": 1.5867, "mean_token_accuracy": 0.6923216238617897, "num_tokens": 2631795.0, "step": 2340 }, { "epoch": 2.088888888888889, "grad_norm": 1.9957973957061768, "learning_rate": 6.277862595419848e-05, "loss": 1.5996, "mean_token_accuracy": 0.6924369186162949, "num_tokens": 2643179.0, "step": 2350 }, { "epoch": 2.097777777777778, "grad_norm": 2.0207560062408447, "learning_rate": 6.21679389312977e-05, "loss": 1.515, "mean_token_accuracy": 0.7066755428910255, "num_tokens": 2654206.0, "step": 2360 }, { "epoch": 2.1066666666666665, "grad_norm": 1.8871878385543823, "learning_rate": 6.155725190839695e-05, "loss": 1.6139, "mean_token_accuracy": 0.687422800064087, "num_tokens": 2665582.0, "step": 2370 }, { "epoch": 2.1155555555555554, "grad_norm": 1.717610478401184, "learning_rate": 6.094656488549618e-05, "loss": 1.6388, "mean_token_accuracy": 0.6870575189590454, "num_tokens": 2677533.0, "step": 2380 }, { "epoch": 2.1244444444444444, "grad_norm": 1.8574187755584717, "learning_rate": 6.0335877862595426e-05, "loss": 1.557, "mean_token_accuracy": 0.6999430671334267, "num_tokens": 2688755.0, "step": 2390 }, { "epoch": 2.1333333333333333, "grad_norm": 1.9739580154418945, "learning_rate": 5.9725190839694655e-05, "loss": 1.6553, "mean_token_accuracy": 0.6819543272256852, "num_tokens": 2700558.0, "step": 2400 }, { "epoch": 2.1333333333333333, "eval_loss": 1.8970768451690674, "eval_mean_token_accuracy": 0.6490416256189346, "eval_num_tokens": 2700558.0, "eval_runtime": 47.6704, "eval_samples_per_second": 20.977, "eval_steps_per_second": 10.489, "step": 2400 }, { "epoch": 2.1422222222222222, "grad_norm": 1.893918514251709, "learning_rate": 5.91145038167939e-05, "loss": 1.5459, "mean_token_accuracy": 0.6963777393102646, "num_tokens": 2711713.0, "step": 2410 }, { "epoch": 2.151111111111111, "grad_norm": 1.9607445001602173, "learning_rate": 5.850381679389313e-05, "loss": 1.6373, "mean_token_accuracy": 0.6815788432955742, "num_tokens": 2723686.0, "step": 2420 }, { "epoch": 2.16, "grad_norm": 2.091732978820801, "learning_rate": 5.789312977099237e-05, "loss": 1.6422, "mean_token_accuracy": 0.6811213716864586, "num_tokens": 2735300.0, "step": 2430 }, { "epoch": 2.168888888888889, "grad_norm": 2.1138076782226562, "learning_rate": 5.7282442748091605e-05, "loss": 1.5848, "mean_token_accuracy": 0.6962573245167732, "num_tokens": 2746248.0, "step": 2440 }, { "epoch": 2.1777777777777776, "grad_norm": 2.1495392322540283, "learning_rate": 5.667175572519085e-05, "loss": 1.576, "mean_token_accuracy": 0.6990228727459907, "num_tokens": 2757259.0, "step": 2450 }, { "epoch": 2.1866666666666665, "grad_norm": 2.1444251537323, "learning_rate": 5.606106870229008e-05, "loss": 1.5979, "mean_token_accuracy": 0.6916472837328911, "num_tokens": 2768228.0, "step": 2460 }, { "epoch": 2.1955555555555555, "grad_norm": 1.945489525794983, "learning_rate": 5.545038167938932e-05, "loss": 1.5663, "mean_token_accuracy": 0.7005513325333595, "num_tokens": 2779254.0, "step": 2470 }, { "epoch": 2.2044444444444444, "grad_norm": 1.8256646394729614, "learning_rate": 5.483969465648855e-05, "loss": 1.5751, "mean_token_accuracy": 0.6961624413728714, "num_tokens": 2790326.0, "step": 2480 }, { "epoch": 2.2133333333333334, "grad_norm": 1.9541441202163696, "learning_rate": 5.422900763358779e-05, "loss": 1.6268, "mean_token_accuracy": 0.6893054991960526, "num_tokens": 2801625.0, "step": 2490 }, { "epoch": 2.2222222222222223, "grad_norm": 2.0127615928649902, "learning_rate": 5.361832061068702e-05, "loss": 1.6096, "mean_token_accuracy": 0.6923437744379044, "num_tokens": 2813010.0, "step": 2500 }, { "epoch": 2.2311111111111113, "grad_norm": 2.0325839519500732, "learning_rate": 5.300763358778626e-05, "loss": 1.5963, "mean_token_accuracy": 0.6913090571761131, "num_tokens": 2824021.0, "step": 2510 }, { "epoch": 2.24, "grad_norm": 2.1595821380615234, "learning_rate": 5.23969465648855e-05, "loss": 1.5617, "mean_token_accuracy": 0.7037980020046234, "num_tokens": 2835232.0, "step": 2520 }, { "epoch": 2.2488888888888887, "grad_norm": 2.11661958694458, "learning_rate": 5.178625954198474e-05, "loss": 1.6213, "mean_token_accuracy": 0.6836483731865883, "num_tokens": 2846524.0, "step": 2530 }, { "epoch": 2.2577777777777777, "grad_norm": 1.88747239112854, "learning_rate": 5.117557251908397e-05, "loss": 1.6408, "mean_token_accuracy": 0.6860729962587356, "num_tokens": 2857788.0, "step": 2540 }, { "epoch": 2.2666666666666666, "grad_norm": 1.9622093439102173, "learning_rate": 5.056488549618321e-05, "loss": 1.5519, "mean_token_accuracy": 0.7002682030200958, "num_tokens": 2868618.0, "step": 2550 }, { "epoch": 2.2755555555555556, "grad_norm": 1.9343371391296387, "learning_rate": 4.995419847328244e-05, "loss": 1.5795, "mean_token_accuracy": 0.6934511423110962, "num_tokens": 2879999.0, "step": 2560 }, { "epoch": 2.2844444444444445, "grad_norm": 1.9991627931594849, "learning_rate": 4.934351145038168e-05, "loss": 1.6183, "mean_token_accuracy": 0.6901679039001465, "num_tokens": 2891053.0, "step": 2570 }, { "epoch": 2.2933333333333334, "grad_norm": 1.9480003118515015, "learning_rate": 4.8732824427480914e-05, "loss": 1.5826, "mean_token_accuracy": 0.7007558569312096, "num_tokens": 2901905.0, "step": 2580 }, { "epoch": 2.3022222222222224, "grad_norm": 2.021207332611084, "learning_rate": 4.812213740458015e-05, "loss": 1.6348, "mean_token_accuracy": 0.6848765298724174, "num_tokens": 2913571.0, "step": 2590 }, { "epoch": 2.311111111111111, "grad_norm": 1.8385164737701416, "learning_rate": 4.751145038167939e-05, "loss": 1.5763, "mean_token_accuracy": 0.6912240386009216, "num_tokens": 2925533.0, "step": 2600 }, { "epoch": 2.311111111111111, "eval_loss": 1.8940143585205078, "eval_mean_token_accuracy": 0.6499911918640137, "eval_num_tokens": 2925533.0, "eval_runtime": 47.456, "eval_samples_per_second": 21.072, "eval_steps_per_second": 10.536, "step": 2600 }, { "epoch": 2.32, "grad_norm": 1.9455375671386719, "learning_rate": 4.690076335877863e-05, "loss": 1.598, "mean_token_accuracy": 0.6915700435638428, "num_tokens": 2936620.0, "step": 2610 }, { "epoch": 2.328888888888889, "grad_norm": 1.863487720489502, "learning_rate": 4.6290076335877864e-05, "loss": 1.5512, "mean_token_accuracy": 0.7025073647499085, "num_tokens": 2947753.0, "step": 2620 }, { "epoch": 2.3377777777777777, "grad_norm": 1.9756685495376587, "learning_rate": 4.56793893129771e-05, "loss": 1.5973, "mean_token_accuracy": 0.6870647758245468, "num_tokens": 2959635.0, "step": 2630 }, { "epoch": 2.3466666666666667, "grad_norm": 2.190765142440796, "learning_rate": 4.5068702290076336e-05, "loss": 1.5948, "mean_token_accuracy": 0.6888303905725479, "num_tokens": 2971675.0, "step": 2640 }, { "epoch": 2.3555555555555556, "grad_norm": 1.827318787574768, "learning_rate": 4.445801526717557e-05, "loss": 1.5682, "mean_token_accuracy": 0.6952902913093567, "num_tokens": 2982744.0, "step": 2650 }, { "epoch": 2.3644444444444446, "grad_norm": 2.11799693107605, "learning_rate": 4.384732824427481e-05, "loss": 1.6221, "mean_token_accuracy": 0.6794109031558037, "num_tokens": 2994347.0, "step": 2660 }, { "epoch": 2.3733333333333335, "grad_norm": 2.1472220420837402, "learning_rate": 4.3236641221374044e-05, "loss": 1.6353, "mean_token_accuracy": 0.6876759916543961, "num_tokens": 3005174.0, "step": 2670 }, { "epoch": 2.3822222222222225, "grad_norm": 1.9971054792404175, "learning_rate": 4.2625954198473286e-05, "loss": 1.5372, "mean_token_accuracy": 0.7059834420680999, "num_tokens": 3016492.0, "step": 2680 }, { "epoch": 2.391111111111111, "grad_norm": 2.067861318588257, "learning_rate": 4.201526717557252e-05, "loss": 1.572, "mean_token_accuracy": 0.6911077201366425, "num_tokens": 3027826.0, "step": 2690 }, { "epoch": 2.4, "grad_norm": 2.0372536182403564, "learning_rate": 4.140458015267176e-05, "loss": 1.5615, "mean_token_accuracy": 0.6972797185182571, "num_tokens": 3038770.0, "step": 2700 }, { "epoch": 2.408888888888889, "grad_norm": 2.15972638130188, "learning_rate": 4.0793893129770994e-05, "loss": 1.5806, "mean_token_accuracy": 0.6947444006800652, "num_tokens": 3050159.0, "step": 2710 }, { "epoch": 2.417777777777778, "grad_norm": 2.059760808944702, "learning_rate": 4.018320610687023e-05, "loss": 1.6167, "mean_token_accuracy": 0.6882677704095841, "num_tokens": 3061009.0, "step": 2720 }, { "epoch": 2.4266666666666667, "grad_norm": 1.9914629459381104, "learning_rate": 3.9572519083969466e-05, "loss": 1.5508, "mean_token_accuracy": 0.6985371947288513, "num_tokens": 3072232.0, "step": 2730 }, { "epoch": 2.4355555555555557, "grad_norm": 2.0151119232177734, "learning_rate": 3.89618320610687e-05, "loss": 1.663, "mean_token_accuracy": 0.6849021047353745, "num_tokens": 3083939.0, "step": 2740 }, { "epoch": 2.4444444444444446, "grad_norm": 2.02457332611084, "learning_rate": 3.835114503816794e-05, "loss": 1.6043, "mean_token_accuracy": 0.6891427770256996, "num_tokens": 3095354.0, "step": 2750 }, { "epoch": 2.453333333333333, "grad_norm": 1.930341362953186, "learning_rate": 3.774045801526718e-05, "loss": 1.5648, "mean_token_accuracy": 0.6962095096707344, "num_tokens": 3106679.0, "step": 2760 }, { "epoch": 2.462222222222222, "grad_norm": 2.1718850135803223, "learning_rate": 3.7129770992366416e-05, "loss": 1.5514, "mean_token_accuracy": 0.6997211873531342, "num_tokens": 3117440.0, "step": 2770 }, { "epoch": 2.471111111111111, "grad_norm": 1.89506196975708, "learning_rate": 3.651908396946565e-05, "loss": 1.6102, "mean_token_accuracy": 0.6865462198853493, "num_tokens": 3128685.0, "step": 2780 }, { "epoch": 2.48, "grad_norm": 2.1102652549743652, "learning_rate": 3.590839694656489e-05, "loss": 1.6092, "mean_token_accuracy": 0.6845578849315643, "num_tokens": 3140574.0, "step": 2790 }, { "epoch": 2.488888888888889, "grad_norm": 1.9541523456573486, "learning_rate": 3.5297709923664124e-05, "loss": 1.6245, "mean_token_accuracy": 0.6867643877863884, "num_tokens": 3151937.0, "step": 2800 }, { "epoch": 2.488888888888889, "eval_loss": 1.8869248628616333, "eval_mean_token_accuracy": 0.6508636207580566, "eval_num_tokens": 3151937.0, "eval_runtime": 46.9872, "eval_samples_per_second": 21.282, "eval_steps_per_second": 10.641, "step": 2800 }, { "epoch": 2.497777777777778, "grad_norm": 2.006448984146118, "learning_rate": 3.468702290076336e-05, "loss": 1.6458, "mean_token_accuracy": 0.6835160732269288, "num_tokens": 3163343.0, "step": 2810 }, { "epoch": 2.506666666666667, "grad_norm": 2.0644562244415283, "learning_rate": 3.4076335877862595e-05, "loss": 1.5841, "mean_token_accuracy": 0.699130979180336, "num_tokens": 3174278.0, "step": 2820 }, { "epoch": 2.5155555555555553, "grad_norm": 2.5352766513824463, "learning_rate": 3.346564885496183e-05, "loss": 1.6411, "mean_token_accuracy": 0.687686163187027, "num_tokens": 3185529.0, "step": 2830 }, { "epoch": 2.5244444444444447, "grad_norm": 2.2506706714630127, "learning_rate": 3.2854961832061074e-05, "loss": 1.5334, "mean_token_accuracy": 0.7042266175150871, "num_tokens": 3196422.0, "step": 2840 }, { "epoch": 2.533333333333333, "grad_norm": 2.038456439971924, "learning_rate": 3.224427480916031e-05, "loss": 1.5226, "mean_token_accuracy": 0.7002356797456741, "num_tokens": 3207640.0, "step": 2850 }, { "epoch": 2.542222222222222, "grad_norm": 2.0818448066711426, "learning_rate": 3.1633587786259545e-05, "loss": 1.5136, "mean_token_accuracy": 0.7040936380624772, "num_tokens": 3218742.0, "step": 2860 }, { "epoch": 2.551111111111111, "grad_norm": 1.9810820817947388, "learning_rate": 3.102290076335878e-05, "loss": 1.6515, "mean_token_accuracy": 0.6826088905334473, "num_tokens": 3230062.0, "step": 2870 }, { "epoch": 2.56, "grad_norm": 2.1830689907073975, "learning_rate": 3.0412213740458017e-05, "loss": 1.5792, "mean_token_accuracy": 0.699496129155159, "num_tokens": 3240533.0, "step": 2880 }, { "epoch": 2.568888888888889, "grad_norm": 2.101184368133545, "learning_rate": 2.9801526717557253e-05, "loss": 1.6538, "mean_token_accuracy": 0.6724523141980171, "num_tokens": 3252476.0, "step": 2890 }, { "epoch": 2.5777777777777775, "grad_norm": 2.021524429321289, "learning_rate": 2.9190839694656492e-05, "loss": 1.6146, "mean_token_accuracy": 0.6886414483189582, "num_tokens": 3263799.0, "step": 2900 }, { "epoch": 2.586666666666667, "grad_norm": 1.9668735265731812, "learning_rate": 2.8580152671755728e-05, "loss": 1.6477, "mean_token_accuracy": 0.678925508260727, "num_tokens": 3275511.0, "step": 2910 }, { "epoch": 2.5955555555555554, "grad_norm": 2.088491201400757, "learning_rate": 2.7969465648854964e-05, "loss": 1.6265, "mean_token_accuracy": 0.6857595339417457, "num_tokens": 3286752.0, "step": 2920 }, { "epoch": 2.6044444444444443, "grad_norm": 2.0536880493164062, "learning_rate": 2.73587786259542e-05, "loss": 1.66, "mean_token_accuracy": 0.681273227930069, "num_tokens": 3297945.0, "step": 2930 }, { "epoch": 2.6133333333333333, "grad_norm": 2.0063817501068115, "learning_rate": 2.674809160305344e-05, "loss": 1.5102, "mean_token_accuracy": 0.7025244757533073, "num_tokens": 3309112.0, "step": 2940 }, { "epoch": 2.6222222222222222, "grad_norm": 1.9980206489562988, "learning_rate": 2.6137404580152675e-05, "loss": 1.5142, "mean_token_accuracy": 0.7049572348594666, "num_tokens": 3320544.0, "step": 2950 }, { "epoch": 2.631111111111111, "grad_norm": 2.1506435871124268, "learning_rate": 2.552671755725191e-05, "loss": 1.5826, "mean_token_accuracy": 0.694467018544674, "num_tokens": 3331309.0, "step": 2960 }, { "epoch": 2.64, "grad_norm": 1.9890793561935425, "learning_rate": 2.4916030534351147e-05, "loss": 1.5631, "mean_token_accuracy": 0.6945617944002151, "num_tokens": 3343068.0, "step": 2970 }, { "epoch": 2.648888888888889, "grad_norm": 2.1102676391601562, "learning_rate": 2.4305343511450383e-05, "loss": 1.6145, "mean_token_accuracy": 0.6866093754768372, "num_tokens": 3354691.0, "step": 2980 }, { "epoch": 2.6577777777777776, "grad_norm": 2.2881674766540527, "learning_rate": 2.369465648854962e-05, "loss": 1.5796, "mean_token_accuracy": 0.6961612686514854, "num_tokens": 3365512.0, "step": 2990 }, { "epoch": 2.6666666666666665, "grad_norm": 1.973838210105896, "learning_rate": 2.3083969465648854e-05, "loss": 1.5456, "mean_token_accuracy": 0.703473174571991, "num_tokens": 3376406.0, "step": 3000 }, { "epoch": 2.6666666666666665, "eval_loss": 1.881131649017334, "eval_mean_token_accuracy": 0.6518214672803879, "eval_num_tokens": 3376406.0, "eval_runtime": 47.794, "eval_samples_per_second": 20.923, "eval_steps_per_second": 10.462, "step": 3000 } ], "logging_steps": 10, "max_steps": 3375, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.0707043350011904e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }