{ "best_metric": null, "best_model_checkpoint": null, "epoch": 6.0, "eval_steps": 5030, "global_step": 150876, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.019883878151594685, "grad_norm": 2.3147895336151123, "learning_rate": 0.0001994694115774394, "loss": 0.594, "step": 500 }, { "epoch": 0.03976775630318937, "grad_norm": 2.2017321586608887, "learning_rate": 0.0001988061760492386, "loss": 0.4251, "step": 1000 }, { "epoch": 0.05965163445478406, "grad_norm": 2.0374693870544434, "learning_rate": 0.00019814294052103784, "loss": 0.386, "step": 1500 }, { "epoch": 0.07953551260637874, "grad_norm": 1.3447909355163574, "learning_rate": 0.00019747970499283706, "loss": 0.3585, "step": 2000 }, { "epoch": 0.09941939075797343, "grad_norm": 1.2458261251449585, "learning_rate": 0.00019682044887780548, "loss": 0.3545, "step": 2500 }, { "epoch": 0.11930326890956812, "grad_norm": 1.844347357749939, "learning_rate": 0.00019615721334960472, "loss": 0.3329, "step": 3000 }, { "epoch": 0.1391871470611628, "grad_norm": 1.3815340995788574, "learning_rate": 0.00019549397782140396, "loss": 0.3182, "step": 3500 }, { "epoch": 0.15907102521275748, "grad_norm": 2.2595863342285156, "learning_rate": 0.00019483074229320317, "loss": 0.2985, "step": 4000 }, { "epoch": 0.17895490336435219, "grad_norm": 1.8043708801269531, "learning_rate": 0.00019416750676500238, "loss": 0.3068, "step": 4500 }, { "epoch": 0.19883878151594686, "grad_norm": 1.8568594455718994, "learning_rate": 0.00019350427123680162, "loss": 0.3044, "step": 5000 }, { "epoch": 0.20003181420504254, "eval_loss": 0.29235559701919556, "eval_runtime": 31.6112, "eval_samples_per_second": 15.817, "eval_steps_per_second": 7.909, "step": 5030 }, { "epoch": 0.21872265966754156, "grad_norm": 1.780515193939209, "learning_rate": 0.00019284103570860084, "loss": 0.2884, "step": 5500 }, { "epoch": 0.23860653781913624, "grad_norm": 1.3652188777923584, "learning_rate": 0.00019217780018040008, "loss": 0.2924, "step": 6000 }, { "epoch": 0.2584904159707309, "grad_norm": 1.5280920267105103, "learning_rate": 0.0001915158911232557, "loss": 0.2714, "step": 6500 }, { "epoch": 0.2783742941223256, "grad_norm": 1.8563861846923828, "learning_rate": 0.00019085398206611135, "loss": 0.2729, "step": 7000 }, { "epoch": 0.2982581722739203, "grad_norm": 1.0958434343338013, "learning_rate": 0.00019019074653791056, "loss": 0.2735, "step": 7500 }, { "epoch": 0.31814205042551497, "grad_norm": 1.145859956741333, "learning_rate": 0.00018952751100970977, "loss": 0.2737, "step": 8000 }, { "epoch": 0.33802592857710967, "grad_norm": 1.2784571647644043, "learning_rate": 0.000188864275481509, "loss": 0.2611, "step": 8500 }, { "epoch": 0.35790980672870437, "grad_norm": 1.3518733978271484, "learning_rate": 0.00018820103995330822, "loss": 0.2612, "step": 9000 }, { "epoch": 0.3777936848802991, "grad_norm": 1.4692599773406982, "learning_rate": 0.00018753780442510743, "loss": 0.2609, "step": 9500 }, { "epoch": 0.3976775630318937, "grad_norm": 2.682018518447876, "learning_rate": 0.00018687456889690667, "loss": 0.2643, "step": 10000 }, { "epoch": 0.4000636284100851, "eval_loss": 0.2585141062736511, "eval_runtime": 31.9094, "eval_samples_per_second": 15.669, "eval_steps_per_second": 7.835, "step": 10060 }, { "epoch": 0.4175614411834884, "grad_norm": 1.7332996129989624, "learning_rate": 0.00018621133336870591, "loss": 0.2523, "step": 10500 }, { "epoch": 0.4374453193350831, "grad_norm": 2.1415417194366455, "learning_rate": 0.00018554942431156152, "loss": 0.2585, "step": 11000 }, { "epoch": 0.4573291974866778, "grad_norm": 1.2841447591781616, "learning_rate": 0.00018488618878336073, "loss": 0.2557, "step": 11500 }, { "epoch": 0.4772130756382725, "grad_norm": 3.2847816944122314, "learning_rate": 0.00018422295325516, "loss": 0.2609, "step": 12000 }, { "epoch": 0.4970969537898672, "grad_norm": 0.7331926822662354, "learning_rate": 0.00018355971772695921, "loss": 0.2477, "step": 12500 }, { "epoch": 0.5169808319414618, "grad_norm": 1.4415650367736816, "learning_rate": 0.00018289648219875843, "loss": 0.2482, "step": 13000 }, { "epoch": 0.5368647100930566, "grad_norm": 1.7630778551101685, "learning_rate": 0.00018223457314161406, "loss": 0.2421, "step": 13500 }, { "epoch": 0.5567485882446512, "grad_norm": 2.0509490966796875, "learning_rate": 0.0001815713376134133, "loss": 0.2444, "step": 14000 }, { "epoch": 0.5766324663962459, "grad_norm": 1.959215760231018, "learning_rate": 0.0001809081020852125, "loss": 0.2398, "step": 14500 }, { "epoch": 0.5965163445478406, "grad_norm": 1.6416336297988892, "learning_rate": 0.00018024486655701173, "loss": 0.2396, "step": 15000 }, { "epoch": 0.6000954426151277, "eval_loss": 0.23102714121341705, "eval_runtime": 32.4432, "eval_samples_per_second": 15.412, "eval_steps_per_second": 7.706, "step": 15090 }, { "epoch": 0.6164002226994353, "grad_norm": 0.5961350798606873, "learning_rate": 0.00017958163102881097, "loss": 0.2291, "step": 15500 }, { "epoch": 0.6362841008510299, "grad_norm": 1.3288564682006836, "learning_rate": 0.00017891972197166657, "loss": 0.2368, "step": 16000 }, { "epoch": 0.6561679790026247, "grad_norm": 1.7699204683303833, "learning_rate": 0.0001782564864434658, "loss": 0.2397, "step": 16500 }, { "epoch": 0.6760518571542193, "grad_norm": 1.3349616527557373, "learning_rate": 0.00017759325091526505, "loss": 0.2353, "step": 17000 }, { "epoch": 0.6959357353058141, "grad_norm": 1.6228617429733276, "learning_rate": 0.00017693001538706426, "loss": 0.2314, "step": 17500 }, { "epoch": 0.7158196134574087, "grad_norm": 1.349108338356018, "learning_rate": 0.00017626677985886348, "loss": 0.2339, "step": 18000 }, { "epoch": 0.7357034916090034, "grad_norm": 2.2128469944000244, "learning_rate": 0.00017560354433066272, "loss": 0.2252, "step": 18500 }, { "epoch": 0.7555873697605981, "grad_norm": 2.038177013397217, "learning_rate": 0.00017494163527351835, "loss": 0.2242, "step": 19000 }, { "epoch": 0.7754712479121928, "grad_norm": 1.1377464532852173, "learning_rate": 0.00017427839974531756, "loss": 0.2197, "step": 19500 }, { "epoch": 0.7953551260637874, "grad_norm": 1.2686738967895508, "learning_rate": 0.00017361516421711678, "loss": 0.2277, "step": 20000 }, { "epoch": 0.8001272568201702, "eval_loss": 0.21075737476348877, "eval_runtime": 32.819, "eval_samples_per_second": 15.235, "eval_steps_per_second": 7.618, "step": 20120 }, { "epoch": 0.8152390042153822, "grad_norm": 2.1816446781158447, "learning_rate": 0.00017295192868891602, "loss": 0.2245, "step": 20500 }, { "epoch": 0.8351228823669768, "grad_norm": 2.224292755126953, "learning_rate": 0.00017229001963177165, "loss": 0.2176, "step": 21000 }, { "epoch": 0.8550067605185715, "grad_norm": 0.7720803618431091, "learning_rate": 0.00017162811057462726, "loss": 0.2097, "step": 21500 }, { "epoch": 0.8748906386701663, "grad_norm": 1.716422200202942, "learning_rate": 0.0001709648750464265, "loss": 0.2104, "step": 22000 }, { "epoch": 0.8947745168217609, "grad_norm": 1.5487083196640015, "learning_rate": 0.00017030163951822574, "loss": 0.2179, "step": 22500 }, { "epoch": 0.9146583949733555, "grad_norm": 1.0968928337097168, "learning_rate": 0.00016963840399002495, "loss": 0.2194, "step": 23000 }, { "epoch": 0.9345422731249503, "grad_norm": 0.36388519406318665, "learning_rate": 0.00016897516846182416, "loss": 0.2259, "step": 23500 }, { "epoch": 0.954426151276545, "grad_norm": 1.2857156991958618, "learning_rate": 0.0001683119329336234, "loss": 0.2195, "step": 24000 }, { "epoch": 0.9743100294281397, "grad_norm": 1.7136821746826172, "learning_rate": 0.00016764869740542261, "loss": 0.2036, "step": 24500 }, { "epoch": 0.9941939075797344, "grad_norm": 2.1068880558013916, "learning_rate": 0.00016698546187722183, "loss": 0.2103, "step": 25000 }, { "epoch": 1.0001590710252128, "eval_loss": 0.2087584286928177, "eval_runtime": 34.3797, "eval_samples_per_second": 14.543, "eval_steps_per_second": 7.272, "step": 25150 }, { "epoch": 1.014077785731329, "grad_norm": 0.9411464929580688, "learning_rate": 0.0001663222263490211, "loss": 0.1902, "step": 25500 }, { "epoch": 1.0339616638829237, "grad_norm": 1.8499776124954224, "learning_rate": 0.0001656589908208203, "loss": 0.1961, "step": 26000 }, { "epoch": 1.0538455420345185, "grad_norm": 2.804515838623047, "learning_rate": 0.00016499708176367591, "loss": 0.1975, "step": 26500 }, { "epoch": 1.0737294201861132, "grad_norm": 1.0124012231826782, "learning_rate": 0.00016433384623547515, "loss": 0.1965, "step": 27000 }, { "epoch": 1.0936132983377078, "grad_norm": 2.1755409240722656, "learning_rate": 0.0001636706107072744, "loss": 0.2032, "step": 27500 }, { "epoch": 1.1134971764893025, "grad_norm": 1.9173059463500977, "learning_rate": 0.0001630073751790736, "loss": 0.1998, "step": 28000 }, { "epoch": 1.1333810546408971, "grad_norm": 2.6213159561157227, "learning_rate": 0.0001623454661219292, "loss": 0.1944, "step": 28500 }, { "epoch": 1.1532649327924918, "grad_norm": 1.979064702987671, "learning_rate": 0.00016168223059372845, "loss": 0.1947, "step": 29000 }, { "epoch": 1.1731488109440866, "grad_norm": 3.864307403564453, "learning_rate": 0.00016101899506552767, "loss": 0.1948, "step": 29500 }, { "epoch": 1.1930326890956813, "grad_norm": 1.3037413358688354, "learning_rate": 0.0001603557595373269, "loss": 0.1942, "step": 30000 }, { "epoch": 1.2001908852302554, "eval_loss": 0.20310936868190765, "eval_runtime": 33.1759, "eval_samples_per_second": 15.071, "eval_steps_per_second": 7.536, "step": 30180 }, { "epoch": 1.212916567247276, "grad_norm": 1.4742060899734497, "learning_rate": 0.00015969385048018254, "loss": 0.1873, "step": 30500 }, { "epoch": 1.2328004453988706, "grad_norm": 0.8921416401863098, "learning_rate": 0.00015903061495198175, "loss": 0.1888, "step": 31000 }, { "epoch": 1.2526843235504652, "grad_norm": 2.814047336578369, "learning_rate": 0.00015836737942378096, "loss": 0.1933, "step": 31500 }, { "epoch": 1.27256820170206, "grad_norm": 1.6011271476745605, "learning_rate": 0.0001577041438955802, "loss": 0.1943, "step": 32000 }, { "epoch": 1.2924520798536547, "grad_norm": 2.1085023880004883, "learning_rate": 0.00015704090836737944, "loss": 0.1939, "step": 32500 }, { "epoch": 1.3123359580052494, "grad_norm": 1.4254250526428223, "learning_rate": 0.00015637899931023505, "loss": 0.1898, "step": 33000 }, { "epoch": 1.332219836156844, "grad_norm": 2.6085307598114014, "learning_rate": 0.00015571576378203426, "loss": 0.1869, "step": 33500 }, { "epoch": 1.3521037143084387, "grad_norm": 1.4234007596969604, "learning_rate": 0.00015505252825383353, "loss": 0.1924, "step": 34000 }, { "epoch": 1.3719875924600333, "grad_norm": 1.5566816329956055, "learning_rate": 0.00015439061919668914, "loss": 0.1932, "step": 34500 }, { "epoch": 1.391871470611628, "grad_norm": 8.811753273010254, "learning_rate": 0.00015372738366848835, "loss": 0.1846, "step": 35000 }, { "epoch": 1.400222699435298, "eval_loss": 0.19551397860050201, "eval_runtime": 35.3626, "eval_samples_per_second": 14.139, "eval_steps_per_second": 7.07, "step": 35210 }, { "epoch": 1.4117553487632228, "grad_norm": 1.463742971420288, "learning_rate": 0.0001530641481402876, "loss": 0.1924, "step": 35500 }, { "epoch": 1.4316392269148175, "grad_norm": 2.8610801696777344, "learning_rate": 0.00015240091261208683, "loss": 0.1938, "step": 36000 }, { "epoch": 1.4515231050664121, "grad_norm": 1.2369511127471924, "learning_rate": 0.00015173767708388604, "loss": 0.1817, "step": 36500 }, { "epoch": 1.4714069832180068, "grad_norm": 1.4233882427215576, "learning_rate": 0.00015107444155568526, "loss": 0.1817, "step": 37000 }, { "epoch": 1.4912908613696017, "grad_norm": 1.5385404825210571, "learning_rate": 0.0001504112060274845, "loss": 0.1849, "step": 37500 }, { "epoch": 1.5111747395211963, "grad_norm": 1.2022610902786255, "learning_rate": 0.0001497479704992837, "loss": 0.183, "step": 38000 }, { "epoch": 1.531058617672791, "grad_norm": 1.5988807678222656, "learning_rate": 0.00014908606144213934, "loss": 0.1821, "step": 38500 }, { "epoch": 1.5509424958243856, "grad_norm": 1.480351209640503, "learning_rate": 0.00014842282591393855, "loss": 0.1787, "step": 39000 }, { "epoch": 1.5708263739759802, "grad_norm": 1.9051077365875244, "learning_rate": 0.0001477595903857378, "loss": 0.1826, "step": 39500 }, { "epoch": 1.5907102521275749, "grad_norm": 2.0775561332702637, "learning_rate": 0.000147096354857537, "loss": 0.1814, "step": 40000 }, { "epoch": 1.6002545136403405, "eval_loss": 0.18265177309513092, "eval_runtime": 31.3228, "eval_samples_per_second": 15.963, "eval_steps_per_second": 7.981, "step": 40240 }, { "epoch": 1.6105941302791695, "grad_norm": 2.4270079135894775, "learning_rate": 0.00014643444580039264, "loss": 0.1728, "step": 40500 }, { "epoch": 1.6304780084307642, "grad_norm": 1.4693158864974976, "learning_rate": 0.00014577121027219188, "loss": 0.1835, "step": 41000 }, { "epoch": 1.650361886582359, "grad_norm": 2.014503240585327, "learning_rate": 0.0001451079747439911, "loss": 0.1825, "step": 41500 }, { "epoch": 1.6702457647339537, "grad_norm": 2.4731264114379883, "learning_rate": 0.0001444447392157903, "loss": 0.1803, "step": 42000 }, { "epoch": 1.6901296428855483, "grad_norm": 1.043967366218567, "learning_rate": 0.00014378150368758955, "loss": 0.1831, "step": 42500 }, { "epoch": 1.7100135210371432, "grad_norm": 2.0062990188598633, "learning_rate": 0.00014311826815938879, "loss": 0.1851, "step": 43000 }, { "epoch": 1.7298973991887379, "grad_norm": 2.958763837814331, "learning_rate": 0.000142455032631188, "loss": 0.1825, "step": 43500 }, { "epoch": 1.7497812773403325, "grad_norm": 1.6624678373336792, "learning_rate": 0.0001417917971029872, "loss": 0.1834, "step": 44000 }, { "epoch": 1.7696651554919272, "grad_norm": 1.1731349229812622, "learning_rate": 0.00014112856157478645, "loss": 0.1775, "step": 44500 }, { "epoch": 1.7895490336435218, "grad_norm": 1.2256383895874023, "learning_rate": 0.00014046665251764206, "loss": 0.1688, "step": 45000 }, { "epoch": 1.8002863278453831, "eval_loss": 0.1788606494665146, "eval_runtime": 31.0353, "eval_samples_per_second": 16.111, "eval_steps_per_second": 8.055, "step": 45270 }, { "epoch": 1.8094329117951164, "grad_norm": 5.187135219573975, "learning_rate": 0.0001398034169894413, "loss": 0.1757, "step": 45500 }, { "epoch": 1.829316789946711, "grad_norm": 3.0221340656280518, "learning_rate": 0.00013914018146124054, "loss": 0.169, "step": 46000 }, { "epoch": 1.8492006680983057, "grad_norm": 1.780038833618164, "learning_rate": 0.00013847694593303975, "loss": 0.1815, "step": 46500 }, { "epoch": 1.8690845462499006, "grad_norm": 1.5590816736221313, "learning_rate": 0.00013781503687589536, "loss": 0.1754, "step": 47000 }, { "epoch": 1.8889684244014953, "grad_norm": 2.1302263736724854, "learning_rate": 0.0001371518013476946, "loss": 0.1737, "step": 47500 }, { "epoch": 1.90885230255309, "grad_norm": 2.173957109451294, "learning_rate": 0.00013648856581949384, "loss": 0.1696, "step": 48000 }, { "epoch": 1.9287361807046848, "grad_norm": 1.0864589214324951, "learning_rate": 0.00013582533029129305, "loss": 0.1832, "step": 48500 }, { "epoch": 1.9486200588562794, "grad_norm": 1.9979732036590576, "learning_rate": 0.00013516209476309226, "loss": 0.1758, "step": 49000 }, { "epoch": 1.968503937007874, "grad_norm": 2.2656006813049316, "learning_rate": 0.00013450018570594792, "loss": 0.1728, "step": 49500 }, { "epoch": 1.9883878151594687, "grad_norm": 2.077143669128418, "learning_rate": 0.00013383695017774714, "loss": 0.1772, "step": 50000 }, { "epoch": 2.0003181420504257, "eval_loss": 0.1767009198665619, "eval_runtime": 32.0718, "eval_samples_per_second": 15.59, "eval_steps_per_second": 7.795, "step": 50300 }, { "epoch": 2.0082716933110634, "grad_norm": 1.5150219202041626, "learning_rate": 0.00013317371464954635, "loss": 0.164, "step": 50500 }, { "epoch": 2.028155571462658, "grad_norm": 1.2225052118301392, "learning_rate": 0.0001325104791213456, "loss": 0.1534, "step": 51000 }, { "epoch": 2.0480394496142527, "grad_norm": 1.9311732053756714, "learning_rate": 0.0001318472435931448, "loss": 0.1584, "step": 51500 }, { "epoch": 2.0679233277658473, "grad_norm": 2.399226188659668, "learning_rate": 0.000131184008064944, "loss": 0.153, "step": 52000 }, { "epoch": 2.087807205917442, "grad_norm": 2.9771511554718018, "learning_rate": 0.00013052077253674325, "loss": 0.1528, "step": 52500 }, { "epoch": 2.107691084069037, "grad_norm": 1.5730229616165161, "learning_rate": 0.0001298588634795989, "loss": 0.154, "step": 53000 }, { "epoch": 2.1275749622206317, "grad_norm": 1.2603771686553955, "learning_rate": 0.0001291956279513981, "loss": 0.1575, "step": 53500 }, { "epoch": 2.1474588403722263, "grad_norm": 2.315129280090332, "learning_rate": 0.00012853371889425373, "loss": 0.1603, "step": 54000 }, { "epoch": 2.167342718523821, "grad_norm": 1.5500929355621338, "learning_rate": 0.00012787048336605297, "loss": 0.1544, "step": 54500 }, { "epoch": 2.1872265966754156, "grad_norm": 1.2831624746322632, "learning_rate": 0.0001272072478378522, "loss": 0.143, "step": 55000 }, { "epoch": 2.2003499562554683, "eval_loss": 0.1747935712337494, "eval_runtime": 31.3745, "eval_samples_per_second": 15.936, "eval_steps_per_second": 7.968, "step": 55330 }, { "epoch": 2.2071104748270103, "grad_norm": 1.4339005947113037, "learning_rate": 0.0001265440123096514, "loss": 0.1587, "step": 55500 }, { "epoch": 2.226994352978605, "grad_norm": 2.5719878673553467, "learning_rate": 0.00012588077678145064, "loss": 0.1527, "step": 56000 }, { "epoch": 2.2468782311301996, "grad_norm": 2.1503918170928955, "learning_rate": 0.00012521754125324988, "loss": 0.1585, "step": 56500 }, { "epoch": 2.2667621092817942, "grad_norm": 3.443103551864624, "learning_rate": 0.0001245543057250491, "loss": 0.1593, "step": 57000 }, { "epoch": 2.286645987433389, "grad_norm": 1.2123481035232544, "learning_rate": 0.0001238910701968483, "loss": 0.1624, "step": 57500 }, { "epoch": 2.3065298655849835, "grad_norm": 1.7718604803085327, "learning_rate": 0.00012322783466864754, "loss": 0.1569, "step": 58000 }, { "epoch": 2.3264137437365786, "grad_norm": 4.025502681732178, "learning_rate": 0.00012256592561150315, "loss": 0.156, "step": 58500 }, { "epoch": 2.3462976218881733, "grad_norm": 2.876533269882202, "learning_rate": 0.00012190401655435879, "loss": 0.1616, "step": 59000 }, { "epoch": 2.366181500039768, "grad_norm": 1.680629849433899, "learning_rate": 0.00012124078102615801, "loss": 0.1543, "step": 59500 }, { "epoch": 2.3860653781913626, "grad_norm": 1.0101946592330933, "learning_rate": 0.00012057754549795724, "loss": 0.1465, "step": 60000 }, { "epoch": 2.400381770460511, "eval_loss": 0.16907282173633575, "eval_runtime": 31.0907, "eval_samples_per_second": 16.082, "eval_steps_per_second": 8.041, "step": 60360 }, { "epoch": 2.405949256342957, "grad_norm": 1.9501550197601318, "learning_rate": 0.00011991430996975645, "loss": 0.1535, "step": 60500 }, { "epoch": 2.425833134494552, "grad_norm": 3.678605556488037, "learning_rate": 0.0001192510744415557, "loss": 0.1453, "step": 61000 }, { "epoch": 2.4457170126461465, "grad_norm": 4.075408458709717, "learning_rate": 0.00011858783891335492, "loss": 0.1494, "step": 61500 }, { "epoch": 2.465600890797741, "grad_norm": 1.3122295141220093, "learning_rate": 0.00011792460338515414, "loss": 0.1593, "step": 62000 }, { "epoch": 2.485484768949336, "grad_norm": 2.0857038497924805, "learning_rate": 0.00011726136785695337, "loss": 0.1514, "step": 62500 }, { "epoch": 2.5053686471009304, "grad_norm": 1.6657792329788208, "learning_rate": 0.000116599458799809, "loss": 0.1501, "step": 63000 }, { "epoch": 2.525252525252525, "grad_norm": 1.8100333213806152, "learning_rate": 0.00011593622327160823, "loss": 0.1519, "step": 63500 }, { "epoch": 2.54513640340412, "grad_norm": 2.504148006439209, "learning_rate": 0.00011527298774340744, "loss": 0.1496, "step": 64000 }, { "epoch": 2.5650202815557144, "grad_norm": 1.1805058717727661, "learning_rate": 0.00011461107868626306, "loss": 0.144, "step": 64500 }, { "epoch": 2.5849041597073095, "grad_norm": 3.109494686126709, "learning_rate": 0.0001139478431580623, "loss": 0.1517, "step": 65000 }, { "epoch": 2.600413584665553, "eval_loss": 0.17017363011837006, "eval_runtime": 31.1173, "eval_samples_per_second": 16.068, "eval_steps_per_second": 8.034, "step": 65390 }, { "epoch": 2.604788037858904, "grad_norm": 2.1431307792663574, "learning_rate": 0.00011328460762986153, "loss": 0.1561, "step": 65500 }, { "epoch": 2.6246719160104988, "grad_norm": 1.5804929733276367, "learning_rate": 0.00011262137210166075, "loss": 0.1536, "step": 66000 }, { "epoch": 2.6445557941620934, "grad_norm": 3.0483312606811523, "learning_rate": 0.00011195813657345997, "loss": 0.1497, "step": 66500 }, { "epoch": 2.664439672313688, "grad_norm": 2.7714364528656006, "learning_rate": 0.00011129490104525919, "loss": 0.1493, "step": 67000 }, { "epoch": 2.6843235504652827, "grad_norm": 1.1538747549057007, "learning_rate": 0.00011063299198811483, "loss": 0.1586, "step": 67500 }, { "epoch": 2.7042074286168774, "grad_norm": 1.946218729019165, "learning_rate": 0.00010996975645991405, "loss": 0.1484, "step": 68000 }, { "epoch": 2.724091306768472, "grad_norm": 1.453487753868103, "learning_rate": 0.00010930652093171328, "loss": 0.1545, "step": 68500 }, { "epoch": 2.7439751849200666, "grad_norm": 1.9838638305664062, "learning_rate": 0.00010864328540351249, "loss": 0.1443, "step": 69000 }, { "epoch": 2.7638590630716617, "grad_norm": 0.83545982837677, "learning_rate": 0.00010798004987531172, "loss": 0.148, "step": 69500 }, { "epoch": 2.783742941223256, "grad_norm": 2.1687493324279785, "learning_rate": 0.00010731681434711096, "loss": 0.1533, "step": 70000 }, { "epoch": 2.800445398870596, "eval_loss": 0.16508887708187103, "eval_runtime": 31.1958, "eval_samples_per_second": 16.028, "eval_steps_per_second": 8.014, "step": 70420 }, { "epoch": 2.803626819374851, "grad_norm": 1.3648791313171387, "learning_rate": 0.00010665357881891018, "loss": 0.1419, "step": 70500 }, { "epoch": 2.8235106975264457, "grad_norm": 1.7863802909851074, "learning_rate": 0.00010599034329070941, "loss": 0.1482, "step": 71000 }, { "epoch": 2.8433945756780403, "grad_norm": 3.2346866130828857, "learning_rate": 0.00010532710776250862, "loss": 0.1425, "step": 71500 }, { "epoch": 2.863278453829635, "grad_norm": 0.9211582541465759, "learning_rate": 0.00010466519870536427, "loss": 0.1501, "step": 72000 }, { "epoch": 2.8831623319812296, "grad_norm": 0.1923113614320755, "learning_rate": 0.00010400196317716348, "loss": 0.1469, "step": 72500 }, { "epoch": 2.9030462101328243, "grad_norm": 4.506950855255127, "learning_rate": 0.00010333872764896271, "loss": 0.1465, "step": 73000 }, { "epoch": 2.922930088284419, "grad_norm": 3.244807004928589, "learning_rate": 0.00010267549212076194, "loss": 0.1425, "step": 73500 }, { "epoch": 2.9428139664360136, "grad_norm": 1.8501616716384888, "learning_rate": 0.00010201225659256115, "loss": 0.1446, "step": 74000 }, { "epoch": 2.962697844587608, "grad_norm": 3.9013025760650635, "learning_rate": 0.0001013503475354168, "loss": 0.1508, "step": 74500 }, { "epoch": 2.9825817227392033, "grad_norm": 3.1799283027648926, "learning_rate": 0.00010068711200721601, "loss": 0.1412, "step": 75000 }, { "epoch": 3.000477213075638, "eval_loss": 0.1569422334432602, "eval_runtime": 33.8027, "eval_samples_per_second": 14.792, "eval_steps_per_second": 7.396, "step": 75450 }, { "epoch": 3.002465600890798, "grad_norm": 1.1885572671890259, "learning_rate": 0.00010002520295007163, "loss": 0.1334, "step": 75500 }, { "epoch": 3.0223494790423926, "grad_norm": 0.8424841165542603, "learning_rate": 9.936196742187086e-05, "loss": 0.1261, "step": 76000 }, { "epoch": 3.0422333571939872, "grad_norm": 1.2642099857330322, "learning_rate": 9.869873189367008e-05, "loss": 0.1233, "step": 76500 }, { "epoch": 3.062117235345582, "grad_norm": 3.7216506004333496, "learning_rate": 9.803549636546931e-05, "loss": 0.1257, "step": 77000 }, { "epoch": 3.0820011134971765, "grad_norm": 1.576157808303833, "learning_rate": 9.737358730832493e-05, "loss": 0.1257, "step": 77500 }, { "epoch": 3.101884991648771, "grad_norm": 3.2462401390075684, "learning_rate": 9.671035178012417e-05, "loss": 0.1387, "step": 78000 }, { "epoch": 3.121768869800366, "grad_norm": 0.38061007857322693, "learning_rate": 9.604711625192338e-05, "loss": 0.1288, "step": 78500 }, { "epoch": 3.1416527479519605, "grad_norm": 1.4168367385864258, "learning_rate": 9.538388072372262e-05, "loss": 0.125, "step": 79000 }, { "epoch": 3.161536626103555, "grad_norm": 1.8440510034561157, "learning_rate": 9.472064519552183e-05, "loss": 0.1203, "step": 79500 }, { "epoch": 3.1814205042551498, "grad_norm": 0.012771493755280972, "learning_rate": 9.405873613837747e-05, "loss": 0.1237, "step": 80000 }, { "epoch": 3.2005090272806807, "eval_loss": 0.15806268155574799, "eval_runtime": 31.0713, "eval_samples_per_second": 16.092, "eval_steps_per_second": 8.046, "step": 80480 }, { "epoch": 3.2013043824067444, "grad_norm": 2.36724853515625, "learning_rate": 9.33955006101767e-05, "loss": 0.1253, "step": 80500 }, { "epoch": 3.221188260558339, "grad_norm": 1.2032071352005005, "learning_rate": 9.273226508197591e-05, "loss": 0.1321, "step": 81000 }, { "epoch": 3.241072138709934, "grad_norm": 1.6413310766220093, "learning_rate": 9.206902955377515e-05, "loss": 0.1256, "step": 81500 }, { "epoch": 3.260956016861529, "grad_norm": 1.263128638267517, "learning_rate": 9.140579402557436e-05, "loss": 0.1281, "step": 82000 }, { "epoch": 3.2808398950131235, "grad_norm": 2.12233304977417, "learning_rate": 9.074255849737359e-05, "loss": 0.1299, "step": 82500 }, { "epoch": 3.300723773164718, "grad_norm": 1.265188217163086, "learning_rate": 9.007932296917282e-05, "loss": 0.1226, "step": 83000 }, { "epoch": 3.3206076513163127, "grad_norm": 1.391507625579834, "learning_rate": 8.941608744097204e-05, "loss": 0.1269, "step": 83500 }, { "epoch": 3.3404915294679074, "grad_norm": 1.941706657409668, "learning_rate": 8.875285191277126e-05, "loss": 0.1306, "step": 84000 }, { "epoch": 3.360375407619502, "grad_norm": 1.1882116794586182, "learning_rate": 8.809094285562688e-05, "loss": 0.1287, "step": 84500 }, { "epoch": 3.3802592857710967, "grad_norm": 2.2689340114593506, "learning_rate": 8.742903379848252e-05, "loss": 0.1281, "step": 85000 }, { "epoch": 3.4001431639226913, "grad_norm": 2.9482505321502686, "learning_rate": 8.676579827028175e-05, "loss": 0.131, "step": 85500 }, { "epoch": 3.400540841485723, "eval_loss": 0.15611666440963745, "eval_runtime": 31.2043, "eval_samples_per_second": 16.023, "eval_steps_per_second": 8.012, "step": 85510 }, { "epoch": 3.420027042074286, "grad_norm": 1.8892121315002441, "learning_rate": 8.610256274208097e-05, "loss": 0.1261, "step": 86000 }, { "epoch": 3.4399109202258806, "grad_norm": 2.923980236053467, "learning_rate": 8.54393272138802e-05, "loss": 0.1304, "step": 86500 }, { "epoch": 3.4597947983774757, "grad_norm": 0.46090424060821533, "learning_rate": 8.477609168567942e-05, "loss": 0.1263, "step": 87000 }, { "epoch": 3.4796786765290704, "grad_norm": 1.307573676109314, "learning_rate": 8.411285615747865e-05, "loss": 0.1217, "step": 87500 }, { "epoch": 3.499562554680665, "grad_norm": 1.9948647022247314, "learning_rate": 8.345094710033427e-05, "loss": 0.1298, "step": 88000 }, { "epoch": 3.5194464328322597, "grad_norm": 1.2264705896377563, "learning_rate": 8.27877115721335e-05, "loss": 0.1254, "step": 88500 }, { "epoch": 3.5393303109838543, "grad_norm": 1.249614953994751, "learning_rate": 8.212447604393274e-05, "loss": 0.1217, "step": 89000 }, { "epoch": 3.559214189135449, "grad_norm": 1.5197207927703857, "learning_rate": 8.146124051573195e-05, "loss": 0.1276, "step": 89500 }, { "epoch": 3.5790980672870436, "grad_norm": 0.8200188875198364, "learning_rate": 8.079800498753118e-05, "loss": 0.1283, "step": 90000 }, { "epoch": 3.5989819454386383, "grad_norm": 1.3526813983917236, "learning_rate": 8.01347694593304e-05, "loss": 0.1221, "step": 90500 }, { "epoch": 3.600572655690766, "eval_loss": 0.14859890937805176, "eval_runtime": 32.6446, "eval_samples_per_second": 15.316, "eval_steps_per_second": 7.658, "step": 90540 }, { "epoch": 3.618865823590233, "grad_norm": 1.9000824689865112, "learning_rate": 7.947153393112963e-05, "loss": 0.1215, "step": 91000 }, { "epoch": 3.638749701741828, "grad_norm": 1.2409000396728516, "learning_rate": 7.880829840292885e-05, "loss": 0.1259, "step": 91500 }, { "epoch": 3.658633579893422, "grad_norm": 1.7296925783157349, "learning_rate": 7.814506287472808e-05, "loss": 0.1245, "step": 92000 }, { "epoch": 3.6785174580450173, "grad_norm": 1.3685667514801025, "learning_rate": 7.748315381758371e-05, "loss": 0.127, "step": 92500 }, { "epoch": 3.698401336196612, "grad_norm": 2.338923692703247, "learning_rate": 7.681991828938293e-05, "loss": 0.119, "step": 93000 }, { "epoch": 3.7182852143482066, "grad_norm": 2.3641035556793213, "learning_rate": 7.615668276118215e-05, "loss": 0.1256, "step": 93500 }, { "epoch": 3.7381690924998012, "grad_norm": 1.444541335105896, "learning_rate": 7.549477370403777e-05, "loss": 0.1218, "step": 94000 }, { "epoch": 3.758052970651396, "grad_norm": 2.2175920009613037, "learning_rate": 7.4831538175837e-05, "loss": 0.1181, "step": 94500 }, { "epoch": 3.7779368488029905, "grad_norm": 2.533275604248047, "learning_rate": 7.416830264763624e-05, "loss": 0.1203, "step": 95000 }, { "epoch": 3.797820726954585, "grad_norm": 2.5117480754852295, "learning_rate": 7.350506711943545e-05, "loss": 0.1161, "step": 95500 }, { "epoch": 3.8006044698958084, "eval_loss": 0.14638979732990265, "eval_runtime": 33.193, "eval_samples_per_second": 15.063, "eval_steps_per_second": 7.532, "step": 95570 }, { "epoch": 3.81770460510618, "grad_norm": 0.9872569441795349, "learning_rate": 7.284183159123469e-05, "loss": 0.1257, "step": 96000 }, { "epoch": 3.8375884832577745, "grad_norm": 2.7399699687957764, "learning_rate": 7.21785960630339e-05, "loss": 0.1309, "step": 96500 }, { "epoch": 3.8574723614093696, "grad_norm": 1.9438740015029907, "learning_rate": 7.151536053483313e-05, "loss": 0.1196, "step": 97000 }, { "epoch": 3.8773562395609638, "grad_norm": 3.683006763458252, "learning_rate": 7.085212500663237e-05, "loss": 0.1207, "step": 97500 }, { "epoch": 3.897240117712559, "grad_norm": 2.3406124114990234, "learning_rate": 7.019154242054439e-05, "loss": 0.1202, "step": 98000 }, { "epoch": 3.9171239958641535, "grad_norm": 1.3052023649215698, "learning_rate": 6.952830689234361e-05, "loss": 0.1233, "step": 98500 }, { "epoch": 3.937007874015748, "grad_norm": 1.8666225671768188, "learning_rate": 6.886507136414284e-05, "loss": 0.1217, "step": 99000 }, { "epoch": 3.956891752167343, "grad_norm": 1.7979810237884521, "learning_rate": 6.820183583594206e-05, "loss": 0.1194, "step": 99500 }, { "epoch": 3.9767756303189374, "grad_norm": 0.9724803566932678, "learning_rate": 6.753860030774129e-05, "loss": 0.1276, "step": 100000 }, { "epoch": 3.996659508470532, "grad_norm": 3.251120090484619, "learning_rate": 6.687536477954052e-05, "loss": 0.1165, "step": 100500 }, { "epoch": 4.000636284100851, "eval_loss": 0.1412682831287384, "eval_runtime": 31.0761, "eval_samples_per_second": 16.09, "eval_steps_per_second": 8.045, "step": 100600 }, { "epoch": 4.016543386622127, "grad_norm": 3.6826343536376953, "learning_rate": 6.621212925133974e-05, "loss": 0.1046, "step": 101000 }, { "epoch": 4.036427264773722, "grad_norm": 5.360718250274658, "learning_rate": 6.554889372313896e-05, "loss": 0.1032, "step": 101500 }, { "epoch": 4.056311142925316, "grad_norm": 1.780490756034851, "learning_rate": 6.488698466599459e-05, "loss": 0.1014, "step": 102000 }, { "epoch": 4.076195021076911, "grad_norm": 2.984694480895996, "learning_rate": 6.422374913779382e-05, "loss": 0.1044, "step": 102500 }, { "epoch": 4.096078899228505, "grad_norm": 0.6237806677818298, "learning_rate": 6.356184008064945e-05, "loss": 0.1009, "step": 103000 }, { "epoch": 4.1159627773801, "grad_norm": 3.3006651401519775, "learning_rate": 6.289860455244868e-05, "loss": 0.1017, "step": 103500 }, { "epoch": 4.135846655531695, "grad_norm": 1.9449480772018433, "learning_rate": 6.223536902424789e-05, "loss": 0.1043, "step": 104000 }, { "epoch": 4.15573053368329, "grad_norm": 2.495291233062744, "learning_rate": 6.157213349604713e-05, "loss": 0.1024, "step": 104500 }, { "epoch": 4.175614411834884, "grad_norm": 0.23489686846733093, "learning_rate": 6.090889796784635e-05, "loss": 0.1031, "step": 105000 }, { "epoch": 4.195498289986479, "grad_norm": 3.0352611541748047, "learning_rate": 6.024566243964557e-05, "loss": 0.0994, "step": 105500 }, { "epoch": 4.2006680983058935, "eval_loss": 0.14581650495529175, "eval_runtime": 35.7264, "eval_samples_per_second": 13.995, "eval_steps_per_second": 6.998, "step": 105630 }, { "epoch": 4.215382168138074, "grad_norm": 1.1670618057250977, "learning_rate": 5.95824269114448e-05, "loss": 0.1045, "step": 106000 }, { "epoch": 4.235266046289668, "grad_norm": 2.0033414363861084, "learning_rate": 5.891919138324402e-05, "loss": 0.1016, "step": 106500 }, { "epoch": 4.255149924441263, "grad_norm": 1.4979524612426758, "learning_rate": 5.825728232609965e-05, "loss": 0.1079, "step": 107000 }, { "epoch": 4.275033802592858, "grad_norm": 1.7079240083694458, "learning_rate": 5.7594046797898874e-05, "loss": 0.1016, "step": 107500 }, { "epoch": 4.294917680744453, "grad_norm": 1.1605221033096313, "learning_rate": 5.6930811269698107e-05, "loss": 0.1008, "step": 108000 }, { "epoch": 4.314801558896047, "grad_norm": 1.4162037372589111, "learning_rate": 5.6267575741497326e-05, "loss": 0.1083, "step": 108500 }, { "epoch": 4.334685437047642, "grad_norm": 2.373020887374878, "learning_rate": 5.5604340213296545e-05, "loss": 0.1073, "step": 109000 }, { "epoch": 4.354569315199236, "grad_norm": 2.836911916732788, "learning_rate": 5.494375762720858e-05, "loss": 0.1019, "step": 109500 }, { "epoch": 4.374453193350831, "grad_norm": 3.7747974395751953, "learning_rate": 5.42805220990078e-05, "loss": 0.1038, "step": 110000 }, { "epoch": 4.3943370715024255, "grad_norm": 1.4646281003952026, "learning_rate": 5.361728657080703e-05, "loss": 0.1113, "step": 110500 }, { "epoch": 4.4006999125109365, "eval_loss": 0.14834672212600708, "eval_runtime": 37.228, "eval_samples_per_second": 13.431, "eval_steps_per_second": 6.715, "step": 110660 }, { "epoch": 4.414220949654021, "grad_norm": 2.5483474731445312, "learning_rate": 5.295405104260626e-05, "loss": 0.1006, "step": 111000 }, { "epoch": 4.434104827805616, "grad_norm": 2.5747101306915283, "learning_rate": 5.229081551440548e-05, "loss": 0.1082, "step": 111500 }, { "epoch": 4.45398870595721, "grad_norm": 0.23354358971118927, "learning_rate": 5.16275799862047e-05, "loss": 0.1032, "step": 112000 }, { "epoch": 4.473872584108805, "grad_norm": 2.4420604705810547, "learning_rate": 5.096434445800393e-05, "loss": 0.106, "step": 112500 }, { "epoch": 4.493756462260399, "grad_norm": 1.2000492811203003, "learning_rate": 5.030110892980315e-05, "loss": 0.0961, "step": 113000 }, { "epoch": 4.513640340411994, "grad_norm": 1.8864790201187134, "learning_rate": 4.963919987265878e-05, "loss": 0.0978, "step": 113500 }, { "epoch": 4.5335242185635884, "grad_norm": 0.45711418986320496, "learning_rate": 4.897596434445801e-05, "loss": 0.094, "step": 114000 }, { "epoch": 4.5534080967151835, "grad_norm": 7.053537845611572, "learning_rate": 4.831272881625723e-05, "loss": 0.1, "step": 114500 }, { "epoch": 4.573291974866778, "grad_norm": 3.276334047317505, "learning_rate": 4.765081975911286e-05, "loss": 0.0996, "step": 115000 }, { "epoch": 4.593175853018373, "grad_norm": 1.4245741367340088, "learning_rate": 4.6987584230912085e-05, "loss": 0.0996, "step": 115500 }, { "epoch": 4.600731726715979, "eval_loss": 0.14262481033802032, "eval_runtime": 32.3953, "eval_samples_per_second": 15.434, "eval_steps_per_second": 7.717, "step": 115690 }, { "epoch": 4.613059731169967, "grad_norm": 2.3254101276397705, "learning_rate": 4.6324348702711304e-05, "loss": 0.1017, "step": 116000 }, { "epoch": 4.632943609321562, "grad_norm": 1.167358636856079, "learning_rate": 4.566111317451054e-05, "loss": 0.0981, "step": 116500 }, { "epoch": 4.652827487473157, "grad_norm": 4.5853376388549805, "learning_rate": 4.499787764630976e-05, "loss": 0.1016, "step": 117000 }, { "epoch": 4.672711365624751, "grad_norm": 1.4917422533035278, "learning_rate": 4.433464211810899e-05, "loss": 0.1018, "step": 117500 }, { "epoch": 4.6925952437763465, "grad_norm": 0.012467560358345509, "learning_rate": 4.367273306096461e-05, "loss": 0.0975, "step": 118000 }, { "epoch": 4.712479121927941, "grad_norm": 0.8470689058303833, "learning_rate": 4.3009497532763836e-05, "loss": 0.0965, "step": 118500 }, { "epoch": 4.732363000079536, "grad_norm": 0.43946540355682373, "learning_rate": 4.234626200456306e-05, "loss": 0.1014, "step": 119000 }, { "epoch": 4.75224687823113, "grad_norm": 2.230210304260254, "learning_rate": 4.168302647636229e-05, "loss": 0.0996, "step": 119500 }, { "epoch": 4.772130756382725, "grad_norm": 3.4537596702575684, "learning_rate": 4.1019790948161514e-05, "loss": 0.0966, "step": 120000 }, { "epoch": 4.792014634534319, "grad_norm": 1.7490534782409668, "learning_rate": 4.035655541996074e-05, "loss": 0.1116, "step": 120500 }, { "epoch": 4.800763540921022, "eval_loss": 0.1392485797405243, "eval_runtime": 34.3723, "eval_samples_per_second": 14.547, "eval_steps_per_second": 7.273, "step": 120720 }, { "epoch": 4.811898512685914, "grad_norm": 0.9216163158416748, "learning_rate": 3.969331989175996e-05, "loss": 0.0965, "step": 121000 }, { "epoch": 4.831782390837509, "grad_norm": 4.56651496887207, "learning_rate": 3.9030084363559186e-05, "loss": 0.0965, "step": 121500 }, { "epoch": 4.851666268989104, "grad_norm": 1.2061914205551147, "learning_rate": 3.8368175306414814e-05, "loss": 0.1059, "step": 122000 }, { "epoch": 4.871550147140699, "grad_norm": 1.8895795345306396, "learning_rate": 3.770493977821405e-05, "loss": 0.0988, "step": 122500 }, { "epoch": 4.891434025292293, "grad_norm": 2.730050563812256, "learning_rate": 3.7041704250013266e-05, "loss": 0.1022, "step": 123000 }, { "epoch": 4.911317903443888, "grad_norm": 2.6674962043762207, "learning_rate": 3.637846872181249e-05, "loss": 0.0966, "step": 123500 }, { "epoch": 4.931201781595482, "grad_norm": 5.602296352386475, "learning_rate": 3.571655966466812e-05, "loss": 0.0944, "step": 124000 }, { "epoch": 4.951085659747077, "grad_norm": 2.389179229736328, "learning_rate": 3.505332413646734e-05, "loss": 0.0975, "step": 124500 }, { "epoch": 4.970969537898672, "grad_norm": 3.1410694122314453, "learning_rate": 3.4390088608266565e-05, "loss": 0.0923, "step": 125000 }, { "epoch": 4.990853416050267, "grad_norm": 0.3184365928173065, "learning_rate": 3.37268530800658e-05, "loss": 0.1004, "step": 125500 }, { "epoch": 5.000795355126064, "eval_loss": 0.13804183900356293, "eval_runtime": 34.1527, "eval_samples_per_second": 14.64, "eval_steps_per_second": 7.32, "step": 125750 }, { "epoch": 5.010737294201861, "grad_norm": 1.9156638383865356, "learning_rate": 3.3063617551865024e-05, "loss": 0.0899, "step": 126000 }, { "epoch": 5.030621172353456, "grad_norm": 1.1794829368591309, "learning_rate": 3.2401708494720645e-05, "loss": 0.0762, "step": 126500 }, { "epoch": 5.05050505050505, "grad_norm": 2.2873692512512207, "learning_rate": 3.173847296651987e-05, "loss": 0.0823, "step": 127000 }, { "epoch": 5.070388928656645, "grad_norm": 2.4998435974121094, "learning_rate": 3.10752374383191e-05, "loss": 0.0747, "step": 127500 }, { "epoch": 5.0902728068082395, "grad_norm": 3.9110782146453857, "learning_rate": 3.041200191011832e-05, "loss": 0.0853, "step": 128000 }, { "epoch": 5.1101566849598345, "grad_norm": 3.3292062282562256, "learning_rate": 2.9748766381917546e-05, "loss": 0.0839, "step": 128500 }, { "epoch": 5.13004056311143, "grad_norm": 4.433184623718262, "learning_rate": 2.9085530853716776e-05, "loss": 0.0802, "step": 129000 }, { "epoch": 5.149924441263024, "grad_norm": 3.4676051139831543, "learning_rate": 2.8422295325515995e-05, "loss": 0.0838, "step": 129500 }, { "epoch": 5.169808319414619, "grad_norm": 1.5169734954833984, "learning_rate": 2.7759059797315225e-05, "loss": 0.0783, "step": 130000 }, { "epoch": 5.189692197566213, "grad_norm": 1.637904405593872, "learning_rate": 2.709582426911445e-05, "loss": 0.0814, "step": 130500 }, { "epoch": 5.200827169331106, "eval_loss": 0.141804039478302, "eval_runtime": 32.6973, "eval_samples_per_second": 15.292, "eval_steps_per_second": 7.646, "step": 130780 }, { "epoch": 5.209576075717808, "grad_norm": 2.086423873901367, "learning_rate": 2.643391521197008e-05, "loss": 0.0831, "step": 131000 }, { "epoch": 5.229459953869402, "grad_norm": 0.6672555804252625, "learning_rate": 2.57720061548257e-05, "loss": 0.0862, "step": 131500 }, { "epoch": 5.2493438320209975, "grad_norm": 0.04646410793066025, "learning_rate": 2.510877062662493e-05, "loss": 0.0792, "step": 132000 }, { "epoch": 5.269227710172592, "grad_norm": 4.182920932769775, "learning_rate": 2.4445535098424155e-05, "loss": 0.0802, "step": 132500 }, { "epoch": 5.289111588324187, "grad_norm": 3.0408833026885986, "learning_rate": 2.3782299570223378e-05, "loss": 0.0836, "step": 133000 }, { "epoch": 5.308995466475781, "grad_norm": 1.9850901365280151, "learning_rate": 2.3120390513079006e-05, "loss": 0.0779, "step": 133500 }, { "epoch": 5.328879344627376, "grad_norm": 2.1532881259918213, "learning_rate": 2.2457154984878232e-05, "loss": 0.0813, "step": 134000 }, { "epoch": 5.348763222778971, "grad_norm": 2.2527785301208496, "learning_rate": 2.1793919456677455e-05, "loss": 0.0761, "step": 134500 }, { "epoch": 5.368647100930565, "grad_norm": 1.829942226409912, "learning_rate": 2.1130683928476684e-05, "loss": 0.0813, "step": 135000 }, { "epoch": 5.3885309790821605, "grad_norm": 2.5190794467926025, "learning_rate": 2.046877487133231e-05, "loss": 0.086, "step": 135500 }, { "epoch": 5.400858983536149, "eval_loss": 0.1405518800020218, "eval_runtime": 32.4801, "eval_samples_per_second": 15.394, "eval_steps_per_second": 7.697, "step": 135810 }, { "epoch": 5.408414857233755, "grad_norm": 0.9381058812141418, "learning_rate": 1.9805539343131535e-05, "loss": 0.0769, "step": 136000 }, { "epoch": 5.42829873538535, "grad_norm": 3.986593246459961, "learning_rate": 1.914230381493076e-05, "loss": 0.08, "step": 136500 }, { "epoch": 5.448182613536944, "grad_norm": 1.4495145082473755, "learning_rate": 1.848039475778639e-05, "loss": 0.0787, "step": 137000 }, { "epoch": 5.468066491688539, "grad_norm": 2.496209144592285, "learning_rate": 1.781715922958561e-05, "loss": 0.0808, "step": 137500 }, { "epoch": 5.487950369840133, "grad_norm": 1.4230667352676392, "learning_rate": 1.7153923701384837e-05, "loss": 0.0818, "step": 138000 }, { "epoch": 5.507834247991728, "grad_norm": 0.07668805867433548, "learning_rate": 1.6490688173184063e-05, "loss": 0.0786, "step": 138500 }, { "epoch": 5.5277181261433235, "grad_norm": 2.7259907722473145, "learning_rate": 1.5827452644983286e-05, "loss": 0.0839, "step": 139000 }, { "epoch": 5.547602004294918, "grad_norm": 1.5834600925445557, "learning_rate": 1.5164217116782512e-05, "loss": 0.0807, "step": 139500 }, { "epoch": 5.567485882446512, "grad_norm": 1.825166940689087, "learning_rate": 1.4500981588581738e-05, "loss": 0.0786, "step": 140000 }, { "epoch": 5.587369760598107, "grad_norm": 0.33820098638534546, "learning_rate": 1.3837746060380963e-05, "loss": 0.0731, "step": 140500 }, { "epoch": 5.600890797741192, "eval_loss": 0.1415776163339615, "eval_runtime": 32.1699, "eval_samples_per_second": 15.542, "eval_steps_per_second": 7.771, "step": 140840 }, { "epoch": 5.607253638749702, "grad_norm": 0.2709919214248657, "learning_rate": 1.317583700323659e-05, "loss": 0.0755, "step": 141000 }, { "epoch": 5.627137516901296, "grad_norm": 2.084345817565918, "learning_rate": 1.2512601475035815e-05, "loss": 0.0784, "step": 141500 }, { "epoch": 5.647021395052891, "grad_norm": 1.5787373781204224, "learning_rate": 1.1849365946835041e-05, "loss": 0.0724, "step": 142000 }, { "epoch": 5.666905273204486, "grad_norm": 2.0165510177612305, "learning_rate": 1.1186130418634267e-05, "loss": 0.0814, "step": 142500 }, { "epoch": 5.686789151356081, "grad_norm": 1.0443742275238037, "learning_rate": 1.052289489043349e-05, "loss": 0.0749, "step": 143000 }, { "epoch": 5.706673029507675, "grad_norm": 0.3988170921802521, "learning_rate": 9.859659362232716e-06, "loss": 0.0805, "step": 143500 }, { "epoch": 5.72655690765927, "grad_norm": 1.6842561960220337, "learning_rate": 9.196423834031942e-06, "loss": 0.0859, "step": 144000 }, { "epoch": 5.746440785810865, "grad_norm": 3.893829584121704, "learning_rate": 8.533188305831168e-06, "loss": 0.0834, "step": 144500 }, { "epoch": 5.766324663962459, "grad_norm": 3.017742156982422, "learning_rate": 7.871279248686794e-06, "loss": 0.0767, "step": 145000 }, { "epoch": 5.786208542114053, "grad_norm": 1.0755605697631836, "learning_rate": 7.2080437204860195e-06, "loss": 0.0807, "step": 145500 }, { "epoch": 5.800922611946234, "eval_loss": 0.14066436886787415, "eval_runtime": 37.1151, "eval_samples_per_second": 13.472, "eval_steps_per_second": 6.736, "step": 145870 }, { "epoch": 5.8060924202656485, "grad_norm": 2.315115451812744, "learning_rate": 6.544808192285244e-06, "loss": 0.0737, "step": 146000 }, { "epoch": 5.825976298417244, "grad_norm": 1.7753525972366333, "learning_rate": 5.88157266408447e-06, "loss": 0.0781, "step": 146500 }, { "epoch": 5.845860176568838, "grad_norm": 0.942420244216919, "learning_rate": 5.218337135883695e-06, "loss": 0.0785, "step": 147000 }, { "epoch": 5.865744054720433, "grad_norm": 1.6908499002456665, "learning_rate": 4.5551016076829205e-06, "loss": 0.0773, "step": 147500 }, { "epoch": 5.885627932872027, "grad_norm": 3.541743755340576, "learning_rate": 3.891866079482146e-06, "loss": 0.082, "step": 148000 }, { "epoch": 5.905511811023622, "grad_norm": 1.41182541847229, "learning_rate": 3.228630551281371e-06, "loss": 0.0755, "step": 148500 }, { "epoch": 5.925395689175216, "grad_norm": 1.457543134689331, "learning_rate": 2.5653950230805963e-06, "loss": 0.0721, "step": 149000 }, { "epoch": 5.9452795673268115, "grad_norm": 1.3256011009216309, "learning_rate": 1.9034859659362233e-06, "loss": 0.0756, "step": 149500 }, { "epoch": 5.965163445478407, "grad_norm": 4.112858295440674, "learning_rate": 1.2402504377354487e-06, "loss": 0.0792, "step": 150000 }, { "epoch": 5.985047323630001, "grad_norm": 9.184978485107422, "learning_rate": 5.77014909534674e-07, "loss": 0.0763, "step": 150500 }, { "epoch": 6.0, "step": 150876, "total_flos": 9.584515856152166e+17, "train_loss": 0.1514410440879876, "train_runtime": 79770.8244, "train_samples_per_second": 3.783, "train_steps_per_second": 1.891 } ], "logging_steps": 500, "max_steps": 150876, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.584515856152166e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }