{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9987639060568603, "eval_steps": 500, "global_step": 505, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.009888751545117428, "grad_norm": 25.25, "learning_rate": 0.0002, "loss": 3.4401, "step": 5 }, { "epoch": 0.019777503090234856, "grad_norm": 11.125, "learning_rate": 0.0002, "loss": 2.2116, "step": 10 }, { "epoch": 0.029666254635352288, "grad_norm": 4.65625, "learning_rate": 0.0002, "loss": 1.6815, "step": 15 }, { "epoch": 0.03955500618046971, "grad_norm": 13.0, "learning_rate": 0.0002, "loss": 1.906, "step": 20 }, { "epoch": 0.049443757725587144, "grad_norm": 6.03125, "learning_rate": 0.0002, "loss": 1.8419, "step": 25 }, { "epoch": 0.059332509270704575, "grad_norm": 7.125, "learning_rate": 0.0002, "loss": 1.7867, "step": 30 }, { "epoch": 0.069221260815822, "grad_norm": 4.125, "learning_rate": 0.0002, "loss": 1.7177, "step": 35 }, { "epoch": 0.07911001236093942, "grad_norm": 4.375, "learning_rate": 0.0002, "loss": 1.4636, "step": 40 }, { "epoch": 0.08899876390605686, "grad_norm": 42.5, "learning_rate": 0.0002, "loss": 1.6758, "step": 45 }, { "epoch": 0.09888751545117429, "grad_norm": 3.71875, "learning_rate": 0.0002, "loss": 1.6556, "step": 50 }, { "epoch": 0.10877626699629171, "grad_norm": 3.953125, "learning_rate": 0.0002, "loss": 1.5998, "step": 55 }, { "epoch": 0.11866501854140915, "grad_norm": 3.21875, "learning_rate": 0.0002, "loss": 1.5765, "step": 60 }, { "epoch": 0.12855377008652658, "grad_norm": 3.71875, "learning_rate": 0.0002, "loss": 1.5169, "step": 65 }, { "epoch": 0.138442521631644, "grad_norm": 3.359375, "learning_rate": 0.0002, "loss": 1.582, "step": 70 }, { "epoch": 0.14833127317676142, "grad_norm": 4.96875, "learning_rate": 0.0002, "loss": 1.5751, "step": 75 }, { "epoch": 0.15822002472187885, "grad_norm": 3.03125, "learning_rate": 0.0002, "loss": 1.6266, "step": 80 }, { "epoch": 0.1681087762669963, "grad_norm": 3.640625, "learning_rate": 0.0002, "loss": 1.508, "step": 85 }, { "epoch": 0.17799752781211373, "grad_norm": 2.765625, "learning_rate": 0.0002, "loss": 1.5031, "step": 90 }, { "epoch": 0.18788627935723115, "grad_norm": 2.734375, "learning_rate": 0.0002, "loss": 1.5231, "step": 95 }, { "epoch": 0.19777503090234858, "grad_norm": 8.3125, "learning_rate": 0.0002, "loss": 1.6393, "step": 100 }, { "epoch": 0.207663782447466, "grad_norm": 145.0, "learning_rate": 0.0002, "loss": 1.7596, "step": 105 }, { "epoch": 0.21755253399258342, "grad_norm": 2.578125, "learning_rate": 0.0002, "loss": 1.5453, "step": 110 }, { "epoch": 0.22744128553770088, "grad_norm": 3.375, "learning_rate": 0.0002, "loss": 1.5337, "step": 115 }, { "epoch": 0.2373300370828183, "grad_norm": 15.375, "learning_rate": 0.0002, "loss": 1.6691, "step": 120 }, { "epoch": 0.24721878862793573, "grad_norm": 6.5625, "learning_rate": 0.0002, "loss": 1.5903, "step": 125 }, { "epoch": 0.25710754017305315, "grad_norm": 2.953125, "learning_rate": 0.0002, "loss": 1.4876, "step": 130 }, { "epoch": 0.2669962917181706, "grad_norm": 24.875, "learning_rate": 0.0002, "loss": 1.6292, "step": 135 }, { "epoch": 0.276885043263288, "grad_norm": 3.03125, "learning_rate": 0.0002, "loss": 1.5652, "step": 140 }, { "epoch": 0.2867737948084054, "grad_norm": 3.03125, "learning_rate": 0.0002, "loss": 1.5782, "step": 145 }, { "epoch": 0.29666254635352285, "grad_norm": 3.828125, "learning_rate": 0.0002, "loss": 1.5251, "step": 150 }, { "epoch": 0.3065512978986403, "grad_norm": 4.71875, "learning_rate": 0.0002, "loss": 1.4326, "step": 155 }, { "epoch": 0.3164400494437577, "grad_norm": 2.375, "learning_rate": 0.0002, "loss": 1.5085, "step": 160 }, { "epoch": 0.3263288009888752, "grad_norm": 3.84375, "learning_rate": 0.0002, "loss": 1.482, "step": 165 }, { "epoch": 0.3362175525339926, "grad_norm": 2.328125, "learning_rate": 0.0002, "loss": 1.516, "step": 170 }, { "epoch": 0.34610630407911, "grad_norm": 2.453125, "learning_rate": 0.0002, "loss": 1.5091, "step": 175 }, { "epoch": 0.35599505562422745, "grad_norm": 2.625, "learning_rate": 0.0002, "loss": 1.4517, "step": 180 }, { "epoch": 0.3658838071693449, "grad_norm": 2.84375, "learning_rate": 0.0002, "loss": 1.4647, "step": 185 }, { "epoch": 0.3757725587144623, "grad_norm": 2.734375, "learning_rate": 0.0002, "loss": 1.4764, "step": 190 }, { "epoch": 0.3856613102595797, "grad_norm": 2.15625, "learning_rate": 0.0002, "loss": 1.4254, "step": 195 }, { "epoch": 0.39555006180469715, "grad_norm": 2.34375, "learning_rate": 0.0002, "loss": 1.4868, "step": 200 }, { "epoch": 0.4054388133498146, "grad_norm": 2.609375, "learning_rate": 0.0002, "loss": 1.4534, "step": 205 }, { "epoch": 0.415327564894932, "grad_norm": 2.34375, "learning_rate": 0.0002, "loss": 1.447, "step": 210 }, { "epoch": 0.4252163164400494, "grad_norm": 2.3125, "learning_rate": 0.0002, "loss": 1.4514, "step": 215 }, { "epoch": 0.43510506798516685, "grad_norm": 2.109375, "learning_rate": 0.0002, "loss": 1.4372, "step": 220 }, { "epoch": 0.44499381953028433, "grad_norm": 2.859375, "learning_rate": 0.0002, "loss": 1.3961, "step": 225 }, { "epoch": 0.45488257107540175, "grad_norm": 3.0625, "learning_rate": 0.0002, "loss": 1.4363, "step": 230 }, { "epoch": 0.4647713226205192, "grad_norm": 2.703125, "learning_rate": 0.0002, "loss": 1.4355, "step": 235 }, { "epoch": 0.4746600741656366, "grad_norm": 2.8125, "learning_rate": 0.0002, "loss": 1.525, "step": 240 }, { "epoch": 0.484548825710754, "grad_norm": 2.46875, "learning_rate": 0.0002, "loss": 1.3801, "step": 245 }, { "epoch": 0.49443757725587145, "grad_norm": 2.40625, "learning_rate": 0.0002, "loss": 1.5133, "step": 250 }, { "epoch": 0.5043263288009888, "grad_norm": 2.328125, "learning_rate": 0.0002, "loss": 1.4364, "step": 255 }, { "epoch": 0.5142150803461063, "grad_norm": 2.3125, "learning_rate": 0.0002, "loss": 1.4363, "step": 260 }, { "epoch": 0.5241038318912238, "grad_norm": 2.75, "learning_rate": 0.0002, "loss": 1.521, "step": 265 }, { "epoch": 0.5339925834363412, "grad_norm": 2.4375, "learning_rate": 0.0002, "loss": 1.4559, "step": 270 }, { "epoch": 0.5438813349814586, "grad_norm": 2.5, "learning_rate": 0.0002, "loss": 1.4516, "step": 275 }, { "epoch": 0.553770086526576, "grad_norm": 2.125, "learning_rate": 0.0002, "loss": 1.3968, "step": 280 }, { "epoch": 0.5636588380716935, "grad_norm": 2.328125, "learning_rate": 0.0002, "loss": 1.4183, "step": 285 }, { "epoch": 0.5735475896168108, "grad_norm": 1.8046875, "learning_rate": 0.0002, "loss": 1.3653, "step": 290 }, { "epoch": 0.5834363411619283, "grad_norm": 2.0, "learning_rate": 0.0002, "loss": 1.388, "step": 295 }, { "epoch": 0.5933250927070457, "grad_norm": 1.953125, "learning_rate": 0.0002, "loss": 1.3131, "step": 300 }, { "epoch": 0.6032138442521632, "grad_norm": 2.390625, "learning_rate": 0.0002, "loss": 1.4784, "step": 305 }, { "epoch": 0.6131025957972805, "grad_norm": 2.34375, "learning_rate": 0.0002, "loss": 1.4339, "step": 310 }, { "epoch": 0.622991347342398, "grad_norm": 2.140625, "learning_rate": 0.0002, "loss": 1.4425, "step": 315 }, { "epoch": 0.6328800988875154, "grad_norm": 2.078125, "learning_rate": 0.0002, "loss": 1.3847, "step": 320 }, { "epoch": 0.6427688504326329, "grad_norm": 2.359375, "learning_rate": 0.0002, "loss": 1.4252, "step": 325 }, { "epoch": 0.6526576019777504, "grad_norm": 2.640625, "learning_rate": 0.0002, "loss": 1.4253, "step": 330 }, { "epoch": 0.6625463535228677, "grad_norm": 2.015625, "learning_rate": 0.0002, "loss": 1.3336, "step": 335 }, { "epoch": 0.6724351050679852, "grad_norm": 2.171875, "learning_rate": 0.0002, "loss": 1.4209, "step": 340 }, { "epoch": 0.6823238566131026, "grad_norm": 4.0625, "learning_rate": 0.0002, "loss": 1.4485, "step": 345 }, { "epoch": 0.69221260815822, "grad_norm": 2.1875, "learning_rate": 0.0002, "loss": 1.4261, "step": 350 }, { "epoch": 0.7021013597033374, "grad_norm": 2.515625, "learning_rate": 0.0002, "loss": 1.3869, "step": 355 }, { "epoch": 0.7119901112484549, "grad_norm": 2.1875, "learning_rate": 0.0002, "loss": 1.4681, "step": 360 }, { "epoch": 0.7218788627935723, "grad_norm": 3.609375, "learning_rate": 0.0002, "loss": 1.3513, "step": 365 }, { "epoch": 0.7317676143386898, "grad_norm": 2.296875, "learning_rate": 0.0002, "loss": 1.4029, "step": 370 }, { "epoch": 0.7416563658838071, "grad_norm": 2.15625, "learning_rate": 0.0002, "loss": 1.3716, "step": 375 }, { "epoch": 0.7515451174289246, "grad_norm": 3.75, "learning_rate": 0.0002, "loss": 1.3902, "step": 380 }, { "epoch": 0.761433868974042, "grad_norm": 2.0625, "learning_rate": 0.0002, "loss": 1.3469, "step": 385 }, { "epoch": 0.7713226205191595, "grad_norm": 1.9609375, "learning_rate": 0.0002, "loss": 1.3216, "step": 390 }, { "epoch": 0.7812113720642769, "grad_norm": 1.6796875, "learning_rate": 0.0002, "loss": 1.4774, "step": 395 }, { "epoch": 0.7911001236093943, "grad_norm": 1.7265625, "learning_rate": 0.0002, "loss": 1.372, "step": 400 }, { "epoch": 0.8009888751545118, "grad_norm": 1.921875, "learning_rate": 0.0002, "loss": 1.3574, "step": 405 }, { "epoch": 0.8108776266996292, "grad_norm": 2.1875, "learning_rate": 0.0002, "loss": 1.4444, "step": 410 }, { "epoch": 0.8207663782447466, "grad_norm": 2.296875, "learning_rate": 0.0002, "loss": 1.3743, "step": 415 }, { "epoch": 0.830655129789864, "grad_norm": 2.125, "learning_rate": 0.0002, "loss": 1.3589, "step": 420 }, { "epoch": 0.8405438813349815, "grad_norm": 2.046875, "learning_rate": 0.0002, "loss": 1.3336, "step": 425 }, { "epoch": 0.8504326328800988, "grad_norm": 3.84375, "learning_rate": 0.0002, "loss": 1.3584, "step": 430 }, { "epoch": 0.8603213844252163, "grad_norm": 2.125, "learning_rate": 0.0002, "loss": 1.332, "step": 435 }, { "epoch": 0.8702101359703337, "grad_norm": 1.8828125, "learning_rate": 0.0002, "loss": 1.3678, "step": 440 }, { "epoch": 0.8800988875154512, "grad_norm": 2.0625, "learning_rate": 0.0002, "loss": 1.2967, "step": 445 }, { "epoch": 0.8899876390605687, "grad_norm": 2.3125, "learning_rate": 0.0002, "loss": 1.3276, "step": 450 }, { "epoch": 0.899876390605686, "grad_norm": 1.953125, "learning_rate": 0.0002, "loss": 1.3175, "step": 455 }, { "epoch": 0.9097651421508035, "grad_norm": 1.7734375, "learning_rate": 0.0002, "loss": 1.366, "step": 460 }, { "epoch": 0.9196538936959209, "grad_norm": 2.46875, "learning_rate": 0.0002, "loss": 1.3105, "step": 465 }, { "epoch": 0.9295426452410384, "grad_norm": 1.84375, "learning_rate": 0.0002, "loss": 1.2856, "step": 470 }, { "epoch": 0.9394313967861557, "grad_norm": 1.7890625, "learning_rate": 0.0002, "loss": 1.3662, "step": 475 }, { "epoch": 0.9493201483312732, "grad_norm": 1.921875, "learning_rate": 0.0002, "loss": 1.3289, "step": 480 }, { "epoch": 0.9592088998763906, "grad_norm": 2.078125, "learning_rate": 0.0002, "loss": 1.3812, "step": 485 }, { "epoch": 0.969097651421508, "grad_norm": 2.703125, "learning_rate": 0.0002, "loss": 1.3344, "step": 490 }, { "epoch": 0.9789864029666254, "grad_norm": 1.9609375, "learning_rate": 0.0002, "loss": 1.345, "step": 495 }, { "epoch": 0.9888751545117429, "grad_norm": 2.125, "learning_rate": 0.0002, "loss": 1.3163, "step": 500 }, { "epoch": 0.9987639060568603, "grad_norm": 2.125, "learning_rate": 0.0002, "loss": 1.354, "step": 505 }, { "epoch": 0.9987639060568603, "step": 505, "total_flos": 9435624701952000.0, "train_loss": 1.491983689412032, "train_runtime": 449.9567, "train_samples_per_second": 17.977, "train_steps_per_second": 1.122 } ], "logging_steps": 5, "max_steps": 505, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9435624701952000.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }