{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5319148936170213, "eval_steps": 500, "global_step": 50, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.010638297872340425, "grad_norm": 0.2887957990169525, "learning_rate": 0.0002, "loss": 0.9327, "mean_token_accuracy": 0.7623532712459564, "num_tokens": 1982.0, "step": 1 }, { "epoch": 0.02127659574468085, "grad_norm": 0.6030351519584656, "learning_rate": 0.000196, "loss": 1.8105, "mean_token_accuracy": 0.6288281381130219, "num_tokens": 2943.0, "step": 2 }, { "epoch": 0.031914893617021274, "grad_norm": 0.3874857723712921, "learning_rate": 0.000192, "loss": 1.0589, "mean_token_accuracy": 0.7225219160318375, "num_tokens": 4484.0, "step": 3 }, { "epoch": 0.0425531914893617, "grad_norm": 0.24475964903831482, "learning_rate": 0.000188, "loss": 1.0091, "mean_token_accuracy": 0.7289294451475143, "num_tokens": 6727.0, "step": 4 }, { "epoch": 0.05319148936170213, "grad_norm": 0.27278101444244385, "learning_rate": 0.00018400000000000003, "loss": 0.7239, "mean_token_accuracy": 0.7673206329345703, "num_tokens": 8935.0, "step": 5 }, { "epoch": 0.06382978723404255, "grad_norm": 0.687332272529602, "learning_rate": 0.00018, "loss": 1.3729, "mean_token_accuracy": 0.6930361837148666, "num_tokens": 9786.0, "step": 6 }, { "epoch": 0.07446808510638298, "grad_norm": 1.9471864700317383, "learning_rate": 0.00017600000000000002, "loss": 1.4784, "mean_token_accuracy": 0.6858641654253006, "num_tokens": 10553.0, "step": 7 }, { "epoch": 0.0851063829787234, "grad_norm": 0.5966729521751404, "learning_rate": 0.000172, "loss": 0.9692, "mean_token_accuracy": 0.7569561004638672, "num_tokens": 11778.0, "step": 8 }, { "epoch": 0.09574468085106383, "grad_norm": 0.8323692679405212, "learning_rate": 0.000168, "loss": 1.1015, "mean_token_accuracy": 0.7499070465564728, "num_tokens": 12686.0, "step": 9 }, { "epoch": 0.10638297872340426, "grad_norm": 1.2347114086151123, "learning_rate": 0.000164, "loss": 1.1746, "mean_token_accuracy": 0.740473136305809, "num_tokens": 13444.0, "step": 10 }, { "epoch": 0.11702127659574468, "grad_norm": 0.528573751449585, "learning_rate": 0.00016, "loss": 0.5423, "mean_token_accuracy": 0.8274708986282349, "num_tokens": 15704.0, "step": 11 }, { "epoch": 0.1276595744680851, "grad_norm": 2.641803026199341, "learning_rate": 0.00015600000000000002, "loss": 0.863, "mean_token_accuracy": 0.7851854711771011, "num_tokens": 16706.0, "step": 12 }, { "epoch": 0.13829787234042554, "grad_norm": 1.9446417093276978, "learning_rate": 0.000152, "loss": 0.949, "mean_token_accuracy": 0.7901278287172318, "num_tokens": 17562.0, "step": 13 }, { "epoch": 0.14893617021276595, "grad_norm": 1.0860435962677002, "learning_rate": 0.000148, "loss": 0.917, "mean_token_accuracy": 0.7842272222042084, "num_tokens": 18558.0, "step": 14 }, { "epoch": 0.1595744680851064, "grad_norm": 1.0175093412399292, "learning_rate": 0.000144, "loss": 0.6745, "mean_token_accuracy": 0.8425705283880234, "num_tokens": 19700.0, "step": 15 }, { "epoch": 0.1702127659574468, "grad_norm": 1.4986056089401245, "learning_rate": 0.00014, "loss": 0.6603, "mean_token_accuracy": 0.8449233621358871, "num_tokens": 20420.0, "step": 16 }, { "epoch": 0.18085106382978725, "grad_norm": 0.8947276473045349, "learning_rate": 0.00013600000000000003, "loss": 0.539, "mean_token_accuracy": 0.8811111599206924, "num_tokens": 21419.0, "step": 17 }, { "epoch": 0.19148936170212766, "grad_norm": 0.42721500992774963, "learning_rate": 0.000132, "loss": 0.4788, "mean_token_accuracy": 0.8939242213964462, "num_tokens": 23846.0, "step": 18 }, { "epoch": 0.20212765957446807, "grad_norm": 0.48865628242492676, "learning_rate": 0.00012800000000000002, "loss": 0.6326, "mean_token_accuracy": 0.8746728301048279, "num_tokens": 25747.0, "step": 19 }, { "epoch": 0.2127659574468085, "grad_norm": 0.8984824419021606, "learning_rate": 0.000124, "loss": 0.4745, "mean_token_accuracy": 0.8928907662630081, "num_tokens": 26627.0, "step": 20 }, { "epoch": 0.22340425531914893, "grad_norm": 0.9669850468635559, "learning_rate": 0.00012, "loss": 0.4962, "mean_token_accuracy": 0.8934834748506546, "num_tokens": 27436.0, "step": 21 }, { "epoch": 0.23404255319148937, "grad_norm": 0.36605075001716614, "learning_rate": 0.000116, "loss": 0.3258, "mean_token_accuracy": 0.9293452799320221, "num_tokens": 29420.0, "step": 22 }, { "epoch": 0.24468085106382978, "grad_norm": 0.894280731678009, "learning_rate": 0.00011200000000000001, "loss": 0.4877, "mean_token_accuracy": 0.8980903029441833, "num_tokens": 30216.0, "step": 23 }, { "epoch": 0.2553191489361702, "grad_norm": 0.7681854963302612, "learning_rate": 0.00010800000000000001, "loss": 0.541, "mean_token_accuracy": 0.8775683045387268, "num_tokens": 31099.0, "step": 24 }, { "epoch": 0.26595744680851063, "grad_norm": 0.7193688750267029, "learning_rate": 0.00010400000000000001, "loss": 0.5943, "mean_token_accuracy": 0.8750757277011871, "num_tokens": 31956.0, "step": 25 }, { "epoch": 0.2765957446808511, "grad_norm": 0.6575931310653687, "learning_rate": 0.0001, "loss": 0.4527, "mean_token_accuracy": 0.8902412056922913, "num_tokens": 32786.0, "step": 26 }, { "epoch": 0.2872340425531915, "grad_norm": 0.6464940309524536, "learning_rate": 9.6e-05, "loss": 0.4924, "mean_token_accuracy": 0.8929924517869949, "num_tokens": 33631.0, "step": 27 }, { "epoch": 0.2978723404255319, "grad_norm": 0.7342627644538879, "learning_rate": 9.200000000000001e-05, "loss": 0.4438, "mean_token_accuracy": 0.8995166420936584, "num_tokens": 34486.0, "step": 28 }, { "epoch": 0.30851063829787234, "grad_norm": 0.41542309522628784, "learning_rate": 8.800000000000001e-05, "loss": 0.3749, "mean_token_accuracy": 0.9000015705823898, "num_tokens": 36295.0, "step": 29 }, { "epoch": 0.3191489361702128, "grad_norm": 0.7227439880371094, "learning_rate": 8.4e-05, "loss": 0.6178, "mean_token_accuracy": 0.8595587909221649, "num_tokens": 37104.0, "step": 30 }, { "epoch": 0.32978723404255317, "grad_norm": 0.5684676766395569, "learning_rate": 8e-05, "loss": 0.4551, "mean_token_accuracy": 0.8971092998981476, "num_tokens": 38549.0, "step": 31 }, { "epoch": 0.3404255319148936, "grad_norm": 0.550665020942688, "learning_rate": 7.6e-05, "loss": 0.3505, "mean_token_accuracy": 0.9074584692716599, "num_tokens": 39872.0, "step": 32 }, { "epoch": 0.35106382978723405, "grad_norm": 0.6100918650627136, "learning_rate": 7.2e-05, "loss": 0.3216, "mean_token_accuracy": 0.9257150143384933, "num_tokens": 40735.0, "step": 33 }, { "epoch": 0.3617021276595745, "grad_norm": 0.4749443233013153, "learning_rate": 6.800000000000001e-05, "loss": 0.5295, "mean_token_accuracy": 0.8798354268074036, "num_tokens": 42345.0, "step": 34 }, { "epoch": 0.3723404255319149, "grad_norm": 0.6380234360694885, "learning_rate": 6.400000000000001e-05, "loss": 0.5781, "mean_token_accuracy": 0.8827451318502426, "num_tokens": 43313.0, "step": 35 }, { "epoch": 0.3829787234042553, "grad_norm": 0.8250994086265564, "learning_rate": 6e-05, "loss": 0.6299, "mean_token_accuracy": 0.8456272482872009, "num_tokens": 44173.0, "step": 36 }, { "epoch": 0.39361702127659576, "grad_norm": 0.5874150395393372, "learning_rate": 5.6000000000000006e-05, "loss": 0.4274, "mean_token_accuracy": 0.8966661095619202, "num_tokens": 45785.0, "step": 37 }, { "epoch": 0.40425531914893614, "grad_norm": 0.42270627617836, "learning_rate": 5.2000000000000004e-05, "loss": 0.3216, "mean_token_accuracy": 0.9102791994810104, "num_tokens": 47201.0, "step": 38 }, { "epoch": 0.4148936170212766, "grad_norm": 0.5454854965209961, "learning_rate": 4.8e-05, "loss": 0.4902, "mean_token_accuracy": 0.9055827260017395, "num_tokens": 48182.0, "step": 39 }, { "epoch": 0.425531914893617, "grad_norm": 0.6554898023605347, "learning_rate": 4.4000000000000006e-05, "loss": 0.3607, "mean_token_accuracy": 0.9176944196224213, "num_tokens": 48959.0, "step": 40 }, { "epoch": 0.43617021276595747, "grad_norm": 0.35927918553352356, "learning_rate": 4e-05, "loss": 0.2911, "mean_token_accuracy": 0.9149797260761261, "num_tokens": 50485.0, "step": 41 }, { "epoch": 0.44680851063829785, "grad_norm": 0.3777298331260681, "learning_rate": 3.6e-05, "loss": 0.4113, "mean_token_accuracy": 0.9079957902431488, "num_tokens": 52011.0, "step": 42 }, { "epoch": 0.4574468085106383, "grad_norm": 0.5052103996276855, "learning_rate": 3.2000000000000005e-05, "loss": 0.4112, "mean_token_accuracy": 0.8977669477462769, "num_tokens": 53670.0, "step": 43 }, { "epoch": 0.46808510638297873, "grad_norm": 0.7540407180786133, "learning_rate": 2.8000000000000003e-05, "loss": 0.6973, "mean_token_accuracy": 0.8849319517612457, "num_tokens": 55106.0, "step": 44 }, { "epoch": 0.4787234042553192, "grad_norm": 0.5548056960105896, "learning_rate": 2.4e-05, "loss": 0.4593, "mean_token_accuracy": 0.8819023072719574, "num_tokens": 56178.0, "step": 45 }, { "epoch": 0.48936170212765956, "grad_norm": 0.8761438131332397, "learning_rate": 2e-05, "loss": 0.3054, "mean_token_accuracy": 0.9220689237117767, "num_tokens": 57070.0, "step": 46 }, { "epoch": 0.5, "grad_norm": 0.917113184928894, "learning_rate": 1.6000000000000003e-05, "loss": 0.619, "mean_token_accuracy": 0.8847331702709198, "num_tokens": 58389.0, "step": 47 }, { "epoch": 0.5106382978723404, "grad_norm": 0.35633689165115356, "learning_rate": 1.2e-05, "loss": 0.2867, "mean_token_accuracy": 0.9246316254138947, "num_tokens": 60319.0, "step": 48 }, { "epoch": 0.5212765957446809, "grad_norm": 0.5241644978523254, "learning_rate": 8.000000000000001e-06, "loss": 0.3784, "mean_token_accuracy": 0.8954263031482697, "num_tokens": 61355.0, "step": 49 }, { "epoch": 0.5319148936170213, "grad_norm": 0.39193060994148254, "learning_rate": 4.000000000000001e-06, "loss": 0.399, "mean_token_accuracy": 0.9024893939495087, "num_tokens": 63093.0, "step": 50 } ], "logging_steps": 1, "max_steps": 50, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 485900913143808.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }