neurocoder's picture
Upload folder using huggingface_hub
6648894 verified
raw
history blame
129 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.944,
"eval_steps": 100,
"global_step": 248,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 288.16796875,
"epoch": 0.016,
"grad_norm": 0.9921875,
"kl": 0.0,
"learning_rate": 2.0000000000000002e-07,
"loss": -0.0,
"reward": 2.1448024585843086,
"reward_std": 0.6503619067370892,
"rewards/accuracy_reward": 0.064453125,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.576262284691135,
"rewards/reasoning_steps_reward": 0.35156250186264515,
"step": 1
},
{
"completion_length": 280.5390625,
"epoch": 0.032,
"grad_norm": 2.453125,
"kl": 0.0,
"learning_rate": 4.0000000000000003e-07,
"loss": -0.0,
"reward": 2.9461557120084763,
"reward_std": 0.7598665952682495,
"rewards/accuracy_reward": 0.017578125,
"rewards/format_reward": 0.001953125,
"rewards/novelty_reward_func_explore_exploit": 0.8809234369546175,
"rewards/reasoning_steps_reward": 0.2838541753590107,
"step": 2
},
{
"completion_length": 282.580078125,
"epoch": 0.048,
"grad_norm": 1.609375,
"kl": 0.0010201742788922274,
"learning_rate": 6.000000000000001e-07,
"loss": 0.0,
"reward": 2.4310644939541817,
"reward_std": 0.7171800062060356,
"rewards/accuracy_reward": 0.099609375,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.68535483473291,
"rewards/reasoning_steps_reward": 0.27539063314907253,
"step": 3
},
{
"completion_length": 281.859375,
"epoch": 0.064,
"grad_norm": 1.6640625,
"kl": 0.0006131621394160902,
"learning_rate": 8.000000000000001e-07,
"loss": 0.0,
"reward": 2.392010949552059,
"reward_std": 0.7056797686964273,
"rewards/accuracy_reward": 0.123046875,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.6480314154177904,
"rewards/reasoning_steps_reward": 0.3248697896488011,
"step": 4
},
{
"completion_length": 276.5234375,
"epoch": 0.08,
"grad_norm": 1.140625,
"kl": 0.0008916492552089039,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.0,
"reward": 2.185966059565544,
"reward_std": 0.7970924656838179,
"rewards/accuracy_reward": 0.1171875,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.6036553451170524,
"rewards/reasoning_steps_reward": 0.2578125027939677,
"step": 5
},
{
"completion_length": 290.345703125,
"epoch": 0.096,
"grad_norm": 0.98828125,
"kl": 0.0007805953682691325,
"learning_rate": 1.2000000000000002e-06,
"loss": 0.0,
"reward": 2.586206890642643,
"reward_std": 0.7317942306399345,
"rewards/accuracy_reward": 0.0625,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7433623660666248,
"rewards/reasoning_steps_reward": 0.29361979849636555,
"step": 6
},
{
"completion_length": 288.357421875,
"epoch": 0.112,
"grad_norm": 3.3125,
"kl": 0.0006862173449917464,
"learning_rate": 1.4000000000000001e-06,
"loss": 0.0,
"reward": 2.9549497589468956,
"reward_std": 0.7832636646926403,
"rewards/accuracy_reward": 0.060546875,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.8552089141060909,
"rewards/reasoning_steps_reward": 0.3287760401144624,
"step": 7
},
{
"completion_length": 291.0859375,
"epoch": 0.128,
"grad_norm": 1.34375,
"kl": 0.0007065349800541298,
"learning_rate": 1.6000000000000001e-06,
"loss": 0.0,
"reward": 2.769632026553154,
"reward_std": 0.6810889039188623,
"rewards/accuracy_reward": 0.02734375,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.8127506040036678,
"rewards/reasoning_steps_reward": 0.3040364640764892,
"step": 8
},
{
"completion_length": 285.240234375,
"epoch": 0.144,
"grad_norm": 1.4921875,
"kl": 0.000676694346111617,
"learning_rate": 1.8000000000000001e-06,
"loss": 0.0,
"reward": 2.951853834092617,
"reward_std": 0.833003468811512,
"rewards/accuracy_reward": 0.02734375,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.8860780304918686,
"rewards/reasoning_steps_reward": 0.2662760391831398,
"step": 9
},
{
"completion_length": 273.15625,
"epoch": 0.16,
"grad_norm": 2.40625,
"kl": 0.0007681718943786109,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.0,
"reward": 2.4880168437957764,
"reward_std": 0.7941582556813955,
"rewards/accuracy_reward": 0.083984375,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.6932712296644846,
"rewards/reasoning_steps_reward": 0.3242187509313226,
"step": 10
},
{
"completion_length": 263.72265625,
"epoch": 0.176,
"grad_norm": 1.8125,
"kl": 0.0007444877319358056,
"learning_rate": 2.2e-06,
"loss": 0.0,
"reward": 2.060094438493252,
"reward_std": 0.7453512959182262,
"rewards/accuracy_reward": 0.15625,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.540864814693729,
"rewards/reasoning_steps_reward": 0.28125000139698386,
"step": 11
},
{
"completion_length": 287.52734375,
"epoch": 0.192,
"grad_norm": 0.84765625,
"kl": 0.0006410041951312451,
"learning_rate": 2.4000000000000003e-06,
"loss": 0.0,
"reward": 2.6830679774284363,
"reward_std": 0.7234712429344654,
"rewards/accuracy_reward": 0.07421875,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.775215346676608,
"rewards/reasoning_steps_reward": 0.28320312732830644,
"step": 12
},
{
"completion_length": 271.40625,
"epoch": 0.208,
"grad_norm": 1.6875,
"kl": 0.0006587781517737312,
"learning_rate": 2.6e-06,
"loss": 0.0,
"reward": 2.297848492860794,
"reward_std": 0.7962923254817724,
"rewards/accuracy_reward": 0.07421875,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.654187332217892,
"rewards/reasoning_steps_reward": 0.2610677082557231,
"step": 13
},
{
"completion_length": 282.728515625,
"epoch": 0.224,
"grad_norm": 1.1015625,
"kl": 0.0009098516529775225,
"learning_rate": 2.8000000000000003e-06,
"loss": 0.0,
"reward": 2.512993238866329,
"reward_std": 0.7325041498988867,
"rewards/accuracy_reward": 0.11328125,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.698124471741418,
"rewards/reasoning_steps_reward": 0.30533855268731713,
"step": 14
},
{
"completion_length": 286.90234375,
"epoch": 0.24,
"grad_norm": 1.7890625,
"kl": 0.0009447168922633864,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 2.572930172085762,
"reward_std": 0.7189842760562897,
"rewards/accuracy_reward": 0.046875,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7254819249113401,
"rewards/reasoning_steps_reward": 0.34960938710719347,
"step": 15
},
{
"completion_length": 282.0234375,
"epoch": 0.256,
"grad_norm": 1.9453125,
"kl": 0.0007176450344559271,
"learning_rate": 3.2000000000000003e-06,
"loss": 0.0,
"reward": 2.3780763298273087,
"reward_std": 0.7437136992812157,
"rewards/accuracy_reward": 0.044921875,
"rewards/format_reward": 0.001953125,
"rewards/novelty_reward_func_explore_exploit": 0.668343149125576,
"rewards/reasoning_steps_reward": 0.326171881519258,
"step": 16
},
{
"completion_length": 277.181640625,
"epoch": 0.272,
"grad_norm": 1.84375,
"kl": 0.0009444843672099523,
"learning_rate": 3.4000000000000005e-06,
"loss": 0.0,
"reward": 2.6828741505742073,
"reward_std": 0.7672664560377598,
"rewards/accuracy_reward": 0.068359375,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7499771338577071,
"rewards/reasoning_steps_reward": 0.3645833469927311,
"step": 17
},
{
"completion_length": 286.3671875,
"epoch": 0.288,
"grad_norm": 1.28125,
"kl": 0.0009558251094858861,
"learning_rate": 3.6000000000000003e-06,
"loss": 0.0,
"reward": 2.608707718551159,
"reward_std": 0.7508547510951757,
"rewards/accuracy_reward": 0.056640625,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7523817578330636,
"rewards/reasoning_steps_reward": 0.29492188477888703,
"step": 18
},
{
"completion_length": 288.197265625,
"epoch": 0.304,
"grad_norm": 1.3203125,
"kl": 0.0009933830478985328,
"learning_rate": 3.8000000000000005e-06,
"loss": 0.0,
"reward": 3.152822159230709,
"reward_std": 0.7633876148611307,
"rewards/accuracy_reward": 0.03125,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.9196473040307561,
"rewards/reasoning_steps_reward": 0.3626302182674408,
"step": 19
},
{
"completion_length": 291.763671875,
"epoch": 0.32,
"grad_norm": 0.95703125,
"kl": 0.0010635810940584633,
"learning_rate": 4.000000000000001e-06,
"loss": 0.0,
"reward": 2.500213325023651,
"reward_std": 0.7017618604004383,
"rewards/accuracy_reward": 0.0859375,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.6949495617300272,
"rewards/reasoning_steps_reward": 0.32942708022892475,
"step": 20
},
{
"completion_length": 281.04296875,
"epoch": 0.336,
"grad_norm": 3872.0,
"kl": 38.398836399162974,
"learning_rate": 4.2000000000000004e-06,
"loss": 1.536,
"reward": 2.2889985144138336,
"reward_std": 0.7787356674671173,
"rewards/accuracy_reward": 0.068359375,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.6505863160515825,
"rewards/reasoning_steps_reward": 0.2688802117481828,
"step": 21
},
{
"completion_length": 288.47265625,
"epoch": 0.352,
"grad_norm": 1.0859375,
"kl": 0.0012669887473748531,
"learning_rate": 4.4e-06,
"loss": 0.0001,
"reward": 2.6376563012599945,
"reward_std": 0.6690970882773399,
"rewards/accuracy_reward": 0.087890625,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7099479290967187,
"rewards/reasoning_steps_reward": 0.41992187313735485,
"step": 22
},
{
"completion_length": 286.966796875,
"epoch": 0.368,
"grad_norm": 1.2734375,
"kl": 0.0015128458107938059,
"learning_rate": 4.600000000000001e-06,
"loss": 0.0001,
"reward": 2.5921228751540184,
"reward_std": 0.6939626764506102,
"rewards/accuracy_reward": 0.0859375,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7249350572625796,
"rewards/reasoning_steps_reward": 0.33138021221384406,
"step": 23
},
{
"completion_length": 280.384765625,
"epoch": 0.384,
"grad_norm": 0.9375,
"kl": 0.0015014593445812352,
"learning_rate": 4.800000000000001e-06,
"loss": 0.0001,
"reward": 2.704825095832348,
"reward_std": 0.8105385769158602,
"rewards/accuracy_reward": 0.056640625,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7590302489697933,
"rewards/reasoning_steps_reward": 0.3710937546566129,
"step": 24
},
{
"completion_length": 290.41796875,
"epoch": 0.4,
"grad_norm": 0.9140625,
"kl": 0.001665601652348414,
"learning_rate": 5e-06,
"loss": 0.0001,
"reward": 2.357452914118767,
"reward_std": 0.7088302746415138,
"rewards/accuracy_reward": 0.064453125,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.6477967969452342,
"rewards/reasoning_steps_reward": 0.349609381519258,
"step": 25
},
{
"completion_length": 291.509765625,
"epoch": 0.416,
"grad_norm": 0.9609375,
"kl": 0.001865061596618034,
"learning_rate": 4.999751919373782e-06,
"loss": 0.0001,
"reward": 2.281202170997858,
"reward_std": 0.6989514082670212,
"rewards/accuracy_reward": 0.08984375,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.5948191303759813,
"rewards/reasoning_steps_reward": 0.40690105222165585,
"step": 26
},
{
"completion_length": 287.421875,
"epoch": 0.432,
"grad_norm": 0.9140625,
"kl": 0.002278451618622057,
"learning_rate": 4.9990077267303256e-06,
"loss": 0.0001,
"reward": 2.39421396702528,
"reward_std": 0.7001004256308079,
"rewards/accuracy_reward": 0.1171875,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.6474636799345413,
"rewards/reasoning_steps_reward": 0.3346354281529784,
"step": 27
},
{
"completion_length": 289.5859375,
"epoch": 0.448,
"grad_norm": 1.0625,
"kl": 0.0022466240770881996,
"learning_rate": 4.997767569765452e-06,
"loss": 0.0001,
"reward": 2.673185594379902,
"reward_std": 0.7204618379473686,
"rewards/accuracy_reward": 0.060546875,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7612875507523617,
"rewards/reasoning_steps_reward": 0.32877604896202683,
"step": 28
},
{
"completion_length": 290.353515625,
"epoch": 0.464,
"grad_norm": 0.94921875,
"kl": 0.002421206998405978,
"learning_rate": 4.996031694606294e-06,
"loss": 0.0001,
"reward": 2.2081645615398884,
"reward_std": 0.7610204052180052,
"rewards/accuracy_reward": 0.095703125,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.5865322849713266,
"rewards/reasoning_steps_reward": 0.3528645895421505,
"step": 29
},
{
"completion_length": 284.9765625,
"epoch": 0.48,
"grad_norm": 0.95703125,
"kl": 0.0032389966800110415,
"learning_rate": 4.993800445762451e-06,
"loss": 0.0001,
"reward": 2.4655835777521133,
"reward_std": 0.8400940801948309,
"rewards/accuracy_reward": 0.087890625,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.6395695237442851,
"rewards/reasoning_steps_reward": 0.45898438431322575,
"step": 30
},
{
"completion_length": 288.93359375,
"epoch": 0.496,
"grad_norm": 1.0,
"kl": 0.0028132440565968864,
"learning_rate": 4.991074266057609e-06,
"loss": 0.0001,
"reward": 2.666738063097,
"reward_std": 0.6789926886558533,
"rewards/accuracy_reward": 0.087890625,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7322286684066057,
"rewards/reasoning_steps_reward": 0.3821614580228925,
"step": 31
},
{
"completion_length": 292.3671875,
"epoch": 0.512,
"grad_norm": 1.0078125,
"kl": 0.004060989667777903,
"learning_rate": 4.987853696541664e-06,
"loss": 0.0002,
"reward": 2.5818087458610535,
"reward_std": 0.6875880807638168,
"rewards/accuracy_reward": 0.0625,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7134674986203512,
"rewards/reasoning_steps_reward": 0.3789062611758709,
"step": 32
},
{
"completion_length": 286.158203125,
"epoch": 0.528,
"grad_norm": 1.140625,
"kl": 0.005552116854232736,
"learning_rate": 4.984139376383337e-06,
"loss": 0.0002,
"reward": 2.8399546705186367,
"reward_std": 0.750790286809206,
"rewards/accuracy_reward": 0.138671875,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7791168199231228,
"rewards/reasoning_steps_reward": 0.3639322938397527,
"step": 33
},
{
"completion_length": 287.48828125,
"epoch": 0.544,
"grad_norm": 3.171875,
"kl": 0.00440279851318337,
"learning_rate": 4.979932042743324e-06,
"loss": 0.0002,
"reward": 3.1019199565052986,
"reward_std": 0.8068479858338833,
"rewards/accuracy_reward": 0.06640625,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.877940321341157,
"rewards/reasoning_steps_reward": 0.4016927145421505,
"step": 34
},
{
"completion_length": 291.712890625,
"epoch": 0.56,
"grad_norm": 0.83984375,
"kl": 0.003549927467247471,
"learning_rate": 4.975232530627998e-06,
"loss": 0.0001,
"reward": 2.758346803486347,
"reward_std": 0.73613665625453,
"rewards/accuracy_reward": 0.05078125,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7701433822512627,
"rewards/reasoning_steps_reward": 0.3971354244276881,
"step": 35
},
{
"completion_length": 280.013671875,
"epoch": 0.576,
"grad_norm": 0.94921875,
"kl": 0.004770460931467824,
"learning_rate": 4.970041772723685e-06,
"loss": 0.0002,
"reward": 2.5518586486577988,
"reward_std": 0.752994803711772,
"rewards/accuracy_reward": 0.185546875,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.6477115458498398,
"rewards/reasoning_steps_reward": 0.4231770895421505,
"step": 36
},
{
"completion_length": 294.7265625,
"epoch": 0.592,
"grad_norm": 0.88671875,
"kl": 0.004318368082749657,
"learning_rate": 4.964360799211563e-06,
"loss": 0.0002,
"reward": 2.9847040474414825,
"reward_std": 0.7252895850688219,
"rewards/accuracy_reward": 0.03515625,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.8412555102258921,
"rewards/reasoning_steps_reward": 0.42578124813735485,
"step": 37
},
{
"completion_length": 287.59765625,
"epoch": 0.608,
"grad_norm": 0.92578125,
"kl": 0.005480331514263526,
"learning_rate": 4.958190737563203e-06,
"loss": 0.0002,
"reward": 2.4749373346567154,
"reward_std": 0.7473156917840242,
"rewards/accuracy_reward": 0.11328125,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.6489808422823747,
"rewards/reasoning_steps_reward": 0.41471355129033327,
"step": 38
},
{
"completion_length": 295.189453125,
"epoch": 0.624,
"grad_norm": 0.8828125,
"kl": 0.005519463520613499,
"learning_rate": 4.951532812316814e-06,
"loss": 0.0002,
"reward": 2.7017148807644844,
"reward_std": 0.713581632822752,
"rewards/accuracy_reward": 0.03515625,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7169778756797314,
"rewards/reasoning_steps_reward": 0.5156250111758709,
"step": 39
},
{
"completion_length": 289.95703125,
"epoch": 0.64,
"grad_norm": 0.8828125,
"kl": 0.005352065360057168,
"learning_rate": 4.944388344834205e-06,
"loss": 0.0002,
"reward": 2.7851984202861786,
"reward_std": 0.658753015100956,
"rewards/accuracy_reward": 0.109375,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7454567709937692,
"rewards/reasoning_steps_reward": 0.43945313338190317,
"step": 40
},
{
"completion_length": 290.80078125,
"epoch": 0.656,
"grad_norm": 0.8515625,
"kl": 0.00584478146629408,
"learning_rate": 4.936758753038551e-06,
"loss": 0.0002,
"reward": 2.83456464856863,
"reward_std": 0.6670792158693075,
"rewards/accuracy_reward": 0.056640625,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7892559381822745,
"rewards/reasoning_steps_reward": 0.41015625838190317,
"step": 41
},
{
"completion_length": 286.533203125,
"epoch": 0.672,
"grad_norm": 1.09375,
"kl": 0.009831015078816563,
"learning_rate": 4.92864555113298e-06,
"loss": 0.0004,
"reward": 3.0447439029812813,
"reward_std": 0.6739194095134735,
"rewards/accuracy_reward": 0.16015625,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.8074493408203125,
"rewards/reasoning_steps_reward": 0.4622395820915699,
"step": 42
},
{
"completion_length": 295.37109375,
"epoch": 0.688,
"grad_norm": 0.95703125,
"kl": 0.0045513896038755774,
"learning_rate": 4.92005034930006e-06,
"loss": 0.0002,
"reward": 2.367835894227028,
"reward_std": 0.6928801033645868,
"rewards/accuracy_reward": 0.05859375,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.6254331463327011,
"rewards/reasoning_steps_reward": 0.4329427080228925,
"step": 43
},
{
"completion_length": 291.12890625,
"epoch": 0.704,
"grad_norm": 0.89453125,
"kl": 0.007478385392460041,
"learning_rate": 4.9109748533822315e-06,
"loss": 0.0003,
"reward": 3.1017851755023003,
"reward_std": 0.7546116765588522,
"rewards/accuracy_reward": 0.05078125,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.8752912487834692,
"rewards/reasoning_steps_reward": 0.42513021221384406,
"step": 44
},
{
"completion_length": 286.34375,
"epoch": 0.72,
"grad_norm": 0.90234375,
"kl": 0.007521548090153374,
"learning_rate": 4.901420864543265e-06,
"loss": 0.0003,
"reward": 2.608507961034775,
"reward_std": 0.6855722554028034,
"rewards/accuracy_reward": 0.119140625,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.674624165520072,
"rewards/reasoning_steps_reward": 0.4654947901144624,
"step": 45
},
{
"completion_length": 287.42578125,
"epoch": 0.736,
"grad_norm": 0.984375,
"kl": 0.006817970628617331,
"learning_rate": 4.891390278910788e-06,
"loss": 0.0003,
"reward": 2.673181392252445,
"reward_std": 0.7831938974559307,
"rewards/accuracy_reward": 0.095703125,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7087687657525142,
"rewards/reasoning_steps_reward": 0.4511718712747097,
"step": 46
},
{
"completion_length": 285.908203125,
"epoch": 0.752,
"grad_norm": 1.2890625,
"kl": 0.007845322310458869,
"learning_rate": 4.880885087199972e-06,
"loss": 0.0003,
"reward": 2.7633985728025436,
"reward_std": 0.7148055490106344,
"rewards/accuracy_reward": 0.099609375,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.750776955857873,
"rewards/reasoning_steps_reward": 0.41145834047347307,
"step": 47
},
{
"completion_length": 289.779296875,
"epoch": 0.768,
"grad_norm": 1.15625,
"kl": 0.008501806572894566,
"learning_rate": 4.869907374318446e-06,
"loss": 0.0003,
"reward": 3.029990702867508,
"reward_std": 0.7890328913927078,
"rewards/accuracy_reward": 0.08203125,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.8099100968490044,
"rewards/reasoning_steps_reward": 0.5182291734963655,
"step": 48
},
{
"completion_length": 288.1328125,
"epoch": 0.784,
"grad_norm": 1.5390625,
"kl": 0.008691710012499243,
"learning_rate": 4.858459318952521e-06,
"loss": 0.0003,
"reward": 2.929666645824909,
"reward_std": 0.7696562893688679,
"rewards/accuracy_reward": 0.072265625,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7836301922798157,
"rewards/reasoning_steps_reward": 0.5065104309469461,
"step": 49
},
{
"completion_length": 290.06640625,
"epoch": 0.8,
"grad_norm": 0.95703125,
"kl": 0.007455944927642122,
"learning_rate": 4.8465431931347904e-06,
"loss": 0.0003,
"reward": 2.573406994342804,
"reward_std": 0.7570146657526493,
"rewards/accuracy_reward": 0.103515625,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.6466478169895709,
"rewards/reasoning_steps_reward": 0.529947929084301,
"step": 50
},
{
"completion_length": 291.49609375,
"epoch": 0.816,
"grad_norm": 0.98046875,
"kl": 0.01057859291904606,
"learning_rate": 4.83416136179322e-06,
"loss": 0.0004,
"reward": 2.6446976363658905,
"reward_std": 0.6847481243312359,
"rewards/accuracy_reward": 0.025390625,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7094738796974221,
"rewards/reasoning_steps_reward": 0.49088541977107525,
"step": 51
},
{
"completion_length": 282.392578125,
"epoch": 0.832,
"grad_norm": 2.03125,
"kl": 0.01016361394431442,
"learning_rate": 4.821316282281788e-06,
"loss": 0.0004,
"reward": 2.766519770026207,
"reward_std": 0.7637902311980724,
"rewards/accuracy_reward": 0.1328125,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.6958277653902769,
"rewards/reasoning_steps_reward": 0.5462239757180214,
"step": 52
},
{
"completion_length": 287.380859375,
"epoch": 0.848,
"grad_norm": 1.5,
"kl": 0.00917052014847286,
"learning_rate": 4.808010503892788e-06,
"loss": 0.0004,
"reward": 2.570107080042362,
"reward_std": 0.7531391996890306,
"rewards/accuracy_reward": 0.10546875,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.6537943718334039,
"rewards/reasoning_steps_reward": 0.5032552108168602,
"step": 53
},
{
"completion_length": 291.640625,
"epoch": 0.864,
"grad_norm": 0.89453125,
"kl": 0.009139836591202766,
"learning_rate": 4.794246667350889e-06,
"loss": 0.0004,
"reward": 2.8398406505584717,
"reward_std": 0.7224904727190733,
"rewards/accuracy_reward": 0.04296875,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.770832309499383,
"rewards/reasoning_steps_reward": 0.484375006519258,
"step": 54
},
{
"completion_length": 273.5,
"epoch": 0.88,
"grad_norm": 1.2734375,
"kl": 0.009875323972664773,
"learning_rate": 4.780027504289043e-06,
"loss": 0.0004,
"reward": 2.9461885392665863,
"reward_std": 0.7379185315221548,
"rewards/accuracy_reward": 0.193359375,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7422624975442886,
"rewards/reasoning_steps_reward": 0.5260416679084301,
"step": 55
},
{
"completion_length": 291.078125,
"epoch": 0.896,
"grad_norm": 0.8828125,
"kl": 0.00826937542296946,
"learning_rate": 4.765355836706349e-06,
"loss": 0.0003,
"reward": 2.880779907107353,
"reward_std": 0.711861016228795,
"rewards/accuracy_reward": 0.03125,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7781853148092827,
"rewards/reasoning_steps_reward": 0.5149739719927311,
"step": 56
},
{
"completion_length": 288.72265625,
"epoch": 0.912,
"grad_norm": 0.9140625,
"kl": 0.008809896156890318,
"learning_rate": 4.750234576407994e-06,
"loss": 0.0004,
"reward": 2.6955473721027374,
"reward_std": 0.8277835454791784,
"rewards/accuracy_reward": 0.087890625,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.6945227358179787,
"rewards/reasoning_steps_reward": 0.5240885429084301,
"step": 57
},
{
"completion_length": 285.9921875,
"epoch": 0.928,
"grad_norm": 0.90234375,
"kl": 0.008899325417587534,
"learning_rate": 4.734666724427357e-06,
"loss": 0.0004,
"reward": 3.041392058134079,
"reward_std": 0.6156999934464693,
"rewards/accuracy_reward": 0.115234375,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.8032938965285817,
"rewards/reasoning_steps_reward": 0.516276054084301,
"step": 58
},
{
"completion_length": 290.314453125,
"epoch": 0.944,
"grad_norm": 4.21875,
"kl": 0.011989369959337637,
"learning_rate": 4.718655370430411e-06,
"loss": 0.0005,
"reward": 2.9865424036979675,
"reward_std": 0.8030446134507656,
"rewards/accuracy_reward": 0.078125,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7860957235097885,
"rewards/reasoning_steps_reward": 0.5501302164047956,
"step": 59
},
{
"completion_length": 282.091796875,
"epoch": 0.96,
"grad_norm": 1.8125,
"kl": 0.012043666996760294,
"learning_rate": 4.702203692102539e-06,
"loss": 0.0005,
"reward": 3.1328602582216263,
"reward_std": 0.6528369020670652,
"rewards/accuracy_reward": 0.111328125,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.8283579163253307,
"rewards/reasoning_steps_reward": 0.5364583358168602,
"step": 60
},
{
"completion_length": 288.666015625,
"epoch": 0.976,
"grad_norm": 0.76953125,
"kl": 0.009388500155182555,
"learning_rate": 4.68531495451787e-06,
"loss": 0.0004,
"reward": 2.58310616761446,
"reward_std": 0.6356705613434315,
"rewards/accuracy_reward": 0.126953125,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.6383791274080673,
"rewards/reasoning_steps_reward": 0.5410156436264515,
"step": 61
},
{
"completion_length": 288.513671875,
"epoch": 0.992,
"grad_norm": 1.203125,
"kl": 0.010823950171470642,
"learning_rate": 4.66799250949128e-06,
"loss": 0.0004,
"reward": 3.1646435484290123,
"reward_std": 0.7192362230271101,
"rewards/accuracy_reward": 0.095703125,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.8294037394225597,
"rewards/reasoning_steps_reward": 0.5807291716337204,
"step": 62
},
{
"completion_length": 283.671875,
"epoch": 1.0,
"grad_norm": 0.65234375,
"kl": 0.011424218711908907,
"learning_rate": 4.650239794913177e-06,
"loss": 0.0002,
"reward": 2.6004482805728912,
"reward_std": 0.775815561413765,
"rewards/accuracy_reward": 0.1484375,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.6493681768576304,
"rewards/reasoning_steps_reward": 0.5039062574505806,
"step": 63
},
{
"completion_length": 293.845703125,
"epoch": 1.016,
"grad_norm": 1.921875,
"kl": 0.011366115068085492,
"learning_rate": 4.632060334067202e-06,
"loss": 0.0005,
"reward": 2.7260814532637596,
"reward_std": 0.6874045897275209,
"rewards/accuracy_reward": 0.078125,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.6918969408919414,
"rewards/reasoning_steps_reward": 0.5722656305879354,
"step": 64
},
{
"completion_length": 294.06640625,
"epoch": 1.032,
"grad_norm": 2.171875,
"kl": 0.012063174799550325,
"learning_rate": 4.613457734930978e-06,
"loss": 0.0005,
"reward": 2.6708649322390556,
"reward_std": 0.6978613398969173,
"rewards/accuracy_reward": 0.109375,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.6702362283443412,
"rewards/reasoning_steps_reward": 0.5507812555879354,
"step": 65
},
{
"completion_length": 293.265625,
"epoch": 1.048,
"grad_norm": 0.91015625,
"kl": 0.010817280679475516,
"learning_rate": 4.5944356894600615e-06,
"loss": 0.0004,
"reward": 2.96081106364727,
"reward_std": 0.7362911906093359,
"rewards/accuracy_reward": 0.052734375,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7738293862591187,
"rewards/reasoning_steps_reward": 0.5865885466337204,
"step": 66
},
{
"completion_length": 278.333984375,
"epoch": 1.064,
"grad_norm": 0.82421875,
"kl": 0.010780130076454952,
"learning_rate": 4.574997972855212e-06,
"loss": 0.0004,
"reward": 2.909902695566416,
"reward_std": 0.6607580352574587,
"rewards/accuracy_reward": 0.228515625,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.708031815631936,
"rewards/reasoning_steps_reward": 0.5572916734963655,
"step": 67
},
{
"completion_length": 283.216796875,
"epoch": 1.08,
"grad_norm": 0.890625,
"kl": 0.012393000011797994,
"learning_rate": 4.5551484428131575e-06,
"loss": 0.0005,
"reward": 2.827034629881382,
"reward_std": 0.6700945645570755,
"rewards/accuracy_reward": 0.130859375,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7125271611536542,
"rewards/reasoning_steps_reward": 0.5585937593132257,
"step": 68
},
{
"completion_length": 288.322265625,
"epoch": 1.096,
"grad_norm": 1.21875,
"kl": 0.013504860282409936,
"learning_rate": 4.534891038760971e-06,
"loss": 0.0005,
"reward": 3.1474373564124107,
"reward_std": 0.7250996101647615,
"rewards/accuracy_reward": 0.07421875,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.818677028020223,
"rewards/reasoning_steps_reward": 0.6171875037252903,
"step": 69
},
{
"completion_length": 282.470703125,
"epoch": 1.112,
"grad_norm": 1.7421875,
"kl": 0.010700971761252731,
"learning_rate": 4.514229781074239e-06,
"loss": 0.0004,
"reward": 2.8449594378471375,
"reward_std": 0.7744644097983837,
"rewards/accuracy_reward": 0.1484375,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.70309411889563,
"rewards/reasoning_steps_reward": 0.5872395783662796,
"step": 70
},
{
"completion_length": 290.74609375,
"epoch": 1.1280000000000001,
"grad_norm": 0.98828125,
"kl": 0.012390443938784301,
"learning_rate": 4.49316877027916e-06,
"loss": 0.0005,
"reward": 2.777492232620716,
"reward_std": 0.6991744674742222,
"rewards/accuracy_reward": 0.109375,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.6892856055249771,
"rewards/reasoning_steps_reward": 0.6002604309469461,
"step": 71
},
{
"completion_length": 286.109375,
"epoch": 1.144,
"grad_norm": 0.921875,
"kl": 0.012579885253217071,
"learning_rate": 4.471712186238728e-06,
"loss": 0.0005,
"reward": 2.548068232834339,
"reward_std": 0.6026105545461178,
"rewards/accuracy_reward": 0.15234375,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.6145470403134823,
"rewards/reasoning_steps_reward": 0.5520833395421505,
"step": 72
},
{
"completion_length": 283.822265625,
"epoch": 1.16,
"grad_norm": 0.80859375,
"kl": 0.01136038324330002,
"learning_rate": 4.449864287323188e-06,
"loss": 0.0005,
"reward": 2.7529877200722694,
"reward_std": 0.575440164655447,
"rewards/accuracy_reward": 0.1015625,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.6726538874208927,
"rewards/reasoning_steps_reward": 0.633463554084301,
"step": 73
},
{
"completion_length": 287.66015625,
"epoch": 1.176,
"grad_norm": 0.99609375,
"kl": 0.013483586255460978,
"learning_rate": 4.427629409564898e-06,
"loss": 0.0005,
"reward": 2.6529831513762474,
"reward_std": 0.7726290188729763,
"rewards/accuracy_reward": 0.025390625,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.6770794546852509,
"rewards/reasoning_steps_reward": 0.5963541734963655,
"step": 74
},
{
"completion_length": 287.505859375,
"epoch": 1.192,
"grad_norm": 0.8828125,
"kl": 0.010077012644615024,
"learning_rate": 4.405011965797775e-06,
"loss": 0.0004,
"reward": 2.944363258779049,
"reward_std": 0.6908796802163124,
"rewards/accuracy_reward": 0.111328125,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7551089283078909,
"rewards/reasoning_steps_reward": 0.567708333954215,
"step": 75
},
{
"completion_length": 286.248046875,
"epoch": 1.208,
"grad_norm": 1.109375,
"kl": 0.014365001203259453,
"learning_rate": 4.382016444781509e-06,
"loss": 0.0006,
"reward": 2.8981464356184006,
"reward_std": 0.7666896525770426,
"rewards/accuracy_reward": 0.09765625,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7290696154038111,
"rewards/reasoning_steps_reward": 0.6132812537252903,
"step": 76
},
{
"completion_length": 296.5078125,
"epoch": 1.224,
"grad_norm": 0.91796875,
"kl": 0.011508767551276833,
"learning_rate": 4.3586474103107034e-06,
"loss": 0.0005,
"reward": 3.2085797861218452,
"reward_std": 0.7307887077331543,
"rewards/accuracy_reward": 0.0078125,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.8637974336743355,
"rewards/reasoning_steps_reward": 0.6093750055879354,
"step": 77
},
{
"completion_length": 288.017578125,
"epoch": 1.24,
"grad_norm": 1.7890625,
"kl": 0.01725275401258841,
"learning_rate": 4.334909500309124e-06,
"loss": 0.0007,
"reward": 2.819778010249138,
"reward_std": 0.680737467482686,
"rewards/accuracy_reward": 0.087890625,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7289884922405084,
"rewards/reasoning_steps_reward": 0.5449218675494194,
"step": 78
},
{
"completion_length": 289.611328125,
"epoch": 1.256,
"grad_norm": 0.91015625,
"kl": 0.012111473915865645,
"learning_rate": 4.310807425909231e-06,
"loss": 0.0005,
"reward": 2.8959785476326942,
"reward_std": 0.7515880167484283,
"rewards/accuracy_reward": 0.091796875,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7370275671904286,
"rewards/reasoning_steps_reward": 0.5930989608168602,
"step": 79
},
{
"completion_length": 289.447265625,
"epoch": 1.272,
"grad_norm": 1.0625,
"kl": 0.01488638247246854,
"learning_rate": 4.286345970517195e-06,
"loss": 0.0006,
"reward": 3.0342861488461494,
"reward_std": 0.7542771827429533,
"rewards/accuracy_reward": 0.08203125,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7896405051772794,
"rewards/reasoning_steps_reward": 0.5833333432674408,
"step": 80
},
{
"completion_length": 291.05078125,
"epoch": 1.288,
"grad_norm": 2.203125,
"kl": 0.018765830318443477,
"learning_rate": 4.261529988863552e-06,
"loss": 0.0008,
"reward": 2.6918394044041634,
"reward_std": 0.5996266044676304,
"rewards/accuracy_reward": 0.0546875,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.6713683251291513,
"rewards/reasoning_steps_reward": 0.6230468954890966,
"step": 81
},
{
"completion_length": 286.84765625,
"epoch": 1.304,
"grad_norm": 1.1796875,
"kl": 0.014768981840461493,
"learning_rate": 4.236364406039718e-06,
"loss": 0.0006,
"reward": 2.7222700491547585,
"reward_std": 0.7165388073772192,
"rewards/accuracy_reward": 0.166015625,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.6546021662652493,
"rewards/reasoning_steps_reward": 0.5924479365348816,
"step": 82
},
{
"completion_length": 286.40625,
"epoch": 1.32,
"grad_norm": 1.1875,
"kl": 0.013985031400807202,
"learning_rate": 4.210854216520529e-06,
"loss": 0.0006,
"reward": 2.992369443178177,
"reward_std": 0.704998791217804,
"rewards/accuracy_reward": 0.140625,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7387759207437435,
"rewards/reasoning_steps_reward": 0.6354166734963655,
"step": 83
},
{
"completion_length": 285.0546875,
"epoch": 1.336,
"grad_norm": 0.98828125,
"kl": 0.015817424457054585,
"learning_rate": 4.185004483173018e-06,
"loss": 0.0006,
"reward": 2.6470197066664696,
"reward_std": 0.6006427239626646,
"rewards/accuracy_reward": 0.091796875,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.6638069117131332,
"rewards/reasoning_steps_reward": 0.5638020895421505,
"step": 84
},
{
"completion_length": 289.240234375,
"epoch": 1.3519999999999999,
"grad_norm": 0.7734375,
"kl": 0.012478121934691444,
"learning_rate": 4.158820336251615e-06,
"loss": 0.0005,
"reward": 2.86134272813797,
"reward_std": 0.6924843583256006,
"rewards/accuracy_reward": 0.103515625,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7050829759488503,
"rewards/reasoning_steps_reward": 0.6425781324505806,
"step": 85
},
{
"completion_length": 290.703125,
"epoch": 1.3679999999999999,
"grad_norm": 1.0,
"kl": 0.01463651837548241,
"learning_rate": 4.132306972379971e-06,
"loss": 0.0006,
"reward": 2.752312555909157,
"reward_std": 0.6686646416783333,
"rewards/accuracy_reward": 0.09375,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.6626632142191132,
"rewards/reasoning_steps_reward": 0.6705729197710752,
"step": 86
},
{
"completion_length": 295.359375,
"epoch": 1.384,
"grad_norm": 6.46875,
"kl": 0.051578508340753615,
"learning_rate": 4.105469653519617e-06,
"loss": 0.0021,
"reward": 2.62810418009758,
"reward_std": 0.7081009931862354,
"rewards/accuracy_reward": 0.08203125,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.630809023976326,
"rewards/reasoning_steps_reward": 0.6536458320915699,
"step": 87
},
{
"completion_length": 274.9765625,
"epoch": 1.4,
"grad_norm": 0.9921875,
"kl": 0.01592816604534164,
"learning_rate": 4.078313705925647e-06,
"loss": 0.0006,
"reward": 2.9463500678539276,
"reward_std": 0.6255538109689951,
"rewards/accuracy_reward": 0.171875,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.735371884269019,
"rewards/reasoning_steps_reward": 0.5683593694120646,
"step": 88
},
{
"completion_length": 285.123046875,
"epoch": 1.416,
"grad_norm": 0.9375,
"kl": 0.016973954916466027,
"learning_rate": 4.0508445190896505e-06,
"loss": 0.0007,
"reward": 2.821994110941887,
"reward_std": 0.6989093981683254,
"rewards/accuracy_reward": 0.123046875,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7006473361204067,
"rewards/reasoning_steps_reward": 0.5970052275806665,
"step": 89
},
{
"completion_length": 289.72265625,
"epoch": 1.432,
"grad_norm": 1.03125,
"kl": 0.013915765506681055,
"learning_rate": 4.023067544670082e-06,
"loss": 0.0006,
"reward": 2.775428354740143,
"reward_std": 0.6686036083847284,
"rewards/accuracy_reward": 0.05078125,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7163754136612018,
"rewards/reasoning_steps_reward": 0.5755208488553762,
"step": 90
},
{
"completion_length": 285.888671875,
"epoch": 1.448,
"grad_norm": 0.84375,
"kl": 0.014326595468446612,
"learning_rate": 3.9949882954103115e-06,
"loss": 0.0006,
"reward": 2.778537377715111,
"reward_std": 0.6794710606336594,
"rewards/accuracy_reward": 0.10546875,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.6774811816091338,
"rewards/reasoning_steps_reward": 0.6406249962747097,
"step": 91
},
{
"completion_length": 288.5703125,
"epoch": 1.464,
"grad_norm": 1.0078125,
"kl": 0.0172699682880193,
"learning_rate": 3.9666123440445295e-06,
"loss": 0.0007,
"reward": 3.1450441628694534,
"reward_std": 0.6363171022385359,
"rewards/accuracy_reward": 0.08203125,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.8013862585648894,
"rewards/reasoning_steps_reward": 0.6588541604578495,
"step": 92
},
{
"completion_length": 290.484375,
"epoch": 1.48,
"grad_norm": 1.0703125,
"kl": 0.01583321939688176,
"learning_rate": 3.937945322191763e-06,
"loss": 0.0006,
"reward": 2.80034501850605,
"reward_std": 0.6433209720999002,
"rewards/accuracy_reward": 0.046875,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.6977712381631136,
"rewards/reasoning_steps_reward": 0.6601562574505806,
"step": 93
},
{
"completion_length": 288.1953125,
"epoch": 1.496,
"grad_norm": 0.8515625,
"kl": 0.014628544799052179,
"learning_rate": 3.9089929192382e-06,
"loss": 0.0006,
"reward": 2.8053995221853256,
"reward_std": 0.6814130581915379,
"rewards/accuracy_reward": 0.1015625,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.6920776072268685,
"rewards/reasoning_steps_reward": 0.627604166045785,
"step": 94
},
{
"completion_length": 281.654296875,
"epoch": 1.512,
"grad_norm": 1.7578125,
"kl": 0.018830388551577926,
"learning_rate": 3.879760881208043e-06,
"loss": 0.0008,
"reward": 3.1405431628227234,
"reward_std": 0.6778986994177103,
"rewards/accuracy_reward": 0.126953125,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7955456127723058,
"rewards/reasoning_steps_reward": 0.6269531361758709,
"step": 95
},
{
"completion_length": 288.43359375,
"epoch": 1.528,
"grad_norm": 0.97265625,
"kl": 0.015546579379588366,
"learning_rate": 3.8502550096231325e-06,
"loss": 0.0006,
"reward": 2.9025785624980927,
"reward_std": 0.6252446379512548,
"rewards/accuracy_reward": 0.14453125,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.6997310233612856,
"rewards/reasoning_steps_reward": 0.6588541865348816,
"step": 96
},
{
"completion_length": 287.794921875,
"epoch": 1.544,
"grad_norm": 1.90625,
"kl": 0.01677697291597724,
"learning_rate": 3.82048116035155e-06,
"loss": 0.0007,
"reward": 2.9905193150043488,
"reward_std": 0.6695100143551826,
"rewards/accuracy_reward": 0.109375,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7668050316472849,
"rewards/reasoning_steps_reward": 0.5807291697710752,
"step": 97
},
{
"completion_length": 290.875,
"epoch": 1.56,
"grad_norm": 0.93359375,
"kl": 0.017896617820952088,
"learning_rate": 3.790445242445432e-06,
"loss": 0.0007,
"reward": 3.0564729273319244,
"reward_std": 0.7583746667951345,
"rewards/accuracy_reward": 0.0625,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.79595104791224,
"rewards/reasoning_steps_reward": 0.6061197966337204,
"step": 98
},
{
"completion_length": 290.935546875,
"epoch": 1.576,
"grad_norm": 0.87890625,
"kl": 0.01600857445737347,
"learning_rate": 3.7601532169682363e-06,
"loss": 0.0006,
"reward": 3.207048572599888,
"reward_std": 0.7255587056279182,
"rewards/accuracy_reward": 0.10546875,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.8183651498208443,
"rewards/reasoning_steps_reward": 0.6464843899011612,
"step": 99
},
{
"completion_length": 293.923828125,
"epoch": 1.592,
"grad_norm": 0.94921875,
"kl": 0.016008648555725813,
"learning_rate": 3.7296110958116845e-06,
"loss": 0.0006,
"reward": 3.213783323764801,
"reward_std": 0.7248476631939411,
"rewards/accuracy_reward": 0.05859375,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.8229972099264463,
"rewards/reasoning_steps_reward": 0.6861979197710752,
"step": 100
},
{
"completion_length": 293.623046875,
"epoch": 1.608,
"grad_norm": 1.890625,
"kl": 0.018201105995103717,
"learning_rate": 3.69882494050261e-06,
"loss": 0.0007,
"reward": 3.1282228976488113,
"reward_std": 0.7127013597637415,
"rewards/accuracy_reward": 0.072265625,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.780154167364041,
"rewards/reasoning_steps_reward": 0.7154948115348816,
"step": 101
},
{
"completion_length": 282.21875,
"epoch": 1.624,
"grad_norm": 1.0078125,
"kl": 0.01832750393077731,
"learning_rate": 3.6678008609999618e-06,
"loss": 0.0007,
"reward": 2.694710373878479,
"reward_std": 0.6631567031145096,
"rewards/accuracy_reward": 0.142578125,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.630658664740622,
"rewards/reasoning_steps_reward": 0.6601562537252903,
"step": 102
},
{
"completion_length": 286.57421875,
"epoch": 1.6400000000000001,
"grad_norm": 0.8359375,
"kl": 0.01992178033106029,
"learning_rate": 3.636545014482198e-06,
"loss": 0.0008,
"reward": 2.5292934477329254,
"reward_std": 0.6474510300904512,
"rewards/accuracy_reward": 0.123046875,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.5965700279921293,
"rewards/reasoning_steps_reward": 0.6165364626795053,
"step": 103
},
{
"completion_length": 292.59375,
"epoch": 1.6560000000000001,
"grad_norm": 0.9140625,
"kl": 0.01710453676059842,
"learning_rate": 3.6050636041252996e-06,
"loss": 0.0007,
"reward": 2.915451444685459,
"reward_std": 0.7090357206761837,
"rewards/accuracy_reward": 0.072265625,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7202980586638054,
"rewards/reasoning_steps_reward": 0.6822916753590107,
"step": 104
},
{
"completion_length": 289.3359375,
"epoch": 1.6720000000000002,
"grad_norm": 1.0859375,
"kl": 0.017357071512378752,
"learning_rate": 3.5733628778716645e-06,
"loss": 0.0007,
"reward": 3.073413372039795,
"reward_std": 0.6876837071031332,
"rewards/accuracy_reward": 0.119140625,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7621013515939316,
"rewards/reasoning_steps_reward": 0.6679687574505806,
"step": 105
},
{
"completion_length": 291.736328125,
"epoch": 1.688,
"grad_norm": 0.96875,
"kl": 0.02157578180776909,
"learning_rate": 3.5414491271901073e-06,
"loss": 0.0009,
"reward": 2.819728344678879,
"reward_std": 0.5820730160921812,
"rewards/accuracy_reward": 0.123046875,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.6790587411572536,
"rewards/reasoning_steps_reward": 0.6595052182674408,
"step": 106
},
{
"completion_length": 289.84375,
"epoch": 1.704,
"grad_norm": 0.90234375,
"kl": 0.0179019469069317,
"learning_rate": 3.5093286858272325e-06,
"loss": 0.0007,
"reward": 3.1114601120352745,
"reward_std": 0.6653738301247358,
"rewards/accuracy_reward": 0.08203125,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7914936700835824,
"rewards/reasoning_steps_reward": 0.6549479365348816,
"step": 107
},
{
"completion_length": 288.5234375,
"epoch": 1.72,
"grad_norm": 0.99609375,
"kl": 0.01925749407382682,
"learning_rate": 3.4770079285504053e-06,
"loss": 0.0008,
"reward": 2.79416061937809,
"reward_std": 0.7290520258247852,
"rewards/accuracy_reward": 0.078125,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.6850760908176502,
"rewards/reasoning_steps_reward": 0.6608072835952044,
"step": 108
},
{
"completion_length": 290.9921875,
"epoch": 1.736,
"grad_norm": 0.98046875,
"kl": 0.017092529160436243,
"learning_rate": 3.4444932698825904e-06,
"loss": 0.0007,
"reward": 3.1415600925683975,
"reward_std": 0.7225816715508699,
"rewards/accuracy_reward": 0.064453125,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7919783468047777,
"rewards/reasoning_steps_reward": 0.7011718712747097,
"step": 109
},
{
"completion_length": 293.560546875,
"epoch": 1.752,
"grad_norm": 0.88671875,
"kl": 0.016746411798521876,
"learning_rate": 3.4117911628292944e-06,
"loss": 0.0007,
"reward": 2.7672165408730507,
"reward_std": 0.6844876762479544,
"rewards/accuracy_reward": 0.041015625,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.680652025466164,
"rewards/reasoning_steps_reward": 0.6842447929084301,
"step": 110
},
{
"completion_length": 285.9921875,
"epoch": 1.768,
"grad_norm": 0.98828125,
"kl": 0.018680680135730654,
"learning_rate": 3.378908097597875e-06,
"loss": 0.0007,
"reward": 2.875435918569565,
"reward_std": 0.6584971006959677,
"rewards/accuracy_reward": 0.119140625,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.688730369011561,
"rewards/reasoning_steps_reward": 0.6901041753590107,
"step": 111
},
{
"completion_length": 284.619140625,
"epoch": 1.784,
"grad_norm": 1.1796875,
"kl": 0.01885543600656092,
"learning_rate": 3.3458506003094626e-06,
"loss": 0.0008,
"reward": 3.2833499684929848,
"reward_std": 0.630975978448987,
"rewards/accuracy_reward": 0.15234375,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.8016982388993105,
"rewards/reasoning_steps_reward": 0.7259114682674408,
"step": 112
},
{
"completion_length": 294.26953125,
"epoch": 1.8,
"grad_norm": 0.859375,
"kl": 0.01702951017068699,
"learning_rate": 3.3126252317037616e-06,
"loss": 0.0007,
"reward": 3.0866554528474808,
"reward_std": 0.7168434467166662,
"rewards/accuracy_reward": 0.046875,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7908209301531315,
"rewards/reasoning_steps_reward": 0.6673177108168602,
"step": 113
},
{
"completion_length": 288.5625,
"epoch": 1.8159999999999998,
"grad_norm": 31.875,
"kl": 0.15217732661403716,
"learning_rate": 3.2792385858369706e-06,
"loss": 0.0061,
"reward": 2.8756242617964745,
"reward_std": 0.6984493192285299,
"rewards/accuracy_reward": 0.056640625,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7315449056526026,
"rewards/reasoning_steps_reward": 0.6243489552289248,
"step": 114
},
{
"completion_length": 283.037109375,
"epoch": 1.8319999999999999,
"grad_norm": 0.7578125,
"kl": 0.016328598430845886,
"learning_rate": 3.245697288773102e-06,
"loss": 0.0007,
"reward": 2.902892917394638,
"reward_std": 0.6528493817895651,
"rewards/accuracy_reward": 0.158203125,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.6931084062283238,
"rewards/reasoning_steps_reward": 0.6653645895421505,
"step": 115
},
{
"completion_length": 292.095703125,
"epoch": 1.8479999999999999,
"grad_norm": 0.9140625,
"kl": 0.018676706589758396,
"learning_rate": 3.2120079972689385e-06,
"loss": 0.0007,
"reward": 2.9004068598151207,
"reward_std": 0.7504412587732077,
"rewards/accuracy_reward": 0.087890625,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7139811124652624,
"rewards/reasoning_steps_reward": 0.6705729253590107,
"step": 116
},
{
"completion_length": 288.615234375,
"epoch": 1.8639999999999999,
"grad_norm": 1.1171875,
"kl": 0.020321853808127344,
"learning_rate": 3.1781773974529072e-06,
"loss": 0.0008,
"reward": 2.7037860229611397,
"reward_std": 0.6163357421755791,
"rewards/accuracy_reward": 0.09375,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.6365050650201738,
"rewards/reasoning_steps_reward": 0.7005208358168602,
"step": 117
},
{
"completion_length": 290.62109375,
"epoch": 1.88,
"grad_norm": 1.09375,
"kl": 0.019285056594526395,
"learning_rate": 3.1442122034981187e-06,
"loss": 0.0008,
"reward": 2.6533412411808968,
"reward_std": 0.6223033964633942,
"rewards/accuracy_reward": 0.10546875,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.6294557445993026,
"rewards/reasoning_steps_reward": 0.6595052145421505,
"step": 118
},
{
"completion_length": 290.82421875,
"epoch": 1.896,
"grad_norm": 1.1953125,
"kl": 0.017033788317348808,
"learning_rate": 3.110119156289841e-06,
"loss": 0.0007,
"reward": 3.352183550596237,
"reward_std": 0.6941560469567776,
"rewards/accuracy_reward": 0.083984375,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.8489483365168174,
"rewards/reasoning_steps_reward": 0.7213541753590107,
"step": 119
},
{
"completion_length": 283.248046875,
"epoch": 1.912,
"grad_norm": 2.015625,
"kl": 0.024006142339203507,
"learning_rate": 3.075905022087675e-06,
"loss": 0.001,
"reward": 2.9336234778165817,
"reward_std": 0.649795226752758,
"rewards/accuracy_reward": 0.15625,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.6996626891195774,
"rewards/reasoning_steps_reward": 0.6783854253590107,
"step": 120
},
{
"completion_length": 282.849609375,
"epoch": 1.928,
"grad_norm": 1.109375,
"kl": 0.02002483472460881,
"learning_rate": 3.0415765911826916e-06,
"loss": 0.0008,
"reward": 2.675464451313019,
"reward_std": 0.6967358216643333,
"rewards/accuracy_reward": 0.119140625,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.6331409340103468,
"rewards/reasoning_steps_reward": 0.6569010391831398,
"step": 121
},
{
"completion_length": 290.783203125,
"epoch": 1.944,
"grad_norm": 1.1171875,
"kl": 0.019037541293073446,
"learning_rate": 3.0071406765498003e-06,
"loss": 0.0008,
"reward": 3.0036216378211975,
"reward_std": 0.6973935160785913,
"rewards/accuracy_reward": 0.080078125,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7529433167849978,
"rewards/reasoning_steps_reward": 0.6647135354578495,
"step": 122
},
{
"completion_length": 283.216796875,
"epoch": 1.96,
"grad_norm": 0.859375,
"kl": 0.017990577791351825,
"learning_rate": 2.9726041124956128e-06,
"loss": 0.0007,
"reward": 2.773143321275711,
"reward_std": 0.714199235662818,
"rewards/accuracy_reward": 0.111328125,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.6735130585730076,
"rewards/reasoning_steps_reward": 0.6412760522216558,
"step": 123
},
{
"completion_length": 287.9765625,
"epoch": 1.976,
"grad_norm": 0.859375,
"kl": 0.017044205858837813,
"learning_rate": 2.9379737533020812e-06,
"loss": 0.0007,
"reward": 3.244216948747635,
"reward_std": 0.6938743200153112,
"rewards/accuracy_reward": 0.08984375,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.8157806489616632,
"rewards/reasoning_steps_reward": 0.7070312425494194,
"step": 124
},
{
"completion_length": 290.36328125,
"epoch": 1.992,
"grad_norm": 0.9765625,
"kl": 0.017912040289957076,
"learning_rate": 2.9032564718661606e-06,
"loss": 0.0007,
"reward": 2.990898907184601,
"reward_std": 0.6811724901199341,
"rewards/accuracy_reward": 0.05859375,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7619402616595229,
"rewards/reasoning_steps_reward": 0.6464843824505806,
"step": 125
},
{
"completion_length": 292.08203125,
"epoch": 2.0,
"grad_norm": 0.609375,
"kl": 0.017485147807747126,
"learning_rate": 2.8684591583357863e-06,
"loss": 0.0003,
"reward": 3.365106463432312,
"reward_std": 0.690997276455164,
"rewards/accuracy_reward": 0.08203125,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.8508687826494375,
"rewards/reasoning_steps_reward": 0.7304687425494194,
"step": 126
},
{
"completion_length": 274.076171875,
"epoch": 2.016,
"grad_norm": 0.9453125,
"kl": 0.020696480583865196,
"learning_rate": 2.8335887187424225e-06,
"loss": 0.0008,
"reward": 3.0040955543518066,
"reward_std": 0.6572606600821018,
"rewards/accuracy_reward": 0.248046875,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7068773318702976,
"rewards/reasoning_steps_reward": 0.6354166716337204,
"step": 127
},
{
"completion_length": 291.470703125,
"epoch": 2.032,
"grad_norm": 0.89453125,
"kl": 0.018271160661242902,
"learning_rate": 2.7986520736304632e-06,
"loss": 0.0007,
"reward": 2.8309315219521523,
"reward_std": 0.6785434670746326,
"rewards/accuracy_reward": 0.072265625,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.688218497360746,
"rewards/reasoning_steps_reward": 0.694010429084301,
"step": 128
},
{
"completion_length": 295.984375,
"epoch": 2.048,
"grad_norm": 1.03125,
"kl": 0.018810921494150534,
"learning_rate": 2.7636561566837463e-06,
"loss": 0.0008,
"reward": 3.07464836537838,
"reward_std": 0.7456005457788706,
"rewards/accuracy_reward": 0.037109375,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7963671404868364,
"rewards/reasoning_steps_reward": 0.6484375149011612,
"step": 129
},
{
"completion_length": 281.150390625,
"epoch": 2.064,
"grad_norm": 0.98046875,
"kl": 0.020406899857334793,
"learning_rate": 2.728607913349464e-06,
"loss": 0.0008,
"reward": 2.931031860411167,
"reward_std": 0.6928635407239199,
"rewards/accuracy_reward": 0.15234375,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.6862120016788443,
"rewards/reasoning_steps_reward": 0.7200520895421505,
"step": 130
},
{
"completion_length": 290.275390625,
"epoch": 2.08,
"grad_norm": 0.8203125,
"kl": 0.018310176266822964,
"learning_rate": 2.6935142994597407e-06,
"loss": 0.0007,
"reward": 3.099424757063389,
"reward_std": 0.6812999919056892,
"rewards/accuracy_reward": 0.078125,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7933412299801906,
"rewards/reasoning_steps_reward": 0.6412760503590107,
"step": 131
},
{
"completion_length": 291.0234375,
"epoch": 2.096,
"grad_norm": 0.96484375,
"kl": 0.01775828906102106,
"learning_rate": 2.6583822798511428e-06,
"loss": 0.0007,
"reward": 3.313634306192398,
"reward_std": 0.6808726880699396,
"rewards/accuracy_reward": 0.083984375,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.8779822603488961,
"rewards/reasoning_steps_reward": 0.5957031287252903,
"step": 132
},
{
"completion_length": 285.263671875,
"epoch": 2.112,
"grad_norm": 0.84375,
"kl": 0.018688918324187398,
"learning_rate": 2.623218826982411e-06,
"loss": 0.0007,
"reward": 2.7654543220996857,
"reward_std": 0.6947140172123909,
"rewards/accuracy_reward": 0.15234375,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.640568091844519,
"rewards/reasoning_steps_reward": 0.69140625,
"step": 133
},
{
"completion_length": 282.830078125,
"epoch": 2.128,
"grad_norm": 0.94140625,
"kl": 0.021032241464126855,
"learning_rate": 2.5880309195506714e-06,
"loss": 0.0008,
"reward": 2.8315402641892433,
"reward_std": 0.6945422478020191,
"rewards/accuracy_reward": 0.158203125,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.641546401505669,
"rewards/reasoning_steps_reward": 0.748697929084301,
"step": 134
},
{
"completion_length": 292.361328125,
"epoch": 2.144,
"grad_norm": 0.890625,
"kl": 0.018107893760316074,
"learning_rate": 2.552825541106414e-06,
"loss": 0.0007,
"reward": 3.0376425981521606,
"reward_std": 0.7193902563303709,
"rewards/accuracy_reward": 0.029296875,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7757853666941324,
"rewards/reasoning_steps_reward": 0.6809895876795053,
"step": 135
},
{
"completion_length": 287.232421875,
"epoch": 2.16,
"grad_norm": 0.85546875,
"kl": 0.018850211054086685,
"learning_rate": 2.517609678667501e-06,
"loss": 0.0008,
"reward": 2.687412917613983,
"reward_std": 0.6682394985109568,
"rewards/accuracy_reward": 0.08984375,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.6455872931207219,
"rewards/reasoning_steps_reward": 0.6608072966337204,
"step": 136
},
{
"completion_length": 290.71484375,
"epoch": 2.176,
"grad_norm": 0.859375,
"kl": 0.017231477366294712,
"learning_rate": 2.4823903213324995e-06,
"loss": 0.0007,
"reward": 3.0338680148124695,
"reward_std": 0.6302597746253014,
"rewards/accuracy_reward": 0.09765625,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7382858718434969,
"rewards/reasoning_steps_reward": 0.7213541753590107,
"step": 137
},
{
"completion_length": 289.5703125,
"epoch": 2.192,
"grad_norm": 0.796875,
"kl": 0.01626200118334964,
"learning_rate": 2.447174458893587e-06,
"loss": 0.0007,
"reward": 2.984310381114483,
"reward_std": 0.6622797809541225,
"rewards/accuracy_reward": 0.1015625,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7369576270381609,
"rewards/reasoning_steps_reward": 0.6718750167638063,
"step": 138
},
{
"completion_length": 287.76953125,
"epoch": 2.208,
"grad_norm": 0.75390625,
"kl": 0.01649257366079837,
"learning_rate": 2.4119690804493285e-06,
"loss": 0.0007,
"reward": 3.0554041862487793,
"reward_std": 0.7084929272532463,
"rewards/accuracy_reward": 0.107421875,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7320097194363674,
"rewards/reasoning_steps_reward": 0.7519531361758709,
"step": 139
},
{
"completion_length": 294.828125,
"epoch": 2.224,
"grad_norm": 0.984375,
"kl": 0.018743149645160884,
"learning_rate": 2.376781173017589e-06,
"loss": 0.0007,
"reward": 2.9738914221525192,
"reward_std": 0.6525749433785677,
"rewards/accuracy_reward": 0.041015625,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7449863621344169,
"rewards/reasoning_steps_reward": 0.6979166753590107,
"step": 140
},
{
"completion_length": 289.109375,
"epoch": 2.24,
"grad_norm": 0.98046875,
"kl": 0.022565504419617355,
"learning_rate": 2.3416177201488585e-06,
"loss": 0.0009,
"reward": 3.2985419929027557,
"reward_std": 0.6833065822720528,
"rewards/accuracy_reward": 0.099609375,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.8430035648246607,
"rewards/reasoning_steps_reward": 0.6699218768626451,
"step": 141
},
{
"completion_length": 288.521484375,
"epoch": 2.2560000000000002,
"grad_norm": 1.015625,
"kl": 0.020633232838008553,
"learning_rate": 2.3064857005402606e-06,
"loss": 0.0008,
"reward": 3.1613398045301437,
"reward_std": 0.7222296446561813,
"rewards/accuracy_reward": 0.095703125,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7796913882096609,
"rewards/reasoning_steps_reward": 0.7265624962747097,
"step": 142
},
{
"completion_length": 279.908203125,
"epoch": 2.2720000000000002,
"grad_norm": 1.078125,
"kl": 0.02265268244082108,
"learning_rate": 2.2713920866505364e-06,
"loss": 0.0009,
"reward": 2.9546066522598267,
"reward_std": 0.681933119893074,
"rewards/accuracy_reward": 0.193359375,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7051379711677631,
"rewards/reasoning_steps_reward": 0.6458333414047956,
"step": 143
},
{
"completion_length": 288.0,
"epoch": 2.288,
"grad_norm": 0.875,
"kl": 0.01793542131781578,
"learning_rate": 2.236343843316254e-06,
"loss": 0.0007,
"reward": 2.790590211749077,
"reward_std": 0.651448430493474,
"rewards/accuracy_reward": 0.05078125,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.6706481222063303,
"rewards/reasoning_steps_reward": 0.7278645765036345,
"step": 144
},
{
"completion_length": 285.646484375,
"epoch": 2.304,
"grad_norm": 0.9609375,
"kl": 0.018404830596409738,
"learning_rate": 2.201347926369537e-06,
"loss": 0.0007,
"reward": 2.710278756916523,
"reward_std": 0.6365776527673006,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.6241293720280131,
"rewards/reasoning_steps_reward": 0.7128906175494194,
"step": 145
},
{
"completion_length": 295.73046875,
"epoch": 2.32,
"grad_norm": 0.9921875,
"kl": 0.021149699110537767,
"learning_rate": 2.166411281257578e-06,
"loss": 0.0008,
"reward": 3.2047041803598404,
"reward_std": 0.7344950754195452,
"rewards/accuracy_reward": 0.03125,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.8334256897990903,
"rewards/reasoning_steps_reward": 0.6731770820915699,
"step": 146
},
{
"completion_length": 288.49609375,
"epoch": 2.336,
"grad_norm": 1.921875,
"kl": 0.019101842306554317,
"learning_rate": 2.1315408416642145e-06,
"loss": 0.0008,
"reward": 2.9557630866765976,
"reward_std": 0.6881984118372202,
"rewards/accuracy_reward": 0.111328125,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7087786557773749,
"rewards/reasoning_steps_reward": 0.7180989496409893,
"step": 147
},
{
"completion_length": 283.1796875,
"epoch": 2.352,
"grad_norm": 0.82421875,
"kl": 0.01961760746780783,
"learning_rate": 2.09674352813384e-06,
"loss": 0.0008,
"reward": 3.1119301542639732,
"reward_std": 0.5922442562878132,
"rewards/accuracy_reward": 0.150390625,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7777614261334141,
"rewards/reasoning_steps_reward": 0.6282552145421505,
"step": 148
},
{
"completion_length": 284.3828125,
"epoch": 2.368,
"grad_norm": 0.8671875,
"kl": 0.022024919569958,
"learning_rate": 2.062026246697919e-06,
"loss": 0.0009,
"reward": 3.0898532271385193,
"reward_std": 0.6860612127929926,
"rewards/accuracy_reward": 0.146484375,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7363313144693772,
"rewards/reasoning_steps_reward": 0.7343750149011612,
"step": 149
},
{
"completion_length": 286.181640625,
"epoch": 2.384,
"grad_norm": 1.1484375,
"kl": 0.01775654760422185,
"learning_rate": 2.0273958875043877e-06,
"loss": 0.0007,
"reward": 2.974420055747032,
"reward_std": 0.6679348535835743,
"rewards/accuracy_reward": 0.123046875,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7212910537297527,
"rewards/reasoning_steps_reward": 0.6875000037252903,
"step": 150
},
{
"completion_length": 276.9296875,
"epoch": 2.4,
"grad_norm": 1.03125,
"kl": 0.02118692739168182,
"learning_rate": 1.992859323450201e-06,
"loss": 0.0008,
"reward": 2.724317155778408,
"reward_std": 0.6507551912218332,
"rewards/accuracy_reward": 0.1875,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.6227324477707347,
"rewards/reasoning_steps_reward": 0.6686198022216558,
"step": 151
},
{
"completion_length": 285.744140625,
"epoch": 2.416,
"grad_norm": 4.90625,
"kl": 0.02042768005048856,
"learning_rate": 1.958423408817309e-06,
"loss": 0.0008,
"reward": 3.1025044322013855,
"reward_std": 0.6402757167816162,
"rewards/accuracy_reward": 0.1484375,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7509650103747845,
"rewards/reasoning_steps_reward": 0.7011718768626451,
"step": 152
},
{
"completion_length": 286.40234375,
"epoch": 2.432,
"grad_norm": 1.046875,
"kl": 0.022553854738362134,
"learning_rate": 1.924094977912326e-06,
"loss": 0.0009,
"reward": 2.981735587120056,
"reward_std": 0.7370939962565899,
"rewards/accuracy_reward": 0.11328125,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7332781835769614,
"rewards/reasoning_steps_reward": 0.6686198078095913,
"step": 153
},
{
"completion_length": 288.896484375,
"epoch": 2.448,
"grad_norm": 0.83203125,
"kl": 0.019592860713601112,
"learning_rate": 1.8898808437101598e-06,
"loss": 0.0008,
"reward": 2.95571531355381,
"reward_std": 0.7355391271412373,
"rewards/accuracy_reward": 0.068359375,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7332853010545174,
"rewards/reasoning_steps_reward": 0.6875000037252903,
"step": 154
},
{
"completion_length": 294.271484375,
"epoch": 2.464,
"grad_norm": 0.94140625,
"kl": 0.019800655485596508,
"learning_rate": 1.8557877965018817e-06,
"loss": 0.0008,
"reward": 3.0556194335222244,
"reward_std": 0.7033564373850822,
"rewards/accuracy_reward": 0.044921875,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7715779819215337,
"rewards/reasoning_steps_reward": 0.6959635429084301,
"step": 155
},
{
"completion_length": 294.31640625,
"epoch": 2.48,
"grad_norm": 0.859375,
"kl": 0.018254225375130773,
"learning_rate": 1.8218226025470934e-06,
"loss": 0.0007,
"reward": 3.604881629347801,
"reward_std": 0.715133111923933,
"rewards/accuracy_reward": 0.052734375,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.9479379473874966,
"rewards/reasoning_steps_reward": 0.7083333265036345,
"step": 156
},
{
"completion_length": 289.25390625,
"epoch": 2.496,
"grad_norm": 0.83984375,
"kl": 0.017004019115120173,
"learning_rate": 1.7879920027310621e-06,
"loss": 0.0007,
"reward": 3.051852695643902,
"reward_std": 0.7096979664638638,
"rewards/accuracy_reward": 0.07421875,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7540463662395874,
"rewards/reasoning_steps_reward": 0.7154948078095913,
"step": 157
},
{
"completion_length": 290.005859375,
"epoch": 2.512,
"grad_norm": 0.9765625,
"kl": 0.019147633225657046,
"learning_rate": 1.7543027112268994e-06,
"loss": 0.0008,
"reward": 2.991758108139038,
"reward_std": 0.684099368751049,
"rewards/accuracy_reward": 0.103515625,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7318446912492315,
"rewards/reasoning_steps_reward": 0.6927083320915699,
"step": 158
},
{
"completion_length": 280.697265625,
"epoch": 2.528,
"grad_norm": 1.1171875,
"kl": 0.020928668964188546,
"learning_rate": 1.7207614141630304e-06,
"loss": 0.0008,
"reward": 2.596983939409256,
"reward_std": 0.6806027349084616,
"rewards/accuracy_reward": 0.12890625,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.6024234692255656,
"rewards/reasoning_steps_reward": 0.6608072984963655,
"step": 159
},
{
"completion_length": 285.546875,
"epoch": 2.544,
"grad_norm": 1.09375,
"kl": 0.022152581717818975,
"learning_rate": 1.6873747682962393e-06,
"loss": 0.0009,
"reward": 2.8694690242409706,
"reward_std": 0.6588537991046906,
"rewards/accuracy_reward": 0.126953125,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.6882605010954043,
"rewards/reasoning_steps_reward": 0.6777343805879354,
"step": 160
},
{
"completion_length": 283.427734375,
"epoch": 2.56,
"grad_norm": 0.94140625,
"kl": 0.020902132673654705,
"learning_rate": 1.6541493996905378e-06,
"loss": 0.0008,
"reward": 3.1272382587194443,
"reward_std": 0.6674788426607847,
"rewards/accuracy_reward": 0.12890625,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7581245402495066,
"rewards/reasoning_steps_reward": 0.7239583395421505,
"step": 161
},
{
"completion_length": 286.408203125,
"epoch": 2.576,
"grad_norm": 0.953125,
"kl": 0.022079574409872293,
"learning_rate": 1.6210919024021258e-06,
"loss": 0.0009,
"reward": 2.9151505902409554,
"reward_std": 0.7153513710945845,
"rewards/accuracy_reward": 0.11328125,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7115172014261285,
"rewards/reasoning_steps_reward": 0.6673177219927311,
"step": 162
},
{
"completion_length": 291.72265625,
"epoch": 2.592,
"grad_norm": 0.875,
"kl": 0.017877445730846375,
"learning_rate": 1.588208837170706e-06,
"loss": 0.0007,
"reward": 2.937485493719578,
"reward_std": 0.7016174159944057,
"rewards/accuracy_reward": 0.056640625,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.734153147165974,
"rewards/reasoning_steps_reward": 0.6783854216337204,
"step": 163
},
{
"completion_length": 289.837890625,
"epoch": 2.608,
"grad_norm": 1.015625,
"kl": 0.023135888564866036,
"learning_rate": 1.55550673011741e-06,
"loss": 0.0009,
"reward": 3.3134661614894867,
"reward_std": 0.6674238592386246,
"rewards/accuracy_reward": 0.09765625,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.824106777086854,
"rewards/reasoning_steps_reward": 0.7434895969927311,
"step": 164
},
{
"completion_length": 285.73828125,
"epoch": 2.624,
"grad_norm": 0.8828125,
"kl": 0.017427237355150282,
"learning_rate": 1.522992071449595e-06,
"loss": 0.0007,
"reward": 2.761025607585907,
"reward_std": 0.5951798930764198,
"rewards/accuracy_reward": 0.115234375,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.6484234320620695,
"rewards/reasoning_steps_reward": 0.7005208227783442,
"step": 165
},
{
"completion_length": 292.478515625,
"epoch": 2.64,
"grad_norm": 0.90625,
"kl": 0.02133324311580509,
"learning_rate": 1.4906713141727677e-06,
"loss": 0.0009,
"reward": 2.930042363703251,
"reward_std": 0.6626697592437267,
"rewards/accuracy_reward": 0.0625,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7227745447307825,
"rewards/reasoning_steps_reward": 0.6992187462747097,
"step": 166
},
{
"completion_length": 279.865234375,
"epoch": 2.656,
"grad_norm": 1.09375,
"kl": 0.024935539229772985,
"learning_rate": 1.4585508728098935e-06,
"loss": 0.001,
"reward": 2.825145460665226,
"reward_std": 0.7417711336165667,
"rewards/accuracy_reward": 0.181640625,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.6630693133920431,
"rewards/reasoning_steps_reward": 0.6542968759313226,
"step": 167
},
{
"completion_length": 293.625,
"epoch": 2.672,
"grad_norm": 1.0234375,
"kl": 0.018147769616916776,
"learning_rate": 1.4266371221283367e-06,
"loss": 0.0007,
"reward": 2.7056074738502502,
"reward_std": 0.6061353217810392,
"rewards/accuracy_reward": 0.02734375,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.6455757624159256,
"rewards/reasoning_steps_reward": 0.7415364719927311,
"step": 168
},
{
"completion_length": 293.119140625,
"epoch": 2.6879999999999997,
"grad_norm": 1.046875,
"kl": 0.019442370510660112,
"learning_rate": 1.3949363958747004e-06,
"loss": 0.0008,
"reward": 3.226225107908249,
"reward_std": 0.7255453541874886,
"rewards/accuracy_reward": 0.01953125,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.8371271109208465,
"rewards/reasoning_steps_reward": 0.6953125037252903,
"step": 169
},
{
"completion_length": 291.013671875,
"epoch": 2.7039999999999997,
"grad_norm": 0.8359375,
"kl": 0.01949766167672351,
"learning_rate": 1.363454985517803e-06,
"loss": 0.0008,
"reward": 2.700456887483597,
"reward_std": 0.7572273463010788,
"rewards/accuracy_reward": 0.11328125,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.6375654794586202,
"rewards/reasoning_steps_reward": 0.6744791809469461,
"step": 170
},
{
"completion_length": 292.544921875,
"epoch": 2.7199999999999998,
"grad_norm": 0.9453125,
"kl": 0.020666938507929444,
"learning_rate": 1.3321991390000382e-06,
"loss": 0.0008,
"reward": 2.9996937662363052,
"reward_std": 0.653770299628377,
"rewards/accuracy_reward": 0.044921875,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7529360945336521,
"rewards/reasoning_steps_reward": 0.6959635615348816,
"step": 171
},
{
"completion_length": 289.091796875,
"epoch": 2.7359999999999998,
"grad_norm": 0.98828125,
"kl": 0.021086076740175486,
"learning_rate": 1.301175059497391e-06,
"loss": 0.0008,
"reward": 2.95357333868742,
"reward_std": 0.6372328288853168,
"rewards/accuracy_reward": 0.13671875,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.701104310962061,
"rewards/reasoning_steps_reward": 0.7135416828095913,
"step": 172
},
{
"completion_length": 281.466796875,
"epoch": 2.752,
"grad_norm": 0.94921875,
"kl": 0.02050035016145557,
"learning_rate": 1.270388904188316e-06,
"loss": 0.0008,
"reward": 2.7741658687591553,
"reward_std": 0.7323318216949701,
"rewards/accuracy_reward": 0.15625,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.6441229923317829,
"rewards/reasoning_steps_reward": 0.6855468563735485,
"step": 173
},
{
"completion_length": 288.654296875,
"epoch": 2.768,
"grad_norm": 0.953125,
"kl": 0.018003857927396894,
"learning_rate": 1.2398467830317635e-06,
"loss": 0.0007,
"reward": 2.823809191584587,
"reward_std": 0.6888855472207069,
"rewards/accuracy_reward": 0.109375,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.6687002858767906,
"rewards/reasoning_steps_reward": 0.7083333376795053,
"step": 174
},
{
"completion_length": 295.314453125,
"epoch": 2.784,
"grad_norm": 0.86328125,
"kl": 0.018295871559530497,
"learning_rate": 1.2095547575545685e-06,
"loss": 0.0007,
"reward": 3.150137387216091,
"reward_std": 0.6382329538464546,
"rewards/accuracy_reward": 0.041015625,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7963565507282814,
"rewards/reasoning_steps_reward": 0.7200520895421505,
"step": 175
},
{
"completion_length": 288.701171875,
"epoch": 2.8,
"grad_norm": 1.0234375,
"kl": 0.020645066746510565,
"learning_rate": 1.1795188396484505e-06,
"loss": 0.0008,
"reward": 2.7497966438531876,
"reward_std": 0.696668054908514,
"rewards/accuracy_reward": 0.103515625,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.6516249105334282,
"rewards/reasoning_steps_reward": 0.6914062425494194,
"step": 176
},
{
"completion_length": 285.642578125,
"epoch": 2.816,
"grad_norm": 1.1484375,
"kl": 0.01807958845165558,
"learning_rate": 1.149744990376868e-06,
"loss": 0.0007,
"reward": 2.925790064036846,
"reward_std": 0.6442860681563616,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.6968345294396082,
"rewards/reasoning_steps_reward": 0.7102864719927311,
"step": 177
},
{
"completion_length": 294.732421875,
"epoch": 2.832,
"grad_norm": 2.765625,
"kl": 0.020244878192897886,
"learning_rate": 1.1202391187919575e-06,
"loss": 0.0008,
"reward": 3.2739059031009674,
"reward_std": 0.6519978456199169,
"rewards/accuracy_reward": 0.0546875,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.8315363439420859,
"rewards/reasoning_steps_reward": 0.7246093861758709,
"step": 178
},
{
"completion_length": 287.73828125,
"epoch": 2.848,
"grad_norm": 1.09375,
"kl": 0.021688284177798778,
"learning_rate": 1.0910070807618012e-06,
"loss": 0.0009,
"reward": 2.786106266081333,
"reward_std": 0.676231924444437,
"rewards/accuracy_reward": 0.103515625,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.6424607516576847,
"rewards/reasoning_steps_reward": 0.7552083432674408,
"step": 179
},
{
"completion_length": 286.52734375,
"epoch": 2.864,
"grad_norm": 1.0546875,
"kl": 0.02280406339559704,
"learning_rate": 1.062054677808238e-06,
"loss": 0.0009,
"reward": 3.1157227605581284,
"reward_std": 0.6361609604209661,
"rewards/accuracy_reward": 0.099609375,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.764702707529068,
"rewards/reasoning_steps_reward": 0.7220052257180214,
"step": 180
},
{
"completion_length": 290.2109375,
"epoch": 2.88,
"grad_norm": 1.1328125,
"kl": 0.028993367042858154,
"learning_rate": 1.033387655955471e-06,
"loss": 0.0012,
"reward": 3.221103757619858,
"reward_std": 0.5982426293194294,
"rewards/accuracy_reward": 0.07421875,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.8156717581053575,
"rewards/reasoning_steps_reward": 0.6998698078095913,
"step": 181
},
{
"completion_length": 289.28125,
"epoch": 2.896,
"grad_norm": 1.0390625,
"kl": 0.02006814256310463,
"learning_rate": 1.0050117045896889e-06,
"loss": 0.0008,
"reward": 2.751469612121582,
"reward_std": 0.7198650874197483,
"rewards/accuracy_reward": 0.11328125,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.6480593029409647,
"rewards/reasoning_steps_reward": 0.6940104179084301,
"step": 182
},
{
"completion_length": 292.52734375,
"epoch": 2.912,
"grad_norm": 1.4140625,
"kl": 0.022621202806476504,
"learning_rate": 9.769324553299174e-07,
"loss": 0.0009,
"reward": 3.1886699497699738,
"reward_std": 0.7633016854524612,
"rewards/accuracy_reward": 0.12890625,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7927076746709645,
"rewards/reasoning_steps_reward": 0.6816406436264515,
"step": 183
},
{
"completion_length": 288.79296875,
"epoch": 2.928,
"grad_norm": 0.98046875,
"kl": 0.021405818057246506,
"learning_rate": 9.491554809103509e-07,
"loss": 0.0009,
"reward": 2.6857599690556526,
"reward_std": 0.6840799152851105,
"rewards/accuracy_reward": 0.083984375,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.6376578211784363,
"rewards/reasoning_steps_reward": 0.6888020988553762,
"step": 184
},
{
"completion_length": 290.58203125,
"epoch": 2.944,
"grad_norm": 0.99609375,
"kl": 0.020275956427212805,
"learning_rate": 9.216862940743529e-07,
"loss": 0.0008,
"reward": 2.757513716816902,
"reward_std": 0.602615574374795,
"rewards/accuracy_reward": 0.115234375,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.636836152523756,
"rewards/reasoning_steps_reward": 0.7317708358168602,
"step": 185
},
{
"completion_length": 286.619140625,
"epoch": 2.96,
"grad_norm": 0.984375,
"kl": 0.019758898008149117,
"learning_rate": 8.945303464803833e-07,
"loss": 0.0008,
"reward": 3.0790238082408905,
"reward_std": 0.5770421475172043,
"rewards/accuracy_reward": 0.12109375,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7574610461791357,
"rewards/reasoning_steps_reward": 0.685546888038516,
"step": 186
},
{
"completion_length": 286.671875,
"epoch": 2.976,
"grad_norm": 0.9765625,
"kl": 0.02084403787739575,
"learning_rate": 8.676930276200294e-07,
"loss": 0.0008,
"reward": 3.0736390501260757,
"reward_std": 0.6433412320911884,
"rewards/accuracy_reward": 0.072265625,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7658657878637314,
"rewards/reasoning_steps_reward": 0.7037760429084301,
"step": 187
},
{
"completion_length": 284.01171875,
"epoch": 2.992,
"grad_norm": 1.0,
"kl": 0.019843781657982618,
"learning_rate": 8.411796637483852e-07,
"loss": 0.0008,
"reward": 2.9655564725399017,
"reward_std": 0.6882054135203362,
"rewards/accuracy_reward": 0.107421875,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7163833907494942,
"rewards/reasoning_steps_reward": 0.7089843675494194,
"step": 188
},
{
"completion_length": 290.765625,
"epoch": 3.0,
"grad_norm": 0.69921875,
"kl": 0.017977926647290587,
"learning_rate": 8.149955168269822e-07,
"loss": 0.0004,
"reward": 2.5494449138641357,
"reward_std": 0.6191319935023785,
"rewards/accuracy_reward": 0.1015625,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.5538079980760813,
"rewards/reasoning_steps_reward": 0.7864583507180214,
"step": 189
},
{
"completion_length": 289.603515625,
"epoch": 3.016,
"grad_norm": 0.87890625,
"kl": 0.019213943742215633,
"learning_rate": 7.891457834794711e-07,
"loss": 0.0008,
"reward": 3.084651954472065,
"reward_std": 0.6362812034785748,
"rewards/accuracy_reward": 0.09375,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7643284083654484,
"rewards/reasoning_steps_reward": 0.6979166697710752,
"step": 190
},
{
"completion_length": 285.248046875,
"epoch": 3.032,
"grad_norm": 1.03125,
"kl": 0.020919292815960944,
"learning_rate": 7.636355939602824e-07,
"loss": 0.0008,
"reward": 2.85429210960865,
"reward_std": 0.6567655950784683,
"rewards/accuracy_reward": 0.078125,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.6931841528664032,
"rewards/reasoning_steps_reward": 0.6966145969927311,
"step": 191
},
{
"completion_length": 290.787109375,
"epoch": 3.048,
"grad_norm": 0.87890625,
"kl": 0.016602561168838292,
"learning_rate": 7.384700111364487e-07,
"loss": 0.0007,
"reward": 2.8143509328365326,
"reward_std": 0.6266643963754177,
"rewards/accuracy_reward": 0.11328125,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.6690197528029481,
"rewards/reasoning_steps_reward": 0.6940104104578495,
"step": 192
},
{
"completion_length": 282.99609375,
"epoch": 3.064,
"grad_norm": 0.96875,
"kl": 0.02081725694006309,
"learning_rate": 7.136540294828062e-07,
"loss": 0.0008,
"reward": 2.8774597868323326,
"reward_std": 0.7187161836773157,
"rewards/accuracy_reward": 0.083984375,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.6952643600913385,
"rewards/reasoning_steps_reward": 0.7076822966337204,
"step": 193
},
{
"completion_length": 294.306640625,
"epoch": 3.08,
"grad_norm": 0.9296875,
"kl": 0.02071163459913805,
"learning_rate": 6.891925740907701e-07,
"loss": 0.0008,
"reward": 2.8051391541957855,
"reward_std": 0.6224446576088667,
"rewards/accuracy_reward": 0.021484375,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.695680051886787,
"rewards/reasoning_steps_reward": 0.6966145820915699,
"step": 194
},
{
"completion_length": 286.19140625,
"epoch": 3.096,
"grad_norm": 0.84765625,
"kl": 0.018767547328025103,
"learning_rate": 6.650904996908772e-07,
"loss": 0.0008,
"reward": 3.3200203105807304,
"reward_std": 0.7382683884352446,
"rewards/accuracy_reward": 0.1484375,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.8095814054831862,
"rewards/reasoning_steps_reward": 0.7428385503590107,
"step": 195
},
{
"completion_length": 285.822265625,
"epoch": 3.112,
"grad_norm": 1.078125,
"kl": 0.02104048355249688,
"learning_rate": 6.413525896892972e-07,
"loss": 0.0008,
"reward": 2.955541580915451,
"reward_std": 0.6638543289154768,
"rewards/accuracy_reward": 0.103515625,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7093558870255947,
"rewards/reasoning_steps_reward": 0.7239583395421505,
"step": 196
},
{
"completion_length": 288.87890625,
"epoch": 3.128,
"grad_norm": 0.92578125,
"kl": 0.02037365094292909,
"learning_rate": 6.179835552184924e-07,
"loss": 0.0008,
"reward": 2.7349835634231567,
"reward_std": 0.6583398748189211,
"rewards/accuracy_reward": 0.07421875,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.644300079283615,
"rewards/reasoning_steps_reward": 0.7278645858168602,
"step": 197
},
{
"completion_length": 288.52734375,
"epoch": 3.144,
"grad_norm": 0.9453125,
"kl": 0.02122843312099576,
"learning_rate": 5.949880342022258e-07,
"loss": 0.0008,
"reward": 3.1269255951046944,
"reward_std": 0.7235856931656599,
"rewards/accuracy_reward": 0.068359375,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7810237923016151,
"rewards/reasoning_steps_reward": 0.7154947929084301,
"step": 198
},
{
"completion_length": 285.734375,
"epoch": 3.16,
"grad_norm": 0.9765625,
"kl": 0.02136942616198212,
"learning_rate": 5.723705904351027e-07,
"loss": 0.0009,
"reward": 2.681896522641182,
"reward_std": 0.6634827610105276,
"rewards/accuracy_reward": 0.109375,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.6170557765290141,
"rewards/reasoning_steps_reward": 0.7213541716337204,
"step": 199
},
{
"completion_length": 286.953125,
"epoch": 3.176,
"grad_norm": 0.91796875,
"kl": 0.019215874548535794,
"learning_rate": 5.501357126768117e-07,
"loss": 0.0008,
"reward": 2.6373501121997833,
"reward_std": 0.7020009346306324,
"rewards/accuracy_reward": 0.1015625,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.5913562929878632,
"rewards/reasoning_steps_reward": 0.7617187462747097,
"step": 200
},
{
"completion_length": 286.212890625,
"epoch": 3.192,
"grad_norm": 0.875,
"kl": 0.020688754506409168,
"learning_rate": 5.282878137612738e-07,
"loss": 0.0008,
"reward": 3.007347419857979,
"reward_std": 0.6104327123612165,
"rewards/accuracy_reward": 0.1328125,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7257564406221112,
"rewards/reasoning_steps_reward": 0.6972656305879354,
"step": 201
},
{
"completion_length": 286.62109375,
"epoch": 3.208,
"grad_norm": 0.8828125,
"kl": 0.02169125445652753,
"learning_rate": 5.068312297208414e-07,
"loss": 0.0009,
"reward": 3.0148477032780647,
"reward_std": 0.679189708083868,
"rewards/accuracy_reward": 0.08203125,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7471367201457421,
"rewards/reasoning_steps_reward": 0.69140625,
"step": 202
},
{
"completion_length": 293.78125,
"epoch": 3.224,
"grad_norm": 0.85546875,
"kl": 0.02024375193286687,
"learning_rate": 4.857702189257613e-07,
"loss": 0.0008,
"reward": 3.007346175611019,
"reward_std": 0.6605745330452919,
"rewards/accuracy_reward": 0.06640625,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7448532494405905,
"rewards/reasoning_steps_reward": 0.7063802145421505,
"step": 203
},
{
"completion_length": 296.005859375,
"epoch": 3.24,
"grad_norm": 0.875,
"kl": 0.020721249806229025,
"learning_rate": 4.6510896123903027e-07,
"loss": 0.0008,
"reward": 3.162186399102211,
"reward_std": 0.668110404163599,
"rewards/accuracy_reward": 0.03125,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.8146958220750093,
"rewards/reasoning_steps_reward": 0.6868489757180214,
"step": 204
},
{
"completion_length": 284.65234375,
"epoch": 3.2560000000000002,
"grad_norm": 1.109375,
"kl": 0.023086362169124186,
"learning_rate": 4.4485155718684334e-07,
"loss": 0.0009,
"reward": 2.8323604688048363,
"reward_std": 0.7187584564089775,
"rewards/accuracy_reward": 0.1171875,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.6745888954028487,
"rewards/reasoning_steps_reward": 0.6914062649011612,
"step": 205
},
{
"completion_length": 295.3203125,
"epoch": 3.2720000000000002,
"grad_norm": 0.890625,
"kl": 0.019205813470762223,
"learning_rate": 4.2500202714478853e-07,
"loss": 0.0008,
"reward": 3.3045015186071396,
"reward_std": 0.7383539900183678,
"rewards/accuracy_reward": 0.048828125,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.8480282643189033,
"rewards/reasoning_steps_reward": 0.7115885354578495,
"step": 206
},
{
"completion_length": 276.3671875,
"epoch": 3.288,
"grad_norm": 0.9296875,
"kl": 0.019852709374390543,
"learning_rate": 4.05564310539939e-07,
"loss": 0.0008,
"reward": 3.327822983264923,
"reward_std": 0.7267354801297188,
"rewards/accuracy_reward": 0.21875,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.8076250155766805,
"rewards/reasoning_steps_reward": 0.686197929084301,
"step": 207
},
{
"completion_length": 295.84375,
"epoch": 3.304,
"grad_norm": 0.79296875,
"kl": 0.017830375931225717,
"learning_rate": 3.8654226506902204e-07,
"loss": 0.0007,
"reward": 2.7935037687420845,
"reward_std": 0.7168517392128706,
"rewards/accuracy_reward": 0.05078125,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.6909335435678562,
"rewards/reasoning_steps_reward": 0.6699218731373549,
"step": 208
},
{
"completion_length": 289.603515625,
"epoch": 3.32,
"grad_norm": 1.1875,
"kl": 0.019766899524256587,
"learning_rate": 3.679396659327986e-07,
"loss": 0.0008,
"reward": 3.100301645696163,
"reward_std": 0.7392721492797136,
"rewards/accuracy_reward": 0.10546875,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.789510258163015,
"rewards/reasoning_steps_reward": 0.6263020895421505,
"step": 209
},
{
"completion_length": 278.326171875,
"epoch": 3.336,
"grad_norm": 0.921875,
"kl": 0.020691857673227787,
"learning_rate": 3.4976020508682345e-07,
"loss": 0.0008,
"reward": 3.0393467769026756,
"reward_std": 0.6087249293923378,
"rewards/accuracy_reward": 0.16796875,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7225339828679959,
"rewards/reasoning_steps_reward": 0.703776054084301,
"step": 210
},
{
"completion_length": 292.322265625,
"epoch": 3.352,
"grad_norm": 1.15625,
"kl": 0.020608096150681376,
"learning_rate": 3.320074905087212e-07,
"loss": 0.0008,
"reward": 2.8478069826960564,
"reward_std": 0.6319366451352835,
"rewards/accuracy_reward": 0.087890625,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.6738783651962876,
"rewards/reasoning_steps_reward": 0.7382812425494194,
"step": 211
},
{
"completion_length": 283.3984375,
"epoch": 3.368,
"grad_norm": 0.8671875,
"kl": 0.023123053135350347,
"learning_rate": 3.14685045482131e-07,
"loss": 0.0009,
"reward": 2.7474499940872192,
"reward_std": 0.6886056587100029,
"rewards/accuracy_reward": 0.095703125,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.6692888826752702,
"rewards/reasoning_steps_reward": 0.6438802052289248,
"step": 212
},
{
"completion_length": 295.224609375,
"epoch": 3.384,
"grad_norm": 0.9140625,
"kl": 0.023218440066557378,
"learning_rate": 2.977963078974616e-07,
"loss": 0.0009,
"reward": 2.9267039820551872,
"reward_std": 0.6514626033604145,
"rewards/accuracy_reward": 0.060546875,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7105940394103527,
"rewards/reasoning_steps_reward": 0.7343750055879354,
"step": 213
},
{
"completion_length": 287.263671875,
"epoch": 3.4,
"grad_norm": 0.98046875,
"kl": 0.021008892101235688,
"learning_rate": 2.813446295695893e-07,
"loss": 0.0008,
"reward": 3.1436211466789246,
"reward_std": 0.6997925061732531,
"rewards/accuracy_reward": 0.076171875,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7876740458110968,
"rewards/reasoning_steps_reward": 0.7044270932674408,
"step": 214
},
{
"completion_length": 285.443359375,
"epoch": 3.416,
"grad_norm": 1.0625,
"kl": 0.02351184340659529,
"learning_rate": 2.65333275572644e-07,
"loss": 0.0009,
"reward": 2.9087352752685547,
"reward_std": 0.6324813142418861,
"rewards/accuracy_reward": 0.11328125,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.707642661097149,
"rewards/reasoning_steps_reward": 0.6725260391831398,
"step": 215
},
{
"completion_length": 292.380859375,
"epoch": 3.432,
"grad_norm": 0.88671875,
"kl": 0.023277590342331678,
"learning_rate": 2.4976542359200664e-07,
"loss": 0.0009,
"reward": 2.6246762797236443,
"reward_std": 0.691521966829896,
"rewards/accuracy_reward": 0.072265625,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.610135139276584,
"rewards/reasoning_steps_reward": 0.7220052182674408,
"step": 216
},
{
"completion_length": 280.158203125,
"epoch": 3.448,
"grad_norm": 1.1484375,
"kl": 0.02378622384276241,
"learning_rate": 2.3464416329365137e-07,
"loss": 0.001,
"reward": 2.8623234406113625,
"reward_std": 0.6014144476503134,
"rewards/accuracy_reward": 0.13671875,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.6630921829491854,
"rewards/reasoning_steps_reward": 0.7363281361758709,
"step": 217
},
{
"completion_length": 295.375,
"epoch": 3.464,
"grad_norm": 0.8203125,
"kl": 0.016009816259611398,
"learning_rate": 2.1997249571095835e-07,
"loss": 0.0006,
"reward": 3.290237843990326,
"reward_std": 0.6886514872312546,
"rewards/accuracy_reward": 0.04296875,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.8556435058514277,
"rewards/reasoning_steps_reward": 0.6803385578095913,
"step": 218
},
{
"completion_length": 289.02734375,
"epoch": 3.48,
"grad_norm": 0.859375,
"kl": 0.02076311851851642,
"learning_rate": 2.0575333264911125e-07,
"loss": 0.0008,
"reward": 2.800406724214554,
"reward_std": 0.6951953694224358,
"rewards/accuracy_reward": 0.10546875,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.6608994637305537,
"rewards/reasoning_steps_reward": 0.7122395895421505,
"step": 219
},
{
"completion_length": 279.2421875,
"epoch": 3.496,
"grad_norm": 0.9921875,
"kl": 0.01861161779379472,
"learning_rate": 1.9198949610721273e-07,
"loss": 0.0007,
"reward": 2.7829076945781708,
"reward_std": 0.5952301491051912,
"rewards/accuracy_reward": 0.19140625,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.6188251003623009,
"rewards/reasoning_steps_reward": 0.735026054084301,
"step": 220
},
{
"completion_length": 281.380859375,
"epoch": 3.512,
"grad_norm": 0.95703125,
"kl": 0.018613723281305283,
"learning_rate": 1.786837177182127e-07,
"loss": 0.0007,
"reward": 2.8807911574840546,
"reward_std": 0.6898195426911116,
"rewards/accuracy_reward": 0.15234375,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.6807498056441545,
"rewards/reasoning_steps_reward": 0.686197929084301,
"step": 221
},
{
"completion_length": 287.46484375,
"epoch": 3.528,
"grad_norm": 0.859375,
"kl": 0.01997726986883208,
"learning_rate": 1.6583863820678032e-07,
"loss": 0.0008,
"reward": 2.8661443442106247,
"reward_std": 0.6607285998761654,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.6637147692963481,
"rewards/reasoning_steps_reward": 0.7500000074505806,
"step": 222
},
{
"completion_length": 287.591796875,
"epoch": 3.544,
"grad_norm": 0.91015625,
"kl": 0.020406617608387023,
"learning_rate": 1.534568068652101e-07,
"loss": 0.0008,
"reward": 2.8175922632217407,
"reward_std": 0.772568928077817,
"rewards/accuracy_reward": 0.080078125,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.6876783277839422,
"rewards/reasoning_steps_reward": 0.6744791697710752,
"step": 223
},
{
"completion_length": 281.27734375,
"epoch": 3.56,
"grad_norm": 0.921875,
"kl": 0.021257835964206606,
"learning_rate": 1.4154068104747981e-07,
"loss": 0.0009,
"reward": 3.101296618580818,
"reward_std": 0.7108908668160439,
"rewards/accuracy_reward": 0.1640625,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7581579011554519,
"rewards/reasoning_steps_reward": 0.662760416045785,
"step": 224
},
{
"completion_length": 279.06640625,
"epoch": 3.576,
"grad_norm": 1.375,
"kl": 0.022064094548113644,
"learning_rate": 1.3009262568155462e-07,
"loss": 0.0009,
"reward": 2.9315654188394547,
"reward_std": 0.705444760620594,
"rewards/accuracy_reward": 0.130859375,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.6976745830227932,
"rewards/reasoning_steps_reward": 0.707682304084301,
"step": 225
},
{
"completion_length": 291.966796875,
"epoch": 3.592,
"grad_norm": 2.09375,
"kl": 0.0230710570467636,
"learning_rate": 1.1911491280002907e-07,
"loss": 0.0009,
"reward": 3.4133089035749435,
"reward_std": 0.7498599980026484,
"rewards/accuracy_reward": 0.0625,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.8753998465836048,
"rewards/reasoning_steps_reward": 0.7246093787252903,
"step": 226
},
{
"completion_length": 277.298828125,
"epoch": 3.608,
"grad_norm": 0.83984375,
"kl": 0.019825019757263362,
"learning_rate": 1.0860972108921258e-07,
"loss": 0.0008,
"reward": 2.766988158226013,
"reward_std": 0.6906307358294725,
"rewards/accuracy_reward": 0.138671875,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.6410793742785851,
"rewards/reasoning_steps_reward": 0.7050781287252903,
"step": 227
},
{
"completion_length": 291.142578125,
"epoch": 3.624,
"grad_norm": 0.91015625,
"kl": 0.019932835886720568,
"learning_rate": 9.857913545673503e-08,
"loss": 0.0008,
"reward": 3.3143957555294037,
"reward_std": 0.6608162298798561,
"rewards/accuracy_reward": 0.080078125,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.836786450818181,
"rewards/reasoning_steps_reward": 0.7239583283662796,
"step": 228
},
{
"completion_length": 288.453125,
"epoch": 3.64,
"grad_norm": 0.8984375,
"kl": 0.018874026485718787,
"learning_rate": 8.902514661776885e-08,
"loss": 0.0008,
"reward": 3.2070699259638786,
"reward_std": 0.7432738393545151,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.8040493360410134,
"rewards/reasoning_steps_reward": 0.6699218824505806,
"step": 229
},
{
"completion_length": 293.146484375,
"epoch": 3.656,
"grad_norm": 0.80078125,
"kl": 0.016978327243123204,
"learning_rate": 7.994965069994143e-08,
"loss": 0.0007,
"reward": 3.143362358212471,
"reward_std": 0.6415095869451761,
"rewards/accuracy_reward": 0.072265625,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7838985491544008,
"rewards/reasoning_steps_reward": 0.7194010280072689,
"step": 230
},
{
"completion_length": 288.62890625,
"epoch": 3.672,
"grad_norm": 1.1171875,
"kl": 0.021244205767288804,
"learning_rate": 7.135444886702064e-08,
"loss": 0.0008,
"reward": 2.9098562449216843,
"reward_std": 0.7155030779540539,
"rewards/accuracy_reward": 0.119140625,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7021569274365902,
"rewards/reasoning_steps_reward": 0.684244804084301,
"step": 231
},
{
"completion_length": 285.921875,
"epoch": 3.6879999999999997,
"grad_norm": 0.81640625,
"kl": 0.01845627831062302,
"learning_rate": 6.324124696144962e-08,
"loss": 0.0007,
"reward": 2.8958379551768303,
"reward_std": 0.6293431017547846,
"rewards/accuracy_reward": 0.111328125,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7039945904786388,
"rewards/reasoning_steps_reward": 0.6725260466337204,
"step": 232
},
{
"completion_length": 290.05859375,
"epoch": 3.7039999999999997,
"grad_norm": 0.91015625,
"kl": 0.017910517868585885,
"learning_rate": 5.5611655165795365e-08,
"loss": 0.0007,
"reward": 2.9697776436805725,
"reward_std": 0.6638195030391216,
"rewards/accuracy_reward": 0.08203125,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7292921990156174,
"rewards/reasoning_steps_reward": 0.699869804084301,
"step": 233
},
{
"completion_length": 287.302734375,
"epoch": 3.7199999999999998,
"grad_norm": 0.859375,
"kl": 0.01873377658193931,
"learning_rate": 4.846718768318659e-08,
"loss": 0.0007,
"reward": 3.1371295899152756,
"reward_std": 0.6027075219899416,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.762940771256884,
"rewards/reasoning_steps_reward": 0.7233072891831398,
"step": 234
},
{
"completion_length": 293.28125,
"epoch": 3.7359999999999998,
"grad_norm": 0.90625,
"kl": 0.019052452000323683,
"learning_rate": 4.1809262436796896e-08,
"loss": 0.0008,
"reward": 3.043783374130726,
"reward_std": 0.6604214962571859,
"rewards/accuracy_reward": 0.06640625,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7457142351195216,
"rewards/reasoning_steps_reward": 0.7402343787252903,
"step": 235
},
{
"completion_length": 292.73828125,
"epoch": 3.752,
"grad_norm": 0.828125,
"kl": 0.019031181174796075,
"learning_rate": 3.563920078843791e-08,
"loss": 0.0008,
"reward": 3.077702447772026,
"reward_std": 0.6683868058025837,
"rewards/accuracy_reward": 0.107421875,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7663521977762381,
"rewards/reasoning_steps_reward": 0.6712239496409893,
"step": 236
},
{
"completion_length": 281.478515625,
"epoch": 3.768,
"grad_norm": 1.21875,
"kl": 0.022773202043026686,
"learning_rate": 2.99582272763152e-08,
"loss": 0.0009,
"reward": 2.9361980706453323,
"reward_std": 0.667768020182848,
"rewards/accuracy_reward": 0.17578125,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.6935764315227667,
"rewards/reasoning_steps_reward": 0.6796875037252903,
"step": 237
},
{
"completion_length": 286.15234375,
"epoch": 3.784,
"grad_norm": 0.953125,
"kl": 0.020423304580617696,
"learning_rate": 2.4767469372002362e-08,
"loss": 0.0008,
"reward": 2.6705066189169884,
"reward_std": 0.6245338693261147,
"rewards/accuracy_reward": 0.162109375,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.606531698256731,
"rewards/reasoning_steps_reward": 0.6888020746409893,
"step": 238
},
{
"completion_length": 291.84765625,
"epoch": 3.8,
"grad_norm": 0.84765625,
"kl": 0.017052936542313546,
"learning_rate": 2.0067957256676428e-08,
"loss": 0.0007,
"reward": 3.033589616417885,
"reward_std": 0.6652188412845135,
"rewards/accuracy_reward": 0.08203125,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7442694374670585,
"rewards/reasoning_steps_reward": 0.7187499944120646,
"step": 239
},
{
"completion_length": 290.75,
"epoch": 3.816,
"grad_norm": 0.796875,
"kl": 0.021085154090542346,
"learning_rate": 1.5860623616664183e-08,
"loss": 0.0008,
"reward": 2.713896244764328,
"reward_std": 0.6678700372576714,
"rewards/accuracy_reward": 0.083984375,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.6320626304174463,
"rewards/reasoning_steps_reward": 0.733723958954215,
"step": 240
},
{
"completion_length": 286.052734375,
"epoch": 3.832,
"grad_norm": 0.93359375,
"kl": 0.02104910637717694,
"learning_rate": 1.2146303458337172e-08,
"loss": 0.0008,
"reward": 3.3307963609695435,
"reward_std": 0.6925474908202887,
"rewards/accuracy_reward": 0.123046875,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.8309685722924769,
"rewards/reasoning_steps_reward": 0.7148437593132257,
"step": 241
},
{
"completion_length": 286.875,
"epoch": 3.848,
"grad_norm": 2.375,
"kl": 0.023578285879921168,
"learning_rate": 8.92573394239149e-09,
"loss": 0.0009,
"reward": 2.9508322179317474,
"reward_std": 0.6057112123817205,
"rewards/accuracy_reward": 0.107421875,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7184197666744391,
"rewards/reasoning_steps_reward": 0.6881510633975267,
"step": 242
},
{
"completion_length": 296.923828125,
"epoch": 3.864,
"grad_norm": 1.0390625,
"kl": 0.019981018383987248,
"learning_rate": 6.1995542375495325e-09,
"loss": 0.0008,
"reward": 3.1651005297899246,
"reward_std": 0.6937647629529238,
"rewards/accuracy_reward": 0.064453125,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.8048164764574418,
"rewards/reasoning_steps_reward": 0.6861979216337204,
"step": 243
},
{
"completion_length": 289.396484375,
"epoch": 3.88,
"grad_norm": 0.82421875,
"kl": 0.01859537634300068,
"learning_rate": 3.96830539370563e-09,
"loss": 0.0007,
"reward": 3.588533952832222,
"reward_std": 0.7532828189432621,
"rewards/accuracy_reward": 0.087890625,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.9253446195895473,
"rewards/reasoning_steps_reward": 0.724609375,
"step": 244
},
{
"completion_length": 294.28515625,
"epoch": 3.896,
"grad_norm": 0.94140625,
"kl": 0.01655962661607191,
"learning_rate": 2.2324302345483327e-09,
"loss": 0.0007,
"reward": 3.025103345513344,
"reward_std": 0.6890733204782009,
"rewards/accuracy_reward": 0.064453125,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7340622267996272,
"rewards/reasoning_steps_reward": 0.7584635354578495,
"step": 245
},
{
"completion_length": 288.9453125,
"epoch": 3.912,
"grad_norm": 0.97265625,
"kl": 0.020295250928029418,
"learning_rate": 9.922732696748816e-10,
"loss": 0.0008,
"reward": 2.733549617230892,
"reward_std": 0.6962179783731699,
"rewards/accuracy_reward": 0.07421875,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.6568429181352258,
"rewards/reasoning_steps_reward": 0.6888020858168602,
"step": 246
},
{
"completion_length": 285.66796875,
"epoch": 3.928,
"grad_norm": 1.0546875,
"kl": 0.017885809938888997,
"learning_rate": 2.480806262181168e-10,
"loss": 0.0007,
"reward": 2.9583439081907272,
"reward_std": 0.6105441423133016,
"rewards/accuracy_reward": 0.115234375,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.7220087309057514,
"rewards/reasoning_steps_reward": 0.6770833320915699,
"step": 247
},
{
"completion_length": 288.5234375,
"epoch": 3.944,
"grad_norm": 26.625,
"kl": 0.03823809011373669,
"learning_rate": 0.0,
"loss": 0.0015,
"reward": 2.7989018857479095,
"reward_std": 0.6642574854195118,
"rewards/accuracy_reward": 0.1015625,
"rewards/format_reward": 0.0,
"rewards/novelty_reward_func_explore_exploit": 0.6736357094099125,
"rewards/reasoning_steps_reward": 0.676432304084301,
"step": 248
},
{
"epoch": 3.944,
"step": 248,
"total_flos": 0.0,
"train_loss": 0.006824205948613517,
"train_runtime": 18399.2985,
"train_samples_per_second": 0.435,
"train_steps_per_second": 0.013
}
],
"logging_steps": 1,
"max_steps": 248,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}