Qwen2.5-0.5B-Instruct-MemoryR
/
Qwen2.5-0.5B-Open-R1-GRPOmemory_combine-20-2000-0
/trainer_state.json
{ | |
"best_metric": null, | |
"best_model_checkpoint": null, | |
"epoch": 3.944, | |
"eval_steps": 100, | |
"global_step": 248, | |
"is_hyper_param_search": false, | |
"is_local_process_zero": true, | |
"is_world_process_zero": true, | |
"log_history": [ | |
{ | |
"completion_length": 288.16796875, | |
"epoch": 0.016, | |
"grad_norm": 0.9921875, | |
"kl": 0.0, | |
"learning_rate": 2.0000000000000002e-07, | |
"loss": -0.0, | |
"reward": 2.1448024585843086, | |
"reward_std": 0.6503619067370892, | |
"rewards/accuracy_reward": 0.064453125, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.576262284691135, | |
"rewards/reasoning_steps_reward": 0.35156250186264515, | |
"step": 1 | |
}, | |
{ | |
"completion_length": 280.5390625, | |
"epoch": 0.032, | |
"grad_norm": 2.453125, | |
"kl": 0.0, | |
"learning_rate": 4.0000000000000003e-07, | |
"loss": -0.0, | |
"reward": 2.9461557120084763, | |
"reward_std": 0.7598665952682495, | |
"rewards/accuracy_reward": 0.017578125, | |
"rewards/format_reward": 0.001953125, | |
"rewards/novelty_reward_func_explore_exploit": 0.8809234369546175, | |
"rewards/reasoning_steps_reward": 0.2838541753590107, | |
"step": 2 | |
}, | |
{ | |
"completion_length": 282.580078125, | |
"epoch": 0.048, | |
"grad_norm": 1.609375, | |
"kl": 0.0010201742788922274, | |
"learning_rate": 6.000000000000001e-07, | |
"loss": 0.0, | |
"reward": 2.4310644939541817, | |
"reward_std": 0.7171800062060356, | |
"rewards/accuracy_reward": 0.099609375, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.68535483473291, | |
"rewards/reasoning_steps_reward": 0.27539063314907253, | |
"step": 3 | |
}, | |
{ | |
"completion_length": 281.859375, | |
"epoch": 0.064, | |
"grad_norm": 1.6640625, | |
"kl": 0.0006131621394160902, | |
"learning_rate": 8.000000000000001e-07, | |
"loss": 0.0, | |
"reward": 2.392010949552059, | |
"reward_std": 0.7056797686964273, | |
"rewards/accuracy_reward": 0.123046875, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.6480314154177904, | |
"rewards/reasoning_steps_reward": 0.3248697896488011, | |
"step": 4 | |
}, | |
{ | |
"completion_length": 276.5234375, | |
"epoch": 0.08, | |
"grad_norm": 1.140625, | |
"kl": 0.0008916492552089039, | |
"learning_rate": 1.0000000000000002e-06, | |
"loss": 0.0, | |
"reward": 2.185966059565544, | |
"reward_std": 0.7970924656838179, | |
"rewards/accuracy_reward": 0.1171875, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.6036553451170524, | |
"rewards/reasoning_steps_reward": 0.2578125027939677, | |
"step": 5 | |
}, | |
{ | |
"completion_length": 290.345703125, | |
"epoch": 0.096, | |
"grad_norm": 0.98828125, | |
"kl": 0.0007805953682691325, | |
"learning_rate": 1.2000000000000002e-06, | |
"loss": 0.0, | |
"reward": 2.586206890642643, | |
"reward_std": 0.7317942306399345, | |
"rewards/accuracy_reward": 0.0625, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7433623660666248, | |
"rewards/reasoning_steps_reward": 0.29361979849636555, | |
"step": 6 | |
}, | |
{ | |
"completion_length": 288.357421875, | |
"epoch": 0.112, | |
"grad_norm": 3.3125, | |
"kl": 0.0006862173449917464, | |
"learning_rate": 1.4000000000000001e-06, | |
"loss": 0.0, | |
"reward": 2.9549497589468956, | |
"reward_std": 0.7832636646926403, | |
"rewards/accuracy_reward": 0.060546875, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.8552089141060909, | |
"rewards/reasoning_steps_reward": 0.3287760401144624, | |
"step": 7 | |
}, | |
{ | |
"completion_length": 291.0859375, | |
"epoch": 0.128, | |
"grad_norm": 1.34375, | |
"kl": 0.0007065349800541298, | |
"learning_rate": 1.6000000000000001e-06, | |
"loss": 0.0, | |
"reward": 2.769632026553154, | |
"reward_std": 0.6810889039188623, | |
"rewards/accuracy_reward": 0.02734375, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.8127506040036678, | |
"rewards/reasoning_steps_reward": 0.3040364640764892, | |
"step": 8 | |
}, | |
{ | |
"completion_length": 285.240234375, | |
"epoch": 0.144, | |
"grad_norm": 1.4921875, | |
"kl": 0.000676694346111617, | |
"learning_rate": 1.8000000000000001e-06, | |
"loss": 0.0, | |
"reward": 2.951853834092617, | |
"reward_std": 0.833003468811512, | |
"rewards/accuracy_reward": 0.02734375, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.8860780304918686, | |
"rewards/reasoning_steps_reward": 0.2662760391831398, | |
"step": 9 | |
}, | |
{ | |
"completion_length": 273.15625, | |
"epoch": 0.16, | |
"grad_norm": 2.40625, | |
"kl": 0.0007681718943786109, | |
"learning_rate": 2.0000000000000003e-06, | |
"loss": 0.0, | |
"reward": 2.4880168437957764, | |
"reward_std": 0.7941582556813955, | |
"rewards/accuracy_reward": 0.083984375, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.6932712296644846, | |
"rewards/reasoning_steps_reward": 0.3242187509313226, | |
"step": 10 | |
}, | |
{ | |
"completion_length": 263.72265625, | |
"epoch": 0.176, | |
"grad_norm": 1.8125, | |
"kl": 0.0007444877319358056, | |
"learning_rate": 2.2e-06, | |
"loss": 0.0, | |
"reward": 2.060094438493252, | |
"reward_std": 0.7453512959182262, | |
"rewards/accuracy_reward": 0.15625, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.540864814693729, | |
"rewards/reasoning_steps_reward": 0.28125000139698386, | |
"step": 11 | |
}, | |
{ | |
"completion_length": 287.52734375, | |
"epoch": 0.192, | |
"grad_norm": 0.84765625, | |
"kl": 0.0006410041951312451, | |
"learning_rate": 2.4000000000000003e-06, | |
"loss": 0.0, | |
"reward": 2.6830679774284363, | |
"reward_std": 0.7234712429344654, | |
"rewards/accuracy_reward": 0.07421875, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.775215346676608, | |
"rewards/reasoning_steps_reward": 0.28320312732830644, | |
"step": 12 | |
}, | |
{ | |
"completion_length": 271.40625, | |
"epoch": 0.208, | |
"grad_norm": 1.6875, | |
"kl": 0.0006587781517737312, | |
"learning_rate": 2.6e-06, | |
"loss": 0.0, | |
"reward": 2.297848492860794, | |
"reward_std": 0.7962923254817724, | |
"rewards/accuracy_reward": 0.07421875, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.654187332217892, | |
"rewards/reasoning_steps_reward": 0.2610677082557231, | |
"step": 13 | |
}, | |
{ | |
"completion_length": 282.728515625, | |
"epoch": 0.224, | |
"grad_norm": 1.1015625, | |
"kl": 0.0009098516529775225, | |
"learning_rate": 2.8000000000000003e-06, | |
"loss": 0.0, | |
"reward": 2.512993238866329, | |
"reward_std": 0.7325041498988867, | |
"rewards/accuracy_reward": 0.11328125, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.698124471741418, | |
"rewards/reasoning_steps_reward": 0.30533855268731713, | |
"step": 14 | |
}, | |
{ | |
"completion_length": 286.90234375, | |
"epoch": 0.24, | |
"grad_norm": 1.7890625, | |
"kl": 0.0009447168922633864, | |
"learning_rate": 3e-06, | |
"loss": 0.0, | |
"reward": 2.572930172085762, | |
"reward_std": 0.7189842760562897, | |
"rewards/accuracy_reward": 0.046875, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7254819249113401, | |
"rewards/reasoning_steps_reward": 0.34960938710719347, | |
"step": 15 | |
}, | |
{ | |
"completion_length": 282.0234375, | |
"epoch": 0.256, | |
"grad_norm": 1.9453125, | |
"kl": 0.0007176450344559271, | |
"learning_rate": 3.2000000000000003e-06, | |
"loss": 0.0, | |
"reward": 2.3780763298273087, | |
"reward_std": 0.7437136992812157, | |
"rewards/accuracy_reward": 0.044921875, | |
"rewards/format_reward": 0.001953125, | |
"rewards/novelty_reward_func_explore_exploit": 0.668343149125576, | |
"rewards/reasoning_steps_reward": 0.326171881519258, | |
"step": 16 | |
}, | |
{ | |
"completion_length": 277.181640625, | |
"epoch": 0.272, | |
"grad_norm": 1.84375, | |
"kl": 0.0009444843672099523, | |
"learning_rate": 3.4000000000000005e-06, | |
"loss": 0.0, | |
"reward": 2.6828741505742073, | |
"reward_std": 0.7672664560377598, | |
"rewards/accuracy_reward": 0.068359375, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7499771338577071, | |
"rewards/reasoning_steps_reward": 0.3645833469927311, | |
"step": 17 | |
}, | |
{ | |
"completion_length": 286.3671875, | |
"epoch": 0.288, | |
"grad_norm": 1.28125, | |
"kl": 0.0009558251094858861, | |
"learning_rate": 3.6000000000000003e-06, | |
"loss": 0.0, | |
"reward": 2.608707718551159, | |
"reward_std": 0.7508547510951757, | |
"rewards/accuracy_reward": 0.056640625, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7523817578330636, | |
"rewards/reasoning_steps_reward": 0.29492188477888703, | |
"step": 18 | |
}, | |
{ | |
"completion_length": 288.197265625, | |
"epoch": 0.304, | |
"grad_norm": 1.3203125, | |
"kl": 0.0009933830478985328, | |
"learning_rate": 3.8000000000000005e-06, | |
"loss": 0.0, | |
"reward": 3.152822159230709, | |
"reward_std": 0.7633876148611307, | |
"rewards/accuracy_reward": 0.03125, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.9196473040307561, | |
"rewards/reasoning_steps_reward": 0.3626302182674408, | |
"step": 19 | |
}, | |
{ | |
"completion_length": 291.763671875, | |
"epoch": 0.32, | |
"grad_norm": 0.95703125, | |
"kl": 0.0010635810940584633, | |
"learning_rate": 4.000000000000001e-06, | |
"loss": 0.0, | |
"reward": 2.500213325023651, | |
"reward_std": 0.7017618604004383, | |
"rewards/accuracy_reward": 0.0859375, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.6949495617300272, | |
"rewards/reasoning_steps_reward": 0.32942708022892475, | |
"step": 20 | |
}, | |
{ | |
"completion_length": 281.04296875, | |
"epoch": 0.336, | |
"grad_norm": 3872.0, | |
"kl": 38.398836399162974, | |
"learning_rate": 4.2000000000000004e-06, | |
"loss": 1.536, | |
"reward": 2.2889985144138336, | |
"reward_std": 0.7787356674671173, | |
"rewards/accuracy_reward": 0.068359375, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.6505863160515825, | |
"rewards/reasoning_steps_reward": 0.2688802117481828, | |
"step": 21 | |
}, | |
{ | |
"completion_length": 288.47265625, | |
"epoch": 0.352, | |
"grad_norm": 1.0859375, | |
"kl": 0.0012669887473748531, | |
"learning_rate": 4.4e-06, | |
"loss": 0.0001, | |
"reward": 2.6376563012599945, | |
"reward_std": 0.6690970882773399, | |
"rewards/accuracy_reward": 0.087890625, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7099479290967187, | |
"rewards/reasoning_steps_reward": 0.41992187313735485, | |
"step": 22 | |
}, | |
{ | |
"completion_length": 286.966796875, | |
"epoch": 0.368, | |
"grad_norm": 1.2734375, | |
"kl": 0.0015128458107938059, | |
"learning_rate": 4.600000000000001e-06, | |
"loss": 0.0001, | |
"reward": 2.5921228751540184, | |
"reward_std": 0.6939626764506102, | |
"rewards/accuracy_reward": 0.0859375, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7249350572625796, | |
"rewards/reasoning_steps_reward": 0.33138021221384406, | |
"step": 23 | |
}, | |
{ | |
"completion_length": 280.384765625, | |
"epoch": 0.384, | |
"grad_norm": 0.9375, | |
"kl": 0.0015014593445812352, | |
"learning_rate": 4.800000000000001e-06, | |
"loss": 0.0001, | |
"reward": 2.704825095832348, | |
"reward_std": 0.8105385769158602, | |
"rewards/accuracy_reward": 0.056640625, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7590302489697933, | |
"rewards/reasoning_steps_reward": 0.3710937546566129, | |
"step": 24 | |
}, | |
{ | |
"completion_length": 290.41796875, | |
"epoch": 0.4, | |
"grad_norm": 0.9140625, | |
"kl": 0.001665601652348414, | |
"learning_rate": 5e-06, | |
"loss": 0.0001, | |
"reward": 2.357452914118767, | |
"reward_std": 0.7088302746415138, | |
"rewards/accuracy_reward": 0.064453125, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.6477967969452342, | |
"rewards/reasoning_steps_reward": 0.349609381519258, | |
"step": 25 | |
}, | |
{ | |
"completion_length": 291.509765625, | |
"epoch": 0.416, | |
"grad_norm": 0.9609375, | |
"kl": 0.001865061596618034, | |
"learning_rate": 4.999751919373782e-06, | |
"loss": 0.0001, | |
"reward": 2.281202170997858, | |
"reward_std": 0.6989514082670212, | |
"rewards/accuracy_reward": 0.08984375, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.5948191303759813, | |
"rewards/reasoning_steps_reward": 0.40690105222165585, | |
"step": 26 | |
}, | |
{ | |
"completion_length": 287.421875, | |
"epoch": 0.432, | |
"grad_norm": 0.9140625, | |
"kl": 0.002278451618622057, | |
"learning_rate": 4.9990077267303256e-06, | |
"loss": 0.0001, | |
"reward": 2.39421396702528, | |
"reward_std": 0.7001004256308079, | |
"rewards/accuracy_reward": 0.1171875, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.6474636799345413, | |
"rewards/reasoning_steps_reward": 0.3346354281529784, | |
"step": 27 | |
}, | |
{ | |
"completion_length": 289.5859375, | |
"epoch": 0.448, | |
"grad_norm": 1.0625, | |
"kl": 0.0022466240770881996, | |
"learning_rate": 4.997767569765452e-06, | |
"loss": 0.0001, | |
"reward": 2.673185594379902, | |
"reward_std": 0.7204618379473686, | |
"rewards/accuracy_reward": 0.060546875, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7612875507523617, | |
"rewards/reasoning_steps_reward": 0.32877604896202683, | |
"step": 28 | |
}, | |
{ | |
"completion_length": 290.353515625, | |
"epoch": 0.464, | |
"grad_norm": 0.94921875, | |
"kl": 0.002421206998405978, | |
"learning_rate": 4.996031694606294e-06, | |
"loss": 0.0001, | |
"reward": 2.2081645615398884, | |
"reward_std": 0.7610204052180052, | |
"rewards/accuracy_reward": 0.095703125, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.5865322849713266, | |
"rewards/reasoning_steps_reward": 0.3528645895421505, | |
"step": 29 | |
}, | |
{ | |
"completion_length": 284.9765625, | |
"epoch": 0.48, | |
"grad_norm": 0.95703125, | |
"kl": 0.0032389966800110415, | |
"learning_rate": 4.993800445762451e-06, | |
"loss": 0.0001, | |
"reward": 2.4655835777521133, | |
"reward_std": 0.8400940801948309, | |
"rewards/accuracy_reward": 0.087890625, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.6395695237442851, | |
"rewards/reasoning_steps_reward": 0.45898438431322575, | |
"step": 30 | |
}, | |
{ | |
"completion_length": 288.93359375, | |
"epoch": 0.496, | |
"grad_norm": 1.0, | |
"kl": 0.0028132440565968864, | |
"learning_rate": 4.991074266057609e-06, | |
"loss": 0.0001, | |
"reward": 2.666738063097, | |
"reward_std": 0.6789926886558533, | |
"rewards/accuracy_reward": 0.087890625, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7322286684066057, | |
"rewards/reasoning_steps_reward": 0.3821614580228925, | |
"step": 31 | |
}, | |
{ | |
"completion_length": 292.3671875, | |
"epoch": 0.512, | |
"grad_norm": 1.0078125, | |
"kl": 0.004060989667777903, | |
"learning_rate": 4.987853696541664e-06, | |
"loss": 0.0002, | |
"reward": 2.5818087458610535, | |
"reward_std": 0.6875880807638168, | |
"rewards/accuracy_reward": 0.0625, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7134674986203512, | |
"rewards/reasoning_steps_reward": 0.3789062611758709, | |
"step": 32 | |
}, | |
{ | |
"completion_length": 286.158203125, | |
"epoch": 0.528, | |
"grad_norm": 1.140625, | |
"kl": 0.005552116854232736, | |
"learning_rate": 4.984139376383337e-06, | |
"loss": 0.0002, | |
"reward": 2.8399546705186367, | |
"reward_std": 0.750790286809206, | |
"rewards/accuracy_reward": 0.138671875, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7791168199231228, | |
"rewards/reasoning_steps_reward": 0.3639322938397527, | |
"step": 33 | |
}, | |
{ | |
"completion_length": 287.48828125, | |
"epoch": 0.544, | |
"grad_norm": 3.171875, | |
"kl": 0.00440279851318337, | |
"learning_rate": 4.979932042743324e-06, | |
"loss": 0.0002, | |
"reward": 3.1019199565052986, | |
"reward_std": 0.8068479858338833, | |
"rewards/accuracy_reward": 0.06640625, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.877940321341157, | |
"rewards/reasoning_steps_reward": 0.4016927145421505, | |
"step": 34 | |
}, | |
{ | |
"completion_length": 291.712890625, | |
"epoch": 0.56, | |
"grad_norm": 0.83984375, | |
"kl": 0.003549927467247471, | |
"learning_rate": 4.975232530627998e-06, | |
"loss": 0.0001, | |
"reward": 2.758346803486347, | |
"reward_std": 0.73613665625453, | |
"rewards/accuracy_reward": 0.05078125, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7701433822512627, | |
"rewards/reasoning_steps_reward": 0.3971354244276881, | |
"step": 35 | |
}, | |
{ | |
"completion_length": 280.013671875, | |
"epoch": 0.576, | |
"grad_norm": 0.94921875, | |
"kl": 0.004770460931467824, | |
"learning_rate": 4.970041772723685e-06, | |
"loss": 0.0002, | |
"reward": 2.5518586486577988, | |
"reward_std": 0.752994803711772, | |
"rewards/accuracy_reward": 0.185546875, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.6477115458498398, | |
"rewards/reasoning_steps_reward": 0.4231770895421505, | |
"step": 36 | |
}, | |
{ | |
"completion_length": 294.7265625, | |
"epoch": 0.592, | |
"grad_norm": 0.88671875, | |
"kl": 0.004318368082749657, | |
"learning_rate": 4.964360799211563e-06, | |
"loss": 0.0002, | |
"reward": 2.9847040474414825, | |
"reward_std": 0.7252895850688219, | |
"rewards/accuracy_reward": 0.03515625, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.8412555102258921, | |
"rewards/reasoning_steps_reward": 0.42578124813735485, | |
"step": 37 | |
}, | |
{ | |
"completion_length": 287.59765625, | |
"epoch": 0.608, | |
"grad_norm": 0.92578125, | |
"kl": 0.005480331514263526, | |
"learning_rate": 4.958190737563203e-06, | |
"loss": 0.0002, | |
"reward": 2.4749373346567154, | |
"reward_std": 0.7473156917840242, | |
"rewards/accuracy_reward": 0.11328125, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.6489808422823747, | |
"rewards/reasoning_steps_reward": 0.41471355129033327, | |
"step": 38 | |
}, | |
{ | |
"completion_length": 295.189453125, | |
"epoch": 0.624, | |
"grad_norm": 0.8828125, | |
"kl": 0.005519463520613499, | |
"learning_rate": 4.951532812316814e-06, | |
"loss": 0.0002, | |
"reward": 2.7017148807644844, | |
"reward_std": 0.713581632822752, | |
"rewards/accuracy_reward": 0.03515625, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7169778756797314, | |
"rewards/reasoning_steps_reward": 0.5156250111758709, | |
"step": 39 | |
}, | |
{ | |
"completion_length": 289.95703125, | |
"epoch": 0.64, | |
"grad_norm": 0.8828125, | |
"kl": 0.005352065360057168, | |
"learning_rate": 4.944388344834205e-06, | |
"loss": 0.0002, | |
"reward": 2.7851984202861786, | |
"reward_std": 0.658753015100956, | |
"rewards/accuracy_reward": 0.109375, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7454567709937692, | |
"rewards/reasoning_steps_reward": 0.43945313338190317, | |
"step": 40 | |
}, | |
{ | |
"completion_length": 290.80078125, | |
"epoch": 0.656, | |
"grad_norm": 0.8515625, | |
"kl": 0.00584478146629408, | |
"learning_rate": 4.936758753038551e-06, | |
"loss": 0.0002, | |
"reward": 2.83456464856863, | |
"reward_std": 0.6670792158693075, | |
"rewards/accuracy_reward": 0.056640625, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7892559381822745, | |
"rewards/reasoning_steps_reward": 0.41015625838190317, | |
"step": 41 | |
}, | |
{ | |
"completion_length": 286.533203125, | |
"epoch": 0.672, | |
"grad_norm": 1.09375, | |
"kl": 0.009831015078816563, | |
"learning_rate": 4.92864555113298e-06, | |
"loss": 0.0004, | |
"reward": 3.0447439029812813, | |
"reward_std": 0.6739194095134735, | |
"rewards/accuracy_reward": 0.16015625, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.8074493408203125, | |
"rewards/reasoning_steps_reward": 0.4622395820915699, | |
"step": 42 | |
}, | |
{ | |
"completion_length": 295.37109375, | |
"epoch": 0.688, | |
"grad_norm": 0.95703125, | |
"kl": 0.0045513896038755774, | |
"learning_rate": 4.92005034930006e-06, | |
"loss": 0.0002, | |
"reward": 2.367835894227028, | |
"reward_std": 0.6928801033645868, | |
"rewards/accuracy_reward": 0.05859375, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.6254331463327011, | |
"rewards/reasoning_steps_reward": 0.4329427080228925, | |
"step": 43 | |
}, | |
{ | |
"completion_length": 291.12890625, | |
"epoch": 0.704, | |
"grad_norm": 0.89453125, | |
"kl": 0.007478385392460041, | |
"learning_rate": 4.9109748533822315e-06, | |
"loss": 0.0003, | |
"reward": 3.1017851755023003, | |
"reward_std": 0.7546116765588522, | |
"rewards/accuracy_reward": 0.05078125, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.8752912487834692, | |
"rewards/reasoning_steps_reward": 0.42513021221384406, | |
"step": 44 | |
}, | |
{ | |
"completion_length": 286.34375, | |
"epoch": 0.72, | |
"grad_norm": 0.90234375, | |
"kl": 0.007521548090153374, | |
"learning_rate": 4.901420864543265e-06, | |
"loss": 0.0003, | |
"reward": 2.608507961034775, | |
"reward_std": 0.6855722554028034, | |
"rewards/accuracy_reward": 0.119140625, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.674624165520072, | |
"rewards/reasoning_steps_reward": 0.4654947901144624, | |
"step": 45 | |
}, | |
{ | |
"completion_length": 287.42578125, | |
"epoch": 0.736, | |
"grad_norm": 0.984375, | |
"kl": 0.006817970628617331, | |
"learning_rate": 4.891390278910788e-06, | |
"loss": 0.0003, | |
"reward": 2.673181392252445, | |
"reward_std": 0.7831938974559307, | |
"rewards/accuracy_reward": 0.095703125, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7087687657525142, | |
"rewards/reasoning_steps_reward": 0.4511718712747097, | |
"step": 46 | |
}, | |
{ | |
"completion_length": 285.908203125, | |
"epoch": 0.752, | |
"grad_norm": 1.2890625, | |
"kl": 0.007845322310458869, | |
"learning_rate": 4.880885087199972e-06, | |
"loss": 0.0003, | |
"reward": 2.7633985728025436, | |
"reward_std": 0.7148055490106344, | |
"rewards/accuracy_reward": 0.099609375, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.750776955857873, | |
"rewards/reasoning_steps_reward": 0.41145834047347307, | |
"step": 47 | |
}, | |
{ | |
"completion_length": 289.779296875, | |
"epoch": 0.768, | |
"grad_norm": 1.15625, | |
"kl": 0.008501806572894566, | |
"learning_rate": 4.869907374318446e-06, | |
"loss": 0.0003, | |
"reward": 3.029990702867508, | |
"reward_std": 0.7890328913927078, | |
"rewards/accuracy_reward": 0.08203125, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.8099100968490044, | |
"rewards/reasoning_steps_reward": 0.5182291734963655, | |
"step": 48 | |
}, | |
{ | |
"completion_length": 288.1328125, | |
"epoch": 0.784, | |
"grad_norm": 1.5390625, | |
"kl": 0.008691710012499243, | |
"learning_rate": 4.858459318952521e-06, | |
"loss": 0.0003, | |
"reward": 2.929666645824909, | |
"reward_std": 0.7696562893688679, | |
"rewards/accuracy_reward": 0.072265625, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7836301922798157, | |
"rewards/reasoning_steps_reward": 0.5065104309469461, | |
"step": 49 | |
}, | |
{ | |
"completion_length": 290.06640625, | |
"epoch": 0.8, | |
"grad_norm": 0.95703125, | |
"kl": 0.007455944927642122, | |
"learning_rate": 4.8465431931347904e-06, | |
"loss": 0.0003, | |
"reward": 2.573406994342804, | |
"reward_std": 0.7570146657526493, | |
"rewards/accuracy_reward": 0.103515625, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.6466478169895709, | |
"rewards/reasoning_steps_reward": 0.529947929084301, | |
"step": 50 | |
}, | |
{ | |
"completion_length": 291.49609375, | |
"epoch": 0.816, | |
"grad_norm": 0.98046875, | |
"kl": 0.01057859291904606, | |
"learning_rate": 4.83416136179322e-06, | |
"loss": 0.0004, | |
"reward": 2.6446976363658905, | |
"reward_std": 0.6847481243312359, | |
"rewards/accuracy_reward": 0.025390625, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7094738796974221, | |
"rewards/reasoning_steps_reward": 0.49088541977107525, | |
"step": 51 | |
}, | |
{ | |
"completion_length": 282.392578125, | |
"epoch": 0.832, | |
"grad_norm": 2.03125, | |
"kl": 0.01016361394431442, | |
"learning_rate": 4.821316282281788e-06, | |
"loss": 0.0004, | |
"reward": 2.766519770026207, | |
"reward_std": 0.7637902311980724, | |
"rewards/accuracy_reward": 0.1328125, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.6958277653902769, | |
"rewards/reasoning_steps_reward": 0.5462239757180214, | |
"step": 52 | |
}, | |
{ | |
"completion_length": 287.380859375, | |
"epoch": 0.848, | |
"grad_norm": 1.5, | |
"kl": 0.00917052014847286, | |
"learning_rate": 4.808010503892788e-06, | |
"loss": 0.0004, | |
"reward": 2.570107080042362, | |
"reward_std": 0.7531391996890306, | |
"rewards/accuracy_reward": 0.10546875, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.6537943718334039, | |
"rewards/reasoning_steps_reward": 0.5032552108168602, | |
"step": 53 | |
}, | |
{ | |
"completion_length": 291.640625, | |
"epoch": 0.864, | |
"grad_norm": 0.89453125, | |
"kl": 0.009139836591202766, | |
"learning_rate": 4.794246667350889e-06, | |
"loss": 0.0004, | |
"reward": 2.8398406505584717, | |
"reward_std": 0.7224904727190733, | |
"rewards/accuracy_reward": 0.04296875, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.770832309499383, | |
"rewards/reasoning_steps_reward": 0.484375006519258, | |
"step": 54 | |
}, | |
{ | |
"completion_length": 273.5, | |
"epoch": 0.88, | |
"grad_norm": 1.2734375, | |
"kl": 0.009875323972664773, | |
"learning_rate": 4.780027504289043e-06, | |
"loss": 0.0004, | |
"reward": 2.9461885392665863, | |
"reward_std": 0.7379185315221548, | |
"rewards/accuracy_reward": 0.193359375, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7422624975442886, | |
"rewards/reasoning_steps_reward": 0.5260416679084301, | |
"step": 55 | |
}, | |
{ | |
"completion_length": 291.078125, | |
"epoch": 0.896, | |
"grad_norm": 0.8828125, | |
"kl": 0.00826937542296946, | |
"learning_rate": 4.765355836706349e-06, | |
"loss": 0.0003, | |
"reward": 2.880779907107353, | |
"reward_std": 0.711861016228795, | |
"rewards/accuracy_reward": 0.03125, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7781853148092827, | |
"rewards/reasoning_steps_reward": 0.5149739719927311, | |
"step": 56 | |
}, | |
{ | |
"completion_length": 288.72265625, | |
"epoch": 0.912, | |
"grad_norm": 0.9140625, | |
"kl": 0.008809896156890318, | |
"learning_rate": 4.750234576407994e-06, | |
"loss": 0.0004, | |
"reward": 2.6955473721027374, | |
"reward_std": 0.8277835454791784, | |
"rewards/accuracy_reward": 0.087890625, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.6945227358179787, | |
"rewards/reasoning_steps_reward": 0.5240885429084301, | |
"step": 57 | |
}, | |
{ | |
"completion_length": 285.9921875, | |
"epoch": 0.928, | |
"grad_norm": 0.90234375, | |
"kl": 0.008899325417587534, | |
"learning_rate": 4.734666724427357e-06, | |
"loss": 0.0004, | |
"reward": 3.041392058134079, | |
"reward_std": 0.6156999934464693, | |
"rewards/accuracy_reward": 0.115234375, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.8032938965285817, | |
"rewards/reasoning_steps_reward": 0.516276054084301, | |
"step": 58 | |
}, | |
{ | |
"completion_length": 290.314453125, | |
"epoch": 0.944, | |
"grad_norm": 4.21875, | |
"kl": 0.011989369959337637, | |
"learning_rate": 4.718655370430411e-06, | |
"loss": 0.0005, | |
"reward": 2.9865424036979675, | |
"reward_std": 0.8030446134507656, | |
"rewards/accuracy_reward": 0.078125, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7860957235097885, | |
"rewards/reasoning_steps_reward": 0.5501302164047956, | |
"step": 59 | |
}, | |
{ | |
"completion_length": 282.091796875, | |
"epoch": 0.96, | |
"grad_norm": 1.8125, | |
"kl": 0.012043666996760294, | |
"learning_rate": 4.702203692102539e-06, | |
"loss": 0.0005, | |
"reward": 3.1328602582216263, | |
"reward_std": 0.6528369020670652, | |
"rewards/accuracy_reward": 0.111328125, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.8283579163253307, | |
"rewards/reasoning_steps_reward": 0.5364583358168602, | |
"step": 60 | |
}, | |
{ | |
"completion_length": 288.666015625, | |
"epoch": 0.976, | |
"grad_norm": 0.76953125, | |
"kl": 0.009388500155182555, | |
"learning_rate": 4.68531495451787e-06, | |
"loss": 0.0004, | |
"reward": 2.58310616761446, | |
"reward_std": 0.6356705613434315, | |
"rewards/accuracy_reward": 0.126953125, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.6383791274080673, | |
"rewards/reasoning_steps_reward": 0.5410156436264515, | |
"step": 61 | |
}, | |
{ | |
"completion_length": 288.513671875, | |
"epoch": 0.992, | |
"grad_norm": 1.203125, | |
"kl": 0.010823950171470642, | |
"learning_rate": 4.66799250949128e-06, | |
"loss": 0.0004, | |
"reward": 3.1646435484290123, | |
"reward_std": 0.7192362230271101, | |
"rewards/accuracy_reward": 0.095703125, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.8294037394225597, | |
"rewards/reasoning_steps_reward": 0.5807291716337204, | |
"step": 62 | |
}, | |
{ | |
"completion_length": 283.671875, | |
"epoch": 1.0, | |
"grad_norm": 0.65234375, | |
"kl": 0.011424218711908907, | |
"learning_rate": 4.650239794913177e-06, | |
"loss": 0.0002, | |
"reward": 2.6004482805728912, | |
"reward_std": 0.775815561413765, | |
"rewards/accuracy_reward": 0.1484375, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.6493681768576304, | |
"rewards/reasoning_steps_reward": 0.5039062574505806, | |
"step": 63 | |
}, | |
{ | |
"completion_length": 293.845703125, | |
"epoch": 1.016, | |
"grad_norm": 1.921875, | |
"kl": 0.011366115068085492, | |
"learning_rate": 4.632060334067202e-06, | |
"loss": 0.0005, | |
"reward": 2.7260814532637596, | |
"reward_std": 0.6874045897275209, | |
"rewards/accuracy_reward": 0.078125, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.6918969408919414, | |
"rewards/reasoning_steps_reward": 0.5722656305879354, | |
"step": 64 | |
}, | |
{ | |
"completion_length": 294.06640625, | |
"epoch": 1.032, | |
"grad_norm": 2.171875, | |
"kl": 0.012063174799550325, | |
"learning_rate": 4.613457734930978e-06, | |
"loss": 0.0005, | |
"reward": 2.6708649322390556, | |
"reward_std": 0.6978613398969173, | |
"rewards/accuracy_reward": 0.109375, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.6702362283443412, | |
"rewards/reasoning_steps_reward": 0.5507812555879354, | |
"step": 65 | |
}, | |
{ | |
"completion_length": 293.265625, | |
"epoch": 1.048, | |
"grad_norm": 0.91015625, | |
"kl": 0.010817280679475516, | |
"learning_rate": 4.5944356894600615e-06, | |
"loss": 0.0004, | |
"reward": 2.96081106364727, | |
"reward_std": 0.7362911906093359, | |
"rewards/accuracy_reward": 0.052734375, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7738293862591187, | |
"rewards/reasoning_steps_reward": 0.5865885466337204, | |
"step": 66 | |
}, | |
{ | |
"completion_length": 278.333984375, | |
"epoch": 1.064, | |
"grad_norm": 0.82421875, | |
"kl": 0.010780130076454952, | |
"learning_rate": 4.574997972855212e-06, | |
"loss": 0.0004, | |
"reward": 2.909902695566416, | |
"reward_std": 0.6607580352574587, | |
"rewards/accuracy_reward": 0.228515625, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.708031815631936, | |
"rewards/reasoning_steps_reward": 0.5572916734963655, | |
"step": 67 | |
}, | |
{ | |
"completion_length": 283.216796875, | |
"epoch": 1.08, | |
"grad_norm": 0.890625, | |
"kl": 0.012393000011797994, | |
"learning_rate": 4.5551484428131575e-06, | |
"loss": 0.0005, | |
"reward": 2.827034629881382, | |
"reward_std": 0.6700945645570755, | |
"rewards/accuracy_reward": 0.130859375, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7125271611536542, | |
"rewards/reasoning_steps_reward": 0.5585937593132257, | |
"step": 68 | |
}, | |
{ | |
"completion_length": 288.322265625, | |
"epoch": 1.096, | |
"grad_norm": 1.21875, | |
"kl": 0.013504860282409936, | |
"learning_rate": 4.534891038760971e-06, | |
"loss": 0.0005, | |
"reward": 3.1474373564124107, | |
"reward_std": 0.7250996101647615, | |
"rewards/accuracy_reward": 0.07421875, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.818677028020223, | |
"rewards/reasoning_steps_reward": 0.6171875037252903, | |
"step": 69 | |
}, | |
{ | |
"completion_length": 282.470703125, | |
"epoch": 1.112, | |
"grad_norm": 1.7421875, | |
"kl": 0.010700971761252731, | |
"learning_rate": 4.514229781074239e-06, | |
"loss": 0.0004, | |
"reward": 2.8449594378471375, | |
"reward_std": 0.7744644097983837, | |
"rewards/accuracy_reward": 0.1484375, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.70309411889563, | |
"rewards/reasoning_steps_reward": 0.5872395783662796, | |
"step": 70 | |
}, | |
{ | |
"completion_length": 290.74609375, | |
"epoch": 1.1280000000000001, | |
"grad_norm": 0.98828125, | |
"kl": 0.012390443938784301, | |
"learning_rate": 4.49316877027916e-06, | |
"loss": 0.0005, | |
"reward": 2.777492232620716, | |
"reward_std": 0.6991744674742222, | |
"rewards/accuracy_reward": 0.109375, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.6892856055249771, | |
"rewards/reasoning_steps_reward": 0.6002604309469461, | |
"step": 71 | |
}, | |
{ | |
"completion_length": 286.109375, | |
"epoch": 1.144, | |
"grad_norm": 0.921875, | |
"kl": 0.012579885253217071, | |
"learning_rate": 4.471712186238728e-06, | |
"loss": 0.0005, | |
"reward": 2.548068232834339, | |
"reward_std": 0.6026105545461178, | |
"rewards/accuracy_reward": 0.15234375, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.6145470403134823, | |
"rewards/reasoning_steps_reward": 0.5520833395421505, | |
"step": 72 | |
}, | |
{ | |
"completion_length": 283.822265625, | |
"epoch": 1.16, | |
"grad_norm": 0.80859375, | |
"kl": 0.01136038324330002, | |
"learning_rate": 4.449864287323188e-06, | |
"loss": 0.0005, | |
"reward": 2.7529877200722694, | |
"reward_std": 0.575440164655447, | |
"rewards/accuracy_reward": 0.1015625, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.6726538874208927, | |
"rewards/reasoning_steps_reward": 0.633463554084301, | |
"step": 73 | |
}, | |
{ | |
"completion_length": 287.66015625, | |
"epoch": 1.176, | |
"grad_norm": 0.99609375, | |
"kl": 0.013483586255460978, | |
"learning_rate": 4.427629409564898e-06, | |
"loss": 0.0005, | |
"reward": 2.6529831513762474, | |
"reward_std": 0.7726290188729763, | |
"rewards/accuracy_reward": 0.025390625, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.6770794546852509, | |
"rewards/reasoning_steps_reward": 0.5963541734963655, | |
"step": 74 | |
}, | |
{ | |
"completion_length": 287.505859375, | |
"epoch": 1.192, | |
"grad_norm": 0.8828125, | |
"kl": 0.010077012644615024, | |
"learning_rate": 4.405011965797775e-06, | |
"loss": 0.0004, | |
"reward": 2.944363258779049, | |
"reward_std": 0.6908796802163124, | |
"rewards/accuracy_reward": 0.111328125, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7551089283078909, | |
"rewards/reasoning_steps_reward": 0.567708333954215, | |
"step": 75 | |
}, | |
{ | |
"completion_length": 286.248046875, | |
"epoch": 1.208, | |
"grad_norm": 1.109375, | |
"kl": 0.014365001203259453, | |
"learning_rate": 4.382016444781509e-06, | |
"loss": 0.0006, | |
"reward": 2.8981464356184006, | |
"reward_std": 0.7666896525770426, | |
"rewards/accuracy_reward": 0.09765625, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7290696154038111, | |
"rewards/reasoning_steps_reward": 0.6132812537252903, | |
"step": 76 | |
}, | |
{ | |
"completion_length": 296.5078125, | |
"epoch": 1.224, | |
"grad_norm": 0.91796875, | |
"kl": 0.011508767551276833, | |
"learning_rate": 4.3586474103107034e-06, | |
"loss": 0.0005, | |
"reward": 3.2085797861218452, | |
"reward_std": 0.7307887077331543, | |
"rewards/accuracy_reward": 0.0078125, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.8637974336743355, | |
"rewards/reasoning_steps_reward": 0.6093750055879354, | |
"step": 77 | |
}, | |
{ | |
"completion_length": 288.017578125, | |
"epoch": 1.24, | |
"grad_norm": 1.7890625, | |
"kl": 0.01725275401258841, | |
"learning_rate": 4.334909500309124e-06, | |
"loss": 0.0007, | |
"reward": 2.819778010249138, | |
"reward_std": 0.680737467482686, | |
"rewards/accuracy_reward": 0.087890625, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7289884922405084, | |
"rewards/reasoning_steps_reward": 0.5449218675494194, | |
"step": 78 | |
}, | |
{ | |
"completion_length": 289.611328125, | |
"epoch": 1.256, | |
"grad_norm": 0.91015625, | |
"kl": 0.012111473915865645, | |
"learning_rate": 4.310807425909231e-06, | |
"loss": 0.0005, | |
"reward": 2.8959785476326942, | |
"reward_std": 0.7515880167484283, | |
"rewards/accuracy_reward": 0.091796875, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7370275671904286, | |
"rewards/reasoning_steps_reward": 0.5930989608168602, | |
"step": 79 | |
}, | |
{ | |
"completion_length": 289.447265625, | |
"epoch": 1.272, | |
"grad_norm": 1.0625, | |
"kl": 0.01488638247246854, | |
"learning_rate": 4.286345970517195e-06, | |
"loss": 0.0006, | |
"reward": 3.0342861488461494, | |
"reward_std": 0.7542771827429533, | |
"rewards/accuracy_reward": 0.08203125, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7896405051772794, | |
"rewards/reasoning_steps_reward": 0.5833333432674408, | |
"step": 80 | |
}, | |
{ | |
"completion_length": 291.05078125, | |
"epoch": 1.288, | |
"grad_norm": 2.203125, | |
"kl": 0.018765830318443477, | |
"learning_rate": 4.261529988863552e-06, | |
"loss": 0.0008, | |
"reward": 2.6918394044041634, | |
"reward_std": 0.5996266044676304, | |
"rewards/accuracy_reward": 0.0546875, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.6713683251291513, | |
"rewards/reasoning_steps_reward": 0.6230468954890966, | |
"step": 81 | |
}, | |
{ | |
"completion_length": 286.84765625, | |
"epoch": 1.304, | |
"grad_norm": 1.1796875, | |
"kl": 0.014768981840461493, | |
"learning_rate": 4.236364406039718e-06, | |
"loss": 0.0006, | |
"reward": 2.7222700491547585, | |
"reward_std": 0.7165388073772192, | |
"rewards/accuracy_reward": 0.166015625, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.6546021662652493, | |
"rewards/reasoning_steps_reward": 0.5924479365348816, | |
"step": 82 | |
}, | |
{ | |
"completion_length": 286.40625, | |
"epoch": 1.32, | |
"grad_norm": 1.1875, | |
"kl": 0.013985031400807202, | |
"learning_rate": 4.210854216520529e-06, | |
"loss": 0.0006, | |
"reward": 2.992369443178177, | |
"reward_std": 0.704998791217804, | |
"rewards/accuracy_reward": 0.140625, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7387759207437435, | |
"rewards/reasoning_steps_reward": 0.6354166734963655, | |
"step": 83 | |
}, | |
{ | |
"completion_length": 285.0546875, | |
"epoch": 1.336, | |
"grad_norm": 0.98828125, | |
"kl": 0.015817424457054585, | |
"learning_rate": 4.185004483173018e-06, | |
"loss": 0.0006, | |
"reward": 2.6470197066664696, | |
"reward_std": 0.6006427239626646, | |
"rewards/accuracy_reward": 0.091796875, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.6638069117131332, | |
"rewards/reasoning_steps_reward": 0.5638020895421505, | |
"step": 84 | |
}, | |
{ | |
"completion_length": 289.240234375, | |
"epoch": 1.3519999999999999, | |
"grad_norm": 0.7734375, | |
"kl": 0.012478121934691444, | |
"learning_rate": 4.158820336251615e-06, | |
"loss": 0.0005, | |
"reward": 2.86134272813797, | |
"reward_std": 0.6924843583256006, | |
"rewards/accuracy_reward": 0.103515625, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7050829759488503, | |
"rewards/reasoning_steps_reward": 0.6425781324505806, | |
"step": 85 | |
}, | |
{ | |
"completion_length": 290.703125, | |
"epoch": 1.3679999999999999, | |
"grad_norm": 1.0, | |
"kl": 0.01463651837548241, | |
"learning_rate": 4.132306972379971e-06, | |
"loss": 0.0006, | |
"reward": 2.752312555909157, | |
"reward_std": 0.6686646416783333, | |
"rewards/accuracy_reward": 0.09375, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.6626632142191132, | |
"rewards/reasoning_steps_reward": 0.6705729197710752, | |
"step": 86 | |
}, | |
{ | |
"completion_length": 295.359375, | |
"epoch": 1.384, | |
"grad_norm": 6.46875, | |
"kl": 0.051578508340753615, | |
"learning_rate": 4.105469653519617e-06, | |
"loss": 0.0021, | |
"reward": 2.62810418009758, | |
"reward_std": 0.7081009931862354, | |
"rewards/accuracy_reward": 0.08203125, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.630809023976326, | |
"rewards/reasoning_steps_reward": 0.6536458320915699, | |
"step": 87 | |
}, | |
{ | |
"completion_length": 274.9765625, | |
"epoch": 1.4, | |
"grad_norm": 0.9921875, | |
"kl": 0.01592816604534164, | |
"learning_rate": 4.078313705925647e-06, | |
"loss": 0.0006, | |
"reward": 2.9463500678539276, | |
"reward_std": 0.6255538109689951, | |
"rewards/accuracy_reward": 0.171875, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.735371884269019, | |
"rewards/reasoning_steps_reward": 0.5683593694120646, | |
"step": 88 | |
}, | |
{ | |
"completion_length": 285.123046875, | |
"epoch": 1.416, | |
"grad_norm": 0.9375, | |
"kl": 0.016973954916466027, | |
"learning_rate": 4.0508445190896505e-06, | |
"loss": 0.0007, | |
"reward": 2.821994110941887, | |
"reward_std": 0.6989093981683254, | |
"rewards/accuracy_reward": 0.123046875, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7006473361204067, | |
"rewards/reasoning_steps_reward": 0.5970052275806665, | |
"step": 89 | |
}, | |
{ | |
"completion_length": 289.72265625, | |
"epoch": 1.432, | |
"grad_norm": 1.03125, | |
"kl": 0.013915765506681055, | |
"learning_rate": 4.023067544670082e-06, | |
"loss": 0.0006, | |
"reward": 2.775428354740143, | |
"reward_std": 0.6686036083847284, | |
"rewards/accuracy_reward": 0.05078125, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7163754136612018, | |
"rewards/reasoning_steps_reward": 0.5755208488553762, | |
"step": 90 | |
}, | |
{ | |
"completion_length": 285.888671875, | |
"epoch": 1.448, | |
"grad_norm": 0.84375, | |
"kl": 0.014326595468446612, | |
"learning_rate": 3.9949882954103115e-06, | |
"loss": 0.0006, | |
"reward": 2.778537377715111, | |
"reward_std": 0.6794710606336594, | |
"rewards/accuracy_reward": 0.10546875, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.6774811816091338, | |
"rewards/reasoning_steps_reward": 0.6406249962747097, | |
"step": 91 | |
}, | |
{ | |
"completion_length": 288.5703125, | |
"epoch": 1.464, | |
"grad_norm": 1.0078125, | |
"kl": 0.0172699682880193, | |
"learning_rate": 3.9666123440445295e-06, | |
"loss": 0.0007, | |
"reward": 3.1450441628694534, | |
"reward_std": 0.6363171022385359, | |
"rewards/accuracy_reward": 0.08203125, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.8013862585648894, | |
"rewards/reasoning_steps_reward": 0.6588541604578495, | |
"step": 92 | |
}, | |
{ | |
"completion_length": 290.484375, | |
"epoch": 1.48, | |
"grad_norm": 1.0703125, | |
"kl": 0.01583321939688176, | |
"learning_rate": 3.937945322191763e-06, | |
"loss": 0.0006, | |
"reward": 2.80034501850605, | |
"reward_std": 0.6433209720999002, | |
"rewards/accuracy_reward": 0.046875, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.6977712381631136, | |
"rewards/reasoning_steps_reward": 0.6601562574505806, | |
"step": 93 | |
}, | |
{ | |
"completion_length": 288.1953125, | |
"epoch": 1.496, | |
"grad_norm": 0.8515625, | |
"kl": 0.014628544799052179, | |
"learning_rate": 3.9089929192382e-06, | |
"loss": 0.0006, | |
"reward": 2.8053995221853256, | |
"reward_std": 0.6814130581915379, | |
"rewards/accuracy_reward": 0.1015625, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.6920776072268685, | |
"rewards/reasoning_steps_reward": 0.627604166045785, | |
"step": 94 | |
}, | |
{ | |
"completion_length": 281.654296875, | |
"epoch": 1.512, | |
"grad_norm": 1.7578125, | |
"kl": 0.018830388551577926, | |
"learning_rate": 3.879760881208043e-06, | |
"loss": 0.0008, | |
"reward": 3.1405431628227234, | |
"reward_std": 0.6778986994177103, | |
"rewards/accuracy_reward": 0.126953125, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7955456127723058, | |
"rewards/reasoning_steps_reward": 0.6269531361758709, | |
"step": 95 | |
}, | |
{ | |
"completion_length": 288.43359375, | |
"epoch": 1.528, | |
"grad_norm": 0.97265625, | |
"kl": 0.015546579379588366, | |
"learning_rate": 3.8502550096231325e-06, | |
"loss": 0.0006, | |
"reward": 2.9025785624980927, | |
"reward_std": 0.6252446379512548, | |
"rewards/accuracy_reward": 0.14453125, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.6997310233612856, | |
"rewards/reasoning_steps_reward": 0.6588541865348816, | |
"step": 96 | |
}, | |
{ | |
"completion_length": 287.794921875, | |
"epoch": 1.544, | |
"grad_norm": 1.90625, | |
"kl": 0.01677697291597724, | |
"learning_rate": 3.82048116035155e-06, | |
"loss": 0.0007, | |
"reward": 2.9905193150043488, | |
"reward_std": 0.6695100143551826, | |
"rewards/accuracy_reward": 0.109375, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7668050316472849, | |
"rewards/reasoning_steps_reward": 0.5807291697710752, | |
"step": 97 | |
}, | |
{ | |
"completion_length": 290.875, | |
"epoch": 1.56, | |
"grad_norm": 0.93359375, | |
"kl": 0.017896617820952088, | |
"learning_rate": 3.790445242445432e-06, | |
"loss": 0.0007, | |
"reward": 3.0564729273319244, | |
"reward_std": 0.7583746667951345, | |
"rewards/accuracy_reward": 0.0625, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.79595104791224, | |
"rewards/reasoning_steps_reward": 0.6061197966337204, | |
"step": 98 | |
}, | |
{ | |
"completion_length": 290.935546875, | |
"epoch": 1.576, | |
"grad_norm": 0.87890625, | |
"kl": 0.01600857445737347, | |
"learning_rate": 3.7601532169682363e-06, | |
"loss": 0.0006, | |
"reward": 3.207048572599888, | |
"reward_std": 0.7255587056279182, | |
"rewards/accuracy_reward": 0.10546875, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.8183651498208443, | |
"rewards/reasoning_steps_reward": 0.6464843899011612, | |
"step": 99 | |
}, | |
{ | |
"completion_length": 293.923828125, | |
"epoch": 1.592, | |
"grad_norm": 0.94921875, | |
"kl": 0.016008648555725813, | |
"learning_rate": 3.7296110958116845e-06, | |
"loss": 0.0006, | |
"reward": 3.213783323764801, | |
"reward_std": 0.7248476631939411, | |
"rewards/accuracy_reward": 0.05859375, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.8229972099264463, | |
"rewards/reasoning_steps_reward": 0.6861979197710752, | |
"step": 100 | |
}, | |
{ | |
"completion_length": 293.623046875, | |
"epoch": 1.608, | |
"grad_norm": 1.890625, | |
"kl": 0.018201105995103717, | |
"learning_rate": 3.69882494050261e-06, | |
"loss": 0.0007, | |
"reward": 3.1282228976488113, | |
"reward_std": 0.7127013597637415, | |
"rewards/accuracy_reward": 0.072265625, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.780154167364041, | |
"rewards/reasoning_steps_reward": 0.7154948115348816, | |
"step": 101 | |
}, | |
{ | |
"completion_length": 282.21875, | |
"epoch": 1.624, | |
"grad_norm": 1.0078125, | |
"kl": 0.01832750393077731, | |
"learning_rate": 3.6678008609999618e-06, | |
"loss": 0.0007, | |
"reward": 2.694710373878479, | |
"reward_std": 0.6631567031145096, | |
"rewards/accuracy_reward": 0.142578125, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.630658664740622, | |
"rewards/reasoning_steps_reward": 0.6601562537252903, | |
"step": 102 | |
}, | |
{ | |
"completion_length": 286.57421875, | |
"epoch": 1.6400000000000001, | |
"grad_norm": 0.8359375, | |
"kl": 0.01992178033106029, | |
"learning_rate": 3.636545014482198e-06, | |
"loss": 0.0008, | |
"reward": 2.5292934477329254, | |
"reward_std": 0.6474510300904512, | |
"rewards/accuracy_reward": 0.123046875, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.5965700279921293, | |
"rewards/reasoning_steps_reward": 0.6165364626795053, | |
"step": 103 | |
}, | |
{ | |
"completion_length": 292.59375, | |
"epoch": 1.6560000000000001, | |
"grad_norm": 0.9140625, | |
"kl": 0.01710453676059842, | |
"learning_rate": 3.6050636041252996e-06, | |
"loss": 0.0007, | |
"reward": 2.915451444685459, | |
"reward_std": 0.7090357206761837, | |
"rewards/accuracy_reward": 0.072265625, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7202980586638054, | |
"rewards/reasoning_steps_reward": 0.6822916753590107, | |
"step": 104 | |
}, | |
{ | |
"completion_length": 289.3359375, | |
"epoch": 1.6720000000000002, | |
"grad_norm": 1.0859375, | |
"kl": 0.017357071512378752, | |
"learning_rate": 3.5733628778716645e-06, | |
"loss": 0.0007, | |
"reward": 3.073413372039795, | |
"reward_std": 0.6876837071031332, | |
"rewards/accuracy_reward": 0.119140625, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7621013515939316, | |
"rewards/reasoning_steps_reward": 0.6679687574505806, | |
"step": 105 | |
}, | |
{ | |
"completion_length": 291.736328125, | |
"epoch": 1.688, | |
"grad_norm": 0.96875, | |
"kl": 0.02157578180776909, | |
"learning_rate": 3.5414491271901073e-06, | |
"loss": 0.0009, | |
"reward": 2.819728344678879, | |
"reward_std": 0.5820730160921812, | |
"rewards/accuracy_reward": 0.123046875, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.6790587411572536, | |
"rewards/reasoning_steps_reward": 0.6595052182674408, | |
"step": 106 | |
}, | |
{ | |
"completion_length": 289.84375, | |
"epoch": 1.704, | |
"grad_norm": 0.90234375, | |
"kl": 0.0179019469069317, | |
"learning_rate": 3.5093286858272325e-06, | |
"loss": 0.0007, | |
"reward": 3.1114601120352745, | |
"reward_std": 0.6653738301247358, | |
"rewards/accuracy_reward": 0.08203125, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7914936700835824, | |
"rewards/reasoning_steps_reward": 0.6549479365348816, | |
"step": 107 | |
}, | |
{ | |
"completion_length": 288.5234375, | |
"epoch": 1.72, | |
"grad_norm": 0.99609375, | |
"kl": 0.01925749407382682, | |
"learning_rate": 3.4770079285504053e-06, | |
"loss": 0.0008, | |
"reward": 2.79416061937809, | |
"reward_std": 0.7290520258247852, | |
"rewards/accuracy_reward": 0.078125, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.6850760908176502, | |
"rewards/reasoning_steps_reward": 0.6608072835952044, | |
"step": 108 | |
}, | |
{ | |
"completion_length": 290.9921875, | |
"epoch": 1.736, | |
"grad_norm": 0.98046875, | |
"kl": 0.017092529160436243, | |
"learning_rate": 3.4444932698825904e-06, | |
"loss": 0.0007, | |
"reward": 3.1415600925683975, | |
"reward_std": 0.7225816715508699, | |
"rewards/accuracy_reward": 0.064453125, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7919783468047777, | |
"rewards/reasoning_steps_reward": 0.7011718712747097, | |
"step": 109 | |
}, | |
{ | |
"completion_length": 293.560546875, | |
"epoch": 1.752, | |
"grad_norm": 0.88671875, | |
"kl": 0.016746411798521876, | |
"learning_rate": 3.4117911628292944e-06, | |
"loss": 0.0007, | |
"reward": 2.7672165408730507, | |
"reward_std": 0.6844876762479544, | |
"rewards/accuracy_reward": 0.041015625, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.680652025466164, | |
"rewards/reasoning_steps_reward": 0.6842447929084301, | |
"step": 110 | |
}, | |
{ | |
"completion_length": 285.9921875, | |
"epoch": 1.768, | |
"grad_norm": 0.98828125, | |
"kl": 0.018680680135730654, | |
"learning_rate": 3.378908097597875e-06, | |
"loss": 0.0007, | |
"reward": 2.875435918569565, | |
"reward_std": 0.6584971006959677, | |
"rewards/accuracy_reward": 0.119140625, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.688730369011561, | |
"rewards/reasoning_steps_reward": 0.6901041753590107, | |
"step": 111 | |
}, | |
{ | |
"completion_length": 284.619140625, | |
"epoch": 1.784, | |
"grad_norm": 1.1796875, | |
"kl": 0.01885543600656092, | |
"learning_rate": 3.3458506003094626e-06, | |
"loss": 0.0008, | |
"reward": 3.2833499684929848, | |
"reward_std": 0.630975978448987, | |
"rewards/accuracy_reward": 0.15234375, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.8016982388993105, | |
"rewards/reasoning_steps_reward": 0.7259114682674408, | |
"step": 112 | |
}, | |
{ | |
"completion_length": 294.26953125, | |
"epoch": 1.8, | |
"grad_norm": 0.859375, | |
"kl": 0.01702951017068699, | |
"learning_rate": 3.3126252317037616e-06, | |
"loss": 0.0007, | |
"reward": 3.0866554528474808, | |
"reward_std": 0.7168434467166662, | |
"rewards/accuracy_reward": 0.046875, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7908209301531315, | |
"rewards/reasoning_steps_reward": 0.6673177108168602, | |
"step": 113 | |
}, | |
{ | |
"completion_length": 288.5625, | |
"epoch": 1.8159999999999998, | |
"grad_norm": 31.875, | |
"kl": 0.15217732661403716, | |
"learning_rate": 3.2792385858369706e-06, | |
"loss": 0.0061, | |
"reward": 2.8756242617964745, | |
"reward_std": 0.6984493192285299, | |
"rewards/accuracy_reward": 0.056640625, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7315449056526026, | |
"rewards/reasoning_steps_reward": 0.6243489552289248, | |
"step": 114 | |
}, | |
{ | |
"completion_length": 283.037109375, | |
"epoch": 1.8319999999999999, | |
"grad_norm": 0.7578125, | |
"kl": 0.016328598430845886, | |
"learning_rate": 3.245697288773102e-06, | |
"loss": 0.0007, | |
"reward": 2.902892917394638, | |
"reward_std": 0.6528493817895651, | |
"rewards/accuracy_reward": 0.158203125, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.6931084062283238, | |
"rewards/reasoning_steps_reward": 0.6653645895421505, | |
"step": 115 | |
}, | |
{ | |
"completion_length": 292.095703125, | |
"epoch": 1.8479999999999999, | |
"grad_norm": 0.9140625, | |
"kl": 0.018676706589758396, | |
"learning_rate": 3.2120079972689385e-06, | |
"loss": 0.0007, | |
"reward": 2.9004068598151207, | |
"reward_std": 0.7504412587732077, | |
"rewards/accuracy_reward": 0.087890625, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7139811124652624, | |
"rewards/reasoning_steps_reward": 0.6705729253590107, | |
"step": 116 | |
}, | |
{ | |
"completion_length": 288.615234375, | |
"epoch": 1.8639999999999999, | |
"grad_norm": 1.1171875, | |
"kl": 0.020321853808127344, | |
"learning_rate": 3.1781773974529072e-06, | |
"loss": 0.0008, | |
"reward": 2.7037860229611397, | |
"reward_std": 0.6163357421755791, | |
"rewards/accuracy_reward": 0.09375, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.6365050650201738, | |
"rewards/reasoning_steps_reward": 0.7005208358168602, | |
"step": 117 | |
}, | |
{ | |
"completion_length": 290.62109375, | |
"epoch": 1.88, | |
"grad_norm": 1.09375, | |
"kl": 0.019285056594526395, | |
"learning_rate": 3.1442122034981187e-06, | |
"loss": 0.0008, | |
"reward": 2.6533412411808968, | |
"reward_std": 0.6223033964633942, | |
"rewards/accuracy_reward": 0.10546875, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.6294557445993026, | |
"rewards/reasoning_steps_reward": 0.6595052145421505, | |
"step": 118 | |
}, | |
{ | |
"completion_length": 290.82421875, | |
"epoch": 1.896, | |
"grad_norm": 1.1953125, | |
"kl": 0.017033788317348808, | |
"learning_rate": 3.110119156289841e-06, | |
"loss": 0.0007, | |
"reward": 3.352183550596237, | |
"reward_std": 0.6941560469567776, | |
"rewards/accuracy_reward": 0.083984375, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.8489483365168174, | |
"rewards/reasoning_steps_reward": 0.7213541753590107, | |
"step": 119 | |
}, | |
{ | |
"completion_length": 283.248046875, | |
"epoch": 1.912, | |
"grad_norm": 2.015625, | |
"kl": 0.024006142339203507, | |
"learning_rate": 3.075905022087675e-06, | |
"loss": 0.001, | |
"reward": 2.9336234778165817, | |
"reward_std": 0.649795226752758, | |
"rewards/accuracy_reward": 0.15625, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.6996626891195774, | |
"rewards/reasoning_steps_reward": 0.6783854253590107, | |
"step": 120 | |
}, | |
{ | |
"completion_length": 282.849609375, | |
"epoch": 1.928, | |
"grad_norm": 1.109375, | |
"kl": 0.02002483472460881, | |
"learning_rate": 3.0415765911826916e-06, | |
"loss": 0.0008, | |
"reward": 2.675464451313019, | |
"reward_std": 0.6967358216643333, | |
"rewards/accuracy_reward": 0.119140625, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.6331409340103468, | |
"rewards/reasoning_steps_reward": 0.6569010391831398, | |
"step": 121 | |
}, | |
{ | |
"completion_length": 290.783203125, | |
"epoch": 1.944, | |
"grad_norm": 1.1171875, | |
"kl": 0.019037541293073446, | |
"learning_rate": 3.0071406765498003e-06, | |
"loss": 0.0008, | |
"reward": 3.0036216378211975, | |
"reward_std": 0.6973935160785913, | |
"rewards/accuracy_reward": 0.080078125, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7529433167849978, | |
"rewards/reasoning_steps_reward": 0.6647135354578495, | |
"step": 122 | |
}, | |
{ | |
"completion_length": 283.216796875, | |
"epoch": 1.96, | |
"grad_norm": 0.859375, | |
"kl": 0.017990577791351825, | |
"learning_rate": 2.9726041124956128e-06, | |
"loss": 0.0007, | |
"reward": 2.773143321275711, | |
"reward_std": 0.714199235662818, | |
"rewards/accuracy_reward": 0.111328125, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.6735130585730076, | |
"rewards/reasoning_steps_reward": 0.6412760522216558, | |
"step": 123 | |
}, | |
{ | |
"completion_length": 287.9765625, | |
"epoch": 1.976, | |
"grad_norm": 0.859375, | |
"kl": 0.017044205858837813, | |
"learning_rate": 2.9379737533020812e-06, | |
"loss": 0.0007, | |
"reward": 3.244216948747635, | |
"reward_std": 0.6938743200153112, | |
"rewards/accuracy_reward": 0.08984375, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.8157806489616632, | |
"rewards/reasoning_steps_reward": 0.7070312425494194, | |
"step": 124 | |
}, | |
{ | |
"completion_length": 290.36328125, | |
"epoch": 1.992, | |
"grad_norm": 0.9765625, | |
"kl": 0.017912040289957076, | |
"learning_rate": 2.9032564718661606e-06, | |
"loss": 0.0007, | |
"reward": 2.990898907184601, | |
"reward_std": 0.6811724901199341, | |
"rewards/accuracy_reward": 0.05859375, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7619402616595229, | |
"rewards/reasoning_steps_reward": 0.6464843824505806, | |
"step": 125 | |
}, | |
{ | |
"completion_length": 292.08203125, | |
"epoch": 2.0, | |
"grad_norm": 0.609375, | |
"kl": 0.017485147807747126, | |
"learning_rate": 2.8684591583357863e-06, | |
"loss": 0.0003, | |
"reward": 3.365106463432312, | |
"reward_std": 0.690997276455164, | |
"rewards/accuracy_reward": 0.08203125, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.8508687826494375, | |
"rewards/reasoning_steps_reward": 0.7304687425494194, | |
"step": 126 | |
}, | |
{ | |
"completion_length": 274.076171875, | |
"epoch": 2.016, | |
"grad_norm": 0.9453125, | |
"kl": 0.020696480583865196, | |
"learning_rate": 2.8335887187424225e-06, | |
"loss": 0.0008, | |
"reward": 3.0040955543518066, | |
"reward_std": 0.6572606600821018, | |
"rewards/accuracy_reward": 0.248046875, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7068773318702976, | |
"rewards/reasoning_steps_reward": 0.6354166716337204, | |
"step": 127 | |
}, | |
{ | |
"completion_length": 291.470703125, | |
"epoch": 2.032, | |
"grad_norm": 0.89453125, | |
"kl": 0.018271160661242902, | |
"learning_rate": 2.7986520736304632e-06, | |
"loss": 0.0007, | |
"reward": 2.8309315219521523, | |
"reward_std": 0.6785434670746326, | |
"rewards/accuracy_reward": 0.072265625, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.688218497360746, | |
"rewards/reasoning_steps_reward": 0.694010429084301, | |
"step": 128 | |
}, | |
{ | |
"completion_length": 295.984375, | |
"epoch": 2.048, | |
"grad_norm": 1.03125, | |
"kl": 0.018810921494150534, | |
"learning_rate": 2.7636561566837463e-06, | |
"loss": 0.0008, | |
"reward": 3.07464836537838, | |
"reward_std": 0.7456005457788706, | |
"rewards/accuracy_reward": 0.037109375, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7963671404868364, | |
"rewards/reasoning_steps_reward": 0.6484375149011612, | |
"step": 129 | |
}, | |
{ | |
"completion_length": 281.150390625, | |
"epoch": 2.064, | |
"grad_norm": 0.98046875, | |
"kl": 0.020406899857334793, | |
"learning_rate": 2.728607913349464e-06, | |
"loss": 0.0008, | |
"reward": 2.931031860411167, | |
"reward_std": 0.6928635407239199, | |
"rewards/accuracy_reward": 0.15234375, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.6862120016788443, | |
"rewards/reasoning_steps_reward": 0.7200520895421505, | |
"step": 130 | |
}, | |
{ | |
"completion_length": 290.275390625, | |
"epoch": 2.08, | |
"grad_norm": 0.8203125, | |
"kl": 0.018310176266822964, | |
"learning_rate": 2.6935142994597407e-06, | |
"loss": 0.0007, | |
"reward": 3.099424757063389, | |
"reward_std": 0.6812999919056892, | |
"rewards/accuracy_reward": 0.078125, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7933412299801906, | |
"rewards/reasoning_steps_reward": 0.6412760503590107, | |
"step": 131 | |
}, | |
{ | |
"completion_length": 291.0234375, | |
"epoch": 2.096, | |
"grad_norm": 0.96484375, | |
"kl": 0.01775828906102106, | |
"learning_rate": 2.6583822798511428e-06, | |
"loss": 0.0007, | |
"reward": 3.313634306192398, | |
"reward_std": 0.6808726880699396, | |
"rewards/accuracy_reward": 0.083984375, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.8779822603488961, | |
"rewards/reasoning_steps_reward": 0.5957031287252903, | |
"step": 132 | |
}, | |
{ | |
"completion_length": 285.263671875, | |
"epoch": 2.112, | |
"grad_norm": 0.84375, | |
"kl": 0.018688918324187398, | |
"learning_rate": 2.623218826982411e-06, | |
"loss": 0.0007, | |
"reward": 2.7654543220996857, | |
"reward_std": 0.6947140172123909, | |
"rewards/accuracy_reward": 0.15234375, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.640568091844519, | |
"rewards/reasoning_steps_reward": 0.69140625, | |
"step": 133 | |
}, | |
{ | |
"completion_length": 282.830078125, | |
"epoch": 2.128, | |
"grad_norm": 0.94140625, | |
"kl": 0.021032241464126855, | |
"learning_rate": 2.5880309195506714e-06, | |
"loss": 0.0008, | |
"reward": 2.8315402641892433, | |
"reward_std": 0.6945422478020191, | |
"rewards/accuracy_reward": 0.158203125, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.641546401505669, | |
"rewards/reasoning_steps_reward": 0.748697929084301, | |
"step": 134 | |
}, | |
{ | |
"completion_length": 292.361328125, | |
"epoch": 2.144, | |
"grad_norm": 0.890625, | |
"kl": 0.018107893760316074, | |
"learning_rate": 2.552825541106414e-06, | |
"loss": 0.0007, | |
"reward": 3.0376425981521606, | |
"reward_std": 0.7193902563303709, | |
"rewards/accuracy_reward": 0.029296875, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7757853666941324, | |
"rewards/reasoning_steps_reward": 0.6809895876795053, | |
"step": 135 | |
}, | |
{ | |
"completion_length": 287.232421875, | |
"epoch": 2.16, | |
"grad_norm": 0.85546875, | |
"kl": 0.018850211054086685, | |
"learning_rate": 2.517609678667501e-06, | |
"loss": 0.0008, | |
"reward": 2.687412917613983, | |
"reward_std": 0.6682394985109568, | |
"rewards/accuracy_reward": 0.08984375, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.6455872931207219, | |
"rewards/reasoning_steps_reward": 0.6608072966337204, | |
"step": 136 | |
}, | |
{ | |
"completion_length": 290.71484375, | |
"epoch": 2.176, | |
"grad_norm": 0.859375, | |
"kl": 0.017231477366294712, | |
"learning_rate": 2.4823903213324995e-06, | |
"loss": 0.0007, | |
"reward": 3.0338680148124695, | |
"reward_std": 0.6302597746253014, | |
"rewards/accuracy_reward": 0.09765625, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7382858718434969, | |
"rewards/reasoning_steps_reward": 0.7213541753590107, | |
"step": 137 | |
}, | |
{ | |
"completion_length": 289.5703125, | |
"epoch": 2.192, | |
"grad_norm": 0.796875, | |
"kl": 0.01626200118334964, | |
"learning_rate": 2.447174458893587e-06, | |
"loss": 0.0007, | |
"reward": 2.984310381114483, | |
"reward_std": 0.6622797809541225, | |
"rewards/accuracy_reward": 0.1015625, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7369576270381609, | |
"rewards/reasoning_steps_reward": 0.6718750167638063, | |
"step": 138 | |
}, | |
{ | |
"completion_length": 287.76953125, | |
"epoch": 2.208, | |
"grad_norm": 0.75390625, | |
"kl": 0.01649257366079837, | |
"learning_rate": 2.4119690804493285e-06, | |
"loss": 0.0007, | |
"reward": 3.0554041862487793, | |
"reward_std": 0.7084929272532463, | |
"rewards/accuracy_reward": 0.107421875, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7320097194363674, | |
"rewards/reasoning_steps_reward": 0.7519531361758709, | |
"step": 139 | |
}, | |
{ | |
"completion_length": 294.828125, | |
"epoch": 2.224, | |
"grad_norm": 0.984375, | |
"kl": 0.018743149645160884, | |
"learning_rate": 2.376781173017589e-06, | |
"loss": 0.0007, | |
"reward": 2.9738914221525192, | |
"reward_std": 0.6525749433785677, | |
"rewards/accuracy_reward": 0.041015625, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7449863621344169, | |
"rewards/reasoning_steps_reward": 0.6979166753590107, | |
"step": 140 | |
}, | |
{ | |
"completion_length": 289.109375, | |
"epoch": 2.24, | |
"grad_norm": 0.98046875, | |
"kl": 0.022565504419617355, | |
"learning_rate": 2.3416177201488585e-06, | |
"loss": 0.0009, | |
"reward": 3.2985419929027557, | |
"reward_std": 0.6833065822720528, | |
"rewards/accuracy_reward": 0.099609375, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.8430035648246607, | |
"rewards/reasoning_steps_reward": 0.6699218768626451, | |
"step": 141 | |
}, | |
{ | |
"completion_length": 288.521484375, | |
"epoch": 2.2560000000000002, | |
"grad_norm": 1.015625, | |
"kl": 0.020633232838008553, | |
"learning_rate": 2.3064857005402606e-06, | |
"loss": 0.0008, | |
"reward": 3.1613398045301437, | |
"reward_std": 0.7222296446561813, | |
"rewards/accuracy_reward": 0.095703125, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7796913882096609, | |
"rewards/reasoning_steps_reward": 0.7265624962747097, | |
"step": 142 | |
}, | |
{ | |
"completion_length": 279.908203125, | |
"epoch": 2.2720000000000002, | |
"grad_norm": 1.078125, | |
"kl": 0.02265268244082108, | |
"learning_rate": 2.2713920866505364e-06, | |
"loss": 0.0009, | |
"reward": 2.9546066522598267, | |
"reward_std": 0.681933119893074, | |
"rewards/accuracy_reward": 0.193359375, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7051379711677631, | |
"rewards/reasoning_steps_reward": 0.6458333414047956, | |
"step": 143 | |
}, | |
{ | |
"completion_length": 288.0, | |
"epoch": 2.288, | |
"grad_norm": 0.875, | |
"kl": 0.01793542131781578, | |
"learning_rate": 2.236343843316254e-06, | |
"loss": 0.0007, | |
"reward": 2.790590211749077, | |
"reward_std": 0.651448430493474, | |
"rewards/accuracy_reward": 0.05078125, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.6706481222063303, | |
"rewards/reasoning_steps_reward": 0.7278645765036345, | |
"step": 144 | |
}, | |
{ | |
"completion_length": 285.646484375, | |
"epoch": 2.304, | |
"grad_norm": 0.9609375, | |
"kl": 0.018404830596409738, | |
"learning_rate": 2.201347926369537e-06, | |
"loss": 0.0007, | |
"reward": 2.710278756916523, | |
"reward_std": 0.6365776527673006, | |
"rewards/accuracy_reward": 0.125, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.6241293720280131, | |
"rewards/reasoning_steps_reward": 0.7128906175494194, | |
"step": 145 | |
}, | |
{ | |
"completion_length": 295.73046875, | |
"epoch": 2.32, | |
"grad_norm": 0.9921875, | |
"kl": 0.021149699110537767, | |
"learning_rate": 2.166411281257578e-06, | |
"loss": 0.0008, | |
"reward": 3.2047041803598404, | |
"reward_std": 0.7344950754195452, | |
"rewards/accuracy_reward": 0.03125, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.8334256897990903, | |
"rewards/reasoning_steps_reward": 0.6731770820915699, | |
"step": 146 | |
}, | |
{ | |
"completion_length": 288.49609375, | |
"epoch": 2.336, | |
"grad_norm": 1.921875, | |
"kl": 0.019101842306554317, | |
"learning_rate": 2.1315408416642145e-06, | |
"loss": 0.0008, | |
"reward": 2.9557630866765976, | |
"reward_std": 0.6881984118372202, | |
"rewards/accuracy_reward": 0.111328125, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7087786557773749, | |
"rewards/reasoning_steps_reward": 0.7180989496409893, | |
"step": 147 | |
}, | |
{ | |
"completion_length": 283.1796875, | |
"epoch": 2.352, | |
"grad_norm": 0.82421875, | |
"kl": 0.01961760746780783, | |
"learning_rate": 2.09674352813384e-06, | |
"loss": 0.0008, | |
"reward": 3.1119301542639732, | |
"reward_std": 0.5922442562878132, | |
"rewards/accuracy_reward": 0.150390625, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7777614261334141, | |
"rewards/reasoning_steps_reward": 0.6282552145421505, | |
"step": 148 | |
}, | |
{ | |
"completion_length": 284.3828125, | |
"epoch": 2.368, | |
"grad_norm": 0.8671875, | |
"kl": 0.022024919569958, | |
"learning_rate": 2.062026246697919e-06, | |
"loss": 0.0009, | |
"reward": 3.0898532271385193, | |
"reward_std": 0.6860612127929926, | |
"rewards/accuracy_reward": 0.146484375, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7363313144693772, | |
"rewards/reasoning_steps_reward": 0.7343750149011612, | |
"step": 149 | |
}, | |
{ | |
"completion_length": 286.181640625, | |
"epoch": 2.384, | |
"grad_norm": 1.1484375, | |
"kl": 0.01775654760422185, | |
"learning_rate": 2.0273958875043877e-06, | |
"loss": 0.0007, | |
"reward": 2.974420055747032, | |
"reward_std": 0.6679348535835743, | |
"rewards/accuracy_reward": 0.123046875, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7212910537297527, | |
"rewards/reasoning_steps_reward": 0.6875000037252903, | |
"step": 150 | |
}, | |
{ | |
"completion_length": 276.9296875, | |
"epoch": 2.4, | |
"grad_norm": 1.03125, | |
"kl": 0.02118692739168182, | |
"learning_rate": 1.992859323450201e-06, | |
"loss": 0.0008, | |
"reward": 2.724317155778408, | |
"reward_std": 0.6507551912218332, | |
"rewards/accuracy_reward": 0.1875, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.6227324477707347, | |
"rewards/reasoning_steps_reward": 0.6686198022216558, | |
"step": 151 | |
}, | |
{ | |
"completion_length": 285.744140625, | |
"epoch": 2.416, | |
"grad_norm": 4.90625, | |
"kl": 0.02042768005048856, | |
"learning_rate": 1.958423408817309e-06, | |
"loss": 0.0008, | |
"reward": 3.1025044322013855, | |
"reward_std": 0.6402757167816162, | |
"rewards/accuracy_reward": 0.1484375, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7509650103747845, | |
"rewards/reasoning_steps_reward": 0.7011718768626451, | |
"step": 152 | |
}, | |
{ | |
"completion_length": 286.40234375, | |
"epoch": 2.432, | |
"grad_norm": 1.046875, | |
"kl": 0.022553854738362134, | |
"learning_rate": 1.924094977912326e-06, | |
"loss": 0.0009, | |
"reward": 2.981735587120056, | |
"reward_std": 0.7370939962565899, | |
"rewards/accuracy_reward": 0.11328125, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7332781835769614, | |
"rewards/reasoning_steps_reward": 0.6686198078095913, | |
"step": 153 | |
}, | |
{ | |
"completion_length": 288.896484375, | |
"epoch": 2.448, | |
"grad_norm": 0.83203125, | |
"kl": 0.019592860713601112, | |
"learning_rate": 1.8898808437101598e-06, | |
"loss": 0.0008, | |
"reward": 2.95571531355381, | |
"reward_std": 0.7355391271412373, | |
"rewards/accuracy_reward": 0.068359375, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7332853010545174, | |
"rewards/reasoning_steps_reward": 0.6875000037252903, | |
"step": 154 | |
}, | |
{ | |
"completion_length": 294.271484375, | |
"epoch": 2.464, | |
"grad_norm": 0.94140625, | |
"kl": 0.019800655485596508, | |
"learning_rate": 1.8557877965018817e-06, | |
"loss": 0.0008, | |
"reward": 3.0556194335222244, | |
"reward_std": 0.7033564373850822, | |
"rewards/accuracy_reward": 0.044921875, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7715779819215337, | |
"rewards/reasoning_steps_reward": 0.6959635429084301, | |
"step": 155 | |
}, | |
{ | |
"completion_length": 294.31640625, | |
"epoch": 2.48, | |
"grad_norm": 0.859375, | |
"kl": 0.018254225375130773, | |
"learning_rate": 1.8218226025470934e-06, | |
"loss": 0.0007, | |
"reward": 3.604881629347801, | |
"reward_std": 0.715133111923933, | |
"rewards/accuracy_reward": 0.052734375, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.9479379473874966, | |
"rewards/reasoning_steps_reward": 0.7083333265036345, | |
"step": 156 | |
}, | |
{ | |
"completion_length": 289.25390625, | |
"epoch": 2.496, | |
"grad_norm": 0.83984375, | |
"kl": 0.017004019115120173, | |
"learning_rate": 1.7879920027310621e-06, | |
"loss": 0.0007, | |
"reward": 3.051852695643902, | |
"reward_std": 0.7096979664638638, | |
"rewards/accuracy_reward": 0.07421875, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7540463662395874, | |
"rewards/reasoning_steps_reward": 0.7154948078095913, | |
"step": 157 | |
}, | |
{ | |
"completion_length": 290.005859375, | |
"epoch": 2.512, | |
"grad_norm": 0.9765625, | |
"kl": 0.019147633225657046, | |
"learning_rate": 1.7543027112268994e-06, | |
"loss": 0.0008, | |
"reward": 2.991758108139038, | |
"reward_std": 0.684099368751049, | |
"rewards/accuracy_reward": 0.103515625, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7318446912492315, | |
"rewards/reasoning_steps_reward": 0.6927083320915699, | |
"step": 158 | |
}, | |
{ | |
"completion_length": 280.697265625, | |
"epoch": 2.528, | |
"grad_norm": 1.1171875, | |
"kl": 0.020928668964188546, | |
"learning_rate": 1.7207614141630304e-06, | |
"loss": 0.0008, | |
"reward": 2.596983939409256, | |
"reward_std": 0.6806027349084616, | |
"rewards/accuracy_reward": 0.12890625, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.6024234692255656, | |
"rewards/reasoning_steps_reward": 0.6608072984963655, | |
"step": 159 | |
}, | |
{ | |
"completion_length": 285.546875, | |
"epoch": 2.544, | |
"grad_norm": 1.09375, | |
"kl": 0.022152581717818975, | |
"learning_rate": 1.6873747682962393e-06, | |
"loss": 0.0009, | |
"reward": 2.8694690242409706, | |
"reward_std": 0.6588537991046906, | |
"rewards/accuracy_reward": 0.126953125, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.6882605010954043, | |
"rewards/reasoning_steps_reward": 0.6777343805879354, | |
"step": 160 | |
}, | |
{ | |
"completion_length": 283.427734375, | |
"epoch": 2.56, | |
"grad_norm": 0.94140625, | |
"kl": 0.020902132673654705, | |
"learning_rate": 1.6541493996905378e-06, | |
"loss": 0.0008, | |
"reward": 3.1272382587194443, | |
"reward_std": 0.6674788426607847, | |
"rewards/accuracy_reward": 0.12890625, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7581245402495066, | |
"rewards/reasoning_steps_reward": 0.7239583395421505, | |
"step": 161 | |
}, | |
{ | |
"completion_length": 286.408203125, | |
"epoch": 2.576, | |
"grad_norm": 0.953125, | |
"kl": 0.022079574409872293, | |
"learning_rate": 1.6210919024021258e-06, | |
"loss": 0.0009, | |
"reward": 2.9151505902409554, | |
"reward_std": 0.7153513710945845, | |
"rewards/accuracy_reward": 0.11328125, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7115172014261285, | |
"rewards/reasoning_steps_reward": 0.6673177219927311, | |
"step": 162 | |
}, | |
{ | |
"completion_length": 291.72265625, | |
"epoch": 2.592, | |
"grad_norm": 0.875, | |
"kl": 0.017877445730846375, | |
"learning_rate": 1.588208837170706e-06, | |
"loss": 0.0007, | |
"reward": 2.937485493719578, | |
"reward_std": 0.7016174159944057, | |
"rewards/accuracy_reward": 0.056640625, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.734153147165974, | |
"rewards/reasoning_steps_reward": 0.6783854216337204, | |
"step": 163 | |
}, | |
{ | |
"completion_length": 289.837890625, | |
"epoch": 2.608, | |
"grad_norm": 1.015625, | |
"kl": 0.023135888564866036, | |
"learning_rate": 1.55550673011741e-06, | |
"loss": 0.0009, | |
"reward": 3.3134661614894867, | |
"reward_std": 0.6674238592386246, | |
"rewards/accuracy_reward": 0.09765625, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.824106777086854, | |
"rewards/reasoning_steps_reward": 0.7434895969927311, | |
"step": 164 | |
}, | |
{ | |
"completion_length": 285.73828125, | |
"epoch": 2.624, | |
"grad_norm": 0.8828125, | |
"kl": 0.017427237355150282, | |
"learning_rate": 1.522992071449595e-06, | |
"loss": 0.0007, | |
"reward": 2.761025607585907, | |
"reward_std": 0.5951798930764198, | |
"rewards/accuracy_reward": 0.115234375, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.6484234320620695, | |
"rewards/reasoning_steps_reward": 0.7005208227783442, | |
"step": 165 | |
}, | |
{ | |
"completion_length": 292.478515625, | |
"epoch": 2.64, | |
"grad_norm": 0.90625, | |
"kl": 0.02133324311580509, | |
"learning_rate": 1.4906713141727677e-06, | |
"loss": 0.0009, | |
"reward": 2.930042363703251, | |
"reward_std": 0.6626697592437267, | |
"rewards/accuracy_reward": 0.0625, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7227745447307825, | |
"rewards/reasoning_steps_reward": 0.6992187462747097, | |
"step": 166 | |
}, | |
{ | |
"completion_length": 279.865234375, | |
"epoch": 2.656, | |
"grad_norm": 1.09375, | |
"kl": 0.024935539229772985, | |
"learning_rate": 1.4585508728098935e-06, | |
"loss": 0.001, | |
"reward": 2.825145460665226, | |
"reward_std": 0.7417711336165667, | |
"rewards/accuracy_reward": 0.181640625, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.6630693133920431, | |
"rewards/reasoning_steps_reward": 0.6542968759313226, | |
"step": 167 | |
}, | |
{ | |
"completion_length": 293.625, | |
"epoch": 2.672, | |
"grad_norm": 1.0234375, | |
"kl": 0.018147769616916776, | |
"learning_rate": 1.4266371221283367e-06, | |
"loss": 0.0007, | |
"reward": 2.7056074738502502, | |
"reward_std": 0.6061353217810392, | |
"rewards/accuracy_reward": 0.02734375, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.6455757624159256, | |
"rewards/reasoning_steps_reward": 0.7415364719927311, | |
"step": 168 | |
}, | |
{ | |
"completion_length": 293.119140625, | |
"epoch": 2.6879999999999997, | |
"grad_norm": 1.046875, | |
"kl": 0.019442370510660112, | |
"learning_rate": 1.3949363958747004e-06, | |
"loss": 0.0008, | |
"reward": 3.226225107908249, | |
"reward_std": 0.7255453541874886, | |
"rewards/accuracy_reward": 0.01953125, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.8371271109208465, | |
"rewards/reasoning_steps_reward": 0.6953125037252903, | |
"step": 169 | |
}, | |
{ | |
"completion_length": 291.013671875, | |
"epoch": 2.7039999999999997, | |
"grad_norm": 0.8359375, | |
"kl": 0.01949766167672351, | |
"learning_rate": 1.363454985517803e-06, | |
"loss": 0.0008, | |
"reward": 2.700456887483597, | |
"reward_std": 0.7572273463010788, | |
"rewards/accuracy_reward": 0.11328125, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.6375654794586202, | |
"rewards/reasoning_steps_reward": 0.6744791809469461, | |
"step": 170 | |
}, | |
{ | |
"completion_length": 292.544921875, | |
"epoch": 2.7199999999999998, | |
"grad_norm": 0.9453125, | |
"kl": 0.020666938507929444, | |
"learning_rate": 1.3321991390000382e-06, | |
"loss": 0.0008, | |
"reward": 2.9996937662363052, | |
"reward_std": 0.653770299628377, | |
"rewards/accuracy_reward": 0.044921875, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7529360945336521, | |
"rewards/reasoning_steps_reward": 0.6959635615348816, | |
"step": 171 | |
}, | |
{ | |
"completion_length": 289.091796875, | |
"epoch": 2.7359999999999998, | |
"grad_norm": 0.98828125, | |
"kl": 0.021086076740175486, | |
"learning_rate": 1.301175059497391e-06, | |
"loss": 0.0008, | |
"reward": 2.95357333868742, | |
"reward_std": 0.6372328288853168, | |
"rewards/accuracy_reward": 0.13671875, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.701104310962061, | |
"rewards/reasoning_steps_reward": 0.7135416828095913, | |
"step": 172 | |
}, | |
{ | |
"completion_length": 281.466796875, | |
"epoch": 2.752, | |
"grad_norm": 0.94921875, | |
"kl": 0.02050035016145557, | |
"learning_rate": 1.270388904188316e-06, | |
"loss": 0.0008, | |
"reward": 2.7741658687591553, | |
"reward_std": 0.7323318216949701, | |
"rewards/accuracy_reward": 0.15625, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.6441229923317829, | |
"rewards/reasoning_steps_reward": 0.6855468563735485, | |
"step": 173 | |
}, | |
{ | |
"completion_length": 288.654296875, | |
"epoch": 2.768, | |
"grad_norm": 0.953125, | |
"kl": 0.018003857927396894, | |
"learning_rate": 1.2398467830317635e-06, | |
"loss": 0.0007, | |
"reward": 2.823809191584587, | |
"reward_std": 0.6888855472207069, | |
"rewards/accuracy_reward": 0.109375, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.6687002858767906, | |
"rewards/reasoning_steps_reward": 0.7083333376795053, | |
"step": 174 | |
}, | |
{ | |
"completion_length": 295.314453125, | |
"epoch": 2.784, | |
"grad_norm": 0.86328125, | |
"kl": 0.018295871559530497, | |
"learning_rate": 1.2095547575545685e-06, | |
"loss": 0.0007, | |
"reward": 3.150137387216091, | |
"reward_std": 0.6382329538464546, | |
"rewards/accuracy_reward": 0.041015625, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7963565507282814, | |
"rewards/reasoning_steps_reward": 0.7200520895421505, | |
"step": 175 | |
}, | |
{ | |
"completion_length": 288.701171875, | |
"epoch": 2.8, | |
"grad_norm": 1.0234375, | |
"kl": 0.020645066746510565, | |
"learning_rate": 1.1795188396484505e-06, | |
"loss": 0.0008, | |
"reward": 2.7497966438531876, | |
"reward_std": 0.696668054908514, | |
"rewards/accuracy_reward": 0.103515625, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.6516249105334282, | |
"rewards/reasoning_steps_reward": 0.6914062425494194, | |
"step": 176 | |
}, | |
{ | |
"completion_length": 285.642578125, | |
"epoch": 2.816, | |
"grad_norm": 1.1484375, | |
"kl": 0.01807958845165558, | |
"learning_rate": 1.149744990376868e-06, | |
"loss": 0.0007, | |
"reward": 2.925790064036846, | |
"reward_std": 0.6442860681563616, | |
"rewards/accuracy_reward": 0.125, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.6968345294396082, | |
"rewards/reasoning_steps_reward": 0.7102864719927311, | |
"step": 177 | |
}, | |
{ | |
"completion_length": 294.732421875, | |
"epoch": 2.832, | |
"grad_norm": 2.765625, | |
"kl": 0.020244878192897886, | |
"learning_rate": 1.1202391187919575e-06, | |
"loss": 0.0008, | |
"reward": 3.2739059031009674, | |
"reward_std": 0.6519978456199169, | |
"rewards/accuracy_reward": 0.0546875, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.8315363439420859, | |
"rewards/reasoning_steps_reward": 0.7246093861758709, | |
"step": 178 | |
}, | |
{ | |
"completion_length": 287.73828125, | |
"epoch": 2.848, | |
"grad_norm": 1.09375, | |
"kl": 0.021688284177798778, | |
"learning_rate": 1.0910070807618012e-06, | |
"loss": 0.0009, | |
"reward": 2.786106266081333, | |
"reward_std": 0.676231924444437, | |
"rewards/accuracy_reward": 0.103515625, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.6424607516576847, | |
"rewards/reasoning_steps_reward": 0.7552083432674408, | |
"step": 179 | |
}, | |
{ | |
"completion_length": 286.52734375, | |
"epoch": 2.864, | |
"grad_norm": 1.0546875, | |
"kl": 0.02280406339559704, | |
"learning_rate": 1.062054677808238e-06, | |
"loss": 0.0009, | |
"reward": 3.1157227605581284, | |
"reward_std": 0.6361609604209661, | |
"rewards/accuracy_reward": 0.099609375, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.764702707529068, | |
"rewards/reasoning_steps_reward": 0.7220052257180214, | |
"step": 180 | |
}, | |
{ | |
"completion_length": 290.2109375, | |
"epoch": 2.88, | |
"grad_norm": 1.1328125, | |
"kl": 0.028993367042858154, | |
"learning_rate": 1.033387655955471e-06, | |
"loss": 0.0012, | |
"reward": 3.221103757619858, | |
"reward_std": 0.5982426293194294, | |
"rewards/accuracy_reward": 0.07421875, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.8156717581053575, | |
"rewards/reasoning_steps_reward": 0.6998698078095913, | |
"step": 181 | |
}, | |
{ | |
"completion_length": 289.28125, | |
"epoch": 2.896, | |
"grad_norm": 1.0390625, | |
"kl": 0.02006814256310463, | |
"learning_rate": 1.0050117045896889e-06, | |
"loss": 0.0008, | |
"reward": 2.751469612121582, | |
"reward_std": 0.7198650874197483, | |
"rewards/accuracy_reward": 0.11328125, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.6480593029409647, | |
"rewards/reasoning_steps_reward": 0.6940104179084301, | |
"step": 182 | |
}, | |
{ | |
"completion_length": 292.52734375, | |
"epoch": 2.912, | |
"grad_norm": 1.4140625, | |
"kl": 0.022621202806476504, | |
"learning_rate": 9.769324553299174e-07, | |
"loss": 0.0009, | |
"reward": 3.1886699497699738, | |
"reward_std": 0.7633016854524612, | |
"rewards/accuracy_reward": 0.12890625, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7927076746709645, | |
"rewards/reasoning_steps_reward": 0.6816406436264515, | |
"step": 183 | |
}, | |
{ | |
"completion_length": 288.79296875, | |
"epoch": 2.928, | |
"grad_norm": 0.98046875, | |
"kl": 0.021405818057246506, | |
"learning_rate": 9.491554809103509e-07, | |
"loss": 0.0009, | |
"reward": 2.6857599690556526, | |
"reward_std": 0.6840799152851105, | |
"rewards/accuracy_reward": 0.083984375, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.6376578211784363, | |
"rewards/reasoning_steps_reward": 0.6888020988553762, | |
"step": 184 | |
}, | |
{ | |
"completion_length": 290.58203125, | |
"epoch": 2.944, | |
"grad_norm": 0.99609375, | |
"kl": 0.020275956427212805, | |
"learning_rate": 9.216862940743529e-07, | |
"loss": 0.0008, | |
"reward": 2.757513716816902, | |
"reward_std": 0.602615574374795, | |
"rewards/accuracy_reward": 0.115234375, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.636836152523756, | |
"rewards/reasoning_steps_reward": 0.7317708358168602, | |
"step": 185 | |
}, | |
{ | |
"completion_length": 286.619140625, | |
"epoch": 2.96, | |
"grad_norm": 0.984375, | |
"kl": 0.019758898008149117, | |
"learning_rate": 8.945303464803833e-07, | |
"loss": 0.0008, | |
"reward": 3.0790238082408905, | |
"reward_std": 0.5770421475172043, | |
"rewards/accuracy_reward": 0.12109375, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7574610461791357, | |
"rewards/reasoning_steps_reward": 0.685546888038516, | |
"step": 186 | |
}, | |
{ | |
"completion_length": 286.671875, | |
"epoch": 2.976, | |
"grad_norm": 0.9765625, | |
"kl": 0.02084403787739575, | |
"learning_rate": 8.676930276200294e-07, | |
"loss": 0.0008, | |
"reward": 3.0736390501260757, | |
"reward_std": 0.6433412320911884, | |
"rewards/accuracy_reward": 0.072265625, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7658657878637314, | |
"rewards/reasoning_steps_reward": 0.7037760429084301, | |
"step": 187 | |
}, | |
{ | |
"completion_length": 284.01171875, | |
"epoch": 2.992, | |
"grad_norm": 1.0, | |
"kl": 0.019843781657982618, | |
"learning_rate": 8.411796637483852e-07, | |
"loss": 0.0008, | |
"reward": 2.9655564725399017, | |
"reward_std": 0.6882054135203362, | |
"rewards/accuracy_reward": 0.107421875, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7163833907494942, | |
"rewards/reasoning_steps_reward": 0.7089843675494194, | |
"step": 188 | |
}, | |
{ | |
"completion_length": 290.765625, | |
"epoch": 3.0, | |
"grad_norm": 0.69921875, | |
"kl": 0.017977926647290587, | |
"learning_rate": 8.149955168269822e-07, | |
"loss": 0.0004, | |
"reward": 2.5494449138641357, | |
"reward_std": 0.6191319935023785, | |
"rewards/accuracy_reward": 0.1015625, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.5538079980760813, | |
"rewards/reasoning_steps_reward": 0.7864583507180214, | |
"step": 189 | |
}, | |
{ | |
"completion_length": 289.603515625, | |
"epoch": 3.016, | |
"grad_norm": 0.87890625, | |
"kl": 0.019213943742215633, | |
"learning_rate": 7.891457834794711e-07, | |
"loss": 0.0008, | |
"reward": 3.084651954472065, | |
"reward_std": 0.6362812034785748, | |
"rewards/accuracy_reward": 0.09375, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7643284083654484, | |
"rewards/reasoning_steps_reward": 0.6979166697710752, | |
"step": 190 | |
}, | |
{ | |
"completion_length": 285.248046875, | |
"epoch": 3.032, | |
"grad_norm": 1.03125, | |
"kl": 0.020919292815960944, | |
"learning_rate": 7.636355939602824e-07, | |
"loss": 0.0008, | |
"reward": 2.85429210960865, | |
"reward_std": 0.6567655950784683, | |
"rewards/accuracy_reward": 0.078125, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.6931841528664032, | |
"rewards/reasoning_steps_reward": 0.6966145969927311, | |
"step": 191 | |
}, | |
{ | |
"completion_length": 290.787109375, | |
"epoch": 3.048, | |
"grad_norm": 0.87890625, | |
"kl": 0.016602561168838292, | |
"learning_rate": 7.384700111364487e-07, | |
"loss": 0.0007, | |
"reward": 2.8143509328365326, | |
"reward_std": 0.6266643963754177, | |
"rewards/accuracy_reward": 0.11328125, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.6690197528029481, | |
"rewards/reasoning_steps_reward": 0.6940104104578495, | |
"step": 192 | |
}, | |
{ | |
"completion_length": 282.99609375, | |
"epoch": 3.064, | |
"grad_norm": 0.96875, | |
"kl": 0.02081725694006309, | |
"learning_rate": 7.136540294828062e-07, | |
"loss": 0.0008, | |
"reward": 2.8774597868323326, | |
"reward_std": 0.7187161836773157, | |
"rewards/accuracy_reward": 0.083984375, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.6952643600913385, | |
"rewards/reasoning_steps_reward": 0.7076822966337204, | |
"step": 193 | |
}, | |
{ | |
"completion_length": 294.306640625, | |
"epoch": 3.08, | |
"grad_norm": 0.9296875, | |
"kl": 0.02071163459913805, | |
"learning_rate": 6.891925740907701e-07, | |
"loss": 0.0008, | |
"reward": 2.8051391541957855, | |
"reward_std": 0.6224446576088667, | |
"rewards/accuracy_reward": 0.021484375, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.695680051886787, | |
"rewards/reasoning_steps_reward": 0.6966145820915699, | |
"step": 194 | |
}, | |
{ | |
"completion_length": 286.19140625, | |
"epoch": 3.096, | |
"grad_norm": 0.84765625, | |
"kl": 0.018767547328025103, | |
"learning_rate": 6.650904996908772e-07, | |
"loss": 0.0008, | |
"reward": 3.3200203105807304, | |
"reward_std": 0.7382683884352446, | |
"rewards/accuracy_reward": 0.1484375, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.8095814054831862, | |
"rewards/reasoning_steps_reward": 0.7428385503590107, | |
"step": 195 | |
}, | |
{ | |
"completion_length": 285.822265625, | |
"epoch": 3.112, | |
"grad_norm": 1.078125, | |
"kl": 0.02104048355249688, | |
"learning_rate": 6.413525896892972e-07, | |
"loss": 0.0008, | |
"reward": 2.955541580915451, | |
"reward_std": 0.6638543289154768, | |
"rewards/accuracy_reward": 0.103515625, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7093558870255947, | |
"rewards/reasoning_steps_reward": 0.7239583395421505, | |
"step": 196 | |
}, | |
{ | |
"completion_length": 288.87890625, | |
"epoch": 3.128, | |
"grad_norm": 0.92578125, | |
"kl": 0.02037365094292909, | |
"learning_rate": 6.179835552184924e-07, | |
"loss": 0.0008, | |
"reward": 2.7349835634231567, | |
"reward_std": 0.6583398748189211, | |
"rewards/accuracy_reward": 0.07421875, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.644300079283615, | |
"rewards/reasoning_steps_reward": 0.7278645858168602, | |
"step": 197 | |
}, | |
{ | |
"completion_length": 288.52734375, | |
"epoch": 3.144, | |
"grad_norm": 0.9453125, | |
"kl": 0.02122843312099576, | |
"learning_rate": 5.949880342022258e-07, | |
"loss": 0.0008, | |
"reward": 3.1269255951046944, | |
"reward_std": 0.7235856931656599, | |
"rewards/accuracy_reward": 0.068359375, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7810237923016151, | |
"rewards/reasoning_steps_reward": 0.7154947929084301, | |
"step": 198 | |
}, | |
{ | |
"completion_length": 285.734375, | |
"epoch": 3.16, | |
"grad_norm": 0.9765625, | |
"kl": 0.02136942616198212, | |
"learning_rate": 5.723705904351027e-07, | |
"loss": 0.0009, | |
"reward": 2.681896522641182, | |
"reward_std": 0.6634827610105276, | |
"rewards/accuracy_reward": 0.109375, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.6170557765290141, | |
"rewards/reasoning_steps_reward": 0.7213541716337204, | |
"step": 199 | |
}, | |
{ | |
"completion_length": 286.953125, | |
"epoch": 3.176, | |
"grad_norm": 0.91796875, | |
"kl": 0.019215874548535794, | |
"learning_rate": 5.501357126768117e-07, | |
"loss": 0.0008, | |
"reward": 2.6373501121997833, | |
"reward_std": 0.7020009346306324, | |
"rewards/accuracy_reward": 0.1015625, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.5913562929878632, | |
"rewards/reasoning_steps_reward": 0.7617187462747097, | |
"step": 200 | |
}, | |
{ | |
"completion_length": 286.212890625, | |
"epoch": 3.192, | |
"grad_norm": 0.875, | |
"kl": 0.020688754506409168, | |
"learning_rate": 5.282878137612738e-07, | |
"loss": 0.0008, | |
"reward": 3.007347419857979, | |
"reward_std": 0.6104327123612165, | |
"rewards/accuracy_reward": 0.1328125, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7257564406221112, | |
"rewards/reasoning_steps_reward": 0.6972656305879354, | |
"step": 201 | |
}, | |
{ | |
"completion_length": 286.62109375, | |
"epoch": 3.208, | |
"grad_norm": 0.8828125, | |
"kl": 0.02169125445652753, | |
"learning_rate": 5.068312297208414e-07, | |
"loss": 0.0009, | |
"reward": 3.0148477032780647, | |
"reward_std": 0.679189708083868, | |
"rewards/accuracy_reward": 0.08203125, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7471367201457421, | |
"rewards/reasoning_steps_reward": 0.69140625, | |
"step": 202 | |
}, | |
{ | |
"completion_length": 293.78125, | |
"epoch": 3.224, | |
"grad_norm": 0.85546875, | |
"kl": 0.02024375193286687, | |
"learning_rate": 4.857702189257613e-07, | |
"loss": 0.0008, | |
"reward": 3.007346175611019, | |
"reward_std": 0.6605745330452919, | |
"rewards/accuracy_reward": 0.06640625, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7448532494405905, | |
"rewards/reasoning_steps_reward": 0.7063802145421505, | |
"step": 203 | |
}, | |
{ | |
"completion_length": 296.005859375, | |
"epoch": 3.24, | |
"grad_norm": 0.875, | |
"kl": 0.020721249806229025, | |
"learning_rate": 4.6510896123903027e-07, | |
"loss": 0.0008, | |
"reward": 3.162186399102211, | |
"reward_std": 0.668110404163599, | |
"rewards/accuracy_reward": 0.03125, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.8146958220750093, | |
"rewards/reasoning_steps_reward": 0.6868489757180214, | |
"step": 204 | |
}, | |
{ | |
"completion_length": 284.65234375, | |
"epoch": 3.2560000000000002, | |
"grad_norm": 1.109375, | |
"kl": 0.023086362169124186, | |
"learning_rate": 4.4485155718684334e-07, | |
"loss": 0.0009, | |
"reward": 2.8323604688048363, | |
"reward_std": 0.7187584564089775, | |
"rewards/accuracy_reward": 0.1171875, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.6745888954028487, | |
"rewards/reasoning_steps_reward": 0.6914062649011612, | |
"step": 205 | |
}, | |
{ | |
"completion_length": 295.3203125, | |
"epoch": 3.2720000000000002, | |
"grad_norm": 0.890625, | |
"kl": 0.019205813470762223, | |
"learning_rate": 4.2500202714478853e-07, | |
"loss": 0.0008, | |
"reward": 3.3045015186071396, | |
"reward_std": 0.7383539900183678, | |
"rewards/accuracy_reward": 0.048828125, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.8480282643189033, | |
"rewards/reasoning_steps_reward": 0.7115885354578495, | |
"step": 206 | |
}, | |
{ | |
"completion_length": 276.3671875, | |
"epoch": 3.288, | |
"grad_norm": 0.9296875, | |
"kl": 0.019852709374390543, | |
"learning_rate": 4.05564310539939e-07, | |
"loss": 0.0008, | |
"reward": 3.327822983264923, | |
"reward_std": 0.7267354801297188, | |
"rewards/accuracy_reward": 0.21875, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.8076250155766805, | |
"rewards/reasoning_steps_reward": 0.686197929084301, | |
"step": 207 | |
}, | |
{ | |
"completion_length": 295.84375, | |
"epoch": 3.304, | |
"grad_norm": 0.79296875, | |
"kl": 0.017830375931225717, | |
"learning_rate": 3.8654226506902204e-07, | |
"loss": 0.0007, | |
"reward": 2.7935037687420845, | |
"reward_std": 0.7168517392128706, | |
"rewards/accuracy_reward": 0.05078125, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.6909335435678562, | |
"rewards/reasoning_steps_reward": 0.6699218731373549, | |
"step": 208 | |
}, | |
{ | |
"completion_length": 289.603515625, | |
"epoch": 3.32, | |
"grad_norm": 1.1875, | |
"kl": 0.019766899524256587, | |
"learning_rate": 3.679396659327986e-07, | |
"loss": 0.0008, | |
"reward": 3.100301645696163, | |
"reward_std": 0.7392721492797136, | |
"rewards/accuracy_reward": 0.10546875, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.789510258163015, | |
"rewards/reasoning_steps_reward": 0.6263020895421505, | |
"step": 209 | |
}, | |
{ | |
"completion_length": 278.326171875, | |
"epoch": 3.336, | |
"grad_norm": 0.921875, | |
"kl": 0.020691857673227787, | |
"learning_rate": 3.4976020508682345e-07, | |
"loss": 0.0008, | |
"reward": 3.0393467769026756, | |
"reward_std": 0.6087249293923378, | |
"rewards/accuracy_reward": 0.16796875, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7225339828679959, | |
"rewards/reasoning_steps_reward": 0.703776054084301, | |
"step": 210 | |
}, | |
{ | |
"completion_length": 292.322265625, | |
"epoch": 3.352, | |
"grad_norm": 1.15625, | |
"kl": 0.020608096150681376, | |
"learning_rate": 3.320074905087212e-07, | |
"loss": 0.0008, | |
"reward": 2.8478069826960564, | |
"reward_std": 0.6319366451352835, | |
"rewards/accuracy_reward": 0.087890625, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.6738783651962876, | |
"rewards/reasoning_steps_reward": 0.7382812425494194, | |
"step": 211 | |
}, | |
{ | |
"completion_length": 283.3984375, | |
"epoch": 3.368, | |
"grad_norm": 0.8671875, | |
"kl": 0.023123053135350347, | |
"learning_rate": 3.14685045482131e-07, | |
"loss": 0.0009, | |
"reward": 2.7474499940872192, | |
"reward_std": 0.6886056587100029, | |
"rewards/accuracy_reward": 0.095703125, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.6692888826752702, | |
"rewards/reasoning_steps_reward": 0.6438802052289248, | |
"step": 212 | |
}, | |
{ | |
"completion_length": 295.224609375, | |
"epoch": 3.384, | |
"grad_norm": 0.9140625, | |
"kl": 0.023218440066557378, | |
"learning_rate": 2.977963078974616e-07, | |
"loss": 0.0009, | |
"reward": 2.9267039820551872, | |
"reward_std": 0.6514626033604145, | |
"rewards/accuracy_reward": 0.060546875, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7105940394103527, | |
"rewards/reasoning_steps_reward": 0.7343750055879354, | |
"step": 213 | |
}, | |
{ | |
"completion_length": 287.263671875, | |
"epoch": 3.4, | |
"grad_norm": 0.98046875, | |
"kl": 0.021008892101235688, | |
"learning_rate": 2.813446295695893e-07, | |
"loss": 0.0008, | |
"reward": 3.1436211466789246, | |
"reward_std": 0.6997925061732531, | |
"rewards/accuracy_reward": 0.076171875, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7876740458110968, | |
"rewards/reasoning_steps_reward": 0.7044270932674408, | |
"step": 214 | |
}, | |
{ | |
"completion_length": 285.443359375, | |
"epoch": 3.416, | |
"grad_norm": 1.0625, | |
"kl": 0.02351184340659529, | |
"learning_rate": 2.65333275572644e-07, | |
"loss": 0.0009, | |
"reward": 2.9087352752685547, | |
"reward_std": 0.6324813142418861, | |
"rewards/accuracy_reward": 0.11328125, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.707642661097149, | |
"rewards/reasoning_steps_reward": 0.6725260391831398, | |
"step": 215 | |
}, | |
{ | |
"completion_length": 292.380859375, | |
"epoch": 3.432, | |
"grad_norm": 0.88671875, | |
"kl": 0.023277590342331678, | |
"learning_rate": 2.4976542359200664e-07, | |
"loss": 0.0009, | |
"reward": 2.6246762797236443, | |
"reward_std": 0.691521966829896, | |
"rewards/accuracy_reward": 0.072265625, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.610135139276584, | |
"rewards/reasoning_steps_reward": 0.7220052182674408, | |
"step": 216 | |
}, | |
{ | |
"completion_length": 280.158203125, | |
"epoch": 3.448, | |
"grad_norm": 1.1484375, | |
"kl": 0.02378622384276241, | |
"learning_rate": 2.3464416329365137e-07, | |
"loss": 0.001, | |
"reward": 2.8623234406113625, | |
"reward_std": 0.6014144476503134, | |
"rewards/accuracy_reward": 0.13671875, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.6630921829491854, | |
"rewards/reasoning_steps_reward": 0.7363281361758709, | |
"step": 217 | |
}, | |
{ | |
"completion_length": 295.375, | |
"epoch": 3.464, | |
"grad_norm": 0.8203125, | |
"kl": 0.016009816259611398, | |
"learning_rate": 2.1997249571095835e-07, | |
"loss": 0.0006, | |
"reward": 3.290237843990326, | |
"reward_std": 0.6886514872312546, | |
"rewards/accuracy_reward": 0.04296875, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.8556435058514277, | |
"rewards/reasoning_steps_reward": 0.6803385578095913, | |
"step": 218 | |
}, | |
{ | |
"completion_length": 289.02734375, | |
"epoch": 3.48, | |
"grad_norm": 0.859375, | |
"kl": 0.02076311851851642, | |
"learning_rate": 2.0575333264911125e-07, | |
"loss": 0.0008, | |
"reward": 2.800406724214554, | |
"reward_std": 0.6951953694224358, | |
"rewards/accuracy_reward": 0.10546875, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.6608994637305537, | |
"rewards/reasoning_steps_reward": 0.7122395895421505, | |
"step": 219 | |
}, | |
{ | |
"completion_length": 279.2421875, | |
"epoch": 3.496, | |
"grad_norm": 0.9921875, | |
"kl": 0.01861161779379472, | |
"learning_rate": 1.9198949610721273e-07, | |
"loss": 0.0007, | |
"reward": 2.7829076945781708, | |
"reward_std": 0.5952301491051912, | |
"rewards/accuracy_reward": 0.19140625, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.6188251003623009, | |
"rewards/reasoning_steps_reward": 0.735026054084301, | |
"step": 220 | |
}, | |
{ | |
"completion_length": 281.380859375, | |
"epoch": 3.512, | |
"grad_norm": 0.95703125, | |
"kl": 0.018613723281305283, | |
"learning_rate": 1.786837177182127e-07, | |
"loss": 0.0007, | |
"reward": 2.8807911574840546, | |
"reward_std": 0.6898195426911116, | |
"rewards/accuracy_reward": 0.15234375, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.6807498056441545, | |
"rewards/reasoning_steps_reward": 0.686197929084301, | |
"step": 221 | |
}, | |
{ | |
"completion_length": 287.46484375, | |
"epoch": 3.528, | |
"grad_norm": 0.859375, | |
"kl": 0.01997726986883208, | |
"learning_rate": 1.6583863820678032e-07, | |
"loss": 0.0008, | |
"reward": 2.8661443442106247, | |
"reward_std": 0.6607285998761654, | |
"rewards/accuracy_reward": 0.125, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.6637147692963481, | |
"rewards/reasoning_steps_reward": 0.7500000074505806, | |
"step": 222 | |
}, | |
{ | |
"completion_length": 287.591796875, | |
"epoch": 3.544, | |
"grad_norm": 0.91015625, | |
"kl": 0.020406617608387023, | |
"learning_rate": 1.534568068652101e-07, | |
"loss": 0.0008, | |
"reward": 2.8175922632217407, | |
"reward_std": 0.772568928077817, | |
"rewards/accuracy_reward": 0.080078125, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.6876783277839422, | |
"rewards/reasoning_steps_reward": 0.6744791697710752, | |
"step": 223 | |
}, | |
{ | |
"completion_length": 281.27734375, | |
"epoch": 3.56, | |
"grad_norm": 0.921875, | |
"kl": 0.021257835964206606, | |
"learning_rate": 1.4154068104747981e-07, | |
"loss": 0.0009, | |
"reward": 3.101296618580818, | |
"reward_std": 0.7108908668160439, | |
"rewards/accuracy_reward": 0.1640625, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7581579011554519, | |
"rewards/reasoning_steps_reward": 0.662760416045785, | |
"step": 224 | |
}, | |
{ | |
"completion_length": 279.06640625, | |
"epoch": 3.576, | |
"grad_norm": 1.375, | |
"kl": 0.022064094548113644, | |
"learning_rate": 1.3009262568155462e-07, | |
"loss": 0.0009, | |
"reward": 2.9315654188394547, | |
"reward_std": 0.705444760620594, | |
"rewards/accuracy_reward": 0.130859375, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.6976745830227932, | |
"rewards/reasoning_steps_reward": 0.707682304084301, | |
"step": 225 | |
}, | |
{ | |
"completion_length": 291.966796875, | |
"epoch": 3.592, | |
"grad_norm": 2.09375, | |
"kl": 0.0230710570467636, | |
"learning_rate": 1.1911491280002907e-07, | |
"loss": 0.0009, | |
"reward": 3.4133089035749435, | |
"reward_std": 0.7498599980026484, | |
"rewards/accuracy_reward": 0.0625, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.8753998465836048, | |
"rewards/reasoning_steps_reward": 0.7246093787252903, | |
"step": 226 | |
}, | |
{ | |
"completion_length": 277.298828125, | |
"epoch": 3.608, | |
"grad_norm": 0.83984375, | |
"kl": 0.019825019757263362, | |
"learning_rate": 1.0860972108921258e-07, | |
"loss": 0.0008, | |
"reward": 2.766988158226013, | |
"reward_std": 0.6906307358294725, | |
"rewards/accuracy_reward": 0.138671875, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.6410793742785851, | |
"rewards/reasoning_steps_reward": 0.7050781287252903, | |
"step": 227 | |
}, | |
{ | |
"completion_length": 291.142578125, | |
"epoch": 3.624, | |
"grad_norm": 0.91015625, | |
"kl": 0.019932835886720568, | |
"learning_rate": 9.857913545673503e-08, | |
"loss": 0.0008, | |
"reward": 3.3143957555294037, | |
"reward_std": 0.6608162298798561, | |
"rewards/accuracy_reward": 0.080078125, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.836786450818181, | |
"rewards/reasoning_steps_reward": 0.7239583283662796, | |
"step": 228 | |
}, | |
{ | |
"completion_length": 288.453125, | |
"epoch": 3.64, | |
"grad_norm": 0.8984375, | |
"kl": 0.018874026485718787, | |
"learning_rate": 8.902514661776885e-08, | |
"loss": 0.0008, | |
"reward": 3.2070699259638786, | |
"reward_std": 0.7432738393545151, | |
"rewards/accuracy_reward": 0.125, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.8040493360410134, | |
"rewards/reasoning_steps_reward": 0.6699218824505806, | |
"step": 229 | |
}, | |
{ | |
"completion_length": 293.146484375, | |
"epoch": 3.656, | |
"grad_norm": 0.80078125, | |
"kl": 0.016978327243123204, | |
"learning_rate": 7.994965069994143e-08, | |
"loss": 0.0007, | |
"reward": 3.143362358212471, | |
"reward_std": 0.6415095869451761, | |
"rewards/accuracy_reward": 0.072265625, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7838985491544008, | |
"rewards/reasoning_steps_reward": 0.7194010280072689, | |
"step": 230 | |
}, | |
{ | |
"completion_length": 288.62890625, | |
"epoch": 3.672, | |
"grad_norm": 1.1171875, | |
"kl": 0.021244205767288804, | |
"learning_rate": 7.135444886702064e-08, | |
"loss": 0.0008, | |
"reward": 2.9098562449216843, | |
"reward_std": 0.7155030779540539, | |
"rewards/accuracy_reward": 0.119140625, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7021569274365902, | |
"rewards/reasoning_steps_reward": 0.684244804084301, | |
"step": 231 | |
}, | |
{ | |
"completion_length": 285.921875, | |
"epoch": 3.6879999999999997, | |
"grad_norm": 0.81640625, | |
"kl": 0.01845627831062302, | |
"learning_rate": 6.324124696144962e-08, | |
"loss": 0.0007, | |
"reward": 2.8958379551768303, | |
"reward_std": 0.6293431017547846, | |
"rewards/accuracy_reward": 0.111328125, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7039945904786388, | |
"rewards/reasoning_steps_reward": 0.6725260466337204, | |
"step": 232 | |
}, | |
{ | |
"completion_length": 290.05859375, | |
"epoch": 3.7039999999999997, | |
"grad_norm": 0.91015625, | |
"kl": 0.017910517868585885, | |
"learning_rate": 5.5611655165795365e-08, | |
"loss": 0.0007, | |
"reward": 2.9697776436805725, | |
"reward_std": 0.6638195030391216, | |
"rewards/accuracy_reward": 0.08203125, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7292921990156174, | |
"rewards/reasoning_steps_reward": 0.699869804084301, | |
"step": 233 | |
}, | |
{ | |
"completion_length": 287.302734375, | |
"epoch": 3.7199999999999998, | |
"grad_norm": 0.859375, | |
"kl": 0.01873377658193931, | |
"learning_rate": 4.846718768318659e-08, | |
"loss": 0.0007, | |
"reward": 3.1371295899152756, | |
"reward_std": 0.6027075219899416, | |
"rewards/accuracy_reward": 0.125, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.762940771256884, | |
"rewards/reasoning_steps_reward": 0.7233072891831398, | |
"step": 234 | |
}, | |
{ | |
"completion_length": 293.28125, | |
"epoch": 3.7359999999999998, | |
"grad_norm": 0.90625, | |
"kl": 0.019052452000323683, | |
"learning_rate": 4.1809262436796896e-08, | |
"loss": 0.0008, | |
"reward": 3.043783374130726, | |
"reward_std": 0.6604214962571859, | |
"rewards/accuracy_reward": 0.06640625, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7457142351195216, | |
"rewards/reasoning_steps_reward": 0.7402343787252903, | |
"step": 235 | |
}, | |
{ | |
"completion_length": 292.73828125, | |
"epoch": 3.752, | |
"grad_norm": 0.828125, | |
"kl": 0.019031181174796075, | |
"learning_rate": 3.563920078843791e-08, | |
"loss": 0.0008, | |
"reward": 3.077702447772026, | |
"reward_std": 0.6683868058025837, | |
"rewards/accuracy_reward": 0.107421875, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7663521977762381, | |
"rewards/reasoning_steps_reward": 0.6712239496409893, | |
"step": 236 | |
}, | |
{ | |
"completion_length": 281.478515625, | |
"epoch": 3.768, | |
"grad_norm": 1.21875, | |
"kl": 0.022773202043026686, | |
"learning_rate": 2.99582272763152e-08, | |
"loss": 0.0009, | |
"reward": 2.9361980706453323, | |
"reward_std": 0.667768020182848, | |
"rewards/accuracy_reward": 0.17578125, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.6935764315227667, | |
"rewards/reasoning_steps_reward": 0.6796875037252903, | |
"step": 237 | |
}, | |
{ | |
"completion_length": 286.15234375, | |
"epoch": 3.784, | |
"grad_norm": 0.953125, | |
"kl": 0.020423304580617696, | |
"learning_rate": 2.4767469372002362e-08, | |
"loss": 0.0008, | |
"reward": 2.6705066189169884, | |
"reward_std": 0.6245338693261147, | |
"rewards/accuracy_reward": 0.162109375, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.606531698256731, | |
"rewards/reasoning_steps_reward": 0.6888020746409893, | |
"step": 238 | |
}, | |
{ | |
"completion_length": 291.84765625, | |
"epoch": 3.8, | |
"grad_norm": 0.84765625, | |
"kl": 0.017052936542313546, | |
"learning_rate": 2.0067957256676428e-08, | |
"loss": 0.0007, | |
"reward": 3.033589616417885, | |
"reward_std": 0.6652188412845135, | |
"rewards/accuracy_reward": 0.08203125, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7442694374670585, | |
"rewards/reasoning_steps_reward": 0.7187499944120646, | |
"step": 239 | |
}, | |
{ | |
"completion_length": 290.75, | |
"epoch": 3.816, | |
"grad_norm": 0.796875, | |
"kl": 0.021085154090542346, | |
"learning_rate": 1.5860623616664183e-08, | |
"loss": 0.0008, | |
"reward": 2.713896244764328, | |
"reward_std": 0.6678700372576714, | |
"rewards/accuracy_reward": 0.083984375, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.6320626304174463, | |
"rewards/reasoning_steps_reward": 0.733723958954215, | |
"step": 240 | |
}, | |
{ | |
"completion_length": 286.052734375, | |
"epoch": 3.832, | |
"grad_norm": 0.93359375, | |
"kl": 0.02104910637717694, | |
"learning_rate": 1.2146303458337172e-08, | |
"loss": 0.0008, | |
"reward": 3.3307963609695435, | |
"reward_std": 0.6925474908202887, | |
"rewards/accuracy_reward": 0.123046875, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.8309685722924769, | |
"rewards/reasoning_steps_reward": 0.7148437593132257, | |
"step": 241 | |
}, | |
{ | |
"completion_length": 286.875, | |
"epoch": 3.848, | |
"grad_norm": 2.375, | |
"kl": 0.023578285879921168, | |
"learning_rate": 8.92573394239149e-09, | |
"loss": 0.0009, | |
"reward": 2.9508322179317474, | |
"reward_std": 0.6057112123817205, | |
"rewards/accuracy_reward": 0.107421875, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7184197666744391, | |
"rewards/reasoning_steps_reward": 0.6881510633975267, | |
"step": 242 | |
}, | |
{ | |
"completion_length": 296.923828125, | |
"epoch": 3.864, | |
"grad_norm": 1.0390625, | |
"kl": 0.019981018383987248, | |
"learning_rate": 6.1995542375495325e-09, | |
"loss": 0.0008, | |
"reward": 3.1651005297899246, | |
"reward_std": 0.6937647629529238, | |
"rewards/accuracy_reward": 0.064453125, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.8048164764574418, | |
"rewards/reasoning_steps_reward": 0.6861979216337204, | |
"step": 243 | |
}, | |
{ | |
"completion_length": 289.396484375, | |
"epoch": 3.88, | |
"grad_norm": 0.82421875, | |
"kl": 0.01859537634300068, | |
"learning_rate": 3.96830539370563e-09, | |
"loss": 0.0007, | |
"reward": 3.588533952832222, | |
"reward_std": 0.7532828189432621, | |
"rewards/accuracy_reward": 0.087890625, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.9253446195895473, | |
"rewards/reasoning_steps_reward": 0.724609375, | |
"step": 244 | |
}, | |
{ | |
"completion_length": 294.28515625, | |
"epoch": 3.896, | |
"grad_norm": 0.94140625, | |
"kl": 0.01655962661607191, | |
"learning_rate": 2.2324302345483327e-09, | |
"loss": 0.0007, | |
"reward": 3.025103345513344, | |
"reward_std": 0.6890733204782009, | |
"rewards/accuracy_reward": 0.064453125, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7340622267996272, | |
"rewards/reasoning_steps_reward": 0.7584635354578495, | |
"step": 245 | |
}, | |
{ | |
"completion_length": 288.9453125, | |
"epoch": 3.912, | |
"grad_norm": 0.97265625, | |
"kl": 0.020295250928029418, | |
"learning_rate": 9.922732696748816e-10, | |
"loss": 0.0008, | |
"reward": 2.733549617230892, | |
"reward_std": 0.6962179783731699, | |
"rewards/accuracy_reward": 0.07421875, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.6568429181352258, | |
"rewards/reasoning_steps_reward": 0.6888020858168602, | |
"step": 246 | |
}, | |
{ | |
"completion_length": 285.66796875, | |
"epoch": 3.928, | |
"grad_norm": 1.0546875, | |
"kl": 0.017885809938888997, | |
"learning_rate": 2.480806262181168e-10, | |
"loss": 0.0007, | |
"reward": 2.9583439081907272, | |
"reward_std": 0.6105441423133016, | |
"rewards/accuracy_reward": 0.115234375, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.7220087309057514, | |
"rewards/reasoning_steps_reward": 0.6770833320915699, | |
"step": 247 | |
}, | |
{ | |
"completion_length": 288.5234375, | |
"epoch": 3.944, | |
"grad_norm": 26.625, | |
"kl": 0.03823809011373669, | |
"learning_rate": 0.0, | |
"loss": 0.0015, | |
"reward": 2.7989018857479095, | |
"reward_std": 0.6642574854195118, | |
"rewards/accuracy_reward": 0.1015625, | |
"rewards/format_reward": 0.0, | |
"rewards/novelty_reward_func_explore_exploit": 0.6736357094099125, | |
"rewards/reasoning_steps_reward": 0.676432304084301, | |
"step": 248 | |
}, | |
{ | |
"epoch": 3.944, | |
"step": 248, | |
"total_flos": 0.0, | |
"train_loss": 0.006824205948613517, | |
"train_runtime": 18399.2985, | |
"train_samples_per_second": 0.435, | |
"train_steps_per_second": 0.013 | |
} | |
], | |
"logging_steps": 1, | |
"max_steps": 248, | |
"num_input_tokens_seen": 0, | |
"num_train_epochs": 4, | |
"save_steps": 50, | |
"stateful_callbacks": { | |
"TrainerControl": { | |
"args": { | |
"should_epoch_stop": false, | |
"should_evaluate": false, | |
"should_log": false, | |
"should_save": true, | |
"should_training_stop": true | |
}, | |
"attributes": {} | |
} | |
}, | |
"total_flos": 0.0, | |
"train_batch_size": 2, | |
"trial_name": null, | |
"trial_params": null | |
} | |