diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,14697 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.996363636363636, + "eval_steps": 500, + "global_step": 1236, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 0.6635121252609261, + "importance_ratio": 1.0, + "learning_rate": 0.0, + "loss": 0.0211, + "ppo_loss": 1.0, + "sft_loss": 0.06775619834661484, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 0.705460294248276, + "importance_ratio": 1.0, + "kl_div": 0.0002822639944497496, + "kl_div_neg": 0.0002822639944497496, + "learning_rate": 6.020599913279622e-07, + "loss": 0.0824, + "ppo_loss": 1.0002822875976562, + "step": 2 + }, + { + "epoch": 0.01, + "grad_norm": 0.7112764661319769, + "kl_div": 0.0009204513626173139, + "kl_div_sft": 0.0009204513626173139, + "learning_rate": 9.542425094393247e-07, + "loss": -0.1509, + "sft_loss": 0.04132953658699989, + "step": 3 + }, + { + "epoch": 0.01, + "grad_norm": 0.7529159524854754, + "kl_div": -0.00046564757940359414, + "kl_div_sft": -0.00046564757940359414, + "learning_rate": 1.2041199826559244e-06, + "loss": -0.0736, + "sft_loss": 0.05602573603391647, + "step": 4 + }, + { + "epoch": 0.01, + "grad_norm": 0.7418723792629985, + "importance_ratio": 1.0, + "kl_div": -0.0001351796672679484, + "kl_div_neg": 0.0006867063930258155, + "kl_div_sft": -0.0009570657275617123, + "learning_rate": 1.3979400086720373e-06, + "loss": 0.0196, + "ppo_loss": 1.0006868839263916, + "sft_loss": 0.04666399583220482, + "step": 5 + }, + { + "epoch": 0.01, + "grad_norm": 0.669249194100899, + "importance_ratio": 1.0, + "kl_div": -0.00030084114405326545, + "kl_div_pos": -0.00030084114405326545, + "learning_rate": 1.556302500767287e-06, + "loss": 0.1155, + "ppo_loss": -0.9996995329856873, + "step": 6 + }, + { + "epoch": 0.02, + "grad_norm": 0.7068742609399631, + "importance_ratio": 1.0, + "kl_div": -0.0005346988327801228, + "kl_div_neg": -0.0011013677576556802, + "kl_div_sft": 3.19700593536254e-05, + "learning_rate": 1.6901960800285134e-06, + "loss": 0.0055, + "ppo_loss": 0.9988992810249329, + "sft_loss": 0.1200389564037323, + "step": 7 + }, + { + "epoch": 0.02, + "grad_norm": 0.6876259168276546, + "importance_ratio": 1.0, + "kl_div": 8.872683974914253e-05, + "kl_div_neg": -0.0002661606704350561, + "kl_div_sft": 0.00044361434993334115, + "learning_rate": 1.8061799739838866e-06, + "loss": 0.1156, + "ppo_loss": 0.9997339248657227, + "sft_loss": 0.08696060627698898, + "step": 8 + }, + { + "epoch": 0.02, + "grad_norm": 0.7939305603233358, + "importance_ratio": 1.0, + "kl_div": 0.00036535965045914054, + "kl_div_pos": -0.0009317900985479355, + "kl_div_sft": 0.0016625093994662166, + "learning_rate": 1.9084850188786494e-06, + "loss": -0.1666, + "ppo_loss": -0.9990686774253845, + "sft_loss": 0.06919308006763458, + "step": 9 + }, + { + "epoch": 0.02, + "grad_norm": 0.623453396281409, + "importance_ratio": 1.0, + "kl_div": 0.0022256490774452686, + "kl_div_neg": 0.0023987418971955776, + "kl_div_pos": 0.0020525562576949596, + "learning_rate": 1.9999999999999995e-06, + "loss": -0.0104, + "ppo_loss": 0.0001735091209411621, + "step": 10 + }, + { + "epoch": 0.03, + "grad_norm": 0.6390810247968097, + "importance_ratio": 1.0, + "kl_div": 0.0018754223128780723, + "kl_div_neg": 0.0018754223128780723, + "learning_rate": 2e-06, + "loss": 0.0211, + "ppo_loss": 1.0018773078918457, + "step": 11 + }, + { + "epoch": 0.03, + "grad_norm": 0.7721549294865053, + "importance_ratio": 1.0, + "kl_div": -0.0011717455927282572, + "kl_div_neg": -0.002727470127865672, + "kl_div_pos": 0.00038397900061681867, + "learning_rate": 1.99836867862969e-06, + "loss": -0.027, + "ppo_loss": -0.0015539228916168213, + "step": 12 + }, + { + "epoch": 0.03, + "grad_norm": 0.7730312243665663, + "importance_ratio": 1.0, + "kl_div": 0.0003774297656491399, + "kl_div_neg": -0.002771079307422042, + "kl_div_pos": 0.0035259388387203217, + "learning_rate": 1.99673735725938e-06, + "loss": 0.0825, + "ppo_loss": -0.003149688243865967, + "step": 13 + }, + { + "epoch": 0.03, + "grad_norm": 0.8498004917610201, + "kl_div": 0.0019221242982894182, + "kl_div_sft": 0.0019221242982894182, + "learning_rate": 1.99510603588907e-06, + "loss": 0.2234, + "sft_loss": 0.05455465987324715, + "step": 14 + }, + { + "epoch": 0.04, + "grad_norm": 0.7375702032523053, + "importance_ratio": 1.0, + "kl_div": 0.0008038842352107167, + "kl_div_neg": 0.001211852766573429, + "kl_div_pos": 0.00039591570384800434, + "learning_rate": 1.99347471451876e-06, + "loss": -0.0567, + "ppo_loss": 0.0004082918167114258, + "step": 15 + }, + { + "epoch": 0.04, + "grad_norm": 0.7252607682305425, + "importance_ratio": 1.0, + "kl_div": 0.0012020182330161333, + "kl_div_neg": 0.0013668726896867156, + "kl_div_sft": 0.0010371638927608728, + "learning_rate": 1.99184339314845e-06, + "loss": 0.0046, + "ppo_loss": 1.0013678073883057, + "sft_loss": 0.12026583403348923, + "step": 16 + }, + { + "epoch": 0.04, + "grad_norm": 0.7092052777631312, + "importance_ratio": 0.99609375, + "kl_div": 0.00025449926033616066, + "kl_div_pos": -0.002506878226995468, + "kl_div_sft": 0.0030158767476677895, + "learning_rate": 1.9902120717781402e-06, + "loss": 0.0364, + "ppo_loss": -0.9974962472915649, + "sft_loss": 0.05157789587974548, + "step": 17 + }, + { + "epoch": 0.04, + "grad_norm": 0.668932570732893, + "kl_div": 0.0012163774808868766, + "kl_div_sft": 0.0012163774808868766, + "learning_rate": 1.9885807504078304e-06, + "loss": 0.0053, + "sft_loss": 0.08282782137393951, + "step": 18 + }, + { + "epoch": 0.05, + "grad_norm": 0.8751659917059365, + "importance_ratio": 0.98828125, + "kl_div": -0.004169079475104809, + "kl_div_pos": -0.010013559833168983, + "kl_div_sft": 0.0016754004172980785, + "learning_rate": 1.98694942903752e-06, + "loss": -0.1204, + "ppo_loss": -0.9900364279747009, + "sft_loss": 0.0878886952996254, + "step": 19 + }, + { + "epoch": 0.05, + "grad_norm": 0.7782780385712201, + "kl_div": 0.002132371999323368, + "kl_div_sft": 0.002132371999323368, + "learning_rate": 1.9853181076672104e-06, + "loss": -0.0268, + "sft_loss": 0.10623150318861008, + "step": 20 + }, + { + "epoch": 0.05, + "grad_norm": 0.9007209618700114, + "importance_ratio": 0.984375, + "kl_div": -0.007818278856575489, + "kl_div_pos": -0.015718363225460052, + "kl_div_sft": 8.180640725186095e-05, + "learning_rate": 1.9836867862969006e-06, + "loss": -0.1371, + "ppo_loss": -0.9844045042991638, + "sft_loss": 0.06074066832661629, + "step": 21 + }, + { + "epoch": 0.05, + "grad_norm": 0.8245334837888496, + "importance_ratio": 0.98046875, + "kl_div": -0.011028180830180645, + "kl_div_pos": -0.01786625385284424, + "kl_div_sft": -0.004190108273178339, + "learning_rate": 1.9820554649265904e-06, + "loss": -0.1048, + "ppo_loss": -0.9822924137115479, + "sft_loss": 0.05008915439248085, + "step": 22 + }, + { + "epoch": 0.06, + "grad_norm": 0.8880969524854001, + "importance_ratio": 1.0078125, + "kl_div": -0.0005527837201952934, + "kl_div_neg": 0.006105577107518911, + "kl_div_sft": -0.007211144547909498, + "learning_rate": 1.9804241435562806e-06, + "loss": 0.1279, + "ppo_loss": 1.0061242580413818, + "sft_loss": 0.043001554906368256, + "step": 23 + }, + { + "epoch": 0.06, + "grad_norm": 0.8045722360370814, + "importance_ratio": 1.0, + "kl_div": 0.0022286863531917334, + "kl_div_pos": 0.0017837247578427196, + "kl_div_sft": 0.0026736478321254253, + "learning_rate": 1.9787928221859708e-06, + "loss": -0.0913, + "ppo_loss": -1.0017852783203125, + "sft_loss": 0.034286826848983765, + "step": 24 + }, + { + "epoch": 0.06, + "grad_norm": 0.7689353843398787, + "importance_ratio": 0.99609375, + "kl_div": -0.001848046900704503, + "kl_div_neg": -0.0025919671170413494, + "kl_div_sft": -0.0011041266843676567, + "learning_rate": 1.9771615008156605e-06, + "loss": 0.0209, + "ppo_loss": 0.9974113702774048, + "sft_loss": 0.05627015605568886, + "step": 25 + }, + { + "epoch": 0.06, + "grad_norm": 0.9286371596882123, + "importance_ratio": 0.9375, + "kl_div": -0.030886108055710793, + "kl_div_neg": -0.06649031490087509, + "kl_div_sft": 0.004718099255114794, + "learning_rate": 1.9755301794453507e-06, + "loss": -0.0009, + "ppo_loss": 0.9356719255447388, + "sft_loss": 0.030333051458001137, + "step": 26 + }, + { + "epoch": 0.07, + "grad_norm": 0.8046988699396788, + "importance_ratio": 0.9296875, + "kl_div": -0.03120781108736992, + "kl_div_neg": -0.07134287804365158, + "kl_div_sft": 0.008927257731556892, + "learning_rate": 1.9738988580750405e-06, + "loss": -0.1101, + "ppo_loss": 0.9311425685882568, + "sft_loss": 0.0429232157766819, + "step": 27 + }, + { + "epoch": 0.07, + "grad_norm": 0.8819900327365835, + "kl_div": 0.011365109123289585, + "kl_div_sft": 0.011365109123289585, + "learning_rate": 1.9722675367047307e-06, + "loss": 0.1108, + "sft_loss": 0.03654914349317551, + "step": 28 + }, + { + "epoch": 0.07, + "grad_norm": 0.9544759150664117, + "importance_ratio": 1.0, + "kl_div": 0.005260104313492775, + "kl_div_pos": 0.003754216246306896, + "kl_div_sft": 0.006765992846339941, + "learning_rate": 1.970636215334421e-06, + "loss": 0.0282, + "ppo_loss": -1.0037612915039062, + "sft_loss": 0.041900016367435455, + "step": 29 + }, + { + "epoch": 0.07, + "grad_norm": 1.0208257298608072, + "importance_ratio": 0.9921875, + "kl_div": -0.003449210897088051, + "kl_div_neg": -0.006457801442593336, + "kl_div_sft": -0.0004406205553095788, + "learning_rate": 1.9690048939641107e-06, + "loss": -0.0117, + "ppo_loss": 0.9935629963874817, + "sft_loss": 0.10450047254562378, + "step": 30 + }, + { + "epoch": 0.08, + "grad_norm": 0.9936690195544808, + "importance_ratio": 1.0078125, + "kl_div": 0.0018349961610510945, + "kl_div_pos": 0.005279692821204662, + "kl_div_sft": -0.0016097004991024733, + "learning_rate": 1.967373572593801e-06, + "loss": 0.1026, + "ppo_loss": -1.005293607711792, + "sft_loss": 0.07605157792568207, + "step": 31 + }, + { + "epoch": 0.08, + "grad_norm": 0.7723476939377518, + "kl_div": -0.005411309655755758, + "kl_div_sft": -0.005411309655755758, + "learning_rate": 1.965742251223491e-06, + "loss": -0.0887, + "sft_loss": 0.118146151304245, + "step": 32 + }, + { + "epoch": 0.08, + "grad_norm": 1.2419891199013167, + "importance_ratio": 0.953125, + "kl_div": -0.02059026248753071, + "kl_div_neg": -0.04986540973186493, + "kl_div_sft": 0.008684884756803513, + "learning_rate": 1.9641109298531813e-06, + "loss": -0.1473, + "ppo_loss": 0.9513574242591858, + "sft_loss": 0.026145216077566147, + "step": 33 + }, + { + "epoch": 0.08, + "grad_norm": 1.1872208919784464, + "importance_ratio": 0.984375, + "kl_div": -0.014594344422221184, + "kl_div_neg": -0.011061317287385464, + "kl_div_pos": -0.01812737062573433, + "learning_rate": 1.962479608482871e-06, + "loss": 0.0378, + "ppo_loss": 0.003481835126876831, + "step": 34 + }, + { + "epoch": 0.08, + "grad_norm": 1.1060544676311477, + "importance_ratio": 0.99609375, + "kl_div": -0.007654288783669472, + "kl_div_pos": -0.0056701661087572575, + "kl_div_sft": -0.009638411924242973, + "learning_rate": 1.9608482871125612e-06, + "loss": -0.1361, + "ppo_loss": -0.9943458437919617, + "sft_loss": 0.08755666762590408, + "step": 35 + }, + { + "epoch": 0.09, + "grad_norm": 1.2161479462544658, + "kl_div": -0.014355514198541641, + "kl_div_sft": -0.014355514198541641, + "learning_rate": 1.959216965742251e-06, + "loss": 0.0437, + "sft_loss": 0.08722537755966187, + "step": 36 + }, + { + "epoch": 0.09, + "grad_norm": 1.1803055716733668, + "kl_div": 0.002251553349196911, + "kl_div_sft": 0.002251553349196911, + "learning_rate": 1.957585644371941e-06, + "loss": 0.0125, + "sft_loss": 0.09785296022891998, + "step": 37 + }, + { + "epoch": 0.09, + "grad_norm": 1.189250208391352, + "importance_ratio": 0.984375, + "kl_div": -0.012131206691265106, + "kl_div_neg": -0.012131206691265106, + "learning_rate": 1.9559543230016314e-06, + "loss": 0.0831, + "ppo_loss": 0.9880673885345459, + "step": 38 + }, + { + "epoch": 0.09, + "grad_norm": 1.158483873277291, + "importance_ratio": 0.84375, + "kl_div": -0.10153215378522873, + "kl_div_pos": -0.16852955520153046, + "kl_div_sft": -0.0345347560942173, + "learning_rate": 1.954323001631321e-06, + "loss": 0.1808, + "ppo_loss": -0.8449063301086426, + "sft_loss": 0.23241208493709564, + "step": 39 + }, + { + "epoch": 0.1, + "grad_norm": 1.4674984083540699, + "importance_ratio": 1.0, + "kl_div": -0.016898073256015778, + "kl_div_pos": 0.002774033695459366, + "kl_div_sft": -0.03657018020749092, + "learning_rate": 1.9526916802610114e-06, + "loss": 0.0697, + "ppo_loss": -1.0027778148651123, + "sft_loss": 0.11947453022003174, + "step": 40 + }, + { + "epoch": 0.1, + "grad_norm": 1.4236500318629106, + "importance_ratio": 0.984375, + "kl_div": -0.01073143258690834, + "kl_div_neg": -0.015295865945518017, + "kl_div_sft": -0.006167000159621239, + "learning_rate": 1.951060358890701e-06, + "loss": 0.1136, + "ppo_loss": 0.9848204851150513, + "sft_loss": 0.07722340524196625, + "step": 41 + }, + { + "epoch": 0.1, + "grad_norm": 1.0650865262660743, + "importance_ratio": 0.9140625, + "kl_div": -0.08983860164880753, + "kl_div_neg": -0.13955451548099518, + "kl_div_pos": -0.040122684091329575, + "learning_rate": 1.9494290375203913e-06, + "loss": -0.0056, + "ppo_loss": -0.045462965965270996, + "step": 42 + }, + { + "epoch": 0.1, + "grad_norm": 1.490474016712344, + "importance_ratio": 0.98046875, + "kl_div": -0.009037816897034645, + "kl_div_pos": -0.019423315301537514, + "kl_div_sft": 0.001347680576145649, + "learning_rate": 1.9477977161500815e-06, + "loss": -0.028, + "ppo_loss": -0.9807640910148621, + "sft_loss": 0.0452117882668972, + "step": 43 + }, + { + "epoch": 0.11, + "grad_norm": 1.0889998692984455, + "importance_ratio": 0.88671875, + "kl_div": -0.125685915350914, + "kl_div_neg": -0.125685915350914, + "learning_rate": 1.9461663947797717e-06, + "loss": 0.0699, + "ppo_loss": 0.8866076469421387, + "step": 44 + }, + { + "epoch": 0.11, + "grad_norm": 1.3315241231448502, + "importance_ratio": 0.984375, + "kl_div": -0.0671415627002716, + "kl_div_pos": -0.013838584534823895, + "kl_div_sft": -0.12044453620910645, + "learning_rate": 1.9445350734094615e-06, + "loss": 0.0841, + "ppo_loss": -0.9862567186355591, + "sft_loss": 0.2546923756599426, + "step": 45 + }, + { + "epoch": 0.11, + "grad_norm": 1.2332457193408217, + "importance_ratio": 0.9375, + "kl_div": -0.06261230260133743, + "kl_div_neg": -0.06261230260133743, + "learning_rate": 1.9429037520391517e-06, + "loss": -0.1038, + "ppo_loss": 0.9403876066207886, + "step": 46 + }, + { + "epoch": 0.11, + "grad_norm": 1.3700016378128397, + "kl_div": -0.005976326763629913, + "kl_div_sft": -0.005976326763629913, + "learning_rate": 1.941272430668842e-06, + "loss": -0.0048, + "sft_loss": 0.05619629845023155, + "step": 47 + }, + { + "epoch": 0.12, + "grad_norm": 1.1374039829517788, + "importance_ratio": 0.91796875, + "kl_div": -0.03426515311002731, + "kl_div_neg": -0.08605366200208664, + "kl_div_sft": 0.01752335950732231, + "learning_rate": 1.9396411092985316e-06, + "loss": 0.0432, + "ppo_loss": 0.9175450205802917, + "sft_loss": 0.044655341655015945, + "step": 48 + }, + { + "epoch": 0.12, + "grad_norm": 1.1122507780573518, + "kl_div": 0.007389682345092297, + "kl_div_sft": 0.007389682345092297, + "learning_rate": 1.938009787928222e-06, + "loss": 0.01, + "sft_loss": 0.03936357796192169, + "step": 49 + }, + { + "epoch": 0.12, + "grad_norm": 1.7067137538603148, + "importance_ratio": 0.9453125, + "kl_div": -0.0594576857984066, + "kl_div_pos": -0.0594576857984066, + "learning_rate": 1.936378466557912e-06, + "loss": -0.103, + "ppo_loss": -0.9448889493942261, + "step": 50 + }, + { + "epoch": 0.12, + "grad_norm": 1.1785624615216657, + "importance_ratio": 0.90234375, + "kl_div": -0.10802098363637924, + "kl_div_neg": -0.1956670731306076, + "kl_div_pos": -0.02037489227950573, + "learning_rate": 1.934747145187602e-06, + "loss": -0.0801, + "ppo_loss": -0.0787726640701294, + "step": 51 + }, + { + "epoch": 0.13, + "grad_norm": 1.1189453593763627, + "importance_ratio": 0.9375, + "kl_div": -0.06822185963392258, + "kl_div_neg": -0.02376522310078144, + "kl_div_pos": -0.11267849802970886, + "learning_rate": 1.933115823817292e-06, + "loss": 0.0258, + "ppo_loss": 0.041538506746292114, + "step": 52 + }, + { + "epoch": 0.13, + "grad_norm": 1.2671464323075408, + "importance_ratio": 0.921875, + "kl_div": -0.0395657904446125, + "kl_div_pos": -0.08192670345306396, + "kl_div_sft": 0.0027951220981776714, + "learning_rate": 1.9314845024469818e-06, + "loss": -0.0388, + "ppo_loss": -0.9213394522666931, + "sft_loss": 0.06512558460235596, + "step": 53 + }, + { + "epoch": 0.13, + "grad_norm": 1.3067314447107663, + "importance_ratio": 0.9453125, + "kl_div": -0.05756595730781555, + "kl_div_neg": -0.05756595730781555, + "learning_rate": 1.929853181076672e-06, + "loss": 0.0434, + "ppo_loss": 0.9440711736679077, + "step": 54 + }, + { + "epoch": 0.13, + "grad_norm": 1.4856568272976414, + "importance_ratio": 0.8046875, + "kl_div": -0.1118774339556694, + "kl_div_neg": -0.21787752211093903, + "kl_div_sft": -0.005877349525690079, + "learning_rate": 1.928221859706362e-06, + "loss": 0.1075, + "ppo_loss": 0.8042239546775818, + "sft_loss": 0.07803851366043091, + "step": 55 + }, + { + "epoch": 0.14, + "grad_norm": 1.2536513405163698, + "importance_ratio": 0.94140625, + "kl_div": -0.060897182673215866, + "kl_div_neg": -0.060897182673215866, + "learning_rate": 1.926590538336052e-06, + "loss": 0.2201, + "ppo_loss": 0.9409350156784058, + "step": 56 + }, + { + "epoch": 0.14, + "grad_norm": 1.8340930520582641, + "kl_div": 0.006499993149191141, + "kl_div_sft": 0.006499993149191141, + "learning_rate": 1.924959216965742e-06, + "loss": -0.0637, + "sft_loss": 0.08112891763448715, + "step": 57 + }, + { + "epoch": 0.14, + "grad_norm": 2.7852510112462383, + "importance_ratio": 0.9609375, + "kl_div": -0.034732721745967865, + "kl_div_pos": -0.0408448651432991, + "kl_div_sft": -0.028620580211281776, + "learning_rate": 1.9233278955954323e-06, + "loss": 0.1799, + "ppo_loss": -0.9599780440330505, + "sft_loss": 0.10139136761426926, + "step": 58 + }, + { + "epoch": 0.14, + "grad_norm": 1.448854302904934, + "importance_ratio": 0.9609375, + "kl_div": -0.021149268373847008, + "kl_div_neg": -0.04163122922182083, + "kl_div_sft": -0.0006673082825727761, + "learning_rate": 1.9216965742251225e-06, + "loss": -0.0926, + "ppo_loss": 0.9592234492301941, + "sft_loss": 0.03335588797926903, + "step": 59 + }, + { + "epoch": 0.15, + "grad_norm": 1.4108705692743857, + "importance_ratio": 0.8984375, + "kl_div": -0.11035354435443878, + "kl_div_neg": -0.16702036559581757, + "kl_div_pos": -0.0536867156624794, + "learning_rate": 1.9200652528548123e-06, + "loss": 0.0644, + "ppo_loss": -0.0507732629776001, + "step": 60 + }, + { + "epoch": 0.15, + "grad_norm": 1.3722931053131302, + "kl_div": -0.012220574542880058, + "kl_div_sft": -0.012220574542880058, + "learning_rate": 1.9184339314845025e-06, + "loss": 0.0142, + "sft_loss": 0.12431100010871887, + "step": 61 + }, + { + "epoch": 0.15, + "grad_norm": 1.401147358629085, + "importance_ratio": 0.921875, + "kl_div": -0.07922801375389099, + "kl_div_neg": -0.055370330810546875, + "kl_div_pos": -0.10308569669723511, + "learning_rate": 1.9168026101141923e-06, + "loss": -0.1723, + "ppo_loss": 0.022042512893676758, + "step": 62 + }, + { + "epoch": 0.15, + "grad_norm": 2.6114266376273556, + "importance_ratio": 0.94140625, + "kl_div": -0.02723909728229046, + "kl_div_pos": -0.06206502392888069, + "kl_div_sft": 0.007586829364299774, + "learning_rate": 1.9151712887438825e-06, + "loss": -0.0016, + "ppo_loss": -0.9398217797279358, + "sft_loss": 0.08087282627820969, + "step": 63 + }, + { + "epoch": 0.16, + "grad_norm": 1.5447770351026526, + "kl_div": -0.002401808276772499, + "kl_div_sft": -0.002401808276772499, + "learning_rate": 1.9135399673735727e-06, + "loss": -0.0636, + "sft_loss": 0.08615696430206299, + "step": 64 + }, + { + "epoch": 0.16, + "grad_norm": 1.4003828785361496, + "kl_div": 0.0021121015306562185, + "kl_div_sft": 0.0021121015306562185, + "learning_rate": 1.9119086460032624e-06, + "loss": 0.0677, + "sft_loss": 0.06753754615783691, + "step": 65 + }, + { + "epoch": 0.16, + "grad_norm": 1.3365684045710704, + "importance_ratio": 0.984375, + "kl_div": -0.017900364473462105, + "kl_div_pos": -0.017900364473462105, + "learning_rate": 1.9102773246329526e-06, + "loss": -0.0189, + "ppo_loss": -0.9823896884918213, + "step": 66 + }, + { + "epoch": 0.16, + "grad_norm": 1.4343510843913625, + "importance_ratio": 0.79296875, + "kl_div": -0.1421518474817276, + "kl_div_neg": -0.2302280217409134, + "kl_div_sft": -0.054075662046670914, + "learning_rate": 1.9086460032626424e-06, + "loss": -0.0862, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.1580251157283783, + "step": 67 + }, + { + "epoch": 0.16, + "grad_norm": 1.8832227331223539, + "importance_ratio": 0.8125, + "kl_div": -0.20741838216781616, + "kl_div_neg": -0.2551022469997406, + "kl_div_pos": -0.15973453223705292, + "learning_rate": 1.9070146818923328e-06, + "loss": 0.0184, + "ppo_loss": -0.02618500590324402, + "step": 68 + }, + { + "epoch": 0.17, + "grad_norm": 1.6859843193673505, + "importance_ratio": 0.96875, + "kl_div": -0.00505702942609787, + "kl_div_pos": -0.029736729338765144, + "kl_div_sft": 0.019622670486569405, + "learning_rate": 1.9053833605220226e-06, + "loss": 0.002, + "ppo_loss": -0.9707010388374329, + "sft_loss": 0.1003195270895958, + "step": 69 + }, + { + "epoch": 0.17, + "grad_norm": 1.2920918328920994, + "importance_ratio": 0.94140625, + "kl_div": -0.026884760707616806, + "kl_div_pos": -0.06142554432153702, + "kl_div_sft": 0.007656024768948555, + "learning_rate": 1.9037520391517128e-06, + "loss": -0.0516, + "ppo_loss": -0.9404229521751404, + "sft_loss": 0.05460807681083679, + "step": 70 + }, + { + "epoch": 0.17, + "grad_norm": 1.6652357952873371, + "kl_div": -0.01144311111420393, + "kl_div_sft": -0.01144311111420393, + "learning_rate": 1.902120717781403e-06, + "loss": 0.1146, + "sft_loss": 0.05612169951200485, + "step": 71 + }, + { + "epoch": 0.17, + "grad_norm": 1.4722564721272102, + "importance_ratio": 0.93359375, + "kl_div": -0.03432208672165871, + "kl_div_pos": -0.06942012161016464, + "kl_div_sft": 0.0007759497966617346, + "learning_rate": 1.9004893964110927e-06, + "loss": 0.0532, + "ppo_loss": -0.9329346418380737, + "sft_loss": 0.04070064797997475, + "step": 72 + }, + { + "epoch": 0.18, + "grad_norm": 1.096017503431569, + "importance_ratio": 0.98046875, + "kl_div": -0.0019319439306855202, + "kl_div_pos": -0.019157661125063896, + "kl_div_sft": 0.015293773263692856, + "learning_rate": 1.898858075040783e-06, + "loss": -0.0507, + "ppo_loss": -0.9810246825218201, + "sft_loss": 0.07127617299556732, + "step": 73 + }, + { + "epoch": 0.18, + "grad_norm": 2.8056491064037496, + "importance_ratio": 0.96484375, + "kl_div": -0.03702807426452637, + "kl_div_pos": -0.03702807426452637, + "learning_rate": 1.897226753670473e-06, + "loss": -0.0432, + "ppo_loss": -0.9645748138427734, + "step": 74 + }, + { + "epoch": 0.18, + "grad_norm": 1.6776358205783002, + "kl_div": 0.01326986588537693, + "kl_div_sft": 0.01326986588537693, + "learning_rate": 1.8955954323001631e-06, + "loss": -0.0721, + "sft_loss": 0.03585825115442276, + "step": 75 + }, + { + "epoch": 0.18, + "grad_norm": 1.3208584531366925, + "importance_ratio": 0.890625, + "kl_div": -0.10156667232513428, + "kl_div_neg": -0.11789321154356003, + "kl_div_sft": -0.08524013310670853, + "learning_rate": 1.8939641109298531e-06, + "loss": 0.0193, + "ppo_loss": 0.8887909650802612, + "sft_loss": 0.1250036656856537, + "step": 76 + }, + { + "epoch": 0.19, + "grad_norm": 1.234519805140591, + "importance_ratio": 0.9765625, + "kl_div": -0.03858345001935959, + "kl_div_neg": -0.02181975729763508, + "kl_div_sft": -0.05534714460372925, + "learning_rate": 1.892332789559543e-06, + "loss": 0.0532, + "ppo_loss": 0.9784166216850281, + "sft_loss": 0.10421111434698105, + "step": 77 + }, + { + "epoch": 0.19, + "grad_norm": 1.4403118331023692, + "importance_ratio": 0.95703125, + "kl_div": -0.01925833150744438, + "kl_div_pos": -0.04326550289988518, + "kl_div_sft": 0.0047488403506577015, + "learning_rate": 1.8907014681892333e-06, + "loss": -0.0328, + "ppo_loss": -0.9576570987701416, + "sft_loss": 0.025720076635479927, + "step": 78 + }, + { + "epoch": 0.19, + "grad_norm": 3.560108787717491, + "importance_ratio": 0.98046875, + "kl_div": -0.019890496507287025, + "kl_div_neg": -0.036975789815187454, + "kl_div_pos": -0.0028052027337253094, + "learning_rate": 1.8890701468189233e-06, + "loss": 0.0458, + "ppo_loss": -0.016749650239944458, + "step": 79 + }, + { + "epoch": 0.19, + "grad_norm": 1.9435713609254912, + "kl_div": -0.005440596025437117, + "kl_div_sft": -0.005440596025437117, + "learning_rate": 1.8874388254486133e-06, + "loss": -0.1137, + "sft_loss": 0.07730162888765335, + "step": 80 + }, + { + "epoch": 0.2, + "grad_norm": 4.148207835936149, + "kl_div": -0.011881545186042786, + "kl_div_sft": -0.011881545186042786, + "learning_rate": 1.8858075040783032e-06, + "loss": 0.0925, + "sft_loss": 0.06615802645683289, + "step": 81 + }, + { + "epoch": 0.2, + "grad_norm": 16.686075328347325, + "importance_ratio": 0.8828125, + "kl_div": -0.12083092331886292, + "kl_div_neg": -0.10571560263633728, + "kl_div_pos": -0.13594624400138855, + "learning_rate": 1.8841761827079934e-06, + "loss": -0.0205, + "ppo_loss": 0.013395458459854126, + "step": 82 + }, + { + "epoch": 0.2, + "grad_norm": 1.3354719365783643, + "kl_div": -0.0251028873026371, + "kl_div_sft": -0.0251028873026371, + "learning_rate": 1.8825448613376836e-06, + "loss": 0.009, + "sft_loss": 0.09680324792861938, + "step": 83 + }, + { + "epoch": 0.2, + "grad_norm": 3.219100044253, + "importance_ratio": 0.953125, + "kl_div": -0.02121526561677456, + "kl_div_pos": -0.04778638482093811, + "kl_div_sft": 0.00535585219040513, + "learning_rate": 1.8809135399673734e-06, + "loss": 0.0199, + "ppo_loss": -0.9533373713493347, + "sft_loss": 0.05247313529253006, + "step": 84 + }, + { + "epoch": 0.21, + "grad_norm": 9.536574780544736, + "importance_ratio": 0.94921875, + "kl_div": -0.05309019237756729, + "kl_div_pos": -0.05309019237756729, + "learning_rate": 1.8792822185970636e-06, + "loss": -0.1631, + "ppo_loss": -0.9483247399330139, + "step": 85 + }, + { + "epoch": 0.21, + "grad_norm": 1.109040115745377, + "importance_ratio": 0.875, + "kl_div": -0.14318667352199554, + "kl_div_pos": -0.14318667352199554, + "learning_rate": 1.8776508972267536e-06, + "loss": -0.0027, + "ppo_loss": -0.8744655251502991, + "step": 86 + }, + { + "epoch": 0.21, + "grad_norm": 3.7104905672476503, + "importance_ratio": 0.8515625, + "kl_div": -0.17779727280139923, + "kl_div_neg": -0.352699339389801, + "kl_div_pos": -0.002895209938287735, + "learning_rate": 1.8760195758564436e-06, + "loss": 0.0602, + "ppo_loss": -0.09855446219444275, + "step": 87 + }, + { + "epoch": 0.21, + "grad_norm": 14.546277092304194, + "kl_div": 0.004458991345018148, + "kl_div_sft": 0.004458991345018148, + "learning_rate": 1.8743882544861336e-06, + "loss": 0.0783, + "sft_loss": 0.034892488270998, + "step": 88 + }, + { + "epoch": 0.22, + "grad_norm": 10.241466784925795, + "importance_ratio": 0.94140625, + "kl_div": -0.059860531240701675, + "kl_div_pos": -0.059860531240701675, + "learning_rate": 1.8727569331158237e-06, + "loss": 0.0203, + "ppo_loss": -0.9426825046539307, + "step": 89 + }, + { + "epoch": 0.22, + "grad_norm": 1.1149219937257968, + "importance_ratio": 0.96875, + "kl_div": -0.032899901270866394, + "kl_div_neg": -0.01882491260766983, + "kl_div_pos": -0.04697488993406296, + "learning_rate": 1.871125611745514e-06, + "loss": 0.0003, + "ppo_loss": 0.013619929552078247, + "step": 90 + }, + { + "epoch": 0.22, + "grad_norm": 2.920325536479396, + "importance_ratio": 0.9765625, + "kl_div": -0.016603615134954453, + "kl_div_pos": -0.022303353995084763, + "kl_div_sft": -0.010903875343501568, + "learning_rate": 1.8694942903752037e-06, + "loss": -0.0031, + "ppo_loss": -0.9779435396194458, + "sft_loss": 0.06654062867164612, + "step": 91 + }, + { + "epoch": 0.22, + "grad_norm": 1.1270961307121106, + "importance_ratio": 0.84375, + "kl_div": -0.08473112434148788, + "kl_div_neg": -0.1700011044740677, + "kl_div_sft": 0.0005388528225012124, + "learning_rate": 1.867862969004894e-06, + "loss": -0.0543, + "ppo_loss": 0.8436638712882996, + "sft_loss": 0.05119462311267853, + "step": 92 + }, + { + "epoch": 0.23, + "grad_norm": 5.17656096721227, + "importance_ratio": 0.8203125, + "kl_div": -0.1019870936870575, + "kl_div_neg": -0.19734208285808563, + "kl_div_sft": -0.006632108241319656, + "learning_rate": 1.866231647634584e-06, + "loss": -0.086, + "ppo_loss": 0.8209097981452942, + "sft_loss": 0.06681513786315918, + "step": 93 + }, + { + "epoch": 0.23, + "grad_norm": 4.32968981717673, + "importance_ratio": 0.953125, + "kl_div": -0.05050528049468994, + "kl_div_neg": -0.04812584072351456, + "kl_div_pos": -0.052884723991155624, + "learning_rate": 1.864600326264274e-06, + "loss": 0.0194, + "ppo_loss": 0.0022622644901275635, + "step": 94 + }, + { + "epoch": 0.23, + "grad_norm": 1.1813542799359134, + "kl_div": 0.00850912369787693, + "kl_div_sft": 0.00850912369787693, + "learning_rate": 1.862969004893964e-06, + "loss": -0.1158, + "sft_loss": 0.05716530978679657, + "step": 95 + }, + { + "epoch": 0.23, + "grad_norm": 1.0067802880019683, + "kl_div": -0.0733400210738182, + "kl_div_sft": -0.0733400210738182, + "learning_rate": 1.861337683523654e-06, + "loss": -0.1477, + "sft_loss": 0.12137281149625778, + "step": 96 + }, + { + "epoch": 0.24, + "grad_norm": 2.1790317656541816, + "importance_ratio": 0.8125, + "kl_div": -0.20977582037448883, + "kl_div_neg": -0.20977582037448883, + "learning_rate": 1.8597063621533443e-06, + "loss": 0.0704, + "ppo_loss": 0.8167802095413208, + "step": 97 + }, + { + "epoch": 0.24, + "grad_norm": 1.5017800894920748, + "kl_div": 0.0028979559428989887, + "kl_div_sft": 0.0028979559428989887, + "learning_rate": 1.858075040783034e-06, + "loss": 0.0027, + "sft_loss": 0.04274223372340202, + "step": 98 + }, + { + "epoch": 0.24, + "grad_norm": 3.253236186447958, + "kl_div": -0.0038328543305397034, + "kl_div_sft": -0.0038328543305397034, + "learning_rate": 1.8564437194127242e-06, + "loss": -0.0125, + "sft_loss": 0.06729351729154587, + "step": 99 + }, + { + "epoch": 0.24, + "grad_norm": 1.1924399472356528, + "importance_ratio": 0.9921875, + "kl_div": -0.017330992966890335, + "kl_div_pos": -0.006222095340490341, + "kl_div_sft": -0.028439892455935478, + "learning_rate": 1.8548123980424142e-06, + "loss": -0.111, + "ppo_loss": -0.993797242641449, + "sft_loss": 0.14494113624095917, + "step": 100 + }, + { + "epoch": 0.24, + "grad_norm": 4.0589767436892945, + "importance_ratio": 1.0, + "kl_div": 0.003675918560475111, + "kl_div_pos": 0.002127651358023286, + "kl_div_sft": 0.0052241855300962925, + "learning_rate": 1.8531810766721044e-06, + "loss": -0.0991, + "ppo_loss": -1.0021299123764038, + "sft_loss": 0.04997418820858002, + "step": 101 + }, + { + "epoch": 0.25, + "grad_norm": 3.0079827594871733, + "importance_ratio": 0.8515625, + "kl_div": -0.15740655362606049, + "kl_div_neg": -0.1272001564502716, + "kl_div_pos": -0.18761295080184937, + "learning_rate": 1.8515497553017944e-06, + "loss": -0.106, + "ppo_loss": 0.025810927152633667, + "step": 102 + }, + { + "epoch": 0.25, + "grad_norm": 6.991458586308189, + "kl_div": -0.018888656049966812, + "kl_div_sft": -0.018888656049966812, + "learning_rate": 1.8499184339314844e-06, + "loss": -0.0726, + "sft_loss": 0.12873780727386475, + "step": 103 + }, + { + "epoch": 0.25, + "grad_norm": 2.0316232191681327, + "importance_ratio": 0.984375, + "kl_div": -0.018464138731360435, + "kl_div_neg": -0.03176502138376236, + "kl_div_pos": -0.005163257010281086, + "learning_rate": 1.8482871125611746e-06, + "loss": 0.0248, + "ppo_loss": -0.013057917356491089, + "step": 104 + }, + { + "epoch": 0.25, + "grad_norm": 19.860864758982913, + "importance_ratio": 1.0, + "kl_div": -0.0046781618148088455, + "kl_div_neg": 0.002820936730131507, + "kl_div_sft": -0.012177260592579842, + "learning_rate": 1.8466557911908646e-06, + "loss": -0.0997, + "ppo_loss": 1.0028249025344849, + "sft_loss": 0.1462474763393402, + "step": 105 + }, + { + "epoch": 0.26, + "grad_norm": 1.94309540302439, + "importance_ratio": 0.984375, + "kl_div": -0.02095239982008934, + "kl_div_pos": -0.016821056604385376, + "kl_div_sft": -0.025083741173148155, + "learning_rate": 1.8450244698205545e-06, + "loss": 0.0472, + "ppo_loss": -0.9833196401596069, + "sft_loss": 0.09222033619880676, + "step": 106 + }, + { + "epoch": 0.26, + "grad_norm": 8.829977681703108, + "kl_div": 0.006691737566143274, + "kl_div_sft": 0.006691737566143274, + "learning_rate": 1.8433931484502445e-06, + "loss": -0.0236, + "sft_loss": 0.03618942201137543, + "step": 107 + }, + { + "epoch": 0.26, + "grad_norm": 7.877428921518379, + "importance_ratio": 0.859375, + "kl_div": -0.1552104353904724, + "kl_div_neg": -0.21654579043388367, + "kl_div_pos": -0.09387508779764175, + "learning_rate": 1.8417618270799347e-06, + "loss": 0.0563, + "ppo_loss": -0.05255037546157837, + "step": 108 + }, + { + "epoch": 0.26, + "grad_norm": 1.4240924186987443, + "importance_ratio": 0.984375, + "kl_div": -0.012207446619868279, + "kl_div_pos": -0.01420664507895708, + "kl_div_sft": -0.01020824909210205, + "learning_rate": 1.8401305057096247e-06, + "loss": 0.0522, + "ppo_loss": -0.9858937859535217, + "sft_loss": 0.0927015170454979, + "step": 109 + }, + { + "epoch": 0.27, + "grad_norm": 1.156184478140483, + "importance_ratio": 0.8984375, + "kl_div": -0.11182701587677002, + "kl_div_neg": -0.11182701587677002, + "learning_rate": 1.8384991843393147e-06, + "loss": -0.1422, + "ppo_loss": 0.8957895636558533, + "step": 110 + }, + { + "epoch": 0.27, + "grad_norm": 6.277832012261074, + "importance_ratio": 0.9140625, + "kl_div": -0.09246876835823059, + "kl_div_neg": -0.16997133195400238, + "kl_div_pos": -0.014966201968491077, + "learning_rate": 1.8368678629690049e-06, + "loss": 0.0615, + "ppo_loss": -0.07072815299034119, + "step": 111 + }, + { + "epoch": 0.27, + "grad_norm": 5.03266081369713, + "importance_ratio": 0.984375, + "kl_div": -0.015262942761182785, + "kl_div_pos": -0.015262942761182785, + "learning_rate": 1.8352365415986949e-06, + "loss": -0.0423, + "ppo_loss": -0.9848529696464539, + "step": 112 + }, + { + "epoch": 0.27, + "grad_norm": 1.023171193659411, + "importance_ratio": 0.78515625, + "kl_div": -0.12196735292673111, + "kl_div_pos": -0.2421884834766388, + "kl_div_sft": -0.0017462180694565177, + "learning_rate": 1.8336052202283848e-06, + "loss": -0.0038, + "ppo_loss": -0.7849081754684448, + "sft_loss": 0.05288619548082352, + "step": 113 + }, + { + "epoch": 0.28, + "grad_norm": 2.7586984913230066, + "importance_ratio": 0.98828125, + "kl_div": 0.003044874407351017, + "kl_div_pos": -0.013100683689117432, + "kl_div_sft": 0.019190432503819466, + "learning_rate": 1.8319738988580748e-06, + "loss": 0.0134, + "ppo_loss": -0.9869847893714905, + "sft_loss": 0.05788184702396393, + "step": 114 + }, + { + "epoch": 0.28, + "grad_norm": 1.3708875886950078, + "importance_ratio": 0.98828125, + "kl_div": -0.012799538671970367, + "kl_div_neg": -0.012748450972139835, + "kl_div_sft": -0.012850625440478325, + "learning_rate": 1.830342577487765e-06, + "loss": 0.0322, + "ppo_loss": 0.9873324632644653, + "sft_loss": 0.08000502735376358, + "step": 115 + }, + { + "epoch": 0.28, + "grad_norm": 1.1851864915885835, + "importance_ratio": 1.0, + "kl_div": -0.0011242360342293978, + "kl_div_neg": -0.00048666924703866243, + "kl_div_pos": -0.0017618027050048113, + "learning_rate": 1.8287112561174552e-06, + "loss": -0.0125, + "ppo_loss": 0.0006368458271026611, + "step": 116 + }, + { + "epoch": 0.28, + "grad_norm": 4.152021049240353, + "importance_ratio": 0.984375, + "kl_div": -0.009348939172923565, + "kl_div_pos": -0.015161776915192604, + "kl_div_sft": -0.003536101896315813, + "learning_rate": 1.827079934747145e-06, + "loss": 0.1734, + "ppo_loss": -0.9849525690078735, + "sft_loss": 0.11104224622249603, + "step": 117 + }, + { + "epoch": 0.29, + "grad_norm": 17.122627038204538, + "kl_div": -0.0030991281382739544, + "kl_div_sft": -0.0030991281382739544, + "learning_rate": 1.8254486133768352e-06, + "loss": 0.0684, + "sft_loss": 0.10727023333311081, + "step": 118 + }, + { + "epoch": 0.29, + "grad_norm": 4.130081745659531, + "importance_ratio": 0.75390625, + "kl_div": -0.14108410477638245, + "kl_div_neg": -0.28084099292755127, + "kl_div_sft": -0.0013272189535200596, + "learning_rate": 1.8238172920065252e-06, + "loss": -0.0169, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.059665389358997345, + "step": 119 + }, + { + "epoch": 0.29, + "grad_norm": 2.738881784016763, + "importance_ratio": 0.890625, + "kl_div": -0.12150909006595612, + "kl_div_neg": -0.06636663526296616, + "kl_div_pos": -0.17665155231952667, + "learning_rate": 1.8221859706362152e-06, + "loss": -0.0808, + "ppo_loss": 0.04885795712471008, + "step": 120 + }, + { + "epoch": 0.29, + "grad_norm": 1.8803545471430183, + "importance_ratio": 0.77734375, + "kl_div": -0.14616063237190247, + "kl_div_pos": -0.2531307339668274, + "kl_div_sft": -0.039190515875816345, + "learning_rate": 1.8205546492659054e-06, + "loss": 0.024, + "ppo_loss": -0.7763664722442627, + "sft_loss": 0.10987972468137741, + "step": 121 + }, + { + "epoch": 0.3, + "grad_norm": 1.1585058424583765, + "importance_ratio": 0.98046875, + "kl_div": -0.01550198346376419, + "kl_div_pos": -0.0198336411267519, + "kl_div_sft": -0.011170326732099056, + "learning_rate": 1.8189233278955953e-06, + "loss": 0.0286, + "ppo_loss": -0.9803617596626282, + "sft_loss": 0.06033976003527641, + "step": 122 + }, + { + "epoch": 0.3, + "grad_norm": 16.79258302723445, + "importance_ratio": 0.8828125, + "kl_div": -0.12821130454540253, + "kl_div_neg": -0.12821130454540253, + "learning_rate": 1.8172920065252855e-06, + "loss": -0.0829, + "ppo_loss": 0.8835910558700562, + "step": 123 + }, + { + "epoch": 0.3, + "grad_norm": 2.7903950390360417, + "importance_ratio": 0.70703125, + "kl_div": -0.17138634622097015, + "kl_div_neg": -0.3454788625240326, + "kl_div_sft": 0.002706160070374608, + "learning_rate": 1.8156606851549753e-06, + "loss": 0.0432, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.029093803837895393, + "step": 124 + }, + { + "epoch": 0.3, + "grad_norm": 11.587931280741508, + "importance_ratio": 1.0078125, + "kl_div": 0.007452365942299366, + "kl_div_pos": 0.005459305830299854, + "kl_div_sft": 0.009445426054298878, + "learning_rate": 1.8140293637846655e-06, + "loss": -0.0357, + "ppo_loss": -1.0054742097854614, + "sft_loss": 0.08399739861488342, + "step": 125 + }, + { + "epoch": 0.31, + "grad_norm": 4.793488043002528, + "importance_ratio": 0.92578125, + "kl_div": -0.07985176146030426, + "kl_div_neg": -0.07985176146030426, + "learning_rate": 1.8123980424143555e-06, + "loss": 0.112, + "ppo_loss": 0.9262062907218933, + "step": 126 + }, + { + "epoch": 0.31, + "grad_norm": 4.294557036360072, + "kl_div": 0.005065533332526684, + "kl_div_sft": 0.005065533332526684, + "learning_rate": 1.8107667210440457e-06, + "loss": 0.1447, + "sft_loss": 0.04570557177066803, + "step": 127 + }, + { + "epoch": 0.31, + "grad_norm": 7.758244221056378, + "kl_div": -0.09475219994783401, + "kl_div_sft": -0.09475219994783401, + "learning_rate": 1.8091353996737357e-06, + "loss": -0.1566, + "sft_loss": 0.14630523324012756, + "step": 128 + }, + { + "epoch": 0.31, + "grad_norm": 1.2952268767730881, + "kl_div": -0.024958381429314613, + "kl_div_sft": -0.024958381429314613, + "learning_rate": 1.8075040783034257e-06, + "loss": 0.2292, + "sft_loss": 0.13158275187015533, + "step": 129 + }, + { + "epoch": 0.32, + "grad_norm": 2.1450926110253348, + "importance_ratio": 1.0, + "kl_div": 0.0020835120230913162, + "kl_div_pos": 0.0011660216841846704, + "kl_div_sft": 0.0030010021291673183, + "learning_rate": 1.8058727569331158e-06, + "loss": -0.0424, + "ppo_loss": -1.0011667013168335, + "sft_loss": 0.06473907083272934, + "step": 130 + }, + { + "epoch": 0.32, + "grad_norm": 10.508363208733325, + "importance_ratio": 0.890625, + "kl_div": -0.12170784175395966, + "kl_div_pos": -0.12170784175395966, + "learning_rate": 1.8042414355628056e-06, + "loss": -0.0534, + "ppo_loss": -0.8911383152008057, + "step": 131 + }, + { + "epoch": 0.32, + "grad_norm": 1.205441115749259, + "importance_ratio": 0.98046875, + "kl_div": -0.008304815739393234, + "kl_div_pos": -0.018010282889008522, + "kl_div_sft": 0.0014006514102220535, + "learning_rate": 1.8026101141924958e-06, + "loss": -0.1574, + "ppo_loss": -0.9821509718894958, + "sft_loss": 0.036297757178545, + "step": 132 + }, + { + "epoch": 0.32, + "grad_norm": 1.275700716932992, + "importance_ratio": 0.86328125, + "kl_div": -0.1585293859243393, + "kl_div_neg": -0.3000960350036621, + "kl_div_pos": -0.016962744295597076, + "learning_rate": 1.8009787928221858e-06, + "loss": -0.1364, + "ppo_loss": -0.09159013628959656, + "step": 133 + }, + { + "epoch": 0.32, + "grad_norm": 0.9975014706172953, + "importance_ratio": 0.89453125, + "kl_div": -0.11716045439243317, + "kl_div_neg": -0.2166241854429245, + "kl_div_pos": -0.01769673079252243, + "learning_rate": 1.799347471451876e-06, + "loss": 0.0028, + "ppo_loss": -0.08861318230628967, + "step": 134 + }, + { + "epoch": 0.33, + "grad_norm": 1.1355681478727442, + "importance_ratio": 0.72265625, + "kl_div": -0.15676546096801758, + "kl_div_pos": -0.32453951239585876, + "kl_div_sft": 0.011008601635694504, + "learning_rate": 1.797716150081566e-06, + "loss": -0.1041, + "ppo_loss": -0.7228600978851318, + "sft_loss": 0.046805497258901596, + "step": 135 + }, + { + "epoch": 0.33, + "grad_norm": 3.1212162762440574, + "importance_ratio": 0.9375, + "kl_div": -0.06198743358254433, + "kl_div_pos": -0.06198743358254433, + "learning_rate": 1.796084828711256e-06, + "loss": 0.0549, + "ppo_loss": -0.9419088959693909, + "step": 136 + }, + { + "epoch": 0.33, + "grad_norm": 1.5661286752630128, + "importance_ratio": 0.84375, + "kl_div": -0.17229585349559784, + "kl_div_neg": -0.17083480954170227, + "kl_div_pos": -0.1737568974494934, + "learning_rate": 1.7944535073409462e-06, + "loss": -0.0202, + "ppo_loss": 0.0012297630310058594, + "step": 137 + }, + { + "epoch": 0.33, + "grad_norm": 1.9203922165169252, + "importance_ratio": 0.80078125, + "kl_div": -0.19221439957618713, + "kl_div_neg": -0.2237863838672638, + "kl_div_sft": -0.16064240038394928, + "learning_rate": 1.7928221859706361e-06, + "loss": -0.0806, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.19409435987472534, + "step": 138 + }, + { + "epoch": 0.34, + "grad_norm": 1.159094739081435, + "kl_div": -0.06490548700094223, + "kl_div_sft": -0.06490548700094223, + "learning_rate": 1.7911908646003261e-06, + "loss": 0.0725, + "sft_loss": 0.1461164951324463, + "step": 139 + }, + { + "epoch": 0.34, + "grad_norm": 0.89539811555495, + "importance_ratio": 0.6953125, + "kl_div": -0.1836492419242859, + "kl_div_neg": -0.3647206723690033, + "kl_div_sft": -0.002577810548245907, + "learning_rate": 1.7895595432300161e-06, + "loss": -0.005, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.040640685707330704, + "step": 140 + }, + { + "epoch": 0.34, + "grad_norm": 0.9498610658443112, + "importance_ratio": 1.015625, + "kl_div": 0.008194430731236935, + "kl_div_pos": 0.01923866756260395, + "kl_div_sft": -0.0028498058672994375, + "learning_rate": 1.7879282218597063e-06, + "loss": -0.0633, + "ppo_loss": -1.0194249153137207, + "sft_loss": 0.14637592434883118, + "step": 141 + }, + { + "epoch": 0.34, + "grad_norm": 0.9800355721849093, + "kl_div": -0.014961409382522106, + "kl_div_sft": -0.014961409382522106, + "learning_rate": 1.7862969004893965e-06, + "loss": -0.0662, + "sft_loss": 0.08139221370220184, + "step": 142 + }, + { + "epoch": 0.35, + "grad_norm": 2.483579496262967, + "importance_ratio": 0.87109375, + "kl_div": -0.0703229084610939, + "kl_div_neg": -0.14011670649051666, + "kl_div_sft": -0.0005291154375299811, + "learning_rate": 1.7846655791190863e-06, + "loss": -0.1198, + "ppo_loss": 0.8692567944526672, + "sft_loss": 0.04155682399868965, + "step": 143 + }, + { + "epoch": 0.35, + "grad_norm": 1.1562214409677711, + "kl_div": 0.002795197768136859, + "kl_div_sft": 0.002795197768136859, + "learning_rate": 1.7830342577487765e-06, + "loss": 0.1252, + "sft_loss": 0.0712403953075409, + "step": 144 + }, + { + "epoch": 0.35, + "grad_norm": 5.743535263812712, + "importance_ratio": 0.84765625, + "kl_div": -0.1651565581560135, + "kl_div_neg": -0.1651565581560135, + "learning_rate": 1.7814029363784665e-06, + "loss": 0.0082, + "ppo_loss": 0.8477612733840942, + "step": 145 + }, + { + "epoch": 0.35, + "grad_norm": 3.1622814298948647, + "kl_div": 0.0015609723050147295, + "kl_div_sft": 0.0015609723050147295, + "learning_rate": 1.7797716150081564e-06, + "loss": 0.0026, + "sft_loss": 0.046822816133499146, + "step": 146 + }, + { + "epoch": 0.36, + "grad_norm": 1.2081844906829902, + "kl_div": -0.019221793860197067, + "kl_div_sft": -0.019221793860197067, + "learning_rate": 1.7781402936378466e-06, + "loss": -0.0855, + "sft_loss": 0.09771668165922165, + "step": 147 + }, + { + "epoch": 0.36, + "grad_norm": 1.20771538071317, + "importance_ratio": 0.890625, + "kl_div": -0.061765387654304504, + "kl_div_neg": -0.11600951105356216, + "kl_div_sft": -0.007521265652030706, + "learning_rate": 1.7765089722675366e-06, + "loss": 0.0313, + "ppo_loss": 0.8904667496681213, + "sft_loss": 0.05933324247598648, + "step": 148 + }, + { + "epoch": 0.36, + "grad_norm": 1.6411811249365569, + "kl_div": 0.013755328953266144, + "kl_div_sft": 0.013755328953266144, + "learning_rate": 1.7748776508972268e-06, + "loss": -0.1119, + "sft_loss": 0.07278871536254883, + "step": 149 + }, + { + "epoch": 0.36, + "grad_norm": 3.13531211771659, + "importance_ratio": 0.93359375, + "kl_div": -0.03333902359008789, + "kl_div_neg": -0.0697193294763565, + "kl_div_sft": 0.003041281597688794, + "learning_rate": 1.7732463295269166e-06, + "loss": 0.1347, + "ppo_loss": 0.9326555132865906, + "sft_loss": 0.05647405609488487, + "step": 150 + }, + { + "epoch": 0.37, + "grad_norm": 0.9084137703047128, + "kl_div": 0.006213179789483547, + "kl_div_sft": 0.006213179789483547, + "learning_rate": 1.7716150081566068e-06, + "loss": 0.0022, + "sft_loss": 0.073430135846138, + "step": 151 + }, + { + "epoch": 0.37, + "grad_norm": 29.33578873963614, + "importance_ratio": 0.8671875, + "kl_div": -0.06711047887802124, + "kl_div_pos": -0.1426703929901123, + "kl_div_sft": 0.008449428714811802, + "learning_rate": 1.7699836867862968e-06, + "loss": 0.0391, + "ppo_loss": -0.8670398592948914, + "sft_loss": 0.025338491424918175, + "step": 152 + }, + { + "epoch": 0.37, + "grad_norm": 3.212689369646534, + "importance_ratio": 0.84375, + "kl_div": -0.17463237047195435, + "kl_div_neg": -0.29152414202690125, + "kl_div_pos": -0.05774059146642685, + "learning_rate": 1.768352365415987e-06, + "loss": -0.0389, + "ppo_loss": -0.0719473659992218, + "step": 153 + }, + { + "epoch": 0.37, + "grad_norm": 1.5889537672803584, + "importance_ratio": 0.8125, + "kl_div": -0.10110513865947723, + "kl_div_neg": -0.2069135308265686, + "kl_div_sft": 0.0047032469883561134, + "learning_rate": 1.766721044045677e-06, + "loss": 0.0353, + "ppo_loss": 0.8130899667739868, + "sft_loss": 0.08691839873790741, + "step": 154 + }, + { + "epoch": 0.38, + "grad_norm": 1.4739593487883216, + "importance_ratio": 0.96875, + "kl_div": -0.03229506313800812, + "kl_div_neg": -0.01941351592540741, + "kl_div_pos": -0.04517660662531853, + "learning_rate": 1.765089722675367e-06, + "loss": -0.0083, + "ppo_loss": 0.012472540140151978, + "step": 155 + }, + { + "epoch": 0.38, + "grad_norm": 1.7783165585490306, + "importance_ratio": 0.859375, + "kl_div": -0.0714777484536171, + "kl_div_neg": -0.15110665559768677, + "kl_div_sft": 0.008151160553097725, + "learning_rate": 1.7634584013050571e-06, + "loss": 0.0427, + "ppo_loss": 0.8597559928894043, + "sft_loss": 0.048410769551992416, + "step": 156 + }, + { + "epoch": 0.38, + "grad_norm": 1.2124782973244714, + "importance_ratio": 0.9375, + "kl_div": -0.07026248425245285, + "kl_div_neg": -0.07026248425245285, + "learning_rate": 1.761827079934747e-06, + "loss": 0.1084, + "ppo_loss": 0.9341276288032532, + "step": 157 + }, + { + "epoch": 0.38, + "grad_norm": 3.2926519022016683, + "importance_ratio": 0.90234375, + "kl_div": -0.10565493255853653, + "kl_div_neg": -0.18504543602466583, + "kl_div_pos": -0.026264430955052376, + "learning_rate": 1.760195758564437e-06, + "loss": -0.0399, + "ppo_loss": -0.07150548696517944, + "step": 158 + }, + { + "epoch": 0.39, + "grad_norm": 1.6225790182659727, + "importance_ratio": 0.9609375, + "kl_div": -0.04186892881989479, + "kl_div_neg": -0.04186892881989479, + "learning_rate": 1.758564437194127e-06, + "loss": -0.2452, + "ppo_loss": 0.959962010383606, + "step": 159 + }, + { + "epoch": 0.39, + "grad_norm": 0.9470105643846278, + "kl_div": -0.11340246349573135, + "kl_div_sft": -0.11340246349573135, + "learning_rate": 1.7569331158238173e-06, + "loss": -0.063, + "sft_loss": 0.21769794821739197, + "step": 160 + }, + { + "epoch": 0.39, + "grad_norm": 3.348421060625913, + "importance_ratio": 0.76171875, + "kl_div": -0.1341472864151001, + "kl_div_neg": -0.27457693219184875, + "kl_div_sft": 0.006282369140535593, + "learning_rate": 1.7553017944535073e-06, + "loss": -0.0256, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.033535126596689224, + "step": 161 + }, + { + "epoch": 0.39, + "grad_norm": 7.443811639615435, + "importance_ratio": 0.9453125, + "kl_div": -0.06028685346245766, + "kl_div_neg": -0.06028685346245766, + "learning_rate": 1.7536704730831972e-06, + "loss": 0.1934, + "ppo_loss": 0.9425788521766663, + "step": 162 + }, + { + "epoch": 0.4, + "grad_norm": 3.5188537690765913, + "importance_ratio": 0.90234375, + "kl_div": -0.04902525618672371, + "kl_div_pos": -0.10362156480550766, + "kl_div_sft": 0.005571051966398954, + "learning_rate": 1.7520391517128874e-06, + "loss": -0.0139, + "ppo_loss": -0.9015664458274841, + "sft_loss": 0.07571996748447418, + "step": 163 + }, + { + "epoch": 0.4, + "grad_norm": 3.0153035205000087, + "kl_div": 0.014165805652737617, + "kl_div_sft": 0.014165805652737617, + "learning_rate": 1.7504078303425774e-06, + "loss": -0.104, + "sft_loss": 0.040061578154563904, + "step": 164 + }, + { + "epoch": 0.4, + "grad_norm": 10.233874406292543, + "importance_ratio": 0.90625, + "kl_div": -0.10461732745170593, + "kl_div_neg": -0.2085922807455063, + "kl_div_pos": -0.0006423748563975096, + "learning_rate": 1.7487765089722674e-06, + "loss": 0.0406, + "ppo_loss": -0.0938158929347992, + "step": 165 + }, + { + "epoch": 0.4, + "grad_norm": 4.848785736038836, + "importance_ratio": 0.9609375, + "kl_div": -0.01604663021862507, + "kl_div_neg": -0.03924839571118355, + "kl_div_sft": 0.007155134342610836, + "learning_rate": 1.7471451876019576e-06, + "loss": 0.0303, + "ppo_loss": 0.9615119099617004, + "sft_loss": 0.02133319526910782, + "step": 166 + }, + { + "epoch": 0.4, + "grad_norm": 2.66964325082396, + "importance_ratio": 0.85546875, + "kl_div": -0.07945609837770462, + "kl_div_pos": -0.1554151475429535, + "kl_div_sft": -0.003497056197375059, + "learning_rate": 1.7455138662316476e-06, + "loss": -0.0582, + "ppo_loss": -0.856059730052948, + "sft_loss": 0.14876271784305573, + "step": 167 + }, + { + "epoch": 0.41, + "grad_norm": 8.793669799352205, + "importance_ratio": 0.8203125, + "kl_div": -0.21741493046283722, + "kl_div_pos": -0.21741493046283722, + "learning_rate": 1.7438825448613378e-06, + "loss": 0.1567, + "ppo_loss": -0.820621907711029, + "step": 168 + }, + { + "epoch": 0.41, + "grad_norm": 1.0459297239608907, + "importance_ratio": 0.74609375, + "kl_div": -0.14068058133125305, + "kl_div_neg": -0.2933712899684906, + "kl_div_sft": 0.012010131031274796, + "learning_rate": 1.7422512234910276e-06, + "loss": 0.073, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.04012210667133331, + "step": 169 + }, + { + "epoch": 0.41, + "grad_norm": 29.005732528782133, + "kl_div": -0.2703365087509155, + "kl_div_sft": -0.2703365087509155, + "learning_rate": 1.7406199021207178e-06, + "loss": -0.1284, + "sft_loss": 0.33477458357810974, + "step": 170 + }, + { + "epoch": 0.41, + "grad_norm": 1.2141234786382378, + "importance_ratio": 0.8046875, + "kl_div": -0.2189944088459015, + "kl_div_neg": -0.20113199949264526, + "kl_div_pos": -0.23685681819915771, + "learning_rate": 1.7389885807504077e-06, + "loss": 0.0753, + "ppo_loss": 0.014350086450576782, + "step": 171 + }, + { + "epoch": 0.42, + "grad_norm": 5.335803095854867, + "importance_ratio": 0.875, + "kl_div": -0.1390659213066101, + "kl_div_neg": -0.2621629536151886, + "kl_div_pos": -0.015968896448612213, + "learning_rate": 1.7373572593800977e-06, + "loss": -0.1297, + "ppo_loss": -0.09207895398139954, + "step": 172 + }, + { + "epoch": 0.42, + "grad_norm": 4.069993370856808, + "importance_ratio": 0.84765625, + "kl_div": -0.08496677875518799, + "kl_div_pos": -0.16756807267665863, + "kl_div_sft": -0.002365480177104473, + "learning_rate": 1.735725938009788e-06, + "loss": 0.0029, + "ppo_loss": -0.845719039440155, + "sft_loss": 0.07888627797365189, + "step": 173 + }, + { + "epoch": 0.42, + "grad_norm": 2.3106871019829285, + "importance_ratio": 0.87890625, + "kl_div": -0.12879464030265808, + "kl_div_pos": -0.12879464030265808, + "learning_rate": 1.734094616639478e-06, + "loss": -0.1655, + "ppo_loss": -0.8791550993919373, + "step": 174 + }, + { + "epoch": 0.42, + "grad_norm": 1.373166075045685, + "importance_ratio": 0.78515625, + "kl_div": -0.11861623823642731, + "kl_div_neg": -0.243907630443573, + "kl_div_sft": 0.006675161421298981, + "learning_rate": 1.732463295269168e-06, + "loss": 0.0095, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.0354793556034565, + "step": 175 + }, + { + "epoch": 0.43, + "grad_norm": 1.2951290833639222, + "importance_ratio": 0.9921875, + "kl_div": -0.008589332923293114, + "kl_div_pos": -0.008589332923293114, + "learning_rate": 1.7308319738988579e-06, + "loss": -0.0586, + "ppo_loss": -0.9914485216140747, + "step": 176 + }, + { + "epoch": 0.43, + "grad_norm": 1.0655123698654576, + "importance_ratio": 0.70703125, + "kl_div": -0.17774005234241486, + "kl_div_neg": -0.3439978063106537, + "kl_div_sft": -0.011482291854918003, + "learning_rate": 1.729200652528548e-06, + "loss": -0.0304, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.0535435788333416, + "step": 177 + }, + { + "epoch": 0.43, + "grad_norm": 4.031029983549054, + "importance_ratio": 0.859375, + "kl_div": -0.15722407400608063, + "kl_div_neg": -0.03494102135300636, + "kl_div_pos": -0.2795071303844452, + "learning_rate": 1.727569331158238e-06, + "loss": 0.025, + "ppo_loss": 0.1047530472278595, + "step": 178 + }, + { + "epoch": 0.43, + "grad_norm": 7.814317659645009, + "importance_ratio": 0.890625, + "kl_div": -0.07359719276428223, + "kl_div_neg": -0.11783038079738617, + "kl_div_sft": -0.029364006593823433, + "learning_rate": 1.7259380097879282e-06, + "loss": -0.0495, + "ppo_loss": 0.8888468742370605, + "sft_loss": 0.14391446113586426, + "step": 179 + }, + { + "epoch": 0.44, + "grad_norm": 7.462387375484931, + "importance_ratio": 0.9453125, + "kl_div": -0.06070178002119064, + "kl_div_neg": -0.06070178002119064, + "learning_rate": 1.7243066884176182e-06, + "loss": -0.0258, + "ppo_loss": 0.9421994686126709, + "step": 180 + }, + { + "epoch": 0.44, + "grad_norm": 2.2803669095537016, + "importance_ratio": 0.8046875, + "kl_div": -0.17116063833236694, + "kl_div_neg": -0.21698014438152313, + "kl_div_sft": -0.12534114718437195, + "learning_rate": 1.7226753670473082e-06, + "loss": -0.006, + "ppo_loss": 0.8049459457397461, + "sft_loss": 0.2503657639026642, + "step": 181 + }, + { + "epoch": 0.44, + "grad_norm": 0.8677791261282602, + "kl_div": -0.007281461730599403, + "kl_div_sft": -0.007281461730599403, + "learning_rate": 1.7210440456769984e-06, + "loss": -0.111, + "sft_loss": 0.06887984275817871, + "step": 182 + }, + { + "epoch": 0.44, + "grad_norm": 3.1503894141349194, + "importance_ratio": 1.0234375, + "kl_div": 0.006265717558562756, + "kl_div_neg": 0.0197683684527874, + "kl_div_sft": -0.007236933335661888, + "learning_rate": 1.7194127243066882e-06, + "loss": 0.0205, + "ppo_loss": 1.0199650526046753, + "sft_loss": 0.061343319714069366, + "step": 183 + }, + { + "epoch": 0.45, + "grad_norm": 6.572701218384443, + "importance_ratio": 0.984375, + "kl_div": -0.019500788301229477, + "kl_div_pos": -0.019500788301229477, + "learning_rate": 1.7177814029363784e-06, + "loss": -0.1272, + "ppo_loss": -0.9808427095413208, + "step": 184 + }, + { + "epoch": 0.45, + "grad_norm": 2.2136950869779586, + "importance_ratio": 0.83203125, + "kl_div": -0.19927285611629486, + "kl_div_neg": -0.19927285611629486, + "learning_rate": 1.7161500815660684e-06, + "loss": 0.1022, + "ppo_loss": 0.8878340721130371, + "step": 185 + }, + { + "epoch": 0.45, + "grad_norm": 1.0749154345939593, + "importance_ratio": 0.9765625, + "kl_div": -0.007401918526738882, + "kl_div_pos": -0.023486772552132607, + "kl_div_sft": 0.008682935498654842, + "learning_rate": 1.7145187601957586e-06, + "loss": -0.0568, + "ppo_loss": -0.9767869114875793, + "sft_loss": 0.038777194917201996, + "step": 186 + }, + { + "epoch": 0.45, + "grad_norm": 3.193871506621026, + "importance_ratio": 0.8203125, + "kl_div": -0.2192690223455429, + "kl_div_neg": -0.44032028317451477, + "kl_div_pos": 0.0017822531517595053, + "learning_rate": 1.7128874388254485e-06, + "loss": 0.0145, + "ppo_loss": -0.10089191794395447, + "step": 187 + }, + { + "epoch": 0.46, + "grad_norm": 9.937608250442247, + "importance_ratio": 0.984375, + "kl_div": -0.01901225373148918, + "kl_div_neg": -0.009077527560293674, + "kl_div_pos": -0.028946978971362114, + "learning_rate": 1.7112561174551385e-06, + "loss": 0.0223, + "ppo_loss": 0.009747833013534546, + "step": 188 + }, + { + "epoch": 0.46, + "grad_norm": 2.626869262718091, + "importance_ratio": 0.98046875, + "kl_div": -0.015202803537249565, + "kl_div_pos": -0.02008308283984661, + "kl_div_sft": -0.01032252423465252, + "learning_rate": 1.7096247960848287e-06, + "loss": 0.0366, + "ppo_loss": -0.9801172018051147, + "sft_loss": 0.08730190247297287, + "step": 189 + }, + { + "epoch": 0.46, + "grad_norm": 1.7644146171659656, + "importance_ratio": 0.95703125, + "kl_div": -0.01875142753124237, + "kl_div_pos": -0.045739587396383286, + "kl_div_sft": 0.008236734196543694, + "learning_rate": 1.7079934747145187e-06, + "loss": 0.0074, + "ppo_loss": -0.9552907347679138, + "sft_loss": 0.03919677063822746, + "step": 190 + }, + { + "epoch": 0.46, + "grad_norm": 1.3433554757725241, + "importance_ratio": 0.80078125, + "kl_div": -0.24683430790901184, + "kl_div_neg": -0.47321048378944397, + "kl_div_pos": -0.020458126440644264, + "learning_rate": 1.7063621533442087e-06, + "loss": 0.0831, + "ppo_loss": -0.08987483382225037, + "step": 191 + }, + { + "epoch": 0.47, + "grad_norm": 12.54506768194302, + "kl_div": -0.018011918291449547, + "kl_div_sft": -0.018011918291449547, + "learning_rate": 1.7047308319738989e-06, + "loss": 0.1565, + "sft_loss": 0.16000191867351532, + "step": 192 + }, + { + "epoch": 0.47, + "grad_norm": 1.832165288190318, + "importance_ratio": 0.73828125, + "kl_div": -0.1693369597196579, + "kl_div_pos": -0.3051265478134155, + "kl_div_sft": -0.033547379076480865, + "learning_rate": 1.7030995106035889e-06, + "loss": -0.0637, + "ppo_loss": -0.7370301485061646, + "sft_loss": 0.10590871423482895, + "step": 193 + }, + { + "epoch": 0.47, + "grad_norm": 1.2331892184952706, + "importance_ratio": 0.99609375, + "kl_div": 0.0009276217315346003, + "kl_div_neg": -0.003930038772523403, + "kl_div_sft": 0.005785282235592604, + "learning_rate": 1.7014681892332789e-06, + "loss": -0.0689, + "ppo_loss": 0.9960777163505554, + "sft_loss": 0.038523055613040924, + "step": 194 + }, + { + "epoch": 0.47, + "grad_norm": 10.673199639834243, + "kl_div": 0.007861132733523846, + "kl_div_sft": 0.007861132733523846, + "learning_rate": 1.6998368678629688e-06, + "loss": 0.1179, + "sft_loss": 0.0692097395658493, + "step": 195 + }, + { + "epoch": 0.48, + "grad_norm": 1.1172814496424563, + "importance_ratio": 0.96484375, + "kl_div": -0.022414401173591614, + "kl_div_pos": -0.034341100603342056, + "kl_div_sft": -0.010487699881196022, + "learning_rate": 1.698205546492659e-06, + "loss": -0.1272, + "ppo_loss": -0.9662418961524963, + "sft_loss": 0.0791834369301796, + "step": 196 + }, + { + "epoch": 0.48, + "grad_norm": 1.7431361604909785, + "importance_ratio": 0.9140625, + "kl_div": -0.09347224235534668, + "kl_div_neg": -0.026140188798308372, + "kl_div_pos": -0.16080430150032043, + "learning_rate": 1.696574225122349e-06, + "loss": -0.0508, + "ppo_loss": 0.06136992573738098, + "step": 197 + }, + { + "epoch": 0.48, + "grad_norm": 5.576029921025871, + "importance_ratio": 0.953125, + "kl_div": -0.023265577852725983, + "kl_div_pos": -0.04682733118534088, + "kl_div_sft": 0.00029617652762681246, + "learning_rate": 1.694942903752039e-06, + "loss": 0.0361, + "ppo_loss": -0.9542521834373474, + "sft_loss": 0.09867709875106812, + "step": 198 + }, + { + "epoch": 0.48, + "grad_norm": 0.9557656152134608, + "kl_div": -0.019151723012328148, + "kl_div_sft": -0.019151723012328148, + "learning_rate": 1.6933115823817292e-06, + "loss": 0.0791, + "sft_loss": 0.0674813762307167, + "step": 199 + }, + { + "epoch": 0.48, + "grad_norm": 4.801397620814088, + "importance_ratio": 1.0, + "kl_div": -0.057284507900476456, + "kl_div_neg": 0.0016344115138053894, + "kl_div_sft": -0.1162034273147583, + "learning_rate": 1.6916802610114192e-06, + "loss": 0.1107, + "ppo_loss": 1.0016357898712158, + "sft_loss": 0.16367655992507935, + "step": 200 + }, + { + "epoch": 0.49, + "grad_norm": 5.25441102506617, + "importance_ratio": 0.9921875, + "kl_div": -0.006039372645318508, + "kl_div_pos": -0.009510613977909088, + "kl_div_sft": -0.002568131545558572, + "learning_rate": 1.6900489396411094e-06, + "loss": 0.0823, + "ppo_loss": -0.9905344247817993, + "sft_loss": 0.0743524581193924, + "step": 201 + }, + { + "epoch": 0.49, + "grad_norm": 2.6389299177583028, + "importance_ratio": 0.98046875, + "kl_div": -0.009905396960675716, + "kl_div_pos": -0.019572056829929352, + "kl_div_sft": -0.00023873659665696323, + "learning_rate": 1.6884176182707991e-06, + "loss": -0.0591, + "ppo_loss": -0.9806182384490967, + "sft_loss": 0.09485552459955215, + "step": 202 + }, + { + "epoch": 0.49, + "grad_norm": 3.628003816116833, + "importance_ratio": 0.98046875, + "kl_div": -0.01536363735795021, + "kl_div_neg": -0.02144819125533104, + "kl_div_sft": -0.009279083460569382, + "learning_rate": 1.6867862969004893e-06, + "loss": -0.0365, + "ppo_loss": 0.9787802696228027, + "sft_loss": 0.06808764487504959, + "step": 203 + }, + { + "epoch": 0.49, + "grad_norm": 10.045426758225995, + "importance_ratio": 0.90625, + "kl_div": -0.09950728714466095, + "kl_div_neg": -0.128764808177948, + "kl_div_pos": -0.0702497735619545, + "learning_rate": 1.6851549755301793e-06, + "loss": -0.0343, + "ppo_loss": -0.026490122079849243, + "step": 204 + }, + { + "epoch": 0.5, + "grad_norm": 1.1874430793371316, + "importance_ratio": 0.99609375, + "kl_div": -0.0002015829086303711, + "kl_div_pos": -0.00569371972233057, + "kl_div_sft": 0.005290553905069828, + "learning_rate": 1.6835236541598693e-06, + "loss": -0.0687, + "ppo_loss": -0.9943224191665649, + "sft_loss": 0.11273713409900665, + "step": 205 + }, + { + "epoch": 0.5, + "grad_norm": 1.8097189251913681, + "kl_div": -0.0048973290249705315, + "kl_div_sft": -0.0048973290249705315, + "learning_rate": 1.6818923327895595e-06, + "loss": 0.0452, + "sft_loss": 0.09427434206008911, + "step": 206 + }, + { + "epoch": 0.5, + "grad_norm": 1.4893219280512007, + "importance_ratio": 0.859375, + "kl_div": -0.08186168968677521, + "kl_div_pos": -0.15327724814414978, + "kl_div_sft": -0.010446123778820038, + "learning_rate": 1.6802610114192495e-06, + "loss": 0.0751, + "ppo_loss": -0.8578917980194092, + "sft_loss": 0.1798616498708725, + "step": 207 + }, + { + "epoch": 0.5, + "grad_norm": 4.306058634144844, + "importance_ratio": 0.94140625, + "kl_div": -0.06163910776376724, + "kl_div_pos": -0.06163910776376724, + "learning_rate": 1.6786296900489397e-06, + "loss": -0.1196, + "ppo_loss": -0.9414124488830566, + "step": 208 + }, + { + "epoch": 0.51, + "grad_norm": 0.9217491710587724, + "importance_ratio": 0.94921875, + "kl_div": -0.026861974969506264, + "kl_div_pos": -0.05260562524199486, + "kl_div_sft": -0.001118326443247497, + "learning_rate": 1.6769983686786295e-06, + "loss": 0.0148, + "ppo_loss": -0.9487541317939758, + "sft_loss": 0.05310088023543358, + "step": 209 + }, + { + "epoch": 0.51, + "grad_norm": 1.013299539205397, + "importance_ratio": 0.9765625, + "kl_div": -0.01243782788515091, + "kl_div_pos": -0.023766396567225456, + "kl_div_sft": -0.001109258970245719, + "learning_rate": 1.6753670473083197e-06, + "loss": -0.0859, + "ppo_loss": -0.9765138030052185, + "sft_loss": 0.0629742220044136, + "step": 210 + }, + { + "epoch": 0.51, + "grad_norm": 1.3299034777013468, + "importance_ratio": 0.9921875, + "kl_div": -0.004810581915080547, + "kl_div_neg": -0.0062462324276566505, + "kl_div_sft": -0.003374930936843157, + "learning_rate": 1.6737357259380096e-06, + "loss": 0.0774, + "ppo_loss": 0.9937732815742493, + "sft_loss": 0.1101992055773735, + "step": 211 + }, + { + "epoch": 0.51, + "grad_norm": 3.5440785143057942, + "importance_ratio": 0.79296875, + "kl_div": -0.11117896437644958, + "kl_div_pos": -0.2340003401041031, + "kl_div_sft": 0.011642408557236195, + "learning_rate": 1.6721044045676998e-06, + "loss": -0.0015, + "ppo_loss": -0.7913615703582764, + "sft_loss": 0.030798746272921562, + "step": 212 + }, + { + "epoch": 0.52, + "grad_norm": 1.2810043006336405, + "importance_ratio": 0.97265625, + "kl_div": -0.018537085503339767, + "kl_div_neg": -0.02736750990152359, + "kl_div_sft": -0.009706659242510796, + "learning_rate": 1.6704730831973898e-06, + "loss": -0.075, + "ppo_loss": 0.973003625869751, + "sft_loss": 0.10553260892629623, + "step": 213 + }, + { + "epoch": 0.52, + "grad_norm": 1.7161154235399265, + "importance_ratio": 0.98828125, + "kl_div": 0.005109624471515417, + "kl_div_pos": -0.012915403582155704, + "kl_div_sft": 0.02313465252518654, + "learning_rate": 1.6688417618270798e-06, + "loss": -0.0869, + "ppo_loss": -0.9871676564216614, + "sft_loss": 0.08311924338340759, + "step": 214 + }, + { + "epoch": 0.52, + "grad_norm": 0.9455897657608686, + "importance_ratio": 0.73828125, + "kl_div": -0.15399585664272308, + "kl_div_neg": -0.30571627616882324, + "kl_div_sft": -0.002275428967550397, + "learning_rate": 1.66721044045677e-06, + "loss": 0.0262, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.06958441436290741, + "step": 215 + }, + { + "epoch": 0.52, + "grad_norm": 0.8223109289451344, + "importance_ratio": 0.6875, + "kl_div": -0.3769010901451111, + "kl_div_neg": -0.3769010901451111, + "learning_rate": 1.6655791190864598e-06, + "loss": -0.0299, + "ppo_loss": 0.800000011920929, + "step": 216 + }, + { + "epoch": 0.53, + "grad_norm": 4.10171339938189, + "importance_ratio": 0.9765625, + "kl_div": -0.025581523776054382, + "kl_div_neg": -0.045762915164232254, + "kl_div_pos": -0.005400133319199085, + "learning_rate": 1.66394779771615e-06, + "loss": 0.1311, + "ppo_loss": -0.01967298984527588, + "step": 217 + }, + { + "epoch": 0.53, + "grad_norm": 1.3111783139513318, + "importance_ratio": 0.859375, + "kl_div": -0.15934057533740997, + "kl_div_neg": -0.2929997742176056, + "kl_div_pos": -0.025681370869278908, + "learning_rate": 1.6623164763458402e-06, + "loss": -0.1031, + "ppo_loss": -0.08732280135154724, + "step": 218 + }, + { + "epoch": 0.53, + "grad_norm": 2.429477891129353, + "importance_ratio": 0.8046875, + "kl_div": -0.10484195500612259, + "kl_div_neg": -0.21933768689632416, + "kl_div_sft": 0.009653773158788681, + "learning_rate": 1.6606851549755301e-06, + "loss": -0.0789, + "ppo_loss": 0.8030505180358887, + "sft_loss": 0.024198254570364952, + "step": 219 + }, + { + "epoch": 0.53, + "grad_norm": 15.890667250435474, + "importance_ratio": 0.98828125, + "kl_div": -0.0036477274261415005, + "kl_div_pos": -0.012399217113852501, + "kl_div_sft": 0.0051037622615695, + "learning_rate": 1.6590538336052201e-06, + "loss": -0.0181, + "ppo_loss": -0.9876773357391357, + "sft_loss": 0.0732378140091896, + "step": 220 + }, + { + "epoch": 0.54, + "grad_norm": 1.4782304173757412, + "importance_ratio": 0.87109375, + "kl_div": -0.07580201327800751, + "kl_div_neg": -0.1377091109752655, + "kl_div_sft": -0.013894918374717236, + "learning_rate": 1.6574225122349101e-06, + "loss": 0.0889, + "ppo_loss": 0.8713521361351013, + "sft_loss": 0.09440121054649353, + "step": 221 + }, + { + "epoch": 0.54, + "grad_norm": 1.1372404179600206, + "kl_div": -0.0017525558359920979, + "kl_div_sft": -0.0017525558359920979, + "learning_rate": 1.6557911908646003e-06, + "loss": -0.0755, + "sft_loss": 0.04836359620094299, + "step": 222 + }, + { + "epoch": 0.54, + "grad_norm": 1.1127947898940231, + "importance_ratio": 0.890625, + "kl_div": -0.11645431816577911, + "kl_div_neg": -0.11645431816577911, + "learning_rate": 1.6541598694942903e-06, + "loss": -0.0162, + "ppo_loss": 0.8909889459609985, + "step": 223 + }, + { + "epoch": 0.54, + "grad_norm": 1.2737183524440732, + "kl_div": -0.024832893162965775, + "kl_div_sft": -0.024832893162965775, + "learning_rate": 1.6525285481239803e-06, + "loss": 0.0049, + "sft_loss": 0.09953819215297699, + "step": 224 + }, + { + "epoch": 0.55, + "grad_norm": 1.3164385783133985, + "kl_div": 0.0015126760117709637, + "kl_div_sft": 0.0015126760117709637, + "learning_rate": 1.6508972267536705e-06, + "loss": 0.123, + "sft_loss": 0.06987418234348297, + "step": 225 + }, + { + "epoch": 0.55, + "grad_norm": 3.5919973328884223, + "importance_ratio": 0.734375, + "kl_div": -0.14923806488513947, + "kl_div_neg": -0.3067997097969055, + "kl_div_sft": 0.00832358468323946, + "learning_rate": 1.6492659053833605e-06, + "loss": 0.0632, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.04831942170858383, + "step": 226 + }, + { + "epoch": 0.55, + "grad_norm": 1.0691139583028615, + "kl_div": -0.00811840407550335, + "kl_div_sft": -0.00811840407550335, + "learning_rate": 1.6476345840130507e-06, + "loss": -0.151, + "sft_loss": 0.06757162511348724, + "step": 227 + }, + { + "epoch": 0.55, + "grad_norm": 3.1649456534156646, + "importance_ratio": 0.9921875, + "kl_div": -0.009984659031033516, + "kl_div_neg": -0.007218531798571348, + "kl_div_sft": -0.012750785797834396, + "learning_rate": 1.6460032626427404e-06, + "loss": -0.0265, + "ppo_loss": 0.9928075075149536, + "sft_loss": 0.10508273541927338, + "step": 228 + }, + { + "epoch": 0.56, + "grad_norm": 2.9554347133869787, + "importance_ratio": 0.80078125, + "kl_div": -0.23050877451896667, + "kl_div_neg": -0.34867945313453674, + "kl_div_pos": -0.1123380959033966, + "learning_rate": 1.6443719412724306e-06, + "loss": -0.0503, + "ppo_loss": -0.04687100648880005, + "step": 229 + }, + { + "epoch": 0.56, + "grad_norm": 1.91213082134934, + "kl_div": -0.00412414874881506, + "kl_div_sft": -0.00412414874881506, + "learning_rate": 1.6427406199021206e-06, + "loss": -0.0354, + "sft_loss": 0.0650189146399498, + "step": 230 + }, + { + "epoch": 0.56, + "grad_norm": 1.5237829354711199, + "importance_ratio": 0.7578125, + "kl_div": -0.27641308307647705, + "kl_div_neg": -0.27641308307647705, + "learning_rate": 1.6411092985318106e-06, + "loss": 0.1837, + "ppo_loss": 0.800000011920929, + "step": 231 + }, + { + "epoch": 0.56, + "grad_norm": 1.1111890977405692, + "importance_ratio": 0.91015625, + "kl_div": -0.0970386490225792, + "kl_div_neg": -0.1912834793329239, + "kl_div_pos": -0.0027938156854361296, + "learning_rate": 1.6394779771615008e-06, + "loss": -0.1116, + "ppo_loss": -0.0856558084487915, + "step": 232 + }, + { + "epoch": 0.56, + "grad_norm": 1.1335844369475514, + "kl_div": -0.020297205075621605, + "kl_div_sft": -0.020297205075621605, + "learning_rate": 1.6378466557911908e-06, + "loss": 0.038, + "sft_loss": 0.12747663259506226, + "step": 233 + }, + { + "epoch": 0.57, + "grad_norm": 4.644416672919157, + "kl_div": -0.0027559231966733932, + "kl_div_sft": -0.0027559231966733932, + "learning_rate": 1.636215334420881e-06, + "loss": 0.031, + "sft_loss": 0.12215368449687958, + "step": 234 + }, + { + "epoch": 0.57, + "grad_norm": 1.0598840486338408, + "kl_div": -0.002642288338392973, + "kl_div_sft": -0.002642288338392973, + "learning_rate": 1.6345840130505707e-06, + "loss": 0.0129, + "sft_loss": 0.0570257268846035, + "step": 235 + }, + { + "epoch": 0.57, + "grad_norm": 1.0173545782221605, + "importance_ratio": 0.99609375, + "kl_div": -0.013152987696230412, + "kl_div_pos": -0.0037110077682882547, + "kl_div_sft": -0.022594967857003212, + "learning_rate": 1.632952691680261e-06, + "loss": -0.0591, + "ppo_loss": -0.9962958693504333, + "sft_loss": 0.0943819060921669, + "step": 236 + }, + { + "epoch": 0.57, + "grad_norm": 1.0482286839391024, + "importance_ratio": 0.921875, + "kl_div": -0.08936250954866409, + "kl_div_neg": -0.08936250954866409, + "learning_rate": 1.6313213703099511e-06, + "loss": 0.0602, + "ppo_loss": 0.9185183644294739, + "step": 237 + }, + { + "epoch": 0.58, + "grad_norm": 12.773073150384684, + "kl_div": 0.004495954606682062, + "kl_div_sft": 0.004495954606682062, + "learning_rate": 1.6296900489396411e-06, + "loss": -0.0781, + "sft_loss": 0.060407549142837524, + "step": 238 + }, + { + "epoch": 0.58, + "grad_norm": 1.1713831059923707, + "importance_ratio": 1.0, + "kl_div": -0.004435502924025059, + "kl_div_pos": -7.828343950677663e-05, + "kl_div_sft": -0.008792722597718239, + "learning_rate": 1.628058727569331e-06, + "loss": 0.0049, + "ppo_loss": -0.9999216794967651, + "sft_loss": 0.10335825383663177, + "step": 239 + }, + { + "epoch": 0.58, + "grad_norm": 3.5700552616063033, + "importance_ratio": 0.71875, + "kl_div": -0.3352140784263611, + "kl_div_neg": -0.25275135040283203, + "kl_div_pos": -0.41767677664756775, + "learning_rate": 1.626427406199021e-06, + "loss": -0.0569, + "ppo_loss": 0.07071244716644287, + "step": 240 + }, + { + "epoch": 0.58, + "grad_norm": 0.9741228070160373, + "kl_div": 0.005845887586474419, + "kl_div_sft": 0.005845887586474419, + "learning_rate": 1.6247960848287113e-06, + "loss": -0.0022, + "sft_loss": 0.07129596918821335, + "step": 241 + }, + { + "epoch": 0.59, + "grad_norm": 2.8899753149753704, + "kl_div": -0.012755146250128746, + "kl_div_sft": -0.012755146250128746, + "learning_rate": 1.623164763458401e-06, + "loss": 0.0202, + "sft_loss": 0.12924881279468536, + "step": 242 + }, + { + "epoch": 0.59, + "grad_norm": 1.1576247419532693, + "importance_ratio": 0.9453125, + "kl_div": -0.01969856396317482, + "kl_div_pos": -0.054807450622320175, + "kl_div_sft": 0.015410324558615685, + "learning_rate": 1.6215334420880912e-06, + "loss": 0.0741, + "ppo_loss": -0.9466674327850342, + "sft_loss": 0.12107433378696442, + "step": 243 + }, + { + "epoch": 0.59, + "grad_norm": 1.158357915457366, + "kl_div": -0.023490093648433685, + "kl_div_sft": -0.023490093648433685, + "learning_rate": 1.6199021207177814e-06, + "loss": -0.0097, + "sft_loss": 0.09046634286642075, + "step": 244 + }, + { + "epoch": 0.59, + "grad_norm": 1.1750598608149345, + "importance_ratio": 0.9765625, + "kl_div": -0.021390119567513466, + "kl_div_neg": -0.00024339275842066854, + "kl_div_pos": -0.04253684729337692, + "learning_rate": 1.6182707993474714e-06, + "loss": -0.0526, + "ppo_loss": 0.02070075273513794, + "step": 245 + }, + { + "epoch": 0.6, + "grad_norm": 6.0861800687779395, + "importance_ratio": 0.75390625, + "kl_div": -0.1369992047548294, + "kl_div_neg": -0.2839204967021942, + "kl_div_sft": 0.009922093711793423, + "learning_rate": 1.6166394779771614e-06, + "loss": -0.1549, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.041812244802713394, + "step": 246 + }, + { + "epoch": 0.6, + "grad_norm": 4.37485394915259, + "importance_ratio": 0.88671875, + "kl_div": -0.12540145218372345, + "kl_div_pos": -0.12540145218372345, + "learning_rate": 1.6150081566068514e-06, + "loss": -0.1772, + "ppo_loss": -0.8872189521789551, + "step": 247 + }, + { + "epoch": 0.6, + "grad_norm": 2.8037534555727666, + "kl_div": 0.006541554816067219, + "kl_div_sft": 0.006541554816067219, + "learning_rate": 1.6133768352365416e-06, + "loss": 0.0927, + "sft_loss": 0.06823631376028061, + "step": 248 + }, + { + "epoch": 0.6, + "grad_norm": 1.2486541655522805, + "kl_div": -0.010986318811774254, + "kl_div_sft": -0.010986318811774254, + "learning_rate": 1.6117455138662316e-06, + "loss": 0.0299, + "sft_loss": 0.10466299951076508, + "step": 249 + }, + { + "epoch": 0.61, + "grad_norm": 2.5603761943435734, + "importance_ratio": 1.0, + "kl_div": -0.004413792863488197, + "kl_div_pos": 0.0007278404664248228, + "kl_div_sft": -0.009555426426231861, + "learning_rate": 1.6101141924959216e-06, + "loss": 0.0637, + "ppo_loss": -1.0007281303405762, + "sft_loss": 0.07314762473106384, + "step": 250 + }, + { + "epoch": 0.61, + "grad_norm": 5.240721723634095, + "importance_ratio": 0.984375, + "kl_div": -0.016420193016529083, + "kl_div_neg": -0.030372580513358116, + "kl_div_pos": -0.002467805054038763, + "learning_rate": 1.6084828711256118e-06, + "loss": 0.0825, + "ppo_loss": -0.013725578784942627, + "step": 251 + }, + { + "epoch": 0.61, + "grad_norm": 1.057452047176148, + "importance_ratio": 0.7734375, + "kl_div": -0.13504436612129211, + "kl_div_pos": -0.25911781191825867, + "kl_div_sft": -0.01097092591226101, + "learning_rate": 1.6068515497553017e-06, + "loss": 0.0304, + "ppo_loss": -0.7717321515083313, + "sft_loss": 0.08440785109996796, + "step": 252 + }, + { + "epoch": 0.61, + "grad_norm": 0.9980315862331917, + "importance_ratio": 1.015625, + "kl_div": 0.006387812085449696, + "kl_div_pos": 0.013380873948335648, + "kl_div_sft": -0.00060524936998263, + "learning_rate": 1.6052202283849917e-06, + "loss": 0.0603, + "ppo_loss": -1.0134707689285278, + "sft_loss": 0.07214315980672836, + "step": 253 + }, + { + "epoch": 0.62, + "grad_norm": 1.3425978223228665, + "kl_div": 0.004152823239564896, + "kl_div_sft": 0.004152823239564896, + "learning_rate": 1.6035889070146817e-06, + "loss": 0.0237, + "sft_loss": 0.04243698716163635, + "step": 254 + }, + { + "epoch": 0.62, + "grad_norm": 1.0893081388019052, + "importance_ratio": 0.98828125, + "kl_div": -0.012175730429589748, + "kl_div_pos": -0.011445739306509495, + "kl_div_sft": -0.012905721552670002, + "learning_rate": 1.601957585644372e-06, + "loss": 0.1381, + "ppo_loss": -0.9886195063591003, + "sft_loss": 0.05715664103627205, + "step": 255 + }, + { + "epoch": 0.62, + "grad_norm": 1.2376164237078648, + "importance_ratio": 0.82421875, + "kl_div": -0.20184333622455597, + "kl_div_neg": -0.3390652537345886, + "kl_div_pos": -0.06462141871452332, + "learning_rate": 1.6003262642740619e-06, + "loss": -0.1227, + "ppo_loss": -0.06871113181114197, + "step": 256 + }, + { + "epoch": 0.62, + "grad_norm": 1.0333806770512999, + "importance_ratio": 0.98828125, + "kl_div": -0.08744671195745468, + "kl_div_pos": -0.012920528650283813, + "kl_div_sft": -0.16197289526462555, + "learning_rate": 1.5986949429037519e-06, + "loss": -0.0855, + "ppo_loss": -0.9871625900268555, + "sft_loss": 0.21870480477809906, + "step": 257 + }, + { + "epoch": 0.63, + "grad_norm": 2.033737068533743, + "importance_ratio": 0.99609375, + "kl_div": -0.009044161066412926, + "kl_div_pos": -0.004775775596499443, + "kl_div_sft": -0.013312545605003834, + "learning_rate": 1.597063621533442e-06, + "loss": 0.0067, + "ppo_loss": -0.9952355623245239, + "sft_loss": 0.08667251467704773, + "step": 258 + }, + { + "epoch": 0.63, + "grad_norm": 2.17227103375518, + "kl_div": -0.02424740232527256, + "kl_div_sft": -0.02424740232527256, + "learning_rate": 1.595432300163132e-06, + "loss": 0.0051, + "sft_loss": 0.1608126163482666, + "step": 259 + }, + { + "epoch": 0.63, + "grad_norm": 3.214808448792363, + "kl_div": -0.010200893506407738, + "kl_div_sft": -0.010200893506407738, + "learning_rate": 1.5938009787928222e-06, + "loss": -0.0841, + "sft_loss": 0.059631407260894775, + "step": 260 + }, + { + "epoch": 0.63, + "grad_norm": 3.708067995215448, + "kl_div": -0.012492822483181953, + "kl_div_sft": -0.012492822483181953, + "learning_rate": 1.592169657422512e-06, + "loss": -0.1127, + "sft_loss": 0.05107155442237854, + "step": 261 + }, + { + "epoch": 0.64, + "grad_norm": 3.4311475839252066, + "importance_ratio": 0.7734375, + "kl_div": -0.2557826042175293, + "kl_div_neg": -0.28364330530166626, + "kl_div_pos": -0.22792188823223114, + "learning_rate": 1.5905383360522022e-06, + "loss": 0.0694, + "ppo_loss": 0.0019067823886871338, + "step": 262 + }, + { + "epoch": 0.64, + "grad_norm": 0.8276769037534126, + "importance_ratio": 0.7890625, + "kl_div": -0.11510230600833893, + "kl_div_neg": -0.23486950993537903, + "kl_div_sft": 0.0046649049036204815, + "learning_rate": 1.5889070146818924e-06, + "loss": 0.0593, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.03703388571739197, + "step": 263 + }, + { + "epoch": 0.64, + "grad_norm": 1.0369255060975295, + "kl_div": -0.0018341443501412868, + "kl_div_sft": -0.0018341443501412868, + "learning_rate": 1.5872756933115822e-06, + "loss": 0.0103, + "sft_loss": 0.07939282059669495, + "step": 264 + }, + { + "epoch": 0.64, + "grad_norm": 0.8583886121029157, + "importance_ratio": 0.9296875, + "kl_div": -0.040981777012348175, + "kl_div_pos": -0.07225144654512405, + "kl_div_sft": -0.00971211027354002, + "learning_rate": 1.5856443719412724e-06, + "loss": -0.0687, + "ppo_loss": -0.9302969574928284, + "sft_loss": 0.11106070131063461, + "step": 265 + }, + { + "epoch": 0.64, + "grad_norm": 9.116483711857038, + "importance_ratio": 0.828125, + "kl_div": -0.0933116152882576, + "kl_div_neg": -0.19036899507045746, + "kl_div_sft": 0.003745768219232559, + "learning_rate": 1.5840130505709624e-06, + "loss": -0.0504, + "ppo_loss": 0.8266540765762329, + "sft_loss": 0.09498132020235062, + "step": 266 + }, + { + "epoch": 0.65, + "grad_norm": 1.8274568820359849, + "importance_ratio": 1.0078125, + "kl_div": -0.008119492791593075, + "kl_div_pos": 0.004711403977125883, + "kl_div_sft": -0.02095039002597332, + "learning_rate": 1.5823817292006526e-06, + "loss": -0.0648, + "ppo_loss": -1.0047225952148438, + "sft_loss": 0.08279009163379669, + "step": 267 + }, + { + "epoch": 0.65, + "grad_norm": 9.420829550466632, + "importance_ratio": 0.6953125, + "kl_div": -0.17865419387817383, + "kl_div_neg": -0.36194297671318054, + "kl_div_sft": 0.004634591285139322, + "learning_rate": 1.5807504078303423e-06, + "loss": 0.1689, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.05000005289912224, + "step": 268 + }, + { + "epoch": 0.65, + "grad_norm": 0.9602763685138183, + "importance_ratio": 0.81640625, + "kl_div": -0.10194914042949677, + "kl_div_neg": -0.20226718485355377, + "kl_div_sft": -0.0016310999635607004, + "learning_rate": 1.5791190864600325e-06, + "loss": 0.026, + "ppo_loss": 0.8168765902519226, + "sft_loss": 0.03999049589037895, + "step": 269 + }, + { + "epoch": 0.65, + "grad_norm": 0.9590098266953325, + "kl_div": -0.12230391055345535, + "kl_div_sft": -0.12230391055345535, + "learning_rate": 1.5774877650897227e-06, + "loss": -0.1017, + "sft_loss": 0.16727250814437866, + "step": 270 + }, + { + "epoch": 0.66, + "grad_norm": 0.8379303182523353, + "importance_ratio": 0.9921875, + "kl_div": -0.00032729655504226685, + "kl_div_neg": -0.007788390852510929, + "kl_div_sft": 0.007133797742426395, + "learning_rate": 1.5758564437194127e-06, + "loss": 0.0727, + "ppo_loss": 0.9922418594360352, + "sft_loss": 0.05602758377790451, + "step": 271 + }, + { + "epoch": 0.66, + "grad_norm": 9.583772422572244, + "importance_ratio": 0.8359375, + "kl_div": -0.19486042857170105, + "kl_div_neg": -0.3538120687007904, + "kl_div_pos": -0.0359087735414505, + "learning_rate": 1.5742251223491027e-06, + "loss": -0.0354, + "ppo_loss": -0.08236417174339294, + "step": 272 + }, + { + "epoch": 0.66, + "grad_norm": 2.3751083148550722, + "importance_ratio": 0.9921875, + "kl_div": -0.0045613134279847145, + "kl_div_pos": -0.008577365428209305, + "kl_div_sft": -0.0005452617770060897, + "learning_rate": 1.5725938009787927e-06, + "loss": -0.1139, + "ppo_loss": -0.991459310054779, + "sft_loss": 0.038445647805929184, + "step": 273 + }, + { + "epoch": 0.66, + "grad_norm": 2.4028228530134856, + "importance_ratio": 0.98046875, + "kl_div": -0.009789850562810898, + "kl_div_pos": -0.020217105746269226, + "kl_div_sft": 0.0006374044460244477, + "learning_rate": 1.5709624796084829e-06, + "loss": 0.0563, + "ppo_loss": -0.9799858927726746, + "sft_loss": 0.0919925719499588, + "step": 274 + }, + { + "epoch": 0.67, + "grad_norm": 1.0608938643593155, + "kl_div": -0.013894759118556976, + "kl_div_sft": -0.013894759118556976, + "learning_rate": 1.5693311582381726e-06, + "loss": -0.1247, + "sft_loss": 0.09050247073173523, + "step": 275 + }, + { + "epoch": 0.67, + "grad_norm": 3.589018651140568, + "importance_ratio": 0.82421875, + "kl_div": -0.09773320704698563, + "kl_div_neg": -0.19317105412483215, + "kl_div_sft": -0.0022953650914132595, + "learning_rate": 1.5676998368678628e-06, + "loss": -0.0331, + "ppo_loss": 0.8243409991264343, + "sft_loss": 0.07633786648511887, + "step": 276 + }, + { + "epoch": 0.67, + "grad_norm": 8.14206347505569, + "kl_div": 1.1440544767538086e-05, + "kl_div_sft": 1.1440544767538086e-05, + "learning_rate": 1.566068515497553e-06, + "loss": 0.0461, + "sft_loss": 0.10166864097118378, + "step": 277 + }, + { + "epoch": 0.67, + "grad_norm": 4.270036700624787, + "importance_ratio": 0.9140625, + "kl_div": -0.09008216857910156, + "kl_div_neg": -0.09008216857910156, + "learning_rate": 1.564437194127243e-06, + "loss": 0.0274, + "ppo_loss": 0.9167443513870239, + "step": 278 + }, + { + "epoch": 0.68, + "grad_norm": 5.246673620107099, + "kl_div": -0.060400474816560745, + "kl_div_sft": -0.060400474816560745, + "learning_rate": 1.562805872756933e-06, + "loss": -0.0454, + "sft_loss": 0.12175711244344711, + "step": 279 + }, + { + "epoch": 0.68, + "grad_norm": 3.953524482656069, + "importance_ratio": 0.859375, + "kl_div": -0.1578972488641739, + "kl_div_pos": -0.1578972488641739, + "learning_rate": 1.561174551386623e-06, + "loss": -0.1069, + "ppo_loss": -0.8590763807296753, + "step": 280 + }, + { + "epoch": 0.68, + "grad_norm": 3.9506407157731407, + "importance_ratio": 0.9765625, + "kl_div": -0.0060625518672168255, + "kl_div_pos": -0.02237924560904503, + "kl_div_sft": 0.010254141874611378, + "learning_rate": 1.5595432300163132e-06, + "loss": -0.0587, + "ppo_loss": -0.9778693318367004, + "sft_loss": 0.05958448350429535, + "step": 281 + }, + { + "epoch": 0.68, + "grad_norm": 3.7460476710376396, + "importance_ratio": 0.8359375, + "kl_div": -0.11085577309131622, + "kl_div_neg": -0.17810475826263428, + "kl_div_sft": -0.04360678046941757, + "learning_rate": 1.5579119086460032e-06, + "loss": -0.0386, + "ppo_loss": 0.8368546962738037, + "sft_loss": 0.12636341154575348, + "step": 282 + }, + { + "epoch": 0.69, + "grad_norm": 4.745895620221447, + "importance_ratio": 0.96484375, + "kl_div": -0.029229873791337013, + "kl_div_pos": -0.0365082323551178, + "kl_div_sft": -0.02195151522755623, + "learning_rate": 1.5562805872756932e-06, + "loss": -0.0572, + "ppo_loss": -0.9641501903533936, + "sft_loss": 0.05703162029385567, + "step": 283 + }, + { + "epoch": 0.69, + "grad_norm": 5.993764498218747, + "kl_div": -0.007489209994673729, + "kl_div_sft": -0.007489209994673729, + "learning_rate": 1.5546492659053833e-06, + "loss": -0.0634, + "sft_loss": 0.07352593541145325, + "step": 284 + }, + { + "epoch": 0.69, + "grad_norm": 0.9322771820704878, + "kl_div": -0.03013739548623562, + "kl_div_sft": -0.03013739548623562, + "learning_rate": 1.5530179445350733e-06, + "loss": 0.056, + "sft_loss": 0.08940079063177109, + "step": 285 + }, + { + "epoch": 0.69, + "grad_norm": 0.992517671692001, + "kl_div": 0.0077654337510466576, + "kl_div_sft": 0.0077654337510466576, + "learning_rate": 1.5513866231647635e-06, + "loss": -0.0487, + "sft_loss": 0.030625823885202408, + "step": 286 + }, + { + "epoch": 0.7, + "grad_norm": 3.708157881683071, + "importance_ratio": 0.9921875, + "kl_div": -0.11875781416893005, + "kl_div_neg": -0.007041546981781721, + "kl_div_sft": -0.2304740846157074, + "learning_rate": 1.5497553017944533e-06, + "loss": 0.1021, + "ppo_loss": 0.9929831027984619, + "sft_loss": 0.27379873394966125, + "step": 287 + }, + { + "epoch": 0.7, + "grad_norm": 0.9189327616143602, + "kl_div": -0.010802363976836205, + "kl_div_sft": -0.010802363976836205, + "learning_rate": 1.5481239804241435e-06, + "loss": -0.1124, + "sft_loss": 0.052867598831653595, + "step": 288 + }, + { + "epoch": 0.7, + "grad_norm": 3.388275598227071, + "kl_div": -0.012758041732013226, + "kl_div_sft": -0.012758041732013226, + "learning_rate": 1.5464926590538337e-06, + "loss": -0.2157, + "sft_loss": 0.07837103307247162, + "step": 289 + }, + { + "epoch": 0.7, + "grad_norm": 3.671059765509262, + "kl_div": -0.009600481018424034, + "kl_div_sft": -0.009600481018424034, + "learning_rate": 1.5448613376835235e-06, + "loss": -0.0148, + "sft_loss": 0.08664091676473618, + "step": 290 + }, + { + "epoch": 0.71, + "grad_norm": 1.0037582822769144, + "kl_div": -0.021426748484373093, + "kl_div_sft": -0.021426748484373093, + "learning_rate": 1.5432300163132137e-06, + "loss": 0.1021, + "sft_loss": 0.10049542784690857, + "step": 291 + }, + { + "epoch": 0.71, + "grad_norm": 3.9444987751497913, + "importance_ratio": 0.90625, + "kl_div": -0.1015133410692215, + "kl_div_neg": -0.15428906679153442, + "kl_div_pos": -0.04873760789632797, + "learning_rate": 1.5415986949429036e-06, + "loss": 0.0774, + "ppo_loss": -0.04770335555076599, + "step": 292 + }, + { + "epoch": 0.71, + "grad_norm": 1.0185512934012841, + "importance_ratio": 0.90625, + "kl_div": -0.050773173570632935, + "kl_div_pos": -0.09809917956590652, + "kl_div_sft": -0.003447168506681919, + "learning_rate": 1.5399673735725938e-06, + "loss": 0.0851, + "ppo_loss": -0.9065589904785156, + "sft_loss": 0.07342778891324997, + "step": 293 + }, + { + "epoch": 0.71, + "grad_norm": 1.24960759679412, + "importance_ratio": 0.98046875, + "kl_div": -0.01254759170114994, + "kl_div_pos": -0.018518824130296707, + "kl_div_sft": -0.0065763588063418865, + "learning_rate": 1.5383360522022836e-06, + "loss": 0.0704, + "ppo_loss": -0.9816515445709229, + "sft_loss": 0.09209147095680237, + "step": 294 + }, + { + "epoch": 0.72, + "grad_norm": 3.208030257008623, + "kl_div": 0.003485637716948986, + "kl_div_sft": 0.003485637716948986, + "learning_rate": 1.5367047308319738e-06, + "loss": 0.028, + "sft_loss": 0.06576119363307953, + "step": 295 + }, + { + "epoch": 0.72, + "grad_norm": 0.7614542575280052, + "kl_div": 0.007014347240328789, + "kl_div_sft": 0.007014347240328789, + "learning_rate": 1.535073409461664e-06, + "loss": 0.1344, + "sft_loss": 0.06254887580871582, + "step": 296 + }, + { + "epoch": 0.72, + "grad_norm": 9.809169325505872, + "kl_div": -0.0015400054398924112, + "kl_div_sft": -0.0015400054398924112, + "learning_rate": 1.533442088091354e-06, + "loss": -0.0076, + "sft_loss": 0.08064014464616776, + "step": 297 + }, + { + "epoch": 0.72, + "grad_norm": 3.459325824170929, + "kl_div": -0.007169000804424286, + "kl_div_sft": -0.007169000804424286, + "learning_rate": 1.531810766721044e-06, + "loss": -0.0143, + "sft_loss": 0.10261943191289902, + "step": 298 + }, + { + "epoch": 0.72, + "grad_norm": 0.7764140564788423, + "kl_div": 0.0003441378939896822, + "kl_div_sft": 0.0003441378939896822, + "learning_rate": 1.530179445350734e-06, + "loss": -0.0727, + "sft_loss": 0.08510269224643707, + "step": 299 + }, + { + "epoch": 0.73, + "grad_norm": 3.553790773740822, + "importance_ratio": 0.70703125, + "kl_div": -0.17280974984169006, + "kl_div_neg": -0.349417120218277, + "kl_div_sft": 0.003797624260187149, + "learning_rate": 1.5285481239804242e-06, + "loss": 0.0813, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.07960819453001022, + "step": 300 + }, + { + "epoch": 0.73, + "grad_norm": 0.9986867924766581, + "kl_div": -0.00027870782651007175, + "kl_div_sft": -0.00027870782651007175, + "learning_rate": 1.526916802610114e-06, + "loss": -0.0956, + "sft_loss": 0.07223731279373169, + "step": 301 + }, + { + "epoch": 0.73, + "grad_norm": 1.182324928566422, + "importance_ratio": 0.83203125, + "kl_div": -0.19484283030033112, + "kl_div_neg": -0.34265056252479553, + "kl_div_pos": -0.047035101801157, + "learning_rate": 1.5252854812398041e-06, + "loss": 0.0757, + "ppo_loss": -0.07702693343162537, + "step": 302 + }, + { + "epoch": 0.73, + "grad_norm": 11.752123052463089, + "importance_ratio": 0.6796875, + "kl_div": -0.18974079191684723, + "kl_div_neg": -0.3876355290412903, + "kl_div_sft": 0.008153931237757206, + "learning_rate": 1.5236541598694943e-06, + "loss": -0.0932, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.07056049257516861, + "step": 303 + }, + { + "epoch": 0.74, + "grad_norm": 4.471725433642123, + "importance_ratio": 0.98828125, + "kl_div": -0.004336100071668625, + "kl_div_neg": -0.013022080063819885, + "kl_div_sft": 0.004349880386143923, + "learning_rate": 1.5220228384991843e-06, + "loss": -0.0391, + "ppo_loss": 0.9870623350143433, + "sft_loss": 0.09904748946428299, + "step": 304 + }, + { + "epoch": 0.74, + "grad_norm": 11.537474236919367, + "importance_ratio": 0.99609375, + "kl_div": -0.005024380516260862, + "kl_div_pos": -0.0030147444922477007, + "kl_div_sft": -0.007034016773104668, + "learning_rate": 1.5203915171288743e-06, + "loss": -0.1319, + "ppo_loss": -0.9969898462295532, + "sft_loss": 0.04072725400328636, + "step": 305 + }, + { + "epoch": 0.74, + "grad_norm": 8.069143943713918, + "importance_ratio": 0.9765625, + "kl_div": -0.011580190621316433, + "kl_div_pos": -0.024436360225081444, + "kl_div_sft": 0.001275978283956647, + "learning_rate": 1.5187601957585643e-06, + "loss": -0.1803, + "ppo_loss": -0.9758597612380981, + "sft_loss": 0.06408393383026123, + "step": 306 + }, + { + "epoch": 0.74, + "grad_norm": 1.0572363113920726, + "importance_ratio": 0.96484375, + "kl_div": -0.015708668157458305, + "kl_div_pos": -0.035194650292396545, + "kl_div_sft": 0.00377731304615736, + "learning_rate": 1.5171288743882545e-06, + "loss": -0.0675, + "ppo_loss": -0.9654175043106079, + "sft_loss": 0.06628946959972382, + "step": 307 + }, + { + "epoch": 0.75, + "grad_norm": 13.488066874020417, + "importance_ratio": 0.96484375, + "kl_div": -0.032234422862529755, + "kl_div_pos": -0.03625834360718727, + "kl_div_sft": -0.028210503980517387, + "learning_rate": 1.5154975530179447e-06, + "loss": 0.077, + "ppo_loss": -0.9643911719322205, + "sft_loss": 0.06895402818918228, + "step": 308 + }, + { + "epoch": 0.75, + "grad_norm": 1.5976955716766723, + "importance_ratio": 1.0078125, + "kl_div": 0.014641315676271915, + "kl_div_pos": 0.005524564068764448, + "kl_div_sft": 0.023758066818118095, + "learning_rate": 1.5138662316476344e-06, + "loss": -0.0026, + "ppo_loss": -1.005539894104004, + "sft_loss": 0.07187476009130478, + "step": 309 + }, + { + "epoch": 0.75, + "grad_norm": 1.0015244784703676, + "kl_div": 0.00711329560726881, + "kl_div_sft": 0.00711329560726881, + "learning_rate": 1.5122349102773246e-06, + "loss": -0.1007, + "sft_loss": 0.0625566840171814, + "step": 310 + }, + { + "epoch": 0.75, + "grad_norm": 3.2210326757754393, + "importance_ratio": 0.921875, + "kl_div": -0.04271788150072098, + "kl_div_pos": -0.0804443210363388, + "kl_div_sft": -0.004991442896425724, + "learning_rate": 1.5106035889070146e-06, + "loss": -0.1387, + "ppo_loss": -0.9227062463760376, + "sft_loss": 0.03155931085348129, + "step": 311 + }, + { + "epoch": 0.76, + "grad_norm": 5.240578690234974, + "importance_ratio": 0.984375, + "kl_div": 0.00046649202704429626, + "kl_div_pos": -0.016343850642442703, + "kl_div_sft": 0.017276834696531296, + "learning_rate": 1.5089722675367046e-06, + "loss": -0.0052, + "ppo_loss": -0.9837889671325684, + "sft_loss": 0.043509677052497864, + "step": 312 + }, + { + "epoch": 0.76, + "grad_norm": 0.8025776879565545, + "importance_ratio": 0.9921875, + "kl_div": -0.002305206609889865, + "kl_div_pos": -0.007442329078912735, + "kl_div_sft": 0.002831915859133005, + "learning_rate": 1.5073409461663946e-06, + "loss": 0.1136, + "ppo_loss": -0.992585301399231, + "sft_loss": 0.06029798090457916, + "step": 313 + }, + { + "epoch": 0.76, + "grad_norm": 9.853050851588272, + "importance_ratio": 0.60546875, + "kl_div": -0.24869604408740997, + "kl_div_pos": -0.5010824799537659, + "kl_div_sft": 0.0036904041189700365, + "learning_rate": 1.5057096247960848e-06, + "loss": 0.0182, + "ppo_loss": -0.6058744788169861, + "sft_loss": 0.06293935328722, + "step": 314 + }, + { + "epoch": 0.76, + "grad_norm": 1.02099609375, + "importance_ratio": 0.98828125, + "kl_div": -0.012375455349683762, + "kl_div_pos": -0.012375455349683762, + "learning_rate": 1.504078303425775e-06, + "loss": -0.0341, + "ppo_loss": -0.9878675937652588, + "step": 315 + }, + { + "epoch": 0.77, + "grad_norm": 1.6083232813440314, + "kl_div": -0.019710950553417206, + "kl_div_sft": -0.019710950553417206, + "learning_rate": 1.5024469820554647e-06, + "loss": -0.1431, + "sft_loss": 0.0729701817035675, + "step": 316 + }, + { + "epoch": 0.77, + "grad_norm": 1.2108278286400305, + "importance_ratio": 0.96875, + "kl_div": -0.03259526938199997, + "kl_div_pos": -0.03259526938199997, + "learning_rate": 1.500815660685155e-06, + "loss": -0.0835, + "ppo_loss": -0.967930793762207, + "step": 317 + }, + { + "epoch": 0.77, + "grad_norm": 1.3044009322260497, + "importance_ratio": 0.96484375, + "kl_div": -0.016362616792321205, + "kl_div_pos": -0.03751760348677635, + "kl_div_sft": 0.004792371299117804, + "learning_rate": 1.499184339314845e-06, + "loss": -0.0453, + "ppo_loss": -0.963177502155304, + "sft_loss": 0.032457318156957626, + "step": 318 + }, + { + "epoch": 0.77, + "grad_norm": 4.55266629580963, + "importance_ratio": 0.7265625, + "kl_div": -0.3182659149169922, + "kl_div_neg": -0.3182659149169922, + "learning_rate": 1.4975530179445351e-06, + "loss": 0.0939, + "ppo_loss": 0.800000011920929, + "step": 319 + }, + { + "epoch": 0.78, + "grad_norm": 1.2009390256397048, + "importance_ratio": 0.97265625, + "kl_div": -0.012672297656536102, + "kl_div_neg": -0.027501724660396576, + "kl_div_sft": 0.002157128881663084, + "learning_rate": 1.4959216965742249e-06, + "loss": 0.233, + "ppo_loss": 0.9728729724884033, + "sft_loss": 0.050059814006090164, + "step": 320 + }, + { + "epoch": 0.78, + "grad_norm": 1.0228750998655343, + "importance_ratio": 0.9453125, + "kl_div": -0.022379038855433464, + "kl_div_neg": -0.054841797798871994, + "kl_div_sft": 0.010083720088005066, + "learning_rate": 1.494290375203915e-06, + "loss": -0.0796, + "ppo_loss": 0.9466348886489868, + "sft_loss": 0.018363839015364647, + "step": 321 + }, + { + "epoch": 0.78, + "grad_norm": 1.4901011796174906, + "importance_ratio": 0.890625, + "kl_div": -0.12282276898622513, + "kl_div_neg": -4.3708219891414046e-05, + "kl_div_pos": -0.2456018328666687, + "learning_rate": 1.4926590538336053e-06, + "loss": 0.0191, + "ppo_loss": 0.10886135697364807, + "step": 322 + }, + { + "epoch": 0.78, + "grad_norm": 5.4440468147725865, + "importance_ratio": 0.77734375, + "kl_div": -0.13044001162052155, + "kl_div_neg": -0.2507326006889343, + "kl_div_sft": -0.010147427208721638, + "learning_rate": 1.4910277324632953e-06, + "loss": 0.0005, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.10479097068309784, + "step": 323 + }, + { + "epoch": 0.79, + "grad_norm": 1.1518768916577657, + "importance_ratio": 0.90234375, + "kl_div": -0.05070945993065834, + "kl_div_pos": -0.1028522476553917, + "kl_div_sft": 0.0014333281433209777, + "learning_rate": 1.4893964110929853e-06, + "loss": 0.0619, + "ppo_loss": -0.9022603034973145, + "sft_loss": 0.0915645956993103, + "step": 324 + }, + { + "epoch": 0.79, + "grad_norm": 1.8697849227514465, + "kl_div": 0.005619871895760298, + "kl_div_sft": 0.005619871895760298, + "learning_rate": 1.4877650897226752e-06, + "loss": -0.026, + "sft_loss": 0.05023251101374626, + "step": 325 + }, + { + "epoch": 0.79, + "grad_norm": 2.931214771181061, + "importance_ratio": 0.92578125, + "kl_div": -0.03833425045013428, + "kl_div_neg": -0.07714598625898361, + "kl_div_sft": 0.0004774852131959051, + "learning_rate": 1.4861337683523654e-06, + "loss": 0.1182, + "ppo_loss": 0.9257546663284302, + "sft_loss": 0.046782199293375015, + "step": 326 + }, + { + "epoch": 0.79, + "grad_norm": 4.7159573204691245, + "kl_div": -0.0009512719698250294, + "kl_div_sft": -0.0009512719698250294, + "learning_rate": 1.4845024469820552e-06, + "loss": -0.0133, + "sft_loss": 0.0551016591489315, + "step": 327 + }, + { + "epoch": 0.8, + "grad_norm": 4.158387287123689, + "importance_ratio": 0.8203125, + "kl_div": -0.10169047117233276, + "kl_div_neg": -0.19945180416107178, + "kl_div_sft": -0.003929144237190485, + "learning_rate": 1.4828711256117454e-06, + "loss": 0.0485, + "ppo_loss": 0.8191797137260437, + "sft_loss": 0.08998027443885803, + "step": 328 + }, + { + "epoch": 0.8, + "grad_norm": 2.8205447959865824, + "kl_div": 0.008606771007180214, + "kl_div_sft": 0.008606771007180214, + "learning_rate": 1.4812398042414356e-06, + "loss": -0.0295, + "sft_loss": 0.046088241040706635, + "step": 329 + }, + { + "epoch": 0.8, + "grad_norm": 1.2197406361778134, + "importance_ratio": 0.9296875, + "kl_div": -0.033290740102529526, + "kl_div_pos": -0.07345406711101532, + "kl_div_sft": 0.006872584577649832, + "learning_rate": 1.4796084828711256e-06, + "loss": -0.0098, + "ppo_loss": -0.9291788339614868, + "sft_loss": 0.03272410109639168, + "step": 330 + }, + { + "epoch": 0.8, + "grad_norm": 2.049644867444505, + "importance_ratio": 0.96875, + "kl_div": -0.03301801532506943, + "kl_div_pos": -0.03301801532506943, + "learning_rate": 1.4779771615008156e-06, + "loss": -0.0545, + "ppo_loss": -0.9675254225730896, + "step": 331 + }, + { + "epoch": 0.8, + "grad_norm": 1.7787836634867733, + "importance_ratio": 0.9296875, + "kl_div": -0.036390505731105804, + "kl_div_neg": -0.07250487059354782, + "kl_div_sft": -0.0002761399664450437, + "learning_rate": 1.4763458401305055e-06, + "loss": -0.0752, + "ppo_loss": 0.9300612211227417, + "sft_loss": 0.05477307736873627, + "step": 332 + }, + { + "epoch": 0.81, + "grad_norm": 1.7759055554391399, + "importance_ratio": 0.94140625, + "kl_div": -0.06303277611732483, + "kl_div_neg": -0.11272674053907394, + "kl_div_pos": -0.013338807038962841, + "learning_rate": 1.4747145187601957e-06, + "loss": -0.1584, + "ppo_loss": -0.04667750000953674, + "step": 333 + }, + { + "epoch": 0.81, + "grad_norm": 2.3862452060657304, + "importance_ratio": 0.9921875, + "kl_div": -0.006657070480287075, + "kl_div_pos": -0.006657070480287075, + "learning_rate": 1.473083197389886e-06, + "loss": -0.2039, + "ppo_loss": -0.993366003036499, + "step": 334 + }, + { + "epoch": 0.81, + "grad_norm": 4.780790419088667, + "importance_ratio": 0.953125, + "kl_div": -0.0286087803542614, + "kl_div_pos": -0.04897100850939751, + "kl_div_sft": -0.00824655406177044, + "learning_rate": 1.4714518760195757e-06, + "loss": -0.0568, + "ppo_loss": -0.9522087574005127, + "sft_loss": 0.097450390458107, + "step": 335 + }, + { + "epoch": 0.81, + "grad_norm": 2.2356525316854676, + "importance_ratio": 0.9765625, + "kl_div": -0.011972763575613499, + "kl_div_neg": -0.02487398311495781, + "kl_div_sft": 0.000928456720430404, + "learning_rate": 1.469820554649266e-06, + "loss": 0.0395, + "ppo_loss": 0.975432813167572, + "sft_loss": 0.07037409394979477, + "step": 336 + }, + { + "epoch": 0.82, + "grad_norm": 3.4639200486698103, + "importance_ratio": 0.9921875, + "kl_div": -0.00725951325148344, + "kl_div_neg": -0.008314115926623344, + "kl_div_sft": -0.006204910576343536, + "learning_rate": 1.4681892332789559e-06, + "loss": 0.0526, + "ppo_loss": 0.9917203783988953, + "sft_loss": 0.06223254278302193, + "step": 337 + }, + { + "epoch": 0.82, + "grad_norm": 2.7775114843095055, + "importance_ratio": 1.0, + "kl_div": -0.0025152729358524084, + "kl_div_pos": 0.003554165828973055, + "kl_div_sft": -0.008584711700677872, + "learning_rate": 1.4665579119086459e-06, + "loss": 0.0289, + "ppo_loss": -1.0035605430603027, + "sft_loss": 0.0701230838894844, + "step": 338 + }, + { + "epoch": 0.82, + "grad_norm": 1.4601875869221272, + "kl_div": 0.003021697048097849, + "kl_div_sft": 0.003021697048097849, + "learning_rate": 1.4649265905383359e-06, + "loss": -0.172, + "sft_loss": 0.07517191767692566, + "step": 339 + }, + { + "epoch": 0.82, + "grad_norm": 1.8553365881180026, + "importance_ratio": 0.79296875, + "kl_div": -0.23275145888328552, + "kl_div_neg": -0.26393523812294006, + "kl_div_pos": -0.20156769454479218, + "learning_rate": 1.463295269168026e-06, + "loss": -0.1994, + "ppo_loss": -0.008724093437194824, + "step": 340 + }, + { + "epoch": 0.83, + "grad_norm": 1.9243318314649143, + "importance_ratio": 0.9296875, + "kl_div": -0.07310383766889572, + "kl_div_neg": -0.07310383766889572, + "learning_rate": 1.4616639477977163e-06, + "loss": -0.0659, + "ppo_loss": 0.9309701323509216, + "step": 341 + }, + { + "epoch": 0.83, + "grad_norm": 1.956693517358818, + "importance_ratio": 0.93359375, + "kl_div": -0.036307819187641144, + "kl_div_pos": -0.06998840719461441, + "kl_div_sft": -0.002627227921038866, + "learning_rate": 1.460032626427406e-06, + "loss": -0.0614, + "ppo_loss": -0.932404637336731, + "sft_loss": 0.07751280814409256, + "step": 342 + }, + { + "epoch": 0.83, + "grad_norm": 3.135826582531772, + "importance_ratio": 0.96875, + "kl_div": -0.03073815256357193, + "kl_div_pos": -0.03073815256357193, + "learning_rate": 1.4584013050570962e-06, + "loss": -0.0142, + "ppo_loss": -0.969732403755188, + "step": 343 + }, + { + "epoch": 0.83, + "grad_norm": 5.727697824552616, + "importance_ratio": 0.83203125, + "kl_div": -0.08306290209293365, + "kl_div_neg": -0.18256793916225433, + "kl_div_sft": 0.016442136839032173, + "learning_rate": 1.4567699836867862e-06, + "loss": 0.108, + "ppo_loss": 0.8331279754638672, + "sft_loss": 0.02431655116379261, + "step": 344 + }, + { + "epoch": 0.84, + "grad_norm": 1.160169402282351, + "importance_ratio": 0.8359375, + "kl_div": -0.08823750913143158, + "kl_div_neg": -0.1810176968574524, + "kl_div_sft": 0.004542672540992498, + "learning_rate": 1.4551386623164764e-06, + "loss": 0.1544, + "ppo_loss": 0.8344205617904663, + "sft_loss": 0.043373603373765945, + "step": 345 + }, + { + "epoch": 0.84, + "grad_norm": 1.4447440189278034, + "importance_ratio": 0.96484375, + "kl_div": -0.015825355425477028, + "kl_div_neg": -0.03718387335538864, + "kl_div_sft": 0.005533162504434586, + "learning_rate": 1.4535073409461662e-06, + "loss": -0.0611, + "ppo_loss": 0.9634990096092224, + "sft_loss": 0.0831359475851059, + "step": 346 + }, + { + "epoch": 0.84, + "grad_norm": 3.0823128188476296, + "importance_ratio": 0.98046875, + "kl_div": -0.014321206137537956, + "kl_div_neg": -0.02046828903257847, + "kl_div_sft": -0.008174123242497444, + "learning_rate": 1.4518760195758564e-06, + "loss": 0.0207, + "ppo_loss": 0.9797397255897522, + "sft_loss": 0.12043432146310806, + "step": 347 + }, + { + "epoch": 0.84, + "grad_norm": 6.68119215052569, + "importance_ratio": 0.8671875, + "kl_div": -0.15497435629367828, + "kl_div_neg": -0.30931636691093445, + "kl_div_pos": -0.0006323597626760602, + "learning_rate": 1.4502446982055466e-06, + "loss": -0.0633, + "ppo_loss": -0.09968394041061401, + "step": 348 + }, + { + "epoch": 0.85, + "grad_norm": 1.6288848802367726, + "importance_ratio": 0.9609375, + "kl_div": -0.019876418635249138, + "kl_div_pos": -0.039128370583057404, + "kl_div_sft": -0.0006244677351787686, + "learning_rate": 1.4486133768352363e-06, + "loss": -0.11, + "ppo_loss": -0.9616272449493408, + "sft_loss": 0.03771361708641052, + "step": 349 + }, + { + "epoch": 0.85, + "grad_norm": 1.7350734644570496, + "importance_ratio": 1.0078125, + "kl_div": 0.006748631596565247, + "kl_div_pos": 0.009035097435116768, + "kl_div_sft": 0.004462166223675013, + "learning_rate": 1.4469820554649265e-06, + "loss": -0.1506, + "ppo_loss": -1.0090761184692383, + "sft_loss": 0.041371047496795654, + "step": 350 + }, + { + "epoch": 0.85, + "grad_norm": 1.6053564027705443, + "kl_div": -0.07098940759897232, + "kl_div_sft": -0.07098940759897232, + "learning_rate": 1.4453507340946165e-06, + "loss": -0.1262, + "sft_loss": 0.13393214344978333, + "step": 351 + }, + { + "epoch": 0.85, + "grad_norm": 0.9634725116278806, + "kl_div": 0.0022934931330382824, + "kl_div_sft": 0.0022934931330382824, + "learning_rate": 1.4437194127243067e-06, + "loss": 0.0054, + "sft_loss": 0.0617811381816864, + "step": 352 + }, + { + "epoch": 0.86, + "grad_norm": 0.9781766445188108, + "importance_ratio": 0.97265625, + "kl_div": -0.05350446328520775, + "kl_div_pos": -0.026010213419795036, + "kl_div_sft": -0.08099871128797531, + "learning_rate": 1.4420880913539967e-06, + "loss": 0.0701, + "ppo_loss": -0.9743251204490662, + "sft_loss": 0.16814443469047546, + "step": 353 + }, + { + "epoch": 0.86, + "grad_norm": 0.9834217724119791, + "importance_ratio": 0.97265625, + "kl_div": -0.01595260016620159, + "kl_div_pos": -0.02615179680287838, + "kl_div_sft": -0.005753403063863516, + "learning_rate": 1.4404567699836867e-06, + "loss": 0.0254, + "ppo_loss": -0.9741871356964111, + "sft_loss": 0.06098558008670807, + "step": 354 + }, + { + "epoch": 0.86, + "grad_norm": 2.190498367981171, + "importance_ratio": 0.8359375, + "kl_div": -0.089511938393116, + "kl_div_neg": -0.176958367228508, + "kl_div_sft": -0.0020655107218772173, + "learning_rate": 1.4388254486133769e-06, + "loss": -0.0161, + "ppo_loss": 0.8378146290779114, + "sft_loss": 0.12363357841968536, + "step": 355 + }, + { + "epoch": 0.86, + "grad_norm": 0.853924616191224, + "kl_div": 0.008525339886546135, + "kl_div_sft": 0.008525339886546135, + "learning_rate": 1.4371941272430669e-06, + "loss": 0.0255, + "sft_loss": 0.08300769329071045, + "step": 356 + }, + { + "epoch": 0.87, + "grad_norm": 1.416110415738903, + "importance_ratio": 0.984375, + "kl_div": -0.008234689943492413, + "kl_div_neg": -0.01620335318148136, + "kl_div_sft": -0.0002660272584762424, + "learning_rate": 1.4355628058727568e-06, + "loss": 0.0244, + "ppo_loss": 0.9839272499084473, + "sft_loss": 0.05065668746829033, + "step": 357 + }, + { + "epoch": 0.87, + "grad_norm": 1.1595674298523522, + "importance_ratio": 0.7734375, + "kl_div": -0.13369768857955933, + "kl_div_neg": -0.25916311144828796, + "kl_div_sft": -0.00823226198554039, + "learning_rate": 1.4339314845024468e-06, + "loss": -0.0392, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.08650946617126465, + "step": 358 + }, + { + "epoch": 0.87, + "grad_norm": 1.7837342456317047, + "kl_div": 0.008224120363593102, + "kl_div_sft": 0.008224120363593102, + "learning_rate": 1.432300163132137e-06, + "loss": 0.0372, + "sft_loss": 0.09266399592161179, + "step": 359 + }, + { + "epoch": 0.87, + "grad_norm": 0.949493870679729, + "importance_ratio": 1.0078125, + "kl_div": -0.0019446432124823332, + "kl_div_pos": 0.006772224325686693, + "kl_div_sft": -0.01066151075065136, + "learning_rate": 1.4306688417618272e-06, + "loss": 0.0486, + "ppo_loss": -1.0067951679229736, + "sft_loss": 0.10154891014099121, + "step": 360 + }, + { + "epoch": 0.88, + "grad_norm": 1.4625758811632306, + "importance_ratio": 0.9921875, + "kl_div": -0.010378789156675339, + "kl_div_pos": -0.008084448054432869, + "kl_div_sft": -0.012673130258917809, + "learning_rate": 1.429037520391517e-06, + "loss": -0.0158, + "ppo_loss": -0.991948127746582, + "sft_loss": 0.08620632439851761, + "step": 361 + }, + { + "epoch": 0.88, + "grad_norm": 1.2196710969455766, + "importance_ratio": 0.796875, + "kl_div": -0.22868932783603668, + "kl_div_neg": -0.22868932783603668, + "learning_rate": 1.4274061990212072e-06, + "loss": -0.0005, + "ppo_loss": 0.8147095441818237, + "step": 362 + }, + { + "epoch": 0.88, + "grad_norm": 4.837035810885735, + "importance_ratio": 0.96875, + "kl_div": -0.005435936152935028, + "kl_div_pos": -0.03251469507813454, + "kl_div_sft": 0.02164282277226448, + "learning_rate": 1.4257748776508972e-06, + "loss": -0.2686, + "ppo_loss": -0.9680081605911255, + "sft_loss": 0.076421357691288, + "step": 363 + }, + { + "epoch": 0.88, + "grad_norm": 1.3897824252978406, + "importance_ratio": 0.74609375, + "kl_div": -0.29730668663978577, + "kl_div_neg": -0.29730668663978577, + "learning_rate": 1.4241435562805872e-06, + "loss": 0.011, + "ppo_loss": 0.800000011920929, + "step": 364 + }, + { + "epoch": 0.88, + "grad_norm": 1.8646535877937016, + "importance_ratio": 0.984375, + "kl_div": -0.015370756387710571, + "kl_div_pos": -0.015370756387710571, + "learning_rate": 1.4225122349102771e-06, + "loss": 0.0536, + "ppo_loss": -0.9847500324249268, + "step": 365 + }, + { + "epoch": 0.89, + "grad_norm": 2.2570258753505583, + "importance_ratio": 0.80078125, + "kl_div": -0.23754560947418213, + "kl_div_pos": -0.22059480845928192, + "kl_div_sft": -0.25449639558792114, + "learning_rate": 1.4208809135399673e-06, + "loss": 0.2512, + "ppo_loss": -0.8020416498184204, + "sft_loss": 0.29686230421066284, + "step": 366 + }, + { + "epoch": 0.89, + "grad_norm": 3.6046846638615846, + "importance_ratio": 0.94921875, + "kl_div": -0.01577794924378395, + "kl_div_pos": -0.05074576660990715, + "kl_div_sft": 0.019189869984984398, + "learning_rate": 1.4192495921696575e-06, + "loss": -0.1423, + "ppo_loss": -0.9505202770233154, + "sft_loss": 0.0688878744840622, + "step": 367 + }, + { + "epoch": 0.89, + "grad_norm": 3.187210369041152, + "importance_ratio": 0.9921875, + "kl_div": -0.008837157860398293, + "kl_div_pos": -0.006236175075173378, + "kl_div_sft": -0.011438139714300632, + "learning_rate": 1.4176182707993473e-06, + "loss": -0.1118, + "ppo_loss": -0.993783175945282, + "sft_loss": 0.06663285195827484, + "step": 368 + }, + { + "epoch": 0.89, + "grad_norm": 2.231233001558751, + "importance_ratio": 0.89453125, + "kl_div": -0.0557381734251976, + "kl_div_neg": -0.10951899737119675, + "kl_div_sft": -0.0019573522731661797, + "learning_rate": 1.4159869494290375e-06, + "loss": -0.1313, + "ppo_loss": 0.8962652087211609, + "sft_loss": 0.07832780480384827, + "step": 369 + }, + { + "epoch": 0.9, + "grad_norm": 1.2428612471342046, + "importance_ratio": 0.953125, + "kl_div": -0.019023537635803223, + "kl_div_pos": -0.04916612431406975, + "kl_div_sft": 0.011119049973785877, + "learning_rate": 1.4143556280587275e-06, + "loss": 0.0106, + "ppo_loss": -0.952022910118103, + "sft_loss": 0.05877070873975754, + "step": 370 + }, + { + "epoch": 0.9, + "grad_norm": 12.252440540330564, + "kl_div": 0.0018844157457351685, + "kl_div_sft": 0.0018844157457351685, + "learning_rate": 1.4127243066884177e-06, + "loss": 0.0404, + "sft_loss": 0.03680435195565224, + "step": 371 + }, + { + "epoch": 0.9, + "grad_norm": 1.581418896125047, + "importance_ratio": 0.9921875, + "kl_div": -0.0039640204049646854, + "kl_div_pos": -0.008107632398605347, + "kl_div_sft": 0.0001795917487470433, + "learning_rate": 1.4110929853181075e-06, + "loss": -0.0892, + "ppo_loss": -0.9919251799583435, + "sft_loss": 0.045454829931259155, + "step": 372 + }, + { + "epoch": 0.9, + "grad_norm": 1.1601368809377202, + "importance_ratio": 0.77734375, + "kl_div": -0.12817691266536713, + "kl_div_neg": -0.25409457087516785, + "kl_div_sft": -0.002259261906147003, + "learning_rate": 1.4094616639477976e-06, + "loss": 0.0758, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.09486612677574158, + "step": 373 + }, + { + "epoch": 0.91, + "grad_norm": 1.3684898930581635, + "importance_ratio": 0.81640625, + "kl_div": -0.20381605625152588, + "kl_div_neg": -0.2413814812898636, + "kl_div_pos": -0.16625064611434937, + "learning_rate": 1.4078303425774878e-06, + "loss": 0.0803, + "ppo_loss": -0.023416996002197266, + "step": 374 + }, + { + "epoch": 0.91, + "grad_norm": 1.258158807883002, + "importance_ratio": 0.9921875, + "kl_div": -0.010114632546901703, + "kl_div_pos": -0.010114632546901703, + "learning_rate": 1.4061990212071776e-06, + "loss": -0.1264, + "ppo_loss": -0.9899537563323975, + "step": 375 + }, + { + "epoch": 0.91, + "grad_norm": 0.9657481661272326, + "kl_div": 0.006769304163753986, + "kl_div_sft": 0.006769304163753986, + "learning_rate": 1.4045676998368678e-06, + "loss": -0.1026, + "sft_loss": 0.03854276239871979, + "step": 376 + }, + { + "epoch": 0.91, + "grad_norm": 2.465672566737089, + "importance_ratio": 1.0, + "kl_div": 0.005747776944190264, + "kl_div_pos": -0.0016325340839102864, + "kl_div_sft": 0.013128087855875492, + "learning_rate": 1.4029363784665578e-06, + "loss": -0.0012, + "ppo_loss": -0.9983687996864319, + "sft_loss": 0.05518368259072304, + "step": 377 + }, + { + "epoch": 0.92, + "grad_norm": 1.1353959189569596, + "importance_ratio": 0.8046875, + "kl_div": -0.2200791984796524, + "kl_div_neg": -0.2200791984796524, + "learning_rate": 1.401305057096248e-06, + "loss": -0.0909, + "ppo_loss": 0.8086894750595093, + "step": 378 + }, + { + "epoch": 0.92, + "grad_norm": 9.269111067567923, + "importance_ratio": 0.828125, + "kl_div": -0.20169486105442047, + "kl_div_neg": -0.3609882593154907, + "kl_div_pos": -0.04240145534276962, + "learning_rate": 1.399673735725938e-06, + "loss": 0.0275, + "ppo_loss": -0.07924246788024902, + "step": 379 + }, + { + "epoch": 0.92, + "grad_norm": 1.125246074043346, + "kl_div": -0.07262912392616272, + "kl_div_sft": -0.07262912392616272, + "learning_rate": 1.398042414355628e-06, + "loss": -0.0746, + "sft_loss": 0.11740092933177948, + "step": 380 + }, + { + "epoch": 0.92, + "grad_norm": 1.1147266307707406, + "kl_div": -0.002457402413710952, + "kl_div_sft": -0.002457402413710952, + "learning_rate": 1.3964110929853182e-06, + "loss": -0.0434, + "sft_loss": 0.04604826122522354, + "step": 381 + }, + { + "epoch": 0.93, + "grad_norm": 3.2083150359453554, + "importance_ratio": 0.9609375, + "kl_div": -0.015367057174444199, + "kl_div_pos": -0.04033532366156578, + "kl_div_sft": 0.009601208381354809, + "learning_rate": 1.3947797716150081e-06, + "loss": -0.001, + "ppo_loss": -0.9604672789573669, + "sft_loss": 0.020802391692996025, + "step": 382 + }, + { + "epoch": 0.93, + "grad_norm": 1.150521806837677, + "importance_ratio": 0.75390625, + "kl_div": -0.28163760900497437, + "kl_div_neg": -0.28163760900497437, + "learning_rate": 1.3931484502446981e-06, + "loss": 0.0511, + "ppo_loss": 0.800000011920929, + "step": 383 + }, + { + "epoch": 0.93, + "grad_norm": 0.8816599791789368, + "kl_div": -4.7477660700678825e-06, + "kl_div_sft": -4.7477660700678825e-06, + "learning_rate": 1.3915171288743881e-06, + "loss": 0.0797, + "sft_loss": 0.07828231900930405, + "step": 384 + }, + { + "epoch": 0.93, + "grad_norm": 1.8813316251466288, + "kl_div": -0.03393517807126045, + "kl_div_sft": -0.03393517807126045, + "learning_rate": 1.3898858075040783e-06, + "loss": -0.0095, + "sft_loss": 0.12192416191101074, + "step": 385 + }, + { + "epoch": 0.94, + "grad_norm": 1.2005491549887957, + "kl_div": -0.0066083818674087524, + "kl_div_sft": -0.0066083818674087524, + "learning_rate": 1.3882544861337683e-06, + "loss": -0.0687, + "sft_loss": 0.050567544996738434, + "step": 386 + }, + { + "epoch": 0.94, + "grad_norm": 5.512368772852106, + "importance_ratio": 0.9453125, + "kl_div": -0.03097151219844818, + "kl_div_neg": -0.05789247900247574, + "kl_div_sft": -0.004050543997436762, + "learning_rate": 1.3866231647634583e-06, + "loss": -0.1663, + "ppo_loss": 0.9437513947486877, + "sft_loss": 0.08977137506008148, + "step": 387 + }, + { + "epoch": 0.94, + "grad_norm": 4.165481525992378, + "importance_ratio": 0.98828125, + "kl_div": -0.009134024381637573, + "kl_div_neg": -0.013471991755068302, + "kl_div_sft": -0.004796057473868132, + "learning_rate": 1.3849918433931485e-06, + "loss": 0.0507, + "ppo_loss": 0.9866183400154114, + "sft_loss": 0.047184206545352936, + "step": 388 + }, + { + "epoch": 0.94, + "grad_norm": 2.342263220643977, + "kl_div": -0.0007208693423308432, + "kl_div_sft": -0.0007208693423308432, + "learning_rate": 1.3833605220228385e-06, + "loss": -0.0463, + "sft_loss": 0.02857479453086853, + "step": 389 + }, + { + "epoch": 0.95, + "grad_norm": 1.3386212435370513, + "kl_div": 0.00022949720732867718, + "kl_div_sft": 0.00022949720732867718, + "learning_rate": 1.3817292006525284e-06, + "loss": -0.1283, + "sft_loss": 0.06676965206861496, + "step": 390 + }, + { + "epoch": 0.95, + "grad_norm": 1.335906803603844, + "kl_div": 0.006292227655649185, + "kl_div_sft": 0.006292227655649185, + "learning_rate": 1.3800978792822184e-06, + "loss": -0.0428, + "sft_loss": 0.07561105489730835, + "step": 391 + }, + { + "epoch": 0.95, + "grad_norm": 1.0108704416477208, + "importance_ratio": 0.7890625, + "kl_div": -0.23826153576374054, + "kl_div_neg": -0.23826153576374054, + "learning_rate": 1.3784665579119086e-06, + "loss": -0.0507, + "ppo_loss": 0.811415433883667, + "step": 392 + }, + { + "epoch": 0.95, + "grad_norm": 2.175136614488159, + "importance_ratio": 0.98828125, + "kl_div": -0.001840903889387846, + "kl_div_pos": -0.01280925888568163, + "kl_div_sft": 0.009127451106905937, + "learning_rate": 1.3768352365415988e-06, + "loss": -0.0873, + "ppo_loss": -0.9872723817825317, + "sft_loss": 0.13365277647972107, + "step": 393 + }, + { + "epoch": 0.96, + "grad_norm": 2.628608312629397, + "importance_ratio": 0.859375, + "kl_div": -0.0726788267493248, + "kl_div_neg": -0.15091568231582642, + "kl_div_sft": 0.005558023229241371, + "learning_rate": 1.3752039151712886e-06, + "loss": -0.004, + "ppo_loss": 0.8599202632904053, + "sft_loss": 0.078884057700634, + "step": 394 + }, + { + "epoch": 0.96, + "grad_norm": 0.8525592325278135, + "importance_ratio": 0.8671875, + "kl_div": -0.152493417263031, + "kl_div_neg": -0.152493417263031, + "learning_rate": 1.3735725938009788e-06, + "loss": 0.0276, + "ppo_loss": 0.8968149423599243, + "step": 395 + }, + { + "epoch": 0.96, + "grad_norm": 1.293563659811949, + "importance_ratio": 0.875, + "kl_div": -0.14297887682914734, + "kl_div_neg": -0.27100446820259094, + "kl_div_pos": -0.014953281730413437, + "learning_rate": 1.3719412724306688e-06, + "loss": -0.0321, + "ppo_loss": -0.09257897734642029, + "step": 396 + }, + { + "epoch": 0.96, + "grad_norm": 3.674110154354462, + "importance_ratio": 1.0, + "kl_div": -0.004077698569744825, + "kl_div_pos": 0.0026960691902786493, + "kl_div_sft": -0.010851466096937656, + "learning_rate": 1.3703099510603587e-06, + "loss": 0.0206, + "ppo_loss": -1.0026997327804565, + "sft_loss": 0.07455823570489883, + "step": 397 + }, + { + "epoch": 0.96, + "grad_norm": 2.217661630960872, + "importance_ratio": 0.73828125, + "kl_div": -0.14972834289073944, + "kl_div_neg": -0.3059646189212799, + "kl_div_sft": 0.006507941521704197, + "learning_rate": 1.3686786296900487e-06, + "loss": 0.0537, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.017921945080161095, + "step": 398 + }, + { + "epoch": 0.97, + "grad_norm": 4.9095395474020656, + "importance_ratio": 1.0078125, + "kl_div": -0.04424430802464485, + "kl_div_pos": 0.007046323735266924, + "kl_div_sft": -0.09553494304418564, + "learning_rate": 1.367047308319739e-06, + "loss": -0.1444, + "ppo_loss": -1.0070712566375732, + "sft_loss": 0.14671209454536438, + "step": 399 + }, + { + "epoch": 0.97, + "grad_norm": 0.9939955809780883, + "importance_ratio": 0.83984375, + "kl_div": -0.18479210138320923, + "kl_div_neg": -0.18479210138320923, + "learning_rate": 1.3654159869494291e-06, + "loss": -0.0179, + "ppo_loss": 0.8847052454948425, + "step": 400 + }, + { + "epoch": 0.97, + "grad_norm": 3.032307391233481, + "kl_div": -0.008666617795825005, + "kl_div_sft": -0.008666617795825005, + "learning_rate": 1.363784665579119e-06, + "loss": 0.1489, + "sft_loss": 0.06816105544567108, + "step": 401 + }, + { + "epoch": 0.97, + "grad_norm": 0.9823549221271792, + "importance_ratio": 0.96484375, + "kl_div": -0.02337481454014778, + "kl_div_pos": -0.03435087203979492, + "kl_div_sft": -0.012398757971823215, + "learning_rate": 1.362153344208809e-06, + "loss": -0.0555, + "ppo_loss": -0.966232419013977, + "sft_loss": 0.07946991920471191, + "step": 402 + }, + { + "epoch": 0.98, + "grad_norm": 1.0897954020833909, + "importance_ratio": 0.96875, + "kl_div": -0.011266342364251614, + "kl_div_pos": -0.03210162743926048, + "kl_div_sft": 0.009568942710757256, + "learning_rate": 1.360522022838499e-06, + "loss": 0.0457, + "ppo_loss": -0.9684081673622131, + "sft_loss": 0.04152145981788635, + "step": 403 + }, + { + "epoch": 0.98, + "grad_norm": 0.8808380936380005, + "importance_ratio": 0.8203125, + "kl_div": -0.10881634056568146, + "kl_div_neg": -0.1977686733007431, + "kl_div_sft": -0.019864002242684364, + "learning_rate": 1.3588907014681893e-06, + "loss": -0.1877, + "ppo_loss": 0.8205596804618835, + "sft_loss": 0.08723907917737961, + "step": 404 + }, + { + "epoch": 0.98, + "grad_norm": 1.8881065051054673, + "importance_ratio": 0.9453125, + "kl_div": -0.0581141859292984, + "kl_div_pos": -0.0581141859292984, + "learning_rate": 1.3572593800978793e-06, + "loss": 0.0755, + "ppo_loss": -0.9441255927085876, + "step": 405 + }, + { + "epoch": 0.98, + "grad_norm": 1.1999381327575513, + "importance_ratio": 0.69140625, + "kl_div": -0.1756732314825058, + "kl_div_neg": -0.36646994948387146, + "kl_div_sft": 0.015123496763408184, + "learning_rate": 1.3556280587275692e-06, + "loss": -0.0247, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.07821010053157806, + "step": 406 + }, + { + "epoch": 0.99, + "grad_norm": 3.3795522436129866, + "importance_ratio": 0.9609375, + "kl_div": -0.020731983706355095, + "kl_div_pos": -0.03792019933462143, + "kl_div_sft": -0.003543769009411335, + "learning_rate": 1.3539967373572594e-06, + "loss": -0.084, + "ppo_loss": -0.96278977394104, + "sft_loss": 0.045178286731243134, + "step": 407 + }, + { + "epoch": 0.99, + "grad_norm": 1.1472833244353395, + "kl_div": 0.004671051632612944, + "kl_div_sft": 0.004671051632612944, + "learning_rate": 1.3523654159869492e-06, + "loss": -0.1033, + "sft_loss": 0.044013626873493195, + "step": 408 + }, + { + "epoch": 0.99, + "grad_norm": 0.998406093622832, + "kl_div": 0.000133444438688457, + "kl_div_sft": 0.000133444438688457, + "learning_rate": 1.3507340946166394e-06, + "loss": 0.0161, + "sft_loss": 0.04665825143456459, + "step": 409 + }, + { + "epoch": 0.99, + "grad_norm": 0.8757673713339846, + "importance_ratio": 0.98046875, + "kl_div": -0.010525347664952278, + "kl_div_pos": -0.02144579030573368, + "kl_div_sft": 0.000395095266867429, + "learning_rate": 1.3491027732463294e-06, + "loss": -0.1966, + "ppo_loss": -0.978782594203949, + "sft_loss": 0.04445347562432289, + "step": 410 + }, + { + "epoch": 1.0, + "grad_norm": 0.8917975487123769, + "importance_ratio": 0.79296875, + "kl_div": -0.10984192788600922, + "kl_div_neg": -0.23029771447181702, + "kl_div_sft": 0.010613863356411457, + "learning_rate": 1.3474714518760196e-06, + "loss": 0.0826, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.03369507938623428, + "step": 411 + }, + { + "epoch": 1.0, + "grad_norm": 0.985083975625179, + "importance_ratio": 1.015625, + "kl_div": 0.0019731326028704643, + "kl_div_pos": 0.015019123442471027, + "kl_div_sft": -0.011072858236730099, + "learning_rate": 1.3458401305057096e-06, + "loss": 0.0667, + "ppo_loss": -1.0151324272155762, + "sft_loss": 0.11207890510559082, + "step": 412 + }, + { + "epoch": 1.0, + "grad_norm": 2.4819049191887, + "importance_ratio": 0.80859375, + "kl_div": -0.12128035724163055, + "kl_div_neg": -0.21045958995819092, + "kl_div_sft": -0.03210112079977989, + "learning_rate": 1.3442088091353996e-06, + "loss": -0.0648, + "ppo_loss": 0.8102118372917175, + "sft_loss": 0.06167708337306976, + "step": 413 + }, + { + "epoch": 1.0, + "grad_norm": 2.7244534355576664, + "importance_ratio": 0.7578125, + "kl_div": -0.28326472640037537, + "kl_div_neg": -0.4056023061275482, + "kl_div_pos": -0.1609271615743637, + "learning_rate": 1.3425774877650897e-06, + "loss": -0.0573, + "ppo_loss": -0.02567705512046814, + "step": 414 + }, + { + "epoch": 1.01, + "grad_norm": 1.059758914036212, + "importance_ratio": 0.8125, + "kl_div": -0.12452328205108643, + "kl_div_neg": -0.2088662087917328, + "kl_div_sft": -0.04018034785985947, + "learning_rate": 1.3409461663947797e-06, + "loss": -0.1405, + "ppo_loss": 0.8115037679672241, + "sft_loss": 0.11398042738437653, + "step": 415 + }, + { + "epoch": 1.01, + "grad_norm": 1.0173488365940238, + "kl_div": 0.010434374213218689, + "kl_div_sft": 0.010434374213218689, + "learning_rate": 1.3393148450244697e-06, + "loss": 0.0197, + "sft_loss": 0.02324233204126358, + "step": 416 + }, + { + "epoch": 1.01, + "grad_norm": 1.5338649609050454, + "importance_ratio": 1.0625, + "kl_div": 0.03886265680193901, + "kl_div_pos": 0.0616186261177063, + "kl_div_sft": 0.016106685623526573, + "learning_rate": 1.3376835236541597e-06, + "loss": -0.0194, + "ppo_loss": -1.0635566711425781, + "sft_loss": 0.0324275866150856, + "step": 417 + }, + { + "epoch": 1.01, + "grad_norm": 0.8521199632775311, + "importance_ratio": 0.90625, + "kl_div": -0.044288281351327896, + "kl_div_neg": -0.10058998316526413, + "kl_div_sft": 0.012013423256576061, + "learning_rate": 1.33605220228385e-06, + "loss": 0.047, + "ppo_loss": 0.9043037295341492, + "sft_loss": 0.10559868067502975, + "step": 418 + }, + { + "epoch": 1.02, + "grad_norm": 6.040839127614859, + "importance_ratio": 0.90625, + "kl_div": -0.10411019623279572, + "kl_div_neg": -0.22670073807239532, + "kl_div_pos": 0.018480347469449043, + "learning_rate": 1.33442088091354e-06, + "loss": 0.0881, + "ppo_loss": -0.10932603478431702, + "step": 419 + }, + { + "epoch": 1.02, + "grad_norm": 0.941359079532538, + "importance_ratio": 0.86328125, + "kl_div": -0.06987014412879944, + "kl_div_neg": -0.14838054776191711, + "kl_div_sft": 0.00864026416093111, + "learning_rate": 1.3327895595432299e-06, + "loss": 0.0411, + "ppo_loss": 0.8621029853820801, + "sft_loss": 0.03195761516690254, + "step": 420 + }, + { + "epoch": 1.02, + "grad_norm": 1.4519205331415717, + "importance_ratio": 0.90625, + "kl_div": -0.1107395812869072, + "kl_div_neg": -0.2647498846054077, + "kl_div_pos": 0.04327072575688362, + "learning_rate": 1.33115823817292e-06, + "loss": -0.0077, + "ppo_loss": -0.1221102774143219, + "step": 421 + }, + { + "epoch": 1.02, + "grad_norm": 0.6827252784033078, + "importance_ratio": 0.76171875, + "kl_div": -0.27574872970581055, + "kl_div_neg": -0.35648322105407715, + "kl_div_pos": -0.19501426815986633, + "learning_rate": 1.32952691680261e-06, + "loss": -0.1515, + "ppo_loss": -0.011411458253860474, + "step": 422 + }, + { + "epoch": 1.03, + "grad_norm": 3.1655908731502045, + "kl_div": -0.0026373867876827717, + "kl_div_sft": -0.0026373867876827717, + "learning_rate": 1.3278955954323e-06, + "loss": -0.0047, + "sft_loss": 0.05154386907815933, + "step": 423 + }, + { + "epoch": 1.03, + "grad_norm": 1.1272914120902908, + "kl_div": 0.006313965655863285, + "kl_div_sft": 0.006313965655863285, + "learning_rate": 1.3262642740619902e-06, + "loss": -0.0953, + "sft_loss": 0.06390878558158875, + "step": 424 + }, + { + "epoch": 1.03, + "grad_norm": 0.9623445273828378, + "importance_ratio": 0.99609375, + "kl_div": 0.0017089867033064365, + "kl_div_pos": -0.0037163347005844116, + "kl_div_sft": 0.007134308107197285, + "learning_rate": 1.3246329526916802e-06, + "loss": -0.1132, + "ppo_loss": -0.9962905645370483, + "sft_loss": 0.05456602945923805, + "step": 425 + }, + { + "epoch": 1.03, + "grad_norm": 0.8058896020720482, + "importance_ratio": 0.75, + "kl_div": -0.2867456078529358, + "kl_div_neg": -0.2867456078529358, + "learning_rate": 1.3230016313213704e-06, + "loss": -0.0598, + "ppo_loss": 0.800000011920929, + "step": 426 + }, + { + "epoch": 1.04, + "grad_norm": 1.1770250705489431, + "importance_ratio": 0.89453125, + "kl_div": -0.1184467077255249, + "kl_div_neg": -0.2329927533864975, + "kl_div_pos": -0.0039006578736007214, + "learning_rate": 1.3213703099510602e-06, + "loss": -0.0331, + "ppo_loss": -0.0980534553527832, + "step": 427 + }, + { + "epoch": 1.04, + "grad_norm": 1.0871435918710617, + "importance_ratio": 0.578125, + "kl_div": -0.27000996470451355, + "kl_div_neg": -0.5461243987083435, + "kl_div_sft": 0.006104460451751947, + "learning_rate": 1.3197389885807504e-06, + "loss": 0.0703, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.0648282840847969, + "step": 428 + }, + { + "epoch": 1.04, + "grad_norm": 1.8659609672618034, + "importance_ratio": 1.015625, + "kl_div": 0.01871345564723015, + "kl_div_pos": 0.012792662717401981, + "kl_div_sft": 0.02463424950838089, + "learning_rate": 1.3181076672104404e-06, + "loss": 0.0333, + "ppo_loss": -1.0128748416900635, + "sft_loss": 0.08509034663438797, + "step": 429 + }, + { + "epoch": 1.04, + "grad_norm": 1.0429954953997032, + "importance_ratio": 1.0078125, + "kl_div": -0.006376433186233044, + "kl_div_pos": 0.006001722067594528, + "kl_div_sft": -0.018754588440060616, + "learning_rate": 1.3164763458401306e-06, + "loss": -0.0295, + "ppo_loss": -1.0060198307037354, + "sft_loss": 0.10955341160297394, + "step": 430 + }, + { + "epoch": 1.04, + "grad_norm": 3.2222155830344135, + "importance_ratio": 0.97265625, + "kl_div": -0.01591493934392929, + "kl_div_pos": -0.029178854078054428, + "kl_div_sft": -0.002651023678481579, + "learning_rate": 1.3148450244698205e-06, + "loss": -0.1148, + "ppo_loss": -0.9712427258491516, + "sft_loss": 0.047940175980329514, + "step": 431 + }, + { + "epoch": 1.05, + "grad_norm": 1.3930729953785197, + "importance_ratio": 0.984375, + "kl_div": -0.09542819857597351, + "kl_div_pos": -0.014116327278316021, + "kl_div_sft": -0.17674006521701813, + "learning_rate": 1.3132137030995105e-06, + "loss": -0.103, + "ppo_loss": -0.9859828352928162, + "sft_loss": 0.2210320681333542, + "step": 432 + }, + { + "epoch": 1.05, + "grad_norm": 1.1302485422227517, + "importance_ratio": 0.73046875, + "kl_div": -0.15205544233322144, + "kl_div_neg": -0.31576773524284363, + "kl_div_sft": 0.011656835675239563, + "learning_rate": 1.3115823817292007e-06, + "loss": -0.0245, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.07695038616657257, + "step": 433 + }, + { + "epoch": 1.05, + "grad_norm": 1.670193762702793, + "importance_ratio": 0.69140625, + "kl_div": -0.17609615623950958, + "kl_div_neg": -0.36890721321105957, + "kl_div_sft": 0.01671488955616951, + "learning_rate": 1.3099510603588905e-06, + "loss": -0.1094, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.05042430758476257, + "step": 434 + }, + { + "epoch": 1.05, + "grad_norm": 1.2478484233268006, + "importance_ratio": 0.67578125, + "kl_div": -0.18994270265102386, + "kl_div_neg": -0.3907208740711212, + "kl_div_sft": 0.010835465043783188, + "learning_rate": 1.3083197389885807e-06, + "loss": 0.0549, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.021010208874940872, + "step": 435 + }, + { + "epoch": 1.06, + "grad_norm": 1.1062430645569032, + "importance_ratio": 1.0, + "kl_div": -0.0013482579961419106, + "kl_div_neg": -0.014547939412295818, + "kl_div_pos": 0.011851423420011997, + "learning_rate": 1.3066884176182707e-06, + "loss": -0.0807, + "ppo_loss": -0.013182252645492554, + "step": 436 + }, + { + "epoch": 1.06, + "grad_norm": 2.2080934382239104, + "importance_ratio": 1.03125, + "kl_div": 0.017685813829302788, + "kl_div_pos": 0.030767135322093964, + "kl_div_sft": 0.004604491405189037, + "learning_rate": 1.3050570962479609e-06, + "loss": -0.0194, + "ppo_loss": -1.0312453508377075, + "sft_loss": 0.17565016448497772, + "step": 437 + }, + { + "epoch": 1.06, + "grad_norm": 1.1770000540831367, + "importance_ratio": 1.0546875, + "kl_div": 0.01865413226187229, + "kl_div_pos": 0.05109777674078941, + "kl_div_sft": -0.01378951221704483, + "learning_rate": 1.3034257748776508e-06, + "loss": -0.0517, + "ppo_loss": -1.052425742149353, + "sft_loss": 0.08110073208808899, + "step": 438 + }, + { + "epoch": 1.06, + "grad_norm": 1.2024259146314618, + "importance_ratio": 1.03125, + "kl_div": 0.03114350140094757, + "kl_div_pos": 0.03114350140094757, + "learning_rate": 1.3017944535073408e-06, + "loss": 0.0292, + "ppo_loss": -1.0317292213439941, + "step": 439 + }, + { + "epoch": 1.07, + "grad_norm": 1.219094056691804, + "kl_div": -0.028548669070005417, + "kl_div_sft": -0.028548669070005417, + "learning_rate": 1.300163132137031e-06, + "loss": 0.0065, + "sft_loss": 0.11218331754207611, + "step": 440 + }, + { + "epoch": 1.07, + "grad_norm": 3.7683389467998696, + "importance_ratio": 0.8828125, + "kl_div": -0.143281489610672, + "kl_div_pos": -0.143281489610672, + "learning_rate": 1.298531810766721e-06, + "loss": 0.0254, + "ppo_loss": -0.8847619295120239, + "step": 441 + }, + { + "epoch": 1.07, + "grad_norm": 1.0317494310100623, + "importance_ratio": 0.7421875, + "kl_div": -0.15573081374168396, + "kl_div_neg": -0.2998103201389313, + "kl_div_sft": -0.011651305481791496, + "learning_rate": 1.296900489396411e-06, + "loss": -0.0031, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.09555532038211823, + "step": 442 + }, + { + "epoch": 1.07, + "grad_norm": 1.1342805492078307, + "importance_ratio": 1.03125, + "kl_div": -0.00379384309053421, + "kl_div_pos": 0.03288000449538231, + "kl_div_sft": -0.04046769067645073, + "learning_rate": 1.295269168026101e-06, + "loss": -0.0779, + "ppo_loss": -1.0334265232086182, + "sft_loss": 0.14797383546829224, + "step": 443 + }, + { + "epoch": 1.08, + "grad_norm": 1.3489916391078236, + "importance_ratio": 1.0, + "kl_div": 0.004089490510523319, + "kl_div_pos": 0.004089490510523319, + "learning_rate": 1.2936378466557912e-06, + "loss": -0.1421, + "ppo_loss": -1.0044682025909424, + "step": 444 + }, + { + "epoch": 1.08, + "grad_norm": 1.7208118902291312, + "kl_div": 0.01862707920372486, + "kl_div_sft": 0.01862707920372486, + "learning_rate": 1.2920065252854812e-06, + "loss": -0.0981, + "sft_loss": 0.06423333287239075, + "step": 445 + }, + { + "epoch": 1.08, + "grad_norm": 2.2753196868971592, + "importance_ratio": 0.9765625, + "kl_div": -0.02230643853545189, + "kl_div_neg": -0.06035029515624046, + "kl_div_pos": 0.015737419947981834, + "learning_rate": 1.2903752039151711e-06, + "loss": -0.0008, + "ppo_loss": -0.03721359372138977, + "step": 446 + }, + { + "epoch": 1.08, + "grad_norm": 3.5354757765214044, + "importance_ratio": 0.9765625, + "kl_div": -0.010805973783135414, + "kl_div_pos": -0.023163456469774246, + "kl_div_sft": 0.001551508903503418, + "learning_rate": 1.2887438825448613e-06, + "loss": -0.0635, + "ppo_loss": -0.9771027565002441, + "sft_loss": 0.05742768198251724, + "step": 447 + }, + { + "epoch": 1.09, + "grad_norm": 1.1104740555028498, + "importance_ratio": 1.015625, + "kl_div": 0.010768914595246315, + "kl_div_pos": 0.015338733792304993, + "kl_div_sft": 0.006199096329510212, + "learning_rate": 1.2871125611745513e-06, + "loss": 0.0476, + "ppo_loss": -1.0154569149017334, + "sft_loss": 0.034216687083244324, + "step": 448 + }, + { + "epoch": 1.09, + "grad_norm": 0.9657073075359999, + "importance_ratio": 1.03125, + "kl_div": 0.019894614815711975, + "kl_div_pos": 0.03344016894698143, + "kl_div_sft": 0.00634906068444252, + "learning_rate": 1.2854812398042413e-06, + "loss": 0.0014, + "ppo_loss": -1.0340055227279663, + "sft_loss": 0.10556098818778992, + "step": 449 + }, + { + "epoch": 1.09, + "grad_norm": 2.6250586730212717, + "importance_ratio": 1.015625, + "kl_div": 0.000533820129930973, + "kl_div_pos": 0.0163713488727808, + "kl_div_sft": -0.015303708612918854, + "learning_rate": 1.2838499184339315e-06, + "loss": 0.0689, + "ppo_loss": -1.0165060758590698, + "sft_loss": 0.08886807411909103, + "step": 450 + }, + { + "epoch": 1.09, + "grad_norm": 5.760296765735746, + "importance_ratio": 0.63671875, + "kl_div": -0.251537561416626, + "kl_div_neg": -0.45127934217453003, + "kl_div_sft": -0.05179579555988312, + "learning_rate": 1.2822185970636215e-06, + "loss": 0.0572, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.13668057322502136, + "step": 451 + }, + { + "epoch": 1.1, + "grad_norm": 3.3639225896576432, + "importance_ratio": 0.9375, + "kl_div": -0.03847883641719818, + "kl_div_neg": -0.065961554646492, + "kl_div_sft": -0.010996121913194656, + "learning_rate": 1.2805872756933117e-06, + "loss": 0.0486, + "ppo_loss": 0.9361668825149536, + "sft_loss": 0.0495573990046978, + "step": 452 + }, + { + "epoch": 1.1, + "grad_norm": 4.073796924321981, + "kl_div": -0.020316394045948982, + "kl_div_sft": -0.020316394045948982, + "learning_rate": 1.2789559543230015e-06, + "loss": -0.0297, + "sft_loss": 0.1286567747592926, + "step": 453 + }, + { + "epoch": 1.1, + "grad_norm": 1.0464566092173484, + "importance_ratio": 0.75390625, + "kl_div": -0.2827034592628479, + "kl_div_neg": -0.2827034592628479, + "learning_rate": 1.2773246329526917e-06, + "loss": -0.0794, + "ppo_loss": 0.800000011920929, + "step": 454 + }, + { + "epoch": 1.1, + "grad_norm": 1.2423138824085187, + "importance_ratio": 0.73046875, + "kl_div": -0.15301235020160675, + "kl_div_neg": -0.31398218870162964, + "kl_div_sft": 0.00795749295502901, + "learning_rate": 1.2756933115823816e-06, + "loss": 0.0354, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.04708397015929222, + "step": 455 + }, + { + "epoch": 1.11, + "grad_norm": 3.044844200019531, + "importance_ratio": 0.703125, + "kl_div": -0.356436163187027, + "kl_div_neg": -0.356436163187027, + "learning_rate": 1.2740619902120716e-06, + "loss": 0.122, + "ppo_loss": 0.800000011920929, + "step": 456 + }, + { + "epoch": 1.11, + "grad_norm": 2.7220759601182714, + "importance_ratio": 0.5078125, + "kl_div": -0.3348856568336487, + "kl_div_neg": -0.6812491416931152, + "kl_div_sft": 0.01147780753672123, + "learning_rate": 1.2724306688417618e-06, + "loss": 0.0421, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.03663746267557144, + "step": 457 + }, + { + "epoch": 1.11, + "grad_norm": 2.3232017536020098, + "kl_div": 0.007984797470271587, + "kl_div_sft": 0.007984797470271587, + "learning_rate": 1.2707993474714518e-06, + "loss": -0.1334, + "sft_loss": 0.034479185938835144, + "step": 458 + }, + { + "epoch": 1.11, + "grad_norm": 2.1881886079592605, + "importance_ratio": 1.0078125, + "kl_div": 0.0066819763742387295, + "kl_div_pos": 0.005560300312936306, + "kl_div_sft": 0.007803652435541153, + "learning_rate": 1.269168026101142e-06, + "loss": 0.0103, + "ppo_loss": -1.0055757761001587, + "sft_loss": 0.10367721319198608, + "step": 459 + }, + { + "epoch": 1.12, + "grad_norm": 1.2141547012175877, + "importance_ratio": 0.59765625, + "kl_div": -0.5515552163124084, + "kl_div_neg": -0.5515552163124084, + "learning_rate": 1.2675367047308318e-06, + "loss": 0.017, + "ppo_loss": 0.800000011920929, + "step": 460 + }, + { + "epoch": 1.12, + "grad_norm": 1.2829905526935033, + "importance_ratio": 0.859375, + "kl_div": -0.1534956693649292, + "kl_div_neg": -0.1534956693649292, + "learning_rate": 1.265905383360522e-06, + "loss": -0.0386, + "ppo_loss": 0.8584249019622803, + "step": 461 + }, + { + "epoch": 1.12, + "grad_norm": 2.182536351970647, + "importance_ratio": 0.85546875, + "kl_div": -0.06841041147708893, + "kl_div_neg": -0.1546468734741211, + "kl_div_sft": 0.017826057970523834, + "learning_rate": 1.264274061990212e-06, + "loss": 0.0789, + "ppo_loss": 0.8567176461219788, + "sft_loss": 0.012218399904668331, + "step": 462 + }, + { + "epoch": 1.12, + "grad_norm": 2.370907117860891, + "importance_ratio": 0.890625, + "kl_div": -0.05210436135530472, + "kl_div_neg": -0.11743360757827759, + "kl_div_sft": 0.013224886730313301, + "learning_rate": 1.2626427406199021e-06, + "loss": -0.1983, + "ppo_loss": 0.8891995549201965, + "sft_loss": 0.030821723863482475, + "step": 463 + }, + { + "epoch": 1.12, + "grad_norm": 3.6608074508667863, + "importance_ratio": 0.74609375, + "kl_div": -0.15941019356250763, + "kl_div_neg": -0.2910042107105255, + "kl_div_sft": -0.0278161708265543, + "learning_rate": 1.2610114192495921e-06, + "loss": -0.0598, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.09093907475471497, + "step": 464 + }, + { + "epoch": 1.13, + "grad_norm": 3.3330544514023743, + "importance_ratio": 0.90625, + "kl_div": -0.10372880846261978, + "kl_div_neg": -0.23328396677970886, + "kl_div_pos": 0.02582634426653385, + "learning_rate": 1.2593800978792821e-06, + "loss": -0.0951, + "ppo_loss": -0.11308136582374573, + "step": 465 + }, + { + "epoch": 1.13, + "grad_norm": 0.925412551278747, + "importance_ratio": 0.99609375, + "kl_div": -0.00442282110452652, + "kl_div_pos": -0.00442282110452652, + "learning_rate": 1.2577487765089723e-06, + "loss": -0.1149, + "ppo_loss": -0.9955874681472778, + "step": 466 + }, + { + "epoch": 1.13, + "grad_norm": 4.15379203451391, + "importance_ratio": 0.671875, + "kl_div": -0.19799412786960602, + "kl_div_neg": -0.3990720212459564, + "kl_div_sft": 0.003083764109760523, + "learning_rate": 1.256117455138662e-06, + "loss": -0.1798, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.049974698573350906, + "step": 467 + }, + { + "epoch": 1.13, + "grad_norm": 2.139147617605486, + "importance_ratio": 0.65234375, + "kl_div": -0.206068754196167, + "kl_div_neg": -0.42438942193984985, + "kl_div_sft": 0.012251907959580421, + "learning_rate": 1.2544861337683523e-06, + "loss": -0.0662, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.09358756244182587, + "step": 468 + }, + { + "epoch": 1.14, + "grad_norm": 6.902050277610847, + "kl_div": -0.0074700559489429, + "kl_div_sft": -0.0074700559489429, + "learning_rate": 1.2528548123980423e-06, + "loss": -0.1717, + "sft_loss": 0.08451381325721741, + "step": 469 + }, + { + "epoch": 1.14, + "grad_norm": 1.4255865930905123, + "importance_ratio": 0.56640625, + "kl_div": -0.2817477285861969, + "kl_div_neg": -0.5675917863845825, + "kl_div_sft": 0.0040963380597531796, + "learning_rate": 1.2512234910277325e-06, + "loss": 0.0424, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.04384386166930199, + "step": 470 + }, + { + "epoch": 1.14, + "grad_norm": 0.9755837991288189, + "importance_ratio": 0.83203125, + "kl_div": -0.21140116453170776, + "kl_div_neg": -0.4443986415863037, + "kl_div_pos": 0.021596306934952736, + "learning_rate": 1.2495921696574224e-06, + "loss": -0.0578, + "ppo_loss": -0.11091557145118713, + "step": 471 + }, + { + "epoch": 1.14, + "grad_norm": 6.980723952211021, + "importance_ratio": 1.03125, + "kl_div": 0.0269265566021204, + "kl_div_pos": 0.028264760971069336, + "kl_div_sft": 0.025588352233171463, + "learning_rate": 1.2479608482871124e-06, + "loss": -0.084, + "ppo_loss": -1.02866792678833, + "sft_loss": 0.09589555114507675, + "step": 472 + }, + { + "epoch": 1.15, + "grad_norm": 3.092916665405893, + "importance_ratio": 0.8515625, + "kl_div": -0.09231029450893402, + "kl_div_neg": -0.15904003381729126, + "kl_div_sft": -0.025580555200576782, + "learning_rate": 1.2463295269168026e-06, + "loss": -0.0882, + "ppo_loss": 0.8529621958732605, + "sft_loss": 0.0967254638671875, + "step": 473 + }, + { + "epoch": 1.15, + "grad_norm": 1.3537538437159136, + "kl_div": 0.011375309899449348, + "kl_div_sft": 0.011375309899449348, + "learning_rate": 1.2446982055464926e-06, + "loss": 0.1465, + "sft_loss": 0.028936251997947693, + "step": 474 + }, + { + "epoch": 1.15, + "grad_norm": 4.675210329924026, + "importance_ratio": 0.7578125, + "kl_div": -0.1408734768629074, + "kl_div_neg": -0.2784445285797119, + "kl_div_sft": -0.0033024323638528585, + "learning_rate": 1.2430668841761826e-06, + "loss": -0.1304, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.09153730422258377, + "step": 475 + }, + { + "epoch": 1.15, + "grad_norm": 1.3261338960268474, + "importance_ratio": 1.03125, + "kl_div": 0.015856783837080002, + "kl_div_pos": 0.029548203572630882, + "kl_div_sft": 0.0021653659641742706, + "learning_rate": 1.2414355628058728e-06, + "loss": -0.12, + "ppo_loss": -1.0299891233444214, + "sft_loss": 0.04419136047363281, + "step": 476 + }, + { + "epoch": 1.16, + "grad_norm": 1.8804943962551242, + "importance_ratio": 0.9140625, + "kl_div": -0.04008149728178978, + "kl_div_neg": -0.09007521718740463, + "kl_div_sft": 0.00991221982985735, + "learning_rate": 1.2398042414355628e-06, + "loss": 0.0098, + "ppo_loss": 0.9138624668121338, + "sft_loss": 0.07724327594041824, + "step": 477 + }, + { + "epoch": 1.16, + "grad_norm": 0.8030858813306728, + "importance_ratio": 0.90625, + "kl_div": -0.11344952136278152, + "kl_div_neg": -0.272797554731369, + "kl_div_pos": 0.04589850828051567, + "learning_rate": 1.238172920065253e-06, + "loss": -0.0467, + "ppo_loss": -0.12348410487174988, + "step": 478 + }, + { + "epoch": 1.16, + "grad_norm": 1.2816110427916472, + "importance_ratio": 1.1015625, + "kl_div": 0.0360209122300148, + "kl_div_pos": 0.09649316221475601, + "kl_div_sft": -0.02445133589208126, + "learning_rate": 1.2365415986949427e-06, + "loss": -0.0362, + "ppo_loss": -1.1013020277023315, + "sft_loss": 0.06209849193692207, + "step": 479 + }, + { + "epoch": 1.16, + "grad_norm": 1.037616727765905, + "importance_ratio": 1.0703125, + "kl_div": 0.02870725654065609, + "kl_div_pos": 0.06513284146785736, + "kl_div_sft": -0.007718327920883894, + "learning_rate": 1.234910277324633e-06, + "loss": 0.0174, + "ppo_loss": -1.067300796508789, + "sft_loss": 0.07386572659015656, + "step": 480 + }, + { + "epoch": 1.17, + "grad_norm": 4.106961894343179, + "importance_ratio": 0.984375, + "kl_div": -0.05532138794660568, + "kl_div_pos": -0.017548711970448494, + "kl_div_sft": -0.09309406578540802, + "learning_rate": 1.233278955954323e-06, + "loss": -0.0662, + "ppo_loss": -0.9826043844223022, + "sft_loss": 0.14551551640033722, + "step": 481 + }, + { + "epoch": 1.17, + "grad_norm": 1.3741362199404208, + "importance_ratio": 0.7734375, + "kl_div": -0.1376347541809082, + "kl_div_neg": -0.2552362382411957, + "kl_div_sft": -0.020033257082104683, + "learning_rate": 1.231647634584013e-06, + "loss": -0.0029, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.13010145723819733, + "step": 482 + }, + { + "epoch": 1.17, + "grad_norm": 2.532009246363337, + "importance_ratio": 0.75390625, + "kl_div": -0.28162503242492676, + "kl_div_neg": -0.28162503242492676, + "learning_rate": 1.230016313213703e-06, + "loss": 0.0856, + "ppo_loss": 0.800000011920929, + "step": 483 + }, + { + "epoch": 1.17, + "grad_norm": 1.1639983396321676, + "importance_ratio": 1.046875, + "kl_div": 0.02006608620285988, + "kl_div_pos": 0.04490042105317116, + "kl_div_sft": -0.004768249578773975, + "learning_rate": 1.228384991843393e-06, + "loss": -0.0734, + "ppo_loss": -1.0459237098693848, + "sft_loss": 0.043344151228666306, + "step": 484 + }, + { + "epoch": 1.18, + "grad_norm": 1.3736239396796828, + "kl_div": -0.0036171970423310995, + "kl_div_sft": -0.0036171970423310995, + "learning_rate": 1.2267536704730833e-06, + "loss": -0.0378, + "sft_loss": 0.0710780918598175, + "step": 485 + }, + { + "epoch": 1.18, + "grad_norm": 2.104948924480214, + "importance_ratio": 1.015625, + "kl_div": 0.012552674859762192, + "kl_div_pos": 0.012552674859762192, + "learning_rate": 1.225122349102773e-06, + "loss": -0.0419, + "ppo_loss": -1.0132044553756714, + "step": 486 + }, + { + "epoch": 1.18, + "grad_norm": 3.6187121348137707, + "importance_ratio": 0.8984375, + "kl_div": -0.12204791605472565, + "kl_div_neg": -0.2728288471698761, + "kl_div_pos": 0.028733013197779655, + "learning_rate": 1.2234910277324632e-06, + "loss": 0.0313, + "ppo_loss": -0.11457487940788269, + "step": 487 + }, + { + "epoch": 1.18, + "grad_norm": 1.8025541039758028, + "importance_ratio": 0.73828125, + "kl_div": -0.14328588545322418, + "kl_div_neg": -0.30288568139076233, + "kl_div_sft": 0.016313914209604263, + "learning_rate": 1.2218597063621532e-06, + "loss": 0.1435, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.040791235864162445, + "step": 488 + }, + { + "epoch": 1.19, + "grad_norm": 1.0924486320168931, + "importance_ratio": 0.9375, + "kl_div": -0.026130639016628265, + "kl_div_neg": -0.06307917833328247, + "kl_div_sft": 0.01081790216267109, + "learning_rate": 1.2202283849918434e-06, + "loss": 0.024, + "ppo_loss": 0.9388691782951355, + "sft_loss": 0.02481883391737938, + "step": 489 + }, + { + "epoch": 1.19, + "grad_norm": 0.8550332062390997, + "kl_div": 0.006395288743078709, + "kl_div_sft": 0.006395288743078709, + "learning_rate": 1.2185970636215334e-06, + "loss": -0.031, + "sft_loss": 0.0363774374127388, + "step": 490 + }, + { + "epoch": 1.19, + "grad_norm": 2.9976819937770967, + "kl_div": -0.024867737665772438, + "kl_div_sft": -0.024867737665772438, + "learning_rate": 1.2169657422512234e-06, + "loss": 0.0257, + "sft_loss": 0.11894671618938446, + "step": 491 + }, + { + "epoch": 1.19, + "grad_norm": 1.127945434988827, + "kl_div": 0.004560403060168028, + "kl_div_sft": 0.004560403060168028, + "learning_rate": 1.2153344208809136e-06, + "loss": 0.1788, + "sft_loss": 0.06735538691282272, + "step": 492 + }, + { + "epoch": 1.2, + "grad_norm": 1.2091607133143483, + "importance_ratio": 0.72265625, + "kl_div": -0.1636078655719757, + "kl_div_neg": -0.32448697090148926, + "kl_div_sft": -0.0027287707198411226, + "learning_rate": 1.2137030995106034e-06, + "loss": -0.2138, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.09669335186481476, + "step": 493 + }, + { + "epoch": 1.2, + "grad_norm": 2.0201115318705276, + "importance_ratio": 0.99609375, + "kl_div": -0.008533771149814129, + "kl_div_pos": -0.0022959003690630198, + "kl_div_sft": -0.014771642163395882, + "learning_rate": 1.2120717781402936e-06, + "loss": -0.1766, + "ppo_loss": -0.9977067112922668, + "sft_loss": 0.13752681016921997, + "step": 494 + }, + { + "epoch": 1.2, + "grad_norm": 3.9285253546849286, + "importance_ratio": 1.0, + "kl_div": -3.4654553019208834e-05, + "kl_div_pos": -2.737253271334339e-05, + "kl_div_sft": -4.1936571506084874e-05, + "learning_rate": 1.2104404567699838e-06, + "loss": -0.1369, + "ppo_loss": -0.9999725818634033, + "sft_loss": 0.08990359306335449, + "step": 495 + }, + { + "epoch": 1.2, + "grad_norm": 2.960428883694901, + "importance_ratio": 0.51171875, + "kl_div": -0.324387788772583, + "kl_div_neg": -0.6732898354530334, + "kl_div_sft": 0.024514272809028625, + "learning_rate": 1.2088091353996737e-06, + "loss": 0.0115, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.026017149910330772, + "step": 496 + }, + { + "epoch": 1.2, + "grad_norm": 1.1399086048064144, + "importance_ratio": 0.76953125, + "kl_div": -0.1295836865901947, + "kl_div_pos": -0.2599563002586365, + "kl_div_sft": 0.0007889288244768977, + "learning_rate": 1.2071778140293637e-06, + "loss": -0.0312, + "ppo_loss": -0.7710853219032288, + "sft_loss": 0.12191534042358398, + "step": 497 + }, + { + "epoch": 1.21, + "grad_norm": 1.391111567380043, + "kl_div": 0.016579890623688698, + "kl_div_sft": 0.016579890623688698, + "learning_rate": 1.2055464926590537e-06, + "loss": -0.1784, + "sft_loss": 0.06572773307561874, + "step": 498 + }, + { + "epoch": 1.21, + "grad_norm": 2.207309725586876, + "importance_ratio": 0.96484375, + "kl_div": -0.035914648324251175, + "kl_div_neg": -0.0898895412683487, + "kl_div_pos": 0.018060242757201195, + "learning_rate": 1.203915171288744e-06, + "loss": 0.0222, + "ppo_loss": -0.052096039056777954, + "step": 499 + }, + { + "epoch": 1.21, + "grad_norm": 0.9474518583040042, + "importance_ratio": 0.9609375, + "kl_div": -0.040214769542217255, + "kl_div_neg": -0.10180936008691788, + "kl_div_pos": 0.021379824727773666, + "learning_rate": 1.2022838499184339e-06, + "loss": 0.0347, + "ppo_loss": -0.059204161167144775, + "step": 500 + }, + { + "epoch": 1.21, + "grad_norm": 1.169849731055024, + "importance_ratio": 0.97265625, + "kl_div": -0.003386242315173149, + "kl_div_pos": -0.025821957737207413, + "kl_div_sft": 0.019049473106861115, + "learning_rate": 1.2006525285481239e-06, + "loss": -0.0243, + "ppo_loss": -0.9745085835456848, + "sft_loss": 0.07731864601373672, + "step": 501 + }, + { + "epoch": 1.22, + "grad_norm": 1.0761134413901443, + "kl_div": 0.007640148513019085, + "kl_div_sft": 0.007640148513019085, + "learning_rate": 1.199021207177814e-06, + "loss": -0.113, + "sft_loss": 0.04595714062452316, + "step": 502 + }, + { + "epoch": 1.22, + "grad_norm": 5.176839330233246, + "kl_div": 0.005883101373910904, + "kl_div_sft": 0.005883101373910904, + "learning_rate": 1.197389885807504e-06, + "loss": 0.0495, + "sft_loss": 0.1581135392189026, + "step": 503 + }, + { + "epoch": 1.22, + "grad_norm": 2.3081021116202844, + "importance_ratio": 1.0234375, + "kl_div": 0.018472742289304733, + "kl_div_pos": 0.02372855134308338, + "kl_div_sft": 0.013216935098171234, + "learning_rate": 1.1957585644371942e-06, + "loss": -0.0408, + "ppo_loss": -1.0240123271942139, + "sft_loss": 0.04031985253095627, + "step": 504 + }, + { + "epoch": 1.22, + "grad_norm": 1.3674385930139834, + "importance_ratio": 0.84765625, + "kl_div": -0.08100101351737976, + "kl_div_neg": -0.16434870660305023, + "kl_div_sft": 0.002346683293581009, + "learning_rate": 1.194127243066884e-06, + "loss": -0.1544, + "ppo_loss": 0.8484461903572083, + "sft_loss": 0.06900664418935776, + "step": 505 + }, + { + "epoch": 1.23, + "grad_norm": 1.0221263844603252, + "kl_div": 0.0132527407258749, + "kl_div_sft": 0.0132527407258749, + "learning_rate": 1.1924959216965742e-06, + "loss": 0.0204, + "sft_loss": 0.07407552003860474, + "step": 506 + }, + { + "epoch": 1.23, + "grad_norm": 1.1289567771084783, + "kl_div": 0.00977835152298212, + "kl_div_sft": 0.00977835152298212, + "learning_rate": 1.1908646003262642e-06, + "loss": 0.0007, + "sft_loss": 0.07620455324649811, + "step": 507 + }, + { + "epoch": 1.23, + "grad_norm": 3.1804808580565855, + "kl_div": 0.0007696398533880711, + "kl_div_sft": 0.0007696398533880711, + "learning_rate": 1.1892332789559542e-06, + "loss": -0.1673, + "sft_loss": 0.07429905235767365, + "step": 508 + }, + { + "epoch": 1.23, + "grad_norm": 2.4337740076700127, + "importance_ratio": 0.37109375, + "kl_div": -0.4959445297718048, + "kl_div_neg": -0.993360161781311, + "kl_div_sft": 0.0014711018884554505, + "learning_rate": 1.1876019575856444e-06, + "loss": -0.0313, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.10419715940952301, + "step": 509 + }, + { + "epoch": 1.24, + "grad_norm": 3.1832248216545675, + "importance_ratio": 1.0625, + "kl_div": 0.0376177541911602, + "kl_div_pos": 0.057756274938583374, + "kl_div_sft": 0.01747923344373703, + "learning_rate": 1.1859706362153344e-06, + "loss": -0.1664, + "ppo_loss": -1.0594568252563477, + "sft_loss": 0.027088165283203125, + "step": 510 + }, + { + "epoch": 1.24, + "grad_norm": 1.554492727376147, + "kl_div": 0.005406046286225319, + "kl_div_sft": 0.005406046286225319, + "learning_rate": 1.1843393148450246e-06, + "loss": 0.0855, + "sft_loss": 0.07011683285236359, + "step": 511 + }, + { + "epoch": 1.24, + "grad_norm": 1.903888177296763, + "importance_ratio": 0.6875, + "kl_div": -0.20197127759456635, + "kl_div_neg": -0.37399518489837646, + "kl_div_sft": -0.02994735911488533, + "learning_rate": 1.1827079934747143e-06, + "loss": 0.0269, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.11903167515993118, + "step": 512 + }, + { + "epoch": 1.24, + "grad_norm": 0.8941020352296798, + "importance_ratio": 0.7734375, + "kl_div": -0.2627606689929962, + "kl_div_neg": -0.3461109399795532, + "kl_div_pos": -0.17941038310527802, + "learning_rate": 1.1810766721044045e-06, + "loss": -0.0986, + "ppo_loss": -0.017881423234939575, + "step": 513 + }, + { + "epoch": 1.25, + "grad_norm": 1.725176362683201, + "kl_div": -0.05197884142398834, + "kl_div_sft": -0.05197884142398834, + "learning_rate": 1.1794453507340945e-06, + "loss": -0.0622, + "sft_loss": 0.10858382284641266, + "step": 514 + }, + { + "epoch": 1.25, + "grad_norm": 1.5883963179667402, + "importance_ratio": 1.015625, + "kl_div": 0.01706155203282833, + "kl_div_pos": 0.01706155203282833, + "learning_rate": 1.1778140293637847e-06, + "loss": -0.0126, + "ppo_loss": -1.017217755317688, + "step": 515 + }, + { + "epoch": 1.25, + "grad_norm": 1.0429578916291624, + "kl_div": 0.004653987940400839, + "kl_div_sft": 0.004653987940400839, + "learning_rate": 1.1761827079934747e-06, + "loss": -0.1474, + "sft_loss": 0.04697205498814583, + "step": 516 + }, + { + "epoch": 1.25, + "grad_norm": 1.8473786030530364, + "importance_ratio": 1.0390625, + "kl_div": 0.026094667613506317, + "kl_div_pos": 0.04203187674283981, + "kl_div_sft": 0.010157458484172821, + "learning_rate": 1.1745513866231647e-06, + "loss": -0.1446, + "ppo_loss": -1.0429277420043945, + "sft_loss": 0.10328664630651474, + "step": 517 + }, + { + "epoch": 1.26, + "grad_norm": 2.3976654897780887, + "kl_div": 0.020840991288423538, + "kl_div_sft": 0.020840991288423538, + "learning_rate": 1.1729200652528549e-06, + "loss": -0.0236, + "sft_loss": 0.04611089080572128, + "step": 518 + }, + { + "epoch": 1.26, + "grad_norm": 1.6587128950854497, + "importance_ratio": 0.8984375, + "kl_div": -0.11170358210802078, + "kl_div_neg": -0.21417810022830963, + "kl_div_pos": -0.009229060262441635, + "learning_rate": 1.1712887438825446e-06, + "loss": -0.0484, + "ppo_loss": -0.09180441498756409, + "step": 519 + }, + { + "epoch": 1.26, + "grad_norm": 1.3053593533221195, + "kl_div": 0.011289243586361408, + "kl_div_sft": 0.011289243586361408, + "learning_rate": 1.1696574225122348e-06, + "loss": -0.0328, + "sft_loss": 0.031042158603668213, + "step": 520 + }, + { + "epoch": 1.26, + "grad_norm": 0.942260860794481, + "importance_ratio": 0.99609375, + "kl_div": -0.002506896387785673, + "kl_div_pos": -0.004093681927770376, + "kl_div_sft": -0.0009201108478009701, + "learning_rate": 1.168026101141925e-06, + "loss": -0.1133, + "ppo_loss": -0.9959146976470947, + "sft_loss": 0.0964369997382164, + "step": 521 + }, + { + "epoch": 1.27, + "grad_norm": 2.0362589186443225, + "kl_div": -0.06798586249351501, + "kl_div_sft": -0.06798586249351501, + "learning_rate": 1.166394779771615e-06, + "loss": -0.0122, + "sft_loss": 0.13926270604133606, + "step": 522 + }, + { + "epoch": 1.27, + "grad_norm": 0.9343024720076063, + "importance_ratio": 0.7578125, + "kl_div": -0.2980799973011017, + "kl_div_neg": -0.2980799973011017, + "learning_rate": 1.164763458401305e-06, + "loss": 0.0436, + "ppo_loss": 0.8530257940292358, + "step": 523 + }, + { + "epoch": 1.27, + "grad_norm": 1.3128374210962497, + "importance_ratio": 0.99609375, + "kl_div": -0.013152940198779106, + "kl_div_pos": -0.0031146006658673286, + "kl_div_sft": -0.02319127880036831, + "learning_rate": 1.163132137030995e-06, + "loss": -0.0097, + "ppo_loss": -0.9968902468681335, + "sft_loss": 0.13142527639865875, + "step": 524 + }, + { + "epoch": 1.27, + "grad_norm": 2.119210378316291, + "importance_ratio": 0.796875, + "kl_div": -0.1095552071928978, + "kl_div_neg": -0.2284746617078781, + "kl_div_sft": 0.009364242665469646, + "learning_rate": 1.1615008156606852e-06, + "loss": -0.0474, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.05101296305656433, + "step": 525 + }, + { + "epoch": 1.28, + "grad_norm": 3.4480022561935515, + "importance_ratio": 0.890625, + "kl_div": -0.050689272582530975, + "kl_div_neg": -0.11669891327619553, + "kl_div_sft": 0.015320368111133575, + "learning_rate": 1.1598694942903752e-06, + "loss": -0.1617, + "ppo_loss": 0.8898531198501587, + "sft_loss": 0.033386651426553726, + "step": 526 + }, + { + "epoch": 1.28, + "grad_norm": 1.7723430331609662, + "importance_ratio": 0.734375, + "kl_div": -0.1602250188589096, + "kl_div_neg": -0.3096018433570862, + "kl_div_sft": -0.010848197154700756, + "learning_rate": 1.1582381729200651e-06, + "loss": -0.1267, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.0797138437628746, + "step": 527 + }, + { + "epoch": 1.28, + "grad_norm": 2.107571657530528, + "importance_ratio": 0.90234375, + "kl_div": -0.11691511422395706, + "kl_div_neg": -0.28963759541511536, + "kl_div_pos": 0.05580736696720123, + "learning_rate": 1.1566068515497553e-06, + "loss": -0.0529, + "ppo_loss": -0.128697007894516, + "step": 528 + }, + { + "epoch": 1.28, + "grad_norm": 2.385418576790879, + "kl_div": 0.013789523392915726, + "kl_div_sft": 0.013789523392915726, + "learning_rate": 1.1549755301794453e-06, + "loss": -0.0812, + "sft_loss": 0.05344651639461517, + "step": 529 + }, + { + "epoch": 1.28, + "grad_norm": 1.4828436732658241, + "importance_ratio": 0.85546875, + "kl_div": -0.16320474445819855, + "kl_div_neg": -0.16320474445819855, + "learning_rate": 1.1533442088091353e-06, + "loss": -0.1487, + "ppo_loss": 0.8766193389892578, + "step": 530 + }, + { + "epoch": 1.29, + "grad_norm": 1.5986096957800984, + "importance_ratio": 0.95703125, + "kl_div": -0.015350108034908772, + "kl_div_neg": -0.044570233672857285, + "kl_div_sft": 0.013870017603039742, + "learning_rate": 1.1517128874388253e-06, + "loss": -0.0947, + "ppo_loss": 0.9564084410667419, + "sft_loss": 0.0904233381152153, + "step": 531 + }, + { + "epoch": 1.29, + "grad_norm": 3.1736066629465336, + "importance_ratio": 0.8125, + "kl_div": -0.21371789276599884, + "kl_div_neg": -0.21371789276599884, + "learning_rate": 1.1500815660685155e-06, + "loss": -0.0531, + "ppo_loss": 0.848617672920227, + "step": 532 + }, + { + "epoch": 1.29, + "grad_norm": 1.4188297904679963, + "kl_div": 0.009632952511310577, + "kl_div_sft": 0.009632952511310577, + "learning_rate": 1.1484502446982055e-06, + "loss": 0.0202, + "sft_loss": 0.04948854818940163, + "step": 533 + }, + { + "epoch": 1.29, + "grad_norm": 1.2580826273802443, + "kl_div": -0.010189337655901909, + "kl_div_sft": -0.010189337655901909, + "learning_rate": 1.1468189233278955e-06, + "loss": -0.1035, + "sft_loss": 0.08644433319568634, + "step": 534 + }, + { + "epoch": 1.3, + "grad_norm": 1.0285720734367165, + "kl_div": 0.012180252932012081, + "kl_div_sft": 0.012180252932012081, + "learning_rate": 1.1451876019575857e-06, + "loss": -0.1224, + "sft_loss": 0.055709097534418106, + "step": 535 + }, + { + "epoch": 1.3, + "grad_norm": 2.3399933272046534, + "kl_div": 0.006703513208776712, + "kl_div_sft": 0.006703513208776712, + "learning_rate": 1.1435562805872756e-06, + "loss": 0.1456, + "sft_loss": 0.05426019802689552, + "step": 536 + }, + { + "epoch": 1.3, + "grad_norm": 1.0555467849222064, + "kl_div": -0.0029783041682094336, + "kl_div_sft": -0.0029783041682094336, + "learning_rate": 1.1419249592169658e-06, + "loss": -0.1068, + "sft_loss": 0.043515875935554504, + "step": 537 + }, + { + "epoch": 1.3, + "grad_norm": 2.871971027483153, + "importance_ratio": 0.828125, + "kl_div": -0.19169503450393677, + "kl_div_neg": -0.19169503450393677, + "learning_rate": 1.1402936378466556e-06, + "loss": 0.139, + "ppo_loss": 0.8521348237991333, + "step": 538 + }, + { + "epoch": 1.31, + "grad_norm": 1.4585484845534662, + "importance_ratio": 0.703125, + "kl_div": -0.1812596321105957, + "kl_div_neg": -0.35394641757011414, + "kl_div_sft": -0.00857284665107727, + "learning_rate": 1.1386623164763458e-06, + "loss": 0.0067, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.06204671785235405, + "step": 539 + }, + { + "epoch": 1.31, + "grad_norm": 1.584166190628764, + "importance_ratio": 0.76953125, + "kl_div": -0.13018177449703217, + "kl_div_neg": -0.2637389004230499, + "kl_div_sft": 0.003375363303348422, + "learning_rate": 1.1370309951060358e-06, + "loss": -0.0823, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.034781016409397125, + "step": 540 + }, + { + "epoch": 1.31, + "grad_norm": 1.62456103044522, + "importance_ratio": 1.0078125, + "kl_div": 0.0058018844574689865, + "kl_div_pos": 0.0058018844574689865, + "learning_rate": 1.1353996737357258e-06, + "loss": -0.0601, + "ppo_loss": -1.0061110258102417, + "step": 541 + }, + { + "epoch": 1.31, + "grad_norm": 1.0309449524969996, + "kl_div": 0.016848277300596237, + "kl_div_sft": 0.016848277300596237, + "learning_rate": 1.133768352365416e-06, + "loss": 0.0897, + "sft_loss": 0.033322080969810486, + "step": 542 + }, + { + "epoch": 1.32, + "grad_norm": 1.6610577458145217, + "importance_ratio": 0.90234375, + "kl_div": -0.07893979549407959, + "kl_div_neg": -0.1038811206817627, + "kl_div_sft": -0.053998466581106186, + "learning_rate": 1.132137030995106e-06, + "loss": 0.0988, + "ppo_loss": 0.9013324975967407, + "sft_loss": 0.10321355611085892, + "step": 543 + }, + { + "epoch": 1.32, + "grad_norm": 1.3239489255141208, + "importance_ratio": 0.6328125, + "kl_div": -0.47106295824050903, + "kl_div_neg": -0.47106295824050903, + "learning_rate": 1.1305057096247961e-06, + "loss": -0.1342, + "ppo_loss": 0.800000011920929, + "step": 544 + }, + { + "epoch": 1.32, + "grad_norm": 6.1853282469454225, + "kl_div": 0.0043830047361552715, + "kl_div_sft": 0.0043830047361552715, + "learning_rate": 1.128874388254486e-06, + "loss": -0.1029, + "sft_loss": 0.04348590970039368, + "step": 545 + }, + { + "epoch": 1.32, + "grad_norm": 6.376584024540215, + "importance_ratio": 0.8984375, + "kl_div": -0.10580405592918396, + "kl_div_pos": -0.10580405592918396, + "learning_rate": 1.1272430668841761e-06, + "loss": -0.0738, + "ppo_loss": -0.8998837471008301, + "step": 546 + }, + { + "epoch": 1.33, + "grad_norm": 1.1272377963645992, + "importance_ratio": 0.7890625, + "kl_div": -0.12314502149820328, + "kl_div_neg": -0.23518849909305573, + "kl_div_sft": -0.011101537384092808, + "learning_rate": 1.1256117455138663e-06, + "loss": -0.0169, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.10723263770341873, + "step": 547 + }, + { + "epoch": 1.33, + "grad_norm": 2.1873014087449754, + "kl_div": -0.00913550890982151, + "kl_div_sft": -0.00913550890982151, + "learning_rate": 1.1239804241435563e-06, + "loss": 0.0708, + "sft_loss": 0.07666724920272827, + "step": 548 + }, + { + "epoch": 1.33, + "grad_norm": 7.292791428782978, + "importance_ratio": 1.015625, + "kl_div": 0.013908376917243004, + "kl_div_pos": 0.018195772543549538, + "kl_div_sft": 0.009620980359613895, + "learning_rate": 1.1223491027732463e-06, + "loss": -0.0659, + "ppo_loss": -1.018362283706665, + "sft_loss": 0.1339206099510193, + "step": 549 + }, + { + "epoch": 1.33, + "grad_norm": 1.4911592146041783, + "importance_ratio": 0.8984375, + "kl_div": -0.11247290670871735, + "kl_div_neg": -0.22601757943630219, + "kl_div_pos": 0.0010717726545408368, + "learning_rate": 1.1207177814029363e-06, + "loss": -0.0758, + "ppo_loss": -0.10053613781929016, + "step": 550 + }, + { + "epoch": 1.34, + "grad_norm": 1.1687821837701196, + "importance_ratio": 0.78515625, + "kl_div": -0.11754447966814041, + "kl_div_neg": -0.2395263910293579, + "kl_div_sft": 0.004437429364770651, + "learning_rate": 1.1190864600326265e-06, + "loss": -0.1035, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.08405223488807678, + "step": 551 + }, + { + "epoch": 1.34, + "grad_norm": 1.7625336041865531, + "importance_ratio": 0.83984375, + "kl_div": -0.19569040834903717, + "kl_div_neg": -0.3977075517177582, + "kl_div_pos": 0.006326722446829081, + "learning_rate": 1.1174551386623162e-06, + "loss": 0.0337, + "ppo_loss": -0.1031734049320221, + "step": 552 + }, + { + "epoch": 1.34, + "grad_norm": 1.5047727788445464, + "importance_ratio": 0.8203125, + "kl_div": -0.20205961167812347, + "kl_div_neg": -0.20205961167812347, + "learning_rate": 1.1158238172920064e-06, + "loss": -0.0112, + "ppo_loss": 0.8295047283172607, + "step": 553 + }, + { + "epoch": 1.34, + "grad_norm": 1.394685880919484, + "importance_ratio": 0.78125, + "kl_div": -0.2900198996067047, + "kl_div_neg": -0.5972557663917542, + "kl_div_pos": 0.01721596159040928, + "learning_rate": 1.1141924959216966e-06, + "loss": 0.051, + "ppo_loss": -0.10868248343467712, + "step": 554 + }, + { + "epoch": 1.35, + "grad_norm": 1.3764932933301735, + "kl_div": 0.014602404087781906, + "kl_div_sft": 0.014602404087781906, + "learning_rate": 1.1125611745513866e-06, + "loss": -0.1426, + "sft_loss": 0.038136836141347885, + "step": 555 + }, + { + "epoch": 1.35, + "grad_norm": 3.7956450554179524, + "importance_ratio": 0.9453125, + "kl_div": -0.026797577738761902, + "kl_div_neg": -0.05635818466544151, + "kl_div_sft": 0.002763028722256422, + "learning_rate": 1.1109298531810766e-06, + "loss": -0.1243, + "ppo_loss": 0.9452005624771118, + "sft_loss": 0.06745104491710663, + "step": 556 + }, + { + "epoch": 1.35, + "grad_norm": 3.8833894051581703, + "importance_ratio": 0.890625, + "kl_div": -0.13215163350105286, + "kl_div_neg": -0.3047674000263214, + "kl_div_pos": 0.0404641255736351, + "learning_rate": 1.1092985318107666e-06, + "loss": -0.1974, + "ppo_loss": -0.12064698338508606, + "step": 557 + }, + { + "epoch": 1.35, + "grad_norm": 1.1478217933251562, + "importance_ratio": 0.9609375, + "kl_div": -0.06483356654644012, + "kl_div_pos": -0.04164363443851471, + "kl_div_sft": -0.08802350610494614, + "learning_rate": 1.1076672104404568e-06, + "loss": -0.1258, + "ppo_loss": -0.959211528301239, + "sft_loss": 0.1283067911863327, + "step": 558 + }, + { + "epoch": 1.36, + "grad_norm": 1.27512121933286, + "importance_ratio": 0.91796875, + "kl_div": -0.08908012509346008, + "kl_div_neg": -0.16968628764152527, + "kl_div_pos": -0.008473969995975494, + "learning_rate": 1.1060358890701468e-06, + "loss": -0.1369, + "ppo_loss": -0.07381615042686462, + "step": 559 + }, + { + "epoch": 1.36, + "grad_norm": 1.2365470318794451, + "kl_div": 0.008347313851118088, + "kl_div_sft": 0.008347313851118088, + "learning_rate": 1.1044045676998367e-06, + "loss": -0.0697, + "sft_loss": 0.047842901200056076, + "step": 560 + }, + { + "epoch": 1.36, + "grad_norm": 2.7825982758337306, + "kl_div": 0.018152426928281784, + "kl_div_sft": 0.018152426928281784, + "learning_rate": 1.102773246329527e-06, + "loss": -0.0772, + "sft_loss": 0.06452981382608414, + "step": 561 + }, + { + "epoch": 1.36, + "grad_norm": 1.2984059960024126, + "importance_ratio": 0.91015625, + "kl_div": -0.035715144127607346, + "kl_div_neg": -0.09511800855398178, + "kl_div_sft": 0.02368772216141224, + "learning_rate": 1.101141924959217e-06, + "loss": -0.1642, + "ppo_loss": 0.9092656970024109, + "sft_loss": 0.038571763783693314, + "step": 562 + }, + { + "epoch": 1.36, + "grad_norm": 1.388739604344503, + "importance_ratio": 0.796875, + "kl_div": -0.11054076999425888, + "kl_div_neg": -0.2250179499387741, + "kl_div_sft": 0.00393641646951437, + "learning_rate": 1.0995106035889071e-06, + "loss": -0.0283, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.051326606422662735, + "step": 563 + }, + { + "epoch": 1.37, + "grad_norm": 0.9820198465078197, + "importance_ratio": 0.4453125, + "kl_div": -0.4062121510505676, + "kl_div_neg": -0.8068867325782776, + "kl_div_sft": -0.005537545774132013, + "learning_rate": 1.0978792822185969e-06, + "loss": 0.0962, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.040258102118968964, + "step": 564 + }, + { + "epoch": 1.37, + "grad_norm": 1.6842497147374014, + "importance_ratio": 0.99609375, + "kl_div": 0.003686623414978385, + "kl_div_pos": -0.00484278192743659, + "kl_div_sft": 0.01221602875739336, + "learning_rate": 1.096247960848287e-06, + "loss": -0.0823, + "ppo_loss": -0.9951689839363098, + "sft_loss": 0.09219343215227127, + "step": 565 + }, + { + "epoch": 1.37, + "grad_norm": 1.1904134647556424, + "importance_ratio": 0.65625, + "kl_div": -0.2421429604291916, + "kl_div_neg": -0.4229358732700348, + "kl_div_sft": -0.061350058764219284, + "learning_rate": 1.0946166394779773e-06, + "loss": -0.0565, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.12906409800052643, + "step": 566 + }, + { + "epoch": 1.37, + "grad_norm": 1.46299052693135, + "kl_div": 0.009440741501748562, + "kl_div_sft": 0.009440741501748562, + "learning_rate": 1.092985318107667e-06, + "loss": -0.1105, + "sft_loss": 0.0516950823366642, + "step": 567 + }, + { + "epoch": 1.38, + "grad_norm": 1.7776258307663522, + "kl_div": 0.009271626360714436, + "kl_div_sft": 0.009271626360714436, + "learning_rate": 1.0913539967373572e-06, + "loss": -0.0054, + "sft_loss": 0.12384568154811859, + "step": 568 + }, + { + "epoch": 1.38, + "grad_norm": 1.449626279221997, + "importance_ratio": 0.7109375, + "kl_div": -0.34696972370147705, + "kl_div_neg": -0.34696972370147705, + "learning_rate": 1.0897226753670472e-06, + "loss": -0.1492, + "ppo_loss": 0.800000011920929, + "step": 569 + }, + { + "epoch": 1.38, + "grad_norm": 1.699506783463177, + "kl_div": -0.0007659494876861572, + "kl_div_sft": -0.0007659494876861572, + "learning_rate": 1.0880913539967374e-06, + "loss": -0.0088, + "sft_loss": 0.060911569744348526, + "step": 570 + }, + { + "epoch": 1.38, + "grad_norm": 1.6960867946206275, + "importance_ratio": 0.87109375, + "kl_div": -0.08905941992998123, + "kl_div_neg": -0.13928547501564026, + "kl_div_sft": -0.0388333685696125, + "learning_rate": 1.0864600326264272e-06, + "loss": -0.0991, + "ppo_loss": 0.8699796795845032, + "sft_loss": 0.15874932706356049, + "step": 571 + }, + { + "epoch": 1.39, + "grad_norm": 1.1456683762616864, + "kl_div": 0.004077468998730183, + "kl_div_sft": 0.004077468998730183, + "learning_rate": 1.0848287112561174e-06, + "loss": -0.0106, + "sft_loss": 0.07098381221294403, + "step": 572 + }, + { + "epoch": 1.39, + "grad_norm": 1.715795300756801, + "importance_ratio": 0.99609375, + "kl_div": 0.0015950084198266268, + "kl_div_pos": -0.004990215878933668, + "kl_div_sft": 0.008180232718586922, + "learning_rate": 1.0831973898858076e-06, + "loss": -0.0068, + "ppo_loss": -0.995022177696228, + "sft_loss": 0.09894995391368866, + "step": 573 + }, + { + "epoch": 1.39, + "grad_norm": 2.7129768374361563, + "importance_ratio": 0.9375, + "kl_div": -0.0715622529387474, + "kl_div_neg": -0.18958507478237152, + "kl_div_pos": 0.04646056890487671, + "learning_rate": 1.0815660685154976e-06, + "loss": -0.1, + "ppo_loss": -0.11012718081474304, + "step": 574 + }, + { + "epoch": 1.39, + "grad_norm": 1.9102451171173565, + "importance_ratio": 0.7578125, + "kl_div": -0.2801617681980133, + "kl_div_neg": -0.2801617681980133, + "learning_rate": 1.0799347471451876e-06, + "loss": 0.1725, + "ppo_loss": 0.8094266057014465, + "step": 575 + }, + { + "epoch": 1.4, + "grad_norm": 1.21976238164542, + "importance_ratio": 0.70703125, + "kl_div": -0.16747649013996124, + "kl_div_neg": -0.3467353284358978, + "kl_div_sft": 0.011782352812588215, + "learning_rate": 1.0783034257748775e-06, + "loss": 0.0558, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.0150612723082304, + "step": 576 + }, + { + "epoch": 1.4, + "grad_norm": 1.8178579128626697, + "importance_ratio": 0.78515625, + "kl_div": -0.12120449542999268, + "kl_div_neg": -0.24348323047161102, + "kl_div_sft": 0.001074238563887775, + "learning_rate": 1.0766721044045677e-06, + "loss": -0.0141, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.03843202069401741, + "step": 577 + }, + { + "epoch": 1.4, + "grad_norm": 1.441867000059928, + "importance_ratio": 0.96484375, + "kl_div": -0.02505611814558506, + "kl_div_neg": -0.03736988827586174, + "kl_div_sft": -0.012742347083985806, + "learning_rate": 1.0750407830342575e-06, + "loss": 0.0226, + "ppo_loss": 0.9633197784423828, + "sft_loss": 0.06988085806369781, + "step": 578 + }, + { + "epoch": 1.4, + "grad_norm": 2.2161701946660313, + "importance_ratio": 1.0625, + "kl_div": 0.04236027970910072, + "kl_div_pos": 0.061362944543361664, + "kl_div_sft": 0.023357614874839783, + "learning_rate": 1.0734094616639477e-06, + "loss": 0.0401, + "ppo_loss": -1.0632847547531128, + "sft_loss": 0.022046925500035286, + "step": 579 + }, + { + "epoch": 1.41, + "grad_norm": 3.3108519286996834, + "importance_ratio": 0.54296875, + "kl_div": -0.6279046535491943, + "kl_div_neg": -0.6279046535491943, + "learning_rate": 1.071778140293638e-06, + "loss": -0.015, + "ppo_loss": 0.800000011920929, + "step": 580 + }, + { + "epoch": 1.41, + "grad_norm": 1.4772493894107401, + "importance_ratio": 0.90234375, + "kl_div": -0.054423633962869644, + "kl_div_neg": -0.10371052473783493, + "kl_div_sft": -0.0051367441192269325, + "learning_rate": 1.0701468189233279e-06, + "loss": -0.0479, + "ppo_loss": 0.9014862179756165, + "sft_loss": 0.06811974197626114, + "step": 581 + }, + { + "epoch": 1.41, + "grad_norm": 1.4673635553878324, + "kl_div": -0.022171318531036377, + "kl_div_sft": -0.022171318531036377, + "learning_rate": 1.0685154975530179e-06, + "loss": -0.0467, + "sft_loss": 0.18078254163265228, + "step": 582 + }, + { + "epoch": 1.41, + "grad_norm": 1.4955656947402662, + "importance_ratio": 0.9453125, + "kl_div": -0.013638187199831009, + "kl_div_pos": -0.0558946430683136, + "kl_div_sft": 0.02861826866865158, + "learning_rate": 1.0668841761827079e-06, + "loss": -0.134, + "ppo_loss": -0.9456387162208557, + "sft_loss": 0.013508289121091366, + "step": 583 + }, + { + "epoch": 1.42, + "grad_norm": 1.9301770782273342, + "importance_ratio": 0.9453125, + "kl_div": -0.04264499992132187, + "kl_div_pos": -0.056435685604810715, + "kl_div_sft": -0.028854310512542725, + "learning_rate": 1.065252854812398e-06, + "loss": -0.1468, + "ppo_loss": -0.9451273083686829, + "sft_loss": 0.1179473027586937, + "step": 584 + }, + { + "epoch": 1.42, + "grad_norm": 1.2775761104239642, + "kl_div": 0.006246216129511595, + "kl_div_sft": 0.006246216129511595, + "learning_rate": 1.063621533442088e-06, + "loss": -0.0571, + "sft_loss": 0.05256784334778786, + "step": 585 + }, + { + "epoch": 1.42, + "grad_norm": 2.3724816673543283, + "importance_ratio": 0.99609375, + "kl_div": 0.002640511840581894, + "kl_div_pos": -0.004412751644849777, + "kl_div_sft": 0.009693775326013565, + "learning_rate": 1.061990212071778e-06, + "loss": -0.0384, + "ppo_loss": -0.9955970048904419, + "sft_loss": 0.051869332790374756, + "step": 586 + }, + { + "epoch": 1.42, + "grad_norm": 1.7260462476738236, + "importance_ratio": 1.03125, + "kl_div": 0.0022712545469403267, + "kl_div_pos": 0.028654221445322037, + "kl_div_sft": -0.024111712351441383, + "learning_rate": 1.0603588907014682e-06, + "loss": -0.1109, + "ppo_loss": -1.0290687084197998, + "sft_loss": 0.08297090977430344, + "step": 587 + }, + { + "epoch": 1.43, + "grad_norm": 1.1074164388748575, + "importance_ratio": 0.9453125, + "kl_div": -0.015047087334096432, + "kl_div_neg": -0.055990349501371384, + "kl_div_sft": 0.02589617483317852, + "learning_rate": 1.0587275693311582e-06, + "loss": -0.0687, + "ppo_loss": 0.9455482363700867, + "sft_loss": 0.11951703578233719, + "step": 588 + }, + { + "epoch": 1.43, + "grad_norm": 1.088531141215031, + "importance_ratio": 1.0234375, + "kl_div": 0.026111368089914322, + "kl_div_pos": 0.026542862877249718, + "kl_div_sft": 0.025679873302578926, + "learning_rate": 1.0570962479608482e-06, + "loss": -0.0353, + "ppo_loss": -1.0268982648849487, + "sft_loss": 0.09185680747032166, + "step": 589 + }, + { + "epoch": 1.43, + "grad_norm": 2.5670330611783356, + "importance_ratio": 1.0546875, + "kl_div": 0.03792417794466019, + "kl_div_pos": 0.05555087327957153, + "kl_div_sft": 0.02029748447239399, + "learning_rate": 1.0554649265905382e-06, + "loss": -0.0404, + "ppo_loss": -1.0571227073669434, + "sft_loss": 0.07268325984477997, + "step": 590 + }, + { + "epoch": 1.43, + "grad_norm": 1.467813396100296, + "kl_div": 0.001873633824288845, + "kl_div_sft": 0.001873633824288845, + "learning_rate": 1.0538336052202284e-06, + "loss": -0.103, + "sft_loss": 0.08411164581775665, + "step": 591 + }, + { + "epoch": 1.44, + "grad_norm": 2.3531129104688713, + "kl_div": -0.0008771107532083988, + "kl_div_sft": -0.0008771107532083988, + "learning_rate": 1.0522022838499186e-06, + "loss": -0.0855, + "sft_loss": 0.053271256387233734, + "step": 592 + }, + { + "epoch": 1.44, + "grad_norm": 1.677308566999723, + "importance_ratio": 0.82421875, + "kl_div": -0.10028732568025589, + "kl_div_pos": -0.19314102828502655, + "kl_div_sft": -0.007433618418872356, + "learning_rate": 1.0505709624796083e-06, + "loss": -0.071, + "ppo_loss": -0.8243657350540161, + "sft_loss": 0.08927972614765167, + "step": 593 + }, + { + "epoch": 1.44, + "grad_norm": 1.5657319784258674, + "kl_div": 0.009805003181099892, + "kl_div_sft": 0.009805003181099892, + "learning_rate": 1.0489396411092985e-06, + "loss": -0.0231, + "sft_loss": 0.052252329885959625, + "step": 594 + }, + { + "epoch": 1.44, + "grad_norm": 1.1571238540589344, + "importance_ratio": 1.03125, + "kl_div": 0.02416916750371456, + "kl_div_pos": 0.03090663067996502, + "kl_div_sft": 0.017431704327464104, + "learning_rate": 1.0473083197389885e-06, + "loss": -0.1397, + "ppo_loss": -1.0313892364501953, + "sft_loss": 0.0322992280125618, + "step": 595 + }, + { + "epoch": 1.44, + "grad_norm": 1.193779953985234, + "kl_div": 0.010021679103374481, + "kl_div_sft": 0.010021679103374481, + "learning_rate": 1.0456769983686787e-06, + "loss": -0.0705, + "sft_loss": 0.05322499945759773, + "step": 596 + }, + { + "epoch": 1.45, + "grad_norm": 1.3361645778833022, + "kl_div": -0.0021144638303667307, + "kl_div_sft": -0.0021144638303667307, + "learning_rate": 1.0440456769983685e-06, + "loss": -0.1023, + "sft_loss": 0.13762226700782776, + "step": 597 + }, + { + "epoch": 1.45, + "grad_norm": 1.4972359303775002, + "kl_div": 0.005427949130535126, + "kl_div_sft": 0.005427949130535126, + "learning_rate": 1.0424143556280587e-06, + "loss": -0.2345, + "sft_loss": 0.05453133210539818, + "step": 598 + }, + { + "epoch": 1.45, + "grad_norm": 1.742612231018511, + "kl_div": -0.0006676320917904377, + "kl_div_sft": -0.0006676320917904377, + "learning_rate": 1.0407830342577489e-06, + "loss": -0.1976, + "sft_loss": 0.06037126109004021, + "step": 599 + }, + { + "epoch": 1.45, + "grad_norm": 1.1586556025204708, + "importance_ratio": 0.73828125, + "kl_div": -0.14715160429477692, + "kl_div_neg": -0.30401289463043213, + "kl_div_sft": 0.009709681384265423, + "learning_rate": 1.0391517128874386e-06, + "loss": -0.0554, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.029035640880465508, + "step": 600 + }, + { + "epoch": 1.46, + "grad_norm": 1.6506692684987094, + "importance_ratio": 1.0078125, + "kl_div": 0.010380488820374012, + "kl_div_pos": 0.006752700544893742, + "kl_div_sft": 0.014008277095854282, + "learning_rate": 1.0375203915171288e-06, + "loss": -0.0073, + "ppo_loss": -1.0067756175994873, + "sft_loss": 0.05170813202857971, + "step": 601 + }, + { + "epoch": 1.46, + "grad_norm": 1.1282360370971418, + "kl_div": 0.01832808554172516, + "kl_div_sft": 0.01832808554172516, + "learning_rate": 1.0358890701468188e-06, + "loss": -0.0271, + "sft_loss": 0.03242221102118492, + "step": 602 + }, + { + "epoch": 1.46, + "grad_norm": 1.5571367150556785, + "kl_div": -0.0014522508718073368, + "kl_div_sft": -0.0014522508718073368, + "learning_rate": 1.034257748776509e-06, + "loss": -0.0319, + "sft_loss": 0.09149647504091263, + "step": 603 + }, + { + "epoch": 1.46, + "grad_norm": 1.5800280286017057, + "importance_ratio": 0.875, + "kl_div": -0.15403921902179718, + "kl_div_neg": -0.3375498056411743, + "kl_div_pos": 0.029471376910805702, + "learning_rate": 1.0326264274061988e-06, + "loss": 0.039, + "ppo_loss": -0.11495497822761536, + "step": 604 + }, + { + "epoch": 1.47, + "grad_norm": 1.7090297148003473, + "importance_ratio": 0.90625, + "kl_div": -0.038361359387636185, + "kl_div_neg": -0.0970546156167984, + "kl_div_sft": 0.020331894978880882, + "learning_rate": 1.030995106035889e-06, + "loss": -0.01, + "ppo_loss": 0.9075064659118652, + "sft_loss": 0.0135499881580472, + "step": 605 + }, + { + "epoch": 1.47, + "grad_norm": 1.528653181102798, + "importance_ratio": 0.6171875, + "kl_div": -0.2556186616420746, + "kl_div_neg": -0.482948362827301, + "kl_div_sft": -0.028288988396525383, + "learning_rate": 1.0293637846655792e-06, + "loss": -0.0538, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.08954707533121109, + "step": 606 + }, + { + "epoch": 1.47, + "grad_norm": 0.9950230846890066, + "kl_div": 0.007246728520840406, + "kl_div_sft": 0.007246728520840406, + "learning_rate": 1.0277324632952692e-06, + "loss": -0.0167, + "sft_loss": 0.06424114853143692, + "step": 607 + }, + { + "epoch": 1.47, + "grad_norm": 1.3094895670075744, + "importance_ratio": 0.75, + "kl_div": -0.13014312088489532, + "kl_div_neg": -0.2881861627101898, + "kl_div_sft": 0.027899926528334618, + "learning_rate": 1.0261011419249592e-06, + "loss": 0.1371, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.03597124293446541, + "step": 608 + }, + { + "epoch": 1.48, + "grad_norm": 1.157121072459821, + "importance_ratio": 0.8203125, + "kl_div": -0.10809502750635147, + "kl_div_pos": -0.19788144528865814, + "kl_div_sft": -0.018308604136109352, + "learning_rate": 1.0244698205546491e-06, + "loss": -0.0255, + "ppo_loss": -0.8204671144485474, + "sft_loss": 0.09640325605869293, + "step": 609 + }, + { + "epoch": 1.48, + "grad_norm": 1.634697587979911, + "importance_ratio": 0.78125, + "kl_div": -0.11589974164962769, + "kl_div_neg": -0.24510979652404785, + "kl_div_sft": 0.013310307636857033, + "learning_rate": 1.0228384991843393e-06, + "loss": -0.1429, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.02353717014193535, + "step": 610 + }, + { + "epoch": 1.48, + "grad_norm": 1.1102423702287916, + "importance_ratio": 1.0390625, + "kl_div": 0.017195984721183777, + "kl_div_pos": 0.0396147146821022, + "kl_div_sft": -0.005222745705395937, + "learning_rate": 1.021207177814029e-06, + "loss": -0.0158, + "ppo_loss": -1.0404099225997925, + "sft_loss": 0.09494847804307938, + "step": 611 + }, + { + "epoch": 1.48, + "grad_norm": 0.9871361774508806, + "importance_ratio": 0.9609375, + "kl_div": -0.042730703949928284, + "kl_div_neg": -0.11213622242212296, + "kl_div_pos": 0.026674814522266388, + "learning_rate": 1.0195758564437193e-06, + "loss": 0.0579, + "ppo_loss": -0.06655561923980713, + "step": 612 + }, + { + "epoch": 1.49, + "grad_norm": 1.6805677369712249, + "kl_div": 0.012642841786146164, + "kl_div_sft": 0.012642841786146164, + "learning_rate": 1.0179445350734095e-06, + "loss": 0.0846, + "sft_loss": 0.025455590337514877, + "step": 613 + }, + { + "epoch": 1.49, + "grad_norm": 1.4515161940104593, + "importance_ratio": 0.65625, + "kl_div": -0.4327019453048706, + "kl_div_neg": -0.4327019453048706, + "learning_rate": 1.0163132137030995e-06, + "loss": -0.0264, + "ppo_loss": 0.800000011920929, + "step": 614 + }, + { + "epoch": 1.49, + "grad_norm": 1.2200070769464322, + "importance_ratio": 0.76953125, + "kl_div": -0.13624180853366852, + "kl_div_neg": -0.2606610357761383, + "kl_div_sft": -0.011822582222521305, + "learning_rate": 1.0146818923327895e-06, + "loss": -0.0443, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.08891305327415466, + "step": 615 + }, + { + "epoch": 1.49, + "grad_norm": 1.3707699298206422, + "importance_ratio": 1.015625, + "kl_div": -0.013705750927329063, + "kl_div_pos": 0.013345234096050262, + "kl_div_sft": -0.04075673595070839, + "learning_rate": 1.0130505709624794e-06, + "loss": -0.0962, + "ppo_loss": -1.013434648513794, + "sft_loss": 0.1080576702952385, + "step": 616 + }, + { + "epoch": 1.5, + "grad_norm": 2.161681921245051, + "kl_div": 0.007364877033978701, + "kl_div_sft": 0.007364877033978701, + "learning_rate": 1.0114192495921696e-06, + "loss": -0.236, + "sft_loss": 0.04366494342684746, + "step": 617 + }, + { + "epoch": 1.5, + "grad_norm": 1.6888717444843546, + "importance_ratio": 0.63671875, + "kl_div": -0.23458042740821838, + "kl_div_neg": -0.4501579999923706, + "kl_div_sft": -0.019002839922904968, + "learning_rate": 1.0097879282218598e-06, + "loss": -0.0109, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.11033762991428375, + "step": 618 + }, + { + "epoch": 1.5, + "grad_norm": 1.202515782274504, + "importance_ratio": 1.0078125, + "kl_div": 0.015978161245584488, + "kl_div_pos": 0.010856712237000465, + "kl_div_sft": 0.02109961025416851, + "learning_rate": 1.0081566068515496e-06, + "loss": -0.0209, + "ppo_loss": -1.0109158754348755, + "sft_loss": 0.051598433405160904, + "step": 619 + }, + { + "epoch": 1.5, + "grad_norm": 2.29203733278664, + "importance_ratio": 0.81640625, + "kl_div": -0.14957204461097717, + "kl_div_neg": -0.20417694747447968, + "kl_div_sft": -0.09496712684631348, + "learning_rate": 1.0065252854812398e-06, + "loss": 0.0295, + "ppo_loss": 0.8153181076049805, + "sft_loss": 0.14580783247947693, + "step": 620 + }, + { + "epoch": 1.51, + "grad_norm": 1.6904680500051987, + "kl_div": -0.009111708030104637, + "kl_div_sft": -0.009111708030104637, + "learning_rate": 1.0048939641109298e-06, + "loss": -0.0565, + "sft_loss": 0.07915902137756348, + "step": 621 + }, + { + "epoch": 1.51, + "grad_norm": 3.122635824457841, + "importance_ratio": 0.8515625, + "kl_div": -0.1739480197429657, + "kl_div_neg": -0.35731154680252075, + "kl_div_pos": 0.00941550638526678, + "learning_rate": 1.00326264274062e-06, + "loss": -0.0549, + "ppo_loss": -0.10472998023033142, + "step": 622 + }, + { + "epoch": 1.51, + "grad_norm": 1.436956717617787, + "kl_div": 0.005096603184938431, + "kl_div_sft": 0.005096603184938431, + "learning_rate": 1.0016313213703098e-06, + "loss": -0.0869, + "sft_loss": 0.09290395677089691, + "step": 623 + }, + { + "epoch": 1.51, + "grad_norm": 0.8033987669657294, + "importance_ratio": 1.046875, + "kl_div": 0.024920709431171417, + "kl_div_pos": 0.04217958450317383, + "kl_div_sft": 0.007661834824830294, + "learning_rate": 1e-06, + "loss": 0.0153, + "ppo_loss": -1.0430817604064941, + "sft_loss": 0.040020450949668884, + "step": 624 + }, + { + "epoch": 1.52, + "grad_norm": 1.0643403999766239, + "kl_div": 0.00011259526945650578, + "kl_div_sft": 0.00011259526945650578, + "learning_rate": 9.9836867862969e-07, + "loss": 0.0686, + "sft_loss": 0.0579143762588501, + "step": 625 + }, + { + "epoch": 1.52, + "grad_norm": 3.7411073149069636, + "importance_ratio": 0.96484375, + "kl_div": -0.013933196663856506, + "kl_div_pos": -0.03640758991241455, + "kl_div_sft": 0.008541197516024113, + "learning_rate": 9.9673735725938e-07, + "loss": -0.0891, + "ppo_loss": -0.9642472267150879, + "sft_loss": 0.02761666662991047, + "step": 626 + }, + { + "epoch": 1.52, + "grad_norm": 6.6967108211071205, + "importance_ratio": 0.9921875, + "kl_div": -0.006364609580487013, + "kl_div_pos": -0.006364609580487013, + "learning_rate": 9.951060358890701e-07, + "loss": 0.0462, + "ppo_loss": -0.9936584234237671, + "step": 627 + }, + { + "epoch": 1.52, + "grad_norm": 1.2668617234289494, + "importance_ratio": 0.8828125, + "kl_div": -0.1370735913515091, + "kl_div_neg": -0.2952595353126526, + "kl_div_pos": 0.021112343296408653, + "learning_rate": 9.9347471451876e-07, + "loss": 0.0278, + "ppo_loss": -0.11066839098930359, + "step": 628 + }, + { + "epoch": 1.52, + "grad_norm": 1.7302028890941692, + "importance_ratio": 1.0, + "kl_div": -0.012814854271709919, + "kl_div_pos": 0.00020324558136053383, + "kl_div_sft": -0.025832954794168472, + "learning_rate": 9.918433931484503e-07, + "loss": -0.0514, + "ppo_loss": -1.000203251838684, + "sft_loss": 0.10578396916389465, + "step": 629 + }, + { + "epoch": 1.53, + "grad_norm": 0.9105489972194977, + "importance_ratio": 0.71484375, + "kl_div": -0.16649454832077026, + "kl_div_neg": -0.3332861065864563, + "kl_div_sft": 0.00029701562016271055, + "learning_rate": 9.902120717781403e-07, + "loss": -0.1376, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.08367674052715302, + "step": 630 + }, + { + "epoch": 1.53, + "grad_norm": 1.3256530196175094, + "importance_ratio": 1.03125, + "kl_div": 0.027810772880911827, + "kl_div_pos": 0.027810772880911827, + "learning_rate": 9.885807504078303e-07, + "loss": -0.0227, + "ppo_loss": -1.0282015800476074, + "step": 631 + }, + { + "epoch": 1.53, + "grad_norm": 2.5581641382522244, + "importance_ratio": 0.8125, + "kl_div": -0.2111785113811493, + "kl_div_neg": -0.2111785113811493, + "learning_rate": 9.869494290375203e-07, + "loss": 0.1847, + "ppo_loss": 0.8429823517799377, + "step": 632 + }, + { + "epoch": 1.53, + "grad_norm": 1.4855555608085709, + "importance_ratio": 1.015625, + "kl_div": 0.014578155241906643, + "kl_div_pos": 0.012926433235406876, + "kl_div_sft": 0.01622987724840641, + "learning_rate": 9.853181076672104e-07, + "loss": -0.1038, + "ppo_loss": -1.0130102634429932, + "sft_loss": 0.03075617365539074, + "step": 633 + }, + { + "epoch": 1.54, + "grad_norm": 0.9026682216648737, + "importance_ratio": 0.76953125, + "kl_div": -0.1260625720024109, + "kl_div_neg": -0.26258131861686707, + "kl_div_sft": 0.010456175543367863, + "learning_rate": 9.836867862969004e-07, + "loss": 0.0844, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.04138074815273285, + "step": 634 + }, + { + "epoch": 1.54, + "grad_norm": 2.996379575064102, + "importance_ratio": 1.0546875, + "kl_div": 0.025566959753632545, + "kl_div_pos": 0.056610144674777985, + "kl_div_sft": -0.005476226564496756, + "learning_rate": 9.820554649265906e-07, + "loss": 0.1201, + "ppo_loss": -1.0582431554794312, + "sft_loss": 0.09242013841867447, + "step": 635 + }, + { + "epoch": 1.54, + "grad_norm": 1.4259640929154063, + "importance_ratio": 0.80859375, + "kl_div": -0.09104778617620468, + "kl_div_neg": -0.21008449792861938, + "kl_div_sft": 0.027988923713564873, + "learning_rate": 9.804241435562806e-07, + "loss": -0.0378, + "ppo_loss": 0.810515820980072, + "sft_loss": 0.023999352008104324, + "step": 636 + }, + { + "epoch": 1.54, + "grad_norm": 1.497605717641765, + "importance_ratio": 0.80859375, + "kl_div": -0.11556511372327805, + "kl_div_neg": -0.21007847785949707, + "kl_div_sft": -0.021051747724413872, + "learning_rate": 9.787928221859706e-07, + "loss": -0.1009, + "ppo_loss": 0.8105207085609436, + "sft_loss": 0.0773676410317421, + "step": 637 + }, + { + "epoch": 1.55, + "grad_norm": 1.6909906524273515, + "importance_ratio": 1.0, + "kl_div": 0.0005287877283990383, + "kl_div_neg": 0.0076847923919558525, + "kl_div_pos": -0.006627216935157776, + "learning_rate": 9.771615008156606e-07, + "loss": -0.1343, + "ppo_loss": 0.00715985894203186, + "step": 638 + }, + { + "epoch": 1.55, + "grad_norm": 1.33005769016516, + "importance_ratio": 1.015625, + "kl_div": 0.019131600856781006, + "kl_div_pos": 0.019131600856781006, + "learning_rate": 9.755301794453506e-07, + "loss": -0.0472, + "ppo_loss": -1.0193370580673218, + "step": 639 + }, + { + "epoch": 1.55, + "grad_norm": 2.4557192745933176, + "importance_ratio": 0.71484375, + "kl_div": -0.16165408492088318, + "kl_div_neg": -0.3333815932273865, + "kl_div_sft": 0.010073418729007244, + "learning_rate": 9.738988580750408e-07, + "loss": -0.0718, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.04032623767852783, + "step": 640 + }, + { + "epoch": 1.55, + "grad_norm": 5.33474776267878, + "importance_ratio": 0.7421875, + "kl_div": -0.1424480825662613, + "kl_div_neg": -0.300345242023468, + "kl_div_sft": 0.015449062921106815, + "learning_rate": 9.722675367047307e-07, + "loss": 0.0681, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.053471680730581284, + "step": 641 + }, + { + "epoch": 1.56, + "grad_norm": 5.099826488161199, + "importance_ratio": 0.81640625, + "kl_div": -0.22648264467716217, + "kl_div_neg": -0.45588648319244385, + "kl_div_pos": 0.0029212054796516895, + "learning_rate": 9.70636215334421e-07, + "loss": -0.0518, + "ppo_loss": -0.10146275162696838, + "step": 642 + }, + { + "epoch": 1.56, + "grad_norm": 3.907806330587893, + "importance_ratio": 0.625, + "kl_div": -0.2288985550403595, + "kl_div_neg": -0.46892765164375305, + "kl_div_sft": 0.01113053783774376, + "learning_rate": 9.69004893964111e-07, + "loss": 0.0213, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.023388583213090897, + "step": 643 + }, + { + "epoch": 1.56, + "grad_norm": 1.3800678493571776, + "kl_div": 0.013283468782901764, + "kl_div_sft": 0.013283468782901764, + "learning_rate": 9.67373572593801e-07, + "loss": -0.1028, + "sft_loss": 0.058054424822330475, + "step": 644 + }, + { + "epoch": 1.56, + "grad_norm": 1.2509233878847967, + "importance_ratio": 0.59765625, + "kl_div": -0.24729475378990173, + "kl_div_neg": -0.5174387693405151, + "kl_div_sft": 0.02284926176071167, + "learning_rate": 9.657422512234909e-07, + "loss": -0.0466, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.026714660227298737, + "step": 645 + }, + { + "epoch": 1.57, + "grad_norm": 5.377815928380768, + "kl_div": -0.08838597685098648, + "kl_div_sft": -0.08838597685098648, + "learning_rate": 9.64110929853181e-07, + "loss": -0.0711, + "sft_loss": 0.14923830330371857, + "step": 646 + }, + { + "epoch": 1.57, + "grad_norm": 1.5657150760405996, + "importance_ratio": 0.87890625, + "kl_div": -0.05644303932785988, + "kl_div_neg": -0.12692682445049286, + "kl_div_sft": 0.014040743932127953, + "learning_rate": 9.62479608482871e-07, + "loss": -0.1483, + "ppo_loss": 0.8807981014251709, + "sft_loss": 0.020476870238780975, + "step": 647 + }, + { + "epoch": 1.57, + "grad_norm": 1.4235593440043515, + "importance_ratio": 0.53515625, + "kl_div": -0.3175111711025238, + "kl_div_neg": -0.6287727952003479, + "kl_div_sft": -0.006249555852264166, + "learning_rate": 9.608482871125613e-07, + "loss": -0.1587, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.054232582449913025, + "step": 648 + }, + { + "epoch": 1.57, + "grad_norm": 1.3064368848946577, + "kl_div": -0.030326243489980698, + "kl_div_sft": -0.030326243489980698, + "learning_rate": 9.592169657422513e-07, + "loss": -0.0821, + "sft_loss": 0.07004794478416443, + "step": 649 + }, + { + "epoch": 1.58, + "grad_norm": 5.415139711784409, + "importance_ratio": 1.0078125, + "kl_div": 0.01519959606230259, + "kl_div_pos": 0.006254679523408413, + "kl_div_sft": 0.02414451353251934, + "learning_rate": 9.575856443719412e-07, + "loss": 0.0018, + "ppo_loss": -1.0062743425369263, + "sft_loss": 0.031017892062664032, + "step": 650 + }, + { + "epoch": 1.58, + "grad_norm": 1.4936301562384438, + "importance_ratio": 0.83203125, + "kl_div": -0.08170606940984726, + "kl_div_neg": -0.18414528667926788, + "kl_div_sft": 0.020733144134283066, + "learning_rate": 9.559543230016312e-07, + "loss": -0.1336, + "ppo_loss": 0.8318149447441101, + "sft_loss": 0.20881670713424683, + "step": 651 + }, + { + "epoch": 1.58, + "grad_norm": 1.340476239894152, + "importance_ratio": 1.0078125, + "kl_div": 0.007561908569186926, + "kl_div_neg": 0.001748603186570108, + "kl_div_pos": 0.01337521430104971, + "learning_rate": 9.543230016313212e-07, + "loss": 0.0364, + "ppo_loss": -0.0058574676513671875, + "step": 652 + }, + { + "epoch": 1.58, + "grad_norm": 7.975741800948087, + "importance_ratio": 0.4140625, + "kl_div": -0.44976139068603516, + "kl_div_neg": -0.8810076117515564, + "kl_div_sft": -0.018515175208449364, + "learning_rate": 9.526916802610113e-07, + "loss": -0.0943, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.08849439769983292, + "step": 653 + }, + { + "epoch": 1.59, + "grad_norm": 2.3740506533495003, + "kl_div": 0.011756137013435364, + "kl_div_sft": 0.011756137013435364, + "learning_rate": 9.510603588907015e-07, + "loss": -0.0646, + "sft_loss": 0.04911510646343231, + "step": 654 + }, + { + "epoch": 1.59, + "grad_norm": 2.0039720431526855, + "kl_div": 0.009900109842419624, + "kl_div_sft": 0.009900109842419624, + "learning_rate": 9.494290375203915e-07, + "loss": 0.0081, + "sft_loss": 0.07109043002128601, + "step": 655 + }, + { + "epoch": 1.59, + "grad_norm": 1.9808406443040367, + "kl_div": -0.0021666795946657658, + "kl_div_sft": -0.0021666795946657658, + "learning_rate": 9.477977161500816e-07, + "loss": -0.0193, + "sft_loss": 0.052859771996736526, + "step": 656 + }, + { + "epoch": 1.59, + "grad_norm": 3.5823483259167084, + "importance_ratio": 0.8359375, + "kl_div": -0.18824321031570435, + "kl_div_neg": -0.31431934237480164, + "kl_div_pos": -0.062167081981897354, + "learning_rate": 9.461663947797715e-07, + "loss": 0.0357, + "ppo_loss": -0.06986293196678162, + "step": 657 + }, + { + "epoch": 1.6, + "grad_norm": 2.1106985955119675, + "importance_ratio": 0.98828125, + "kl_div": -0.02020912803709507, + "kl_div_pos": -0.012419541366398335, + "kl_div_sft": -0.02799871563911438, + "learning_rate": 9.445350734094616e-07, + "loss": -0.0225, + "ppo_loss": -0.9876572489738464, + "sft_loss": 0.17996813356876373, + "step": 658 + }, + { + "epoch": 1.6, + "grad_norm": 1.2809024432434561, + "importance_ratio": 1.015625, + "kl_div": 0.017215736210346222, + "kl_div_pos": 0.017215736210346222, + "learning_rate": 9.429037520391516e-07, + "loss": -0.0194, + "ppo_loss": -1.0175464153289795, + "step": 659 + }, + { + "epoch": 1.6, + "grad_norm": 2.0085101270090355, + "kl_div": 0.0152102280408144, + "kl_div_sft": 0.0152102280408144, + "learning_rate": 9.412724306688418e-07, + "loss": 0.0802, + "sft_loss": 0.07296903431415558, + "step": 660 + }, + { + "epoch": 1.6, + "grad_norm": 1.3454928960594914, + "importance_ratio": 1.0390625, + "kl_div": 0.030548464506864548, + "kl_div_pos": 0.040144648402929306, + "kl_div_sft": 0.02095227874815464, + "learning_rate": 9.396411092985318e-07, + "loss": -0.1293, + "ppo_loss": -1.0409613847732544, + "sft_loss": 0.0516112744808197, + "step": 661 + }, + { + "epoch": 1.6, + "grad_norm": 1.159335992778984, + "importance_ratio": 1.03125, + "kl_div": 0.008757232688367367, + "kl_div_pos": 0.03196336701512337, + "kl_div_sft": -0.014448901638388634, + "learning_rate": 9.380097879282218e-07, + "loss": -0.0229, + "ppo_loss": -1.0324797630310059, + "sft_loss": 0.06966232508420944, + "step": 662 + }, + { + "epoch": 1.61, + "grad_norm": 1.018373848212684, + "importance_ratio": 1.0078125, + "kl_div": 0.014435206539928913, + "kl_div_pos": 0.010820697993040085, + "kl_div_sft": 0.01804971508681774, + "learning_rate": 9.363784665579119e-07, + "loss": -0.0568, + "ppo_loss": -1.0108795166015625, + "sft_loss": 0.035217855125665665, + "step": 663 + }, + { + "epoch": 1.61, + "grad_norm": 1.312975661052913, + "importance_ratio": 0.9453125, + "kl_div": -0.06321582943201065, + "kl_div_neg": -0.17330032587051392, + "kl_div_pos": 0.046868663281202316, + "learning_rate": 9.347471451876019e-07, + "loss": -0.0804, + "ppo_loss": -0.10354965925216675, + "step": 664 + }, + { + "epoch": 1.61, + "grad_norm": 1.2861361492695895, + "importance_ratio": 0.9609375, + "kl_div": -0.014686057344079018, + "kl_div_neg": -0.04050711914896965, + "kl_div_sft": 0.011135004460811615, + "learning_rate": 9.33115823817292e-07, + "loss": -0.058, + "ppo_loss": 0.9603022933006287, + "sft_loss": 0.04753648489713669, + "step": 665 + }, + { + "epoch": 1.61, + "grad_norm": 1.4443818266247967, + "importance_ratio": 0.859375, + "kl_div": -0.07007308304309845, + "kl_div_pos": -0.15025857090950012, + "kl_div_sft": 0.01011241041123867, + "learning_rate": 9.31484502446982e-07, + "loss": -0.1818, + "ppo_loss": -0.8604854345321655, + "sft_loss": 0.07846605777740479, + "step": 666 + }, + { + "epoch": 1.62, + "grad_norm": 1.0413113115080765, + "kl_div": 0.013836363330483437, + "kl_div_sft": 0.013836363330483437, + "learning_rate": 9.298531810766721e-07, + "loss": -0.0525, + "sft_loss": 0.05567178130149841, + "step": 667 + }, + { + "epoch": 1.62, + "grad_norm": 1.3742326415868675, + "importance_ratio": 0.9453125, + "kl_div": -0.022275906056165695, + "kl_div_pos": -0.05634089559316635, + "kl_div_sft": 0.011789082549512386, + "learning_rate": 9.282218597063621e-07, + "loss": -0.0949, + "ppo_loss": -0.9452168941497803, + "sft_loss": 0.11352340877056122, + "step": 668 + }, + { + "epoch": 1.62, + "grad_norm": 1.3628873501949048, + "importance_ratio": 0.90625, + "kl_div": -0.10156269371509552, + "kl_div_neg": -0.2054397463798523, + "kl_div_pos": 0.0023143519647419453, + "learning_rate": 9.265905383360522e-07, + "loss": 0.0061, + "ppo_loss": -0.09401395916938782, + "step": 669 + }, + { + "epoch": 1.62, + "grad_norm": 3.272854190831507, + "importance_ratio": 1.0078125, + "kl_div": 0.003968629986047745, + "kl_div_neg": -0.021340366452932358, + "kl_div_pos": 0.029277626425027847, + "learning_rate": 9.249592169657422e-07, + "loss": -0.1134, + "ppo_loss": -0.02541235089302063, + "step": 670 + }, + { + "epoch": 1.63, + "grad_norm": 3.120346571918228, + "kl_div": 0.009733067825436592, + "kl_div_sft": 0.009733067825436592, + "learning_rate": 9.233278955954323e-07, + "loss": -0.0521, + "sft_loss": 0.04392252117395401, + "step": 671 + }, + { + "epoch": 1.63, + "grad_norm": 1.1634629516932495, + "kl_div": 0.007590790279209614, + "kl_div_sft": 0.007590790279209614, + "learning_rate": 9.216965742251223e-07, + "loss": -0.027, + "sft_loss": 0.05298725515604019, + "step": 672 + }, + { + "epoch": 1.63, + "grad_norm": 1.0945023265341634, + "importance_ratio": 0.76171875, + "kl_div": -0.136201411485672, + "kl_div_neg": -0.2729892134666443, + "kl_div_sft": 0.0005863768747076392, + "learning_rate": 9.200652528548124e-07, + "loss": -0.0103, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.11422469466924667, + "step": 673 + }, + { + "epoch": 1.63, + "grad_norm": 1.3032575526162251, + "kl_div": 0.021982582286000252, + "kl_div_sft": 0.021982582286000252, + "learning_rate": 9.184339314845024e-07, + "loss": 0.0219, + "sft_loss": 0.04211244732141495, + "step": 674 + }, + { + "epoch": 1.64, + "grad_norm": 2.6326199696040273, + "importance_ratio": 1.03125, + "kl_div": 0.01889719069004059, + "kl_div_pos": 0.027834882959723473, + "kl_div_sft": 0.00995949748903513, + "learning_rate": 9.168026101141924e-07, + "loss": 0.0839, + "ppo_loss": -1.0282258987426758, + "sft_loss": 0.026849152520298958, + "step": 675 + }, + { + "epoch": 1.64, + "grad_norm": 1.2263257715455744, + "importance_ratio": 0.8671875, + "kl_div": -0.156332328915596, + "kl_div_neg": -0.31329450011253357, + "kl_div_pos": 0.0006298309890553355, + "learning_rate": 9.151712887438825e-07, + "loss": -0.1005, + "ppo_loss": -0.10031500458717346, + "step": 676 + }, + { + "epoch": 1.64, + "grad_norm": 1.1389040571200293, + "importance_ratio": 0.96875, + "kl_div": -0.030221477150917053, + "kl_div_pos": -0.030221477150917053, + "learning_rate": 9.135399673735725e-07, + "loss": -0.1023, + "ppo_loss": -0.9703148603439331, + "step": 677 + }, + { + "epoch": 1.64, + "grad_norm": 2.1273286065926076, + "kl_div": 0.0077500175684690475, + "kl_div_sft": 0.0077500175684690475, + "learning_rate": 9.119086460032626e-07, + "loss": 0.0415, + "sft_loss": 0.05046243965625763, + "step": 678 + }, + { + "epoch": 1.65, + "grad_norm": 1.3444536717591244, + "kl_div": 0.0037313615903258324, + "kl_div_sft": 0.0037313615903258324, + "learning_rate": 9.102773246329527e-07, + "loss": -0.0019, + "sft_loss": 0.03707285225391388, + "step": 679 + }, + { + "epoch": 1.65, + "grad_norm": 2.2594354729585935, + "importance_ratio": 1.015625, + "kl_div": 0.007647077552974224, + "kl_div_pos": 0.016760453581809998, + "kl_div_sft": -0.0014662984758615494, + "learning_rate": 9.086460032626428e-07, + "loss": 0.0399, + "ppo_loss": -1.0169017314910889, + "sft_loss": 0.07068877667188644, + "step": 680 + }, + { + "epoch": 1.65, + "grad_norm": 2.7407975111008636, + "importance_ratio": 0.91796875, + "kl_div": -0.051950324326753616, + "kl_div_neg": -0.0857582613825798, + "kl_div_sft": -0.01814238540828228, + "learning_rate": 9.070146818923328e-07, + "loss": 0.0222, + "ppo_loss": 0.9178161025047302, + "sft_loss": 0.05306997522711754, + "step": 681 + }, + { + "epoch": 1.65, + "grad_norm": 1.200354172177631, + "importance_ratio": 0.69921875, + "kl_div": -0.17709381878376007, + "kl_div_neg": -0.3601168096065521, + "kl_div_sft": 0.005929179489612579, + "learning_rate": 9.053833605220228e-07, + "loss": 0.0515, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.034444693475961685, + "step": 682 + }, + { + "epoch": 1.66, + "grad_norm": 2.1378046097572883, + "importance_ratio": 0.765625, + "kl_div": -0.1365443766117096, + "kl_div_pos": -0.26465514302253723, + "kl_div_sft": -0.00843361672013998, + "learning_rate": 9.037520391517128e-07, + "loss": 0.0251, + "ppo_loss": -0.7674705386161804, + "sft_loss": 0.13605627417564392, + "step": 683 + }, + { + "epoch": 1.66, + "grad_norm": 2.50043807959821, + "importance_ratio": 1.0, + "kl_div": 0.009185480885207653, + "kl_div_pos": 0.0008363473461940885, + "kl_div_sft": 0.017534613609313965, + "learning_rate": 9.021207177814028e-07, + "loss": -0.2217, + "ppo_loss": -1.000836730003357, + "sft_loss": 0.031047571450471878, + "step": 684 + }, + { + "epoch": 1.66, + "grad_norm": 1.0978447694539437, + "importance_ratio": 0.72265625, + "kl_div": -0.15691974759101868, + "kl_div_neg": -0.325582891702652, + "kl_div_sft": 0.011743384413421154, + "learning_rate": 9.004893964110929e-07, + "loss": 0.139, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.05710052326321602, + "step": 685 + }, + { + "epoch": 1.66, + "grad_norm": 1.5917475967736583, + "importance_ratio": 0.9921875, + "kl_div": -0.007733147591352463, + "kl_div_pos": -0.008901979774236679, + "kl_div_sft": -0.006564315874129534, + "learning_rate": 8.98858075040783e-07, + "loss": -0.0465, + "ppo_loss": -0.9911375045776367, + "sft_loss": 0.058706916868686676, + "step": 686 + }, + { + "epoch": 1.67, + "grad_norm": 2.629120000393848, + "importance_ratio": 0.96484375, + "kl_div": -0.03718624636530876, + "kl_div_neg": -0.09633693844079971, + "kl_div_pos": 0.02196444384753704, + "learning_rate": 8.972267536704731e-07, + "loss": -0.1139, + "ppo_loss": -0.057024747133255005, + "step": 687 + }, + { + "epoch": 1.67, + "grad_norm": 1.1114538352944412, + "importance_ratio": 0.7265625, + "kl_div": -0.1579190343618393, + "kl_div_neg": -0.31698939204216003, + "kl_div_sft": 0.0011513223871588707, + "learning_rate": 8.955954323001631e-07, + "loss": 0.011, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.040994782000780106, + "step": 688 + }, + { + "epoch": 1.67, + "grad_norm": 0.9097634642638004, + "importance_ratio": 1.03125, + "kl_div": 0.013047357089817524, + "kl_div_pos": 0.02883242629468441, + "kl_div_sft": -0.0027377125807106495, + "learning_rate": 8.939641109298532e-07, + "loss": 0.1438, + "ppo_loss": -1.029252052307129, + "sft_loss": 0.09796261787414551, + "step": 689 + }, + { + "epoch": 1.67, + "grad_norm": 1.6195585404050543, + "importance_ratio": 0.96484375, + "kl_div": -0.0072251176461577415, + "kl_div_neg": -0.037326835095882416, + "kl_div_sft": 0.022876599803566933, + "learning_rate": 8.923327895595431e-07, + "loss": -0.1642, + "ppo_loss": 0.9633612632751465, + "sft_loss": 0.061763886362314224, + "step": 690 + }, + { + "epoch": 1.68, + "grad_norm": 1.4068703025085305, + "importance_ratio": 1.0390625, + "kl_div": 0.020885484293103218, + "kl_div_pos": 0.0366450771689415, + "kl_div_sft": 0.005125890951603651, + "learning_rate": 8.907014681892332e-07, + "loss": -0.1869, + "ppo_loss": -1.0373247861862183, + "sft_loss": 0.029581492766737938, + "step": 691 + }, + { + "epoch": 1.68, + "grad_norm": 1.3389147326418513, + "kl_div": 0.006500979885458946, + "kl_div_sft": 0.006500979885458946, + "learning_rate": 8.890701468189233e-07, + "loss": 0.0491, + "sft_loss": 0.04796748608350754, + "step": 692 + }, + { + "epoch": 1.68, + "grad_norm": 1.8068521571686462, + "kl_div": -0.008079552091658115, + "kl_div_sft": -0.008079552091658115, + "learning_rate": 8.874388254486134e-07, + "loss": 0.0581, + "sft_loss": 0.05922364443540573, + "step": 693 + }, + { + "epoch": 1.68, + "grad_norm": 2.0603291039395932, + "importance_ratio": 0.890625, + "kl_div": -0.1223197802901268, + "kl_div_neg": -0.24422934651374817, + "kl_div_pos": -0.0004102161037735641, + "learning_rate": 8.858075040783034e-07, + "loss": -0.0071, + "ppo_loss": -0.09979492425918579, + "step": 694 + }, + { + "epoch": 1.68, + "grad_norm": 2.567755171920564, + "importance_ratio": 0.96875, + "kl_div": -0.03187120705842972, + "kl_div_pos": -0.030253032222390175, + "kl_div_sft": -0.03348938003182411, + "learning_rate": 8.841761827079935e-07, + "loss": 0.0119, + "ppo_loss": -0.9701999425888062, + "sft_loss": 0.1174488514661789, + "step": 695 + }, + { + "epoch": 1.69, + "grad_norm": 1.9102247729034183, + "importance_ratio": 0.7890625, + "kl_div": -0.10715979337692261, + "kl_div_neg": -0.2363295704126358, + "kl_div_sft": 0.022009991109371185, + "learning_rate": 8.825448613376835e-07, + "loss": 0.1096, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.029377589002251625, + "step": 696 + }, + { + "epoch": 1.69, + "grad_norm": 1.4168552198404951, + "kl_div": 0.016493186354637146, + "kl_div_sft": 0.016493186354637146, + "learning_rate": 8.809135399673735e-07, + "loss": -0.0076, + "sft_loss": 0.038798198103904724, + "step": 697 + }, + { + "epoch": 1.69, + "grad_norm": 1.2051259035997948, + "importance_ratio": 0.99609375, + "kl_div": 0.00634672399610281, + "kl_div_pos": -0.005619870498776436, + "kl_div_sft": 0.018313318490982056, + "learning_rate": 8.792822185970635e-07, + "loss": 0.0683, + "ppo_loss": -0.994395911693573, + "sft_loss": 0.021961260586977005, + "step": 698 + }, + { + "epoch": 1.69, + "grad_norm": 1.9214684281188614, + "importance_ratio": 0.80078125, + "kl_div": -0.1102023497223854, + "kl_div_neg": -0.2217598259449005, + "kl_div_sft": 0.0013551327865570784, + "learning_rate": 8.776508972267536e-07, + "loss": -0.0939, + "ppo_loss": 0.8011077642440796, + "sft_loss": 0.0796297937631607, + "step": 699 + }, + { + "epoch": 1.7, + "grad_norm": 1.5797531491607557, + "importance_ratio": 0.84375, + "kl_div": -0.16946262121200562, + "kl_div_neg": -0.16946262121200562, + "learning_rate": 8.760195758564437e-07, + "loss": -0.2183, + "ppo_loss": 0.8442213535308838, + "step": 700 + }, + { + "epoch": 1.7, + "grad_norm": 0.9745875721176449, + "importance_ratio": 1.015625, + "kl_div": 0.01686589978635311, + "kl_div_pos": 0.01686589978635311, + "learning_rate": 8.743882544861337e-07, + "loss": -0.1424, + "ppo_loss": -1.0171172618865967, + "step": 701 + }, + { + "epoch": 1.7, + "grad_norm": 1.0281778872722713, + "importance_ratio": 0.734375, + "kl_div": -0.1530963033437729, + "kl_div_neg": -0.31053248047828674, + "kl_div_sft": 0.004339868202805519, + "learning_rate": 8.727569331158238e-07, + "loss": 0.0715, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.13882605731487274, + "step": 702 + }, + { + "epoch": 1.7, + "grad_norm": 1.2211496741826067, + "importance_ratio": 0.8046875, + "kl_div": -0.24385526776313782, + "kl_div_neg": -0.4737330973148346, + "kl_div_pos": -0.013977449387311935, + "learning_rate": 8.711256117455138e-07, + "loss": -0.0602, + "ppo_loss": -0.09305989742279053, + "step": 703 + }, + { + "epoch": 1.71, + "grad_norm": 1.7288146369197956, + "importance_ratio": 0.859375, + "kl_div": -0.1643335372209549, + "kl_div_neg": -0.31171128153800964, + "kl_div_pos": -0.016955794766545296, + "learning_rate": 8.694942903752039e-07, + "loss": 0.0194, + "ppo_loss": -0.09159353375434875, + "step": 704 + }, + { + "epoch": 1.71, + "grad_norm": 1.391555217841228, + "importance_ratio": 0.859375, + "kl_div": -0.1633143424987793, + "kl_div_neg": -0.2973756492137909, + "kl_div_pos": -0.029253020882606506, + "learning_rate": 8.67862969004894e-07, + "loss": -0.0563, + "ppo_loss": -0.08558535575866699, + "step": 705 + }, + { + "epoch": 1.71, + "grad_norm": 0.925779576038505, + "importance_ratio": 0.8984375, + "kl_div": -0.12373246997594833, + "kl_div_neg": -0.29810261726379395, + "kl_div_pos": 0.050637681037187576, + "learning_rate": 8.66231647634584e-07, + "loss": -0.0361, + "ppo_loss": -0.12597081065177917, + "step": 706 + }, + { + "epoch": 1.71, + "grad_norm": 1.6802199828218412, + "importance_ratio": 0.9921875, + "kl_div": -0.006216323934495449, + "kl_div_pos": -0.007722379639744759, + "kl_div_sft": -0.0047102682292461395, + "learning_rate": 8.64600326264274e-07, + "loss": -0.0714, + "ppo_loss": -0.9923073649406433, + "sft_loss": 0.14663028717041016, + "step": 707 + }, + { + "epoch": 1.72, + "grad_norm": 4.493721290464836, + "kl_div": 0.01159009337425232, + "kl_div_sft": 0.01159009337425232, + "learning_rate": 8.629690048939641e-07, + "loss": 0.0042, + "sft_loss": 0.040383607149124146, + "step": 708 + }, + { + "epoch": 1.72, + "grad_norm": 1.1044778325423805, + "importance_ratio": 0.76953125, + "kl_div": -0.12625326216220856, + "kl_div_neg": -0.26072871685028076, + "kl_div_sft": 0.008222181349992752, + "learning_rate": 8.613376835236541e-07, + "loss": -0.1057, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.049650732427835464, + "step": 709 + }, + { + "epoch": 1.72, + "grad_norm": 1.8488862216122248, + "kl_div": 0.011091427877545357, + "kl_div_sft": 0.011091427877545357, + "learning_rate": 8.597063621533441e-07, + "loss": -0.023, + "sft_loss": 0.06612934917211533, + "step": 710 + }, + { + "epoch": 1.72, + "grad_norm": 1.3987795448762412, + "importance_ratio": 0.98046875, + "kl_div": -0.004307438153773546, + "kl_div_pos": -0.021483710035681725, + "kl_div_sft": 0.012868833728134632, + "learning_rate": 8.580750407830342e-07, + "loss": -0.0557, + "ppo_loss": -0.9787454605102539, + "sft_loss": 0.007901977747678757, + "step": 711 + }, + { + "epoch": 1.73, + "grad_norm": 1.7901759784653886, + "kl_div": -0.007937498390674591, + "kl_div_sft": -0.007937498390674591, + "learning_rate": 8.564437194127243e-07, + "loss": -0.0439, + "sft_loss": 0.1107691079378128, + "step": 712 + }, + { + "epoch": 1.73, + "grad_norm": 3.1402156430200727, + "kl_div": 0.008624709211289883, + "kl_div_sft": 0.008624709211289883, + "learning_rate": 8.548123980424144e-07, + "loss": 0.0598, + "sft_loss": 0.05024949088692665, + "step": 713 + }, + { + "epoch": 1.73, + "grad_norm": 1.5056068218522578, + "kl_div": -0.0017463699914515018, + "kl_div_sft": -0.0017463699914515018, + "learning_rate": 8.531810766721043e-07, + "loss": -0.1127, + "sft_loss": 0.0635661706328392, + "step": 714 + }, + { + "epoch": 1.73, + "grad_norm": 1.7111928392202491, + "importance_ratio": 0.953125, + "kl_div": -0.051379989832639694, + "kl_div_pos": -0.051379989832639694, + "learning_rate": 8.515497553017944e-07, + "loss": -0.0515, + "ppo_loss": -0.953635573387146, + "step": 715 + }, + { + "epoch": 1.74, + "grad_norm": 2.907991277332683, + "kl_div": -0.0014508292078971863, + "kl_div_sft": -0.0014508292078971863, + "learning_rate": 8.499184339314844e-07, + "loss": -0.0116, + "sft_loss": 0.08951911330223083, + "step": 716 + }, + { + "epoch": 1.74, + "grad_norm": 1.0871951278765424, + "kl_div": 0.011151503771543503, + "kl_div_sft": 0.011151503771543503, + "learning_rate": 8.482871125611745e-07, + "loss": -0.1067, + "sft_loss": 0.035712748765945435, + "step": 717 + }, + { + "epoch": 1.74, + "grad_norm": 2.998019200299793, + "importance_ratio": 1.046875, + "kl_div": 0.042760252952575684, + "kl_div_pos": 0.042760252952575684, + "learning_rate": 8.466557911908646e-07, + "loss": -0.0357, + "ppo_loss": -1.0436962842941284, + "step": 718 + }, + { + "epoch": 1.74, + "grad_norm": 1.0650319616688604, + "importance_ratio": 0.97265625, + "kl_div": -0.029345382004976273, + "kl_div_pos": -0.028837639838457108, + "kl_div_sft": -0.029853124171495438, + "learning_rate": 8.450244698205547e-07, + "loss": -0.1348, + "ppo_loss": -0.9715741872787476, + "sft_loss": 0.09074969589710236, + "step": 719 + }, + { + "epoch": 1.75, + "grad_norm": 2.3564983287653996, + "kl_div": 0.011116044595837593, + "kl_div_sft": 0.011116044595837593, + "learning_rate": 8.433931484502447e-07, + "loss": -0.1878, + "sft_loss": 0.038140468299388885, + "step": 720 + }, + { + "epoch": 1.75, + "grad_norm": 1.3253419776665938, + "importance_ratio": 0.953125, + "kl_div": -0.047147031873464584, + "kl_div_neg": -0.09410122781991959, + "kl_div_pos": -0.0001928337151184678, + "learning_rate": 8.417618270799347e-07, + "loss": -0.0887, + "ppo_loss": -0.044808268547058105, + "step": 721 + }, + { + "epoch": 1.75, + "grad_norm": 1.0211129032781585, + "kl_div": 0.012244272977113724, + "kl_div_sft": 0.012244272977113724, + "learning_rate": 8.401305057096247e-07, + "loss": -0.0017, + "sft_loss": 0.09197733551263809, + "step": 722 + }, + { + "epoch": 1.75, + "grad_norm": 0.9798829657874045, + "kl_div": -0.0023552451748400927, + "kl_div_sft": -0.0023552451748400927, + "learning_rate": 8.384991843393147e-07, + "loss": -0.0282, + "sft_loss": 0.06323568522930145, + "step": 723 + }, + { + "epoch": 1.76, + "grad_norm": 1.1345348024910273, + "importance_ratio": 0.6796875, + "kl_div": -0.39538222551345825, + "kl_div_neg": -0.550060510635376, + "kl_div_pos": -0.24070391058921814, + "learning_rate": 8.368678629690048e-07, + "loss": -0.0131, + "ppo_loss": 0.006962805986404419, + "step": 724 + }, + { + "epoch": 1.76, + "grad_norm": 4.913288291990402, + "importance_ratio": 0.75, + "kl_div": -0.14327985048294067, + "kl_div_neg": -0.28823381662368774, + "kl_div_sft": 0.0016741086728870869, + "learning_rate": 8.352365415986949e-07, + "loss": 0.0042, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.0790887251496315, + "step": 725 + }, + { + "epoch": 1.76, + "grad_norm": 1.1523942322099083, + "importance_ratio": 1.015625, + "kl_div": 0.0058078221045434475, + "kl_div_pos": 0.012630930170416832, + "kl_div_sft": -0.0010152860777452588, + "learning_rate": 8.33605220228385e-07, + "loss": -0.1444, + "ppo_loss": -1.0127110481262207, + "sft_loss": 0.059882428497076035, + "step": 726 + }, + { + "epoch": 1.76, + "grad_norm": 1.1735775596041245, + "kl_div": -0.030080385506153107, + "kl_div_sft": -0.030080385506153107, + "learning_rate": 8.31973898858075e-07, + "loss": -0.0156, + "sft_loss": 0.09316494315862656, + "step": 727 + }, + { + "epoch": 1.76, + "grad_norm": 1.015282441102564, + "importance_ratio": 0.8515625, + "kl_div": -0.17228472232818604, + "kl_div_neg": -0.3388327360153198, + "kl_div_pos": -0.005736696999520063, + "learning_rate": 8.303425774877651e-07, + "loss": 0.0329, + "ppo_loss": -0.0971398651599884, + "step": 728 + }, + { + "epoch": 1.77, + "grad_norm": 1.0764164862974561, + "importance_ratio": 1.0234375, + "kl_div": 0.020020831376314163, + "kl_div_pos": 0.023958861827850342, + "kl_div_sft": 0.016082800924777985, + "learning_rate": 8.287112561174551e-07, + "loss": -0.1059, + "ppo_loss": -1.0242482423782349, + "sft_loss": 0.02947784587740898, + "step": 729 + }, + { + "epoch": 1.77, + "grad_norm": 1.3000123096763578, + "importance_ratio": 0.828125, + "kl_div": -0.09342669695615768, + "kl_div_neg": -0.18810221552848816, + "kl_div_sft": 0.0012488183565437794, + "learning_rate": 8.270799347471451e-07, + "loss": -0.0962, + "ppo_loss": 0.8285300731658936, + "sft_loss": 0.041453879326581955, + "step": 730 + }, + { + "epoch": 1.77, + "grad_norm": 1.1521890827413293, + "kl_div": 0.01180996373295784, + "kl_div_sft": 0.01180996373295784, + "learning_rate": 8.254486133768352e-07, + "loss": -0.0657, + "sft_loss": 0.05986681208014488, + "step": 731 + }, + { + "epoch": 1.77, + "grad_norm": 1.7080742554257822, + "importance_ratio": 0.7421875, + "kl_div": -0.30273953080177307, + "kl_div_neg": -0.30273953080177307, + "learning_rate": 8.238172920065253e-07, + "loss": 0.1383, + "ppo_loss": 0.806269645690918, + "step": 732 + }, + { + "epoch": 1.78, + "grad_norm": 1.0960210926991685, + "kl_div": 0.010733826085925102, + "kl_div_sft": 0.010733826085925102, + "learning_rate": 8.221859706362153e-07, + "loss": 0.0392, + "sft_loss": 0.052053019404411316, + "step": 733 + }, + { + "epoch": 1.78, + "grad_norm": 1.0775148074732415, + "importance_ratio": 0.9609375, + "kl_div": -0.03888477012515068, + "kl_div_neg": -0.08518827706575394, + "kl_div_pos": 0.007418735418468714, + "learning_rate": 8.205546492659053e-07, + "loss": 0.0344, + "ppo_loss": -0.044553518295288086, + "step": 734 + }, + { + "epoch": 1.78, + "grad_norm": 1.0134316455579038, + "importance_ratio": 1.0234375, + "kl_div": 0.01093167345970869, + "kl_div_pos": 0.020649367943406105, + "kl_div_sft": 0.0012139781611040235, + "learning_rate": 8.189233278955954e-07, + "loss": 0.0421, + "ppo_loss": -1.0208640098571777, + "sft_loss": 0.07622949033975601, + "step": 735 + }, + { + "epoch": 1.78, + "grad_norm": 1.2524015721461401, + "importance_ratio": 1.015625, + "kl_div": 0.0013436046428978443, + "kl_div_pos": 0.013851309195160866, + "kl_div_sft": -0.011164099909365177, + "learning_rate": 8.172920065252854e-07, + "loss": -0.0786, + "ppo_loss": -1.013947606086731, + "sft_loss": 0.050719670951366425, + "step": 736 + }, + { + "epoch": 1.79, + "grad_norm": 1.1603974884658244, + "importance_ratio": 0.84765625, + "kl_div": -0.17790842056274414, + "kl_div_neg": -0.3409663736820221, + "kl_div_pos": -0.014850452542304993, + "learning_rate": 8.156606851549756e-07, + "loss": 0.0241, + "ppo_loss": -0.09262964129447937, + "step": 737 + }, + { + "epoch": 1.79, + "grad_norm": 5.249287148172106, + "importance_ratio": 1.0, + "kl_div": 0.006068367511034012, + "kl_div_pos": -5.9042658904218115e-06, + "kl_div_sft": 0.012142639607191086, + "learning_rate": 8.140293637846656e-07, + "loss": -0.094, + "ppo_loss": -0.999994158744812, + "sft_loss": 0.024276066571474075, + "step": 738 + }, + { + "epoch": 1.79, + "grad_norm": 0.9898027611776252, + "kl_div": 0.0026179328560829163, + "kl_div_sft": 0.0026179328560829163, + "learning_rate": 8.123980424143556e-07, + "loss": 0.0992, + "sft_loss": 0.060176316648721695, + "step": 739 + }, + { + "epoch": 1.79, + "grad_norm": 1.0231015185327998, + "importance_ratio": 0.59375, + "kl_div": -0.26476937532424927, + "kl_div_neg": -0.5224133729934692, + "kl_div_sft": -0.0071253604255616665, + "learning_rate": 8.107667210440456e-07, + "loss": -0.1075, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.20790791511535645, + "step": 740 + }, + { + "epoch": 1.8, + "grad_norm": 1.1156809325315264, + "importance_ratio": 0.92578125, + "kl_div": -0.023399610072374344, + "kl_div_pos": -0.0772751122713089, + "kl_div_sft": 0.03047589212656021, + "learning_rate": 8.091353996737357e-07, + "loss": -0.0045, + "ppo_loss": -0.9256352186203003, + "sft_loss": 0.03140160068869591, + "step": 741 + }, + { + "epoch": 1.8, + "grad_norm": 0.9824747181325911, + "kl_div": 0.010281100869178772, + "kl_div_sft": 0.010281100869178772, + "learning_rate": 8.075040783034257e-07, + "loss": -0.0006, + "sft_loss": 0.036707181483507156, + "step": 742 + }, + { + "epoch": 1.8, + "grad_norm": 1.057966320850275, + "kl_div": 0.02083955705165863, + "kl_div_sft": 0.02083955705165863, + "learning_rate": 8.058727569331158e-07, + "loss": 0.0689, + "sft_loss": 0.07779664546251297, + "step": 743 + }, + { + "epoch": 1.8, + "grad_norm": 2.9374123418187548, + "importance_ratio": 1.015625, + "kl_div": 0.0048860725946724415, + "kl_div_pos": 0.012217046692967415, + "kl_div_sft": -0.002444901503622532, + "learning_rate": 8.042414355628059e-07, + "loss": -0.0725, + "ppo_loss": -1.0122920274734497, + "sft_loss": 0.06515536457300186, + "step": 744 + }, + { + "epoch": 1.81, + "grad_norm": 1.1769438410562778, + "importance_ratio": 0.7734375, + "kl_div": -0.13132451474666595, + "kl_div_neg": -0.25582724809646606, + "kl_div_sft": -0.006821789778769016, + "learning_rate": 8.026101141924959e-07, + "loss": -0.0015, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.09594070911407471, + "step": 745 + }, + { + "epoch": 1.81, + "grad_norm": 1.1242838805566813, + "importance_ratio": 0.91796875, + "kl_div": -0.09715739637613297, + "kl_div_neg": -0.23688404262065887, + "kl_div_pos": 0.042569246143102646, + "learning_rate": 8.00978792822186e-07, + "loss": -0.0239, + "ppo_loss": -0.12174412608146667, + "step": 746 + }, + { + "epoch": 1.81, + "grad_norm": 0.7507215843348847, + "kl_div": 0.01962902955710888, + "kl_div_sft": 0.01962902955710888, + "learning_rate": 7.993474714518759e-07, + "loss": 0.017, + "sft_loss": 0.013374298810958862, + "step": 747 + }, + { + "epoch": 1.81, + "grad_norm": 2.0844715696552276, + "kl_div": 0.016665317118167877, + "kl_div_sft": 0.016665317118167877, + "learning_rate": 7.97716150081566e-07, + "loss": -0.0687, + "sft_loss": 0.04137241095304489, + "step": 748 + }, + { + "epoch": 1.82, + "grad_norm": 1.7014040870593201, + "kl_div": 0.022186122834682465, + "kl_div_sft": 0.022186122834682465, + "learning_rate": 7.96084828711256e-07, + "loss": 0.0706, + "sft_loss": 0.042967766523361206, + "step": 749 + }, + { + "epoch": 1.82, + "grad_norm": 0.851161547419168, + "importance_ratio": 0.6484375, + "kl_div": -0.4384397864341736, + "kl_div_neg": -0.4384397864341736, + "learning_rate": 7.944535073409462e-07, + "loss": -0.0331, + "ppo_loss": 0.800000011920929, + "step": 750 + }, + { + "epoch": 1.82, + "grad_norm": 1.1825319065684825, + "importance_ratio": 1.0390625, + "kl_div": 0.02095329761505127, + "kl_div_pos": 0.03723027557134628, + "kl_div_sft": 0.004676317796111107, + "learning_rate": 7.928221859706362e-07, + "loss": 0.1072, + "ppo_loss": -1.03793203830719, + "sft_loss": 0.04656980186700821, + "step": 751 + }, + { + "epoch": 1.82, + "grad_norm": 1.930849906686716, + "importance_ratio": 0.8125, + "kl_div": -0.217138409614563, + "kl_div_neg": -0.217138409614563, + "learning_rate": 7.911908646003263e-07, + "loss": -0.0073, + "ppo_loss": 0.8517619371414185, + "step": 752 + }, + { + "epoch": 1.83, + "grad_norm": 2.1123037845293955, + "importance_ratio": 0.765625, + "kl_div": -0.2653629183769226, + "kl_div_neg": -0.31069087982177734, + "kl_div_pos": -0.22003495693206787, + "learning_rate": 7.895595432300163e-07, + "loss": -0.1358, + "ppo_loss": -0.0012453794479370117, + "step": 753 + }, + { + "epoch": 1.83, + "grad_norm": 0.9361953876798991, + "importance_ratio": 0.859375, + "kl_div": -0.06817443668842316, + "kl_div_neg": -0.15062358975410461, + "kl_div_sft": 0.014274713583290577, + "learning_rate": 7.879282218597064e-07, + "loss": 0.0415, + "ppo_loss": 0.8601714372634888, + "sft_loss": 0.015392109751701355, + "step": 754 + }, + { + "epoch": 1.83, + "grad_norm": 1.2295459496194712, + "importance_ratio": 0.78125, + "kl_div": -0.24351197481155396, + "kl_div_neg": -0.24351197481155396, + "learning_rate": 7.862969004893963e-07, + "loss": 0.1713, + "ppo_loss": 0.800000011920929, + "step": 755 + }, + { + "epoch": 1.83, + "grad_norm": 1.9602276858604508, + "importance_ratio": 0.77734375, + "kl_div": -0.12892000377178192, + "kl_div_neg": -0.2504558265209198, + "kl_div_sft": -0.007384192198514938, + "learning_rate": 7.846655791190863e-07, + "loss": -0.115, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.09705698490142822, + "step": 756 + }, + { + "epoch": 1.84, + "grad_norm": 1.3345687725516597, + "kl_div": 0.003265599487349391, + "kl_div_sft": 0.003265599487349391, + "learning_rate": 7.830342577487765e-07, + "loss": -0.082, + "sft_loss": 0.08901108801364899, + "step": 757 + }, + { + "epoch": 1.84, + "grad_norm": 1.2540828782457056, + "importance_ratio": 0.984375, + "kl_div": -0.012916380539536476, + "kl_div_neg": -0.05277905613183975, + "kl_div_pos": 0.0269462950527668, + "learning_rate": 7.814029363784665e-07, + "loss": 0.0208, + "ppo_loss": -0.039361536502838135, + "step": 758 + }, + { + "epoch": 1.84, + "grad_norm": 1.1282596518342276, + "importance_ratio": 0.703125, + "kl_div": -0.1804586499929428, + "kl_div_neg": -0.3504132628440857, + "kl_div_sft": -0.010504036210477352, + "learning_rate": 7.797716150081566e-07, + "loss": -0.0069, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.1207689493894577, + "step": 759 + }, + { + "epoch": 1.84, + "grad_norm": 1.3429166294389452, + "importance_ratio": 0.84375, + "kl_div": -0.1911240667104721, + "kl_div_neg": -0.40841782093048096, + "kl_div_pos": 0.026169700548052788, + "learning_rate": 7.781402936378466e-07, + "loss": -0.0265, + "ppo_loss": -0.11325755715370178, + "step": 760 + }, + { + "epoch": 1.84, + "grad_norm": 0.7505495124612518, + "importance_ratio": 0.875, + "kl_div": -0.13588908314704895, + "kl_div_pos": -0.13588908314704895, + "learning_rate": 7.765089722675367e-07, + "loss": -0.0604, + "ppo_loss": -0.8754533529281616, + "step": 761 + }, + { + "epoch": 1.85, + "grad_norm": 1.8840970924126217, + "kl_div": 0.0035053137689828873, + "kl_div_sft": 0.0035053137689828873, + "learning_rate": 7.748776508972267e-07, + "loss": -0.0674, + "sft_loss": 0.07195758819580078, + "step": 762 + }, + { + "epoch": 1.85, + "grad_norm": 1.4304368186899392, + "importance_ratio": 1.015625, + "kl_div": 0.0013200687244534492, + "kl_div_pos": 0.012445573695003986, + "kl_div_sft": -0.009805436246097088, + "learning_rate": 7.732463295269168e-07, + "loss": -0.0422, + "ppo_loss": -1.0125232934951782, + "sft_loss": 0.09700474143028259, + "step": 763 + }, + { + "epoch": 1.85, + "grad_norm": 1.1241353208801281, + "importance_ratio": 0.6640625, + "kl_div": -0.19818703830242157, + "kl_div_neg": -0.4117022752761841, + "kl_div_sft": 0.015328200533986092, + "learning_rate": 7.716150081566068e-07, + "loss": -0.0166, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.0617440789937973, + "step": 764 + }, + { + "epoch": 1.85, + "grad_norm": 1.331659617558531, + "importance_ratio": 0.7421875, + "kl_div": -0.14339332282543182, + "kl_div_neg": -0.29687318205833435, + "kl_div_sft": 0.010086532682180405, + "learning_rate": 7.699836867862969e-07, + "loss": 0.032, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.05432585999369621, + "step": 765 + }, + { + "epoch": 1.86, + "grad_norm": 0.9274721116245701, + "importance_ratio": 1.0546875, + "kl_div": 0.0332191027700901, + "kl_div_pos": 0.05039292946457863, + "kl_div_sft": 0.016045276075601578, + "learning_rate": 7.683523654159869e-07, + "loss": -0.0024, + "ppo_loss": -1.0516842603683472, + "sft_loss": 0.03535175696015358, + "step": 766 + }, + { + "epoch": 1.86, + "grad_norm": 2.7703251958342054, + "importance_ratio": 0.96875, + "kl_div": -0.030892496928572655, + "kl_div_neg": -0.10311172157526016, + "kl_div_pos": 0.04132672771811485, + "learning_rate": 7.66721044045677e-07, + "loss": -0.0743, + "ppo_loss": -0.07008317112922668, + "step": 767 + }, + { + "epoch": 1.86, + "grad_norm": 3.411228007144001, + "importance_ratio": 0.7265625, + "kl_div": -0.15510594844818115, + "kl_div_neg": -0.32163888216018677, + "kl_div_sft": 0.011426973156630993, + "learning_rate": 7.65089722675367e-07, + "loss": -0.1231, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.042363401502370834, + "step": 768 + }, + { + "epoch": 1.86, + "grad_norm": 1.0573455659003186, + "importance_ratio": 1.0546875, + "kl_div": 0.02536001428961754, + "kl_div_pos": 0.052674584090709686, + "kl_div_sft": -0.0019545569084584713, + "learning_rate": 7.63458401305057e-07, + "loss": -0.0199, + "ppo_loss": -1.0540865659713745, + "sft_loss": 0.06650757044553757, + "step": 769 + }, + { + "epoch": 1.87, + "grad_norm": 1.2626625522927828, + "importance_ratio": 1.0390625, + "kl_div": 0.03157272934913635, + "kl_div_pos": 0.03687372803688049, + "kl_div_sft": 0.026271730661392212, + "learning_rate": 7.618270799347472e-07, + "loss": 0.0742, + "ppo_loss": -1.0375620126724243, + "sft_loss": 0.0405106358230114, + "step": 770 + }, + { + "epoch": 1.87, + "grad_norm": 1.1863100715597479, + "importance_ratio": 1.03125, + "kl_div": 0.021939104422926903, + "kl_div_pos": 0.032872870564460754, + "kl_div_sft": 0.011005338281393051, + "learning_rate": 7.601957585644371e-07, + "loss": 0.0095, + "ppo_loss": -1.033419132232666, + "sft_loss": 0.019082684069871902, + "step": 771 + }, + { + "epoch": 1.87, + "grad_norm": 1.3042253600899398, + "kl_div": -0.00686268787831068, + "kl_div_sft": -0.00686268787831068, + "learning_rate": 7.585644371941272e-07, + "loss": -0.0553, + "sft_loss": 0.0654834434390068, + "step": 772 + }, + { + "epoch": 1.87, + "grad_norm": 1.2348815264999986, + "kl_div": 0.004872541408985853, + "kl_div_sft": 0.004872541408985853, + "learning_rate": 7.569331158238172e-07, + "loss": 0.0428, + "sft_loss": 0.05468171834945679, + "step": 773 + }, + { + "epoch": 1.88, + "grad_norm": 3.4244027055488826, + "importance_ratio": 1.0390625, + "kl_div": 0.01898658275604248, + "kl_div_pos": 0.03458033502101898, + "kl_div_sft": 0.00339283118955791, + "learning_rate": 7.553017944535073e-07, + "loss": -0.041, + "ppo_loss": -1.0351852178573608, + "sft_loss": 0.050571754574775696, + "step": 774 + }, + { + "epoch": 1.88, + "grad_norm": 1.7280742534339981, + "importance_ratio": 1.03125, + "kl_div": 0.027021419256925583, + "kl_div_pos": 0.027021419256925583, + "learning_rate": 7.536704730831973e-07, + "loss": -0.1909, + "ppo_loss": -1.0274732112884521, + "step": 775 + }, + { + "epoch": 1.88, + "grad_norm": 2.228668394878667, + "importance_ratio": 0.8203125, + "kl_div": -0.09695656597614288, + "kl_div_pos": -0.1973302811384201, + "kl_div_sft": 0.0034171519801020622, + "learning_rate": 7.520391517128875e-07, + "loss": -0.0268, + "ppo_loss": -0.8209194540977478, + "sft_loss": 0.10032328963279724, + "step": 776 + }, + { + "epoch": 1.88, + "grad_norm": 1.9671372362522144, + "importance_ratio": 0.90625, + "kl_div": -0.03965136408805847, + "kl_div_neg": -0.09838567674160004, + "kl_div_sft": 0.01908295229077339, + "learning_rate": 7.504078303425775e-07, + "loss": -0.0087, + "ppo_loss": 0.9062992930412292, + "sft_loss": 0.06949137151241302, + "step": 777 + }, + { + "epoch": 1.89, + "grad_norm": 1.5321125792878874, + "importance_ratio": 1.046875, + "kl_div": 0.024745360016822815, + "kl_div_pos": 0.042934924364089966, + "kl_div_sft": 0.006555794272571802, + "learning_rate": 7.487765089722676e-07, + "loss": -0.1395, + "ppo_loss": -1.043869972229004, + "sft_loss": 0.085297130048275, + "step": 778 + }, + { + "epoch": 1.89, + "grad_norm": 1.0138319179490123, + "importance_ratio": 1.0390625, + "kl_div": 0.024765880778431892, + "kl_div_pos": 0.03781313821673393, + "kl_div_sft": 0.011718622408807278, + "learning_rate": 7.471451876019575e-07, + "loss": 0.0432, + "ppo_loss": -1.0385371446609497, + "sft_loss": 0.03418990597128868, + "step": 779 + }, + { + "epoch": 1.89, + "grad_norm": 1.8636929678937624, + "importance_ratio": 0.8671875, + "kl_div": -0.062706358730793, + "kl_div_neg": -0.1413581222295761, + "kl_div_sft": 0.01594540849328041, + "learning_rate": 7.455138662316476e-07, + "loss": 0.0171, + "ppo_loss": 0.8681783676147461, + "sft_loss": 0.06166466325521469, + "step": 780 + }, + { + "epoch": 1.89, + "grad_norm": 1.9722408046055935, + "importance_ratio": 0.7734375, + "kl_div": -0.12510272860527039, + "kl_div_neg": -0.2547655999660492, + "kl_div_sft": 0.0045601557940244675, + "learning_rate": 7.438825448613376e-07, + "loss": -0.0757, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.09346868097782135, + "step": 781 + }, + { + "epoch": 1.9, + "grad_norm": 2.440355828714277, + "importance_ratio": 1.0546875, + "kl_div": 0.019144684076309204, + "kl_div_pos": 0.05361033231019974, + "kl_div_sft": -0.015320963226258755, + "learning_rate": 7.422512234910276e-07, + "loss": -0.0695, + "ppo_loss": -1.0550734996795654, + "sft_loss": 0.08448739349842072, + "step": 782 + }, + { + "epoch": 1.9, + "grad_norm": 1.55208335040133, + "importance_ratio": 1.03125, + "kl_div": 0.012653040699660778, + "kl_div_pos": 0.027025144547224045, + "kl_div_sft": -0.0017190633807331324, + "learning_rate": 7.406199021207178e-07, + "loss": -0.0467, + "ppo_loss": -1.0273935794830322, + "sft_loss": 0.06036945432424545, + "step": 783 + }, + { + "epoch": 1.9, + "grad_norm": 1.109984968960003, + "importance_ratio": 0.890625, + "kl_div": -0.061940498650074005, + "kl_div_neg": -0.11718282103538513, + "kl_div_sft": -0.00669817766174674, + "learning_rate": 7.389885807504078e-07, + "loss": -0.0052, + "ppo_loss": 0.8894225358963013, + "sft_loss": 0.13916446268558502, + "step": 784 + }, + { + "epoch": 1.9, + "grad_norm": 1.6532969509756765, + "importance_ratio": 0.75, + "kl_div": -0.14871735870838165, + "kl_div_neg": -0.28607773780822754, + "kl_div_sft": -0.011356977745890617, + "learning_rate": 7.373572593800979e-07, + "loss": -0.0826, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.07842813432216644, + "step": 785 + }, + { + "epoch": 1.91, + "grad_norm": 1.9565854964026683, + "importance_ratio": 1.0234375, + "kl_div": 0.010597184300422668, + "kl_div_pos": 0.02512553334236145, + "kl_div_sft": -0.003931163810193539, + "learning_rate": 7.357259380097879e-07, + "loss": -0.0408, + "ppo_loss": -1.0254437923431396, + "sft_loss": 0.05996675416827202, + "step": 786 + }, + { + "epoch": 1.91, + "grad_norm": 3.3010744946477453, + "importance_ratio": 0.99609375, + "kl_div": 0.0037558861076831818, + "kl_div_pos": -0.0037879080045968294, + "kl_div_sft": 0.01129967998713255, + "learning_rate": 7.340946166394779e-07, + "loss": -0.023, + "ppo_loss": -0.996219277381897, + "sft_loss": 0.0625493973493576, + "step": 787 + }, + { + "epoch": 1.91, + "grad_norm": 3.899235890189737, + "kl_div": 0.01014263927936554, + "kl_div_sft": 0.01014263927936554, + "learning_rate": 7.324632952691679e-07, + "loss": -0.0615, + "sft_loss": 0.0858975425362587, + "step": 788 + }, + { + "epoch": 1.91, + "grad_norm": 1.0165068685822132, + "importance_ratio": 0.93359375, + "kl_div": -0.01569426991045475, + "kl_div_neg": -0.06786607205867767, + "kl_div_sft": 0.03647753223776817, + "learning_rate": 7.308319738988581e-07, + "loss": 0.0064, + "ppo_loss": 0.9343855977058411, + "sft_loss": 0.06460925936698914, + "step": 789 + }, + { + "epoch": 1.92, + "grad_norm": 4.0812106681202, + "importance_ratio": 1.046875, + "kl_div": 0.021152887493371964, + "kl_div_pos": 0.047909412533044815, + "kl_div_sft": -0.005603638477623463, + "learning_rate": 7.292006525285481e-07, + "loss": -0.0341, + "ppo_loss": -1.0490756034851074, + "sft_loss": 0.04746154323220253, + "step": 790 + }, + { + "epoch": 1.92, + "grad_norm": 1.6711705229869975, + "importance_ratio": 0.9296875, + "kl_div": -0.07854534685611725, + "kl_div_neg": -0.17257368564605713, + "kl_div_pos": 0.015482988208532333, + "learning_rate": 7.275693311582382e-07, + "loss": -0.2077, + "ppo_loss": -0.0870535671710968, + "step": 791 + }, + { + "epoch": 1.92, + "grad_norm": 1.9255106583040016, + "importance_ratio": 1.0390625, + "kl_div": 0.02168772555887699, + "kl_div_pos": 0.040172941982746124, + "kl_div_sft": 0.003202509367838502, + "learning_rate": 7.259380097879282e-07, + "loss": -0.2886, + "ppo_loss": -1.0409908294677734, + "sft_loss": 0.055382560938596725, + "step": 792 + }, + { + "epoch": 1.92, + "grad_norm": 3.2895379889376657, + "importance_ratio": 0.609375, + "kl_div": -0.24140967428684235, + "kl_div_neg": -0.49316883087158203, + "kl_div_sft": 0.010349491611123085, + "learning_rate": 7.243066884176182e-07, + "loss": 0.0002, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.058419495820999146, + "step": 793 + }, + { + "epoch": 1.92, + "grad_norm": 1.3688321493919273, + "importance_ratio": 0.6953125, + "kl_div": -0.17243990302085876, + "kl_div_neg": -0.3653371036052704, + "kl_div_sft": 0.020457301288843155, + "learning_rate": 7.226753670473083e-07, + "loss": -0.0483, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.07426360249519348, + "step": 794 + }, + { + "epoch": 1.93, + "grad_norm": 1.878562784822585, + "importance_ratio": 1.0390625, + "kl_div": 0.020934144034981728, + "kl_div_pos": 0.037678271532058716, + "kl_div_sft": 0.004190015606582165, + "learning_rate": 7.210440456769983e-07, + "loss": 0.0899, + "ppo_loss": -1.0383970737457275, + "sft_loss": 0.06750870496034622, + "step": 795 + }, + { + "epoch": 1.93, + "grad_norm": 1.4040822593305553, + "importance_ratio": 0.8046875, + "kl_div": -0.21546536684036255, + "kl_div_neg": -0.21546536684036255, + "learning_rate": 7.194127243066884e-07, + "loss": 0.0605, + "ppo_loss": 0.832324206829071, + "step": 796 + }, + { + "epoch": 1.93, + "grad_norm": 1.374617610125144, + "importance_ratio": 1.03125, + "kl_div": 0.01844792440533638, + "kl_div_pos": 0.03335430100560188, + "kl_div_sft": 0.003541549202054739, + "learning_rate": 7.177814029363784e-07, + "loss": -0.118, + "ppo_loss": -1.033916711807251, + "sft_loss": 0.046227868646383286, + "step": 797 + }, + { + "epoch": 1.93, + "grad_norm": 1.3155559204522764, + "kl_div": 0.003813364077359438, + "kl_div_sft": 0.003813364077359438, + "learning_rate": 7.161500815660685e-07, + "loss": -0.1129, + "sft_loss": 0.09659446775913239, + "step": 798 + }, + { + "epoch": 1.94, + "grad_norm": 1.0441580859778266, + "kl_div": -0.0024976865388453007, + "kl_div_sft": -0.0024976865388453007, + "learning_rate": 7.145187601957585e-07, + "loss": -0.0499, + "sft_loss": 0.06166272237896919, + "step": 799 + }, + { + "epoch": 1.94, + "grad_norm": 1.4281665654049773, + "kl_div": 0.012549598701298237, + "kl_div_sft": 0.012549598701298237, + "learning_rate": 7.128874388254486e-07, + "loss": 0.1386, + "sft_loss": 0.07515973597764969, + "step": 800 + }, + { + "epoch": 1.94, + "grad_norm": 3.9301131455924017, + "kl_div": 0.012901881709694862, + "kl_div_sft": 0.012901881709694862, + "learning_rate": 7.112561174551386e-07, + "loss": -0.1478, + "sft_loss": 0.04293525964021683, + "step": 801 + }, + { + "epoch": 1.94, + "grad_norm": 1.1873570406340965, + "importance_ratio": 0.79296875, + "kl_div": -0.24081431329250336, + "kl_div_neg": -0.24081431329250336, + "learning_rate": 7.096247960848288e-07, + "loss": -0.069, + "ppo_loss": 0.8468914031982422, + "step": 802 + }, + { + "epoch": 1.95, + "grad_norm": 1.5981748900311181, + "kl_div": 0.013554854318499565, + "kl_div_sft": 0.013554854318499565, + "learning_rate": 7.079934747145187e-07, + "loss": -0.0581, + "sft_loss": 0.08860177546739578, + "step": 803 + }, + { + "epoch": 1.95, + "grad_norm": 1.1497190422507106, + "importance_ratio": 1.0078125, + "kl_div": -0.0008298805914819241, + "kl_div_pos": 0.009881866164505482, + "kl_div_sft": -0.01154162734746933, + "learning_rate": 7.063621533442088e-07, + "loss": 0.099, + "ppo_loss": -1.0099308490753174, + "sft_loss": 0.12553054094314575, + "step": 804 + }, + { + "epoch": 1.95, + "grad_norm": 2.7418610902932983, + "importance_ratio": 0.7734375, + "kl_div": -0.1349291205406189, + "kl_div_neg": -0.2556372880935669, + "kl_div_sft": -0.014220948331058025, + "learning_rate": 7.047308319738988e-07, + "loss": 0.1, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.08331334590911865, + "step": 805 + }, + { + "epoch": 1.95, + "grad_norm": 1.2000722426285284, + "kl_div": 0.011318661272525787, + "kl_div_sft": 0.011318661272525787, + "learning_rate": 7.030995106035888e-07, + "loss": 0.1115, + "sft_loss": 0.08323749154806137, + "step": 806 + }, + { + "epoch": 1.96, + "grad_norm": 1.2451685995819552, + "kl_div": -0.004958365112543106, + "kl_div_sft": -0.004958365112543106, + "learning_rate": 7.014681892332789e-07, + "loss": 0.1006, + "sft_loss": 0.12090334296226501, + "step": 807 + }, + { + "epoch": 1.96, + "grad_norm": 3.2934260113091613, + "importance_ratio": 1.03125, + "kl_div": 0.024032320827245712, + "kl_div_pos": 0.03315795585513115, + "kl_div_sft": 0.014906683936715126, + "learning_rate": 6.99836867862969e-07, + "loss": -0.0971, + "ppo_loss": -1.0337138175964355, + "sft_loss": 0.01600171998143196, + "step": 808 + }, + { + "epoch": 1.96, + "grad_norm": 1.2325395385868936, + "kl_div": 0.013921285048127174, + "kl_div_sft": 0.013921285048127174, + "learning_rate": 6.982055464926591e-07, + "loss": 0.0345, + "sft_loss": 0.09161141514778137, + "step": 809 + }, + { + "epoch": 1.96, + "grad_norm": 2.126775224460074, + "kl_div": 0.0037957075983285904, + "kl_div_sft": 0.0037957075983285904, + "learning_rate": 6.965742251223491e-07, + "loss": 0.083, + "sft_loss": 0.030382830649614334, + "step": 810 + }, + { + "epoch": 1.97, + "grad_norm": 1.2775065933624745, + "importance_ratio": 0.68359375, + "kl_div": -0.1811116635799408, + "kl_div_neg": -0.37784725427627563, + "kl_div_sft": 0.015623941086232662, + "learning_rate": 6.949429037520392e-07, + "loss": 0.0044, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.032511260360479355, + "step": 811 + }, + { + "epoch": 1.97, + "grad_norm": 1.1472604129972903, + "importance_ratio": 0.9140625, + "kl_div": -0.09489559382200241, + "kl_div_pos": -0.09489559382200241, + "learning_rate": 6.933115823817291e-07, + "loss": -0.0312, + "ppo_loss": -0.914150595664978, + "step": 812 + }, + { + "epoch": 1.97, + "grad_norm": 1.2691938215937906, + "importance_ratio": 0.59375, + "kl_div": -0.2605888545513153, + "kl_div_neg": -0.5227549076080322, + "kl_div_sft": 0.0015771690523251891, + "learning_rate": 6.916802610114192e-07, + "loss": -0.075, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.04791930317878723, + "step": 813 + }, + { + "epoch": 1.97, + "grad_norm": 1.3854534125175522, + "kl_div": 0.010630996897816658, + "kl_div_sft": 0.010630996897816658, + "learning_rate": 6.900489396411092e-07, + "loss": -0.0741, + "sft_loss": 0.044867224991321564, + "step": 814 + }, + { + "epoch": 1.98, + "grad_norm": 1.0842829112765733, + "importance_ratio": 0.953125, + "kl_div": -0.055763162672519684, + "kl_div_neg": -0.18750636279582977, + "kl_div_pos": 0.0759800374507904, + "learning_rate": 6.884176182707994e-07, + "loss": -0.1645, + "ppo_loss": -0.12495854496955872, + "step": 815 + }, + { + "epoch": 1.98, + "grad_norm": 1.3370248039529775, + "importance_ratio": 0.88671875, + "kl_div": -0.05268274247646332, + "kl_div_neg": -0.1205388605594635, + "kl_div_sft": 0.015173375606536865, + "learning_rate": 6.867862969004894e-07, + "loss": 0.0088, + "ppo_loss": 0.8864426612854004, + "sft_loss": 0.10430143773555756, + "step": 816 + }, + { + "epoch": 1.98, + "grad_norm": 2.836920785813892, + "importance_ratio": 0.70703125, + "kl_div": -0.16925114393234253, + "kl_div_pos": -0.34709298610687256, + "kl_div_sft": 0.008590701036155224, + "learning_rate": 6.851549755301794e-07, + "loss": -0.0516, + "ppo_loss": -0.7067395448684692, + "sft_loss": 0.0393403097987175, + "step": 817 + }, + { + "epoch": 1.98, + "grad_norm": 1.054002044599747, + "importance_ratio": 1.0546875, + "kl_div": 0.03092723712325096, + "kl_div_pos": 0.05130693316459656, + "kl_div_sft": 0.010547542944550514, + "learning_rate": 6.835236541598695e-07, + "loss": -0.1042, + "ppo_loss": -1.0526460409164429, + "sft_loss": 0.05216517299413681, + "step": 818 + }, + { + "epoch": 1.99, + "grad_norm": 1.1683901533729915, + "importance_ratio": 1.0546875, + "kl_div": 0.023352043703198433, + "kl_div_pos": 0.05334731191396713, + "kl_div_sft": -0.006643224041908979, + "learning_rate": 6.818923327895594e-07, + "loss": -0.036, + "ppo_loss": -1.0547959804534912, + "sft_loss": 0.057002827525138855, + "step": 819 + }, + { + "epoch": 1.99, + "grad_norm": 1.551231491876032, + "importance_ratio": 1.015625, + "kl_div": 0.014218551106750965, + "kl_div_pos": 0.01491499226540327, + "kl_div_sft": 0.01352210994809866, + "learning_rate": 6.802610114192495e-07, + "loss": -0.0743, + "ppo_loss": -1.0150268077850342, + "sft_loss": 0.027273595333099365, + "step": 820 + }, + { + "epoch": 1.99, + "grad_norm": 1.755738115207708, + "importance_ratio": 1.0234375, + "kl_div": 0.020510466769337654, + "kl_div_pos": 0.020663322880864143, + "kl_div_sft": 0.020357610657811165, + "learning_rate": 6.786296900489396e-07, + "loss": -0.0518, + "ppo_loss": -1.0208783149719238, + "sft_loss": 0.09010978788137436, + "step": 821 + }, + { + "epoch": 1.99, + "grad_norm": 2.787474963156988, + "importance_ratio": 0.70703125, + "kl_div": -0.16490840911865234, + "kl_div_neg": -0.3457406163215637, + "kl_div_sft": 0.015923811122775078, + "learning_rate": 6.769983686786297e-07, + "loss": 0.0045, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.040341440588235855, + "step": 822 + }, + { + "epoch": 2.0, + "grad_norm": 2.7430841876331677, + "importance_ratio": 0.80078125, + "kl_div": -0.10109415650367737, + "kl_div_neg": -0.22379928827285767, + "kl_div_sft": 0.021610967814922333, + "learning_rate": 6.753670473083197e-07, + "loss": -0.0856, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.10019398480653763, + "step": 823 + }, + { + "epoch": 2.0, + "grad_norm": 1.2497369012514388, + "importance_ratio": 0.9921875, + "kl_div": -0.00417862506583333, + "kl_div_pos": -0.007548276800662279, + "kl_div_sft": -0.0008089732145890594, + "learning_rate": 6.737357259380098e-07, + "loss": 0.047, + "ppo_loss": -0.9924801588058472, + "sft_loss": 0.08204149454832077, + "step": 824 + }, + { + "epoch": 2.0, + "grad_norm": 3.340571381127005, + "importance_ratio": 1.0390625, + "kl_div": 0.02399800717830658, + "kl_div_pos": 0.037386514246463776, + "kl_div_sft": 0.010609501041471958, + "learning_rate": 6.721044045676998e-07, + "loss": -0.1545, + "ppo_loss": -1.038094162940979, + "sft_loss": 0.014390765689313412, + "step": 825 + }, + { + "epoch": 2.0, + "grad_norm": 0.7295409649700262, + "kl_div": 0.012158479541540146, + "kl_div_sft": 0.012158479541540146, + "learning_rate": 6.704730831973899e-07, + "loss": -0.0283, + "sft_loss": 0.0906473770737648, + "step": 826 + }, + { + "epoch": 2.0, + "grad_norm": 1.7190898212535601, + "importance_ratio": 0.796875, + "kl_div": -0.1135992556810379, + "kl_div_neg": -0.22905249893665314, + "kl_div_sft": 0.001853986643254757, + "learning_rate": 6.688417618270798e-07, + "loss": -0.0515, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.050841983407735825, + "step": 827 + }, + { + "epoch": 2.01, + "grad_norm": 0.6960819573051047, + "importance_ratio": 0.859375, + "kl_div": -0.1718362420797348, + "kl_div_neg": -0.3597458004951477, + "kl_div_pos": 0.01607331819832325, + "learning_rate": 6.6721044045677e-07, + "loss": -0.1438, + "ppo_loss": -0.10810157656669617, + "step": 828 + }, + { + "epoch": 2.01, + "grad_norm": 0.4588395559847598, + "importance_ratio": 0.91796875, + "kl_div": -0.0542854405939579, + "kl_div_pos": -0.08652933686971664, + "kl_div_sft": -0.02204154245555401, + "learning_rate": 6.6557911908646e-07, + "loss": -0.0033, + "ppo_loss": -0.9171086549758911, + "sft_loss": 0.13468027114868164, + "step": 829 + }, + { + "epoch": 2.01, + "grad_norm": 0.8429747304624693, + "kl_div": 0.04222976416349411, + "kl_div_sft": 0.04222976416349411, + "learning_rate": 6.6394779771615e-07, + "loss": -0.132, + "sft_loss": 0.08493160456418991, + "step": 830 + }, + { + "epoch": 2.01, + "grad_norm": 0.7845448253120335, + "importance_ratio": 0.7109375, + "kl_div": -0.3473590612411499, + "kl_div_neg": -0.3473590612411499, + "learning_rate": 6.623164763458401e-07, + "loss": 0.0421, + "ppo_loss": 0.800000011920929, + "step": 831 + }, + { + "epoch": 2.02, + "grad_norm": 0.6056690500023906, + "importance_ratio": 1.0859375, + "kl_div": 0.05032026022672653, + "kl_div_pos": 0.08291984349489212, + "kl_div_sft": 0.017720678821206093, + "learning_rate": 6.606851549755301e-07, + "loss": -0.258, + "ppo_loss": -1.0864547491073608, + "sft_loss": 0.01477021537721157, + "step": 832 + }, + { + "epoch": 2.02, + "grad_norm": 0.9148086045939298, + "importance_ratio": 0.671875, + "kl_div": -0.19181066751480103, + "kl_div_neg": -0.3996953070163727, + "kl_div_sft": 0.016073964536190033, + "learning_rate": 6.590538336052202e-07, + "loss": 0.0231, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.0960092768073082, + "step": 833 + }, + { + "epoch": 2.02, + "grad_norm": 0.6604697481919004, + "importance_ratio": 1.046875, + "kl_div": 0.017061958089470863, + "kl_div_pos": 0.04899755120277405, + "kl_div_sft": -0.014873635023832321, + "learning_rate": 6.574225122349103e-07, + "loss": -0.1018, + "ppo_loss": -1.0502177476882935, + "sft_loss": 0.09160658717155457, + "step": 834 + }, + { + "epoch": 2.02, + "grad_norm": 0.6492688009384401, + "importance_ratio": 0.80078125, + "kl_div": -0.09790018200874329, + "kl_div_neg": -0.22408372163772583, + "kl_div_sft": 0.02828335016965866, + "learning_rate": 6.557911908646004e-07, + "loss": 0.0167, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.07054585963487625, + "step": 835 + }, + { + "epoch": 2.03, + "grad_norm": 0.5942027223177359, + "importance_ratio": 1.046875, + "kl_div": 0.029894528910517693, + "kl_div_pos": 0.04286080598831177, + "kl_div_sft": 0.016928251832723618, + "learning_rate": 6.541598694942903e-07, + "loss": -0.0582, + "ppo_loss": -1.0437926054000854, + "sft_loss": 0.03479551151394844, + "step": 836 + }, + { + "epoch": 2.03, + "grad_norm": 0.9763045313572033, + "importance_ratio": 0.7578125, + "kl_div": -0.27878081798553467, + "kl_div_neg": -0.27878081798553467, + "learning_rate": 6.525285481239804e-07, + "loss": -0.0547, + "ppo_loss": 0.800000011920929, + "step": 837 + }, + { + "epoch": 2.03, + "grad_norm": 1.6743166198463337, + "importance_ratio": 0.6640625, + "kl_div": -0.19439943134784698, + "kl_div_pos": -0.41146811842918396, + "kl_div_sft": 0.022669266909360886, + "learning_rate": 6.508972267536704e-07, + "loss": -0.0292, + "ppo_loss": -0.6626766324043274, + "sft_loss": 0.07602635025978088, + "step": 838 + }, + { + "epoch": 2.03, + "grad_norm": 0.658178945940582, + "importance_ratio": 0.93359375, + "kl_div": -0.07545746117830276, + "kl_div_neg": -0.1793633997440338, + "kl_div_pos": 0.028448481112718582, + "learning_rate": 6.492659053833605e-07, + "loss": -0.0082, + "ppo_loss": -0.09652745723724365, + "step": 839 + }, + { + "epoch": 2.04, + "grad_norm": 0.7360289595804811, + "importance_ratio": 0.6953125, + "kl_div": -0.36604252457618713, + "kl_div_neg": -0.36604252457618713, + "learning_rate": 6.476345840130505e-07, + "loss": -0.0225, + "ppo_loss": 0.800000011920929, + "step": 840 + }, + { + "epoch": 2.04, + "grad_norm": 0.8158893317648063, + "kl_div": 0.019132403656840324, + "kl_div_sft": 0.019132403656840324, + "learning_rate": 6.460032626427406e-07, + "loss": -0.1055, + "sft_loss": 0.06531329452991486, + "step": 841 + }, + { + "epoch": 2.04, + "grad_norm": 0.5638392717343974, + "importance_ratio": 1.0390625, + "kl_div": -0.001689983531832695, + "kl_div_pos": 0.03776420280337334, + "kl_div_sft": -0.04114416986703873, + "learning_rate": 6.443719412724307e-07, + "loss": -0.0369, + "ppo_loss": -1.038486361503601, + "sft_loss": 0.09056337922811508, + "step": 842 + }, + { + "epoch": 2.04, + "grad_norm": 0.8054468220372359, + "importance_ratio": 0.98828125, + "kl_div": -0.001824448350816965, + "kl_div_pos": -0.012244836427271366, + "kl_div_sft": 0.008595939725637436, + "learning_rate": 6.427406199021207e-07, + "loss": -0.0229, + "ppo_loss": -0.9878298044204712, + "sft_loss": 0.0815005898475647, + "step": 843 + }, + { + "epoch": 2.05, + "grad_norm": 2.3269584856776846, + "importance_ratio": 0.875, + "kl_div": -0.1502387821674347, + "kl_div_neg": -0.3389727473258972, + "kl_div_pos": 0.03849519044160843, + "learning_rate": 6.411092985318107e-07, + "loss": -0.1731, + "ppo_loss": -0.1196228563785553, + "step": 844 + }, + { + "epoch": 2.05, + "grad_norm": 0.8850740817147107, + "kl_div": -0.005501694977283478, + "kl_div_sft": -0.005501694977283478, + "learning_rate": 6.394779771615007e-07, + "loss": -0.1446, + "sft_loss": 0.06264421343803406, + "step": 845 + }, + { + "epoch": 2.05, + "grad_norm": 1.9384037494196507, + "importance_ratio": 0.76171875, + "kl_div": -0.12875968217849731, + "kl_div_neg": -0.2738000452518463, + "kl_div_sft": 0.016280686482787132, + "learning_rate": 6.378466557911908e-07, + "loss": 0.0103, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.12097492069005966, + "step": 846 + }, + { + "epoch": 2.05, + "grad_norm": 0.83877406641491, + "importance_ratio": 1.0078125, + "kl_div": 0.003864092053845525, + "kl_div_pos": 0.009729886427521706, + "kl_div_sft": -0.002001702319830656, + "learning_rate": 6.362153344208809e-07, + "loss": 0.0018, + "ppo_loss": -1.0097774267196655, + "sft_loss": 0.09012917429208755, + "step": 847 + }, + { + "epoch": 2.06, + "grad_norm": 0.6029278754233961, + "importance_ratio": 0.87890625, + "kl_div": -0.1441148966550827, + "kl_div_neg": -0.30787554383277893, + "kl_div_pos": 0.01964573748409748, + "learning_rate": 6.34584013050571e-07, + "loss": 0.0073, + "ppo_loss": -0.10991999506950378, + "step": 848 + }, + { + "epoch": 2.06, + "grad_norm": 1.97101033352137, + "kl_div": 0.005903508514165878, + "kl_div_sft": 0.005903508514165878, + "learning_rate": 6.32952691680261e-07, + "loss": -0.0246, + "sft_loss": 0.029841091483831406, + "step": 849 + }, + { + "epoch": 2.06, + "grad_norm": 2.246006176112066, + "kl_div": 0.017790446057915688, + "kl_div_sft": 0.017790446057915688, + "learning_rate": 6.313213703099511e-07, + "loss": -0.1384, + "sft_loss": 0.023750465363264084, + "step": 850 + }, + { + "epoch": 2.06, + "grad_norm": 3.720825898154855, + "importance_ratio": 0.890625, + "kl_div": -0.13176052272319794, + "kl_div_neg": -0.2837873101234436, + "kl_div_pos": 0.020266273990273476, + "learning_rate": 6.296900489396411e-07, + "loss": -0.0339, + "ppo_loss": -0.11023649573326111, + "step": 851 + }, + { + "epoch": 2.07, + "grad_norm": 0.805438607777615, + "kl_div": 0.014838488772511482, + "kl_div_sft": 0.014838488772511482, + "learning_rate": 6.28058727569331e-07, + "loss": -0.1113, + "sft_loss": 0.054096221923828125, + "step": 852 + }, + { + "epoch": 2.07, + "grad_norm": 1.6791321302069588, + "importance_ratio": 0.8515625, + "kl_div": -0.17815786600112915, + "kl_div_neg": -0.3900858759880066, + "kl_div_pos": 0.03377014398574829, + "learning_rate": 6.264274061990211e-07, + "loss": -0.0931, + "ppo_loss": -0.11717340350151062, + "step": 853 + }, + { + "epoch": 2.07, + "grad_norm": 1.4237608926438445, + "importance_ratio": 0.7265625, + "kl_div": -0.3291565179824829, + "kl_div_neg": -0.3291565179824829, + "learning_rate": 6.247960848287112e-07, + "loss": -0.0117, + "ppo_loss": 0.8069021701812744, + "step": 854 + }, + { + "epoch": 2.07, + "grad_norm": 1.0107552200435455, + "importance_ratio": 1.0390625, + "kl_div": 0.028783971443772316, + "kl_div_pos": 0.03633904084563255, + "kl_div_sft": 0.02122890204191208, + "learning_rate": 6.231647634584013e-07, + "loss": -0.1386, + "ppo_loss": -1.0370073318481445, + "sft_loss": 0.021868525072932243, + "step": 855 + }, + { + "epoch": 2.08, + "grad_norm": 0.5036693873875817, + "importance_ratio": 0.6015625, + "kl_div": -0.5148600935935974, + "kl_div_neg": -0.5148600935935974, + "learning_rate": 6.215334420880913e-07, + "loss": -0.0459, + "ppo_loss": 0.800000011920929, + "step": 856 + }, + { + "epoch": 2.08, + "grad_norm": 1.3641401061259197, + "importance_ratio": 1.046875, + "kl_div": 0.0433039590716362, + "kl_div_pos": 0.0433039590716362, + "learning_rate": 6.199021207177814e-07, + "loss": -0.0362, + "ppo_loss": -1.0442602634429932, + "step": 857 + }, + { + "epoch": 2.08, + "grad_norm": 0.6578019728010779, + "importance_ratio": 1.0234375, + "kl_div": 0.025171402841806412, + "kl_div_pos": 0.022367173805832863, + "kl_div_sft": 0.02797563001513481, + "learning_rate": 6.182707993474714e-07, + "loss": -0.0778, + "ppo_loss": -1.0226192474365234, + "sft_loss": 0.03281998261809349, + "step": 858 + }, + { + "epoch": 2.08, + "grad_norm": 0.9822804404163674, + "kl_div": 0.012662078253924847, + "kl_div_sft": 0.012662078253924847, + "learning_rate": 6.166394779771615e-07, + "loss": -0.1125, + "sft_loss": 0.03483930230140686, + "step": 859 + }, + { + "epoch": 2.08, + "grad_norm": 1.866113649204854, + "importance_ratio": 0.53125, + "kl_div": -0.30700746178627014, + "kl_div_neg": -0.6296558380126953, + "kl_div_sft": 0.015640929341316223, + "learning_rate": 6.150081566068515e-07, + "loss": -0.1434, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.03329959884285927, + "step": 860 + }, + { + "epoch": 2.09, + "grad_norm": 1.8112362369201367, + "importance_ratio": 0.8125, + "kl_div": -0.10016106814146042, + "kl_div_neg": -0.20947012305259705, + "kl_div_sft": 0.009147980250418186, + "learning_rate": 6.133768352365416e-07, + "loss": 0.0362, + "ppo_loss": 0.81101393699646, + "sft_loss": 0.030194612219929695, + "step": 861 + }, + { + "epoch": 2.09, + "grad_norm": 0.9537721531846218, + "importance_ratio": 0.96875, + "kl_div": -0.029792316257953644, + "kl_div_pos": -0.029792316257953644, + "learning_rate": 6.117455138662316e-07, + "loss": -0.1717, + "ppo_loss": -0.9717547297477722, + "step": 862 + }, + { + "epoch": 2.09, + "grad_norm": 0.9043738907053905, + "kl_div": -0.06411665678024292, + "kl_div_sft": -0.06411665678024292, + "learning_rate": 6.101141924959217e-07, + "loss": -0.1035, + "sft_loss": 0.2276521474123001, + "step": 863 + }, + { + "epoch": 2.09, + "grad_norm": 2.0284484323251943, + "importance_ratio": 0.71875, + "kl_div": -0.3353831470012665, + "kl_div_neg": -0.3353831470012665, + "learning_rate": 6.084828711256117e-07, + "loss": 0.0273, + "ppo_loss": 0.800000011920929, + "step": 864 + }, + { + "epoch": 2.1, + "grad_norm": 0.8652953391200562, + "importance_ratio": 1.0625, + "kl_div": 0.03596498817205429, + "kl_div_pos": 0.057203471660614014, + "kl_div_sft": 0.014726504683494568, + "learning_rate": 6.068515497553017e-07, + "loss": -0.0778, + "ppo_loss": -1.0588712692260742, + "sft_loss": 0.03289495408535004, + "step": 865 + }, + { + "epoch": 2.1, + "grad_norm": 0.8463662171117285, + "importance_ratio": 0.7578125, + "kl_div": -0.27933812141418457, + "kl_div_neg": -0.27933812141418457, + "learning_rate": 6.052202283849919e-07, + "loss": -0.0995, + "ppo_loss": 0.800000011920929, + "step": 866 + }, + { + "epoch": 2.1, + "grad_norm": 0.48484302797303697, + "importance_ratio": 0.6953125, + "kl_div": -0.3646080493927002, + "kl_div_neg": -0.3646080493927002, + "learning_rate": 6.035889070146819e-07, + "loss": 0.0461, + "ppo_loss": 0.800000011920929, + "step": 867 + }, + { + "epoch": 2.1, + "grad_norm": 0.7968388717539745, + "importance_ratio": 0.82421875, + "kl_div": -0.21529999375343323, + "kl_div_neg": -0.4309925436973572, + "kl_div_pos": 0.0003925491473637521, + "learning_rate": 6.01957585644372e-07, + "loss": 0.0053, + "ppo_loss": -0.10019633173942566, + "step": 868 + }, + { + "epoch": 2.11, + "grad_norm": 0.8793106844007031, + "kl_div": 0.017604706808924675, + "kl_div_sft": 0.017604706808924675, + "learning_rate": 6.003262642740619e-07, + "loss": -0.0568, + "sft_loss": 0.05003702640533447, + "step": 869 + }, + { + "epoch": 2.11, + "grad_norm": 0.5997317807386668, + "kl_div": 0.013879947364330292, + "kl_div_sft": 0.013879947364330292, + "learning_rate": 5.98694942903752e-07, + "loss": 0.1225, + "sft_loss": 0.043585263192653656, + "step": 870 + }, + { + "epoch": 2.11, + "grad_norm": 0.7998335277681405, + "importance_ratio": 1.03125, + "kl_div": 0.02605769783258438, + "kl_div_pos": 0.027240796014666557, + "kl_div_sft": 0.024874601513147354, + "learning_rate": 5.97063621533442e-07, + "loss": -0.0505, + "ppo_loss": -1.0276151895523071, + "sft_loss": 0.031028786674141884, + "step": 871 + }, + { + "epoch": 2.11, + "grad_norm": 1.1196540333939047, + "kl_div": 0.00864154938608408, + "kl_div_sft": 0.00864154938608408, + "learning_rate": 5.954323001631321e-07, + "loss": -0.1298, + "sft_loss": 0.04518849402666092, + "step": 872 + }, + { + "epoch": 2.12, + "grad_norm": 1.0285343479954419, + "importance_ratio": 0.890625, + "kl_div": -0.13256366550922394, + "kl_div_neg": -0.31670084595680237, + "kl_div_pos": 0.05157352238893509, + "learning_rate": 5.938009787928222e-07, + "loss": -0.053, + "ppo_loss": -0.12646332383155823, + "step": 873 + }, + { + "epoch": 2.12, + "grad_norm": 1.8676648666960494, + "importance_ratio": 1.015625, + "kl_div": 0.00921483151614666, + "kl_div_pos": 0.00921483151614666, + "learning_rate": 5.921696574225123e-07, + "loss": -0.084, + "ppo_loss": -1.0095914602279663, + "step": 874 + }, + { + "epoch": 2.12, + "grad_norm": 1.6134732111411296, + "importance_ratio": 1.0390625, + "kl_div": 0.01806982234120369, + "kl_div_pos": 0.04134603589773178, + "kl_div_sft": -0.005206390284001827, + "learning_rate": 5.905383360522023e-07, + "loss": -0.0232, + "ppo_loss": -1.042212724685669, + "sft_loss": 0.03829171508550644, + "step": 875 + }, + { + "epoch": 2.12, + "grad_norm": 0.9257044378035186, + "importance_ratio": 0.86328125, + "kl_div": -0.08369235694408417, + "kl_div_neg": -0.14850135147571564, + "kl_div_sft": -0.018883369863033295, + "learning_rate": 5.889070146818924e-07, + "loss": 0.0204, + "ppo_loss": 0.8619988560676575, + "sft_loss": 0.10338804125785828, + "step": 876 + }, + { + "epoch": 2.13, + "grad_norm": 3.690614064302966, + "kl_div": 0.0038330499082803726, + "kl_div_sft": 0.0038330499082803726, + "learning_rate": 5.872756933115823e-07, + "loss": -0.2202, + "sft_loss": 0.03799348697066307, + "step": 877 + }, + { + "epoch": 2.13, + "grad_norm": 0.9830280049488119, + "kl_div": 0.004340812098234892, + "kl_div_sft": 0.004340812098234892, + "learning_rate": 5.856443719412723e-07, + "loss": -0.0818, + "sft_loss": 0.08931317925453186, + "step": 878 + }, + { + "epoch": 2.13, + "grad_norm": 0.8184240493164358, + "importance_ratio": 0.490234375, + "kl_div": -0.34959229826927185, + "kl_div_neg": -0.7119632363319397, + "kl_div_sft": 0.012778629548847675, + "learning_rate": 5.840130505709625e-07, + "loss": -0.0543, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.029221320524811745, + "step": 879 + }, + { + "epoch": 2.13, + "grad_norm": 1.0711299252957196, + "importance_ratio": 0.625, + "kl_div": -0.25115224719047546, + "kl_div_neg": -0.4688381254673004, + "kl_div_sft": -0.0334663949906826, + "learning_rate": 5.823817292006525e-07, + "loss": -0.0823, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.08490065485239029, + "step": 880 + }, + { + "epoch": 2.14, + "grad_norm": 0.6690764806748656, + "importance_ratio": 1.078125, + "kl_div": 0.04035523161292076, + "kl_div_pos": 0.073147714138031, + "kl_div_sft": 0.007562749553471804, + "learning_rate": 5.807504078303426e-07, + "loss": -0.0222, + "ppo_loss": -1.0758894681930542, + "sft_loss": 0.041387416422367096, + "step": 881 + }, + { + "epoch": 2.14, + "grad_norm": 0.5771412340673167, + "importance_ratio": 1.0390625, + "kl_div": 0.016257621347904205, + "kl_div_pos": 0.03585897013545036, + "kl_div_sft": -0.003343727672472596, + "learning_rate": 5.791190864600326e-07, + "loss": -0.0872, + "ppo_loss": -1.03650963306427, + "sft_loss": 0.07778169214725494, + "step": 882 + }, + { + "epoch": 2.14, + "grad_norm": 0.639651279588748, + "importance_ratio": 0.7109375, + "kl_div": -0.3393230438232422, + "kl_div_neg": -0.3393230438232422, + "learning_rate": 5.774877650897227e-07, + "loss": -0.0315, + "ppo_loss": 0.800000011920929, + "step": 883 + }, + { + "epoch": 2.14, + "grad_norm": 0.7186821615298014, + "importance_ratio": 1.0625, + "kl_div": 0.056774161756038666, + "kl_div_pos": 0.056774161756038666, + "learning_rate": 5.758564437194126e-07, + "loss": -0.0984, + "ppo_loss": -1.0584173202514648, + "step": 884 + }, + { + "epoch": 2.15, + "grad_norm": 0.7896946178578934, + "importance_ratio": 0.91796875, + "kl_div": -0.029687166213989258, + "kl_div_neg": -0.08769184350967407, + "kl_div_sft": 0.028317509219050407, + "learning_rate": 5.742251223491027e-07, + "loss": 0.0452, + "ppo_loss": 0.9160431027412415, + "sft_loss": 0.0370275042951107, + "step": 885 + }, + { + "epoch": 2.15, + "grad_norm": 0.5964225552143704, + "kl_div": -0.007856599055230618, + "kl_div_sft": -0.007856599055230618, + "learning_rate": 5.725938009787928e-07, + "loss": 0.0325, + "sft_loss": 0.09961707890033722, + "step": 886 + }, + { + "epoch": 2.15, + "grad_norm": 2.6939401864801154, + "importance_ratio": 0.63671875, + "kl_div": -0.22561708092689514, + "kl_div_neg": -0.4499557614326477, + "kl_div_sft": -0.0012784095015376806, + "learning_rate": 5.709624796084829e-07, + "loss": -0.0235, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.061579037457704544, + "step": 887 + }, + { + "epoch": 2.15, + "grad_norm": 0.7136453056713985, + "importance_ratio": 0.67578125, + "kl_div": -0.3997284173965454, + "kl_div_neg": -0.3997284173965454, + "learning_rate": 5.693311582381729e-07, + "loss": 0.029, + "ppo_loss": 0.800000011920929, + "step": 888 + }, + { + "epoch": 2.16, + "grad_norm": 0.5778132706549846, + "kl_div": 0.007670292165130377, + "kl_div_sft": 0.007670292165130377, + "learning_rate": 5.676998368678629e-07, + "loss": -0.0585, + "sft_loss": 0.04167616367340088, + "step": 889 + }, + { + "epoch": 2.16, + "grad_norm": 1.6287109111507299, + "kl_div": 0.012486516498029232, + "kl_div_sft": 0.012486516498029232, + "learning_rate": 5.66068515497553e-07, + "loss": -0.0287, + "sft_loss": 0.10539884120225906, + "step": 890 + }, + { + "epoch": 2.16, + "grad_norm": 0.5478193032189899, + "importance_ratio": 1.0859375, + "kl_div": 0.02673727460205555, + "kl_div_pos": 0.08578943461179733, + "kl_div_sft": -0.032314885407686234, + "learning_rate": 5.64437194127243e-07, + "loss": -0.0571, + "ppo_loss": -1.0895768404006958, + "sft_loss": 0.1281270980834961, + "step": 891 + }, + { + "epoch": 2.16, + "grad_norm": 0.8703841667870453, + "importance_ratio": 0.62890625, + "kl_div": -0.22016043961048126, + "kl_div_neg": -0.46085503697395325, + "kl_div_sft": 0.020534159615635872, + "learning_rate": 5.628058727569332e-07, + "loss": -0.1792, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.0323028638958931, + "step": 892 + }, + { + "epoch": 2.16, + "grad_norm": 3.2286924116468385, + "importance_ratio": 1.125, + "kl_div": 0.06784869730472565, + "kl_div_pos": 0.11585932970046997, + "kl_div_sft": 0.01983807049691677, + "learning_rate": 5.611745513866231e-07, + "loss": -0.1086, + "ppo_loss": -1.1228379011154175, + "sft_loss": 0.03183257207274437, + "step": 893 + }, + { + "epoch": 2.17, + "grad_norm": 0.5008777424275753, + "importance_ratio": 1.0546875, + "kl_div": 0.03559987619519234, + "kl_div_pos": 0.05102997273206711, + "kl_div_sft": 0.020169777795672417, + "learning_rate": 5.595432300163132e-07, + "loss": -0.0837, + "ppo_loss": -1.0523544549942017, + "sft_loss": 0.049543678760528564, + "step": 894 + }, + { + "epoch": 2.17, + "grad_norm": 1.9827208345235616, + "importance_ratio": 0.734375, + "kl_div": -0.31260108947753906, + "kl_div_neg": -0.31260108947753906, + "learning_rate": 5.579119086460032e-07, + "loss": -0.0038, + "ppo_loss": 0.800000011920929, + "step": 895 + }, + { + "epoch": 2.17, + "grad_norm": 0.6935491812510783, + "importance_ratio": 0.765625, + "kl_div": -0.1380336731672287, + "kl_div_neg": -0.2684652507305145, + "kl_div_sft": -0.007602086756378412, + "learning_rate": 5.562805872756933e-07, + "loss": -0.0853, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.06231473386287689, + "step": 896 + }, + { + "epoch": 2.17, + "grad_norm": 0.9387845457910657, + "kl_div": 0.025464002043008804, + "kl_div_sft": 0.025464002043008804, + "learning_rate": 5.546492659053833e-07, + "loss": -0.0815, + "sft_loss": 0.06399422883987427, + "step": 897 + }, + { + "epoch": 2.18, + "grad_norm": 0.6214152529656375, + "importance_ratio": 1.0234375, + "kl_div": 0.022999495267868042, + "kl_div_pos": 0.022714493796229362, + "kl_div_sft": 0.02328449860215187, + "learning_rate": 5.530179445350734e-07, + "loss": 0.1013, + "ppo_loss": -1.0229744911193848, + "sft_loss": 0.04827561601996422, + "step": 898 + }, + { + "epoch": 2.18, + "grad_norm": 0.8544614252275191, + "importance_ratio": 0.88671875, + "kl_div": -0.1337304562330246, + "kl_div_neg": -0.30951204895973206, + "kl_div_pos": 0.04205113649368286, + "learning_rate": 5.513866231647635e-07, + "loss": 0.0048, + "ppo_loss": -0.12147393822669983, + "step": 899 + }, + { + "epoch": 2.18, + "grad_norm": 2.1178801694981506, + "importance_ratio": 0.7734375, + "kl_div": -0.12312136590480804, + "kl_div_neg": -0.2545829713344574, + "kl_div_sft": 0.008340239524841309, + "learning_rate": 5.497553017944536e-07, + "loss": -0.0033, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.06270673125982285, + "step": 900 + }, + { + "epoch": 2.18, + "grad_norm": 1.019347075565765, + "importance_ratio": 1.1328125, + "kl_div": 0.07899191230535507, + "kl_div_pos": 0.126931294798851, + "kl_div_sft": 0.03105252794921398, + "learning_rate": 5.481239804241435e-07, + "loss": -0.162, + "ppo_loss": -1.1353390216827393, + "sft_loss": 0.05711635947227478, + "step": 901 + }, + { + "epoch": 2.19, + "grad_norm": 0.5945166106057107, + "importance_ratio": 0.75390625, + "kl_div": -0.13973702490329742, + "kl_div_neg": -0.2828894853591919, + "kl_div_sft": 0.0034154406748712063, + "learning_rate": 5.464926590538335e-07, + "loss": -0.1883, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.07432715594768524, + "step": 902 + }, + { + "epoch": 2.19, + "grad_norm": 1.8871254176644539, + "importance_ratio": 1.0234375, + "kl_div": 0.028098221868276596, + "kl_div_pos": 0.02675572969019413, + "kl_div_sft": 0.029440714046359062, + "learning_rate": 5.448613376835236e-07, + "loss": -0.0609, + "ppo_loss": -1.0271168947219849, + "sft_loss": 0.1034909188747406, + "step": 903 + }, + { + "epoch": 2.19, + "grad_norm": 0.9504609947007345, + "importance_ratio": 0.63671875, + "kl_div": -0.45120978355407715, + "kl_div_neg": -0.45120978355407715, + "learning_rate": 5.432300163132136e-07, + "loss": -0.1774, + "ppo_loss": 0.800000011920929, + "step": 904 + }, + { + "epoch": 2.19, + "grad_norm": 0.74247535343558, + "importance_ratio": 1.0703125, + "kl_div": 0.04448920488357544, + "kl_div_pos": 0.07105836272239685, + "kl_div_sft": 0.01792004518210888, + "learning_rate": 5.415986949429038e-07, + "loss": -0.2217, + "ppo_loss": -1.0736439228057861, + "sft_loss": 0.028560511767864227, + "step": 905 + }, + { + "epoch": 2.2, + "grad_norm": 1.131370415785853, + "importance_ratio": 0.6796875, + "kl_div": -0.38727688789367676, + "kl_div_neg": -0.38727688789367676, + "learning_rate": 5.399673735725938e-07, + "loss": 0.0092, + "ppo_loss": 0.800000011920929, + "step": 906 + }, + { + "epoch": 2.2, + "grad_norm": 0.9877402894496916, + "importance_ratio": 0.90625, + "kl_div": -0.11487214267253876, + "kl_div_neg": -0.27314162254333496, + "kl_div_pos": 0.043397340923547745, + "learning_rate": 5.383360522022839e-07, + "loss": -0.001, + "ppo_loss": -0.12217637896537781, + "step": 907 + }, + { + "epoch": 2.2, + "grad_norm": 1.3842466858928706, + "importance_ratio": 0.73828125, + "kl_div": -0.14413465559482574, + "kl_div_neg": -0.30117878317832947, + "kl_div_sft": 0.012909467332065105, + "learning_rate": 5.367047308319739e-07, + "loss": -0.0474, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.06256598234176636, + "step": 908 + }, + { + "epoch": 2.2, + "grad_norm": 0.758970850079431, + "importance_ratio": 1.0703125, + "kl_div": 0.0683857724070549, + "kl_div_pos": 0.0683857724070549, + "learning_rate": 5.350734094616639e-07, + "loss": -0.2193, + "ppo_loss": -1.0707976818084717, + "step": 909 + }, + { + "epoch": 2.21, + "grad_norm": 1.1457802268928465, + "importance_ratio": 0.75390625, + "kl_div": -0.15916511416435242, + "kl_div_neg": -0.2836153507232666, + "kl_div_sft": -0.03471486642956734, + "learning_rate": 5.334420880913539e-07, + "loss": -0.0862, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.08879987895488739, + "step": 910 + }, + { + "epoch": 2.21, + "grad_norm": 0.9460655554705513, + "kl_div": 0.021832624450325966, + "kl_div_sft": 0.021832624450325966, + "learning_rate": 5.31810766721044e-07, + "loss": -0.234, + "sft_loss": 0.039057567715644836, + "step": 911 + }, + { + "epoch": 2.21, + "grad_norm": 0.7130222146852393, + "importance_ratio": 0.671875, + "kl_div": -0.20167236030101776, + "kl_div_neg": -0.3981626033782959, + "kl_div_sft": -0.005182111170142889, + "learning_rate": 5.301794453507341e-07, + "loss": 0.0799, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.11809124052524567, + "step": 912 + }, + { + "epoch": 2.21, + "grad_norm": 1.105397953421926, + "importance_ratio": 1.03125, + "kl_div": 0.025202281773090363, + "kl_div_pos": 0.03070555254817009, + "kl_div_sft": 0.019699012860655785, + "learning_rate": 5.285481239804241e-07, + "loss": -0.0665, + "ppo_loss": -1.031181812286377, + "sft_loss": 0.005819185636937618, + "step": 913 + }, + { + "epoch": 2.22, + "grad_norm": 1.443877874218789, + "importance_ratio": 0.7109375, + "kl_div": -0.16923661530017853, + "kl_div_neg": -0.34361013770103455, + "kl_div_sft": 0.005136905238032341, + "learning_rate": 5.269168026101142e-07, + "loss": -0.0395, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.03511352464556694, + "step": 914 + }, + { + "epoch": 2.22, + "grad_norm": 0.9870923999795622, + "importance_ratio": 0.65625, + "kl_div": -0.4261021018028259, + "kl_div_neg": -0.4261021018028259, + "learning_rate": 5.252854812398042e-07, + "loss": -0.2032, + "ppo_loss": 0.800000011920929, + "step": 915 + }, + { + "epoch": 2.22, + "grad_norm": 0.7466714429986411, + "kl_div": 0.016338294371962547, + "kl_div_sft": 0.016338294371962547, + "learning_rate": 5.236541598694943e-07, + "loss": 0.0584, + "sft_loss": 0.06320355087518692, + "step": 916 + }, + { + "epoch": 2.22, + "grad_norm": 0.8658166996738293, + "importance_ratio": 1.0703125, + "kl_div": 0.033583857119083405, + "kl_div_pos": 0.06586972624063492, + "kl_div_sft": 0.0012979895109310746, + "learning_rate": 5.220228384991842e-07, + "loss": 0.026, + "ppo_loss": -1.0680874586105347, + "sft_loss": 0.05195733532309532, + "step": 917 + }, + { + "epoch": 2.23, + "grad_norm": 1.9082970181248398, + "importance_ratio": 0.953125, + "kl_div": -0.05385783314704895, + "kl_div_neg": -0.15265466272830963, + "kl_div_pos": 0.04493899643421173, + "learning_rate": 5.203915171288744e-07, + "loss": -0.0781, + "ppo_loss": -0.09376892447471619, + "step": 918 + }, + { + "epoch": 2.23, + "grad_norm": 4.148648761319739, + "importance_ratio": 0.69921875, + "kl_div": -0.16899904608726501, + "kl_div_neg": -0.3573715388774872, + "kl_div_sft": 0.019373446702957153, + "learning_rate": 5.187601957585644e-07, + "loss": -0.0843, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.07252917438745499, + "step": 919 + }, + { + "epoch": 2.23, + "grad_norm": 2.7364635229817815, + "importance_ratio": 0.8046875, + "kl_div": -0.240549236536026, + "kl_div_neg": -0.47470220923423767, + "kl_div_pos": -0.006396274082362652, + "learning_rate": 5.171288743882545e-07, + "loss": -0.1051, + "ppo_loss": -0.09681206941604614, + "step": 920 + }, + { + "epoch": 2.23, + "grad_norm": 0.8553304691120556, + "importance_ratio": 0.8828125, + "kl_div": -0.14235982298851013, + "kl_div_neg": -0.34726372361183167, + "kl_div_pos": 0.062544085085392, + "learning_rate": 5.154975530179445e-07, + "loss": 0.0691, + "ppo_loss": -0.1322707235813141, + "step": 921 + }, + { + "epoch": 2.24, + "grad_norm": 1.9997982877101181, + "importance_ratio": 1.046875, + "kl_div": 0.00859884824603796, + "kl_div_pos": 0.0443231463432312, + "kl_div_sft": -0.02712544985115528, + "learning_rate": 5.138662316476346e-07, + "loss": -0.0244, + "ppo_loss": -1.0453200340270996, + "sft_loss": 0.1878640204668045, + "step": 922 + }, + { + "epoch": 2.24, + "grad_norm": 0.9587369704573453, + "kl_div": 0.010656565427780151, + "kl_div_sft": 0.010656565427780151, + "learning_rate": 5.122349102773246e-07, + "loss": -0.0024, + "sft_loss": 0.04456965997815132, + "step": 923 + }, + { + "epoch": 2.24, + "grad_norm": 1.2664087011936005, + "kl_div": 0.010272424668073654, + "kl_div_sft": 0.010272424668073654, + "learning_rate": 5.106035889070146e-07, + "loss": 0.0762, + "sft_loss": 0.09669818729162216, + "step": 924 + }, + { + "epoch": 2.24, + "grad_norm": 4.297439815505914, + "importance_ratio": 0.58984375, + "kl_div": -0.25953978300094604, + "kl_div_neg": -0.5252647995948792, + "kl_div_sft": 0.006185252219438553, + "learning_rate": 5.089722675367047e-07, + "loss": -0.1525, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.0307835154235363, + "step": 925 + }, + { + "epoch": 2.24, + "grad_norm": 1.709687285690838, + "importance_ratio": 0.76171875, + "kl_div": -0.1354270726442337, + "kl_div_neg": -0.2699092924594879, + "kl_div_sft": -0.0009448446216993034, + "learning_rate": 5.073409461663947e-07, + "loss": -0.1006, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.03979117423295975, + "step": 926 + }, + { + "epoch": 2.25, + "grad_norm": 1.733235990407424, + "importance_ratio": 1.0625, + "kl_div": 0.06101357191801071, + "kl_div_pos": 0.06101357191801071, + "learning_rate": 5.057096247960848e-07, + "loss": -0.1685, + "ppo_loss": -1.0629152059555054, + "step": 927 + }, + { + "epoch": 2.25, + "grad_norm": 1.0768405616111052, + "importance_ratio": 0.8046875, + "kl_div": -0.1038859635591507, + "kl_div_neg": -0.21953067183494568, + "kl_div_sft": 0.011758743785321712, + "learning_rate": 5.040783034257748e-07, + "loss": -0.1842, + "ppo_loss": 0.8028955459594727, + "sft_loss": 0.02808184176683426, + "step": 928 + }, + { + "epoch": 2.25, + "grad_norm": 1.0042719193588434, + "importance_ratio": 0.734375, + "kl_div": -0.3099173307418823, + "kl_div_neg": -0.3099173307418823, + "learning_rate": 5.024469820554649e-07, + "loss": -0.1502, + "ppo_loss": 0.800000011920929, + "step": 929 + }, + { + "epoch": 2.25, + "grad_norm": 0.6970447222415244, + "importance_ratio": 0.58203125, + "kl_div": -0.25831127166748047, + "kl_div_neg": -0.5409226417541504, + "kl_div_sft": 0.024300090968608856, + "learning_rate": 5.008156606851549e-07, + "loss": -0.0752, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.057892248034477234, + "step": 930 + }, + { + "epoch": 2.26, + "grad_norm": 0.6302256516107879, + "importance_ratio": 0.65625, + "kl_div": -0.21493804454803467, + "kl_div_neg": -0.4216420352458954, + "kl_div_sft": -0.008234056644141674, + "learning_rate": 4.99184339314845e-07, + "loss": 0.0039, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.1816994994878769, + "step": 931 + }, + { + "epoch": 2.26, + "grad_norm": 0.9363894560663912, + "importance_ratio": 0.87890625, + "kl_div": -0.15596584975719452, + "kl_div_neg": -0.38737794756889343, + "kl_div_pos": 0.0754462480545044, + "learning_rate": 4.975530179445351e-07, + "loss": -0.1136, + "ppo_loss": -0.13918259739875793, + "step": 932 + }, + { + "epoch": 2.26, + "grad_norm": 0.9081506529385803, + "importance_ratio": 0.64453125, + "kl_div": -0.2149648517370224, + "kl_div_neg": -0.4411203861236572, + "kl_div_sft": 0.011190692894160748, + "learning_rate": 4.959216965742251e-07, + "loss": -0.065, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.04278969392180443, + "step": 933 + }, + { + "epoch": 2.26, + "grad_norm": 1.1448285234307398, + "importance_ratio": 0.56640625, + "kl_div": -0.5740979909896851, + "kl_div_neg": -0.5740979909896851, + "learning_rate": 4.942903752039151e-07, + "loss": 0.0292, + "ppo_loss": 0.800000011920929, + "step": 934 + }, + { + "epoch": 2.27, + "grad_norm": 0.9074364492269864, + "importance_ratio": 0.6484375, + "kl_div": -0.22268356382846832, + "kl_div_neg": -0.43390703201293945, + "kl_div_sft": -0.011460098437964916, + "learning_rate": 4.926590538336052e-07, + "loss": -0.0228, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.06586343795061111, + "step": 935 + }, + { + "epoch": 2.27, + "grad_norm": 0.9816613106724512, + "importance_ratio": 0.8828125, + "kl_div": -0.13887540996074677, + "kl_div_neg": -0.3009295165538788, + "kl_div_pos": 0.02317870408296585, + "learning_rate": 4.910277324632953e-07, + "loss": -0.0443, + "ppo_loss": -0.11172470450401306, + "step": 936 + }, + { + "epoch": 2.27, + "grad_norm": 1.1341153249581237, + "kl_div": 0.009604312479496002, + "kl_div_sft": 0.009604312479496002, + "learning_rate": 4.893964110929853e-07, + "loss": 0.0384, + "sft_loss": 0.021207697689533234, + "step": 937 + }, + { + "epoch": 2.27, + "grad_norm": 3.626492817746304, + "importance_ratio": 0.81640625, + "kl_div": -0.0927228033542633, + "kl_div_neg": -0.20264843106269836, + "kl_div_sft": 0.017202824354171753, + "learning_rate": 4.877650897226753e-07, + "loss": -0.1958, + "ppo_loss": 0.816565215587616, + "sft_loss": 0.06965680420398712, + "step": 938 + }, + { + "epoch": 2.28, + "grad_norm": 0.8807685280788413, + "importance_ratio": 1.0625, + "kl_div": 0.0319003164768219, + "kl_div_pos": 0.06287212669849396, + "kl_div_sft": 0.0009285045089200139, + "learning_rate": 4.861337683523654e-07, + "loss": 0.0382, + "ppo_loss": -1.064890742301941, + "sft_loss": 0.09779992699623108, + "step": 939 + }, + { + "epoch": 2.28, + "grad_norm": 0.6507075685918899, + "importance_ratio": 0.61328125, + "kl_div": -0.22991566359996796, + "kl_div_neg": -0.48832497000694275, + "kl_div_sft": 0.028493640944361687, + "learning_rate": 4.845024469820555e-07, + "loss": -0.0257, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.04952215030789375, + "step": 940 + }, + { + "epoch": 2.28, + "grad_norm": 0.7804789743908849, + "importance_ratio": 0.671875, + "kl_div": -0.40713441371917725, + "kl_div_neg": -0.40713441371917725, + "learning_rate": 4.828711256117454e-07, + "loss": -0.0677, + "ppo_loss": 0.800000011920929, + "step": 941 + }, + { + "epoch": 2.28, + "grad_norm": 0.5817969958081254, + "importance_ratio": 0.875, + "kl_div": -0.1548028588294983, + "kl_div_neg": -0.3525158166885376, + "kl_div_pos": 0.04291009157896042, + "learning_rate": 4.812398042414355e-07, + "loss": -0.0212, + "ppo_loss": -0.12192204594612122, + "step": 942 + }, + { + "epoch": 2.29, + "grad_norm": 1.223707825476559, + "kl_div": 0.02316691353917122, + "kl_div_sft": 0.02316691353917122, + "learning_rate": 4.796084828711256e-07, + "loss": 0.0506, + "sft_loss": 0.10178062319755554, + "step": 943 + }, + { + "epoch": 2.29, + "grad_norm": 1.2030046576221893, + "importance_ratio": 0.8203125, + "kl_div": -0.08802450448274612, + "kl_div_pos": -0.19667044281959534, + "kl_div_sft": 0.020621435716748238, + "learning_rate": 4.779771615008156e-07, + "loss": -0.1252, + "ppo_loss": -0.8214613199234009, + "sft_loss": 0.08459824323654175, + "step": 944 + }, + { + "epoch": 2.29, + "grad_norm": 1.4469212831546836, + "importance_ratio": 1.0234375, + "kl_div": 0.013117531314492226, + "kl_div_pos": 0.024006225168704987, + "kl_div_sft": 0.0022288374602794647, + "learning_rate": 4.7634584013050565e-07, + "loss": -0.1127, + "ppo_loss": -1.024296760559082, + "sft_loss": 0.0502149797976017, + "step": 945 + }, + { + "epoch": 2.29, + "grad_norm": 1.606855475753341, + "importance_ratio": 0.7890625, + "kl_div": -0.23702040314674377, + "kl_div_neg": -0.23702040314674377, + "learning_rate": 4.7471451876019574e-07, + "loss": -0.0082, + "ppo_loss": 0.8120083808898926, + "step": 946 + }, + { + "epoch": 2.3, + "grad_norm": 1.3974691266793597, + "importance_ratio": 0.9296875, + "kl_div": -0.08570179343223572, + "kl_div_neg": -0.24382445216178894, + "kl_div_pos": 0.0724208727478981, + "learning_rate": 4.7308319738988577e-07, + "loss": -0.1494, + "ppo_loss": -0.13755390048027039, + "step": 947 + }, + { + "epoch": 2.3, + "grad_norm": 1.7760453061004071, + "importance_ratio": 0.734375, + "kl_div": -0.14622443914413452, + "kl_div_neg": -0.3102128803730011, + "kl_div_sft": 0.01776399277150631, + "learning_rate": 4.714518760195758e-07, + "loss": 0.075, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.020431511104106903, + "step": 948 + }, + { + "epoch": 2.3, + "grad_norm": 0.9016649740018722, + "importance_ratio": 1.0703125, + "kl_div": 0.03968197479844093, + "kl_div_pos": 0.0646844133734703, + "kl_div_sft": 0.01467953436076641, + "learning_rate": 4.698205546492659e-07, + "loss": -0.0747, + "ppo_loss": -1.0668222904205322, + "sft_loss": 0.028933806344866753, + "step": 949 + }, + { + "epoch": 2.3, + "grad_norm": 1.6019154229572639, + "importance_ratio": 1.1171875, + "kl_div": 0.06266265362501144, + "kl_div_pos": 0.10981543362140656, + "kl_div_sft": 0.01550986710935831, + "learning_rate": 4.6818923327895594e-07, + "loss": -0.0774, + "ppo_loss": -1.1160720586776733, + "sft_loss": 0.1458381563425064, + "step": 950 + }, + { + "epoch": 2.31, + "grad_norm": 1.6397811899523762, + "importance_ratio": 0.75, + "kl_div": -0.13818220794200897, + "kl_div_neg": -0.2875550389289856, + "kl_div_sft": 0.01119061280041933, + "learning_rate": 4.66557911908646e-07, + "loss": -0.1208, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.0633588433265686, + "step": 951 + }, + { + "epoch": 2.31, + "grad_norm": 0.4302715147359019, + "importance_ratio": 0.76171875, + "kl_div": -0.11986764520406723, + "kl_div_neg": -0.27146387100219727, + "kl_div_sft": 0.031728580594062805, + "learning_rate": 4.6492659053833606e-07, + "loss": 0.0822, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.030951837077736855, + "step": 952 + }, + { + "epoch": 2.31, + "grad_norm": 0.8463842455091792, + "kl_div": 0.02023821882903576, + "kl_div_sft": 0.02023821882903576, + "learning_rate": 4.632952691680261e-07, + "loss": -0.1321, + "sft_loss": 0.022637102752923965, + "step": 953 + }, + { + "epoch": 2.31, + "grad_norm": 2.163751132200089, + "importance_ratio": 1.0625, + "kl_div": 0.03594045341014862, + "kl_div_pos": 0.06286430358886719, + "kl_div_sft": 0.009016606025397778, + "learning_rate": 4.6166394779771614e-07, + "loss": -0.2679, + "ppo_loss": -1.0648823976516724, + "sft_loss": 0.04815494269132614, + "step": 954 + }, + { + "epoch": 2.32, + "grad_norm": 1.224781857744496, + "importance_ratio": 0.4921875, + "kl_div": -0.35339614748954773, + "kl_div_neg": -0.7081003189086914, + "kl_div_sft": 0.0013080260250717402, + "learning_rate": 4.600326264274062e-07, + "loss": 0.108, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.07929542660713196, + "step": 955 + }, + { + "epoch": 2.32, + "grad_norm": 1.1772556628894841, + "importance_ratio": 0.64453125, + "kl_div": -0.44552016258239746, + "kl_div_neg": -0.44552016258239746, + "learning_rate": 4.584013050570962e-07, + "loss": 0.001, + "ppo_loss": 0.800000011920929, + "step": 956 + }, + { + "epoch": 2.32, + "grad_norm": 0.5728572612365842, + "kl_div": 0.020160946995019913, + "kl_div_sft": 0.020160946995019913, + "learning_rate": 4.5676998368678625e-07, + "loss": -0.0046, + "sft_loss": 0.0823456272482872, + "step": 957 + }, + { + "epoch": 2.32, + "grad_norm": 0.7880825340452763, + "importance_ratio": 1.0546875, + "kl_div": 0.022984405979514122, + "kl_div_pos": 0.055373843759298325, + "kl_div_sft": -0.009405032731592655, + "learning_rate": 4.5513866231647634e-07, + "loss": -0.0303, + "ppo_loss": -1.0569356679916382, + "sft_loss": 0.09787919372320175, + "step": 958 + }, + { + "epoch": 2.32, + "grad_norm": 0.59643939435095, + "importance_ratio": 1.078125, + "kl_div": 0.04588526487350464, + "kl_div_pos": 0.07213117927312851, + "kl_div_sft": 0.01963934861123562, + "learning_rate": 4.535073409461664e-07, + "loss": -0.1171, + "ppo_loss": -1.0747963190078735, + "sft_loss": 0.03999362513422966, + "step": 959 + }, + { + "epoch": 2.33, + "grad_norm": 0.7536200340791993, + "importance_ratio": 0.7421875, + "kl_div": -0.13872717320919037, + "kl_div_neg": -0.2981773614883423, + "kl_div_sft": 0.020723015069961548, + "learning_rate": 4.518760195758564e-07, + "loss": 0.0493, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.054769787937402725, + "step": 960 + }, + { + "epoch": 2.33, + "grad_norm": 0.9574148226866949, + "importance_ratio": 1.0625, + "kl_div": 0.03498686105012894, + "kl_div_pos": 0.058754608035087585, + "kl_div_sft": 0.011219117790460587, + "learning_rate": 4.5024469820554645e-07, + "loss": 0.0674, + "ppo_loss": -1.06051504611969, + "sft_loss": 0.07081925123929977, + "step": 961 + }, + { + "epoch": 2.33, + "grad_norm": 1.7923191863604815, + "importance_ratio": 0.69921875, + "kl_div": -0.18562382459640503, + "kl_div_neg": -0.3601547181606293, + "kl_div_sft": -0.011092942208051682, + "learning_rate": 4.4861337683523654e-07, + "loss": -0.1316, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.052693869918584824, + "step": 962 + }, + { + "epoch": 2.33, + "grad_norm": 1.6299272808867897, + "importance_ratio": 0.71484375, + "kl_div": -0.1514943540096283, + "kl_div_neg": -0.3330497741699219, + "kl_div_sft": 0.03006105124950409, + "learning_rate": 4.469820554649266e-07, + "loss": -0.1489, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.017525294795632362, + "step": 963 + }, + { + "epoch": 2.34, + "grad_norm": 0.7107717299814393, + "importance_ratio": 0.7734375, + "kl_div": -0.13310004770755768, + "kl_div_neg": -0.25532862544059753, + "kl_div_sft": -0.010871472768485546, + "learning_rate": 4.453507340946166e-07, + "loss": -0.0232, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.12232664972543716, + "step": 964 + }, + { + "epoch": 2.34, + "grad_norm": 0.665382845354885, + "importance_ratio": 0.78125, + "kl_div": -0.3159344792366028, + "kl_div_neg": -0.6828457117080688, + "kl_div_pos": 0.05097677558660507, + "learning_rate": 4.437194127243067e-07, + "loss": -0.0329, + "ppo_loss": -0.12614920735359192, + "step": 965 + }, + { + "epoch": 2.34, + "grad_norm": 3.6051859801594266, + "kl_div": 0.020080842077732086, + "kl_div_sft": 0.020080842077732086, + "learning_rate": 4.4208809135399674e-07, + "loss": -0.1174, + "sft_loss": 0.07262429594993591, + "step": 966 + }, + { + "epoch": 2.34, + "grad_norm": 0.9211671342162377, + "importance_ratio": 0.8671875, + "kl_div": -0.15479469299316406, + "kl_div_neg": -0.31137746572494507, + "kl_div_pos": 0.0017880933592095971, + "learning_rate": 4.404567699836867e-07, + "loss": 0.1204, + "ppo_loss": -0.10089483857154846, + "step": 967 + }, + { + "epoch": 2.35, + "grad_norm": 0.8084663991109114, + "importance_ratio": 0.89453125, + "kl_div": -0.053698454052209854, + "kl_div_pos": -0.1113470196723938, + "kl_div_sft": 0.003950112033635378, + "learning_rate": 4.388254486133768e-07, + "loss": -0.0354, + "ppo_loss": -0.8946282863616943, + "sft_loss": 0.05712110549211502, + "step": 968 + }, + { + "epoch": 2.35, + "grad_norm": 0.9693344568152478, + "importance_ratio": 0.95703125, + "kl_div": -0.04780184105038643, + "kl_div_pos": -0.04780184105038643, + "learning_rate": 4.3719412724306685e-07, + "loss": -0.0423, + "ppo_loss": -0.9561375975608826, + "step": 969 + }, + { + "epoch": 2.35, + "grad_norm": 1.8175507252159182, + "importance_ratio": 0.6640625, + "kl_div": -0.2046709656715393, + "kl_div_neg": -0.40851375460624695, + "kl_div_sft": -0.0008281635236926377, + "learning_rate": 4.355628058727569e-07, + "loss": 0.0932, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.04318346455693245, + "step": 970 + }, + { + "epoch": 2.35, + "grad_norm": 0.6528844362965035, + "kl_div": 0.014403751119971275, + "kl_div_sft": 0.014403751119971275, + "learning_rate": 4.33931484502447e-07, + "loss": -0.0675, + "sft_loss": 0.05766864866018295, + "step": 971 + }, + { + "epoch": 2.36, + "grad_norm": 1.578720584695023, + "importance_ratio": 1.0234375, + "kl_div": 0.020572490990161896, + "kl_div_pos": 0.025049732998013496, + "kl_div_sft": 0.016095248982310295, + "learning_rate": 4.32300163132137e-07, + "loss": -0.2578, + "ppo_loss": -1.025366187095642, + "sft_loss": 0.09186916053295135, + "step": 972 + }, + { + "epoch": 2.36, + "grad_norm": 1.0925616757160808, + "importance_ratio": 0.85546875, + "kl_div": -0.1612037867307663, + "kl_div_neg": -0.2649836242198944, + "kl_div_pos": -0.05742394179105759, + "learning_rate": 4.3066884176182705e-07, + "loss": -0.032, + "ppo_loss": -0.0720968246459961, + "step": 973 + }, + { + "epoch": 2.36, + "grad_norm": 1.4997664905626473, + "importance_ratio": 0.82421875, + "kl_div": -0.2255844920873642, + "kl_div_neg": -0.4845183491706848, + "kl_div_pos": 0.03334937244653702, + "learning_rate": 4.290375203915171e-07, + "loss": 0.0245, + "ppo_loss": -0.11695578694343567, + "step": 974 + }, + { + "epoch": 2.36, + "grad_norm": 1.6696932809091711, + "importance_ratio": 0.69921875, + "kl_div": -0.17535261809825897, + "kl_div_neg": -0.3550969362258911, + "kl_div_sft": 0.004391703754663467, + "learning_rate": 4.274061990212072e-07, + "loss": -0.0693, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.04276962950825691, + "step": 975 + }, + { + "epoch": 2.37, + "grad_norm": 0.9145076678409796, + "importance_ratio": 1.0625, + "kl_div": 0.05917125195264816, + "kl_div_pos": 0.05917125195264816, + "learning_rate": 4.257748776508972e-07, + "loss": -0.1431, + "ppo_loss": -1.0609924793243408, + "step": 976 + }, + { + "epoch": 2.37, + "grad_norm": 0.7626321646793529, + "importance_ratio": 1.046875, + "kl_div": 0.025566166266798973, + "kl_div_pos": 0.043650005012750626, + "kl_div_sft": 0.007482328452169895, + "learning_rate": 4.2414355628058725e-07, + "loss": -0.0223, + "ppo_loss": -1.04461669921875, + "sft_loss": 0.06351668387651443, + "step": 977 + }, + { + "epoch": 2.37, + "grad_norm": 0.8729367453390997, + "importance_ratio": 0.80078125, + "kl_div": -0.22225841879844666, + "kl_div_neg": -0.22804026305675507, + "kl_div_pos": -0.21647655963897705, + "learning_rate": 4.2251223491027734e-07, + "loss": -0.0014, + "ppo_loss": -0.0026757121086120605, + "step": 978 + }, + { + "epoch": 2.37, + "grad_norm": 1.016882133060283, + "importance_ratio": 0.875, + "kl_div": -0.15614619851112366, + "kl_div_neg": -0.3702617883682251, + "kl_div_pos": 0.05796937644481659, + "learning_rate": 4.2088091353996733e-07, + "loss": -0.0115, + "ppo_loss": -0.12984129786491394, + "step": 979 + }, + { + "epoch": 2.38, + "grad_norm": 0.7823746025595179, + "importance_ratio": 0.54296875, + "kl_div": -0.30497926473617554, + "kl_div_neg": -0.6085367798805237, + "kl_div_sft": -0.0014217606512829661, + "learning_rate": 4.1924959216965737e-07, + "loss": 0.0263, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.021424463018774986, + "step": 980 + }, + { + "epoch": 2.38, + "grad_norm": 0.8102134294481221, + "kl_div": 0.012248167768120766, + "kl_div_sft": 0.012248167768120766, + "learning_rate": 4.1761827079934746e-07, + "loss": -0.0891, + "sft_loss": 0.04203151911497116, + "step": 981 + }, + { + "epoch": 2.38, + "grad_norm": 0.678118110656901, + "importance_ratio": 0.72265625, + "kl_div": -0.15509912371635437, + "kl_div_neg": -0.32470399141311646, + "kl_div_sft": 0.014505734667181969, + "learning_rate": 4.159869494290375e-07, + "loss": -0.1236, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.0410248227417469, + "step": 982 + }, + { + "epoch": 2.38, + "grad_norm": 0.9372623142299381, + "importance_ratio": 1.0859375, + "kl_div": 0.04024956002831459, + "kl_div_pos": 0.08440776914358139, + "kl_div_sft": -0.00390864722430706, + "learning_rate": 4.1435562805872753e-07, + "loss": -0.0291, + "ppo_loss": -1.088072419166565, + "sft_loss": 0.04991704225540161, + "step": 983 + }, + { + "epoch": 2.39, + "grad_norm": 0.817519309839498, + "importance_ratio": 0.765625, + "kl_div": -0.1263846457004547, + "kl_div_neg": -0.2686377465724945, + "kl_div_sft": 0.01586846634745598, + "learning_rate": 4.127243066884176e-07, + "loss": -0.0655, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.03322933614253998, + "step": 984 + }, + { + "epoch": 2.39, + "grad_norm": 0.9612455727087088, + "importance_ratio": 1.0546875, + "kl_div": 0.03542941063642502, + "kl_div_pos": 0.05257461965084076, + "kl_div_sft": 0.018284201622009277, + "learning_rate": 4.1109298531810766e-07, + "loss": -0.1122, + "ppo_loss": -1.0539811849594116, + "sft_loss": 0.04410533607006073, + "step": 985 + }, + { + "epoch": 2.39, + "grad_norm": 0.8610901103654892, + "importance_ratio": 1.0390625, + "kl_div": 0.01973757892847061, + "kl_div_pos": 0.03967132791876793, + "kl_div_sft": -0.00019617100770119578, + "learning_rate": 4.094616639477977e-07, + "loss": 0.0131, + "ppo_loss": -1.0404688119888306, + "sft_loss": 0.04647931084036827, + "step": 986 + }, + { + "epoch": 2.39, + "grad_norm": 0.9737974771817556, + "importance_ratio": 1.03125, + "kl_div": 0.025196190923452377, + "kl_div_pos": 0.027397962287068367, + "kl_div_sft": 0.022994421422481537, + "learning_rate": 4.078303425774878e-07, + "loss": -0.1519, + "ppo_loss": -1.0277767181396484, + "sft_loss": 0.03373304381966591, + "step": 987 + }, + { + "epoch": 2.4, + "grad_norm": 1.075184613277687, + "importance_ratio": 0.984375, + "kl_div": -0.001032222993671894, + "kl_div_pos": -0.01664351485669613, + "kl_div_sft": 0.01457906886935234, + "learning_rate": 4.061990212071778e-07, + "loss": -0.1073, + "ppo_loss": -0.983494222164154, + "sft_loss": 0.012189331464469433, + "step": 988 + }, + { + "epoch": 2.4, + "grad_norm": 0.8159042713378422, + "importance_ratio": 0.671875, + "kl_div": -0.18853308260440826, + "kl_div_neg": -0.3979014456272125, + "kl_div_sft": 0.02083529159426689, + "learning_rate": 4.0456769983686786e-07, + "loss": 0.0038, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.046729568392038345, + "step": 989 + }, + { + "epoch": 2.4, + "grad_norm": 0.7626515081625942, + "kl_div": 0.019913557916879654, + "kl_div_sft": 0.019913557916879654, + "learning_rate": 4.029363784665579e-07, + "loss": 0.04, + "sft_loss": 0.09075594693422318, + "step": 990 + }, + { + "epoch": 2.4, + "grad_norm": 0.945492120497141, + "kl_div": 0.015917502343654633, + "kl_div_sft": 0.015917502343654633, + "learning_rate": 4.0130505709624793e-07, + "loss": -0.1035, + "sft_loss": 0.0952652245759964, + "step": 991 + }, + { + "epoch": 2.4, + "grad_norm": 0.9477742510837185, + "importance_ratio": 0.6484375, + "kl_div": -0.21832098066806793, + "kl_div_neg": -0.43150076270103455, + "kl_div_sft": -0.005141206085681915, + "learning_rate": 3.9967373572593797e-07, + "loss": 0.0258, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.13462142646312714, + "step": 992 + }, + { + "epoch": 2.41, + "grad_norm": 0.4865183936814369, + "importance_ratio": 0.69140625, + "kl_div": -0.1786474585533142, + "kl_div_neg": -0.369859904050827, + "kl_div_sft": 0.012564979493618011, + "learning_rate": 3.98042414355628e-07, + "loss": -0.0915, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.041070904582738876, + "step": 993 + }, + { + "epoch": 2.41, + "grad_norm": 1.2837001819100236, + "importance_ratio": 0.796875, + "kl_div": -0.1052025631070137, + "kl_div_neg": -0.2293727695941925, + "kl_div_sft": 0.0189676433801651, + "learning_rate": 3.964110929853181e-07, + "loss": 0.0865, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.031797245144844055, + "step": 994 + }, + { + "epoch": 2.41, + "grad_norm": 0.6600711022715781, + "importance_ratio": 0.80078125, + "kl_div": -0.10373440384864807, + "kl_div_neg": -0.22047407925128937, + "kl_div_sft": 0.013005265966057777, + "learning_rate": 3.9477977161500813e-07, + "loss": -0.168, + "ppo_loss": 0.8021385073661804, + "sft_loss": 0.042339082807302475, + "step": 995 + }, + { + "epoch": 2.41, + "grad_norm": 1.0313396992831134, + "importance_ratio": 1.03125, + "kl_div": 0.01869693025946617, + "kl_div_pos": 0.03447434678673744, + "kl_div_sft": 0.002919515362009406, + "learning_rate": 3.9314845024469817e-07, + "loss": -0.0382, + "ppo_loss": -1.0350754261016846, + "sft_loss": 0.06383674591779709, + "step": 996 + }, + { + "epoch": 2.42, + "grad_norm": 1.0082966432648142, + "importance_ratio": 0.60546875, + "kl_div": -0.23840777575969696, + "kl_div_neg": -0.5040940642356873, + "kl_div_sft": 0.027278510853648186, + "learning_rate": 3.9151712887438826e-07, + "loss": -0.1549, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.028871726244688034, + "step": 997 + }, + { + "epoch": 2.42, + "grad_norm": 0.8104968687511205, + "kl_div": 0.005753981880843639, + "kl_div_sft": 0.005753981880843639, + "learning_rate": 3.898858075040783e-07, + "loss": -0.011, + "sft_loss": 0.058913350105285645, + "step": 998 + }, + { + "epoch": 2.42, + "grad_norm": 1.005984222663792, + "importance_ratio": 0.78125, + "kl_div": -0.2435922473669052, + "kl_div_neg": -0.2435922473669052, + "learning_rate": 3.8825448613376833e-07, + "loss": 0.0712, + "ppo_loss": 0.803253173828125, + "step": 999 + }, + { + "epoch": 2.42, + "grad_norm": 0.8981189660683943, + "kl_div": 0.013707583770155907, + "kl_div_sft": 0.013707583770155907, + "learning_rate": 3.866231647634584e-07, + "loss": 0.0675, + "sft_loss": 0.06317710131406784, + "step": 1000 + }, + { + "epoch": 2.43, + "grad_norm": 3.1582724257528825, + "importance_ratio": 1.0625, + "kl_div": 0.03967162221670151, + "kl_div_pos": 0.057782385498285294, + "kl_div_sft": 0.02156086266040802, + "learning_rate": 3.8499184339314846e-07, + "loss": 0.1217, + "ppo_loss": -1.0594844818115234, + "sft_loss": 0.05303584784269333, + "step": 1001 + }, + { + "epoch": 2.43, + "grad_norm": 0.8169442188499416, + "importance_ratio": 0.6171875, + "kl_div": -0.2412409782409668, + "kl_div_neg": -0.48035579919815063, + "kl_div_sft": -0.0021261433139443398, + "learning_rate": 3.833605220228385e-07, + "loss": -0.1994, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.051242586225271225, + "step": 1002 + }, + { + "epoch": 2.43, + "grad_norm": 0.8600466790949012, + "kl_div": 0.00573944766074419, + "kl_div_sft": 0.00573944766074419, + "learning_rate": 3.817292006525285e-07, + "loss": 0.0015, + "sft_loss": 0.12035956233739853, + "step": 1003 + }, + { + "epoch": 2.43, + "grad_norm": 0.7545500067336446, + "kl_div": 0.003994662780314684, + "kl_div_sft": 0.003994662780314684, + "learning_rate": 3.8009787928221857e-07, + "loss": 0.0088, + "sft_loss": 0.09922191500663757, + "step": 1004 + }, + { + "epoch": 2.44, + "grad_norm": 0.9927936414561115, + "importance_ratio": 1.0546875, + "kl_div": 0.028940679505467415, + "kl_div_pos": 0.05019991099834442, + "kl_div_sft": 0.007681448478251696, + "learning_rate": 3.784665579119086e-07, + "loss": -0.0966, + "ppo_loss": -1.0514812469482422, + "sft_loss": 0.03251166641712189, + "step": 1005 + }, + { + "epoch": 2.44, + "grad_norm": 0.5373069261166618, + "kl_div": 0.0025871756952255964, + "kl_div_sft": 0.0025871756952255964, + "learning_rate": 3.7683523654159865e-07, + "loss": -0.0637, + "sft_loss": 0.07286077737808228, + "step": 1006 + }, + { + "epoch": 2.44, + "grad_norm": 1.1399458862310774, + "kl_div": -0.01982680708169937, + "kl_div_sft": -0.01982680708169937, + "learning_rate": 3.7520391517128874e-07, + "loss": -0.0391, + "sft_loss": 0.10723453760147095, + "step": 1007 + }, + { + "epoch": 2.44, + "grad_norm": 0.7950012017486894, + "importance_ratio": 1.0546875, + "kl_div": 0.023592008277773857, + "kl_div_pos": 0.049902159720659256, + "kl_div_sft": -0.00271814176812768, + "learning_rate": 3.7357259380097877e-07, + "loss": -0.0217, + "ppo_loss": -1.0511683225631714, + "sft_loss": 0.08355196565389633, + "step": 1008 + }, + { + "epoch": 2.45, + "grad_norm": 0.9301867707171533, + "importance_ratio": 0.7890625, + "kl_div": -0.11879640072584152, + "kl_div_neg": -0.2389979511499405, + "kl_div_sft": 0.0014051523758098483, + "learning_rate": 3.719412724306688e-07, + "loss": -0.1237, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.060970794409513474, + "step": 1009 + }, + { + "epoch": 2.45, + "grad_norm": 0.8832422071513172, + "importance_ratio": 0.70703125, + "kl_div": -0.1883264183998108, + "kl_div_neg": -0.34558987617492676, + "kl_div_sft": -0.031062960624694824, + "learning_rate": 3.703099510603589e-07, + "loss": -0.1216, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.09838508069515228, + "step": 1010 + }, + { + "epoch": 2.45, + "grad_norm": 1.5865526932298137, + "kl_div": 0.014200937002897263, + "kl_div_sft": 0.014200937002897263, + "learning_rate": 3.6867862969004894e-07, + "loss": -0.1554, + "sft_loss": 0.0974261462688446, + "step": 1011 + }, + { + "epoch": 2.45, + "grad_norm": 0.6722549650421021, + "importance_ratio": 0.7109375, + "kl_div": -0.16757439076900482, + "kl_div_neg": -0.3384281098842621, + "kl_div_sft": 0.003279315773397684, + "learning_rate": 3.6704730831973897e-07, + "loss": 0.0471, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.0659431666135788, + "step": 1012 + }, + { + "epoch": 2.46, + "grad_norm": 0.9591359253759136, + "importance_ratio": 0.8359375, + "kl_div": -0.21374891698360443, + "kl_div_neg": -0.48649612069129944, + "kl_div_pos": 0.05899827927350998, + "learning_rate": 3.6541598694942906e-07, + "loss": -0.075, + "ppo_loss": -0.13038673996925354, + "step": 1013 + }, + { + "epoch": 2.46, + "grad_norm": 0.8733932183629812, + "importance_ratio": 0.5625, + "kl_div": -0.6281610131263733, + "kl_div_neg": -0.6281610131263733, + "learning_rate": 3.637846655791191e-07, + "loss": -0.0703, + "ppo_loss": 0.800000011920929, + "step": 1014 + }, + { + "epoch": 2.46, + "grad_norm": 1.792351510509404, + "importance_ratio": 0.828125, + "kl_div": -0.2242492288351059, + "kl_div_neg": -0.4955662488937378, + "kl_div_pos": 0.047067780047655106, + "learning_rate": 3.621533442088091e-07, + "loss": 0.0967, + "ppo_loss": -0.12409648299217224, + "step": 1015 + }, + { + "epoch": 2.46, + "grad_norm": 1.0900571333934244, + "kl_div": -0.003447586204856634, + "kl_div_sft": -0.003447586204856634, + "learning_rate": 3.605220228384992e-07, + "loss": -0.0276, + "sft_loss": 0.0526358038187027, + "step": 1016 + }, + { + "epoch": 2.47, + "grad_norm": 0.7597449920127731, + "importance_ratio": 0.9140625, + "kl_div": -0.10268957912921906, + "kl_div_neg": -0.257598876953125, + "kl_div_pos": 0.05221971124410629, + "learning_rate": 3.588907014681892e-07, + "loss": 0.0907, + "ppo_loss": -0.12680360674858093, + "step": 1017 + }, + { + "epoch": 2.47, + "grad_norm": 1.5183983038447628, + "importance_ratio": 0.8515625, + "kl_div": -0.18091082572937012, + "kl_div_neg": -0.38382551074028015, + "kl_div_pos": 0.022003866732120514, + "learning_rate": 3.5725938009787925e-07, + "loss": -0.0696, + "ppo_loss": -0.11112388968467712, + "step": 1018 + }, + { + "epoch": 2.47, + "grad_norm": 0.6231943033796007, + "importance_ratio": 0.609375, + "kl_div": -0.49936342239379883, + "kl_div_neg": -0.49936342239379883, + "learning_rate": 3.556280587275693e-07, + "loss": -0.0034, + "ppo_loss": 0.800000011920929, + "step": 1019 + }, + { + "epoch": 2.47, + "grad_norm": 1.3095852412227011, + "kl_div": 0.010510137304663658, + "kl_div_sft": 0.010510137304663658, + "learning_rate": 3.539967373572594e-07, + "loss": -0.2417, + "sft_loss": 0.10299015045166016, + "step": 1020 + }, + { + "epoch": 2.48, + "grad_norm": 1.312973572807166, + "importance_ratio": 1.0078125, + "kl_div": 0.01436000969260931, + "kl_div_pos": 0.00994068942964077, + "kl_div_sft": 0.01877932995557785, + "learning_rate": 3.523654159869494e-07, + "loss": 0.0862, + "ppo_loss": -1.0099903345108032, + "sft_loss": 0.010615055449306965, + "step": 1021 + }, + { + "epoch": 2.48, + "grad_norm": 0.8203770929972757, + "kl_div": 0.018396597355604172, + "kl_div_sft": 0.018396597355604172, + "learning_rate": 3.5073409461663945e-07, + "loss": -0.0526, + "sft_loss": 0.06409763544797897, + "step": 1022 + }, + { + "epoch": 2.48, + "grad_norm": 1.5235168436369382, + "importance_ratio": 0.78515625, + "kl_div": -0.11560414731502533, + "kl_div_neg": -0.24418854713439941, + "kl_div_sft": 0.012980245985090733, + "learning_rate": 3.4910277324632954e-07, + "loss": 0.0121, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.16275613009929657, + "step": 1023 + }, + { + "epoch": 2.48, + "grad_norm": 0.8458514006490623, + "importance_ratio": 0.75390625, + "kl_div": -0.13325250148773193, + "kl_div_neg": -0.2842373847961426, + "kl_div_sft": 0.017732389271259308, + "learning_rate": 3.474714518760196e-07, + "loss": 0.0991, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.029525436460971832, + "step": 1024 + }, + { + "epoch": 2.48, + "grad_norm": 0.8619704500585501, + "kl_div": 0.031707629561424255, + "kl_div_sft": 0.031707629561424255, + "learning_rate": 3.458401305057096e-07, + "loss": -0.1025, + "sft_loss": 0.028296634554862976, + "step": 1025 + }, + { + "epoch": 2.49, + "grad_norm": 0.7322028481099582, + "importance_ratio": 1.0625, + "kl_div": 0.05670029670000076, + "kl_div_pos": 0.05670029670000076, + "learning_rate": 3.442088091353997e-07, + "loss": -0.0422, + "ppo_loss": -1.058358907699585, + "step": 1026 + }, + { + "epoch": 2.49, + "grad_norm": 1.0721780154043024, + "importance_ratio": 1.0546875, + "kl_div": 0.031154815107584, + "kl_div_pos": 0.053050797432661057, + "kl_div_sft": 0.009258833713829517, + "learning_rate": 3.425774877650897e-07, + "loss": -0.0309, + "ppo_loss": -1.05448317527771, + "sft_loss": 0.05470652133226395, + "step": 1027 + }, + { + "epoch": 2.49, + "grad_norm": 1.0051672470840838, + "kl_div": 0.010653335601091385, + "kl_div_sft": 0.010653335601091385, + "learning_rate": 3.409461663947797e-07, + "loss": -0.0996, + "sft_loss": 0.0735035166144371, + "step": 1028 + }, + { + "epoch": 2.49, + "grad_norm": 0.8605183018716919, + "importance_ratio": 0.7890625, + "kl_div": -0.28868263959884644, + "kl_div_neg": -0.6043957471847534, + "kl_div_pos": 0.0270304586738348, + "learning_rate": 3.393148450244698e-07, + "loss": -0.014, + "ppo_loss": -0.11369958519935608, + "step": 1029 + }, + { + "epoch": 2.5, + "grad_norm": 0.7443510944361741, + "kl_div": 0.005600485950708389, + "kl_div_sft": 0.005600485950708389, + "learning_rate": 3.3768352365415985e-07, + "loss": -0.1211, + "sft_loss": 0.0647912248969078, + "step": 1030 + }, + { + "epoch": 2.5, + "grad_norm": 0.5907332427125761, + "importance_ratio": 0.703125, + "kl_div": -0.16875949501991272, + "kl_div_neg": -0.3517286479473114, + "kl_div_sft": 0.014209664426743984, + "learning_rate": 3.360522022838499e-07, + "loss": -0.0071, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.055921465158462524, + "step": 1031 + }, + { + "epoch": 2.5, + "grad_norm": 0.9959272058419946, + "importance_ratio": 1.0703125, + "kl_div": 0.04712774604558945, + "kl_div_pos": 0.06734217703342438, + "kl_div_sft": 0.026913316920399666, + "learning_rate": 3.344208809135399e-07, + "loss": -0.1393, + "ppo_loss": -1.0696613788604736, + "sft_loss": 0.022098220884799957, + "step": 1032 + }, + { + "epoch": 2.5, + "grad_norm": 0.8081967028606536, + "kl_div": 0.024792511016130447, + "kl_div_sft": 0.024792511016130447, + "learning_rate": 3.3278955954323e-07, + "loss": -0.0203, + "sft_loss": 0.060120031237602234, + "step": 1033 + }, + { + "epoch": 2.51, + "grad_norm": 0.7387214438249187, + "importance_ratio": 1.0, + "kl_div": 0.012671315111219883, + "kl_div_pos": 0.0017745542572811246, + "kl_div_sft": 0.02356807515025139, + "learning_rate": 3.3115823817292005e-07, + "loss": -0.1071, + "ppo_loss": -1.0017762184143066, + "sft_loss": 0.015665153041481972, + "step": 1034 + }, + { + "epoch": 2.51, + "grad_norm": 2.323499757488017, + "importance_ratio": 1.046875, + "kl_div": 0.03274444863200188, + "kl_div_pos": 0.04232431575655937, + "kl_div_sft": 0.023164579644799232, + "learning_rate": 3.295269168026101e-07, + "loss": -0.0592, + "ppo_loss": -1.043232798576355, + "sft_loss": 0.058782611042261124, + "step": 1035 + }, + { + "epoch": 2.51, + "grad_norm": 2.0824484916993877, + "importance_ratio": 0.765625, + "kl_div": -0.13460354506969452, + "kl_div_neg": -0.2660696506500244, + "kl_div_sft": -0.0031374366953969, + "learning_rate": 3.278955954323002e-07, + "loss": -0.0027, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.1042267307639122, + "step": 1036 + }, + { + "epoch": 2.51, + "grad_norm": 1.555489951218678, + "importance_ratio": 0.73828125, + "kl_div": -0.16115102171897888, + "kl_div_neg": -0.30402418971061707, + "kl_div_sft": -0.018277853727340698, + "learning_rate": 3.262642740619902e-07, + "loss": 0.0736, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.1204533502459526, + "step": 1037 + }, + { + "epoch": 2.52, + "grad_norm": 1.556094812216053, + "importance_ratio": 1.0546875, + "kl_div": 0.04028111696243286, + "kl_div_pos": 0.05016597732901573, + "kl_div_sft": 0.03039625659584999, + "learning_rate": 3.2463295269168025e-07, + "loss": -0.0944, + "ppo_loss": -1.0514456033706665, + "sft_loss": 0.013909861445426941, + "step": 1038 + }, + { + "epoch": 2.52, + "grad_norm": 0.755530707295099, + "importance_ratio": 0.8515625, + "kl_div": -0.0725010484457016, + "kl_div_neg": -0.16224679350852966, + "kl_div_sft": 0.017244696617126465, + "learning_rate": 3.230016313213703e-07, + "loss": 0.0109, + "ppo_loss": 0.8502313494682312, + "sft_loss": 0.06347573548555374, + "step": 1039 + }, + { + "epoch": 2.52, + "grad_norm": 0.5475334563114244, + "kl_div": 0.009469101205468178, + "kl_div_sft": 0.009469101205468178, + "learning_rate": 3.2137030995106033e-07, + "loss": -0.0214, + "sft_loss": 0.12747272849082947, + "step": 1040 + }, + { + "epoch": 2.52, + "grad_norm": 1.2012607448545887, + "importance_ratio": 1.0390625, + "kl_div": 0.01814403384923935, + "kl_div_pos": 0.039489831775426865, + "kl_div_sft": -0.0032017657067626715, + "learning_rate": 3.1973898858075036e-07, + "loss": 0.0054, + "ppo_loss": -1.0402798652648926, + "sft_loss": 0.07425745576620102, + "step": 1041 + }, + { + "epoch": 2.53, + "grad_norm": 1.2494062921098452, + "importance_ratio": 0.5546875, + "kl_div": -0.2863028049468994, + "kl_div_neg": -0.5887343883514404, + "kl_div_sft": 0.01612876169383526, + "learning_rate": 3.1810766721044045e-07, + "loss": -0.0079, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.013732396066188812, + "step": 1042 + }, + { + "epoch": 2.53, + "grad_norm": 0.9357681809049531, + "importance_ratio": 0.91796875, + "kl_div": -0.0974334329366684, + "kl_div_neg": -0.24848219752311707, + "kl_div_pos": 0.05361533910036087, + "learning_rate": 3.164763458401305e-07, + "loss": -0.0336, + "ppo_loss": -0.12753930687904358, + "step": 1043 + }, + { + "epoch": 2.53, + "grad_norm": 1.8262376742824988, + "importance_ratio": 1.0625, + "kl_div": 0.05723012983798981, + "kl_div_pos": 0.05723012983798981, + "learning_rate": 3.1484502446982053e-07, + "loss": -0.1627, + "ppo_loss": -1.058899998664856, + "step": 1044 + }, + { + "epoch": 2.53, + "grad_norm": 1.1505770313680412, + "importance_ratio": 1.0546875, + "kl_div": 0.010374104604125023, + "kl_div_pos": 0.05278032273054123, + "kl_div_sft": -0.032032113522291183, + "learning_rate": 3.1321370309951056e-07, + "loss": -0.0909, + "ppo_loss": -1.0541980266571045, + "sft_loss": 0.12327804416418076, + "step": 1045 + }, + { + "epoch": 2.54, + "grad_norm": 1.1151452772154664, + "kl_div": 0.012791233137249947, + "kl_div_sft": 0.012791233137249947, + "learning_rate": 3.1158238172920065e-07, + "loss": -0.0217, + "sft_loss": 0.03297749534249306, + "step": 1046 + }, + { + "epoch": 2.54, + "grad_norm": 1.2764277075361212, + "importance_ratio": 0.7265625, + "kl_div": -0.15004746615886688, + "kl_div_neg": -0.32086265087127686, + "kl_div_sft": 0.020767726004123688, + "learning_rate": 3.099510603588907e-07, + "loss": -0.0545, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.007463144138455391, + "step": 1047 + }, + { + "epoch": 2.54, + "grad_norm": 0.6707315251197906, + "importance_ratio": 1.015625, + "kl_div": 0.013251978904008865, + "kl_div_pos": 0.017632799223065376, + "kl_div_sft": 0.008871159516274929, + "learning_rate": 3.0831973898858073e-07, + "loss": -0.0922, + "ppo_loss": -1.0177891254425049, + "sft_loss": 0.09840093553066254, + "step": 1048 + }, + { + "epoch": 2.54, + "grad_norm": 0.8942643812802545, + "importance_ratio": 0.75, + "kl_div": -0.13627244532108307, + "kl_div_neg": -0.28906428813934326, + "kl_div_sft": 0.016519390046596527, + "learning_rate": 3.066884176182708e-07, + "loss": -0.1519, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.028626246377825737, + "step": 1049 + }, + { + "epoch": 2.55, + "grad_norm": 0.9543023888845963, + "importance_ratio": 0.6875, + "kl_div": -0.3724897503852844, + "kl_div_neg": -0.3724897503852844, + "learning_rate": 3.0505709624796086e-07, + "loss": -0.0181, + "ppo_loss": 0.800000011920929, + "step": 1050 + }, + { + "epoch": 2.55, + "grad_norm": 0.8109235141261069, + "importance_ratio": 1.0546875, + "kl_div": 0.04136640205979347, + "kl_div_pos": 0.05685226619243622, + "kl_div_sft": 0.025880537927150726, + "learning_rate": 3.0342577487765084e-07, + "loss": -0.1933, + "ppo_loss": -1.0584994554519653, + "sft_loss": 0.010072030127048492, + "step": 1051 + }, + { + "epoch": 2.55, + "grad_norm": 1.123338319859818, + "importance_ratio": 0.69921875, + "kl_div": -0.17182566225528717, + "kl_div_neg": -0.3587484657764435, + "kl_div_sft": 0.01509714126586914, + "learning_rate": 3.0179445350734093e-07, + "loss": -0.2156, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.04835548251867294, + "step": 1052 + }, + { + "epoch": 2.55, + "grad_norm": 0.7801343963453246, + "importance_ratio": 1.0390625, + "kl_div": 0.015188152901828289, + "kl_div_pos": 0.041032060980796814, + "kl_div_sft": -0.010655755177140236, + "learning_rate": 3.0016313213703097e-07, + "loss": -0.1176, + "ppo_loss": -1.041885495185852, + "sft_loss": 0.08199844509363174, + "step": 1053 + }, + { + "epoch": 2.56, + "grad_norm": 0.7988252150055876, + "importance_ratio": 0.7265625, + "kl_div": -0.3288571238517761, + "kl_div_neg": -0.3288571238517761, + "learning_rate": 2.98531810766721e-07, + "loss": -0.0802, + "ppo_loss": 0.8001587390899658, + "step": 1054 + }, + { + "epoch": 2.56, + "grad_norm": 1.2844207421392433, + "importance_ratio": 1.0625, + "kl_div": 0.03747943416237831, + "kl_div_pos": 0.06310385465621948, + "kl_div_sft": 0.011855010874569416, + "learning_rate": 2.969004893964111e-07, + "loss": -0.1941, + "ppo_loss": -1.065137505531311, + "sft_loss": 0.03686026483774185, + "step": 1055 + }, + { + "epoch": 2.56, + "grad_norm": 1.375427916437548, + "importance_ratio": 0.68359375, + "kl_div": -0.18899478018283844, + "kl_div_neg": -0.3808421492576599, + "kl_div_sft": 0.002852577017620206, + "learning_rate": 2.9526916802610113e-07, + "loss": -0.0285, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.045148205012083054, + "step": 1056 + }, + { + "epoch": 2.56, + "grad_norm": 0.9253707812763302, + "importance_ratio": 0.60546875, + "kl_div": -0.24174775183200836, + "kl_div_neg": -0.5002955794334412, + "kl_div_sft": 0.01680006831884384, + "learning_rate": 2.9363784665579117e-07, + "loss": -0.0444, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.08801782131195068, + "step": 1057 + }, + { + "epoch": 2.56, + "grad_norm": 0.8836541341559392, + "importance_ratio": 1.0625, + "kl_div": 0.025106048211455345, + "kl_div_pos": 0.057671915739774704, + "kl_div_sft": -0.007459820713847876, + "learning_rate": 2.9200652528548126e-07, + "loss": 0.1365, + "ppo_loss": -1.0593674182891846, + "sft_loss": 0.13191211223602295, + "step": 1058 + }, + { + "epoch": 2.57, + "grad_norm": 1.2667644693027453, + "importance_ratio": 0.6796875, + "kl_div": -0.3990887999534607, + "kl_div_neg": -0.3990887999534607, + "learning_rate": 2.903752039151713e-07, + "loss": -0.0013, + "ppo_loss": 0.800000011920929, + "step": 1059 + }, + { + "epoch": 2.57, + "grad_norm": 0.9833057590971988, + "importance_ratio": 1.15625, + "kl_div": 0.06241554394364357, + "kl_div_pos": 0.14272110164165497, + "kl_div_sft": -0.017890015617012978, + "learning_rate": 2.8874388254486133e-07, + "loss": -0.0021, + "ppo_loss": -1.1534080505371094, + "sft_loss": 0.14776094257831573, + "step": 1060 + }, + { + "epoch": 2.57, + "grad_norm": 1.0756397627992749, + "importance_ratio": 0.66796875, + "kl_div": -0.207151859998703, + "kl_div_neg": -0.40292197465896606, + "kl_div_sft": -0.01138173695653677, + "learning_rate": 2.8711256117455137e-07, + "loss": -0.0806, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.05718496814370155, + "step": 1061 + }, + { + "epoch": 2.57, + "grad_norm": 1.2466782302286152, + "importance_ratio": 1.0078125, + "kl_div": 0.00763517152518034, + "kl_div_pos": 0.00763517152518034, + "learning_rate": 2.8548123980424146e-07, + "loss": -0.0701, + "ppo_loss": -1.007724404335022, + "step": 1062 + }, + { + "epoch": 2.58, + "grad_norm": 0.9938324336283981, + "importance_ratio": 0.66796875, + "kl_div": -0.19250355660915375, + "kl_div_neg": -0.4044356346130371, + "kl_div_sft": 0.019428521394729614, + "learning_rate": 2.8384991843393144e-07, + "loss": 0.0077, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.03204822912812233, + "step": 1063 + }, + { + "epoch": 2.58, + "grad_norm": 0.8122684442240617, + "kl_div": -0.011821310967206955, + "kl_div_sft": -0.011821310967206955, + "learning_rate": 2.822185970636215e-07, + "loss": -0.1022, + "sft_loss": 0.11833268404006958, + "step": 1064 + }, + { + "epoch": 2.58, + "grad_norm": 1.0927651467234645, + "importance_ratio": 0.796875, + "kl_div": -0.09760934859514236, + "kl_div_neg": -0.22511491179466248, + "kl_div_sft": 0.0298962090164423, + "learning_rate": 2.8058727569331157e-07, + "loss": -0.0521, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.047688040882349014, + "step": 1065 + }, + { + "epoch": 2.58, + "grad_norm": 0.9446047190323583, + "kl_div": 0.001321147195994854, + "kl_div_sft": 0.001321147195994854, + "learning_rate": 2.789559543230016e-07, + "loss": -0.0115, + "sft_loss": 0.051537930965423584, + "step": 1066 + }, + { + "epoch": 2.59, + "grad_norm": 0.7045568087018107, + "importance_ratio": 1.03125, + "kl_div": 0.03264131397008896, + "kl_div_pos": 0.03413437679409981, + "kl_div_sft": 0.03114825300872326, + "learning_rate": 2.7732463295269164e-07, + "loss": -0.0013, + "ppo_loss": -1.0347236394882202, + "sft_loss": 0.025318821892142296, + "step": 1067 + }, + { + "epoch": 2.59, + "grad_norm": 0.7338484641819885, + "importance_ratio": 1.0390625, + "kl_div": 0.030414361506700516, + "kl_div_pos": 0.04188847914338112, + "kl_div_sft": 0.018940245732665062, + "learning_rate": 2.7569331158238173e-07, + "loss": -0.115, + "ppo_loss": -1.0427781343460083, + "sft_loss": 0.05671351030468941, + "step": 1068 + }, + { + "epoch": 2.59, + "grad_norm": 0.7268600674880552, + "kl_div": 0.007879311218857765, + "kl_div_sft": 0.007879311218857765, + "learning_rate": 2.7406199021207177e-07, + "loss": -0.1696, + "sft_loss": 0.06098397448658943, + "step": 1069 + }, + { + "epoch": 2.59, + "grad_norm": 1.1483685738776235, + "importance_ratio": 1.015625, + "kl_div": 0.007197917439043522, + "kl_div_pos": 0.017390882596373558, + "kl_div_sft": -0.0029950477182865143, + "learning_rate": 2.724306688417618e-07, + "loss": -0.1216, + "ppo_loss": -1.0175429582595825, + "sft_loss": 0.1088942214846611, + "step": 1070 + }, + { + "epoch": 2.6, + "grad_norm": 0.8431710977020026, + "importance_ratio": 1.0546875, + "kl_div": 0.03426618501543999, + "kl_div_pos": 0.04973319172859192, + "kl_div_sft": 0.018799176439642906, + "learning_rate": 2.707993474714519e-07, + "loss": -0.0724, + "ppo_loss": -1.0509907007217407, + "sft_loss": 0.04126371070742607, + "step": 1071 + }, + { + "epoch": 2.6, + "grad_norm": 1.4190722498656456, + "importance_ratio": 0.625, + "kl_div": -0.23403041064739227, + "kl_div_neg": -0.47112327814102173, + "kl_div_sft": 0.0030624489299952984, + "learning_rate": 2.6916802610114193e-07, + "loss": -0.0026, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.13048440217971802, + "step": 1072 + }, + { + "epoch": 2.6, + "grad_norm": 0.7858382342466547, + "importance_ratio": 0.93359375, + "kl_div": -0.07867449522018433, + "kl_div_neg": -0.21523505449295044, + "kl_div_pos": 0.05788605660200119, + "learning_rate": 2.6753670473083197e-07, + "loss": -0.0918, + "ppo_loss": -0.12662121653556824, + "step": 1073 + }, + { + "epoch": 2.6, + "grad_norm": 1.6482051305925476, + "kl_div": 0.002539373002946377, + "kl_div_sft": 0.002539373002946377, + "learning_rate": 2.65905383360522e-07, + "loss": -0.1219, + "sft_loss": 0.08029313385486603, + "step": 1074 + }, + { + "epoch": 2.61, + "grad_norm": 1.0317026936449962, + "importance_ratio": 1.0390625, + "kl_div": 0.025399165228009224, + "kl_div_pos": 0.04176368936896324, + "kl_div_sft": 0.009034640155732632, + "learning_rate": 2.6427406199021205e-07, + "loss": -0.0674, + "ppo_loss": -1.0426480770111084, + "sft_loss": 0.03283477947115898, + "step": 1075 + }, + { + "epoch": 2.61, + "grad_norm": 1.0097162762209868, + "importance_ratio": 1.046875, + "kl_div": 0.038147445768117905, + "kl_div_pos": 0.04772398620843887, + "kl_div_sft": 0.028570905327796936, + "learning_rate": 2.626427406199021e-07, + "loss": -0.113, + "ppo_loss": -1.0488810539245605, + "sft_loss": 0.06863260269165039, + "step": 1076 + }, + { + "epoch": 2.61, + "grad_norm": 0.719903352264217, + "kl_div": -0.0003017587587237358, + "kl_div_sft": -0.0003017587587237358, + "learning_rate": 2.610114192495921e-07, + "loss": 0.0407, + "sft_loss": 0.06353778392076492, + "step": 1077 + }, + { + "epoch": 2.61, + "grad_norm": 0.8777904973000497, + "importance_ratio": 0.66796875, + "kl_div": -0.1905461549758911, + "kl_div_neg": -0.4016769528388977, + "kl_div_sft": 0.020584631711244583, + "learning_rate": 2.593800978792822e-07, + "loss": -0.0008, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.06284645944833755, + "step": 1078 + }, + { + "epoch": 2.62, + "grad_norm": 1.0830187646502971, + "importance_ratio": 0.82421875, + "kl_div": -0.08450426906347275, + "kl_div_neg": -0.191858172416687, + "kl_div_sft": 0.022849630564451218, + "learning_rate": 2.5774877650897225e-07, + "loss": 0.0971, + "ppo_loss": 0.8254238963127136, + "sft_loss": 0.10808595269918442, + "step": 1079 + }, + { + "epoch": 2.62, + "grad_norm": 0.7372952694322946, + "importance_ratio": 0.73046875, + "kl_div": -0.15658007562160492, + "kl_div_neg": -0.31368288397789, + "kl_div_sft": 0.000522738613653928, + "learning_rate": 2.561174551386623e-07, + "loss": -0.1206, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.053254082798957825, + "step": 1080 + }, + { + "epoch": 2.62, + "grad_norm": 1.3891624732241223, + "importance_ratio": 0.78515625, + "kl_div": -0.24391339719295502, + "kl_div_neg": -0.24391339719295502, + "learning_rate": 2.544861337683524e-07, + "loss": 0.0536, + "ppo_loss": 0.8239413499832153, + "step": 1081 + }, + { + "epoch": 2.62, + "grad_norm": 1.0054647616030914, + "kl_div": 0.019152436405420303, + "kl_div_sft": 0.019152436405420303, + "learning_rate": 2.528548123980424e-07, + "loss": -0.0373, + "sft_loss": 0.0417667031288147, + "step": 1082 + }, + { + "epoch": 2.63, + "grad_norm": 0.7566992851682453, + "importance_ratio": 1.0546875, + "kl_div": 0.04358925670385361, + "kl_div_pos": 0.05337366834282875, + "kl_div_sft": 0.03380484879016876, + "learning_rate": 2.5122349102773245e-07, + "loss": -0.0952, + "ppo_loss": -1.0548237562179565, + "sft_loss": 0.025316337123513222, + "step": 1083 + }, + { + "epoch": 2.63, + "grad_norm": 1.101774046585049, + "importance_ratio": 0.7890625, + "kl_div": -0.11028125882148743, + "kl_div_neg": -0.23718704283237457, + "kl_div_sft": 0.01662452146410942, + "learning_rate": 2.495921696574225e-07, + "loss": 0.0535, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.023163825273513794, + "step": 1084 + }, + { + "epoch": 2.63, + "grad_norm": 3.775540541796825, + "importance_ratio": 1.03125, + "kl_div": 0.03306008130311966, + "kl_div_pos": 0.03152577951550484, + "kl_div_sft": 0.03459438309073448, + "learning_rate": 2.479608482871126e-07, + "loss": -0.0828, + "ppo_loss": -1.0320279598236084, + "sft_loss": 0.06501305103302002, + "step": 1085 + }, + { + "epoch": 2.63, + "grad_norm": 1.3828568101238494, + "importance_ratio": 0.53515625, + "kl_div": -0.3016151189804077, + "kl_div_neg": -0.6263535022735596, + "kl_div_sft": 0.023123271763324738, + "learning_rate": 2.463295269168026e-07, + "loss": 0.0381, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.024348098784685135, + "step": 1086 + }, + { + "epoch": 2.64, + "grad_norm": 0.886695424583937, + "importance_ratio": 0.63671875, + "kl_div": -0.22029554843902588, + "kl_div_neg": -0.45359790325164795, + "kl_div_sft": 0.013006805442273617, + "learning_rate": 2.4469820554649265e-07, + "loss": -0.0932, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.028095100075006485, + "step": 1087 + }, + { + "epoch": 2.64, + "grad_norm": 1.9189425169946241, + "importance_ratio": 1.0546875, + "kl_div": 0.030528422445058823, + "kl_div_pos": 0.05292826145887375, + "kl_div_sft": 0.008128583431243896, + "learning_rate": 2.430668841761827e-07, + "loss": -0.012, + "ppo_loss": -1.0543540716171265, + "sft_loss": 0.0633447915315628, + "step": 1088 + }, + { + "epoch": 2.64, + "grad_norm": 0.5676929460125901, + "kl_div": 0.0012860526330769062, + "kl_div_sft": 0.0012860526330769062, + "learning_rate": 2.414355628058727e-07, + "loss": -0.0232, + "sft_loss": 0.0832691341638565, + "step": 1089 + }, + { + "epoch": 2.64, + "grad_norm": 0.6963825136367197, + "importance_ratio": 0.71484375, + "kl_div": -0.15527908504009247, + "kl_div_neg": -0.3331499993801117, + "kl_div_sft": 0.02259182743728161, + "learning_rate": 2.398042414355628e-07, + "loss": -0.1482, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.03868754953145981, + "step": 1090 + }, + { + "epoch": 2.64, + "grad_norm": 0.9400337629646386, + "importance_ratio": 0.765625, + "kl_div": -0.1231977567076683, + "kl_div_neg": -0.2665286362171173, + "kl_div_sft": 0.020133126527071, + "learning_rate": 2.3817292006525282e-07, + "loss": 0.006, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.0708237737417221, + "step": 1091 + }, + { + "epoch": 2.65, + "grad_norm": 0.925268812040375, + "kl_div": 0.008872696198523045, + "kl_div_sft": 0.008872696198523045, + "learning_rate": 2.3654159869494289e-07, + "loss": -0.1638, + "sft_loss": 0.05794968456029892, + "step": 1092 + }, + { + "epoch": 2.65, + "grad_norm": 1.1531785430469386, + "importance_ratio": 0.7890625, + "kl_div": -0.3047209680080414, + "kl_div_neg": -0.6697598099708557, + "kl_div_pos": 0.06031789258122444, + "learning_rate": 2.3491027732463295e-07, + "loss": -0.1041, + "ppo_loss": -0.13108709454536438, + "step": 1093 + }, + { + "epoch": 2.65, + "grad_norm": 0.7584735700520077, + "importance_ratio": 0.78515625, + "kl_div": -0.12317772209644318, + "kl_div_neg": -0.24293836951255798, + "kl_div_sft": -0.0034170723520219326, + "learning_rate": 2.33278955954323e-07, + "loss": -0.0657, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.06922072917222977, + "step": 1094 + }, + { + "epoch": 2.65, + "grad_norm": 0.6423110406852908, + "importance_ratio": 0.6953125, + "kl_div": -0.3657418191432953, + "kl_div_neg": -0.3657418191432953, + "learning_rate": 2.3164763458401305e-07, + "loss": 0.098, + "ppo_loss": 0.800000011920929, + "step": 1095 + }, + { + "epoch": 2.66, + "grad_norm": 0.870868807774345, + "kl_div": 0.016433026641607285, + "kl_div_sft": 0.016433026641607285, + "learning_rate": 2.300163132137031e-07, + "loss": -0.0642, + "sft_loss": 0.029268991202116013, + "step": 1096 + }, + { + "epoch": 2.66, + "grad_norm": 0.8481605541448505, + "importance_ratio": 1.046875, + "kl_div": 0.03731922805309296, + "kl_div_pos": 0.043029073625802994, + "kl_div_sft": 0.03160938620567322, + "learning_rate": 2.2838499184339312e-07, + "loss": -0.0741, + "ppo_loss": -1.0439682006835938, + "sft_loss": 0.012241056188941002, + "step": 1097 + }, + { + "epoch": 2.66, + "grad_norm": 1.1204147675255658, + "importance_ratio": 0.98046875, + "kl_div": 0.0020015956833958626, + "kl_div_pos": -0.018063034862279892, + "kl_div_sft": 0.022066226229071617, + "learning_rate": 2.267536704730832e-07, + "loss": 0.0903, + "ppo_loss": -0.9820991158485413, + "sft_loss": 0.12652084231376648, + "step": 1098 + }, + { + "epoch": 2.66, + "grad_norm": 1.0467702898637867, + "importance_ratio": 0.875, + "kl_div": -0.15396293997764587, + "kl_div_neg": -0.36044037342071533, + "kl_div_pos": 0.05251449719071388, + "learning_rate": 2.2512234910277323e-07, + "loss": -0.0469, + "ppo_loss": -0.1269589364528656, + "step": 1099 + }, + { + "epoch": 2.67, + "grad_norm": 1.2571962161784893, + "importance_ratio": 0.74609375, + "kl_div": -0.160195454955101, + "kl_div_neg": -0.2913189232349396, + "kl_div_sft": -0.029071999713778496, + "learning_rate": 2.234910277324633e-07, + "loss": -0.0143, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.09783211350440979, + "step": 1100 + }, + { + "epoch": 2.67, + "grad_norm": 1.1750347010581415, + "kl_div": 0.014914116822183132, + "kl_div_sft": 0.014914116822183132, + "learning_rate": 2.2185970636215335e-07, + "loss": -0.0397, + "sft_loss": 0.04734306409955025, + "step": 1101 + }, + { + "epoch": 2.67, + "grad_norm": 0.8239253457495405, + "importance_ratio": 1.078125, + "kl_div": 0.05389084294438362, + "kl_div_pos": 0.07796481251716614, + "kl_div_sft": 0.029816875234246254, + "learning_rate": 2.2022838499184336e-07, + "loss": -0.0211, + "ppo_loss": -1.0810847282409668, + "sft_loss": 0.018815357238054276, + "step": 1102 + }, + { + "epoch": 2.67, + "grad_norm": 0.8150976178735944, + "importance_ratio": 0.7578125, + "kl_div": -0.2764035165309906, + "kl_div_neg": -0.2764035165309906, + "learning_rate": 2.1859706362153343e-07, + "loss": 0.0045, + "ppo_loss": 0.8088388442993164, + "step": 1103 + }, + { + "epoch": 2.68, + "grad_norm": 0.6921501252910377, + "kl_div": 0.01970837637782097, + "kl_div_sft": 0.01970837637782097, + "learning_rate": 2.169657422512235e-07, + "loss": -0.0667, + "sft_loss": 0.04228641837835312, + "step": 1104 + }, + { + "epoch": 2.68, + "grad_norm": 0.665915428522638, + "importance_ratio": 1.0390625, + "kl_div": 0.00600157305598259, + "kl_div_pos": 0.036812517791986465, + "kl_div_sft": -0.024809371680021286, + "learning_rate": 2.1533442088091353e-07, + "loss": -0.0151, + "ppo_loss": -1.0374984741210938, + "sft_loss": 0.1946043074131012, + "step": 1105 + }, + { + "epoch": 2.68, + "grad_norm": 1.555084484540579, + "importance_ratio": 0.66015625, + "kl_div": -0.21379652619361877, + "kl_div_neg": -0.4146406650543213, + "kl_div_sft": -0.012952383607625961, + "learning_rate": 2.137030995106036e-07, + "loss": -0.205, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.11993511021137238, + "step": 1106 + }, + { + "epoch": 2.68, + "grad_norm": 0.9577427301930038, + "importance_ratio": 0.8984375, + "kl_div": -0.1193065196275711, + "kl_div_neg": -0.26879850029945374, + "kl_div_pos": 0.03018546849489212, + "learning_rate": 2.1207177814029363e-07, + "loss": -0.1508, + "ppo_loss": -0.11532279849052429, + "step": 1107 + }, + { + "epoch": 2.69, + "grad_norm": 0.9782622841235084, + "kl_div": 0.007449622265994549, + "kl_div_sft": 0.007449622265994549, + "learning_rate": 2.1044045676998366e-07, + "loss": -0.0685, + "sft_loss": 0.06773808598518372, + "step": 1108 + }, + { + "epoch": 2.69, + "grad_norm": 0.9088608195909383, + "importance_ratio": 0.75390625, + "kl_div": -0.28366923332214355, + "kl_div_neg": -0.28366923332214355, + "learning_rate": 2.0880913539967373e-07, + "loss": -0.0054, + "ppo_loss": 0.800000011920929, + "step": 1109 + }, + { + "epoch": 2.69, + "grad_norm": 1.9121598732877862, + "importance_ratio": 0.8984375, + "kl_div": -0.12701398134231567, + "kl_div_neg": -0.345787912607193, + "kl_div_pos": 0.09175995737314224, + "learning_rate": 2.0717781402936376e-07, + "loss": -0.1056, + "ppo_loss": -0.14805081486701965, + "step": 1110 + }, + { + "epoch": 2.69, + "grad_norm": 0.9042432204739338, + "importance_ratio": 1.0625, + "kl_div": 0.05639778822660446, + "kl_div_pos": 0.05639778822660446, + "learning_rate": 2.0554649265905383e-07, + "loss": -0.0823, + "ppo_loss": -1.0584442615509033, + "step": 1111 + }, + { + "epoch": 2.7, + "grad_norm": 0.8756164014504375, + "importance_ratio": 0.765625, + "kl_div": -0.12315031886100769, + "kl_div_neg": -0.26553070545196533, + "kl_div_sft": 0.0192300695925951, + "learning_rate": 2.039151712887439e-07, + "loss": -0.1084, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.049898453056812286, + "step": 1112 + }, + { + "epoch": 2.7, + "grad_norm": 1.6528490524880572, + "importance_ratio": 1.0234375, + "kl_div": 0.02359095588326454, + "kl_div_pos": 0.02359095588326454, + "learning_rate": 2.0228384991843393e-07, + "loss": -0.0684, + "ppo_loss": -1.0239702463150024, + "step": 1113 + }, + { + "epoch": 2.7, + "grad_norm": 0.8415497596362785, + "kl_div": -0.02142334170639515, + "kl_div_sft": -0.02142334170639515, + "learning_rate": 2.0065252854812397e-07, + "loss": -0.1327, + "sft_loss": 0.08445389568805695, + "step": 1114 + }, + { + "epoch": 2.7, + "grad_norm": 1.3523021724662885, + "importance_ratio": 1.0625, + "kl_div": 0.03919368237257004, + "kl_div_pos": 0.062300436198711395, + "kl_div_sft": 0.016086924821138382, + "learning_rate": 1.99021207177814e-07, + "loss": -0.0498, + "ppo_loss": -1.0642820596694946, + "sft_loss": 0.03224463015794754, + "step": 1115 + }, + { + "epoch": 2.71, + "grad_norm": 1.5138710333950391, + "importance_ratio": 1.09375, + "kl_div": 0.04405112564563751, + "kl_div_pos": 0.08768067508935928, + "kl_div_sft": 0.00042157687130384147, + "learning_rate": 1.9738988580750407e-07, + "loss": 0.0862, + "ppo_loss": -1.091639518737793, + "sft_loss": 0.12167085707187653, + "step": 1116 + }, + { + "epoch": 2.71, + "grad_norm": 0.9421302575583053, + "importance_ratio": 0.90625, + "kl_div": -0.11986620724201202, + "kl_div_neg": -0.3113791048526764, + "kl_div_pos": 0.07164669781923294, + "learning_rate": 1.9575856443719413e-07, + "loss": 0.0002, + "ppo_loss": -0.13713786005973816, + "step": 1117 + }, + { + "epoch": 2.71, + "grad_norm": 1.4199391695881942, + "kl_div": 0.003954947926104069, + "kl_div_sft": 0.003954947926104069, + "learning_rate": 1.9412724306688417e-07, + "loss": -0.0118, + "sft_loss": 0.044476695358753204, + "step": 1118 + }, + { + "epoch": 2.71, + "grad_norm": 0.6698292481917263, + "kl_div": 0.01963566057384014, + "kl_div_sft": 0.01963566057384014, + "learning_rate": 1.9249592169657423e-07, + "loss": -0.0074, + "sft_loss": 0.04536540061235428, + "step": 1119 + }, + { + "epoch": 2.72, + "grad_norm": 0.6886097880573838, + "kl_div": -0.004337035119533539, + "kl_div_sft": -0.004337035119533539, + "learning_rate": 1.9086460032626424e-07, + "loss": 0.0435, + "sft_loss": 0.08166567981243134, + "step": 1120 + }, + { + "epoch": 2.72, + "grad_norm": 1.5491112622221588, + "importance_ratio": 0.796875, + "kl_div": -0.227139413356781, + "kl_div_neg": -0.227139413356781, + "learning_rate": 1.892332789559543e-07, + "loss": -0.1024, + "ppo_loss": 0.8080419898033142, + "step": 1121 + }, + { + "epoch": 2.72, + "grad_norm": 0.9438899662739387, + "importance_ratio": 1.0625, + "kl_div": 0.05969712883234024, + "kl_div_pos": 0.05969712883234024, + "learning_rate": 1.8760195758564437e-07, + "loss": -0.0487, + "ppo_loss": -1.0615639686584473, + "step": 1122 + }, + { + "epoch": 2.72, + "grad_norm": 1.2169227596211318, + "importance_ratio": 0.5390625, + "kl_div": -0.30277031660079956, + "kl_div_neg": -0.6170855760574341, + "kl_div_sft": 0.011544971726834774, + "learning_rate": 1.859706362153344e-07, + "loss": 0.0772, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.040595829486846924, + "step": 1123 + }, + { + "epoch": 2.72, + "grad_norm": 1.4201241917097076, + "importance_ratio": 1.046875, + "kl_div": 0.03995908051729202, + "kl_div_pos": 0.03995908051729202, + "learning_rate": 1.8433931484502447e-07, + "loss": -0.0486, + "ppo_loss": -1.0409839153289795, + "step": 1124 + }, + { + "epoch": 2.73, + "grad_norm": 0.9788433447473673, + "kl_div": -0.01730550080537796, + "kl_div_sft": -0.01730550080537796, + "learning_rate": 1.8270799347471453e-07, + "loss": -0.0301, + "sft_loss": 0.08677445352077484, + "step": 1125 + }, + { + "epoch": 2.73, + "grad_norm": 0.5361937330199169, + "kl_div": -0.005295942537486553, + "kl_div_sft": -0.005295942537486553, + "learning_rate": 1.8107667210440454e-07, + "loss": 0.0529, + "sft_loss": 0.06039096787571907, + "step": 1126 + }, + { + "epoch": 2.73, + "grad_norm": 0.724901659642847, + "importance_ratio": 0.703125, + "kl_div": -0.35365772247314453, + "kl_div_neg": -0.35365772247314453, + "learning_rate": 1.794453507340946e-07, + "loss": -0.0044, + "ppo_loss": 0.800000011920929, + "step": 1127 + }, + { + "epoch": 2.73, + "grad_norm": 1.3223456805067588, + "importance_ratio": 1.0234375, + "kl_div": 0.022419404238462448, + "kl_div_pos": 0.026006117463111877, + "kl_div_sft": 0.01883268915116787, + "learning_rate": 1.7781402936378464e-07, + "loss": -0.0079, + "ppo_loss": -1.026347279548645, + "sft_loss": 0.11715763062238693, + "step": 1128 + }, + { + "epoch": 2.74, + "grad_norm": 1.3684236006674422, + "importance_ratio": 1.046875, + "kl_div": 0.04095806926488876, + "kl_div_pos": 0.04559039697051048, + "kl_div_sft": 0.03632574528455734, + "learning_rate": 1.761827079934747e-07, + "loss": -0.2562, + "ppo_loss": -1.0466456413269043, + "sft_loss": 0.033348001539707184, + "step": 1129 + }, + { + "epoch": 2.74, + "grad_norm": 2.0941610359857985, + "importance_ratio": 0.796875, + "kl_div": -0.10481736809015274, + "kl_div_neg": -0.22943493723869324, + "kl_div_sft": 0.019800204783678055, + "learning_rate": 1.7455138662316477e-07, + "loss": -0.0176, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.07383913546800613, + "step": 1130 + }, + { + "epoch": 2.74, + "grad_norm": 1.869183610603224, + "importance_ratio": 0.82421875, + "kl_div": -0.08802182972431183, + "kl_div_neg": -0.19563773274421692, + "kl_div_sft": 0.019594065845012665, + "learning_rate": 1.729200652528548e-07, + "loss": -0.1363, + "ppo_loss": 0.8223100900650024, + "sft_loss": 0.026904089376330376, + "step": 1131 + }, + { + "epoch": 2.74, + "grad_norm": 1.0702346996237544, + "kl_div": 0.005095012951642275, + "kl_div_sft": 0.005095012951642275, + "learning_rate": 1.7128874388254484e-07, + "loss": 0.0549, + "sft_loss": 0.055672984570264816, + "step": 1132 + }, + { + "epoch": 2.75, + "grad_norm": 0.6563738978457855, + "importance_ratio": 1.0234375, + "kl_div": 0.01523815467953682, + "kl_div_pos": 0.02387329190969467, + "kl_div_sft": 0.006603018380701542, + "learning_rate": 1.696574225122349e-07, + "loss": 0.0573, + "ppo_loss": -1.0241605043411255, + "sft_loss": 0.07252994179725647, + "step": 1133 + }, + { + "epoch": 2.75, + "grad_norm": 0.9554710905930678, + "importance_ratio": 1.0625, + "kl_div": 0.04103770852088928, + "kl_div_pos": 0.05817992612719536, + "kl_div_sft": 0.023895489051938057, + "learning_rate": 1.6802610114192494e-07, + "loss": -0.1059, + "ppo_loss": -1.059905767440796, + "sft_loss": 0.02852977253496647, + "step": 1134 + }, + { + "epoch": 2.75, + "grad_norm": 1.332576114491777, + "importance_ratio": 0.76171875, + "kl_div": -0.27210551500320435, + "kl_div_neg": -0.27210551500320435, + "learning_rate": 1.66394779771615e-07, + "loss": 0.0088, + "ppo_loss": 0.800000011920929, + "step": 1135 + }, + { + "epoch": 2.75, + "grad_norm": 1.3744154467839322, + "importance_ratio": 0.890625, + "kl_div": -0.12652842700481415, + "kl_div_neg": -0.26943251490592957, + "kl_div_pos": 0.01637565903365612, + "learning_rate": 1.6476345840130504e-07, + "loss": 0.0322, + "ppo_loss": -0.10825523734092712, + "step": 1136 + }, + { + "epoch": 2.76, + "grad_norm": 0.6300766284449495, + "kl_div": 0.012872161343693733, + "kl_div_sft": 0.012872161343693733, + "learning_rate": 1.631321370309951e-07, + "loss": 0.009, + "sft_loss": 0.09325136244297028, + "step": 1137 + }, + { + "epoch": 2.76, + "grad_norm": 0.7215131224154683, + "importance_ratio": 1.0390625, + "kl_div": 0.03919503092765808, + "kl_div_pos": 0.03919503092765808, + "learning_rate": 1.6150081566068514e-07, + "loss": -0.1359, + "ppo_loss": -1.0400731563568115, + "step": 1138 + }, + { + "epoch": 2.76, + "grad_norm": 0.5619121234583224, + "importance_ratio": 0.6328125, + "kl_div": -0.46213510632514954, + "kl_div_neg": -0.46213510632514954, + "learning_rate": 1.5986949429037518e-07, + "loss": -0.0543, + "ppo_loss": 0.800000011920929, + "step": 1139 + }, + { + "epoch": 2.76, + "grad_norm": 0.734594961461299, + "importance_ratio": 0.44140625, + "kl_div": -0.40974512696266174, + "kl_div_neg": -0.8175477385520935, + "kl_div_sft": -0.001942517701536417, + "learning_rate": 1.5823817292006525e-07, + "loss": -0.035, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.06181861087679863, + "step": 1140 + }, + { + "epoch": 2.77, + "grad_norm": 0.7942525309137227, + "importance_ratio": 0.76953125, + "kl_div": -0.12769439816474915, + "kl_div_neg": -0.2644917666912079, + "kl_div_sft": 0.009102970361709595, + "learning_rate": 1.5660685154975528e-07, + "loss": -0.0329, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.03120669350028038, + "step": 1141 + }, + { + "epoch": 2.77, + "grad_norm": 0.9624103950306931, + "importance_ratio": 0.73828125, + "kl_div": -0.14747409522533417, + "kl_div_neg": -0.3031984865665436, + "kl_div_sft": 0.008250309154391289, + "learning_rate": 1.5497553017944535e-07, + "loss": 0.0108, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.05386658012866974, + "step": 1142 + }, + { + "epoch": 2.77, + "grad_norm": 1.5596566077849165, + "kl_div": 0.01757156103849411, + "kl_div_sft": 0.01757156103849411, + "learning_rate": 1.533442088091354e-07, + "loss": 0.0472, + "sft_loss": 0.0313275121152401, + "step": 1143 + }, + { + "epoch": 2.77, + "grad_norm": 1.224527943411857, + "kl_div": -0.005035087466239929, + "kl_div_sft": -0.005035087466239929, + "learning_rate": 1.5171288743882542e-07, + "loss": 0.0123, + "sft_loss": 0.08055819571018219, + "step": 1144 + }, + { + "epoch": 2.78, + "grad_norm": 0.7722177617045625, + "kl_div": 0.0151774100959301, + "kl_div_sft": 0.0151774100959301, + "learning_rate": 1.5008156606851548e-07, + "loss": -0.0686, + "sft_loss": 0.038552433252334595, + "step": 1145 + }, + { + "epoch": 2.78, + "grad_norm": 0.8718422381080544, + "importance_ratio": 0.73046875, + "kl_div": -0.15404218435287476, + "kl_div_neg": -0.31161925196647644, + "kl_div_sft": 0.003534871619194746, + "learning_rate": 1.4845024469820555e-07, + "loss": 0.0104, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.07242981344461441, + "step": 1146 + }, + { + "epoch": 2.78, + "grad_norm": 2.0345420569579686, + "importance_ratio": 1.03125, + "kl_div": 0.031925879418849945, + "kl_div_pos": 0.031925879418849945, + "learning_rate": 1.4681892332789558e-07, + "loss": 0.0601, + "ppo_loss": -1.0334405899047852, + "step": 1147 + }, + { + "epoch": 2.78, + "grad_norm": 1.0711314833969363, + "importance_ratio": 1.046875, + "kl_div": 0.04982160031795502, + "kl_div_pos": 0.04982160031795502, + "learning_rate": 1.4518760195758565e-07, + "loss": -0.119, + "ppo_loss": -1.051084041595459, + "step": 1148 + }, + { + "epoch": 2.79, + "grad_norm": 1.0689223234219718, + "importance_ratio": 0.703125, + "kl_div": -0.16888612508773804, + "kl_div_neg": -0.35433754324913025, + "kl_div_sft": 0.01656527817249298, + "learning_rate": 1.4355628058727568e-07, + "loss": -0.0845, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.03223303705453873, + "step": 1149 + }, + { + "epoch": 2.79, + "grad_norm": 1.0808209973888399, + "kl_div": -0.01934581622481346, + "kl_div_sft": -0.01934581622481346, + "learning_rate": 1.4192495921696572e-07, + "loss": -0.2052, + "sft_loss": 0.07561028003692627, + "step": 1150 + }, + { + "epoch": 2.79, + "grad_norm": 1.2129563269273218, + "kl_div": 0.01769608072936535, + "kl_div_sft": 0.01769608072936535, + "learning_rate": 1.4029363784665578e-07, + "loss": -0.0735, + "sft_loss": 0.04179126024246216, + "step": 1151 + }, + { + "epoch": 2.79, + "grad_norm": 0.6347059955465805, + "kl_div": 0.027414832264184952, + "kl_div_sft": 0.027414832264184952, + "learning_rate": 1.3866231647634582e-07, + "loss": 0.0408, + "sft_loss": 0.03668251633644104, + "step": 1152 + }, + { + "epoch": 2.8, + "grad_norm": 0.7479490609622703, + "importance_ratio": 0.82421875, + "kl_div": -0.11562100797891617, + "kl_div_neg": -0.1932021677494049, + "kl_div_sft": -0.03803984820842743, + "learning_rate": 1.3703099510603589e-07, + "loss": -0.0478, + "ppo_loss": 0.8243153095245361, + "sft_loss": 0.0878666415810585, + "step": 1153 + }, + { + "epoch": 2.8, + "grad_norm": 0.8725099213809392, + "importance_ratio": 0.72265625, + "kl_div": -0.15855799615383148, + "kl_div_neg": -0.32498425245285034, + "kl_div_sft": 0.00786825455725193, + "learning_rate": 1.3539967373572595e-07, + "loss": -0.0438, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.044809021055698395, + "step": 1154 + }, + { + "epoch": 2.8, + "grad_norm": 0.8801756783170777, + "importance_ratio": 1.0390625, + "kl_div": 0.009981222450733185, + "kl_div_pos": 0.040560025721788406, + "kl_div_sft": -0.020597580820322037, + "learning_rate": 1.3376835236541599e-07, + "loss": -0.0567, + "ppo_loss": -1.0413938760757446, + "sft_loss": 0.06672003120183945, + "step": 1155 + }, + { + "epoch": 2.8, + "grad_norm": 1.125348355083032, + "importance_ratio": 0.703125, + "kl_div": -0.16469106078147888, + "kl_div_neg": -0.35337555408477783, + "kl_div_sft": 0.02399342507123947, + "learning_rate": 1.3213703099510602e-07, + "loss": 0.0195, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.022216780111193657, + "step": 1156 + }, + { + "epoch": 2.8, + "grad_norm": 0.9578812541834655, + "importance_ratio": 0.578125, + "kl_div": -0.27082908153533936, + "kl_div_neg": -0.5504145622253418, + "kl_div_sft": 0.008756392635405064, + "learning_rate": 1.3050570962479606e-07, + "loss": -0.1345, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.022599216550588608, + "step": 1157 + }, + { + "epoch": 2.81, + "grad_norm": 2.131705419482927, + "importance_ratio": 1.0859375, + "kl_div": 0.04747091233730316, + "kl_div_pos": 0.08328525722026825, + "kl_div_sft": 0.011656570248305798, + "learning_rate": 1.2887438825448612e-07, + "loss": -0.0991, + "ppo_loss": -1.0868518352508545, + "sft_loss": 0.04350946098566055, + "step": 1158 + }, + { + "epoch": 2.81, + "grad_norm": 0.6816832083705758, + "kl_div": 0.005396461579948664, + "kl_div_sft": 0.005396461579948664, + "learning_rate": 1.272430668841762e-07, + "loss": -0.101, + "sft_loss": 0.049411237239837646, + "step": 1159 + }, + { + "epoch": 2.81, + "grad_norm": 0.830705401201232, + "importance_ratio": 0.91015625, + "kl_div": -0.060057319700717926, + "kl_div_pos": -0.09451054036617279, + "kl_div_sft": -0.02560410276055336, + "learning_rate": 1.2561174551386622e-07, + "loss": -0.0946, + "ppo_loss": -0.909818172454834, + "sft_loss": 0.13856370747089386, + "step": 1160 + }, + { + "epoch": 2.81, + "grad_norm": 0.9586862384229029, + "importance_ratio": 0.8671875, + "kl_div": -0.14406529068946838, + "kl_div_neg": -0.2500915229320526, + "kl_div_pos": -0.03803904354572296, + "learning_rate": 1.239804241435563e-07, + "loss": 0.0083, + "ppo_loss": -0.08133766055107117, + "step": 1161 + }, + { + "epoch": 2.82, + "grad_norm": 1.770973947963446, + "importance_ratio": 1.015625, + "kl_div": 0.02230958268046379, + "kl_div_pos": 0.012497110292315483, + "kl_div_sft": 0.03212205320596695, + "learning_rate": 1.2234910277324632e-07, + "loss": -0.0917, + "ppo_loss": -1.0125755071640015, + "sft_loss": 0.05473034456372261, + "step": 1162 + }, + { + "epoch": 2.82, + "grad_norm": 1.893070562366472, + "kl_div": 0.00027730176225304604, + "kl_div_sft": 0.00027730176225304604, + "learning_rate": 1.2071778140293636e-07, + "loss": 0.0764, + "sft_loss": 0.06012003496289253, + "step": 1163 + }, + { + "epoch": 2.82, + "grad_norm": 1.9890327757047541, + "importance_ratio": 1.0703125, + "kl_div": 0.039140187203884125, + "kl_div_pos": 0.06697124987840652, + "kl_div_sft": 0.011309120804071426, + "learning_rate": 1.1908646003262641e-07, + "loss": 0.0611, + "ppo_loss": -1.0692646503448486, + "sft_loss": 0.051590412855148315, + "step": 1164 + }, + { + "epoch": 2.82, + "grad_norm": 1.3363702084433386, + "importance_ratio": 0.66796875, + "kl_div": -0.23694896697998047, + "kl_div_neg": -0.40421345829963684, + "kl_div_sft": -0.0696844831109047, + "learning_rate": 1.1745513866231648e-07, + "loss": 0.0501, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.17454583942890167, + "step": 1165 + }, + { + "epoch": 2.83, + "grad_norm": 1.1762943605658298, + "importance_ratio": 0.78515625, + "kl_div": -0.1164015606045723, + "kl_div_neg": -0.23999521136283875, + "kl_div_sft": 0.007192092947661877, + "learning_rate": 1.1582381729200653e-07, + "loss": -0.1663, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.07211599498987198, + "step": 1166 + }, + { + "epoch": 2.83, + "grad_norm": 1.150266682759822, + "kl_div": -0.01800680160522461, + "kl_div_sft": -0.01800680160522461, + "learning_rate": 1.1419249592169656e-07, + "loss": -0.1317, + "sft_loss": 0.0823834165930748, + "step": 1167 + }, + { + "epoch": 2.83, + "grad_norm": 0.8969832883317888, + "importance_ratio": 0.875, + "kl_div": -0.1475740224123001, + "kl_div_neg": -0.3291796147823334, + "kl_div_pos": 0.034031569957733154, + "learning_rate": 1.1256117455138661e-07, + "loss": 0.0277, + "ppo_loss": -0.1173085868358612, + "step": 1168 + }, + { + "epoch": 2.83, + "grad_norm": 1.3658282689545083, + "importance_ratio": 1.03125, + "kl_div": 0.017019763588905334, + "kl_div_pos": 0.028323249891400337, + "kl_div_sft": 0.005716277752071619, + "learning_rate": 1.1092985318107668e-07, + "loss": -0.0856, + "ppo_loss": -1.0287281274795532, + "sft_loss": 0.05883674696087837, + "step": 1169 + }, + { + "epoch": 2.84, + "grad_norm": 1.0474936521970561, + "importance_ratio": 0.875, + "kl_div": -0.15186944603919983, + "kl_div_neg": -0.3259763717651367, + "kl_div_pos": 0.022237488999962807, + "learning_rate": 1.0929853181076671e-07, + "loss": -0.0053, + "ppo_loss": -0.11124327778816223, + "step": 1170 + }, + { + "epoch": 2.84, + "grad_norm": 1.2800057114533314, + "importance_ratio": 0.87109375, + "kl_div": -0.15982398390769958, + "kl_div_neg": -0.36433860659599304, + "kl_div_pos": 0.04469062760472298, + "learning_rate": 1.0766721044045676e-07, + "loss": -0.0283, + "ppo_loss": -0.12285217642784119, + "step": 1171 + }, + { + "epoch": 2.84, + "grad_norm": 0.7458908444258102, + "importance_ratio": 1.0390625, + "kl_div": 0.028081998229026794, + "kl_div_pos": 0.04125192388892174, + "kl_div_sft": 0.014912070706486702, + "learning_rate": 1.0603588907014681e-07, + "loss": -0.1004, + "ppo_loss": -1.0421146154403687, + "sft_loss": 0.02104533091187477, + "step": 1172 + }, + { + "epoch": 2.84, + "grad_norm": 1.7487067484875245, + "importance_ratio": 0.96484375, + "kl_div": -0.03644672781229019, + "kl_div_pos": -0.03644672781229019, + "learning_rate": 1.0440456769983686e-07, + "loss": 0.0135, + "ppo_loss": -0.9646996259689331, + "step": 1173 + }, + { + "epoch": 2.85, + "grad_norm": 1.228644092920269, + "importance_ratio": 0.6640625, + "kl_div": -0.19775894284248352, + "kl_div_neg": -0.4072263538837433, + "kl_div_sft": 0.011708474718034267, + "learning_rate": 1.0277324632952691e-07, + "loss": -0.0193, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.029304752126336098, + "step": 1174 + }, + { + "epoch": 2.85, + "grad_norm": 0.9653485788288345, + "kl_div": 0.010237861424684525, + "kl_div_sft": 0.010237861424684525, + "learning_rate": 1.0114192495921696e-07, + "loss": -0.0696, + "sft_loss": 0.05024181306362152, + "step": 1175 + }, + { + "epoch": 2.85, + "grad_norm": 0.709317341952596, + "kl_div": 0.011810576543211937, + "kl_div_sft": 0.011810576543211937, + "learning_rate": 9.9510603588907e-08, + "loss": 0.0471, + "sft_loss": 0.04906405881047249, + "step": 1176 + }, + { + "epoch": 2.85, + "grad_norm": 1.3330104506470002, + "importance_ratio": 0.8671875, + "kl_div": -0.16925135254859924, + "kl_div_neg": -0.38317427039146423, + "kl_div_pos": 0.04467155039310455, + "learning_rate": 9.787928221859706e-08, + "loss": -0.1224, + "ppo_loss": -0.12284216284751892, + "step": 1177 + }, + { + "epoch": 2.86, + "grad_norm": 0.8009218418169928, + "importance_ratio": 0.67578125, + "kl_div": -0.1921180933713913, + "kl_div_neg": -0.392837256193161, + "kl_div_sft": 0.008601064793765545, + "learning_rate": 9.624796084828712e-08, + "loss": -0.1025, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.009927182458341122, + "step": 1178 + }, + { + "epoch": 2.86, + "grad_norm": 1.1503104164127465, + "importance_ratio": 1.0234375, + "kl_div": 0.020346002653241158, + "kl_div_pos": 0.024633346125483513, + "kl_div_sft": 0.016058659180998802, + "learning_rate": 9.461663947797715e-08, + "loss": -0.1081, + "ppo_loss": -1.0249391794204712, + "sft_loss": 0.03225059062242508, + "step": 1179 + }, + { + "epoch": 2.86, + "grad_norm": 0.639340648418175, + "kl_div": 0.011185074225068092, + "kl_div_sft": 0.011185074225068092, + "learning_rate": 9.29853181076672e-08, + "loss": 0.0115, + "sft_loss": 0.06356431543827057, + "step": 1180 + }, + { + "epoch": 2.86, + "grad_norm": 1.3975395004613398, + "importance_ratio": 0.8203125, + "kl_div": -0.08465030789375305, + "kl_div_neg": -0.2001633197069168, + "kl_div_sft": 0.030862705782055855, + "learning_rate": 9.135399673735727e-08, + "loss": -0.1217, + "ppo_loss": 0.8185970187187195, + "sft_loss": 0.060302335768938065, + "step": 1181 + }, + { + "epoch": 2.87, + "grad_norm": 0.7561947573827391, + "importance_ratio": 0.9921875, + "kl_div": -0.007614566013216972, + "kl_div_pos": -0.007614566013216972, + "learning_rate": 8.97226753670473e-08, + "loss": -0.0295, + "ppo_loss": -0.9930227994918823, + "step": 1182 + }, + { + "epoch": 2.87, + "grad_norm": 1.567095217360077, + "importance_ratio": 0.7421875, + "kl_div": -0.13958537578582764, + "kl_div_neg": -0.30015498399734497, + "kl_div_sft": 0.020984219387173653, + "learning_rate": 8.809135399673735e-08, + "loss": -0.0063, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.04172850027680397, + "step": 1183 + }, + { + "epoch": 2.87, + "grad_norm": 0.8954740661452641, + "importance_ratio": 0.73828125, + "kl_div": -0.14187641441822052, + "kl_div_neg": -0.3050064742565155, + "kl_div_sft": 0.021253643557429314, + "learning_rate": 8.64600326264274e-08, + "loss": -0.1237, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.05894254520535469, + "step": 1184 + }, + { + "epoch": 2.87, + "grad_norm": 1.0250840082865402, + "importance_ratio": 1.03125, + "kl_div": 0.01920795626938343, + "kl_div_pos": 0.030632779002189636, + "kl_div_sft": 0.007783134467899799, + "learning_rate": 8.482871125611745e-08, + "loss": -0.0527, + "ppo_loss": -1.0311068296432495, + "sft_loss": 0.08948075771331787, + "step": 1185 + }, + { + "epoch": 2.88, + "grad_norm": 0.9286371917807487, + "importance_ratio": 0.8359375, + "kl_div": -0.2051847279071808, + "kl_div_neg": -0.43950650095939636, + "kl_div_pos": 0.029137054458260536, + "learning_rate": 8.31973898858075e-08, + "loss": -0.0859, + "ppo_loss": -0.11478284001350403, + "step": 1186 + }, + { + "epoch": 2.88, + "grad_norm": 0.9022022483156672, + "importance_ratio": 0.76953125, + "kl_div": -0.13229966163635254, + "kl_div_neg": -0.2619016170501709, + "kl_div_sft": -0.0026977057568728924, + "learning_rate": 8.156606851549755e-08, + "loss": 0.0617, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.0610504224896431, + "step": 1187 + }, + { + "epoch": 2.88, + "grad_norm": 1.2345106798762973, + "importance_ratio": 0.7421875, + "kl_div": -0.2970220446586609, + "kl_div_neg": -0.2970220446586609, + "learning_rate": 7.993474714518759e-08, + "loss": -0.019, + "ppo_loss": 0.800000011920929, + "step": 1188 + }, + { + "epoch": 2.88, + "grad_norm": 0.6406334667111523, + "importance_ratio": 1.0703125, + "kl_div": 0.04250887781381607, + "kl_div_pos": 0.06475761532783508, + "kl_div_sft": 0.020260144025087357, + "learning_rate": 7.830342577487764e-08, + "loss": -0.1853, + "ppo_loss": -1.066900372505188, + "sft_loss": 0.05050653591752052, + "step": 1189 + }, + { + "epoch": 2.88, + "grad_norm": 0.6454048940357756, + "importance_ratio": 0.78125, + "kl_div": -0.1214267909526825, + "kl_div_neg": -0.24888797104358673, + "kl_div_sft": 0.0060343933291733265, + "learning_rate": 7.66721044045677e-08, + "loss": 0.0587, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.07454501837491989, + "step": 1190 + }, + { + "epoch": 2.89, + "grad_norm": 0.6479468040162593, + "importance_ratio": 1.03125, + "kl_div": 0.018266797065734863, + "kl_div_pos": 0.028083480894565582, + "kl_div_sft": 0.008450115099549294, + "learning_rate": 7.504078303425774e-08, + "loss": 0.0932, + "ppo_loss": -1.0284816026687622, + "sft_loss": 0.033145107328891754, + "step": 1191 + }, + { + "epoch": 2.89, + "grad_norm": 0.9432509460687126, + "importance_ratio": 1.046875, + "kl_div": 0.03311854973435402, + "kl_div_pos": 0.04216361045837402, + "kl_div_sft": 0.024073489010334015, + "learning_rate": 7.340946166394779e-08, + "loss": -0.0441, + "ppo_loss": -1.0430651903152466, + "sft_loss": 0.02739499695599079, + "step": 1192 + }, + { + "epoch": 2.89, + "grad_norm": 1.4257255647837281, + "importance_ratio": 0.8984375, + "kl_div": -0.12912996113300323, + "kl_div_neg": -0.341534823179245, + "kl_div_pos": 0.08327490836381912, + "learning_rate": 7.177814029363784e-08, + "loss": 0.0506, + "ppo_loss": -0.1434202492237091, + "step": 1193 + }, + { + "epoch": 2.89, + "grad_norm": 0.9792687721313257, + "importance_ratio": 0.859375, + "kl_div": -0.16656380891799927, + "kl_div_neg": -0.3459399938583374, + "kl_div_pos": 0.012812378816306591, + "learning_rate": 7.014681892332789e-08, + "loss": -0.09, + "ppo_loss": -0.10644736886024475, + "step": 1194 + }, + { + "epoch": 2.9, + "grad_norm": 0.9705135229368407, + "importance_ratio": 1.0390625, + "kl_div": 0.0385335236787796, + "kl_div_pos": 0.0385335236787796, + "learning_rate": 6.851549755301794e-08, + "loss": -0.0764, + "ppo_loss": -1.0395358800888062, + "step": 1195 + }, + { + "epoch": 2.9, + "grad_norm": 0.9032225849796234, + "kl_div": 0.015084541402757168, + "kl_div_sft": 0.015084541402757168, + "learning_rate": 6.688417618270799e-08, + "loss": -0.0185, + "sft_loss": 0.08808805048465729, + "step": 1196 + }, + { + "epoch": 2.9, + "grad_norm": 1.0286034812739275, + "importance_ratio": 1.046875, + "kl_div": 0.043941110372543335, + "kl_div_pos": 0.047929711639881134, + "kl_div_sft": 0.03995250537991524, + "learning_rate": 6.525285481239803e-08, + "loss": -0.1544, + "ppo_loss": -1.049096941947937, + "sft_loss": 0.08652502298355103, + "step": 1197 + }, + { + "epoch": 2.9, + "grad_norm": 2.1520358936833315, + "kl_div": -0.003481715451925993, + "kl_div_sft": -0.003481715451925993, + "learning_rate": 6.36215334420881e-08, + "loss": -0.1079, + "sft_loss": 0.06440573185682297, + "step": 1198 + }, + { + "epoch": 2.91, + "grad_norm": 0.9205021738858639, + "kl_div": 0.01033596321940422, + "kl_div_sft": 0.01033596321940422, + "learning_rate": 6.199021207177814e-08, + "loss": -0.1552, + "sft_loss": 0.06722992658615112, + "step": 1199 + }, + { + "epoch": 2.91, + "grad_norm": 1.7470641714449784, + "kl_div": 0.005851946771144867, + "kl_div_sft": 0.005851946771144867, + "learning_rate": 6.035889070146818e-08, + "loss": -0.0059, + "sft_loss": 0.033399973064661026, + "step": 1200 + }, + { + "epoch": 2.91, + "grad_norm": 0.6236003462276024, + "kl_div": 0.014642293564975262, + "kl_div_sft": 0.014642293564975262, + "learning_rate": 5.872756933115824e-08, + "loss": 0.0587, + "sft_loss": 0.07679164409637451, + "step": 1201 + }, + { + "epoch": 2.91, + "grad_norm": 1.2955535222683352, + "importance_ratio": 0.9375, + "kl_div": -0.07583994418382645, + "kl_div_neg": -0.19484470784664154, + "kl_div_pos": 0.043164823204278946, + "learning_rate": 5.709624796084828e-08, + "loss": -0.0482, + "ppo_loss": -0.11057373881340027, + "step": 1202 + }, + { + "epoch": 2.92, + "grad_norm": 1.074403115837226, + "kl_div": -0.00038597348611801863, + "kl_div_sft": -0.00038597348611801863, + "learning_rate": 5.546492659053834e-08, + "loss": 0.0345, + "sft_loss": 0.0903572291135788, + "step": 1203 + }, + { + "epoch": 2.92, + "grad_norm": 0.7845828491455248, + "kl_div": 0.019885778427124023, + "kl_div_sft": 0.019885778427124023, + "learning_rate": 5.383360522022838e-08, + "loss": -0.0814, + "sft_loss": 0.0418141670525074, + "step": 1204 + }, + { + "epoch": 2.92, + "grad_norm": 2.3863401221896203, + "importance_ratio": 1.0546875, + "kl_div": 0.032779138535261154, + "kl_div_pos": 0.051422979682683945, + "kl_div_sft": 0.014135295525193214, + "learning_rate": 5.220228384991843e-08, + "loss": 0.0782, + "ppo_loss": -1.0527681112289429, + "sft_loss": 0.04892640933394432, + "step": 1205 + }, + { + "epoch": 2.92, + "grad_norm": 0.771678504180194, + "kl_div": 0.01562921702861786, + "kl_div_sft": 0.01562921702861786, + "learning_rate": 5.057096247960848e-08, + "loss": -0.1562, + "sft_loss": 0.02873784676194191, + "step": 1206 + }, + { + "epoch": 2.93, + "grad_norm": 0.941453734660512, + "kl_div": -0.0010949738789349794, + "kl_div_sft": -0.0010949738789349794, + "learning_rate": 4.893964110929853e-08, + "loss": -0.0747, + "sft_loss": 0.08132661879062653, + "step": 1207 + }, + { + "epoch": 2.93, + "grad_norm": 1.0532282023135735, + "importance_ratio": 0.8984375, + "kl_div": -0.05000169575214386, + "kl_div_pos": -0.10559673607349396, + "kl_div_sft": 0.005593346897512674, + "learning_rate": 4.7308319738988576e-08, + "loss": -0.2808, + "ppo_loss": -0.8997873663902283, + "sft_loss": 0.11342725902795792, + "step": 1208 + }, + { + "epoch": 2.93, + "grad_norm": 1.1130491315891495, + "kl_div": 0.005920859519392252, + "kl_div_sft": 0.005920859519392252, + "learning_rate": 4.567699836867863e-08, + "loss": -0.0926, + "sft_loss": 0.04379911348223686, + "step": 1209 + }, + { + "epoch": 2.93, + "grad_norm": 1.3022770343390342, + "importance_ratio": 1.1015625, + "kl_div": 0.04200161620974541, + "kl_div_pos": 0.09724302589893341, + "kl_div_sft": -0.013239794410765171, + "learning_rate": 4.4045676998368676e-08, + "loss": -0.088, + "ppo_loss": -1.102128267288208, + "sft_loss": 0.09195703268051147, + "step": 1210 + }, + { + "epoch": 2.94, + "grad_norm": 1.2349028605509618, + "importance_ratio": 0.76953125, + "kl_div": -0.12613043189048767, + "kl_div_neg": -0.2621628940105438, + "kl_div_sft": 0.009902019053697586, + "learning_rate": 4.241435562805873e-08, + "loss": 0.0338, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.13322798907756805, + "step": 1211 + }, + { + "epoch": 2.94, + "grad_norm": 0.8190554478591701, + "kl_div": 0.016479335725307465, + "kl_div_sft": 0.016479335725307465, + "learning_rate": 4.078303425774878e-08, + "loss": -0.0813, + "sft_loss": 0.06707711517810822, + "step": 1212 + }, + { + "epoch": 2.94, + "grad_norm": 0.6853244344539269, + "kl_div": 0.002904551802203059, + "kl_div_sft": 0.002904551802203059, + "learning_rate": 3.915171288743882e-08, + "loss": 0.0193, + "sft_loss": 0.07321298122406006, + "step": 1213 + }, + { + "epoch": 2.94, + "grad_norm": 0.9767238025964826, + "kl_div": 0.015007298439741135, + "kl_div_sft": 0.015007298439741135, + "learning_rate": 3.752039151712887e-08, + "loss": -0.1253, + "sft_loss": 0.04647599905729294, + "step": 1214 + }, + { + "epoch": 2.95, + "grad_norm": 1.302545033296922, + "importance_ratio": 1.0234375, + "kl_div": 0.013774197548627853, + "kl_div_pos": 0.02181210182607174, + "kl_div_sft": 0.005736292339861393, + "learning_rate": 3.588907014681892e-08, + "loss": -0.11, + "ppo_loss": -1.0220516920089722, + "sft_loss": 0.03027566708624363, + "step": 1215 + }, + { + "epoch": 2.95, + "grad_norm": 0.6969580733387959, + "importance_ratio": 0.96875, + "kl_div": -0.022010929882526398, + "kl_div_pos": -0.031074170023202896, + "kl_div_sft": -0.01294768787920475, + "learning_rate": 3.425774877650897e-08, + "loss": 0.0919, + "ppo_loss": -0.9694036841392517, + "sft_loss": 0.085136279463768, + "step": 1216 + }, + { + "epoch": 2.95, + "grad_norm": 1.0893446350242888, + "importance_ratio": 0.796875, + "kl_div": -0.22950898110866547, + "kl_div_neg": -0.22950898110866547, + "learning_rate": 3.2626427406199015e-08, + "loss": -0.124, + "ppo_loss": 0.8365697264671326, + "step": 1217 + }, + { + "epoch": 2.95, + "grad_norm": 1.0154642418031865, + "kl_div": 0.017310921102762222, + "kl_div_sft": 0.017310921102762222, + "learning_rate": 3.099510603588907e-08, + "loss": -0.0204, + "sft_loss": 0.05979524925351143, + "step": 1218 + }, + { + "epoch": 2.96, + "grad_norm": 1.3089672865018278, + "importance_ratio": 1.0703125, + "kl_div": 0.023883214220404625, + "kl_div_pos": 0.06660286337137222, + "kl_div_sft": -0.018836434930562973, + "learning_rate": 2.936378466557912e-08, + "loss": 0.0179, + "ppo_loss": -1.0688709020614624, + "sft_loss": 0.06836844235658646, + "step": 1219 + }, + { + "epoch": 2.96, + "grad_norm": 0.7735948498894707, + "importance_ratio": 1.0625, + "kl_div": 0.04178933799266815, + "kl_div_pos": 0.061134010553359985, + "kl_div_sft": 0.02244466543197632, + "learning_rate": 2.773246329526917e-08, + "loss": 0.0265, + "ppo_loss": -1.0630414485931396, + "sft_loss": 0.029229391366243362, + "step": 1220 + }, + { + "epoch": 2.96, + "grad_norm": 0.8287853990592418, + "importance_ratio": 0.78125, + "kl_div": -0.12434131652116776, + "kl_div_neg": -0.24875983595848083, + "kl_div_sft": 7.720127905486152e-05, + "learning_rate": 2.6101141924959216e-08, + "loss": -0.0963, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.04770297557115555, + "step": 1221 + }, + { + "epoch": 2.96, + "grad_norm": 0.6735727904476042, + "importance_ratio": 1.109375, + "kl_div": 0.06096731498837471, + "kl_div_pos": 0.105230912566185, + "kl_div_sft": 0.016703717410564423, + "learning_rate": 2.4469820554649266e-08, + "loss": -0.1107, + "ppo_loss": -1.1109670400619507, + "sft_loss": 0.02639000490307808, + "step": 1222 + }, + { + "epoch": 2.96, + "grad_norm": 4.27384140644358, + "importance_ratio": 1.078125, + "kl_div": 0.047324467450380325, + "kl_div_pos": 0.07398180663585663, + "kl_div_sft": 0.02066713012754917, + "learning_rate": 2.2838499184339316e-08, + "loss": -0.1533, + "ppo_loss": -1.0767872333526611, + "sft_loss": 0.015478396788239479, + "step": 1223 + }, + { + "epoch": 2.97, + "grad_norm": 0.7995789597116523, + "importance_ratio": 1.0078125, + "kl_div": 0.01401783712208271, + "kl_div_pos": 0.010602614842355251, + "kl_div_sft": 0.017433058470487595, + "learning_rate": 2.1207177814029363e-08, + "loss": -0.1075, + "ppo_loss": -1.010659098625183, + "sft_loss": 0.024448391050100327, + "step": 1224 + }, + { + "epoch": 2.97, + "grad_norm": 0.6576333678713244, + "kl_div": 0.009036296978592873, + "kl_div_sft": 0.009036296978592873, + "learning_rate": 1.957585644371941e-08, + "loss": -0.0222, + "sft_loss": 0.024351729080080986, + "step": 1225 + }, + { + "epoch": 2.97, + "grad_norm": 1.0430261831864822, + "importance_ratio": 0.6328125, + "kl_div": -0.4564090967178345, + "kl_div_neg": -0.4564090967178345, + "learning_rate": 1.794453507340946e-08, + "loss": -0.0877, + "ppo_loss": 0.800000011920929, + "step": 1226 + }, + { + "epoch": 2.97, + "grad_norm": 1.187129816279101, + "importance_ratio": 0.7734375, + "kl_div": -0.18304017186164856, + "kl_div_neg": -0.25575414299964905, + "kl_div_sft": -0.11032620817422867, + "learning_rate": 1.6313213703099507e-08, + "loss": -0.145, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.17107929289340973, + "step": 1227 + }, + { + "epoch": 2.98, + "grad_norm": 0.6751320868637353, + "importance_ratio": 1.0625, + "kl_div": 0.03507830947637558, + "kl_div_pos": 0.05915962904691696, + "kl_div_sft": 0.0109969861805439, + "learning_rate": 1.468189233278956e-08, + "loss": -0.1551, + "ppo_loss": -1.0609445571899414, + "sft_loss": 0.011611356399953365, + "step": 1228 + }, + { + "epoch": 2.98, + "grad_norm": 0.8545330626384282, + "importance_ratio": 1.0859375, + "kl_div": 0.04665626212954521, + "kl_div_pos": 0.08197905123233795, + "kl_div_sft": 0.011333473958075047, + "learning_rate": 1.3050570962479608e-08, + "loss": -0.301, + "ppo_loss": -1.0854331254959106, + "sft_loss": 0.021651491522789, + "step": 1229 + }, + { + "epoch": 2.98, + "grad_norm": 2.1311360572046136, + "importance_ratio": 0.8125, + "kl_div": -0.25613871216773987, + "kl_div_neg": -0.563224196434021, + "kl_div_pos": 0.05094676464796066, + "learning_rate": 1.1419249592169658e-08, + "loss": -0.1842, + "ppo_loss": -0.12613347172737122, + "step": 1230 + }, + { + "epoch": 2.98, + "grad_norm": 0.6622900252208386, + "importance_ratio": 0.3671875, + "kl_div": -0.49855610728263855, + "kl_div_neg": -1.0007708072662354, + "kl_div_sft": 0.0036585640627890825, + "learning_rate": 9.787928221859705e-09, + "loss": -0.0537, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.06895247101783752, + "step": 1231 + }, + { + "epoch": 2.99, + "grad_norm": 1.181454326981826, + "importance_ratio": 1.109375, + "kl_div": 0.06376396119594574, + "kl_div_pos": 0.10577323287725449, + "kl_div_sft": 0.021754683926701546, + "learning_rate": 8.156606851549754e-09, + "loss": -0.0258, + "ppo_loss": -1.1115697622299194, + "sft_loss": 0.01212720200419426, + "step": 1232 + }, + { + "epoch": 2.99, + "grad_norm": 1.5721435499483776, + "importance_ratio": 0.640625, + "kl_div": -0.4436643123626709, + "kl_div_neg": -0.4436643123626709, + "learning_rate": 6.525285481239804e-09, + "loss": -0.0093, + "ppo_loss": 0.800000011920929, + "step": 1233 + }, + { + "epoch": 2.99, + "grad_norm": 0.7975132947668616, + "importance_ratio": 0.671875, + "kl_div": -0.19387556612491608, + "kl_div_neg": -0.4003700315952301, + "kl_div_sft": 0.012618891894817352, + "learning_rate": 4.8939641109298526e-09, + "loss": -0.0591, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.04209718108177185, + "step": 1234 + }, + { + "epoch": 2.99, + "grad_norm": 0.5783795492283123, + "importance_ratio": 0.74609375, + "kl_div": -0.13893654942512512, + "kl_div_neg": -0.2953203022480011, + "kl_div_sft": 0.01744719222187996, + "learning_rate": 3.262642740619902e-09, + "loss": 0.0643, + "ppo_loss": 0.800000011920929, + "sft_loss": 0.006301300600171089, + "step": 1235 + }, + { + "epoch": 3.0, + "grad_norm": 0.843983582553917, + "importance_ratio": 0.80078125, + "kl_div": -0.10064101964235306, + "kl_div_neg": -0.22288663685321808, + "kl_div_sft": 0.021604593843221664, + "learning_rate": 1.631321370309951e-09, + "loss": -0.1822, + "ppo_loss": 0.8002055287361145, + "sft_loss": 0.057481780648231506, + "step": 1236 + } + ], + "logging_steps": 1.0, + "max_steps": 1236, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 100.0, + "total_flos": 120210122113024.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}