diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,8049 @@ +{ + "best_metric": 0.32666667327284815, + "best_model_checkpoint": "/mnt/data/user/zhao_jun/tangjixin/output/model/intern2.5vl-7b-grpo_v2/v8-20250328-093218/checkpoint-2475", + "epoch": 1.0, + "eval_steps": 250, + "global_step": 2475, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio": 0.0, + "completion_length": 409.79168701171875, + "epoch": 0.00040404040404040404, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 1.6129032258064515e-09, + "loss": 0.0, + "memory(GiB)": 53.97, + "response_clip_ratio": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/MultiModalAccuracyORM": 0.0, + "step": 1, + "train_speed(iter/s)": 0.01394 + }, + { + "clip_ratio": 0.0, + "completion_length": 280.04167318344116, + "epoch": 0.00202020202020202, + "grad_norm": 0.8363032478879351, + "kl": 0.0014553563960362226, + "learning_rate": 8.064516129032257e-09, + "loss": 0.02412133663892746, + "memory(GiB)": 66.4, + "response_clip_ratio": 0.0, + "reward": 0.13541666977107525, + "reward_std": 0.2260890230536461, + "rewards/MultiModalAccuracyORM": 0.13541666977107525, + "step": 5, + "train_speed(iter/s)": 0.028028 + }, + { + "clip_ratio": 0.0, + "completion_length": 310.8333435058594, + "epoch": 0.00404040404040404, + "grad_norm": 0.6139540103661346, + "kl": 0.0016570288338698448, + "learning_rate": 1.6129032258064514e-08, + "loss": -0.01782941520214081, + "memory(GiB)": 66.57, + "response_clip_ratio": 0.0, + "reward": 0.08333333507180214, + "reward_std": 0.20967912971973418, + "rewards/MultiModalAccuracyORM": 0.08333333507180214, + "step": 10, + "train_speed(iter/s)": 0.031112 + }, + { + "clip_ratio": 0.0, + "completion_length": 388.75834503173826, + "epoch": 0.006060606060606061, + "grad_norm": 0.40096093890994094, + "kl": 0.001698582514654845, + "learning_rate": 2.4193548387096773e-08, + "loss": 0.026962581276893615, + "memory(GiB)": 67.01, + "response_clip_ratio": 0.00833333358168602, + "reward": 0.2833333432674408, + "reward_std": 0.3393357157707214, + "rewards/MultiModalAccuracyORM": 0.2833333432674408, + "step": 15, + "train_speed(iter/s)": 0.031299 + }, + { + "clip_ratio": 0.0, + "completion_length": 297.6416748046875, + "epoch": 0.00808080808080808, + "grad_norm": 1.4739542997288975, + "kl": 0.0019028475042432546, + "learning_rate": 3.225806451612903e-08, + "loss": 0.038644880056381226, + "memory(GiB)": 67.01, + "response_clip_ratio": 0.01666666716337204, + "reward": 0.2666666753590107, + "reward_std": 0.2996539086103439, + "rewards/MultiModalAccuracyORM": 0.2666666753590107, + "step": 20, + "train_speed(iter/s)": 0.031377 + }, + { + "clip_ratio": 0.0, + "completion_length": 374.5166717529297, + "epoch": 0.010101010101010102, + "grad_norm": 0.7741728453598145, + "kl": 0.0016279776813462377, + "learning_rate": 4.032258064516129e-08, + "loss": 0.004998515546321869, + "memory(GiB)": 67.01, + "response_clip_ratio": 0.01666666716337204, + "reward": 0.2416666716337204, + "reward_std": 0.3144540905952454, + "rewards/MultiModalAccuracyORM": 0.2416666716337204, + "step": 25, + "train_speed(iter/s)": 0.030159 + }, + { + "clip_ratio": 0.0, + "completion_length": 379.8333404541016, + "epoch": 0.012121212121212121, + "grad_norm": 0.9701247553439714, + "kl": 0.0015773880179040134, + "learning_rate": 4.8387096774193546e-08, + "loss": -0.0044724434614181515, + "memory(GiB)": 67.01, + "response_clip_ratio": 0.03333333432674408, + "reward": 0.2500000067055225, + "reward_std": 0.35868159830570223, + "rewards/MultiModalAccuracyORM": 0.2500000067055225, + "step": 30, + "train_speed(iter/s)": 0.030283 + }, + { + "clip_ratio": 0.0, + "completion_length": 357.30834197998047, + "epoch": 0.014141414141414142, + "grad_norm": 1.1959341136654718, + "kl": 0.001618355477694422, + "learning_rate": 5.645161290322581e-08, + "loss": 0.023464329540729523, + "memory(GiB)": 67.01, + "response_clip_ratio": 0.03333333432674408, + "reward": 0.20833333805203438, + "reward_std": 0.2963388442993164, + "rewards/MultiModalAccuracyORM": 0.20833333805203438, + "step": 35, + "train_speed(iter/s)": 0.030014 + }, + { + "clip_ratio": 0.0, + "completion_length": 301.2083457946777, + "epoch": 0.01616161616161616, + "grad_norm": 1.410798812581158, + "kl": 0.0019752797903493046, + "learning_rate": 6.451612903225806e-08, + "loss": -0.0007259666919708252, + "memory(GiB)": 67.01, + "response_clip_ratio": 0.0, + "reward": 0.19166667088866235, + "reward_std": 0.33526378870010376, + "rewards/MultiModalAccuracyORM": 0.19166667088866235, + "step": 40, + "train_speed(iter/s)": 0.031276 + }, + { + "clip_ratio": 0.0, + "completion_length": 453.8916778564453, + "epoch": 0.01818181818181818, + "grad_norm": 0.7834115303183283, + "kl": 0.0015114007983356714, + "learning_rate": 7.258064516129032e-08, + "loss": 0.02698530852794647, + "memory(GiB)": 67.01, + "response_clip_ratio": 0.00833333358168602, + "reward": 0.13333333879709244, + "reward_std": 0.2323044866323471, + "rewards/MultiModalAccuracyORM": 0.13333333879709244, + "step": 45, + "train_speed(iter/s)": 0.030831 + }, + { + "clip_ratio": 0.0, + "completion_length": 399.0833435058594, + "epoch": 0.020202020202020204, + "grad_norm": 0.5949445172203021, + "kl": 0.0016400692868046463, + "learning_rate": 8.064516129032257e-08, + "loss": 0.00902385413646698, + "memory(GiB)": 67.01, + "response_clip_ratio": 0.01666666716337204, + "reward": 0.1166666679084301, + "reward_std": 0.22297748029232026, + "rewards/MultiModalAccuracyORM": 0.1166666679084301, + "step": 50, + "train_speed(iter/s)": 0.030983 + }, + { + "clip_ratio": 0.0, + "completion_length": 283.9916702270508, + "epoch": 0.022222222222222223, + "grad_norm": 0.8683855913813096, + "kl": 0.001812657283153385, + "learning_rate": 8.870967741935484e-08, + "loss": 0.020981660485267638, + "memory(GiB)": 67.01, + "response_clip_ratio": 0.0, + "reward": 0.1916666693985462, + "reward_std": 0.30718872845172884, + "rewards/MultiModalAccuracyORM": 0.1916666693985462, + "step": 55, + "train_speed(iter/s)": 0.031708 + }, + { + "clip_ratio": 0.0, + "completion_length": 233.82500686645508, + "epoch": 0.024242424242424242, + "grad_norm": 0.7530936756741837, + "kl": 0.001994300523074344, + "learning_rate": 9.677419354838709e-08, + "loss": -0.0005262017250061036, + "memory(GiB)": 67.01, + "response_clip_ratio": 0.00833333358168602, + "reward": 0.2583333432674408, + "reward_std": 0.3782250702381134, + "rewards/MultiModalAccuracyORM": 0.2583333432674408, + "step": 60, + "train_speed(iter/s)": 0.03245 + }, + { + "clip_ratio": 0.0, + "completion_length": 479.90834655761716, + "epoch": 0.026262626262626262, + "grad_norm": 1.067432975947567, + "kl": 0.0014999201346654444, + "learning_rate": 1.0483870967741934e-07, + "loss": 0.016191774606704713, + "memory(GiB)": 67.01, + "response_clip_ratio": 0.041666668653488156, + "reward": 0.21666667386889457, + "reward_std": 0.3534030467271805, + "rewards/MultiModalAccuracyORM": 0.21666667386889457, + "step": 65, + "train_speed(iter/s)": 0.032048 + }, + { + "clip_ratio": 0.0, + "completion_length": 305.74167785644534, + "epoch": 0.028282828282828285, + "grad_norm": 0.6557989849641392, + "kl": 0.0015661009470932185, + "learning_rate": 1.1290322580645162e-07, + "loss": 0.00013453364372253417, + "memory(GiB)": 67.01, + "response_clip_ratio": 0.0, + "reward": 0.12500000223517418, + "reward_std": 0.2367905855178833, + "rewards/MultiModalAccuracyORM": 0.12500000223517418, + "step": 70, + "train_speed(iter/s)": 0.032349 + }, + { + "clip_ratio": 0.0, + "completion_length": 254.53333625793456, + "epoch": 0.030303030303030304, + "grad_norm": 1.4990729586396685, + "kl": 0.001931124395923689, + "learning_rate": 1.2096774193548387e-07, + "loss": -0.014107623696327209, + "memory(GiB)": 67.01, + "response_clip_ratio": 0.0, + "reward": 0.16666667237877847, + "reward_std": 0.28758862614631653, + "rewards/MultiModalAccuracyORM": 0.16666667237877847, + "step": 75, + "train_speed(iter/s)": 0.033073 + }, + { + "clip_ratio": 0.0, + "completion_length": 363.6750129699707, + "epoch": 0.03232323232323232, + "grad_norm": 0.009474604451451558, + "kl": 0.0016073725128080696, + "learning_rate": 1.2903225806451611e-07, + "loss": 0.02208370268344879, + "memory(GiB)": 67.01, + "response_clip_ratio": 0.00833333358168602, + "reward": 0.13333333507180214, + "reward_std": 0.214479061961174, + "rewards/MultiModalAccuracyORM": 0.13333333507180214, + "step": 80, + "train_speed(iter/s)": 0.032802 + }, + { + "clip_ratio": 0.0, + "completion_length": 266.858341217041, + "epoch": 0.03434343434343434, + "grad_norm": 0.8129277425663378, + "kl": 0.0015607591019943356, + "learning_rate": 1.3709677419354838e-07, + "loss": 0.03339255452156067, + "memory(GiB)": 67.01, + "response_clip_ratio": 0.0, + "reward": 0.2083333395421505, + "reward_std": 0.27393454909324644, + "rewards/MultiModalAccuracyORM": 0.2083333395421505, + "step": 85, + "train_speed(iter/s)": 0.033288 + }, + { + "clip_ratio": 0.0, + "completion_length": 328.29166717529296, + "epoch": 0.03636363636363636, + "grad_norm": 1.0237992843062258, + "kl": 0.0018175460281781852, + "learning_rate": 1.4516129032258064e-07, + "loss": -0.0047673434019088745, + "memory(GiB)": 67.01, + "response_clip_ratio": 0.0, + "reward": 0.19166667237877846, + "reward_std": 0.31222184002399445, + "rewards/MultiModalAccuracyORM": 0.19166667237877846, + "step": 90, + "train_speed(iter/s)": 0.02855 + }, + { + "clip_ratio": 0.0, + "completion_length": 390.2666778564453, + "epoch": 0.03838383838383838, + "grad_norm": 1.0063561945582478, + "kl": 0.0016310916107613593, + "learning_rate": 1.5322580645161288e-07, + "loss": 0.012225335836410523, + "memory(GiB)": 67.01, + "response_clip_ratio": 0.0, + "reward": 0.15833333805203437, + "reward_std": 0.31040860116481783, + "rewards/MultiModalAccuracyORM": 0.15833333805203437, + "step": 95, + "train_speed(iter/s)": 0.028934 + }, + { + "clip_ratio": 0.0, + "completion_length": 347.8333480834961, + "epoch": 0.04040404040404041, + "grad_norm": 0.6258205936576121, + "kl": 0.0016133204102516175, + "learning_rate": 1.6129032258064515e-07, + "loss": -0.03874449729919434, + "memory(GiB)": 67.01, + "response_clip_ratio": 0.00833333358168602, + "reward": 0.21666667386889457, + "reward_std": 0.24935851097106934, + "rewards/MultiModalAccuracyORM": 0.21666667386889457, + "step": 100, + "train_speed(iter/s)": 0.029069 + }, + { + "clip_ratio": 0.0, + "completion_length": 356.62500915527346, + "epoch": 0.04242424242424243, + "grad_norm": 0.7939711873436608, + "kl": 0.0018501532729715108, + "learning_rate": 1.6935483870967741e-07, + "loss": -0.03681076169013977, + "memory(GiB)": 67.01, + "response_clip_ratio": 0.00833333358168602, + "reward": 0.14166667088866233, + "reward_std": 0.30009694397449493, + "rewards/MultiModalAccuracyORM": 0.14166667088866233, + "step": 105, + "train_speed(iter/s)": 0.029156 + }, + { + "clip_ratio": 0.0, + "completion_length": 334.25000762939453, + "epoch": 0.044444444444444446, + "grad_norm": 0.8077469947790702, + "kl": 0.001749929750803858, + "learning_rate": 1.7741935483870968e-07, + "loss": 0.0030364990234375, + "memory(GiB)": 67.01, + "response_clip_ratio": 0.0, + "reward": 0.1916666716337204, + "reward_std": 0.36744636595249175, + "rewards/MultiModalAccuracyORM": 0.1916666716337204, + "step": 110, + "train_speed(iter/s)": 0.029419 + }, + { + "clip_ratio": 0.0, + "completion_length": 322.3250114440918, + "epoch": 0.046464646464646465, + "grad_norm": 0.6490437761753438, + "kl": 0.0021277177263982596, + "learning_rate": 1.8548387096774192e-07, + "loss": 0.0014735162258148193, + "memory(GiB)": 67.01, + "response_clip_ratio": 0.00833333358168602, + "reward": 0.20000000298023224, + "reward_std": 0.3026430279016495, + "rewards/MultiModalAccuracyORM": 0.20000000298023224, + "step": 115, + "train_speed(iter/s)": 0.029726 + }, + { + "clip_ratio": 0.0, + "completion_length": 356.07501220703125, + "epoch": 0.048484848484848485, + "grad_norm": 0.7766724930460716, + "kl": 0.0019528187229298055, + "learning_rate": 1.9354838709677418e-07, + "loss": 0.006993652880191803, + "memory(GiB)": 67.01, + "response_clip_ratio": 0.0, + "reward": 0.1916666693985462, + "reward_std": 0.3041424334049225, + "rewards/MultiModalAccuracyORM": 0.1916666693985462, + "step": 120, + "train_speed(iter/s)": 0.030043 + }, + { + "clip_ratio": 0.0, + "completion_length": 291.8666763305664, + "epoch": 0.050505050505050504, + "grad_norm": 1.7360957995530275, + "kl": 0.0021437739836983384, + "learning_rate": 2e-07, + "loss": 0.07869662046432495, + "memory(GiB)": 67.01, + "response_clip_ratio": 0.0, + "reward": 0.2500000067055225, + "reward_std": 0.2706790864467621, + "rewards/MultiModalAccuracyORM": 0.2500000067055225, + "step": 125, + "train_speed(iter/s)": 0.030399 + }, + { + "clip_ratio": 0.0, + "completion_length": 423.4416778564453, + "epoch": 0.052525252525252523, + "grad_norm": 0.7239088304012476, + "kl": 0.001710877218283713, + "learning_rate": 2e-07, + "loss": 0.009808599948883057, + "memory(GiB)": 67.01, + "response_clip_ratio": 0.00833333358168602, + "reward": 0.2666666768491268, + "reward_std": 0.40890581607818605, + "rewards/MultiModalAccuracyORM": 0.2666666768491268, + "step": 130, + "train_speed(iter/s)": 0.030332 + }, + { + "clip_ratio": 0.0, + "completion_length": 329.34167633056643, + "epoch": 0.05454545454545454, + "grad_norm": 0.8447168227598457, + "kl": 0.002082096762023866, + "learning_rate": 2e-07, + "loss": -0.03216500878334046, + "memory(GiB)": 67.01, + "response_clip_ratio": 0.0, + "reward": 0.2083333395421505, + "reward_std": 0.3019101768732071, + "rewards/MultiModalAccuracyORM": 0.2083333395421505, + "step": 135, + "train_speed(iter/s)": 0.030718 + }, + { + "clip_ratio": 0.0, + "completion_length": 324.9166748046875, + "epoch": 0.05656565656565657, + "grad_norm": 1.0324727992148468, + "kl": 0.0018370800535194576, + "learning_rate": 2e-07, + "loss": 0.023554784059524537, + "memory(GiB)": 67.01, + "response_clip_ratio": 0.0, + "reward": 0.20833333805203438, + "reward_std": 0.33395901322364807, + "rewards/MultiModalAccuracyORM": 0.20833333805203438, + "step": 140, + "train_speed(iter/s)": 0.030908 + }, + { + "clip_ratio": 0.0, + "completion_length": 248.10834045410155, + "epoch": 0.05858585858585859, + "grad_norm": 1.3871585996389504, + "kl": 0.0021007918752729894, + "learning_rate": 2e-07, + "loss": 0.026919734477996827, + "memory(GiB)": 67.01, + "response_clip_ratio": 0.0, + "reward": 0.21666666865348816, + "reward_std": 0.21753989458084105, + "rewards/MultiModalAccuracyORM": 0.21666666865348816, + "step": 145, + "train_speed(iter/s)": 0.031304 + }, + { + "clip_ratio": 0.0, + "completion_length": 289.16667556762695, + "epoch": 0.06060606060606061, + "grad_norm": 0.8132545647619184, + "kl": 0.002322551829274744, + "learning_rate": 2e-07, + "loss": 0.07373623847961426, + "memory(GiB)": 67.01, + "response_clip_ratio": 0.0, + "reward": 0.3083333447575569, + "reward_std": 0.3354848504066467, + "rewards/MultiModalAccuracyORM": 0.3083333447575569, + "step": 150, + "train_speed(iter/s)": 0.031477 + }, + { + "clip_ratio": 0.0, + "completion_length": 337.5250114440918, + "epoch": 0.06262626262626263, + "grad_norm": 1.3519945848264368, + "kl": 0.002200189605355263, + "learning_rate": 2e-07, + "loss": 0.04400811195373535, + "memory(GiB)": 67.01, + "response_clip_ratio": 0.0, + "reward": 0.13333333656191826, + "reward_std": 0.29784067571163175, + "rewards/MultiModalAccuracyORM": 0.13333333656191826, + "step": 155, + "train_speed(iter/s)": 0.031607 + }, + { + "clip_ratio": 0.0, + "completion_length": 362.05834312438964, + "epoch": 0.06464646464646465, + "grad_norm": 0.6312437669339288, + "kl": 0.0019409565313253552, + "learning_rate": 2e-07, + "loss": -0.00390947014093399, + "memory(GiB)": 67.01, + "response_clip_ratio": 0.01666666716337204, + "reward": 0.09166666865348816, + "reward_std": 0.23854664266109465, + "rewards/MultiModalAccuracyORM": 0.09166666865348816, + "step": 160, + "train_speed(iter/s)": 0.031439 + }, + { + "clip_ratio": 0.0, + "completion_length": 309.1583427429199, + "epoch": 0.06666666666666667, + "grad_norm": 0.9922303390769169, + "kl": 0.0020790058420971035, + "learning_rate": 2e-07, + "loss": 0.02683091163635254, + "memory(GiB)": 67.01, + "response_clip_ratio": 0.0, + "reward": 0.11666666939854622, + "reward_std": 0.24010564982891083, + "rewards/MultiModalAccuracyORM": 0.11666666939854622, + "step": 165, + "train_speed(iter/s)": 0.031658 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.6333381652832, + "epoch": 0.06868686868686869, + "grad_norm": 1.8736558792915283, + "kl": 0.0021017327206209304, + "learning_rate": 2e-07, + "loss": 0.053074592351913454, + "memory(GiB)": 67.01, + "response_clip_ratio": 0.0, + "reward": 0.2416666753590107, + "reward_std": 0.31345489621162415, + "rewards/MultiModalAccuracyORM": 0.2416666753590107, + "step": 170, + "train_speed(iter/s)": 0.031906 + }, + { + "clip_ratio": 0.0, + "completion_length": 334.34167938232423, + "epoch": 0.0707070707070707, + "grad_norm": 0.369038153671658, + "kl": 0.0021058263606391846, + "learning_rate": 2e-07, + "loss": -0.015150085091590881, + "memory(GiB)": 67.01, + "response_clip_ratio": 0.0, + "reward": 0.19166667014360428, + "reward_std": 0.2506715327501297, + "rewards/MultiModalAccuracyORM": 0.19166667014360428, + "step": 175, + "train_speed(iter/s)": 0.032115 + }, + { + "clip_ratio": 0.0, + "completion_length": 292.1916778564453, + "epoch": 0.07272727272727272, + "grad_norm": 1.9587979389448584, + "kl": 0.002187371510080993, + "learning_rate": 2e-07, + "loss": -0.07978157997131348, + "memory(GiB)": 67.01, + "response_clip_ratio": 0.0, + "reward": 0.14166666939854622, + "reward_std": 0.3041424334049225, + "rewards/MultiModalAccuracyORM": 0.14166666939854622, + "step": 180, + "train_speed(iter/s)": 0.032328 + }, + { + "clip_ratio": 0.0, + "completion_length": 252.3166763305664, + "epoch": 0.07474747474747474, + "grad_norm": 1.8461501284258892, + "kl": 0.002373928390443325, + "learning_rate": 2e-07, + "loss": 0.0032023414969444275, + "memory(GiB)": 67.01, + "response_clip_ratio": 0.0, + "reward": 0.20833334028720857, + "reward_std": 0.3078981190919876, + "rewards/MultiModalAccuracyORM": 0.20833334028720857, + "step": 185, + "train_speed(iter/s)": 0.032577 + }, + { + "clip_ratio": 0.0, + "completion_length": 372.09167327880857, + "epoch": 0.07676767676767676, + "grad_norm": 1.161955111140227, + "kl": 0.0020143969799391926, + "learning_rate": 2e-07, + "loss": 0.015145952999591827, + "memory(GiB)": 67.01, + "response_clip_ratio": 0.0, + "reward": 0.14166667088866233, + "reward_std": 0.22400068640708923, + "rewards/MultiModalAccuracyORM": 0.14166667088866233, + "step": 190, + "train_speed(iter/s)": 0.032624 + }, + { + "clip_ratio": 0.0, + "completion_length": 301.4333442687988, + "epoch": 0.07878787878787878, + "grad_norm": 0.6821896150480984, + "kl": 0.002065828931517899, + "learning_rate": 2e-07, + "loss": -0.008565062284469604, + "memory(GiB)": 67.01, + "response_clip_ratio": 0.00833333358168602, + "reward": 0.20000000298023224, + "reward_std": 0.21394325494766236, + "rewards/MultiModalAccuracyORM": 0.20000000298023224, + "step": 195, + "train_speed(iter/s)": 0.032656 + }, + { + "clip_ratio": 0.0, + "completion_length": 377.93334503173827, + "epoch": 0.08080808080808081, + "grad_norm": 0.9729957971818891, + "kl": 0.002064543019514531, + "learning_rate": 2e-07, + "loss": 0.05269354581832886, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.02500000074505806, + "reward": 0.21666667684912683, + "reward_std": 0.27379952669143676, + "rewards/MultiModalAccuracyORM": 0.21666667684912683, + "step": 200, + "train_speed(iter/s)": 0.032498 + }, + { + "clip_ratio": 0.0, + "completion_length": 366.07500915527345, + "epoch": 0.08282828282828283, + "grad_norm": 1.771257702172847, + "kl": 0.0021668279776349665, + "learning_rate": 2e-07, + "loss": 0.05926605463027954, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.00833333358168602, + "reward": 0.14166666865348815, + "reward_std": 0.29076993763446807, + "rewards/MultiModalAccuracyORM": 0.14166666865348815, + "step": 205, + "train_speed(iter/s)": 0.032513 + }, + { + "clip_ratio": 0.0, + "completion_length": 312.17501373291014, + "epoch": 0.08484848484848485, + "grad_norm": 1.1847534793771106, + "kl": 0.002231467212550342, + "learning_rate": 2e-07, + "loss": -0.04730735421180725, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.17500000596046447, + "reward_std": 0.30665292739868166, + "rewards/MultiModalAccuracyORM": 0.17500000596046447, + "step": 210, + "train_speed(iter/s)": 0.032602 + }, + { + "clip_ratio": 0.0, + "completion_length": 318.25001068115233, + "epoch": 0.08686868686868687, + "grad_norm": 1.0162794234808679, + "kl": 0.0021812492050230503, + "learning_rate": 2e-07, + "loss": -0.012527593970298767, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.01666666716337204, + "reward": 0.25000000596046446, + "reward_std": 0.2488823115825653, + "rewards/MultiModalAccuracyORM": 0.25000000596046446, + "step": 215, + "train_speed(iter/s)": 0.032672 + }, + { + "clip_ratio": 0.0, + "completion_length": 325.2083404541016, + "epoch": 0.08888888888888889, + "grad_norm": 1.6052431990433658, + "kl": 0.002427070902194828, + "learning_rate": 2e-07, + "loss": 0.04005146026611328, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.2333333410322666, + "reward_std": 0.31740519404411316, + "rewards/MultiModalAccuracyORM": 0.2333333410322666, + "step": 220, + "train_speed(iter/s)": 0.032834 + }, + { + "clip_ratio": 0.0, + "completion_length": 322.5166763305664, + "epoch": 0.09090909090909091, + "grad_norm": 1.2096008908286064, + "kl": 0.002002272638492286, + "learning_rate": 2e-07, + "loss": 0.07909151315689086, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.2250000096857548, + "reward_std": 0.3757145762443542, + "rewards/MultiModalAccuracyORM": 0.2250000096857548, + "step": 225, + "train_speed(iter/s)": 0.032966 + }, + { + "clip_ratio": 0.0, + "completion_length": 252.65000610351564, + "epoch": 0.09292929292929293, + "grad_norm": 1.2828051744936808, + "kl": 0.002529059338849038, + "learning_rate": 2e-07, + "loss": 0.03620143532752991, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.17500000447034836, + "reward_std": 0.2792848199605942, + "rewards/MultiModalAccuracyORM": 0.17500000447034836, + "step": 230, + "train_speed(iter/s)": 0.033148 + }, + { + "clip_ratio": 0.0, + "completion_length": 362.3000091552734, + "epoch": 0.09494949494949495, + "grad_norm": 1.085440875767882, + "kl": 0.0019764827913604675, + "learning_rate": 2e-07, + "loss": 0.03204571008682251, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.21666667461395264, + "reward_std": 0.34560188353061677, + "rewards/MultiModalAccuracyORM": 0.21666667461395264, + "step": 235, + "train_speed(iter/s)": 0.033179 + }, + { + "clip_ratio": 0.0, + "completion_length": 292.416674041748, + "epoch": 0.09696969696969697, + "grad_norm": 0.49748232708825735, + "kl": 0.002392634970601648, + "learning_rate": 2e-07, + "loss": -9.850338101387024e-05, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.00833333358168602, + "reward": 0.1666666679084301, + "reward_std": 0.3134308844804764, + "rewards/MultiModalAccuracyORM": 0.1666666679084301, + "step": 240, + "train_speed(iter/s)": 0.033246 + }, + { + "clip_ratio": 0.0, + "completion_length": 342.1333465576172, + "epoch": 0.09898989898989899, + "grad_norm": 1.2151595435030045, + "kl": 0.0020633480802644045, + "learning_rate": 2e-07, + "loss": 0.04106523394584656, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.02500000074505806, + "reward": 0.3750000111758709, + "reward_std": 0.3597048044204712, + "rewards/MultiModalAccuracyORM": 0.3750000111758709, + "step": 245, + "train_speed(iter/s)": 0.033114 + }, + { + "epoch": 0.10101010101010101, + "grad_norm": 2.203412703075323, + "learning_rate": 2e-07, + "loss": -0.009947558492422104, + "memory(GiB)": 67.41, + "step": 250, + "train_speed(iter/s)": 0.033255 + }, + { + "epoch": 0.10101010101010101, + "eval_clip_ratio": 0.0, + "eval_completion_length": 313.73167510986326, + "eval_kl": 0.002364178735297173, + "eval_loss": 0.011137718334794044, + "eval_response_clip_ratio": 0.001666666716337204, + "eval_reward": 0.1716666702926159, + "eval_reward_std": 0.30018057823181155, + "eval_rewards/MultiModalAccuracyORM": 0.1716666702926159, + "eval_runtime": 611.4996, + "eval_samples_per_second": 0.082, + "eval_steps_per_second": 0.008, + "step": 250 + }, + { + "clip_ratio": 0.0, + "completion_length": 307.2083404541016, + "epoch": 0.10303030303030303, + "grad_norm": 0.6772835244109896, + "kl": 0.0024091241066344082, + "learning_rate": 2e-07, + "loss": -0.014972110092639924, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.23333334028720856, + "reward_std": 0.28962627798318863, + "rewards/MultiModalAccuracyORM": 0.23333334028720856, + "step": 255, + "train_speed(iter/s)": 0.029399 + }, + { + "clip_ratio": 0.0, + "completion_length": 229.6, + "epoch": 0.10505050505050505, + "grad_norm": 1.2025473761483534, + "kl": 0.0028577180579304694, + "learning_rate": 2e-07, + "loss": 0.04020859003067016, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.3166666716337204, + "reward_std": 0.34929499626159666, + "rewards/MultiModalAccuracyORM": 0.3166666716337204, + "step": 260, + "train_speed(iter/s)": 0.029617 + }, + { + "clip_ratio": 0.0, + "completion_length": 337.6, + "epoch": 0.10707070707070707, + "grad_norm": 0.4120260278240223, + "kl": 0.0021677285199984907, + "learning_rate": 2e-07, + "loss": -0.0139850914478302, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.17500000521540643, + "reward_std": 0.30015655159950255, + "rewards/MultiModalAccuracyORM": 0.17500000521540643, + "step": 265, + "train_speed(iter/s)": 0.029704 + }, + { + "clip_ratio": 0.0, + "completion_length": 366.85, + "epoch": 0.10909090909090909, + "grad_norm": 0.8561435874110225, + "kl": 0.0018193130497820675, + "learning_rate": 2e-07, + "loss": 0.08367395997047425, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.05, + "reward": 0.07500000223517418, + "reward_std": 0.22218745350837707, + "rewards/MultiModalAccuracyORM": 0.07500000223517418, + "step": 270, + "train_speed(iter/s)": 0.029758 + }, + { + "clip_ratio": 0.0, + "completion_length": 251.2, + "epoch": 0.1111111111111111, + "grad_norm": 0.9014771216720453, + "kl": 0.0027342547429725526, + "learning_rate": 2e-07, + "loss": -0.017020440101623534, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.25000000447034837, + "reward_std": 0.30515109598636625, + "rewards/MultiModalAccuracyORM": 0.25000000447034837, + "step": 275, + "train_speed(iter/s)": 0.029979 + }, + { + "clip_ratio": 0.0, + "completion_length": 314.9, + "epoch": 0.11313131313131314, + "grad_norm": 0.8605819537524286, + "kl": 0.002405107906088233, + "learning_rate": 2e-07, + "loss": -0.021587955951690673, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.24166667610406875, + "reward_std": 0.2674381673336029, + "rewards/MultiModalAccuracyORM": 0.24166667610406875, + "step": 280, + "train_speed(iter/s)": 0.030012 + }, + { + "clip_ratio": 0.0, + "completion_length": 420.35, + "epoch": 0.11515151515151516, + "grad_norm": 0.8251215931120387, + "kl": 0.0022269786451943217, + "learning_rate": 2e-07, + "loss": -0.009860058128833771, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.1833333380520344, + "reward_std": 0.24239750802516938, + "rewards/MultiModalAccuracyORM": 0.1833333380520344, + "step": 285, + "train_speed(iter/s)": 0.030091 + }, + { + "clip_ratio": 0.0, + "completion_length": 288.7, + "epoch": 0.11717171717171718, + "grad_norm": 1.2527152485469888, + "kl": 0.0021428745938465, + "learning_rate": 2e-07, + "loss": -0.023031486570835112, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.16666667014360428, + "reward_std": 0.32451152205467226, + "rewards/MultiModalAccuracyORM": 0.16666667014360428, + "step": 290, + "train_speed(iter/s)": 0.030252 + }, + { + "clip_ratio": 0.0, + "completion_length": 313.55, + "epoch": 0.1191919191919192, + "grad_norm": 1.6313717819183706, + "kl": 0.0029145212611183524, + "learning_rate": 2e-07, + "loss": 0.010453201830387115, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.1916666716337204, + "reward_std": 0.36569273471832275, + "rewards/MultiModalAccuracyORM": 0.1916666716337204, + "step": 295, + "train_speed(iter/s)": 0.030281 + }, + { + "clip_ratio": 0.0, + "completion_length": 409.95, + "epoch": 0.12121212121212122, + "grad_norm": 0.9763637277897765, + "kl": 0.002011374046560377, + "learning_rate": 2e-07, + "loss": -0.004069572687149048, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.05, + "reward": 0.1416666716337204, + "reward_std": 0.2574163258075714, + "rewards/MultiModalAccuracyORM": 0.1416666716337204, + "step": 300, + "train_speed(iter/s)": 0.030274 + }, + { + "clip_ratio": 0.0, + "completion_length": 294.9, + "epoch": 0.12323232323232323, + "grad_norm": 0.6068062718184686, + "kl": 0.002677905629388988, + "learning_rate": 2e-07, + "loss": 0.029740142822265624, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.22500000447034835, + "reward_std": 0.3315081149339676, + "rewards/MultiModalAccuracyORM": 0.22500000447034835, + "step": 305, + "train_speed(iter/s)": 0.030352 + }, + { + "clip_ratio": 0.0, + "completion_length": 346.2, + "epoch": 0.12525252525252525, + "grad_norm": 1.6191575053592702, + "kl": 0.002568906731903553, + "learning_rate": 2e-07, + "loss": 0.014141106605529785, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.275000012665987, + "reward_std": 0.3184880018234253, + "rewards/MultiModalAccuracyORM": 0.275000012665987, + "step": 310, + "train_speed(iter/s)": 0.030382 + }, + { + "clip_ratio": 0.0, + "completion_length": 575.05, + "epoch": 0.12727272727272726, + "grad_norm": 0.8100491404341938, + "kl": 0.0023127400781959295, + "learning_rate": 2e-07, + "loss": 0.03490907847881317, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.05, + "reward": 0.10833333656191826, + "reward_std": 0.20343697369098662, + "rewards/MultiModalAccuracyORM": 0.10833333656191826, + "step": 315, + "train_speed(iter/s)": 0.030396 + }, + { + "clip_ratio": 0.0, + "completion_length": 263.35, + "epoch": 0.1292929292929293, + "grad_norm": 0.013915602281916708, + "kl": 0.00306427797768265, + "learning_rate": 2e-07, + "loss": -0.017041555047035216, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.15833333507180214, + "reward_std": 0.23609575033187866, + "rewards/MultiModalAccuracyORM": 0.15833333507180214, + "step": 320, + "train_speed(iter/s)": 0.030583 + }, + { + "clip_ratio": 0.0, + "completion_length": 382.05, + "epoch": 0.13131313131313133, + "grad_norm": 1.1033620456072635, + "kl": 0.002842709410469979, + "learning_rate": 2e-07, + "loss": 0.011531709134578705, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.2416666716337204, + "reward_std": 0.3182693660259247, + "rewards/MultiModalAccuracyORM": 0.2416666716337204, + "step": 325, + "train_speed(iter/s)": 0.030665 + }, + { + "clip_ratio": 0.0, + "completion_length": 308.55, + "epoch": 0.13333333333333333, + "grad_norm": 0.9537144351366293, + "kl": 0.002481410140171647, + "learning_rate": 2e-07, + "loss": 0.0630490779876709, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.1666666693985462, + "reward_std": 0.26196202635765076, + "rewards/MultiModalAccuracyORM": 0.1666666693985462, + "step": 330, + "train_speed(iter/s)": 0.030751 + }, + { + "clip_ratio": 0.0, + "completion_length": 326.6, + "epoch": 0.13535353535353536, + "grad_norm": 1.8155099532753467, + "kl": 0.0024575040792115034, + "learning_rate": 2e-07, + "loss": 0.0058914005756378176, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.22500000670552253, + "reward_std": 0.3784552842378616, + "rewards/MultiModalAccuracyORM": 0.22500000670552253, + "step": 335, + "train_speed(iter/s)": 0.030852 + }, + { + "clip_ratio": 0.0, + "completion_length": 386.25, + "epoch": 0.13737373737373737, + "grad_norm": 0.8060655227590112, + "kl": 0.0027357690036296845, + "learning_rate": 2e-07, + "loss": 0.015557366609573364, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.05, + "reward": 0.2750000089406967, + "reward_std": 0.2752989321947098, + "rewards/MultiModalAccuracyORM": 0.2750000089406967, + "step": 340, + "train_speed(iter/s)": 0.030911 + }, + { + "clip_ratio": 0.0, + "completion_length": 399.25, + "epoch": 0.1393939393939394, + "grad_norm": 0.866681581232229, + "kl": 0.0027076376718468964, + "learning_rate": 2e-07, + "loss": 0.04877374768257141, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.12500000223517418, + "reward_std": 0.2750207006931305, + "rewards/MultiModalAccuracyORM": 0.12500000223517418, + "step": 345, + "train_speed(iter/s)": 0.030975 + }, + { + "clip_ratio": 0.0, + "completion_length": 338.7, + "epoch": 0.1414141414141414, + "grad_norm": 1.2612207291126878, + "kl": 0.0021965037449263036, + "learning_rate": 2e-07, + "loss": 0.01168801486492157, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.24166667237877845, + "reward_std": 0.33686081171035764, + "rewards/MultiModalAccuracyORM": 0.24166667237877845, + "step": 350, + "train_speed(iter/s)": 0.030946 + }, + { + "clip_ratio": 0.0, + "completion_length": 223.15, + "epoch": 0.14343434343434344, + "grad_norm": 1.4058521539207838, + "kl": 0.0031185435480438175, + "learning_rate": 2e-07, + "loss": 0.04595511555671692, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.0916666679084301, + "reward_std": 0.19717080593109132, + "rewards/MultiModalAccuracyORM": 0.0916666679084301, + "step": 355, + "train_speed(iter/s)": 0.031022 + }, + { + "clip_ratio": 0.0, + "completion_length": 374.05, + "epoch": 0.14545454545454545, + "grad_norm": 1.2063444833151329, + "kl": 0.002893015928566456, + "learning_rate": 2e-07, + "loss": 0.05137801170349121, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.05, + "reward": 0.1666666716337204, + "reward_std": 0.3794462442398071, + "rewards/MultiModalAccuracyORM": 0.1666666716337204, + "step": 360, + "train_speed(iter/s)": 0.031042 + }, + { + "clip_ratio": 0.0, + "completion_length": 347.3, + "epoch": 0.14747474747474748, + "grad_norm": 0.004092609073173449, + "kl": 0.002910976018756628, + "learning_rate": 2e-07, + "loss": -0.07540136575698853, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.1833333343267441, + "reward_std": 0.23933666944503784, + "rewards/MultiModalAccuracyORM": 0.1833333343267441, + "step": 365, + "train_speed(iter/s)": 0.031092 + }, + { + "clip_ratio": 0.0, + "completion_length": 270.45, + "epoch": 0.1494949494949495, + "grad_norm": 1.9513061817753958, + "kl": 0.002679864503443241, + "learning_rate": 2e-07, + "loss": -0.023768115043640136, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.23333334252238275, + "reward_std": 0.28758862614631653, + "rewards/MultiModalAccuracyORM": 0.23333334252238275, + "step": 370, + "train_speed(iter/s)": 0.031131 + }, + { + "clip_ratio": 0.0, + "completion_length": 220.4, + "epoch": 0.15151515151515152, + "grad_norm": 1.418865700350339, + "kl": 0.002804583264514804, + "learning_rate": 2e-07, + "loss": -0.020401501655578615, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.2166666716337204, + "reward_std": 0.3634364664554596, + "rewards/MultiModalAccuracyORM": 0.2166666716337204, + "step": 375, + "train_speed(iter/s)": 0.031222 + }, + { + "clip_ratio": 0.0, + "completion_length": 221.95, + "epoch": 0.15353535353535352, + "grad_norm": 0.5418336734188505, + "kl": 0.002608964138198644, + "learning_rate": 2e-07, + "loss": 4.297494888305664e-05, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.20000000521540642, + "reward_std": 0.2323044866323471, + "rewards/MultiModalAccuracyORM": 0.20000000521540642, + "step": 380, + "train_speed(iter/s)": 0.031331 + }, + { + "clip_ratio": 0.0, + "completion_length": 252.3, + "epoch": 0.15555555555555556, + "grad_norm": 1.2938746411839903, + "kl": 0.003260041878093034, + "learning_rate": 2e-07, + "loss": 0.004776376485824585, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.3000000104308128, + "reward_std": 0.42925089299678804, + "rewards/MultiModalAccuracyORM": 0.3000000104308128, + "step": 385, + "train_speed(iter/s)": 0.031397 + }, + { + "clip_ratio": 0.0, + "completion_length": 371.25, + "epoch": 0.15757575757575756, + "grad_norm": 0.5646363772035449, + "kl": 0.003275243751704693, + "learning_rate": 2e-07, + "loss": -0.038579174876213075, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.15833334028720855, + "reward_std": 0.27523933053016664, + "rewards/MultiModalAccuracyORM": 0.15833334028720855, + "step": 390, + "train_speed(iter/s)": 0.031443 + }, + { + "clip_ratio": 0.0, + "completion_length": 445.15, + "epoch": 0.1595959595959596, + "grad_norm": 1.1555077391898336, + "kl": 0.0027449760818853974, + "learning_rate": 2e-07, + "loss": 0.010327178239822387, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.28333333805203437, + "reward_std": 0.4030417025089264, + "rewards/MultiModalAccuracyORM": 0.28333333805203437, + "step": 395, + "train_speed(iter/s)": 0.031462 + }, + { + "clip_ratio": 0.0, + "completion_length": 257.0, + "epoch": 0.16161616161616163, + "grad_norm": 1.0521916035915964, + "kl": 0.0038477353053167464, + "learning_rate": 2e-07, + "loss": -0.0054982278496026995, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.08333333432674409, + "reward_std": 0.16830329298973085, + "rewards/MultiModalAccuracyORM": 0.08333333432674409, + "step": 400, + "train_speed(iter/s)": 0.03148 + }, + { + "clip_ratio": 0.0, + "completion_length": 264.8, + "epoch": 0.16363636363636364, + "grad_norm": 1.407684002477615, + "kl": 0.0036190941464155912, + "learning_rate": 2e-07, + "loss": -0.018236428499221802, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.2500000074505806, + "reward_std": 0.40632360279560087, + "rewards/MultiModalAccuracyORM": 0.2500000074505806, + "step": 405, + "train_speed(iter/s)": 0.031569 + }, + { + "clip_ratio": 0.0, + "completion_length": 242.35, + "epoch": 0.16565656565656567, + "grad_norm": 0.013270915639793456, + "kl": 0.0037169228307902813, + "learning_rate": 2e-07, + "loss": 0.009006601572036744, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.26666667237877845, + "reward_std": 0.21775853037834167, + "rewards/MultiModalAccuracyORM": 0.26666667237877845, + "step": 410, + "train_speed(iter/s)": 0.03167 + }, + { + "clip_ratio": 0.0, + "completion_length": 358.25, + "epoch": 0.16767676767676767, + "grad_norm": 0.018447250902146762, + "kl": 0.0025411285692825913, + "learning_rate": 2e-07, + "loss": -0.013655924797058105, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.15833333805203437, + "reward_std": 0.16696292161941528, + "rewards/MultiModalAccuracyORM": 0.15833333805203437, + "step": 415, + "train_speed(iter/s)": 0.031744 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.8, + "epoch": 0.1696969696969697, + "grad_norm": 0.9508473836871834, + "kl": 0.004386395937763155, + "learning_rate": 2e-07, + "loss": 0.01687029004096985, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.2583333380520344, + "reward_std": 0.2925831705331802, + "rewards/MultiModalAccuracyORM": 0.2583333380520344, + "step": 420, + "train_speed(iter/s)": 0.031886 + }, + { + "clip_ratio": 0.0, + "completion_length": 386.25, + "epoch": 0.1717171717171717, + "grad_norm": 0.6441750873812842, + "kl": 0.0029796794056892394, + "learning_rate": 2e-07, + "loss": 0.03889042139053345, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.31666667461395265, + "reward_std": 0.32771685123443606, + "rewards/MultiModalAccuracyORM": 0.31666667461395265, + "step": 425, + "train_speed(iter/s)": 0.03187 + }, + { + "clip_ratio": 0.0, + "completion_length": 304.05, + "epoch": 0.17373737373737375, + "grad_norm": 1.178565169897863, + "kl": 0.004055350879207253, + "learning_rate": 2e-07, + "loss": 0.024070069193840027, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.19166666865348816, + "reward_std": 0.2719598561525345, + "rewards/MultiModalAccuracyORM": 0.19166666865348816, + "step": 430, + "train_speed(iter/s)": 0.03191 + }, + { + "clip_ratio": 0.0, + "completion_length": 316.65, + "epoch": 0.17575757575757575, + "grad_norm": 1.4718997092641302, + "kl": 0.00319979356136173, + "learning_rate": 2e-07, + "loss": 0.0438249945640564, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.2166666716337204, + "reward_std": 0.26502286493778227, + "rewards/MultiModalAccuracyORM": 0.2166666716337204, + "step": 435, + "train_speed(iter/s)": 0.031962 + }, + { + "clip_ratio": 0.0, + "completion_length": 374.25, + "epoch": 0.17777777777777778, + "grad_norm": 1.2499621790542323, + "kl": 0.003690016525797546, + "learning_rate": 2e-07, + "loss": 0.02377350926399231, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.23333333656191826, + "reward_std": 0.3626674860715866, + "rewards/MultiModalAccuracyORM": 0.23333333656191826, + "step": 440, + "train_speed(iter/s)": 0.032048 + }, + { + "clip_ratio": 0.0, + "completion_length": 308.65, + "epoch": 0.1797979797979798, + "grad_norm": 1.9383757905012418, + "kl": 0.0035172241390682758, + "learning_rate": 2e-07, + "loss": -0.000668191909790039, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.2500000111758709, + "reward_std": 0.2651819050312042, + "rewards/MultiModalAccuracyORM": 0.2500000111758709, + "step": 445, + "train_speed(iter/s)": 0.032084 + }, + { + "clip_ratio": 0.0, + "completion_length": 227.05, + "epoch": 0.18181818181818182, + "grad_norm": 1.3408808376367802, + "kl": 0.004776520561426878, + "learning_rate": 2e-07, + "loss": 0.07315102815628052, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.2250000074505806, + "reward_std": 0.41088385283946993, + "rewards/MultiModalAccuracyORM": 0.2250000074505806, + "step": 450, + "train_speed(iter/s)": 0.03213 + }, + { + "clip_ratio": 0.0, + "completion_length": 377.25, + "epoch": 0.18383838383838383, + "grad_norm": 0.7424759470710323, + "kl": 0.002804637746885419, + "learning_rate": 2e-07, + "loss": -0.001922774314880371, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.18333333879709243, + "reward_std": 0.32902405858039857, + "rewards/MultiModalAccuracyORM": 0.18333333879709243, + "step": 455, + "train_speed(iter/s)": 0.03208 + }, + { + "clip_ratio": 0.0, + "completion_length": 310.6, + "epoch": 0.18585858585858586, + "grad_norm": 1.8334324625115481, + "kl": 0.004147664201445878, + "learning_rate": 2e-07, + "loss": 0.021799880266189575, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.3333333425223827, + "reward_std": 0.3004107713699341, + "rewards/MultiModalAccuracyORM": 0.3333333425223827, + "step": 460, + "train_speed(iter/s)": 0.032139 + }, + { + "clip_ratio": 0.0, + "completion_length": 254.55, + "epoch": 0.18787878787878787, + "grad_norm": 1.041487897521913, + "kl": 0.0034352200804278255, + "learning_rate": 2e-07, + "loss": -0.004481983184814453, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.21666667088866234, + "reward_std": 0.2996539086103439, + "rewards/MultiModalAccuracyORM": 0.21666667088866234, + "step": 465, + "train_speed(iter/s)": 0.032211 + }, + { + "clip_ratio": 0.0, + "completion_length": 253.45, + "epoch": 0.1898989898989899, + "grad_norm": 0.9482855673770706, + "kl": 0.003838365920819342, + "learning_rate": 2e-07, + "loss": -0.013571098446846008, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.12500000223517418, + "reward_std": 0.24640740752220153, + "rewards/MultiModalAccuracyORM": 0.12500000223517418, + "step": 470, + "train_speed(iter/s)": 0.032291 + }, + { + "clip_ratio": 0.0, + "completion_length": 228.4, + "epoch": 0.1919191919191919, + "grad_norm": 1.3313266716504006, + "kl": 0.004062736709602177, + "learning_rate": 2e-07, + "loss": 0.052185094356536864, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.36666667833924294, + "reward_std": 0.350342208147049, + "rewards/MultiModalAccuracyORM": 0.36666667833924294, + "step": 475, + "train_speed(iter/s)": 0.032317 + }, + { + "clip_ratio": 0.0, + "completion_length": 378.75, + "epoch": 0.19393939393939394, + "grad_norm": 0.006691860039934013, + "kl": 0.003972473449539393, + "learning_rate": 2e-07, + "loss": 0.013308031857013703, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.25833333656191826, + "reward_std": 0.2652770906686783, + "rewards/MultiModalAccuracyORM": 0.25833333656191826, + "step": 480, + "train_speed(iter/s)": 0.032297 + }, + { + "clip_ratio": 0.0, + "completion_length": 279.95, + "epoch": 0.19595959595959597, + "grad_norm": 1.858828735648231, + "kl": 0.0045673437649384144, + "learning_rate": 2e-07, + "loss": -0.08343450427055359, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.28333334252238274, + "reward_std": 0.30661733746528624, + "rewards/MultiModalAccuracyORM": 0.28333334252238274, + "step": 485, + "train_speed(iter/s)": 0.03234 + }, + { + "clip_ratio": 0.0, + "completion_length": 270.2, + "epoch": 0.19797979797979798, + "grad_norm": 0.9385618148678871, + "kl": 0.0037567693390883504, + "learning_rate": 2e-07, + "loss": -0.016573160886764526, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.13333333730697633, + "reward_std": 0.21394325494766236, + "rewards/MultiModalAccuracyORM": 0.13333333730697633, + "step": 490, + "train_speed(iter/s)": 0.032454 + }, + { + "clip_ratio": 0.0, + "completion_length": 266.6, + "epoch": 0.2, + "grad_norm": 0.8533415655351878, + "kl": 0.003521406790241599, + "learning_rate": 2e-07, + "loss": -0.019848501682281493, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.2583333395421505, + "reward_std": 0.3541334718465805, + "rewards/MultiModalAccuracyORM": 0.2583333395421505, + "step": 495, + "train_speed(iter/s)": 0.032507 + }, + { + "epoch": 0.20202020202020202, + "grad_norm": 10.76251214333233, + "learning_rate": 2e-07, + "loss": 0.010219329595565796, + "memory(GiB)": 67.41, + "step": 500, + "train_speed(iter/s)": 0.03255 + }, + { + "epoch": 0.20202020202020202, + "eval_clip_ratio": 0.0, + "eval_completion_length": 343.88834259033206, + "eval_kl": 0.0037221815134398637, + "eval_loss": 0.033297207206487656, + "eval_response_clip_ratio": 0.013333333432674408, + "eval_reward": 0.2283333396911621, + "eval_reward_std": 0.3360080027580261, + "eval_rewards/MultiModalAccuracyORM": 0.2283333396911621, + "eval_runtime": 614.5158, + "eval_samples_per_second": 0.081, + "eval_steps_per_second": 0.008, + "step": 500 + }, + { + "clip_ratio": 0.0, + "completion_length": 296.325, + "epoch": 0.20404040404040405, + "grad_norm": 1.0491762446720787, + "kl": 0.0045048539526760575, + "learning_rate": 2e-07, + "loss": -0.03126291036605835, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.2125000037252903, + "reward_std": 0.29145624935626985, + "rewards/MultiModalAccuracyORM": 0.2125000037252903, + "step": 505, + "train_speed(iter/s)": 0.030627 + }, + { + "clip_ratio": 0.0, + "completion_length": 432.65, + "epoch": 0.20606060606060606, + "grad_norm": 1.2154663404881314, + "kl": 0.005191830382682383, + "learning_rate": 2e-07, + "loss": 0.009676572680473328, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.1, + "reward": 0.2416666753590107, + "reward_std": 0.26368249356746676, + "rewards/MultiModalAccuracyORM": 0.2416666753590107, + "step": 510, + "train_speed(iter/s)": 0.030645 + }, + { + "clip_ratio": 0.0, + "completion_length": 322.7, + "epoch": 0.2080808080808081, + "grad_norm": 0.5041268253271711, + "kl": 0.003948929556645453, + "learning_rate": 2e-07, + "loss": -0.03559762239456177, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.1416666679084301, + "reward_std": 0.2159808874130249, + "rewards/MultiModalAccuracyORM": 0.1416666679084301, + "step": 515, + "train_speed(iter/s)": 0.030611 + }, + { + "clip_ratio": 0.0, + "completion_length": 360.1, + "epoch": 0.2101010101010101, + "grad_norm": 0.29702395537873283, + "kl": 0.00434970180504024, + "learning_rate": 2e-07, + "loss": -0.09598699808120728, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.2750000059604645, + "reward_std": 0.3237069517374039, + "rewards/MultiModalAccuracyORM": 0.2750000059604645, + "step": 520, + "train_speed(iter/s)": 0.030675 + }, + { + "clip_ratio": 0.0, + "completion_length": 285.9, + "epoch": 0.21212121212121213, + "grad_norm": 1.4484696850763847, + "kl": 0.0041591078508645294, + "learning_rate": 2e-07, + "loss": -0.06923834681510925, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.2583333358168602, + "reward_std": 0.3533048897981644, + "rewards/MultiModalAccuracyORM": 0.2583333358168602, + "step": 525, + "train_speed(iter/s)": 0.030707 + }, + { + "clip_ratio": 0.0, + "completion_length": 270.0, + "epoch": 0.21414141414141413, + "grad_norm": 1.2132868650103246, + "kl": 0.0032755408203229306, + "learning_rate": 2e-07, + "loss": -0.012829649448394775, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.21666667237877846, + "reward_std": 0.24261614382267, + "rewards/MultiModalAccuracyORM": 0.21666667237877846, + "step": 530, + "train_speed(iter/s)": 0.030765 + }, + { + "clip_ratio": 0.0, + "completion_length": 250.15, + "epoch": 0.21616161616161617, + "grad_norm": 1.3471895483550291, + "kl": 0.004648885619826615, + "learning_rate": 2e-07, + "loss": -0.015677666664123534, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.25833334028720856, + "reward_std": 0.3315081149339676, + "rewards/MultiModalAccuracyORM": 0.25833334028720856, + "step": 535, + "train_speed(iter/s)": 0.030826 + }, + { + "clip_ratio": 0.0, + "completion_length": 270.25, + "epoch": 0.21818181818181817, + "grad_norm": 0.6603883596764876, + "kl": 0.003572591207921505, + "learning_rate": 2e-07, + "loss": 0.0794254183769226, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.27500000298023225, + "reward_std": 0.17775078415870665, + "rewards/MultiModalAccuracyORM": 0.27500000298023225, + "step": 540, + "train_speed(iter/s)": 0.030827 + }, + { + "clip_ratio": 0.0, + "completion_length": 217.15, + "epoch": 0.2202020202020202, + "grad_norm": 1.1298566902251597, + "kl": 0.0045941169140860435, + "learning_rate": 2e-07, + "loss": 0.021821698546409606, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.3833333432674408, + "reward_std": 0.3860022217035294, + "rewards/MultiModalAccuracyORM": 0.3833333432674408, + "step": 545, + "train_speed(iter/s)": 0.03093 + }, + { + "clip_ratio": 0.0, + "completion_length": 251.6, + "epoch": 0.2222222222222222, + "grad_norm": 1.9661316551950794, + "kl": 0.004029618808999658, + "learning_rate": 2e-07, + "loss": -0.09334349632263184, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.1583333395421505, + "reward_std": 0.30636311769485475, + "rewards/MultiModalAccuracyORM": 0.1583333395421505, + "step": 550, + "train_speed(iter/s)": 0.030967 + }, + { + "clip_ratio": 0.0, + "completion_length": 336.95, + "epoch": 0.22424242424242424, + "grad_norm": 1.551239261425344, + "kl": 0.0035113503108732402, + "learning_rate": 2e-07, + "loss": -0.020345258712768554, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.1416666701436043, + "reward_std": 0.30789810717105864, + "rewards/MultiModalAccuracyORM": 0.1416666701436043, + "step": 555, + "train_speed(iter/s)": 0.030967 + }, + { + "clip_ratio": 0.0, + "completion_length": 267.7, + "epoch": 0.22626262626262628, + "grad_norm": 0.6662733706778937, + "kl": 0.003866145922802389, + "learning_rate": 2e-07, + "loss": 0.119044029712677, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.2083333358168602, + "reward_std": 0.24265173375606536, + "rewards/MultiModalAccuracyORM": 0.2083333358168602, + "step": 560, + "train_speed(iter/s)": 0.031003 + }, + { + "clip_ratio": 0.0, + "completion_length": 308.85, + "epoch": 0.22828282828282828, + "grad_norm": 1.107656830931071, + "kl": 0.0037969154422171415, + "learning_rate": 2e-07, + "loss": -0.007524615526199341, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.22500000298023223, + "reward_std": 0.33081327974796293, + "rewards/MultiModalAccuracyORM": 0.22500000298023223, + "step": 565, + "train_speed(iter/s)": 0.03102 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.25, + "epoch": 0.23030303030303031, + "grad_norm": 1.7816660571542655, + "kl": 0.00519141077529639, + "learning_rate": 2e-07, + "loss": 0.04047863185405731, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.2333333373069763, + "reward_std": 0.31963745057582854, + "rewards/MultiModalAccuracyORM": 0.2333333373069763, + "step": 570, + "train_speed(iter/s)": 0.031108 + }, + { + "clip_ratio": 0.0, + "completion_length": 246.75, + "epoch": 0.23232323232323232, + "grad_norm": 1.2214979642804986, + "kl": 0.004031882807612419, + "learning_rate": 2e-07, + "loss": 0.020588791370391844, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.1916666716337204, + "reward_std": 0.38450281620025634, + "rewards/MultiModalAccuracyORM": 0.1916666716337204, + "step": 575, + "train_speed(iter/s)": 0.031083 + }, + { + "clip_ratio": 0.0, + "completion_length": 360.75, + "epoch": 0.23434343434343435, + "grad_norm": 1.5078231957808115, + "kl": 0.0030958396266214548, + "learning_rate": 2e-07, + "loss": 0.04003850221633911, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.266666679084301, + "reward_std": 0.3370438635349274, + "rewards/MultiModalAccuracyORM": 0.266666679084301, + "step": 580, + "train_speed(iter/s)": 0.031079 + }, + { + "clip_ratio": 0.0, + "completion_length": 246.9, + "epoch": 0.23636363636363636, + "grad_norm": 1.1609668181534438, + "kl": 0.003774796542711556, + "learning_rate": 2e-07, + "loss": 0.03895624876022339, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.1250000014901161, + "reward_std": 0.2518449932336807, + "rewards/MultiModalAccuracyORM": 0.1250000014901161, + "step": 585, + "train_speed(iter/s)": 0.03114 + }, + { + "clip_ratio": 0.0, + "completion_length": 322.75, + "epoch": 0.2383838383838384, + "grad_norm": 1.3664975542154603, + "kl": 0.0038781519746407867, + "learning_rate": 2e-07, + "loss": -0.012830546498298645, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.2750000014901161, + "reward_std": 0.20442162454128265, + "rewards/MultiModalAccuracyORM": 0.2750000014901161, + "step": 590, + "train_speed(iter/s)": 0.031195 + }, + { + "clip_ratio": 0.0, + "completion_length": 343.25, + "epoch": 0.2404040404040404, + "grad_norm": 1.0360518178054594, + "kl": 0.004115447495132684, + "learning_rate": 2e-07, + "loss": 0.042417135834693906, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.05, + "reward": 0.2333333395421505, + "reward_std": 0.3581433713436127, + "rewards/MultiModalAccuracyORM": 0.2333333395421505, + "step": 595, + "train_speed(iter/s)": 0.03125 + }, + { + "clip_ratio": 0.0, + "completion_length": 310.2, + "epoch": 0.24242424242424243, + "grad_norm": 1.3913775959095787, + "kl": 0.0037250344757921994, + "learning_rate": 2e-07, + "loss": 0.00183790922164917, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.1416666701436043, + "reward_std": 0.2775311887264252, + "rewards/MultiModalAccuracyORM": 0.1416666701436043, + "step": 600, + "train_speed(iter/s)": 0.031261 + }, + { + "clip_ratio": 0.0, + "completion_length": 255.55, + "epoch": 0.24444444444444444, + "grad_norm": 0.440329365041974, + "kl": 0.004550268652383238, + "learning_rate": 2e-07, + "loss": 0.005285969376564026, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.2583333387970924, + "reward_std": 0.3297544836997986, + "rewards/MultiModalAccuracyORM": 0.2583333387970924, + "step": 605, + "train_speed(iter/s)": 0.031263 + }, + { + "clip_ratio": 0.0, + "completion_length": 280.9, + "epoch": 0.24646464646464647, + "grad_norm": 1.2624826502631048, + "kl": 0.005133295292034745, + "learning_rate": 2e-07, + "loss": -0.055149185657501223, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.25833333730697633, + "reward_std": 0.24640740752220153, + "rewards/MultiModalAccuracyORM": 0.25833333730697633, + "step": 610, + "train_speed(iter/s)": 0.031347 + }, + { + "clip_ratio": 0.0, + "completion_length": 354.2, + "epoch": 0.24848484848484848, + "grad_norm": 0.012452326444885307, + "kl": 0.003399366606026888, + "learning_rate": 2e-07, + "loss": 0.029164138436317443, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.1416666716337204, + "reward_std": 0.19744904339313507, + "rewards/MultiModalAccuracyORM": 0.1416666716337204, + "step": 615, + "train_speed(iter/s)": 0.031388 + }, + { + "clip_ratio": 0.0, + "completion_length": 272.55, + "epoch": 0.2505050505050505, + "grad_norm": 1.3165129641085085, + "kl": 0.004156474373303354, + "learning_rate": 2e-07, + "loss": 0.0376417338848114, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.25000000894069674, + "reward_std": 0.30035116970539094, + "rewards/MultiModalAccuracyORM": 0.25000000894069674, + "step": 620, + "train_speed(iter/s)": 0.031401 + }, + { + "clip_ratio": 0.0, + "completion_length": 317.85, + "epoch": 0.25252525252525254, + "grad_norm": 1.4029841920807011, + "kl": 0.003737919870764017, + "learning_rate": 2e-07, + "loss": 0.006714335083961487, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.1583333358168602, + "reward_std": 0.28227151930332184, + "rewards/MultiModalAccuracyORM": 0.1583333358168602, + "step": 625, + "train_speed(iter/s)": 0.031402 + }, + { + "clip_ratio": 0.0, + "completion_length": 210.7, + "epoch": 0.2545454545454545, + "grad_norm": 1.9019291244494156, + "kl": 0.005253740306943655, + "learning_rate": 2e-07, + "loss": 0.013242076337337493, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.19166667088866235, + "reward_std": 0.286027193069458, + "rewards/MultiModalAccuracyORM": 0.19166667088866235, + "step": 630, + "train_speed(iter/s)": 0.031476 + }, + { + "clip_ratio": 0.0, + "completion_length": 343.75, + "epoch": 0.25656565656565655, + "grad_norm": 1.5813011213273676, + "kl": 0.003963836142793298, + "learning_rate": 2e-07, + "loss": 0.03190605342388153, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.25833333656191826, + "reward_std": 0.4000905990600586, + "rewards/MultiModalAccuracyORM": 0.25833333656191826, + "step": 635, + "train_speed(iter/s)": 0.031501 + }, + { + "clip_ratio": 0.0, + "completion_length": 325.15, + "epoch": 0.2585858585858586, + "grad_norm": 0.9990236313380018, + "kl": 0.004905425664037466, + "learning_rate": 2e-07, + "loss": 0.01101228892803192, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.16666667088866233, + "reward_std": 0.3167103588581085, + "rewards/MultiModalAccuracyORM": 0.16666667088866233, + "step": 640, + "train_speed(iter/s)": 0.031543 + }, + { + "clip_ratio": 0.0, + "completion_length": 280.3, + "epoch": 0.2606060606060606, + "grad_norm": 0.8236059940973481, + "kl": 0.004417796130292117, + "learning_rate": 2e-07, + "loss": 0.021716611087322236, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.11666666865348815, + "reward_std": 0.2551600575447083, + "rewards/MultiModalAccuracyORM": 0.11666666865348815, + "step": 645, + "train_speed(iter/s)": 0.031613 + }, + { + "clip_ratio": 0.0, + "completion_length": 368.3, + "epoch": 0.26262626262626265, + "grad_norm": 1.208141693935316, + "kl": 0.0037754237418994308, + "learning_rate": 2e-07, + "loss": 0.05538809299468994, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.19166667088866235, + "reward_std": 0.35232023894786835, + "rewards/MultiModalAccuracyORM": 0.19166667088866235, + "step": 650, + "train_speed(iter/s)": 0.031638 + }, + { + "clip_ratio": 0.0, + "completion_length": 318.6, + "epoch": 0.26464646464646463, + "grad_norm": 0.9531663350118769, + "kl": 0.0037441954482346773, + "learning_rate": 2e-07, + "loss": -0.0026717036962509155, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.12500000223517418, + "reward_std": 0.29383077621459963, + "rewards/MultiModalAccuracyORM": 0.12500000223517418, + "step": 655, + "train_speed(iter/s)": 0.031701 + }, + { + "clip_ratio": 0.0, + "completion_length": 406.25, + "epoch": 0.26666666666666666, + "grad_norm": 0.45156032406611, + "kl": 0.0036298127146437765, + "learning_rate": 2e-07, + "loss": 0.004486371576786041, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.1666666693985462, + "reward_std": 0.26371566355228426, + "rewards/MultiModalAccuracyORM": 0.1666666693985462, + "step": 660, + "train_speed(iter/s)": 0.031747 + }, + { + "clip_ratio": 0.0, + "completion_length": 278.6, + "epoch": 0.2686868686868687, + "grad_norm": 1.2354602142887612, + "kl": 0.005091256252489984, + "learning_rate": 2e-07, + "loss": -0.01994211971759796, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.1666666679084301, + "reward_std": 0.24239750802516938, + "rewards/MultiModalAccuracyORM": 0.1666666679084301, + "step": 665, + "train_speed(iter/s)": 0.031774 + }, + { + "clip_ratio": 0.0, + "completion_length": 297.65, + "epoch": 0.27070707070707073, + "grad_norm": 1.5995488899211916, + "kl": 0.004296229011379183, + "learning_rate": 2e-07, + "loss": -0.02723192870616913, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.3583333417773247, + "reward_std": 0.32669364511966703, + "rewards/MultiModalAccuracyORM": 0.3583333417773247, + "step": 670, + "train_speed(iter/s)": 0.031837 + }, + { + "clip_ratio": 0.0, + "completion_length": 391.65, + "epoch": 0.2727272727272727, + "grad_norm": 0.8793351325351834, + "kl": 0.003925298724789172, + "learning_rate": 2e-07, + "loss": 0.01873851418495178, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.22500000819563865, + "reward_std": 0.29159853160381316, + "rewards/MultiModalAccuracyORM": 0.22500000819563865, + "step": 675, + "train_speed(iter/s)": 0.031854 + }, + { + "clip_ratio": 0.0, + "completion_length": 282.9, + "epoch": 0.27474747474747474, + "grad_norm": 1.2196841930405988, + "kl": 0.0043221796862781044, + "learning_rate": 2e-07, + "loss": -0.0018929451704025268, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.23333334177732468, + "reward_std": 0.36864383816719054, + "rewards/MultiModalAccuracyORM": 0.23333334177732468, + "step": 680, + "train_speed(iter/s)": 0.031875 + }, + { + "clip_ratio": 0.0, + "completion_length": 491.4, + "epoch": 0.2767676767676768, + "grad_norm": 0.578110919194848, + "kl": 0.003840261767618358, + "learning_rate": 2e-07, + "loss": 0.001986941695213318, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.14166666865348815, + "reward_std": 0.2719598561525345, + "rewards/MultiModalAccuracyORM": 0.14166666865348815, + "step": 685, + "train_speed(iter/s)": 0.031873 + }, + { + "clip_ratio": 0.0, + "completion_length": 250.5, + "epoch": 0.2787878787878788, + "grad_norm": 1.7045589714757738, + "kl": 0.004626664402894676, + "learning_rate": 2e-07, + "loss": 0.012319982051849365, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.1916666679084301, + "reward_std": 0.33984750509262085, + "rewards/MultiModalAccuracyORM": 0.1916666679084301, + "step": 690, + "train_speed(iter/s)": 0.031928 + }, + { + "clip_ratio": 0.0, + "completion_length": 321.9, + "epoch": 0.2808080808080808, + "grad_norm": 1.3591693418225999, + "kl": 0.004481176193803549, + "learning_rate": 2e-07, + "loss": -0.01004476472735405, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.1666666716337204, + "reward_std": 0.24560284316539766, + "rewards/MultiModalAccuracyORM": 0.1666666716337204, + "step": 695, + "train_speed(iter/s)": 0.031967 + }, + { + "clip_ratio": 0.0, + "completion_length": 222.35, + "epoch": 0.2828282828282828, + "grad_norm": 0.8510617258462263, + "kl": 0.00467616633977741, + "learning_rate": 2e-07, + "loss": 0.029875683784484863, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.29166667684912684, + "reward_std": 0.3390218883752823, + "rewards/MultiModalAccuracyORM": 0.29166667684912684, + "step": 700, + "train_speed(iter/s)": 0.031964 + }, + { + "clip_ratio": 0.0, + "completion_length": 264.4, + "epoch": 0.28484848484848485, + "grad_norm": 1.4378845483189635, + "kl": 0.005385997367557138, + "learning_rate": 2e-07, + "loss": 0.015697968006134034, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.44166668206453324, + "reward_std": 0.39862974882125857, + "rewards/MultiModalAccuracyORM": 0.44166668206453324, + "step": 705, + "train_speed(iter/s)": 0.032034 + }, + { + "clip_ratio": 0.0, + "completion_length": 223.8, + "epoch": 0.2868686868686869, + "grad_norm": 1.258370491031708, + "kl": 0.0045306324027478695, + "learning_rate": 2e-07, + "loss": -0.013608846068382262, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.21666667610406876, + "reward_std": 0.23224488496780396, + "rewards/MultiModalAccuracyORM": 0.21666667610406876, + "step": 710, + "train_speed(iter/s)": 0.032068 + }, + { + "clip_ratio": 0.0, + "completion_length": 308.55, + "epoch": 0.28888888888888886, + "grad_norm": 2.4400677744024937, + "kl": 0.005214189388789236, + "learning_rate": 2e-07, + "loss": -0.01957079768180847, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.21666667014360427, + "reward_std": 0.3274982154369354, + "rewards/MultiModalAccuracyORM": 0.21666667014360427, + "step": 715, + "train_speed(iter/s)": 0.032114 + }, + { + "clip_ratio": 0.0, + "completion_length": 379.45, + "epoch": 0.2909090909090909, + "grad_norm": 1.5673843849964653, + "kl": 0.004570033040363341, + "learning_rate": 2e-07, + "loss": -0.018536585569381713, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.22500000447034835, + "reward_std": 0.28472240567207335, + "rewards/MultiModalAccuracyORM": 0.22500000447034835, + "step": 720, + "train_speed(iter/s)": 0.03214 + }, + { + "clip_ratio": 0.0, + "completion_length": 304.6, + "epoch": 0.29292929292929293, + "grad_norm": 0.9935300145018874, + "kl": 0.0055370709858834745, + "learning_rate": 2e-07, + "loss": 0.03206640779972077, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.20000000447034835, + "reward_std": 0.3719944924116135, + "rewards/MultiModalAccuracyORM": 0.20000000447034835, + "step": 725, + "train_speed(iter/s)": 0.032199 + }, + { + "clip_ratio": 0.0, + "completion_length": 325.3, + "epoch": 0.29494949494949496, + "grad_norm": 1.7757797332994796, + "kl": 0.005048377229832113, + "learning_rate": 2e-07, + "loss": -0.023146471381187438, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.11666666939854622, + "reward_std": 0.277725812792778, + "rewards/MultiModalAccuracyORM": 0.11666666939854622, + "step": 730, + "train_speed(iter/s)": 0.032226 + }, + { + "clip_ratio": 0.0, + "completion_length": 422.3, + "epoch": 0.296969696969697, + "grad_norm": 0.47809660757769357, + "kl": 0.006107103615067899, + "learning_rate": 2e-07, + "loss": -0.03393080234527588, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.05, + "reward": 0.1250000037252903, + "reward_std": 0.21374863088130952, + "rewards/MultiModalAccuracyORM": 0.1250000037252903, + "step": 735, + "train_speed(iter/s)": 0.032213 + }, + { + "clip_ratio": 0.0, + "completion_length": 285.9, + "epoch": 0.298989898989899, + "grad_norm": 1.0436844507348098, + "kl": 0.004702468903269618, + "learning_rate": 2e-07, + "loss": -0.021845155954360963, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.22500000670552253, + "reward_std": 0.32425729632377626, + "rewards/MultiModalAccuracyORM": 0.22500000670552253, + "step": 740, + "train_speed(iter/s)": 0.032209 + }, + { + "clip_ratio": 0.0, + "completion_length": 524.05, + "epoch": 0.301010101010101, + "grad_norm": 1.049067442546249, + "kl": 0.0037612170912325383, + "learning_rate": 2e-07, + "loss": 0.021030843257904053, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.1, + "reward": 0.15833333656191825, + "reward_std": 0.3127004593610764, + "rewards/MultiModalAccuracyORM": 0.15833333656191825, + "step": 745, + "train_speed(iter/s)": 0.032202 + }, + { + "epoch": 0.30303030303030304, + "grad_norm": 0.9001583844372758, + "learning_rate": 2e-07, + "loss": -0.02579028606414795, + "memory(GiB)": 67.41, + "step": 750, + "train_speed(iter/s)": 0.032231 + }, + { + "epoch": 0.30303030303030304, + "eval_clip_ratio": 0.0, + "eval_completion_length": 340.4366758728027, + "eval_kl": 0.004551883968524635, + "eval_loss": 0.0018110970268025994, + "eval_response_clip_ratio": 0.015000000149011612, + "eval_reward": 0.20166667073965072, + "eval_reward_std": 0.2683356386423111, + "eval_rewards/MultiModalAccuracyORM": 0.20166667073965072, + "eval_runtime": 643.2616, + "eval_samples_per_second": 0.078, + "eval_steps_per_second": 0.008, + "step": 750 + }, + { + "clip_ratio": 0.0, + "completion_length": 334.525, + "epoch": 0.30505050505050507, + "grad_norm": 1.176274790320748, + "kl": 0.004291673714760691, + "learning_rate": 2e-07, + "loss": -0.014118121564388275, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.1625000022351742, + "reward_std": 0.2608040913939476, + "rewards/MultiModalAccuracyORM": 0.1625000022351742, + "step": 755, + "train_speed(iter/s)": 0.030899 + }, + { + "clip_ratio": 0.0, + "completion_length": 435.5, + "epoch": 0.30707070707070705, + "grad_norm": 1.6795792962236686, + "kl": 0.0048645576927810906, + "learning_rate": 2e-07, + "loss": 0.0353985846042633, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.05, + "reward": 0.14166667088866233, + "reward_std": 0.27148365676403047, + "rewards/MultiModalAccuracyORM": 0.14166667088866233, + "step": 760, + "train_speed(iter/s)": 0.030904 + }, + { + "clip_ratio": 0.0, + "completion_length": 242.6, + "epoch": 0.3090909090909091, + "grad_norm": 1.5837851108447996, + "kl": 0.005048908712342382, + "learning_rate": 2e-07, + "loss": 0.0012214839458465575, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.325000012665987, + "reward_std": 0.3860262334346771, + "rewards/MultiModalAccuracyORM": 0.325000012665987, + "step": 765, + "train_speed(iter/s)": 0.030943 + }, + { + "clip_ratio": 0.0, + "completion_length": 473.85, + "epoch": 0.3111111111111111, + "grad_norm": 1.1108515543557964, + "kl": 0.19770307638682424, + "learning_rate": 2e-07, + "loss": 0.033725738525390625, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.05, + "reward": 0.2250000022351742, + "reward_std": 0.35085399746894835, + "rewards/MultiModalAccuracyORM": 0.2250000022351742, + "step": 770, + "train_speed(iter/s)": 0.030941 + }, + { + "clip_ratio": 0.0, + "completion_length": 244.05, + "epoch": 0.31313131313131315, + "grad_norm": 1.1567261301470195, + "kl": 0.005782892415300012, + "learning_rate": 2e-07, + "loss": 0.011248695850372314, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.2166666716337204, + "reward_std": 0.344626384973526, + "rewards/MultiModalAccuracyORM": 0.2166666716337204, + "step": 775, + "train_speed(iter/s)": 0.031007 + }, + { + "clip_ratio": 0.0, + "completion_length": 363.6, + "epoch": 0.3151515151515151, + "grad_norm": 0.8996969813314127, + "kl": 0.004383829329162836, + "learning_rate": 2e-07, + "loss": 0.032080155611038205, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.08333333507180214, + "reward_std": 0.18106584250926971, + "rewards/MultiModalAccuracyORM": 0.08333333507180214, + "step": 780, + "train_speed(iter/s)": 0.03104 + }, + { + "clip_ratio": 0.0, + "completion_length": 335.55, + "epoch": 0.31717171717171716, + "grad_norm": 0.8366413858450646, + "kl": 0.00416933981468901, + "learning_rate": 2e-07, + "loss": 0.01989735960960388, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.20000000894069672, + "reward_std": 0.2323044866323471, + "rewards/MultiModalAccuracyORM": 0.20000000894069672, + "step": 785, + "train_speed(iter/s)": 0.031043 + }, + { + "clip_ratio": 0.0, + "completion_length": 352.75, + "epoch": 0.3191919191919192, + "grad_norm": 1.2338416774729999, + "kl": 0.0057875648839399215, + "learning_rate": 2e-07, + "loss": 0.04425770938396454, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.23333334028720856, + "reward_std": 0.2900991141796112, + "rewards/MultiModalAccuracyORM": 0.23333334028720856, + "step": 790, + "train_speed(iter/s)": 0.031075 + }, + { + "clip_ratio": 0.0, + "completion_length": 396.55, + "epoch": 0.3212121212121212, + "grad_norm": 0.6085363702164241, + "kl": 0.005086203385144472, + "learning_rate": 2e-07, + "loss": 0.07262378931045532, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.29166666939854624, + "reward_std": 0.39939576387405396, + "rewards/MultiModalAccuracyORM": 0.29166666939854624, + "step": 795, + "train_speed(iter/s)": 0.031082 + }, + { + "clip_ratio": 0.0, + "completion_length": 330.05, + "epoch": 0.32323232323232326, + "grad_norm": 1.710902967582431, + "kl": 0.0056509776040911674, + "learning_rate": 2e-07, + "loss": 0.05898982286453247, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.3666666805744171, + "reward_std": 0.40188278555870055, + "rewards/MultiModalAccuracyORM": 0.3666666805744171, + "step": 800, + "train_speed(iter/s)": 0.031108 + }, + { + "clip_ratio": 0.0, + "completion_length": 310.25, + "epoch": 0.32525252525252524, + "grad_norm": 1.4124668864516894, + "kl": 0.004830094543285668, + "learning_rate": 2e-07, + "loss": 0.04988533854484558, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.22500000894069672, + "reward_std": 0.3737125337123871, + "rewards/MultiModalAccuracyORM": 0.22500000894069672, + "step": 805, + "train_speed(iter/s)": 0.031118 + }, + { + "clip_ratio": 0.0, + "completion_length": 254.2, + "epoch": 0.32727272727272727, + "grad_norm": 0.8398251404363962, + "kl": 0.005614466220140457, + "learning_rate": 2e-07, + "loss": -0.02921849489212036, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.2333333358168602, + "reward_std": 0.3322981417179108, + "rewards/MultiModalAccuracyORM": 0.2333333358168602, + "step": 810, + "train_speed(iter/s)": 0.031188 + }, + { + "clip_ratio": 0.0, + "completion_length": 291.85, + "epoch": 0.3292929292929293, + "grad_norm": 1.4062519243001712, + "kl": 0.005611171037890017, + "learning_rate": 2e-07, + "loss": 0.027082645893096925, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.24166667088866234, + "reward_std": 0.37593025267124175, + "rewards/MultiModalAccuracyORM": 0.24166667088866234, + "step": 815, + "train_speed(iter/s)": 0.031214 + }, + { + "clip_ratio": 0.0, + "completion_length": 244.55, + "epoch": 0.33131313131313134, + "grad_norm": 1.535732791217238, + "kl": 0.006293811020441353, + "learning_rate": 2e-07, + "loss": -0.004323112964630127, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.19166667014360428, + "reward_std": 0.36012140214443206, + "rewards/MultiModalAccuracyORM": 0.19166667014360428, + "step": 820, + "train_speed(iter/s)": 0.031248 + }, + { + "clip_ratio": 0.0, + "completion_length": 263.3, + "epoch": 0.3333333333333333, + "grad_norm": 1.4122524484024275, + "kl": 0.005569443246349693, + "learning_rate": 2e-07, + "loss": 0.03137176036834717, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.15000000223517418, + "reward_std": 0.27221408784389495, + "rewards/MultiModalAccuracyORM": 0.15000000223517418, + "step": 825, + "train_speed(iter/s)": 0.031262 + }, + { + "clip_ratio": 0.0, + "completion_length": 248.3, + "epoch": 0.33535353535353535, + "grad_norm": 0.847697597752372, + "kl": 0.006561408983543515, + "learning_rate": 2e-07, + "loss": 0.001510709524154663, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.15833333805203437, + "reward_std": 0.3202118068933487, + "rewards/MultiModalAccuracyORM": 0.15833333805203437, + "step": 830, + "train_speed(iter/s)": 0.031276 + }, + { + "clip_ratio": 0.0, + "completion_length": 284.1, + "epoch": 0.3373737373737374, + "grad_norm": 1.0077148325129925, + "kl": 0.0051434833323583005, + "learning_rate": 2e-07, + "loss": -0.046033868193626405, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.22500000819563865, + "reward_std": 0.3932794779539108, + "rewards/MultiModalAccuracyORM": 0.22500000819563865, + "step": 835, + "train_speed(iter/s)": 0.031306 + }, + { + "clip_ratio": 0.0, + "completion_length": 325.4, + "epoch": 0.3393939393939394, + "grad_norm": 1.0924856088353982, + "kl": 0.006390028609894216, + "learning_rate": 2e-07, + "loss": 0.017022347450256346, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.2666666775941849, + "reward_std": 0.34735551476478577, + "rewards/MultiModalAccuracyORM": 0.2666666775941849, + "step": 840, + "train_speed(iter/s)": 0.031335 + }, + { + "clip_ratio": 0.0, + "completion_length": 288.1, + "epoch": 0.3414141414141414, + "grad_norm": 1.2514996858658436, + "kl": 0.0053161653922870755, + "learning_rate": 2e-07, + "loss": 0.017437267303466796, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.25000000819563867, + "reward_std": 0.2893422573804855, + "rewards/MultiModalAccuracyORM": 0.25000000819563867, + "step": 845, + "train_speed(iter/s)": 0.031345 + }, + { + "clip_ratio": 0.0, + "completion_length": 330.0, + "epoch": 0.3434343434343434, + "grad_norm": 1.0839879351711734, + "kl": 0.005381654878146946, + "learning_rate": 2e-07, + "loss": 0.013951669633388519, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.13333333730697633, + "reward_std": 0.29003951251506804, + "rewards/MultiModalAccuracyORM": 0.13333333730697633, + "step": 850, + "train_speed(iter/s)": 0.031349 + }, + { + "clip_ratio": 0.0, + "completion_length": 324.4, + "epoch": 0.34545454545454546, + "grad_norm": 1.5570693343632969, + "kl": 0.008254527021199465, + "learning_rate": 2e-07, + "loss": -0.0007428258657455444, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.29166667759418485, + "reward_std": 0.3415919840335846, + "rewards/MultiModalAccuracyORM": 0.29166667759418485, + "step": 855, + "train_speed(iter/s)": 0.031395 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.0, + "epoch": 0.3474747474747475, + "grad_norm": 1.545061824643743, + "kl": 0.006796046695671976, + "learning_rate": 2e-07, + "loss": 0.01094520315527916, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.27500000819563863, + "reward_std": 0.38901292681694033, + "rewards/MultiModalAccuracyORM": 0.27500000819563863, + "step": 860, + "train_speed(iter/s)": 0.031457 + }, + { + "clip_ratio": 0.0, + "completion_length": 302.3, + "epoch": 0.34949494949494947, + "grad_norm": 1.868618439314485, + "kl": 0.00691440338268876, + "learning_rate": 2e-07, + "loss": -0.029064083099365236, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.05, + "reward": 0.3083333410322666, + "reward_std": 0.4548985332250595, + "rewards/MultiModalAccuracyORM": 0.3083333410322666, + "step": 865, + "train_speed(iter/s)": 0.031484 + }, + { + "clip_ratio": 0.0, + "completion_length": 322.65, + "epoch": 0.3515151515151515, + "grad_norm": 1.3127307464437807, + "kl": 0.005565014760941267, + "learning_rate": 2e-07, + "loss": 0.019167789816856386, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.1500000037252903, + "reward_std": 0.26816859245300295, + "rewards/MultiModalAccuracyORM": 0.1500000037252903, + "step": 870, + "train_speed(iter/s)": 0.031519 + }, + { + "clip_ratio": 0.0, + "completion_length": 300.05, + "epoch": 0.35353535353535354, + "grad_norm": 1.5008568373381221, + "kl": 0.005994554329663515, + "learning_rate": 2e-07, + "loss": 0.023988738656044006, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.2750000059604645, + "reward_std": 0.38748950958251954, + "rewards/MultiModalAccuracyORM": 0.2750000059604645, + "step": 875, + "train_speed(iter/s)": 0.031537 + }, + { + "clip_ratio": 0.0, + "completion_length": 318.35, + "epoch": 0.35555555555555557, + "grad_norm": 0.8858552851817257, + "kl": 0.0050561846233904365, + "learning_rate": 2e-07, + "loss": 0.001196683943271637, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.05, + "reward": 0.27500000819563863, + "reward_std": 0.28755303025245665, + "rewards/MultiModalAccuracyORM": 0.27500000819563863, + "step": 880, + "train_speed(iter/s)": 0.031564 + }, + { + "clip_ratio": 0.0, + "completion_length": 213.9, + "epoch": 0.3575757575757576, + "grad_norm": 0.5620687272407863, + "kl": 0.006028561620041728, + "learning_rate": 2e-07, + "loss": -0.013837304711341859, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.2750000014901161, + "reward_std": 0.25664491951465607, + "rewards/MultiModalAccuracyORM": 0.2750000014901161, + "step": 885, + "train_speed(iter/s)": 0.031616 + }, + { + "clip_ratio": 0.0, + "completion_length": 247.8, + "epoch": 0.3595959595959596, + "grad_norm": 2.985697887769574, + "kl": 0.007074238453060389, + "learning_rate": 2e-07, + "loss": 0.019273641705513, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.23333333879709245, + "reward_std": 0.2681685984134674, + "rewards/MultiModalAccuracyORM": 0.23333333879709245, + "step": 890, + "train_speed(iter/s)": 0.031645 + }, + { + "clip_ratio": 0.0, + "completion_length": 549.55, + "epoch": 0.3616161616161616, + "grad_norm": 1.4558424844518882, + "kl": 0.003922113939188421, + "learning_rate": 2e-07, + "loss": 0.06525606513023377, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.05, + "reward": 0.2500000074505806, + "reward_std": 0.3988839745521545, + "rewards/MultiModalAccuracyORM": 0.2500000074505806, + "step": 895, + "train_speed(iter/s)": 0.031624 + }, + { + "clip_ratio": 0.0, + "completion_length": 347.4, + "epoch": 0.36363636363636365, + "grad_norm": 0.656120438814147, + "kl": 0.006284803117159754, + "learning_rate": 2e-07, + "loss": -0.0007577657699584961, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.22500000521540642, + "reward_std": 0.25811116099357606, + "rewards/MultiModalAccuracyORM": 0.22500000521540642, + "step": 900, + "train_speed(iter/s)": 0.031648 + }, + { + "clip_ratio": 0.0, + "completion_length": 299.0, + "epoch": 0.3656565656565657, + "grad_norm": 1.5386640162242566, + "kl": 0.006484637362882495, + "learning_rate": 2e-07, + "loss": 0.03939504027366638, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.24166667312383652, + "reward_std": 0.2964008718729019, + "rewards/MultiModalAccuracyORM": 0.24166667312383652, + "step": 905, + "train_speed(iter/s)": 0.031684 + }, + { + "clip_ratio": 0.0, + "completion_length": 329.45, + "epoch": 0.36767676767676766, + "grad_norm": 1.636331464172546, + "kl": 0.006949460273608566, + "learning_rate": 2e-07, + "loss": -0.07270481586456298, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.3166666738688946, + "reward_std": 0.4011138051748276, + "rewards/MultiModalAccuracyORM": 0.3166666738688946, + "step": 910, + "train_speed(iter/s)": 0.031729 + }, + { + "clip_ratio": 0.0, + "completion_length": 261.65, + "epoch": 0.3696969696969697, + "grad_norm": 0.6925500168851378, + "kl": 0.006146807945333422, + "learning_rate": 2e-07, + "loss": 0.0035164892673492433, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.33333334773778917, + "reward_std": 0.32052563428878783, + "rewards/MultiModalAccuracyORM": 0.33333334773778917, + "step": 915, + "train_speed(iter/s)": 0.03175 + }, + { + "clip_ratio": 0.0, + "completion_length": 280.1, + "epoch": 0.3717171717171717, + "grad_norm": 1.8970854173810114, + "kl": 0.005729123065248132, + "learning_rate": 2e-07, + "loss": 0.05737735033035278, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.35000001043081286, + "reward_std": 0.4226803660392761, + "rewards/MultiModalAccuracyORM": 0.35000001043081286, + "step": 920, + "train_speed(iter/s)": 0.031768 + }, + { + "clip_ratio": 0.0, + "completion_length": 269.85, + "epoch": 0.37373737373737376, + "grad_norm": 1.1898661364371217, + "kl": 0.0061120831873267886, + "learning_rate": 2e-07, + "loss": 0.011839108169078827, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.2333333410322666, + "reward_std": 0.3144781023263931, + "rewards/MultiModalAccuracyORM": 0.2333333410322666, + "step": 925, + "train_speed(iter/s)": 0.031823 + }, + { + "clip_ratio": 0.0, + "completion_length": 277.4, + "epoch": 0.37575757575757573, + "grad_norm": 1.3632550964844283, + "kl": 0.006334329699166119, + "learning_rate": 2e-07, + "loss": -0.046709203720092775, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.18333333730697632, + "reward_std": 0.21999078691005708, + "rewards/MultiModalAccuracyORM": 0.18333333730697632, + "step": 930, + "train_speed(iter/s)": 0.031864 + }, + { + "clip_ratio": 0.0, + "completion_length": 240.85, + "epoch": 0.37777777777777777, + "grad_norm": 1.0159613386349218, + "kl": 0.008165232185274363, + "learning_rate": 2e-07, + "loss": 0.03819341957569122, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.29166666939854624, + "reward_std": 0.27071225047111513, + "rewards/MultiModalAccuracyORM": 0.29166666939854624, + "step": 935, + "train_speed(iter/s)": 0.031907 + }, + { + "clip_ratio": 0.0, + "completion_length": 249.2, + "epoch": 0.3797979797979798, + "grad_norm": 1.7106230008719308, + "kl": 0.006773473136126995, + "learning_rate": 2e-07, + "loss": -0.07135199308395386, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.34166667610406876, + "reward_std": 0.287842845916748, + "rewards/MultiModalAccuracyORM": 0.34166667610406876, + "step": 940, + "train_speed(iter/s)": 0.031944 + }, + { + "clip_ratio": 0.0, + "completion_length": 311.0, + "epoch": 0.38181818181818183, + "grad_norm": 0.9204904271016048, + "kl": 0.00620469048153609, + "learning_rate": 2e-07, + "loss": 0.0036203682422637938, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.30000000447034836, + "reward_std": 0.33299540281295775, + "rewards/MultiModalAccuracyORM": 0.30000000447034836, + "step": 945, + "train_speed(iter/s)": 0.031985 + }, + { + "clip_ratio": 0.0, + "completion_length": 321.5, + "epoch": 0.3838383838383838, + "grad_norm": 1.9449992630577924, + "kl": 0.0057474728906527165, + "learning_rate": 2e-07, + "loss": 0.010283425450325012, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.3916666693985462, + "reward_std": 0.28456337153911593, + "rewards/MultiModalAccuracyORM": 0.3916666693985462, + "step": 950, + "train_speed(iter/s)": 0.031997 + }, + { + "clip_ratio": 0.0, + "completion_length": 354.65, + "epoch": 0.38585858585858585, + "grad_norm": 1.1872114495400206, + "kl": 0.0066348537104204295, + "learning_rate": 2e-07, + "loss": -0.08897682428359985, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.05, + "reward": 0.30833334252238276, + "reward_std": 0.3064227133989334, + "rewards/MultiModalAccuracyORM": 0.30833334252238276, + "step": 955, + "train_speed(iter/s)": 0.031973 + }, + { + "clip_ratio": 0.0, + "completion_length": 293.85, + "epoch": 0.3878787878787879, + "grad_norm": 1.4160066184361069, + "kl": 0.00627009014133364, + "learning_rate": 2e-07, + "loss": 0.024894729256629944, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.27500000819563863, + "reward_std": 0.33303396999835966, + "rewards/MultiModalAccuracyORM": 0.27500000819563863, + "step": 960, + "train_speed(iter/s)": 0.031975 + }, + { + "clip_ratio": 0.0, + "completion_length": 363.55, + "epoch": 0.3898989898989899, + "grad_norm": 1.2362614663841194, + "kl": 0.006092234468087554, + "learning_rate": 2e-07, + "loss": 0.0033513441681861877, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.05, + "reward": 0.33333334103226664, + "reward_std": 0.3174647957086563, + "rewards/MultiModalAccuracyORM": 0.33333334103226664, + "step": 965, + "train_speed(iter/s)": 0.031996 + }, + { + "clip_ratio": 0.0, + "completion_length": 294.45, + "epoch": 0.39191919191919194, + "grad_norm": 0.6212974432181537, + "kl": 0.005540155991911888, + "learning_rate": 2e-07, + "loss": 0.0035649120807647707, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.18333333879709243, + "reward_std": 0.24615318179130555, + "rewards/MultiModalAccuracyORM": 0.18333333879709243, + "step": 970, + "train_speed(iter/s)": 0.031961 + }, + { + "clip_ratio": 0.0, + "completion_length": 295.1, + "epoch": 0.3939393939393939, + "grad_norm": 0.5947339601867617, + "kl": 0.005397630413062871, + "learning_rate": 2e-07, + "loss": -0.07813270688056946, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.17500000521540643, + "reward_std": 0.30191018283367155, + "rewards/MultiModalAccuracyORM": 0.17500000521540643, + "step": 975, + "train_speed(iter/s)": 0.031954 + }, + { + "clip_ratio": 0.0, + "completion_length": 308.25, + "epoch": 0.39595959595959596, + "grad_norm": 0.5020572049064443, + "kl": 0.005718397395685315, + "learning_rate": 2e-07, + "loss": 0.02026384472846985, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.12500000298023223, + "reward_std": 0.1911232739686966, + "rewards/MultiModalAccuracyORM": 0.12500000298023223, + "step": 980, + "train_speed(iter/s)": 0.031951 + }, + { + "clip_ratio": 0.0, + "completion_length": 344.65, + "epoch": 0.397979797979798, + "grad_norm": 0.9401973082771917, + "kl": 0.006880732695572078, + "learning_rate": 2e-07, + "loss": 0.033180487155914304, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.16666667386889458, + "reward_std": 0.2074468731880188, + "rewards/MultiModalAccuracyORM": 0.16666667386889458, + "step": 985, + "train_speed(iter/s)": 0.031964 + }, + { + "clip_ratio": 0.0, + "completion_length": 494.3, + "epoch": 0.4, + "grad_norm": 83.40224473063842, + "kl": 0.12589137610048057, + "learning_rate": 2e-07, + "loss": 0.013488560914993286, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.15000000447034836, + "reward_std": 0.2832947254180908, + "rewards/MultiModalAccuracyORM": 0.15000000447034836, + "step": 990, + "train_speed(iter/s)": 0.031966 + }, + { + "clip_ratio": 0.0, + "completion_length": 366.9, + "epoch": 0.402020202020202, + "grad_norm": 1.4324831057895324, + "kl": 0.0055825527058914306, + "learning_rate": 2e-07, + "loss": -0.03936474025249481, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.2583333395421505, + "reward_std": 0.4111736625432968, + "rewards/MultiModalAccuracyORM": 0.2583333395421505, + "step": 995, + "train_speed(iter/s)": 0.03201 + }, + { + "epoch": 0.40404040404040403, + "grad_norm": 1.2517397120109381, + "learning_rate": 2e-07, + "loss": -0.015787112712860107, + "memory(GiB)": 67.41, + "step": 1000, + "train_speed(iter/s)": 0.031987 + }, + { + "epoch": 0.40404040404040403, + "eval_clip_ratio": 0.0, + "eval_completion_length": 325.82667709350585, + "eval_kl": 0.005815695002675056, + "eval_loss": 0.004047422204166651, + "eval_response_clip_ratio": 0.001666666716337204, + "eval_reward": 0.22833334043622017, + "eval_reward_std": 0.31840195894241335, + "eval_rewards/MultiModalAccuracyORM": 0.22833334043622017, + "eval_runtime": 636.093, + "eval_samples_per_second": 0.079, + "eval_steps_per_second": 0.008, + "step": 1000 + }, + { + "clip_ratio": 0.0, + "completion_length": 411.1, + "epoch": 0.40606060606060607, + "grad_norm": 0.013788807580567233, + "kl": 0.0054129053140059115, + "learning_rate": 2e-07, + "loss": 0.0037709444761276243, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.025, + "reward": 0.1833333358168602, + "reward_std": 0.3083831608295441, + "rewards/MultiModalAccuracyORM": 0.1833333358168602, + "step": 1005, + "train_speed(iter/s)": 0.030995 + }, + { + "clip_ratio": 0.0, + "completion_length": 288.5, + "epoch": 0.4080808080808081, + "grad_norm": 0.7274819878482078, + "kl": 0.00596827978733927, + "learning_rate": 2e-07, + "loss": 0.05422252416610718, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.21666667088866234, + "reward_std": 0.3408707112073898, + "rewards/MultiModalAccuracyORM": 0.21666667088866234, + "step": 1010, + "train_speed(iter/s)": 0.031034 + }, + { + "clip_ratio": 0.0, + "completion_length": 278.4, + "epoch": 0.4101010101010101, + "grad_norm": 0.48477183540520113, + "kl": 0.0054684164701029655, + "learning_rate": 2e-07, + "loss": 0.037825629115104675, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.3000000096857548, + "reward_std": 0.2979002833366394, + "rewards/MultiModalAccuracyORM": 0.3000000096857548, + "step": 1015, + "train_speed(iter/s)": 0.031065 + }, + { + "clip_ratio": 0.0, + "completion_length": 355.4, + "epoch": 0.4121212121212121, + "grad_norm": 2.4295423623484362, + "kl": 0.005641359637957066, + "learning_rate": 2e-07, + "loss": -0.046464985609054564, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.1500000037252903, + "reward_std": 0.16470665335655213, + "rewards/MultiModalAccuracyORM": 0.1500000037252903, + "step": 1020, + "train_speed(iter/s)": 0.031083 + }, + { + "clip_ratio": 0.0, + "completion_length": 328.6, + "epoch": 0.41414141414141414, + "grad_norm": 1.2390331767029386, + "kl": 0.005399754224345088, + "learning_rate": 2e-07, + "loss": 0.030136501789093016, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.10833333656191826, + "reward_std": 0.20343697369098662, + "rewards/MultiModalAccuracyORM": 0.10833333656191826, + "step": 1025, + "train_speed(iter/s)": 0.031111 + }, + { + "clip_ratio": 0.0, + "completion_length": 332.25, + "epoch": 0.4161616161616162, + "grad_norm": 0.9468249386621901, + "kl": 0.006285157660022378, + "learning_rate": 2e-07, + "loss": 0.023849096894264222, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.1833333432674408, + "reward_std": 0.2488823115825653, + "rewards/MultiModalAccuracyORM": 0.1833333432674408, + "step": 1030, + "train_speed(iter/s)": 0.031124 + }, + { + "clip_ratio": 0.0, + "completion_length": 374.6, + "epoch": 0.41818181818181815, + "grad_norm": 1.1115466247036063, + "kl": 0.004610971501097083, + "learning_rate": 2e-07, + "loss": 0.0053513914346694945, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.24166667386889457, + "reward_std": 0.23930107951164245, + "rewards/MultiModalAccuracyORM": 0.24166667386889457, + "step": 1035, + "train_speed(iter/s)": 0.031106 + }, + { + "clip_ratio": 0.0, + "completion_length": 351.1, + "epoch": 0.4202020202020202, + "grad_norm": 0.02105150606730856, + "kl": 0.006059326883405447, + "learning_rate": 2e-07, + "loss": 0.003586888313293457, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.33333334550261495, + "reward_std": 0.324789759516716, + "rewards/MultiModalAccuracyORM": 0.33333334550261495, + "step": 1040, + "train_speed(iter/s)": 0.031118 + }, + { + "clip_ratio": 0.0, + "completion_length": 417.0, + "epoch": 0.4222222222222222, + "grad_norm": 1.3465295426814468, + "kl": 0.005017468379810453, + "learning_rate": 2e-07, + "loss": 0.01884058117866516, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.05, + "reward": 0.15833333730697632, + "reward_std": 0.33526621460914613, + "rewards/MultiModalAccuracyORM": 0.15833333730697632, + "step": 1045, + "train_speed(iter/s)": 0.031118 + }, + { + "clip_ratio": 0.0, + "completion_length": 385.25, + "epoch": 0.42424242424242425, + "grad_norm": 0.01829834451184037, + "kl": 0.00570887109497562, + "learning_rate": 2e-07, + "loss": -0.04955781400203705, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.17500000149011613, + "reward_std": 0.1808116167783737, + "rewards/MultiModalAccuracyORM": 0.17500000149011613, + "step": 1050, + "train_speed(iter/s)": 0.031106 + }, + { + "clip_ratio": 0.0, + "completion_length": 294.55, + "epoch": 0.4262626262626263, + "grad_norm": 1.1139595382513947, + "kl": 0.0063067243434488775, + "learning_rate": 2e-07, + "loss": 0.037534278631210324, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.21666666939854623, + "reward_std": 0.2923289448022842, + "rewards/MultiModalAccuracyORM": 0.21666666939854623, + "step": 1055, + "train_speed(iter/s)": 0.031146 + }, + { + "clip_ratio": 0.0, + "completion_length": 310.5, + "epoch": 0.42828282828282827, + "grad_norm": 1.2445691938767505, + "kl": 0.006325511611066759, + "learning_rate": 2e-07, + "loss": -0.044334182143211366, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.22500000447034835, + "reward_std": 0.3978011727333069, + "rewards/MultiModalAccuracyORM": 0.22500000447034835, + "step": 1060, + "train_speed(iter/s)": 0.031171 + }, + { + "clip_ratio": 0.0, + "completion_length": 303.7, + "epoch": 0.4303030303030303, + "grad_norm": 0.9566673579166692, + "kl": 0.0070721972035244106, + "learning_rate": 2e-07, + "loss": -0.005390632152557373, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.3250000081956387, + "reward_std": 0.3438218146562576, + "rewards/MultiModalAccuracyORM": 0.3250000081956387, + "step": 1065, + "train_speed(iter/s)": 0.031187 + }, + { + "clip_ratio": 0.0, + "completion_length": 349.65, + "epoch": 0.43232323232323233, + "grad_norm": 0.6080597174926101, + "kl": 0.005873536411672831, + "learning_rate": 2e-07, + "loss": 0.02115156948566437, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.3416666783392429, + "reward_std": 0.2885376811027527, + "rewards/MultiModalAccuracyORM": 0.3416666783392429, + "step": 1070, + "train_speed(iter/s)": 0.03122 + }, + { + "clip_ratio": 0.0, + "completion_length": 388.05, + "epoch": 0.43434343434343436, + "grad_norm": 1.1890985376722285, + "kl": 0.005225225887261331, + "learning_rate": 2e-07, + "loss": 0.033620885014533995, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.1666666716337204, + "reward_std": 0.2730426698923111, + "rewards/MultiModalAccuracyORM": 0.1666666716337204, + "step": 1075, + "train_speed(iter/s)": 0.03125 + }, + { + "clip_ratio": 0.0, + "completion_length": 257.8, + "epoch": 0.43636363636363634, + "grad_norm": 0.9920368386170019, + "kl": 0.007723887427709996, + "learning_rate": 2e-07, + "loss": -0.01428629457950592, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.3166666738688946, + "reward_std": 0.3488905102014542, + "rewards/MultiModalAccuracyORM": 0.3166666738688946, + "step": 1080, + "train_speed(iter/s)": 0.031278 + }, + { + "clip_ratio": 0.0, + "completion_length": 299.95, + "epoch": 0.4383838383838384, + "grad_norm": 0.8633228611517588, + "kl": 0.006215728004463017, + "learning_rate": 2e-07, + "loss": 0.009860965609550475, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.10833333879709243, + "reward_std": 0.20588786602020265, + "rewards/MultiModalAccuracyORM": 0.10833333879709243, + "step": 1085, + "train_speed(iter/s)": 0.031299 + }, + { + "clip_ratio": 0.0, + "completion_length": 249.4, + "epoch": 0.4404040404040404, + "grad_norm": 1.1078043273889853, + "kl": 0.008284115185961127, + "learning_rate": 2e-07, + "loss": -0.027253830432891847, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.20833334028720857, + "reward_std": 0.31467272639274596, + "rewards/MultiModalAccuracyORM": 0.20833334028720857, + "step": 1090, + "train_speed(iter/s)": 0.031335 + }, + { + "clip_ratio": 0.0, + "completion_length": 301.9, + "epoch": 0.44242424242424244, + "grad_norm": 1.3316514075181503, + "kl": 0.007131563685834408, + "learning_rate": 2e-07, + "loss": 0.01606808602809906, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.39166667461395266, + "reward_std": 0.3697382241487503, + "rewards/MultiModalAccuracyORM": 0.39166667461395266, + "step": 1095, + "train_speed(iter/s)": 0.031356 + }, + { + "clip_ratio": 0.0, + "completion_length": 277.3, + "epoch": 0.4444444444444444, + "grad_norm": 1.5007656151975992, + "kl": 0.005109827104024589, + "learning_rate": 2e-07, + "loss": 0.012760597467422485, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.30000000819563866, + "reward_std": 0.4181702554225922, + "rewards/MultiModalAccuracyORM": 0.30000000819563866, + "step": 1100, + "train_speed(iter/s)": 0.031388 + }, + { + "clip_ratio": 0.0, + "completion_length": 269.8, + "epoch": 0.44646464646464645, + "grad_norm": 1.1822162392393358, + "kl": 0.006911608600057661, + "learning_rate": 2e-07, + "loss": -0.004604104161262512, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.2583333380520344, + "reward_std": 0.37523541152477263, + "rewards/MultiModalAccuracyORM": 0.2583333380520344, + "step": 1105, + "train_speed(iter/s)": 0.031425 + }, + { + "clip_ratio": 0.0, + "completion_length": 233.1, + "epoch": 0.4484848484848485, + "grad_norm": 1.0513525935612356, + "kl": 0.007250142516568303, + "learning_rate": 2e-07, + "loss": 0.011294081062078475, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.26666667088866236, + "reward_std": 0.2692273885011673, + "rewards/MultiModalAccuracyORM": 0.26666667088866236, + "step": 1110, + "train_speed(iter/s)": 0.031461 + }, + { + "clip_ratio": 0.0, + "completion_length": 336.4, + "epoch": 0.4505050505050505, + "grad_norm": 0.588254196095547, + "kl": 0.0058827483095228675, + "learning_rate": 2e-07, + "loss": 0.008113735914230346, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.1083333358168602, + "reward_std": 0.24885829985141755, + "rewards/MultiModalAccuracyORM": 0.1083333358168602, + "step": 1115, + "train_speed(iter/s)": 0.031497 + }, + { + "clip_ratio": 0.0, + "completion_length": 344.8, + "epoch": 0.45252525252525255, + "grad_norm": 0.9984948999076526, + "kl": 0.007338272430934012, + "learning_rate": 2e-07, + "loss": 0.05758116841316223, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.3166666753590107, + "reward_std": 0.3659469664096832, + "rewards/MultiModalAccuracyORM": 0.3166666753590107, + "step": 1120, + "train_speed(iter/s)": 0.031502 + }, + { + "clip_ratio": 0.0, + "completion_length": 368.15, + "epoch": 0.45454545454545453, + "grad_norm": 0.509714512716735, + "kl": 0.0060618318850174545, + "learning_rate": 2e-07, + "loss": -0.0008696913719177246, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.43333335071802137, + "reward_std": 0.4385633558034897, + "rewards/MultiModalAccuracyORM": 0.43333335071802137, + "step": 1125, + "train_speed(iter/s)": 0.0315 + }, + { + "clip_ratio": 0.0, + "completion_length": 237.15, + "epoch": 0.45656565656565656, + "grad_norm": 12.152246394116803, + "kl": 0.010060751531273126, + "learning_rate": 2e-07, + "loss": 0.044137763977050784, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.26666667461395266, + "reward_std": 0.3440760403871536, + "rewards/MultiModalAccuracyORM": 0.26666667461395266, + "step": 1130, + "train_speed(iter/s)": 0.031529 + }, + { + "clip_ratio": 0.0, + "completion_length": 423.0, + "epoch": 0.4585858585858586, + "grad_norm": 0.9173177729995868, + "kl": 0.005417682533152402, + "learning_rate": 2e-07, + "loss": -0.0018961310386657714, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.07500000298023224, + "reward_std": 0.1481528401374817, + "rewards/MultiModalAccuracyORM": 0.07500000298023224, + "step": 1135, + "train_speed(iter/s)": 0.031537 + }, + { + "clip_ratio": 0.0, + "completion_length": 290.7, + "epoch": 0.46060606060606063, + "grad_norm": 1.2629928855399732, + "kl": 0.007898857281543315, + "learning_rate": 2e-07, + "loss": 0.0265865683555603, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.23333333879709245, + "reward_std": 0.28697867393493653, + "rewards/MultiModalAccuracyORM": 0.23333333879709245, + "step": 1140, + "train_speed(iter/s)": 0.031512 + }, + { + "clip_ratio": 0.0, + "completion_length": 406.35, + "epoch": 0.4626262626262626, + "grad_norm": 0.012765946285130914, + "kl": 0.005864207935519517, + "learning_rate": 2e-07, + "loss": 0.005378928780555725, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.10833333507180214, + "reward_std": 0.22629254460334777, + "rewards/MultiModalAccuracyORM": 0.10833333507180214, + "step": 1145, + "train_speed(iter/s)": 0.031533 + }, + { + "clip_ratio": 0.0, + "completion_length": 371.75, + "epoch": 0.46464646464646464, + "grad_norm": 1.2497788736637212, + "kl": 0.00878450043965131, + "learning_rate": 2e-07, + "loss": 0.04154196977615356, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.05, + "reward": 0.2583333417773247, + "reward_std": 0.33478758931159974, + "rewards/MultiModalAccuracyORM": 0.2583333417773247, + "step": 1150, + "train_speed(iter/s)": 0.031545 + }, + { + "clip_ratio": 0.0, + "completion_length": 328.6, + "epoch": 0.4666666666666667, + "grad_norm": 1.144170406383162, + "kl": 0.00682174377143383, + "learning_rate": 2e-07, + "loss": 0.054303860664367674, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.291666679084301, + "reward_std": 0.31667476892471313, + "rewards/MultiModalAccuracyORM": 0.291666679084301, + "step": 1155, + "train_speed(iter/s)": 0.031578 + }, + { + "clip_ratio": 0.0, + "completion_length": 270.75, + "epoch": 0.4686868686868687, + "grad_norm": 1.706239065161423, + "kl": 0.007611270109191537, + "learning_rate": 2e-07, + "loss": 0.05665465593338013, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.3416666738688946, + "reward_std": 0.3656185895204544, + "rewards/MultiModalAccuracyORM": 0.3416666738688946, + "step": 1160, + "train_speed(iter/s)": 0.031587 + }, + { + "clip_ratio": 0.0, + "completion_length": 275.9, + "epoch": 0.4707070707070707, + "grad_norm": 0.6591730740554306, + "kl": 0.006474771653302014, + "learning_rate": 2e-07, + "loss": 0.0037678249180316927, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.13333333730697633, + "reward_std": 0.24255654215812683, + "rewards/MultiModalAccuracyORM": 0.13333333730697633, + "step": 1165, + "train_speed(iter/s)": 0.031615 + }, + { + "clip_ratio": 0.0, + "completion_length": 347.85, + "epoch": 0.4727272727272727, + "grad_norm": 0.8135071730864046, + "kl": 0.007919127470813692, + "learning_rate": 2e-07, + "loss": 0.042950406670570374, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.33333334177732465, + "reward_std": 0.4546443074941635, + "rewards/MultiModalAccuracyORM": 0.33333334177732465, + "step": 1170, + "train_speed(iter/s)": 0.031644 + }, + { + "clip_ratio": 0.0, + "completion_length": 388.35, + "epoch": 0.47474747474747475, + "grad_norm": 0.735952514646633, + "kl": 0.00741737331263721, + "learning_rate": 2e-07, + "loss": -0.020889997482299805, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.22500000447034835, + "reward_std": 0.34076098203659055, + "rewards/MultiModalAccuracyORM": 0.22500000447034835, + "step": 1175, + "train_speed(iter/s)": 0.031661 + }, + { + "clip_ratio": 0.0, + "completion_length": 373.25, + "epoch": 0.4767676767676768, + "grad_norm": 2.0420078916899143, + "kl": 0.007773328572511673, + "learning_rate": 2e-07, + "loss": -0.020136108994483946, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.21666667386889457, + "reward_std": 0.37396675944328306, + "rewards/MultiModalAccuracyORM": 0.21666667386889457, + "step": 1180, + "train_speed(iter/s)": 0.031682 + }, + { + "clip_ratio": 0.0, + "completion_length": 426.3, + "epoch": 0.47878787878787876, + "grad_norm": 1.50872882008361, + "kl": 0.006335928500629961, + "learning_rate": 2e-07, + "loss": 0.06880509257316589, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.27500001043081285, + "reward_std": 0.3852572590112686, + "rewards/MultiModalAccuracyORM": 0.27500001043081285, + "step": 1185, + "train_speed(iter/s)": 0.031678 + }, + { + "clip_ratio": 0.0, + "completion_length": 358.45, + "epoch": 0.4808080808080808, + "grad_norm": 0.7395112570215242, + "kl": 0.007393318344838917, + "learning_rate": 2e-07, + "loss": 0.02349342405796051, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.1666666731238365, + "reward_std": 0.2574403375387192, + "rewards/MultiModalAccuracyORM": 0.1666666731238365, + "step": 1190, + "train_speed(iter/s)": 0.031694 + }, + { + "clip_ratio": 0.0, + "completion_length": 347.9, + "epoch": 0.48282828282828283, + "grad_norm": 0.8770660085210343, + "kl": 0.008547824015840888, + "learning_rate": 2e-07, + "loss": 0.00840257853269577, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.2583333380520344, + "reward_std": 0.3003867596387863, + "rewards/MultiModalAccuracyORM": 0.2583333380520344, + "step": 1195, + "train_speed(iter/s)": 0.031694 + }, + { + "clip_ratio": 0.0, + "completion_length": 248.55, + "epoch": 0.48484848484848486, + "grad_norm": 4.837760223432283, + "kl": 0.007676198193803429, + "learning_rate": 2e-07, + "loss": 0.04577964842319489, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.27500000298023225, + "reward_std": 0.41469616293907163, + "rewards/MultiModalAccuracyORM": 0.27500000298023225, + "step": 1200, + "train_speed(iter/s)": 0.031731 + }, + { + "clip_ratio": 0.0, + "completion_length": 370.1, + "epoch": 0.4868686868686869, + "grad_norm": 0.8131117130602656, + "kl": 0.006847620429471135, + "learning_rate": 2e-07, + "loss": 0.038221675157547, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.46666667610406876, + "reward_std": 0.44790194034576414, + "rewards/MultiModalAccuracyORM": 0.46666667610406876, + "step": 1205, + "train_speed(iter/s)": 0.031756 + }, + { + "clip_ratio": 0.0, + "completion_length": 214.2, + "epoch": 0.4888888888888889, + "grad_norm": 1.0083613540747984, + "kl": 0.009360355604439975, + "learning_rate": 2e-07, + "loss": 0.04207033514976501, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.44166667461395265, + "reward_std": 0.30734776258468627, + "rewards/MultiModalAccuracyORM": 0.44166667461395265, + "step": 1210, + "train_speed(iter/s)": 0.031802 + }, + { + "clip_ratio": 0.0, + "completion_length": 370.75, + "epoch": 0.4909090909090909, + "grad_norm": 0.8535636422021001, + "kl": 0.007888032216578722, + "learning_rate": 2e-07, + "loss": 0.02074309587478638, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.3750000029802322, + "reward_std": 0.36721318662166597, + "rewards/MultiModalAccuracyORM": 0.3750000029802322, + "step": 1215, + "train_speed(iter/s)": 0.031817 + }, + { + "clip_ratio": 0.0, + "completion_length": 301.45, + "epoch": 0.49292929292929294, + "grad_norm": 1.1747245735311718, + "kl": 0.005809159600175917, + "learning_rate": 2e-07, + "loss": 0.030560284852981567, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.10000000223517418, + "reward_std": 0.24860407412052155, + "rewards/MultiModalAccuracyORM": 0.10000000223517418, + "step": 1220, + "train_speed(iter/s)": 0.031788 + }, + { + "clip_ratio": 0.0, + "completion_length": 316.7, + "epoch": 0.494949494949495, + "grad_norm": 1.3342180404809851, + "kl": 0.007361576543189585, + "learning_rate": 2e-07, + "loss": 0.005729189515113831, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.3666666805744171, + "reward_std": 0.42749726176261904, + "rewards/MultiModalAccuracyORM": 0.3666666805744171, + "step": 1225, + "train_speed(iter/s)": 0.03183 + }, + { + "clip_ratio": 0.0, + "completion_length": 264.4, + "epoch": 0.49696969696969695, + "grad_norm": 1.3354967765672678, + "kl": 0.0077354055363684894, + "learning_rate": 2e-07, + "loss": 0.011936230957508088, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.2500000037252903, + "reward_std": 0.28934225142002107, + "rewards/MultiModalAccuracyORM": 0.2500000037252903, + "step": 1230, + "train_speed(iter/s)": 0.031862 + }, + { + "clip_ratio": 0.0, + "completion_length": 372.85, + "epoch": 0.498989898989899, + "grad_norm": 1.7029900631069643, + "kl": 0.009492517588660121, + "learning_rate": 2e-07, + "loss": 0.026520213484764098, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.21666667088866234, + "reward_std": 0.39930057227611543, + "rewards/MultiModalAccuracyORM": 0.21666667088866234, + "step": 1235, + "train_speed(iter/s)": 0.03189 + }, + { + "clip_ratio": 0.0, + "completion_length": 437.5, + "epoch": 0.501010101010101, + "grad_norm": 0.8836311467778365, + "kl": 0.007044275873340666, + "learning_rate": 2e-07, + "loss": 0.02124674618244171, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.23333334177732468, + "reward_std": 0.26518189907073975, + "rewards/MultiModalAccuracyORM": 0.23333334177732468, + "step": 1240, + "train_speed(iter/s)": 0.031898 + }, + { + "clip_ratio": 0.0, + "completion_length": 456.55, + "epoch": 0.503030303030303, + "grad_norm": 0.713932604855569, + "kl": 0.007297229184769094, + "learning_rate": 2e-07, + "loss": -0.049902528524398804, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.05, + "reward": 0.21666667535901069, + "reward_std": 0.3636551022529602, + "rewards/MultiModalAccuracyORM": 0.21666667535901069, + "step": 1245, + "train_speed(iter/s)": 0.031896 + }, + { + "epoch": 0.5050505050505051, + "grad_norm": 1.0772405355167842, + "learning_rate": 2e-07, + "loss": 0.023000609874725342, + "memory(GiB)": 67.41, + "step": 1250, + "train_speed(iter/s)": 0.031918 + }, + { + "epoch": 0.5050505050505051, + "eval_clip_ratio": 0.0, + "eval_completion_length": 317.3300076293945, + "eval_kl": 0.008607916957698762, + "eval_loss": 0.04203889146447182, + "eval_response_clip_ratio": 0.005000000149011612, + "eval_reward": 0.26166667401790616, + "eval_reward_std": 0.33101949989795687, + "eval_rewards/MultiModalAccuracyORM": 0.26166667401790616, + "eval_runtime": 649.7206, + "eval_samples_per_second": 0.077, + "eval_steps_per_second": 0.008, + "step": 1250 + }, + { + "clip_ratio": 0.0, + "completion_length": 386.7, + "epoch": 0.5070707070707071, + "grad_norm": 0.8869623944003157, + "kl": 0.007809204491786658, + "learning_rate": 2e-07, + "loss": 0.08510025143623352, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.3166666753590107, + "reward_std": 0.3126992493867874, + "rewards/MultiModalAccuracyORM": 0.3166666753590107, + "step": 1255, + "train_speed(iter/s)": 0.031124 + }, + { + "clip_ratio": 0.0, + "completion_length": 239.6, + "epoch": 0.509090909090909, + "grad_norm": 1.5782361460355463, + "kl": 0.009718046616762876, + "learning_rate": 2e-07, + "loss": 0.06461310386657715, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.3416666783392429, + "reward_std": 0.39375568330287936, + "rewards/MultiModalAccuracyORM": 0.3416666783392429, + "step": 1260, + "train_speed(iter/s)": 0.031165 + }, + { + "clip_ratio": 0.0, + "completion_length": 309.85, + "epoch": 0.5111111111111111, + "grad_norm": 2.199476894866435, + "kl": 0.007806334691122174, + "learning_rate": 2e-07, + "loss": 0.006014569103717804, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.22500000149011612, + "reward_std": 0.24265173375606536, + "rewards/MultiModalAccuracyORM": 0.22500000149011612, + "step": 1265, + "train_speed(iter/s)": 0.031162 + }, + { + "clip_ratio": 0.0, + "completion_length": 333.35, + "epoch": 0.5131313131313131, + "grad_norm": 0.7012273951483338, + "kl": 0.009944566525518894, + "learning_rate": 2e-07, + "loss": 0.08870444297790528, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.39166668206453326, + "reward_std": 0.407406410574913, + "rewards/MultiModalAccuracyORM": 0.39166668206453326, + "step": 1270, + "train_speed(iter/s)": 0.031181 + }, + { + "clip_ratio": 0.0, + "completion_length": 241.9, + "epoch": 0.5151515151515151, + "grad_norm": 0.8594993279946802, + "kl": 0.009327950514853, + "learning_rate": 2e-07, + "loss": -0.0274441659450531, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.31666667610406873, + "reward_std": 0.38369824588298795, + "rewards/MultiModalAccuracyORM": 0.31666667610406873, + "step": 1275, + "train_speed(iter/s)": 0.031192 + }, + { + "clip_ratio": 0.0, + "completion_length": 263.5, + "epoch": 0.5171717171717172, + "grad_norm": 1.605471507408993, + "kl": 0.008260847954079508, + "learning_rate": 2e-07, + "loss": 0.01983429193496704, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.1500000037252903, + "reward_std": 0.21149236261844634, + "rewards/MultiModalAccuracyORM": 0.1500000037252903, + "step": 1280, + "train_speed(iter/s)": 0.03122 + }, + { + "clip_ratio": 0.0, + "completion_length": 268.75, + "epoch": 0.5191919191919192, + "grad_norm": 0.7970718049407819, + "kl": 0.009291452821344137, + "learning_rate": 2e-07, + "loss": -0.0878964126110077, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.24166667461395264, + "reward_std": 0.3267081886529922, + "rewards/MultiModalAccuracyORM": 0.24166667461395264, + "step": 1285, + "train_speed(iter/s)": 0.031244 + }, + { + "clip_ratio": 0.0, + "completion_length": 272.7, + "epoch": 0.5212121212121212, + "grad_norm": 0.03636319780256794, + "kl": 0.010186967998743057, + "learning_rate": 2e-07, + "loss": 0.014943599700927734, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.14166666865348815, + "reward_std": 0.2621566504240036, + "rewards/MultiModalAccuracyORM": 0.14166666865348815, + "step": 1290, + "train_speed(iter/s)": 0.031292 + }, + { + "clip_ratio": 0.0, + "completion_length": 350.85, + "epoch": 0.5232323232323233, + "grad_norm": 0.5560985576302439, + "kl": 0.008923888113349676, + "learning_rate": 2e-07, + "loss": 0.01636694073677063, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.05, + "reward": 0.3833333387970924, + "reward_std": 0.379781112074852, + "rewards/MultiModalAccuracyORM": 0.3833333387970924, + "step": 1295, + "train_speed(iter/s)": 0.031311 + }, + { + "clip_ratio": 0.0, + "completion_length": 347.65, + "epoch": 0.5252525252525253, + "grad_norm": 1.0969162898318026, + "kl": 0.007647776743397117, + "learning_rate": 2e-07, + "loss": 0.011600933969020844, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.2916666679084301, + "reward_std": 0.33984750509262085, + "rewards/MultiModalAccuracyORM": 0.2916666679084301, + "step": 1300, + "train_speed(iter/s)": 0.031311 + }, + { + "clip_ratio": 0.0, + "completion_length": 333.3, + "epoch": 0.5272727272727272, + "grad_norm": 1.0609203809073788, + "kl": 0.009041132358834147, + "learning_rate": 2e-07, + "loss": 0.05796287655830383, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.37500000968575475, + "reward_std": 0.395568910241127, + "rewards/MultiModalAccuracyORM": 0.37500000968575475, + "step": 1305, + "train_speed(iter/s)": 0.031317 + }, + { + "clip_ratio": 0.0, + "completion_length": 307.35, + "epoch": 0.5292929292929293, + "grad_norm": 1.2425047304213874, + "kl": 0.0067569724516943095, + "learning_rate": 2e-07, + "loss": 0.0740867018699646, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.30000000447034836, + "reward_std": 0.35792473554611204, + "rewards/MultiModalAccuracyORM": 0.30000000447034836, + "step": 1310, + "train_speed(iter/s)": 0.031328 + }, + { + "clip_ratio": 0.0, + "completion_length": 379.15, + "epoch": 0.5313131313131313, + "grad_norm": 0.9808794955952097, + "kl": 0.010406963923014701, + "learning_rate": 2e-07, + "loss": 0.01893787384033203, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.26666667237877845, + "reward_std": 0.3398119151592255, + "rewards/MultiModalAccuracyORM": 0.26666667237877845, + "step": 1315, + "train_speed(iter/s)": 0.031339 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.9, + "epoch": 0.5333333333333333, + "grad_norm": 1.2456162412938805, + "kl": 0.011189991328865289, + "learning_rate": 2e-07, + "loss": -0.014865723252296448, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.29166667237877847, + "reward_std": 0.29258317649364474, + "rewards/MultiModalAccuracyORM": 0.29166667237877847, + "step": 1320, + "train_speed(iter/s)": 0.03136 + }, + { + "clip_ratio": 0.0, + "completion_length": 253.65, + "epoch": 0.5353535353535354, + "grad_norm": 2.0692124425676828, + "kl": 0.008971794368699193, + "learning_rate": 2e-07, + "loss": -0.0039320230484008786, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.3000000089406967, + "reward_std": 0.350342208147049, + "rewards/MultiModalAccuracyORM": 0.3000000089406967, + "step": 1325, + "train_speed(iter/s)": 0.03138 + }, + { + "clip_ratio": 0.0, + "completion_length": 339.95, + "epoch": 0.5373737373737374, + "grad_norm": 0.019727773431517187, + "kl": 0.008127374900504946, + "learning_rate": 2e-07, + "loss": 0.014944207668304444, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.27500000819563863, + "reward_std": 0.2855509877204895, + "rewards/MultiModalAccuracyORM": 0.27500000819563863, + "step": 1330, + "train_speed(iter/s)": 0.031384 + }, + { + "clip_ratio": 0.0, + "completion_length": 285.5, + "epoch": 0.5393939393939394, + "grad_norm": 1.8758979222370529, + "kl": 0.011889316607266665, + "learning_rate": 2e-07, + "loss": 0.016926100850105284, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.26666667610406875, + "reward_std": 0.35766717195510866, + "rewards/MultiModalAccuracyORM": 0.26666667610406875, + "step": 1335, + "train_speed(iter/s)": 0.031406 + }, + { + "clip_ratio": 0.0, + "completion_length": 260.65, + "epoch": 0.5414141414141415, + "grad_norm": 1.7504504529530354, + "kl": 0.009696374088525772, + "learning_rate": 2e-07, + "loss": 0.0762328028678894, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.28333334177732467, + "reward_std": 0.36349606812000274, + "rewards/MultiModalAccuracyORM": 0.28333334177732467, + "step": 1340, + "train_speed(iter/s)": 0.031433 + }, + { + "clip_ratio": 0.0, + "completion_length": 257.1, + "epoch": 0.5434343434343434, + "grad_norm": 1.0790393664453202, + "kl": 0.01006975807249546, + "learning_rate": 2e-07, + "loss": 0.054473668336868286, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.2750000111758709, + "reward_std": 0.3104086071252823, + "rewards/MultiModalAccuracyORM": 0.2750000111758709, + "step": 1345, + "train_speed(iter/s)": 0.03143 + }, + { + "clip_ratio": 0.0, + "completion_length": 254.4, + "epoch": 0.5454545454545454, + "grad_norm": 0.03144888010820615, + "kl": 0.009962662309408187, + "learning_rate": 2e-07, + "loss": 0.04643962681293488, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.1166666716337204, + "reward_std": 0.18083563446998596, + "rewards/MultiModalAccuracyORM": 0.1166666716337204, + "step": 1350, + "train_speed(iter/s)": 0.031451 + }, + { + "clip_ratio": 0.0, + "completion_length": 273.45, + "epoch": 0.5474747474747474, + "grad_norm": 0.8833335261829609, + "kl": 0.012588053662329911, + "learning_rate": 2e-07, + "loss": 0.009816545248031616, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.33333333805203436, + "reward_std": 0.2777854144573212, + "rewards/MultiModalAccuracyORM": 0.33333333805203436, + "step": 1355, + "train_speed(iter/s)": 0.03148 + }, + { + "clip_ratio": 0.0, + "completion_length": 296.8, + "epoch": 0.5494949494949495, + "grad_norm": 1.2021920029810926, + "kl": 0.010357017442584038, + "learning_rate": 2e-07, + "loss": 0.0067857176065444945, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.20000000819563865, + "reward_std": 0.28529676198959353, + "rewards/MultiModalAccuracyORM": 0.20000000819563865, + "step": 1360, + "train_speed(iter/s)": 0.031501 + }, + { + "clip_ratio": 0.0, + "completion_length": 346.05, + "epoch": 0.5515151515151515, + "grad_norm": 1.0531405923571842, + "kl": 0.009836095664650202, + "learning_rate": 2e-07, + "loss": -0.004297977685928345, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.21666667461395264, + "reward_std": 0.2815410941839218, + "rewards/MultiModalAccuracyORM": 0.21666667461395264, + "step": 1365, + "train_speed(iter/s)": 0.031525 + }, + { + "clip_ratio": 0.0, + "completion_length": 225.5, + "epoch": 0.5535353535353535, + "grad_norm": 0.6717065174636488, + "kl": 0.00997301978059113, + "learning_rate": 2e-07, + "loss": -0.06482144594192504, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.2416666753590107, + "reward_std": 0.3402847766876221, + "rewards/MultiModalAccuracyORM": 0.2416666753590107, + "step": 1370, + "train_speed(iter/s)": 0.03155 + }, + { + "clip_ratio": 0.0, + "completion_length": 216.2, + "epoch": 0.5555555555555556, + "grad_norm": 1.3753758040643629, + "kl": 0.012827477231621743, + "learning_rate": 2e-07, + "loss": 0.002781185507774353, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.308333333581686, + "reward_std": 0.17154421210289, + "rewards/MultiModalAccuracyORM": 0.308333333581686, + "step": 1375, + "train_speed(iter/s)": 0.031568 + }, + { + "clip_ratio": 0.0, + "completion_length": 315.3, + "epoch": 0.5575757575757576, + "grad_norm": 1.182226370113768, + "kl": 0.011787687614560127, + "learning_rate": 2e-07, + "loss": 0.047075501084327696, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.18333333656191825, + "reward_std": 0.28803746998310087, + "rewards/MultiModalAccuracyORM": 0.18333333656191825, + "step": 1380, + "train_speed(iter/s)": 0.031595 + }, + { + "clip_ratio": 0.0, + "completion_length": 263.9, + "epoch": 0.5595959595959596, + "grad_norm": 0.7310584791826561, + "kl": 0.009143536072224378, + "learning_rate": 2e-07, + "loss": 0.02444952130317688, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.28333334177732467, + "reward_std": 0.39707074761390687, + "rewards/MultiModalAccuracyORM": 0.28333334177732467, + "step": 1385, + "train_speed(iter/s)": 0.031597 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.25, + "epoch": 0.5616161616161616, + "grad_norm": 0.07084639511270675, + "kl": 0.014022548403590917, + "learning_rate": 2e-07, + "loss": 0.010427016019821166, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.24166667386889457, + "reward_std": 0.3523798406124115, + "rewards/MultiModalAccuracyORM": 0.24166667386889457, + "step": 1390, + "train_speed(iter/s)": 0.031647 + }, + { + "clip_ratio": 0.0, + "completion_length": 313.95, + "epoch": 0.5636363636363636, + "grad_norm": 1.0165143719005905, + "kl": 0.00867614927701652, + "learning_rate": 2e-07, + "loss": 0.013519459962844848, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.35833334028720853, + "reward_std": 0.4385393440723419, + "rewards/MultiModalAccuracyORM": 0.35833334028720853, + "step": 1395, + "train_speed(iter/s)": 0.031669 + }, + { + "clip_ratio": 0.0, + "completion_length": 316.65, + "epoch": 0.5656565656565656, + "grad_norm": 1.4260416543225247, + "kl": 0.012046672217547894, + "learning_rate": 2e-07, + "loss": 0.052398312091827395, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.22500000447034835, + "reward_std": 0.23631438612937927, + "rewards/MultiModalAccuracyORM": 0.22500000447034835, + "step": 1400, + "train_speed(iter/s)": 0.031689 + }, + { + "clip_ratio": 0.0, + "completion_length": 271.95, + "epoch": 0.5676767676767677, + "grad_norm": 0.9876833902717693, + "kl": 0.011473514698445797, + "learning_rate": 2e-07, + "loss": 0.012492635846138, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.22500000298023223, + "reward_std": 0.2682041823863983, + "rewards/MultiModalAccuracyORM": 0.22500000298023223, + "step": 1405, + "train_speed(iter/s)": 0.031712 + }, + { + "clip_ratio": 0.0, + "completion_length": 326.5, + "epoch": 0.5696969696969697, + "grad_norm": 1.3575244353002172, + "kl": 0.00927637224085629, + "learning_rate": 2e-07, + "loss": -0.0041919216513633725, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.308333345502615, + "reward_std": 0.3556593209505081, + "rewards/MultiModalAccuracyORM": 0.308333345502615, + "step": 1410, + "train_speed(iter/s)": 0.031729 + }, + { + "clip_ratio": 0.0, + "completion_length": 445.25, + "epoch": 0.5717171717171717, + "grad_norm": 0.47740004196130253, + "kl": 0.011411032918840647, + "learning_rate": 2e-07, + "loss": 0.042676869034767154, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.2250000074505806, + "reward_std": 0.31645613312721255, + "rewards/MultiModalAccuracyORM": 0.2250000074505806, + "step": 1415, + "train_speed(iter/s)": 0.031733 + }, + { + "clip_ratio": 0.0, + "completion_length": 286.7, + "epoch": 0.5737373737373738, + "grad_norm": 0.8014116239502665, + "kl": 0.008727412531152367, + "learning_rate": 2e-07, + "loss": 0.008877889811992645, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.20000000447034835, + "reward_std": 0.20973873138427734, + "rewards/MultiModalAccuracyORM": 0.20000000447034835, + "step": 1420, + "train_speed(iter/s)": 0.031744 + }, + { + "clip_ratio": 0.0, + "completion_length": 208.8, + "epoch": 0.5757575757575758, + "grad_norm": 0.5960886338338734, + "kl": 0.01004549846984446, + "learning_rate": 2e-07, + "loss": 0.07170453071594238, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.3666666753590107, + "reward_std": 0.3089091956615448, + "rewards/MultiModalAccuracyORM": 0.3666666753590107, + "step": 1425, + "train_speed(iter/s)": 0.031773 + }, + { + "clip_ratio": 0.0, + "completion_length": 338.7, + "epoch": 0.5777777777777777, + "grad_norm": 0.8709949111887233, + "kl": 0.01054220967926085, + "learning_rate": 2e-07, + "loss": 0.0183966726064682, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.20000000298023224, + "reward_std": 0.24255654215812683, + "rewards/MultiModalAccuracyORM": 0.20000000298023224, + "step": 1430, + "train_speed(iter/s)": 0.031768 + }, + { + "clip_ratio": 0.0, + "completion_length": 383.5, + "epoch": 0.5797979797979798, + "grad_norm": 0.4149003361127432, + "kl": 0.00967580354772508, + "learning_rate": 2e-07, + "loss": 0.01183580830693245, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.2833333402872086, + "reward_std": 0.3471368789672852, + "rewards/MultiModalAccuracyORM": 0.2833333402872086, + "step": 1435, + "train_speed(iter/s)": 0.031771 + }, + { + "clip_ratio": 0.0, + "completion_length": 288.05, + "epoch": 0.5818181818181818, + "grad_norm": 1.8127646049079418, + "kl": 0.011693871626630426, + "learning_rate": 2e-07, + "loss": -0.017411130666732787, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.20000000074505805, + "reward_std": 0.36495934426784515, + "rewards/MultiModalAccuracyORM": 0.20000000074505805, + "step": 1440, + "train_speed(iter/s)": 0.031798 + }, + { + "clip_ratio": 0.0, + "completion_length": 266.7, + "epoch": 0.5838383838383838, + "grad_norm": 1.0167459169189659, + "kl": 0.011920861806720496, + "learning_rate": 2e-07, + "loss": -0.002490566670894623, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.23333333656191826, + "reward_std": 0.3312538951635361, + "rewards/MultiModalAccuracyORM": 0.23333333656191826, + "step": 1445, + "train_speed(iter/s)": 0.031834 + }, + { + "clip_ratio": 0.0, + "completion_length": 372.8, + "epoch": 0.5858585858585859, + "grad_norm": 0.43569174707905806, + "kl": 0.010208403388969601, + "learning_rate": 2e-07, + "loss": 0.004625104367733002, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.18333334103226662, + "reward_std": 0.31266486942768096, + "rewards/MultiModalAccuracyORM": 0.18333334103226662, + "step": 1450, + "train_speed(iter/s)": 0.031851 + }, + { + "clip_ratio": 0.0, + "completion_length": 305.55, + "epoch": 0.5878787878787879, + "grad_norm": 1.2047141815062223, + "kl": 0.012308929720893503, + "learning_rate": 2e-07, + "loss": 0.013238468766212463, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.05000000149011612, + "reward_std": 0.13558491468429565, + "rewards/MultiModalAccuracyORM": 0.05000000149011612, + "step": 1455, + "train_speed(iter/s)": 0.031866 + }, + { + "clip_ratio": 0.0, + "completion_length": 310.75, + "epoch": 0.5898989898989899, + "grad_norm": 1.3779114361595524, + "kl": 0.009801013302057982, + "learning_rate": 2e-07, + "loss": 0.02388697862625122, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.3250000111758709, + "reward_std": 0.43350920975208285, + "rewards/MultiModalAccuracyORM": 0.3250000111758709, + "step": 1460, + "train_speed(iter/s)": 0.031876 + }, + { + "clip_ratio": 0.0, + "completion_length": 269.7, + "epoch": 0.591919191919192, + "grad_norm": 1.8826359120192269, + "kl": 0.011588224535807967, + "learning_rate": 2e-07, + "loss": 0.0029266417026519776, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.24166667237877845, + "reward_std": 0.3345689594745636, + "rewards/MultiModalAccuracyORM": 0.24166667237877845, + "step": 1465, + "train_speed(iter/s)": 0.031906 + }, + { + "clip_ratio": 0.0, + "completion_length": 308.7, + "epoch": 0.593939393939394, + "grad_norm": 0.9727598907618097, + "kl": 0.012582354433834552, + "learning_rate": 2e-07, + "loss": 0.04956952333450317, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.05, + "reward": 0.24166667386889457, + "reward_std": 0.27148365676403047, + "rewards/MultiModalAccuracyORM": 0.24166667386889457, + "step": 1470, + "train_speed(iter/s)": 0.03192 + }, + { + "clip_ratio": 0.0, + "completion_length": 438.0, + "epoch": 0.5959595959595959, + "grad_norm": 0.986971470103617, + "kl": 0.009621695009991526, + "learning_rate": 2e-07, + "loss": 0.02806570827960968, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.21666666939854623, + "reward_std": 0.28752902448177337, + "rewards/MultiModalAccuracyORM": 0.21666666939854623, + "step": 1475, + "train_speed(iter/s)": 0.031935 + }, + { + "clip_ratio": 0.0, + "completion_length": 232.65, + "epoch": 0.597979797979798, + "grad_norm": 2.281672700874799, + "kl": 0.014227775321342052, + "learning_rate": 2e-07, + "loss": -0.016278558969497682, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.3000000067055225, + "reward_std": 0.3494287371635437, + "rewards/MultiModalAccuracyORM": 0.3000000067055225, + "step": 1480, + "train_speed(iter/s)": 0.031979 + }, + { + "clip_ratio": 0.0, + "completion_length": 297.4, + "epoch": 0.6, + "grad_norm": 0.4294218493479977, + "kl": 0.010013082064688206, + "learning_rate": 2e-07, + "loss": 0.020896130800247194, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.23333334550261497, + "reward_std": 0.2818193256855011, + "rewards/MultiModalAccuracyORM": 0.23333334550261497, + "step": 1485, + "train_speed(iter/s)": 0.031988 + }, + { + "clip_ratio": 0.0, + "completion_length": 357.2, + "epoch": 0.602020202020202, + "grad_norm": 1.4950104429019768, + "kl": 0.010283974278718234, + "learning_rate": 2e-07, + "loss": -0.016252765059471132, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.1250000037252903, + "reward_std": 0.26292563080787656, + "rewards/MultiModalAccuracyORM": 0.1250000037252903, + "step": 1490, + "train_speed(iter/s)": 0.032019 + }, + { + "clip_ratio": 0.0, + "completion_length": 299.35, + "epoch": 0.604040404040404, + "grad_norm": 1.0324610637121567, + "kl": 0.012730671325698495, + "learning_rate": 2e-07, + "loss": 0.07304045557975769, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.3250000074505806, + "reward_std": 0.42524099349975586, + "rewards/MultiModalAccuracyORM": 0.3250000074505806, + "step": 1495, + "train_speed(iter/s)": 0.032051 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 1.0986021903349583, + "learning_rate": 2e-07, + "loss": 0.02047921419143677, + "memory(GiB)": 67.41, + "step": 1500, + "train_speed(iter/s)": 0.032077 + }, + { + "epoch": 0.6060606060606061, + "eval_clip_ratio": 0.0, + "eval_completion_length": 311.67167556762695, + "eval_kl": 0.013875643741339445, + "eval_loss": 0.022496523335576057, + "eval_response_clip_ratio": 0.005000000149011612, + "eval_reward": 0.27333333969116214, + "eval_reward_std": 0.3327353143692017, + "eval_rewards/MultiModalAccuracyORM": 0.27333333969116214, + "eval_runtime": 606.8844, + "eval_samples_per_second": 0.082, + "eval_steps_per_second": 0.008, + "step": 1500 + }, + { + "clip_ratio": 0.0, + "completion_length": 313.4, + "epoch": 0.6080808080808081, + "grad_norm": 1.6347943339220123, + "kl": 0.012441246653907001, + "learning_rate": 2e-07, + "loss": -0.0039748698472976685, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.25416667349636557, + "reward_std": 0.26936939507722857, + "rewards/MultiModalAccuracyORM": 0.25416667349636557, + "step": 1505, + "train_speed(iter/s)": 0.031443 + }, + { + "clip_ratio": 0.0, + "completion_length": 309.85, + "epoch": 0.6101010101010101, + "grad_norm": 0.7590195039303175, + "kl": 0.012008609343320131, + "learning_rate": 2e-07, + "loss": 0.027225631475448608, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.36666668206453323, + "reward_std": 0.32297652661800386, + "rewards/MultiModalAccuracyORM": 0.36666668206453323, + "step": 1510, + "train_speed(iter/s)": 0.031453 + }, + { + "clip_ratio": 0.0, + "completion_length": 330.05, + "epoch": 0.6121212121212121, + "grad_norm": 1.4989100416458765, + "kl": 0.013079424249008298, + "learning_rate": 2e-07, + "loss": 0.020418940484523772, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.30833334028720855, + "reward_std": 0.4385393440723419, + "rewards/MultiModalAccuracyORM": 0.30833334028720855, + "step": 1515, + "train_speed(iter/s)": 0.031473 + }, + { + "clip_ratio": 0.0, + "completion_length": 506.65, + "epoch": 0.6141414141414141, + "grad_norm": 0.5238837736009042, + "kl": 0.010060561215505004, + "learning_rate": 2e-07, + "loss": 0.00045015439391136167, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.05, + "reward": 0.10000000298023223, + "reward_std": 0.19337954223155976, + "rewards/MultiModalAccuracyORM": 0.10000000298023223, + "step": 1520, + "train_speed(iter/s)": 0.031458 + }, + { + "clip_ratio": 0.0, + "completion_length": 255.45, + "epoch": 0.6161616161616161, + "grad_norm": 1.393953370005201, + "kl": 0.013004821306094528, + "learning_rate": 2e-07, + "loss": 0.032045644521713254, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.23333333805203438, + "reward_std": 0.33676562607288363, + "rewards/MultiModalAccuracyORM": 0.23333333805203438, + "step": 1525, + "train_speed(iter/s)": 0.03147 + }, + { + "clip_ratio": 0.0, + "completion_length": 323.35, + "epoch": 0.6181818181818182, + "grad_norm": 0.9011779447383061, + "kl": 0.012558170035481453, + "learning_rate": 2e-07, + "loss": 0.03226361274719238, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.3750000104308128, + "reward_std": 0.31765941977500917, + "rewards/MultiModalAccuracyORM": 0.3750000104308128, + "step": 1530, + "train_speed(iter/s)": 0.031486 + }, + { + "clip_ratio": 0.0, + "completion_length": 436.35, + "epoch": 0.6202020202020202, + "grad_norm": 1.1053902425907327, + "kl": 0.011504510790109635, + "learning_rate": 2e-07, + "loss": 0.006062358617782593, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.1500000037252903, + "reward_std": 0.21149236261844634, + "rewards/MultiModalAccuracyORM": 0.1500000037252903, + "step": 1535, + "train_speed(iter/s)": 0.031502 + }, + { + "clip_ratio": 0.0, + "completion_length": 301.95, + "epoch": 0.6222222222222222, + "grad_norm": 1.5149361621486646, + "kl": 0.011535796569660306, + "learning_rate": 2e-07, + "loss": -0.05787181854248047, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.3416666746139526, + "reward_std": 0.3985701471567154, + "rewards/MultiModalAccuracyORM": 0.3416666746139526, + "step": 1540, + "train_speed(iter/s)": 0.031507 + }, + { + "clip_ratio": 0.0, + "completion_length": 354.75, + "epoch": 0.6242424242424243, + "grad_norm": 0.6317974358994652, + "kl": 0.011278041498735547, + "learning_rate": 2e-07, + "loss": -0.008586804568767547, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.17500000223517417, + "reward_std": 0.24640740752220153, + "rewards/MultiModalAccuracyORM": 0.17500000223517417, + "step": 1545, + "train_speed(iter/s)": 0.03152 + }, + { + "clip_ratio": 0.0, + "completion_length": 272.8, + "epoch": 0.6262626262626263, + "grad_norm": 1.1228824311930454, + "kl": 0.012291358271613716, + "learning_rate": 2e-07, + "loss": 0.07681397199630738, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.2416666753590107, + "reward_std": 0.41868501007556913, + "rewards/MultiModalAccuracyORM": 0.2416666753590107, + "step": 1550, + "train_speed(iter/s)": 0.031545 + }, + { + "clip_ratio": 0.0, + "completion_length": 291.1, + "epoch": 0.6282828282828283, + "grad_norm": 1.263695035441367, + "kl": 0.016078970720991494, + "learning_rate": 2e-07, + "loss": 0.00948096513748169, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.3166666738688946, + "reward_std": 0.3677601933479309, + "rewards/MultiModalAccuracyORM": 0.3166666738688946, + "step": 1555, + "train_speed(iter/s)": 0.03156 + }, + { + "clip_ratio": 0.0, + "completion_length": 308.05, + "epoch": 0.6303030303030303, + "grad_norm": 1.3808232047869045, + "kl": 0.016249435674399137, + "learning_rate": 2e-07, + "loss": -0.009509658813476563, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.2083333373069763, + "reward_std": 0.31645613312721255, + "rewards/MultiModalAccuracyORM": 0.2083333373069763, + "step": 1560, + "train_speed(iter/s)": 0.031583 + }, + { + "clip_ratio": 0.0, + "completion_length": 227.7, + "epoch": 0.6323232323232323, + "grad_norm": 4.203818809917105, + "kl": 0.015438845753669739, + "learning_rate": 2e-07, + "loss": 0.056962573528289796, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.24166667312383652, + "reward_std": 0.273775514960289, + "rewards/MultiModalAccuracyORM": 0.24166667312383652, + "step": 1565, + "train_speed(iter/s)": 0.031611 + }, + { + "clip_ratio": 0.0, + "completion_length": 425.6, + "epoch": 0.6343434343434343, + "grad_norm": 1.1171108239340608, + "kl": 0.009527648240327835, + "learning_rate": 2e-07, + "loss": 0.043929648399353025, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.15833334028720855, + "reward_std": 0.2940494179725647, + "rewards/MultiModalAccuracyORM": 0.15833334028720855, + "step": 1570, + "train_speed(iter/s)": 0.03162 + }, + { + "clip_ratio": 0.0, + "completion_length": 260.95, + "epoch": 0.6363636363636364, + "grad_norm": 1.2373925332541635, + "kl": 0.014681565202772617, + "learning_rate": 2e-07, + "loss": 0.0012456446886062623, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.1583333395421505, + "reward_std": 0.27148365676403047, + "rewards/MultiModalAccuracyORM": 0.1583333395421505, + "step": 1575, + "train_speed(iter/s)": 0.031648 + }, + { + "clip_ratio": 0.0, + "completion_length": 233.7, + "epoch": 0.6383838383838384, + "grad_norm": 2.6797021271180217, + "kl": 0.017097664810717107, + "learning_rate": 2e-07, + "loss": -0.02114928364753723, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.23333333805203438, + "reward_std": 0.33300994634628295, + "rewards/MultiModalAccuracyORM": 0.23333333805203438, + "step": 1580, + "train_speed(iter/s)": 0.031683 + }, + { + "clip_ratio": 0.0, + "completion_length": 291.7, + "epoch": 0.6404040404040404, + "grad_norm": 0.9201592598726605, + "kl": 0.012181163858622312, + "learning_rate": 2e-07, + "loss": 0.04073759019374847, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.1583333358168602, + "reward_std": 0.2822715133428574, + "rewards/MultiModalAccuracyORM": 0.1583333358168602, + "step": 1585, + "train_speed(iter/s)": 0.031693 + }, + { + "clip_ratio": 0.0, + "completion_length": 264.55, + "epoch": 0.6424242424242425, + "grad_norm": 0.8369270384527866, + "kl": 0.012538785161450506, + "learning_rate": 2e-07, + "loss": 0.007032622396945953, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.2250000022351742, + "reward_std": 0.20817729830741882, + "rewards/MultiModalAccuracyORM": 0.2250000022351742, + "step": 1590, + "train_speed(iter/s)": 0.03169 + }, + { + "clip_ratio": 0.0, + "completion_length": 227.45, + "epoch": 0.6444444444444445, + "grad_norm": 1.6359484517192477, + "kl": 0.016188242752105, + "learning_rate": 2e-07, + "loss": 0.018440821766853334, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.20000000819563865, + "reward_std": 0.24637181758880616, + "rewards/MultiModalAccuracyORM": 0.20000000819563865, + "step": 1595, + "train_speed(iter/s)": 0.031718 + }, + { + "clip_ratio": 0.0, + "completion_length": 282.65, + "epoch": 0.6464646464646465, + "grad_norm": 0.8089885473247238, + "kl": 0.016708724852651357, + "learning_rate": 2e-07, + "loss": -0.05247594714164734, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.2833333402872086, + "reward_std": 0.34691824316978453, + "rewards/MultiModalAccuracyORM": 0.2833333402872086, + "step": 1600, + "train_speed(iter/s)": 0.03174 + }, + { + "clip_ratio": 0.0, + "completion_length": 282.2, + "epoch": 0.6484848484848484, + "grad_norm": 0.9909588366261658, + "kl": 0.01558589404448867, + "learning_rate": 2e-07, + "loss": 0.019981113076210023, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.20000000447034835, + "reward_std": 0.2862814128398895, + "rewards/MultiModalAccuracyORM": 0.20000000447034835, + "step": 1605, + "train_speed(iter/s)": 0.031763 + }, + { + "clip_ratio": 0.0, + "completion_length": 329.55, + "epoch": 0.6505050505050505, + "grad_norm": 0.6306196270122503, + "kl": 0.012664367025718094, + "learning_rate": 2e-07, + "loss": -0.022629472613334655, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.08333333507180214, + "reward_std": 0.20967912971973418, + "rewards/MultiModalAccuracyORM": 0.08333333507180214, + "step": 1610, + "train_speed(iter/s)": 0.031768 + }, + { + "clip_ratio": 0.0, + "completion_length": 276.95, + "epoch": 0.6525252525252525, + "grad_norm": 0.7277318686643045, + "kl": 0.012076504435390234, + "learning_rate": 2e-07, + "loss": 0.010481297969818115, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.35000000670552256, + "reward_std": 0.29784067571163175, + "rewards/MultiModalAccuracyORM": 0.35000000670552256, + "step": 1615, + "train_speed(iter/s)": 0.031784 + }, + { + "clip_ratio": 0.0, + "completion_length": 273.85, + "epoch": 0.6545454545454545, + "grad_norm": 0.02891952012425108, + "kl": 0.014772931393235923, + "learning_rate": 2e-07, + "loss": 0.04711937606334686, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.3083333447575569, + "reward_std": 0.3480859398841858, + "rewards/MultiModalAccuracyORM": 0.3083333447575569, + "step": 1620, + "train_speed(iter/s)": 0.0318 + }, + { + "clip_ratio": 0.0, + "completion_length": 237.95, + "epoch": 0.6565656565656566, + "grad_norm": 0.8446774152498904, + "kl": 0.029250213177874684, + "learning_rate": 2e-07, + "loss": -0.022804903984069824, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.2666666738688946, + "reward_std": 0.29079394936561587, + "rewards/MultiModalAccuracyORM": 0.2666666738688946, + "step": 1625, + "train_speed(iter/s)": 0.031824 + }, + { + "clip_ratio": 0.0, + "completion_length": 341.8, + "epoch": 0.6585858585858586, + "grad_norm": 1.2388651468227543, + "kl": 0.012895361986011267, + "learning_rate": 2e-07, + "loss": 0.013838109374046326, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.22500000521540642, + "reward_std": 0.32376655340194704, + "rewards/MultiModalAccuracyORM": 0.22500000521540642, + "step": 1630, + "train_speed(iter/s)": 0.031847 + }, + { + "clip_ratio": 0.0, + "completion_length": 227.05, + "epoch": 0.6606060606060606, + "grad_norm": 1.6308326246006755, + "kl": 0.01930234730243683, + "learning_rate": 2e-07, + "loss": 0.030217719078063966, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.28333334252238274, + "reward_std": 0.2985951125621796, + "rewards/MultiModalAccuracyORM": 0.28333334252238274, + "step": 1635, + "train_speed(iter/s)": 0.031884 + }, + { + "clip_ratio": 0.0, + "completion_length": 407.65, + "epoch": 0.6626262626262627, + "grad_norm": 0.6347971168876615, + "kl": 0.013101364299654961, + "learning_rate": 2e-07, + "loss": 0.04347882270812988, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.43333334252238276, + "reward_std": 0.36113006472587583, + "rewards/MultiModalAccuracyORM": 0.43333334252238276, + "step": 1640, + "train_speed(iter/s)": 0.031887 + }, + { + "clip_ratio": 0.0, + "completion_length": 280.25, + "epoch": 0.6646464646464646, + "grad_norm": 2.8764533574893005, + "kl": 0.022802903782576323, + "learning_rate": 2e-07, + "loss": 0.008072008192539216, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.33333334401249887, + "reward_std": 0.38835368156433103, + "rewards/MultiModalAccuracyORM": 0.33333334401249887, + "step": 1645, + "train_speed(iter/s)": 0.031918 + }, + { + "clip_ratio": 0.0, + "completion_length": 280.45, + "epoch": 0.6666666666666666, + "grad_norm": 0.027226074705726515, + "kl": 0.012724117608740926, + "learning_rate": 2e-07, + "loss": 0.008971738815307616, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.25833333730697633, + "reward_std": 0.2652174890041351, + "rewards/MultiModalAccuracyORM": 0.25833333730697633, + "step": 1650, + "train_speed(iter/s)": 0.031934 + }, + { + "clip_ratio": 0.0, + "completion_length": 387.9, + "epoch": 0.6686868686868687, + "grad_norm": 1.0372426479480876, + "kl": 0.011012718360871077, + "learning_rate": 2e-07, + "loss": 0.046323955059051514, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.1833333395421505, + "reward_std": 0.3229169249534607, + "rewards/MultiModalAccuracyORM": 0.1833333395421505, + "step": 1655, + "train_speed(iter/s)": 0.03195 + }, + { + "clip_ratio": 0.0, + "completion_length": 315.55, + "epoch": 0.6707070707070707, + "grad_norm": 0.7414666681358184, + "kl": 0.01629993673413992, + "learning_rate": 2e-07, + "loss": 0.037484565377235414, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.23333333507180215, + "reward_std": 0.2813224524259567, + "rewards/MultiModalAccuracyORM": 0.23333333507180215, + "step": 1660, + "train_speed(iter/s)": 0.031968 + }, + { + "clip_ratio": 0.0, + "completion_length": 238.0, + "epoch": 0.6727272727272727, + "grad_norm": 0.7494737035564301, + "kl": 0.014933030121028423, + "learning_rate": 2e-07, + "loss": -0.010917484760284424, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.20833333656191827, + "reward_std": 0.2526139736175537, + "rewards/MultiModalAccuracyORM": 0.20833333656191827, + "step": 1665, + "train_speed(iter/s)": 0.03199 + }, + { + "clip_ratio": 0.0, + "completion_length": 221.15, + "epoch": 0.6747474747474748, + "grad_norm": 0.8825460973521991, + "kl": 0.020983812306076287, + "learning_rate": 2e-07, + "loss": 0.029893827438354493, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.25833334028720856, + "reward_std": 0.36012140214443206, + "rewards/MultiModalAccuracyORM": 0.25833334028720856, + "step": 1670, + "train_speed(iter/s)": 0.032 + }, + { + "clip_ratio": 0.0, + "completion_length": 245.4, + "epoch": 0.6767676767676768, + "grad_norm": 0.850414025178336, + "kl": 0.013601220259442926, + "learning_rate": 2e-07, + "loss": 0.06674546003341675, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.2083333410322666, + "reward_std": 0.2629852324724197, + "rewards/MultiModalAccuracyORM": 0.2083333410322666, + "step": 1675, + "train_speed(iter/s)": 0.032024 + }, + { + "clip_ratio": 0.0, + "completion_length": 251.45, + "epoch": 0.6787878787878788, + "grad_norm": 1.1643033783932533, + "kl": 0.01739194723777473, + "learning_rate": 2e-07, + "loss": -0.02118738889694214, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.2250000037252903, + "reward_std": 0.35642533004283905, + "rewards/MultiModalAccuracyORM": 0.2250000037252903, + "step": 1680, + "train_speed(iter/s)": 0.032041 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.45, + "epoch": 0.6808080808080809, + "grad_norm": 1.1839236469929488, + "kl": 0.015266428608447314, + "learning_rate": 2e-07, + "loss": 0.0050781965255737305, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.350000012665987, + "reward_std": 0.28154108822345736, + "rewards/MultiModalAccuracyORM": 0.350000012665987, + "step": 1685, + "train_speed(iter/s)": 0.032082 + }, + { + "clip_ratio": 0.0, + "completion_length": 217.2, + "epoch": 0.6828282828282828, + "grad_norm": 0.02924478206081596, + "kl": 0.026798779796808957, + "learning_rate": 2e-07, + "loss": 0.01288943737745285, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.20833334177732468, + "reward_std": 0.2581707626581192, + "rewards/MultiModalAccuracyORM": 0.20833334177732468, + "step": 1690, + "train_speed(iter/s)": 0.032119 + }, + { + "clip_ratio": 0.0, + "completion_length": 255.9, + "epoch": 0.6848484848484848, + "grad_norm": 0.4842656141561725, + "kl": 0.07169700982049107, + "learning_rate": 2e-07, + "loss": 0.0162178635597229, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.3666666798293591, + "reward_std": 0.3265135705471039, + "rewards/MultiModalAccuracyORM": 0.3666666798293591, + "step": 1695, + "train_speed(iter/s)": 0.032135 + }, + { + "clip_ratio": 0.0, + "completion_length": 255.1, + "epoch": 0.6868686868686869, + "grad_norm": 1.6517445880602681, + "kl": 0.014678607648238539, + "learning_rate": 2e-07, + "loss": -0.0174052894115448, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.20833333656191827, + "reward_std": 0.33326417207717896, + "rewards/MultiModalAccuracyORM": 0.20833333656191827, + "step": 1700, + "train_speed(iter/s)": 0.032154 + }, + { + "clip_ratio": 0.0, + "completion_length": 242.55, + "epoch": 0.6888888888888889, + "grad_norm": 1.3895903493056987, + "kl": 0.023576964903622866, + "learning_rate": 2e-07, + "loss": -0.044988250732421874, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.28333333879709244, + "reward_std": 0.28252573907375333, + "rewards/MultiModalAccuracyORM": 0.28333333879709244, + "step": 1705, + "train_speed(iter/s)": 0.032181 + }, + { + "clip_ratio": 0.0, + "completion_length": 309.2, + "epoch": 0.6909090909090909, + "grad_norm": 1.0976488092373375, + "kl": 0.013535353261977435, + "learning_rate": 2e-07, + "loss": 0.02341702878475189, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.2500000074505806, + "reward_std": 0.32902405858039857, + "rewards/MultiModalAccuracyORM": 0.2500000074505806, + "step": 1710, + "train_speed(iter/s)": 0.032206 + }, + { + "clip_ratio": 0.0, + "completion_length": 281.5, + "epoch": 0.692929292929293, + "grad_norm": 0.8889266103149475, + "kl": 0.019961224216967822, + "learning_rate": 2e-07, + "loss": 0.019682276248931884, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.35833333879709245, + "reward_std": 0.27934442162513734, + "rewards/MultiModalAccuracyORM": 0.35833333879709245, + "step": 1715, + "train_speed(iter/s)": 0.032218 + }, + { + "clip_ratio": 0.0, + "completion_length": 247.75, + "epoch": 0.694949494949495, + "grad_norm": 0.8073289767388547, + "kl": 0.015237010596320034, + "learning_rate": 2e-07, + "loss": 0.017612373828887938, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.2416666716337204, + "reward_std": 0.38450281620025634, + "rewards/MultiModalAccuracyORM": 0.2416666716337204, + "step": 1720, + "train_speed(iter/s)": 0.032235 + }, + { + "clip_ratio": 0.0, + "completion_length": 278.85, + "epoch": 0.696969696969697, + "grad_norm": 1.204716243957595, + "kl": 0.015058515965938568, + "learning_rate": 2e-07, + "loss": 0.02674557566642761, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.2750000089406967, + "reward_std": 0.3480859398841858, + "rewards/MultiModalAccuracyORM": 0.2750000089406967, + "step": 1725, + "train_speed(iter/s)": 0.032253 + }, + { + "clip_ratio": 0.0, + "completion_length": 304.3, + "epoch": 0.6989898989898989, + "grad_norm": 0.8965654466446099, + "kl": 0.011863613128662109, + "learning_rate": 2e-07, + "loss": 0.055030471086502074, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.20833333805203438, + "reward_std": 0.35343863666057584, + "rewards/MultiModalAccuracyORM": 0.20833333805203438, + "step": 1730, + "train_speed(iter/s)": 0.032269 + }, + { + "clip_ratio": 0.0, + "completion_length": 315.8, + "epoch": 0.701010101010101, + "grad_norm": 0.5431864929728417, + "kl": 0.015647308621555566, + "learning_rate": 2e-07, + "loss": -0.039025521278381346, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.1583333358168602, + "reward_std": 0.23004821836948394, + "rewards/MultiModalAccuracyORM": 0.1583333358168602, + "step": 1735, + "train_speed(iter/s)": 0.032274 + }, + { + "clip_ratio": 0.0, + "completion_length": 218.75, + "epoch": 0.703030303030303, + "grad_norm": 1.7430511187722826, + "kl": 0.019717163406312466, + "learning_rate": 2e-07, + "loss": -0.02261778712272644, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.2083333395421505, + "reward_std": 0.3041424334049225, + "rewards/MultiModalAccuracyORM": 0.2083333395421505, + "step": 1740, + "train_speed(iter/s)": 0.032306 + }, + { + "clip_ratio": 0.0, + "completion_length": 243.05, + "epoch": 0.705050505050505, + "grad_norm": 0.036996154308780546, + "kl": 0.02013384862802923, + "learning_rate": 2e-07, + "loss": 0.032172924280166625, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.30000000074505806, + "reward_std": 0.2362758308649063, + "rewards/MultiModalAccuracyORM": 0.30000000074505806, + "step": 1745, + "train_speed(iter/s)": 0.032334 + }, + { + "epoch": 0.7070707070707071, + "grad_norm": 1.092887250386922, + "learning_rate": 2e-07, + "loss": -0.023990578949451447, + "memory(GiB)": 67.41, + "step": 1750, + "train_speed(iter/s)": 0.03234 + }, + { + "epoch": 0.7070707070707071, + "eval_clip_ratio": 0.0, + "eval_completion_length": 294.09001091003415, + "eval_kl": 0.023105102032423018, + "eval_loss": 0.036663174629211426, + "eval_response_clip_ratio": 0.006666666865348816, + "eval_reward": 0.2633333384990692, + "eval_reward_std": 0.31118109107017516, + "eval_rewards/MultiModalAccuracyORM": 0.2633333384990692, + "eval_runtime": 620.9868, + "eval_samples_per_second": 0.081, + "eval_steps_per_second": 0.008, + "step": 1750 + }, + { + "clip_ratio": 0.0, + "completion_length": 287.0, + "epoch": 0.7090909090909091, + "grad_norm": 1.0538874052505927, + "kl": 0.016379984514787792, + "learning_rate": 2e-07, + "loss": -0.005869853496551514, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.20000000596046447, + "reward_std": 0.3115545302629471, + "rewards/MultiModalAccuracyORM": 0.20000000596046447, + "step": 1755, + "train_speed(iter/s)": 0.031772 + }, + { + "clip_ratio": 0.0, + "completion_length": 266.25, + "epoch": 0.7111111111111111, + "grad_norm": 1.3903102724055967, + "kl": 0.013312125299125909, + "learning_rate": 2e-07, + "loss": 0.02017918825149536, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.26666666865348815, + "reward_std": 0.29776653051376345, + "rewards/MultiModalAccuracyORM": 0.26666666865348815, + "step": 1760, + "train_speed(iter/s)": 0.031786 + }, + { + "clip_ratio": 0.0, + "completion_length": 331.8, + "epoch": 0.7131313131313132, + "grad_norm": 1.3836373367153505, + "kl": 0.0138115500099957, + "learning_rate": 2e-07, + "loss": 0.03059466779232025, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.1333333358168602, + "reward_std": 0.2370448112487793, + "rewards/MultiModalAccuracyORM": 0.1333333358168602, + "step": 1765, + "train_speed(iter/s)": 0.031799 + }, + { + "clip_ratio": 0.0, + "completion_length": 271.15, + "epoch": 0.7151515151515152, + "grad_norm": 1.4216966273148588, + "kl": 0.01580625809729099, + "learning_rate": 2e-07, + "loss": -0.019823677837848663, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.12500000447034837, + "reward_std": 0.23631439208984376, + "rewards/MultiModalAccuracyORM": 0.12500000447034837, + "step": 1770, + "train_speed(iter/s)": 0.031824 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.55, + "epoch": 0.7171717171717171, + "grad_norm": 1.1462808808604095, + "kl": 0.05410000858828425, + "learning_rate": 2e-07, + "loss": 0.040007442235946655, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.2916666783392429, + "reward_std": 0.3511467784643173, + "rewards/MultiModalAccuracyORM": 0.2916666783392429, + "step": 1775, + "train_speed(iter/s)": 0.03186 + }, + { + "clip_ratio": 0.0, + "completion_length": 411.8, + "epoch": 0.7191919191919192, + "grad_norm": 1.1009164808291978, + "kl": 0.012898495933040977, + "learning_rate": 2e-07, + "loss": 0.03641944527626038, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.23333334028720856, + "reward_std": 0.30410684943199157, + "rewards/MultiModalAccuracyORM": 0.23333334028720856, + "step": 1780, + "train_speed(iter/s)": 0.031851 + }, + { + "clip_ratio": 0.0, + "completion_length": 352.9, + "epoch": 0.7212121212121212, + "grad_norm": 2.3163685194934924, + "kl": 0.016784476628527046, + "learning_rate": 2e-07, + "loss": 0.03075094223022461, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.05, + "reward": 0.1833333380520344, + "reward_std": 0.30815233290195465, + "rewards/MultiModalAccuracyORM": 0.1833333380520344, + "step": 1785, + "train_speed(iter/s)": 0.031872 + }, + { + "clip_ratio": 0.0, + "completion_length": 283.8, + "epoch": 0.7232323232323232, + "grad_norm": 1.7199397374779446, + "kl": 0.01555022168904543, + "learning_rate": 2e-07, + "loss": -0.009939193725585938, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.20833334028720857, + "reward_std": 0.29634127020835876, + "rewards/MultiModalAccuracyORM": 0.20833334028720857, + "step": 1790, + "train_speed(iter/s)": 0.031884 + }, + { + "clip_ratio": 0.0, + "completion_length": 304.85, + "epoch": 0.7252525252525253, + "grad_norm": 3.3750611418125196, + "kl": 0.017106020543724298, + "learning_rate": 2e-07, + "loss": 0.01690548360347748, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.31666667610406873, + "reward_std": 0.33083729147911073, + "rewards/MultiModalAccuracyORM": 0.31666667610406873, + "step": 1795, + "train_speed(iter/s)": 0.031895 + }, + { + "clip_ratio": 0.0, + "completion_length": 303.0, + "epoch": 0.7272727272727273, + "grad_norm": 0.9074495991885635, + "kl": 0.02303459094837308, + "learning_rate": 2e-07, + "loss": 0.013132384419441223, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.3750000111758709, + "reward_std": 0.35964520275592804, + "rewards/MultiModalAccuracyORM": 0.3750000111758709, + "step": 1800, + "train_speed(iter/s)": 0.031921 + }, + { + "clip_ratio": 0.0, + "completion_length": 276.65, + "epoch": 0.7292929292929293, + "grad_norm": 1.5259661253214234, + "kl": 0.01673535956069827, + "learning_rate": 2e-07, + "loss": -0.02494005113840103, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.3250000037252903, + "reward_std": 0.19487895369529723, + "rewards/MultiModalAccuracyORM": 0.3250000037252903, + "step": 1805, + "train_speed(iter/s)": 0.03194 + }, + { + "clip_ratio": 0.0, + "completion_length": 237.1, + "epoch": 0.7313131313131314, + "grad_norm": 23.765604936964085, + "kl": 0.0391254379414022, + "learning_rate": 2e-07, + "loss": 0.07751191854476928, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.2083333410322666, + "reward_std": 0.34710128903388976, + "rewards/MultiModalAccuracyORM": 0.2083333410322666, + "step": 1810, + "train_speed(iter/s)": 0.031961 + }, + { + "clip_ratio": 0.0, + "completion_length": 226.05, + "epoch": 0.7333333333333333, + "grad_norm": 1.844663010011029, + "kl": 0.018137864442542194, + "learning_rate": 2e-07, + "loss": 0.006373977661132813, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.4500000163912773, + "reward_std": 0.4026396483182907, + "rewards/MultiModalAccuracyORM": 0.4500000163912773, + "step": 1815, + "train_speed(iter/s)": 0.031995 + }, + { + "clip_ratio": 0.0, + "completion_length": 259.55, + "epoch": 0.7353535353535353, + "grad_norm": 3.1736096732843753, + "kl": 0.01654947120696306, + "learning_rate": 2e-07, + "loss": -0.03625679612159729, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.35000000819563865, + "reward_std": 0.33621527552604674, + "rewards/MultiModalAccuracyORM": 0.35000000819563865, + "step": 1820, + "train_speed(iter/s)": 0.032006 + }, + { + "clip_ratio": 0.0, + "completion_length": 273.95, + "epoch": 0.7373737373737373, + "grad_norm": 0.8446499426276284, + "kl": 0.01737216175533831, + "learning_rate": 2e-07, + "loss": 0.01487790048122406, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.13333334028720856, + "reward_std": 0.18083562850952148, + "rewards/MultiModalAccuracyORM": 0.13333334028720856, + "step": 1825, + "train_speed(iter/s)": 0.032029 + }, + { + "clip_ratio": 0.0, + "completion_length": 333.2, + "epoch": 0.7393939393939394, + "grad_norm": 12.81365368807463, + "kl": 0.017870889231562614, + "learning_rate": 2e-07, + "loss": -0.04711695909500122, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.13333333507180214, + "reward_std": 0.20967913568019866, + "rewards/MultiModalAccuracyORM": 0.13333333507180214, + "step": 1830, + "train_speed(iter/s)": 0.032037 + }, + { + "clip_ratio": 0.0, + "completion_length": 290.2, + "epoch": 0.7414141414141414, + "grad_norm": 0.8602503566731583, + "kl": 0.06428629895672203, + "learning_rate": 2e-07, + "loss": 0.0755260705947876, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.2666666738688946, + "reward_std": 0.3033378630876541, + "rewards/MultiModalAccuracyORM": 0.2666666738688946, + "step": 1835, + "train_speed(iter/s)": 0.032056 + }, + { + "clip_ratio": 0.0, + "completion_length": 217.2, + "epoch": 0.7434343434343434, + "grad_norm": 4.0474296652864865, + "kl": 0.025846300972625615, + "learning_rate": 2e-07, + "loss": 0.03730224370956421, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.4250000067055225, + "reward_std": 0.4119280993938446, + "rewards/MultiModalAccuracyORM": 0.4250000067055225, + "step": 1840, + "train_speed(iter/s)": 0.032071 + }, + { + "clip_ratio": 0.0, + "completion_length": 301.95, + "epoch": 0.7454545454545455, + "grad_norm": 2.232537088059296, + "kl": 0.01461884556338191, + "learning_rate": 2e-07, + "loss": -0.004309023916721344, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.26666667237877845, + "reward_std": 0.313649520277977, + "rewards/MultiModalAccuracyORM": 0.26666667237877845, + "step": 1845, + "train_speed(iter/s)": 0.032079 + }, + { + "clip_ratio": 0.0, + "completion_length": 260.05, + "epoch": 0.7474747474747475, + "grad_norm": 2.3016578929204963, + "kl": 0.021660260390490294, + "learning_rate": 2e-07, + "loss": -0.03592326939105987, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.3083333395421505, + "reward_std": 0.26591232419013977, + "rewards/MultiModalAccuracyORM": 0.3083333395421505, + "step": 1850, + "train_speed(iter/s)": 0.032095 + }, + { + "clip_ratio": 0.0, + "completion_length": 240.8, + "epoch": 0.7494949494949495, + "grad_norm": 0.040878648566390896, + "kl": 0.016377491503953935, + "learning_rate": 2e-07, + "loss": -0.007729291915893555, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.2916666746139526, + "reward_std": 0.2752989321947098, + "rewards/MultiModalAccuracyORM": 0.2916666746139526, + "step": 1855, + "train_speed(iter/s)": 0.03211 + }, + { + "clip_ratio": 0.0, + "completion_length": 277.6, + "epoch": 0.7515151515151515, + "grad_norm": 1.745735435957987, + "kl": 0.01565658366307616, + "learning_rate": 2e-07, + "loss": 0.03239756226539612, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.2083333395421505, + "reward_std": 0.34933354556560514, + "rewards/MultiModalAccuracyORM": 0.2083333395421505, + "step": 1860, + "train_speed(iter/s)": 0.03212 + }, + { + "clip_ratio": 0.0, + "completion_length": 463.15, + "epoch": 0.7535353535353535, + "grad_norm": 1.1867439188156674, + "kl": 0.013316378556191921, + "learning_rate": 2e-07, + "loss": 0.02554565668106079, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.05, + "reward": 0.19166667386889458, + "reward_std": 0.3019101768732071, + "rewards/MultiModalAccuracyORM": 0.19166667386889458, + "step": 1865, + "train_speed(iter/s)": 0.032117 + }, + { + "clip_ratio": 0.0, + "completion_length": 241.8, + "epoch": 0.7555555555555555, + "grad_norm": 1.6710102974959882, + "kl": 0.0185435910243541, + "learning_rate": 2e-07, + "loss": 0.0040223002433776855, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.19166666865348816, + "reward_std": 0.32418315708637235, + "rewards/MultiModalAccuracyORM": 0.19166666865348816, + "step": 1870, + "train_speed(iter/s)": 0.032131 + }, + { + "clip_ratio": 0.0, + "completion_length": 219.95, + "epoch": 0.7575757575757576, + "grad_norm": 1.6067423711502076, + "kl": 0.019126034528017043, + "learning_rate": 2e-07, + "loss": 0.044132769107818604, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.22500001192092894, + "reward_std": 0.2815650999546051, + "rewards/MultiModalAccuracyORM": 0.22500001192092894, + "step": 1875, + "train_speed(iter/s)": 0.032154 + }, + { + "clip_ratio": 0.0, + "completion_length": 257.55, + "epoch": 0.7595959595959596, + "grad_norm": 0.028316390333514352, + "kl": 0.013678487855941057, + "learning_rate": 2e-07, + "loss": 0.011262473464012147, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.0916666679084301, + "reward_std": 0.16855751872062683, + "rewards/MultiModalAccuracyORM": 0.0916666679084301, + "step": 1880, + "train_speed(iter/s)": 0.032167 + }, + { + "clip_ratio": 0.0, + "completion_length": 411.7, + "epoch": 0.7616161616161616, + "grad_norm": 1.2321236683558325, + "kl": 0.016270547499880196, + "learning_rate": 2e-07, + "loss": -0.05071290135383606, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.17500000223517417, + "reward_std": 0.3174407839775085, + "rewards/MultiModalAccuracyORM": 0.17500000223517417, + "step": 1885, + "train_speed(iter/s)": 0.032165 + }, + { + "clip_ratio": 0.0, + "completion_length": 238.25, + "epoch": 0.7636363636363637, + "grad_norm": 1.7168712559722545, + "kl": 0.020080643892288207, + "learning_rate": 2e-07, + "loss": 0.014709633588790894, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.29166667014360426, + "reward_std": 0.27371591329574585, + "rewards/MultiModalAccuracyORM": 0.29166667014360426, + "step": 1890, + "train_speed(iter/s)": 0.032193 + }, + { + "clip_ratio": 0.0, + "completion_length": 297.8, + "epoch": 0.7656565656565657, + "grad_norm": 1.4396153411408321, + "kl": 0.015443798806518316, + "learning_rate": 2e-07, + "loss": -0.03242262601852417, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.20833333879709243, + "reward_std": 0.3297544836997986, + "rewards/MultiModalAccuracyORM": 0.20833333879709243, + "step": 1895, + "train_speed(iter/s)": 0.032196 + }, + { + "clip_ratio": 0.0, + "completion_length": 206.0, + "epoch": 0.7676767676767676, + "grad_norm": 2.153471273550742, + "kl": 0.019460227340459824, + "learning_rate": 2e-07, + "loss": 0.06560848355293274, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.3083333380520344, + "reward_std": 0.23860624432563782, + "rewards/MultiModalAccuracyORM": 0.3083333380520344, + "step": 1900, + "train_speed(iter/s)": 0.032197 + }, + { + "clip_ratio": 0.0, + "completion_length": 244.55, + "epoch": 0.7696969696969697, + "grad_norm": 1.0432229656314325, + "kl": 0.017312650848180056, + "learning_rate": 2e-07, + "loss": 0.031227093935012818, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.4583333432674408, + "reward_std": 0.31520852744579314, + "rewards/MultiModalAccuracyORM": 0.4583333432674408, + "step": 1905, + "train_speed(iter/s)": 0.032215 + }, + { + "clip_ratio": 0.0, + "completion_length": 265.75, + "epoch": 0.7717171717171717, + "grad_norm": 1.180310161050895, + "kl": 0.015469088219106197, + "learning_rate": 2e-07, + "loss": -0.015459638833999634, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.2666666716337204, + "reward_std": 0.36887405812740326, + "rewards/MultiModalAccuracyORM": 0.2666666716337204, + "step": 1910, + "train_speed(iter/s)": 0.032224 + }, + { + "clip_ratio": 0.0, + "completion_length": 213.3, + "epoch": 0.7737373737373737, + "grad_norm": 0.06901782029330109, + "kl": 0.020954974088817836, + "learning_rate": 2e-07, + "loss": 0.023259681463241578, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.3666666761040688, + "reward_std": 0.3164801448583603, + "rewards/MultiModalAccuracyORM": 0.3666666761040688, + "step": 1915, + "train_speed(iter/s)": 0.032258 + }, + { + "clip_ratio": 0.0, + "completion_length": 294.35, + "epoch": 0.7757575757575758, + "grad_norm": 1.2725867918795215, + "kl": 0.023106640204787254, + "learning_rate": 2e-07, + "loss": 0.05665600299835205, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.3500000096857548, + "reward_std": 0.3811877518892288, + "rewards/MultiModalAccuracyORM": 0.3500000096857548, + "step": 1920, + "train_speed(iter/s)": 0.032278 + }, + { + "clip_ratio": 0.0, + "completion_length": 251.0, + "epoch": 0.7777777777777778, + "grad_norm": 1.9367541891791467, + "kl": 0.05693813692778349, + "learning_rate": 2e-07, + "loss": 0.016718414425849915, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.4583333395421505, + "reward_std": 0.343126979470253, + "rewards/MultiModalAccuracyORM": 0.4583333395421505, + "step": 1925, + "train_speed(iter/s)": 0.032298 + }, + { + "clip_ratio": 0.0, + "completion_length": 270.8, + "epoch": 0.7797979797979798, + "grad_norm": 1.8056180584349435, + "kl": 0.018875516019761562, + "learning_rate": 2e-07, + "loss": 0.026608097553253173, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.3416666746139526, + "reward_std": 0.2745299518108368, + "rewards/MultiModalAccuracyORM": 0.3416666746139526, + "step": 1930, + "train_speed(iter/s)": 0.032307 + }, + { + "clip_ratio": 0.0, + "completion_length": 275.75, + "epoch": 0.7818181818181819, + "grad_norm": 0.9474431031282609, + "kl": 0.017304270621389152, + "learning_rate": 2e-07, + "loss": 0.054594576358795166, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.15833334028720855, + "reward_std": 0.2940494120121002, + "rewards/MultiModalAccuracyORM": 0.15833334028720855, + "step": 1935, + "train_speed(iter/s)": 0.032311 + }, + { + "clip_ratio": 0.0, + "completion_length": 413.95, + "epoch": 0.7838383838383839, + "grad_norm": 0.31279661376062395, + "kl": 0.015814543049782515, + "learning_rate": 2e-07, + "loss": 0.022085633873939515, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.05, + "reward": 0.2250000111758709, + "reward_std": 0.2674381732940674, + "rewards/MultiModalAccuracyORM": 0.2250000111758709, + "step": 1940, + "train_speed(iter/s)": 0.032322 + }, + { + "clip_ratio": 0.0, + "completion_length": 251.15, + "epoch": 0.7858585858585858, + "grad_norm": 1.153254924584669, + "kl": 0.022455749101936817, + "learning_rate": 2e-07, + "loss": -0.005599388480186462, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.4583333432674408, + "reward_std": 0.32981408536434176, + "rewards/MultiModalAccuracyORM": 0.4583333432674408, + "step": 1945, + "train_speed(iter/s)": 0.032335 + }, + { + "clip_ratio": 0.0, + "completion_length": 284.8, + "epoch": 0.7878787878787878, + "grad_norm": 1.477534719499969, + "kl": 0.015763588808476926, + "learning_rate": 2e-07, + "loss": 0.00195084810256958, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.3250000089406967, + "reward_std": 0.3789910912513733, + "rewards/MultiModalAccuracyORM": 0.3250000089406967, + "step": 1950, + "train_speed(iter/s)": 0.032341 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.25, + "epoch": 0.7898989898989899, + "grad_norm": 2.285698337644553, + "kl": 0.02175712687894702, + "learning_rate": 2e-07, + "loss": -0.00021601170301437377, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.3416666775941849, + "reward_std": 0.3043610692024231, + "rewards/MultiModalAccuracyORM": 0.3416666775941849, + "step": 1955, + "train_speed(iter/s)": 0.032362 + }, + { + "clip_ratio": 0.0, + "completion_length": 240.6, + "epoch": 0.7919191919191919, + "grad_norm": 1.2460067583966314, + "kl": 0.019805201794952154, + "learning_rate": 2e-07, + "loss": 0.016333769261837005, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.12500000074505807, + "reward_std": 0.24869927167892455, + "rewards/MultiModalAccuracyORM": 0.12500000074505807, + "step": 1960, + "train_speed(iter/s)": 0.032387 + }, + { + "clip_ratio": 0.0, + "completion_length": 294.15, + "epoch": 0.793939393939394, + "grad_norm": 1.2892695908066818, + "kl": 0.016942942142486574, + "learning_rate": 2e-07, + "loss": 0.05399552583694458, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.3333333425223827, + "reward_std": 0.32777645289897916, + "rewards/MultiModalAccuracyORM": 0.3333333425223827, + "step": 1965, + "train_speed(iter/s)": 0.032394 + }, + { + "clip_ratio": 0.0, + "completion_length": 286.5, + "epoch": 0.795959595959596, + "grad_norm": 1.15365791846424, + "kl": 0.015720244217664003, + "learning_rate": 2e-07, + "loss": 0.03231356143951416, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.31666667461395265, + "reward_std": 0.35792473554611204, + "rewards/MultiModalAccuracyORM": 0.31666667461395265, + "step": 1970, + "train_speed(iter/s)": 0.032404 + }, + { + "clip_ratio": 0.0, + "completion_length": 256.7, + "epoch": 0.797979797979798, + "grad_norm": 2.0846813962071784, + "kl": 0.02078899824991822, + "learning_rate": 2e-07, + "loss": 0.0010771095752716065, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.36666667759418486, + "reward_std": 0.34783414006233215, + "rewards/MultiModalAccuracyORM": 0.36666667759418486, + "step": 1975, + "train_speed(iter/s)": 0.032425 + }, + { + "clip_ratio": 0.0, + "completion_length": 291.3, + "epoch": 0.8, + "grad_norm": 1.8113050624363713, + "kl": 0.02985860425978899, + "learning_rate": 2e-07, + "loss": -0.02335626631975174, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.2416666716337204, + "reward_std": 0.2940494120121002, + "rewards/MultiModalAccuracyORM": 0.2416666716337204, + "step": 1980, + "train_speed(iter/s)": 0.032446 + }, + { + "clip_ratio": 0.0, + "completion_length": 306.5, + "epoch": 0.802020202020202, + "grad_norm": 0.564590741108868, + "kl": 0.023125759046524762, + "learning_rate": 2e-07, + "loss": -0.003053317964076996, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.3250000074505806, + "reward_std": 0.2599243938922882, + "rewards/MultiModalAccuracyORM": 0.3250000074505806, + "step": 1985, + "train_speed(iter/s)": 0.032463 + }, + { + "clip_ratio": 0.0, + "completion_length": 255.45, + "epoch": 0.804040404040404, + "grad_norm": 1.8493252896290147, + "kl": 0.026984267216175795, + "learning_rate": 2e-07, + "loss": -0.06052778363227844, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.2500000074505806, + "reward_std": 0.3063987076282501, + "rewards/MultiModalAccuracyORM": 0.2500000074505806, + "step": 1990, + "train_speed(iter/s)": 0.032484 + }, + { + "clip_ratio": 0.0, + "completion_length": 238.1, + "epoch": 0.806060606060606, + "grad_norm": 1.0923781003773394, + "kl": 0.01942981770262122, + "learning_rate": 2e-07, + "loss": 0.013067781925201416, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.19166667014360428, + "reward_std": 0.37893148958683015, + "rewards/MultiModalAccuracyORM": 0.19166667014360428, + "step": 1995, + "train_speed(iter/s)": 0.032491 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 1.528791414674691, + "learning_rate": 2e-07, + "loss": -0.023299628496170045, + "memory(GiB)": 67.41, + "step": 2000, + "train_speed(iter/s)": 0.032518 + }, + { + "epoch": 0.8080808080808081, + "eval_clip_ratio": 0.0, + "eval_completion_length": 260.0916757965088, + "eval_kl": 0.03659271189942956, + "eval_loss": 0.022981010377407074, + "eval_response_clip_ratio": 0.001666666716337204, + "eval_reward": 0.3116666758060455, + "eval_reward_std": 0.3461023557186127, + "eval_rewards/MultiModalAccuracyORM": 0.3116666758060455, + "eval_runtime": 577.7727, + "eval_samples_per_second": 0.087, + "eval_steps_per_second": 0.009, + "step": 2000 + }, + { + "clip_ratio": 0.0, + "completion_length": 256.1, + "epoch": 0.8101010101010101, + "grad_norm": 0.6858703005476166, + "kl": 0.021803660970181226, + "learning_rate": 2e-07, + "loss": -0.01412125825881958, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.17916667126119137, + "reward_std": 0.27865810990333556, + "rewards/MultiModalAccuracyORM": 0.17916667126119137, + "step": 2005, + "train_speed(iter/s)": 0.032026 + }, + { + "clip_ratio": 0.0, + "completion_length": 235.3, + "epoch": 0.8121212121212121, + "grad_norm": 1.240645294227265, + "kl": 0.021775086782872675, + "learning_rate": 2e-07, + "loss": -0.009345543384552003, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.2666666738688946, + "reward_std": 0.25591449439525604, + "rewards/MultiModalAccuracyORM": 0.2666666738688946, + "step": 2010, + "train_speed(iter/s)": 0.032055 + }, + { + "clip_ratio": 0.0, + "completion_length": 274.75, + "epoch": 0.8141414141414142, + "grad_norm": 0.8866953662012617, + "kl": 0.025209260638803244, + "learning_rate": 2e-07, + "loss": -0.007123380899429321, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.33333334103226664, + "reward_std": 0.38077115416526797, + "rewards/MultiModalAccuracyORM": 0.33333334103226664, + "step": 2015, + "train_speed(iter/s)": 0.032076 + }, + { + "clip_ratio": 0.0, + "completion_length": 224.65, + "epoch": 0.8161616161616162, + "grad_norm": 2.421458976490517, + "kl": 0.019219990959390996, + "learning_rate": 2e-07, + "loss": -0.046216756105422974, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.25833334028720856, + "reward_std": 0.3353258162736893, + "rewards/MultiModalAccuracyORM": 0.25833334028720856, + "step": 2020, + "train_speed(iter/s)": 0.032083 + }, + { + "clip_ratio": 0.0, + "completion_length": 354.7, + "epoch": 0.8181818181818182, + "grad_norm": 1.323113651332219, + "kl": 0.0228486392647028, + "learning_rate": 2e-07, + "loss": 0.018266260623931885, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.4083333402872086, + "reward_std": 0.22224705517292023, + "rewards/MultiModalAccuracyORM": 0.4083333402872086, + "step": 2025, + "train_speed(iter/s)": 0.032101 + }, + { + "clip_ratio": 0.0, + "completion_length": 277.3, + "epoch": 0.8202020202020202, + "grad_norm": 1.351077195712627, + "kl": 0.020902801770716906, + "learning_rate": 2e-07, + "loss": 0.061953216791152954, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.3666666768491268, + "reward_std": 0.39707074165344236, + "rewards/MultiModalAccuracyORM": 0.3666666768491268, + "step": 2030, + "train_speed(iter/s)": 0.032117 + }, + { + "clip_ratio": 0.0, + "completion_length": 417.7, + "epoch": 0.8222222222222222, + "grad_norm": 1.2938426116281756, + "kl": 0.016236740909516812, + "learning_rate": 2e-07, + "loss": 0.06566336154937744, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.25000000596046446, + "reward_std": 0.32052563428878783, + "rewards/MultiModalAccuracyORM": 0.25000000596046446, + "step": 2035, + "train_speed(iter/s)": 0.032133 + }, + { + "clip_ratio": 0.0, + "completion_length": 379.7, + "epoch": 0.8242424242424242, + "grad_norm": 0.037993510667949114, + "kl": 0.018295575771480797, + "learning_rate": 2e-07, + "loss": -0.04348133802413941, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.16666667014360428, + "reward_std": 0.3227578908205032, + "rewards/MultiModalAccuracyORM": 0.16666667014360428, + "step": 2040, + "train_speed(iter/s)": 0.032143 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.4, + "epoch": 0.8262626262626263, + "grad_norm": 1.7568452367664746, + "kl": 0.022255995497107505, + "learning_rate": 2e-07, + "loss": -0.010665307939052581, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.32500000670552254, + "reward_std": 0.36062985360622407, + "rewards/MultiModalAccuracyORM": 0.32500000670552254, + "step": 2045, + "train_speed(iter/s)": 0.032157 + }, + { + "clip_ratio": 0.0, + "completion_length": 324.9, + "epoch": 0.8282828282828283, + "grad_norm": 0.3584645693525935, + "kl": 0.026058319211006164, + "learning_rate": 2e-07, + "loss": 0.008829855918884277, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.13333333805203437, + "reward_std": 0.26518189907073975, + "rewards/MultiModalAccuracyORM": 0.13333333805203437, + "step": 2050, + "train_speed(iter/s)": 0.032167 + }, + { + "clip_ratio": 0.0, + "completion_length": 268.7, + "epoch": 0.8303030303030303, + "grad_norm": 1.0414808041792807, + "kl": 0.024912268854677677, + "learning_rate": 2e-07, + "loss": -0.0169498473405838, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.2666666731238365, + "reward_std": 0.3943714380264282, + "rewards/MultiModalAccuracyORM": 0.2666666731238365, + "step": 2055, + "train_speed(iter/s)": 0.032187 + }, + { + "clip_ratio": 0.0, + "completion_length": 387.25, + "epoch": 0.8323232323232324, + "grad_norm": 0.8330776026520852, + "kl": 0.017706521693617104, + "learning_rate": 2e-07, + "loss": -0.0015785574913024902, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.33333334475755694, + "reward_std": 0.3066769391298294, + "rewards/MultiModalAccuracyORM": 0.33333334475755694, + "step": 2060, + "train_speed(iter/s)": 0.032184 + }, + { + "clip_ratio": 0.0, + "completion_length": 361.8, + "epoch": 0.8343434343434344, + "grad_norm": 1.286067527047748, + "kl": 0.016120643261820077, + "learning_rate": 2e-07, + "loss": 0.010149773955345155, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.1583333395421505, + "reward_std": 0.30636311173439024, + "rewards/MultiModalAccuracyORM": 0.1583333395421505, + "step": 2065, + "train_speed(iter/s)": 0.032193 + }, + { + "clip_ratio": 0.0, + "completion_length": 341.9, + "epoch": 0.8363636363636363, + "grad_norm": 0.8716186001419639, + "kl": 0.025367347244173288, + "learning_rate": 2e-07, + "loss": -0.00466080904006958, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.35000001192092894, + "reward_std": 0.29784067571163175, + "rewards/MultiModalAccuracyORM": 0.35000001192092894, + "step": 2070, + "train_speed(iter/s)": 0.032191 + }, + { + "clip_ratio": 0.0, + "completion_length": 228.15, + "epoch": 0.8383838383838383, + "grad_norm": 2.4195845756657106, + "kl": 0.027777089178562163, + "learning_rate": 2e-07, + "loss": 0.0020249992609024047, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.30833333879709246, + "reward_std": 0.2885376811027527, + "rewards/MultiModalAccuracyORM": 0.30833333879709246, + "step": 2075, + "train_speed(iter/s)": 0.032214 + }, + { + "clip_ratio": 0.0, + "completion_length": 252.1, + "epoch": 0.8404040404040404, + "grad_norm": 1.3049618894030401, + "kl": 0.055896259006112815, + "learning_rate": 2e-07, + "loss": 0.02880297303199768, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.3250000037252903, + "reward_std": 0.2853323519229889, + "rewards/MultiModalAccuracyORM": 0.3250000037252903, + "step": 2080, + "train_speed(iter/s)": 0.032232 + }, + { + "clip_ratio": 0.0, + "completion_length": 407.05, + "epoch": 0.8424242424242424, + "grad_norm": 0.8244280245530798, + "kl": 0.02422601543366909, + "learning_rate": 2e-07, + "loss": -0.006161260604858399, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.2500000074505806, + "reward_std": 0.29560841917991637, + "rewards/MultiModalAccuracyORM": 0.2500000074505806, + "step": 2085, + "train_speed(iter/s)": 0.032237 + }, + { + "clip_ratio": 0.0, + "completion_length": 281.9, + "epoch": 0.8444444444444444, + "grad_norm": 1.3048496117538315, + "kl": 0.01891004741191864, + "learning_rate": 2e-07, + "loss": -0.049727249145507815, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.14166666865348815, + "reward_std": 0.3290000528097153, + "rewards/MultiModalAccuracyORM": 0.14166666865348815, + "step": 2090, + "train_speed(iter/s)": 0.032254 + }, + { + "clip_ratio": 0.0, + "completion_length": 266.05, + "epoch": 0.8464646464646465, + "grad_norm": 1.1403945225556158, + "kl": 0.01645102323964238, + "learning_rate": 2e-07, + "loss": -0.013469058275222778, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.3166666716337204, + "reward_std": 0.21999078691005708, + "rewards/MultiModalAccuracyORM": 0.3166666716337204, + "step": 2095, + "train_speed(iter/s)": 0.032269 + }, + { + "clip_ratio": 0.0, + "completion_length": 410.45, + "epoch": 0.8484848484848485, + "grad_norm": 0.9707324672728811, + "kl": 0.029299197159707545, + "learning_rate": 2e-07, + "loss": 0.03684330582618713, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.37500000521540644, + "reward_std": 0.2792848199605942, + "rewards/MultiModalAccuracyORM": 0.37500000521540644, + "step": 2100, + "train_speed(iter/s)": 0.032273 + }, + { + "clip_ratio": 0.0, + "completion_length": 291.2, + "epoch": 0.8505050505050505, + "grad_norm": 1.218289932401983, + "kl": 0.02226941576227546, + "learning_rate": 2e-07, + "loss": 0.049808406829833986, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.20000000521540642, + "reward_std": 0.3563301384449005, + "rewards/MultiModalAccuracyORM": 0.20000000521540642, + "step": 2105, + "train_speed(iter/s)": 0.032281 + }, + { + "clip_ratio": 0.0, + "completion_length": 242.25, + "epoch": 0.8525252525252526, + "grad_norm": 0.5870668188140559, + "kl": 0.07989006163552403, + "learning_rate": 2e-07, + "loss": 0.03518458604812622, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.5250000037252903, + "reward_std": 0.21374863088130952, + "rewards/MultiModalAccuracyORM": 0.5250000037252903, + "step": 2110, + "train_speed(iter/s)": 0.032307 + }, + { + "clip_ratio": 0.0, + "completion_length": 293.3, + "epoch": 0.8545454545454545, + "grad_norm": 0.020336894429096215, + "kl": 0.037888656742870806, + "learning_rate": 2e-07, + "loss": -0.014006611704826356, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.17500000447034836, + "reward_std": 0.2792848199605942, + "rewards/MultiModalAccuracyORM": 0.17500000447034836, + "step": 2115, + "train_speed(iter/s)": 0.032314 + }, + { + "clip_ratio": 0.0, + "completion_length": 372.15, + "epoch": 0.8565656565656565, + "grad_norm": 1.21835723660335, + "kl": 0.017908666748553514, + "learning_rate": 2e-07, + "loss": -0.01713634133338928, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.28333333879709244, + "reward_std": 0.40485736131668093, + "rewards/MultiModalAccuracyORM": 0.28333333879709244, + "step": 2120, + "train_speed(iter/s)": 0.032326 + }, + { + "clip_ratio": 0.0, + "completion_length": 331.75, + "epoch": 0.8585858585858586, + "grad_norm": 2.0067295271647985, + "kl": 0.026275785733014347, + "learning_rate": 2e-07, + "loss": 0.05382862687110901, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.24166667684912682, + "reward_std": 0.37440980076789854, + "rewards/MultiModalAccuracyORM": 0.24166667684912682, + "step": 2125, + "train_speed(iter/s)": 0.032342 + }, + { + "clip_ratio": 0.0, + "completion_length": 242.4, + "epoch": 0.8606060606060606, + "grad_norm": 1.6449849094427824, + "kl": 0.026703681144863368, + "learning_rate": 2e-07, + "loss": 0.07567849159240722, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.3166666775941849, + "reward_std": 0.383000984787941, + "rewards/MultiModalAccuracyORM": 0.3166666775941849, + "step": 2130, + "train_speed(iter/s)": 0.032348 + }, + { + "clip_ratio": 0.0, + "completion_length": 273.25, + "epoch": 0.8626262626262626, + "grad_norm": 1.015338618446623, + "kl": 0.02266251090914011, + "learning_rate": 2e-07, + "loss": 0.0509304940700531, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.1666666679084301, + "reward_std": 0.29462080299854276, + "rewards/MultiModalAccuracyORM": 0.1666666679084301, + "step": 2135, + "train_speed(iter/s)": 0.032357 + }, + { + "clip_ratio": 0.0, + "completion_length": 221.85, + "epoch": 0.8646464646464647, + "grad_norm": 1.431447102184734, + "kl": 0.05701554603874683, + "learning_rate": 2e-07, + "loss": -0.008281412720680236, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.3166666693985462, + "reward_std": 0.23030244410037995, + "rewards/MultiModalAccuracyORM": 0.3166666693985462, + "step": 2140, + "train_speed(iter/s)": 0.032369 + }, + { + "clip_ratio": 0.0, + "completion_length": 233.3, + "epoch": 0.8666666666666667, + "grad_norm": 1.6811314333402767, + "kl": 0.06945961127057672, + "learning_rate": 2e-07, + "loss": 0.04042296409606934, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.3333333387970924, + "reward_std": 0.28154108822345736, + "rewards/MultiModalAccuracyORM": 0.3333333387970924, + "step": 2145, + "train_speed(iter/s)": 0.032395 + }, + { + "clip_ratio": 0.0, + "completion_length": 244.2, + "epoch": 0.8686868686868687, + "grad_norm": 1.7106832961427025, + "kl": 0.02610405897721648, + "learning_rate": 2e-07, + "loss": 0.018969109654426573, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.2333333395421505, + "reward_std": 0.316710364818573, + "rewards/MultiModalAccuracyORM": 0.2333333395421505, + "step": 2150, + "train_speed(iter/s)": 0.032414 + }, + { + "clip_ratio": 0.0, + "completion_length": 241.85, + "epoch": 0.8707070707070707, + "grad_norm": 2.2093533026187493, + "kl": 0.027732077380642296, + "learning_rate": 2e-07, + "loss": -0.04979143142700195, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.1500000014901161, + "reward_std": 0.2782616138458252, + "rewards/MultiModalAccuracyORM": 0.1500000014901161, + "step": 2155, + "train_speed(iter/s)": 0.032415 + }, + { + "clip_ratio": 0.0, + "completion_length": 361.4, + "epoch": 0.8727272727272727, + "grad_norm": 1.4196000317723967, + "kl": 0.017831438966095448, + "learning_rate": 2e-07, + "loss": -0.016922876238822937, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.1000000037252903, + "reward_std": 0.18733201026916504, + "rewards/MultiModalAccuracyORM": 0.1000000037252903, + "step": 2160, + "train_speed(iter/s)": 0.032428 + }, + { + "clip_ratio": 0.0, + "completion_length": 310.35, + "epoch": 0.8747474747474747, + "grad_norm": 1.2738059173790044, + "kl": 0.027828316576778887, + "learning_rate": 2e-07, + "loss": 0.0020169079303741454, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.43333334401249884, + "reward_std": 0.3993601739406586, + "rewards/MultiModalAccuracyORM": 0.43333334401249884, + "step": 2165, + "train_speed(iter/s)": 0.032444 + }, + { + "clip_ratio": 0.0, + "completion_length": 315.65, + "epoch": 0.8767676767676768, + "grad_norm": 0.5711871148221249, + "kl": 0.021445599384605885, + "learning_rate": 2e-07, + "loss": -0.021869242191314697, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.2083333373069763, + "reward_std": 0.2652174890041351, + "rewards/MultiModalAccuracyORM": 0.2083333373069763, + "step": 2170, + "train_speed(iter/s)": 0.032448 + }, + { + "clip_ratio": 0.0, + "completion_length": 323.3, + "epoch": 0.8787878787878788, + "grad_norm": 0.7831111399049216, + "kl": 0.02290212018415332, + "learning_rate": 2e-07, + "loss": 0.0058018617331981655, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.20833333805203438, + "reward_std": 0.2191862165927887, + "rewards/MultiModalAccuracyORM": 0.20833333805203438, + "step": 2175, + "train_speed(iter/s)": 0.032457 + }, + { + "clip_ratio": 0.0, + "completion_length": 352.4, + "epoch": 0.8808080808080808, + "grad_norm": 0.8065748585811442, + "kl": 0.020074152015149595, + "learning_rate": 2e-07, + "loss": -0.01390417218208313, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.2333333410322666, + "reward_std": 0.355694904923439, + "rewards/MultiModalAccuracyORM": 0.2333333410322666, + "step": 2180, + "train_speed(iter/s)": 0.032459 + }, + { + "clip_ratio": 0.0, + "completion_length": 238.9, + "epoch": 0.8828282828282829, + "grad_norm": 1.4489205967938936, + "kl": 0.023838929925113918, + "learning_rate": 2e-07, + "loss": -0.004918675124645233, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.2666666775941849, + "reward_std": 0.350342208147049, + "rewards/MultiModalAccuracyORM": 0.2666666775941849, + "step": 2185, + "train_speed(iter/s)": 0.032473 + }, + { + "clip_ratio": 0.0, + "completion_length": 268.05, + "epoch": 0.8848484848484849, + "grad_norm": 0.02924850303466511, + "kl": 0.022593512199819088, + "learning_rate": 2e-07, + "loss": 0.023534037172794342, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.3250000037252903, + "reward_std": 0.2567190647125244, + "rewards/MultiModalAccuracyORM": 0.3250000037252903, + "step": 2190, + "train_speed(iter/s)": 0.032483 + }, + { + "clip_ratio": 0.0, + "completion_length": 301.65, + "epoch": 0.8868686868686869, + "grad_norm": 0.03836674408051373, + "kl": 0.02357559949159622, + "learning_rate": 2e-07, + "loss": -0.039435860514640805, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.14166667088866233, + "reward_std": 0.20519061088562013, + "rewards/MultiModalAccuracyORM": 0.14166667088866233, + "step": 2195, + "train_speed(iter/s)": 0.032503 + }, + { + "clip_ratio": 0.0, + "completion_length": 275.5, + "epoch": 0.8888888888888888, + "grad_norm": 0.38821860650162454, + "kl": 0.018242907989770175, + "learning_rate": 2e-07, + "loss": 0.012925875186920167, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.13333333805203437, + "reward_std": 0.23481498062610626, + "rewards/MultiModalAccuracyORM": 0.13333333805203437, + "step": 2200, + "train_speed(iter/s)": 0.032493 + }, + { + "clip_ratio": 0.0, + "completion_length": 432.85, + "epoch": 0.8909090909090909, + "grad_norm": 0.5940318264351889, + "kl": 0.017311586905270814, + "learning_rate": 2e-07, + "loss": 0.029576906561851503, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.05000000149011612, + "reward_std": 0.13558491468429565, + "rewards/MultiModalAccuracyORM": 0.05000000149011612, + "step": 2205, + "train_speed(iter/s)": 0.03248 + }, + { + "clip_ratio": 0.0, + "completion_length": 505.3, + "epoch": 0.8929292929292929, + "grad_norm": 0.8492665899886147, + "kl": 0.019003557693213224, + "learning_rate": 2e-07, + "loss": 0.07017003893852233, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.1833333395421505, + "reward_std": 0.3041664451360703, + "rewards/MultiModalAccuracyORM": 0.1833333395421505, + "step": 2210, + "train_speed(iter/s)": 0.032484 + }, + { + "clip_ratio": 0.0, + "completion_length": 240.35, + "epoch": 0.8949494949494949, + "grad_norm": 1.0856523861671716, + "kl": 0.027655008435249328, + "learning_rate": 2e-07, + "loss": -0.004737144708633423, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.2833333365619183, + "reward_std": 0.2260383188724518, + "rewards/MultiModalAccuracyORM": 0.2833333365619183, + "step": 2215, + "train_speed(iter/s)": 0.032493 + }, + { + "clip_ratio": 0.0, + "completion_length": 309.9, + "epoch": 0.896969696969697, + "grad_norm": 1.448236523400996, + "kl": 0.018335943669080736, + "learning_rate": 2e-07, + "loss": -0.03629968166351318, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.2583333380520344, + "reward_std": 0.3113932520151138, + "rewards/MultiModalAccuracyORM": 0.2583333380520344, + "step": 2220, + "train_speed(iter/s)": 0.032499 + }, + { + "clip_ratio": 0.0, + "completion_length": 249.55, + "epoch": 0.898989898989899, + "grad_norm": 7.847983245982503, + "kl": 0.049338278640061614, + "learning_rate": 2e-07, + "loss": -0.000819157063961029, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.19166666865348816, + "reward_std": 0.3003867596387863, + "rewards/MultiModalAccuracyORM": 0.19166666865348816, + "step": 2225, + "train_speed(iter/s)": 0.032517 + }, + { + "clip_ratio": 0.0, + "completion_length": 299.65, + "epoch": 0.901010101010101, + "grad_norm": 1.8170235507507255, + "kl": 0.018048797827214004, + "learning_rate": 2e-07, + "loss": -0.07969279289245605, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.1833333395421505, + "reward_std": 0.3167103588581085, + "rewards/MultiModalAccuracyORM": 0.1833333395421505, + "step": 2230, + "train_speed(iter/s)": 0.032523 + }, + { + "clip_ratio": 0.0, + "completion_length": 233.15, + "epoch": 0.9030303030303031, + "grad_norm": 1.609249526669931, + "kl": 0.026530137099325658, + "learning_rate": 2e-07, + "loss": -0.008447715640068054, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.31666667461395265, + "reward_std": 0.3106628268957138, + "rewards/MultiModalAccuracyORM": 0.31666667461395265, + "step": 2235, + "train_speed(iter/s)": 0.032541 + }, + { + "clip_ratio": 0.0, + "completion_length": 367.8, + "epoch": 0.9050505050505051, + "grad_norm": 0.7699373347527835, + "kl": 0.021872828295454384, + "learning_rate": 2e-07, + "loss": 0.034666317701339724, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.15833334028720855, + "reward_std": 0.24487241208553315, + "rewards/MultiModalAccuracyORM": 0.15833334028720855, + "step": 2240, + "train_speed(iter/s)": 0.032555 + }, + { + "clip_ratio": 0.0, + "completion_length": 411.45, + "epoch": 0.907070707070707, + "grad_norm": 1.464770865938276, + "kl": 0.018026039376854895, + "learning_rate": 2e-07, + "loss": 0.048737600445747375, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.17500000596046447, + "reward_std": 0.25866150557994844, + "rewards/MultiModalAccuracyORM": 0.17500000596046447, + "step": 2245, + "train_speed(iter/s)": 0.03256 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 1.7098872390259656, + "learning_rate": 2e-07, + "loss": 0.09721781015396118, + "memory(GiB)": 67.41, + "step": 2250, + "train_speed(iter/s)": 0.032576 + }, + { + "epoch": 0.9090909090909091, + "eval_clip_ratio": 0.0, + "eval_completion_length": 274.43000770568847, + "eval_kl": 0.027916996125131845, + "eval_loss": 0.03821293264627457, + "eval_response_clip_ratio": 0.0, + "eval_reward": 0.3183333395421505, + "eval_reward_std": 0.3168588674068451, + "eval_rewards/MultiModalAccuracyORM": 0.3183333395421505, + "eval_runtime": 541.6774, + "eval_samples_per_second": 0.092, + "eval_steps_per_second": 0.009, + "step": 2250 + }, + { + "clip_ratio": 0.0, + "completion_length": 295.35, + "epoch": 0.9111111111111111, + "grad_norm": 2.547618884699712, + "kl": 0.022893223259598017, + "learning_rate": 2e-07, + "loss": 0.025194990634918212, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.025, + "reward": 0.4291666805744171, + "reward_std": 0.3938093319535255, + "rewards/MultiModalAccuracyORM": 0.4291666805744171, + "step": 2255, + "train_speed(iter/s)": 0.032148 + }, + { + "clip_ratio": 0.0, + "completion_length": 214.35, + "epoch": 0.9131313131313131, + "grad_norm": 1.5269748079222623, + "kl": 0.08079174058511854, + "learning_rate": 2e-07, + "loss": 0.030228087306022645, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.40833334252238274, + "reward_std": 0.21374863088130952, + "rewards/MultiModalAccuracyORM": 0.40833334252238274, + "step": 2260, + "train_speed(iter/s)": 0.032172 + }, + { + "clip_ratio": 0.0, + "completion_length": 327.15, + "epoch": 0.9151515151515152, + "grad_norm": 1.4348651198084164, + "kl": 0.022139840014278888, + "learning_rate": 2e-07, + "loss": -0.012193611264228821, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.32500001341104506, + "reward_std": 0.4274616837501526, + "rewards/MultiModalAccuracyORM": 0.32500001341104506, + "step": 2265, + "train_speed(iter/s)": 0.032191 + }, + { + "clip_ratio": 0.0, + "completion_length": 374.9, + "epoch": 0.9171717171717172, + "grad_norm": 0.05197357590854231, + "kl": 0.021040867920964955, + "learning_rate": 2e-07, + "loss": 0.011036497354507447, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.11666666939854622, + "reward_std": 0.0966599702835083, + "rewards/MultiModalAccuracyORM": 0.11666666939854622, + "step": 2270, + "train_speed(iter/s)": 0.0322 + }, + { + "clip_ratio": 0.0, + "completion_length": 306.75, + "epoch": 0.9191919191919192, + "grad_norm": 0.4699960941829414, + "kl": 0.0629787240177393, + "learning_rate": 2e-07, + "loss": 0.06589013934135438, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.05, + "reward": 0.30833334252238276, + "reward_std": 0.2877832442522049, + "rewards/MultiModalAccuracyORM": 0.30833334252238276, + "step": 2275, + "train_speed(iter/s)": 0.03221 + }, + { + "clip_ratio": 0.0, + "completion_length": 314.9, + "epoch": 0.9212121212121213, + "grad_norm": 1.3008238080628904, + "kl": 0.026494850823655724, + "learning_rate": 2e-07, + "loss": -0.00757303386926651, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.3833333425223827, + "reward_std": 0.33621527552604674, + "rewards/MultiModalAccuracyORM": 0.3833333425223827, + "step": 2280, + "train_speed(iter/s)": 0.032219 + }, + { + "clip_ratio": 0.0, + "completion_length": 257.8, + "epoch": 0.9232323232323232, + "grad_norm": 1.524415453755262, + "kl": 0.017740064300596714, + "learning_rate": 2e-07, + "loss": 0.04480080604553223, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.21666667014360427, + "reward_std": 0.2940850019454956, + "rewards/MultiModalAccuracyORM": 0.21666667014360427, + "step": 2285, + "train_speed(iter/s)": 0.032228 + }, + { + "clip_ratio": 0.0, + "completion_length": 344.9, + "epoch": 0.9252525252525252, + "grad_norm": 1.1770888642120725, + "kl": 0.020085165183991192, + "learning_rate": 2e-07, + "loss": 0.003685349225997925, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.30000001192092896, + "reward_std": 0.394838485121727, + "rewards/MultiModalAccuracyORM": 0.30000001192092896, + "step": 2290, + "train_speed(iter/s)": 0.032233 + }, + { + "clip_ratio": 0.0, + "completion_length": 282.95, + "epoch": 0.9272727272727272, + "grad_norm": 0.6172113323574312, + "kl": 0.025718586426228283, + "learning_rate": 2e-07, + "loss": -0.038769152760505673, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.15000000223517418, + "reward_std": 0.21378422081470488, + "rewards/MultiModalAccuracyORM": 0.15000000223517418, + "step": 2295, + "train_speed(iter/s)": 0.032258 + }, + { + "clip_ratio": 0.0, + "completion_length": 381.15, + "epoch": 0.9292929292929293, + "grad_norm": 0.9460784361294594, + "kl": 0.02194049907848239, + "learning_rate": 2e-07, + "loss": 0.003476354479789734, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.20000000596046447, + "reward_std": 0.2918527454137802, + "rewards/MultiModalAccuracyORM": 0.20000000596046447, + "step": 2300, + "train_speed(iter/s)": 0.032265 + }, + { + "clip_ratio": 0.0, + "completion_length": 288.1, + "epoch": 0.9313131313131313, + "grad_norm": 0.531311987853284, + "kl": 0.01724575264379382, + "learning_rate": 2e-07, + "loss": 0.026836919784545898, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.21666666939854623, + "reward_std": 0.3141853272914886, + "rewards/MultiModalAccuracyORM": 0.21666666939854623, + "step": 2305, + "train_speed(iter/s)": 0.03226 + }, + { + "clip_ratio": 0.0, + "completion_length": 307.25, + "epoch": 0.9333333333333333, + "grad_norm": 0.4613731219877519, + "kl": 0.019346164539456367, + "learning_rate": 2e-07, + "loss": -0.0063665717840194706, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.23333334252238275, + "reward_std": 0.22979399859905242, + "rewards/MultiModalAccuracyORM": 0.23333334252238275, + "step": 2310, + "train_speed(iter/s)": 0.032264 + }, + { + "clip_ratio": 0.0, + "completion_length": 319.95, + "epoch": 0.9353535353535354, + "grad_norm": 0.5351247883145774, + "kl": 0.036143379751592875, + "learning_rate": 2e-07, + "loss": 0.0059957727789878845, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.34166666939854623, + "reward_std": 0.273775514960289, + "rewards/MultiModalAccuracyORM": 0.34166666939854623, + "step": 2315, + "train_speed(iter/s)": 0.032281 + }, + { + "clip_ratio": 0.0, + "completion_length": 301.95, + "epoch": 0.9373737373737374, + "grad_norm": 1.006473375720635, + "kl": 0.022489387728273868, + "learning_rate": 2e-07, + "loss": -0.02273874878883362, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.23333333805203438, + "reward_std": 0.2651819050312042, + "rewards/MultiModalAccuracyORM": 0.23333333805203438, + "step": 2320, + "train_speed(iter/s)": 0.032281 + }, + { + "clip_ratio": 0.0, + "completion_length": 396.75, + "epoch": 0.9393939393939394, + "grad_norm": 1.0081193199484457, + "kl": 0.01994982697069645, + "learning_rate": 2e-07, + "loss": 0.06933374404907226, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.3083333380520344, + "reward_std": 0.4342752188444138, + "rewards/MultiModalAccuracyORM": 0.3083333380520344, + "step": 2325, + "train_speed(iter/s)": 0.032295 + }, + { + "clip_ratio": 0.0, + "completion_length": 259.55, + "epoch": 0.9414141414141414, + "grad_norm": 0.05459849756184899, + "kl": 0.022601721994578838, + "learning_rate": 2e-07, + "loss": 0.018863174319267272, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.2833333440124989, + "reward_std": 0.2410811483860016, + "rewards/MultiModalAccuracyORM": 0.2833333440124989, + "step": 2330, + "train_speed(iter/s)": 0.032314 + }, + { + "clip_ratio": 0.0, + "completion_length": 348.8, + "epoch": 0.9434343434343434, + "grad_norm": 1.710345471585661, + "kl": 0.020212457934394478, + "learning_rate": 2e-07, + "loss": 0.10661859512329101, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.23333334177732468, + "reward_std": 0.244369775056839, + "rewards/MultiModalAccuracyORM": 0.23333334177732468, + "step": 2335, + "train_speed(iter/s)": 0.032312 + }, + { + "clip_ratio": 0.0, + "completion_length": 266.35, + "epoch": 0.9454545454545454, + "grad_norm": 1.207560090272438, + "kl": 0.020538910292088985, + "learning_rate": 2e-07, + "loss": 0.00968976616859436, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.11666666939854622, + "reward_std": 0.21149236261844634, + "rewards/MultiModalAccuracyORM": 0.11666666939854622, + "step": 2340, + "train_speed(iter/s)": 0.032319 + }, + { + "clip_ratio": 0.0, + "completion_length": 232.7, + "epoch": 0.9474747474747475, + "grad_norm": 0.5661864643216962, + "kl": 0.02179541252553463, + "learning_rate": 2e-07, + "loss": 0.03451942503452301, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.2833333410322666, + "reward_std": 0.3026406019926071, + "rewards/MultiModalAccuracyORM": 0.2833333410322666, + "step": 2345, + "train_speed(iter/s)": 0.032337 + }, + { + "clip_ratio": 0.0, + "completion_length": 306.85, + "epoch": 0.9494949494949495, + "grad_norm": 0.6878755464268733, + "kl": 0.02421591989696026, + "learning_rate": 2e-07, + "loss": 0.05174432992935181, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.24166667088866234, + "reward_std": 0.279270276427269, + "rewards/MultiModalAccuracyORM": 0.24166667088866234, + "step": 2350, + "train_speed(iter/s)": 0.032341 + }, + { + "clip_ratio": 0.0, + "completion_length": 352.45, + "epoch": 0.9515151515151515, + "grad_norm": 1.50264742942701, + "kl": 0.02172108683735132, + "learning_rate": 2e-07, + "loss": -0.019475644826889037, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.3416666753590107, + "reward_std": 0.42271838188171384, + "rewards/MultiModalAccuracyORM": 0.3416666753590107, + "step": 2355, + "train_speed(iter/s)": 0.032343 + }, + { + "clip_ratio": 0.0, + "completion_length": 261.65, + "epoch": 0.9535353535353536, + "grad_norm": 1.224629573838151, + "kl": 0.03754720762372017, + "learning_rate": 2e-07, + "loss": 0.08559540510177613, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.5083333402872086, + "reward_std": 0.2744703501462936, + "rewards/MultiModalAccuracyORM": 0.5083333402872086, + "step": 2360, + "train_speed(iter/s)": 0.032358 + }, + { + "clip_ratio": 0.0, + "completion_length": 343.5, + "epoch": 0.9555555555555556, + "grad_norm": 1.4630602233812633, + "kl": 0.021476354077458383, + "learning_rate": 2e-07, + "loss": -0.08055483698844909, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.4083333395421505, + "reward_std": 0.40558778643608095, + "rewards/MultiModalAccuracyORM": 0.4083333395421505, + "step": 2365, + "train_speed(iter/s)": 0.03237 + }, + { + "clip_ratio": 0.0, + "completion_length": 345.1, + "epoch": 0.9575757575757575, + "grad_norm": 2.857198218490812, + "kl": 0.021784471347928047, + "learning_rate": 2e-07, + "loss": -0.02553858757019043, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.29166667982935907, + "reward_std": 0.35789157152175904, + "rewards/MultiModalAccuracyORM": 0.29166667982935907, + "step": 2370, + "train_speed(iter/s)": 0.03238 + }, + { + "clip_ratio": 0.0, + "completion_length": 273.85, + "epoch": 0.9595959595959596, + "grad_norm": 1.6238009179768014, + "kl": 0.03373121190816164, + "learning_rate": 2e-07, + "loss": 0.0017300590872764588, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.27500000670552255, + "reward_std": 0.40231128334999083, + "rewards/MultiModalAccuracyORM": 0.27500000670552255, + "step": 2375, + "train_speed(iter/s)": 0.032396 + }, + { + "clip_ratio": 0.0, + "completion_length": 372.95, + "epoch": 0.9616161616161616, + "grad_norm": 0.5732843040643582, + "kl": 0.020048757642507554, + "learning_rate": 2e-07, + "loss": 0.03029699921607971, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.45000001341104506, + "reward_std": 0.3471368789672852, + "rewards/MultiModalAccuracyORM": 0.45000001341104506, + "step": 2380, + "train_speed(iter/s)": 0.032405 + }, + { + "clip_ratio": 0.0, + "completion_length": 289.35, + "epoch": 0.9636363636363636, + "grad_norm": 1.1293323918314548, + "kl": 0.024607629235833883, + "learning_rate": 2e-07, + "loss": 0.0067908987402915955, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.25000000223517416, + "reward_std": 0.3292373031377792, + "rewards/MultiModalAccuracyORM": 0.25000000223517416, + "step": 2385, + "train_speed(iter/s)": 0.032423 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.45, + "epoch": 0.9656565656565657, + "grad_norm": 1.039492421088282, + "kl": 0.025942530203610658, + "learning_rate": 2e-07, + "loss": 0.059793722629547116, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.2750000089406967, + "reward_std": 0.3285214215517044, + "rewards/MultiModalAccuracyORM": 0.2750000089406967, + "step": 2390, + "train_speed(iter/s)": 0.032437 + }, + { + "clip_ratio": 0.0, + "completion_length": 263.3, + "epoch": 0.9676767676767677, + "grad_norm": 1.2125676700182542, + "kl": 0.024488268233835698, + "learning_rate": 2e-07, + "loss": 0.05103383660316467, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.3750000074505806, + "reward_std": 0.39961439967155454, + "rewards/MultiModalAccuracyORM": 0.3750000074505806, + "step": 2395, + "train_speed(iter/s)": 0.032441 + }, + { + "clip_ratio": 0.0, + "completion_length": 284.4, + "epoch": 0.9696969696969697, + "grad_norm": 0.09761626886432172, + "kl": 0.02428424544632435, + "learning_rate": 2e-07, + "loss": 0.00439504086971283, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.1500000014901161, + "reward_std": 0.2260383188724518, + "rewards/MultiModalAccuracyORM": 0.1500000014901161, + "step": 2400, + "train_speed(iter/s)": 0.03246 + }, + { + "clip_ratio": 0.0, + "completion_length": 199.25, + "epoch": 0.9717171717171718, + "grad_norm": 2.1810913319134038, + "kl": 0.03959659710526466, + "learning_rate": 2e-07, + "loss": 0.02262794375419617, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.26666666865348815, + "reward_std": 0.21753989458084105, + "rewards/MultiModalAccuracyORM": 0.26666666865348815, + "step": 2405, + "train_speed(iter/s)": 0.032482 + }, + { + "clip_ratio": 0.0, + "completion_length": 295.15, + "epoch": 0.9737373737373738, + "grad_norm": 1.7428069378966462, + "kl": 0.02672185152769089, + "learning_rate": 2e-07, + "loss": -0.00538158118724823, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.13333333879709244, + "reward_std": 0.2323044866323471, + "rewards/MultiModalAccuracyORM": 0.13333333879709244, + "step": 2410, + "train_speed(iter/s)": 0.032487 + }, + { + "clip_ratio": 0.0, + "completion_length": 257.5, + "epoch": 0.9757575757575757, + "grad_norm": 0.7544250223135323, + "kl": 0.027272729855030774, + "learning_rate": 2e-07, + "loss": 0.03573228120803833, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.40833334624767303, + "reward_std": 0.27523933053016664, + "rewards/MultiModalAccuracyORM": 0.40833334624767303, + "step": 2415, + "train_speed(iter/s)": 0.032504 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.15, + "epoch": 0.9777777777777777, + "grad_norm": 1.866177647075409, + "kl": 0.036007688101381066, + "learning_rate": 2e-07, + "loss": 0.003173720836639404, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.45000000596046447, + "reward_std": 0.3689336538314819, + "rewards/MultiModalAccuracyORM": 0.45000000596046447, + "step": 2420, + "train_speed(iter/s)": 0.032526 + }, + { + "clip_ratio": 0.0, + "completion_length": 206.6, + "epoch": 0.9797979797979798, + "grad_norm": 1.2725371241295251, + "kl": 0.02483037244528532, + "learning_rate": 2e-07, + "loss": -0.025305929780006408, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.14166667237877845, + "reward_std": 0.21374863088130952, + "rewards/MultiModalAccuracyORM": 0.14166667237877845, + "step": 2425, + "train_speed(iter/s)": 0.032544 + }, + { + "clip_ratio": 0.0, + "completion_length": 267.65, + "epoch": 0.9818181818181818, + "grad_norm": 0.5577365367508015, + "kl": 0.037827163096517326, + "learning_rate": 2e-07, + "loss": 0.09373842477798462, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.4333333417773247, + "reward_std": 0.38300099074840543, + "rewards/MultiModalAccuracyORM": 0.4333333417773247, + "step": 2430, + "train_speed(iter/s)": 0.032563 + }, + { + "clip_ratio": 0.0, + "completion_length": 241.65, + "epoch": 0.9838383838383838, + "grad_norm": 2.1936674243123027, + "kl": 0.02673042882233858, + "learning_rate": 2e-07, + "loss": -0.004708817601203919, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.2500000074505806, + "reward_std": 0.2712294369935989, + "rewards/MultiModalAccuracyORM": 0.2500000074505806, + "step": 2435, + "train_speed(iter/s)": 0.032575 + }, + { + "clip_ratio": 0.0, + "completion_length": 242.1, + "epoch": 0.9858585858585859, + "grad_norm": 1.2081108167008505, + "kl": 0.03199390545487404, + "learning_rate": 2e-07, + "loss": 0.08453056812286378, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.17500000447034836, + "reward_std": 0.2323400765657425, + "rewards/MultiModalAccuracyORM": 0.17500000447034836, + "step": 2440, + "train_speed(iter/s)": 0.032589 + }, + { + "clip_ratio": 0.0, + "completion_length": 212.6, + "epoch": 0.9878787878787879, + "grad_norm": 1.3157103877648637, + "kl": 0.033728963509202, + "learning_rate": 2e-07, + "loss": 0.025054675340652467, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.35000001192092894, + "reward_std": 0.323036128282547, + "rewards/MultiModalAccuracyORM": 0.35000001192092894, + "step": 2445, + "train_speed(iter/s)": 0.032609 + }, + { + "clip_ratio": 0.0, + "completion_length": 280.85, + "epoch": 0.98989898989899, + "grad_norm": 0.07041901111619411, + "kl": 0.029867130145430566, + "learning_rate": 2e-07, + "loss": 0.008186718821525574, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.1250000037252903, + "reward_std": 0.21550226211547852, + "rewards/MultiModalAccuracyORM": 0.1250000037252903, + "step": 2450, + "train_speed(iter/s)": 0.032618 + }, + { + "clip_ratio": 0.0, + "completion_length": 292.0, + "epoch": 0.9919191919191919, + "grad_norm": 1.672464403846531, + "kl": 0.032038337737321856, + "learning_rate": 2e-07, + "loss": -0.020856915414333342, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.2916666708886623, + "reward_std": 0.28227151930332184, + "rewards/MultiModalAccuracyORM": 0.2916666708886623, + "step": 2455, + "train_speed(iter/s)": 0.032636 + }, + { + "clip_ratio": 0.0, + "completion_length": 310.5, + "epoch": 0.9939393939393939, + "grad_norm": 1.1129720769248492, + "kl": 0.02438914030790329, + "learning_rate": 2e-07, + "loss": 0.06876440644264221, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.450000012665987, + "reward_std": 0.33704385757446287, + "rewards/MultiModalAccuracyORM": 0.450000012665987, + "step": 2460, + "train_speed(iter/s)": 0.032645 + }, + { + "clip_ratio": 0.0, + "completion_length": 221.7, + "epoch": 0.9959595959595959, + "grad_norm": 1.6345111959053422, + "kl": 0.03214533980935812, + "learning_rate": 2e-07, + "loss": 0.008031123876571655, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.5250000104308128, + "reward_std": 0.3142238765954971, + "rewards/MultiModalAccuracyORM": 0.5250000104308128, + "step": 2465, + "train_speed(iter/s)": 0.032661 + }, + { + "clip_ratio": 0.0, + "completion_length": 246.95, + "epoch": 0.997979797979798, + "grad_norm": 1.663569908963529, + "kl": 0.0534836488775909, + "learning_rate": 2e-07, + "loss": 0.05174955129623413, + "memory(GiB)": 67.41, + "response_clip_ratio": 0.0, + "reward": 0.2583333417773247, + "reward_std": 0.33303395807743075, + "rewards/MultiModalAccuracyORM": 0.2583333417773247, + "step": 2470, + "train_speed(iter/s)": 0.032676 + }, + { + "epoch": 1.0, + "grad_norm": 1.93613237990153, + "learning_rate": 2e-07, + "loss": -0.011667436361312867, + "memory(GiB)": 67.41, + "step": 2475, + "train_speed(iter/s)": 0.03268 + }, + { + "epoch": 1.0, + "eval_clip_ratio": 0.0, + "eval_completion_length": 257.6600067901611, + "eval_kl": 0.04240348171442747, + "eval_loss": -0.001080758636817336, + "eval_response_clip_ratio": 0.003333333432674408, + "eval_reward": 0.32666667327284815, + "eval_reward_std": 0.29233356595039367, + "eval_rewards/MultiModalAccuracyORM": 0.32666667327284815, + "eval_runtime": 586.4987, + "eval_samples_per_second": 0.085, + "eval_steps_per_second": 0.009, + "step": 2475 + } + ], + "logging_steps": 5, + "max_steps": 2475, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 250, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 3, + "trial_name": null, + "trial_params": null +}