{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9987642455032266, "eval_steps": 500, "global_step": 2730, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0010984484415762735, "grad_norm": 0.13173329830169678, "learning_rate": 1.0989010989010988e-06, "loss": 0.8751, "step": 1 }, { "epoch": 0.002196896883152547, "grad_norm": 0.19401921331882477, "learning_rate": 2.1978021978021976e-06, "loss": 1.3488, "step": 2 }, { "epoch": 0.0032953453247288205, "grad_norm": 0.142131969332695, "learning_rate": 3.2967032967032968e-06, "loss": 0.8371, "step": 3 }, { "epoch": 0.004393793766305094, "grad_norm": 0.1124999076128006, "learning_rate": 4.395604395604395e-06, "loss": 1.0039, "step": 4 }, { "epoch": 0.005492242207881368, "grad_norm": 0.20683947205543518, "learning_rate": 5.494505494505494e-06, "loss": 1.4423, "step": 5 }, { "epoch": 0.006590690649457641, "grad_norm": 0.2007640153169632, "learning_rate": 6.5934065934065935e-06, "loss": 0.9797, "step": 6 }, { "epoch": 0.007689139091033915, "grad_norm": 0.1362670361995697, "learning_rate": 7.692307692307692e-06, "loss": 1.0443, "step": 7 }, { "epoch": 0.008787587532610188, "grad_norm": 0.21512511372566223, "learning_rate": 8.79120879120879e-06, "loss": 1.2888, "step": 8 }, { "epoch": 0.009886035974186462, "grad_norm": 0.13403186202049255, "learning_rate": 9.89010989010989e-06, "loss": 0.9637, "step": 9 }, { "epoch": 0.010984484415762736, "grad_norm": 0.16911157965660095, "learning_rate": 1.0989010989010989e-05, "loss": 0.8824, "step": 10 }, { "epoch": 0.012082932857339008, "grad_norm": 0.19280359148979187, "learning_rate": 1.2087912087912087e-05, "loss": 0.9843, "step": 11 }, { "epoch": 0.013181381298915282, "grad_norm": 0.15720519423484802, "learning_rate": 1.3186813186813187e-05, "loss": 0.9769, "step": 12 }, { "epoch": 0.014279829740491556, "grad_norm": 0.18622402846813202, "learning_rate": 1.4285714285714284e-05, "loss": 0.903, "step": 13 }, { "epoch": 0.01537827818206783, "grad_norm": 0.1491895169019699, "learning_rate": 1.5384615384615384e-05, "loss": 1.065, "step": 14 }, { "epoch": 0.016476726623644102, "grad_norm": 0.16883142292499542, "learning_rate": 1.6483516483516482e-05, "loss": 0.9916, "step": 15 }, { "epoch": 0.017575175065220376, "grad_norm": 0.155453160405159, "learning_rate": 1.758241758241758e-05, "loss": 1.1048, "step": 16 }, { "epoch": 0.01867362350679665, "grad_norm": 0.12869666516780853, "learning_rate": 1.868131868131868e-05, "loss": 0.9355, "step": 17 }, { "epoch": 0.019772071948372924, "grad_norm": 0.18860433995723724, "learning_rate": 1.978021978021978e-05, "loss": 1.1779, "step": 18 }, { "epoch": 0.020870520389949198, "grad_norm": 0.30738529562950134, "learning_rate": 2.087912087912088e-05, "loss": 0.905, "step": 19 }, { "epoch": 0.021968968831525472, "grad_norm": 0.30248674750328064, "learning_rate": 2.1978021978021977e-05, "loss": 1.0749, "step": 20 }, { "epoch": 0.023067417273101742, "grad_norm": 0.17005079984664917, "learning_rate": 2.3076923076923076e-05, "loss": 1.0141, "step": 21 }, { "epoch": 0.024165865714678016, "grad_norm": 0.5497377514839172, "learning_rate": 2.4175824175824174e-05, "loss": 0.804, "step": 22 }, { "epoch": 0.02526431415625429, "grad_norm": 0.23464925587177277, "learning_rate": 2.5274725274725276e-05, "loss": 1.0592, "step": 23 }, { "epoch": 0.026362762597830564, "grad_norm": 0.2906591594219208, "learning_rate": 2.6373626373626374e-05, "loss": 1.4096, "step": 24 }, { "epoch": 0.027461211039406838, "grad_norm": 0.14552968740463257, "learning_rate": 2.747252747252747e-05, "loss": 0.8827, "step": 25 }, { "epoch": 0.028559659480983112, "grad_norm": 0.26139914989471436, "learning_rate": 2.8571428571428567e-05, "loss": 1.1081, "step": 26 }, { "epoch": 0.029658107922559386, "grad_norm": 0.16122505068778992, "learning_rate": 2.9670329670329666e-05, "loss": 0.8967, "step": 27 }, { "epoch": 0.03075655636413566, "grad_norm": 0.19174647331237793, "learning_rate": 3.076923076923077e-05, "loss": 0.7527, "step": 28 }, { "epoch": 0.031855004805711934, "grad_norm": 0.24506032466888428, "learning_rate": 3.1868131868131866e-05, "loss": 1.0981, "step": 29 }, { "epoch": 0.032953453247288204, "grad_norm": 0.18928349018096924, "learning_rate": 3.2967032967032964e-05, "loss": 1.2955, "step": 30 }, { "epoch": 0.03405190168886448, "grad_norm": 0.20482106506824493, "learning_rate": 3.406593406593406e-05, "loss": 0.886, "step": 31 }, { "epoch": 0.03515035013044075, "grad_norm": 0.17304010689258575, "learning_rate": 3.516483516483516e-05, "loss": 1.0062, "step": 32 }, { "epoch": 0.03624879857201702, "grad_norm": 0.17006444931030273, "learning_rate": 3.626373626373626e-05, "loss": 0.76, "step": 33 }, { "epoch": 0.0373472470135933, "grad_norm": 0.16570955514907837, "learning_rate": 3.736263736263736e-05, "loss": 0.7512, "step": 34 }, { "epoch": 0.03844569545516957, "grad_norm": 0.4470347464084625, "learning_rate": 3.8461538461538456e-05, "loss": 1.051, "step": 35 }, { "epoch": 0.03954414389674585, "grad_norm": 0.3013080060482025, "learning_rate": 3.956043956043956e-05, "loss": 1.1269, "step": 36 }, { "epoch": 0.04064259233832212, "grad_norm": 0.33114469051361084, "learning_rate": 4.065934065934065e-05, "loss": 1.046, "step": 37 }, { "epoch": 0.041741040779898396, "grad_norm": 0.3496829867362976, "learning_rate": 4.175824175824176e-05, "loss": 0.9139, "step": 38 }, { "epoch": 0.042839489221474666, "grad_norm": 0.36173877120018005, "learning_rate": 4.285714285714285e-05, "loss": 1.16, "step": 39 }, { "epoch": 0.043937937663050944, "grad_norm": 0.23047995567321777, "learning_rate": 4.3956043956043955e-05, "loss": 0.8623, "step": 40 }, { "epoch": 0.045036386104627214, "grad_norm": 0.33733946084976196, "learning_rate": 4.5054945054945046e-05, "loss": 0.873, "step": 41 }, { "epoch": 0.046134834546203485, "grad_norm": 0.43975624442100525, "learning_rate": 4.615384615384615e-05, "loss": 0.9374, "step": 42 }, { "epoch": 0.04723328298777976, "grad_norm": 0.5429202318191528, "learning_rate": 4.725274725274725e-05, "loss": 1.0699, "step": 43 }, { "epoch": 0.04833173142935603, "grad_norm": 0.39317595958709717, "learning_rate": 4.835164835164835e-05, "loss": 0.7719, "step": 44 }, { "epoch": 0.04943017987093231, "grad_norm": 0.41328710317611694, "learning_rate": 4.9450549450549446e-05, "loss": 1.112, "step": 45 }, { "epoch": 0.05052862831250858, "grad_norm": 0.5977774858474731, "learning_rate": 5.054945054945055e-05, "loss": 0.9408, "step": 46 }, { "epoch": 0.05162707675408486, "grad_norm": 0.6984797716140747, "learning_rate": 5.164835164835164e-05, "loss": 0.9766, "step": 47 }, { "epoch": 0.05272552519566113, "grad_norm": 0.5161548256874084, "learning_rate": 5.274725274725275e-05, "loss": 1.3705, "step": 48 }, { "epoch": 0.0538239736372374, "grad_norm": 0.5750108361244202, "learning_rate": 5.384615384615384e-05, "loss": 0.9492, "step": 49 }, { "epoch": 0.054922422078813676, "grad_norm": 0.7861920595169067, "learning_rate": 5.494505494505494e-05, "loss": 1.1495, "step": 50 }, { "epoch": 0.05602087052038995, "grad_norm": 0.5992287993431091, "learning_rate": 5.6043956043956037e-05, "loss": 1.2818, "step": 51 }, { "epoch": 0.057119318961966224, "grad_norm": 0.5470016598701477, "learning_rate": 5.7142857142857135e-05, "loss": 1.0385, "step": 52 }, { "epoch": 0.058217767403542495, "grad_norm": 0.7035269141197205, "learning_rate": 5.824175824175824e-05, "loss": 0.785, "step": 53 }, { "epoch": 0.05931621584511877, "grad_norm": 0.5253639817237854, "learning_rate": 5.934065934065933e-05, "loss": 0.6092, "step": 54 }, { "epoch": 0.06041466428669504, "grad_norm": 0.5233064293861389, "learning_rate": 6.043956043956044e-05, "loss": 0.7853, "step": 55 }, { "epoch": 0.06151311272827132, "grad_norm": 0.4508589804172516, "learning_rate": 6.153846153846154e-05, "loss": 0.5737, "step": 56 }, { "epoch": 0.06261156116984759, "grad_norm": 1.0521594285964966, "learning_rate": 6.263736263736263e-05, "loss": 1.0132, "step": 57 }, { "epoch": 0.06371000961142387, "grad_norm": 0.3572557866573334, "learning_rate": 6.373626373626373e-05, "loss": 0.655, "step": 58 }, { "epoch": 0.06480845805300013, "grad_norm": 0.600371241569519, "learning_rate": 6.483516483516483e-05, "loss": 0.8897, "step": 59 }, { "epoch": 0.06590690649457641, "grad_norm": 0.6430579423904419, "learning_rate": 6.593406593406593e-05, "loss": 0.8058, "step": 60 }, { "epoch": 0.06700535493615269, "grad_norm": 0.5309410095214844, "learning_rate": 6.703296703296703e-05, "loss": 0.7312, "step": 61 }, { "epoch": 0.06810380337772896, "grad_norm": 0.46225860714912415, "learning_rate": 6.813186813186813e-05, "loss": 0.8607, "step": 62 }, { "epoch": 0.06920225181930523, "grad_norm": 0.8889493346214294, "learning_rate": 6.923076923076922e-05, "loss": 0.7791, "step": 63 }, { "epoch": 0.0703007002608815, "grad_norm": 0.5721575617790222, "learning_rate": 7.032967032967032e-05, "loss": 0.9426, "step": 64 }, { "epoch": 0.07139914870245778, "grad_norm": 0.8355056047439575, "learning_rate": 7.142857142857142e-05, "loss": 0.621, "step": 65 }, { "epoch": 0.07249759714403405, "grad_norm": 1.3048707246780396, "learning_rate": 7.252747252747252e-05, "loss": 0.8869, "step": 66 }, { "epoch": 0.07359604558561032, "grad_norm": 0.5817797183990479, "learning_rate": 7.362637362637362e-05, "loss": 0.8385, "step": 67 }, { "epoch": 0.0746944940271866, "grad_norm": 1.2051454782485962, "learning_rate": 7.472527472527472e-05, "loss": 0.7566, "step": 68 }, { "epoch": 0.07579294246876288, "grad_norm": 0.8565987944602966, "learning_rate": 7.582417582417581e-05, "loss": 0.8374, "step": 69 }, { "epoch": 0.07689139091033914, "grad_norm": 0.7503894567489624, "learning_rate": 7.692307692307691e-05, "loss": 0.6749, "step": 70 }, { "epoch": 0.07798983935191542, "grad_norm": 0.6298589706420898, "learning_rate": 7.802197802197802e-05, "loss": 0.9096, "step": 71 }, { "epoch": 0.0790882877934917, "grad_norm": 0.8327789306640625, "learning_rate": 7.912087912087912e-05, "loss": 0.9836, "step": 72 }, { "epoch": 0.08018673623506796, "grad_norm": 1.0001461505889893, "learning_rate": 8.021978021978021e-05, "loss": 0.6917, "step": 73 }, { "epoch": 0.08128518467664424, "grad_norm": 0.8373435735702515, "learning_rate": 8.13186813186813e-05, "loss": 0.7703, "step": 74 }, { "epoch": 0.08238363311822051, "grad_norm": 0.9785758256912231, "learning_rate": 8.241758241758242e-05, "loss": 0.8004, "step": 75 }, { "epoch": 0.08348208155979679, "grad_norm": 0.8900540471076965, "learning_rate": 8.351648351648352e-05, "loss": 0.8238, "step": 76 }, { "epoch": 0.08458053000137306, "grad_norm": 0.7411159873008728, "learning_rate": 8.46153846153846e-05, "loss": 1.0364, "step": 77 }, { "epoch": 0.08567897844294933, "grad_norm": 0.4975040555000305, "learning_rate": 8.57142857142857e-05, "loss": 0.4814, "step": 78 }, { "epoch": 0.08677742688452561, "grad_norm": 0.6698398590087891, "learning_rate": 8.681318681318681e-05, "loss": 0.6828, "step": 79 }, { "epoch": 0.08787587532610189, "grad_norm": 0.5883696675300598, "learning_rate": 8.791208791208791e-05, "loss": 0.92, "step": 80 }, { "epoch": 0.08897432376767815, "grad_norm": 0.9050906896591187, "learning_rate": 8.901098901098901e-05, "loss": 0.7229, "step": 81 }, { "epoch": 0.09007277220925443, "grad_norm": 0.5996706485748291, "learning_rate": 9.010989010989009e-05, "loss": 0.699, "step": 82 }, { "epoch": 0.0911712206508307, "grad_norm": 2.0782630443573, "learning_rate": 9.120879120879119e-05, "loss": 1.2118, "step": 83 }, { "epoch": 0.09226966909240697, "grad_norm": 0.759730875492096, "learning_rate": 9.23076923076923e-05, "loss": 0.6397, "step": 84 }, { "epoch": 0.09336811753398325, "grad_norm": 1.1138097047805786, "learning_rate": 9.34065934065934e-05, "loss": 0.8973, "step": 85 }, { "epoch": 0.09446656597555952, "grad_norm": 0.9852680563926697, "learning_rate": 9.45054945054945e-05, "loss": 1.0733, "step": 86 }, { "epoch": 0.0955650144171358, "grad_norm": 0.8435002565383911, "learning_rate": 9.560439560439558e-05, "loss": 0.8977, "step": 87 }, { "epoch": 0.09666346285871207, "grad_norm": 1.3031998872756958, "learning_rate": 9.67032967032967e-05, "loss": 0.9852, "step": 88 }, { "epoch": 0.09776191130028834, "grad_norm": 0.6343463063240051, "learning_rate": 9.78021978021978e-05, "loss": 0.6147, "step": 89 }, { "epoch": 0.09886035974186462, "grad_norm": 0.7061794996261597, "learning_rate": 9.890109890109889e-05, "loss": 0.7437, "step": 90 }, { "epoch": 0.09995880818344088, "grad_norm": 1.2231422662734985, "learning_rate": 9.999999999999999e-05, "loss": 0.7944, "step": 91 }, { "epoch": 0.10105725662501716, "grad_norm": 0.7199704647064209, "learning_rate": 0.0001010989010989011, "loss": 0.7355, "step": 92 }, { "epoch": 0.10215570506659344, "grad_norm": 1.2740516662597656, "learning_rate": 0.00010219780219780219, "loss": 0.7622, "step": 93 }, { "epoch": 0.10325415350816972, "grad_norm": 0.7762659788131714, "learning_rate": 0.00010329670329670329, "loss": 0.7074, "step": 94 }, { "epoch": 0.10435260194974598, "grad_norm": 0.6618936061859131, "learning_rate": 0.00010439560439560438, "loss": 0.7667, "step": 95 }, { "epoch": 0.10545105039132226, "grad_norm": 0.7244533896446228, "learning_rate": 0.0001054945054945055, "loss": 0.6451, "step": 96 }, { "epoch": 0.10654949883289853, "grad_norm": 0.6391953229904175, "learning_rate": 0.0001065934065934066, "loss": 0.5637, "step": 97 }, { "epoch": 0.1076479472744748, "grad_norm": 0.6992442607879639, "learning_rate": 0.00010769230769230768, "loss": 0.7112, "step": 98 }, { "epoch": 0.10874639571605108, "grad_norm": 1.0820791721343994, "learning_rate": 0.00010879120879120878, "loss": 0.9199, "step": 99 }, { "epoch": 0.10984484415762735, "grad_norm": 0.6012185215950012, "learning_rate": 0.00010989010989010988, "loss": 0.5574, "step": 100 }, { "epoch": 0.11094329259920363, "grad_norm": 0.822455644607544, "learning_rate": 0.00011098901098901099, "loss": 0.5185, "step": 101 }, { "epoch": 0.1120417410407799, "grad_norm": 0.9417555332183838, "learning_rate": 0.00011208791208791207, "loss": 0.6883, "step": 102 }, { "epoch": 0.11314018948235617, "grad_norm": 1.0258208513259888, "learning_rate": 0.00011318681318681317, "loss": 0.7588, "step": 103 }, { "epoch": 0.11423863792393245, "grad_norm": 1.904179573059082, "learning_rate": 0.00011428571428571427, "loss": 0.7425, "step": 104 }, { "epoch": 0.11533708636550873, "grad_norm": 1.5453238487243652, "learning_rate": 0.00011538461538461538, "loss": 0.658, "step": 105 }, { "epoch": 0.11643553480708499, "grad_norm": 0.8801619410514832, "learning_rate": 0.00011648351648351648, "loss": 0.8432, "step": 106 }, { "epoch": 0.11753398324866127, "grad_norm": 0.8567579388618469, "learning_rate": 0.00011758241758241756, "loss": 0.5904, "step": 107 }, { "epoch": 0.11863243169023754, "grad_norm": 0.9351131319999695, "learning_rate": 0.00011868131868131866, "loss": 0.7228, "step": 108 }, { "epoch": 0.11973088013181381, "grad_norm": 0.8817545175552368, "learning_rate": 0.00011978021978021978, "loss": 0.7853, "step": 109 }, { "epoch": 0.12082932857339009, "grad_norm": 1.0484094619750977, "learning_rate": 0.00012087912087912087, "loss": 0.7049, "step": 110 }, { "epoch": 0.12192777701496636, "grad_norm": 1.80658757686615, "learning_rate": 0.00012197802197802197, "loss": 0.669, "step": 111 }, { "epoch": 0.12302622545654264, "grad_norm": 1.5311473608016968, "learning_rate": 0.00012307692307692307, "loss": 0.8342, "step": 112 }, { "epoch": 0.1241246738981189, "grad_norm": 0.8968105912208557, "learning_rate": 0.00012417582417582416, "loss": 0.7199, "step": 113 }, { "epoch": 0.12522312233969518, "grad_norm": 0.6149659156799316, "learning_rate": 0.00012527472527472527, "loss": 0.4961, "step": 114 }, { "epoch": 0.12632157078127146, "grad_norm": 8.04592227935791, "learning_rate": 0.00012637362637362635, "loss": 0.7515, "step": 115 }, { "epoch": 0.12742001922284774, "grad_norm": 0.7797659039497375, "learning_rate": 0.00012747252747252746, "loss": 0.7281, "step": 116 }, { "epoch": 0.128518467664424, "grad_norm": 0.6414046883583069, "learning_rate": 0.00012857142857142855, "loss": 0.6655, "step": 117 }, { "epoch": 0.12961691610600026, "grad_norm": 4.678529262542725, "learning_rate": 0.00012967032967032966, "loss": 0.9165, "step": 118 }, { "epoch": 0.13071536454757654, "grad_norm": 0.8540724515914917, "learning_rate": 0.00013076923076923077, "loss": 0.7064, "step": 119 }, { "epoch": 0.13181381298915282, "grad_norm": 1.057844638824463, "learning_rate": 0.00013186813186813186, "loss": 0.6617, "step": 120 }, { "epoch": 0.1329122614307291, "grad_norm": 0.8429140448570251, "learning_rate": 0.00013296703296703294, "loss": 0.8156, "step": 121 }, { "epoch": 0.13401070987230537, "grad_norm": 0.9944230914115906, "learning_rate": 0.00013406593406593405, "loss": 0.5851, "step": 122 }, { "epoch": 0.13510915831388165, "grad_norm": 0.6582810878753662, "learning_rate": 0.00013516483516483517, "loss": 0.5819, "step": 123 }, { "epoch": 0.13620760675545793, "grad_norm": 1.3106951713562012, "learning_rate": 0.00013626373626373625, "loss": 0.7598, "step": 124 }, { "epoch": 0.13730605519703418, "grad_norm": 1.0464080572128296, "learning_rate": 0.00013736263736263734, "loss": 0.7241, "step": 125 }, { "epoch": 0.13840450363861045, "grad_norm": 0.8519262075424194, "learning_rate": 0.00013846153846153845, "loss": 0.7001, "step": 126 }, { "epoch": 0.13950295208018673, "grad_norm": 1.2764228582382202, "learning_rate": 0.00013956043956043956, "loss": 0.7152, "step": 127 }, { "epoch": 0.140601400521763, "grad_norm": 1.157472014427185, "learning_rate": 0.00014065934065934064, "loss": 0.697, "step": 128 }, { "epoch": 0.1416998489633393, "grad_norm": 0.7153847813606262, "learning_rate": 0.00014175824175824173, "loss": 0.6897, "step": 129 }, { "epoch": 0.14279829740491556, "grad_norm": 0.7254152297973633, "learning_rate": 0.00014285714285714284, "loss": 0.5263, "step": 130 }, { "epoch": 0.14389674584649184, "grad_norm": 1.3370522260665894, "learning_rate": 0.00014395604395604395, "loss": 0.7587, "step": 131 }, { "epoch": 0.1449951942880681, "grad_norm": 1.092029333114624, "learning_rate": 0.00014505494505494504, "loss": 0.8674, "step": 132 }, { "epoch": 0.14609364272964437, "grad_norm": 0.6123655438423157, "learning_rate": 0.00014615384615384615, "loss": 0.7163, "step": 133 }, { "epoch": 0.14719209117122065, "grad_norm": 0.8476639986038208, "learning_rate": 0.00014725274725274723, "loss": 0.7241, "step": 134 }, { "epoch": 0.14829053961279692, "grad_norm": 0.9986979961395264, "learning_rate": 0.00014835164835164835, "loss": 0.6229, "step": 135 }, { "epoch": 0.1493889880543732, "grad_norm": 0.8208728432655334, "learning_rate": 0.00014945054945054943, "loss": 0.5441, "step": 136 }, { "epoch": 0.15048743649594948, "grad_norm": 0.742091953754425, "learning_rate": 0.00015054945054945054, "loss": 0.6047, "step": 137 }, { "epoch": 0.15158588493752576, "grad_norm": 1.6566306352615356, "learning_rate": 0.00015164835164835163, "loss": 0.6381, "step": 138 }, { "epoch": 0.152684333379102, "grad_norm": 0.7735741138458252, "learning_rate": 0.0001527472527472527, "loss": 0.5842, "step": 139 }, { "epoch": 0.15378278182067828, "grad_norm": 0.7116795778274536, "learning_rate": 0.00015384615384615382, "loss": 0.7117, "step": 140 }, { "epoch": 0.15488123026225456, "grad_norm": 0.6912885904312134, "learning_rate": 0.00015494505494505494, "loss": 0.763, "step": 141 }, { "epoch": 0.15597967870383084, "grad_norm": 1.0789505243301392, "learning_rate": 0.00015604395604395605, "loss": 0.5534, "step": 142 }, { "epoch": 0.15707812714540711, "grad_norm": 1.0304033756256104, "learning_rate": 0.00015714285714285713, "loss": 0.4961, "step": 143 }, { "epoch": 0.1581765755869834, "grad_norm": 1.0216940641403198, "learning_rate": 0.00015824175824175824, "loss": 0.8167, "step": 144 }, { "epoch": 0.15927502402855967, "grad_norm": 0.7767283916473389, "learning_rate": 0.00015934065934065933, "loss": 0.649, "step": 145 }, { "epoch": 0.16037347247013592, "grad_norm": 0.6125204563140869, "learning_rate": 0.00016043956043956041, "loss": 0.6596, "step": 146 }, { "epoch": 0.1614719209117122, "grad_norm": 2.113314390182495, "learning_rate": 0.00016153846153846153, "loss": 0.6825, "step": 147 }, { "epoch": 0.16257036935328847, "grad_norm": 1.3892889022827148, "learning_rate": 0.0001626373626373626, "loss": 0.5162, "step": 148 }, { "epoch": 0.16366881779486475, "grad_norm": 1.2544710636138916, "learning_rate": 0.0001637362637362637, "loss": 0.5992, "step": 149 }, { "epoch": 0.16476726623644103, "grad_norm": 1.2952786684036255, "learning_rate": 0.00016483516483516484, "loss": 0.5968, "step": 150 }, { "epoch": 0.1658657146780173, "grad_norm": 0.9910382628440857, "learning_rate": 0.00016593406593406592, "loss": 0.6138, "step": 151 }, { "epoch": 0.16696416311959358, "grad_norm": 0.7291635870933533, "learning_rate": 0.00016703296703296703, "loss": 0.8957, "step": 152 }, { "epoch": 0.16806261156116986, "grad_norm": 0.7290105819702148, "learning_rate": 0.00016813186813186812, "loss": 0.4864, "step": 153 }, { "epoch": 0.1691610600027461, "grad_norm": 1.1888444423675537, "learning_rate": 0.0001692307692307692, "loss": 0.913, "step": 154 }, { "epoch": 0.1702595084443224, "grad_norm": 0.8183659315109253, "learning_rate": 0.0001703296703296703, "loss": 0.6405, "step": 155 }, { "epoch": 0.17135795688589867, "grad_norm": 0.8549530506134033, "learning_rate": 0.0001714285714285714, "loss": 0.7019, "step": 156 }, { "epoch": 0.17245640532747494, "grad_norm": 0.5960697531700134, "learning_rate": 0.0001725274725274725, "loss": 0.6728, "step": 157 }, { "epoch": 0.17355485376905122, "grad_norm": 0.6802973747253418, "learning_rate": 0.00017362637362637362, "loss": 0.6462, "step": 158 }, { "epoch": 0.1746533022106275, "grad_norm": 0.5056049823760986, "learning_rate": 0.00017472527472527473, "loss": 0.5155, "step": 159 }, { "epoch": 0.17575175065220378, "grad_norm": 0.8181887865066528, "learning_rate": 0.00017582417582417582, "loss": 0.6631, "step": 160 }, { "epoch": 0.17685019909378003, "grad_norm": 0.5748574137687683, "learning_rate": 0.0001769230769230769, "loss": 0.5807, "step": 161 }, { "epoch": 0.1779486475353563, "grad_norm": 0.8585043549537659, "learning_rate": 0.00017802197802197802, "loss": 0.5412, "step": 162 }, { "epoch": 0.17904709597693258, "grad_norm": 0.8763203620910645, "learning_rate": 0.0001791208791208791, "loss": 1.0859, "step": 163 }, { "epoch": 0.18014554441850886, "grad_norm": 0.7327267527580261, "learning_rate": 0.00018021978021978018, "loss": 0.8034, "step": 164 }, { "epoch": 0.18124399286008513, "grad_norm": 0.6813991665840149, "learning_rate": 0.0001813186813186813, "loss": 0.9236, "step": 165 }, { "epoch": 0.1823424413016614, "grad_norm": 2.9234185218811035, "learning_rate": 0.00018241758241758238, "loss": 0.9148, "step": 166 }, { "epoch": 0.1834408897432377, "grad_norm": 0.8117207884788513, "learning_rate": 0.00018351648351648352, "loss": 1.0514, "step": 167 }, { "epoch": 0.18453933818481394, "grad_norm": 0.6485300064086914, "learning_rate": 0.0001846153846153846, "loss": 0.4764, "step": 168 }, { "epoch": 0.18563778662639022, "grad_norm": 0.43059054017066956, "learning_rate": 0.00018571428571428572, "loss": 0.6289, "step": 169 }, { "epoch": 0.1867362350679665, "grad_norm": 1.007095456123352, "learning_rate": 0.0001868131868131868, "loss": 0.5889, "step": 170 }, { "epoch": 0.18783468350954277, "grad_norm": 1.6733218431472778, "learning_rate": 0.0001879120879120879, "loss": 0.8036, "step": 171 }, { "epoch": 0.18893313195111905, "grad_norm": 0.7533760666847229, "learning_rate": 0.000189010989010989, "loss": 0.7282, "step": 172 }, { "epoch": 0.19003158039269533, "grad_norm": 0.45892444252967834, "learning_rate": 0.00019010989010989008, "loss": 0.6273, "step": 173 }, { "epoch": 0.1911300288342716, "grad_norm": 0.54690021276474, "learning_rate": 0.00019120879120879117, "loss": 0.669, "step": 174 }, { "epoch": 0.19222847727584785, "grad_norm": 0.7361836433410645, "learning_rate": 0.0001923076923076923, "loss": 0.8945, "step": 175 }, { "epoch": 0.19332692571742413, "grad_norm": 0.5876324772834778, "learning_rate": 0.0001934065934065934, "loss": 0.7557, "step": 176 }, { "epoch": 0.1944253741590004, "grad_norm": 0.7753897309303284, "learning_rate": 0.0001945054945054945, "loss": 0.7904, "step": 177 }, { "epoch": 0.19552382260057669, "grad_norm": 0.6244968771934509, "learning_rate": 0.0001956043956043956, "loss": 0.7617, "step": 178 }, { "epoch": 0.19662227104215296, "grad_norm": 0.6300948262214661, "learning_rate": 0.00019670329670329667, "loss": 0.5884, "step": 179 }, { "epoch": 0.19772071948372924, "grad_norm": 0.5845354795455933, "learning_rate": 0.00019780219780219779, "loss": 0.8034, "step": 180 }, { "epoch": 0.19881916792530552, "grad_norm": 0.5231277942657471, "learning_rate": 0.00019890109890109887, "loss": 0.5302, "step": 181 }, { "epoch": 0.19991761636688177, "grad_norm": 0.8393481969833374, "learning_rate": 0.00019999999999999998, "loss": 0.6376, "step": 182 }, { "epoch": 0.20101606480845804, "grad_norm": 0.5777038335800171, "learning_rate": 0.00020109890109890107, "loss": 0.5777, "step": 183 }, { "epoch": 0.20211451325003432, "grad_norm": 0.7751956582069397, "learning_rate": 0.0002021978021978022, "loss": 0.8368, "step": 184 }, { "epoch": 0.2032129616916106, "grad_norm": 1.5582187175750732, "learning_rate": 0.0002032967032967033, "loss": 0.5087, "step": 185 }, { "epoch": 0.20431141013318688, "grad_norm": 0.8304231762886047, "learning_rate": 0.00020439560439560438, "loss": 0.5512, "step": 186 }, { "epoch": 0.20540985857476315, "grad_norm": 0.8545000553131104, "learning_rate": 0.0002054945054945055, "loss": 1.2533, "step": 187 }, { "epoch": 0.20650830701633943, "grad_norm": 0.4891647696495056, "learning_rate": 0.00020659340659340657, "loss": 0.5738, "step": 188 }, { "epoch": 0.20760675545791568, "grad_norm": 0.7159665822982788, "learning_rate": 0.00020769230769230766, "loss": 0.9266, "step": 189 }, { "epoch": 0.20870520389949196, "grad_norm": 0.5053237080574036, "learning_rate": 0.00020879120879120877, "loss": 0.4574, "step": 190 }, { "epoch": 0.20980365234106824, "grad_norm": 0.728336751461029, "learning_rate": 0.00020989010989010985, "loss": 0.6871, "step": 191 }, { "epoch": 0.2109021007826445, "grad_norm": 0.8593311309814453, "learning_rate": 0.000210989010989011, "loss": 0.6788, "step": 192 }, { "epoch": 0.2120005492242208, "grad_norm": 1.247111201286316, "learning_rate": 0.00021208791208791208, "loss": 0.5428, "step": 193 }, { "epoch": 0.21309899766579707, "grad_norm": 0.6636946201324463, "learning_rate": 0.0002131868131868132, "loss": 0.7935, "step": 194 }, { "epoch": 0.21419744610737335, "grad_norm": 0.5811622738838196, "learning_rate": 0.00021428571428571427, "loss": 0.4322, "step": 195 }, { "epoch": 0.2152958945489496, "grad_norm": 0.5329126715660095, "learning_rate": 0.00021538461538461536, "loss": 0.7037, "step": 196 }, { "epoch": 0.21639434299052587, "grad_norm": 1.730969786643982, "learning_rate": 0.00021648351648351647, "loss": 1.0315, "step": 197 }, { "epoch": 0.21749279143210215, "grad_norm": 0.5242175459861755, "learning_rate": 0.00021758241758241756, "loss": 0.9285, "step": 198 }, { "epoch": 0.21859123987367843, "grad_norm": 0.4745596945285797, "learning_rate": 0.00021868131868131864, "loss": 0.5414, "step": 199 }, { "epoch": 0.2196896883152547, "grad_norm": 0.8693228363990784, "learning_rate": 0.00021978021978021975, "loss": 0.4576, "step": 200 }, { "epoch": 0.22078813675683098, "grad_norm": 0.7073357105255127, "learning_rate": 0.00022087912087912086, "loss": 0.778, "step": 201 }, { "epoch": 0.22188658519840726, "grad_norm": 0.535009503364563, "learning_rate": 0.00022197802197802198, "loss": 0.7734, "step": 202 }, { "epoch": 0.2229850336399835, "grad_norm": 0.5862578749656677, "learning_rate": 0.00022307692307692306, "loss": 0.8612, "step": 203 }, { "epoch": 0.2240834820815598, "grad_norm": 0.5167233943939209, "learning_rate": 0.00022417582417582415, "loss": 0.6122, "step": 204 }, { "epoch": 0.22518193052313606, "grad_norm": 0.8982027769088745, "learning_rate": 0.00022527472527472526, "loss": 0.8905, "step": 205 }, { "epoch": 0.22628037896471234, "grad_norm": 0.7311340570449829, "learning_rate": 0.00022637362637362634, "loss": 1.0151, "step": 206 }, { "epoch": 0.22737882740628862, "grad_norm": 0.45674124360084534, "learning_rate": 0.00022747252747252745, "loss": 0.7056, "step": 207 }, { "epoch": 0.2284772758478649, "grad_norm": 0.6916844844818115, "learning_rate": 0.00022857142857142854, "loss": 0.5977, "step": 208 }, { "epoch": 0.22957572428944117, "grad_norm": 0.6632958650588989, "learning_rate": 0.00022967032967032962, "loss": 0.8228, "step": 209 }, { "epoch": 0.23067417273101745, "grad_norm": 0.3243491053581238, "learning_rate": 0.00023076923076923076, "loss": 0.4823, "step": 210 }, { "epoch": 0.2317726211725937, "grad_norm": 0.45630499720573425, "learning_rate": 0.00023186813186813185, "loss": 0.7206, "step": 211 }, { "epoch": 0.23287106961416998, "grad_norm": 0.6726184487342834, "learning_rate": 0.00023296703296703296, "loss": 0.8211, "step": 212 }, { "epoch": 0.23396951805574626, "grad_norm": 0.45092982053756714, "learning_rate": 0.00023406593406593405, "loss": 0.6812, "step": 213 }, { "epoch": 0.23506796649732253, "grad_norm": 0.5624651312828064, "learning_rate": 0.00023516483516483513, "loss": 0.726, "step": 214 }, { "epoch": 0.2361664149388988, "grad_norm": 1.1685765981674194, "learning_rate": 0.00023626373626373624, "loss": 0.7906, "step": 215 }, { "epoch": 0.2372648633804751, "grad_norm": 0.581599771976471, "learning_rate": 0.00023736263736263733, "loss": 0.7049, "step": 216 }, { "epoch": 0.23836331182205137, "grad_norm": 0.7660847902297974, "learning_rate": 0.00023846153846153844, "loss": 0.6105, "step": 217 }, { "epoch": 0.23946176026362762, "grad_norm": 0.5126472115516663, "learning_rate": 0.00023956043956043955, "loss": 0.7134, "step": 218 }, { "epoch": 0.2405602087052039, "grad_norm": 0.48460498452186584, "learning_rate": 0.00024065934065934066, "loss": 0.5578, "step": 219 }, { "epoch": 0.24165865714678017, "grad_norm": 0.41463029384613037, "learning_rate": 0.00024175824175824175, "loss": 0.5589, "step": 220 }, { "epoch": 0.24275710558835645, "grad_norm": 2.0703623294830322, "learning_rate": 0.00024285714285714283, "loss": 0.7128, "step": 221 }, { "epoch": 0.24385555402993273, "grad_norm": 1.5641820430755615, "learning_rate": 0.00024395604395604394, "loss": 0.4439, "step": 222 }, { "epoch": 0.244954002471509, "grad_norm": 0.34634652733802795, "learning_rate": 0.00024505494505494503, "loss": 0.5389, "step": 223 }, { "epoch": 0.24605245091308528, "grad_norm": 0.5669183135032654, "learning_rate": 0.00024615384615384614, "loss": 0.5699, "step": 224 }, { "epoch": 0.24715089935466153, "grad_norm": 0.6459633111953735, "learning_rate": 0.0002472527472527472, "loss": 0.7904, "step": 225 }, { "epoch": 0.2482493477962378, "grad_norm": 0.9719502925872803, "learning_rate": 0.0002483516483516483, "loss": 0.7354, "step": 226 }, { "epoch": 0.24934779623781408, "grad_norm": 0.7433357834815979, "learning_rate": 0.0002494505494505494, "loss": 0.5772, "step": 227 }, { "epoch": 0.25044624467939036, "grad_norm": 0.42272481322288513, "learning_rate": 0.00025054945054945053, "loss": 0.5609, "step": 228 }, { "epoch": 0.2515446931209666, "grad_norm": 1.2868828773498535, "learning_rate": 0.00025164835164835165, "loss": 0.5775, "step": 229 }, { "epoch": 0.2526431415625429, "grad_norm": 0.40398430824279785, "learning_rate": 0.0002527472527472527, "loss": 0.742, "step": 230 }, { "epoch": 0.25374159000411917, "grad_norm": 0.46501678228378296, "learning_rate": 0.0002538461538461538, "loss": 0.69, "step": 231 }, { "epoch": 0.25484003844569547, "grad_norm": 0.46631869673728943, "learning_rate": 0.00025494505494505493, "loss": 0.7712, "step": 232 }, { "epoch": 0.2559384868872717, "grad_norm": 0.6761367321014404, "learning_rate": 0.000256043956043956, "loss": 0.64, "step": 233 }, { "epoch": 0.257036935328848, "grad_norm": 0.6253519654273987, "learning_rate": 0.0002571428571428571, "loss": 0.5499, "step": 234 }, { "epoch": 0.2581353837704243, "grad_norm": 1.0556268692016602, "learning_rate": 0.0002582417582417582, "loss": 0.869, "step": 235 }, { "epoch": 0.2592338322120005, "grad_norm": 0.4816044867038727, "learning_rate": 0.0002593406593406593, "loss": 0.6061, "step": 236 }, { "epoch": 0.26033228065357683, "grad_norm": 1.1049383878707886, "learning_rate": 0.00026043956043956043, "loss": 0.7695, "step": 237 }, { "epoch": 0.2614307290951531, "grad_norm": 0.44643181562423706, "learning_rate": 0.00026153846153846154, "loss": 0.7849, "step": 238 }, { "epoch": 0.2625291775367294, "grad_norm": 0.5231640338897705, "learning_rate": 0.0002626373626373626, "loss": 0.8033, "step": 239 }, { "epoch": 0.26362762597830564, "grad_norm": 0.5537316799163818, "learning_rate": 0.0002637362637362637, "loss": 0.7317, "step": 240 }, { "epoch": 0.26472607441988194, "grad_norm": 0.42069998383522034, "learning_rate": 0.0002648351648351648, "loss": 0.6325, "step": 241 }, { "epoch": 0.2658245228614582, "grad_norm": 0.8009732365608215, "learning_rate": 0.0002659340659340659, "loss": 0.6589, "step": 242 }, { "epoch": 0.26692297130303444, "grad_norm": 1.2626444101333618, "learning_rate": 0.000267032967032967, "loss": 0.5845, "step": 243 }, { "epoch": 0.26802141974461074, "grad_norm": 0.4783913195133209, "learning_rate": 0.0002681318681318681, "loss": 0.8844, "step": 244 }, { "epoch": 0.269119868186187, "grad_norm": 1.098160982131958, "learning_rate": 0.0002692307692307692, "loss": 0.6134, "step": 245 }, { "epoch": 0.2702183166277633, "grad_norm": 1.0397273302078247, "learning_rate": 0.00027032967032967033, "loss": 0.7861, "step": 246 }, { "epoch": 0.27131676506933955, "grad_norm": 0.9729229807853699, "learning_rate": 0.0002714285714285714, "loss": 0.7691, "step": 247 }, { "epoch": 0.27241521351091585, "grad_norm": 0.44837963581085205, "learning_rate": 0.0002725274725274725, "loss": 0.9414, "step": 248 }, { "epoch": 0.2735136619524921, "grad_norm": 1.4863499402999878, "learning_rate": 0.0002736263736263736, "loss": 0.5825, "step": 249 }, { "epoch": 0.27461211039406835, "grad_norm": 0.5948237180709839, "learning_rate": 0.00027472527472527467, "loss": 0.4934, "step": 250 }, { "epoch": 0.27571055883564466, "grad_norm": 0.5448721051216125, "learning_rate": 0.0002758241758241758, "loss": 0.6295, "step": 251 }, { "epoch": 0.2768090072772209, "grad_norm": 0.4309394657611847, "learning_rate": 0.0002769230769230769, "loss": 0.6561, "step": 252 }, { "epoch": 0.2779074557187972, "grad_norm": 0.7659335136413574, "learning_rate": 0.000278021978021978, "loss": 0.7588, "step": 253 }, { "epoch": 0.27900590416037346, "grad_norm": 0.45655715465545654, "learning_rate": 0.0002791208791208791, "loss": 0.5257, "step": 254 }, { "epoch": 0.28010435260194977, "grad_norm": 0.5390630960464478, "learning_rate": 0.0002802197802197802, "loss": 0.7051, "step": 255 }, { "epoch": 0.281202801043526, "grad_norm": 0.39703306555747986, "learning_rate": 0.0002813186813186813, "loss": 0.6137, "step": 256 }, { "epoch": 0.28230124948510227, "grad_norm": 0.4662924110889435, "learning_rate": 0.0002824175824175824, "loss": 0.4897, "step": 257 }, { "epoch": 0.2833996979266786, "grad_norm": 0.39399877190589905, "learning_rate": 0.00028351648351648346, "loss": 0.6235, "step": 258 }, { "epoch": 0.2844981463682548, "grad_norm": 0.497549444437027, "learning_rate": 0.00028461538461538457, "loss": 0.5134, "step": 259 }, { "epoch": 0.28559659480983113, "grad_norm": 0.6597803235054016, "learning_rate": 0.0002857142857142857, "loss": 0.7955, "step": 260 }, { "epoch": 0.2866950432514074, "grad_norm": 0.5545711517333984, "learning_rate": 0.0002868131868131868, "loss": 0.833, "step": 261 }, { "epoch": 0.2877934916929837, "grad_norm": 1.0227786302566528, "learning_rate": 0.0002879120879120879, "loss": 0.5249, "step": 262 }, { "epoch": 0.28889194013455993, "grad_norm": 0.5727143883705139, "learning_rate": 0.000289010989010989, "loss": 0.6319, "step": 263 }, { "epoch": 0.2899903885761362, "grad_norm": 0.39322397112846375, "learning_rate": 0.0002901098901098901, "loss": 0.7003, "step": 264 }, { "epoch": 0.2910888370177125, "grad_norm": 0.5657737851142883, "learning_rate": 0.0002912087912087912, "loss": 0.7085, "step": 265 }, { "epoch": 0.29218728545928874, "grad_norm": 0.4305976927280426, "learning_rate": 0.0002923076923076923, "loss": 0.5931, "step": 266 }, { "epoch": 0.29328573390086504, "grad_norm": 0.5300284624099731, "learning_rate": 0.00029340659340659336, "loss": 0.7881, "step": 267 }, { "epoch": 0.2943841823424413, "grad_norm": 0.5922349095344543, "learning_rate": 0.00029450549450549447, "loss": 0.8688, "step": 268 }, { "epoch": 0.2954826307840176, "grad_norm": 0.5700828433036804, "learning_rate": 0.0002956043956043956, "loss": 1.1328, "step": 269 }, { "epoch": 0.29658107922559385, "grad_norm": 0.6773694753646851, "learning_rate": 0.0002967032967032967, "loss": 0.7821, "step": 270 }, { "epoch": 0.2976795276671701, "grad_norm": 0.5200739502906799, "learning_rate": 0.0002978021978021978, "loss": 0.8775, "step": 271 }, { "epoch": 0.2987779761087464, "grad_norm": 0.9860020875930786, "learning_rate": 0.00029890109890109886, "loss": 0.9141, "step": 272 }, { "epoch": 0.29987642455032265, "grad_norm": 0.7012956142425537, "learning_rate": 0.0003, "loss": 0.7672, "step": 273 }, { "epoch": 0.30097487299189896, "grad_norm": 0.4128098785877228, "learning_rate": 0.0002998778998778999, "loss": 0.3969, "step": 274 }, { "epoch": 0.3020733214334752, "grad_norm": 0.366597980260849, "learning_rate": 0.00029975579975579974, "loss": 0.639, "step": 275 }, { "epoch": 0.3031717698750515, "grad_norm": 0.5208033919334412, "learning_rate": 0.0002996336996336996, "loss": 0.664, "step": 276 }, { "epoch": 0.30427021831662776, "grad_norm": 0.45519202947616577, "learning_rate": 0.0002995115995115995, "loss": 0.8495, "step": 277 }, { "epoch": 0.305368666758204, "grad_norm": 0.6617010831832886, "learning_rate": 0.0002993894993894994, "loss": 1.0204, "step": 278 }, { "epoch": 0.3064671151997803, "grad_norm": 1.4151723384857178, "learning_rate": 0.00029926739926739923, "loss": 0.8289, "step": 279 }, { "epoch": 0.30756556364135657, "grad_norm": 0.6531035900115967, "learning_rate": 0.00029914529914529915, "loss": 0.7571, "step": 280 }, { "epoch": 0.30866401208293287, "grad_norm": 0.8595600724220276, "learning_rate": 0.000299023199023199, "loss": 0.9668, "step": 281 }, { "epoch": 0.3097624605245091, "grad_norm": 0.50210040807724, "learning_rate": 0.00029890109890109886, "loss": 0.6662, "step": 282 }, { "epoch": 0.3108609089660854, "grad_norm": 0.6004669666290283, "learning_rate": 0.0002987789987789988, "loss": 0.7127, "step": 283 }, { "epoch": 0.3119593574076617, "grad_norm": 0.8085057139396667, "learning_rate": 0.00029865689865689863, "loss": 0.9266, "step": 284 }, { "epoch": 0.3130578058492379, "grad_norm": 0.44965627789497375, "learning_rate": 0.0002985347985347985, "loss": 0.7118, "step": 285 }, { "epoch": 0.31415625429081423, "grad_norm": 0.5758265852928162, "learning_rate": 0.00029841269841269835, "loss": 0.6915, "step": 286 }, { "epoch": 0.3152547027323905, "grad_norm": 0.5623393058776855, "learning_rate": 0.00029829059829059826, "loss": 0.6962, "step": 287 }, { "epoch": 0.3163531511739668, "grad_norm": 0.857796311378479, "learning_rate": 0.0002981684981684982, "loss": 0.676, "step": 288 }, { "epoch": 0.31745159961554303, "grad_norm": 0.36431241035461426, "learning_rate": 0.000298046398046398, "loss": 0.5475, "step": 289 }, { "epoch": 0.31855004805711934, "grad_norm": 0.4778802692890167, "learning_rate": 0.0002979242979242979, "loss": 0.7198, "step": 290 }, { "epoch": 0.3196484964986956, "grad_norm": 0.4887610673904419, "learning_rate": 0.0002978021978021978, "loss": 0.5559, "step": 291 }, { "epoch": 0.32074694494027184, "grad_norm": 0.745379626750946, "learning_rate": 0.00029768009768009766, "loss": 1.0509, "step": 292 }, { "epoch": 0.32184539338184814, "grad_norm": 0.40081167221069336, "learning_rate": 0.0002975579975579975, "loss": 0.6564, "step": 293 }, { "epoch": 0.3229438418234244, "grad_norm": 0.5133034586906433, "learning_rate": 0.00029743589743589743, "loss": 0.6765, "step": 294 }, { "epoch": 0.3240422902650007, "grad_norm": 0.5123881697654724, "learning_rate": 0.0002973137973137973, "loss": 0.8001, "step": 295 }, { "epoch": 0.32514073870657695, "grad_norm": 0.3771597743034363, "learning_rate": 0.00029719169719169715, "loss": 0.785, "step": 296 }, { "epoch": 0.32623918714815325, "grad_norm": 0.38929086923599243, "learning_rate": 0.00029706959706959706, "loss": 0.7273, "step": 297 }, { "epoch": 0.3273376355897295, "grad_norm": 0.47761446237564087, "learning_rate": 0.0002969474969474969, "loss": 0.6997, "step": 298 }, { "epoch": 0.3284360840313058, "grad_norm": 0.4798452854156494, "learning_rate": 0.0002968253968253968, "loss": 0.7171, "step": 299 }, { "epoch": 0.32953453247288206, "grad_norm": 0.5864073038101196, "learning_rate": 0.0002967032967032967, "loss": 0.7075, "step": 300 }, { "epoch": 0.3306329809144583, "grad_norm": 0.6298258900642395, "learning_rate": 0.00029658119658119655, "loss": 0.8659, "step": 301 }, { "epoch": 0.3317314293560346, "grad_norm": 0.9764651656150818, "learning_rate": 0.0002964590964590964, "loss": 0.7451, "step": 302 }, { "epoch": 0.33282987779761086, "grad_norm": 0.7084535360336304, "learning_rate": 0.0002963369963369963, "loss": 0.7896, "step": 303 }, { "epoch": 0.33392832623918717, "grad_norm": 0.3226016163825989, "learning_rate": 0.0002962148962148962, "loss": 0.5614, "step": 304 }, { "epoch": 0.3350267746807634, "grad_norm": 0.5515668988227844, "learning_rate": 0.0002960927960927961, "loss": 0.6981, "step": 305 }, { "epoch": 0.3361252231223397, "grad_norm": 0.42776307463645935, "learning_rate": 0.00029597069597069595, "loss": 0.5911, "step": 306 }, { "epoch": 0.33722367156391597, "grad_norm": 0.36645814776420593, "learning_rate": 0.0002958485958485958, "loss": 0.5584, "step": 307 }, { "epoch": 0.3383221200054922, "grad_norm": 0.4089672565460205, "learning_rate": 0.0002957264957264957, "loss": 0.6814, "step": 308 }, { "epoch": 0.3394205684470685, "grad_norm": 0.4406324326992035, "learning_rate": 0.0002956043956043956, "loss": 0.5426, "step": 309 }, { "epoch": 0.3405190168886448, "grad_norm": 0.4138193726539612, "learning_rate": 0.00029548229548229544, "loss": 0.7554, "step": 310 }, { "epoch": 0.3416174653302211, "grad_norm": 0.45647338032722473, "learning_rate": 0.00029536019536019535, "loss": 0.4871, "step": 311 }, { "epoch": 0.34271591377179733, "grad_norm": 0.44362974166870117, "learning_rate": 0.0002952380952380952, "loss": 0.7254, "step": 312 }, { "epoch": 0.34381436221337364, "grad_norm": 0.5832559466362, "learning_rate": 0.00029511599511599507, "loss": 0.64, "step": 313 }, { "epoch": 0.3449128106549499, "grad_norm": 0.6754651665687561, "learning_rate": 0.000294993894993895, "loss": 0.7046, "step": 314 }, { "epoch": 0.34601125909652614, "grad_norm": 0.6487123370170593, "learning_rate": 0.00029487179487179484, "loss": 0.5934, "step": 315 }, { "epoch": 0.34710970753810244, "grad_norm": 0.24118930101394653, "learning_rate": 0.0002947496947496947, "loss": 0.5241, "step": 316 }, { "epoch": 0.3482081559796787, "grad_norm": 0.4580494165420532, "learning_rate": 0.0002946275946275946, "loss": 0.6733, "step": 317 }, { "epoch": 0.349306604421255, "grad_norm": 0.4770609736442566, "learning_rate": 0.00029450549450549447, "loss": 0.5758, "step": 318 }, { "epoch": 0.35040505286283125, "grad_norm": 0.40334221720695496, "learning_rate": 0.0002943833943833944, "loss": 0.5365, "step": 319 }, { "epoch": 0.35150350130440755, "grad_norm": 0.5605480074882507, "learning_rate": 0.00029426129426129424, "loss": 0.5967, "step": 320 }, { "epoch": 0.3526019497459838, "grad_norm": 0.6031836271286011, "learning_rate": 0.0002941391941391941, "loss": 0.6397, "step": 321 }, { "epoch": 0.35370039818756005, "grad_norm": 0.5602075457572937, "learning_rate": 0.000294017094017094, "loss": 0.7253, "step": 322 }, { "epoch": 0.35479884662913636, "grad_norm": 1.5055879354476929, "learning_rate": 0.00029389499389499387, "loss": 0.6066, "step": 323 }, { "epoch": 0.3558972950707126, "grad_norm": 1.969072699546814, "learning_rate": 0.0002937728937728937, "loss": 0.9263, "step": 324 }, { "epoch": 0.3569957435122889, "grad_norm": 0.43139147758483887, "learning_rate": 0.00029365079365079364, "loss": 0.6462, "step": 325 }, { "epoch": 0.35809419195386516, "grad_norm": 0.40423595905303955, "learning_rate": 0.0002935286935286935, "loss": 0.4278, "step": 326 }, { "epoch": 0.35919264039544146, "grad_norm": 0.41983166337013245, "learning_rate": 0.00029340659340659336, "loss": 0.7527, "step": 327 }, { "epoch": 0.3602910888370177, "grad_norm": 0.6624807715415955, "learning_rate": 0.00029328449328449327, "loss": 0.7381, "step": 328 }, { "epoch": 0.36138953727859396, "grad_norm": 0.6173990964889526, "learning_rate": 0.00029316239316239313, "loss": 0.6838, "step": 329 }, { "epoch": 0.36248798572017027, "grad_norm": 1.1278433799743652, "learning_rate": 0.000293040293040293, "loss": 0.8439, "step": 330 }, { "epoch": 0.3635864341617465, "grad_norm": 0.3453993797302246, "learning_rate": 0.0002929181929181929, "loss": 0.5324, "step": 331 }, { "epoch": 0.3646848826033228, "grad_norm": 0.4151187241077423, "learning_rate": 0.0002927960927960928, "loss": 0.7019, "step": 332 }, { "epoch": 0.3657833310448991, "grad_norm": 0.4247313439846039, "learning_rate": 0.0002926739926739926, "loss": 0.6362, "step": 333 }, { "epoch": 0.3668817794864754, "grad_norm": 1.5250136852264404, "learning_rate": 0.00029255189255189253, "loss": 0.5885, "step": 334 }, { "epoch": 0.36798022792805163, "grad_norm": 0.43669968843460083, "learning_rate": 0.00029242979242979244, "loss": 0.9191, "step": 335 }, { "epoch": 0.3690786763696279, "grad_norm": 0.8063925504684448, "learning_rate": 0.0002923076923076923, "loss": 0.6813, "step": 336 }, { "epoch": 0.3701771248112042, "grad_norm": 0.6002399325370789, "learning_rate": 0.00029218559218559216, "loss": 0.5859, "step": 337 }, { "epoch": 0.37127557325278043, "grad_norm": 0.9405462145805359, "learning_rate": 0.000292063492063492, "loss": 0.7476, "step": 338 }, { "epoch": 0.37237402169435674, "grad_norm": 0.5050615072250366, "learning_rate": 0.00029194139194139193, "loss": 0.5172, "step": 339 }, { "epoch": 0.373472470135933, "grad_norm": 0.4593801200389862, "learning_rate": 0.0002918192918192918, "loss": 0.5405, "step": 340 }, { "epoch": 0.3745709185775093, "grad_norm": 0.5275060534477234, "learning_rate": 0.00029169719169719164, "loss": 0.4537, "step": 341 }, { "epoch": 0.37566936701908554, "grad_norm": 0.8907522559165955, "learning_rate": 0.00029157509157509156, "loss": 0.6826, "step": 342 }, { "epoch": 0.3767678154606618, "grad_norm": 0.7229670882225037, "learning_rate": 0.0002914529914529914, "loss": 0.6072, "step": 343 }, { "epoch": 0.3778662639022381, "grad_norm": 1.7154827117919922, "learning_rate": 0.0002913308913308913, "loss": 0.6956, "step": 344 }, { "epoch": 0.37896471234381435, "grad_norm": 1.012902021408081, "learning_rate": 0.0002912087912087912, "loss": 0.5337, "step": 345 }, { "epoch": 0.38006316078539065, "grad_norm": 0.6467313170433044, "learning_rate": 0.00029108669108669105, "loss": 0.7652, "step": 346 }, { "epoch": 0.3811616092269669, "grad_norm": 0.5594947338104248, "learning_rate": 0.0002909645909645909, "loss": 0.578, "step": 347 }, { "epoch": 0.3822600576685432, "grad_norm": 0.5808854699134827, "learning_rate": 0.0002908424908424908, "loss": 0.6142, "step": 348 }, { "epoch": 0.38335850611011946, "grad_norm": 0.6067795157432556, "learning_rate": 0.00029072039072039073, "loss": 0.7682, "step": 349 }, { "epoch": 0.3844569545516957, "grad_norm": 0.392993301153183, "learning_rate": 0.0002905982905982906, "loss": 0.6599, "step": 350 }, { "epoch": 0.385555402993272, "grad_norm": 0.3963404893875122, "learning_rate": 0.00029047619047619045, "loss": 0.7079, "step": 351 }, { "epoch": 0.38665385143484826, "grad_norm": 0.3471222221851349, "learning_rate": 0.00029035409035409036, "loss": 0.463, "step": 352 }, { "epoch": 0.38775229987642457, "grad_norm": 0.5496531128883362, "learning_rate": 0.0002902319902319902, "loss": 0.7639, "step": 353 }, { "epoch": 0.3888507483180008, "grad_norm": 0.5482885241508484, "learning_rate": 0.0002901098901098901, "loss": 0.4198, "step": 354 }, { "epoch": 0.3899491967595771, "grad_norm": 0.7329181432723999, "learning_rate": 0.00028998778998779, "loss": 0.6057, "step": 355 }, { "epoch": 0.39104764520115337, "grad_norm": 0.41850918531417847, "learning_rate": 0.00028986568986568985, "loss": 0.605, "step": 356 }, { "epoch": 0.3921460936427296, "grad_norm": 0.4463609457015991, "learning_rate": 0.0002897435897435897, "loss": 0.7381, "step": 357 }, { "epoch": 0.3932445420843059, "grad_norm": 0.7207491397857666, "learning_rate": 0.0002896214896214896, "loss": 0.6892, "step": 358 }, { "epoch": 0.3943429905258822, "grad_norm": 0.3715958595275879, "learning_rate": 0.0002894993894993895, "loss": 0.5426, "step": 359 }, { "epoch": 0.3954414389674585, "grad_norm": 0.7077822685241699, "learning_rate": 0.00028937728937728933, "loss": 0.5923, "step": 360 }, { "epoch": 0.39653988740903473, "grad_norm": 0.5109585523605347, "learning_rate": 0.00028925518925518925, "loss": 0.5939, "step": 361 }, { "epoch": 0.39763833585061104, "grad_norm": 0.6105355024337769, "learning_rate": 0.0002891330891330891, "loss": 1.0345, "step": 362 }, { "epoch": 0.3987367842921873, "grad_norm": 0.479732871055603, "learning_rate": 0.000289010989010989, "loss": 0.71, "step": 363 }, { "epoch": 0.39983523273376353, "grad_norm": 0.8600007891654968, "learning_rate": 0.0002888888888888888, "loss": 0.7406, "step": 364 }, { "epoch": 0.40093368117533984, "grad_norm": 0.6584550738334656, "learning_rate": 0.00028876678876678873, "loss": 0.6658, "step": 365 }, { "epoch": 0.4020321296169161, "grad_norm": 0.7251041531562805, "learning_rate": 0.00028864468864468865, "loss": 0.8425, "step": 366 }, { "epoch": 0.4031305780584924, "grad_norm": 0.5729238390922546, "learning_rate": 0.0002885225885225885, "loss": 0.9054, "step": 367 }, { "epoch": 0.40422902650006864, "grad_norm": 1.1829932928085327, "learning_rate": 0.00028840048840048836, "loss": 0.9232, "step": 368 }, { "epoch": 0.40532747494164495, "grad_norm": 0.37746721506118774, "learning_rate": 0.0002882783882783883, "loss": 0.9619, "step": 369 }, { "epoch": 0.4064259233832212, "grad_norm": 0.5653749108314514, "learning_rate": 0.00028815628815628813, "loss": 0.7182, "step": 370 }, { "epoch": 0.40752437182479745, "grad_norm": 0.6024563312530518, "learning_rate": 0.000288034188034188, "loss": 0.6881, "step": 371 }, { "epoch": 0.40862282026637375, "grad_norm": 0.485350102186203, "learning_rate": 0.0002879120879120879, "loss": 0.6451, "step": 372 }, { "epoch": 0.40972126870795, "grad_norm": 0.5762611627578735, "learning_rate": 0.00028778998778998776, "loss": 0.7818, "step": 373 }, { "epoch": 0.4108197171495263, "grad_norm": 0.7961844801902771, "learning_rate": 0.0002876678876678876, "loss": 0.6682, "step": 374 }, { "epoch": 0.41191816559110256, "grad_norm": 0.4630587697029114, "learning_rate": 0.00028754578754578753, "loss": 0.9015, "step": 375 }, { "epoch": 0.41301661403267886, "grad_norm": 0.6592808961868286, "learning_rate": 0.0002874236874236874, "loss": 0.5738, "step": 376 }, { "epoch": 0.4141150624742551, "grad_norm": 0.4788278639316559, "learning_rate": 0.00028730158730158725, "loss": 0.7022, "step": 377 }, { "epoch": 0.41521351091583136, "grad_norm": 0.5041861534118652, "learning_rate": 0.00028717948717948716, "loss": 0.6137, "step": 378 }, { "epoch": 0.41631195935740767, "grad_norm": 0.5436013340950012, "learning_rate": 0.000287057387057387, "loss": 0.6621, "step": 379 }, { "epoch": 0.4174104077989839, "grad_norm": 0.5102400183677673, "learning_rate": 0.00028693528693528694, "loss": 0.6627, "step": 380 }, { "epoch": 0.4185088562405602, "grad_norm": 0.43655040860176086, "learning_rate": 0.0002868131868131868, "loss": 0.6475, "step": 381 }, { "epoch": 0.4196073046821365, "grad_norm": 0.3989826738834381, "learning_rate": 0.00028669108669108665, "loss": 0.5483, "step": 382 }, { "epoch": 0.4207057531237128, "grad_norm": 0.7781158685684204, "learning_rate": 0.00028656898656898656, "loss": 0.6475, "step": 383 }, { "epoch": 0.421804201565289, "grad_norm": 0.8119930624961853, "learning_rate": 0.0002864468864468864, "loss": 0.8122, "step": 384 }, { "epoch": 0.4229026500068653, "grad_norm": 0.7233585119247437, "learning_rate": 0.0002863247863247863, "loss": 0.7837, "step": 385 }, { "epoch": 0.4240010984484416, "grad_norm": 0.41249507665634155, "learning_rate": 0.0002862026862026862, "loss": 0.6916, "step": 386 }, { "epoch": 0.42509954689001783, "grad_norm": 0.4865298867225647, "learning_rate": 0.00028608058608058605, "loss": 0.595, "step": 387 }, { "epoch": 0.42619799533159414, "grad_norm": 0.6057963371276855, "learning_rate": 0.0002859584859584859, "loss": 0.7214, "step": 388 }, { "epoch": 0.4272964437731704, "grad_norm": 0.5390968918800354, "learning_rate": 0.0002858363858363858, "loss": 0.805, "step": 389 }, { "epoch": 0.4283948922147467, "grad_norm": 0.5944109559059143, "learning_rate": 0.0002857142857142857, "loss": 0.9953, "step": 390 }, { "epoch": 0.42949334065632294, "grad_norm": 0.5480278134346008, "learning_rate": 0.00028559218559218554, "loss": 0.8406, "step": 391 }, { "epoch": 0.4305917890978992, "grad_norm": 0.5168552994728088, "learning_rate": 0.00028547008547008545, "loss": 0.9715, "step": 392 }, { "epoch": 0.4316902375394755, "grad_norm": 0.4859452247619629, "learning_rate": 0.0002853479853479853, "loss": 0.7368, "step": 393 }, { "epoch": 0.43278868598105175, "grad_norm": 0.4697234034538269, "learning_rate": 0.0002852258852258852, "loss": 0.4801, "step": 394 }, { "epoch": 0.43388713442262805, "grad_norm": 0.6198891401290894, "learning_rate": 0.0002851037851037851, "loss": 0.5184, "step": 395 }, { "epoch": 0.4349855828642043, "grad_norm": 0.531563401222229, "learning_rate": 0.00028498168498168494, "loss": 0.8047, "step": 396 }, { "epoch": 0.4360840313057806, "grad_norm": 0.4610724449157715, "learning_rate": 0.00028485958485958485, "loss": 0.4583, "step": 397 }, { "epoch": 0.43718247974735686, "grad_norm": 0.5609697699546814, "learning_rate": 0.0002847374847374847, "loss": 0.7362, "step": 398 }, { "epoch": 0.4382809281889331, "grad_norm": 0.5257968306541443, "learning_rate": 0.00028461538461538457, "loss": 0.8173, "step": 399 }, { "epoch": 0.4393793766305094, "grad_norm": 0.8307009339332581, "learning_rate": 0.0002844932844932845, "loss": 0.5507, "step": 400 }, { "epoch": 0.44047782507208566, "grad_norm": 0.36615508794784546, "learning_rate": 0.00028437118437118434, "loss": 0.6605, "step": 401 }, { "epoch": 0.44157627351366197, "grad_norm": 0.35138362646102905, "learning_rate": 0.0002842490842490842, "loss": 0.6614, "step": 402 }, { "epoch": 0.4426747219552382, "grad_norm": 0.5054494738578796, "learning_rate": 0.0002841269841269841, "loss": 0.799, "step": 403 }, { "epoch": 0.4437731703968145, "grad_norm": 0.4711816608905792, "learning_rate": 0.00028400488400488397, "loss": 0.8892, "step": 404 }, { "epoch": 0.44487161883839077, "grad_norm": 0.5073884725570679, "learning_rate": 0.00028388278388278383, "loss": 0.8156, "step": 405 }, { "epoch": 0.445970067279967, "grad_norm": 0.29938632249832153, "learning_rate": 0.00028376068376068374, "loss": 0.7598, "step": 406 }, { "epoch": 0.4470685157215433, "grad_norm": 1.745937466621399, "learning_rate": 0.00028363858363858365, "loss": 0.7829, "step": 407 }, { "epoch": 0.4481669641631196, "grad_norm": 0.46887943148612976, "learning_rate": 0.00028351648351648346, "loss": 0.7798, "step": 408 }, { "epoch": 0.4492654126046959, "grad_norm": 0.4274987280368805, "learning_rate": 0.00028339438339438337, "loss": 0.8407, "step": 409 }, { "epoch": 0.45036386104627213, "grad_norm": 0.4445902109146118, "learning_rate": 0.0002832722832722833, "loss": 0.7394, "step": 410 }, { "epoch": 0.45146230948784843, "grad_norm": 0.3842466175556183, "learning_rate": 0.00028315018315018314, "loss": 0.7781, "step": 411 }, { "epoch": 0.4525607579294247, "grad_norm": 0.5660600066184998, "learning_rate": 0.000283028083028083, "loss": 0.8058, "step": 412 }, { "epoch": 0.45365920637100093, "grad_norm": 0.442911297082901, "learning_rate": 0.0002829059829059829, "loss": 0.808, "step": 413 }, { "epoch": 0.45475765481257724, "grad_norm": 0.9051260352134705, "learning_rate": 0.00028278388278388277, "loss": 0.9427, "step": 414 }, { "epoch": 0.4558561032541535, "grad_norm": 0.8027593493461609, "learning_rate": 0.00028266178266178263, "loss": 0.531, "step": 415 }, { "epoch": 0.4569545516957298, "grad_norm": 0.36242446303367615, "learning_rate": 0.0002825396825396825, "loss": 0.5609, "step": 416 }, { "epoch": 0.45805300013730604, "grad_norm": 0.6095871925354004, "learning_rate": 0.0002824175824175824, "loss": 0.7424, "step": 417 }, { "epoch": 0.45915144857888235, "grad_norm": 0.5102814435958862, "learning_rate": 0.00028229548229548226, "loss": 0.8861, "step": 418 }, { "epoch": 0.4602498970204586, "grad_norm": 0.375265896320343, "learning_rate": 0.0002821733821733821, "loss": 0.6235, "step": 419 }, { "epoch": 0.4613483454620349, "grad_norm": 0.4506315588951111, "learning_rate": 0.00028205128205128203, "loss": 0.6059, "step": 420 }, { "epoch": 0.46244679390361115, "grad_norm": 0.8119642734527588, "learning_rate": 0.0002819291819291819, "loss": 0.7821, "step": 421 }, { "epoch": 0.4635452423451874, "grad_norm": 0.42945513129234314, "learning_rate": 0.00028180708180708175, "loss": 0.9503, "step": 422 }, { "epoch": 0.4646436907867637, "grad_norm": 0.35567665100097656, "learning_rate": 0.00028168498168498166, "loss": 0.5243, "step": 423 }, { "epoch": 0.46574213922833996, "grad_norm": 0.5160343647003174, "learning_rate": 0.00028156288156288157, "loss": 0.5767, "step": 424 }, { "epoch": 0.46684058766991626, "grad_norm": 0.37530624866485596, "learning_rate": 0.00028144078144078143, "loss": 1.2016, "step": 425 }, { "epoch": 0.4679390361114925, "grad_norm": 0.5283146500587463, "learning_rate": 0.0002813186813186813, "loss": 0.5958, "step": 426 }, { "epoch": 0.4690374845530688, "grad_norm": 0.5217192769050598, "learning_rate": 0.0002811965811965812, "loss": 0.715, "step": 427 }, { "epoch": 0.47013593299464507, "grad_norm": 0.5092077851295471, "learning_rate": 0.00028107448107448106, "loss": 0.6942, "step": 428 }, { "epoch": 0.4712343814362213, "grad_norm": 0.7683324813842773, "learning_rate": 0.0002809523809523809, "loss": 1.0185, "step": 429 }, { "epoch": 0.4723328298777976, "grad_norm": 0.3117397725582123, "learning_rate": 0.00028083028083028083, "loss": 0.6949, "step": 430 }, { "epoch": 0.47343127831937387, "grad_norm": 0.3218965232372284, "learning_rate": 0.0002807081807081807, "loss": 0.6872, "step": 431 }, { "epoch": 0.4745297267609502, "grad_norm": 1.104121446609497, "learning_rate": 0.00028058608058608055, "loss": 0.6628, "step": 432 }, { "epoch": 0.4756281752025264, "grad_norm": 0.3224816620349884, "learning_rate": 0.00028046398046398046, "loss": 0.5974, "step": 433 }, { "epoch": 0.47672662364410273, "grad_norm": 0.5742220878601074, "learning_rate": 0.0002803418803418803, "loss": 0.7248, "step": 434 }, { "epoch": 0.477825072085679, "grad_norm": 0.5449275374412537, "learning_rate": 0.0002802197802197802, "loss": 0.8552, "step": 435 }, { "epoch": 0.47892352052725523, "grad_norm": 0.44660067558288574, "learning_rate": 0.0002800976800976801, "loss": 0.6968, "step": 436 }, { "epoch": 0.48002196896883154, "grad_norm": 0.4287508428096771, "learning_rate": 0.00027997557997557995, "loss": 0.8101, "step": 437 }, { "epoch": 0.4811204174104078, "grad_norm": 0.4142225384712219, "learning_rate": 0.00027985347985347986, "loss": 0.5379, "step": 438 }, { "epoch": 0.4822188658519841, "grad_norm": 1.246833324432373, "learning_rate": 0.0002797313797313797, "loss": 0.7116, "step": 439 }, { "epoch": 0.48331731429356034, "grad_norm": 0.3845030963420868, "learning_rate": 0.0002796092796092796, "loss": 0.8088, "step": 440 }, { "epoch": 0.48441576273513665, "grad_norm": 1.4492995738983154, "learning_rate": 0.0002794871794871795, "loss": 0.7358, "step": 441 }, { "epoch": 0.4855142111767129, "grad_norm": 0.40994521975517273, "learning_rate": 0.00027936507936507935, "loss": 0.6228, "step": 442 }, { "epoch": 0.48661265961828915, "grad_norm": 0.4782777428627014, "learning_rate": 0.0002792429792429792, "loss": 0.4944, "step": 443 }, { "epoch": 0.48771110805986545, "grad_norm": 0.47269922494888306, "learning_rate": 0.0002791208791208791, "loss": 0.7023, "step": 444 }, { "epoch": 0.4888095565014417, "grad_norm": 0.5529118776321411, "learning_rate": 0.000278998778998779, "loss": 0.7717, "step": 445 }, { "epoch": 0.489908004943018, "grad_norm": 0.4244072139263153, "learning_rate": 0.00027887667887667884, "loss": 0.7902, "step": 446 }, { "epoch": 0.49100645338459425, "grad_norm": 1.4737539291381836, "learning_rate": 0.00027875457875457875, "loss": 0.5784, "step": 447 }, { "epoch": 0.49210490182617056, "grad_norm": 0.40120208263397217, "learning_rate": 0.0002786324786324786, "loss": 0.7974, "step": 448 }, { "epoch": 0.4932033502677468, "grad_norm": 0.5481031537055969, "learning_rate": 0.00027851037851037846, "loss": 0.7867, "step": 449 }, { "epoch": 0.49430179870932306, "grad_norm": 0.36719343066215515, "learning_rate": 0.0002783882783882784, "loss": 0.6543, "step": 450 }, { "epoch": 0.49540024715089936, "grad_norm": 0.3980066776275635, "learning_rate": 0.00027826617826617824, "loss": 0.5395, "step": 451 }, { "epoch": 0.4964986955924756, "grad_norm": 0.45570313930511475, "learning_rate": 0.0002781440781440781, "loss": 0.7908, "step": 452 }, { "epoch": 0.4975971440340519, "grad_norm": 0.41858601570129395, "learning_rate": 0.000278021978021978, "loss": 0.5248, "step": 453 }, { "epoch": 0.49869559247562817, "grad_norm": 0.5019702315330505, "learning_rate": 0.00027789987789987786, "loss": 0.8006, "step": 454 }, { "epoch": 0.4997940409172045, "grad_norm": 0.4589880108833313, "learning_rate": 0.0002777777777777778, "loss": 0.7294, "step": 455 }, { "epoch": 0.5008924893587807, "grad_norm": 0.5679266452789307, "learning_rate": 0.00027765567765567764, "loss": 0.651, "step": 456 }, { "epoch": 0.501990937800357, "grad_norm": 0.4854479134082794, "learning_rate": 0.0002775335775335775, "loss": 0.9908, "step": 457 }, { "epoch": 0.5030893862419332, "grad_norm": 0.4964112341403961, "learning_rate": 0.0002774114774114774, "loss": 0.8084, "step": 458 }, { "epoch": 0.5041878346835096, "grad_norm": 0.5130513906478882, "learning_rate": 0.00027728937728937727, "loss": 0.8389, "step": 459 }, { "epoch": 0.5052862831250858, "grad_norm": 0.4784137010574341, "learning_rate": 0.0002771672771672771, "loss": 0.5497, "step": 460 }, { "epoch": 0.5063847315666621, "grad_norm": 0.28685998916625977, "learning_rate": 0.00027704517704517704, "loss": 0.491, "step": 461 }, { "epoch": 0.5074831800082383, "grad_norm": 0.5337100625038147, "learning_rate": 0.0002769230769230769, "loss": 0.8315, "step": 462 }, { "epoch": 0.5085816284498146, "grad_norm": 0.5431344509124756, "learning_rate": 0.00027680097680097675, "loss": 0.5996, "step": 463 }, { "epoch": 0.5096800768913909, "grad_norm": 0.4546130299568176, "learning_rate": 0.00027667887667887667, "loss": 0.5647, "step": 464 }, { "epoch": 0.5107785253329672, "grad_norm": 0.6298655271530151, "learning_rate": 0.0002765567765567765, "loss": 0.7684, "step": 465 }, { "epoch": 0.5118769737745434, "grad_norm": 0.44330841302871704, "learning_rate": 0.0002764346764346764, "loss": 0.4906, "step": 466 }, { "epoch": 0.5129754222161197, "grad_norm": 0.3824306130409241, "learning_rate": 0.0002763125763125763, "loss": 0.6123, "step": 467 }, { "epoch": 0.514073870657696, "grad_norm": 0.3225514590740204, "learning_rate": 0.00027619047619047615, "loss": 0.7535, "step": 468 }, { "epoch": 0.5151723190992723, "grad_norm": 0.701239824295044, "learning_rate": 0.00027606837606837607, "loss": 0.9643, "step": 469 }, { "epoch": 0.5162707675408486, "grad_norm": 0.37800920009613037, "learning_rate": 0.0002759462759462759, "loss": 0.543, "step": 470 }, { "epoch": 0.5173692159824248, "grad_norm": 0.3521328568458557, "learning_rate": 0.0002758241758241758, "loss": 0.7157, "step": 471 }, { "epoch": 0.518467664424001, "grad_norm": 0.2659924626350403, "learning_rate": 0.0002757020757020757, "loss": 0.7334, "step": 472 }, { "epoch": 0.5195661128655774, "grad_norm": 0.42815065383911133, "learning_rate": 0.00027557997557997555, "loss": 1.2015, "step": 473 }, { "epoch": 0.5206645613071537, "grad_norm": 0.7758998870849609, "learning_rate": 0.0002754578754578754, "loss": 0.9493, "step": 474 }, { "epoch": 0.5217630097487299, "grad_norm": 0.46281251311302185, "learning_rate": 0.0002753357753357753, "loss": 0.9159, "step": 475 }, { "epoch": 0.5228614581903062, "grad_norm": 0.3668971061706543, "learning_rate": 0.0002752136752136752, "loss": 0.4869, "step": 476 }, { "epoch": 0.5239599066318824, "grad_norm": 0.462534099817276, "learning_rate": 0.00027509157509157504, "loss": 0.6439, "step": 477 }, { "epoch": 0.5250583550734588, "grad_norm": 0.6341688632965088, "learning_rate": 0.00027496947496947495, "loss": 0.6948, "step": 478 }, { "epoch": 0.526156803515035, "grad_norm": 0.5469139814376831, "learning_rate": 0.0002748473748473748, "loss": 1.016, "step": 479 }, { "epoch": 0.5272552519566113, "grad_norm": 0.438204288482666, "learning_rate": 0.00027472527472527467, "loss": 0.6941, "step": 480 }, { "epoch": 0.5283537003981875, "grad_norm": 0.586700975894928, "learning_rate": 0.0002746031746031746, "loss": 0.6649, "step": 481 }, { "epoch": 0.5294521488397639, "grad_norm": 0.4077949523925781, "learning_rate": 0.0002744810744810745, "loss": 0.5948, "step": 482 }, { "epoch": 0.5305505972813401, "grad_norm": 0.3756411373615265, "learning_rate": 0.0002743589743589743, "loss": 0.4915, "step": 483 }, { "epoch": 0.5316490457229164, "grad_norm": 1.2067008018493652, "learning_rate": 0.0002742368742368742, "loss": 0.8795, "step": 484 }, { "epoch": 0.5327474941644926, "grad_norm": 0.3097778260707855, "learning_rate": 0.0002741147741147741, "loss": 0.5478, "step": 485 }, { "epoch": 0.5338459426060689, "grad_norm": 0.5536866188049316, "learning_rate": 0.000273992673992674, "loss": 0.7042, "step": 486 }, { "epoch": 0.5349443910476452, "grad_norm": 0.5930231809616089, "learning_rate": 0.00027387057387057384, "loss": 0.7108, "step": 487 }, { "epoch": 0.5360428394892215, "grad_norm": 0.39304253458976746, "learning_rate": 0.00027374847374847375, "loss": 0.788, "step": 488 }, { "epoch": 0.5371412879307977, "grad_norm": 0.5238274335861206, "learning_rate": 0.0002736263736263736, "loss": 0.9887, "step": 489 }, { "epoch": 0.538239736372374, "grad_norm": 0.5993770956993103, "learning_rate": 0.00027350427350427347, "loss": 0.7819, "step": 490 }, { "epoch": 0.5393381848139503, "grad_norm": 0.4601563811302185, "learning_rate": 0.00027338217338217333, "loss": 0.4347, "step": 491 }, { "epoch": 0.5404366332555266, "grad_norm": 0.5292415022850037, "learning_rate": 0.00027326007326007324, "loss": 0.5248, "step": 492 }, { "epoch": 0.5415350816971028, "grad_norm": 0.37247565388679504, "learning_rate": 0.0002731379731379731, "loss": 0.5412, "step": 493 }, { "epoch": 0.5426335301386791, "grad_norm": 0.6865994930267334, "learning_rate": 0.00027301587301587296, "loss": 0.8263, "step": 494 }, { "epoch": 0.5437319785802553, "grad_norm": 0.5019715428352356, "learning_rate": 0.00027289377289377287, "loss": 0.7084, "step": 495 }, { "epoch": 0.5448304270218317, "grad_norm": 0.8432828783988953, "learning_rate": 0.00027277167277167273, "loss": 0.6188, "step": 496 }, { "epoch": 0.545928875463408, "grad_norm": 0.594881534576416, "learning_rate": 0.0002726495726495726, "loss": 0.8923, "step": 497 }, { "epoch": 0.5470273239049842, "grad_norm": 0.5573694705963135, "learning_rate": 0.0002725274725274725, "loss": 0.6351, "step": 498 }, { "epoch": 0.5481257723465605, "grad_norm": 0.30426710844039917, "learning_rate": 0.0002724053724053724, "loss": 0.6359, "step": 499 }, { "epoch": 0.5492242207881367, "grad_norm": 0.759385883808136, "learning_rate": 0.00027228327228327227, "loss": 0.6131, "step": 500 }, { "epoch": 0.5503226692297131, "grad_norm": 0.5436901450157166, "learning_rate": 0.00027216117216117213, "loss": 0.5232, "step": 501 }, { "epoch": 0.5514211176712893, "grad_norm": 0.5924163460731506, "learning_rate": 0.00027203907203907204, "loss": 0.9594, "step": 502 }, { "epoch": 0.5525195661128656, "grad_norm": 0.49177658557891846, "learning_rate": 0.0002719169719169719, "loss": 0.842, "step": 503 }, { "epoch": 0.5536180145544418, "grad_norm": 0.4437295198440552, "learning_rate": 0.00027179487179487176, "loss": 1.0338, "step": 504 }, { "epoch": 0.5547164629960182, "grad_norm": 0.426213800907135, "learning_rate": 0.00027167277167277167, "loss": 0.6375, "step": 505 }, { "epoch": 0.5558149114375944, "grad_norm": 0.4599516689777374, "learning_rate": 0.00027155067155067153, "loss": 0.5005, "step": 506 }, { "epoch": 0.5569133598791707, "grad_norm": 0.647957980632782, "learning_rate": 0.0002714285714285714, "loss": 0.6292, "step": 507 }, { "epoch": 0.5580118083207469, "grad_norm": 0.7891755104064941, "learning_rate": 0.0002713064713064713, "loss": 0.697, "step": 508 }, { "epoch": 0.5591102567623232, "grad_norm": 0.5290817618370056, "learning_rate": 0.00027118437118437116, "loss": 0.4547, "step": 509 }, { "epoch": 0.5602087052038995, "grad_norm": 0.4025941789150238, "learning_rate": 0.000271062271062271, "loss": 0.6299, "step": 510 }, { "epoch": 0.5613071536454758, "grad_norm": 0.7768287658691406, "learning_rate": 0.00027094017094017093, "loss": 0.6813, "step": 511 }, { "epoch": 0.562405602087052, "grad_norm": 0.6977662444114685, "learning_rate": 0.0002708180708180708, "loss": 0.8217, "step": 512 }, { "epoch": 0.5635040505286283, "grad_norm": 0.5238949060440063, "learning_rate": 0.0002706959706959707, "loss": 0.7348, "step": 513 }, { "epoch": 0.5646024989702045, "grad_norm": 0.5099830627441406, "learning_rate": 0.00027057387057387056, "loss": 0.9894, "step": 514 }, { "epoch": 0.5657009474117809, "grad_norm": 0.6254756450653076, "learning_rate": 0.0002704517704517704, "loss": 0.9258, "step": 515 }, { "epoch": 0.5667993958533571, "grad_norm": 0.40313196182250977, "learning_rate": 0.00027032967032967033, "loss": 0.8115, "step": 516 }, { "epoch": 0.5678978442949334, "grad_norm": 0.9706575274467468, "learning_rate": 0.0002702075702075702, "loss": 0.5204, "step": 517 }, { "epoch": 0.5689962927365096, "grad_norm": 0.36777085065841675, "learning_rate": 0.00027008547008547005, "loss": 0.7716, "step": 518 }, { "epoch": 0.570094741178086, "grad_norm": 0.48726886510849, "learning_rate": 0.00026996336996336996, "loss": 0.7745, "step": 519 }, { "epoch": 0.5711931896196623, "grad_norm": 0.3590470850467682, "learning_rate": 0.0002698412698412698, "loss": 0.7038, "step": 520 }, { "epoch": 0.5722916380612385, "grad_norm": 0.7103118896484375, "learning_rate": 0.0002697191697191697, "loss": 0.8368, "step": 521 }, { "epoch": 0.5733900865028148, "grad_norm": 0.5503933429718018, "learning_rate": 0.0002695970695970696, "loss": 0.6164, "step": 522 }, { "epoch": 0.574488534944391, "grad_norm": 0.5255150198936462, "learning_rate": 0.00026947496947496945, "loss": 0.8886, "step": 523 }, { "epoch": 0.5755869833859674, "grad_norm": 0.4872569739818573, "learning_rate": 0.0002693528693528693, "loss": 0.6277, "step": 524 }, { "epoch": 0.5766854318275436, "grad_norm": 0.3748464584350586, "learning_rate": 0.0002692307692307692, "loss": 0.6471, "step": 525 }, { "epoch": 0.5777838802691199, "grad_norm": 0.4401276111602783, "learning_rate": 0.0002691086691086691, "loss": 0.9846, "step": 526 }, { "epoch": 0.5788823287106961, "grad_norm": 0.9565305709838867, "learning_rate": 0.00026898656898656894, "loss": 0.9471, "step": 527 }, { "epoch": 0.5799807771522724, "grad_norm": 0.6307245492935181, "learning_rate": 0.00026886446886446885, "loss": 0.9168, "step": 528 }, { "epoch": 0.5810792255938487, "grad_norm": 0.49177634716033936, "learning_rate": 0.0002687423687423687, "loss": 0.5464, "step": 529 }, { "epoch": 0.582177674035425, "grad_norm": 0.68553626537323, "learning_rate": 0.0002686202686202686, "loss": 0.5874, "step": 530 }, { "epoch": 0.5832761224770012, "grad_norm": 0.3811597228050232, "learning_rate": 0.0002684981684981685, "loss": 0.766, "step": 531 }, { "epoch": 0.5843745709185775, "grad_norm": 0.6634503602981567, "learning_rate": 0.00026837606837606834, "loss": 0.6438, "step": 532 }, { "epoch": 0.5854730193601538, "grad_norm": 0.6115571856498718, "learning_rate": 0.00026825396825396825, "loss": 0.8757, "step": 533 }, { "epoch": 0.5865714678017301, "grad_norm": 0.3011985719203949, "learning_rate": 0.0002681318681318681, "loss": 0.6188, "step": 534 }, { "epoch": 0.5876699162433063, "grad_norm": 0.7029386162757874, "learning_rate": 0.00026800976800976797, "loss": 0.8681, "step": 535 }, { "epoch": 0.5887683646848826, "grad_norm": 0.4796508550643921, "learning_rate": 0.0002678876678876679, "loss": 0.7207, "step": 536 }, { "epoch": 0.5898668131264588, "grad_norm": 0.542948842048645, "learning_rate": 0.00026776556776556774, "loss": 0.5587, "step": 537 }, { "epoch": 0.5909652615680352, "grad_norm": 0.7566731572151184, "learning_rate": 0.0002676434676434676, "loss": 0.8562, "step": 538 }, { "epoch": 0.5920637100096114, "grad_norm": 0.6411837339401245, "learning_rate": 0.0002675213675213675, "loss": 0.4516, "step": 539 }, { "epoch": 0.5931621584511877, "grad_norm": 0.41434159874916077, "learning_rate": 0.00026739926739926737, "loss": 0.7069, "step": 540 }, { "epoch": 0.5942606068927639, "grad_norm": 0.29941752552986145, "learning_rate": 0.0002672771672771672, "loss": 0.7444, "step": 541 }, { "epoch": 0.5953590553343402, "grad_norm": 1.8168927431106567, "learning_rate": 0.00026715506715506714, "loss": 0.4947, "step": 542 }, { "epoch": 0.5964575037759166, "grad_norm": 0.5639868974685669, "learning_rate": 0.000267032967032967, "loss": 0.6749, "step": 543 }, { "epoch": 0.5975559522174928, "grad_norm": 0.5054119229316711, "learning_rate": 0.0002669108669108669, "loss": 0.8075, "step": 544 }, { "epoch": 0.598654400659069, "grad_norm": 0.3531246483325958, "learning_rate": 0.00026678876678876677, "loss": 0.6986, "step": 545 }, { "epoch": 0.5997528491006453, "grad_norm": 0.36428287625312805, "learning_rate": 0.0002666666666666666, "loss": 0.6496, "step": 546 }, { "epoch": 0.6008512975422217, "grad_norm": 0.45706960558891296, "learning_rate": 0.00026654456654456654, "loss": 0.5646, "step": 547 }, { "epoch": 0.6019497459837979, "grad_norm": 0.39326363801956177, "learning_rate": 0.0002664224664224664, "loss": 0.5037, "step": 548 }, { "epoch": 0.6030481944253742, "grad_norm": 0.7158151268959045, "learning_rate": 0.00026630036630036625, "loss": 0.5643, "step": 549 }, { "epoch": 0.6041466428669504, "grad_norm": 0.398335337638855, "learning_rate": 0.00026617826617826617, "loss": 0.5462, "step": 550 }, { "epoch": 0.6052450913085267, "grad_norm": 0.8625812530517578, "learning_rate": 0.000266056166056166, "loss": 0.7898, "step": 551 }, { "epoch": 0.606343539750103, "grad_norm": 0.5558099150657654, "learning_rate": 0.0002659340659340659, "loss": 0.7968, "step": 552 }, { "epoch": 0.6074419881916793, "grad_norm": 0.6244741678237915, "learning_rate": 0.0002658119658119658, "loss": 0.9085, "step": 553 }, { "epoch": 0.6085404366332555, "grad_norm": 0.4907127916812897, "learning_rate": 0.00026568986568986565, "loss": 0.5683, "step": 554 }, { "epoch": 0.6096388850748318, "grad_norm": 0.6140159964561462, "learning_rate": 0.0002655677655677655, "loss": 0.5693, "step": 555 }, { "epoch": 0.610737333516408, "grad_norm": 0.41251274943351746, "learning_rate": 0.0002654456654456654, "loss": 0.728, "step": 556 }, { "epoch": 0.6118357819579844, "grad_norm": 0.43427684903144836, "learning_rate": 0.00026532356532356534, "loss": 0.5692, "step": 557 }, { "epoch": 0.6129342303995606, "grad_norm": 0.41471078991889954, "learning_rate": 0.00026520146520146514, "loss": 0.6616, "step": 558 }, { "epoch": 0.6140326788411369, "grad_norm": 0.4406953752040863, "learning_rate": 0.00026507936507936506, "loss": 0.4764, "step": 559 }, { "epoch": 0.6151311272827131, "grad_norm": 7.233060359954834, "learning_rate": 0.00026495726495726497, "loss": 0.6111, "step": 560 }, { "epoch": 0.6162295757242895, "grad_norm": 0.47008857131004333, "learning_rate": 0.0002648351648351648, "loss": 0.8145, "step": 561 }, { "epoch": 0.6173280241658657, "grad_norm": 0.47636717557907104, "learning_rate": 0.0002647130647130647, "loss": 0.8036, "step": 562 }, { "epoch": 0.618426472607442, "grad_norm": 0.526971161365509, "learning_rate": 0.0002645909645909646, "loss": 0.7559, "step": 563 }, { "epoch": 0.6195249210490182, "grad_norm": 0.5027382373809814, "learning_rate": 0.00026446886446886446, "loss": 0.7765, "step": 564 }, { "epoch": 0.6206233694905945, "grad_norm": 0.4222506284713745, "learning_rate": 0.0002643467643467643, "loss": 0.6376, "step": 565 }, { "epoch": 0.6217218179321709, "grad_norm": 0.6390372514724731, "learning_rate": 0.0002642246642246642, "loss": 0.8224, "step": 566 }, { "epoch": 0.6228202663737471, "grad_norm": 0.44495514035224915, "learning_rate": 0.0002641025641025641, "loss": 0.5995, "step": 567 }, { "epoch": 0.6239187148153233, "grad_norm": 0.7005137205123901, "learning_rate": 0.00026398046398046394, "loss": 0.4986, "step": 568 }, { "epoch": 0.6250171632568996, "grad_norm": 0.40745365619659424, "learning_rate": 0.0002638583638583638, "loss": 0.608, "step": 569 }, { "epoch": 0.6261156116984758, "grad_norm": 0.3449142277240753, "learning_rate": 0.0002637362637362637, "loss": 0.6253, "step": 570 }, { "epoch": 0.6272140601400522, "grad_norm": 0.4318457841873169, "learning_rate": 0.00026361416361416357, "loss": 0.6376, "step": 571 }, { "epoch": 0.6283125085816285, "grad_norm": 2.2202258110046387, "learning_rate": 0.00026349206349206343, "loss": 0.5477, "step": 572 }, { "epoch": 0.6294109570232047, "grad_norm": 0.6759721040725708, "learning_rate": 0.00026336996336996334, "loss": 1.1176, "step": 573 }, { "epoch": 0.630509405464781, "grad_norm": 1.7796927690505981, "learning_rate": 0.00026324786324786326, "loss": 0.8713, "step": 574 }, { "epoch": 0.6316078539063573, "grad_norm": 0.32952558994293213, "learning_rate": 0.0002631257631257631, "loss": 0.4711, "step": 575 }, { "epoch": 0.6327063023479336, "grad_norm": 0.40390628576278687, "learning_rate": 0.000263003663003663, "loss": 0.5412, "step": 576 }, { "epoch": 0.6338047507895098, "grad_norm": 0.7439208030700684, "learning_rate": 0.0002628815628815629, "loss": 0.7094, "step": 577 }, { "epoch": 0.6349031992310861, "grad_norm": 0.34505775570869446, "learning_rate": 0.00026275946275946274, "loss": 0.5939, "step": 578 }, { "epoch": 0.6360016476726623, "grad_norm": 0.9452011585235596, "learning_rate": 0.0002626373626373626, "loss": 0.5108, "step": 579 }, { "epoch": 0.6371000961142387, "grad_norm": 0.42789551615715027, "learning_rate": 0.0002625152625152625, "loss": 0.5661, "step": 580 }, { "epoch": 0.6381985445558149, "grad_norm": 0.3460575044155121, "learning_rate": 0.0002623931623931624, "loss": 0.8333, "step": 581 }, { "epoch": 0.6392969929973912, "grad_norm": 0.8932168483734131, "learning_rate": 0.00026227106227106223, "loss": 0.7058, "step": 582 }, { "epoch": 0.6403954414389674, "grad_norm": 0.8588842749595642, "learning_rate": 0.00026214896214896214, "loss": 0.6905, "step": 583 }, { "epoch": 0.6414938898805437, "grad_norm": 0.5097251534461975, "learning_rate": 0.000262026862026862, "loss": 0.8189, "step": 584 }, { "epoch": 0.64259233832212, "grad_norm": 0.45746755599975586, "learning_rate": 0.00026190476190476186, "loss": 0.7212, "step": 585 }, { "epoch": 0.6436907867636963, "grad_norm": 0.9576689600944519, "learning_rate": 0.0002617826617826618, "loss": 0.6159, "step": 586 }, { "epoch": 0.6447892352052725, "grad_norm": 0.5721899271011353, "learning_rate": 0.00026166056166056163, "loss": 0.6083, "step": 587 }, { "epoch": 0.6458876836468488, "grad_norm": 0.4851115942001343, "learning_rate": 0.00026153846153846154, "loss": 0.7678, "step": 588 }, { "epoch": 0.6469861320884251, "grad_norm": 0.6631761193275452, "learning_rate": 0.0002614163614163614, "loss": 0.7068, "step": 589 }, { "epoch": 0.6480845805300014, "grad_norm": 0.6862382292747498, "learning_rate": 0.00026129426129426126, "loss": 0.5766, "step": 590 }, { "epoch": 0.6491830289715776, "grad_norm": 0.3754968047142029, "learning_rate": 0.0002611721611721612, "loss": 0.7254, "step": 591 }, { "epoch": 0.6502814774131539, "grad_norm": 0.5239700078964233, "learning_rate": 0.00026105006105006103, "loss": 0.5777, "step": 592 }, { "epoch": 0.6513799258547301, "grad_norm": 0.5103443264961243, "learning_rate": 0.0002609279609279609, "loss": 1.0006, "step": 593 }, { "epoch": 0.6524783742963065, "grad_norm": 0.4733884632587433, "learning_rate": 0.0002608058608058608, "loss": 0.6851, "step": 594 }, { "epoch": 0.6535768227378828, "grad_norm": 0.5982065796852112, "learning_rate": 0.00026068376068376066, "loss": 0.6295, "step": 595 }, { "epoch": 0.654675271179459, "grad_norm": 1.2408190965652466, "learning_rate": 0.0002605616605616605, "loss": 0.8806, "step": 596 }, { "epoch": 0.6557737196210353, "grad_norm": 0.6005455851554871, "learning_rate": 0.00026043956043956043, "loss": 0.7186, "step": 597 }, { "epoch": 0.6568721680626116, "grad_norm": 0.33777105808258057, "learning_rate": 0.0002603174603174603, "loss": 0.4599, "step": 598 }, { "epoch": 0.6579706165041879, "grad_norm": 0.5336529612541199, "learning_rate": 0.00026019536019536015, "loss": 0.553, "step": 599 }, { "epoch": 0.6590690649457641, "grad_norm": 0.6930931806564331, "learning_rate": 0.00026007326007326006, "loss": 0.5686, "step": 600 }, { "epoch": 0.6601675133873404, "grad_norm": 1.1340439319610596, "learning_rate": 0.0002599511599511599, "loss": 0.5886, "step": 601 }, { "epoch": 0.6612659618289166, "grad_norm": 0.9833797812461853, "learning_rate": 0.0002598290598290598, "loss": 0.7109, "step": 602 }, { "epoch": 0.662364410270493, "grad_norm": 0.9305315017700195, "learning_rate": 0.0002597069597069597, "loss": 0.8341, "step": 603 }, { "epoch": 0.6634628587120692, "grad_norm": 0.9753265380859375, "learning_rate": 0.00025958485958485955, "loss": 0.7102, "step": 604 }, { "epoch": 0.6645613071536455, "grad_norm": 2.2342822551727295, "learning_rate": 0.00025946275946275946, "loss": 0.6784, "step": 605 }, { "epoch": 0.6656597555952217, "grad_norm": 0.6815157532691956, "learning_rate": 0.0002593406593406593, "loss": 0.7689, "step": 606 }, { "epoch": 0.666758204036798, "grad_norm": 0.7792591452598572, "learning_rate": 0.0002592185592185592, "loss": 0.9444, "step": 607 }, { "epoch": 0.6678566524783743, "grad_norm": 0.668251097202301, "learning_rate": 0.0002590964590964591, "loss": 0.6899, "step": 608 }, { "epoch": 0.6689551009199506, "grad_norm": 0.5041349530220032, "learning_rate": 0.00025897435897435895, "loss": 0.652, "step": 609 }, { "epoch": 0.6700535493615268, "grad_norm": 0.35069939494132996, "learning_rate": 0.0002588522588522588, "loss": 0.8102, "step": 610 }, { "epoch": 0.6711519978031031, "grad_norm": 3.324793577194214, "learning_rate": 0.0002587301587301587, "loss": 0.7936, "step": 611 }, { "epoch": 0.6722504462446794, "grad_norm": 0.6778903007507324, "learning_rate": 0.0002586080586080586, "loss": 0.6258, "step": 612 }, { "epoch": 0.6733488946862557, "grad_norm": 3.034745454788208, "learning_rate": 0.00025848595848595844, "loss": 0.697, "step": 613 }, { "epoch": 0.6744473431278319, "grad_norm": 2.563870429992676, "learning_rate": 0.00025836385836385835, "loss": 0.7596, "step": 614 }, { "epoch": 0.6755457915694082, "grad_norm": 0.45592913031578064, "learning_rate": 0.0002582417582417582, "loss": 0.7753, "step": 615 }, { "epoch": 0.6766442400109844, "grad_norm": 0.7209720015525818, "learning_rate": 0.00025811965811965807, "loss": 0.6907, "step": 616 }, { "epoch": 0.6777426884525608, "grad_norm": 0.4611949026584625, "learning_rate": 0.000257997557997558, "loss": 0.5896, "step": 617 }, { "epoch": 0.678841136894137, "grad_norm": 1.3885395526885986, "learning_rate": 0.0002578754578754579, "loss": 0.6344, "step": 618 }, { "epoch": 0.6799395853357133, "grad_norm": 0.544572651386261, "learning_rate": 0.00025775335775335775, "loss": 0.586, "step": 619 }, { "epoch": 0.6810380337772896, "grad_norm": 0.5637034177780151, "learning_rate": 0.0002576312576312576, "loss": 0.8284, "step": 620 }, { "epoch": 0.6821364822188658, "grad_norm": 1.170779824256897, "learning_rate": 0.00025750915750915747, "loss": 0.8818, "step": 621 }, { "epoch": 0.6832349306604422, "grad_norm": 0.4877263605594635, "learning_rate": 0.0002573870573870574, "loss": 0.9179, "step": 622 }, { "epoch": 0.6843333791020184, "grad_norm": 0.6684415340423584, "learning_rate": 0.00025726495726495724, "loss": 0.7358, "step": 623 }, { "epoch": 0.6854318275435947, "grad_norm": 0.6679075956344604, "learning_rate": 0.0002571428571428571, "loss": 0.6342, "step": 624 }, { "epoch": 0.6865302759851709, "grad_norm": 0.65242600440979, "learning_rate": 0.000257020757020757, "loss": 0.4762, "step": 625 }, { "epoch": 0.6876287244267473, "grad_norm": 0.806523859500885, "learning_rate": 0.00025689865689865687, "loss": 0.7621, "step": 626 }, { "epoch": 0.6887271728683235, "grad_norm": 1.09652578830719, "learning_rate": 0.0002567765567765567, "loss": 0.6594, "step": 627 }, { "epoch": 0.6898256213098998, "grad_norm": 0.412505179643631, "learning_rate": 0.00025665445665445664, "loss": 0.8026, "step": 628 }, { "epoch": 0.690924069751476, "grad_norm": 0.5801676511764526, "learning_rate": 0.0002565323565323565, "loss": 0.7026, "step": 629 }, { "epoch": 0.6920225181930523, "grad_norm": 0.6822883486747742, "learning_rate": 0.00025641025641025636, "loss": 0.4372, "step": 630 }, { "epoch": 0.6931209666346286, "grad_norm": 0.3455508351325989, "learning_rate": 0.00025628815628815627, "loss": 0.5624, "step": 631 }, { "epoch": 0.6942194150762049, "grad_norm": 0.3533216714859009, "learning_rate": 0.0002561660561660562, "loss": 0.7493, "step": 632 }, { "epoch": 0.6953178635177811, "grad_norm": 1.4306656122207642, "learning_rate": 0.000256043956043956, "loss": 0.7537, "step": 633 }, { "epoch": 0.6964163119593574, "grad_norm": 0.336393266916275, "learning_rate": 0.0002559218559218559, "loss": 0.787, "step": 634 }, { "epoch": 0.6975147604009336, "grad_norm": 0.5303547382354736, "learning_rate": 0.0002557997557997558, "loss": 0.5604, "step": 635 }, { "epoch": 0.69861320884251, "grad_norm": 0.5421821475028992, "learning_rate": 0.00025567765567765567, "loss": 0.6905, "step": 636 }, { "epoch": 0.6997116572840862, "grad_norm": 0.5445061922073364, "learning_rate": 0.00025555555555555553, "loss": 0.6389, "step": 637 }, { "epoch": 0.7008101057256625, "grad_norm": 0.42832881212234497, "learning_rate": 0.00025543345543345544, "loss": 0.7825, "step": 638 }, { "epoch": 0.7019085541672387, "grad_norm": 1.4624862670898438, "learning_rate": 0.0002553113553113553, "loss": 0.4964, "step": 639 }, { "epoch": 0.7030070026088151, "grad_norm": 0.38657426834106445, "learning_rate": 0.00025518925518925516, "loss": 0.5299, "step": 640 }, { "epoch": 0.7041054510503914, "grad_norm": 14.422834396362305, "learning_rate": 0.00025506715506715507, "loss": 0.5008, "step": 641 }, { "epoch": 0.7052038994919676, "grad_norm": 0.591106653213501, "learning_rate": 0.00025494505494505493, "loss": 0.6732, "step": 642 }, { "epoch": 0.7063023479335439, "grad_norm": 1.6697375774383545, "learning_rate": 0.0002548229548229548, "loss": 0.6782, "step": 643 }, { "epoch": 0.7074007963751201, "grad_norm": 1.670777678489685, "learning_rate": 0.0002547008547008547, "loss": 0.5275, "step": 644 }, { "epoch": 0.7084992448166965, "grad_norm": 2.3361563682556152, "learning_rate": 0.00025457875457875456, "loss": 0.4177, "step": 645 }, { "epoch": 0.7095976932582727, "grad_norm": 1.823844313621521, "learning_rate": 0.0002544566544566544, "loss": 0.5438, "step": 646 }, { "epoch": 0.710696141699849, "grad_norm": 0.5374146699905396, "learning_rate": 0.0002543345543345543, "loss": 0.6704, "step": 647 }, { "epoch": 0.7117945901414252, "grad_norm": 0.9709361791610718, "learning_rate": 0.0002542124542124542, "loss": 0.8896, "step": 648 }, { "epoch": 0.7128930385830015, "grad_norm": 0.7118197083473206, "learning_rate": 0.0002540903540903541, "loss": 0.766, "step": 649 }, { "epoch": 0.7139914870245778, "grad_norm": 0.4597225487232208, "learning_rate": 0.00025396825396825396, "loss": 0.7498, "step": 650 }, { "epoch": 0.7150899354661541, "grad_norm": 0.9708977937698364, "learning_rate": 0.0002538461538461538, "loss": 0.7602, "step": 651 }, { "epoch": 0.7161883839077303, "grad_norm": 0.8156960606575012, "learning_rate": 0.00025372405372405373, "loss": 1.1105, "step": 652 }, { "epoch": 0.7172868323493066, "grad_norm": 1.4135644435882568, "learning_rate": 0.0002536019536019536, "loss": 0.9203, "step": 653 }, { "epoch": 0.7183852807908829, "grad_norm": 0.5754226446151733, "learning_rate": 0.00025347985347985344, "loss": 0.5368, "step": 654 }, { "epoch": 0.7194837292324592, "grad_norm": 1.7644588947296143, "learning_rate": 0.00025335775335775336, "loss": 0.6451, "step": 655 }, { "epoch": 0.7205821776740354, "grad_norm": 4.35576868057251, "learning_rate": 0.0002532356532356532, "loss": 0.6732, "step": 656 }, { "epoch": 0.7216806261156117, "grad_norm": 1.1072558164596558, "learning_rate": 0.0002531135531135531, "loss": 0.7901, "step": 657 }, { "epoch": 0.7227790745571879, "grad_norm": 0.3916113078594208, "learning_rate": 0.000252991452991453, "loss": 0.7153, "step": 658 }, { "epoch": 0.7238775229987643, "grad_norm": 1.055137276649475, "learning_rate": 0.00025286935286935285, "loss": 0.8664, "step": 659 }, { "epoch": 0.7249759714403405, "grad_norm": 0.5966087579727173, "learning_rate": 0.0002527472527472527, "loss": 0.933, "step": 660 }, { "epoch": 0.7260744198819168, "grad_norm": 0.40958529710769653, "learning_rate": 0.0002526251526251526, "loss": 0.7196, "step": 661 }, { "epoch": 0.727172868323493, "grad_norm": 0.4636710584163666, "learning_rate": 0.0002525030525030525, "loss": 0.7039, "step": 662 }, { "epoch": 0.7282713167650693, "grad_norm": 0.6967337131500244, "learning_rate": 0.0002523809523809524, "loss": 0.8981, "step": 663 }, { "epoch": 0.7293697652066456, "grad_norm": 0.49781784415245056, "learning_rate": 0.00025225885225885225, "loss": 0.7239, "step": 664 }, { "epoch": 0.7304682136482219, "grad_norm": 0.940851628780365, "learning_rate": 0.0002521367521367521, "loss": 0.8199, "step": 665 }, { "epoch": 0.7315666620897981, "grad_norm": 1.0271226167678833, "learning_rate": 0.000252014652014652, "loss": 0.6757, "step": 666 }, { "epoch": 0.7326651105313744, "grad_norm": 0.5299912095069885, "learning_rate": 0.0002518925518925519, "loss": 0.8464, "step": 667 }, { "epoch": 0.7337635589729508, "grad_norm": 0.7060052156448364, "learning_rate": 0.00025177045177045173, "loss": 0.6541, "step": 668 }, { "epoch": 0.734862007414527, "grad_norm": 0.5419691205024719, "learning_rate": 0.00025164835164835165, "loss": 0.8741, "step": 669 }, { "epoch": 0.7359604558561033, "grad_norm": 0.6363463401794434, "learning_rate": 0.0002515262515262515, "loss": 0.7224, "step": 670 }, { "epoch": 0.7370589042976795, "grad_norm": 0.7622922658920288, "learning_rate": 0.00025140415140415136, "loss": 0.9402, "step": 671 }, { "epoch": 0.7381573527392558, "grad_norm": 0.7477490305900574, "learning_rate": 0.0002512820512820513, "loss": 0.6036, "step": 672 }, { "epoch": 0.7392558011808321, "grad_norm": 0.4813562333583832, "learning_rate": 0.00025115995115995113, "loss": 0.5982, "step": 673 }, { "epoch": 0.7403542496224084, "grad_norm": 3.112766981124878, "learning_rate": 0.000251037851037851, "loss": 0.5825, "step": 674 }, { "epoch": 0.7414526980639846, "grad_norm": 0.9523088932037354, "learning_rate": 0.0002509157509157509, "loss": 0.5698, "step": 675 }, { "epoch": 0.7425511465055609, "grad_norm": 0.3426001965999603, "learning_rate": 0.00025079365079365076, "loss": 0.5516, "step": 676 }, { "epoch": 0.7436495949471371, "grad_norm": 0.4866350591182709, "learning_rate": 0.0002506715506715506, "loss": 0.5466, "step": 677 }, { "epoch": 0.7447480433887135, "grad_norm": 0.6590595245361328, "learning_rate": 0.00025054945054945053, "loss": 0.7579, "step": 678 }, { "epoch": 0.7458464918302897, "grad_norm": 0.36733704805374146, "learning_rate": 0.0002504273504273504, "loss": 0.5114, "step": 679 }, { "epoch": 0.746944940271866, "grad_norm": 0.5890951156616211, "learning_rate": 0.0002503052503052503, "loss": 0.7196, "step": 680 }, { "epoch": 0.7480433887134422, "grad_norm": 0.8393438458442688, "learning_rate": 0.00025018315018315016, "loss": 0.6291, "step": 681 }, { "epoch": 0.7491418371550186, "grad_norm": 0.9745636582374573, "learning_rate": 0.00025006105006105, "loss": 0.8675, "step": 682 }, { "epoch": 0.7502402855965948, "grad_norm": 1.1764310598373413, "learning_rate": 0.00024993894993894993, "loss": 0.9384, "step": 683 }, { "epoch": 0.7513387340381711, "grad_norm": 0.6199970245361328, "learning_rate": 0.0002498168498168498, "loss": 0.5984, "step": 684 }, { "epoch": 0.7524371824797473, "grad_norm": 2.2708802223205566, "learning_rate": 0.00024969474969474965, "loss": 0.7867, "step": 685 }, { "epoch": 0.7535356309213236, "grad_norm": 0.6731462478637695, "learning_rate": 0.00024957264957264956, "loss": 0.5377, "step": 686 }, { "epoch": 0.7546340793629, "grad_norm": 0.991669774055481, "learning_rate": 0.0002494505494505494, "loss": 0.7015, "step": 687 }, { "epoch": 0.7557325278044762, "grad_norm": 0.5873506665229797, "learning_rate": 0.0002493284493284493, "loss": 0.567, "step": 688 }, { "epoch": 0.7568309762460524, "grad_norm": 1.5025473833084106, "learning_rate": 0.0002492063492063492, "loss": 0.6264, "step": 689 }, { "epoch": 0.7579294246876287, "grad_norm": 0.4942665696144104, "learning_rate": 0.00024908424908424905, "loss": 0.7623, "step": 690 }, { "epoch": 0.7590278731292049, "grad_norm": 0.5522105693817139, "learning_rate": 0.0002489621489621489, "loss": 0.6192, "step": 691 }, { "epoch": 0.7601263215707813, "grad_norm": 1.25243079662323, "learning_rate": 0.0002488400488400488, "loss": 0.8547, "step": 692 }, { "epoch": 0.7612247700123576, "grad_norm": 0.5228685140609741, "learning_rate": 0.00024871794871794874, "loss": 0.7365, "step": 693 }, { "epoch": 0.7623232184539338, "grad_norm": 1.5090827941894531, "learning_rate": 0.0002485958485958486, "loss": 0.9226, "step": 694 }, { "epoch": 0.76342166689551, "grad_norm": 3.3617379665374756, "learning_rate": 0.00024847374847374845, "loss": 0.7942, "step": 695 }, { "epoch": 0.7645201153370864, "grad_norm": 0.5350137948989868, "learning_rate": 0.0002483516483516483, "loss": 0.6254, "step": 696 }, { "epoch": 0.7656185637786627, "grad_norm": 0.8871312141418457, "learning_rate": 0.0002482295482295482, "loss": 0.8241, "step": 697 }, { "epoch": 0.7667170122202389, "grad_norm": 0.48593926429748535, "learning_rate": 0.0002481074481074481, "loss": 0.5707, "step": 698 }, { "epoch": 0.7678154606618152, "grad_norm": 0.7460000514984131, "learning_rate": 0.00024798534798534794, "loss": 0.9521, "step": 699 }, { "epoch": 0.7689139091033914, "grad_norm": 0.7105034589767456, "learning_rate": 0.00024786324786324785, "loss": 0.7513, "step": 700 }, { "epoch": 0.7700123575449678, "grad_norm": 0.40251481533050537, "learning_rate": 0.0002477411477411477, "loss": 0.6067, "step": 701 }, { "epoch": 0.771110805986544, "grad_norm": 0.452709436416626, "learning_rate": 0.00024761904761904757, "loss": 0.671, "step": 702 }, { "epoch": 0.7722092544281203, "grad_norm": 0.581453263759613, "learning_rate": 0.0002474969474969475, "loss": 0.5356, "step": 703 }, { "epoch": 0.7733077028696965, "grad_norm": 0.8013669848442078, "learning_rate": 0.00024737484737484734, "loss": 0.6889, "step": 704 }, { "epoch": 0.7744061513112728, "grad_norm": 1.1480565071105957, "learning_rate": 0.0002472527472527472, "loss": 0.7456, "step": 705 }, { "epoch": 0.7755045997528491, "grad_norm": 0.7568329572677612, "learning_rate": 0.0002471306471306471, "loss": 0.7455, "step": 706 }, { "epoch": 0.7766030481944254, "grad_norm": 0.4223226308822632, "learning_rate": 0.000247008547008547, "loss": 0.7138, "step": 707 }, { "epoch": 0.7777014966360016, "grad_norm": 0.372872531414032, "learning_rate": 0.00024688644688644683, "loss": 0.8037, "step": 708 }, { "epoch": 0.7787999450775779, "grad_norm": 0.968614399433136, "learning_rate": 0.00024676434676434674, "loss": 0.5943, "step": 709 }, { "epoch": 0.7798983935191542, "grad_norm": 0.801157534122467, "learning_rate": 0.00024664224664224665, "loss": 0.9467, "step": 710 }, { "epoch": 0.7809968419607305, "grad_norm": 0.7115808129310608, "learning_rate": 0.0002465201465201465, "loss": 0.7828, "step": 711 }, { "epoch": 0.7820952904023067, "grad_norm": 1.2951349020004272, "learning_rate": 0.00024639804639804637, "loss": 0.6221, "step": 712 }, { "epoch": 0.783193738843883, "grad_norm": 0.47706693410873413, "learning_rate": 0.0002462759462759463, "loss": 0.3641, "step": 713 }, { "epoch": 0.7842921872854592, "grad_norm": 0.8871097564697266, "learning_rate": 0.00024615384615384614, "loss": 0.6177, "step": 714 }, { "epoch": 0.7853906357270356, "grad_norm": 0.7920973896980286, "learning_rate": 0.000246031746031746, "loss": 0.5858, "step": 715 }, { "epoch": 0.7864890841686119, "grad_norm": 0.49732694029808044, "learning_rate": 0.0002459096459096459, "loss": 0.5176, "step": 716 }, { "epoch": 0.7875875326101881, "grad_norm": 0.34965720772743225, "learning_rate": 0.00024578754578754577, "loss": 0.4983, "step": 717 }, { "epoch": 0.7886859810517644, "grad_norm": 0.45963025093078613, "learning_rate": 0.00024566544566544563, "loss": 0.7756, "step": 718 }, { "epoch": 0.7897844294933407, "grad_norm": 0.5802373290061951, "learning_rate": 0.00024554334554334554, "loss": 0.5773, "step": 719 }, { "epoch": 0.790882877934917, "grad_norm": 1.8482742309570312, "learning_rate": 0.0002454212454212454, "loss": 0.7978, "step": 720 }, { "epoch": 0.7919813263764932, "grad_norm": 0.5821959972381592, "learning_rate": 0.00024529914529914526, "loss": 0.7483, "step": 721 }, { "epoch": 0.7930797748180695, "grad_norm": 0.9352701306343079, "learning_rate": 0.0002451770451770451, "loss": 0.6979, "step": 722 }, { "epoch": 0.7941782232596457, "grad_norm": 0.554032564163208, "learning_rate": 0.00024505494505494503, "loss": 0.6773, "step": 723 }, { "epoch": 0.7952766717012221, "grad_norm": 0.6914504766464233, "learning_rate": 0.00024493284493284494, "loss": 0.6548, "step": 724 }, { "epoch": 0.7963751201427983, "grad_norm": 0.40804949402809143, "learning_rate": 0.0002448107448107448, "loss": 0.4634, "step": 725 }, { "epoch": 0.7974735685843746, "grad_norm": 0.4965716302394867, "learning_rate": 0.00024468864468864466, "loss": 0.4879, "step": 726 }, { "epoch": 0.7985720170259508, "grad_norm": 0.48798999190330505, "learning_rate": 0.00024456654456654457, "loss": 0.7003, "step": 727 }, { "epoch": 0.7996704654675271, "grad_norm": 0.6946013569831848, "learning_rate": 0.00024444444444444443, "loss": 0.7508, "step": 728 }, { "epoch": 0.8007689139091034, "grad_norm": 0.4310678243637085, "learning_rate": 0.0002443223443223443, "loss": 0.5765, "step": 729 }, { "epoch": 0.8018673623506797, "grad_norm": 0.5407636761665344, "learning_rate": 0.0002442002442002442, "loss": 0.5445, "step": 730 }, { "epoch": 0.8029658107922559, "grad_norm": 0.6281490921974182, "learning_rate": 0.00024407814407814403, "loss": 0.9319, "step": 731 }, { "epoch": 0.8040642592338322, "grad_norm": 1.2027008533477783, "learning_rate": 0.00024395604395604394, "loss": 0.3957, "step": 732 }, { "epoch": 0.8051627076754085, "grad_norm": 0.543230414390564, "learning_rate": 0.00024383394383394383, "loss": 0.7919, "step": 733 }, { "epoch": 0.8062611561169848, "grad_norm": 0.4269828498363495, "learning_rate": 0.0002437118437118437, "loss": 0.6081, "step": 734 }, { "epoch": 0.807359604558561, "grad_norm": 1.2857966423034668, "learning_rate": 0.00024358974358974357, "loss": 0.8654, "step": 735 }, { "epoch": 0.8084580530001373, "grad_norm": 0.6370485424995422, "learning_rate": 0.00024346764346764346, "loss": 0.8053, "step": 736 }, { "epoch": 0.8095565014417135, "grad_norm": 1.1288559436798096, "learning_rate": 0.00024334554334554332, "loss": 0.8709, "step": 737 }, { "epoch": 0.8106549498832899, "grad_norm": 0.5601497292518616, "learning_rate": 0.0002432234432234432, "loss": 0.7982, "step": 738 }, { "epoch": 0.8117533983248661, "grad_norm": 0.476745069026947, "learning_rate": 0.0002431013431013431, "loss": 0.7372, "step": 739 }, { "epoch": 0.8128518467664424, "grad_norm": 0.4287762939929962, "learning_rate": 0.00024297924297924295, "loss": 0.5686, "step": 740 }, { "epoch": 0.8139502952080186, "grad_norm": 0.7039306163787842, "learning_rate": 0.00024285714285714283, "loss": 0.7976, "step": 741 }, { "epoch": 0.8150487436495949, "grad_norm": 0.47433528304100037, "learning_rate": 0.00024273504273504272, "loss": 0.6375, "step": 742 }, { "epoch": 0.8161471920911713, "grad_norm": 0.5443944931030273, "learning_rate": 0.00024261294261294258, "loss": 0.6793, "step": 743 }, { "epoch": 0.8172456405327475, "grad_norm": 0.516094982624054, "learning_rate": 0.00024249084249084246, "loss": 0.785, "step": 744 }, { "epoch": 0.8183440889743238, "grad_norm": 0.6694304347038269, "learning_rate": 0.00024236874236874237, "loss": 0.5431, "step": 745 }, { "epoch": 0.8194425374159, "grad_norm": 0.5309669375419617, "learning_rate": 0.00024224664224664223, "loss": 0.5806, "step": 746 }, { "epoch": 0.8205409858574764, "grad_norm": 0.5502971410751343, "learning_rate": 0.00024212454212454212, "loss": 0.5053, "step": 747 }, { "epoch": 0.8216394342990526, "grad_norm": 0.5242869853973389, "learning_rate": 0.00024200244200244198, "loss": 0.8189, "step": 748 }, { "epoch": 0.8227378827406289, "grad_norm": 0.4131311774253845, "learning_rate": 0.00024188034188034186, "loss": 0.7074, "step": 749 }, { "epoch": 0.8238363311822051, "grad_norm": 0.599915087223053, "learning_rate": 0.00024175824175824175, "loss": 0.9408, "step": 750 }, { "epoch": 0.8249347796237814, "grad_norm": 0.3683515191078186, "learning_rate": 0.0002416361416361416, "loss": 0.6675, "step": 751 }, { "epoch": 0.8260332280653577, "grad_norm": 1.633415699005127, "learning_rate": 0.0002415140415140415, "loss": 0.6768, "step": 752 }, { "epoch": 0.827131676506934, "grad_norm": 0.3848377764225006, "learning_rate": 0.00024139194139194138, "loss": 0.485, "step": 753 }, { "epoch": 0.8282301249485102, "grad_norm": 0.4116027355194092, "learning_rate": 0.00024126984126984123, "loss": 0.8253, "step": 754 }, { "epoch": 0.8293285733900865, "grad_norm": 0.5805407762527466, "learning_rate": 0.00024114774114774112, "loss": 0.825, "step": 755 }, { "epoch": 0.8304270218316627, "grad_norm": 1.2401742935180664, "learning_rate": 0.000241025641025641, "loss": 0.6394, "step": 756 }, { "epoch": 0.8315254702732391, "grad_norm": 0.42345038056373596, "learning_rate": 0.00024090354090354086, "loss": 0.6958, "step": 757 }, { "epoch": 0.8326239187148153, "grad_norm": 1.3758116960525513, "learning_rate": 0.00024078144078144075, "loss": 0.6997, "step": 758 }, { "epoch": 0.8337223671563916, "grad_norm": 1.1826672554016113, "learning_rate": 0.00024065934065934066, "loss": 0.7908, "step": 759 }, { "epoch": 0.8348208155979678, "grad_norm": 1.0752373933792114, "learning_rate": 0.0002405372405372405, "loss": 0.8896, "step": 760 }, { "epoch": 0.8359192640395442, "grad_norm": 0.3347112834453583, "learning_rate": 0.0002404151404151404, "loss": 0.8202, "step": 761 }, { "epoch": 0.8370177124811204, "grad_norm": 0.5837082266807556, "learning_rate": 0.0002402930402930403, "loss": 0.7502, "step": 762 }, { "epoch": 0.8381161609226967, "grad_norm": 0.5439388751983643, "learning_rate": 0.00024017094017094015, "loss": 0.6928, "step": 763 }, { "epoch": 0.839214609364273, "grad_norm": 0.35348060727119446, "learning_rate": 0.00024004884004884004, "loss": 0.5495, "step": 764 }, { "epoch": 0.8403130578058492, "grad_norm": 0.4943974018096924, "learning_rate": 0.00023992673992673992, "loss": 0.9218, "step": 765 }, { "epoch": 0.8414115062474256, "grad_norm": 0.628667414188385, "learning_rate": 0.00023980463980463978, "loss": 0.6266, "step": 766 }, { "epoch": 0.8425099546890018, "grad_norm": 0.822575032711029, "learning_rate": 0.00023968253968253966, "loss": 0.791, "step": 767 }, { "epoch": 0.843608403130578, "grad_norm": 0.3044184446334839, "learning_rate": 0.00023956043956043955, "loss": 0.6048, "step": 768 }, { "epoch": 0.8447068515721543, "grad_norm": 0.40807369351387024, "learning_rate": 0.0002394383394383394, "loss": 0.6286, "step": 769 }, { "epoch": 0.8458053000137306, "grad_norm": 1.2373838424682617, "learning_rate": 0.0002393162393162393, "loss": 0.5133, "step": 770 }, { "epoch": 0.8469037484553069, "grad_norm": 0.5104987025260925, "learning_rate": 0.00023919413919413918, "loss": 0.591, "step": 771 }, { "epoch": 0.8480021968968832, "grad_norm": 0.6644220352172852, "learning_rate": 0.00023907203907203904, "loss": 0.7039, "step": 772 }, { "epoch": 0.8491006453384594, "grad_norm": 0.5887960195541382, "learning_rate": 0.00023894993894993892, "loss": 0.7017, "step": 773 }, { "epoch": 0.8501990937800357, "grad_norm": 0.6568577885627747, "learning_rate": 0.00023882783882783878, "loss": 0.6131, "step": 774 }, { "epoch": 0.851297542221612, "grad_norm": 0.6594721674919128, "learning_rate": 0.00023870573870573867, "loss": 0.6079, "step": 775 }, { "epoch": 0.8523959906631883, "grad_norm": 12.29937744140625, "learning_rate": 0.00023858363858363858, "loss": 1.1068, "step": 776 }, { "epoch": 0.8534944391047645, "grad_norm": 1.175355315208435, "learning_rate": 0.00023846153846153844, "loss": 0.734, "step": 777 }, { "epoch": 0.8545928875463408, "grad_norm": 1.7128019332885742, "learning_rate": 0.00023833943833943832, "loss": 0.6395, "step": 778 }, { "epoch": 0.855691335987917, "grad_norm": 0.6479717493057251, "learning_rate": 0.0002382173382173382, "loss": 0.8572, "step": 779 }, { "epoch": 0.8567897844294934, "grad_norm": 0.9646544456481934, "learning_rate": 0.00023809523809523807, "loss": 1.1168, "step": 780 }, { "epoch": 0.8578882328710696, "grad_norm": 0.8290930986404419, "learning_rate": 0.00023797313797313795, "loss": 0.4413, "step": 781 }, { "epoch": 0.8589866813126459, "grad_norm": 0.6690389513969421, "learning_rate": 0.00023785103785103784, "loss": 1.1878, "step": 782 }, { "epoch": 0.8600851297542221, "grad_norm": 0.6602356433868408, "learning_rate": 0.0002377289377289377, "loss": 0.5862, "step": 783 }, { "epoch": 0.8611835781957984, "grad_norm": 0.612316370010376, "learning_rate": 0.00023760683760683758, "loss": 0.7971, "step": 784 }, { "epoch": 0.8622820266373747, "grad_norm": 0.7429434657096863, "learning_rate": 0.00023748473748473747, "loss": 0.6265, "step": 785 }, { "epoch": 0.863380475078951, "grad_norm": 0.40107640624046326, "learning_rate": 0.00023736263736263733, "loss": 0.6697, "step": 786 }, { "epoch": 0.8644789235205272, "grad_norm": 0.45808035135269165, "learning_rate": 0.0002372405372405372, "loss": 0.7443, "step": 787 }, { "epoch": 0.8655773719621035, "grad_norm": 0.36327049136161804, "learning_rate": 0.0002371184371184371, "loss": 0.6518, "step": 788 }, { "epoch": 0.8666758204036799, "grad_norm": 0.45617833733558655, "learning_rate": 0.00023699633699633696, "loss": 0.792, "step": 789 }, { "epoch": 0.8677742688452561, "grad_norm": 0.5354835391044617, "learning_rate": 0.00023687423687423687, "loss": 0.7788, "step": 790 }, { "epoch": 0.8688727172868324, "grad_norm": 0.9770327210426331, "learning_rate": 0.00023675213675213675, "loss": 0.7267, "step": 791 }, { "epoch": 0.8699711657284086, "grad_norm": 0.646757960319519, "learning_rate": 0.0002366300366300366, "loss": 0.7234, "step": 792 }, { "epoch": 0.8710696141699849, "grad_norm": 0.4694693982601166, "learning_rate": 0.0002365079365079365, "loss": 0.8261, "step": 793 }, { "epoch": 0.8721680626115612, "grad_norm": 0.9923954606056213, "learning_rate": 0.00023638583638583638, "loss": 0.703, "step": 794 }, { "epoch": 0.8732665110531375, "grad_norm": 1.6440534591674805, "learning_rate": 0.00023626373626373624, "loss": 0.7654, "step": 795 }, { "epoch": 0.8743649594947137, "grad_norm": 0.3947128653526306, "learning_rate": 0.00023614163614163613, "loss": 0.637, "step": 796 }, { "epoch": 0.87546340793629, "grad_norm": 3.4264323711395264, "learning_rate": 0.000236019536019536, "loss": 0.7325, "step": 797 }, { "epoch": 0.8765618563778662, "grad_norm": 0.5469256043434143, "learning_rate": 0.00023589743589743587, "loss": 0.8203, "step": 798 }, { "epoch": 0.8776603048194426, "grad_norm": 0.5184471011161804, "learning_rate": 0.00023577533577533576, "loss": 0.7895, "step": 799 }, { "epoch": 0.8787587532610188, "grad_norm": 0.8231347799301147, "learning_rate": 0.00023565323565323562, "loss": 0.7888, "step": 800 }, { "epoch": 0.8798572017025951, "grad_norm": 14.826855659484863, "learning_rate": 0.0002355311355311355, "loss": 0.7564, "step": 801 }, { "epoch": 0.8809556501441713, "grad_norm": 0.5809927582740784, "learning_rate": 0.00023540903540903539, "loss": 0.6702, "step": 802 }, { "epoch": 0.8820540985857477, "grad_norm": 0.7244674563407898, "learning_rate": 0.00023528693528693524, "loss": 0.6475, "step": 803 }, { "epoch": 0.8831525470273239, "grad_norm": 0.8071272373199463, "learning_rate": 0.00023516483516483513, "loss": 0.7434, "step": 804 }, { "epoch": 0.8842509954689002, "grad_norm": 0.6872429847717285, "learning_rate": 0.00023504273504273504, "loss": 0.5968, "step": 805 }, { "epoch": 0.8853494439104764, "grad_norm": 9.353965759277344, "learning_rate": 0.00023492063492063487, "loss": 0.4228, "step": 806 }, { "epoch": 0.8864478923520527, "grad_norm": 0.47151222825050354, "learning_rate": 0.00023479853479853479, "loss": 0.6832, "step": 807 }, { "epoch": 0.887546340793629, "grad_norm": 1.4599422216415405, "learning_rate": 0.00023467643467643467, "loss": 0.6692, "step": 808 }, { "epoch": 0.8886447892352053, "grad_norm": 0.45811519026756287, "learning_rate": 0.00023455433455433453, "loss": 0.787, "step": 809 }, { "epoch": 0.8897432376767815, "grad_norm": 1.077709674835205, "learning_rate": 0.00023443223443223442, "loss": 0.6695, "step": 810 }, { "epoch": 0.8908416861183578, "grad_norm": 0.5702061057090759, "learning_rate": 0.0002343101343101343, "loss": 0.5858, "step": 811 }, { "epoch": 0.891940134559934, "grad_norm": 2.2391059398651123, "learning_rate": 0.00023418803418803416, "loss": 0.6688, "step": 812 }, { "epoch": 0.8930385830015104, "grad_norm": 1.6974279880523682, "learning_rate": 0.00023406593406593405, "loss": 0.8545, "step": 813 }, { "epoch": 0.8941370314430866, "grad_norm": 0.983435869216919, "learning_rate": 0.00023394383394383393, "loss": 0.8128, "step": 814 }, { "epoch": 0.8952354798846629, "grad_norm": 0.44103240966796875, "learning_rate": 0.0002338217338217338, "loss": 0.7968, "step": 815 }, { "epoch": 0.8963339283262391, "grad_norm": 1.0707038640975952, "learning_rate": 0.00023369963369963367, "loss": 0.6996, "step": 816 }, { "epoch": 0.8974323767678155, "grad_norm": 0.8029122352600098, "learning_rate": 0.00023357753357753356, "loss": 0.7911, "step": 817 }, { "epoch": 0.8985308252093918, "grad_norm": 0.46339499950408936, "learning_rate": 0.00023345543345543342, "loss": 0.7712, "step": 818 }, { "epoch": 0.899629273650968, "grad_norm": 1.020947813987732, "learning_rate": 0.0002333333333333333, "loss": 0.6865, "step": 819 }, { "epoch": 0.9007277220925443, "grad_norm": 0.5332039594650269, "learning_rate": 0.00023321123321123322, "loss": 0.8352, "step": 820 }, { "epoch": 0.9018261705341205, "grad_norm": 0.40052923560142517, "learning_rate": 0.00023308913308913307, "loss": 0.5435, "step": 821 }, { "epoch": 0.9029246189756969, "grad_norm": 0.6643521189689636, "learning_rate": 0.00023296703296703296, "loss": 0.7406, "step": 822 }, { "epoch": 0.9040230674172731, "grad_norm": 0.7514997720718384, "learning_rate": 0.00023284493284493285, "loss": 0.7595, "step": 823 }, { "epoch": 0.9051215158588494, "grad_norm": 0.7124571204185486, "learning_rate": 0.0002327228327228327, "loss": 0.5736, "step": 824 }, { "epoch": 0.9062199643004256, "grad_norm": 0.6757075786590576, "learning_rate": 0.0002326007326007326, "loss": 0.6275, "step": 825 }, { "epoch": 0.9073184127420019, "grad_norm": 0.4200783669948578, "learning_rate": 0.00023247863247863245, "loss": 0.6267, "step": 826 }, { "epoch": 0.9084168611835782, "grad_norm": 0.5442836284637451, "learning_rate": 0.00023235653235653233, "loss": 0.6814, "step": 827 }, { "epoch": 0.9095153096251545, "grad_norm": 0.4859601557254791, "learning_rate": 0.00023223443223443222, "loss": 0.6451, "step": 828 }, { "epoch": 0.9106137580667307, "grad_norm": 0.7353097200393677, "learning_rate": 0.00023211233211233208, "loss": 0.6723, "step": 829 }, { "epoch": 0.911712206508307, "grad_norm": 0.6389304995536804, "learning_rate": 0.00023199023199023196, "loss": 0.9429, "step": 830 }, { "epoch": 0.9128106549498833, "grad_norm": 0.6813933849334717, "learning_rate": 0.00023186813186813185, "loss": 0.5319, "step": 831 }, { "epoch": 0.9139091033914596, "grad_norm": 0.40023690462112427, "learning_rate": 0.0002317460317460317, "loss": 0.5808, "step": 832 }, { "epoch": 0.9150075518330358, "grad_norm": 0.5327205657958984, "learning_rate": 0.0002316239316239316, "loss": 0.6666, "step": 833 }, { "epoch": 0.9161060002746121, "grad_norm": 1.672450065612793, "learning_rate": 0.0002315018315018315, "loss": 0.7758, "step": 834 }, { "epoch": 0.9172044487161883, "grad_norm": 0.5022990703582764, "learning_rate": 0.00023137973137973134, "loss": 0.6309, "step": 835 }, { "epoch": 0.9183028971577647, "grad_norm": 0.43023642897605896, "learning_rate": 0.00023125763125763125, "loss": 0.5343, "step": 836 }, { "epoch": 0.919401345599341, "grad_norm": 0.6878641843795776, "learning_rate": 0.00023113553113553113, "loss": 0.7268, "step": 837 }, { "epoch": 0.9204997940409172, "grad_norm": 0.40551453828811646, "learning_rate": 0.000231013431013431, "loss": 0.5784, "step": 838 }, { "epoch": 0.9215982424824934, "grad_norm": 0.412356436252594, "learning_rate": 0.00023089133089133088, "loss": 0.7685, "step": 839 }, { "epoch": 0.9226966909240698, "grad_norm": 1.1603305339813232, "learning_rate": 0.00023076923076923076, "loss": 0.518, "step": 840 }, { "epoch": 0.9237951393656461, "grad_norm": 0.6733229756355286, "learning_rate": 0.00023064713064713062, "loss": 0.5883, "step": 841 }, { "epoch": 0.9248935878072223, "grad_norm": 0.619434654712677, "learning_rate": 0.0002305250305250305, "loss": 0.6244, "step": 842 }, { "epoch": 0.9259920362487986, "grad_norm": 0.6989772319793701, "learning_rate": 0.0002304029304029304, "loss": 0.5763, "step": 843 }, { "epoch": 0.9270904846903748, "grad_norm": 0.6276418566703796, "learning_rate": 0.00023028083028083025, "loss": 0.4762, "step": 844 }, { "epoch": 0.9281889331319512, "grad_norm": 0.5577360987663269, "learning_rate": 0.00023015873015873014, "loss": 0.6254, "step": 845 }, { "epoch": 0.9292873815735274, "grad_norm": 0.6185848116874695, "learning_rate": 0.00023003663003663002, "loss": 1.0182, "step": 846 }, { "epoch": 0.9303858300151037, "grad_norm": 1.2415262460708618, "learning_rate": 0.00022991452991452988, "loss": 0.4677, "step": 847 }, { "epoch": 0.9314842784566799, "grad_norm": 0.4582594335079193, "learning_rate": 0.00022979242979242977, "loss": 0.6308, "step": 848 }, { "epoch": 0.9325827268982562, "grad_norm": 0.4749620258808136, "learning_rate": 0.00022967032967032962, "loss": 0.6217, "step": 849 }, { "epoch": 0.9336811753398325, "grad_norm": 0.48614588379859924, "learning_rate": 0.0002295482295482295, "loss": 0.7469, "step": 850 }, { "epoch": 0.9347796237814088, "grad_norm": 0.7357453107833862, "learning_rate": 0.00022942612942612942, "loss": 0.5978, "step": 851 }, { "epoch": 0.935878072222985, "grad_norm": 0.53326815366745, "learning_rate": 0.00022930402930402928, "loss": 0.7678, "step": 852 }, { "epoch": 0.9369765206645613, "grad_norm": 0.4853271245956421, "learning_rate": 0.00022918192918192917, "loss": 0.4888, "step": 853 }, { "epoch": 0.9380749691061376, "grad_norm": 1.6529743671417236, "learning_rate": 0.00022905982905982905, "loss": 0.6103, "step": 854 }, { "epoch": 0.9391734175477139, "grad_norm": 0.8255143165588379, "learning_rate": 0.0002289377289377289, "loss": 0.6977, "step": 855 }, { "epoch": 0.9402718659892901, "grad_norm": 0.3999016284942627, "learning_rate": 0.0002288156288156288, "loss": 0.5398, "step": 856 }, { "epoch": 0.9413703144308664, "grad_norm": 1.933090329170227, "learning_rate": 0.00022869352869352868, "loss": 1.0827, "step": 857 }, { "epoch": 0.9424687628724426, "grad_norm": 0.8884105682373047, "learning_rate": 0.00022857142857142854, "loss": 0.702, "step": 858 }, { "epoch": 0.943567211314019, "grad_norm": 0.4555901885032654, "learning_rate": 0.00022844932844932843, "loss": 0.8737, "step": 859 }, { "epoch": 0.9446656597555952, "grad_norm": 0.535915732383728, "learning_rate": 0.0002283272283272283, "loss": 0.7036, "step": 860 }, { "epoch": 0.9457641081971715, "grad_norm": 0.7607597708702087, "learning_rate": 0.00022820512820512817, "loss": 0.8707, "step": 861 }, { "epoch": 0.9468625566387477, "grad_norm": 0.4056457579135895, "learning_rate": 0.00022808302808302805, "loss": 0.6658, "step": 862 }, { "epoch": 0.947961005080324, "grad_norm": 0.5472984313964844, "learning_rate": 0.00022796092796092794, "loss": 0.5429, "step": 863 }, { "epoch": 0.9490594535219004, "grad_norm": 0.6866592764854431, "learning_rate": 0.0002278388278388278, "loss": 0.7343, "step": 864 }, { "epoch": 0.9501579019634766, "grad_norm": 0.5244406461715698, "learning_rate": 0.0002277167277167277, "loss": 0.669, "step": 865 }, { "epoch": 0.9512563504050529, "grad_norm": 0.45024383068084717, "learning_rate": 0.0002275946275946276, "loss": 0.9062, "step": 866 }, { "epoch": 0.9523547988466291, "grad_norm": 0.4252873659133911, "learning_rate": 0.00022747252747252745, "loss": 0.6109, "step": 867 }, { "epoch": 0.9534532472882055, "grad_norm": 0.50081467628479, "learning_rate": 0.00022735042735042734, "loss": 0.5266, "step": 868 }, { "epoch": 0.9545516957297817, "grad_norm": 0.9674072861671448, "learning_rate": 0.00022722832722832723, "loss": 0.7197, "step": 869 }, { "epoch": 0.955650144171358, "grad_norm": 1.572348952293396, "learning_rate": 0.00022710622710622708, "loss": 0.4728, "step": 870 }, { "epoch": 0.9567485926129342, "grad_norm": 0.6033158898353577, "learning_rate": 0.00022698412698412697, "loss": 0.6394, "step": 871 }, { "epoch": 0.9578470410545105, "grad_norm": 0.5810523629188538, "learning_rate": 0.00022686202686202686, "loss": 0.8813, "step": 872 }, { "epoch": 0.9589454894960868, "grad_norm": 0.46345213055610657, "learning_rate": 0.00022673992673992671, "loss": 0.5828, "step": 873 }, { "epoch": 0.9600439379376631, "grad_norm": 0.5414748191833496, "learning_rate": 0.0002266178266178266, "loss": 0.6311, "step": 874 }, { "epoch": 0.9611423863792393, "grad_norm": 0.9083818197250366, "learning_rate": 0.00022649572649572646, "loss": 0.961, "step": 875 }, { "epoch": 0.9622408348208156, "grad_norm": 0.786993145942688, "learning_rate": 0.00022637362637362634, "loss": 0.7825, "step": 876 }, { "epoch": 0.9633392832623918, "grad_norm": 0.7639968991279602, "learning_rate": 0.00022625152625152623, "loss": 0.8989, "step": 877 }, { "epoch": 0.9644377317039682, "grad_norm": 0.43360400199890137, "learning_rate": 0.0002261294261294261, "loss": 0.6747, "step": 878 }, { "epoch": 0.9655361801455444, "grad_norm": 0.8512898683547974, "learning_rate": 0.00022600732600732597, "loss": 0.7152, "step": 879 }, { "epoch": 0.9666346285871207, "grad_norm": 0.46903684735298157, "learning_rate": 0.00022588522588522589, "loss": 0.7594, "step": 880 }, { "epoch": 0.9677330770286969, "grad_norm": 1.9560080766677856, "learning_rate": 0.00022576312576312572, "loss": 0.598, "step": 881 }, { "epoch": 0.9688315254702733, "grad_norm": 1.1595470905303955, "learning_rate": 0.00022564102564102563, "loss": 0.6005, "step": 882 }, { "epoch": 0.9699299739118495, "grad_norm": 0.7318668365478516, "learning_rate": 0.00022551892551892551, "loss": 0.7327, "step": 883 }, { "epoch": 0.9710284223534258, "grad_norm": 0.6557647585868835, "learning_rate": 0.00022539682539682537, "loss": 0.5858, "step": 884 }, { "epoch": 0.972126870795002, "grad_norm": 0.5645928382873535, "learning_rate": 0.00022527472527472526, "loss": 0.5818, "step": 885 }, { "epoch": 0.9732253192365783, "grad_norm": 0.4630253314971924, "learning_rate": 0.00022515262515262514, "loss": 0.8363, "step": 886 }, { "epoch": 0.9743237676781547, "grad_norm": 0.6750912666320801, "learning_rate": 0.000225030525030525, "loss": 0.8865, "step": 887 }, { "epoch": 0.9754222161197309, "grad_norm": 0.6309487819671631, "learning_rate": 0.0002249084249084249, "loss": 0.5596, "step": 888 }, { "epoch": 0.9765206645613072, "grad_norm": 0.9696050882339478, "learning_rate": 0.00022478632478632477, "loss": 0.7752, "step": 889 }, { "epoch": 0.9776191130028834, "grad_norm": 0.7614735960960388, "learning_rate": 0.00022466422466422463, "loss": 0.7131, "step": 890 }, { "epoch": 0.9787175614444596, "grad_norm": 0.4971006214618683, "learning_rate": 0.00022454212454212452, "loss": 0.6218, "step": 891 }, { "epoch": 0.979816009886036, "grad_norm": 0.47809773683547974, "learning_rate": 0.0002244200244200244, "loss": 0.5678, "step": 892 }, { "epoch": 0.9809144583276123, "grad_norm": 0.5959337949752808, "learning_rate": 0.00022429792429792426, "loss": 1.0002, "step": 893 }, { "epoch": 0.9820129067691885, "grad_norm": 0.45277753472328186, "learning_rate": 0.00022417582417582415, "loss": 0.7321, "step": 894 }, { "epoch": 0.9831113552107648, "grad_norm": 1.279405951499939, "learning_rate": 0.00022405372405372406, "loss": 0.7912, "step": 895 }, { "epoch": 0.9842098036523411, "grad_norm": 0.49885687232017517, "learning_rate": 0.00022393162393162392, "loss": 0.5558, "step": 896 }, { "epoch": 0.9853082520939174, "grad_norm": 0.474979430437088, "learning_rate": 0.0002238095238095238, "loss": 0.7095, "step": 897 }, { "epoch": 0.9864067005354936, "grad_norm": 0.3826389014720917, "learning_rate": 0.0002236874236874237, "loss": 0.5695, "step": 898 }, { "epoch": 0.9875051489770699, "grad_norm": 0.33514517545700073, "learning_rate": 0.00022356532356532355, "loss": 0.6341, "step": 899 }, { "epoch": 0.9886035974186461, "grad_norm": 0.5049251914024353, "learning_rate": 0.00022344322344322343, "loss": 0.5577, "step": 900 }, { "epoch": 0.9897020458602225, "grad_norm": 0.5179988145828247, "learning_rate": 0.0002233211233211233, "loss": 0.5769, "step": 901 }, { "epoch": 0.9908004943017987, "grad_norm": 0.5194469094276428, "learning_rate": 0.00022319902319902318, "loss": 0.5466, "step": 902 }, { "epoch": 0.991898942743375, "grad_norm": 0.46941491961479187, "learning_rate": 0.00022307692307692306, "loss": 0.642, "step": 903 }, { "epoch": 0.9929973911849512, "grad_norm": 0.379682719707489, "learning_rate": 0.00022295482295482292, "loss": 0.5508, "step": 904 }, { "epoch": 0.9940958396265275, "grad_norm": 1.3844119310379028, "learning_rate": 0.0002228327228327228, "loss": 0.8814, "step": 905 }, { "epoch": 0.9951942880681038, "grad_norm": 2.497697114944458, "learning_rate": 0.0002227106227106227, "loss": 0.8116, "step": 906 }, { "epoch": 0.9962927365096801, "grad_norm": 0.36689239740371704, "learning_rate": 0.00022258852258852255, "loss": 0.5001, "step": 907 }, { "epoch": 0.9973911849512563, "grad_norm": 0.39868447184562683, "learning_rate": 0.00022246642246642243, "loss": 0.6913, "step": 908 }, { "epoch": 0.9984896333928326, "grad_norm": 0.5270336270332336, "learning_rate": 0.00022234432234432235, "loss": 0.5401, "step": 909 }, { "epoch": 0.999588081834409, "grad_norm": 0.4079851508140564, "learning_rate": 0.00022222222222222218, "loss": 0.471, "step": 910 }, { "epoch": 1.000686530275985, "grad_norm": 0.43189048767089844, "learning_rate": 0.0002221001221001221, "loss": 0.8237, "step": 911 }, { "epoch": 1.0017849787175614, "grad_norm": 0.52342289686203, "learning_rate": 0.00022197802197802198, "loss": 0.6363, "step": 912 }, { "epoch": 1.0028834271591378, "grad_norm": 0.38078904151916504, "learning_rate": 0.00022185592185592184, "loss": 0.4411, "step": 913 }, { "epoch": 1.003981875600714, "grad_norm": 0.5302817821502686, "learning_rate": 0.00022173382173382172, "loss": 0.858, "step": 914 }, { "epoch": 1.0050803240422903, "grad_norm": 0.3696751892566681, "learning_rate": 0.0002216117216117216, "loss": 0.8766, "step": 915 }, { "epoch": 1.0061787724838664, "grad_norm": 0.7566766738891602, "learning_rate": 0.00022148962148962146, "loss": 1.067, "step": 916 }, { "epoch": 1.0072772209254428, "grad_norm": 0.7399318218231201, "learning_rate": 0.00022136752136752135, "loss": 0.6683, "step": 917 }, { "epoch": 1.0083756693670192, "grad_norm": 0.5435899496078491, "learning_rate": 0.00022124542124542124, "loss": 0.6045, "step": 918 }, { "epoch": 1.0094741178085953, "grad_norm": 0.9680571556091309, "learning_rate": 0.0002211233211233211, "loss": 0.7546, "step": 919 }, { "epoch": 1.0105725662501717, "grad_norm": 0.6131067872047424, "learning_rate": 0.00022100122100122098, "loss": 0.6655, "step": 920 }, { "epoch": 1.0116710146917478, "grad_norm": 0.8093316555023193, "learning_rate": 0.00022087912087912086, "loss": 0.4812, "step": 921 }, { "epoch": 1.0127694631333242, "grad_norm": 0.5077763199806213, "learning_rate": 0.00022075702075702072, "loss": 0.5357, "step": 922 }, { "epoch": 1.0138679115749005, "grad_norm": 0.4767695963382721, "learning_rate": 0.0002206349206349206, "loss": 0.5807, "step": 923 }, { "epoch": 1.0149663600164767, "grad_norm": 0.3215581178665161, "learning_rate": 0.00022051282051282052, "loss": 0.5773, "step": 924 }, { "epoch": 1.016064808458053, "grad_norm": 0.425603985786438, "learning_rate": 0.00022039072039072035, "loss": 0.5441, "step": 925 }, { "epoch": 1.0171632568996292, "grad_norm": 0.6131730079650879, "learning_rate": 0.00022026862026862027, "loss": 0.856, "step": 926 }, { "epoch": 1.0182617053412055, "grad_norm": 0.5472941398620605, "learning_rate": 0.00022014652014652012, "loss": 0.8228, "step": 927 }, { "epoch": 1.0193601537827819, "grad_norm": 0.46728211641311646, "learning_rate": 0.00022002442002442, "loss": 0.7615, "step": 928 }, { "epoch": 1.020458602224358, "grad_norm": 0.39919501543045044, "learning_rate": 0.0002199023199023199, "loss": 0.709, "step": 929 }, { "epoch": 1.0215570506659344, "grad_norm": 0.564400315284729, "learning_rate": 0.00021978021978021975, "loss": 0.5941, "step": 930 }, { "epoch": 1.0226554991075107, "grad_norm": 0.39073804020881653, "learning_rate": 0.00021965811965811964, "loss": 0.6386, "step": 931 }, { "epoch": 1.0237539475490869, "grad_norm": 0.3725563585758209, "learning_rate": 0.00021953601953601952, "loss": 0.4766, "step": 932 }, { "epoch": 1.0248523959906632, "grad_norm": 1.319197654724121, "learning_rate": 0.00021941391941391938, "loss": 0.8465, "step": 933 }, { "epoch": 1.0259508444322394, "grad_norm": 0.5126785635948181, "learning_rate": 0.00021929181929181927, "loss": 0.5103, "step": 934 }, { "epoch": 1.0270492928738157, "grad_norm": 0.5401897430419922, "learning_rate": 0.00021916971916971915, "loss": 0.5879, "step": 935 }, { "epoch": 1.028147741315392, "grad_norm": 0.47014057636260986, "learning_rate": 0.000219047619047619, "loss": 0.658, "step": 936 }, { "epoch": 1.0292461897569682, "grad_norm": 0.49227291345596313, "learning_rate": 0.0002189255189255189, "loss": 0.5271, "step": 937 }, { "epoch": 1.0303446381985446, "grad_norm": 0.8186778426170349, "learning_rate": 0.00021880341880341878, "loss": 0.6491, "step": 938 }, { "epoch": 1.0314430866401207, "grad_norm": 0.46345674991607666, "learning_rate": 0.00021868131868131864, "loss": 0.7935, "step": 939 }, { "epoch": 1.032541535081697, "grad_norm": 1.7300915718078613, "learning_rate": 0.00021855921855921855, "loss": 0.516, "step": 940 }, { "epoch": 1.0336399835232735, "grad_norm": 0.5100822448730469, "learning_rate": 0.00021843711843711844, "loss": 0.8286, "step": 941 }, { "epoch": 1.0347384319648496, "grad_norm": 0.42278483510017395, "learning_rate": 0.0002183150183150183, "loss": 0.7312, "step": 942 }, { "epoch": 1.035836880406426, "grad_norm": 0.42105185985565186, "learning_rate": 0.00021819291819291818, "loss": 0.5729, "step": 943 }, { "epoch": 1.036935328848002, "grad_norm": 0.5117312669754028, "learning_rate": 0.00021807081807081807, "loss": 0.7688, "step": 944 }, { "epoch": 1.0380337772895785, "grad_norm": 0.4982740879058838, "learning_rate": 0.00021794871794871793, "loss": 0.5746, "step": 945 }, { "epoch": 1.0391322257311548, "grad_norm": 0.5181052684783936, "learning_rate": 0.0002178266178266178, "loss": 0.8446, "step": 946 }, { "epoch": 1.040230674172731, "grad_norm": 5.104315757751465, "learning_rate": 0.0002177045177045177, "loss": 0.9641, "step": 947 }, { "epoch": 1.0413291226143073, "grad_norm": 0.7384645938873291, "learning_rate": 0.00021758241758241756, "loss": 0.7168, "step": 948 }, { "epoch": 1.0424275710558835, "grad_norm": 0.4367550313472748, "learning_rate": 0.00021746031746031744, "loss": 0.7139, "step": 949 }, { "epoch": 1.0435260194974598, "grad_norm": 0.7332566380500793, "learning_rate": 0.00021733821733821733, "loss": 0.7082, "step": 950 }, { "epoch": 1.0446244679390362, "grad_norm": 0.4191775918006897, "learning_rate": 0.00021721611721611719, "loss": 0.7986, "step": 951 }, { "epoch": 1.0457229163806123, "grad_norm": 0.33929941058158875, "learning_rate": 0.00021709401709401707, "loss": 0.3784, "step": 952 }, { "epoch": 1.0468213648221887, "grad_norm": 0.5255181789398193, "learning_rate": 0.00021697191697191693, "loss": 0.5842, "step": 953 }, { "epoch": 1.047919813263765, "grad_norm": 0.5401780605316162, "learning_rate": 0.00021684981684981681, "loss": 0.7939, "step": 954 }, { "epoch": 1.0490182617053412, "grad_norm": 0.34873855113983154, "learning_rate": 0.00021672771672771673, "loss": 0.7957, "step": 955 }, { "epoch": 1.0501167101469175, "grad_norm": 0.33418160676956177, "learning_rate": 0.00021660561660561656, "loss": 0.6037, "step": 956 }, { "epoch": 1.0512151585884937, "grad_norm": 0.3197249174118042, "learning_rate": 0.00021648351648351647, "loss": 0.5223, "step": 957 }, { "epoch": 1.05231360703007, "grad_norm": 0.5962835550308228, "learning_rate": 0.00021636141636141636, "loss": 0.5213, "step": 958 }, { "epoch": 1.0534120554716464, "grad_norm": 1.3891643285751343, "learning_rate": 0.00021623931623931622, "loss": 0.6781, "step": 959 }, { "epoch": 1.0545105039132225, "grad_norm": 0.42117932438850403, "learning_rate": 0.0002161172161172161, "loss": 0.6363, "step": 960 }, { "epoch": 1.055608952354799, "grad_norm": 0.4514491558074951, "learning_rate": 0.00021599511599511599, "loss": 0.6904, "step": 961 }, { "epoch": 1.056707400796375, "grad_norm": 0.4863387644290924, "learning_rate": 0.00021587301587301584, "loss": 0.6595, "step": 962 }, { "epoch": 1.0578058492379514, "grad_norm": 0.6178450584411621, "learning_rate": 0.00021575091575091573, "loss": 0.8412, "step": 963 }, { "epoch": 1.0589042976795278, "grad_norm": 0.3728642761707306, "learning_rate": 0.00021562881562881562, "loss": 0.629, "step": 964 }, { "epoch": 1.060002746121104, "grad_norm": 0.7554892301559448, "learning_rate": 0.00021550671550671547, "loss": 0.5804, "step": 965 }, { "epoch": 1.0611011945626803, "grad_norm": 0.550298273563385, "learning_rate": 0.00021538461538461536, "loss": 0.476, "step": 966 }, { "epoch": 1.0621996430042564, "grad_norm": 0.4082244336605072, "learning_rate": 0.00021526251526251524, "loss": 0.4001, "step": 967 }, { "epoch": 1.0632980914458328, "grad_norm": 1.2327499389648438, "learning_rate": 0.0002151404151404151, "loss": 0.4583, "step": 968 }, { "epoch": 1.0643965398874091, "grad_norm": 0.860550045967102, "learning_rate": 0.000215018315018315, "loss": 0.6415, "step": 969 }, { "epoch": 1.0654949883289853, "grad_norm": 0.558860182762146, "learning_rate": 0.0002148962148962149, "loss": 0.6215, "step": 970 }, { "epoch": 1.0665934367705616, "grad_norm": 0.7794890403747559, "learning_rate": 0.00021477411477411476, "loss": 0.5094, "step": 971 }, { "epoch": 1.0676918852121378, "grad_norm": 0.48574942350387573, "learning_rate": 0.00021465201465201465, "loss": 0.7385, "step": 972 }, { "epoch": 1.0687903336537141, "grad_norm": 0.4496791660785675, "learning_rate": 0.00021452991452991453, "loss": 0.5036, "step": 973 }, { "epoch": 1.0698887820952905, "grad_norm": 0.5360952615737915, "learning_rate": 0.0002144078144078144, "loss": 0.6825, "step": 974 }, { "epoch": 1.0709872305368666, "grad_norm": 0.5783904194831848, "learning_rate": 0.00021428571428571427, "loss": 0.6736, "step": 975 }, { "epoch": 1.072085678978443, "grad_norm": 2.290815830230713, "learning_rate": 0.00021416361416361416, "loss": 0.696, "step": 976 }, { "epoch": 1.0731841274200193, "grad_norm": 1.3432899713516235, "learning_rate": 0.00021404151404151402, "loss": 0.5296, "step": 977 }, { "epoch": 1.0742825758615955, "grad_norm": 0.5308722257614136, "learning_rate": 0.0002139194139194139, "loss": 0.6642, "step": 978 }, { "epoch": 1.0753810243031718, "grad_norm": 0.7245768904685974, "learning_rate": 0.00021379731379731376, "loss": 0.6811, "step": 979 }, { "epoch": 1.076479472744748, "grad_norm": 0.3873349726200104, "learning_rate": 0.00021367521367521365, "loss": 0.8503, "step": 980 }, { "epoch": 1.0775779211863243, "grad_norm": 0.5792405605316162, "learning_rate": 0.00021355311355311353, "loss": 0.4543, "step": 981 }, { "epoch": 1.0786763696279005, "grad_norm": 0.6543241143226624, "learning_rate": 0.0002134310134310134, "loss": 0.7778, "step": 982 }, { "epoch": 1.0797748180694768, "grad_norm": 0.5572071075439453, "learning_rate": 0.00021330891330891328, "loss": 0.8446, "step": 983 }, { "epoch": 1.0808732665110532, "grad_norm": 0.5798014402389526, "learning_rate": 0.0002131868131868132, "loss": 0.7461, "step": 984 }, { "epoch": 1.0819717149526293, "grad_norm": 0.8282085657119751, "learning_rate": 0.00021306471306471302, "loss": 0.612, "step": 985 }, { "epoch": 1.0830701633942057, "grad_norm": 0.5782580971717834, "learning_rate": 0.00021294261294261293, "loss": 0.5506, "step": 986 }, { "epoch": 1.084168611835782, "grad_norm": 0.3826775848865509, "learning_rate": 0.00021282051282051282, "loss": 0.7859, "step": 987 }, { "epoch": 1.0852670602773582, "grad_norm": 0.534752368927002, "learning_rate": 0.00021269841269841268, "loss": 0.8835, "step": 988 }, { "epoch": 1.0863655087189346, "grad_norm": 0.45931264758110046, "learning_rate": 0.00021257631257631256, "loss": 0.6694, "step": 989 }, { "epoch": 1.0874639571605107, "grad_norm": 0.6106250286102295, "learning_rate": 0.00021245421245421245, "loss": 0.8274, "step": 990 }, { "epoch": 1.088562405602087, "grad_norm": 0.3704061806201935, "learning_rate": 0.0002123321123321123, "loss": 0.7449, "step": 991 }, { "epoch": 1.0896608540436634, "grad_norm": 0.3922840356826782, "learning_rate": 0.0002122100122100122, "loss": 0.5845, "step": 992 }, { "epoch": 1.0907593024852396, "grad_norm": 0.48152726888656616, "learning_rate": 0.00021208791208791208, "loss": 0.6608, "step": 993 }, { "epoch": 1.091857750926816, "grad_norm": 0.42257216572761536, "learning_rate": 0.00021196581196581194, "loss": 0.6379, "step": 994 }, { "epoch": 1.092956199368392, "grad_norm": 0.4746345579624176, "learning_rate": 0.00021184371184371182, "loss": 0.6467, "step": 995 }, { "epoch": 1.0940546478099684, "grad_norm": 0.3915644884109497, "learning_rate": 0.0002117216117216117, "loss": 0.9699, "step": 996 }, { "epoch": 1.0951530962515448, "grad_norm": 0.5957880020141602, "learning_rate": 0.00021159951159951157, "loss": 0.6917, "step": 997 }, { "epoch": 1.096251544693121, "grad_norm": 0.4327985942363739, "learning_rate": 0.00021147741147741145, "loss": 0.8091, "step": 998 }, { "epoch": 1.0973499931346973, "grad_norm": 0.42600274085998535, "learning_rate": 0.00021135531135531136, "loss": 0.7685, "step": 999 }, { "epoch": 1.0984484415762734, "grad_norm": 0.7165039777755737, "learning_rate": 0.0002112332112332112, "loss": 0.8646, "step": 1000 }, { "epoch": 1.0995468900178498, "grad_norm": 0.447652131319046, "learning_rate": 0.0002111111111111111, "loss": 0.521, "step": 1001 }, { "epoch": 1.1006453384594261, "grad_norm": 0.3022591769695282, "learning_rate": 0.000210989010989011, "loss": 0.6099, "step": 1002 }, { "epoch": 1.1017437869010023, "grad_norm": 0.32764387130737305, "learning_rate": 0.00021086691086691085, "loss": 0.5624, "step": 1003 }, { "epoch": 1.1028422353425786, "grad_norm": 0.7301959991455078, "learning_rate": 0.00021074481074481074, "loss": 0.6091, "step": 1004 }, { "epoch": 1.1039406837841548, "grad_norm": 0.4734131097793579, "learning_rate": 0.0002106227106227106, "loss": 0.6849, "step": 1005 }, { "epoch": 1.1050391322257311, "grad_norm": 0.7214820384979248, "learning_rate": 0.00021050061050061048, "loss": 0.789, "step": 1006 }, { "epoch": 1.1061375806673075, "grad_norm": 0.31265702843666077, "learning_rate": 0.00021037851037851037, "loss": 0.5176, "step": 1007 }, { "epoch": 1.1072360291088836, "grad_norm": 0.5804157257080078, "learning_rate": 0.00021025641025641022, "loss": 1.0152, "step": 1008 }, { "epoch": 1.10833447755046, "grad_norm": 0.3624595105648041, "learning_rate": 0.0002101343101343101, "loss": 0.6843, "step": 1009 }, { "epoch": 1.1094329259920364, "grad_norm": 0.5099515318870544, "learning_rate": 0.00021001221001221, "loss": 0.5568, "step": 1010 }, { "epoch": 1.1105313744336125, "grad_norm": 0.46201249957084656, "learning_rate": 0.00020989010989010985, "loss": 0.5883, "step": 1011 }, { "epoch": 1.1116298228751889, "grad_norm": 0.4493483603000641, "learning_rate": 0.00020976800976800974, "loss": 0.8338, "step": 1012 }, { "epoch": 1.112728271316765, "grad_norm": 0.4771614968776703, "learning_rate": 0.00020964590964590963, "loss": 0.7251, "step": 1013 }, { "epoch": 1.1138267197583414, "grad_norm": 2.073347806930542, "learning_rate": 0.00020952380952380948, "loss": 0.8921, "step": 1014 }, { "epoch": 1.1149251681999177, "grad_norm": 0.435680091381073, "learning_rate": 0.0002094017094017094, "loss": 0.5444, "step": 1015 }, { "epoch": 1.1160236166414939, "grad_norm": 0.46824783086776733, "learning_rate": 0.00020927960927960928, "loss": 0.5591, "step": 1016 }, { "epoch": 1.1171220650830702, "grad_norm": 0.43938374519348145, "learning_rate": 0.00020915750915750914, "loss": 0.7476, "step": 1017 }, { "epoch": 1.1182205135246464, "grad_norm": 0.3620377779006958, "learning_rate": 0.00020903540903540903, "loss": 0.5763, "step": 1018 }, { "epoch": 1.1193189619662227, "grad_norm": 0.612406313419342, "learning_rate": 0.0002089133089133089, "loss": 0.706, "step": 1019 }, { "epoch": 1.120417410407799, "grad_norm": 0.5045173168182373, "learning_rate": 0.00020879120879120877, "loss": 0.6799, "step": 1020 }, { "epoch": 1.1215158588493752, "grad_norm": 0.4815331995487213, "learning_rate": 0.00020866910866910865, "loss": 0.8845, "step": 1021 }, { "epoch": 1.1226143072909516, "grad_norm": 0.3756159245967865, "learning_rate": 0.00020854700854700854, "loss": 0.5545, "step": 1022 }, { "epoch": 1.1237127557325277, "grad_norm": 0.3184347152709961, "learning_rate": 0.0002084249084249084, "loss": 0.5109, "step": 1023 }, { "epoch": 1.124811204174104, "grad_norm": 0.4000808298587799, "learning_rate": 0.00020830280830280828, "loss": 0.8363, "step": 1024 }, { "epoch": 1.1259096526156804, "grad_norm": 0.3930743336677551, "learning_rate": 0.00020818070818070817, "loss": 0.6183, "step": 1025 }, { "epoch": 1.1270081010572566, "grad_norm": 0.7536817789077759, "learning_rate": 0.00020805860805860803, "loss": 0.7511, "step": 1026 }, { "epoch": 1.128106549498833, "grad_norm": 0.5012079477310181, "learning_rate": 0.00020793650793650791, "loss": 0.6346, "step": 1027 }, { "epoch": 1.129204997940409, "grad_norm": 0.9914690852165222, "learning_rate": 0.00020781440781440783, "loss": 0.5827, "step": 1028 }, { "epoch": 1.1303034463819854, "grad_norm": 0.9096476435661316, "learning_rate": 0.00020769230769230766, "loss": 1.0235, "step": 1029 }, { "epoch": 1.1314018948235618, "grad_norm": 0.6668229699134827, "learning_rate": 0.00020757020757020757, "loss": 0.741, "step": 1030 }, { "epoch": 1.132500343265138, "grad_norm": 0.3232771158218384, "learning_rate": 0.0002074481074481074, "loss": 0.6206, "step": 1031 }, { "epoch": 1.1335987917067143, "grad_norm": 0.278003990650177, "learning_rate": 0.00020732600732600731, "loss": 0.5661, "step": 1032 }, { "epoch": 1.1346972401482907, "grad_norm": 1.481213927268982, "learning_rate": 0.0002072039072039072, "loss": 0.6422, "step": 1033 }, { "epoch": 1.1357956885898668, "grad_norm": 0.4688512682914734, "learning_rate": 0.00020708180708180706, "loss": 0.4163, "step": 1034 }, { "epoch": 1.1368941370314432, "grad_norm": 0.6438425779342651, "learning_rate": 0.00020695970695970694, "loss": 0.6241, "step": 1035 }, { "epoch": 1.1379925854730193, "grad_norm": 0.5013176798820496, "learning_rate": 0.00020683760683760683, "loss": 0.6273, "step": 1036 }, { "epoch": 1.1390910339145957, "grad_norm": 0.5178597569465637, "learning_rate": 0.0002067155067155067, "loss": 0.7489, "step": 1037 }, { "epoch": 1.1401894823561718, "grad_norm": 0.5804840922355652, "learning_rate": 0.00020659340659340657, "loss": 0.9142, "step": 1038 }, { "epoch": 1.1412879307977482, "grad_norm": 0.47613444924354553, "learning_rate": 0.00020647130647130646, "loss": 0.9531, "step": 1039 }, { "epoch": 1.1423863792393245, "grad_norm": 0.4835624694824219, "learning_rate": 0.00020634920634920632, "loss": 0.6349, "step": 1040 }, { "epoch": 1.1434848276809007, "grad_norm": 0.38351112604141235, "learning_rate": 0.0002062271062271062, "loss": 0.4726, "step": 1041 }, { "epoch": 1.144583276122477, "grad_norm": 0.5533854365348816, "learning_rate": 0.0002061050061050061, "loss": 0.5108, "step": 1042 }, { "epoch": 1.1456817245640534, "grad_norm": 0.4842824637889862, "learning_rate": 0.00020598290598290595, "loss": 0.6038, "step": 1043 }, { "epoch": 1.1467801730056295, "grad_norm": 0.552798330783844, "learning_rate": 0.00020586080586080583, "loss": 0.8056, "step": 1044 }, { "epoch": 1.1478786214472059, "grad_norm": 0.40466025471687317, "learning_rate": 0.00020573870573870574, "loss": 0.6234, "step": 1045 }, { "epoch": 1.148977069888782, "grad_norm": 0.6988784074783325, "learning_rate": 0.0002056166056166056, "loss": 0.7721, "step": 1046 }, { "epoch": 1.1500755183303584, "grad_norm": 0.4852863550186157, "learning_rate": 0.0002054945054945055, "loss": 0.6074, "step": 1047 }, { "epoch": 1.1511739667719347, "grad_norm": 0.4548696279525757, "learning_rate": 0.00020537240537240537, "loss": 0.5592, "step": 1048 }, { "epoch": 1.1522724152135109, "grad_norm": 0.9355410933494568, "learning_rate": 0.00020525030525030523, "loss": 0.8618, "step": 1049 }, { "epoch": 1.1533708636550872, "grad_norm": 0.5641398429870605, "learning_rate": 0.00020512820512820512, "loss": 0.704, "step": 1050 }, { "epoch": 1.1544693120966634, "grad_norm": 0.48187771439552307, "learning_rate": 0.000205006105006105, "loss": 0.6008, "step": 1051 }, { "epoch": 1.1555677605382397, "grad_norm": 0.41609904170036316, "learning_rate": 0.00020488400488400486, "loss": 0.8812, "step": 1052 }, { "epoch": 1.156666208979816, "grad_norm": 0.919477105140686, "learning_rate": 0.00020476190476190475, "loss": 0.6597, "step": 1053 }, { "epoch": 1.1577646574213922, "grad_norm": 0.5008611083030701, "learning_rate": 0.0002046398046398046, "loss": 0.6501, "step": 1054 }, { "epoch": 1.1588631058629686, "grad_norm": 0.39832696318626404, "learning_rate": 0.0002045177045177045, "loss": 0.6232, "step": 1055 }, { "epoch": 1.159961554304545, "grad_norm": 0.5290446281433105, "learning_rate": 0.00020439560439560438, "loss": 0.6123, "step": 1056 }, { "epoch": 1.161060002746121, "grad_norm": 0.40837669372558594, "learning_rate": 0.00020427350427350423, "loss": 0.4989, "step": 1057 }, { "epoch": 1.1621584511876974, "grad_norm": 0.43407055735588074, "learning_rate": 0.00020415140415140412, "loss": 0.6961, "step": 1058 }, { "epoch": 1.1632568996292736, "grad_norm": 0.7601787447929382, "learning_rate": 0.00020402930402930403, "loss": 0.9308, "step": 1059 }, { "epoch": 1.16435534807085, "grad_norm": 0.452628493309021, "learning_rate": 0.00020390720390720386, "loss": 0.6478, "step": 1060 }, { "epoch": 1.165453796512426, "grad_norm": 0.4524000287055969, "learning_rate": 0.00020378510378510378, "loss": 0.4499, "step": 1061 }, { "epoch": 1.1665522449540024, "grad_norm": 0.5971822142601013, "learning_rate": 0.00020366300366300366, "loss": 0.6402, "step": 1062 }, { "epoch": 1.1676506933955788, "grad_norm": 0.36858659982681274, "learning_rate": 0.00020354090354090352, "loss": 0.6511, "step": 1063 }, { "epoch": 1.168749141837155, "grad_norm": 0.47295433282852173, "learning_rate": 0.0002034188034188034, "loss": 0.5977, "step": 1064 }, { "epoch": 1.1698475902787313, "grad_norm": 0.4402971565723419, "learning_rate": 0.0002032967032967033, "loss": 0.4824, "step": 1065 }, { "epoch": 1.1709460387203077, "grad_norm": 0.3752620816230774, "learning_rate": 0.00020317460317460315, "loss": 0.6519, "step": 1066 }, { "epoch": 1.1720444871618838, "grad_norm": 0.45207279920578003, "learning_rate": 0.00020305250305250303, "loss": 0.6869, "step": 1067 }, { "epoch": 1.1731429356034602, "grad_norm": 0.4255804121494293, "learning_rate": 0.00020293040293040292, "loss": 0.7289, "step": 1068 }, { "epoch": 1.1742413840450363, "grad_norm": 0.48725178837776184, "learning_rate": 0.00020280830280830278, "loss": 0.5472, "step": 1069 }, { "epoch": 1.1753398324866127, "grad_norm": 0.37094470858573914, "learning_rate": 0.00020268620268620266, "loss": 0.558, "step": 1070 }, { "epoch": 1.176438280928189, "grad_norm": 0.4191375970840454, "learning_rate": 0.00020256410256410255, "loss": 0.6422, "step": 1071 }, { "epoch": 1.1775367293697652, "grad_norm": 0.4091531038284302, "learning_rate": 0.0002024420024420024, "loss": 0.6705, "step": 1072 }, { "epoch": 1.1786351778113415, "grad_norm": 0.4876718521118164, "learning_rate": 0.0002023199023199023, "loss": 0.8265, "step": 1073 }, { "epoch": 1.1797336262529177, "grad_norm": 0.43008798360824585, "learning_rate": 0.0002021978021978022, "loss": 0.5159, "step": 1074 }, { "epoch": 1.180832074694494, "grad_norm": 0.47896140813827515, "learning_rate": 0.00020207570207570204, "loss": 0.5455, "step": 1075 }, { "epoch": 1.1819305231360704, "grad_norm": 0.5313389301300049, "learning_rate": 0.00020195360195360195, "loss": 0.7628, "step": 1076 }, { "epoch": 1.1830289715776465, "grad_norm": 0.46337512135505676, "learning_rate": 0.00020183150183150184, "loss": 0.6661, "step": 1077 }, { "epoch": 1.1841274200192229, "grad_norm": 0.4304458498954773, "learning_rate": 0.0002017094017094017, "loss": 0.7019, "step": 1078 }, { "epoch": 1.185225868460799, "grad_norm": 0.638445258140564, "learning_rate": 0.00020158730158730158, "loss": 0.6972, "step": 1079 }, { "epoch": 1.1863243169023754, "grad_norm": 1.8217968940734863, "learning_rate": 0.00020146520146520144, "loss": 0.5217, "step": 1080 }, { "epoch": 1.1874227653439517, "grad_norm": 0.4996611773967743, "learning_rate": 0.00020134310134310132, "loss": 0.6767, "step": 1081 }, { "epoch": 1.1885212137855279, "grad_norm": 0.43705832958221436, "learning_rate": 0.0002012210012210012, "loss": 0.7364, "step": 1082 }, { "epoch": 1.1896196622271042, "grad_norm": 0.4148736596107483, "learning_rate": 0.00020109890109890107, "loss": 0.7544, "step": 1083 }, { "epoch": 1.1907181106686804, "grad_norm": 0.5772218108177185, "learning_rate": 0.00020097680097680095, "loss": 0.6349, "step": 1084 }, { "epoch": 1.1918165591102567, "grad_norm": 0.9127015471458435, "learning_rate": 0.00020085470085470084, "loss": 0.4772, "step": 1085 }, { "epoch": 1.192915007551833, "grad_norm": 0.46906840801239014, "learning_rate": 0.0002007326007326007, "loss": 0.6184, "step": 1086 }, { "epoch": 1.1940134559934092, "grad_norm": 0.38405168056488037, "learning_rate": 0.00020061050061050058, "loss": 0.5027, "step": 1087 }, { "epoch": 1.1951119044349856, "grad_norm": 0.6352836489677429, "learning_rate": 0.00020048840048840047, "loss": 0.6674, "step": 1088 }, { "epoch": 1.196210352876562, "grad_norm": 0.6750807762145996, "learning_rate": 0.00020036630036630033, "loss": 0.5707, "step": 1089 }, { "epoch": 1.197308801318138, "grad_norm": 0.5661985874176025, "learning_rate": 0.00020024420024420024, "loss": 0.8298, "step": 1090 }, { "epoch": 1.1984072497597145, "grad_norm": 0.6393309831619263, "learning_rate": 0.00020012210012210012, "loss": 0.7397, "step": 1091 }, { "epoch": 1.1995056982012906, "grad_norm": 0.5442856550216675, "learning_rate": 0.00019999999999999998, "loss": 0.7176, "step": 1092 }, { "epoch": 1.200604146642867, "grad_norm": 1.0100654363632202, "learning_rate": 0.00019987789987789987, "loss": 0.8052, "step": 1093 }, { "epoch": 1.201702595084443, "grad_norm": 0.3916209936141968, "learning_rate": 0.00019975579975579975, "loss": 0.5951, "step": 1094 }, { "epoch": 1.2028010435260195, "grad_norm": 0.3890608847141266, "learning_rate": 0.0001996336996336996, "loss": 0.8129, "step": 1095 }, { "epoch": 1.2038994919675958, "grad_norm": 0.4267507493495941, "learning_rate": 0.0001995115995115995, "loss": 0.8741, "step": 1096 }, { "epoch": 1.204997940409172, "grad_norm": 0.49055561423301697, "learning_rate": 0.00019938949938949938, "loss": 0.901, "step": 1097 }, { "epoch": 1.2060963888507483, "grad_norm": 0.6662428379058838, "learning_rate": 0.00019926739926739924, "loss": 0.4971, "step": 1098 }, { "epoch": 1.2071948372923247, "grad_norm": 0.4469052255153656, "learning_rate": 0.00019914529914529913, "loss": 0.6593, "step": 1099 }, { "epoch": 1.2082932857339008, "grad_norm": 0.5514255166053772, "learning_rate": 0.000199023199023199, "loss": 0.8033, "step": 1100 }, { "epoch": 1.2093917341754772, "grad_norm": 0.4838184714317322, "learning_rate": 0.00019890109890109887, "loss": 0.5533, "step": 1101 }, { "epoch": 1.2104901826170533, "grad_norm": 0.6061891913414001, "learning_rate": 0.00019877899877899876, "loss": 0.5837, "step": 1102 }, { "epoch": 1.2115886310586297, "grad_norm": 0.3387523889541626, "learning_rate": 0.00019865689865689867, "loss": 0.455, "step": 1103 }, { "epoch": 1.212687079500206, "grad_norm": 0.5204731225967407, "learning_rate": 0.0001985347985347985, "loss": 0.6869, "step": 1104 }, { "epoch": 1.2137855279417822, "grad_norm": 0.5747571587562561, "learning_rate": 0.0001984126984126984, "loss": 0.7208, "step": 1105 }, { "epoch": 1.2148839763833585, "grad_norm": 0.5382461547851562, "learning_rate": 0.00019829059829059824, "loss": 0.6035, "step": 1106 }, { "epoch": 1.2159824248249347, "grad_norm": 0.44335421919822693, "learning_rate": 0.00019816849816849816, "loss": 0.8563, "step": 1107 }, { "epoch": 1.217080873266511, "grad_norm": 0.3059934675693512, "learning_rate": 0.00019804639804639804, "loss": 0.6422, "step": 1108 }, { "epoch": 1.2181793217080874, "grad_norm": 0.4306177794933319, "learning_rate": 0.0001979242979242979, "loss": 0.5347, "step": 1109 }, { "epoch": 1.2192777701496635, "grad_norm": 0.5196095705032349, "learning_rate": 0.00019780219780219779, "loss": 0.5996, "step": 1110 }, { "epoch": 1.22037621859124, "grad_norm": 0.4814283549785614, "learning_rate": 0.00019768009768009767, "loss": 0.6782, "step": 1111 }, { "epoch": 1.2214746670328163, "grad_norm": 0.2287791222333908, "learning_rate": 0.00019755799755799753, "loss": 0.5908, "step": 1112 }, { "epoch": 1.2225731154743924, "grad_norm": 0.43044313788414, "learning_rate": 0.00019743589743589742, "loss": 0.6554, "step": 1113 }, { "epoch": 1.2236715639159688, "grad_norm": 0.390874445438385, "learning_rate": 0.0001973137973137973, "loss": 0.5777, "step": 1114 }, { "epoch": 1.224770012357545, "grad_norm": 0.5380458235740662, "learning_rate": 0.00019719169719169716, "loss": 0.467, "step": 1115 }, { "epoch": 1.2258684607991213, "grad_norm": 0.6176440119743347, "learning_rate": 0.00019706959706959704, "loss": 0.5625, "step": 1116 }, { "epoch": 1.2269669092406974, "grad_norm": 0.4321332275867462, "learning_rate": 0.00019694749694749693, "loss": 0.7262, "step": 1117 }, { "epoch": 1.2280653576822738, "grad_norm": 0.5679623484611511, "learning_rate": 0.0001968253968253968, "loss": 0.8216, "step": 1118 }, { "epoch": 1.2291638061238501, "grad_norm": 0.4741218686103821, "learning_rate": 0.00019670329670329667, "loss": 0.7164, "step": 1119 }, { "epoch": 1.2302622545654263, "grad_norm": 0.6570267677307129, "learning_rate": 0.00019658119658119659, "loss": 0.7606, "step": 1120 }, { "epoch": 1.2313607030070026, "grad_norm": 0.4256306290626526, "learning_rate": 0.00019645909645909644, "loss": 0.5137, "step": 1121 }, { "epoch": 1.232459151448579, "grad_norm": 0.4444984793663025, "learning_rate": 0.00019633699633699633, "loss": 0.8863, "step": 1122 }, { "epoch": 1.2335575998901551, "grad_norm": 0.458133339881897, "learning_rate": 0.00019621489621489622, "loss": 0.6445, "step": 1123 }, { "epoch": 1.2346560483317315, "grad_norm": 0.6087627410888672, "learning_rate": 0.00019609279609279607, "loss": 0.5625, "step": 1124 }, { "epoch": 1.2357544967733076, "grad_norm": 0.42782312631607056, "learning_rate": 0.00019597069597069596, "loss": 0.6321, "step": 1125 }, { "epoch": 1.236852945214884, "grad_norm": 0.49623987078666687, "learning_rate": 0.00019584859584859585, "loss": 0.6473, "step": 1126 }, { "epoch": 1.2379513936564603, "grad_norm": 0.5348198413848877, "learning_rate": 0.0001957264957264957, "loss": 0.6948, "step": 1127 }, { "epoch": 1.2390498420980365, "grad_norm": 0.44476062059402466, "learning_rate": 0.0001956043956043956, "loss": 0.5917, "step": 1128 }, { "epoch": 1.2401482905396128, "grad_norm": 0.5777286291122437, "learning_rate": 0.00019548229548229547, "loss": 0.7474, "step": 1129 }, { "epoch": 1.241246738981189, "grad_norm": 0.3132689893245697, "learning_rate": 0.00019536019536019533, "loss": 0.5827, "step": 1130 }, { "epoch": 1.2423451874227653, "grad_norm": 0.3898192346096039, "learning_rate": 0.00019523809523809522, "loss": 0.5469, "step": 1131 }, { "epoch": 1.2434436358643417, "grad_norm": 0.338693767786026, "learning_rate": 0.00019511599511599508, "loss": 0.704, "step": 1132 }, { "epoch": 1.2445420843059178, "grad_norm": 0.4276609718799591, "learning_rate": 0.00019499389499389496, "loss": 0.7269, "step": 1133 }, { "epoch": 1.2456405327474942, "grad_norm": 0.7320281863212585, "learning_rate": 0.00019487179487179487, "loss": 0.62, "step": 1134 }, { "epoch": 1.2467389811890706, "grad_norm": 0.4023820757865906, "learning_rate": 0.0001947496947496947, "loss": 0.4234, "step": 1135 }, { "epoch": 1.2478374296306467, "grad_norm": 0.3218212425708771, "learning_rate": 0.00019462759462759462, "loss": 0.5325, "step": 1136 }, { "epoch": 1.248935878072223, "grad_norm": 0.45131513476371765, "learning_rate": 0.0001945054945054945, "loss": 0.5667, "step": 1137 }, { "epoch": 1.2500343265137992, "grad_norm": 0.604475200176239, "learning_rate": 0.00019438339438339436, "loss": 0.9018, "step": 1138 }, { "epoch": 1.2511327749553756, "grad_norm": 0.46968311071395874, "learning_rate": 0.00019426129426129425, "loss": 0.7946, "step": 1139 }, { "epoch": 1.2522312233969517, "grad_norm": 0.3960346281528473, "learning_rate": 0.00019413919413919413, "loss": 0.7719, "step": 1140 }, { "epoch": 1.253329671838528, "grad_norm": 0.5146461129188538, "learning_rate": 0.000194017094017094, "loss": 0.8946, "step": 1141 }, { "epoch": 1.2544281202801044, "grad_norm": 0.6343802809715271, "learning_rate": 0.00019389499389499388, "loss": 0.7822, "step": 1142 }, { "epoch": 1.2555265687216806, "grad_norm": 0.4646434485912323, "learning_rate": 0.00019377289377289376, "loss": 0.6722, "step": 1143 }, { "epoch": 1.256625017163257, "grad_norm": 0.48127877712249756, "learning_rate": 0.00019365079365079362, "loss": 0.9059, "step": 1144 }, { "epoch": 1.2577234656048333, "grad_norm": 0.4040716290473938, "learning_rate": 0.0001935286935286935, "loss": 0.7288, "step": 1145 }, { "epoch": 1.2588219140464094, "grad_norm": 0.43992865085601807, "learning_rate": 0.0001934065934065934, "loss": 0.5804, "step": 1146 }, { "epoch": 1.2599203624879858, "grad_norm": 0.41578513383865356, "learning_rate": 0.00019328449328449325, "loss": 0.5459, "step": 1147 }, { "epoch": 1.261018810929562, "grad_norm": 0.40165719389915466, "learning_rate": 0.00019316239316239314, "loss": 0.6001, "step": 1148 }, { "epoch": 1.2621172593711383, "grad_norm": 0.43200212717056274, "learning_rate": 0.00019304029304029305, "loss": 0.8712, "step": 1149 }, { "epoch": 1.2632157078127144, "grad_norm": 0.3217264413833618, "learning_rate": 0.00019291819291819288, "loss": 0.6074, "step": 1150 }, { "epoch": 1.2643141562542908, "grad_norm": 0.3964528441429138, "learning_rate": 0.0001927960927960928, "loss": 0.6131, "step": 1151 }, { "epoch": 1.2654126046958671, "grad_norm": 0.5151070952415466, "learning_rate": 0.00019267399267399268, "loss": 0.6992, "step": 1152 }, { "epoch": 1.2665110531374433, "grad_norm": 0.5902129411697388, "learning_rate": 0.00019255189255189254, "loss": 0.7311, "step": 1153 }, { "epoch": 1.2676095015790196, "grad_norm": 0.5386108160018921, "learning_rate": 0.00019242979242979242, "loss": 0.6469, "step": 1154 }, { "epoch": 1.268707950020596, "grad_norm": 0.384093701839447, "learning_rate": 0.0001923076923076923, "loss": 0.7111, "step": 1155 }, { "epoch": 1.2698063984621721, "grad_norm": 0.34160250425338745, "learning_rate": 0.00019218559218559217, "loss": 0.5396, "step": 1156 }, { "epoch": 1.2709048469037485, "grad_norm": 0.6590912938117981, "learning_rate": 0.00019206349206349205, "loss": 1.1613, "step": 1157 }, { "epoch": 1.2720032953453249, "grad_norm": 0.6230842471122742, "learning_rate": 0.0001919413919413919, "loss": 0.7701, "step": 1158 }, { "epoch": 1.273101743786901, "grad_norm": 0.3881864547729492, "learning_rate": 0.0001918192918192918, "loss": 0.633, "step": 1159 }, { "epoch": 1.2742001922284774, "grad_norm": 0.4538264274597168, "learning_rate": 0.00019169719169719168, "loss": 0.451, "step": 1160 }, { "epoch": 1.2752986406700535, "grad_norm": 0.6188018321990967, "learning_rate": 0.00019157509157509154, "loss": 0.9563, "step": 1161 }, { "epoch": 1.2763970891116299, "grad_norm": 0.4172852039337158, "learning_rate": 0.00019145299145299142, "loss": 0.8284, "step": 1162 }, { "epoch": 1.277495537553206, "grad_norm": 0.338623583316803, "learning_rate": 0.0001913308913308913, "loss": 0.6745, "step": 1163 }, { "epoch": 1.2785939859947824, "grad_norm": 0.3960900902748108, "learning_rate": 0.00019120879120879117, "loss": 0.6508, "step": 1164 }, { "epoch": 1.2796924344363587, "grad_norm": 0.37232962250709534, "learning_rate": 0.00019108669108669108, "loss": 0.7347, "step": 1165 }, { "epoch": 1.2807908828779349, "grad_norm": 0.47092223167419434, "learning_rate": 0.00019096459096459097, "loss": 0.8251, "step": 1166 }, { "epoch": 1.2818893313195112, "grad_norm": 0.4647108316421509, "learning_rate": 0.00019084249084249082, "loss": 0.556, "step": 1167 }, { "epoch": 1.2829877797610876, "grad_norm": 0.5812810659408569, "learning_rate": 0.0001907203907203907, "loss": 0.6802, "step": 1168 }, { "epoch": 1.2840862282026637, "grad_norm": 0.3731052279472351, "learning_rate": 0.0001905982905982906, "loss": 0.6384, "step": 1169 }, { "epoch": 1.28518467664424, "grad_norm": 0.47995856404304504, "learning_rate": 0.00019047619047619045, "loss": 0.4914, "step": 1170 }, { "epoch": 1.2862831250858162, "grad_norm": 0.3223705589771271, "learning_rate": 0.00019035409035409034, "loss": 0.6676, "step": 1171 }, { "epoch": 1.2873815735273926, "grad_norm": 0.5643377304077148, "learning_rate": 0.00019023199023199023, "loss": 0.8224, "step": 1172 }, { "epoch": 1.2884800219689687, "grad_norm": 0.48324450850486755, "learning_rate": 0.00019010989010989008, "loss": 0.8005, "step": 1173 }, { "epoch": 1.289578470410545, "grad_norm": 0.40516728162765503, "learning_rate": 0.00018998778998778997, "loss": 0.5463, "step": 1174 }, { "epoch": 1.2906769188521214, "grad_norm": 0.45521625876426697, "learning_rate": 0.00018986568986568985, "loss": 0.7562, "step": 1175 }, { "epoch": 1.2917753672936976, "grad_norm": 0.38747909665107727, "learning_rate": 0.0001897435897435897, "loss": 0.5074, "step": 1176 }, { "epoch": 1.292873815735274, "grad_norm": 0.39688000082969666, "learning_rate": 0.0001896214896214896, "loss": 0.3551, "step": 1177 }, { "epoch": 1.2939722641768503, "grad_norm": 0.6891604065895081, "learning_rate": 0.0001894993894993895, "loss": 0.601, "step": 1178 }, { "epoch": 1.2950707126184264, "grad_norm": 0.5177300572395325, "learning_rate": 0.00018937728937728934, "loss": 0.5188, "step": 1179 }, { "epoch": 1.2961691610600028, "grad_norm": 0.3166979253292084, "learning_rate": 0.00018925518925518926, "loss": 0.8411, "step": 1180 }, { "epoch": 1.2972676095015792, "grad_norm": 0.6637437343597412, "learning_rate": 0.00018913308913308914, "loss": 0.7256, "step": 1181 }, { "epoch": 1.2983660579431553, "grad_norm": 0.424932599067688, "learning_rate": 0.000189010989010989, "loss": 0.783, "step": 1182 }, { "epoch": 1.2994645063847314, "grad_norm": 0.47751033306121826, "learning_rate": 0.00018888888888888888, "loss": 0.7039, "step": 1183 }, { "epoch": 1.3005629548263078, "grad_norm": 0.4332704544067383, "learning_rate": 0.00018876678876678874, "loss": 0.4797, "step": 1184 }, { "epoch": 1.3016614032678842, "grad_norm": 0.439431756734848, "learning_rate": 0.00018864468864468863, "loss": 0.6256, "step": 1185 }, { "epoch": 1.3027598517094603, "grad_norm": 0.4334176480770111, "learning_rate": 0.00018852258852258851, "loss": 0.5583, "step": 1186 }, { "epoch": 1.3038583001510367, "grad_norm": 0.42080724239349365, "learning_rate": 0.00018840048840048837, "loss": 0.461, "step": 1187 }, { "epoch": 1.304956748592613, "grad_norm": 0.41007399559020996, "learning_rate": 0.00018827838827838826, "loss": 0.4746, "step": 1188 }, { "epoch": 1.3060551970341892, "grad_norm": 0.3763822019100189, "learning_rate": 0.00018815628815628814, "loss": 0.5352, "step": 1189 }, { "epoch": 1.3071536454757655, "grad_norm": 0.5557730197906494, "learning_rate": 0.000188034188034188, "loss": 0.5404, "step": 1190 }, { "epoch": 1.3082520939173419, "grad_norm": 0.43677788972854614, "learning_rate": 0.0001879120879120879, "loss": 0.7111, "step": 1191 }, { "epoch": 1.309350542358918, "grad_norm": 0.6084219217300415, "learning_rate": 0.00018778998778998777, "loss": 0.7524, "step": 1192 }, { "epoch": 1.3104489908004944, "grad_norm": 0.7219144701957703, "learning_rate": 0.00018766788766788763, "loss": 0.6182, "step": 1193 }, { "epoch": 1.3115474392420705, "grad_norm": 0.5280331969261169, "learning_rate": 0.00018754578754578752, "loss": 0.8023, "step": 1194 }, { "epoch": 1.3126458876836469, "grad_norm": 0.42130032181739807, "learning_rate": 0.00018742368742368743, "loss": 0.5673, "step": 1195 }, { "epoch": 1.313744336125223, "grad_norm": 0.6063292026519775, "learning_rate": 0.0001873015873015873, "loss": 0.6438, "step": 1196 }, { "epoch": 1.3148427845667994, "grad_norm": 0.4073690176010132, "learning_rate": 0.00018717948717948717, "loss": 0.7099, "step": 1197 }, { "epoch": 1.3159412330083757, "grad_norm": 0.5419113636016846, "learning_rate": 0.00018705738705738706, "loss": 0.6451, "step": 1198 }, { "epoch": 1.3170396814499519, "grad_norm": 0.4489867091178894, "learning_rate": 0.00018693528693528692, "loss": 0.7522, "step": 1199 }, { "epoch": 1.3181381298915282, "grad_norm": 0.3536837697029114, "learning_rate": 0.0001868131868131868, "loss": 0.6201, "step": 1200 }, { "epoch": 1.3192365783331046, "grad_norm": 0.42462313175201416, "learning_rate": 0.0001866910866910867, "loss": 0.4804, "step": 1201 }, { "epoch": 1.3203350267746807, "grad_norm": 0.612319827079773, "learning_rate": 0.00018656898656898655, "loss": 0.8546, "step": 1202 }, { "epoch": 1.321433475216257, "grad_norm": 0.5242000222206116, "learning_rate": 0.00018644688644688643, "loss": 0.7577, "step": 1203 }, { "epoch": 1.3225319236578332, "grad_norm": 0.5688628554344177, "learning_rate": 0.00018632478632478632, "loss": 0.6645, "step": 1204 }, { "epoch": 1.3236303720994096, "grad_norm": 0.3695731461048126, "learning_rate": 0.00018620268620268618, "loss": 0.4979, "step": 1205 }, { "epoch": 1.3247288205409857, "grad_norm": 0.44525593519210815, "learning_rate": 0.00018608058608058606, "loss": 0.807, "step": 1206 }, { "epoch": 1.325827268982562, "grad_norm": 0.37627971172332764, "learning_rate": 0.00018595848595848595, "loss": 0.6584, "step": 1207 }, { "epoch": 1.3269257174241385, "grad_norm": 0.39727315306663513, "learning_rate": 0.0001858363858363858, "loss": 0.5565, "step": 1208 }, { "epoch": 1.3280241658657146, "grad_norm": 0.4151424169540405, "learning_rate": 0.00018571428571428572, "loss": 0.81, "step": 1209 }, { "epoch": 1.329122614307291, "grad_norm": 0.37529075145721436, "learning_rate": 0.00018559218559218555, "loss": 0.6188, "step": 1210 }, { "epoch": 1.3302210627488673, "grad_norm": 0.43061408400535583, "learning_rate": 0.00018547008547008546, "loss": 0.814, "step": 1211 }, { "epoch": 1.3313195111904434, "grad_norm": 0.437511682510376, "learning_rate": 0.00018534798534798535, "loss": 0.55, "step": 1212 }, { "epoch": 1.3324179596320198, "grad_norm": 0.5172685980796814, "learning_rate": 0.0001852258852258852, "loss": 0.6551, "step": 1213 }, { "epoch": 1.3335164080735962, "grad_norm": 0.3292716443538666, "learning_rate": 0.0001851037851037851, "loss": 0.5108, "step": 1214 }, { "epoch": 1.3346148565151723, "grad_norm": 0.7129474878311157, "learning_rate": 0.00018498168498168498, "loss": 0.7197, "step": 1215 }, { "epoch": 1.3357133049567487, "grad_norm": 0.46317145228385925, "learning_rate": 0.00018485958485958483, "loss": 0.6553, "step": 1216 }, { "epoch": 1.3368117533983248, "grad_norm": 0.5539398789405823, "learning_rate": 0.00018473748473748472, "loss": 0.7057, "step": 1217 }, { "epoch": 1.3379102018399012, "grad_norm": 0.40555253624916077, "learning_rate": 0.0001846153846153846, "loss": 0.5976, "step": 1218 }, { "epoch": 1.3390086502814773, "grad_norm": 0.462704062461853, "learning_rate": 0.00018449328449328446, "loss": 0.7018, "step": 1219 }, { "epoch": 1.3401070987230537, "grad_norm": 0.407287061214447, "learning_rate": 0.00018437118437118435, "loss": 0.4726, "step": 1220 }, { "epoch": 1.34120554716463, "grad_norm": 0.3654995858669281, "learning_rate": 0.00018424908424908423, "loss": 0.5811, "step": 1221 }, { "epoch": 1.3423039956062062, "grad_norm": 0.46455878019332886, "learning_rate": 0.0001841269841269841, "loss": 0.8998, "step": 1222 }, { "epoch": 1.3434024440477825, "grad_norm": 0.47929346561431885, "learning_rate": 0.00018400488400488398, "loss": 0.7348, "step": 1223 }, { "epoch": 1.344500892489359, "grad_norm": 0.7128652930259705, "learning_rate": 0.0001838827838827839, "loss": 1.2647, "step": 1224 }, { "epoch": 1.345599340930935, "grad_norm": 0.3956572413444519, "learning_rate": 0.00018376068376068372, "loss": 0.6985, "step": 1225 }, { "epoch": 1.3466977893725114, "grad_norm": 0.5585309863090515, "learning_rate": 0.00018363858363858364, "loss": 1.0086, "step": 1226 }, { "epoch": 1.3477962378140875, "grad_norm": 1.5960838794708252, "learning_rate": 0.00018351648351648352, "loss": 0.644, "step": 1227 }, { "epoch": 1.3488946862556639, "grad_norm": 0.6499342322349548, "learning_rate": 0.00018339438339438338, "loss": 0.7698, "step": 1228 }, { "epoch": 1.34999313469724, "grad_norm": 0.42246925830841064, "learning_rate": 0.00018327228327228326, "loss": 0.5614, "step": 1229 }, { "epoch": 1.3510915831388164, "grad_norm": 0.42192572355270386, "learning_rate": 0.00018315018315018315, "loss": 0.7726, "step": 1230 }, { "epoch": 1.3521900315803927, "grad_norm": 0.6409221887588501, "learning_rate": 0.000183028083028083, "loss": 0.5928, "step": 1231 }, { "epoch": 1.3532884800219689, "grad_norm": 1.328852653503418, "learning_rate": 0.0001829059829059829, "loss": 0.7861, "step": 1232 }, { "epoch": 1.3543869284635452, "grad_norm": 0.4519331753253937, "learning_rate": 0.00018278388278388275, "loss": 0.5938, "step": 1233 }, { "epoch": 1.3554853769051216, "grad_norm": 0.3942720592021942, "learning_rate": 0.00018266178266178264, "loss": 0.4781, "step": 1234 }, { "epoch": 1.3565838253466977, "grad_norm": 0.5066869258880615, "learning_rate": 0.00018253968253968252, "loss": 0.8069, "step": 1235 }, { "epoch": 1.357682273788274, "grad_norm": 0.37002792954444885, "learning_rate": 0.00018241758241758238, "loss": 0.5737, "step": 1236 }, { "epoch": 1.3587807222298505, "grad_norm": 0.3738810122013092, "learning_rate": 0.00018229548229548227, "loss": 0.5169, "step": 1237 }, { "epoch": 1.3598791706714266, "grad_norm": 0.44956260919570923, "learning_rate": 0.00018217338217338215, "loss": 0.5614, "step": 1238 }, { "epoch": 1.3609776191130027, "grad_norm": 0.34839004278182983, "learning_rate": 0.000182051282051282, "loss": 0.5783, "step": 1239 }, { "epoch": 1.362076067554579, "grad_norm": 0.30152127146720886, "learning_rate": 0.00018192918192918192, "loss": 0.4321, "step": 1240 }, { "epoch": 1.3631745159961555, "grad_norm": 0.6672345399856567, "learning_rate": 0.0001818070818070818, "loss": 0.6073, "step": 1241 }, { "epoch": 1.3642729644377316, "grad_norm": 0.45652687549591064, "learning_rate": 0.00018168498168498167, "loss": 0.6193, "step": 1242 }, { "epoch": 1.365371412879308, "grad_norm": 0.6392306089401245, "learning_rate": 0.00018156288156288155, "loss": 0.8388, "step": 1243 }, { "epoch": 1.3664698613208843, "grad_norm": 0.5510252714157104, "learning_rate": 0.00018144078144078144, "loss": 0.6512, "step": 1244 }, { "epoch": 1.3675683097624605, "grad_norm": 0.38780227303504944, "learning_rate": 0.0001813186813186813, "loss": 0.6835, "step": 1245 }, { "epoch": 1.3686667582040368, "grad_norm": 0.47472965717315674, "learning_rate": 0.00018119658119658118, "loss": 0.6625, "step": 1246 }, { "epoch": 1.3697652066456132, "grad_norm": 0.3599228262901306, "learning_rate": 0.00018107448107448107, "loss": 0.5063, "step": 1247 }, { "epoch": 1.3708636550871893, "grad_norm": 0.3284567892551422, "learning_rate": 0.00018095238095238093, "loss": 0.7679, "step": 1248 }, { "epoch": 1.3719621035287657, "grad_norm": 0.5258575081825256, "learning_rate": 0.0001808302808302808, "loss": 0.6213, "step": 1249 }, { "epoch": 1.3730605519703418, "grad_norm": 0.3211069405078888, "learning_rate": 0.0001807081807081807, "loss": 0.5306, "step": 1250 }, { "epoch": 1.3741590004119182, "grad_norm": 0.6325588822364807, "learning_rate": 0.00018058608058608056, "loss": 0.8104, "step": 1251 }, { "epoch": 1.3752574488534943, "grad_norm": 0.4994303584098816, "learning_rate": 0.00018046398046398044, "loss": 0.6464, "step": 1252 }, { "epoch": 1.3763558972950707, "grad_norm": 0.3013019263744354, "learning_rate": 0.00018034188034188035, "loss": 0.4749, "step": 1253 }, { "epoch": 1.377454345736647, "grad_norm": 1.0342131853103638, "learning_rate": 0.00018021978021978018, "loss": 0.7995, "step": 1254 }, { "epoch": 1.3785527941782232, "grad_norm": 0.40213823318481445, "learning_rate": 0.0001800976800976801, "loss": 0.8791, "step": 1255 }, { "epoch": 1.3796512426197995, "grad_norm": 0.37126532196998596, "learning_rate": 0.00017997557997557998, "loss": 0.551, "step": 1256 }, { "epoch": 1.380749691061376, "grad_norm": 0.3417685031890869, "learning_rate": 0.00017985347985347984, "loss": 0.583, "step": 1257 }, { "epoch": 1.381848139502952, "grad_norm": 0.33571329712867737, "learning_rate": 0.00017973137973137973, "loss": 0.4927, "step": 1258 }, { "epoch": 1.3829465879445284, "grad_norm": 0.5128073692321777, "learning_rate": 0.00017960927960927959, "loss": 0.5903, "step": 1259 }, { "epoch": 1.3840450363861048, "grad_norm": 0.5345245599746704, "learning_rate": 0.00017948717948717947, "loss": 0.5828, "step": 1260 }, { "epoch": 1.385143484827681, "grad_norm": 0.312639981508255, "learning_rate": 0.00017936507936507936, "loss": 0.6905, "step": 1261 }, { "epoch": 1.386241933269257, "grad_norm": 0.4795394837856293, "learning_rate": 0.00017924297924297921, "loss": 0.6193, "step": 1262 }, { "epoch": 1.3873403817108334, "grad_norm": 0.39672231674194336, "learning_rate": 0.0001791208791208791, "loss": 0.7833, "step": 1263 }, { "epoch": 1.3884388301524098, "grad_norm": 0.46752655506134033, "learning_rate": 0.00017899877899877899, "loss": 0.6385, "step": 1264 }, { "epoch": 1.389537278593986, "grad_norm": 0.5376736521720886, "learning_rate": 0.00017887667887667884, "loss": 0.6362, "step": 1265 }, { "epoch": 1.3906357270355623, "grad_norm": 0.5675904750823975, "learning_rate": 0.00017875457875457873, "loss": 0.7975, "step": 1266 }, { "epoch": 1.3917341754771386, "grad_norm": 0.5429015755653381, "learning_rate": 0.00017863247863247861, "loss": 0.5415, "step": 1267 }, { "epoch": 1.3928326239187148, "grad_norm": 0.3714626729488373, "learning_rate": 0.00017851037851037847, "loss": 0.7104, "step": 1268 }, { "epoch": 1.3939310723602911, "grad_norm": 0.7549324035644531, "learning_rate": 0.00017838827838827836, "loss": 0.698, "step": 1269 }, { "epoch": 1.3950295208018675, "grad_norm": 0.36867257952690125, "learning_rate": 0.00017826617826617827, "loss": 0.6019, "step": 1270 }, { "epoch": 1.3961279692434436, "grad_norm": 0.42439624667167664, "learning_rate": 0.00017814407814407813, "loss": 0.4626, "step": 1271 }, { "epoch": 1.39722641768502, "grad_norm": 0.4768877923488617, "learning_rate": 0.00017802197802197802, "loss": 0.671, "step": 1272 }, { "epoch": 1.3983248661265961, "grad_norm": 0.3415908217430115, "learning_rate": 0.0001778998778998779, "loss": 0.5904, "step": 1273 }, { "epoch": 1.3994233145681725, "grad_norm": 0.5370535850524902, "learning_rate": 0.00017777777777777776, "loss": 0.578, "step": 1274 }, { "epoch": 1.4005217630097486, "grad_norm": 0.61114901304245, "learning_rate": 0.00017765567765567764, "loss": 0.6498, "step": 1275 }, { "epoch": 1.401620211451325, "grad_norm": 0.3491772711277008, "learning_rate": 0.00017753357753357753, "loss": 0.6057, "step": 1276 }, { "epoch": 1.4027186598929013, "grad_norm": 0.4992705285549164, "learning_rate": 0.0001774114774114774, "loss": 0.8541, "step": 1277 }, { "epoch": 1.4038171083344775, "grad_norm": 0.5476379990577698, "learning_rate": 0.00017728937728937727, "loss": 0.5608, "step": 1278 }, { "epoch": 1.4049155567760538, "grad_norm": 0.6107895374298096, "learning_rate": 0.00017716727716727716, "loss": 0.7437, "step": 1279 }, { "epoch": 1.4060140052176302, "grad_norm": 0.510809600353241, "learning_rate": 0.00017704517704517702, "loss": 0.6569, "step": 1280 }, { "epoch": 1.4071124536592063, "grad_norm": 0.5050077438354492, "learning_rate": 0.0001769230769230769, "loss": 0.6566, "step": 1281 }, { "epoch": 1.4082109021007827, "grad_norm": 0.44812703132629395, "learning_rate": 0.0001768009768009768, "loss": 0.6557, "step": 1282 }, { "epoch": 1.4093093505423588, "grad_norm": 0.5216537714004517, "learning_rate": 0.00017667887667887665, "loss": 0.7311, "step": 1283 }, { "epoch": 1.4104077989839352, "grad_norm": 0.5608856081962585, "learning_rate": 0.00017655677655677656, "loss": 0.9001, "step": 1284 }, { "epoch": 1.4115062474255113, "grad_norm": 0.47205066680908203, "learning_rate": 0.0001764346764346764, "loss": 0.5214, "step": 1285 }, { "epoch": 1.4126046958670877, "grad_norm": 0.4073629081249237, "learning_rate": 0.0001763125763125763, "loss": 0.483, "step": 1286 }, { "epoch": 1.413703144308664, "grad_norm": 0.42381593585014343, "learning_rate": 0.0001761904761904762, "loss": 0.4895, "step": 1287 }, { "epoch": 1.4148015927502402, "grad_norm": 0.629356861114502, "learning_rate": 0.00017606837606837605, "loss": 0.4639, "step": 1288 }, { "epoch": 1.4159000411918166, "grad_norm": 0.3123486340045929, "learning_rate": 0.00017594627594627593, "loss": 0.4575, "step": 1289 }, { "epoch": 1.416998489633393, "grad_norm": 0.4163682460784912, "learning_rate": 0.00017582417582417582, "loss": 0.7511, "step": 1290 }, { "epoch": 1.418096938074969, "grad_norm": 0.5697455406188965, "learning_rate": 0.00017570207570207568, "loss": 0.5977, "step": 1291 }, { "epoch": 1.4191953865165454, "grad_norm": 0.39232510328292847, "learning_rate": 0.00017557997557997556, "loss": 0.6133, "step": 1292 }, { "epoch": 1.4202938349581218, "grad_norm": 0.5452993512153625, "learning_rate": 0.00017545787545787545, "loss": 0.6596, "step": 1293 }, { "epoch": 1.421392283399698, "grad_norm": 0.39080601930618286, "learning_rate": 0.0001753357753357753, "loss": 0.7422, "step": 1294 }, { "epoch": 1.4224907318412743, "grad_norm": 0.6513398289680481, "learning_rate": 0.0001752136752136752, "loss": 0.5277, "step": 1295 }, { "epoch": 1.4235891802828504, "grad_norm": 0.4627130329608917, "learning_rate": 0.00017509157509157508, "loss": 0.6296, "step": 1296 }, { "epoch": 1.4246876287244268, "grad_norm": 0.499700129032135, "learning_rate": 0.00017496947496947494, "loss": 0.689, "step": 1297 }, { "epoch": 1.425786077166003, "grad_norm": 0.4668709635734558, "learning_rate": 0.00017484737484737482, "loss": 0.784, "step": 1298 }, { "epoch": 1.4268845256075793, "grad_norm": 0.6378145217895508, "learning_rate": 0.00017472527472527473, "loss": 0.5077, "step": 1299 }, { "epoch": 1.4279829740491556, "grad_norm": 0.6320174336433411, "learning_rate": 0.00017460317460317457, "loss": 1.061, "step": 1300 }, { "epoch": 1.4290814224907318, "grad_norm": 0.48719078302383423, "learning_rate": 0.00017448107448107448, "loss": 0.7181, "step": 1301 }, { "epoch": 1.4301798709323081, "grad_norm": 0.5345287919044495, "learning_rate": 0.00017435897435897436, "loss": 0.5599, "step": 1302 }, { "epoch": 1.4312783193738845, "grad_norm": 0.567857563495636, "learning_rate": 0.00017423687423687422, "loss": 0.6294, "step": 1303 }, { "epoch": 1.4323767678154606, "grad_norm": 0.5715040564537048, "learning_rate": 0.0001741147741147741, "loss": 0.5326, "step": 1304 }, { "epoch": 1.433475216257037, "grad_norm": 0.40048834681510925, "learning_rate": 0.000173992673992674, "loss": 0.687, "step": 1305 }, { "epoch": 1.4345736646986131, "grad_norm": 0.4964540898799896, "learning_rate": 0.00017387057387057385, "loss": 0.6149, "step": 1306 }, { "epoch": 1.4356721131401895, "grad_norm": 0.5018569231033325, "learning_rate": 0.00017374847374847374, "loss": 0.4224, "step": 1307 }, { "epoch": 1.4367705615817656, "grad_norm": 0.6026094555854797, "learning_rate": 0.00017362637362637362, "loss": 0.8934, "step": 1308 }, { "epoch": 1.437869010023342, "grad_norm": 0.33409950137138367, "learning_rate": 0.00017350427350427348, "loss": 0.6725, "step": 1309 }, { "epoch": 1.4389674584649184, "grad_norm": 0.43982234597206116, "learning_rate": 0.00017338217338217337, "loss": 0.9203, "step": 1310 }, { "epoch": 1.4400659069064945, "grad_norm": 0.843877911567688, "learning_rate": 0.00017326007326007322, "loss": 0.6028, "step": 1311 }, { "epoch": 1.4411643553480709, "grad_norm": 0.35148733854293823, "learning_rate": 0.0001731379731379731, "loss": 0.7503, "step": 1312 }, { "epoch": 1.4422628037896472, "grad_norm": 0.4561845362186432, "learning_rate": 0.000173015873015873, "loss": 0.6577, "step": 1313 }, { "epoch": 1.4433612522312234, "grad_norm": 0.47295713424682617, "learning_rate": 0.00017289377289377285, "loss": 0.8013, "step": 1314 }, { "epoch": 1.4444597006727997, "grad_norm": 0.46340033411979675, "learning_rate": 0.00017277167277167277, "loss": 0.73, "step": 1315 }, { "epoch": 1.445558149114376, "grad_norm": 0.49221453070640564, "learning_rate": 0.00017264957264957265, "loss": 0.6735, "step": 1316 }, { "epoch": 1.4466565975559522, "grad_norm": 0.36250925064086914, "learning_rate": 0.0001725274725274725, "loss": 0.7463, "step": 1317 }, { "epoch": 1.4477550459975284, "grad_norm": 0.3832615911960602, "learning_rate": 0.0001724053724053724, "loss": 0.7295, "step": 1318 }, { "epoch": 1.4488534944391047, "grad_norm": 0.7413591742515564, "learning_rate": 0.00017228327228327228, "loss": 0.7627, "step": 1319 }, { "epoch": 1.449951942880681, "grad_norm": 0.45626765489578247, "learning_rate": 0.00017216117216117214, "loss": 0.727, "step": 1320 }, { "epoch": 1.4510503913222572, "grad_norm": 0.3024120330810547, "learning_rate": 0.00017203907203907202, "loss": 0.3986, "step": 1321 }, { "epoch": 1.4521488397638336, "grad_norm": 0.31635284423828125, "learning_rate": 0.0001719169719169719, "loss": 0.3469, "step": 1322 }, { "epoch": 1.45324728820541, "grad_norm": 0.36893391609191895, "learning_rate": 0.00017179487179487177, "loss": 0.7017, "step": 1323 }, { "epoch": 1.454345736646986, "grad_norm": 0.4804024398326874, "learning_rate": 0.00017167277167277165, "loss": 0.8811, "step": 1324 }, { "epoch": 1.4554441850885624, "grad_norm": 0.4446522295475006, "learning_rate": 0.00017155067155067154, "loss": 0.8027, "step": 1325 }, { "epoch": 1.4565426335301388, "grad_norm": 0.27936413884162903, "learning_rate": 0.0001714285714285714, "loss": 0.3846, "step": 1326 }, { "epoch": 1.457641081971715, "grad_norm": 0.3312259316444397, "learning_rate": 0.00017130647130647128, "loss": 0.4852, "step": 1327 }, { "epoch": 1.4587395304132913, "grad_norm": 0.4751642644405365, "learning_rate": 0.0001711843711843712, "loss": 0.7337, "step": 1328 }, { "epoch": 1.4598379788548674, "grad_norm": 0.5365067720413208, "learning_rate": 0.00017106227106227103, "loss": 0.8052, "step": 1329 }, { "epoch": 1.4609364272964438, "grad_norm": 0.5944942831993103, "learning_rate": 0.00017094017094017094, "loss": 0.7673, "step": 1330 }, { "epoch": 1.46203487573802, "grad_norm": 0.48244431614875793, "learning_rate": 0.00017081807081807083, "loss": 0.855, "step": 1331 }, { "epoch": 1.4631333241795963, "grad_norm": 0.32348135113716125, "learning_rate": 0.00017069597069597068, "loss": 0.5133, "step": 1332 }, { "epoch": 1.4642317726211727, "grad_norm": 0.6455866694450378, "learning_rate": 0.00017057387057387057, "loss": 0.6825, "step": 1333 }, { "epoch": 1.4653302210627488, "grad_norm": 0.3937522768974304, "learning_rate": 0.00017045177045177045, "loss": 0.6335, "step": 1334 }, { "epoch": 1.4664286695043252, "grad_norm": 0.33579352498054504, "learning_rate": 0.0001703296703296703, "loss": 0.4711, "step": 1335 }, { "epoch": 1.4675271179459015, "grad_norm": 0.5055533647537231, "learning_rate": 0.0001702075702075702, "loss": 0.6512, "step": 1336 }, { "epoch": 1.4686255663874777, "grad_norm": 0.40702182054519653, "learning_rate": 0.00017008547008547006, "loss": 0.8833, "step": 1337 }, { "epoch": 1.469724014829054, "grad_norm": 0.3574135899543762, "learning_rate": 0.00016996336996336994, "loss": 0.7127, "step": 1338 }, { "epoch": 1.4708224632706302, "grad_norm": 0.45641472935676575, "learning_rate": 0.00016984126984126983, "loss": 0.7258, "step": 1339 }, { "epoch": 1.4719209117122065, "grad_norm": 1.5012352466583252, "learning_rate": 0.0001697191697191697, "loss": 0.8065, "step": 1340 }, { "epoch": 1.4730193601537827, "grad_norm": 0.5025885701179504, "learning_rate": 0.00016959706959706957, "loss": 0.9377, "step": 1341 }, { "epoch": 1.474117808595359, "grad_norm": 0.2942202687263489, "learning_rate": 0.00016947496947496946, "loss": 0.5693, "step": 1342 }, { "epoch": 1.4752162570369354, "grad_norm": 0.48770126700401306, "learning_rate": 0.00016935286935286932, "loss": 0.5483, "step": 1343 }, { "epoch": 1.4763147054785115, "grad_norm": 0.3853349983692169, "learning_rate": 0.0001692307692307692, "loss": 0.5787, "step": 1344 }, { "epoch": 1.4774131539200879, "grad_norm": 0.3593169152736664, "learning_rate": 0.00016910866910866911, "loss": 0.6426, "step": 1345 }, { "epoch": 1.4785116023616642, "grad_norm": 0.5932713150978088, "learning_rate": 0.00016898656898656897, "loss": 0.7543, "step": 1346 }, { "epoch": 1.4796100508032404, "grad_norm": 0.43406638503074646, "learning_rate": 0.00016886446886446886, "loss": 0.7868, "step": 1347 }, { "epoch": 1.4807084992448167, "grad_norm": 0.38596048951148987, "learning_rate": 0.00016874236874236874, "loss": 0.49, "step": 1348 }, { "epoch": 1.481806947686393, "grad_norm": 0.42844533920288086, "learning_rate": 0.0001686202686202686, "loss": 0.6485, "step": 1349 }, { "epoch": 1.4829053961279692, "grad_norm": 0.5165280103683472, "learning_rate": 0.0001684981684981685, "loss": 0.6924, "step": 1350 }, { "epoch": 1.4840038445695456, "grad_norm": 0.5717988610267639, "learning_rate": 0.00016837606837606837, "loss": 0.5624, "step": 1351 }, { "epoch": 1.4851022930111217, "grad_norm": 0.4384293556213379, "learning_rate": 0.00016825396825396823, "loss": 0.7895, "step": 1352 }, { "epoch": 1.486200741452698, "grad_norm": 0.5472243428230286, "learning_rate": 0.00016813186813186812, "loss": 0.8838, "step": 1353 }, { "epoch": 1.4872991898942742, "grad_norm": 0.3903232216835022, "learning_rate": 0.000168009768009768, "loss": 0.5452, "step": 1354 }, { "epoch": 1.4883976383358506, "grad_norm": 0.3799583613872528, "learning_rate": 0.00016788766788766786, "loss": 0.8931, "step": 1355 }, { "epoch": 1.489496086777427, "grad_norm": 0.4481349289417267, "learning_rate": 0.00016776556776556775, "loss": 0.5956, "step": 1356 }, { "epoch": 1.490594535219003, "grad_norm": 0.45875266194343567, "learning_rate": 0.00016764346764346763, "loss": 0.4729, "step": 1357 }, { "epoch": 1.4916929836605795, "grad_norm": 0.494112104177475, "learning_rate": 0.0001675213675213675, "loss": 0.6416, "step": 1358 }, { "epoch": 1.4927914321021558, "grad_norm": 0.3976772725582123, "learning_rate": 0.0001673992673992674, "loss": 0.6601, "step": 1359 }, { "epoch": 1.493889880543732, "grad_norm": 0.29009610414505005, "learning_rate": 0.0001672771672771673, "loss": 0.4261, "step": 1360 }, { "epoch": 1.4949883289853083, "grad_norm": 0.5540419816970825, "learning_rate": 0.00016715506715506715, "loss": 0.8206, "step": 1361 }, { "epoch": 1.4960867774268845, "grad_norm": 0.41308313608169556, "learning_rate": 0.00016703296703296703, "loss": 0.7862, "step": 1362 }, { "epoch": 1.4971852258684608, "grad_norm": 0.6565150618553162, "learning_rate": 0.0001669108669108669, "loss": 0.6963, "step": 1363 }, { "epoch": 1.498283674310037, "grad_norm": 0.4901321530342102, "learning_rate": 0.00016678876678876678, "loss": 0.7063, "step": 1364 }, { "epoch": 1.4993821227516133, "grad_norm": 0.4676086902618408, "learning_rate": 0.00016666666666666666, "loss": 0.5142, "step": 1365 }, { "epoch": 1.5004805711931897, "grad_norm": 0.4745628833770752, "learning_rate": 0.00016654456654456652, "loss": 0.7659, "step": 1366 }, { "epoch": 1.5015790196347658, "grad_norm": 0.42693057656288147, "learning_rate": 0.0001664224664224664, "loss": 0.9233, "step": 1367 }, { "epoch": 1.5026774680763422, "grad_norm": 0.4110391139984131, "learning_rate": 0.0001663003663003663, "loss": 0.5062, "step": 1368 }, { "epoch": 1.5037759165179185, "grad_norm": 0.3090996742248535, "learning_rate": 0.00016617826617826615, "loss": 0.4462, "step": 1369 }, { "epoch": 1.5048743649594947, "grad_norm": 0.42027410864830017, "learning_rate": 0.00016605616605616603, "loss": 0.8589, "step": 1370 }, { "epoch": 1.505972813401071, "grad_norm": 0.38396796584129333, "learning_rate": 0.00016593406593406592, "loss": 0.6609, "step": 1371 }, { "epoch": 1.5070712618426474, "grad_norm": 0.5236012935638428, "learning_rate": 0.00016581196581196578, "loss": 0.6506, "step": 1372 }, { "epoch": 1.5081697102842235, "grad_norm": 0.7232113480567932, "learning_rate": 0.00016568986568986566, "loss": 0.6689, "step": 1373 }, { "epoch": 1.5092681587257997, "grad_norm": 0.4777502417564392, "learning_rate": 0.00016556776556776558, "loss": 0.5701, "step": 1374 }, { "epoch": 1.510366607167376, "grad_norm": 0.39154767990112305, "learning_rate": 0.0001654456654456654, "loss": 0.4906, "step": 1375 }, { "epoch": 1.5114650556089524, "grad_norm": 0.469382107257843, "learning_rate": 0.00016532356532356532, "loss": 0.5768, "step": 1376 }, { "epoch": 1.5125635040505285, "grad_norm": 0.3485945761203766, "learning_rate": 0.0001652014652014652, "loss": 0.7814, "step": 1377 }, { "epoch": 1.513661952492105, "grad_norm": 0.4375949203968048, "learning_rate": 0.00016507936507936506, "loss": 0.6328, "step": 1378 }, { "epoch": 1.5147604009336813, "grad_norm": 0.47778064012527466, "learning_rate": 0.00016495726495726495, "loss": 0.635, "step": 1379 }, { "epoch": 1.5158588493752574, "grad_norm": 0.3515126705169678, "learning_rate": 0.00016483516483516484, "loss": 0.7014, "step": 1380 }, { "epoch": 1.5169572978168337, "grad_norm": 0.3710018992424011, "learning_rate": 0.0001647130647130647, "loss": 0.7903, "step": 1381 }, { "epoch": 1.51805574625841, "grad_norm": 0.37630394101142883, "learning_rate": 0.00016459096459096458, "loss": 0.5446, "step": 1382 }, { "epoch": 1.5191541946999862, "grad_norm": 0.4312807321548462, "learning_rate": 0.00016446886446886446, "loss": 0.6101, "step": 1383 }, { "epoch": 1.5202526431415624, "grad_norm": 0.399384468793869, "learning_rate": 0.00016434676434676432, "loss": 0.5734, "step": 1384 }, { "epoch": 1.521351091583139, "grad_norm": 0.41233471035957336, "learning_rate": 0.0001642246642246642, "loss": 0.6525, "step": 1385 }, { "epoch": 1.522449540024715, "grad_norm": 0.5215228199958801, "learning_rate": 0.0001641025641025641, "loss": 0.4804, "step": 1386 }, { "epoch": 1.5235479884662912, "grad_norm": 0.42069393396377563, "learning_rate": 0.00016398046398046395, "loss": 0.5517, "step": 1387 }, { "epoch": 1.5246464369078676, "grad_norm": 1.7902978658676147, "learning_rate": 0.00016385836385836384, "loss": 0.6295, "step": 1388 }, { "epoch": 1.525744885349444, "grad_norm": 0.7353507280349731, "learning_rate": 0.0001637362637362637, "loss": 1.0585, "step": 1389 }, { "epoch": 1.52684333379102, "grad_norm": 0.45992404222488403, "learning_rate": 0.0001636141636141636, "loss": 0.7671, "step": 1390 }, { "epoch": 1.5279417822325965, "grad_norm": 0.3927334249019623, "learning_rate": 0.0001634920634920635, "loss": 0.7479, "step": 1391 }, { "epoch": 1.5290402306741728, "grad_norm": 0.32833003997802734, "learning_rate": 0.00016336996336996335, "loss": 0.5774, "step": 1392 }, { "epoch": 1.530138679115749, "grad_norm": 0.4306529462337494, "learning_rate": 0.00016324786324786324, "loss": 0.6317, "step": 1393 }, { "epoch": 1.5312371275573253, "grad_norm": 0.5411052703857422, "learning_rate": 0.00016312576312576312, "loss": 0.6637, "step": 1394 }, { "epoch": 1.5323355759989017, "grad_norm": 0.633800745010376, "learning_rate": 0.00016300366300366298, "loss": 0.7145, "step": 1395 }, { "epoch": 1.5334340244404778, "grad_norm": 0.6986578702926636, "learning_rate": 0.00016288156288156287, "loss": 0.7194, "step": 1396 }, { "epoch": 1.534532472882054, "grad_norm": 0.5223686695098877, "learning_rate": 0.00016275946275946275, "loss": 0.7849, "step": 1397 }, { "epoch": 1.5356309213236303, "grad_norm": 0.5342483520507812, "learning_rate": 0.0001626373626373626, "loss": 0.8885, "step": 1398 }, { "epoch": 1.5367293697652067, "grad_norm": 0.5467656850814819, "learning_rate": 0.0001625152625152625, "loss": 0.6265, "step": 1399 }, { "epoch": 1.5378278182067828, "grad_norm": 0.4483658969402313, "learning_rate": 0.00016239316239316238, "loss": 0.7133, "step": 1400 }, { "epoch": 1.5389262666483592, "grad_norm": 0.5714216232299805, "learning_rate": 0.00016227106227106224, "loss": 0.5212, "step": 1401 }, { "epoch": 1.5400247150899355, "grad_norm": 0.5487145781517029, "learning_rate": 0.00016214896214896213, "loss": 0.6276, "step": 1402 }, { "epoch": 1.5411231635315117, "grad_norm": 0.3687078654766083, "learning_rate": 0.00016202686202686204, "loss": 0.7512, "step": 1403 }, { "epoch": 1.542221611973088, "grad_norm": 0.3596762418746948, "learning_rate": 0.00016190476190476187, "loss": 0.7192, "step": 1404 }, { "epoch": 1.5433200604146644, "grad_norm": 0.4092305898666382, "learning_rate": 0.00016178266178266178, "loss": 0.7339, "step": 1405 }, { "epoch": 1.5444185088562405, "grad_norm": 0.4018193483352661, "learning_rate": 0.00016166056166056167, "loss": 0.7213, "step": 1406 }, { "epoch": 1.5455169572978167, "grad_norm": 0.4993208646774292, "learning_rate": 0.00016153846153846153, "loss": 0.6362, "step": 1407 }, { "epoch": 1.5466154057393933, "grad_norm": 0.3958855867385864, "learning_rate": 0.0001614163614163614, "loss": 0.8482, "step": 1408 }, { "epoch": 1.5477138541809694, "grad_norm": 0.32689765095710754, "learning_rate": 0.0001612942612942613, "loss": 0.6583, "step": 1409 }, { "epoch": 1.5488123026225455, "grad_norm": 0.48947611451148987, "learning_rate": 0.00016117216117216116, "loss": 0.6707, "step": 1410 }, { "epoch": 1.549910751064122, "grad_norm": 0.3446139395236969, "learning_rate": 0.00016105006105006104, "loss": 0.8914, "step": 1411 }, { "epoch": 1.5510091995056983, "grad_norm": 0.585746705532074, "learning_rate": 0.0001609279609279609, "loss": 0.5413, "step": 1412 }, { "epoch": 1.5521076479472744, "grad_norm": 0.6561328172683716, "learning_rate": 0.00016080586080586079, "loss": 0.3728, "step": 1413 }, { "epoch": 1.5532060963888508, "grad_norm": 0.47158828377723694, "learning_rate": 0.00016068376068376067, "loss": 0.6525, "step": 1414 }, { "epoch": 1.5543045448304271, "grad_norm": 0.3676914572715759, "learning_rate": 0.00016056166056166053, "loss": 0.7395, "step": 1415 }, { "epoch": 1.5554029932720033, "grad_norm": 0.608076810836792, "learning_rate": 0.00016043956043956041, "loss": 0.5289, "step": 1416 }, { "epoch": 1.5565014417135794, "grad_norm": 0.44940462708473206, "learning_rate": 0.0001603174603174603, "loss": 0.6282, "step": 1417 }, { "epoch": 1.557599890155156, "grad_norm": 0.48062869906425476, "learning_rate": 0.00016019536019536016, "loss": 0.7438, "step": 1418 }, { "epoch": 1.5586983385967321, "grad_norm": 0.43834635615348816, "learning_rate": 0.00016007326007326004, "loss": 0.4248, "step": 1419 }, { "epoch": 1.5597967870383083, "grad_norm": 0.5203731060028076, "learning_rate": 0.00015995115995115996, "loss": 0.91, "step": 1420 }, { "epoch": 1.5608952354798846, "grad_norm": 0.5766960978507996, "learning_rate": 0.00015982905982905981, "loss": 0.7211, "step": 1421 }, { "epoch": 1.561993683921461, "grad_norm": 0.3048666715621948, "learning_rate": 0.0001597069597069597, "loss": 0.5618, "step": 1422 }, { "epoch": 1.5630921323630371, "grad_norm": 0.3916679322719574, "learning_rate": 0.00015958485958485959, "loss": 0.6954, "step": 1423 }, { "epoch": 1.5641905808046135, "grad_norm": 0.6336612105369568, "learning_rate": 0.00015946275946275944, "loss": 0.6368, "step": 1424 }, { "epoch": 1.5652890292461898, "grad_norm": 0.8314816355705261, "learning_rate": 0.00015934065934065933, "loss": 0.7633, "step": 1425 }, { "epoch": 1.566387477687766, "grad_norm": 0.46973487734794617, "learning_rate": 0.00015921855921855922, "loss": 0.6915, "step": 1426 }, { "epoch": 1.5674859261293423, "grad_norm": 0.48737633228302, "learning_rate": 0.00015909645909645907, "loss": 0.5346, "step": 1427 }, { "epoch": 1.5685843745709187, "grad_norm": 0.548876941204071, "learning_rate": 0.00015897435897435896, "loss": 1.0449, "step": 1428 }, { "epoch": 1.5696828230124948, "grad_norm": 0.5039654970169067, "learning_rate": 0.00015885225885225884, "loss": 0.9953, "step": 1429 }, { "epoch": 1.570781271454071, "grad_norm": 0.7233378887176514, "learning_rate": 0.0001587301587301587, "loss": 0.7068, "step": 1430 }, { "epoch": 1.5718797198956473, "grad_norm": 0.5767638683319092, "learning_rate": 0.0001586080586080586, "loss": 0.8055, "step": 1431 }, { "epoch": 1.5729781683372237, "grad_norm": 0.34450021386146545, "learning_rate": 0.00015848595848595847, "loss": 0.726, "step": 1432 }, { "epoch": 1.5740766167787998, "grad_norm": 0.8474962711334229, "learning_rate": 0.00015836385836385833, "loss": 0.6974, "step": 1433 }, { "epoch": 1.5751750652203762, "grad_norm": 1.565746545791626, "learning_rate": 0.00015824175824175824, "loss": 0.7766, "step": 1434 }, { "epoch": 1.5762735136619526, "grad_norm": 0.4393616020679474, "learning_rate": 0.00015811965811965813, "loss": 0.6071, "step": 1435 }, { "epoch": 1.5773719621035287, "grad_norm": 0.5209214091300964, "learning_rate": 0.000157997557997558, "loss": 0.7546, "step": 1436 }, { "epoch": 1.578470410545105, "grad_norm": 0.6069398522377014, "learning_rate": 0.00015787545787545787, "loss": 0.7322, "step": 1437 }, { "epoch": 1.5795688589866814, "grad_norm": 0.6168296337127686, "learning_rate": 0.00015775335775335773, "loss": 0.5169, "step": 1438 }, { "epoch": 1.5806673074282576, "grad_norm": 0.25368016958236694, "learning_rate": 0.00015763125763125762, "loss": 0.4838, "step": 1439 }, { "epoch": 1.5817657558698337, "grad_norm": 0.4165039360523224, "learning_rate": 0.0001575091575091575, "loss": 1.0135, "step": 1440 }, { "epoch": 1.5828642043114103, "grad_norm": 0.4596197307109833, "learning_rate": 0.00015738705738705736, "loss": 0.5545, "step": 1441 }, { "epoch": 1.5839626527529864, "grad_norm": 0.5077592730522156, "learning_rate": 0.00015726495726495725, "loss": 0.7754, "step": 1442 }, { "epoch": 1.5850611011945626, "grad_norm": 0.5041285157203674, "learning_rate": 0.00015714285714285713, "loss": 0.8384, "step": 1443 }, { "epoch": 1.586159549636139, "grad_norm": 0.40924420952796936, "learning_rate": 0.000157020757020757, "loss": 0.5511, "step": 1444 }, { "epoch": 1.5872579980777153, "grad_norm": 0.4800551235675812, "learning_rate": 0.00015689865689865688, "loss": 0.6154, "step": 1445 }, { "epoch": 1.5883564465192914, "grad_norm": 0.433174729347229, "learning_rate": 0.00015677655677655676, "loss": 0.6158, "step": 1446 }, { "epoch": 1.5894548949608678, "grad_norm": 0.29649895429611206, "learning_rate": 0.00015665445665445662, "loss": 0.5729, "step": 1447 }, { "epoch": 1.5905533434024441, "grad_norm": 0.3815969228744507, "learning_rate": 0.0001565323565323565, "loss": 0.6748, "step": 1448 }, { "epoch": 1.5916517918440203, "grad_norm": 0.4933919608592987, "learning_rate": 0.00015641025641025642, "loss": 0.7683, "step": 1449 }, { "epoch": 1.5927502402855966, "grad_norm": 0.5053071975708008, "learning_rate": 0.00015628815628815625, "loss": 0.6779, "step": 1450 }, { "epoch": 1.593848688727173, "grad_norm": 0.3900013566017151, "learning_rate": 0.00015616605616605616, "loss": 0.6326, "step": 1451 }, { "epoch": 1.5949471371687491, "grad_norm": 0.5823982357978821, "learning_rate": 0.00015604395604395605, "loss": 0.6104, "step": 1452 }, { "epoch": 1.5960455856103253, "grad_norm": 0.5277792811393738, "learning_rate": 0.0001559218559218559, "loss": 0.6647, "step": 1453 }, { "epoch": 1.5971440340519016, "grad_norm": 0.32926440238952637, "learning_rate": 0.0001557997557997558, "loss": 0.6064, "step": 1454 }, { "epoch": 1.598242482493478, "grad_norm": 0.7350378036499023, "learning_rate": 0.00015567765567765568, "loss": 0.7951, "step": 1455 }, { "epoch": 1.5993409309350541, "grad_norm": 0.4125807285308838, "learning_rate": 0.00015555555555555554, "loss": 0.7761, "step": 1456 }, { "epoch": 1.6004393793766305, "grad_norm": 0.49707722663879395, "learning_rate": 0.00015543345543345542, "loss": 0.7299, "step": 1457 }, { "epoch": 1.6015378278182069, "grad_norm": 0.3240358829498291, "learning_rate": 0.0001553113553113553, "loss": 0.4832, "step": 1458 }, { "epoch": 1.602636276259783, "grad_norm": 0.44430434703826904, "learning_rate": 0.00015518925518925517, "loss": 0.5968, "step": 1459 }, { "epoch": 1.6037347247013594, "grad_norm": 0.3702992796897888, "learning_rate": 0.00015506715506715505, "loss": 0.7177, "step": 1460 }, { "epoch": 1.6048331731429357, "grad_norm": 0.5001052618026733, "learning_rate": 0.00015494505494505494, "loss": 0.7448, "step": 1461 }, { "epoch": 1.6059316215845119, "grad_norm": 0.45969969034194946, "learning_rate": 0.0001548229548229548, "loss": 0.8292, "step": 1462 }, { "epoch": 1.607030070026088, "grad_norm": 0.46075674891471863, "learning_rate": 0.00015470085470085468, "loss": 0.5624, "step": 1463 }, { "epoch": 1.6081285184676646, "grad_norm": 2.077080488204956, "learning_rate": 0.00015457875457875454, "loss": 0.6643, "step": 1464 }, { "epoch": 1.6092269669092407, "grad_norm": 0.46008172631263733, "learning_rate": 0.00015445665445665445, "loss": 0.6329, "step": 1465 }, { "epoch": 1.6103254153508169, "grad_norm": 0.5016405582427979, "learning_rate": 0.00015433455433455434, "loss": 0.7692, "step": 1466 }, { "epoch": 1.6114238637923932, "grad_norm": 0.46292269229888916, "learning_rate": 0.0001542124542124542, "loss": 0.6485, "step": 1467 }, { "epoch": 1.6125223122339696, "grad_norm": 0.4498538672924042, "learning_rate": 0.00015409035409035408, "loss": 0.598, "step": 1468 }, { "epoch": 1.6136207606755457, "grad_norm": 0.3537295162677765, "learning_rate": 0.00015396825396825397, "loss": 0.6356, "step": 1469 }, { "epoch": 1.614719209117122, "grad_norm": 0.9966747164726257, "learning_rate": 0.00015384615384615382, "loss": 0.6627, "step": 1470 }, { "epoch": 1.6158176575586984, "grad_norm": 0.9386951327323914, "learning_rate": 0.0001537240537240537, "loss": 0.8148, "step": 1471 }, { "epoch": 1.6169161060002746, "grad_norm": 0.3452979028224945, "learning_rate": 0.0001536019536019536, "loss": 0.5778, "step": 1472 }, { "epoch": 1.618014554441851, "grad_norm": 0.3443523049354553, "learning_rate": 0.00015347985347985345, "loss": 0.9228, "step": 1473 }, { "epoch": 1.6191130028834273, "grad_norm": 0.5345872044563293, "learning_rate": 0.00015335775335775334, "loss": 0.4682, "step": 1474 }, { "epoch": 1.6202114513250034, "grad_norm": 0.35112351179122925, "learning_rate": 0.00015323565323565322, "loss": 0.5482, "step": 1475 }, { "epoch": 1.6213098997665796, "grad_norm": 0.39090535044670105, "learning_rate": 0.00015311355311355308, "loss": 0.825, "step": 1476 }, { "epoch": 1.622408348208156, "grad_norm": 1.1684538125991821, "learning_rate": 0.00015299145299145297, "loss": 0.6561, "step": 1477 }, { "epoch": 1.6235067966497323, "grad_norm": 0.4006233513355255, "learning_rate": 0.00015286935286935288, "loss": 0.3647, "step": 1478 }, { "epoch": 1.6246052450913084, "grad_norm": 0.30577126145362854, "learning_rate": 0.0001527472527472527, "loss": 0.4934, "step": 1479 }, { "epoch": 1.6257036935328848, "grad_norm": 0.39927995204925537, "learning_rate": 0.00015262515262515263, "loss": 0.6028, "step": 1480 }, { "epoch": 1.6268021419744612, "grad_norm": 0.49143150448799133, "learning_rate": 0.0001525030525030525, "loss": 0.4595, "step": 1481 }, { "epoch": 1.6279005904160373, "grad_norm": 0.8603225946426392, "learning_rate": 0.00015238095238095237, "loss": 0.8617, "step": 1482 }, { "epoch": 1.6289990388576137, "grad_norm": 0.534269392490387, "learning_rate": 0.00015225885225885225, "loss": 0.6648, "step": 1483 }, { "epoch": 1.63009748729919, "grad_norm": 0.4987354278564453, "learning_rate": 0.00015213675213675214, "loss": 0.5908, "step": 1484 }, { "epoch": 1.6311959357407662, "grad_norm": 0.5739774107933044, "learning_rate": 0.000152014652014652, "loss": 0.7652, "step": 1485 }, { "epoch": 1.6322943841823423, "grad_norm": 0.5343801975250244, "learning_rate": 0.00015189255189255188, "loss": 0.6864, "step": 1486 }, { "epoch": 1.6333928326239189, "grad_norm": 0.45683905482292175, "learning_rate": 0.00015177045177045177, "loss": 0.7179, "step": 1487 }, { "epoch": 1.634491281065495, "grad_norm": 0.5020450949668884, "learning_rate": 0.00015164835164835163, "loss": 0.4356, "step": 1488 }, { "epoch": 1.6355897295070712, "grad_norm": 0.3870914876461029, "learning_rate": 0.0001515262515262515, "loss": 0.692, "step": 1489 }, { "epoch": 1.6366881779486475, "grad_norm": 0.5256255269050598, "learning_rate": 0.00015140415140415137, "loss": 0.7184, "step": 1490 }, { "epoch": 1.6377866263902239, "grad_norm": 0.27588197588920593, "learning_rate": 0.00015128205128205126, "loss": 0.6928, "step": 1491 }, { "epoch": 1.6388850748318, "grad_norm": 0.43336692452430725, "learning_rate": 0.00015115995115995114, "loss": 0.7357, "step": 1492 }, { "epoch": 1.6399835232733764, "grad_norm": 0.7952486872673035, "learning_rate": 0.000151037851037851, "loss": 0.5536, "step": 1493 }, { "epoch": 1.6410819717149527, "grad_norm": 3.8659090995788574, "learning_rate": 0.00015091575091575089, "loss": 0.6409, "step": 1494 }, { "epoch": 1.6421804201565289, "grad_norm": 0.3824027478694916, "learning_rate": 0.0001507936507936508, "loss": 0.5988, "step": 1495 }, { "epoch": 1.643278868598105, "grad_norm": 0.45106491446495056, "learning_rate": 0.00015067155067155066, "loss": 0.7568, "step": 1496 }, { "epoch": 1.6443773170396816, "grad_norm": 0.719417154788971, "learning_rate": 0.00015054945054945054, "loss": 0.8191, "step": 1497 }, { "epoch": 1.6454757654812577, "grad_norm": 0.4702167212963104, "learning_rate": 0.00015042735042735043, "loss": 0.6761, "step": 1498 }, { "epoch": 1.6465742139228339, "grad_norm": 0.49441996216773987, "learning_rate": 0.0001503052503052503, "loss": 0.7323, "step": 1499 }, { "epoch": 1.6476726623644102, "grad_norm": 0.623470664024353, "learning_rate": 0.00015018315018315017, "loss": 0.8384, "step": 1500 }, { "epoch": 1.6487711108059866, "grad_norm": 0.5583334565162659, "learning_rate": 0.00015006105006105006, "loss": 0.8238, "step": 1501 }, { "epoch": 1.6498695592475627, "grad_norm": 0.4803924560546875, "learning_rate": 0.00014993894993894994, "loss": 0.5322, "step": 1502 }, { "epoch": 1.650968007689139, "grad_norm": 0.709605872631073, "learning_rate": 0.0001498168498168498, "loss": 0.8254, "step": 1503 }, { "epoch": 1.6520664561307155, "grad_norm": 0.48047375679016113, "learning_rate": 0.0001496947496947497, "loss": 0.5263, "step": 1504 }, { "epoch": 1.6531649045722916, "grad_norm": 0.41796261072158813, "learning_rate": 0.00014957264957264957, "loss": 0.5803, "step": 1505 }, { "epoch": 1.654263353013868, "grad_norm": 0.7576707601547241, "learning_rate": 0.00014945054945054943, "loss": 0.545, "step": 1506 }, { "epoch": 1.6553618014554443, "grad_norm": 0.4668630063533783, "learning_rate": 0.00014932844932844932, "loss": 0.6213, "step": 1507 }, { "epoch": 1.6564602498970205, "grad_norm": 0.9730806350708008, "learning_rate": 0.00014920634920634917, "loss": 0.5415, "step": 1508 }, { "epoch": 1.6575586983385966, "grad_norm": 0.39670151472091675, "learning_rate": 0.0001490842490842491, "loss": 0.7931, "step": 1509 }, { "epoch": 1.658657146780173, "grad_norm": 0.6003556847572327, "learning_rate": 0.00014896214896214895, "loss": 0.7494, "step": 1510 }, { "epoch": 1.6597555952217493, "grad_norm": 0.4335152506828308, "learning_rate": 0.00014884004884004883, "loss": 0.7003, "step": 1511 }, { "epoch": 1.6608540436633255, "grad_norm": 0.34025630354881287, "learning_rate": 0.00014871794871794872, "loss": 0.9012, "step": 1512 }, { "epoch": 1.6619524921049018, "grad_norm": 0.403934508562088, "learning_rate": 0.00014859584859584858, "loss": 0.717, "step": 1513 }, { "epoch": 1.6630509405464782, "grad_norm": 0.45691147446632385, "learning_rate": 0.00014847374847374846, "loss": 0.4833, "step": 1514 }, { "epoch": 1.6641493889880543, "grad_norm": 0.42266151309013367, "learning_rate": 0.00014835164835164835, "loss": 0.5892, "step": 1515 }, { "epoch": 1.6652478374296307, "grad_norm": 0.392337441444397, "learning_rate": 0.0001482295482295482, "loss": 0.7748, "step": 1516 }, { "epoch": 1.666346285871207, "grad_norm": 0.352081298828125, "learning_rate": 0.0001481074481074481, "loss": 0.6018, "step": 1517 }, { "epoch": 1.6674447343127832, "grad_norm": 0.46293389797210693, "learning_rate": 0.00014798534798534798, "loss": 0.4696, "step": 1518 }, { "epoch": 1.6685431827543593, "grad_norm": 0.6427372097969055, "learning_rate": 0.00014786324786324786, "loss": 0.7279, "step": 1519 }, { "epoch": 1.669641631195936, "grad_norm": 0.500382125377655, "learning_rate": 0.00014774114774114772, "loss": 0.7395, "step": 1520 }, { "epoch": 1.670740079637512, "grad_norm": 0.4410606920719147, "learning_rate": 0.0001476190476190476, "loss": 0.501, "step": 1521 }, { "epoch": 1.6718385280790882, "grad_norm": 0.5587645769119263, "learning_rate": 0.0001474969474969475, "loss": 0.8655, "step": 1522 }, { "epoch": 1.6729369765206645, "grad_norm": 0.4312286376953125, "learning_rate": 0.00014737484737484735, "loss": 0.9578, "step": 1523 }, { "epoch": 1.674035424962241, "grad_norm": 0.48694175481796265, "learning_rate": 0.00014725274725274723, "loss": 0.6806, "step": 1524 }, { "epoch": 1.675133873403817, "grad_norm": 0.39892563223838806, "learning_rate": 0.00014713064713064712, "loss": 0.598, "step": 1525 }, { "epoch": 1.6762323218453934, "grad_norm": 0.4714735150337219, "learning_rate": 0.000147008547008547, "loss": 0.9637, "step": 1526 }, { "epoch": 1.6773307702869698, "grad_norm": 0.8308823108673096, "learning_rate": 0.00014688644688644686, "loss": 0.7886, "step": 1527 }, { "epoch": 1.678429218728546, "grad_norm": 0.5142358541488647, "learning_rate": 0.00014676434676434675, "loss": 0.8028, "step": 1528 }, { "epoch": 1.6795276671701223, "grad_norm": 0.4001234471797943, "learning_rate": 0.00014664224664224663, "loss": 0.59, "step": 1529 }, { "epoch": 1.6806261156116986, "grad_norm": 0.4112735688686371, "learning_rate": 0.0001465201465201465, "loss": 0.6523, "step": 1530 }, { "epoch": 1.6817245640532748, "grad_norm": 0.4391016960144043, "learning_rate": 0.0001463980463980464, "loss": 0.7372, "step": 1531 }, { "epoch": 1.682823012494851, "grad_norm": 0.7199782133102417, "learning_rate": 0.00014627594627594626, "loss": 0.8493, "step": 1532 }, { "epoch": 1.6839214609364273, "grad_norm": 0.42379269003868103, "learning_rate": 0.00014615384615384615, "loss": 0.6609, "step": 1533 }, { "epoch": 1.6850199093780036, "grad_norm": 0.41174909472465515, "learning_rate": 0.000146031746031746, "loss": 0.7021, "step": 1534 }, { "epoch": 1.6861183578195797, "grad_norm": 0.4856640100479126, "learning_rate": 0.0001459096459096459, "loss": 0.6055, "step": 1535 }, { "epoch": 1.687216806261156, "grad_norm": 0.5789656043052673, "learning_rate": 0.00014578754578754578, "loss": 0.7003, "step": 1536 }, { "epoch": 1.6883152547027325, "grad_norm": 0.5711427330970764, "learning_rate": 0.00014566544566544564, "loss": 0.5762, "step": 1537 }, { "epoch": 1.6894137031443086, "grad_norm": 0.3285518288612366, "learning_rate": 0.00014554334554334552, "loss": 0.6232, "step": 1538 }, { "epoch": 1.690512151585885, "grad_norm": 0.48425230383872986, "learning_rate": 0.0001454212454212454, "loss": 0.5515, "step": 1539 }, { "epoch": 1.6916106000274613, "grad_norm": 0.573079526424408, "learning_rate": 0.0001452991452991453, "loss": 0.7776, "step": 1540 }, { "epoch": 1.6927090484690375, "grad_norm": 0.49084943532943726, "learning_rate": 0.00014517704517704518, "loss": 0.6504, "step": 1541 }, { "epoch": 1.6938074969106136, "grad_norm": 0.46472617983818054, "learning_rate": 0.00014505494505494504, "loss": 0.6971, "step": 1542 }, { "epoch": 1.6949059453521902, "grad_norm": 0.4890255033969879, "learning_rate": 0.00014493284493284492, "loss": 0.9292, "step": 1543 }, { "epoch": 1.6960043937937663, "grad_norm": 0.42868301272392273, "learning_rate": 0.0001448107448107448, "loss": 0.6024, "step": 1544 }, { "epoch": 1.6971028422353425, "grad_norm": 0.5118973255157471, "learning_rate": 0.00014468864468864467, "loss": 0.7598, "step": 1545 }, { "epoch": 1.6982012906769188, "grad_norm": 0.40809181332588196, "learning_rate": 0.00014456654456654455, "loss": 0.5157, "step": 1546 }, { "epoch": 1.6992997391184952, "grad_norm": 0.5236404538154602, "learning_rate": 0.0001444444444444444, "loss": 0.84, "step": 1547 }, { "epoch": 1.7003981875600713, "grad_norm": 0.5712966322898865, "learning_rate": 0.00014432234432234432, "loss": 0.7208, "step": 1548 }, { "epoch": 1.7014966360016477, "grad_norm": 0.2910475730895996, "learning_rate": 0.00014420024420024418, "loss": 0.4998, "step": 1549 }, { "epoch": 1.702595084443224, "grad_norm": 0.5326736569404602, "learning_rate": 0.00014407814407814407, "loss": 0.5492, "step": 1550 }, { "epoch": 1.7036935328848002, "grad_norm": 0.5454451441764832, "learning_rate": 0.00014395604395604395, "loss": 0.9016, "step": 1551 }, { "epoch": 1.7047919813263763, "grad_norm": 0.45031625032424927, "learning_rate": 0.0001438339438339438, "loss": 0.671, "step": 1552 }, { "epoch": 1.705890429767953, "grad_norm": 0.5496229529380798, "learning_rate": 0.0001437118437118437, "loss": 0.6333, "step": 1553 }, { "epoch": 1.706988878209529, "grad_norm": 0.4200669825077057, "learning_rate": 0.00014358974358974358, "loss": 0.6158, "step": 1554 }, { "epoch": 1.7080873266511052, "grad_norm": 0.7623536586761475, "learning_rate": 0.00014346764346764347, "loss": 0.686, "step": 1555 }, { "epoch": 1.7091857750926815, "grad_norm": 0.3363445997238159, "learning_rate": 0.00014334554334554333, "loss": 0.305, "step": 1556 }, { "epoch": 1.710284223534258, "grad_norm": 0.5042807459831238, "learning_rate": 0.0001432234432234432, "loss": 0.72, "step": 1557 }, { "epoch": 1.711382671975834, "grad_norm": 0.5264353156089783, "learning_rate": 0.0001431013431013431, "loss": 0.6778, "step": 1558 }, { "epoch": 1.7124811204174104, "grad_norm": 0.48960715532302856, "learning_rate": 0.00014297924297924296, "loss": 0.4935, "step": 1559 }, { "epoch": 1.7135795688589868, "grad_norm": 0.4308861792087555, "learning_rate": 0.00014285714285714284, "loss": 0.6527, "step": 1560 }, { "epoch": 1.714678017300563, "grad_norm": 0.42890703678131104, "learning_rate": 0.00014273504273504273, "loss": 0.4846, "step": 1561 }, { "epoch": 1.7157764657421393, "grad_norm": 0.5222750902175903, "learning_rate": 0.0001426129426129426, "loss": 0.764, "step": 1562 }, { "epoch": 1.7168749141837156, "grad_norm": 0.49664998054504395, "learning_rate": 0.00014249084249084247, "loss": 0.5728, "step": 1563 }, { "epoch": 1.7179733626252918, "grad_norm": 0.3131520748138428, "learning_rate": 0.00014236874236874236, "loss": 0.5089, "step": 1564 }, { "epoch": 1.719071811066868, "grad_norm": 0.5098987221717834, "learning_rate": 0.00014224664224664224, "loss": 0.781, "step": 1565 }, { "epoch": 1.7201702595084445, "grad_norm": 0.4040893316268921, "learning_rate": 0.0001421245421245421, "loss": 0.7358, "step": 1566 }, { "epoch": 1.7212687079500206, "grad_norm": 0.3601396679878235, "learning_rate": 0.00014200244200244198, "loss": 0.5531, "step": 1567 }, { "epoch": 1.7223671563915968, "grad_norm": 0.6634377837181091, "learning_rate": 0.00014188034188034187, "loss": 0.6548, "step": 1568 }, { "epoch": 1.7234656048331731, "grad_norm": 0.35935553908348083, "learning_rate": 0.00014175824175824173, "loss": 0.5653, "step": 1569 }, { "epoch": 1.7245640532747495, "grad_norm": 0.4607802927494049, "learning_rate": 0.00014163614163614164, "loss": 0.9111, "step": 1570 }, { "epoch": 1.7256625017163256, "grad_norm": 1.0116467475891113, "learning_rate": 0.0001415140415140415, "loss": 0.9226, "step": 1571 }, { "epoch": 1.726760950157902, "grad_norm": 0.9484761953353882, "learning_rate": 0.00014139194139194139, "loss": 0.7536, "step": 1572 }, { "epoch": 1.7278593985994783, "grad_norm": 0.3684981167316437, "learning_rate": 0.00014126984126984124, "loss": 0.5013, "step": 1573 }, { "epoch": 1.7289578470410545, "grad_norm": 0.40037083625793457, "learning_rate": 0.00014114774114774113, "loss": 0.8069, "step": 1574 }, { "epoch": 1.7300562954826306, "grad_norm": 0.42828282713890076, "learning_rate": 0.00014102564102564101, "loss": 0.5586, "step": 1575 }, { "epoch": 1.7311547439242072, "grad_norm": 0.3461548686027527, "learning_rate": 0.00014090354090354087, "loss": 0.6045, "step": 1576 }, { "epoch": 1.7322531923657833, "grad_norm": 0.622982919216156, "learning_rate": 0.00014078144078144079, "loss": 0.8943, "step": 1577 }, { "epoch": 1.7333516408073595, "grad_norm": 0.3318479359149933, "learning_rate": 0.00014065934065934064, "loss": 0.4058, "step": 1578 }, { "epoch": 1.7344500892489358, "grad_norm": 0.5178685188293457, "learning_rate": 0.00014053724053724053, "loss": 0.5839, "step": 1579 }, { "epoch": 1.7355485376905122, "grad_norm": 0.44273868203163147, "learning_rate": 0.00014041514041514042, "loss": 0.5394, "step": 1580 }, { "epoch": 1.7366469861320883, "grad_norm": 0.60169517993927, "learning_rate": 0.00014029304029304027, "loss": 0.6753, "step": 1581 }, { "epoch": 1.7377454345736647, "grad_norm": 0.7691718339920044, "learning_rate": 0.00014017094017094016, "loss": 0.9618, "step": 1582 }, { "epoch": 1.738843883015241, "grad_norm": 0.3900390565395355, "learning_rate": 0.00014004884004884004, "loss": 0.5809, "step": 1583 }, { "epoch": 1.7399423314568172, "grad_norm": 0.6272429823875427, "learning_rate": 0.00013992673992673993, "loss": 0.8579, "step": 1584 }, { "epoch": 1.7410407798983936, "grad_norm": 0.30017220973968506, "learning_rate": 0.0001398046398046398, "loss": 0.5335, "step": 1585 }, { "epoch": 1.74213922833997, "grad_norm": 0.4937066435813904, "learning_rate": 0.00013968253968253967, "loss": 0.7941, "step": 1586 }, { "epoch": 1.743237676781546, "grad_norm": 0.47317594289779663, "learning_rate": 0.00013956043956043956, "loss": 0.6013, "step": 1587 }, { "epoch": 1.7443361252231222, "grad_norm": 1.9155733585357666, "learning_rate": 0.00013943833943833942, "loss": 0.6708, "step": 1588 }, { "epoch": 1.7454345736646986, "grad_norm": 0.3844835162162781, "learning_rate": 0.0001393162393162393, "loss": 0.7176, "step": 1589 }, { "epoch": 1.746533022106275, "grad_norm": 0.42810145020484924, "learning_rate": 0.0001391941391941392, "loss": 0.9255, "step": 1590 }, { "epoch": 1.747631470547851, "grad_norm": 3.846015691757202, "learning_rate": 0.00013907203907203905, "loss": 0.6202, "step": 1591 }, { "epoch": 1.7487299189894274, "grad_norm": 0.42783257365226746, "learning_rate": 0.00013894993894993893, "loss": 0.7451, "step": 1592 }, { "epoch": 1.7498283674310038, "grad_norm": 0.5237023234367371, "learning_rate": 0.00013882783882783882, "loss": 0.7961, "step": 1593 }, { "epoch": 1.75092681587258, "grad_norm": 2.5639729499816895, "learning_rate": 0.0001387057387057387, "loss": 0.7026, "step": 1594 }, { "epoch": 1.7520252643141563, "grad_norm": 0.5686498284339905, "learning_rate": 0.00013858363858363856, "loss": 0.4916, "step": 1595 }, { "epoch": 1.7531237127557326, "grad_norm": 0.561611533164978, "learning_rate": 0.00013846153846153845, "loss": 0.772, "step": 1596 }, { "epoch": 1.7542221611973088, "grad_norm": 0.6220077872276306, "learning_rate": 0.00013833943833943833, "loss": 0.5694, "step": 1597 }, { "epoch": 1.755320609638885, "grad_norm": 0.6902570724487305, "learning_rate": 0.0001382173382173382, "loss": 0.7963, "step": 1598 }, { "epoch": 1.7564190580804615, "grad_norm": 2.0417702198028564, "learning_rate": 0.00013809523809523808, "loss": 0.6721, "step": 1599 }, { "epoch": 1.7575175065220376, "grad_norm": 0.36764901876449585, "learning_rate": 0.00013797313797313796, "loss": 0.5714, "step": 1600 }, { "epoch": 1.7586159549636138, "grad_norm": 0.6679022908210754, "learning_rate": 0.00013785103785103785, "loss": 0.7025, "step": 1601 }, { "epoch": 1.7597144034051901, "grad_norm": 0.5749796628952026, "learning_rate": 0.0001377289377289377, "loss": 0.7381, "step": 1602 }, { "epoch": 1.7608128518467665, "grad_norm": 0.9285687208175659, "learning_rate": 0.0001376068376068376, "loss": 0.6, "step": 1603 }, { "epoch": 1.7619113002883426, "grad_norm": 0.8209772706031799, "learning_rate": 0.00013748473748473748, "loss": 0.5701, "step": 1604 }, { "epoch": 1.763009748729919, "grad_norm": 0.7823337912559509, "learning_rate": 0.00013736263736263734, "loss": 0.6695, "step": 1605 }, { "epoch": 1.7641081971714954, "grad_norm": 0.4885605275630951, "learning_rate": 0.00013724053724053725, "loss": 0.6487, "step": 1606 }, { "epoch": 1.7652066456130715, "grad_norm": 0.36517488956451416, "learning_rate": 0.0001371184371184371, "loss": 0.5798, "step": 1607 }, { "epoch": 1.7663050940546479, "grad_norm": 0.49961966276168823, "learning_rate": 0.000136996336996337, "loss": 0.4373, "step": 1608 }, { "epoch": 1.7674035424962242, "grad_norm": 0.495263010263443, "learning_rate": 0.00013687423687423688, "loss": 0.5868, "step": 1609 }, { "epoch": 1.7685019909378004, "grad_norm": 0.7384648323059082, "learning_rate": 0.00013675213675213674, "loss": 0.4957, "step": 1610 }, { "epoch": 1.7696004393793765, "grad_norm": 0.465440034866333, "learning_rate": 0.00013663003663003662, "loss": 0.7424, "step": 1611 }, { "epoch": 1.7706988878209529, "grad_norm": 0.68381667137146, "learning_rate": 0.00013650793650793648, "loss": 1.0421, "step": 1612 }, { "epoch": 1.7717973362625292, "grad_norm": 4.455906867980957, "learning_rate": 0.00013638583638583637, "loss": 0.6626, "step": 1613 }, { "epoch": 1.7728957847041054, "grad_norm": 0.6165801286697388, "learning_rate": 0.00013626373626373625, "loss": 0.6072, "step": 1614 }, { "epoch": 1.7739942331456817, "grad_norm": 0.8296604156494141, "learning_rate": 0.00013614163614163614, "loss": 0.6507, "step": 1615 }, { "epoch": 1.775092681587258, "grad_norm": 0.4678190350532532, "learning_rate": 0.00013601953601953602, "loss": 0.8466, "step": 1616 }, { "epoch": 1.7761911300288342, "grad_norm": 1.2141482830047607, "learning_rate": 0.00013589743589743588, "loss": 0.513, "step": 1617 }, { "epoch": 1.7772895784704106, "grad_norm": 0.4522024691104889, "learning_rate": 0.00013577533577533577, "loss": 0.7571, "step": 1618 }, { "epoch": 1.778388026911987, "grad_norm": 2.0903220176696777, "learning_rate": 0.00013565323565323565, "loss": 0.7359, "step": 1619 }, { "epoch": 1.779486475353563, "grad_norm": 0.5292307734489441, "learning_rate": 0.0001355311355311355, "loss": 0.6526, "step": 1620 }, { "epoch": 1.7805849237951392, "grad_norm": 0.5047786235809326, "learning_rate": 0.0001354090354090354, "loss": 0.7056, "step": 1621 }, { "epoch": 1.7816833722367158, "grad_norm": 0.4102507531642914, "learning_rate": 0.00013528693528693528, "loss": 0.8673, "step": 1622 }, { "epoch": 1.782781820678292, "grad_norm": 0.471556693315506, "learning_rate": 0.00013516483516483517, "loss": 0.9424, "step": 1623 }, { "epoch": 1.783880269119868, "grad_norm": 0.6595687866210938, "learning_rate": 0.00013504273504273502, "loss": 0.661, "step": 1624 }, { "epoch": 1.7849787175614444, "grad_norm": 0.6221860647201538, "learning_rate": 0.0001349206349206349, "loss": 0.5457, "step": 1625 }, { "epoch": 1.7860771660030208, "grad_norm": 0.9256211519241333, "learning_rate": 0.0001347985347985348, "loss": 0.9216, "step": 1626 }, { "epoch": 1.787175614444597, "grad_norm": 0.31376492977142334, "learning_rate": 0.00013467643467643465, "loss": 0.7071, "step": 1627 }, { "epoch": 1.7882740628861733, "grad_norm": 0.5313776135444641, "learning_rate": 0.00013455433455433454, "loss": 0.8111, "step": 1628 }, { "epoch": 1.7893725113277497, "grad_norm": 0.8203330636024475, "learning_rate": 0.00013443223443223442, "loss": 0.5301, "step": 1629 }, { "epoch": 1.7904709597693258, "grad_norm": 0.42774948477745056, "learning_rate": 0.0001343101343101343, "loss": 0.8359, "step": 1630 }, { "epoch": 1.791569408210902, "grad_norm": 0.8165685534477234, "learning_rate": 0.00013418803418803417, "loss": 0.4894, "step": 1631 }, { "epoch": 1.7926678566524785, "grad_norm": 0.5739139318466187, "learning_rate": 0.00013406593406593405, "loss": 0.7009, "step": 1632 }, { "epoch": 1.7937663050940547, "grad_norm": 0.5102986097335815, "learning_rate": 0.00013394383394383394, "loss": 0.7174, "step": 1633 }, { "epoch": 1.7948647535356308, "grad_norm": 1.1377652883529663, "learning_rate": 0.0001338217338217338, "loss": 0.79, "step": 1634 }, { "epoch": 1.7959632019772072, "grad_norm": 0.44272491335868835, "learning_rate": 0.00013369963369963368, "loss": 0.6761, "step": 1635 }, { "epoch": 1.7970616504187835, "grad_norm": 0.5084714889526367, "learning_rate": 0.00013357753357753357, "loss": 0.6848, "step": 1636 }, { "epoch": 1.7981600988603597, "grad_norm": 0.752017080783844, "learning_rate": 0.00013345543345543345, "loss": 0.6107, "step": 1637 }, { "epoch": 1.799258547301936, "grad_norm": 0.4430617690086365, "learning_rate": 0.0001333333333333333, "loss": 0.7639, "step": 1638 }, { "epoch": 1.8003569957435124, "grad_norm": 0.8098049759864807, "learning_rate": 0.0001332112332112332, "loss": 0.8172, "step": 1639 }, { "epoch": 1.8014554441850885, "grad_norm": 0.6817697286605835, "learning_rate": 0.00013308913308913308, "loss": 0.8274, "step": 1640 }, { "epoch": 1.8025538926266649, "grad_norm": 0.5132669806480408, "learning_rate": 0.00013296703296703294, "loss": 0.6269, "step": 1641 }, { "epoch": 1.8036523410682412, "grad_norm": 0.8487284183502197, "learning_rate": 0.00013284493284493283, "loss": 0.6734, "step": 1642 }, { "epoch": 1.8047507895098174, "grad_norm": 0.7084116339683533, "learning_rate": 0.0001327228327228327, "loss": 0.703, "step": 1643 }, { "epoch": 1.8058492379513935, "grad_norm": 0.39045432209968567, "learning_rate": 0.00013260073260073257, "loss": 0.5466, "step": 1644 }, { "epoch": 1.8069476863929699, "grad_norm": 0.4408475160598755, "learning_rate": 0.00013247863247863248, "loss": 0.4998, "step": 1645 }, { "epoch": 1.8080461348345462, "grad_norm": 0.41640380024909973, "learning_rate": 0.00013235653235653234, "loss": 0.49, "step": 1646 }, { "epoch": 1.8091445832761224, "grad_norm": 0.6760729551315308, "learning_rate": 0.00013223443223443223, "loss": 0.4537, "step": 1647 }, { "epoch": 1.8102430317176987, "grad_norm": 0.42953255772590637, "learning_rate": 0.0001321123321123321, "loss": 0.489, "step": 1648 }, { "epoch": 1.811341480159275, "grad_norm": 0.3260825574398041, "learning_rate": 0.00013199023199023197, "loss": 0.6633, "step": 1649 }, { "epoch": 1.8124399286008512, "grad_norm": 0.7073171138763428, "learning_rate": 0.00013186813186813186, "loss": 0.4953, "step": 1650 }, { "epoch": 1.8135383770424276, "grad_norm": 0.36153069138526917, "learning_rate": 0.00013174603174603172, "loss": 0.7641, "step": 1651 }, { "epoch": 1.814636825484004, "grad_norm": 0.4233636260032654, "learning_rate": 0.00013162393162393163, "loss": 0.7119, "step": 1652 }, { "epoch": 1.81573527392558, "grad_norm": 0.5262153148651123, "learning_rate": 0.0001315018315018315, "loss": 0.4516, "step": 1653 }, { "epoch": 1.8168337223671562, "grad_norm": 0.5263295769691467, "learning_rate": 0.00013137973137973137, "loss": 0.7786, "step": 1654 }, { "epoch": 1.8179321708087328, "grad_norm": 0.3681116998195648, "learning_rate": 0.00013125763125763126, "loss": 0.5295, "step": 1655 }, { "epoch": 1.819030619250309, "grad_norm": 0.5075433254241943, "learning_rate": 0.00013113553113553112, "loss": 0.6017, "step": 1656 }, { "epoch": 1.820129067691885, "grad_norm": 0.2960616946220398, "learning_rate": 0.000131013431013431, "loss": 0.4951, "step": 1657 }, { "epoch": 1.8212275161334615, "grad_norm": 0.4010205864906311, "learning_rate": 0.0001308913308913309, "loss": 0.8916, "step": 1658 }, { "epoch": 1.8223259645750378, "grad_norm": 0.9112391471862793, "learning_rate": 0.00013076923076923077, "loss": 0.4978, "step": 1659 }, { "epoch": 1.823424413016614, "grad_norm": 0.7214633226394653, "learning_rate": 0.00013064713064713063, "loss": 0.791, "step": 1660 }, { "epoch": 1.8245228614581903, "grad_norm": 0.4174933433532715, "learning_rate": 0.00013052503052503052, "loss": 0.4099, "step": 1661 }, { "epoch": 1.8256213098997667, "grad_norm": 0.4622137248516083, "learning_rate": 0.0001304029304029304, "loss": 1.1726, "step": 1662 }, { "epoch": 1.8267197583413428, "grad_norm": 0.5991957783699036, "learning_rate": 0.00013028083028083026, "loss": 0.6713, "step": 1663 }, { "epoch": 1.8278182067829192, "grad_norm": 0.43959730863571167, "learning_rate": 0.00013015873015873015, "loss": 0.5676, "step": 1664 }, { "epoch": 1.8289166552244955, "grad_norm": 0.6271671056747437, "learning_rate": 0.00013003663003663003, "loss": 0.7399, "step": 1665 }, { "epoch": 1.8300151036660717, "grad_norm": 0.6412084102630615, "learning_rate": 0.0001299145299145299, "loss": 0.7585, "step": 1666 }, { "epoch": 1.8311135521076478, "grad_norm": 0.4066605269908905, "learning_rate": 0.00012979242979242977, "loss": 0.5756, "step": 1667 }, { "epoch": 1.8322120005492242, "grad_norm": 0.3568172752857208, "learning_rate": 0.00012967032967032966, "loss": 0.968, "step": 1668 }, { "epoch": 1.8333104489908005, "grad_norm": 0.5061100721359253, "learning_rate": 0.00012954822954822955, "loss": 0.5089, "step": 1669 }, { "epoch": 1.8344088974323767, "grad_norm": 3.013622522354126, "learning_rate": 0.0001294261294261294, "loss": 0.5101, "step": 1670 }, { "epoch": 1.835507345873953, "grad_norm": 0.40078219771385193, "learning_rate": 0.0001293040293040293, "loss": 0.5602, "step": 1671 }, { "epoch": 1.8366057943155294, "grad_norm": 0.4108009338378906, "learning_rate": 0.00012918192918192918, "loss": 0.6338, "step": 1672 }, { "epoch": 1.8377042427571055, "grad_norm": 0.5452212691307068, "learning_rate": 0.00012905982905982903, "loss": 0.5358, "step": 1673 }, { "epoch": 1.838802691198682, "grad_norm": 0.4694603979587555, "learning_rate": 0.00012893772893772895, "loss": 0.7031, "step": 1674 }, { "epoch": 1.8399011396402583, "grad_norm": 0.3787671625614166, "learning_rate": 0.0001288156288156288, "loss": 0.5667, "step": 1675 }, { "epoch": 1.8409995880818344, "grad_norm": 0.4842737317085266, "learning_rate": 0.0001286935286935287, "loss": 0.5082, "step": 1676 }, { "epoch": 1.8420980365234105, "grad_norm": 0.7690992951393127, "learning_rate": 0.00012857142857142855, "loss": 0.706, "step": 1677 }, { "epoch": 1.8431964849649871, "grad_norm": 1.0891668796539307, "learning_rate": 0.00012844932844932843, "loss": 0.7162, "step": 1678 }, { "epoch": 1.8442949334065633, "grad_norm": 0.4118032157421112, "learning_rate": 0.00012832722832722832, "loss": 0.7019, "step": 1679 }, { "epoch": 1.8453933818481394, "grad_norm": 0.513157308101654, "learning_rate": 0.00012820512820512818, "loss": 0.4359, "step": 1680 }, { "epoch": 1.8464918302897158, "grad_norm": 1.3229504823684692, "learning_rate": 0.0001280830280830281, "loss": 0.5555, "step": 1681 }, { "epoch": 1.8475902787312921, "grad_norm": 0.6301699876785278, "learning_rate": 0.00012796092796092795, "loss": 0.5211, "step": 1682 }, { "epoch": 1.8486887271728683, "grad_norm": 0.6125632524490356, "learning_rate": 0.00012783882783882783, "loss": 0.6287, "step": 1683 }, { "epoch": 1.8497871756144446, "grad_norm": 1.806593418121338, "learning_rate": 0.00012771672771672772, "loss": 0.5794, "step": 1684 }, { "epoch": 1.850885624056021, "grad_norm": 1.2972358465194702, "learning_rate": 0.00012759462759462758, "loss": 0.9205, "step": 1685 }, { "epoch": 1.8519840724975971, "grad_norm": 1.0519033670425415, "learning_rate": 0.00012747252747252746, "loss": 0.7103, "step": 1686 }, { "epoch": 1.8530825209391735, "grad_norm": 1.6489734649658203, "learning_rate": 0.00012735042735042735, "loss": 0.7585, "step": 1687 }, { "epoch": 1.8541809693807498, "grad_norm": 0.7229527235031128, "learning_rate": 0.0001272283272283272, "loss": 0.8109, "step": 1688 }, { "epoch": 1.855279417822326, "grad_norm": 0.35257261991500854, "learning_rate": 0.0001271062271062271, "loss": 0.8014, "step": 1689 }, { "epoch": 1.856377866263902, "grad_norm": 0.4653327167034149, "learning_rate": 0.00012698412698412698, "loss": 0.6404, "step": 1690 }, { "epoch": 1.8574763147054785, "grad_norm": 0.5230842232704163, "learning_rate": 0.00012686202686202686, "loss": 0.7413, "step": 1691 }, { "epoch": 1.8585747631470548, "grad_norm": 0.42130210995674133, "learning_rate": 0.00012673992673992672, "loss": 0.7283, "step": 1692 }, { "epoch": 1.859673211588631, "grad_norm": 1.4667960405349731, "learning_rate": 0.0001266178266178266, "loss": 0.5656, "step": 1693 }, { "epoch": 1.8607716600302073, "grad_norm": 0.4077359139919281, "learning_rate": 0.0001264957264957265, "loss": 0.5891, "step": 1694 }, { "epoch": 1.8618701084717837, "grad_norm": 0.503654956817627, "learning_rate": 0.00012637362637362635, "loss": 0.5912, "step": 1695 }, { "epoch": 1.8629685569133598, "grad_norm": 1.6315315961837769, "learning_rate": 0.00012625152625152624, "loss": 0.5588, "step": 1696 }, { "epoch": 1.8640670053549362, "grad_norm": 0.783920407295227, "learning_rate": 0.00012612942612942612, "loss": 0.6585, "step": 1697 }, { "epoch": 1.8651654537965126, "grad_norm": 0.7186728715896606, "learning_rate": 0.000126007326007326, "loss": 0.9174, "step": 1698 }, { "epoch": 1.8662639022380887, "grad_norm": 0.8784156441688538, "learning_rate": 0.00012588522588522587, "loss": 0.5835, "step": 1699 }, { "epoch": 1.8673623506796648, "grad_norm": 0.7090787887573242, "learning_rate": 0.00012576312576312575, "loss": 0.7555, "step": 1700 }, { "epoch": 1.8684607991212414, "grad_norm": 0.5508129596710205, "learning_rate": 0.00012564102564102564, "loss": 0.6168, "step": 1701 }, { "epoch": 1.8695592475628175, "grad_norm": 0.40403681993484497, "learning_rate": 0.0001255189255189255, "loss": 0.4528, "step": 1702 }, { "epoch": 1.8706576960043937, "grad_norm": 0.9553635716438293, "learning_rate": 0.00012539682539682538, "loss": 0.654, "step": 1703 }, { "epoch": 1.87175614444597, "grad_norm": 1.0610092878341675, "learning_rate": 0.00012527472527472527, "loss": 0.6115, "step": 1704 }, { "epoch": 1.8728545928875464, "grad_norm": 0.32898634672164917, "learning_rate": 0.00012515262515262515, "loss": 0.5651, "step": 1705 }, { "epoch": 1.8739530413291225, "grad_norm": 0.4018780589103699, "learning_rate": 0.000125030525030525, "loss": 0.5919, "step": 1706 }, { "epoch": 1.875051489770699, "grad_norm": 1.6521873474121094, "learning_rate": 0.0001249084249084249, "loss": 0.7137, "step": 1707 }, { "epoch": 1.8761499382122753, "grad_norm": 0.5515930652618408, "learning_rate": 0.00012478632478632478, "loss": 0.4471, "step": 1708 }, { "epoch": 1.8772483866538514, "grad_norm": 0.4156915545463562, "learning_rate": 0.00012466422466422464, "loss": 0.6575, "step": 1709 }, { "epoch": 1.8783468350954275, "grad_norm": 0.41263312101364136, "learning_rate": 0.00012454212454212453, "loss": 0.542, "step": 1710 }, { "epoch": 1.8794452835370041, "grad_norm": 1.0169517993927002, "learning_rate": 0.0001244200244200244, "loss": 1.1631, "step": 1711 }, { "epoch": 1.8805437319785803, "grad_norm": 0.49169981479644775, "learning_rate": 0.0001242979242979243, "loss": 0.6707, "step": 1712 }, { "epoch": 1.8816421804201564, "grad_norm": 0.44801297783851624, "learning_rate": 0.00012417582417582416, "loss": 1.0036, "step": 1713 }, { "epoch": 1.8827406288617328, "grad_norm": 0.47181040048599243, "learning_rate": 0.00012405372405372404, "loss": 0.6693, "step": 1714 }, { "epoch": 1.8838390773033091, "grad_norm": 0.39900457859039307, "learning_rate": 0.00012393162393162393, "loss": 0.6421, "step": 1715 }, { "epoch": 1.8849375257448853, "grad_norm": 1.1160179376602173, "learning_rate": 0.00012380952380952378, "loss": 0.6599, "step": 1716 }, { "epoch": 1.8860359741864616, "grad_norm": 0.6951555609703064, "learning_rate": 0.00012368742368742367, "loss": 0.743, "step": 1717 }, { "epoch": 1.887134422628038, "grad_norm": 0.5381472706794739, "learning_rate": 0.00012356532356532356, "loss": 0.5051, "step": 1718 }, { "epoch": 1.8882328710696141, "grad_norm": 0.48717793822288513, "learning_rate": 0.00012344322344322341, "loss": 0.7015, "step": 1719 }, { "epoch": 1.8893313195111905, "grad_norm": 0.3720596432685852, "learning_rate": 0.00012332112332112333, "loss": 0.6743, "step": 1720 }, { "epoch": 1.8904297679527668, "grad_norm": 1.1850451231002808, "learning_rate": 0.00012319902319902318, "loss": 0.6132, "step": 1721 }, { "epoch": 1.891528216394343, "grad_norm": 0.4546525180339813, "learning_rate": 0.00012307692307692307, "loss": 0.5465, "step": 1722 }, { "epoch": 1.8926266648359191, "grad_norm": 0.41415080428123474, "learning_rate": 0.00012295482295482296, "loss": 0.7259, "step": 1723 }, { "epoch": 1.8937251132774955, "grad_norm": 0.44278842210769653, "learning_rate": 0.00012283272283272281, "loss": 0.7244, "step": 1724 }, { "epoch": 1.8948235617190718, "grad_norm": 0.3887364864349365, "learning_rate": 0.0001227106227106227, "loss": 0.7124, "step": 1725 }, { "epoch": 1.895922010160648, "grad_norm": 0.5405781269073486, "learning_rate": 0.00012258852258852256, "loss": 0.5153, "step": 1726 }, { "epoch": 1.8970204586022243, "grad_norm": 0.3530559837818146, "learning_rate": 0.00012246642246642247, "loss": 0.5429, "step": 1727 }, { "epoch": 1.8981189070438007, "grad_norm": 0.523621678352356, "learning_rate": 0.00012234432234432233, "loss": 0.5645, "step": 1728 }, { "epoch": 1.8992173554853768, "grad_norm": 0.3893704116344452, "learning_rate": 0.00012222222222222221, "loss": 0.6419, "step": 1729 }, { "epoch": 1.9003158039269532, "grad_norm": 0.7010704278945923, "learning_rate": 0.0001221001221001221, "loss": 0.5202, "step": 1730 }, { "epoch": 1.9014142523685296, "grad_norm": 0.45551490783691406, "learning_rate": 0.00012197802197802197, "loss": 0.8492, "step": 1731 }, { "epoch": 1.9025127008101057, "grad_norm": 1.0112484693527222, "learning_rate": 0.00012185592185592184, "loss": 0.8602, "step": 1732 }, { "epoch": 1.9036111492516818, "grad_norm": 0.4509601294994354, "learning_rate": 0.00012173382173382173, "loss": 0.6138, "step": 1733 }, { "epoch": 1.9047095976932584, "grad_norm": 0.4303388297557831, "learning_rate": 0.0001216117216117216, "loss": 0.4748, "step": 1734 }, { "epoch": 1.9058080461348346, "grad_norm": 0.4452000558376312, "learning_rate": 0.00012148962148962147, "loss": 0.5869, "step": 1735 }, { "epoch": 1.9069064945764107, "grad_norm": 0.5915077924728394, "learning_rate": 0.00012136752136752136, "loss": 0.8057, "step": 1736 }, { "epoch": 1.908004943017987, "grad_norm": 0.38761547207832336, "learning_rate": 0.00012124542124542123, "loss": 0.5772, "step": 1737 }, { "epoch": 1.9091033914595634, "grad_norm": 0.517752468585968, "learning_rate": 0.00012112332112332112, "loss": 0.7865, "step": 1738 }, { "epoch": 1.9102018399011396, "grad_norm": 0.5325546860694885, "learning_rate": 0.00012100122100122099, "loss": 0.5934, "step": 1739 }, { "epoch": 1.911300288342716, "grad_norm": 0.3930620551109314, "learning_rate": 0.00012087912087912087, "loss": 0.5974, "step": 1740 }, { "epoch": 1.9123987367842923, "grad_norm": 1.1001818180084229, "learning_rate": 0.00012075702075702075, "loss": 0.6524, "step": 1741 }, { "epoch": 1.9134971852258684, "grad_norm": 0.3690165877342224, "learning_rate": 0.00012063492063492062, "loss": 0.36, "step": 1742 }, { "epoch": 1.9145956336674448, "grad_norm": 0.4403206408023834, "learning_rate": 0.0001205128205128205, "loss": 0.5737, "step": 1743 }, { "epoch": 1.9156940821090211, "grad_norm": 0.651498019695282, "learning_rate": 0.00012039072039072037, "loss": 0.657, "step": 1744 }, { "epoch": 1.9167925305505973, "grad_norm": 0.6880660057067871, "learning_rate": 0.00012026862026862025, "loss": 0.6891, "step": 1745 }, { "epoch": 1.9178909789921734, "grad_norm": 0.4968664348125458, "learning_rate": 0.00012014652014652015, "loss": 0.841, "step": 1746 }, { "epoch": 1.9189894274337498, "grad_norm": 0.4392407536506653, "learning_rate": 0.00012002442002442002, "loss": 0.7096, "step": 1747 }, { "epoch": 1.9200878758753261, "grad_norm": 0.41028741002082825, "learning_rate": 0.00011990231990231989, "loss": 0.5838, "step": 1748 }, { "epoch": 1.9211863243169023, "grad_norm": 0.7928158640861511, "learning_rate": 0.00011978021978021978, "loss": 0.6633, "step": 1749 }, { "epoch": 1.9222847727584786, "grad_norm": 0.4970681071281433, "learning_rate": 0.00011965811965811965, "loss": 0.7764, "step": 1750 }, { "epoch": 1.923383221200055, "grad_norm": 0.49581378698349, "learning_rate": 0.00011953601953601952, "loss": 0.7204, "step": 1751 }, { "epoch": 1.9244816696416311, "grad_norm": 1.309241771697998, "learning_rate": 0.00011941391941391939, "loss": 0.5859, "step": 1752 }, { "epoch": 1.9255801180832075, "grad_norm": 0.4651016592979431, "learning_rate": 0.00011929181929181929, "loss": 0.6425, "step": 1753 }, { "epoch": 1.9266785665247839, "grad_norm": 0.5377634167671204, "learning_rate": 0.00011916971916971916, "loss": 0.8244, "step": 1754 }, { "epoch": 1.92777701496636, "grad_norm": 0.6809287667274475, "learning_rate": 0.00011904761904761903, "loss": 0.5711, "step": 1755 }, { "epoch": 1.9288754634079361, "grad_norm": 0.650701105594635, "learning_rate": 0.00011892551892551892, "loss": 0.8341, "step": 1756 }, { "epoch": 1.9299739118495127, "grad_norm": 1.1710751056671143, "learning_rate": 0.00011880341880341879, "loss": 0.8093, "step": 1757 }, { "epoch": 1.9310723602910889, "grad_norm": 0.4244484603404999, "learning_rate": 0.00011868131868131866, "loss": 0.5556, "step": 1758 }, { "epoch": 1.932170808732665, "grad_norm": 0.43999040126800537, "learning_rate": 0.00011855921855921855, "loss": 0.4582, "step": 1759 }, { "epoch": 1.9332692571742414, "grad_norm": 0.4197145700454712, "learning_rate": 0.00011843711843711843, "loss": 0.6475, "step": 1760 }, { "epoch": 1.9343677056158177, "grad_norm": 0.36619749665260315, "learning_rate": 0.0001183150183150183, "loss": 0.5804, "step": 1761 }, { "epoch": 1.9354661540573939, "grad_norm": 1.7230706214904785, "learning_rate": 0.00011819291819291819, "loss": 0.7064, "step": 1762 }, { "epoch": 1.9365646024989702, "grad_norm": 0.7621874213218689, "learning_rate": 0.00011807081807081806, "loss": 0.6766, "step": 1763 }, { "epoch": 1.9376630509405466, "grad_norm": 0.5920525789260864, "learning_rate": 0.00011794871794871794, "loss": 0.7092, "step": 1764 }, { "epoch": 1.9387614993821227, "grad_norm": 1.5368432998657227, "learning_rate": 0.00011782661782661781, "loss": 0.3366, "step": 1765 }, { "epoch": 1.9398599478236989, "grad_norm": 0.43197643756866455, "learning_rate": 0.00011770451770451769, "loss": 0.6158, "step": 1766 }, { "epoch": 1.9409583962652754, "grad_norm": 0.4623143970966339, "learning_rate": 0.00011758241758241756, "loss": 0.6574, "step": 1767 }, { "epoch": 1.9420568447068516, "grad_norm": 0.40638601779937744, "learning_rate": 0.00011746031746031744, "loss": 0.4385, "step": 1768 }, { "epoch": 1.9431552931484277, "grad_norm": 0.5941652655601501, "learning_rate": 0.00011733821733821734, "loss": 0.8634, "step": 1769 }, { "epoch": 1.944253741590004, "grad_norm": 0.9646288156509399, "learning_rate": 0.00011721611721611721, "loss": 0.7107, "step": 1770 }, { "epoch": 1.9453521900315804, "grad_norm": 1.6859776973724365, "learning_rate": 0.00011709401709401708, "loss": 0.5544, "step": 1771 }, { "epoch": 1.9464506384731566, "grad_norm": 0.4034999907016754, "learning_rate": 0.00011697191697191697, "loss": 0.559, "step": 1772 }, { "epoch": 1.947549086914733, "grad_norm": 0.3644643723964691, "learning_rate": 0.00011684981684981684, "loss": 0.535, "step": 1773 }, { "epoch": 1.9486475353563093, "grad_norm": 0.5826202034950256, "learning_rate": 0.00011672771672771671, "loss": 0.6405, "step": 1774 }, { "epoch": 1.9497459837978854, "grad_norm": 0.5501505136489868, "learning_rate": 0.00011660561660561661, "loss": 0.5702, "step": 1775 }, { "epoch": 1.9508444322394618, "grad_norm": 0.7928853631019592, "learning_rate": 0.00011648351648351648, "loss": 0.666, "step": 1776 }, { "epoch": 1.9519428806810382, "grad_norm": 0.8168489933013916, "learning_rate": 0.00011636141636141635, "loss": 0.4451, "step": 1777 }, { "epoch": 1.9530413291226143, "grad_norm": 0.3752410113811493, "learning_rate": 0.00011623931623931622, "loss": 0.6552, "step": 1778 }, { "epoch": 1.9541397775641904, "grad_norm": 0.9020218849182129, "learning_rate": 0.00011611721611721611, "loss": 0.5994, "step": 1779 }, { "epoch": 1.9552382260057668, "grad_norm": 0.7668479084968567, "learning_rate": 0.00011599511599511598, "loss": 0.5007, "step": 1780 }, { "epoch": 1.9563366744473432, "grad_norm": 0.5034022331237793, "learning_rate": 0.00011587301587301585, "loss": 0.5211, "step": 1781 }, { "epoch": 1.9574351228889193, "grad_norm": 1.0153850317001343, "learning_rate": 0.00011575091575091575, "loss": 0.5953, "step": 1782 }, { "epoch": 1.9585335713304957, "grad_norm": 0.40088045597076416, "learning_rate": 0.00011562881562881562, "loss": 0.568, "step": 1783 }, { "epoch": 1.959632019772072, "grad_norm": 1.4017099142074585, "learning_rate": 0.0001155067155067155, "loss": 0.7058, "step": 1784 }, { "epoch": 1.9607304682136482, "grad_norm": 0.6009597778320312, "learning_rate": 0.00011538461538461538, "loss": 0.6239, "step": 1785 }, { "epoch": 1.9618289166552245, "grad_norm": 0.5155071020126343, "learning_rate": 0.00011526251526251525, "loss": 0.6089, "step": 1786 }, { "epoch": 1.9629273650968009, "grad_norm": 0.4248057007789612, "learning_rate": 0.00011514041514041513, "loss": 0.6481, "step": 1787 }, { "epoch": 1.964025813538377, "grad_norm": 0.6521177887916565, "learning_rate": 0.00011501831501831501, "loss": 0.6598, "step": 1788 }, { "epoch": 1.9651242619799532, "grad_norm": 0.44697993993759155, "learning_rate": 0.00011489621489621488, "loss": 0.8944, "step": 1789 }, { "epoch": 1.9662227104215297, "grad_norm": 0.41537097096443176, "learning_rate": 0.00011477411477411476, "loss": 0.5304, "step": 1790 }, { "epoch": 1.9673211588631059, "grad_norm": 0.48793885111808777, "learning_rate": 0.00011465201465201464, "loss": 0.7262, "step": 1791 }, { "epoch": 1.968419607304682, "grad_norm": 0.8768893480300903, "learning_rate": 0.00011452991452991453, "loss": 0.6748, "step": 1792 }, { "epoch": 1.9695180557462584, "grad_norm": 0.39224761724472046, "learning_rate": 0.0001144078144078144, "loss": 0.5503, "step": 1793 }, { "epoch": 1.9706165041878347, "grad_norm": 0.5617446899414062, "learning_rate": 0.00011428571428571427, "loss": 0.7329, "step": 1794 }, { "epoch": 1.9717149526294109, "grad_norm": 0.3787171542644501, "learning_rate": 0.00011416361416361416, "loss": 0.545, "step": 1795 }, { "epoch": 1.9728134010709872, "grad_norm": 1.5167701244354248, "learning_rate": 0.00011404151404151403, "loss": 0.492, "step": 1796 }, { "epoch": 1.9739118495125636, "grad_norm": 0.6436883807182312, "learning_rate": 0.0001139194139194139, "loss": 0.5644, "step": 1797 }, { "epoch": 1.9750102979541397, "grad_norm": 0.7104658484458923, "learning_rate": 0.0001137973137973138, "loss": 0.7485, "step": 1798 }, { "epoch": 1.976108746395716, "grad_norm": 0.7996894717216492, "learning_rate": 0.00011367521367521367, "loss": 0.6918, "step": 1799 }, { "epoch": 1.9772071948372925, "grad_norm": 0.6419106721878052, "learning_rate": 0.00011355311355311354, "loss": 0.5945, "step": 1800 }, { "epoch": 1.9783056432788686, "grad_norm": 0.5158131718635559, "learning_rate": 0.00011343101343101343, "loss": 0.6685, "step": 1801 }, { "epoch": 1.9794040917204447, "grad_norm": 1.0825144052505493, "learning_rate": 0.0001133089133089133, "loss": 0.6774, "step": 1802 }, { "epoch": 1.980502540162021, "grad_norm": 0.3999088704586029, "learning_rate": 0.00011318681318681317, "loss": 0.632, "step": 1803 }, { "epoch": 1.9816009886035975, "grad_norm": 0.8866069316864014, "learning_rate": 0.00011306471306471304, "loss": 0.6541, "step": 1804 }, { "epoch": 1.9826994370451736, "grad_norm": 0.3858928978443146, "learning_rate": 0.00011294261294261294, "loss": 0.6608, "step": 1805 }, { "epoch": 1.98379788548675, "grad_norm": 0.513117790222168, "learning_rate": 0.00011282051282051281, "loss": 0.7598, "step": 1806 }, { "epoch": 1.9848963339283263, "grad_norm": 0.3166581392288208, "learning_rate": 0.00011269841269841269, "loss": 0.781, "step": 1807 }, { "epoch": 1.9859947823699025, "grad_norm": 0.3982362151145935, "learning_rate": 0.00011257631257631257, "loss": 0.873, "step": 1808 }, { "epoch": 1.9870932308114788, "grad_norm": 0.3784008026123047, "learning_rate": 0.00011245421245421244, "loss": 0.7286, "step": 1809 }, { "epoch": 1.9881916792530552, "grad_norm": 0.7578315138816833, "learning_rate": 0.00011233211233211232, "loss": 0.5958, "step": 1810 }, { "epoch": 1.9892901276946313, "grad_norm": 0.8509061932563782, "learning_rate": 0.0001122100122100122, "loss": 0.557, "step": 1811 }, { "epoch": 1.9903885761362075, "grad_norm": 0.5107323527336121, "learning_rate": 0.00011208791208791207, "loss": 0.6994, "step": 1812 }, { "epoch": 1.991487024577784, "grad_norm": 0.5421388149261475, "learning_rate": 0.00011196581196581196, "loss": 0.8839, "step": 1813 }, { "epoch": 1.9925854730193602, "grad_norm": 0.7442356944084167, "learning_rate": 0.00011184371184371184, "loss": 0.6676, "step": 1814 }, { "epoch": 1.9936839214609363, "grad_norm": 0.34132111072540283, "learning_rate": 0.00011172161172161172, "loss": 0.5714, "step": 1815 }, { "epoch": 1.9947823699025127, "grad_norm": 0.3995620906352997, "learning_rate": 0.00011159951159951159, "loss": 0.4811, "step": 1816 }, { "epoch": 1.995880818344089, "grad_norm": 0.5613861083984375, "learning_rate": 0.00011147741147741146, "loss": 0.7495, "step": 1817 }, { "epoch": 1.9969792667856652, "grad_norm": 0.4366309642791748, "learning_rate": 0.00011135531135531135, "loss": 0.6512, "step": 1818 }, { "epoch": 1.9980777152272415, "grad_norm": 0.889916718006134, "learning_rate": 0.00011123321123321122, "loss": 0.5544, "step": 1819 }, { "epoch": 1.999176163668818, "grad_norm": 0.512112021446228, "learning_rate": 0.00011111111111111109, "loss": 1.136, "step": 1820 }, { "epoch": 2.000274612110394, "grad_norm": 0.5241844654083252, "learning_rate": 0.00011098901098901099, "loss": 0.5898, "step": 1821 }, { "epoch": 2.00137306055197, "grad_norm": 0.38159477710723877, "learning_rate": 0.00011086691086691086, "loss": 0.5523, "step": 1822 }, { "epoch": 2.0024715089935468, "grad_norm": 1.0415009260177612, "learning_rate": 0.00011074481074481073, "loss": 0.6963, "step": 1823 }, { "epoch": 2.003569957435123, "grad_norm": 0.5349957942962646, "learning_rate": 0.00011062271062271062, "loss": 0.4422, "step": 1824 }, { "epoch": 2.004668405876699, "grad_norm": 0.4512043297290802, "learning_rate": 0.00011050061050061049, "loss": 0.5467, "step": 1825 }, { "epoch": 2.0057668543182756, "grad_norm": 0.8268045783042908, "learning_rate": 0.00011037851037851036, "loss": 0.6931, "step": 1826 }, { "epoch": 2.0068653027598518, "grad_norm": 0.47922319173812866, "learning_rate": 0.00011025641025641026, "loss": 0.707, "step": 1827 }, { "epoch": 2.007963751201428, "grad_norm": 1.352858304977417, "learning_rate": 0.00011013431013431013, "loss": 0.5658, "step": 1828 }, { "epoch": 2.0090621996430045, "grad_norm": 0.6304643154144287, "learning_rate": 0.00011001221001221, "loss": 0.6526, "step": 1829 }, { "epoch": 2.0101606480845806, "grad_norm": 0.3759060502052307, "learning_rate": 0.00010989010989010988, "loss": 0.627, "step": 1830 }, { "epoch": 2.0112590965261568, "grad_norm": 0.5676531195640564, "learning_rate": 0.00010976800976800976, "loss": 0.7568, "step": 1831 }, { "epoch": 2.012357544967733, "grad_norm": 0.7481321692466736, "learning_rate": 0.00010964590964590963, "loss": 0.7304, "step": 1832 }, { "epoch": 2.0134559934093095, "grad_norm": 1.0350905656814575, "learning_rate": 0.0001095238095238095, "loss": 0.7414, "step": 1833 }, { "epoch": 2.0145544418508856, "grad_norm": 0.7817292809486389, "learning_rate": 0.00010940170940170939, "loss": 0.7742, "step": 1834 }, { "epoch": 2.0156528902924618, "grad_norm": 0.44659602642059326, "learning_rate": 0.00010927960927960928, "loss": 0.7872, "step": 1835 }, { "epoch": 2.0167513387340383, "grad_norm": 0.46931198239326477, "learning_rate": 0.00010915750915750915, "loss": 0.5596, "step": 1836 }, { "epoch": 2.0178497871756145, "grad_norm": 0.34634560346603394, "learning_rate": 0.00010903540903540903, "loss": 0.6861, "step": 1837 }, { "epoch": 2.0189482356171906, "grad_norm": 0.36579200625419617, "learning_rate": 0.0001089133089133089, "loss": 0.6586, "step": 1838 }, { "epoch": 2.020046684058767, "grad_norm": 0.9167144894599915, "learning_rate": 0.00010879120879120878, "loss": 0.7125, "step": 1839 }, { "epoch": 2.0211451325003433, "grad_norm": 0.4107789993286133, "learning_rate": 0.00010866910866910866, "loss": 0.6089, "step": 1840 }, { "epoch": 2.0222435809419195, "grad_norm": 1.0845204591751099, "learning_rate": 0.00010854700854700854, "loss": 0.499, "step": 1841 }, { "epoch": 2.0233420293834956, "grad_norm": 0.382376492023468, "learning_rate": 0.00010842490842490841, "loss": 0.5505, "step": 1842 }, { "epoch": 2.024440477825072, "grad_norm": 0.38339781761169434, "learning_rate": 0.00010830280830280828, "loss": 0.4593, "step": 1843 }, { "epoch": 2.0255389262666483, "grad_norm": 0.45328769087791443, "learning_rate": 0.00010818070818070818, "loss": 0.8437, "step": 1844 }, { "epoch": 2.0266373747082245, "grad_norm": 0.3051920533180237, "learning_rate": 0.00010805860805860805, "loss": 0.6096, "step": 1845 }, { "epoch": 2.027735823149801, "grad_norm": 0.4249560236930847, "learning_rate": 0.00010793650793650792, "loss": 0.6441, "step": 1846 }, { "epoch": 2.028834271591377, "grad_norm": 0.6639708280563354, "learning_rate": 0.00010781440781440781, "loss": 0.716, "step": 1847 }, { "epoch": 2.0299327200329533, "grad_norm": 0.4324635863304138, "learning_rate": 0.00010769230769230768, "loss": 0.5288, "step": 1848 }, { "epoch": 2.03103116847453, "grad_norm": 0.46487629413604736, "learning_rate": 0.00010757020757020755, "loss": 0.4908, "step": 1849 }, { "epoch": 2.032129616916106, "grad_norm": 0.5104641318321228, "learning_rate": 0.00010744810744810745, "loss": 0.6367, "step": 1850 }, { "epoch": 2.033228065357682, "grad_norm": 0.4010922312736511, "learning_rate": 0.00010732600732600732, "loss": 0.4266, "step": 1851 }, { "epoch": 2.0343265137992583, "grad_norm": 0.6835510730743408, "learning_rate": 0.0001072039072039072, "loss": 1.0077, "step": 1852 }, { "epoch": 2.035424962240835, "grad_norm": 0.7012602686882019, "learning_rate": 0.00010708180708180708, "loss": 0.7656, "step": 1853 }, { "epoch": 2.036523410682411, "grad_norm": 0.8202001452445984, "learning_rate": 0.00010695970695970695, "loss": 0.9796, "step": 1854 }, { "epoch": 2.037621859123987, "grad_norm": 0.37708353996276855, "learning_rate": 0.00010683760683760682, "loss": 0.3664, "step": 1855 }, { "epoch": 2.0387203075655638, "grad_norm": 0.34818801283836365, "learning_rate": 0.0001067155067155067, "loss": 0.5365, "step": 1856 }, { "epoch": 2.03981875600714, "grad_norm": 0.46427440643310547, "learning_rate": 0.0001065934065934066, "loss": 0.7503, "step": 1857 }, { "epoch": 2.040917204448716, "grad_norm": 0.4782754182815552, "learning_rate": 0.00010647130647130647, "loss": 0.9247, "step": 1858 }, { "epoch": 2.0420156528902926, "grad_norm": 0.6814667582511902, "learning_rate": 0.00010634920634920634, "loss": 0.5365, "step": 1859 }, { "epoch": 2.0431141013318688, "grad_norm": 0.4782056510448456, "learning_rate": 0.00010622710622710622, "loss": 0.7444, "step": 1860 }, { "epoch": 2.044212549773445, "grad_norm": 0.768439769744873, "learning_rate": 0.0001061050061050061, "loss": 0.6386, "step": 1861 }, { "epoch": 2.0453109982150215, "grad_norm": 0.9991740584373474, "learning_rate": 0.00010598290598290597, "loss": 0.4762, "step": 1862 }, { "epoch": 2.0464094466565976, "grad_norm": 0.4244922995567322, "learning_rate": 0.00010586080586080585, "loss": 0.4469, "step": 1863 }, { "epoch": 2.0475078950981738, "grad_norm": 0.4085465371608734, "learning_rate": 0.00010573870573870573, "loss": 0.7215, "step": 1864 }, { "epoch": 2.04860634353975, "grad_norm": 1.3068008422851562, "learning_rate": 0.0001056166056166056, "loss": 0.7781, "step": 1865 }, { "epoch": 2.0497047919813265, "grad_norm": 0.3995974659919739, "learning_rate": 0.0001054945054945055, "loss": 0.6114, "step": 1866 }, { "epoch": 2.0508032404229026, "grad_norm": 0.47944560647010803, "learning_rate": 0.00010537240537240537, "loss": 0.7355, "step": 1867 }, { "epoch": 2.0519016888644788, "grad_norm": 1.6718720197677612, "learning_rate": 0.00010525030525030524, "loss": 0.5987, "step": 1868 }, { "epoch": 2.0530001373060554, "grad_norm": 0.46015220880508423, "learning_rate": 0.00010512820512820511, "loss": 0.481, "step": 1869 }, { "epoch": 2.0540985857476315, "grad_norm": 0.4863795042037964, "learning_rate": 0.000105006105006105, "loss": 0.5877, "step": 1870 }, { "epoch": 2.0551970341892076, "grad_norm": 0.9190402030944824, "learning_rate": 0.00010488400488400487, "loss": 0.7941, "step": 1871 }, { "epoch": 2.056295482630784, "grad_norm": 0.6056554317474365, "learning_rate": 0.00010476190476190474, "loss": 0.5455, "step": 1872 }, { "epoch": 2.0573939310723603, "grad_norm": 0.7070736289024353, "learning_rate": 0.00010463980463980464, "loss": 0.6112, "step": 1873 }, { "epoch": 2.0584923795139365, "grad_norm": 0.5415268540382385, "learning_rate": 0.00010451770451770451, "loss": 0.7141, "step": 1874 }, { "epoch": 2.0595908279555126, "grad_norm": 0.45696091651916504, "learning_rate": 0.00010439560439560438, "loss": 0.7825, "step": 1875 }, { "epoch": 2.060689276397089, "grad_norm": 0.5728979706764221, "learning_rate": 0.00010427350427350427, "loss": 0.5869, "step": 1876 }, { "epoch": 2.0617877248386653, "grad_norm": 0.5910143852233887, "learning_rate": 0.00010415140415140414, "loss": 0.728, "step": 1877 }, { "epoch": 2.0628861732802415, "grad_norm": 0.530915379524231, "learning_rate": 0.00010402930402930401, "loss": 0.6459, "step": 1878 }, { "epoch": 2.063984621721818, "grad_norm": 0.36358964443206787, "learning_rate": 0.00010390720390720391, "loss": 0.7536, "step": 1879 }, { "epoch": 2.065083070163394, "grad_norm": 2.7523410320281982, "learning_rate": 0.00010378510378510379, "loss": 0.6347, "step": 1880 }, { "epoch": 2.0661815186049703, "grad_norm": 0.6842527389526367, "learning_rate": 0.00010366300366300366, "loss": 0.4943, "step": 1881 }, { "epoch": 2.067279967046547, "grad_norm": 0.5830293297767639, "learning_rate": 0.00010354090354090353, "loss": 0.5855, "step": 1882 }, { "epoch": 2.068378415488123, "grad_norm": 0.981920599937439, "learning_rate": 0.00010341880341880341, "loss": 0.4425, "step": 1883 }, { "epoch": 2.069476863929699, "grad_norm": 2.0826029777526855, "learning_rate": 0.00010329670329670329, "loss": 0.5399, "step": 1884 }, { "epoch": 2.0705753123712753, "grad_norm": 0.4648442268371582, "learning_rate": 0.00010317460317460316, "loss": 0.6203, "step": 1885 }, { "epoch": 2.071673760812852, "grad_norm": 0.5086346864700317, "learning_rate": 0.00010305250305250304, "loss": 0.6091, "step": 1886 }, { "epoch": 2.072772209254428, "grad_norm": 0.40404266119003296, "learning_rate": 0.00010293040293040292, "loss": 0.5013, "step": 1887 }, { "epoch": 2.073870657696004, "grad_norm": 2.0507569313049316, "learning_rate": 0.0001028083028083028, "loss": 0.7822, "step": 1888 }, { "epoch": 2.074969106137581, "grad_norm": 0.9318211078643799, "learning_rate": 0.00010268620268620269, "loss": 0.6638, "step": 1889 }, { "epoch": 2.076067554579157, "grad_norm": 0.7601054310798645, "learning_rate": 0.00010256410256410256, "loss": 0.6085, "step": 1890 }, { "epoch": 2.077166003020733, "grad_norm": 1.1299306154251099, "learning_rate": 0.00010244200244200243, "loss": 0.682, "step": 1891 }, { "epoch": 2.0782644514623096, "grad_norm": 0.5009475350379944, "learning_rate": 0.0001023199023199023, "loss": 0.7229, "step": 1892 }, { "epoch": 2.079362899903886, "grad_norm": 0.3432561159133911, "learning_rate": 0.00010219780219780219, "loss": 0.5991, "step": 1893 }, { "epoch": 2.080461348345462, "grad_norm": 0.5224031805992126, "learning_rate": 0.00010207570207570206, "loss": 0.3687, "step": 1894 }, { "epoch": 2.0815597967870385, "grad_norm": 0.4849548935890198, "learning_rate": 0.00010195360195360193, "loss": 0.507, "step": 1895 }, { "epoch": 2.0826582452286146, "grad_norm": 0.6093185544013977, "learning_rate": 0.00010183150183150183, "loss": 0.7019, "step": 1896 }, { "epoch": 2.083756693670191, "grad_norm": 0.7408457398414612, "learning_rate": 0.0001017094017094017, "loss": 0.6331, "step": 1897 }, { "epoch": 2.084855142111767, "grad_norm": 0.67701655626297, "learning_rate": 0.00010158730158730157, "loss": 0.6685, "step": 1898 }, { "epoch": 2.0859535905533435, "grad_norm": 0.2880030870437622, "learning_rate": 0.00010146520146520146, "loss": 0.4043, "step": 1899 }, { "epoch": 2.0870520389949196, "grad_norm": 0.45890796184539795, "learning_rate": 0.00010134310134310133, "loss": 0.3695, "step": 1900 }, { "epoch": 2.088150487436496, "grad_norm": 0.7898344397544861, "learning_rate": 0.0001012210012210012, "loss": 0.7875, "step": 1901 }, { "epoch": 2.0892489358780724, "grad_norm": 0.5648753046989441, "learning_rate": 0.0001010989010989011, "loss": 0.6058, "step": 1902 }, { "epoch": 2.0903473843196485, "grad_norm": 0.7880465984344482, "learning_rate": 0.00010097680097680098, "loss": 0.6403, "step": 1903 }, { "epoch": 2.0914458327612246, "grad_norm": 0.4169737696647644, "learning_rate": 0.00010085470085470085, "loss": 0.71, "step": 1904 }, { "epoch": 2.0925442812028012, "grad_norm": 0.33653560280799866, "learning_rate": 0.00010073260073260072, "loss": 0.6278, "step": 1905 }, { "epoch": 2.0936427296443774, "grad_norm": 0.6861558556556702, "learning_rate": 0.0001006105006105006, "loss": 0.8463, "step": 1906 }, { "epoch": 2.0947411780859535, "grad_norm": 0.29407018423080444, "learning_rate": 0.00010048840048840048, "loss": 0.5644, "step": 1907 }, { "epoch": 2.09583962652753, "grad_norm": 0.673083484172821, "learning_rate": 0.00010036630036630035, "loss": 0.8353, "step": 1908 }, { "epoch": 2.0969380749691062, "grad_norm": 0.429061621427536, "learning_rate": 0.00010024420024420023, "loss": 0.6381, "step": 1909 }, { "epoch": 2.0980365234106824, "grad_norm": 0.5113368630409241, "learning_rate": 0.00010012210012210012, "loss": 0.7603, "step": 1910 }, { "epoch": 2.0991349718522585, "grad_norm": 0.9005820751190186, "learning_rate": 9.999999999999999e-05, "loss": 0.6331, "step": 1911 }, { "epoch": 2.100233420293835, "grad_norm": 0.489851176738739, "learning_rate": 9.987789987789988e-05, "loss": 0.8564, "step": 1912 }, { "epoch": 2.1013318687354112, "grad_norm": 0.42647236585617065, "learning_rate": 9.975579975579975e-05, "loss": 0.5496, "step": 1913 }, { "epoch": 2.1024303171769874, "grad_norm": 0.9061693549156189, "learning_rate": 9.963369963369962e-05, "loss": 0.4478, "step": 1914 }, { "epoch": 2.103528765618564, "grad_norm": 0.4721933901309967, "learning_rate": 9.95115995115995e-05, "loss": 0.6066, "step": 1915 }, { "epoch": 2.10462721406014, "grad_norm": 0.7265921831130981, "learning_rate": 9.938949938949938e-05, "loss": 0.7195, "step": 1916 }, { "epoch": 2.1057256625017162, "grad_norm": 0.4521386921405792, "learning_rate": 9.926739926739925e-05, "loss": 0.6476, "step": 1917 }, { "epoch": 2.106824110943293, "grad_norm": 0.42982912063598633, "learning_rate": 9.914529914529912e-05, "loss": 0.535, "step": 1918 }, { "epoch": 2.107922559384869, "grad_norm": 0.4758259952068329, "learning_rate": 9.902319902319902e-05, "loss": 0.8106, "step": 1919 }, { "epoch": 2.109021007826445, "grad_norm": 0.69195157289505, "learning_rate": 9.890109890109889e-05, "loss": 0.6643, "step": 1920 }, { "epoch": 2.110119456268021, "grad_norm": 0.8207395672798157, "learning_rate": 9.877899877899876e-05, "loss": 0.7535, "step": 1921 }, { "epoch": 2.111217904709598, "grad_norm": 1.4245035648345947, "learning_rate": 9.865689865689865e-05, "loss": 0.6721, "step": 1922 }, { "epoch": 2.112316353151174, "grad_norm": 0.5496362447738647, "learning_rate": 9.853479853479852e-05, "loss": 0.5367, "step": 1923 }, { "epoch": 2.11341480159275, "grad_norm": 0.5466665625572205, "learning_rate": 9.84126984126984e-05, "loss": 0.6083, "step": 1924 }, { "epoch": 2.1145132500343267, "grad_norm": 0.7750464677810669, "learning_rate": 9.829059829059829e-05, "loss": 0.663, "step": 1925 }, { "epoch": 2.115611698475903, "grad_norm": 0.4978208541870117, "learning_rate": 9.816849816849817e-05, "loss": 0.6334, "step": 1926 }, { "epoch": 2.116710146917479, "grad_norm": 0.6415550708770752, "learning_rate": 9.804639804639804e-05, "loss": 0.6477, "step": 1927 }, { "epoch": 2.1178085953590555, "grad_norm": 0.644123911857605, "learning_rate": 9.792429792429792e-05, "loss": 0.668, "step": 1928 }, { "epoch": 2.1189070438006317, "grad_norm": 0.39706236124038696, "learning_rate": 9.78021978021978e-05, "loss": 0.5875, "step": 1929 }, { "epoch": 2.120005492242208, "grad_norm": 1.3733233213424683, "learning_rate": 9.768009768009767e-05, "loss": 0.6023, "step": 1930 }, { "epoch": 2.121103940683784, "grad_norm": 0.48839983344078064, "learning_rate": 9.755799755799754e-05, "loss": 0.5693, "step": 1931 }, { "epoch": 2.1222023891253605, "grad_norm": 0.3107692301273346, "learning_rate": 9.743589743589744e-05, "loss": 0.5822, "step": 1932 }, { "epoch": 2.1233008375669367, "grad_norm": 0.3988654911518097, "learning_rate": 9.731379731379731e-05, "loss": 0.5989, "step": 1933 }, { "epoch": 2.124399286008513, "grad_norm": 1.1887754201889038, "learning_rate": 9.719169719169718e-05, "loss": 0.6382, "step": 1934 }, { "epoch": 2.1254977344500894, "grad_norm": 0.43282651901245117, "learning_rate": 9.706959706959707e-05, "loss": 0.5649, "step": 1935 }, { "epoch": 2.1265961828916655, "grad_norm": 0.39243975281715393, "learning_rate": 9.694749694749694e-05, "loss": 0.7005, "step": 1936 }, { "epoch": 2.1276946313332417, "grad_norm": 0.7401454448699951, "learning_rate": 9.682539682539681e-05, "loss": 1.0632, "step": 1937 }, { "epoch": 2.1287930797748182, "grad_norm": 0.6976983547210693, "learning_rate": 9.67032967032967e-05, "loss": 0.562, "step": 1938 }, { "epoch": 2.1298915282163944, "grad_norm": 0.9784336686134338, "learning_rate": 9.658119658119657e-05, "loss": 0.8115, "step": 1939 }, { "epoch": 2.1309899766579705, "grad_norm": 0.5289125442504883, "learning_rate": 9.645909645909644e-05, "loss": 0.6161, "step": 1940 }, { "epoch": 2.132088425099547, "grad_norm": 1.414559006690979, "learning_rate": 9.633699633699634e-05, "loss": 0.7115, "step": 1941 }, { "epoch": 2.1331868735411232, "grad_norm": 0.5444177389144897, "learning_rate": 9.621489621489621e-05, "loss": 0.6211, "step": 1942 }, { "epoch": 2.1342853219826994, "grad_norm": 0.637030839920044, "learning_rate": 9.609279609279608e-05, "loss": 0.8747, "step": 1943 }, { "epoch": 2.1353837704242755, "grad_norm": 0.5926198363304138, "learning_rate": 9.597069597069595e-05, "loss": 0.8673, "step": 1944 }, { "epoch": 2.136482218865852, "grad_norm": 0.3638801872730255, "learning_rate": 9.584859584859584e-05, "loss": 0.4698, "step": 1945 }, { "epoch": 2.1375806673074282, "grad_norm": 0.5823031067848206, "learning_rate": 9.572649572649571e-05, "loss": 0.6988, "step": 1946 }, { "epoch": 2.1386791157490044, "grad_norm": 0.44348934292793274, "learning_rate": 9.560439560439558e-05, "loss": 0.6667, "step": 1947 }, { "epoch": 2.139777564190581, "grad_norm": 3.177112579345703, "learning_rate": 9.548229548229548e-05, "loss": 0.8738, "step": 1948 }, { "epoch": 2.140876012632157, "grad_norm": 1.3834997415542603, "learning_rate": 9.536019536019536e-05, "loss": 0.528, "step": 1949 }, { "epoch": 2.1419744610737332, "grad_norm": 0.5514722466468811, "learning_rate": 9.523809523809523e-05, "loss": 0.5058, "step": 1950 }, { "epoch": 2.14307290951531, "grad_norm": 0.8795000314712524, "learning_rate": 9.511599511599511e-05, "loss": 0.6368, "step": 1951 }, { "epoch": 2.144171357956886, "grad_norm": 1.0043178796768188, "learning_rate": 9.499389499389498e-05, "loss": 0.5701, "step": 1952 }, { "epoch": 2.145269806398462, "grad_norm": 1.8537780046463013, "learning_rate": 9.487179487179486e-05, "loss": 0.6978, "step": 1953 }, { "epoch": 2.1463682548400387, "grad_norm": 0.5239475965499878, "learning_rate": 9.474969474969476e-05, "loss": 0.7093, "step": 1954 }, { "epoch": 2.147466703281615, "grad_norm": 0.7944377064704895, "learning_rate": 9.462759462759463e-05, "loss": 0.7625, "step": 1955 }, { "epoch": 2.148565151723191, "grad_norm": 0.7356003522872925, "learning_rate": 9.45054945054945e-05, "loss": 0.6845, "step": 1956 }, { "epoch": 2.149663600164767, "grad_norm": 1.3590694665908813, "learning_rate": 9.438339438339437e-05, "loss": 0.6964, "step": 1957 }, { "epoch": 2.1507620486063437, "grad_norm": 0.40889453887939453, "learning_rate": 9.426129426129426e-05, "loss": 0.6643, "step": 1958 }, { "epoch": 2.15186049704792, "grad_norm": 0.6347643136978149, "learning_rate": 9.413919413919413e-05, "loss": 1.0002, "step": 1959 }, { "epoch": 2.152958945489496, "grad_norm": 0.3661377429962158, "learning_rate": 9.4017094017094e-05, "loss": 0.5084, "step": 1960 }, { "epoch": 2.1540573939310725, "grad_norm": 0.8262574672698975, "learning_rate": 9.389499389499389e-05, "loss": 0.5658, "step": 1961 }, { "epoch": 2.1551558423726487, "grad_norm": 0.6054818034172058, "learning_rate": 9.377289377289376e-05, "loss": 0.6349, "step": 1962 }, { "epoch": 2.156254290814225, "grad_norm": 0.3696078658103943, "learning_rate": 9.365079365079364e-05, "loss": 0.5746, "step": 1963 }, { "epoch": 2.157352739255801, "grad_norm": 0.7613049745559692, "learning_rate": 9.352869352869353e-05, "loss": 0.5204, "step": 1964 }, { "epoch": 2.1584511876973775, "grad_norm": 0.6841816306114197, "learning_rate": 9.34065934065934e-05, "loss": 0.813, "step": 1965 }, { "epoch": 2.1595496361389537, "grad_norm": 0.902998685836792, "learning_rate": 9.328449328449327e-05, "loss": 0.6288, "step": 1966 }, { "epoch": 2.16064808458053, "grad_norm": 0.5367470979690552, "learning_rate": 9.316239316239316e-05, "loss": 0.6689, "step": 1967 }, { "epoch": 2.1617465330221064, "grad_norm": 0.9443572163581848, "learning_rate": 9.304029304029303e-05, "loss": 0.6864, "step": 1968 }, { "epoch": 2.1628449814636825, "grad_norm": 0.42191457748413086, "learning_rate": 9.29181929181929e-05, "loss": 0.6509, "step": 1969 }, { "epoch": 2.1639434299052587, "grad_norm": 0.6019404530525208, "learning_rate": 9.279609279609277e-05, "loss": 0.5252, "step": 1970 }, { "epoch": 2.1650418783468353, "grad_norm": 1.9933907985687256, "learning_rate": 9.267399267399267e-05, "loss": 0.6042, "step": 1971 }, { "epoch": 2.1661403267884114, "grad_norm": 0.33075836300849915, "learning_rate": 9.255189255189255e-05, "loss": 0.579, "step": 1972 }, { "epoch": 2.1672387752299875, "grad_norm": 0.37899547815322876, "learning_rate": 9.242979242979242e-05, "loss": 0.5006, "step": 1973 }, { "epoch": 2.168337223671564, "grad_norm": 0.6482734680175781, "learning_rate": 9.23076923076923e-05, "loss": 0.4844, "step": 1974 }, { "epoch": 2.1694356721131403, "grad_norm": 0.47632062435150146, "learning_rate": 9.218559218559217e-05, "loss": 0.5844, "step": 1975 }, { "epoch": 2.1705341205547164, "grad_norm": 0.3402813971042633, "learning_rate": 9.206349206349205e-05, "loss": 0.6397, "step": 1976 }, { "epoch": 2.1716325689962925, "grad_norm": 0.47405871748924255, "learning_rate": 9.194139194139195e-05, "loss": 0.6436, "step": 1977 }, { "epoch": 2.172731017437869, "grad_norm": 0.5474234223365784, "learning_rate": 9.181929181929182e-05, "loss": 0.5758, "step": 1978 }, { "epoch": 2.1738294658794453, "grad_norm": 0.5423378348350525, "learning_rate": 9.169719169719169e-05, "loss": 0.5882, "step": 1979 }, { "epoch": 2.1749279143210214, "grad_norm": 0.32848963141441345, "learning_rate": 9.157509157509158e-05, "loss": 0.5828, "step": 1980 }, { "epoch": 2.176026362762598, "grad_norm": 0.6646802425384521, "learning_rate": 9.145299145299145e-05, "loss": 0.551, "step": 1981 }, { "epoch": 2.177124811204174, "grad_norm": 0.4560980200767517, "learning_rate": 9.133089133089132e-05, "loss": 0.705, "step": 1982 }, { "epoch": 2.1782232596457503, "grad_norm": 0.4531053304672241, "learning_rate": 9.120879120879119e-05, "loss": 0.7471, "step": 1983 }, { "epoch": 2.179321708087327, "grad_norm": 0.5881507992744446, "learning_rate": 9.108669108669108e-05, "loss": 0.7559, "step": 1984 }, { "epoch": 2.180420156528903, "grad_norm": 0.41462886333465576, "learning_rate": 9.096459096459096e-05, "loss": 0.5674, "step": 1985 }, { "epoch": 2.181518604970479, "grad_norm": 0.46718108654022217, "learning_rate": 9.084249084249083e-05, "loss": 0.7149, "step": 1986 }, { "epoch": 2.1826170534120557, "grad_norm": 0.49290111660957336, "learning_rate": 9.072039072039072e-05, "loss": 0.5641, "step": 1987 }, { "epoch": 2.183715501853632, "grad_norm": 0.398296594619751, "learning_rate": 9.059829059829059e-05, "loss": 0.5177, "step": 1988 }, { "epoch": 2.184813950295208, "grad_norm": 0.8241115212440491, "learning_rate": 9.047619047619046e-05, "loss": 0.7864, "step": 1989 }, { "epoch": 2.185912398736784, "grad_norm": 1.1335865259170532, "learning_rate": 9.035409035409035e-05, "loss": 0.6167, "step": 1990 }, { "epoch": 2.1870108471783607, "grad_norm": 0.4479789435863495, "learning_rate": 9.023199023199022e-05, "loss": 0.6365, "step": 1991 }, { "epoch": 2.188109295619937, "grad_norm": 0.4892582297325134, "learning_rate": 9.010989010989009e-05, "loss": 0.6283, "step": 1992 }, { "epoch": 2.189207744061513, "grad_norm": 0.8397974371910095, "learning_rate": 8.998778998778999e-05, "loss": 0.7123, "step": 1993 }, { "epoch": 2.1903061925030896, "grad_norm": 0.5295377969741821, "learning_rate": 8.986568986568986e-05, "loss": 0.4033, "step": 1994 }, { "epoch": 2.1914046409446657, "grad_norm": 0.464832067489624, "learning_rate": 8.974358974358974e-05, "loss": 0.8228, "step": 1995 }, { "epoch": 2.192503089386242, "grad_norm": 0.381369024515152, "learning_rate": 8.962148962148961e-05, "loss": 0.6267, "step": 1996 }, { "epoch": 2.193601537827818, "grad_norm": 0.7176710963249207, "learning_rate": 8.949938949938949e-05, "loss": 0.7008, "step": 1997 }, { "epoch": 2.1946999862693946, "grad_norm": 2.569753885269165, "learning_rate": 8.937728937728936e-05, "loss": 0.6899, "step": 1998 }, { "epoch": 2.1957984347109707, "grad_norm": 0.5020056962966919, "learning_rate": 8.925518925518924e-05, "loss": 0.527, "step": 1999 }, { "epoch": 2.196896883152547, "grad_norm": 1.7054524421691895, "learning_rate": 8.913308913308914e-05, "loss": 0.5455, "step": 2000 }, { "epoch": 2.1979953315941234, "grad_norm": 0.5037225484848022, "learning_rate": 8.901098901098901e-05, "loss": 0.7445, "step": 2001 }, { "epoch": 2.1990937800356996, "grad_norm": 0.8109555840492249, "learning_rate": 8.888888888888888e-05, "loss": 0.624, "step": 2002 }, { "epoch": 2.2001922284772757, "grad_norm": 0.47120043635368347, "learning_rate": 8.876678876678877e-05, "loss": 0.6858, "step": 2003 }, { "epoch": 2.2012906769188523, "grad_norm": 0.6166191101074219, "learning_rate": 8.864468864468864e-05, "loss": 0.4528, "step": 2004 }, { "epoch": 2.2023891253604284, "grad_norm": 0.4999128580093384, "learning_rate": 8.852258852258851e-05, "loss": 0.712, "step": 2005 }, { "epoch": 2.2034875738020046, "grad_norm": 1.1858354806900024, "learning_rate": 8.84004884004884e-05, "loss": 0.7647, "step": 2006 }, { "epoch": 2.204586022243581, "grad_norm": 0.4223528206348419, "learning_rate": 8.827838827838828e-05, "loss": 0.6553, "step": 2007 }, { "epoch": 2.2056844706851573, "grad_norm": 0.41678956151008606, "learning_rate": 8.815628815628815e-05, "loss": 0.6033, "step": 2008 }, { "epoch": 2.2067829191267334, "grad_norm": 0.5812666416168213, "learning_rate": 8.803418803418802e-05, "loss": 0.6016, "step": 2009 }, { "epoch": 2.2078813675683095, "grad_norm": 0.5553560256958008, "learning_rate": 8.791208791208791e-05, "loss": 0.7621, "step": 2010 }, { "epoch": 2.208979816009886, "grad_norm": 0.6392796635627747, "learning_rate": 8.778998778998778e-05, "loss": 0.567, "step": 2011 }, { "epoch": 2.2100782644514623, "grad_norm": 1.0086902379989624, "learning_rate": 8.766788766788765e-05, "loss": 0.9432, "step": 2012 }, { "epoch": 2.2111767128930384, "grad_norm": 1.3578602075576782, "learning_rate": 8.754578754578754e-05, "loss": 0.5107, "step": 2013 }, { "epoch": 2.212275161334615, "grad_norm": 0.5530524849891663, "learning_rate": 8.742368742368741e-05, "loss": 0.6078, "step": 2014 }, { "epoch": 2.213373609776191, "grad_norm": 0.3795104920864105, "learning_rate": 8.730158730158728e-05, "loss": 0.4889, "step": 2015 }, { "epoch": 2.2144720582177673, "grad_norm": 0.40977227687835693, "learning_rate": 8.717948717948718e-05, "loss": 0.6295, "step": 2016 }, { "epoch": 2.215570506659344, "grad_norm": 0.4882934093475342, "learning_rate": 8.705738705738705e-05, "loss": 0.7219, "step": 2017 }, { "epoch": 2.21666895510092, "grad_norm": 0.7966530919075012, "learning_rate": 8.693528693528693e-05, "loss": 0.5342, "step": 2018 }, { "epoch": 2.217767403542496, "grad_norm": 0.6992311477661133, "learning_rate": 8.681318681318681e-05, "loss": 0.5932, "step": 2019 }, { "epoch": 2.2188658519840727, "grad_norm": 0.396427720785141, "learning_rate": 8.669108669108668e-05, "loss": 0.5838, "step": 2020 }, { "epoch": 2.219964300425649, "grad_norm": 0.5625690817832947, "learning_rate": 8.656898656898655e-05, "loss": 0.7605, "step": 2021 }, { "epoch": 2.221062748867225, "grad_norm": 0.6052583456039429, "learning_rate": 8.644688644688643e-05, "loss": 0.6572, "step": 2022 }, { "epoch": 2.222161197308801, "grad_norm": 0.7201973795890808, "learning_rate": 8.632478632478633e-05, "loss": 0.4924, "step": 2023 }, { "epoch": 2.2232596457503777, "grad_norm": 0.4222647249698639, "learning_rate": 8.62026862026862e-05, "loss": 0.7764, "step": 2024 }, { "epoch": 2.224358094191954, "grad_norm": 0.5168121457099915, "learning_rate": 8.608058608058607e-05, "loss": 0.5766, "step": 2025 }, { "epoch": 2.22545654263353, "grad_norm": 0.886203408241272, "learning_rate": 8.595848595848596e-05, "loss": 0.3804, "step": 2026 }, { "epoch": 2.2265549910751066, "grad_norm": 1.7365875244140625, "learning_rate": 8.583638583638583e-05, "loss": 0.6583, "step": 2027 }, { "epoch": 2.2276534395166827, "grad_norm": 0.44519639015197754, "learning_rate": 8.57142857142857e-05, "loss": 0.7322, "step": 2028 }, { "epoch": 2.228751887958259, "grad_norm": 0.4888206422328949, "learning_rate": 8.55921855921856e-05, "loss": 0.6645, "step": 2029 }, { "epoch": 2.2298503363998354, "grad_norm": 0.598225474357605, "learning_rate": 8.547008547008547e-05, "loss": 0.7903, "step": 2030 }, { "epoch": 2.2309487848414116, "grad_norm": 0.8521910905838013, "learning_rate": 8.534798534798534e-05, "loss": 0.8573, "step": 2031 }, { "epoch": 2.2320472332829877, "grad_norm": 1.6346311569213867, "learning_rate": 8.522588522588523e-05, "loss": 0.5653, "step": 2032 }, { "epoch": 2.233145681724564, "grad_norm": 0.6574315428733826, "learning_rate": 8.51037851037851e-05, "loss": 0.5289, "step": 2033 }, { "epoch": 2.2342441301661404, "grad_norm": 0.3821216821670532, "learning_rate": 8.498168498168497e-05, "loss": 0.4627, "step": 2034 }, { "epoch": 2.2353425786077166, "grad_norm": 0.28965023159980774, "learning_rate": 8.485958485958484e-05, "loss": 0.3696, "step": 2035 }, { "epoch": 2.2364410270492927, "grad_norm": 0.8256242275238037, "learning_rate": 8.473748473748473e-05, "loss": 0.6305, "step": 2036 }, { "epoch": 2.2375394754908693, "grad_norm": 0.8374451398849487, "learning_rate": 8.46153846153846e-05, "loss": 0.5038, "step": 2037 }, { "epoch": 2.2386379239324454, "grad_norm": 0.5931464433670044, "learning_rate": 8.449328449328449e-05, "loss": 0.6928, "step": 2038 }, { "epoch": 2.2397363723740216, "grad_norm": 0.5120035409927368, "learning_rate": 8.437118437118437e-05, "loss": 0.6004, "step": 2039 }, { "epoch": 2.240834820815598, "grad_norm": 0.6345282196998596, "learning_rate": 8.424908424908424e-05, "loss": 0.866, "step": 2040 }, { "epoch": 2.2419332692571743, "grad_norm": 0.5632284283638, "learning_rate": 8.412698412698412e-05, "loss": 0.406, "step": 2041 }, { "epoch": 2.2430317176987504, "grad_norm": 0.4784685969352722, "learning_rate": 8.4004884004884e-05, "loss": 0.4732, "step": 2042 }, { "epoch": 2.2441301661403266, "grad_norm": 0.47678086161613464, "learning_rate": 8.388278388278387e-05, "loss": 0.502, "step": 2043 }, { "epoch": 2.245228614581903, "grad_norm": 0.6543307304382324, "learning_rate": 8.376068376068374e-05, "loss": 0.7183, "step": 2044 }, { "epoch": 2.2463270630234793, "grad_norm": 0.6147063374519348, "learning_rate": 8.363858363858364e-05, "loss": 0.618, "step": 2045 }, { "epoch": 2.2474255114650554, "grad_norm": 0.5867168307304382, "learning_rate": 8.351648351648352e-05, "loss": 0.7749, "step": 2046 }, { "epoch": 2.248523959906632, "grad_norm": 1.164838433265686, "learning_rate": 8.339438339438339e-05, "loss": 0.6261, "step": 2047 }, { "epoch": 2.249622408348208, "grad_norm": 0.6695102453231812, "learning_rate": 8.327228327228326e-05, "loss": 0.6172, "step": 2048 }, { "epoch": 2.2507208567897843, "grad_norm": 0.43873751163482666, "learning_rate": 8.315018315018315e-05, "loss": 0.7032, "step": 2049 }, { "epoch": 2.251819305231361, "grad_norm": 0.439897745847702, "learning_rate": 8.302808302808302e-05, "loss": 0.7744, "step": 2050 }, { "epoch": 2.252917753672937, "grad_norm": 0.6671053767204285, "learning_rate": 8.290598290598289e-05, "loss": 0.6877, "step": 2051 }, { "epoch": 2.254016202114513, "grad_norm": 0.37354105710983276, "learning_rate": 8.278388278388279e-05, "loss": 0.5653, "step": 2052 }, { "epoch": 2.2551146505560897, "grad_norm": 0.5615684390068054, "learning_rate": 8.266178266178266e-05, "loss": 0.5961, "step": 2053 }, { "epoch": 2.256213098997666, "grad_norm": 2.0932323932647705, "learning_rate": 8.253968253968253e-05, "loss": 0.6139, "step": 2054 }, { "epoch": 2.257311547439242, "grad_norm": 0.5486952066421509, "learning_rate": 8.241758241758242e-05, "loss": 0.7816, "step": 2055 }, { "epoch": 2.258409995880818, "grad_norm": 0.7377699017524719, "learning_rate": 8.229548229548229e-05, "loss": 0.5036, "step": 2056 }, { "epoch": 2.2595084443223947, "grad_norm": 0.7057545781135559, "learning_rate": 8.217338217338216e-05, "loss": 0.5788, "step": 2057 }, { "epoch": 2.260606892763971, "grad_norm": 0.5388674736022949, "learning_rate": 8.205128205128205e-05, "loss": 0.7079, "step": 2058 }, { "epoch": 2.261705341205547, "grad_norm": 0.620943546295166, "learning_rate": 8.192918192918192e-05, "loss": 0.6223, "step": 2059 }, { "epoch": 2.2628037896471236, "grad_norm": 0.6159489154815674, "learning_rate": 8.18070818070818e-05, "loss": 0.7277, "step": 2060 }, { "epoch": 2.2639022380886997, "grad_norm": 0.5745131373405457, "learning_rate": 8.168498168498168e-05, "loss": 0.6356, "step": 2061 }, { "epoch": 2.265000686530276, "grad_norm": 0.4925720989704132, "learning_rate": 8.156288156288156e-05, "loss": 0.6342, "step": 2062 }, { "epoch": 2.2660991349718524, "grad_norm": 0.410692036151886, "learning_rate": 8.144078144078143e-05, "loss": 0.5903, "step": 2063 }, { "epoch": 2.2671975834134286, "grad_norm": 0.8246005177497864, "learning_rate": 8.13186813186813e-05, "loss": 0.4048, "step": 2064 }, { "epoch": 2.2682960318550047, "grad_norm": 0.5054492950439453, "learning_rate": 8.119658119658119e-05, "loss": 0.5797, "step": 2065 }, { "epoch": 2.2693944802965813, "grad_norm": 0.6249692440032959, "learning_rate": 8.107448107448106e-05, "loss": 0.5434, "step": 2066 }, { "epoch": 2.2704929287381574, "grad_norm": 0.5582659244537354, "learning_rate": 8.095238095238093e-05, "loss": 0.5925, "step": 2067 }, { "epoch": 2.2715913771797336, "grad_norm": 0.38472238183021545, "learning_rate": 8.083028083028083e-05, "loss": 0.7325, "step": 2068 }, { "epoch": 2.2726898256213097, "grad_norm": 0.4649077355861664, "learning_rate": 8.07081807081807e-05, "loss": 0.6244, "step": 2069 }, { "epoch": 2.2737882740628863, "grad_norm": 0.38582849502563477, "learning_rate": 8.058608058608058e-05, "loss": 0.7696, "step": 2070 }, { "epoch": 2.2748867225044624, "grad_norm": 0.4612105190753937, "learning_rate": 8.046398046398045e-05, "loss": 0.6453, "step": 2071 }, { "epoch": 2.2759851709460386, "grad_norm": 0.6572852730751038, "learning_rate": 8.034188034188034e-05, "loss": 0.7417, "step": 2072 }, { "epoch": 2.277083619387615, "grad_norm": 0.6322109699249268, "learning_rate": 8.021978021978021e-05, "loss": 0.2827, "step": 2073 }, { "epoch": 2.2781820678291913, "grad_norm": 1.2452771663665771, "learning_rate": 8.009768009768008e-05, "loss": 0.7441, "step": 2074 }, { "epoch": 2.2792805162707674, "grad_norm": 0.32154834270477295, "learning_rate": 7.997557997557998e-05, "loss": 0.4606, "step": 2075 }, { "epoch": 2.2803789647123436, "grad_norm": 1.0170034170150757, "learning_rate": 7.985347985347985e-05, "loss": 0.7003, "step": 2076 }, { "epoch": 2.28147741315392, "grad_norm": 0.7780435085296631, "learning_rate": 7.973137973137972e-05, "loss": 0.5847, "step": 2077 }, { "epoch": 2.2825758615954963, "grad_norm": 0.6422854661941528, "learning_rate": 7.960927960927961e-05, "loss": 0.6278, "step": 2078 }, { "epoch": 2.2836743100370724, "grad_norm": 0.5440393090248108, "learning_rate": 7.948717948717948e-05, "loss": 0.6313, "step": 2079 }, { "epoch": 2.284772758478649, "grad_norm": 0.5774940848350525, "learning_rate": 7.936507936507935e-05, "loss": 0.7504, "step": 2080 }, { "epoch": 2.285871206920225, "grad_norm": 0.44180789589881897, "learning_rate": 7.924297924297924e-05, "loss": 0.5806, "step": 2081 }, { "epoch": 2.2869696553618013, "grad_norm": 0.8452728390693665, "learning_rate": 7.912087912087912e-05, "loss": 0.5753, "step": 2082 }, { "epoch": 2.288068103803378, "grad_norm": 0.40172943472862244, "learning_rate": 7.8998778998779e-05, "loss": 0.5565, "step": 2083 }, { "epoch": 2.289166552244954, "grad_norm": 0.3919180929660797, "learning_rate": 7.887667887667887e-05, "loss": 0.4951, "step": 2084 }, { "epoch": 2.29026500068653, "grad_norm": 1.0796260833740234, "learning_rate": 7.875457875457875e-05, "loss": 0.733, "step": 2085 }, { "epoch": 2.2913634491281067, "grad_norm": 0.5640047788619995, "learning_rate": 7.863247863247862e-05, "loss": 0.4625, "step": 2086 }, { "epoch": 2.292461897569683, "grad_norm": 0.8736083507537842, "learning_rate": 7.85103785103785e-05, "loss": 0.5532, "step": 2087 }, { "epoch": 2.293560346011259, "grad_norm": 0.5358221530914307, "learning_rate": 7.838827838827838e-05, "loss": 0.6397, "step": 2088 }, { "epoch": 2.294658794452835, "grad_norm": 5.207391262054443, "learning_rate": 7.826617826617825e-05, "loss": 0.6402, "step": 2089 }, { "epoch": 2.2957572428944117, "grad_norm": 0.4122523069381714, "learning_rate": 7.814407814407813e-05, "loss": 0.474, "step": 2090 }, { "epoch": 2.296855691335988, "grad_norm": 2.8296186923980713, "learning_rate": 7.802197802197802e-05, "loss": 0.5197, "step": 2091 }, { "epoch": 2.297954139777564, "grad_norm": 0.6898410320281982, "learning_rate": 7.78998778998779e-05, "loss": 0.782, "step": 2092 }, { "epoch": 2.2990525882191406, "grad_norm": 0.37363025546073914, "learning_rate": 7.777777777777777e-05, "loss": 0.5824, "step": 2093 }, { "epoch": 2.3001510366607167, "grad_norm": 0.5120764374732971, "learning_rate": 7.765567765567765e-05, "loss": 0.7326, "step": 2094 }, { "epoch": 2.301249485102293, "grad_norm": 0.6517985463142395, "learning_rate": 7.753357753357753e-05, "loss": 0.6274, "step": 2095 }, { "epoch": 2.3023479335438695, "grad_norm": 0.8033846020698547, "learning_rate": 7.74114774114774e-05, "loss": 0.7093, "step": 2096 }, { "epoch": 2.3034463819854456, "grad_norm": 0.896397590637207, "learning_rate": 7.728937728937727e-05, "loss": 0.6685, "step": 2097 }, { "epoch": 2.3045448304270217, "grad_norm": 0.4606597423553467, "learning_rate": 7.716727716727717e-05, "loss": 0.5821, "step": 2098 }, { "epoch": 2.3056432788685983, "grad_norm": 0.9286845922470093, "learning_rate": 7.704517704517704e-05, "loss": 0.7537, "step": 2099 }, { "epoch": 2.3067417273101745, "grad_norm": 0.6514043211936951, "learning_rate": 7.692307692307691e-05, "loss": 0.5644, "step": 2100 }, { "epoch": 2.3078401757517506, "grad_norm": 0.4881083369255066, "learning_rate": 7.68009768009768e-05, "loss": 0.5348, "step": 2101 }, { "epoch": 2.3089386241933267, "grad_norm": 2.688716173171997, "learning_rate": 7.667887667887667e-05, "loss": 0.6732, "step": 2102 }, { "epoch": 2.3100370726349033, "grad_norm": 0.4597708582878113, "learning_rate": 7.655677655677654e-05, "loss": 0.6166, "step": 2103 }, { "epoch": 2.3111355210764795, "grad_norm": 0.7629315853118896, "learning_rate": 7.643467643467644e-05, "loss": 0.4677, "step": 2104 }, { "epoch": 2.3122339695180556, "grad_norm": 0.7282788753509521, "learning_rate": 7.631257631257631e-05, "loss": 0.6841, "step": 2105 }, { "epoch": 2.313332417959632, "grad_norm": 0.5421862006187439, "learning_rate": 7.619047619047618e-05, "loss": 0.7274, "step": 2106 }, { "epoch": 2.3144308664012083, "grad_norm": 0.7396867871284485, "learning_rate": 7.606837606837607e-05, "loss": 0.6546, "step": 2107 }, { "epoch": 2.3155293148427845, "grad_norm": 0.34731313586235046, "learning_rate": 7.594627594627594e-05, "loss": 0.72, "step": 2108 }, { "epoch": 2.3166277632843606, "grad_norm": 1.1024978160858154, "learning_rate": 7.582417582417581e-05, "loss": 0.7304, "step": 2109 }, { "epoch": 2.317726211725937, "grad_norm": 0.5866183638572693, "learning_rate": 7.570207570207569e-05, "loss": 0.4912, "step": 2110 }, { "epoch": 2.3188246601675133, "grad_norm": 0.8068836331367493, "learning_rate": 7.557997557997557e-05, "loss": 0.5342, "step": 2111 }, { "epoch": 2.31992310860909, "grad_norm": 0.6417646408081055, "learning_rate": 7.545787545787544e-05, "loss": 0.7642, "step": 2112 }, { "epoch": 2.321021557050666, "grad_norm": 0.4545808434486389, "learning_rate": 7.533577533577533e-05, "loss": 0.5681, "step": 2113 }, { "epoch": 2.322120005492242, "grad_norm": 0.3567211329936981, "learning_rate": 7.521367521367521e-05, "loss": 0.6368, "step": 2114 }, { "epoch": 2.3232184539338183, "grad_norm": 0.5747010707855225, "learning_rate": 7.509157509157509e-05, "loss": 0.5848, "step": 2115 }, { "epoch": 2.324316902375395, "grad_norm": 0.46303555369377136, "learning_rate": 7.496947496947497e-05, "loss": 0.6577, "step": 2116 }, { "epoch": 2.325415350816971, "grad_norm": 0.5343080759048462, "learning_rate": 7.484737484737484e-05, "loss": 0.8531, "step": 2117 }, { "epoch": 2.326513799258547, "grad_norm": 0.9027140736579895, "learning_rate": 7.472527472527472e-05, "loss": 0.6271, "step": 2118 }, { "epoch": 2.3276122477001238, "grad_norm": 0.6390063166618347, "learning_rate": 7.460317460317459e-05, "loss": 0.5669, "step": 2119 }, { "epoch": 2.3287106961417, "grad_norm": 0.4965013563632965, "learning_rate": 7.448107448107447e-05, "loss": 0.6362, "step": 2120 }, { "epoch": 2.329809144583276, "grad_norm": 0.49252766370773315, "learning_rate": 7.435897435897436e-05, "loss": 0.6703, "step": 2121 }, { "epoch": 2.330907593024852, "grad_norm": 0.7043023705482483, "learning_rate": 7.423687423687423e-05, "loss": 0.7114, "step": 2122 }, { "epoch": 2.3320060414664288, "grad_norm": 0.4373185634613037, "learning_rate": 7.41147741147741e-05, "loss": 0.5656, "step": 2123 }, { "epoch": 2.333104489908005, "grad_norm": 1.0036537647247314, "learning_rate": 7.399267399267399e-05, "loss": 0.6652, "step": 2124 }, { "epoch": 2.334202938349581, "grad_norm": 2.06589937210083, "learning_rate": 7.387057387057386e-05, "loss": 0.6502, "step": 2125 }, { "epoch": 2.3353013867911576, "grad_norm": 1.1616554260253906, "learning_rate": 7.374847374847375e-05, "loss": 0.7288, "step": 2126 }, { "epoch": 2.3363998352327338, "grad_norm": 0.4532950520515442, "learning_rate": 7.362637362637362e-05, "loss": 0.7696, "step": 2127 }, { "epoch": 2.33749828367431, "grad_norm": 1.0143449306488037, "learning_rate": 7.35042735042735e-05, "loss": 1.0185, "step": 2128 }, { "epoch": 2.3385967321158865, "grad_norm": 2.2059850692749023, "learning_rate": 7.338217338217337e-05, "loss": 0.6267, "step": 2129 }, { "epoch": 2.3396951805574626, "grad_norm": 0.4883456826210022, "learning_rate": 7.326007326007325e-05, "loss": 0.6081, "step": 2130 }, { "epoch": 2.3407936289990388, "grad_norm": 0.42373138666152954, "learning_rate": 7.313797313797313e-05, "loss": 0.6204, "step": 2131 }, { "epoch": 2.3418920774406153, "grad_norm": 0.43958979845046997, "learning_rate": 7.3015873015873e-05, "loss": 0.7608, "step": 2132 }, { "epoch": 2.3429905258821915, "grad_norm": 0.4493010342121124, "learning_rate": 7.289377289377289e-05, "loss": 0.5985, "step": 2133 }, { "epoch": 2.3440889743237676, "grad_norm": 0.38533085584640503, "learning_rate": 7.277167277167276e-05, "loss": 0.445, "step": 2134 }, { "epoch": 2.3451874227653438, "grad_norm": 0.37900710105895996, "learning_rate": 7.264957264957265e-05, "loss": 0.8466, "step": 2135 }, { "epoch": 2.3462858712069203, "grad_norm": 1.7598285675048828, "learning_rate": 7.252747252747252e-05, "loss": 0.6881, "step": 2136 }, { "epoch": 2.3473843196484965, "grad_norm": 0.5551338791847229, "learning_rate": 7.24053724053724e-05, "loss": 0.5908, "step": 2137 }, { "epoch": 2.3484827680900726, "grad_norm": 0.42995861172676086, "learning_rate": 7.228327228327228e-05, "loss": 0.689, "step": 2138 }, { "epoch": 2.349581216531649, "grad_norm": 0.6428760290145874, "learning_rate": 7.216117216117216e-05, "loss": 0.5879, "step": 2139 }, { "epoch": 2.3506796649732253, "grad_norm": 0.6199445724487305, "learning_rate": 7.203907203907203e-05, "loss": 0.5275, "step": 2140 }, { "epoch": 2.3517781134148015, "grad_norm": 0.4687311053276062, "learning_rate": 7.19169719169719e-05, "loss": 0.7046, "step": 2141 }, { "epoch": 2.352876561856378, "grad_norm": 0.47645121812820435, "learning_rate": 7.179487179487179e-05, "loss": 0.4787, "step": 2142 }, { "epoch": 2.353975010297954, "grad_norm": 1.3774843215942383, "learning_rate": 7.167277167277166e-05, "loss": 0.565, "step": 2143 }, { "epoch": 2.3550734587395303, "grad_norm": 0.9585548043251038, "learning_rate": 7.155067155067155e-05, "loss": 0.7496, "step": 2144 }, { "epoch": 2.356171907181107, "grad_norm": 0.9073938131332397, "learning_rate": 7.142857142857142e-05, "loss": 0.6785, "step": 2145 }, { "epoch": 2.357270355622683, "grad_norm": 1.4543087482452393, "learning_rate": 7.13064713064713e-05, "loss": 0.4827, "step": 2146 }, { "epoch": 2.358368804064259, "grad_norm": 0.49685895442962646, "learning_rate": 7.118437118437118e-05, "loss": 0.5624, "step": 2147 }, { "epoch": 2.3594672525058353, "grad_norm": 0.3820716142654419, "learning_rate": 7.106227106227105e-05, "loss": 0.5326, "step": 2148 }, { "epoch": 2.360565700947412, "grad_norm": 0.6018278002738953, "learning_rate": 7.094017094017094e-05, "loss": 0.7372, "step": 2149 }, { "epoch": 2.361664149388988, "grad_norm": 0.49245381355285645, "learning_rate": 7.081807081807082e-05, "loss": 0.714, "step": 2150 }, { "epoch": 2.362762597830564, "grad_norm": 0.5913417339324951, "learning_rate": 7.069597069597069e-05, "loss": 0.6395, "step": 2151 }, { "epoch": 2.3638610462721408, "grad_norm": 0.3142958879470825, "learning_rate": 7.057387057387056e-05, "loss": 0.4363, "step": 2152 }, { "epoch": 2.364959494713717, "grad_norm": 0.44251006841659546, "learning_rate": 7.045177045177044e-05, "loss": 0.5751, "step": 2153 }, { "epoch": 2.366057943155293, "grad_norm": 0.7642143964767456, "learning_rate": 7.032967032967032e-05, "loss": 0.9707, "step": 2154 }, { "epoch": 2.367156391596869, "grad_norm": 0.3676380217075348, "learning_rate": 7.020757020757021e-05, "loss": 0.6142, "step": 2155 }, { "epoch": 2.3682548400384458, "grad_norm": 0.43112027645111084, "learning_rate": 7.008547008547008e-05, "loss": 0.6194, "step": 2156 }, { "epoch": 2.369353288480022, "grad_norm": 0.5463792681694031, "learning_rate": 6.996336996336996e-05, "loss": 0.5478, "step": 2157 }, { "epoch": 2.370451736921598, "grad_norm": 0.5498053431510925, "learning_rate": 6.984126984126984e-05, "loss": 0.8373, "step": 2158 }, { "epoch": 2.3715501853631746, "grad_norm": 0.5144299268722534, "learning_rate": 6.971916971916971e-05, "loss": 0.7033, "step": 2159 }, { "epoch": 2.3726486338047508, "grad_norm": 0.4049033522605896, "learning_rate": 6.95970695970696e-05, "loss": 0.6257, "step": 2160 }, { "epoch": 2.373747082246327, "grad_norm": 0.8007866740226746, "learning_rate": 6.947496947496947e-05, "loss": 1.1859, "step": 2161 }, { "epoch": 2.3748455306879035, "grad_norm": 0.6302816867828369, "learning_rate": 6.935286935286935e-05, "loss": 0.4972, "step": 2162 }, { "epoch": 2.3759439791294796, "grad_norm": 0.4181542694568634, "learning_rate": 6.923076923076922e-05, "loss": 0.5543, "step": 2163 }, { "epoch": 2.3770424275710558, "grad_norm": 0.45409703254699707, "learning_rate": 6.91086691086691e-05, "loss": 0.6237, "step": 2164 }, { "epoch": 2.3781408760126324, "grad_norm": 0.5172666907310486, "learning_rate": 6.898656898656898e-05, "loss": 0.5798, "step": 2165 }, { "epoch": 2.3792393244542085, "grad_norm": 0.7849127054214478, "learning_rate": 6.886446886446885e-05, "loss": 0.8282, "step": 2166 }, { "epoch": 2.3803377728957846, "grad_norm": 0.4041041135787964, "learning_rate": 6.874236874236874e-05, "loss": 0.5046, "step": 2167 }, { "epoch": 2.3814362213373608, "grad_norm": 0.35880064964294434, "learning_rate": 6.862026862026862e-05, "loss": 0.4096, "step": 2168 }, { "epoch": 2.3825346697789374, "grad_norm": 0.5949457883834839, "learning_rate": 6.84981684981685e-05, "loss": 0.6666, "step": 2169 }, { "epoch": 2.3836331182205135, "grad_norm": 0.6332186460494995, "learning_rate": 6.837606837606837e-05, "loss": 0.9715, "step": 2170 }, { "epoch": 2.3847315666620896, "grad_norm": 0.3173432946205139, "learning_rate": 6.825396825396824e-05, "loss": 0.6792, "step": 2171 }, { "epoch": 2.385830015103666, "grad_norm": 0.7556782364845276, "learning_rate": 6.813186813186813e-05, "loss": 0.7267, "step": 2172 }, { "epoch": 2.3869284635452424, "grad_norm": 0.43191683292388916, "learning_rate": 6.800976800976801e-05, "loss": 0.5841, "step": 2173 }, { "epoch": 2.3880269119868185, "grad_norm": 0.4010660946369171, "learning_rate": 6.788766788766788e-05, "loss": 0.7491, "step": 2174 }, { "epoch": 2.389125360428395, "grad_norm": 0.6889204382896423, "learning_rate": 6.776556776556775e-05, "loss": 0.4539, "step": 2175 }, { "epoch": 2.390223808869971, "grad_norm": 0.4509136974811554, "learning_rate": 6.764346764346764e-05, "loss": 0.7066, "step": 2176 }, { "epoch": 2.3913222573115474, "grad_norm": 0.4313298463821411, "learning_rate": 6.752136752136751e-05, "loss": 0.6292, "step": 2177 }, { "epoch": 2.392420705753124, "grad_norm": 0.7713265419006348, "learning_rate": 6.73992673992674e-05, "loss": 0.8392, "step": 2178 }, { "epoch": 2.3935191541947, "grad_norm": 0.5283428430557251, "learning_rate": 6.727716727716727e-05, "loss": 0.6912, "step": 2179 }, { "epoch": 2.394617602636276, "grad_norm": 0.40429314970970154, "learning_rate": 6.715506715506716e-05, "loss": 0.4335, "step": 2180 }, { "epoch": 2.3957160510778523, "grad_norm": 0.6888754367828369, "learning_rate": 6.703296703296703e-05, "loss": 0.6276, "step": 2181 }, { "epoch": 2.396814499519429, "grad_norm": 0.5595026612281799, "learning_rate": 6.69108669108669e-05, "loss": 0.7806, "step": 2182 }, { "epoch": 2.397912947961005, "grad_norm": 0.32394587993621826, "learning_rate": 6.678876678876678e-05, "loss": 0.5531, "step": 2183 }, { "epoch": 2.399011396402581, "grad_norm": 0.5909039974212646, "learning_rate": 6.666666666666666e-05, "loss": 0.4932, "step": 2184 }, { "epoch": 2.400109844844158, "grad_norm": 0.4148501455783844, "learning_rate": 6.654456654456654e-05, "loss": 0.5637, "step": 2185 }, { "epoch": 2.401208293285734, "grad_norm": 0.558403491973877, "learning_rate": 6.642246642246641e-05, "loss": 0.5733, "step": 2186 }, { "epoch": 2.40230674172731, "grad_norm": 0.5171149373054504, "learning_rate": 6.630036630036629e-05, "loss": 0.6931, "step": 2187 }, { "epoch": 2.403405190168886, "grad_norm": 0.44966164231300354, "learning_rate": 6.617826617826617e-05, "loss": 0.5061, "step": 2188 }, { "epoch": 2.404503638610463, "grad_norm": 0.45499417185783386, "learning_rate": 6.605616605616606e-05, "loss": 0.3726, "step": 2189 }, { "epoch": 2.405602087052039, "grad_norm": 0.5790139436721802, "learning_rate": 6.593406593406593e-05, "loss": 0.6647, "step": 2190 }, { "epoch": 2.4067005354936155, "grad_norm": 0.5948793292045593, "learning_rate": 6.581196581196581e-05, "loss": 0.765, "step": 2191 }, { "epoch": 2.4077989839351917, "grad_norm": 0.5925643444061279, "learning_rate": 6.568986568986569e-05, "loss": 0.889, "step": 2192 }, { "epoch": 2.408897432376768, "grad_norm": 0.5776219964027405, "learning_rate": 6.556776556776556e-05, "loss": 0.5506, "step": 2193 }, { "epoch": 2.409995880818344, "grad_norm": 0.44397997856140137, "learning_rate": 6.544566544566544e-05, "loss": 0.5372, "step": 2194 }, { "epoch": 2.4110943292599205, "grad_norm": 0.45733606815338135, "learning_rate": 6.532356532356532e-05, "loss": 0.7207, "step": 2195 }, { "epoch": 2.4121927777014966, "grad_norm": 0.38223645091056824, "learning_rate": 6.52014652014652e-05, "loss": 0.5888, "step": 2196 }, { "epoch": 2.413291226143073, "grad_norm": 0.3642580211162567, "learning_rate": 6.507936507936507e-05, "loss": 0.5687, "step": 2197 }, { "epoch": 2.4143896745846494, "grad_norm": 0.42435723543167114, "learning_rate": 6.495726495726494e-05, "loss": 0.6056, "step": 2198 }, { "epoch": 2.4154881230262255, "grad_norm": 0.4998740255832672, "learning_rate": 6.483516483516483e-05, "loss": 0.6813, "step": 2199 }, { "epoch": 2.4165865714678016, "grad_norm": 0.47158849239349365, "learning_rate": 6.47130647130647e-05, "loss": 0.5585, "step": 2200 }, { "epoch": 2.417685019909378, "grad_norm": 0.4780612289905548, "learning_rate": 6.459096459096459e-05, "loss": 0.4941, "step": 2201 }, { "epoch": 2.4187834683509544, "grad_norm": 0.5073630809783936, "learning_rate": 6.446886446886447e-05, "loss": 0.4549, "step": 2202 }, { "epoch": 2.4198819167925305, "grad_norm": 0.4311310052871704, "learning_rate": 6.434676434676435e-05, "loss": 0.4419, "step": 2203 }, { "epoch": 2.4209803652341066, "grad_norm": 0.3557896316051483, "learning_rate": 6.422466422466422e-05, "loss": 0.6973, "step": 2204 }, { "epoch": 2.4220788136756832, "grad_norm": 0.6171516180038452, "learning_rate": 6.410256410256409e-05, "loss": 0.7554, "step": 2205 }, { "epoch": 2.4231772621172594, "grad_norm": 0.4687957465648651, "learning_rate": 6.398046398046397e-05, "loss": 0.7429, "step": 2206 }, { "epoch": 2.4242757105588355, "grad_norm": 0.8685696125030518, "learning_rate": 6.385836385836386e-05, "loss": 0.5896, "step": 2207 }, { "epoch": 2.425374159000412, "grad_norm": 0.39599040150642395, "learning_rate": 6.373626373626373e-05, "loss": 0.4744, "step": 2208 }, { "epoch": 2.4264726074419882, "grad_norm": 0.9079630970954895, "learning_rate": 6.36141636141636e-05, "loss": 0.6067, "step": 2209 }, { "epoch": 2.4275710558835644, "grad_norm": 0.5051462054252625, "learning_rate": 6.349206349206349e-05, "loss": 0.7314, "step": 2210 }, { "epoch": 2.428669504325141, "grad_norm": 0.4899844825267792, "learning_rate": 6.336996336996336e-05, "loss": 0.7086, "step": 2211 }, { "epoch": 2.429767952766717, "grad_norm": 0.5135432481765747, "learning_rate": 6.324786324786325e-05, "loss": 0.5261, "step": 2212 }, { "epoch": 2.4308664012082932, "grad_norm": 0.6025048494338989, "learning_rate": 6.312576312576312e-05, "loss": 0.5276, "step": 2213 }, { "epoch": 2.4319648496498694, "grad_norm": 0.6931442022323608, "learning_rate": 6.3003663003663e-05, "loss": 0.6535, "step": 2214 }, { "epoch": 2.433063298091446, "grad_norm": 0.695106565952301, "learning_rate": 6.288156288156288e-05, "loss": 0.9183, "step": 2215 }, { "epoch": 2.434161746533022, "grad_norm": 0.450100302696228, "learning_rate": 6.275946275946275e-05, "loss": 0.5049, "step": 2216 }, { "epoch": 2.4352601949745982, "grad_norm": 0.5539785623550415, "learning_rate": 6.263736263736263e-05, "loss": 0.5735, "step": 2217 }, { "epoch": 2.436358643416175, "grad_norm": 0.5560977458953857, "learning_rate": 6.25152625152625e-05, "loss": 0.7364, "step": 2218 }, { "epoch": 2.437457091857751, "grad_norm": 0.740195095539093, "learning_rate": 6.239316239316239e-05, "loss": 0.7839, "step": 2219 }, { "epoch": 2.438555540299327, "grad_norm": 0.9324271082878113, "learning_rate": 6.227106227106226e-05, "loss": 0.6365, "step": 2220 }, { "epoch": 2.4396539887409037, "grad_norm": 0.5540104508399963, "learning_rate": 6.214896214896215e-05, "loss": 0.6586, "step": 2221 }, { "epoch": 2.44075243718248, "grad_norm": 0.5028054714202881, "learning_rate": 6.202686202686202e-05, "loss": 0.4422, "step": 2222 }, { "epoch": 2.441850885624056, "grad_norm": 0.7052125930786133, "learning_rate": 6.190476190476189e-05, "loss": 0.7248, "step": 2223 }, { "epoch": 2.4429493340656325, "grad_norm": 0.6705207824707031, "learning_rate": 6.178266178266178e-05, "loss": 0.81, "step": 2224 }, { "epoch": 2.4440477825072087, "grad_norm": 0.7996514439582825, "learning_rate": 6.166056166056166e-05, "loss": 0.382, "step": 2225 }, { "epoch": 2.445146230948785, "grad_norm": 1.5169689655303955, "learning_rate": 6.153846153846154e-05, "loss": 0.7373, "step": 2226 }, { "epoch": 2.446244679390361, "grad_norm": 0.8039339780807495, "learning_rate": 6.141636141636141e-05, "loss": 0.8609, "step": 2227 }, { "epoch": 2.4473431278319375, "grad_norm": 0.6489125490188599, "learning_rate": 6.129426129426128e-05, "loss": 0.6309, "step": 2228 }, { "epoch": 2.4484415762735137, "grad_norm": 0.533184826374054, "learning_rate": 6.117216117216116e-05, "loss": 0.5166, "step": 2229 }, { "epoch": 2.44954002471509, "grad_norm": 0.5699225068092346, "learning_rate": 6.105006105006105e-05, "loss": 0.7276, "step": 2230 }, { "epoch": 2.4506384731566664, "grad_norm": 0.5552012324333191, "learning_rate": 6.092796092796092e-05, "loss": 0.636, "step": 2231 }, { "epoch": 2.4517369215982425, "grad_norm": 0.4785599112510681, "learning_rate": 6.08058608058608e-05, "loss": 0.6362, "step": 2232 }, { "epoch": 2.4528353700398187, "grad_norm": 0.740872859954834, "learning_rate": 6.068376068376068e-05, "loss": 0.5603, "step": 2233 }, { "epoch": 2.453933818481395, "grad_norm": 0.5217441916465759, "learning_rate": 6.056166056166056e-05, "loss": 0.6306, "step": 2234 }, { "epoch": 2.4550322669229714, "grad_norm": 0.446481853723526, "learning_rate": 6.043956043956044e-05, "loss": 0.8156, "step": 2235 }, { "epoch": 2.4561307153645475, "grad_norm": 0.6527410745620728, "learning_rate": 6.031746031746031e-05, "loss": 0.7057, "step": 2236 }, { "epoch": 2.4572291638061237, "grad_norm": 0.6801958680152893, "learning_rate": 6.019536019536019e-05, "loss": 0.7718, "step": 2237 }, { "epoch": 2.4583276122477002, "grad_norm": 1.0723007917404175, "learning_rate": 6.007326007326007e-05, "loss": 0.5552, "step": 2238 }, { "epoch": 2.4594260606892764, "grad_norm": 0.4058208763599396, "learning_rate": 5.9951159951159945e-05, "loss": 0.5035, "step": 2239 }, { "epoch": 2.4605245091308525, "grad_norm": 0.5384330153465271, "learning_rate": 5.9829059829059824e-05, "loss": 0.5059, "step": 2240 }, { "epoch": 2.461622957572429, "grad_norm": 0.7797716856002808, "learning_rate": 5.9706959706959696e-05, "loss": 0.5613, "step": 2241 }, { "epoch": 2.4627214060140052, "grad_norm": 2.9689226150512695, "learning_rate": 5.958485958485958e-05, "loss": 0.6219, "step": 2242 }, { "epoch": 2.4638198544555814, "grad_norm": 0.47863152623176575, "learning_rate": 5.946275946275946e-05, "loss": 0.5498, "step": 2243 }, { "epoch": 2.464918302897158, "grad_norm": 0.49707144498825073, "learning_rate": 5.934065934065933e-05, "loss": 0.775, "step": 2244 }, { "epoch": 2.466016751338734, "grad_norm": 0.3437495529651642, "learning_rate": 5.921855921855922e-05, "loss": 0.4592, "step": 2245 }, { "epoch": 2.4671151997803102, "grad_norm": 0.7298309206962585, "learning_rate": 5.9096459096459096e-05, "loss": 0.5374, "step": 2246 }, { "epoch": 2.4682136482218864, "grad_norm": 0.6666691303253174, "learning_rate": 5.897435897435897e-05, "loss": 0.424, "step": 2247 }, { "epoch": 2.469312096663463, "grad_norm": 0.5841661691665649, "learning_rate": 5.8852258852258847e-05, "loss": 0.5316, "step": 2248 }, { "epoch": 2.470410545105039, "grad_norm": 0.4921081066131592, "learning_rate": 5.873015873015872e-05, "loss": 0.6901, "step": 2249 }, { "epoch": 2.4715089935466152, "grad_norm": 0.4779987633228302, "learning_rate": 5.8608058608058604e-05, "loss": 0.8976, "step": 2250 }, { "epoch": 2.472607441988192, "grad_norm": 0.43142780661582947, "learning_rate": 5.848595848595848e-05, "loss": 0.4915, "step": 2251 }, { "epoch": 2.473705890429768, "grad_norm": 1.132870078086853, "learning_rate": 5.8363858363858355e-05, "loss": 0.6633, "step": 2252 }, { "epoch": 2.474804338871344, "grad_norm": 0.5674893856048584, "learning_rate": 5.824175824175824e-05, "loss": 0.5023, "step": 2253 }, { "epoch": 2.4759027873129207, "grad_norm": 0.42495957016944885, "learning_rate": 5.811965811965811e-05, "loss": 0.6544, "step": 2254 }, { "epoch": 2.477001235754497, "grad_norm": 0.8031434416770935, "learning_rate": 5.799755799755799e-05, "loss": 0.892, "step": 2255 }, { "epoch": 2.478099684196073, "grad_norm": 0.7715115547180176, "learning_rate": 5.7875457875457876e-05, "loss": 0.5659, "step": 2256 }, { "epoch": 2.4791981326376495, "grad_norm": 0.6882114410400391, "learning_rate": 5.775335775335775e-05, "loss": 0.5154, "step": 2257 }, { "epoch": 2.4802965810792257, "grad_norm": 0.4994114935398102, "learning_rate": 5.763125763125763e-05, "loss": 0.6001, "step": 2258 }, { "epoch": 2.481395029520802, "grad_norm": 0.45008450746536255, "learning_rate": 5.7509157509157506e-05, "loss": 0.7076, "step": 2259 }, { "epoch": 2.482493477962378, "grad_norm": 0.654270350933075, "learning_rate": 5.738705738705738e-05, "loss": 0.5809, "step": 2260 }, { "epoch": 2.4835919264039545, "grad_norm": 0.6344896554946899, "learning_rate": 5.726495726495726e-05, "loss": 0.6059, "step": 2261 }, { "epoch": 2.4846903748455307, "grad_norm": 0.44090238213539124, "learning_rate": 5.7142857142857135e-05, "loss": 0.7953, "step": 2262 }, { "epoch": 2.485788823287107, "grad_norm": 0.47564128041267395, "learning_rate": 5.7020757020757014e-05, "loss": 0.5062, "step": 2263 }, { "epoch": 2.4868872717286834, "grad_norm": 0.3644583225250244, "learning_rate": 5.68986568986569e-05, "loss": 0.6417, "step": 2264 }, { "epoch": 2.4879857201702595, "grad_norm": 0.5264548659324646, "learning_rate": 5.677655677655677e-05, "loss": 0.5971, "step": 2265 }, { "epoch": 2.4890841686118357, "grad_norm": 0.7300589680671692, "learning_rate": 5.665445665445665e-05, "loss": 0.6249, "step": 2266 }, { "epoch": 2.490182617053412, "grad_norm": 0.9016311764717102, "learning_rate": 5.653235653235652e-05, "loss": 0.5761, "step": 2267 }, { "epoch": 2.4912810654949884, "grad_norm": 0.7480237483978271, "learning_rate": 5.641025641025641e-05, "loss": 0.4026, "step": 2268 }, { "epoch": 2.4923795139365645, "grad_norm": 0.5738864541053772, "learning_rate": 5.6288156288156286e-05, "loss": 0.8657, "step": 2269 }, { "epoch": 2.493477962378141, "grad_norm": 0.7320820093154907, "learning_rate": 5.616605616605616e-05, "loss": 0.7341, "step": 2270 }, { "epoch": 2.4945764108197173, "grad_norm": 0.7029497623443604, "learning_rate": 5.6043956043956037e-05, "loss": 0.7597, "step": 2271 }, { "epoch": 2.4956748592612934, "grad_norm": 0.5160001516342163, "learning_rate": 5.592185592185592e-05, "loss": 0.6488, "step": 2272 }, { "epoch": 2.4967733077028695, "grad_norm": 0.5425933003425598, "learning_rate": 5.5799755799755794e-05, "loss": 0.7102, "step": 2273 }, { "epoch": 2.497871756144446, "grad_norm": 0.5881295204162598, "learning_rate": 5.567765567765567e-05, "loss": 0.8123, "step": 2274 }, { "epoch": 2.4989702045860223, "grad_norm": 0.6021397113800049, "learning_rate": 5.5555555555555545e-05, "loss": 0.8887, "step": 2275 }, { "epoch": 2.5000686530275984, "grad_norm": 0.4754411578178406, "learning_rate": 5.543345543345543e-05, "loss": 0.8162, "step": 2276 }, { "epoch": 2.501167101469175, "grad_norm": 0.46976983547210693, "learning_rate": 5.531135531135531e-05, "loss": 0.4177, "step": 2277 }, { "epoch": 2.502265549910751, "grad_norm": 0.4946482181549072, "learning_rate": 5.518925518925518e-05, "loss": 0.6997, "step": 2278 }, { "epoch": 2.5033639983523273, "grad_norm": 0.49166280031204224, "learning_rate": 5.5067155067155066e-05, "loss": 0.6436, "step": 2279 }, { "epoch": 2.5044624467939034, "grad_norm": 0.40157628059387207, "learning_rate": 5.494505494505494e-05, "loss": 0.6998, "step": 2280 }, { "epoch": 2.50556089523548, "grad_norm": 0.4139937162399292, "learning_rate": 5.482295482295482e-05, "loss": 0.4021, "step": 2281 }, { "epoch": 2.506659343677056, "grad_norm": 3.6814892292022705, "learning_rate": 5.4700854700854696e-05, "loss": 0.6402, "step": 2282 }, { "epoch": 2.5077577921186327, "grad_norm": 0.3136257529258728, "learning_rate": 5.4578754578754574e-05, "loss": 0.5364, "step": 2283 }, { "epoch": 2.508856240560209, "grad_norm": 0.42901432514190674, "learning_rate": 5.445665445665445e-05, "loss": 0.6838, "step": 2284 }, { "epoch": 2.509954689001785, "grad_norm": 0.8462406992912292, "learning_rate": 5.433455433455433e-05, "loss": 0.4232, "step": 2285 }, { "epoch": 2.511053137443361, "grad_norm": 1.244150996208191, "learning_rate": 5.4212454212454204e-05, "loss": 0.6192, "step": 2286 }, { "epoch": 2.5121515858849373, "grad_norm": 0.834296703338623, "learning_rate": 5.409035409035409e-05, "loss": 0.548, "step": 2287 }, { "epoch": 2.513250034326514, "grad_norm": 0.4279276430606842, "learning_rate": 5.396825396825396e-05, "loss": 0.7549, "step": 2288 }, { "epoch": 2.51434848276809, "grad_norm": 0.5770757794380188, "learning_rate": 5.384615384615384e-05, "loss": 0.6156, "step": 2289 }, { "epoch": 2.5154469312096666, "grad_norm": 0.41763821244239807, "learning_rate": 5.3724053724053725e-05, "loss": 0.5019, "step": 2290 }, { "epoch": 2.5165453796512427, "grad_norm": 0.5212944746017456, "learning_rate": 5.36019536019536e-05, "loss": 0.6132, "step": 2291 }, { "epoch": 2.517643828092819, "grad_norm": 0.44493457674980164, "learning_rate": 5.3479853479853476e-05, "loss": 0.4162, "step": 2292 }, { "epoch": 2.518742276534395, "grad_norm": 0.46922022104263306, "learning_rate": 5.335775335775335e-05, "loss": 0.4624, "step": 2293 }, { "epoch": 2.5198407249759716, "grad_norm": 0.41906213760375977, "learning_rate": 5.3235653235653233e-05, "loss": 0.612, "step": 2294 }, { "epoch": 2.5209391734175477, "grad_norm": 0.620276153087616, "learning_rate": 5.311355311355311e-05, "loss": 0.6322, "step": 2295 }, { "epoch": 2.522037621859124, "grad_norm": 0.6597051620483398, "learning_rate": 5.2991452991452984e-05, "loss": 0.7659, "step": 2296 }, { "epoch": 2.5231360703007004, "grad_norm": 4.377660274505615, "learning_rate": 5.286935286935286e-05, "loss": 0.8294, "step": 2297 }, { "epoch": 2.5242345187422766, "grad_norm": 0.6086331009864807, "learning_rate": 5.274725274725275e-05, "loss": 0.5164, "step": 2298 }, { "epoch": 2.5253329671838527, "grad_norm": 0.5100352168083191, "learning_rate": 5.262515262515262e-05, "loss": 0.6319, "step": 2299 }, { "epoch": 2.526431415625429, "grad_norm": 0.6642487049102783, "learning_rate": 5.25030525030525e-05, "loss": 0.533, "step": 2300 }, { "epoch": 2.5275298640670054, "grad_norm": 0.5834927558898926, "learning_rate": 5.238095238095237e-05, "loss": 0.5669, "step": 2301 }, { "epoch": 2.5286283125085816, "grad_norm": 0.530815064907074, "learning_rate": 5.2258852258852256e-05, "loss": 0.6189, "step": 2302 }, { "epoch": 2.529726760950158, "grad_norm": 0.6275864243507385, "learning_rate": 5.2136752136752135e-05, "loss": 0.8403, "step": 2303 }, { "epoch": 2.5308252093917343, "grad_norm": 0.5878366827964783, "learning_rate": 5.201465201465201e-05, "loss": 0.6176, "step": 2304 }, { "epoch": 2.5319236578333104, "grad_norm": 0.37410980463027954, "learning_rate": 5.189255189255189e-05, "loss": 0.6337, "step": 2305 }, { "epoch": 2.5330221062748866, "grad_norm": 0.43912917375564575, "learning_rate": 5.1770451770451764e-05, "loss": 0.5348, "step": 2306 }, { "epoch": 2.534120554716463, "grad_norm": 1.4737471342086792, "learning_rate": 5.164835164835164e-05, "loss": 0.4862, "step": 2307 }, { "epoch": 2.5352190031580393, "grad_norm": 0.3978705108165741, "learning_rate": 5.152625152625152e-05, "loss": 0.7929, "step": 2308 }, { "epoch": 2.5363174515996154, "grad_norm": 0.3852058947086334, "learning_rate": 5.14041514041514e-05, "loss": 0.5895, "step": 2309 }, { "epoch": 2.537415900041192, "grad_norm": 17.968448638916016, "learning_rate": 5.128205128205128e-05, "loss": 0.4661, "step": 2310 }, { "epoch": 2.538514348482768, "grad_norm": 0.9369175434112549, "learning_rate": 5.115995115995115e-05, "loss": 0.5957, "step": 2311 }, { "epoch": 2.5396127969243443, "grad_norm": 0.612750768661499, "learning_rate": 5.103785103785103e-05, "loss": 0.6786, "step": 2312 }, { "epoch": 2.5407112453659204, "grad_norm": 0.588512659072876, "learning_rate": 5.0915750915750915e-05, "loss": 1.0482, "step": 2313 }, { "epoch": 2.541809693807497, "grad_norm": 0.4964143633842468, "learning_rate": 5.079365079365079e-05, "loss": 0.5673, "step": 2314 }, { "epoch": 2.542908142249073, "grad_norm": 0.5807982683181763, "learning_rate": 5.0671550671550666e-05, "loss": 0.5493, "step": 2315 }, { "epoch": 2.5440065906906497, "grad_norm": 0.5131386518478394, "learning_rate": 5.054945054945055e-05, "loss": 0.5947, "step": 2316 }, { "epoch": 2.545105039132226, "grad_norm": 0.4521124064922333, "learning_rate": 5.0427350427350424e-05, "loss": 0.5554, "step": 2317 }, { "epoch": 2.546203487573802, "grad_norm": 0.9441378712654114, "learning_rate": 5.03052503052503e-05, "loss": 0.6991, "step": 2318 }, { "epoch": 2.547301936015378, "grad_norm": 0.6353013515472412, "learning_rate": 5.0183150183150174e-05, "loss": 0.5308, "step": 2319 }, { "epoch": 2.5484003844569547, "grad_norm": 0.5940631628036499, "learning_rate": 5.006105006105006e-05, "loss": 0.6536, "step": 2320 }, { "epoch": 2.549498832898531, "grad_norm": 0.5457591414451599, "learning_rate": 4.993894993894994e-05, "loss": 0.6927, "step": 2321 }, { "epoch": 2.550597281340107, "grad_norm": 0.6265937685966492, "learning_rate": 4.981684981684981e-05, "loss": 0.6341, "step": 2322 }, { "epoch": 2.5516957297816836, "grad_norm": 0.5842925310134888, "learning_rate": 4.969474969474969e-05, "loss": 0.4583, "step": 2323 }, { "epoch": 2.5527941782232597, "grad_norm": 0.5363351106643677, "learning_rate": 4.957264957264956e-05, "loss": 0.6882, "step": 2324 }, { "epoch": 2.553892626664836, "grad_norm": 0.3677682876586914, "learning_rate": 4.9450549450549446e-05, "loss": 0.5671, "step": 2325 }, { "epoch": 2.554991075106412, "grad_norm": 1.222985863685608, "learning_rate": 4.9328449328449325e-05, "loss": 0.4936, "step": 2326 }, { "epoch": 2.5560895235479886, "grad_norm": 1.187898874282837, "learning_rate": 4.92063492063492e-05, "loss": 0.4893, "step": 2327 }, { "epoch": 2.5571879719895647, "grad_norm": 0.38843801617622375, "learning_rate": 4.908424908424908e-05, "loss": 0.6512, "step": 2328 }, { "epoch": 2.558286420431141, "grad_norm": 0.9550036191940308, "learning_rate": 4.896214896214896e-05, "loss": 0.6055, "step": 2329 }, { "epoch": 2.5593848688727174, "grad_norm": 0.80762779712677, "learning_rate": 4.884004884004883e-05, "loss": 0.8852, "step": 2330 }, { "epoch": 2.5604833173142936, "grad_norm": 0.7496643662452698, "learning_rate": 4.871794871794872e-05, "loss": 0.6535, "step": 2331 }, { "epoch": 2.5615817657558697, "grad_norm": 0.5532578825950623, "learning_rate": 4.859584859584859e-05, "loss": 0.6336, "step": 2332 }, { "epoch": 2.562680214197446, "grad_norm": 0.4058012366294861, "learning_rate": 4.847374847374847e-05, "loss": 0.6529, "step": 2333 }, { "epoch": 2.5637786626390224, "grad_norm": 3.1913115978240967, "learning_rate": 4.835164835164835e-05, "loss": 0.548, "step": 2334 }, { "epoch": 2.5648771110805986, "grad_norm": 0.47375988960266113, "learning_rate": 4.822954822954822e-05, "loss": 0.7567, "step": 2335 }, { "epoch": 2.565975559522175, "grad_norm": 0.5287726521492004, "learning_rate": 4.8107448107448106e-05, "loss": 0.6009, "step": 2336 }, { "epoch": 2.5670740079637513, "grad_norm": 0.43966931104660034, "learning_rate": 4.798534798534798e-05, "loss": 0.5538, "step": 2337 }, { "epoch": 2.5681724564053274, "grad_norm": 0.6683239340782166, "learning_rate": 4.7863247863247856e-05, "loss": 0.3999, "step": 2338 }, { "epoch": 2.5692709048469036, "grad_norm": 0.5260687470436096, "learning_rate": 4.774114774114774e-05, "loss": 0.7212, "step": 2339 }, { "epoch": 2.57036935328848, "grad_norm": 1.086850881576538, "learning_rate": 4.7619047619047614e-05, "loss": 0.7439, "step": 2340 }, { "epoch": 2.5714678017300563, "grad_norm": 0.9744517207145691, "learning_rate": 4.749694749694749e-05, "loss": 0.5625, "step": 2341 }, { "epoch": 2.5725662501716324, "grad_norm": 0.6829352974891663, "learning_rate": 4.737484737484738e-05, "loss": 0.5241, "step": 2342 }, { "epoch": 2.573664698613209, "grad_norm": 0.9441612958908081, "learning_rate": 4.725274725274725e-05, "loss": 0.8815, "step": 2343 }, { "epoch": 2.574763147054785, "grad_norm": 0.9406607151031494, "learning_rate": 4.713064713064713e-05, "loss": 0.7176, "step": 2344 }, { "epoch": 2.5758615954963613, "grad_norm": 0.6601364016532898, "learning_rate": 4.7008547008547e-05, "loss": 0.7713, "step": 2345 }, { "epoch": 2.5769600439379374, "grad_norm": 2.5189599990844727, "learning_rate": 4.688644688644688e-05, "loss": 0.5572, "step": 2346 }, { "epoch": 2.578058492379514, "grad_norm": 0.7295210957527161, "learning_rate": 4.6764346764346765e-05, "loss": 0.4431, "step": 2347 }, { "epoch": 2.57915694082109, "grad_norm": 0.5053385496139526, "learning_rate": 4.6642246642246637e-05, "loss": 0.4881, "step": 2348 }, { "epoch": 2.5802553892626667, "grad_norm": 0.6556063890457153, "learning_rate": 4.6520146520146515e-05, "loss": 0.5168, "step": 2349 }, { "epoch": 2.581353837704243, "grad_norm": 0.37052014470100403, "learning_rate": 4.639804639804639e-05, "loss": 0.3954, "step": 2350 }, { "epoch": 2.582452286145819, "grad_norm": 0.5975561738014221, "learning_rate": 4.627594627594627e-05, "loss": 0.5714, "step": 2351 }, { "epoch": 2.583550734587395, "grad_norm": 0.7273014187812805, "learning_rate": 4.615384615384615e-05, "loss": 0.7287, "step": 2352 }, { "epoch": 2.5846491830289717, "grad_norm": 0.566586971282959, "learning_rate": 4.603174603174602e-05, "loss": 0.5589, "step": 2353 }, { "epoch": 2.585747631470548, "grad_norm": 0.5846517086029053, "learning_rate": 4.590964590964591e-05, "loss": 0.5061, "step": 2354 }, { "epoch": 2.586846079912124, "grad_norm": 0.7470859885215759, "learning_rate": 4.578754578754579e-05, "loss": 0.5433, "step": 2355 }, { "epoch": 2.5879445283537006, "grad_norm": 0.5419175624847412, "learning_rate": 4.566544566544566e-05, "loss": 0.5502, "step": 2356 }, { "epoch": 2.5890429767952767, "grad_norm": 1.507851004600525, "learning_rate": 4.554334554334554e-05, "loss": 0.7399, "step": 2357 }, { "epoch": 2.590141425236853, "grad_norm": 1.4420006275177002, "learning_rate": 4.542124542124542e-05, "loss": 0.4233, "step": 2358 }, { "epoch": 2.591239873678429, "grad_norm": 0.6471789479255676, "learning_rate": 4.5299145299145296e-05, "loss": 0.4052, "step": 2359 }, { "epoch": 2.5923383221200056, "grad_norm": 0.5886567831039429, "learning_rate": 4.5177045177045174e-05, "loss": 0.7197, "step": 2360 }, { "epoch": 2.5934367705615817, "grad_norm": 0.843024492263794, "learning_rate": 4.5054945054945046e-05, "loss": 0.7636, "step": 2361 }, { "epoch": 2.5945352190031583, "grad_norm": 0.8689064979553223, "learning_rate": 4.493284493284493e-05, "loss": 0.6694, "step": 2362 }, { "epoch": 2.5956336674447344, "grad_norm": 0.5112485289573669, "learning_rate": 4.4810744810744804e-05, "loss": 0.5338, "step": 2363 }, { "epoch": 2.5967321158863106, "grad_norm": 0.4828614294528961, "learning_rate": 4.468864468864468e-05, "loss": 0.8519, "step": 2364 }, { "epoch": 2.5978305643278867, "grad_norm": 0.5644575357437134, "learning_rate": 4.456654456654457e-05, "loss": 0.5605, "step": 2365 }, { "epoch": 2.598929012769463, "grad_norm": 0.7749584913253784, "learning_rate": 4.444444444444444e-05, "loss": 0.6697, "step": 2366 }, { "epoch": 2.6000274612110394, "grad_norm": 0.9038271307945251, "learning_rate": 4.432234432234432e-05, "loss": 0.7242, "step": 2367 }, { "epoch": 2.6011259096526156, "grad_norm": 0.5102944374084473, "learning_rate": 4.42002442002442e-05, "loss": 0.5841, "step": 2368 }, { "epoch": 2.602224358094192, "grad_norm": 0.5072823762893677, "learning_rate": 4.4078144078144076e-05, "loss": 0.4927, "step": 2369 }, { "epoch": 2.6033228065357683, "grad_norm": 0.3654184341430664, "learning_rate": 4.3956043956043955e-05, "loss": 0.6449, "step": 2370 }, { "epoch": 2.6044212549773444, "grad_norm": 1.7309939861297607, "learning_rate": 4.3833943833943827e-05, "loss": 0.6979, "step": 2371 }, { "epoch": 2.6055197034189206, "grad_norm": 0.7982075214385986, "learning_rate": 4.3711843711843705e-05, "loss": 0.6589, "step": 2372 }, { "epoch": 2.606618151860497, "grad_norm": 0.6989462375640869, "learning_rate": 4.358974358974359e-05, "loss": 0.7104, "step": 2373 }, { "epoch": 2.6077166003020733, "grad_norm": 0.7331676483154297, "learning_rate": 4.346764346764346e-05, "loss": 0.7565, "step": 2374 }, { "epoch": 2.6088150487436494, "grad_norm": 1.0566400289535522, "learning_rate": 4.334554334554334e-05, "loss": 0.6967, "step": 2375 }, { "epoch": 2.609913497185226, "grad_norm": 0.5988017320632935, "learning_rate": 4.322344322344321e-05, "loss": 0.7871, "step": 2376 }, { "epoch": 2.611011945626802, "grad_norm": 0.4248102307319641, "learning_rate": 4.31013431013431e-05, "loss": 0.6891, "step": 2377 }, { "epoch": 2.6121103940683783, "grad_norm": 1.9839611053466797, "learning_rate": 4.297924297924298e-05, "loss": 0.6647, "step": 2378 }, { "epoch": 2.6132088425099544, "grad_norm": 0.4382665455341339, "learning_rate": 4.285714285714285e-05, "loss": 0.5969, "step": 2379 }, { "epoch": 2.614307290951531, "grad_norm": 1.1918715238571167, "learning_rate": 4.2735042735042735e-05, "loss": 0.7788, "step": 2380 }, { "epoch": 2.615405739393107, "grad_norm": 0.38117820024490356, "learning_rate": 4.2612942612942614e-05, "loss": 0.4967, "step": 2381 }, { "epoch": 2.6165041878346837, "grad_norm": 0.6454489827156067, "learning_rate": 4.2490842490842486e-05, "loss": 0.7724, "step": 2382 }, { "epoch": 2.61760263627626, "grad_norm": 1.0696319341659546, "learning_rate": 4.2368742368742364e-05, "loss": 0.5292, "step": 2383 }, { "epoch": 2.618701084717836, "grad_norm": 0.5887579321861267, "learning_rate": 4.224664224664224e-05, "loss": 0.5317, "step": 2384 }, { "epoch": 2.619799533159412, "grad_norm": 0.557188093662262, "learning_rate": 4.212454212454212e-05, "loss": 0.7172, "step": 2385 }, { "epoch": 2.6208979816009887, "grad_norm": 0.5122195482254028, "learning_rate": 4.2002442002442e-05, "loss": 0.6398, "step": 2386 }, { "epoch": 2.621996430042565, "grad_norm": 0.520722508430481, "learning_rate": 4.188034188034187e-05, "loss": 0.3984, "step": 2387 }, { "epoch": 2.623094878484141, "grad_norm": 1.2077422142028809, "learning_rate": 4.175824175824176e-05, "loss": 0.6686, "step": 2388 }, { "epoch": 2.6241933269257176, "grad_norm": 1.1437829732894897, "learning_rate": 4.163614163614163e-05, "loss": 0.6653, "step": 2389 }, { "epoch": 2.6252917753672937, "grad_norm": 0.6157158017158508, "learning_rate": 4.151404151404151e-05, "loss": 0.7074, "step": 2390 }, { "epoch": 2.62639022380887, "grad_norm": 1.8944931030273438, "learning_rate": 4.1391941391941394e-05, "loss": 0.5991, "step": 2391 }, { "epoch": 2.627488672250446, "grad_norm": 0.6598528623580933, "learning_rate": 4.1269841269841266e-05, "loss": 0.6051, "step": 2392 }, { "epoch": 2.6285871206920226, "grad_norm": 0.9341129660606384, "learning_rate": 4.1147741147741145e-05, "loss": 0.3795, "step": 2393 }, { "epoch": 2.6296855691335987, "grad_norm": 0.4246079921722412, "learning_rate": 4.1025641025641023e-05, "loss": 0.4603, "step": 2394 }, { "epoch": 2.6307840175751753, "grad_norm": 0.6639881134033203, "learning_rate": 4.09035409035409e-05, "loss": 0.5862, "step": 2395 }, { "epoch": 2.6318824660167515, "grad_norm": 1.297917366027832, "learning_rate": 4.078144078144078e-05, "loss": 0.6175, "step": 2396 }, { "epoch": 2.6329809144583276, "grad_norm": 0.7880698442459106, "learning_rate": 4.065934065934065e-05, "loss": 0.7034, "step": 2397 }, { "epoch": 2.6340793628999037, "grad_norm": 0.6197066903114319, "learning_rate": 4.053724053724053e-05, "loss": 0.659, "step": 2398 }, { "epoch": 2.6351778113414803, "grad_norm": 0.7560408711433411, "learning_rate": 4.041514041514042e-05, "loss": 0.5543, "step": 2399 }, { "epoch": 2.6362762597830565, "grad_norm": 2.2571635246276855, "learning_rate": 4.029304029304029e-05, "loss": 0.712, "step": 2400 }, { "epoch": 2.6373747082246326, "grad_norm": 0.8119613528251648, "learning_rate": 4.017094017094017e-05, "loss": 0.6407, "step": 2401 }, { "epoch": 2.638473156666209, "grad_norm": 3.9773592948913574, "learning_rate": 4.004884004884004e-05, "loss": 0.6434, "step": 2402 }, { "epoch": 2.6395716051077853, "grad_norm": 1.2648125886917114, "learning_rate": 3.9926739926739925e-05, "loss": 0.689, "step": 2403 }, { "epoch": 2.6406700535493615, "grad_norm": 0.7015364170074463, "learning_rate": 3.9804639804639804e-05, "loss": 0.4175, "step": 2404 }, { "epoch": 2.6417685019909376, "grad_norm": 0.941303551197052, "learning_rate": 3.9682539682539676e-05, "loss": 0.4126, "step": 2405 }, { "epoch": 2.642866950432514, "grad_norm": 0.7533726096153259, "learning_rate": 3.956043956043956e-05, "loss": 0.7401, "step": 2406 }, { "epoch": 2.6439653988740903, "grad_norm": 0.5480525493621826, "learning_rate": 3.943833943833943e-05, "loss": 0.5567, "step": 2407 }, { "epoch": 2.6450638473156665, "grad_norm": 0.6171422004699707, "learning_rate": 3.931623931623931e-05, "loss": 0.721, "step": 2408 }, { "epoch": 2.646162295757243, "grad_norm": 0.6719728708267212, "learning_rate": 3.919413919413919e-05, "loss": 0.5015, "step": 2409 }, { "epoch": 2.647260744198819, "grad_norm": 1.8106555938720703, "learning_rate": 3.907203907203906e-05, "loss": 0.6954, "step": 2410 }, { "epoch": 2.6483591926403953, "grad_norm": 0.42534878849983215, "learning_rate": 3.894993894993895e-05, "loss": 0.5241, "step": 2411 }, { "epoch": 2.6494576410819715, "grad_norm": 0.8733202219009399, "learning_rate": 3.882783882783883e-05, "loss": 0.4485, "step": 2412 }, { "epoch": 2.650556089523548, "grad_norm": 0.9050257802009583, "learning_rate": 3.87057387057387e-05, "loss": 0.6202, "step": 2413 }, { "epoch": 2.651654537965124, "grad_norm": 0.650347888469696, "learning_rate": 3.8583638583638584e-05, "loss": 0.621, "step": 2414 }, { "epoch": 2.6527529864067008, "grad_norm": 6.092042446136475, "learning_rate": 3.8461538461538456e-05, "loss": 0.5143, "step": 2415 }, { "epoch": 2.653851434848277, "grad_norm": 0.7801241874694824, "learning_rate": 3.8339438339438335e-05, "loss": 0.5424, "step": 2416 }, { "epoch": 2.654949883289853, "grad_norm": 0.5492686629295349, "learning_rate": 3.821733821733822e-05, "loss": 0.642, "step": 2417 }, { "epoch": 2.656048331731429, "grad_norm": 0.4257514774799347, "learning_rate": 3.809523809523809e-05, "loss": 0.8273, "step": 2418 }, { "epoch": 2.6571467801730058, "grad_norm": 1.0180964469909668, "learning_rate": 3.797313797313797e-05, "loss": 0.6962, "step": 2419 }, { "epoch": 2.658245228614582, "grad_norm": 0.3844882547855377, "learning_rate": 3.785103785103784e-05, "loss": 0.7315, "step": 2420 }, { "epoch": 2.659343677056158, "grad_norm": 0.46182385087013245, "learning_rate": 3.772893772893772e-05, "loss": 0.3889, "step": 2421 }, { "epoch": 2.6604421254977346, "grad_norm": 0.562627375125885, "learning_rate": 3.760683760683761e-05, "loss": 0.6415, "step": 2422 }, { "epoch": 2.6615405739393108, "grad_norm": 0.3234645128250122, "learning_rate": 3.7484737484737486e-05, "loss": 0.4819, "step": 2423 }, { "epoch": 2.662639022380887, "grad_norm": 0.6804205775260925, "learning_rate": 3.736263736263736e-05, "loss": 0.4248, "step": 2424 }, { "epoch": 2.663737470822463, "grad_norm": 0.5543864369392395, "learning_rate": 3.7240537240537236e-05, "loss": 0.5259, "step": 2425 }, { "epoch": 2.6648359192640396, "grad_norm": 0.8411497473716736, "learning_rate": 3.7118437118437115e-05, "loss": 0.5448, "step": 2426 }, { "epoch": 2.6659343677056158, "grad_norm": 0.4386245608329773, "learning_rate": 3.6996336996336994e-05, "loss": 0.9601, "step": 2427 }, { "epoch": 2.6670328161471923, "grad_norm": 0.773210346698761, "learning_rate": 3.687423687423687e-05, "loss": 0.8601, "step": 2428 }, { "epoch": 2.6681312645887685, "grad_norm": 0.4636232852935791, "learning_rate": 3.675213675213675e-05, "loss": 0.6322, "step": 2429 }, { "epoch": 2.6692297130303446, "grad_norm": 1.6318496465682983, "learning_rate": 3.663003663003662e-05, "loss": 0.4402, "step": 2430 }, { "epoch": 2.6703281614719208, "grad_norm": 0.5299782156944275, "learning_rate": 3.65079365079365e-05, "loss": 0.5622, "step": 2431 }, { "epoch": 2.6714266099134973, "grad_norm": 1.1223825216293335, "learning_rate": 3.638583638583638e-05, "loss": 0.5994, "step": 2432 }, { "epoch": 2.6725250583550735, "grad_norm": 1.8495402336120605, "learning_rate": 3.626373626373626e-05, "loss": 0.669, "step": 2433 }, { "epoch": 2.6736235067966496, "grad_norm": 0.4963383972644806, "learning_rate": 3.614163614163614e-05, "loss": 0.5412, "step": 2434 }, { "epoch": 2.674721955238226, "grad_norm": 0.5644822716712952, "learning_rate": 3.601953601953602e-05, "loss": 0.5768, "step": 2435 }, { "epoch": 2.6758204036798023, "grad_norm": 0.5272318720817566, "learning_rate": 3.5897435897435896e-05, "loss": 0.5909, "step": 2436 }, { "epoch": 2.6769188521213785, "grad_norm": 0.29838863015174866, "learning_rate": 3.5775335775335774e-05, "loss": 0.5625, "step": 2437 }, { "epoch": 2.6780173005629546, "grad_norm": 0.5375344157218933, "learning_rate": 3.565323565323565e-05, "loss": 0.5932, "step": 2438 }, { "epoch": 2.679115749004531, "grad_norm": 0.7850833535194397, "learning_rate": 3.5531135531135525e-05, "loss": 0.6706, "step": 2439 }, { "epoch": 2.6802141974461073, "grad_norm": 0.5286651253700256, "learning_rate": 3.540903540903541e-05, "loss": 0.6865, "step": 2440 }, { "epoch": 2.681312645887684, "grad_norm": 0.9832364320755005, "learning_rate": 3.528693528693528e-05, "loss": 0.7941, "step": 2441 }, { "epoch": 2.68241109432926, "grad_norm": 0.4431805908679962, "learning_rate": 3.516483516483516e-05, "loss": 0.4706, "step": 2442 }, { "epoch": 2.683509542770836, "grad_norm": 1.7264482975006104, "learning_rate": 3.504273504273504e-05, "loss": 0.6308, "step": 2443 }, { "epoch": 2.6846079912124123, "grad_norm": 0.6196084022521973, "learning_rate": 3.492063492063492e-05, "loss": 1.0233, "step": 2444 }, { "epoch": 2.6857064396539885, "grad_norm": 0.855876088142395, "learning_rate": 3.47985347985348e-05, "loss": 0.5522, "step": 2445 }, { "epoch": 2.686804888095565, "grad_norm": 0.45323798060417175, "learning_rate": 3.4676434676434676e-05, "loss": 0.6232, "step": 2446 }, { "epoch": 2.687903336537141, "grad_norm": 0.577273964881897, "learning_rate": 3.455433455433455e-05, "loss": 0.5051, "step": 2447 }, { "epoch": 2.689001784978718, "grad_norm": 0.4999620020389557, "learning_rate": 3.4432234432234427e-05, "loss": 0.4881, "step": 2448 }, { "epoch": 2.690100233420294, "grad_norm": 0.5028046369552612, "learning_rate": 3.431013431013431e-05, "loss": 0.6575, "step": 2449 }, { "epoch": 2.69119868186187, "grad_norm": 2.122028350830078, "learning_rate": 3.4188034188034184e-05, "loss": 0.7226, "step": 2450 }, { "epoch": 2.692297130303446, "grad_norm": 0.4979703426361084, "learning_rate": 3.406593406593406e-05, "loss": 0.5768, "step": 2451 }, { "epoch": 2.693395578745023, "grad_norm": 0.9270527958869934, "learning_rate": 3.394383394383394e-05, "loss": 0.6464, "step": 2452 }, { "epoch": 2.694494027186599, "grad_norm": 1.0739809274673462, "learning_rate": 3.382173382173382e-05, "loss": 0.753, "step": 2453 }, { "epoch": 2.695592475628175, "grad_norm": 0.6039335131645203, "learning_rate": 3.36996336996337e-05, "loss": 0.7909, "step": 2454 }, { "epoch": 2.6966909240697516, "grad_norm": 0.49040424823760986, "learning_rate": 3.357753357753358e-05, "loss": 0.6112, "step": 2455 }, { "epoch": 2.6977893725113278, "grad_norm": 0.6890440583229065, "learning_rate": 3.345543345543345e-05, "loss": 0.6849, "step": 2456 }, { "epoch": 2.698887820952904, "grad_norm": 0.7819212675094604, "learning_rate": 3.333333333333333e-05, "loss": 0.6797, "step": 2457 }, { "epoch": 2.69998626939448, "grad_norm": 1.0147050619125366, "learning_rate": 3.321123321123321e-05, "loss": 0.6867, "step": 2458 }, { "epoch": 2.7010847178360566, "grad_norm": 1.3562036752700806, "learning_rate": 3.3089133089133086e-05, "loss": 0.7811, "step": 2459 }, { "epoch": 2.7021831662776328, "grad_norm": 0.5813838839530945, "learning_rate": 3.2967032967032964e-05, "loss": 0.5405, "step": 2460 }, { "epoch": 2.7032816147192094, "grad_norm": 0.6152640581130981, "learning_rate": 3.284493284493284e-05, "loss": 0.425, "step": 2461 }, { "epoch": 2.7043800631607855, "grad_norm": 1.1984590291976929, "learning_rate": 3.272283272283272e-05, "loss": 0.592, "step": 2462 }, { "epoch": 2.7054785116023616, "grad_norm": 0.48487693071365356, "learning_rate": 3.26007326007326e-05, "loss": 0.5223, "step": 2463 }, { "epoch": 2.7065769600439378, "grad_norm": 0.47191065549850464, "learning_rate": 3.247863247863247e-05, "loss": 0.6479, "step": 2464 }, { "epoch": 2.7076754084855144, "grad_norm": 1.3167297840118408, "learning_rate": 3.235653235653235e-05, "loss": 0.4552, "step": 2465 }, { "epoch": 2.7087738569270905, "grad_norm": 1.3219714164733887, "learning_rate": 3.2234432234432237e-05, "loss": 0.5839, "step": 2466 }, { "epoch": 2.7098723053686666, "grad_norm": 0.8047394752502441, "learning_rate": 3.211233211233211e-05, "loss": 0.795, "step": 2467 }, { "epoch": 2.710970753810243, "grad_norm": 0.6053475737571716, "learning_rate": 3.199023199023199e-05, "loss": 0.743, "step": 2468 }, { "epoch": 2.7120692022518194, "grad_norm": 0.4619985818862915, "learning_rate": 3.1868131868131866e-05, "loss": 0.642, "step": 2469 }, { "epoch": 2.7131676506933955, "grad_norm": 0.8241426944732666, "learning_rate": 3.1746031746031745e-05, "loss": 0.521, "step": 2470 }, { "epoch": 2.7142660991349716, "grad_norm": 0.4344565272331238, "learning_rate": 3.162393162393162e-05, "loss": 0.4615, "step": 2471 }, { "epoch": 2.715364547576548, "grad_norm": 0.9640605449676514, "learning_rate": 3.15018315018315e-05, "loss": 0.4735, "step": 2472 }, { "epoch": 2.7164629960181244, "grad_norm": 0.49423810839653015, "learning_rate": 3.1379731379731374e-05, "loss": 0.7547, "step": 2473 }, { "epoch": 2.717561444459701, "grad_norm": 0.7234408855438232, "learning_rate": 3.125763125763125e-05, "loss": 0.464, "step": 2474 }, { "epoch": 2.718659892901277, "grad_norm": 0.542647123336792, "learning_rate": 3.113553113553113e-05, "loss": 0.5563, "step": 2475 }, { "epoch": 2.719758341342853, "grad_norm": 0.555722177028656, "learning_rate": 3.101343101343101e-05, "loss": 0.6899, "step": 2476 }, { "epoch": 2.7208567897844294, "grad_norm": 0.6171600222587585, "learning_rate": 3.089133089133089e-05, "loss": 0.6088, "step": 2477 }, { "epoch": 2.7219552382260055, "grad_norm": 0.9118738770484924, "learning_rate": 3.076923076923077e-05, "loss": 0.7778, "step": 2478 }, { "epoch": 2.723053686667582, "grad_norm": 0.6610655784606934, "learning_rate": 3.064713064713064e-05, "loss": 0.6935, "step": 2479 }, { "epoch": 2.724152135109158, "grad_norm": 0.6729289889335632, "learning_rate": 3.0525030525030525e-05, "loss": 0.792, "step": 2480 }, { "epoch": 2.725250583550735, "grad_norm": 0.4955647587776184, "learning_rate": 3.04029304029304e-05, "loss": 0.6746, "step": 2481 }, { "epoch": 2.726349031992311, "grad_norm": 0.42975953221321106, "learning_rate": 3.028083028083028e-05, "loss": 0.5318, "step": 2482 }, { "epoch": 2.727447480433887, "grad_norm": 0.3555055856704712, "learning_rate": 3.0158730158730154e-05, "loss": 0.6377, "step": 2483 }, { "epoch": 2.728545928875463, "grad_norm": 3.138209342956543, "learning_rate": 3.0036630036630036e-05, "loss": 0.6296, "step": 2484 }, { "epoch": 2.72964437731704, "grad_norm": 0.5710242390632629, "learning_rate": 2.9914529914529912e-05, "loss": 0.8987, "step": 2485 }, { "epoch": 2.730742825758616, "grad_norm": 0.5200769305229187, "learning_rate": 2.979242979242979e-05, "loss": 0.5154, "step": 2486 }, { "epoch": 2.731841274200192, "grad_norm": 0.797572910785675, "learning_rate": 2.9670329670329666e-05, "loss": 0.8039, "step": 2487 }, { "epoch": 2.7329397226417687, "grad_norm": 0.4667447805404663, "learning_rate": 2.9548229548229548e-05, "loss": 0.586, "step": 2488 }, { "epoch": 2.734038171083345, "grad_norm": 0.5500869154930115, "learning_rate": 2.9426129426129423e-05, "loss": 0.7007, "step": 2489 }, { "epoch": 2.735136619524921, "grad_norm": 0.5311625003814697, "learning_rate": 2.9304029304029302e-05, "loss": 0.4257, "step": 2490 }, { "epoch": 2.736235067966497, "grad_norm": 0.6474941968917847, "learning_rate": 2.9181929181929177e-05, "loss": 0.4747, "step": 2491 }, { "epoch": 2.7373335164080737, "grad_norm": 1.1186646223068237, "learning_rate": 2.9059829059829056e-05, "loss": 0.8177, "step": 2492 }, { "epoch": 2.73843196484965, "grad_norm": 2.455371379852295, "learning_rate": 2.8937728937728938e-05, "loss": 0.6535, "step": 2493 }, { "epoch": 2.7395304132912264, "grad_norm": 0.5033484101295471, "learning_rate": 2.8815628815628813e-05, "loss": 0.525, "step": 2494 }, { "epoch": 2.7406288617328025, "grad_norm": 0.5826357007026672, "learning_rate": 2.869352869352869e-05, "loss": 0.476, "step": 2495 }, { "epoch": 2.7417273101743787, "grad_norm": 0.5875104665756226, "learning_rate": 2.8571428571428567e-05, "loss": 0.6903, "step": 2496 }, { "epoch": 2.742825758615955, "grad_norm": 0.6006028056144714, "learning_rate": 2.844932844932845e-05, "loss": 0.8522, "step": 2497 }, { "epoch": 2.7439242070575314, "grad_norm": 0.5605003833770752, "learning_rate": 2.8327228327228325e-05, "loss": 0.5312, "step": 2498 }, { "epoch": 2.7450226554991075, "grad_norm": 0.7641153931617737, "learning_rate": 2.8205128205128204e-05, "loss": 0.6841, "step": 2499 }, { "epoch": 2.7461211039406836, "grad_norm": 0.5523414015769958, "learning_rate": 2.808302808302808e-05, "loss": 0.6582, "step": 2500 }, { "epoch": 2.7472195523822602, "grad_norm": 0.40714672207832336, "learning_rate": 2.796092796092796e-05, "loss": 0.7493, "step": 2501 }, { "epoch": 2.7483180008238364, "grad_norm": 0.6960926651954651, "learning_rate": 2.7838827838827836e-05, "loss": 0.7104, "step": 2502 }, { "epoch": 2.7494164492654125, "grad_norm": 0.42409783601760864, "learning_rate": 2.7716727716727715e-05, "loss": 0.5643, "step": 2503 }, { "epoch": 2.7505148977069886, "grad_norm": 0.5174455046653748, "learning_rate": 2.759462759462759e-05, "loss": 0.4545, "step": 2504 }, { "epoch": 2.7516133461485652, "grad_norm": 0.6353528499603271, "learning_rate": 2.747252747252747e-05, "loss": 0.5068, "step": 2505 }, { "epoch": 2.7527117945901414, "grad_norm": 0.46814125776290894, "learning_rate": 2.7350427350427348e-05, "loss": 0.7979, "step": 2506 }, { "epoch": 2.753810243031718, "grad_norm": 0.7229417562484741, "learning_rate": 2.7228327228327227e-05, "loss": 0.6212, "step": 2507 }, { "epoch": 2.754908691473294, "grad_norm": 1.2155603170394897, "learning_rate": 2.7106227106227102e-05, "loss": 0.8444, "step": 2508 }, { "epoch": 2.7560071399148702, "grad_norm": 0.462703138589859, "learning_rate": 2.698412698412698e-05, "loss": 0.8263, "step": 2509 }, { "epoch": 2.7571055883564464, "grad_norm": 0.9474642872810364, "learning_rate": 2.6862026862026863e-05, "loss": 0.7586, "step": 2510 }, { "epoch": 2.758204036798023, "grad_norm": 4.502622127532959, "learning_rate": 2.6739926739926738e-05, "loss": 0.5806, "step": 2511 }, { "epoch": 2.759302485239599, "grad_norm": 1.1251213550567627, "learning_rate": 2.6617826617826617e-05, "loss": 0.6333, "step": 2512 }, { "epoch": 2.7604009336811752, "grad_norm": 0.7035579681396484, "learning_rate": 2.6495726495726492e-05, "loss": 0.4739, "step": 2513 }, { "epoch": 2.761499382122752, "grad_norm": 0.5279493927955627, "learning_rate": 2.6373626373626374e-05, "loss": 0.597, "step": 2514 }, { "epoch": 2.762597830564328, "grad_norm": 0.5512554049491882, "learning_rate": 2.625152625152625e-05, "loss": 0.6471, "step": 2515 }, { "epoch": 2.763696279005904, "grad_norm": 0.857778012752533, "learning_rate": 2.6129426129426128e-05, "loss": 0.6172, "step": 2516 }, { "epoch": 2.7647947274474802, "grad_norm": 0.5348466634750366, "learning_rate": 2.6007326007326004e-05, "loss": 0.8074, "step": 2517 }, { "epoch": 2.765893175889057, "grad_norm": 0.5413629412651062, "learning_rate": 2.5885225885225882e-05, "loss": 0.3879, "step": 2518 }, { "epoch": 2.766991624330633, "grad_norm": 0.569411039352417, "learning_rate": 2.576312576312576e-05, "loss": 0.4392, "step": 2519 }, { "epoch": 2.7680900727722095, "grad_norm": 0.5127429962158203, "learning_rate": 2.564102564102564e-05, "loss": 0.6566, "step": 2520 }, { "epoch": 2.7691885212137857, "grad_norm": 0.7328614592552185, "learning_rate": 2.5518925518925515e-05, "loss": 0.6801, "step": 2521 }, { "epoch": 2.770286969655362, "grad_norm": 0.615686297416687, "learning_rate": 2.5396825396825394e-05, "loss": 0.6366, "step": 2522 }, { "epoch": 2.771385418096938, "grad_norm": 0.5250161290168762, "learning_rate": 2.5274725274725276e-05, "loss": 0.5737, "step": 2523 }, { "epoch": 2.772483866538514, "grad_norm": 0.6708832383155823, "learning_rate": 2.515262515262515e-05, "loss": 0.6681, "step": 2524 }, { "epoch": 2.7735823149800907, "grad_norm": 0.6120278835296631, "learning_rate": 2.503052503052503e-05, "loss": 0.4964, "step": 2525 }, { "epoch": 2.774680763421667, "grad_norm": 0.7024976015090942, "learning_rate": 2.4908424908424905e-05, "loss": 0.7984, "step": 2526 }, { "epoch": 2.7757792118632434, "grad_norm": 7.281716823577881, "learning_rate": 2.478632478632478e-05, "loss": 0.7191, "step": 2527 }, { "epoch": 2.7768776603048195, "grad_norm": 0.7347024083137512, "learning_rate": 2.4664224664224663e-05, "loss": 0.8684, "step": 2528 }, { "epoch": 2.7779761087463957, "grad_norm": 1.1338274478912354, "learning_rate": 2.454212454212454e-05, "loss": 0.5936, "step": 2529 }, { "epoch": 2.779074557187972, "grad_norm": 0.4176536202430725, "learning_rate": 2.4420024420024417e-05, "loss": 0.445, "step": 2530 }, { "epoch": 2.7801730056295484, "grad_norm": 0.9390072822570801, "learning_rate": 2.4297924297924295e-05, "loss": 0.5821, "step": 2531 }, { "epoch": 2.7812714540711245, "grad_norm": 1.1045840978622437, "learning_rate": 2.4175824175824174e-05, "loss": 0.7372, "step": 2532 }, { "epoch": 2.7823699025127007, "grad_norm": 0.5568689703941345, "learning_rate": 2.4053724053724053e-05, "loss": 0.5005, "step": 2533 }, { "epoch": 2.7834683509542772, "grad_norm": 0.2747582793235779, "learning_rate": 2.3931623931623928e-05, "loss": 0.5778, "step": 2534 }, { "epoch": 2.7845667993958534, "grad_norm": 1.4027804136276245, "learning_rate": 2.3809523809523807e-05, "loss": 0.5368, "step": 2535 }, { "epoch": 2.7856652478374295, "grad_norm": 0.7523220777511597, "learning_rate": 2.368742368742369e-05, "loss": 0.58, "step": 2536 }, { "epoch": 2.7867636962790057, "grad_norm": 0.33777353167533875, "learning_rate": 2.3565323565323564e-05, "loss": 0.5269, "step": 2537 }, { "epoch": 2.7878621447205822, "grad_norm": 0.5818787217140198, "learning_rate": 2.344322344322344e-05, "loss": 0.4459, "step": 2538 }, { "epoch": 2.7889605931621584, "grad_norm": 0.36858034133911133, "learning_rate": 2.3321123321123318e-05, "loss": 0.712, "step": 2539 }, { "epoch": 2.790059041603735, "grad_norm": 0.5299241542816162, "learning_rate": 2.3199023199023194e-05, "loss": 0.6086, "step": 2540 }, { "epoch": 2.791157490045311, "grad_norm": 2.432325601577759, "learning_rate": 2.3076923076923076e-05, "loss": 1.0386, "step": 2541 }, { "epoch": 2.7922559384868872, "grad_norm": 0.746638834476471, "learning_rate": 2.2954822954822954e-05, "loss": 0.7372, "step": 2542 }, { "epoch": 2.7933543869284634, "grad_norm": 0.6017647981643677, "learning_rate": 2.283272283272283e-05, "loss": 0.9134, "step": 2543 }, { "epoch": 2.79445283537004, "grad_norm": 0.7385385036468506, "learning_rate": 2.271062271062271e-05, "loss": 0.6827, "step": 2544 }, { "epoch": 2.795551283811616, "grad_norm": 0.6607246994972229, "learning_rate": 2.2588522588522587e-05, "loss": 0.6333, "step": 2545 }, { "epoch": 2.7966497322531922, "grad_norm": 0.40185117721557617, "learning_rate": 2.2466422466422466e-05, "loss": 0.6589, "step": 2546 }, { "epoch": 2.797748180694769, "grad_norm": 0.48225662112236023, "learning_rate": 2.234432234432234e-05, "loss": 0.6571, "step": 2547 }, { "epoch": 2.798846629136345, "grad_norm": 0.8996065855026245, "learning_rate": 2.222222222222222e-05, "loss": 0.7518, "step": 2548 }, { "epoch": 2.799945077577921, "grad_norm": 0.7139112949371338, "learning_rate": 2.21001221001221e-05, "loss": 0.6517, "step": 2549 }, { "epoch": 2.8010435260194972, "grad_norm": 0.5433416366577148, "learning_rate": 2.1978021978021977e-05, "loss": 0.3799, "step": 2550 }, { "epoch": 2.802141974461074, "grad_norm": 0.3883088231086731, "learning_rate": 2.1855921855921853e-05, "loss": 0.9269, "step": 2551 }, { "epoch": 2.80324042290265, "grad_norm": 0.5275357961654663, "learning_rate": 2.173382173382173e-05, "loss": 0.6606, "step": 2552 }, { "epoch": 2.8043388713442265, "grad_norm": 0.4666341543197632, "learning_rate": 2.1611721611721607e-05, "loss": 0.6982, "step": 2553 }, { "epoch": 2.8054373197858027, "grad_norm": 0.9221529364585876, "learning_rate": 2.148962148962149e-05, "loss": 0.4769, "step": 2554 }, { "epoch": 2.806535768227379, "grad_norm": 0.7469640374183655, "learning_rate": 2.1367521367521368e-05, "loss": 0.6985, "step": 2555 }, { "epoch": 2.807634216668955, "grad_norm": 0.6858775615692139, "learning_rate": 2.1245421245421243e-05, "loss": 0.4511, "step": 2556 }, { "epoch": 2.808732665110531, "grad_norm": 1.266801357269287, "learning_rate": 2.112332112332112e-05, "loss": 0.421, "step": 2557 }, { "epoch": 2.8098311135521077, "grad_norm": 0.5506262183189392, "learning_rate": 2.1001221001221e-05, "loss": 0.6082, "step": 2558 }, { "epoch": 2.810929561993684, "grad_norm": 0.5359029173851013, "learning_rate": 2.087912087912088e-05, "loss": 0.8111, "step": 2559 }, { "epoch": 2.8120280104352604, "grad_norm": 0.6969206929206848, "learning_rate": 2.0757020757020754e-05, "loss": 0.8331, "step": 2560 }, { "epoch": 2.8131264588768365, "grad_norm": 0.6040379405021667, "learning_rate": 2.0634920634920633e-05, "loss": 0.575, "step": 2561 }, { "epoch": 2.8142249073184127, "grad_norm": 1.3847273588180542, "learning_rate": 2.0512820512820512e-05, "loss": 0.5442, "step": 2562 }, { "epoch": 2.815323355759989, "grad_norm": 0.8050490617752075, "learning_rate": 2.039072039072039e-05, "loss": 0.6267, "step": 2563 }, { "epoch": 2.8164218042015654, "grad_norm": 0.5663136839866638, "learning_rate": 2.0268620268620266e-05, "loss": 0.5246, "step": 2564 }, { "epoch": 2.8175202526431415, "grad_norm": 0.3316130042076111, "learning_rate": 2.0146520146520144e-05, "loss": 0.5175, "step": 2565 }, { "epoch": 2.8186187010847177, "grad_norm": 0.4782855808734894, "learning_rate": 2.002442002442002e-05, "loss": 0.5111, "step": 2566 }, { "epoch": 2.8197171495262943, "grad_norm": 0.44766396284103394, "learning_rate": 1.9902319902319902e-05, "loss": 0.5825, "step": 2567 }, { "epoch": 2.8208155979678704, "grad_norm": 0.6830618977546692, "learning_rate": 1.978021978021978e-05, "loss": 0.5685, "step": 2568 }, { "epoch": 2.8219140464094465, "grad_norm": 0.5860748887062073, "learning_rate": 1.9658119658119656e-05, "loss": 0.7557, "step": 2569 }, { "epoch": 2.8230124948510227, "grad_norm": 0.49533459544181824, "learning_rate": 1.953601953601953e-05, "loss": 0.7326, "step": 2570 }, { "epoch": 2.8241109432925993, "grad_norm": 0.4989941418170929, "learning_rate": 1.9413919413919413e-05, "loss": 0.5757, "step": 2571 }, { "epoch": 2.8252093917341754, "grad_norm": 0.4973461627960205, "learning_rate": 1.9291819291819292e-05, "loss": 0.5357, "step": 2572 }, { "epoch": 2.826307840175752, "grad_norm": 0.7442370057106018, "learning_rate": 1.9169719169719167e-05, "loss": 0.7283, "step": 2573 }, { "epoch": 2.827406288617328, "grad_norm": 1.3321865797042847, "learning_rate": 1.9047619047619046e-05, "loss": 0.5107, "step": 2574 }, { "epoch": 2.8285047370589043, "grad_norm": 0.47394871711730957, "learning_rate": 1.892551892551892e-05, "loss": 0.5495, "step": 2575 }, { "epoch": 2.8296031855004804, "grad_norm": 0.6102151274681091, "learning_rate": 1.8803418803418804e-05, "loss": 0.5983, "step": 2576 }, { "epoch": 2.830701633942057, "grad_norm": 0.4657471179962158, "learning_rate": 1.868131868131868e-05, "loss": 0.5937, "step": 2577 }, { "epoch": 2.831800082383633, "grad_norm": 0.41180238127708435, "learning_rate": 1.8559218559218558e-05, "loss": 0.7775, "step": 2578 }, { "epoch": 2.8328985308252093, "grad_norm": 3.5043845176696777, "learning_rate": 1.8437118437118436e-05, "loss": 0.5304, "step": 2579 }, { "epoch": 2.833996979266786, "grad_norm": 0.4502231776714325, "learning_rate": 1.831501831501831e-05, "loss": 0.6556, "step": 2580 }, { "epoch": 2.835095427708362, "grad_norm": 0.6165898442268372, "learning_rate": 1.819291819291819e-05, "loss": 0.8434, "step": 2581 }, { "epoch": 2.836193876149938, "grad_norm": 0.5112649202346802, "learning_rate": 1.807081807081807e-05, "loss": 0.7429, "step": 2582 }, { "epoch": 2.8372923245915143, "grad_norm": 0.4834790527820587, "learning_rate": 1.7948717948717948e-05, "loss": 0.5772, "step": 2583 }, { "epoch": 2.838390773033091, "grad_norm": 0.4251219630241394, "learning_rate": 1.7826617826617826e-05, "loss": 0.5192, "step": 2584 }, { "epoch": 2.839489221474667, "grad_norm": 0.7645363807678223, "learning_rate": 1.7704517704517705e-05, "loss": 0.6624, "step": 2585 }, { "epoch": 2.8405876699162436, "grad_norm": 0.5651314854621887, "learning_rate": 1.758241758241758e-05, "loss": 0.5829, "step": 2586 }, { "epoch": 2.8416861183578197, "grad_norm": 1.059164047241211, "learning_rate": 1.746031746031746e-05, "loss": 0.6688, "step": 2587 }, { "epoch": 2.842784566799396, "grad_norm": 2.2424001693725586, "learning_rate": 1.7338217338217338e-05, "loss": 0.4515, "step": 2588 }, { "epoch": 2.843883015240972, "grad_norm": 0.6211466789245605, "learning_rate": 1.7216117216117213e-05, "loss": 0.836, "step": 2589 }, { "epoch": 2.8449814636825486, "grad_norm": 0.4224345088005066, "learning_rate": 1.7094017094017092e-05, "loss": 0.536, "step": 2590 }, { "epoch": 2.8460799121241247, "grad_norm": 0.7985780239105225, "learning_rate": 1.697191697191697e-05, "loss": 0.7433, "step": 2591 }, { "epoch": 2.847178360565701, "grad_norm": 1.4033039808273315, "learning_rate": 1.684981684981685e-05, "loss": 0.7479, "step": 2592 }, { "epoch": 2.8482768090072774, "grad_norm": 1.1432255506515503, "learning_rate": 1.6727716727716725e-05, "loss": 0.652, "step": 2593 }, { "epoch": 2.8493752574488536, "grad_norm": 0.9324535727500916, "learning_rate": 1.6605616605616603e-05, "loss": 0.5225, "step": 2594 }, { "epoch": 2.8504737058904297, "grad_norm": 0.5573447942733765, "learning_rate": 1.6483516483516482e-05, "loss": 0.6649, "step": 2595 }, { "epoch": 2.851572154332006, "grad_norm": 0.6875207424163818, "learning_rate": 1.636141636141636e-05, "loss": 0.7334, "step": 2596 }, { "epoch": 2.8526706027735824, "grad_norm": 0.32099124789237976, "learning_rate": 1.6239316239316236e-05, "loss": 0.5732, "step": 2597 }, { "epoch": 2.8537690512151586, "grad_norm": 0.4142940938472748, "learning_rate": 1.6117216117216118e-05, "loss": 0.6605, "step": 2598 }, { "epoch": 2.8548674996567347, "grad_norm": 0.5377205610275269, "learning_rate": 1.5995115995115994e-05, "loss": 0.5556, "step": 2599 }, { "epoch": 2.8559659480983113, "grad_norm": 0.43509960174560547, "learning_rate": 1.5873015873015872e-05, "loss": 0.8321, "step": 2600 }, { "epoch": 2.8570643965398874, "grad_norm": 0.4376494586467743, "learning_rate": 1.575091575091575e-05, "loss": 0.6392, "step": 2601 }, { "epoch": 2.8581628449814636, "grad_norm": 0.507837176322937, "learning_rate": 1.5628815628815626e-05, "loss": 0.5326, "step": 2602 }, { "epoch": 2.8592612934230397, "grad_norm": 29.0502986907959, "learning_rate": 1.5506715506715505e-05, "loss": 0.5478, "step": 2603 }, { "epoch": 2.8603597418646163, "grad_norm": 0.6940420866012573, "learning_rate": 1.5384615384615384e-05, "loss": 1.3063, "step": 2604 }, { "epoch": 2.8614581903061924, "grad_norm": 0.7178813219070435, "learning_rate": 1.5262515262515263e-05, "loss": 0.7447, "step": 2605 }, { "epoch": 2.862556638747769, "grad_norm": 0.6209506392478943, "learning_rate": 1.514041514041514e-05, "loss": 0.5496, "step": 2606 }, { "epoch": 2.863655087189345, "grad_norm": 0.5526819825172424, "learning_rate": 1.5018315018315018e-05, "loss": 0.4224, "step": 2607 }, { "epoch": 2.8647535356309213, "grad_norm": 0.5056405663490295, "learning_rate": 1.4896214896214895e-05, "loss": 0.6248, "step": 2608 }, { "epoch": 2.8658519840724974, "grad_norm": 2.416952610015869, "learning_rate": 1.4774114774114774e-05, "loss": 0.7551, "step": 2609 }, { "epoch": 2.866950432514074, "grad_norm": 0.52223140001297, "learning_rate": 1.4652014652014651e-05, "loss": 1.1146, "step": 2610 }, { "epoch": 2.86804888095565, "grad_norm": 0.685767650604248, "learning_rate": 1.4529914529914528e-05, "loss": 0.715, "step": 2611 }, { "epoch": 2.8691473293972263, "grad_norm": 0.650374174118042, "learning_rate": 1.4407814407814407e-05, "loss": 0.8844, "step": 2612 }, { "epoch": 2.870245777838803, "grad_norm": 0.46946465969085693, "learning_rate": 1.4285714285714284e-05, "loss": 0.9545, "step": 2613 }, { "epoch": 2.871344226280379, "grad_norm": 0.5312052369117737, "learning_rate": 1.4163614163614162e-05, "loss": 0.5204, "step": 2614 }, { "epoch": 2.872442674721955, "grad_norm": 0.41921889781951904, "learning_rate": 1.404151404151404e-05, "loss": 0.4614, "step": 2615 }, { "epoch": 2.8735411231635313, "grad_norm": 0.513203501701355, "learning_rate": 1.3919413919413918e-05, "loss": 0.613, "step": 2616 }, { "epoch": 2.874639571605108, "grad_norm": 1.1020901203155518, "learning_rate": 1.3797313797313795e-05, "loss": 0.525, "step": 2617 }, { "epoch": 2.875738020046684, "grad_norm": 0.39301392436027527, "learning_rate": 1.3675213675213674e-05, "loss": 0.5799, "step": 2618 }, { "epoch": 2.8768364684882606, "grad_norm": 1.576910376548767, "learning_rate": 1.3553113553113551e-05, "loss": 0.6286, "step": 2619 }, { "epoch": 2.8779349169298367, "grad_norm": 0.36711424589157104, "learning_rate": 1.3431013431013431e-05, "loss": 0.7542, "step": 2620 }, { "epoch": 2.879033365371413, "grad_norm": 1.2777636051177979, "learning_rate": 1.3308913308913308e-05, "loss": 0.6269, "step": 2621 }, { "epoch": 2.880131813812989, "grad_norm": 0.5584180355072021, "learning_rate": 1.3186813186813187e-05, "loss": 0.5633, "step": 2622 }, { "epoch": 2.8812302622545656, "grad_norm": 1.2418673038482666, "learning_rate": 1.3064713064713064e-05, "loss": 0.537, "step": 2623 }, { "epoch": 2.8823287106961417, "grad_norm": 0.5850531458854675, "learning_rate": 1.2942612942612941e-05, "loss": 0.595, "step": 2624 }, { "epoch": 2.883427159137718, "grad_norm": 1.054592251777649, "learning_rate": 1.282051282051282e-05, "loss": 0.8308, "step": 2625 }, { "epoch": 2.8845256075792944, "grad_norm": 0.3231412470340729, "learning_rate": 1.2698412698412697e-05, "loss": 0.4044, "step": 2626 }, { "epoch": 2.8856240560208706, "grad_norm": 0.47942933440208435, "learning_rate": 1.2576312576312576e-05, "loss": 0.6299, "step": 2627 }, { "epoch": 2.8867225044624467, "grad_norm": 0.4884187579154968, "learning_rate": 1.2454212454212453e-05, "loss": 0.6606, "step": 2628 }, { "epoch": 2.887820952904023, "grad_norm": 0.6658734083175659, "learning_rate": 1.2332112332112331e-05, "loss": 0.642, "step": 2629 }, { "epoch": 2.8889194013455994, "grad_norm": 0.24990247189998627, "learning_rate": 1.2210012210012208e-05, "loss": 0.4041, "step": 2630 }, { "epoch": 2.8900178497871756, "grad_norm": 0.6446508169174194, "learning_rate": 1.2087912087912087e-05, "loss": 0.7126, "step": 2631 }, { "epoch": 2.891116298228752, "grad_norm": 0.7800988554954529, "learning_rate": 1.1965811965811964e-05, "loss": 0.6733, "step": 2632 }, { "epoch": 2.8922147466703283, "grad_norm": 0.5319482684135437, "learning_rate": 1.1843711843711844e-05, "loss": 0.6445, "step": 2633 }, { "epoch": 2.8933131951119044, "grad_norm": 0.6029678583145142, "learning_rate": 1.172161172161172e-05, "loss": 0.7642, "step": 2634 }, { "epoch": 2.8944116435534806, "grad_norm": 0.9029693007469177, "learning_rate": 1.1599511599511597e-05, "loss": 0.635, "step": 2635 }, { "epoch": 2.8955100919950567, "grad_norm": 0.6022691130638123, "learning_rate": 1.1477411477411477e-05, "loss": 0.5361, "step": 2636 }, { "epoch": 2.8966085404366333, "grad_norm": 0.6777801513671875, "learning_rate": 1.1355311355311354e-05, "loss": 0.5099, "step": 2637 }, { "epoch": 2.8977069888782094, "grad_norm": 0.4157528877258301, "learning_rate": 1.1233211233211233e-05, "loss": 0.5038, "step": 2638 }, { "epoch": 2.898805437319786, "grad_norm": 2.6101133823394775, "learning_rate": 1.111111111111111e-05, "loss": 0.6324, "step": 2639 }, { "epoch": 2.899903885761362, "grad_norm": 0.6885612607002258, "learning_rate": 1.0989010989010989e-05, "loss": 0.4931, "step": 2640 }, { "epoch": 2.9010023342029383, "grad_norm": 0.5510079264640808, "learning_rate": 1.0866910866910866e-05, "loss": 0.5088, "step": 2641 }, { "epoch": 2.9021007826445144, "grad_norm": 0.6099854111671448, "learning_rate": 1.0744810744810744e-05, "loss": 0.4647, "step": 2642 }, { "epoch": 2.903199231086091, "grad_norm": 0.4390881657600403, "learning_rate": 1.0622710622710621e-05, "loss": 0.6787, "step": 2643 }, { "epoch": 2.904297679527667, "grad_norm": 0.46238628029823303, "learning_rate": 1.05006105006105e-05, "loss": 0.5655, "step": 2644 }, { "epoch": 2.9053961279692433, "grad_norm": 0.479106605052948, "learning_rate": 1.0378510378510377e-05, "loss": 0.7833, "step": 2645 }, { "epoch": 2.90649457641082, "grad_norm": 0.4643683135509491, "learning_rate": 1.0256410256410256e-05, "loss": 0.4563, "step": 2646 }, { "epoch": 2.907593024852396, "grad_norm": 0.4173976480960846, "learning_rate": 1.0134310134310133e-05, "loss": 0.6614, "step": 2647 }, { "epoch": 2.908691473293972, "grad_norm": 0.7158990502357483, "learning_rate": 1.001221001221001e-05, "loss": 0.7342, "step": 2648 }, { "epoch": 2.9097899217355483, "grad_norm": 0.7276301980018616, "learning_rate": 9.89010989010989e-06, "loss": 0.6883, "step": 2649 }, { "epoch": 2.910888370177125, "grad_norm": 0.63588947057724, "learning_rate": 9.768009768009766e-06, "loss": 0.7533, "step": 2650 }, { "epoch": 2.911986818618701, "grad_norm": 1.8038127422332764, "learning_rate": 9.645909645909646e-06, "loss": 0.6238, "step": 2651 }, { "epoch": 2.9130852670602776, "grad_norm": 0.7289617657661438, "learning_rate": 9.523809523809523e-06, "loss": 0.4767, "step": 2652 }, { "epoch": 2.9141837155018537, "grad_norm": 0.3828502893447876, "learning_rate": 9.401709401709402e-06, "loss": 0.4812, "step": 2653 }, { "epoch": 2.91528216394343, "grad_norm": 0.5157826542854309, "learning_rate": 9.279609279609279e-06, "loss": 0.703, "step": 2654 }, { "epoch": 2.916380612385006, "grad_norm": 0.6833345890045166, "learning_rate": 9.157509157509156e-06, "loss": 0.7471, "step": 2655 }, { "epoch": 2.9174790608265826, "grad_norm": 1.0189886093139648, "learning_rate": 9.035409035409035e-06, "loss": 0.6065, "step": 2656 }, { "epoch": 2.9185775092681587, "grad_norm": 0.5197221040725708, "learning_rate": 8.913308913308913e-06, "loss": 0.5904, "step": 2657 }, { "epoch": 2.919675957709735, "grad_norm": 0.6265780925750732, "learning_rate": 8.79120879120879e-06, "loss": 0.5622, "step": 2658 }, { "epoch": 2.9207744061513115, "grad_norm": 0.5703533887863159, "learning_rate": 8.669108669108669e-06, "loss": 0.8005, "step": 2659 }, { "epoch": 2.9218728545928876, "grad_norm": 0.8656613230705261, "learning_rate": 8.547008547008546e-06, "loss": 0.4942, "step": 2660 }, { "epoch": 2.9229713030344637, "grad_norm": 0.6180423498153687, "learning_rate": 8.424908424908425e-06, "loss": 0.8163, "step": 2661 }, { "epoch": 2.92406975147604, "grad_norm": 0.7308143377304077, "learning_rate": 8.302808302808302e-06, "loss": 0.7639, "step": 2662 }, { "epoch": 2.9251681999176165, "grad_norm": 0.585617184638977, "learning_rate": 8.18070818070818e-06, "loss": 0.7614, "step": 2663 }, { "epoch": 2.9262666483591926, "grad_norm": 0.5277345776557922, "learning_rate": 8.058608058608059e-06, "loss": 0.6489, "step": 2664 }, { "epoch": 2.927365096800769, "grad_norm": 0.3540293574333191, "learning_rate": 7.936507936507936e-06, "loss": 0.4503, "step": 2665 }, { "epoch": 2.9284635452423453, "grad_norm": 0.554492175579071, "learning_rate": 7.814407814407813e-06, "loss": 0.5785, "step": 2666 }, { "epoch": 2.9295619936839215, "grad_norm": 0.5547875761985779, "learning_rate": 7.692307692307692e-06, "loss": 0.5763, "step": 2667 }, { "epoch": 2.9306604421254976, "grad_norm": 0.745947003364563, "learning_rate": 7.57020757020757e-06, "loss": 0.512, "step": 2668 }, { "epoch": 2.931758890567074, "grad_norm": 0.47691571712493896, "learning_rate": 7.448107448107448e-06, "loss": 0.7018, "step": 2669 }, { "epoch": 2.9328573390086503, "grad_norm": 0.9611607789993286, "learning_rate": 7.3260073260073255e-06, "loss": 0.7419, "step": 2670 }, { "epoch": 2.9339557874502264, "grad_norm": 0.5495268106460571, "learning_rate": 7.203907203907203e-06, "loss": 0.6096, "step": 2671 }, { "epoch": 2.935054235891803, "grad_norm": 0.8863226771354675, "learning_rate": 7.081807081807081e-06, "loss": 0.7149, "step": 2672 }, { "epoch": 2.936152684333379, "grad_norm": 0.4234665334224701, "learning_rate": 6.959706959706959e-06, "loss": 0.6913, "step": 2673 }, { "epoch": 2.9372511327749553, "grad_norm": 0.9667326211929321, "learning_rate": 6.837606837606837e-06, "loss": 0.4181, "step": 2674 }, { "epoch": 2.9383495812165314, "grad_norm": 0.543683648109436, "learning_rate": 6.715506715506716e-06, "loss": 0.6329, "step": 2675 }, { "epoch": 2.939448029658108, "grad_norm": 0.5083779692649841, "learning_rate": 6.5934065934065935e-06, "loss": 0.8742, "step": 2676 }, { "epoch": 2.940546478099684, "grad_norm": 0.7212001085281372, "learning_rate": 6.4713064713064706e-06, "loss": 0.6912, "step": 2677 }, { "epoch": 2.9416449265412603, "grad_norm": 0.9474835991859436, "learning_rate": 6.349206349206348e-06, "loss": 0.649, "step": 2678 }, { "epoch": 2.942743374982837, "grad_norm": 0.8142021298408508, "learning_rate": 6.227106227106226e-06, "loss": 0.6136, "step": 2679 }, { "epoch": 2.943841823424413, "grad_norm": 2.9018187522888184, "learning_rate": 6.105006105006104e-06, "loss": 0.7157, "step": 2680 }, { "epoch": 2.944940271865989, "grad_norm": 0.4023605287075043, "learning_rate": 5.982905982905982e-06, "loss": 0.5675, "step": 2681 }, { "epoch": 2.9460387203075653, "grad_norm": 0.3693840801715851, "learning_rate": 5.86080586080586e-06, "loss": 0.5982, "step": 2682 }, { "epoch": 2.947137168749142, "grad_norm": 0.4298234283924103, "learning_rate": 5.738705738705739e-06, "loss": 0.5379, "step": 2683 }, { "epoch": 2.948235617190718, "grad_norm": 0.6495395302772522, "learning_rate": 5.6166056166056165e-06, "loss": 0.5411, "step": 2684 }, { "epoch": 2.9493340656322946, "grad_norm": 0.44857510924339294, "learning_rate": 5.494505494505494e-06, "loss": 0.5154, "step": 2685 }, { "epoch": 2.9504325140738707, "grad_norm": 0.7485830187797546, "learning_rate": 5.372405372405372e-06, "loss": 0.6595, "step": 2686 }, { "epoch": 2.951530962515447, "grad_norm": 0.5141469836235046, "learning_rate": 5.25030525030525e-06, "loss": 0.6289, "step": 2687 }, { "epoch": 2.952629410957023, "grad_norm": 0.8847435712814331, "learning_rate": 5.128205128205128e-06, "loss": 0.6734, "step": 2688 }, { "epoch": 2.9537278593985996, "grad_norm": 0.570573091506958, "learning_rate": 5.006105006105005e-06, "loss": 0.7013, "step": 2689 }, { "epoch": 2.9548263078401757, "grad_norm": 0.4376991391181946, "learning_rate": 4.884004884004883e-06, "loss": 0.5918, "step": 2690 }, { "epoch": 2.955924756281752, "grad_norm": 0.5480318069458008, "learning_rate": 4.7619047619047615e-06, "loss": 0.6227, "step": 2691 }, { "epoch": 2.9570232047233285, "grad_norm": 0.5831297636032104, "learning_rate": 4.639804639804639e-06, "loss": 0.6264, "step": 2692 }, { "epoch": 2.9581216531649046, "grad_norm": 1.5778921842575073, "learning_rate": 4.517704517704517e-06, "loss": 0.6352, "step": 2693 }, { "epoch": 2.9592201016064807, "grad_norm": 0.9567496180534363, "learning_rate": 4.395604395604395e-06, "loss": 0.6067, "step": 2694 }, { "epoch": 2.960318550048057, "grad_norm": 0.5237869620323181, "learning_rate": 4.273504273504273e-06, "loss": 0.8241, "step": 2695 }, { "epoch": 2.9614169984896335, "grad_norm": 0.3452164828777313, "learning_rate": 4.151404151404151e-06, "loss": 0.5718, "step": 2696 }, { "epoch": 2.9625154469312096, "grad_norm": 0.42237767577171326, "learning_rate": 4.0293040293040296e-06, "loss": 0.5199, "step": 2697 }, { "epoch": 2.963613895372786, "grad_norm": 0.7035055756568909, "learning_rate": 3.907203907203907e-06, "loss": 0.7078, "step": 2698 }, { "epoch": 2.9647123438143623, "grad_norm": 0.39236482977867126, "learning_rate": 3.785103785103785e-06, "loss": 0.59, "step": 2699 }, { "epoch": 2.9658107922559385, "grad_norm": 1.1658680438995361, "learning_rate": 3.6630036630036627e-06, "loss": 0.53, "step": 2700 }, { "epoch": 2.9669092406975146, "grad_norm": 0.6797634363174438, "learning_rate": 3.5409035409035406e-06, "loss": 0.6763, "step": 2701 }, { "epoch": 2.968007689139091, "grad_norm": 1.0421425104141235, "learning_rate": 3.4188034188034185e-06, "loss": 0.4, "step": 2702 }, { "epoch": 2.9691061375806673, "grad_norm": 0.36937475204467773, "learning_rate": 3.2967032967032968e-06, "loss": 0.5401, "step": 2703 }, { "epoch": 2.9702045860222435, "grad_norm": 0.4324638843536377, "learning_rate": 3.174603174603174e-06, "loss": 0.5882, "step": 2704 }, { "epoch": 2.97130303446382, "grad_norm": 1.2700526714324951, "learning_rate": 3.052503052503052e-06, "loss": 0.613, "step": 2705 }, { "epoch": 2.972401482905396, "grad_norm": 0.5261131525039673, "learning_rate": 2.93040293040293e-06, "loss": 0.6279, "step": 2706 }, { "epoch": 2.9734999313469723, "grad_norm": 0.42924660444259644, "learning_rate": 2.8083028083028082e-06, "loss": 1.0058, "step": 2707 }, { "epoch": 2.9745983797885485, "grad_norm": 3.100399971008301, "learning_rate": 2.686202686202686e-06, "loss": 0.5209, "step": 2708 }, { "epoch": 2.975696828230125, "grad_norm": 0.3666403293609619, "learning_rate": 2.564102564102564e-06, "loss": 0.5231, "step": 2709 }, { "epoch": 2.976795276671701, "grad_norm": 1.1315009593963623, "learning_rate": 2.4420024420024414e-06, "loss": 0.4449, "step": 2710 }, { "epoch": 2.9778937251132778, "grad_norm": 0.3323412537574768, "learning_rate": 2.3199023199023197e-06, "loss": 0.4806, "step": 2711 }, { "epoch": 2.978992173554854, "grad_norm": 0.7348967790603638, "learning_rate": 2.1978021978021976e-06, "loss": 0.7521, "step": 2712 }, { "epoch": 2.98009062199643, "grad_norm": 1.018898606300354, "learning_rate": 2.0757020757020754e-06, "loss": 0.8468, "step": 2713 }, { "epoch": 2.981189070438006, "grad_norm": 0.46808505058288574, "learning_rate": 1.9536019536019533e-06, "loss": 0.6992, "step": 2714 }, { "epoch": 2.9822875188795823, "grad_norm": 0.5411276817321777, "learning_rate": 1.8315018315018314e-06, "loss": 0.5949, "step": 2715 }, { "epoch": 2.983385967321159, "grad_norm": 0.45061302185058594, "learning_rate": 1.7094017094017092e-06, "loss": 0.4617, "step": 2716 }, { "epoch": 2.984484415762735, "grad_norm": 0.44529294967651367, "learning_rate": 1.587301587301587e-06, "loss": 0.5811, "step": 2717 }, { "epoch": 2.9855828642043116, "grad_norm": 1.255299687385559, "learning_rate": 1.465201465201465e-06, "loss": 1.1899, "step": 2718 }, { "epoch": 2.9866813126458878, "grad_norm": 0.8325234651565552, "learning_rate": 1.343101343101343e-06, "loss": 0.6344, "step": 2719 }, { "epoch": 2.987779761087464, "grad_norm": 1.0692095756530762, "learning_rate": 1.2210012210012207e-06, "loss": 0.5136, "step": 2720 }, { "epoch": 2.98887820952904, "grad_norm": 0.4980855882167816, "learning_rate": 1.0989010989010988e-06, "loss": 0.6352, "step": 2721 }, { "epoch": 2.9899766579706166, "grad_norm": 0.8502411246299744, "learning_rate": 9.768009768009766e-07, "loss": 0.599, "step": 2722 }, { "epoch": 2.9910751064121928, "grad_norm": 0.4849570691585541, "learning_rate": 8.547008547008546e-07, "loss": 0.5862, "step": 2723 }, { "epoch": 2.992173554853769, "grad_norm": 0.5491626858711243, "learning_rate": 7.326007326007325e-07, "loss": 0.5634, "step": 2724 }, { "epoch": 2.9932720032953455, "grad_norm": 0.7289263606071472, "learning_rate": 6.105006105006104e-07, "loss": 0.6643, "step": 2725 }, { "epoch": 2.9943704517369216, "grad_norm": 1.5343972444534302, "learning_rate": 4.884004884004883e-07, "loss": 0.71, "step": 2726 }, { "epoch": 2.9954689001784978, "grad_norm": 0.5619814395904541, "learning_rate": 3.6630036630036624e-07, "loss": 0.721, "step": 2727 }, { "epoch": 2.996567348620074, "grad_norm": 0.500442624092102, "learning_rate": 2.4420024420024416e-07, "loss": 0.6571, "step": 2728 }, { "epoch": 2.9976657970616505, "grad_norm": 0.42292630672454834, "learning_rate": 1.2210012210012208e-07, "loss": 0.4772, "step": 2729 }, { "epoch": 2.9987642455032266, "grad_norm": 0.4350331425666809, "learning_rate": 0.0, "loss": 0.7493, "step": 2730 }, { "epoch": 2.9987642455032266, "step": 2730, "total_flos": 1.0372510312766669e+18, "train_loss": 0.674373844124022, "train_runtime": 11584.4184, "train_samples_per_second": 1.886, "train_steps_per_second": 0.236 } ], "logging_steps": 1.0, "max_steps": 2730, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.0372510312766669e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }