{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.015910898965791568, "eval_steps": 10, "global_step": 150, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00010607265977194378, "grad_norm": 1.647079586982727, "learning_rate": 4.99994960800331e-07, "loss": 2.3252, "step": 1 }, { "epoch": 0.00010607265977194378, "eval_loss": 2.388094663619995, "eval_runtime": 67.0533, "eval_samples_per_second": 1.909, "eval_steps_per_second": 0.477, "step": 1 }, { "epoch": 0.00021214531954388756, "grad_norm": 1.7374595403671265, "learning_rate": 4.999899206382888e-07, "loss": 2.2091, "step": 2 }, { "epoch": 0.00031821797931583137, "grad_norm": 1.4541484117507935, "learning_rate": 4.999848795135978e-07, "loss": 2.4903, "step": 3 }, { "epoch": 0.0004242906390877751, "grad_norm": 1.3729519844055176, "learning_rate": 4.999798374259822e-07, "loss": 2.3478, "step": 4 }, { "epoch": 0.0005303632988597189, "grad_norm": 2.059835433959961, "learning_rate": 4.999747943751658e-07, "loss": 2.2294, "step": 5 }, { "epoch": 0.0006364359586316627, "grad_norm": 1.4303361177444458, "learning_rate": 4.999697503608729e-07, "loss": 2.3702, "step": 6 }, { "epoch": 0.0007425086184036064, "grad_norm": 1.506787657737732, "learning_rate": 4.999647053828272e-07, "loss": 2.2794, "step": 7 }, { "epoch": 0.0008485812781755502, "grad_norm": 1.1569323539733887, "learning_rate": 4.999596594407525e-07, "loss": 2.2048, "step": 8 }, { "epoch": 0.0009546539379474941, "grad_norm": 1.0063825845718384, "learning_rate": 4.999546125343724e-07, "loss": 2.0392, "step": 9 }, { "epoch": 0.0010607265977194379, "grad_norm": 1.2020900249481201, "learning_rate": 4.999495646634105e-07, "loss": 2.1371, "step": 10 }, { "epoch": 0.0010607265977194379, "eval_loss": 1.957827091217041, "eval_runtime": 70.5284, "eval_samples_per_second": 1.815, "eval_steps_per_second": 0.454, "step": 10 }, { "epoch": 0.0011667992574913816, "grad_norm": 0.9959272742271423, "learning_rate": 4.999445158275902e-07, "loss": 2.0418, "step": 11 }, { "epoch": 0.0012728719172633255, "grad_norm": 0.8702138662338257, "learning_rate": 4.999394660266349e-07, "loss": 1.97, "step": 12 }, { "epoch": 0.0013789445770352692, "grad_norm": 0.6772508025169373, "learning_rate": 4.999344152602678e-07, "loss": 1.6033, "step": 13 }, { "epoch": 0.0014850172368072129, "grad_norm": 1.0128448009490967, "learning_rate": 4.99929363528212e-07, "loss": 2.1025, "step": 14 }, { "epoch": 0.0015910898965791568, "grad_norm": 0.9335213899612427, "learning_rate": 4.999243108301906e-07, "loss": 1.9303, "step": 15 }, { "epoch": 0.0016971625563511005, "grad_norm": 0.9497168660163879, "learning_rate": 4.999192571659265e-07, "loss": 1.8389, "step": 16 }, { "epoch": 0.0018032352161230442, "grad_norm": 0.8370901346206665, "learning_rate": 4.999142025351424e-07, "loss": 1.8423, "step": 17 }, { "epoch": 0.0019093078758949881, "grad_norm": 1.1051160097122192, "learning_rate": 4.999091469375611e-07, "loss": 1.9193, "step": 18 }, { "epoch": 0.002015380535666932, "grad_norm": 1.2687193155288696, "learning_rate": 4.999040903729051e-07, "loss": 2.1474, "step": 19 }, { "epoch": 0.0021214531954388757, "grad_norm": 0.9469927549362183, "learning_rate": 4.99899032840897e-07, "loss": 1.6744, "step": 20 }, { "epoch": 0.0021214531954388757, "eval_loss": 1.7858046293258667, "eval_runtime": 68.1877, "eval_samples_per_second": 1.877, "eval_steps_per_second": 0.469, "step": 20 }, { "epoch": 0.0022275258552108194, "grad_norm": 0.9844200015068054, "learning_rate": 4.998939743412591e-07, "loss": 1.9173, "step": 21 }, { "epoch": 0.002333598514982763, "grad_norm": 1.1347674131393433, "learning_rate": 4.998889148737137e-07, "loss": 1.8158, "step": 22 }, { "epoch": 0.002439671174754707, "grad_norm": 1.0652062892913818, "learning_rate": 4.99883854437983e-07, "loss": 1.9189, "step": 23 }, { "epoch": 0.002545743834526651, "grad_norm": 1.247801661491394, "learning_rate": 4.998787930337891e-07, "loss": 1.6044, "step": 24 }, { "epoch": 0.0026518164942985947, "grad_norm": 0.856572151184082, "learning_rate": 4.998737306608538e-07, "loss": 1.9368, "step": 25 }, { "epoch": 0.0027578891540705384, "grad_norm": 0.8902915716171265, "learning_rate": 4.998686673188991e-07, "loss": 1.7691, "step": 26 }, { "epoch": 0.002863961813842482, "grad_norm": 0.9449676871299744, "learning_rate": 4.998636030076468e-07, "loss": 1.7605, "step": 27 }, { "epoch": 0.0029700344736144258, "grad_norm": 0.7392516136169434, "learning_rate": 4.998585377268183e-07, "loss": 1.7255, "step": 28 }, { "epoch": 0.0030761071333863695, "grad_norm": 0.8038751482963562, "learning_rate": 4.998534714761353e-07, "loss": 1.7689, "step": 29 }, { "epoch": 0.0031821797931583136, "grad_norm": 0.8447410464286804, "learning_rate": 4.998484042553191e-07, "loss": 2.0482, "step": 30 }, { "epoch": 0.0031821797931583136, "eval_loss": 1.6817430257797241, "eval_runtime": 67.9443, "eval_samples_per_second": 1.884, "eval_steps_per_second": 0.471, "step": 30 }, { "epoch": 0.0032882524529302573, "grad_norm": 0.8365817070007324, "learning_rate": 4.998433360640912e-07, "loss": 1.8075, "step": 31 }, { "epoch": 0.003394325112702201, "grad_norm": 0.7525676488876343, "learning_rate": 4.998382669021727e-07, "loss": 1.6026, "step": 32 }, { "epoch": 0.0035003977724741447, "grad_norm": 0.9553101658821106, "learning_rate": 4.998331967692847e-07, "loss": 1.743, "step": 33 }, { "epoch": 0.0036064704322460884, "grad_norm": 0.966307520866394, "learning_rate": 4.998281256651483e-07, "loss": 1.7075, "step": 34 }, { "epoch": 0.0037125430920180325, "grad_norm": 0.8701184988021851, "learning_rate": 4.998230535894843e-07, "loss": 1.6257, "step": 35 }, { "epoch": 0.0038186157517899762, "grad_norm": 0.8450726866722107, "learning_rate": 4.998179805420135e-07, "loss": 1.8127, "step": 36 }, { "epoch": 0.00392468841156192, "grad_norm": 0.8065881133079529, "learning_rate": 4.998129065224565e-07, "loss": 1.7574, "step": 37 }, { "epoch": 0.004030761071333864, "grad_norm": 0.7674804925918579, "learning_rate": 4.99807831530534e-07, "loss": 1.6796, "step": 38 }, { "epoch": 0.004136833731105807, "grad_norm": 0.8442147970199585, "learning_rate": 4.998027555659665e-07, "loss": 1.5551, "step": 39 }, { "epoch": 0.0042429063908777515, "grad_norm": 0.7327367663383484, "learning_rate": 4.99797678628474e-07, "loss": 1.075, "step": 40 }, { "epoch": 0.0042429063908777515, "eval_loss": 1.6100257635116577, "eval_runtime": 67.9157, "eval_samples_per_second": 1.885, "eval_steps_per_second": 0.471, "step": 40 }, { "epoch": 0.004348979050649695, "grad_norm": 0.8727586269378662, "learning_rate": 4.997926007177772e-07, "loss": 1.6814, "step": 41 }, { "epoch": 0.004455051710421639, "grad_norm": 1.0420920848846436, "learning_rate": 4.99787521833596e-07, "loss": 1.6025, "step": 42 }, { "epoch": 0.004561124370193583, "grad_norm": 0.757056713104248, "learning_rate": 4.997824419756506e-07, "loss": 1.7756, "step": 43 }, { "epoch": 0.004667197029965526, "grad_norm": 0.9350019693374634, "learning_rate": 4.997773611436606e-07, "loss": 1.6165, "step": 44 }, { "epoch": 0.00477326968973747, "grad_norm": 0.7474361062049866, "learning_rate": 4.997722793373462e-07, "loss": 1.8263, "step": 45 }, { "epoch": 0.004879342349509414, "grad_norm": 0.6356221437454224, "learning_rate": 4.997671965564268e-07, "loss": 1.7313, "step": 46 }, { "epoch": 0.004985415009281358, "grad_norm": 0.7225522398948669, "learning_rate": 4.997621128006223e-07, "loss": 1.8336, "step": 47 }, { "epoch": 0.005091487669053302, "grad_norm": 0.7901801466941833, "learning_rate": 4.997570280696519e-07, "loss": 1.5573, "step": 48 }, { "epoch": 0.005197560328825245, "grad_norm": 0.7901318073272705, "learning_rate": 4.997519423632353e-07, "loss": 1.5356, "step": 49 }, { "epoch": 0.005303632988597189, "grad_norm": 0.7905575633049011, "learning_rate": 4.997468556810914e-07, "loss": 1.5592, "step": 50 }, { "epoch": 0.005303632988597189, "eval_loss": 1.565526008605957, "eval_runtime": 68.1883, "eval_samples_per_second": 1.877, "eval_steps_per_second": 0.469, "step": 50 }, { "epoch": 0.005409705648369133, "grad_norm": 0.7927577495574951, "learning_rate": 4.997417680229397e-07, "loss": 1.8136, "step": 51 }, { "epoch": 0.005515778308141077, "grad_norm": 0.6780912280082703, "learning_rate": 4.997366793884992e-07, "loss": 1.635, "step": 52 }, { "epoch": 0.005621850967913021, "grad_norm": 0.6629224419593811, "learning_rate": 4.997315897774888e-07, "loss": 1.6814, "step": 53 }, { "epoch": 0.005727923627684964, "grad_norm": 0.6568951606750488, "learning_rate": 4.997264991896272e-07, "loss": 1.7656, "step": 54 }, { "epoch": 0.005833996287456908, "grad_norm": 0.9639095664024353, "learning_rate": 4.997214076246334e-07, "loss": 1.6255, "step": 55 }, { "epoch": 0.0059400689472288515, "grad_norm": 1.7226521968841553, "learning_rate": 4.99716315082226e-07, "loss": 1.6617, "step": 56 }, { "epoch": 0.006046141607000796, "grad_norm": 0.7537712454795837, "learning_rate": 4.997112215621234e-07, "loss": 1.7756, "step": 57 }, { "epoch": 0.006152214266772739, "grad_norm": 0.5914387702941895, "learning_rate": 4.99706127064044e-07, "loss": 1.4576, "step": 58 }, { "epoch": 0.006258286926544683, "grad_norm": 0.5612177848815918, "learning_rate": 4.997010315877063e-07, "loss": 1.6828, "step": 59 }, { "epoch": 0.006364359586316627, "grad_norm": 0.6786366701126099, "learning_rate": 4.996959351328284e-07, "loss": 1.7288, "step": 60 }, { "epoch": 0.006364359586316627, "eval_loss": 1.537359356880188, "eval_runtime": 67.7518, "eval_samples_per_second": 1.889, "eval_steps_per_second": 0.472, "step": 60 }, { "epoch": 0.0064704322460885704, "grad_norm": 0.678156316280365, "learning_rate": 4.996908376991283e-07, "loss": 1.5268, "step": 61 }, { "epoch": 0.006576504905860515, "grad_norm": 0.6791099905967712, "learning_rate": 4.99685739286324e-07, "loss": 1.5498, "step": 62 }, { "epoch": 0.006682577565632458, "grad_norm": 0.702700674533844, "learning_rate": 4.996806398941335e-07, "loss": 1.6741, "step": 63 }, { "epoch": 0.006788650225404402, "grad_norm": 0.8344963788986206, "learning_rate": 4.996755395222746e-07, "loss": 1.6074, "step": 64 }, { "epoch": 0.006894722885176346, "grad_norm": 3.9447405338287354, "learning_rate": 4.996704381704648e-07, "loss": 1.5762, "step": 65 }, { "epoch": 0.007000795544948289, "grad_norm": 0.870587170124054, "learning_rate": 4.996653358384218e-07, "loss": 1.6515, "step": 66 }, { "epoch": 0.0071068682047202335, "grad_norm": 0.5930059552192688, "learning_rate": 4.996602325258629e-07, "loss": 1.7334, "step": 67 }, { "epoch": 0.007212940864492177, "grad_norm": 1.2227728366851807, "learning_rate": 4.996551282325055e-07, "loss": 1.3723, "step": 68 }, { "epoch": 0.007319013524264121, "grad_norm": 1.1175763607025146, "learning_rate": 4.996500229580668e-07, "loss": 1.4476, "step": 69 }, { "epoch": 0.007425086184036065, "grad_norm": 0.591778039932251, "learning_rate": 4.99644916702264e-07, "loss": 1.2612, "step": 70 }, { "epoch": 0.007425086184036065, "eval_loss": 1.5133857727050781, "eval_runtime": 68.2036, "eval_samples_per_second": 1.877, "eval_steps_per_second": 0.469, "step": 70 }, { "epoch": 0.007531158843808008, "grad_norm": 0.62836754322052, "learning_rate": 4.99639809464814e-07, "loss": 1.4586, "step": 71 }, { "epoch": 0.0076372315035799524, "grad_norm": 0.779591977596283, "learning_rate": 4.996347012454338e-07, "loss": 1.4937, "step": 72 }, { "epoch": 0.007743304163351896, "grad_norm": 0.7092106342315674, "learning_rate": 4.9962959204384e-07, "loss": 1.74, "step": 73 }, { "epoch": 0.00784937682312384, "grad_norm": 0.5990781188011169, "learning_rate": 4.996244818597496e-07, "loss": 1.3733, "step": 74 }, { "epoch": 0.007955449482895784, "grad_norm": 0.7790846824645996, "learning_rate": 4.996193706928789e-07, "loss": 1.5198, "step": 75 }, { "epoch": 0.008061522142667728, "grad_norm": 0.746094286441803, "learning_rate": 4.996142585429444e-07, "loss": 1.3825, "step": 76 }, { "epoch": 0.00816759480243967, "grad_norm": 1.385066032409668, "learning_rate": 4.996091454096626e-07, "loss": 1.473, "step": 77 }, { "epoch": 0.008273667462211615, "grad_norm": 0.6657389402389526, "learning_rate": 4.996040312927497e-07, "loss": 1.6375, "step": 78 }, { "epoch": 0.008379740121983559, "grad_norm": 1.1722540855407715, "learning_rate": 4.995989161919216e-07, "loss": 1.5355, "step": 79 }, { "epoch": 0.008485812781755503, "grad_norm": 0.7716183066368103, "learning_rate": 4.995938001068947e-07, "loss": 1.6588, "step": 80 }, { "epoch": 0.008485812781755503, "eval_loss": 1.4981476068496704, "eval_runtime": 67.8094, "eval_samples_per_second": 1.888, "eval_steps_per_second": 0.472, "step": 80 }, { "epoch": 0.008591885441527447, "grad_norm": 1.0189088582992554, "learning_rate": 4.995886830373846e-07, "loss": 1.5961, "step": 81 }, { "epoch": 0.00869795810129939, "grad_norm": 0.7427169680595398, "learning_rate": 4.995835649831073e-07, "loss": 1.6456, "step": 82 }, { "epoch": 0.008804030761071334, "grad_norm": 0.8337876796722412, "learning_rate": 4.995784459437785e-07, "loss": 1.2846, "step": 83 }, { "epoch": 0.008910103420843278, "grad_norm": 0.6561726331710815, "learning_rate": 4.995733259191137e-07, "loss": 1.5769, "step": 84 }, { "epoch": 0.009016176080615222, "grad_norm": 0.7599915862083435, "learning_rate": 4.995682049088284e-07, "loss": 1.7303, "step": 85 }, { "epoch": 0.009122248740387166, "grad_norm": 0.7153452038764954, "learning_rate": 4.995630829126379e-07, "loss": 1.5249, "step": 86 }, { "epoch": 0.009228321400159108, "grad_norm": 0.7663532495498657, "learning_rate": 4.995579599302577e-07, "loss": 1.4317, "step": 87 }, { "epoch": 0.009334394059931053, "grad_norm": 0.8668680191040039, "learning_rate": 4.995528359614027e-07, "loss": 1.8046, "step": 88 }, { "epoch": 0.009440466719702997, "grad_norm": 0.684675931930542, "learning_rate": 4.99547711005788e-07, "loss": 1.393, "step": 89 }, { "epoch": 0.00954653937947494, "grad_norm": 0.8086925745010376, "learning_rate": 4.995425850631287e-07, "loss": 1.5898, "step": 90 }, { "epoch": 0.00954653937947494, "eval_loss": 1.4867264032363892, "eval_runtime": 68.2886, "eval_samples_per_second": 1.874, "eval_steps_per_second": 0.469, "step": 90 }, { "epoch": 0.009652612039246885, "grad_norm": 0.6806846261024475, "learning_rate": 4.995374581331393e-07, "loss": 1.5126, "step": 91 }, { "epoch": 0.009758684699018827, "grad_norm": 0.7875713109970093, "learning_rate": 4.995323302155347e-07, "loss": 1.5119, "step": 92 }, { "epoch": 0.009864757358790771, "grad_norm": 0.6719956398010254, "learning_rate": 4.995272013100296e-07, "loss": 1.611, "step": 93 }, { "epoch": 0.009970830018562716, "grad_norm": 0.7365944385528564, "learning_rate": 4.995220714163384e-07, "loss": 1.2394, "step": 94 }, { "epoch": 0.01007690267833466, "grad_norm": 0.6620836853981018, "learning_rate": 4.995169405341754e-07, "loss": 1.6405, "step": 95 }, { "epoch": 0.010182975338106604, "grad_norm": 0.7277278900146484, "learning_rate": 4.995118086632551e-07, "loss": 1.7809, "step": 96 }, { "epoch": 0.010289047997878546, "grad_norm": 0.7061654925346375, "learning_rate": 4.995066758032913e-07, "loss": 1.361, "step": 97 }, { "epoch": 0.01039512065765049, "grad_norm": 0.7945475578308105, "learning_rate": 4.995015419539983e-07, "loss": 1.608, "step": 98 }, { "epoch": 0.010501193317422435, "grad_norm": 0.7080848813056946, "learning_rate": 4.994964071150901e-07, "loss": 1.6869, "step": 99 }, { "epoch": 0.010607265977194379, "grad_norm": 0.6934227347373962, "learning_rate": 4.994912712862803e-07, "loss": 1.4698, "step": 100 }, { "epoch": 0.010607265977194379, "eval_loss": 1.4758180379867554, "eval_runtime": 67.7252, "eval_samples_per_second": 1.89, "eval_steps_per_second": 0.472, "step": 100 }, { "epoch": 0.010713338636966323, "grad_norm": 0.6977412700653076, "learning_rate": 4.994861344672828e-07, "loss": 1.6276, "step": 101 }, { "epoch": 0.010819411296738265, "grad_norm": 0.8750130534172058, "learning_rate": 4.994809966578113e-07, "loss": 1.5425, "step": 102 }, { "epoch": 0.01092548395651021, "grad_norm": 1.6952922344207764, "learning_rate": 4.99475857857579e-07, "loss": 1.6159, "step": 103 }, { "epoch": 0.011031556616282153, "grad_norm": 0.7001510858535767, "learning_rate": 4.994707180662995e-07, "loss": 1.5937, "step": 104 }, { "epoch": 0.011137629276054098, "grad_norm": 0.7474836707115173, "learning_rate": 4.99465577283686e-07, "loss": 1.4775, "step": 105 }, { "epoch": 0.011243701935826042, "grad_norm": 1.114769458770752, "learning_rate": 4.994604355094518e-07, "loss": 1.4304, "step": 106 }, { "epoch": 0.011349774595597984, "grad_norm": 0.7222145199775696, "learning_rate": 4.994552927433097e-07, "loss": 1.2972, "step": 107 }, { "epoch": 0.011455847255369928, "grad_norm": 0.7787733674049377, "learning_rate": 4.994501489849728e-07, "loss": 1.8544, "step": 108 }, { "epoch": 0.011561919915141872, "grad_norm": 0.6402618288993835, "learning_rate": 4.994450042341541e-07, "loss": 1.5189, "step": 109 }, { "epoch": 0.011667992574913817, "grad_norm": 0.6818183064460754, "learning_rate": 4.99439858490566e-07, "loss": 1.3971, "step": 110 }, { "epoch": 0.011667992574913817, "eval_loss": 1.4671616554260254, "eval_runtime": 68.8225, "eval_samples_per_second": 1.86, "eval_steps_per_second": 0.465, "step": 110 }, { "epoch": 0.011774065234685759, "grad_norm": 0.8468677997589111, "learning_rate": 4.994347117539214e-07, "loss": 1.6674, "step": 111 }, { "epoch": 0.011880137894457703, "grad_norm": 0.9757254123687744, "learning_rate": 4.994295640239325e-07, "loss": 1.5847, "step": 112 }, { "epoch": 0.011986210554229647, "grad_norm": 0.9481499195098877, "learning_rate": 4.99424415300312e-07, "loss": 1.3539, "step": 113 }, { "epoch": 0.012092283214001591, "grad_norm": 0.6789958477020264, "learning_rate": 4.99419265582772e-07, "loss": 1.5099, "step": 114 }, { "epoch": 0.012198355873773535, "grad_norm": 0.6501567959785461, "learning_rate": 4.994141148710247e-07, "loss": 1.5429, "step": 115 }, { "epoch": 0.012304428533545478, "grad_norm": 1.262799859046936, "learning_rate": 4.994089631647824e-07, "loss": 1.1193, "step": 116 }, { "epoch": 0.012410501193317422, "grad_norm": 0.685874342918396, "learning_rate": 4.994038104637567e-07, "loss": 1.728, "step": 117 }, { "epoch": 0.012516573853089366, "grad_norm": 0.7375260591506958, "learning_rate": 4.993986567676594e-07, "loss": 1.6958, "step": 118 }, { "epoch": 0.01262264651286131, "grad_norm": 0.7215054631233215, "learning_rate": 4.993935020762025e-07, "loss": 1.3697, "step": 119 }, { "epoch": 0.012728719172633254, "grad_norm": 0.7148920297622681, "learning_rate": 4.993883463890975e-07, "loss": 1.6451, "step": 120 }, { "epoch": 0.012728719172633254, "eval_loss": 1.459830403327942, "eval_runtime": 68.6763, "eval_samples_per_second": 1.864, "eval_steps_per_second": 0.466, "step": 120 }, { "epoch": 0.012834791832405197, "grad_norm": 0.6565997004508972, "learning_rate": 4.993831897060559e-07, "loss": 1.4405, "step": 121 }, { "epoch": 0.012940864492177141, "grad_norm": 0.7842444181442261, "learning_rate": 4.993780320267891e-07, "loss": 1.659, "step": 122 }, { "epoch": 0.013046937151949085, "grad_norm": 0.7806555032730103, "learning_rate": 4.993728733510084e-07, "loss": 1.6056, "step": 123 }, { "epoch": 0.01315300981172103, "grad_norm": 0.8176814317703247, "learning_rate": 4.993677136784249e-07, "loss": 1.6902, "step": 124 }, { "epoch": 0.013259082471492973, "grad_norm": 0.7916125059127808, "learning_rate": 4.993625530087498e-07, "loss": 1.4324, "step": 125 }, { "epoch": 0.013365155131264916, "grad_norm": 0.6581283211708069, "learning_rate": 4.993573913416939e-07, "loss": 1.753, "step": 126 }, { "epoch": 0.01347122779103686, "grad_norm": 0.6454209685325623, "learning_rate": 4.99352228676968e-07, "loss": 1.5184, "step": 127 }, { "epoch": 0.013577300450808804, "grad_norm": 0.746411919593811, "learning_rate": 4.99347065014283e-07, "loss": 1.5564, "step": 128 }, { "epoch": 0.013683373110580748, "grad_norm": 0.710649847984314, "learning_rate": 4.993419003533493e-07, "loss": 1.46, "step": 129 }, { "epoch": 0.013789445770352692, "grad_norm": 0.7483745217323303, "learning_rate": 4.993367346938775e-07, "loss": 1.4491, "step": 130 }, { "epoch": 0.013789445770352692, "eval_loss": 1.4538627862930298, "eval_runtime": 68.3516, "eval_samples_per_second": 1.873, "eval_steps_per_second": 0.468, "step": 130 }, { "epoch": 0.013895518430124635, "grad_norm": 0.7695822715759277, "learning_rate": 4.993315680355781e-07, "loss": 1.5002, "step": 131 }, { "epoch": 0.014001591089896579, "grad_norm": 0.7457824945449829, "learning_rate": 4.993264003781611e-07, "loss": 1.4464, "step": 132 }, { "epoch": 0.014107663749668523, "grad_norm": 0.6966177821159363, "learning_rate": 4.99321231721337e-07, "loss": 1.5493, "step": 133 }, { "epoch": 0.014213736409440467, "grad_norm": 0.7250157594680786, "learning_rate": 4.993160620648156e-07, "loss": 1.6838, "step": 134 }, { "epoch": 0.014319809069212411, "grad_norm": 0.6885997653007507, "learning_rate": 4.993108914083069e-07, "loss": 1.3327, "step": 135 }, { "epoch": 0.014425881728984354, "grad_norm": 0.6850383877754211, "learning_rate": 4.993057197515208e-07, "loss": 1.373, "step": 136 }, { "epoch": 0.014531954388756298, "grad_norm": 0.8969720005989075, "learning_rate": 4.993005470941668e-07, "loss": 1.6318, "step": 137 }, { "epoch": 0.014638027048528242, "grad_norm": 0.6809049844741821, "learning_rate": 4.992953734359548e-07, "loss": 1.3728, "step": 138 }, { "epoch": 0.014744099708300186, "grad_norm": 0.6609801054000854, "learning_rate": 4.992901987765941e-07, "loss": 1.4302, "step": 139 }, { "epoch": 0.01485017236807213, "grad_norm": 0.6714500188827515, "learning_rate": 4.99285023115794e-07, "loss": 1.3048, "step": 140 }, { "epoch": 0.01485017236807213, "eval_loss": 1.4487165212631226, "eval_runtime": 68.3673, "eval_samples_per_second": 1.872, "eval_steps_per_second": 0.468, "step": 140 }, { "epoch": 0.014956245027844072, "grad_norm": 0.7616133689880371, "learning_rate": 4.992798464532639e-07, "loss": 1.6308, "step": 141 }, { "epoch": 0.015062317687616017, "grad_norm": 0.7023948431015015, "learning_rate": 4.99274668788713e-07, "loss": 1.4595, "step": 142 }, { "epoch": 0.01516839034738796, "grad_norm": 1.1847342252731323, "learning_rate": 4.992694901218502e-07, "loss": 1.892, "step": 143 }, { "epoch": 0.015274463007159905, "grad_norm": 0.6449259519577026, "learning_rate": 4.992643104523846e-07, "loss": 1.4526, "step": 144 }, { "epoch": 0.015380535666931849, "grad_norm": 0.8200846910476685, "learning_rate": 4.992591297800247e-07, "loss": 1.6277, "step": 145 }, { "epoch": 0.015486608326703791, "grad_norm": 0.627193808555603, "learning_rate": 4.992539481044796e-07, "loss": 1.2767, "step": 146 }, { "epoch": 0.015592680986475736, "grad_norm": 0.6754332780838013, "learning_rate": 4.992487654254575e-07, "loss": 1.7414, "step": 147 }, { "epoch": 0.01569875364624768, "grad_norm": 0.8303399682044983, "learning_rate": 4.992435817426671e-07, "loss": 1.5313, "step": 148 }, { "epoch": 0.015804826306019624, "grad_norm": 0.7653703093528748, "learning_rate": 4.992383970558168e-07, "loss": 1.6444, "step": 149 }, { "epoch": 0.015910898965791568, "grad_norm": 0.756195068359375, "learning_rate": 4.992332113646148e-07, "loss": 1.5063, "step": 150 }, { "epoch": 0.015910898965791568, "eval_loss": 1.4446443319320679, "eval_runtime": 67.8607, "eval_samples_per_second": 1.886, "eval_steps_per_second": 0.472, "step": 150 } ], "logging_steps": 1, "max_steps": 9427, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.418837751332864e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }