CodeLlama-13B-QML / trainer_state.json
qt-spyro-hf's picture
Upload 13 files
d544a68 verified
raw
history blame
98 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 20,
"global_step": 12178,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0016424069473813874,
"grad_norm": 0.4667005240917206,
"learning_rate": 0.0002,
"loss": 1.9661,
"step": 20
},
{
"epoch": 0.003284813894762775,
"grad_norm": 0.5031771063804626,
"learning_rate": 0.0002,
"loss": 1.602,
"step": 40
},
{
"epoch": 0.004927220842144162,
"grad_norm": 0.4090685546398163,
"learning_rate": 0.0002,
"loss": 1.4703,
"step": 60
},
{
"epoch": 0.00656962778952555,
"grad_norm": 0.4099690020084381,
"learning_rate": 0.0002,
"loss": 1.3652,
"step": 80
},
{
"epoch": 0.008212034736906937,
"grad_norm": 0.4610142111778259,
"learning_rate": 0.0002,
"loss": 1.4386,
"step": 100
},
{
"epoch": 0.009854441684288324,
"grad_norm": 0.3908289968967438,
"learning_rate": 0.0002,
"loss": 1.3151,
"step": 120
},
{
"epoch": 0.011496848631669712,
"grad_norm": 0.4541659951210022,
"learning_rate": 0.0002,
"loss": 1.1233,
"step": 140
},
{
"epoch": 0.0131392555790511,
"grad_norm": 0.43324407935142517,
"learning_rate": 0.0002,
"loss": 1.1266,
"step": 160
},
{
"epoch": 0.014781662526432487,
"grad_norm": 0.3396519720554352,
"learning_rate": 0.0002,
"loss": 1.1004,
"step": 180
},
{
"epoch": 0.016424069473813873,
"grad_norm": 0.5125846266746521,
"learning_rate": 0.0002,
"loss": 1.1258,
"step": 200
},
{
"epoch": 0.01806647642119526,
"grad_norm": 0.4572688937187195,
"learning_rate": 0.0002,
"loss": 1.1796,
"step": 220
},
{
"epoch": 0.01970888336857665,
"grad_norm": 0.434186190366745,
"learning_rate": 0.0002,
"loss": 1.1016,
"step": 240
},
{
"epoch": 0.021351290315958036,
"grad_norm": 0.5205552577972412,
"learning_rate": 0.0002,
"loss": 1.0419,
"step": 260
},
{
"epoch": 0.022993697263339424,
"grad_norm": 0.3958785831928253,
"learning_rate": 0.0002,
"loss": 0.9515,
"step": 280
},
{
"epoch": 0.02463610421072081,
"grad_norm": 0.46327391266822815,
"learning_rate": 0.0002,
"loss": 1.0079,
"step": 300
},
{
"epoch": 0.0262785111581022,
"grad_norm": 0.39861008524894714,
"learning_rate": 0.0002,
"loss": 0.9755,
"step": 320
},
{
"epoch": 0.027920918105483587,
"grad_norm": 0.42074650526046753,
"learning_rate": 0.0002,
"loss": 0.9435,
"step": 340
},
{
"epoch": 0.029563325052864974,
"grad_norm": 0.41754183173179626,
"learning_rate": 0.0002,
"loss": 0.9376,
"step": 360
},
{
"epoch": 0.031205732000246362,
"grad_norm": 0.3933572769165039,
"learning_rate": 0.0002,
"loss": 0.9489,
"step": 380
},
{
"epoch": 0.032848138947627746,
"grad_norm": 0.4244033992290497,
"learning_rate": 0.0002,
"loss": 0.9759,
"step": 400
},
{
"epoch": 0.034490545895009134,
"grad_norm": 0.3638761639595032,
"learning_rate": 0.0002,
"loss": 0.9371,
"step": 420
},
{
"epoch": 0.03613295284239052,
"grad_norm": 0.4706399738788605,
"learning_rate": 0.0002,
"loss": 0.8464,
"step": 440
},
{
"epoch": 0.03777535978977191,
"grad_norm": 0.4349803328514099,
"learning_rate": 0.0002,
"loss": 0.8918,
"step": 460
},
{
"epoch": 0.0394177667371533,
"grad_norm": 0.3831111490726471,
"learning_rate": 0.0002,
"loss": 0.8366,
"step": 480
},
{
"epoch": 0.041060173684534684,
"grad_norm": 0.4122432470321655,
"learning_rate": 0.0002,
"loss": 0.8444,
"step": 500
},
{
"epoch": 0.04270258063191607,
"grad_norm": 0.3296256959438324,
"learning_rate": 0.0002,
"loss": 0.8301,
"step": 520
},
{
"epoch": 0.04434498757929746,
"grad_norm": 0.3447166979312897,
"learning_rate": 0.0002,
"loss": 0.857,
"step": 540
},
{
"epoch": 0.04598739452667885,
"grad_norm": 0.4408610761165619,
"learning_rate": 0.0002,
"loss": 0.8356,
"step": 560
},
{
"epoch": 0.047629801474060235,
"grad_norm": 0.4657248854637146,
"learning_rate": 0.0002,
"loss": 0.7525,
"step": 580
},
{
"epoch": 0.04927220842144162,
"grad_norm": 0.35138434171676636,
"learning_rate": 0.0002,
"loss": 0.7486,
"step": 600
},
{
"epoch": 0.05091461536882301,
"grad_norm": 0.4687822461128235,
"learning_rate": 0.0002,
"loss": 0.8169,
"step": 620
},
{
"epoch": 0.0525570223162044,
"grad_norm": 0.465108186006546,
"learning_rate": 0.0002,
"loss": 0.738,
"step": 640
},
{
"epoch": 0.054199429263585785,
"grad_norm": 0.3954925835132599,
"learning_rate": 0.0002,
"loss": 0.7627,
"step": 660
},
{
"epoch": 0.05584183621096717,
"grad_norm": 0.5010778307914734,
"learning_rate": 0.0002,
"loss": 0.7273,
"step": 680
},
{
"epoch": 0.05748424315834856,
"grad_norm": 0.6221648454666138,
"learning_rate": 0.0002,
"loss": 0.7506,
"step": 700
},
{
"epoch": 0.05912665010572995,
"grad_norm": 0.4075715243816376,
"learning_rate": 0.0002,
"loss": 0.7587,
"step": 720
},
{
"epoch": 0.060769057053111336,
"grad_norm": 0.4346787631511688,
"learning_rate": 0.0002,
"loss": 0.7627,
"step": 740
},
{
"epoch": 0.062411464000492724,
"grad_norm": 0.4146323800086975,
"learning_rate": 0.0002,
"loss": 0.6642,
"step": 760
},
{
"epoch": 0.06405387094787411,
"grad_norm": 0.4093219041824341,
"learning_rate": 0.0002,
"loss": 0.7148,
"step": 780
},
{
"epoch": 0.06569627789525549,
"grad_norm": 0.4016498327255249,
"learning_rate": 0.0002,
"loss": 0.6522,
"step": 800
},
{
"epoch": 0.06733868484263689,
"grad_norm": 0.436252236366272,
"learning_rate": 0.0002,
"loss": 0.6884,
"step": 820
},
{
"epoch": 0.06898109179001827,
"grad_norm": 0.4362093508243561,
"learning_rate": 0.0002,
"loss": 0.7185,
"step": 840
},
{
"epoch": 0.07062349873739966,
"grad_norm": 0.42092448472976685,
"learning_rate": 0.0002,
"loss": 0.6702,
"step": 860
},
{
"epoch": 0.07226590568478104,
"grad_norm": 0.4649953842163086,
"learning_rate": 0.0002,
"loss": 0.6753,
"step": 880
},
{
"epoch": 0.07390831263216244,
"grad_norm": 0.4321405589580536,
"learning_rate": 0.0002,
"loss": 0.6578,
"step": 900
},
{
"epoch": 0.07555071957954382,
"grad_norm": 0.5045340657234192,
"learning_rate": 0.0002,
"loss": 0.6993,
"step": 920
},
{
"epoch": 0.07719312652692521,
"grad_norm": 0.5063377022743225,
"learning_rate": 0.0002,
"loss": 0.6654,
"step": 940
},
{
"epoch": 0.0788355334743066,
"grad_norm": 0.41710513830184937,
"learning_rate": 0.0002,
"loss": 0.6264,
"step": 960
},
{
"epoch": 0.08047794042168799,
"grad_norm": 0.4204249083995819,
"learning_rate": 0.0002,
"loss": 0.6683,
"step": 980
},
{
"epoch": 0.08212034736906937,
"grad_norm": 0.44983726739883423,
"learning_rate": 0.0002,
"loss": 0.6592,
"step": 1000
},
{
"epoch": 0.08376275431645076,
"grad_norm": 0.5991094708442688,
"learning_rate": 0.0002,
"loss": 0.6197,
"step": 1020
},
{
"epoch": 0.08540516126383214,
"grad_norm": 0.3672972619533539,
"learning_rate": 0.0002,
"loss": 0.5656,
"step": 1040
},
{
"epoch": 0.08704756821121354,
"grad_norm": 0.503656804561615,
"learning_rate": 0.0002,
"loss": 0.6017,
"step": 1060
},
{
"epoch": 0.08868997515859492,
"grad_norm": 0.49204686284065247,
"learning_rate": 0.0002,
"loss": 0.6421,
"step": 1080
},
{
"epoch": 0.09033238210597631,
"grad_norm": 0.45617127418518066,
"learning_rate": 0.0002,
"loss": 0.6176,
"step": 1100
},
{
"epoch": 0.0919747890533577,
"grad_norm": 0.49607595801353455,
"learning_rate": 0.0002,
"loss": 0.5595,
"step": 1120
},
{
"epoch": 0.09361719600073909,
"grad_norm": 0.39171984791755676,
"learning_rate": 0.0002,
"loss": 0.5479,
"step": 1140
},
{
"epoch": 0.09525960294812047,
"grad_norm": 0.4964667558670044,
"learning_rate": 0.0002,
"loss": 0.5937,
"step": 1160
},
{
"epoch": 0.09690200989550186,
"grad_norm": 0.40392565727233887,
"learning_rate": 0.0002,
"loss": 0.5888,
"step": 1180
},
{
"epoch": 0.09854441684288325,
"grad_norm": 0.4721887409687042,
"learning_rate": 0.0002,
"loss": 0.5345,
"step": 1200
},
{
"epoch": 0.10018682379026464,
"grad_norm": 0.4130144417285919,
"learning_rate": 0.0002,
"loss": 0.599,
"step": 1220
},
{
"epoch": 0.10182923073764602,
"grad_norm": 0.4222985506057739,
"learning_rate": 0.0002,
"loss": 0.5762,
"step": 1240
},
{
"epoch": 0.1034716376850274,
"grad_norm": 0.47171750664711,
"learning_rate": 0.0002,
"loss": 0.5619,
"step": 1260
},
{
"epoch": 0.1051140446324088,
"grad_norm": 0.40906137228012085,
"learning_rate": 0.0002,
"loss": 0.5137,
"step": 1280
},
{
"epoch": 0.10675645157979018,
"grad_norm": 0.43774527311325073,
"learning_rate": 0.0002,
"loss": 0.5888,
"step": 1300
},
{
"epoch": 0.10839885852717157,
"grad_norm": 0.5423911213874817,
"learning_rate": 0.0002,
"loss": 0.5409,
"step": 1320
},
{
"epoch": 0.11004126547455295,
"grad_norm": 0.4405030906200409,
"learning_rate": 0.0002,
"loss": 0.5248,
"step": 1340
},
{
"epoch": 0.11168367242193435,
"grad_norm": 0.4299491345882416,
"learning_rate": 0.0002,
"loss": 0.5196,
"step": 1360
},
{
"epoch": 0.11332607936931573,
"grad_norm": 0.5445800423622131,
"learning_rate": 0.0002,
"loss": 0.5524,
"step": 1380
},
{
"epoch": 0.11496848631669712,
"grad_norm": 0.42257580161094666,
"learning_rate": 0.0002,
"loss": 0.5266,
"step": 1400
},
{
"epoch": 0.1166108932640785,
"grad_norm": 0.4614318907260895,
"learning_rate": 0.0002,
"loss": 0.5593,
"step": 1420
},
{
"epoch": 0.1182533002114599,
"grad_norm": 0.5021907687187195,
"learning_rate": 0.0002,
"loss": 0.5183,
"step": 1440
},
{
"epoch": 0.11989570715884128,
"grad_norm": 0.39399659633636475,
"learning_rate": 0.0002,
"loss": 0.516,
"step": 1460
},
{
"epoch": 0.12153811410622267,
"grad_norm": 0.5128427743911743,
"learning_rate": 0.0002,
"loss": 0.5067,
"step": 1480
},
{
"epoch": 0.12318052105360405,
"grad_norm": 0.41359153389930725,
"learning_rate": 0.0002,
"loss": 0.508,
"step": 1500
},
{
"epoch": 0.12482292800098545,
"grad_norm": 0.5723029375076294,
"learning_rate": 0.0002,
"loss": 0.4955,
"step": 1520
},
{
"epoch": 0.12646533494836684,
"grad_norm": 0.4619792699813843,
"learning_rate": 0.0002,
"loss": 0.5398,
"step": 1540
},
{
"epoch": 0.12810774189574822,
"grad_norm": 0.5200566649436951,
"learning_rate": 0.0002,
"loss": 0.5213,
"step": 1560
},
{
"epoch": 0.1297501488431296,
"grad_norm": 0.4156297445297241,
"learning_rate": 0.0002,
"loss": 0.4895,
"step": 1580
},
{
"epoch": 0.13139255579051098,
"grad_norm": 0.43649184703826904,
"learning_rate": 0.0002,
"loss": 0.4809,
"step": 1600
},
{
"epoch": 0.1330349627378924,
"grad_norm": 0.38926875591278076,
"learning_rate": 0.0002,
"loss": 0.4819,
"step": 1620
},
{
"epoch": 0.13467736968527377,
"grad_norm": 0.45897549390792847,
"learning_rate": 0.0002,
"loss": 0.4619,
"step": 1640
},
{
"epoch": 0.13631977663265515,
"grad_norm": 0.4487549364566803,
"learning_rate": 0.0002,
"loss": 0.4737,
"step": 1660
},
{
"epoch": 0.13796218358003653,
"grad_norm": 0.36948007345199585,
"learning_rate": 0.0002,
"loss": 0.4576,
"step": 1680
},
{
"epoch": 0.13960459052741794,
"grad_norm": 0.38834378123283386,
"learning_rate": 0.0002,
"loss": 0.4464,
"step": 1700
},
{
"epoch": 0.14124699747479932,
"grad_norm": 0.5436655879020691,
"learning_rate": 0.0002,
"loss": 0.4616,
"step": 1720
},
{
"epoch": 0.1428894044221807,
"grad_norm": 0.3576355278491974,
"learning_rate": 0.0002,
"loss": 0.4669,
"step": 1740
},
{
"epoch": 0.14453181136956209,
"grad_norm": 0.4736698269844055,
"learning_rate": 0.0002,
"loss": 0.4788,
"step": 1760
},
{
"epoch": 0.1461742183169435,
"grad_norm": 0.4074772596359253,
"learning_rate": 0.0002,
"loss": 0.4214,
"step": 1780
},
{
"epoch": 0.14781662526432487,
"grad_norm": 0.4454910457134247,
"learning_rate": 0.0002,
"loss": 0.4407,
"step": 1800
},
{
"epoch": 0.14945903221170626,
"grad_norm": 0.4039610028266907,
"learning_rate": 0.0002,
"loss": 0.4585,
"step": 1820
},
{
"epoch": 0.15110143915908764,
"grad_norm": 0.4431604743003845,
"learning_rate": 0.0002,
"loss": 0.4483,
"step": 1840
},
{
"epoch": 0.15274384610646902,
"grad_norm": 0.4190782606601715,
"learning_rate": 0.0002,
"loss": 0.4516,
"step": 1860
},
{
"epoch": 0.15438625305385043,
"grad_norm": 0.2951456606388092,
"learning_rate": 0.0002,
"loss": 0.4584,
"step": 1880
},
{
"epoch": 0.1560286600012318,
"grad_norm": 0.4400006830692291,
"learning_rate": 0.0002,
"loss": 0.4533,
"step": 1900
},
{
"epoch": 0.1576710669486132,
"grad_norm": 0.3839446008205414,
"learning_rate": 0.0002,
"loss": 0.4489,
"step": 1920
},
{
"epoch": 0.15931347389599457,
"grad_norm": 0.41484808921813965,
"learning_rate": 0.0002,
"loss": 0.422,
"step": 1940
},
{
"epoch": 0.16095588084337598,
"grad_norm": 0.5211725831031799,
"learning_rate": 0.0002,
"loss": 0.4379,
"step": 1960
},
{
"epoch": 0.16259828779075736,
"grad_norm": 0.3866327106952667,
"learning_rate": 0.0002,
"loss": 0.4279,
"step": 1980
},
{
"epoch": 0.16424069473813874,
"grad_norm": 0.3327186107635498,
"learning_rate": 0.0002,
"loss": 0.417,
"step": 2000
},
{
"epoch": 0.16588310168552012,
"grad_norm": 0.46427205204963684,
"learning_rate": 0.0002,
"loss": 0.4411,
"step": 2020
},
{
"epoch": 0.16752550863290153,
"grad_norm": 0.4826524257659912,
"learning_rate": 0.0002,
"loss": 0.4359,
"step": 2040
},
{
"epoch": 0.1691679155802829,
"grad_norm": 0.4641328454017639,
"learning_rate": 0.0002,
"loss": 0.4691,
"step": 2060
},
{
"epoch": 0.1708103225276643,
"grad_norm": 0.525749683380127,
"learning_rate": 0.0002,
"loss": 0.4297,
"step": 2080
},
{
"epoch": 0.17245272947504567,
"grad_norm": 0.45604804158210754,
"learning_rate": 0.0002,
"loss": 0.4411,
"step": 2100
},
{
"epoch": 0.17409513642242708,
"grad_norm": 0.3894326984882355,
"learning_rate": 0.0002,
"loss": 0.4098,
"step": 2120
},
{
"epoch": 0.17573754336980846,
"grad_norm": 0.34401944279670715,
"learning_rate": 0.0002,
"loss": 0.406,
"step": 2140
},
{
"epoch": 0.17737995031718984,
"grad_norm": 0.3576812148094177,
"learning_rate": 0.0002,
"loss": 0.4024,
"step": 2160
},
{
"epoch": 0.17902235726457122,
"grad_norm": 0.4276871979236603,
"learning_rate": 0.0002,
"loss": 0.4085,
"step": 2180
},
{
"epoch": 0.18066476421195263,
"grad_norm": 0.49007973074913025,
"learning_rate": 0.0002,
"loss": 0.4104,
"step": 2200
},
{
"epoch": 0.182307171159334,
"grad_norm": 0.4573257267475128,
"learning_rate": 0.0002,
"loss": 0.4041,
"step": 2220
},
{
"epoch": 0.1839495781067154,
"grad_norm": 0.4118468463420868,
"learning_rate": 0.0002,
"loss": 0.3984,
"step": 2240
},
{
"epoch": 0.18559198505409677,
"grad_norm": 0.357284277677536,
"learning_rate": 0.0002,
"loss": 0.4212,
"step": 2260
},
{
"epoch": 0.18723439200147818,
"grad_norm": 0.4252781867980957,
"learning_rate": 0.0002,
"loss": 0.3924,
"step": 2280
},
{
"epoch": 0.18887679894885956,
"grad_norm": 0.40546557307243347,
"learning_rate": 0.0002,
"loss": 0.398,
"step": 2300
},
{
"epoch": 0.19051920589624094,
"grad_norm": 0.4305673837661743,
"learning_rate": 0.0002,
"loss": 0.398,
"step": 2320
},
{
"epoch": 0.19216161284362232,
"grad_norm": 0.40348726511001587,
"learning_rate": 0.0002,
"loss": 0.4031,
"step": 2340
},
{
"epoch": 0.19380401979100373,
"grad_norm": 0.48159924149513245,
"learning_rate": 0.0002,
"loss": 0.3926,
"step": 2360
},
{
"epoch": 0.1954464267383851,
"grad_norm": 0.5939348936080933,
"learning_rate": 0.0002,
"loss": 0.3963,
"step": 2380
},
{
"epoch": 0.1970888336857665,
"grad_norm": 0.42593804001808167,
"learning_rate": 0.0002,
"loss": 0.3925,
"step": 2400
},
{
"epoch": 0.19873124063314787,
"grad_norm": 0.515277624130249,
"learning_rate": 0.0002,
"loss": 0.3753,
"step": 2420
},
{
"epoch": 0.20037364758052928,
"grad_norm": 0.43423864245414734,
"learning_rate": 0.0002,
"loss": 0.396,
"step": 2440
},
{
"epoch": 0.20201605452791066,
"grad_norm": 0.3857817053794861,
"learning_rate": 0.0002,
"loss": 0.3834,
"step": 2460
},
{
"epoch": 0.20365846147529204,
"grad_norm": 0.3945648670196533,
"learning_rate": 0.0002,
"loss": 0.3768,
"step": 2480
},
{
"epoch": 0.20530086842267342,
"grad_norm": 0.46411946415901184,
"learning_rate": 0.0002,
"loss": 0.3852,
"step": 2500
},
{
"epoch": 0.2069432753700548,
"grad_norm": 0.3779551684856415,
"learning_rate": 0.0002,
"loss": 0.3767,
"step": 2520
},
{
"epoch": 0.2085856823174362,
"grad_norm": 0.4743368625640869,
"learning_rate": 0.0002,
"loss": 0.4253,
"step": 2540
},
{
"epoch": 0.2102280892648176,
"grad_norm": 0.4278275668621063,
"learning_rate": 0.0002,
"loss": 0.3558,
"step": 2560
},
{
"epoch": 0.21187049621219897,
"grad_norm": 0.42412903904914856,
"learning_rate": 0.0002,
"loss": 0.3934,
"step": 2580
},
{
"epoch": 0.21351290315958035,
"grad_norm": 7.02437162399292,
"learning_rate": 0.0002,
"loss": 0.3972,
"step": 2600
},
{
"epoch": 0.21515531010696176,
"grad_norm": 0.46447402238845825,
"learning_rate": 0.0002,
"loss": 0.3742,
"step": 2620
},
{
"epoch": 0.21679771705434314,
"grad_norm": 0.4078330993652344,
"learning_rate": 0.0002,
"loss": 0.3954,
"step": 2640
},
{
"epoch": 0.21844012400172452,
"grad_norm": 0.39751455187797546,
"learning_rate": 0.0002,
"loss": 0.36,
"step": 2660
},
{
"epoch": 0.2200825309491059,
"grad_norm": 0.4075968265533447,
"learning_rate": 0.0002,
"loss": 0.3894,
"step": 2680
},
{
"epoch": 0.2217249378964873,
"grad_norm": 0.39630162715911865,
"learning_rate": 0.0002,
"loss": 0.3748,
"step": 2700
},
{
"epoch": 0.2233673448438687,
"grad_norm": 0.42885056138038635,
"learning_rate": 0.0002,
"loss": 0.3496,
"step": 2720
},
{
"epoch": 0.22500975179125007,
"grad_norm": 0.4635525941848755,
"learning_rate": 0.0002,
"loss": 0.3494,
"step": 2740
},
{
"epoch": 0.22665215873863145,
"grad_norm": 0.48458898067474365,
"learning_rate": 0.0002,
"loss": 0.387,
"step": 2760
},
{
"epoch": 0.22829456568601286,
"grad_norm": 0.49742501974105835,
"learning_rate": 0.0002,
"loss": 0.3717,
"step": 2780
},
{
"epoch": 0.22993697263339424,
"grad_norm": 0.4279645085334778,
"learning_rate": 0.0002,
"loss": 0.3537,
"step": 2800
},
{
"epoch": 0.23157937958077562,
"grad_norm": 0.5221889615058899,
"learning_rate": 0.0002,
"loss": 0.3676,
"step": 2820
},
{
"epoch": 0.233221786528157,
"grad_norm": 0.5390656590461731,
"learning_rate": 0.0002,
"loss": 0.3439,
"step": 2840
},
{
"epoch": 0.2348641934755384,
"grad_norm": 0.4269630014896393,
"learning_rate": 0.0002,
"loss": 0.3663,
"step": 2860
},
{
"epoch": 0.2365066004229198,
"grad_norm": 0.37411990761756897,
"learning_rate": 0.0002,
"loss": 0.3779,
"step": 2880
},
{
"epoch": 0.23814900737030117,
"grad_norm": 0.3186222016811371,
"learning_rate": 0.0002,
"loss": 0.3513,
"step": 2900
},
{
"epoch": 0.23979141431768256,
"grad_norm": 0.33270496129989624,
"learning_rate": 0.0002,
"loss": 0.3534,
"step": 2920
},
{
"epoch": 0.24143382126506396,
"grad_norm": 0.4496273100376129,
"learning_rate": 0.0002,
"loss": 0.3588,
"step": 2940
},
{
"epoch": 0.24307622821244534,
"grad_norm": 0.35411253571510315,
"learning_rate": 0.0002,
"loss": 0.3466,
"step": 2960
},
{
"epoch": 0.24471863515982673,
"grad_norm": 0.4333256185054779,
"learning_rate": 0.0002,
"loss": 0.3555,
"step": 2980
},
{
"epoch": 0.2463610421072081,
"grad_norm": 0.3264130651950836,
"learning_rate": 0.0002,
"loss": 0.3345,
"step": 3000
},
{
"epoch": 0.24800344905458951,
"grad_norm": 0.3925504684448242,
"learning_rate": 0.0002,
"loss": 0.3559,
"step": 3020
},
{
"epoch": 0.2496458560019709,
"grad_norm": 0.4186360836029053,
"learning_rate": 0.0002,
"loss": 0.3458,
"step": 3040
},
{
"epoch": 0.2512882629493523,
"grad_norm": 0.4656223952770233,
"learning_rate": 0.0002,
"loss": 0.349,
"step": 3060
},
{
"epoch": 0.2529306698967337,
"grad_norm": 0.4535064399242401,
"learning_rate": 0.0002,
"loss": 0.3474,
"step": 3080
},
{
"epoch": 0.25457307684411506,
"grad_norm": 0.37564146518707275,
"learning_rate": 0.0002,
"loss": 0.3454,
"step": 3100
},
{
"epoch": 0.25621548379149645,
"grad_norm": 0.36363497376441956,
"learning_rate": 0.0002,
"loss": 0.3515,
"step": 3120
},
{
"epoch": 0.2578578907388778,
"grad_norm": 0.380750447511673,
"learning_rate": 0.0002,
"loss": 0.3653,
"step": 3140
},
{
"epoch": 0.2595002976862592,
"grad_norm": 0.3188472092151642,
"learning_rate": 0.0002,
"loss": 0.3596,
"step": 3160
},
{
"epoch": 0.2611427046336406,
"grad_norm": 0.4478905200958252,
"learning_rate": 0.0002,
"loss": 0.3567,
"step": 3180
},
{
"epoch": 0.26278511158102197,
"grad_norm": 0.4925800859928131,
"learning_rate": 0.0002,
"loss": 0.3466,
"step": 3200
},
{
"epoch": 0.26442751852840335,
"grad_norm": 0.3702840209007263,
"learning_rate": 0.0002,
"loss": 0.3327,
"step": 3220
},
{
"epoch": 0.2660699254757848,
"grad_norm": 0.35024309158325195,
"learning_rate": 0.0002,
"loss": 0.3524,
"step": 3240
},
{
"epoch": 0.26771233242316617,
"grad_norm": 0.4079764783382416,
"learning_rate": 0.0002,
"loss": 0.338,
"step": 3260
},
{
"epoch": 0.26935473937054755,
"grad_norm": 0.4466266632080078,
"learning_rate": 0.0002,
"loss": 0.3465,
"step": 3280
},
{
"epoch": 0.2709971463179289,
"grad_norm": 0.4438311457633972,
"learning_rate": 0.0002,
"loss": 0.3396,
"step": 3300
},
{
"epoch": 0.2726395532653103,
"grad_norm": 0.37101468443870544,
"learning_rate": 0.0002,
"loss": 0.3392,
"step": 3320
},
{
"epoch": 0.2742819602126917,
"grad_norm": 0.41411712765693665,
"learning_rate": 0.0002,
"loss": 0.3341,
"step": 3340
},
{
"epoch": 0.27592436716007307,
"grad_norm": 0.47411611676216125,
"learning_rate": 0.0002,
"loss": 0.3355,
"step": 3360
},
{
"epoch": 0.27756677410745445,
"grad_norm": 0.4871801733970642,
"learning_rate": 0.0002,
"loss": 0.3627,
"step": 3380
},
{
"epoch": 0.2792091810548359,
"grad_norm": 0.47128844261169434,
"learning_rate": 0.0002,
"loss": 0.324,
"step": 3400
},
{
"epoch": 0.28085158800221727,
"grad_norm": 0.4556843042373657,
"learning_rate": 0.0002,
"loss": 0.3443,
"step": 3420
},
{
"epoch": 0.28249399494959865,
"grad_norm": 0.3775945007801056,
"learning_rate": 0.0002,
"loss": 0.3401,
"step": 3440
},
{
"epoch": 0.28413640189698003,
"grad_norm": 0.377316415309906,
"learning_rate": 0.0002,
"loss": 0.3478,
"step": 3460
},
{
"epoch": 0.2857788088443614,
"grad_norm": 0.336944580078125,
"learning_rate": 0.0002,
"loss": 0.3382,
"step": 3480
},
{
"epoch": 0.2874212157917428,
"grad_norm": 0.4296940863132477,
"learning_rate": 0.0002,
"loss": 0.3361,
"step": 3500
},
{
"epoch": 0.28906362273912417,
"grad_norm": 0.4638020396232605,
"learning_rate": 0.0002,
"loss": 0.3583,
"step": 3520
},
{
"epoch": 0.29070602968650555,
"grad_norm": 0.4074634313583374,
"learning_rate": 0.0002,
"loss": 0.3601,
"step": 3540
},
{
"epoch": 0.292348436633887,
"grad_norm": 0.3634164035320282,
"learning_rate": 0.0002,
"loss": 0.3216,
"step": 3560
},
{
"epoch": 0.29399084358126837,
"grad_norm": 0.43480202555656433,
"learning_rate": 0.0002,
"loss": 0.33,
"step": 3580
},
{
"epoch": 0.29563325052864975,
"grad_norm": 0.42778658866882324,
"learning_rate": 0.0002,
"loss": 0.3408,
"step": 3600
},
{
"epoch": 0.29727565747603113,
"grad_norm": 0.3778844177722931,
"learning_rate": 0.0002,
"loss": 0.3309,
"step": 3620
},
{
"epoch": 0.2989180644234125,
"grad_norm": 0.33491814136505127,
"learning_rate": 0.0002,
"loss": 0.3011,
"step": 3640
},
{
"epoch": 0.3005604713707939,
"grad_norm": 0.5079118609428406,
"learning_rate": 0.0002,
"loss": 0.3079,
"step": 3660
},
{
"epoch": 0.30220287831817527,
"grad_norm": 0.3751799166202545,
"learning_rate": 0.0002,
"loss": 0.3286,
"step": 3680
},
{
"epoch": 0.30384528526555665,
"grad_norm": 0.4447515904903412,
"learning_rate": 0.0002,
"loss": 0.2991,
"step": 3700
},
{
"epoch": 0.30548769221293803,
"grad_norm": 0.33741819858551025,
"learning_rate": 0.0002,
"loss": 0.3169,
"step": 3720
},
{
"epoch": 0.30713009916031947,
"grad_norm": 0.3624327480792999,
"learning_rate": 0.0002,
"loss": 0.3213,
"step": 3740
},
{
"epoch": 0.30877250610770085,
"grad_norm": 0.5299442410469055,
"learning_rate": 0.0002,
"loss": 0.3476,
"step": 3760
},
{
"epoch": 0.31041491305508223,
"grad_norm": 0.3178050220012665,
"learning_rate": 0.0002,
"loss": 0.329,
"step": 3780
},
{
"epoch": 0.3120573200024636,
"grad_norm": 0.3178127408027649,
"learning_rate": 0.0002,
"loss": 0.3046,
"step": 3800
},
{
"epoch": 0.313699726949845,
"grad_norm": 0.4366089403629303,
"learning_rate": 0.0002,
"loss": 0.3179,
"step": 3820
},
{
"epoch": 0.3153421338972264,
"grad_norm": 0.47534024715423584,
"learning_rate": 0.0002,
"loss": 0.3377,
"step": 3840
},
{
"epoch": 0.31698454084460775,
"grad_norm": 0.4247181713581085,
"learning_rate": 0.0002,
"loss": 0.311,
"step": 3860
},
{
"epoch": 0.31862694779198913,
"grad_norm": 0.5085952877998352,
"learning_rate": 0.0002,
"loss": 0.3197,
"step": 3880
},
{
"epoch": 0.32026935473937057,
"grad_norm": 0.3649958372116089,
"learning_rate": 0.0002,
"loss": 0.3243,
"step": 3900
},
{
"epoch": 0.32191176168675195,
"grad_norm": 0.43816304206848145,
"learning_rate": 0.0002,
"loss": 0.3232,
"step": 3920
},
{
"epoch": 0.32355416863413333,
"grad_norm": 0.32603034377098083,
"learning_rate": 0.0002,
"loss": 0.3155,
"step": 3940
},
{
"epoch": 0.3251965755815147,
"grad_norm": 0.4867421090602875,
"learning_rate": 0.0002,
"loss": 0.3102,
"step": 3960
},
{
"epoch": 0.3268389825288961,
"grad_norm": 0.3843926191329956,
"learning_rate": 0.0002,
"loss": 0.3035,
"step": 3980
},
{
"epoch": 0.3284813894762775,
"grad_norm": 0.49313676357269287,
"learning_rate": 0.0002,
"loss": 0.322,
"step": 4000
},
{
"epoch": 0.33012379642365886,
"grad_norm": 0.4102085530757904,
"learning_rate": 0.0002,
"loss": 0.3206,
"step": 4020
},
{
"epoch": 0.33176620337104024,
"grad_norm": 0.47901496291160583,
"learning_rate": 0.0002,
"loss": 0.3131,
"step": 4040
},
{
"epoch": 0.33340861031842167,
"grad_norm": 0.40674644708633423,
"learning_rate": 0.0002,
"loss": 0.3091,
"step": 4060
},
{
"epoch": 0.33505101726580305,
"grad_norm": 0.44038107991218567,
"learning_rate": 0.0002,
"loss": 0.3116,
"step": 4080
},
{
"epoch": 0.33669342421318443,
"grad_norm": 0.3919316828250885,
"learning_rate": 0.0002,
"loss": 0.3077,
"step": 4100
},
{
"epoch": 0.3383358311605658,
"grad_norm": 0.38622769713401794,
"learning_rate": 0.0002,
"loss": 0.302,
"step": 4120
},
{
"epoch": 0.3399782381079472,
"grad_norm": 0.4685916602611542,
"learning_rate": 0.0002,
"loss": 0.3234,
"step": 4140
},
{
"epoch": 0.3416206450553286,
"grad_norm": 0.3348797559738159,
"learning_rate": 0.0002,
"loss": 0.3205,
"step": 4160
},
{
"epoch": 0.34326305200270996,
"grad_norm": 0.4265504777431488,
"learning_rate": 0.0002,
"loss": 0.3101,
"step": 4180
},
{
"epoch": 0.34490545895009134,
"grad_norm": 0.4005930423736572,
"learning_rate": 0.0002,
"loss": 0.3096,
"step": 4200
},
{
"epoch": 0.3465478658974728,
"grad_norm": 0.4154227674007416,
"learning_rate": 0.0002,
"loss": 0.3188,
"step": 4220
},
{
"epoch": 0.34819027284485415,
"grad_norm": 0.30359068512916565,
"learning_rate": 0.0002,
"loss": 0.2966,
"step": 4240
},
{
"epoch": 0.34983267979223553,
"grad_norm": 0.35363709926605225,
"learning_rate": 0.0002,
"loss": 0.3189,
"step": 4260
},
{
"epoch": 0.3514750867396169,
"grad_norm": 0.43156126141548157,
"learning_rate": 0.0002,
"loss": 0.2951,
"step": 4280
},
{
"epoch": 0.3531174936869983,
"grad_norm": 0.4593096077442169,
"learning_rate": 0.0002,
"loss": 0.3048,
"step": 4300
},
{
"epoch": 0.3547599006343797,
"grad_norm": 0.49352073669433594,
"learning_rate": 0.0002,
"loss": 0.301,
"step": 4320
},
{
"epoch": 0.35640230758176106,
"grad_norm": 0.4053367078304291,
"learning_rate": 0.0002,
"loss": 0.311,
"step": 4340
},
{
"epoch": 0.35804471452914244,
"grad_norm": 0.3465437889099121,
"learning_rate": 0.0002,
"loss": 0.3186,
"step": 4360
},
{
"epoch": 0.3596871214765238,
"grad_norm": 0.4525587558746338,
"learning_rate": 0.0002,
"loss": 0.3126,
"step": 4380
},
{
"epoch": 0.36132952842390526,
"grad_norm": 0.4213342070579529,
"learning_rate": 0.0002,
"loss": 0.3041,
"step": 4400
},
{
"epoch": 0.36297193537128664,
"grad_norm": 0.37421244382858276,
"learning_rate": 0.0002,
"loss": 0.3295,
"step": 4420
},
{
"epoch": 0.364614342318668,
"grad_norm": 0.4033282697200775,
"learning_rate": 0.0002,
"loss": 0.3031,
"step": 4440
},
{
"epoch": 0.3662567492660494,
"grad_norm": 0.45873841643333435,
"learning_rate": 0.0002,
"loss": 0.2819,
"step": 4460
},
{
"epoch": 0.3678991562134308,
"grad_norm": 0.36195841431617737,
"learning_rate": 0.0002,
"loss": 0.2908,
"step": 4480
},
{
"epoch": 0.36954156316081216,
"grad_norm": 0.39707615971565247,
"learning_rate": 0.0002,
"loss": 0.3023,
"step": 4500
},
{
"epoch": 0.37118397010819354,
"grad_norm": 0.3999727666378021,
"learning_rate": 0.0002,
"loss": 0.31,
"step": 4520
},
{
"epoch": 0.3728263770555749,
"grad_norm": 0.36880913376808167,
"learning_rate": 0.0002,
"loss": 0.3017,
"step": 4540
},
{
"epoch": 0.37446878400295636,
"grad_norm": 0.36656180024147034,
"learning_rate": 0.0002,
"loss": 0.3129,
"step": 4560
},
{
"epoch": 0.37611119095033774,
"grad_norm": 0.4566299021244049,
"learning_rate": 0.0002,
"loss": 0.3039,
"step": 4580
},
{
"epoch": 0.3777535978977191,
"grad_norm": 0.3202304542064667,
"learning_rate": 0.0002,
"loss": 0.2827,
"step": 4600
},
{
"epoch": 0.3793960048451005,
"grad_norm": 0.4553089439868927,
"learning_rate": 0.0002,
"loss": 0.3401,
"step": 4620
},
{
"epoch": 0.3810384117924819,
"grad_norm": 0.40536269545555115,
"learning_rate": 0.0002,
"loss": 0.3038,
"step": 4640
},
{
"epoch": 0.38268081873986326,
"grad_norm": 0.36675453186035156,
"learning_rate": 0.0002,
"loss": 0.3198,
"step": 4660
},
{
"epoch": 0.38432322568724464,
"grad_norm": 0.41660359501838684,
"learning_rate": 0.0002,
"loss": 0.2904,
"step": 4680
},
{
"epoch": 0.385965632634626,
"grad_norm": 0.2889881134033203,
"learning_rate": 0.0002,
"loss": 0.3076,
"step": 4700
},
{
"epoch": 0.38760803958200746,
"grad_norm": 0.3077252507209778,
"learning_rate": 0.0002,
"loss": 0.3087,
"step": 4720
},
{
"epoch": 0.38925044652938884,
"grad_norm": 0.43053752183914185,
"learning_rate": 0.0002,
"loss": 0.2994,
"step": 4740
},
{
"epoch": 0.3908928534767702,
"grad_norm": 0.39978402853012085,
"learning_rate": 0.0002,
"loss": 0.2825,
"step": 4760
},
{
"epoch": 0.3925352604241516,
"grad_norm": 0.39721283316612244,
"learning_rate": 0.0002,
"loss": 0.3002,
"step": 4780
},
{
"epoch": 0.394177667371533,
"grad_norm": 0.4234716296195984,
"learning_rate": 0.0002,
"loss": 0.281,
"step": 4800
},
{
"epoch": 0.39582007431891436,
"grad_norm": 0.41390299797058105,
"learning_rate": 0.0002,
"loss": 0.3015,
"step": 4820
},
{
"epoch": 0.39746248126629574,
"grad_norm": 0.8412930369377136,
"learning_rate": 0.0002,
"loss": 0.3034,
"step": 4840
},
{
"epoch": 0.3991048882136771,
"grad_norm": 0.4165583848953247,
"learning_rate": 0.0002,
"loss": 0.2844,
"step": 4860
},
{
"epoch": 0.40074729516105856,
"grad_norm": 0.4212113618850708,
"learning_rate": 0.0002,
"loss": 0.2847,
"step": 4880
},
{
"epoch": 0.40238970210843994,
"grad_norm": 0.46880143880844116,
"learning_rate": 0.0002,
"loss": 0.2877,
"step": 4900
},
{
"epoch": 0.4040321090558213,
"grad_norm": 0.33470281958580017,
"learning_rate": 0.0002,
"loss": 0.3006,
"step": 4920
},
{
"epoch": 0.4056745160032027,
"grad_norm": 0.41939905285835266,
"learning_rate": 0.0002,
"loss": 0.3014,
"step": 4940
},
{
"epoch": 0.4073169229505841,
"grad_norm": 0.4031718671321869,
"learning_rate": 0.0002,
"loss": 0.2959,
"step": 4960
},
{
"epoch": 0.40895932989796546,
"grad_norm": 0.3611488938331604,
"learning_rate": 0.0002,
"loss": 0.3175,
"step": 4980
},
{
"epoch": 0.41060173684534684,
"grad_norm": 0.38445645570755005,
"learning_rate": 0.0002,
"loss": 0.2897,
"step": 5000
},
{
"epoch": 0.4122441437927282,
"grad_norm": 0.3903651833534241,
"learning_rate": 0.0002,
"loss": 0.2716,
"step": 5020
},
{
"epoch": 0.4138865507401096,
"grad_norm": 0.39842015504837036,
"learning_rate": 0.0002,
"loss": 0.2987,
"step": 5040
},
{
"epoch": 0.41552895768749104,
"grad_norm": 0.4211498200893402,
"learning_rate": 0.0002,
"loss": 0.3027,
"step": 5060
},
{
"epoch": 0.4171713646348724,
"grad_norm": 0.4767220914363861,
"learning_rate": 0.0002,
"loss": 0.2897,
"step": 5080
},
{
"epoch": 0.4188137715822538,
"grad_norm": 0.4871378242969513,
"learning_rate": 0.0002,
"loss": 0.2874,
"step": 5100
},
{
"epoch": 0.4204561785296352,
"grad_norm": 0.3960734009742737,
"learning_rate": 0.0002,
"loss": 0.2903,
"step": 5120
},
{
"epoch": 0.42209858547701656,
"grad_norm": 0.3350552022457123,
"learning_rate": 0.0002,
"loss": 0.2835,
"step": 5140
},
{
"epoch": 0.42374099242439794,
"grad_norm": 0.34975695610046387,
"learning_rate": 0.0002,
"loss": 0.3025,
"step": 5160
},
{
"epoch": 0.4253833993717793,
"grad_norm": 0.3886794149875641,
"learning_rate": 0.0002,
"loss": 0.289,
"step": 5180
},
{
"epoch": 0.4270258063191607,
"grad_norm": 0.4114588797092438,
"learning_rate": 0.0002,
"loss": 0.2802,
"step": 5200
},
{
"epoch": 0.42866821326654214,
"grad_norm": 0.4368172585964203,
"learning_rate": 0.0002,
"loss": 0.2918,
"step": 5220
},
{
"epoch": 0.4303106202139235,
"grad_norm": 0.2889314889907837,
"learning_rate": 0.0002,
"loss": 0.2854,
"step": 5240
},
{
"epoch": 0.4319530271613049,
"grad_norm": 0.3999134600162506,
"learning_rate": 0.0002,
"loss": 0.2955,
"step": 5260
},
{
"epoch": 0.4335954341086863,
"grad_norm": 0.32143938541412354,
"learning_rate": 0.0002,
"loss": 0.2836,
"step": 5280
},
{
"epoch": 0.43523784105606766,
"grad_norm": 0.4069638252258301,
"learning_rate": 0.0002,
"loss": 0.2854,
"step": 5300
},
{
"epoch": 0.43688024800344905,
"grad_norm": 0.46609416604042053,
"learning_rate": 0.0002,
"loss": 0.2777,
"step": 5320
},
{
"epoch": 0.4385226549508304,
"grad_norm": 0.35112160444259644,
"learning_rate": 0.0002,
"loss": 0.2896,
"step": 5340
},
{
"epoch": 0.4401650618982118,
"grad_norm": 0.4243420660495758,
"learning_rate": 0.0002,
"loss": 0.2743,
"step": 5360
},
{
"epoch": 0.44180746884559324,
"grad_norm": 0.45615971088409424,
"learning_rate": 0.0002,
"loss": 0.2699,
"step": 5380
},
{
"epoch": 0.4434498757929746,
"grad_norm": 0.4836295247077942,
"learning_rate": 0.0002,
"loss": 0.2932,
"step": 5400
},
{
"epoch": 0.445092282740356,
"grad_norm": 0.41774359345436096,
"learning_rate": 0.0002,
"loss": 0.2869,
"step": 5420
},
{
"epoch": 0.4467346896877374,
"grad_norm": 0.3904239535331726,
"learning_rate": 0.0002,
"loss": 0.2798,
"step": 5440
},
{
"epoch": 0.44837709663511877,
"grad_norm": 0.3867247700691223,
"learning_rate": 0.0002,
"loss": 0.2668,
"step": 5460
},
{
"epoch": 0.45001950358250015,
"grad_norm": 0.33975329995155334,
"learning_rate": 0.0002,
"loss": 0.2805,
"step": 5480
},
{
"epoch": 0.4516619105298815,
"grad_norm": 0.30403727293014526,
"learning_rate": 0.0002,
"loss": 0.2747,
"step": 5500
},
{
"epoch": 0.4533043174772629,
"grad_norm": 0.4227672219276428,
"learning_rate": 0.0002,
"loss": 0.2699,
"step": 5520
},
{
"epoch": 0.4549467244246443,
"grad_norm": 0.38823801279067993,
"learning_rate": 0.0002,
"loss": 0.256,
"step": 5540
},
{
"epoch": 0.4565891313720257,
"grad_norm": 0.3460341691970825,
"learning_rate": 0.0002,
"loss": 0.2768,
"step": 5560
},
{
"epoch": 0.4582315383194071,
"grad_norm": 0.40843436121940613,
"learning_rate": 0.0002,
"loss": 0.2829,
"step": 5580
},
{
"epoch": 0.4598739452667885,
"grad_norm": 0.411004900932312,
"learning_rate": 0.0002,
"loss": 0.2849,
"step": 5600
},
{
"epoch": 0.46151635221416987,
"grad_norm": 0.5354210138320923,
"learning_rate": 0.0002,
"loss": 0.298,
"step": 5620
},
{
"epoch": 0.46315875916155125,
"grad_norm": 0.3296845555305481,
"learning_rate": 0.0002,
"loss": 0.2571,
"step": 5640
},
{
"epoch": 0.46480116610893263,
"grad_norm": 0.404950350522995,
"learning_rate": 0.0002,
"loss": 0.2843,
"step": 5660
},
{
"epoch": 0.466443573056314,
"grad_norm": 0.3697005808353424,
"learning_rate": 0.0002,
"loss": 0.2655,
"step": 5680
},
{
"epoch": 0.4680859800036954,
"grad_norm": 0.3465549945831299,
"learning_rate": 0.0002,
"loss": 0.282,
"step": 5700
},
{
"epoch": 0.4697283869510768,
"grad_norm": 0.4802212119102478,
"learning_rate": 0.0002,
"loss": 0.2672,
"step": 5720
},
{
"epoch": 0.4713707938984582,
"grad_norm": 0.3909721076488495,
"learning_rate": 0.0002,
"loss": 0.2704,
"step": 5740
},
{
"epoch": 0.4730132008458396,
"grad_norm": 0.41303369402885437,
"learning_rate": 0.0002,
"loss": 0.2797,
"step": 5760
},
{
"epoch": 0.47465560779322097,
"grad_norm": 0.32934170961380005,
"learning_rate": 0.0002,
"loss": 0.2903,
"step": 5780
},
{
"epoch": 0.47629801474060235,
"grad_norm": 0.375072181224823,
"learning_rate": 0.0002,
"loss": 0.2752,
"step": 5800
},
{
"epoch": 0.47794042168798373,
"grad_norm": 0.35390418767929077,
"learning_rate": 0.0002,
"loss": 0.2755,
"step": 5820
},
{
"epoch": 0.4795828286353651,
"grad_norm": 0.3856378197669983,
"learning_rate": 0.0002,
"loss": 0.2699,
"step": 5840
},
{
"epoch": 0.4812252355827465,
"grad_norm": 0.2624310851097107,
"learning_rate": 0.0002,
"loss": 0.2654,
"step": 5860
},
{
"epoch": 0.4828676425301279,
"grad_norm": 0.43709930777549744,
"learning_rate": 0.0002,
"loss": 0.2768,
"step": 5880
},
{
"epoch": 0.4845100494775093,
"grad_norm": 0.3971209228038788,
"learning_rate": 0.0002,
"loss": 0.2728,
"step": 5900
},
{
"epoch": 0.4861524564248907,
"grad_norm": 0.3937450647354126,
"learning_rate": 0.0002,
"loss": 0.2836,
"step": 5920
},
{
"epoch": 0.48779486337227207,
"grad_norm": 0.3925333023071289,
"learning_rate": 0.0002,
"loss": 0.2653,
"step": 5940
},
{
"epoch": 0.48943727031965345,
"grad_norm": 0.3056396245956421,
"learning_rate": 0.0002,
"loss": 0.2593,
"step": 5960
},
{
"epoch": 0.49107967726703483,
"grad_norm": 0.349110871553421,
"learning_rate": 0.0002,
"loss": 0.2872,
"step": 5980
},
{
"epoch": 0.4927220842144162,
"grad_norm": 0.37678685784339905,
"learning_rate": 0.0002,
"loss": 0.2779,
"step": 6000
},
{
"epoch": 0.4943644911617976,
"grad_norm": 0.37364938855171204,
"learning_rate": 0.0002,
"loss": 0.2612,
"step": 6020
},
{
"epoch": 0.49600689810917903,
"grad_norm": 0.3885985016822815,
"learning_rate": 0.0002,
"loss": 0.2701,
"step": 6040
},
{
"epoch": 0.4976493050565604,
"grad_norm": 0.4726998507976532,
"learning_rate": 0.0002,
"loss": 0.258,
"step": 6060
},
{
"epoch": 0.4992917120039418,
"grad_norm": 0.3752720355987549,
"learning_rate": 0.0002,
"loss": 0.2873,
"step": 6080
},
{
"epoch": 0.5009341189513231,
"grad_norm": 0.5174003839492798,
"learning_rate": 0.0002,
"loss": 0.2677,
"step": 6100
},
{
"epoch": 0.5025765258987046,
"grad_norm": 0.39343810081481934,
"learning_rate": 0.0002,
"loss": 0.2498,
"step": 6120
},
{
"epoch": 0.504218932846086,
"grad_norm": 0.3367049992084503,
"learning_rate": 0.0002,
"loss": 0.2555,
"step": 6140
},
{
"epoch": 0.5058613397934674,
"grad_norm": 0.3384205400943756,
"learning_rate": 0.0002,
"loss": 0.2865,
"step": 6160
},
{
"epoch": 0.5075037467408487,
"grad_norm": 0.37642723321914673,
"learning_rate": 0.0002,
"loss": 0.2677,
"step": 6180
},
{
"epoch": 0.5091461536882301,
"grad_norm": 0.31989771127700806,
"learning_rate": 0.0002,
"loss": 0.2675,
"step": 6200
},
{
"epoch": 0.5107885606356115,
"grad_norm": 0.30809977650642395,
"learning_rate": 0.0002,
"loss": 0.2562,
"step": 6220
},
{
"epoch": 0.5124309675829929,
"grad_norm": 0.3463954031467438,
"learning_rate": 0.0002,
"loss": 0.2576,
"step": 6240
},
{
"epoch": 0.5140733745303743,
"grad_norm": 0.3789072036743164,
"learning_rate": 0.0002,
"loss": 0.2679,
"step": 6260
},
{
"epoch": 0.5157157814777557,
"grad_norm": 0.458978533744812,
"learning_rate": 0.0002,
"loss": 0.2596,
"step": 6280
},
{
"epoch": 0.517358188425137,
"grad_norm": 0.3515280783176422,
"learning_rate": 0.0002,
"loss": 0.2629,
"step": 6300
},
{
"epoch": 0.5190005953725184,
"grad_norm": 0.42611977458000183,
"learning_rate": 0.0002,
"loss": 0.2674,
"step": 6320
},
{
"epoch": 0.5206430023198998,
"grad_norm": 0.3865070641040802,
"learning_rate": 0.0002,
"loss": 0.2714,
"step": 6340
},
{
"epoch": 0.5222854092672812,
"grad_norm": 0.3559401333332062,
"learning_rate": 0.0002,
"loss": 0.2751,
"step": 6360
},
{
"epoch": 0.5239278162146626,
"grad_norm": 0.3181537389755249,
"learning_rate": 0.0002,
"loss": 0.2724,
"step": 6380
},
{
"epoch": 0.5255702231620439,
"grad_norm": 0.37673598527908325,
"learning_rate": 0.0002,
"loss": 0.2711,
"step": 6400
},
{
"epoch": 0.5272126301094253,
"grad_norm": 0.44122573733329773,
"learning_rate": 0.0002,
"loss": 0.2617,
"step": 6420
},
{
"epoch": 0.5288550370568067,
"grad_norm": 0.4779141843318939,
"learning_rate": 0.0002,
"loss": 0.2602,
"step": 6440
},
{
"epoch": 0.5304974440041882,
"grad_norm": 0.3975127339363098,
"learning_rate": 0.0002,
"loss": 0.2472,
"step": 6460
},
{
"epoch": 0.5321398509515696,
"grad_norm": 0.3808406591415405,
"learning_rate": 0.0002,
"loss": 0.2623,
"step": 6480
},
{
"epoch": 0.533782257898951,
"grad_norm": 0.340666264295578,
"learning_rate": 0.0002,
"loss": 0.2806,
"step": 6500
},
{
"epoch": 0.5354246648463323,
"grad_norm": 0.41233885288238525,
"learning_rate": 0.0002,
"loss": 0.2458,
"step": 6520
},
{
"epoch": 0.5370670717937137,
"grad_norm": 0.28576114773750305,
"learning_rate": 0.0002,
"loss": 0.2638,
"step": 6540
},
{
"epoch": 0.5387094787410951,
"grad_norm": 0.4704492688179016,
"learning_rate": 0.0002,
"loss": 0.2735,
"step": 6560
},
{
"epoch": 0.5403518856884765,
"grad_norm": 0.43339604139328003,
"learning_rate": 0.0002,
"loss": 0.2667,
"step": 6580
},
{
"epoch": 0.5419942926358579,
"grad_norm": 0.332878440618515,
"learning_rate": 0.0002,
"loss": 0.2513,
"step": 6600
},
{
"epoch": 0.5436366995832392,
"grad_norm": 0.34620800614356995,
"learning_rate": 0.0002,
"loss": 0.2768,
"step": 6620
},
{
"epoch": 0.5452791065306206,
"grad_norm": 0.46673691272735596,
"learning_rate": 0.0002,
"loss": 0.2597,
"step": 6640
},
{
"epoch": 0.546921513478002,
"grad_norm": 0.36888402700424194,
"learning_rate": 0.0002,
"loss": 0.2453,
"step": 6660
},
{
"epoch": 0.5485639204253834,
"grad_norm": 0.363007515668869,
"learning_rate": 0.0002,
"loss": 0.2545,
"step": 6680
},
{
"epoch": 0.5502063273727648,
"grad_norm": 0.3927077353000641,
"learning_rate": 0.0002,
"loss": 0.2597,
"step": 6700
},
{
"epoch": 0.5518487343201461,
"grad_norm": 0.36897674202919006,
"learning_rate": 0.0002,
"loss": 0.2571,
"step": 6720
},
{
"epoch": 0.5534911412675275,
"grad_norm": 0.3425733149051666,
"learning_rate": 0.0002,
"loss": 0.2624,
"step": 6740
},
{
"epoch": 0.5551335482149089,
"grad_norm": 0.3315962553024292,
"learning_rate": 0.0002,
"loss": 0.2656,
"step": 6760
},
{
"epoch": 0.5567759551622903,
"grad_norm": 0.4456098675727844,
"learning_rate": 0.0002,
"loss": 0.266,
"step": 6780
},
{
"epoch": 0.5584183621096718,
"grad_norm": 0.4146248996257782,
"learning_rate": 0.0002,
"loss": 0.2631,
"step": 6800
},
{
"epoch": 0.5600607690570532,
"grad_norm": 0.3591421842575073,
"learning_rate": 0.0002,
"loss": 0.2475,
"step": 6820
},
{
"epoch": 0.5617031760044345,
"grad_norm": 0.4540598690509796,
"learning_rate": 0.0002,
"loss": 0.2667,
"step": 6840
},
{
"epoch": 0.5633455829518159,
"grad_norm": 0.4394567906856537,
"learning_rate": 0.0002,
"loss": 0.2673,
"step": 6860
},
{
"epoch": 0.5649879898991973,
"grad_norm": 0.3273297846317291,
"learning_rate": 0.0002,
"loss": 0.2631,
"step": 6880
},
{
"epoch": 0.5666303968465787,
"grad_norm": 0.3828592896461487,
"learning_rate": 0.0002,
"loss": 0.2601,
"step": 6900
},
{
"epoch": 0.5682728037939601,
"grad_norm": 0.24124163389205933,
"learning_rate": 0.0002,
"loss": 0.2507,
"step": 6920
},
{
"epoch": 0.5699152107413414,
"grad_norm": 0.4403514564037323,
"learning_rate": 0.0002,
"loss": 0.2686,
"step": 6940
},
{
"epoch": 0.5715576176887228,
"grad_norm": 0.39177918434143066,
"learning_rate": 0.0002,
"loss": 0.255,
"step": 6960
},
{
"epoch": 0.5732000246361042,
"grad_norm": 0.41621333360671997,
"learning_rate": 0.0002,
"loss": 0.2472,
"step": 6980
},
{
"epoch": 0.5748424315834856,
"grad_norm": 0.4051215648651123,
"learning_rate": 0.0002,
"loss": 0.2692,
"step": 7000
},
{
"epoch": 0.576484838530867,
"grad_norm": 0.9351252317428589,
"learning_rate": 0.0002,
"loss": 0.2519,
"step": 7020
},
{
"epoch": 0.5781272454782483,
"grad_norm": 0.38004037737846375,
"learning_rate": 0.0002,
"loss": 0.2683,
"step": 7040
},
{
"epoch": 0.5797696524256297,
"grad_norm": 0.31271103024482727,
"learning_rate": 0.0002,
"loss": 0.2554,
"step": 7060
},
{
"epoch": 0.5814120593730111,
"grad_norm": 0.3766959607601166,
"learning_rate": 0.0002,
"loss": 0.2555,
"step": 7080
},
{
"epoch": 0.5830544663203925,
"grad_norm": 2.4575226306915283,
"learning_rate": 0.0002,
"loss": 0.2673,
"step": 7100
},
{
"epoch": 0.584696873267774,
"grad_norm": 0.3419061005115509,
"learning_rate": 0.0002,
"loss": 0.2484,
"step": 7120
},
{
"epoch": 0.5863392802151554,
"grad_norm": 0.3647725284099579,
"learning_rate": 0.0002,
"loss": 0.2614,
"step": 7140
},
{
"epoch": 0.5879816871625367,
"grad_norm": 0.39643993973731995,
"learning_rate": 0.0002,
"loss": 0.2583,
"step": 7160
},
{
"epoch": 0.5896240941099181,
"grad_norm": 0.37024736404418945,
"learning_rate": 0.0002,
"loss": 0.2605,
"step": 7180
},
{
"epoch": 0.5912665010572995,
"grad_norm": 0.4551810324192047,
"learning_rate": 0.0002,
"loss": 0.2512,
"step": 7200
},
{
"epoch": 0.5929089080046809,
"grad_norm": 0.2843814492225647,
"learning_rate": 0.0002,
"loss": 0.2504,
"step": 7220
},
{
"epoch": 0.5945513149520623,
"grad_norm": 0.3765452206134796,
"learning_rate": 0.0002,
"loss": 0.2557,
"step": 7240
},
{
"epoch": 0.5961937218994436,
"grad_norm": 0.4625066816806793,
"learning_rate": 0.0002,
"loss": 0.2433,
"step": 7260
},
{
"epoch": 0.597836128846825,
"grad_norm": 0.4870743453502655,
"learning_rate": 0.0002,
"loss": 0.2494,
"step": 7280
},
{
"epoch": 0.5994785357942064,
"grad_norm": 0.4229605197906494,
"learning_rate": 0.0002,
"loss": 0.2553,
"step": 7300
},
{
"epoch": 0.6011209427415878,
"grad_norm": 0.37593892216682434,
"learning_rate": 0.0002,
"loss": 0.2523,
"step": 7320
},
{
"epoch": 0.6027633496889692,
"grad_norm": 0.36149609088897705,
"learning_rate": 0.0002,
"loss": 0.2582,
"step": 7340
},
{
"epoch": 0.6044057566363505,
"grad_norm": 0.3866046071052551,
"learning_rate": 0.0002,
"loss": 0.2534,
"step": 7360
},
{
"epoch": 0.6060481635837319,
"grad_norm": 0.4623259902000427,
"learning_rate": 0.0002,
"loss": 0.2542,
"step": 7380
},
{
"epoch": 0.6076905705311133,
"grad_norm": 0.32349276542663574,
"learning_rate": 0.0002,
"loss": 0.2437,
"step": 7400
},
{
"epoch": 0.6093329774784947,
"grad_norm": 0.386561781167984,
"learning_rate": 0.0002,
"loss": 0.2494,
"step": 7420
},
{
"epoch": 0.6109753844258761,
"grad_norm": 0.36509180068969727,
"learning_rate": 0.0002,
"loss": 0.2559,
"step": 7440
},
{
"epoch": 0.6126177913732576,
"grad_norm": 0.3628571331501007,
"learning_rate": 0.0002,
"loss": 0.26,
"step": 7460
},
{
"epoch": 0.6142601983206389,
"grad_norm": 0.3218732476234436,
"learning_rate": 0.0002,
"loss": 0.2487,
"step": 7480
},
{
"epoch": 0.6159026052680203,
"grad_norm": 0.3551442623138428,
"learning_rate": 0.0002,
"loss": 0.231,
"step": 7500
},
{
"epoch": 0.6175450122154017,
"grad_norm": 0.40962496399879456,
"learning_rate": 0.0002,
"loss": 0.2486,
"step": 7520
},
{
"epoch": 0.6191874191627831,
"grad_norm": 0.48531442880630493,
"learning_rate": 0.0002,
"loss": 0.2547,
"step": 7540
},
{
"epoch": 0.6208298261101645,
"grad_norm": 0.387851357460022,
"learning_rate": 0.0002,
"loss": 0.2655,
"step": 7560
},
{
"epoch": 0.6224722330575458,
"grad_norm": 0.3165546953678131,
"learning_rate": 0.0002,
"loss": 0.2499,
"step": 7580
},
{
"epoch": 0.6241146400049272,
"grad_norm": 0.3393017649650574,
"learning_rate": 0.0002,
"loss": 0.2546,
"step": 7600
},
{
"epoch": 0.6257570469523086,
"grad_norm": 0.3975006639957428,
"learning_rate": 0.0002,
"loss": 0.255,
"step": 7620
},
{
"epoch": 0.62739945389969,
"grad_norm": 0.4458036720752716,
"learning_rate": 0.0002,
"loss": 0.2671,
"step": 7640
},
{
"epoch": 0.6290418608470714,
"grad_norm": 0.34977594017982483,
"learning_rate": 0.0002,
"loss": 0.2438,
"step": 7660
},
{
"epoch": 0.6306842677944527,
"grad_norm": 0.4126521646976471,
"learning_rate": 0.0002,
"loss": 0.2473,
"step": 7680
},
{
"epoch": 0.6323266747418341,
"grad_norm": 0.35712817311286926,
"learning_rate": 0.0002,
"loss": 0.2568,
"step": 7700
},
{
"epoch": 0.6339690816892155,
"grad_norm": 0.3464488983154297,
"learning_rate": 0.0002,
"loss": 0.26,
"step": 7720
},
{
"epoch": 0.6356114886365969,
"grad_norm": 0.40559422969818115,
"learning_rate": 0.0002,
"loss": 0.2531,
"step": 7740
},
{
"epoch": 0.6372538955839783,
"grad_norm": 0.3709222972393036,
"learning_rate": 0.0002,
"loss": 0.257,
"step": 7760
},
{
"epoch": 0.6388963025313598,
"grad_norm": 0.3671443462371826,
"learning_rate": 0.0002,
"loss": 0.243,
"step": 7780
},
{
"epoch": 0.6405387094787411,
"grad_norm": 0.39361605048179626,
"learning_rate": 0.0002,
"loss": 0.2569,
"step": 7800
},
{
"epoch": 0.6421811164261225,
"grad_norm": 0.41323602199554443,
"learning_rate": 0.0002,
"loss": 0.2465,
"step": 7820
},
{
"epoch": 0.6438235233735039,
"grad_norm": 0.4266330301761627,
"learning_rate": 0.0002,
"loss": 0.2495,
"step": 7840
},
{
"epoch": 0.6454659303208853,
"grad_norm": 0.3892604112625122,
"learning_rate": 0.0002,
"loss": 0.2505,
"step": 7860
},
{
"epoch": 0.6471083372682667,
"grad_norm": 0.43539443612098694,
"learning_rate": 0.0002,
"loss": 0.2643,
"step": 7880
},
{
"epoch": 0.648750744215648,
"grad_norm": 0.3637757897377014,
"learning_rate": 0.0002,
"loss": 0.2557,
"step": 7900
},
{
"epoch": 0.6503931511630294,
"grad_norm": 0.42761602997779846,
"learning_rate": 0.0002,
"loss": 0.2578,
"step": 7920
},
{
"epoch": 0.6520355581104108,
"grad_norm": 0.38917163014411926,
"learning_rate": 0.0002,
"loss": 0.2593,
"step": 7940
},
{
"epoch": 0.6536779650577922,
"grad_norm": 0.42814767360687256,
"learning_rate": 0.0002,
"loss": 0.2412,
"step": 7960
},
{
"epoch": 0.6553203720051736,
"grad_norm": 0.3543958365917206,
"learning_rate": 0.0002,
"loss": 0.2485,
"step": 7980
},
{
"epoch": 0.656962778952555,
"grad_norm": 0.3452099859714508,
"learning_rate": 0.0002,
"loss": 0.2519,
"step": 8000
},
{
"epoch": 0.6586051858999363,
"grad_norm": 0.38600897789001465,
"learning_rate": 0.0002,
"loss": 0.2443,
"step": 8020
},
{
"epoch": 0.6602475928473177,
"grad_norm": 0.35474061965942383,
"learning_rate": 0.0002,
"loss": 0.2435,
"step": 8040
},
{
"epoch": 0.6618899997946991,
"grad_norm": 0.48493891954421997,
"learning_rate": 0.0002,
"loss": 0.2564,
"step": 8060
},
{
"epoch": 0.6635324067420805,
"grad_norm": 0.40137720108032227,
"learning_rate": 0.0002,
"loss": 0.2592,
"step": 8080
},
{
"epoch": 0.6651748136894619,
"grad_norm": 0.38460877537727356,
"learning_rate": 0.0002,
"loss": 0.2387,
"step": 8100
},
{
"epoch": 0.6668172206368433,
"grad_norm": 0.3780753016471863,
"learning_rate": 0.0002,
"loss": 0.2517,
"step": 8120
},
{
"epoch": 0.6684596275842247,
"grad_norm": 0.30384665727615356,
"learning_rate": 0.0002,
"loss": 0.2442,
"step": 8140
},
{
"epoch": 0.6701020345316061,
"grad_norm": 0.34080567955970764,
"learning_rate": 0.0002,
"loss": 0.2443,
"step": 8160
},
{
"epoch": 0.6717444414789875,
"grad_norm": 0.3789510130882263,
"learning_rate": 0.0002,
"loss": 0.2462,
"step": 8180
},
{
"epoch": 0.6733868484263689,
"grad_norm": 0.3566538989543915,
"learning_rate": 0.0002,
"loss": 0.2418,
"step": 8200
},
{
"epoch": 0.6750292553737502,
"grad_norm": 0.3436945676803589,
"learning_rate": 0.0002,
"loss": 0.2353,
"step": 8220
},
{
"epoch": 0.6766716623211316,
"grad_norm": 0.35046547651290894,
"learning_rate": 0.0002,
"loss": 0.2521,
"step": 8240
},
{
"epoch": 0.678314069268513,
"grad_norm": 0.3671397566795349,
"learning_rate": 0.0002,
"loss": 0.2505,
"step": 8260
},
{
"epoch": 0.6799564762158944,
"grad_norm": 0.33368802070617676,
"learning_rate": 0.0002,
"loss": 0.2663,
"step": 8280
},
{
"epoch": 0.6815988831632758,
"grad_norm": 0.35810762643814087,
"learning_rate": 0.0002,
"loss": 0.2467,
"step": 8300
},
{
"epoch": 0.6832412901106572,
"grad_norm": 0.3913412094116211,
"learning_rate": 0.0002,
"loss": 0.2544,
"step": 8320
},
{
"epoch": 0.6848836970580385,
"grad_norm": 0.3313830494880676,
"learning_rate": 0.0002,
"loss": 0.2551,
"step": 8340
},
{
"epoch": 0.6865261040054199,
"grad_norm": 0.3506488502025604,
"learning_rate": 0.0002,
"loss": 0.2416,
"step": 8360
},
{
"epoch": 0.6881685109528013,
"grad_norm": 0.3841126561164856,
"learning_rate": 0.0002,
"loss": 0.2531,
"step": 8380
},
{
"epoch": 0.6898109179001827,
"grad_norm": 0.38030919432640076,
"learning_rate": 0.0002,
"loss": 0.2374,
"step": 8400
},
{
"epoch": 0.691453324847564,
"grad_norm": 0.3643128573894501,
"learning_rate": 0.0002,
"loss": 0.2616,
"step": 8420
},
{
"epoch": 0.6930957317949455,
"grad_norm": 0.37401241064071655,
"learning_rate": 0.0002,
"loss": 0.2424,
"step": 8440
},
{
"epoch": 0.6947381387423269,
"grad_norm": 0.42304474115371704,
"learning_rate": 0.0002,
"loss": 0.2491,
"step": 8460
},
{
"epoch": 0.6963805456897083,
"grad_norm": 0.3441920280456543,
"learning_rate": 0.0002,
"loss": 0.2429,
"step": 8480
},
{
"epoch": 0.6980229526370897,
"grad_norm": 0.33383867144584656,
"learning_rate": 0.0002,
"loss": 0.2361,
"step": 8500
},
{
"epoch": 0.6996653595844711,
"grad_norm": 0.42935657501220703,
"learning_rate": 0.0002,
"loss": 0.2598,
"step": 8520
},
{
"epoch": 0.7013077665318525,
"grad_norm": 0.5143205523490906,
"learning_rate": 0.0002,
"loss": 0.2348,
"step": 8540
},
{
"epoch": 0.7029501734792338,
"grad_norm": 0.37915435433387756,
"learning_rate": 0.0002,
"loss": 0.2277,
"step": 8560
},
{
"epoch": 0.7045925804266152,
"grad_norm": 0.3202255666255951,
"learning_rate": 0.0002,
"loss": 0.2474,
"step": 8580
},
{
"epoch": 0.7062349873739966,
"grad_norm": 0.3681676387786865,
"learning_rate": 0.0002,
"loss": 0.2417,
"step": 8600
},
{
"epoch": 0.707877394321378,
"grad_norm": 0.41214585304260254,
"learning_rate": 0.0002,
"loss": 0.2356,
"step": 8620
},
{
"epoch": 0.7095198012687594,
"grad_norm": 0.35259029269218445,
"learning_rate": 0.0002,
"loss": 0.2394,
"step": 8640
},
{
"epoch": 0.7111622082161407,
"grad_norm": 0.47768017649650574,
"learning_rate": 0.0002,
"loss": 0.248,
"step": 8660
},
{
"epoch": 0.7128046151635221,
"grad_norm": 0.3282839059829712,
"learning_rate": 0.0002,
"loss": 0.2336,
"step": 8680
},
{
"epoch": 0.7144470221109035,
"grad_norm": 0.441099613904953,
"learning_rate": 0.0002,
"loss": 0.2631,
"step": 8700
},
{
"epoch": 0.7160894290582849,
"grad_norm": 0.3486292362213135,
"learning_rate": 0.0002,
"loss": 0.2531,
"step": 8720
},
{
"epoch": 0.7177318360056663,
"grad_norm": 0.33037880063056946,
"learning_rate": 0.0002,
"loss": 0.2405,
"step": 8740
},
{
"epoch": 0.7193742429530476,
"grad_norm": 0.47114354372024536,
"learning_rate": 0.0002,
"loss": 0.2665,
"step": 8760
},
{
"epoch": 0.7210166499004291,
"grad_norm": 0.34797531366348267,
"learning_rate": 0.0002,
"loss": 0.2481,
"step": 8780
},
{
"epoch": 0.7226590568478105,
"grad_norm": 0.43183642625808716,
"learning_rate": 0.0002,
"loss": 0.242,
"step": 8800
},
{
"epoch": 0.7243014637951919,
"grad_norm": 0.4230342507362366,
"learning_rate": 0.0002,
"loss": 0.2363,
"step": 8820
},
{
"epoch": 0.7259438707425733,
"grad_norm": 0.40553364157676697,
"learning_rate": 0.0002,
"loss": 0.2422,
"step": 8840
},
{
"epoch": 0.7275862776899547,
"grad_norm": 0.34155145287513733,
"learning_rate": 0.0002,
"loss": 0.2422,
"step": 8860
},
{
"epoch": 0.729228684637336,
"grad_norm": 0.4095294773578644,
"learning_rate": 0.0002,
"loss": 0.2605,
"step": 8880
},
{
"epoch": 0.7308710915847174,
"grad_norm": 0.36541318893432617,
"learning_rate": 0.0002,
"loss": 0.2516,
"step": 8900
},
{
"epoch": 0.7325134985320988,
"grad_norm": 0.40149998664855957,
"learning_rate": 0.0002,
"loss": 0.2515,
"step": 8920
},
{
"epoch": 0.7341559054794802,
"grad_norm": 0.3220469653606415,
"learning_rate": 0.0002,
"loss": 0.2361,
"step": 8940
},
{
"epoch": 0.7357983124268616,
"grad_norm": 0.3153376579284668,
"learning_rate": 0.0002,
"loss": 0.2325,
"step": 8960
},
{
"epoch": 0.7374407193742429,
"grad_norm": 0.3046116530895233,
"learning_rate": 0.0002,
"loss": 0.2502,
"step": 8980
},
{
"epoch": 0.7390831263216243,
"grad_norm": 0.502663791179657,
"learning_rate": 0.0002,
"loss": 0.2471,
"step": 9000
},
{
"epoch": 0.7407255332690057,
"grad_norm": 0.35168886184692383,
"learning_rate": 0.0002,
"loss": 0.2309,
"step": 9020
},
{
"epoch": 0.7423679402163871,
"grad_norm": 0.43629148602485657,
"learning_rate": 0.0002,
"loss": 0.2423,
"step": 9040
},
{
"epoch": 0.7440103471637685,
"grad_norm": 0.35909175872802734,
"learning_rate": 0.0002,
"loss": 0.2453,
"step": 9060
},
{
"epoch": 0.7456527541111498,
"grad_norm": 0.3052688539028168,
"learning_rate": 0.0002,
"loss": 0.2413,
"step": 9080
},
{
"epoch": 0.7472951610585313,
"grad_norm": 0.2708439230918884,
"learning_rate": 0.0002,
"loss": 0.2237,
"step": 9100
},
{
"epoch": 0.7489375680059127,
"grad_norm": 0.3965560495853424,
"learning_rate": 0.0002,
"loss": 0.2423,
"step": 9120
},
{
"epoch": 0.7505799749532941,
"grad_norm": 0.3895662724971771,
"learning_rate": 0.0002,
"loss": 0.249,
"step": 9140
},
{
"epoch": 0.7522223819006755,
"grad_norm": 0.32124513387680054,
"learning_rate": 0.0002,
"loss": 0.2376,
"step": 9160
},
{
"epoch": 0.7538647888480569,
"grad_norm": 0.716029167175293,
"learning_rate": 0.0002,
"loss": 0.2529,
"step": 9180
},
{
"epoch": 0.7555071957954382,
"grad_norm": 0.3812948167324066,
"learning_rate": 0.0002,
"loss": 0.2269,
"step": 9200
},
{
"epoch": 0.7571496027428196,
"grad_norm": 0.37073054909706116,
"learning_rate": 0.0002,
"loss": 0.235,
"step": 9220
},
{
"epoch": 0.758792009690201,
"grad_norm": 0.4043092727661133,
"learning_rate": 0.0002,
"loss": 0.2345,
"step": 9240
},
{
"epoch": 0.7604344166375824,
"grad_norm": 0.3160434365272522,
"learning_rate": 0.0002,
"loss": 0.2412,
"step": 9260
},
{
"epoch": 0.7620768235849638,
"grad_norm": 0.35415521264076233,
"learning_rate": 0.0002,
"loss": 0.2358,
"step": 9280
},
{
"epoch": 0.7637192305323451,
"grad_norm": 0.41371211409568787,
"learning_rate": 0.0002,
"loss": 0.2317,
"step": 9300
},
{
"epoch": 0.7653616374797265,
"grad_norm": 0.4175126850605011,
"learning_rate": 0.0002,
"loss": 0.2547,
"step": 9320
},
{
"epoch": 0.7670040444271079,
"grad_norm": 0.39811649918556213,
"learning_rate": 0.0002,
"loss": 0.2462,
"step": 9340
},
{
"epoch": 0.7686464513744893,
"grad_norm": 0.33596447110176086,
"learning_rate": 0.0002,
"loss": 0.2368,
"step": 9360
},
{
"epoch": 0.7702888583218707,
"grad_norm": 0.36754104495048523,
"learning_rate": 0.0002,
"loss": 0.2484,
"step": 9380
},
{
"epoch": 0.771931265269252,
"grad_norm": 0.38244250416755676,
"learning_rate": 0.0002,
"loss": 0.2364,
"step": 9400
},
{
"epoch": 0.7735736722166334,
"grad_norm": 0.3366243839263916,
"learning_rate": 0.0002,
"loss": 0.2194,
"step": 9420
},
{
"epoch": 0.7752160791640149,
"grad_norm": 0.39877885580062866,
"learning_rate": 0.0002,
"loss": 0.2469,
"step": 9440
},
{
"epoch": 0.7768584861113963,
"grad_norm": 0.2690157890319824,
"learning_rate": 0.0002,
"loss": 0.2459,
"step": 9460
},
{
"epoch": 0.7785008930587777,
"grad_norm": 0.3678382337093353,
"learning_rate": 0.0002,
"loss": 0.2192,
"step": 9480
},
{
"epoch": 0.7801433000061591,
"grad_norm": 0.3121150732040405,
"learning_rate": 0.0002,
"loss": 0.2438,
"step": 9500
},
{
"epoch": 0.7817857069535404,
"grad_norm": 0.3517535626888275,
"learning_rate": 0.0002,
"loss": 0.2495,
"step": 9520
},
{
"epoch": 0.7834281139009218,
"grad_norm": 0.434817910194397,
"learning_rate": 0.0002,
"loss": 0.2532,
"step": 9540
},
{
"epoch": 0.7850705208483032,
"grad_norm": 0.35570958256721497,
"learning_rate": 0.0002,
"loss": 0.2467,
"step": 9560
},
{
"epoch": 0.7867129277956846,
"grad_norm": 0.4270517826080322,
"learning_rate": 0.0002,
"loss": 0.2337,
"step": 9580
},
{
"epoch": 0.788355334743066,
"grad_norm": 0.2827800214290619,
"learning_rate": 0.0002,
"loss": 0.2309,
"step": 9600
},
{
"epoch": 0.7899977416904473,
"grad_norm": 0.39158400893211365,
"learning_rate": 0.0002,
"loss": 0.2366,
"step": 9620
},
{
"epoch": 0.7916401486378287,
"grad_norm": 0.32538673281669617,
"learning_rate": 0.0002,
"loss": 0.2389,
"step": 9640
},
{
"epoch": 0.7932825555852101,
"grad_norm": 0.3370015323162079,
"learning_rate": 0.0002,
"loss": 0.2377,
"step": 9660
},
{
"epoch": 0.7949249625325915,
"grad_norm": 0.3779650032520294,
"learning_rate": 0.0002,
"loss": 0.2339,
"step": 9680
},
{
"epoch": 0.7965673694799729,
"grad_norm": 0.36034300923347473,
"learning_rate": 0.0002,
"loss": 0.2427,
"step": 9700
},
{
"epoch": 0.7982097764273542,
"grad_norm": 0.3154286742210388,
"learning_rate": 0.0002,
"loss": 0.2338,
"step": 9720
},
{
"epoch": 0.7998521833747356,
"grad_norm": 0.3282501697540283,
"learning_rate": 0.0002,
"loss": 0.2408,
"step": 9740
},
{
"epoch": 0.8014945903221171,
"grad_norm": 0.41291025280952454,
"learning_rate": 0.0002,
"loss": 0.2507,
"step": 9760
},
{
"epoch": 0.8031369972694985,
"grad_norm": 0.3961363136768341,
"learning_rate": 0.0002,
"loss": 0.2281,
"step": 9780
},
{
"epoch": 0.8047794042168799,
"grad_norm": 0.47485384345054626,
"learning_rate": 0.0002,
"loss": 0.2349,
"step": 9800
},
{
"epoch": 0.8064218111642613,
"grad_norm": 0.3284982740879059,
"learning_rate": 0.0002,
"loss": 0.2288,
"step": 9820
},
{
"epoch": 0.8080642181116426,
"grad_norm": 0.38867270946502686,
"learning_rate": 0.0002,
"loss": 0.2328,
"step": 9840
},
{
"epoch": 0.809706625059024,
"grad_norm": 0.44371268153190613,
"learning_rate": 0.0002,
"loss": 0.2416,
"step": 9860
},
{
"epoch": 0.8113490320064054,
"grad_norm": 0.2462434470653534,
"learning_rate": 0.0002,
"loss": 0.2391,
"step": 9880
},
{
"epoch": 0.8129914389537868,
"grad_norm": 0.31762421131134033,
"learning_rate": 0.0002,
"loss": 0.2467,
"step": 9900
},
{
"epoch": 0.8146338459011682,
"grad_norm": 0.40011724829673767,
"learning_rate": 0.0002,
"loss": 0.2351,
"step": 9920
},
{
"epoch": 0.8162762528485495,
"grad_norm": 0.2972090542316437,
"learning_rate": 0.0002,
"loss": 0.2469,
"step": 9940
},
{
"epoch": 0.8179186597959309,
"grad_norm": 0.4047238230705261,
"learning_rate": 0.0002,
"loss": 0.2257,
"step": 9960
},
{
"epoch": 0.8195610667433123,
"grad_norm": 0.36663326621055603,
"learning_rate": 0.0002,
"loss": 0.2302,
"step": 9980
},
{
"epoch": 0.8212034736906937,
"grad_norm": 0.49191904067993164,
"learning_rate": 0.0002,
"loss": 0.242,
"step": 10000
},
{
"epoch": 0.8228458806380751,
"grad_norm": 0.4621546268463135,
"learning_rate": 0.0002,
"loss": 0.2324,
"step": 10020
},
{
"epoch": 0.8244882875854564,
"grad_norm": 0.4055505394935608,
"learning_rate": 0.0002,
"loss": 0.2373,
"step": 10040
},
{
"epoch": 0.8261306945328378,
"grad_norm": 0.34892845153808594,
"learning_rate": 0.0002,
"loss": 0.23,
"step": 10060
},
{
"epoch": 0.8277731014802192,
"grad_norm": 0.33453091979026794,
"learning_rate": 0.0002,
"loss": 0.2348,
"step": 10080
},
{
"epoch": 0.8294155084276007,
"grad_norm": 0.3283565640449524,
"learning_rate": 0.0002,
"loss": 0.2314,
"step": 10100
},
{
"epoch": 0.8310579153749821,
"grad_norm": 0.35970717668533325,
"learning_rate": 0.0002,
"loss": 0.2336,
"step": 10120
},
{
"epoch": 0.8327003223223635,
"grad_norm": 0.3093232810497284,
"learning_rate": 0.0002,
"loss": 0.2363,
"step": 10140
},
{
"epoch": 0.8343427292697448,
"grad_norm": 0.4389066696166992,
"learning_rate": 0.0002,
"loss": 0.2422,
"step": 10160
},
{
"epoch": 0.8359851362171262,
"grad_norm": 0.44654580950737,
"learning_rate": 0.0002,
"loss": 0.232,
"step": 10180
},
{
"epoch": 0.8376275431645076,
"grad_norm": 0.2830391526222229,
"learning_rate": 0.0002,
"loss": 0.2476,
"step": 10200
},
{
"epoch": 0.839269950111889,
"grad_norm": 0.31547674536705017,
"learning_rate": 0.0002,
"loss": 0.231,
"step": 10220
},
{
"epoch": 0.8409123570592704,
"grad_norm": 0.45748040080070496,
"learning_rate": 0.0002,
"loss": 0.2372,
"step": 10240
},
{
"epoch": 0.8425547640066517,
"grad_norm": 0.34882062673568726,
"learning_rate": 0.0002,
"loss": 0.2376,
"step": 10260
},
{
"epoch": 0.8441971709540331,
"grad_norm": 0.3529532849788666,
"learning_rate": 0.0002,
"loss": 0.2323,
"step": 10280
},
{
"epoch": 0.8458395779014145,
"grad_norm": 0.33054473996162415,
"learning_rate": 0.0002,
"loss": 0.2376,
"step": 10300
},
{
"epoch": 0.8474819848487959,
"grad_norm": 0.3015061616897583,
"learning_rate": 0.0002,
"loss": 0.2243,
"step": 10320
},
{
"epoch": 0.8491243917961773,
"grad_norm": 0.3048664629459381,
"learning_rate": 0.0002,
"loss": 0.2318,
"step": 10340
},
{
"epoch": 0.8507667987435586,
"grad_norm": 0.31459841132164,
"learning_rate": 0.0002,
"loss": 0.2307,
"step": 10360
},
{
"epoch": 0.85240920569094,
"grad_norm": 0.39160168170928955,
"learning_rate": 0.0002,
"loss": 0.2407,
"step": 10380
},
{
"epoch": 0.8540516126383214,
"grad_norm": 0.30392590165138245,
"learning_rate": 0.0002,
"loss": 0.2206,
"step": 10400
},
{
"epoch": 0.8556940195857029,
"grad_norm": 0.3656589686870575,
"learning_rate": 0.0002,
"loss": 0.229,
"step": 10420
},
{
"epoch": 0.8573364265330843,
"grad_norm": 0.35856541991233826,
"learning_rate": 0.0002,
"loss": 0.2361,
"step": 10440
},
{
"epoch": 0.8589788334804657,
"grad_norm": 0.3591729402542114,
"learning_rate": 0.0002,
"loss": 0.2232,
"step": 10460
},
{
"epoch": 0.860621240427847,
"grad_norm": 0.36023178696632385,
"learning_rate": 0.0002,
"loss": 0.2495,
"step": 10480
},
{
"epoch": 0.8622636473752284,
"grad_norm": 0.38790059089660645,
"learning_rate": 0.0002,
"loss": 0.2288,
"step": 10500
},
{
"epoch": 0.8639060543226098,
"grad_norm": 0.39627397060394287,
"learning_rate": 0.0002,
"loss": 0.24,
"step": 10520
},
{
"epoch": 0.8655484612699912,
"grad_norm": 0.32167407870292664,
"learning_rate": 0.0002,
"loss": 0.2365,
"step": 10540
},
{
"epoch": 0.8671908682173726,
"grad_norm": 0.34265172481536865,
"learning_rate": 0.0002,
"loss": 0.2419,
"step": 10560
},
{
"epoch": 0.868833275164754,
"grad_norm": 0.3236486613750458,
"learning_rate": 0.0002,
"loss": 0.2326,
"step": 10580
},
{
"epoch": 0.8704756821121353,
"grad_norm": 0.3700607120990753,
"learning_rate": 0.0002,
"loss": 0.2361,
"step": 10600
},
{
"epoch": 0.8721180890595167,
"grad_norm": 0.33969688415527344,
"learning_rate": 0.0002,
"loss": 0.2236,
"step": 10620
},
{
"epoch": 0.8737604960068981,
"grad_norm": 0.2824096083641052,
"learning_rate": 0.0002,
"loss": 0.2415,
"step": 10640
},
{
"epoch": 0.8754029029542795,
"grad_norm": 0.3842727243900299,
"learning_rate": 0.0002,
"loss": 0.2223,
"step": 10660
},
{
"epoch": 0.8770453099016609,
"grad_norm": 0.36808887124061584,
"learning_rate": 0.0002,
"loss": 0.2253,
"step": 10680
},
{
"epoch": 0.8786877168490422,
"grad_norm": 0.4065176844596863,
"learning_rate": 0.0002,
"loss": 0.2274,
"step": 10700
},
{
"epoch": 0.8803301237964236,
"grad_norm": 0.3421749174594879,
"learning_rate": 0.0002,
"loss": 0.2309,
"step": 10720
},
{
"epoch": 0.881972530743805,
"grad_norm": 0.30610519647598267,
"learning_rate": 0.0002,
"loss": 0.2213,
"step": 10740
},
{
"epoch": 0.8836149376911865,
"grad_norm": 0.40341177582740784,
"learning_rate": 0.0002,
"loss": 0.229,
"step": 10760
},
{
"epoch": 0.8852573446385679,
"grad_norm": 0.43038755655288696,
"learning_rate": 0.0002,
"loss": 0.2312,
"step": 10780
},
{
"epoch": 0.8868997515859492,
"grad_norm": 0.26736319065093994,
"learning_rate": 0.0002,
"loss": 0.2375,
"step": 10800
},
{
"epoch": 0.8885421585333306,
"grad_norm": 0.34479281306266785,
"learning_rate": 0.0002,
"loss": 0.2342,
"step": 10820
},
{
"epoch": 0.890184565480712,
"grad_norm": 0.32857152819633484,
"learning_rate": 0.0002,
"loss": 0.2352,
"step": 10840
},
{
"epoch": 0.8918269724280934,
"grad_norm": 0.30919578671455383,
"learning_rate": 0.0002,
"loss": 0.2133,
"step": 10860
},
{
"epoch": 0.8934693793754748,
"grad_norm": 0.3049899637699127,
"learning_rate": 0.0002,
"loss": 0.2374,
"step": 10880
},
{
"epoch": 0.8951117863228562,
"grad_norm": 0.4088539779186249,
"learning_rate": 0.0002,
"loss": 0.2377,
"step": 10900
},
{
"epoch": 0.8967541932702375,
"grad_norm": 0.3318689167499542,
"learning_rate": 0.0002,
"loss": 0.2459,
"step": 10920
},
{
"epoch": 0.8983966002176189,
"grad_norm": 0.38051754236221313,
"learning_rate": 0.0002,
"loss": 0.2305,
"step": 10940
},
{
"epoch": 0.9000390071650003,
"grad_norm": 0.401080846786499,
"learning_rate": 0.0002,
"loss": 0.2297,
"step": 10960
},
{
"epoch": 0.9016814141123817,
"grad_norm": 0.30713602900505066,
"learning_rate": 0.0002,
"loss": 0.2254,
"step": 10980
},
{
"epoch": 0.903323821059763,
"grad_norm": 0.37888234853744507,
"learning_rate": 0.0002,
"loss": 0.2346,
"step": 11000
},
{
"epoch": 0.9049662280071444,
"grad_norm": 0.3106231689453125,
"learning_rate": 0.0002,
"loss": 0.2206,
"step": 11020
},
{
"epoch": 0.9066086349545258,
"grad_norm": 0.44297677278518677,
"learning_rate": 0.0002,
"loss": 0.2218,
"step": 11040
},
{
"epoch": 0.9082510419019072,
"grad_norm": 0.3375784456729889,
"learning_rate": 0.0002,
"loss": 0.2273,
"step": 11060
},
{
"epoch": 0.9098934488492886,
"grad_norm": 0.4860747158527374,
"learning_rate": 0.0002,
"loss": 0.2317,
"step": 11080
},
{
"epoch": 0.9115358557966701,
"grad_norm": 0.2880633771419525,
"learning_rate": 0.0002,
"loss": 0.2398,
"step": 11100
},
{
"epoch": 0.9131782627440514,
"grad_norm": 0.4085402190685272,
"learning_rate": 0.0002,
"loss": 0.234,
"step": 11120
},
{
"epoch": 0.9148206696914328,
"grad_norm": 0.38998520374298096,
"learning_rate": 0.0002,
"loss": 0.2402,
"step": 11140
},
{
"epoch": 0.9164630766388142,
"grad_norm": 0.40508535504341125,
"learning_rate": 0.0002,
"loss": 0.2136,
"step": 11160
},
{
"epoch": 0.9181054835861956,
"grad_norm": 0.3789615035057068,
"learning_rate": 0.0002,
"loss": 0.2267,
"step": 11180
},
{
"epoch": 0.919747890533577,
"grad_norm": 0.3882130980491638,
"learning_rate": 0.0002,
"loss": 0.2276,
"step": 11200
},
{
"epoch": 0.9213902974809584,
"grad_norm": 0.3001303970813751,
"learning_rate": 0.0002,
"loss": 0.2313,
"step": 11220
},
{
"epoch": 0.9230327044283397,
"grad_norm": 0.4514042139053345,
"learning_rate": 0.0002,
"loss": 0.2204,
"step": 11240
},
{
"epoch": 0.9246751113757211,
"grad_norm": 0.43372517824172974,
"learning_rate": 0.0002,
"loss": 0.2294,
"step": 11260
},
{
"epoch": 0.9263175183231025,
"grad_norm": 0.2934057414531708,
"learning_rate": 0.0002,
"loss": 0.2308,
"step": 11280
},
{
"epoch": 0.9279599252704839,
"grad_norm": 0.4067831337451935,
"learning_rate": 0.0002,
"loss": 0.2329,
"step": 11300
},
{
"epoch": 0.9296023322178653,
"grad_norm": 0.3299509584903717,
"learning_rate": 0.0002,
"loss": 0.2214,
"step": 11320
},
{
"epoch": 0.9312447391652466,
"grad_norm": 0.35204941034317017,
"learning_rate": 0.0002,
"loss": 0.239,
"step": 11340
},
{
"epoch": 0.932887146112628,
"grad_norm": 0.30878013372421265,
"learning_rate": 0.0002,
"loss": 0.2248,
"step": 11360
},
{
"epoch": 0.9345295530600094,
"grad_norm": 0.392170786857605,
"learning_rate": 0.0002,
"loss": 0.2274,
"step": 11380
},
{
"epoch": 0.9361719600073908,
"grad_norm": 0.4151529371738434,
"learning_rate": 0.0002,
"loss": 0.2186,
"step": 11400
},
{
"epoch": 0.9378143669547723,
"grad_norm": 0.3535741865634918,
"learning_rate": 0.0002,
"loss": 0.2285,
"step": 11420
},
{
"epoch": 0.9394567739021537,
"grad_norm": 0.3477960526943207,
"learning_rate": 0.0002,
"loss": 0.2313,
"step": 11440
},
{
"epoch": 0.941099180849535,
"grad_norm": 0.3621846139431,
"learning_rate": 0.0002,
"loss": 0.2317,
"step": 11460
},
{
"epoch": 0.9427415877969164,
"grad_norm": 0.3844580352306366,
"learning_rate": 0.0002,
"loss": 0.2345,
"step": 11480
},
{
"epoch": 0.9443839947442978,
"grad_norm": 0.3395872116088867,
"learning_rate": 0.0002,
"loss": 0.2233,
"step": 11500
},
{
"epoch": 0.9460264016916792,
"grad_norm": 0.4554111063480377,
"learning_rate": 0.0002,
"loss": 0.2324,
"step": 11520
},
{
"epoch": 0.9476688086390606,
"grad_norm": 0.34367838501930237,
"learning_rate": 0.0002,
"loss": 0.2157,
"step": 11540
},
{
"epoch": 0.9493112155864419,
"grad_norm": 0.2760342061519623,
"learning_rate": 0.0002,
"loss": 0.2278,
"step": 11560
},
{
"epoch": 0.9509536225338233,
"grad_norm": 0.4382875859737396,
"learning_rate": 0.0002,
"loss": 0.2361,
"step": 11580
},
{
"epoch": 0.9525960294812047,
"grad_norm": 0.3573220670223236,
"learning_rate": 0.0002,
"loss": 0.2241,
"step": 11600
},
{
"epoch": 0.9542384364285861,
"grad_norm": 0.3491596579551697,
"learning_rate": 0.0002,
"loss": 0.2258,
"step": 11620
},
{
"epoch": 0.9558808433759675,
"grad_norm": 0.42366743087768555,
"learning_rate": 0.0002,
"loss": 0.2406,
"step": 11640
},
{
"epoch": 0.9575232503233488,
"grad_norm": 0.3748779892921448,
"learning_rate": 0.0002,
"loss": 0.2305,
"step": 11660
},
{
"epoch": 0.9591656572707302,
"grad_norm": 0.40864527225494385,
"learning_rate": 0.0002,
"loss": 0.235,
"step": 11680
},
{
"epoch": 0.9608080642181116,
"grad_norm": 0.41164445877075195,
"learning_rate": 0.0002,
"loss": 0.2195,
"step": 11700
},
{
"epoch": 0.962450471165493,
"grad_norm": 0.46402692794799805,
"learning_rate": 0.0002,
"loss": 0.2266,
"step": 11720
},
{
"epoch": 0.9640928781128744,
"grad_norm": 0.32727622985839844,
"learning_rate": 0.0002,
"loss": 0.2324,
"step": 11740
},
{
"epoch": 0.9657352850602559,
"grad_norm": 0.4346349537372589,
"learning_rate": 0.0002,
"loss": 0.2257,
"step": 11760
},
{
"epoch": 0.9673776920076372,
"grad_norm": 0.3470235764980316,
"learning_rate": 0.0002,
"loss": 0.2333,
"step": 11780
},
{
"epoch": 0.9690200989550186,
"grad_norm": 0.48941469192504883,
"learning_rate": 0.0002,
"loss": 0.2336,
"step": 11800
},
{
"epoch": 0.9706625059024,
"grad_norm": 0.3959124982357025,
"learning_rate": 0.0002,
"loss": 0.2221,
"step": 11820
},
{
"epoch": 0.9723049128497814,
"grad_norm": 0.40877676010131836,
"learning_rate": 0.0002,
"loss": 0.232,
"step": 11840
},
{
"epoch": 0.9739473197971628,
"grad_norm": 0.4087940454483032,
"learning_rate": 0.0002,
"loss": 0.2195,
"step": 11860
},
{
"epoch": 0.9755897267445441,
"grad_norm": 0.3967040181159973,
"learning_rate": 0.0002,
"loss": 0.234,
"step": 11880
},
{
"epoch": 0.9772321336919255,
"grad_norm": 0.41639575362205505,
"learning_rate": 0.0002,
"loss": 0.221,
"step": 11900
},
{
"epoch": 0.9788745406393069,
"grad_norm": 0.304775595664978,
"learning_rate": 0.0002,
"loss": 0.2283,
"step": 11920
},
{
"epoch": 0.9805169475866883,
"grad_norm": 0.41931501030921936,
"learning_rate": 0.0002,
"loss": 0.2263,
"step": 11940
},
{
"epoch": 0.9821593545340697,
"grad_norm": 0.34010422229766846,
"learning_rate": 0.0002,
"loss": 0.222,
"step": 11960
},
{
"epoch": 0.983801761481451,
"grad_norm": 0.3099174499511719,
"learning_rate": 0.0002,
"loss": 0.2221,
"step": 11980
},
{
"epoch": 0.9854441684288324,
"grad_norm": 0.3627716600894928,
"learning_rate": 0.0002,
"loss": 0.2419,
"step": 12000
},
{
"epoch": 0.9870865753762138,
"grad_norm": 0.3797793388366699,
"learning_rate": 0.0002,
"loss": 0.2289,
"step": 12020
},
{
"epoch": 0.9887289823235952,
"grad_norm": 0.34914806485176086,
"learning_rate": 0.0002,
"loss": 0.2211,
"step": 12040
},
{
"epoch": 0.9903713892709766,
"grad_norm": 0.35985666513442993,
"learning_rate": 0.0002,
"loss": 0.2271,
"step": 12060
},
{
"epoch": 0.9920137962183581,
"grad_norm": 0.3159051835536957,
"learning_rate": 0.0002,
"loss": 0.2364,
"step": 12080
},
{
"epoch": 0.9936562031657394,
"grad_norm": 0.29203563928604126,
"learning_rate": 0.0002,
"loss": 0.2429,
"step": 12100
},
{
"epoch": 0.9952986101131208,
"grad_norm": 0.32187801599502563,
"learning_rate": 0.0002,
"loss": 0.2386,
"step": 12120
},
{
"epoch": 0.9969410170605022,
"grad_norm": 0.35564154386520386,
"learning_rate": 0.0002,
"loss": 0.2349,
"step": 12140
},
{
"epoch": 0.9985834240078836,
"grad_norm": 0.3589749336242676,
"learning_rate": 0.0002,
"loss": 0.2275,
"step": 12160
},
{
"epoch": 1.0,
"eval_loss": 0.28475141525268555,
"eval_runtime": 907.1315,
"eval_samples_per_second": 4.174,
"eval_steps_per_second": 0.523,
"step": 12178
}
],
"logging_steps": 20,
"max_steps": 16000,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 77,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 7.843715322728153e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}