|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.9994378864530633, |
|
"eval_steps": 500, |
|
"global_step": 2001, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.014989694584972832, |
|
"grad_norm": 3.2230756947704684, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8065, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.029979389169945664, |
|
"grad_norm": 4.087000478453515, |
|
"learning_rate": 5e-06, |
|
"loss": 0.651, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.044969083754918496, |
|
"grad_norm": 2.503327595683403, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6152, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.05995877833989133, |
|
"grad_norm": 2.6733564049479916, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5859, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.07494847292486416, |
|
"grad_norm": 2.7328394257454245, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5786, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.08993816750983699, |
|
"grad_norm": 3.1797521945570693, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5685, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.10492786209480982, |
|
"grad_norm": 2.684949545099637, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5586, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.11991755667978266, |
|
"grad_norm": 2.9219404511166824, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5529, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.13490725126475547, |
|
"grad_norm": 2.3100236702034826, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5461, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.14989694584972832, |
|
"grad_norm": 2.2049571056422397, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5381, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.16488664043470114, |
|
"grad_norm": 1.7661938201026661, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5359, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.17987633501967398, |
|
"grad_norm": 2.1859179504620174, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5318, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.1948660296046468, |
|
"grad_norm": 1.6326069613621976, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5279, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.20985572418961965, |
|
"grad_norm": 1.5360852906450997, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5309, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.22484541877459246, |
|
"grad_norm": 1.7724258855816508, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5207, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.2398351133595653, |
|
"grad_norm": 1.7886911455233951, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5203, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.25482480794453816, |
|
"grad_norm": 1.436404813254963, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5179, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.26981450252951095, |
|
"grad_norm": 1.892553234562171, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5134, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.2848041971144838, |
|
"grad_norm": 1.6450239492265477, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5124, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.29979389169945664, |
|
"grad_norm": 1.894402097483371, |
|
"learning_rate": 5e-06, |
|
"loss": 0.508, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.31478358628442943, |
|
"grad_norm": 1.6348153643564973, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5032, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.3297732808694023, |
|
"grad_norm": 1.3715172854305366, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5051, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.3447629754543751, |
|
"grad_norm": 1.742102338514997, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5036, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.35975267003934797, |
|
"grad_norm": 1.7426571084822509, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5041, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.37474236462432076, |
|
"grad_norm": 2.1505290746647465, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5001, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.3897320592092936, |
|
"grad_norm": 1.793888447963483, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5015, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.40472175379426645, |
|
"grad_norm": 1.8644855732640748, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4958, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.4197114483792393, |
|
"grad_norm": 2.6049833173978825, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4996, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.4347011429642121, |
|
"grad_norm": 1.4405086279628676, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4929, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.44969083754918493, |
|
"grad_norm": 1.604860450889284, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4897, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.4646805321341578, |
|
"grad_norm": 1.662135718621203, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4927, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.4796702267191306, |
|
"grad_norm": 2.062302477917024, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4892, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.4946599213041034, |
|
"grad_norm": 1.4281806778360036, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4889, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.5096496158890763, |
|
"grad_norm": 1.4534145149693543, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4872, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.5246393104740491, |
|
"grad_norm": 1.8740427050796966, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4839, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.5396290050590219, |
|
"grad_norm": 1.5668501527936287, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4886, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.5546186996439948, |
|
"grad_norm": 1.3759231439903845, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4837, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.5696083942289676, |
|
"grad_norm": 1.498739905029643, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4822, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.5845980888139404, |
|
"grad_norm": 1.5404538522347584, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4798, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.5995877833989133, |
|
"grad_norm": 2.0684145258599544, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4831, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.6145774779838861, |
|
"grad_norm": 1.7365319201533058, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4803, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.6295671725688589, |
|
"grad_norm": 1.7707026206608905, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4759, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.6445568671538318, |
|
"grad_norm": 1.4832433269572964, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4759, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.6595465617388045, |
|
"grad_norm": 1.4849966253145612, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4731, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.6745362563237775, |
|
"grad_norm": 2.1406203761167037, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4748, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.6895259509087502, |
|
"grad_norm": 1.6432408076200942, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4692, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.704515645493723, |
|
"grad_norm": 2.110147852343459, |
|
"learning_rate": 5e-06, |
|
"loss": 0.469, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.7195053400786959, |
|
"grad_norm": 2.67027429796622, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4742, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.7344950346636687, |
|
"grad_norm": 2.1612548231072197, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4734, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.7494847292486415, |
|
"grad_norm": 1.9020717675158103, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4693, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.7644744238336144, |
|
"grad_norm": 1.8918141412998646, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4734, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.7794641184185872, |
|
"grad_norm": 1.368955726552673, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4663, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.7944538130035601, |
|
"grad_norm": 1.5459750086281097, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4645, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.8094435075885329, |
|
"grad_norm": 1.5230854042747586, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4647, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.8244332021735057, |
|
"grad_norm": 1.5090270259811516, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4707, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.8394228967584786, |
|
"grad_norm": 1.5558244270343213, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4674, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.8544125913434514, |
|
"grad_norm": 1.4739129842995866, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4678, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.8694022859284242, |
|
"grad_norm": 1.7227702869131336, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4614, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.8843919805133971, |
|
"grad_norm": 1.412974957600055, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4634, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.8993816750983699, |
|
"grad_norm": 1.3755393868079429, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4649, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.9143713696833426, |
|
"grad_norm": 1.3451129180593917, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4638, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.9293610642683156, |
|
"grad_norm": 1.4949083165414878, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4639, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.9443507588532883, |
|
"grad_norm": 1.3507178958809956, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4656, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.9593404534382612, |
|
"grad_norm": 1.572459968859256, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4592, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.974330148023234, |
|
"grad_norm": 1.2767150139708328, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4632, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.9893198426082068, |
|
"grad_norm": 1.5510115795503987, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4596, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.9998126288176878, |
|
"eval_loss": 0.05758751183748245, |
|
"eval_runtime": 454.8318, |
|
"eval_samples_per_second": 39.525, |
|
"eval_steps_per_second": 0.618, |
|
"step": 667 |
|
}, |
|
{ |
|
"epoch": 1.0043095371931796, |
|
"grad_norm": 2.7775612629774344, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4348, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.0192992317781526, |
|
"grad_norm": 1.9710376561762528, |
|
"learning_rate": 5e-06, |
|
"loss": 0.364, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.0342889263631254, |
|
"grad_norm": 1.5909045555502788, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3558, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.0492786209480982, |
|
"grad_norm": 1.3602748984083428, |
|
"learning_rate": 5e-06, |
|
"loss": 0.355, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.064268315533071, |
|
"grad_norm": 1.6434233988948692, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3519, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.0792580101180438, |
|
"grad_norm": 1.4242278638735506, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3545, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.0942477047030166, |
|
"grad_norm": 1.547462512141288, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3583, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.1092373992879896, |
|
"grad_norm": 1.4513237085400739, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3558, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.1242270938729624, |
|
"grad_norm": 2.1885817879178404, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3563, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.1392167884579352, |
|
"grad_norm": 1.695688809842665, |
|
"learning_rate": 5e-06, |
|
"loss": 0.358, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.154206483042908, |
|
"grad_norm": 2.289531376884916, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3574, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.1691961776278808, |
|
"grad_norm": 1.9357596883149255, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3584, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.1841858722128538, |
|
"grad_norm": 2.1618585974064395, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3564, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.1991755667978266, |
|
"grad_norm": 1.6373234010519413, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3527, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.2141652613827993, |
|
"grad_norm": 1.8399099410223894, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3566, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.2291549559677721, |
|
"grad_norm": 1.6753343512169294, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3601, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.244144650552745, |
|
"grad_norm": 1.505241949293805, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3583, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.259134345137718, |
|
"grad_norm": 1.5312665902757236, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3542, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.2741240397226907, |
|
"grad_norm": 1.8665087434383938, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3575, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.2891137343076635, |
|
"grad_norm": 1.7982799864263068, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3586, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.3041034288926363, |
|
"grad_norm": 1.5371290759554668, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3622, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.319093123477609, |
|
"grad_norm": 1.5167552093630703, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3588, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.334082818062582, |
|
"grad_norm": 1.6591098979757748, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3634, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.3490725126475547, |
|
"grad_norm": 1.5726838898036415, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3539, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.3640622072325277, |
|
"grad_norm": 1.7507238369968354, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3632, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.3790519018175005, |
|
"grad_norm": 1.475358404698048, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3625, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.3940415964024733, |
|
"grad_norm": 1.5954088726182911, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3595, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.409031290987446, |
|
"grad_norm": 1.520181904849792, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3597, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.4240209855724189, |
|
"grad_norm": 1.6317070098760134, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3611, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.4390106801573919, |
|
"grad_norm": 1.768300641616324, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3603, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.4540003747423647, |
|
"grad_norm": 1.5408964888063073, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3631, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.4689900693273374, |
|
"grad_norm": 1.452167086951644, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3609, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.4839797639123102, |
|
"grad_norm": 1.6249338530074764, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3625, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.498969458497283, |
|
"grad_norm": 1.7743633699281656, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3618, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.513959153082256, |
|
"grad_norm": 1.7741260951568605, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3612, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.5289488476672288, |
|
"grad_norm": 1.3987893388986192, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3628, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.5439385422522016, |
|
"grad_norm": 1.4893576161794635, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3626, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.5589282368371744, |
|
"grad_norm": 1.4112328226196433, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3618, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.5739179314221472, |
|
"grad_norm": 1.3858150270139942, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3615, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.5889076260071202, |
|
"grad_norm": 1.5535049280480435, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3618, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.6038973205920928, |
|
"grad_norm": 1.3757444255540903, |
|
"learning_rate": 5e-06, |
|
"loss": 0.365, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.6188870151770658, |
|
"grad_norm": 1.5995972790776192, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3632, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.6338767097620386, |
|
"grad_norm": 1.7108799556122343, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3643, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.6488664043470114, |
|
"grad_norm": 1.686754492103127, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3649, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.6638560989319844, |
|
"grad_norm": 1.7693708666917307, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3612, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.678845793516957, |
|
"grad_norm": 1.520817212897819, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3614, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.69383548810193, |
|
"grad_norm": 1.9406706141560084, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3601, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 1.7088251826869028, |
|
"grad_norm": 1.488137404772404, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3627, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.7238148772718755, |
|
"grad_norm": 1.8331126631936938, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3609, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.7388045718568486, |
|
"grad_norm": 1.5561141952783655, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3601, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.7537942664418211, |
|
"grad_norm": 1.4134693187494294, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3596, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.7687839610267941, |
|
"grad_norm": 1.696827607327685, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3614, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.783773655611767, |
|
"grad_norm": 1.5042455149771978, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3629, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 1.7987633501967397, |
|
"grad_norm": 1.619841917897079, |
|
"learning_rate": 5e-06, |
|
"loss": 0.364, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.8137530447817127, |
|
"grad_norm": 1.9678117854674795, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3623, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 1.8287427393666853, |
|
"grad_norm": 2.000250566254895, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3629, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.8437324339516583, |
|
"grad_norm": 1.7246823549895025, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3639, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 1.858722128536631, |
|
"grad_norm": 1.9341838486216245, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3614, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.873711823121604, |
|
"grad_norm": 1.6254030300504665, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3628, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.8887015177065767, |
|
"grad_norm": 1.5870549405395955, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3624, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.9036912122915495, |
|
"grad_norm": 1.5435102287374787, |
|
"learning_rate": 5e-06, |
|
"loss": 0.362, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 1.9186809068765225, |
|
"grad_norm": 1.4063275660945347, |
|
"learning_rate": 5e-06, |
|
"loss": 0.363, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 1.9336706014614953, |
|
"grad_norm": 1.5087142670105975, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3587, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 1.948660296046468, |
|
"grad_norm": 1.3583330142096965, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3608, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.9636499906314409, |
|
"grad_norm": 1.5780539376028968, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3621, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 1.9786396852164136, |
|
"grad_norm": 1.451758117798617, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3632, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 1.9936293798013867, |
|
"grad_norm": 1.428577561948826, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3617, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 1.9996252576353757, |
|
"eval_loss": 0.056735917925834656, |
|
"eval_runtime": 450.721, |
|
"eval_samples_per_second": 39.885, |
|
"eval_steps_per_second": 0.623, |
|
"step": 1334 |
|
}, |
|
{ |
|
"epoch": 2.0086190743863592, |
|
"grad_norm": 2.693760600411948, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3009, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 2.0236087689713322, |
|
"grad_norm": 1.8123948443076334, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2445, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 2.0385984635563053, |
|
"grad_norm": 1.8233844327944795, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2404, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 2.053588158141278, |
|
"grad_norm": 1.659782737096887, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2396, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 2.068577852726251, |
|
"grad_norm": 1.6369760989914484, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2399, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 2.0835675473112234, |
|
"grad_norm": 1.6159958358268898, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2398, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 2.0985572418961964, |
|
"grad_norm": 1.8629592527737424, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2404, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.1135469364811694, |
|
"grad_norm": 1.9402493917715093, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2406, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 2.128536631066142, |
|
"grad_norm": 1.9914438604583353, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2445, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 2.143526325651115, |
|
"grad_norm": 1.6486122695508032, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2428, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 2.1585160202360876, |
|
"grad_norm": 1.5254691785643584, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2443, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 2.1735057148210606, |
|
"grad_norm": 1.9133206318932665, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2431, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 2.188495409406033, |
|
"grad_norm": 1.6550777901047227, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2448, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 2.203485103991006, |
|
"grad_norm": 1.8001235821461439, |
|
"learning_rate": 5e-06, |
|
"loss": 0.247, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 2.218474798575979, |
|
"grad_norm": 1.6442525813509412, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2491, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 2.2334644931609517, |
|
"grad_norm": 1.6551298330853466, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2485, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 2.2484541877459248, |
|
"grad_norm": 1.691060089285524, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2463, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.2634438823308973, |
|
"grad_norm": 1.680873308991847, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2486, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 2.2784335769158703, |
|
"grad_norm": 1.5264889834969293, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2497, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 2.2934232715008434, |
|
"grad_norm": 1.9490999202092631, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2492, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 2.308412966085816, |
|
"grad_norm": 1.5890774001111174, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2503, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 2.323402660670789, |
|
"grad_norm": 1.8381634501461115, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2505, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 2.3383923552557615, |
|
"grad_norm": 1.8112499776123423, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2518, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 2.3533820498407345, |
|
"grad_norm": 1.7701674873383388, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2504, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 2.3683717444257075, |
|
"grad_norm": 1.7787057235293418, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2511, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 2.38336143901068, |
|
"grad_norm": 1.6220489456862308, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2517, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 2.398351133595653, |
|
"grad_norm": 1.6092461385206882, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2519, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 2.4133408281806257, |
|
"grad_norm": 1.6892803378685273, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2537, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 2.4283305227655987, |
|
"grad_norm": 1.5514590659702057, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2525, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 2.4433202173505713, |
|
"grad_norm": 1.6463095287912044, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2519, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 2.4583099119355443, |
|
"grad_norm": 1.8642417904545223, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2542, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 2.4732996065205173, |
|
"grad_norm": 2.1833462931941483, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2523, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 2.48828930110549, |
|
"grad_norm": 1.7334334035624595, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2525, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 2.503278995690463, |
|
"grad_norm": 1.996879481861675, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2512, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 2.518268690275436, |
|
"grad_norm": 1.7646159599515048, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2543, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 2.5332583848604084, |
|
"grad_norm": 1.5280180383047155, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2542, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 2.5482480794453815, |
|
"grad_norm": 1.6854655209849605, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2543, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 2.563237774030354, |
|
"grad_norm": 1.614184312338993, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2547, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 2.578227468615327, |
|
"grad_norm": 1.7759088208026037, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2577, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 2.5932171632002996, |
|
"grad_norm": 1.7074767316934885, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2585, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 2.6082068577852726, |
|
"grad_norm": 1.666840211483153, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2566, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 2.6231965523702456, |
|
"grad_norm": 1.5612601726380533, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2577, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 2.638186246955218, |
|
"grad_norm": 1.6048678229351492, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2584, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 2.653175941540191, |
|
"grad_norm": 1.535012707234654, |
|
"learning_rate": 5e-06, |
|
"loss": 0.258, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 2.668165636125164, |
|
"grad_norm": 1.5594137106457704, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2596, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 2.683155330710137, |
|
"grad_norm": 1.7412507218140363, |
|
"learning_rate": 5e-06, |
|
"loss": 0.259, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 2.6981450252951094, |
|
"grad_norm": 1.8620927203026323, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2613, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 2.7131347198800824, |
|
"grad_norm": 1.5875808505192006, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2585, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 2.7281244144650554, |
|
"grad_norm": 1.6955850941699981, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2597, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 2.743114109050028, |
|
"grad_norm": 1.7569205126042602, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2571, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 2.758103803635001, |
|
"grad_norm": 1.519918624729941, |
|
"learning_rate": 5e-06, |
|
"loss": 0.263, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 2.773093498219974, |
|
"grad_norm": 1.5838501646665883, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2608, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 2.7880831928049465, |
|
"grad_norm": 1.6657424039076156, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2613, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 2.8030728873899196, |
|
"grad_norm": 1.595659803150772, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2611, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 2.818062581974892, |
|
"grad_norm": 1.7789456591238404, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2622, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 2.833052276559865, |
|
"grad_norm": 1.6114699686236196, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2615, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 2.8480419711448377, |
|
"grad_norm": 1.6465619591478904, |
|
"learning_rate": 5e-06, |
|
"loss": 0.263, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 2.8630316657298107, |
|
"grad_norm": 1.6736896325417563, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2649, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 2.8780213603147837, |
|
"grad_norm": 1.52817706604071, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2611, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 2.8930110548997563, |
|
"grad_norm": 1.7454962824468059, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2626, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 2.9080007494847293, |
|
"grad_norm": 1.6946473714595274, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2627, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 2.9229904440697023, |
|
"grad_norm": 1.6411356019714227, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2628, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 2.937980138654675, |
|
"grad_norm": 1.9705938078449712, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2627, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 2.952969833239648, |
|
"grad_norm": 1.9589063291434328, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2646, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 2.9679595278246205, |
|
"grad_norm": 1.6622226531628324, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2637, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 2.9829492224095935, |
|
"grad_norm": 1.7651652877550925, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2637, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 2.997938916994566, |
|
"grad_norm": 1.6491028582670904, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2629, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2.9994378864530633, |
|
"eval_loss": 0.061618607491254807, |
|
"eval_runtime": 450.5684, |
|
"eval_samples_per_second": 39.898, |
|
"eval_steps_per_second": 0.624, |
|
"step": 2001 |
|
}, |
|
{ |
|
"epoch": 2.9994378864530633, |
|
"step": 2001, |
|
"total_flos": 3351540148469760.0, |
|
"train_loss": 0.37255630833932246, |
|
"train_runtime": 64694.9321, |
|
"train_samples_per_second": 15.838, |
|
"train_steps_per_second": 0.031 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 2001, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3351540148469760.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|