|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9994683678894205, |
|
"eval_steps": 500, |
|
"global_step": 940, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01063264221158958, |
|
"grad_norm": 5.609270095825195, |
|
"learning_rate": 0.00019808510638297873, |
|
"loss": 2.4434, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.02126528442317916, |
|
"grad_norm": 4.589075088500977, |
|
"learning_rate": 0.00019595744680851065, |
|
"loss": 1.6902, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.03189792663476874, |
|
"grad_norm": 3.7465291023254395, |
|
"learning_rate": 0.00019382978723404257, |
|
"loss": 1.3148, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.04253056884635832, |
|
"grad_norm": 3.543064594268799, |
|
"learning_rate": 0.00019170212765957448, |
|
"loss": 1.4302, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0531632110579479, |
|
"grad_norm": 2.68544340133667, |
|
"learning_rate": 0.0001895744680851064, |
|
"loss": 1.3222, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.06379585326953748, |
|
"grad_norm": 2.752901792526245, |
|
"learning_rate": 0.00018744680851063832, |
|
"loss": 1.2792, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.07442849548112707, |
|
"grad_norm": 2.7944841384887695, |
|
"learning_rate": 0.0001853191489361702, |
|
"loss": 1.3764, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.08506113769271664, |
|
"grad_norm": 3.0340654850006104, |
|
"learning_rate": 0.00018319148936170215, |
|
"loss": 1.2255, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.09569377990430622, |
|
"grad_norm": 2.5017054080963135, |
|
"learning_rate": 0.00018106382978723404, |
|
"loss": 1.1689, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.1063264221158958, |
|
"grad_norm": 4.572251319885254, |
|
"learning_rate": 0.00017893617021276596, |
|
"loss": 1.1418, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.11695906432748537, |
|
"grad_norm": 3.354853630065918, |
|
"learning_rate": 0.00017680851063829787, |
|
"loss": 1.3103, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.12759170653907495, |
|
"grad_norm": 2.387272834777832, |
|
"learning_rate": 0.0001746808510638298, |
|
"loss": 1.2848, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.13822434875066453, |
|
"grad_norm": 2.579465627670288, |
|
"learning_rate": 0.0001725531914893617, |
|
"loss": 1.2395, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.14885699096225413, |
|
"grad_norm": 2.9512267112731934, |
|
"learning_rate": 0.00017042553191489362, |
|
"loss": 1.3716, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.1594896331738437, |
|
"grad_norm": 2.6200809478759766, |
|
"learning_rate": 0.00016829787234042554, |
|
"loss": 1.2647, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.17012227538543329, |
|
"grad_norm": 2.7764666080474854, |
|
"learning_rate": 0.00016617021276595746, |
|
"loss": 1.0862, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.18075491759702286, |
|
"grad_norm": 2.454061269760132, |
|
"learning_rate": 0.00016404255319148937, |
|
"loss": 1.2213, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.19138755980861244, |
|
"grad_norm": 2.483651876449585, |
|
"learning_rate": 0.0001619148936170213, |
|
"loss": 1.1895, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.20202020202020202, |
|
"grad_norm": 3.5856575965881348, |
|
"learning_rate": 0.0001597872340425532, |
|
"loss": 1.2322, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.2126528442317916, |
|
"grad_norm": 2.1436448097229004, |
|
"learning_rate": 0.00015765957446808512, |
|
"loss": 1.2782, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.22328548644338117, |
|
"grad_norm": 2.569831609725952, |
|
"learning_rate": 0.00015553191489361701, |
|
"loss": 1.171, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.23391812865497075, |
|
"grad_norm": 2.5455546379089355, |
|
"learning_rate": 0.00015340425531914896, |
|
"loss": 1.3055, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.24455077086656035, |
|
"grad_norm": 1.7153586149215698, |
|
"learning_rate": 0.00015127659574468085, |
|
"loss": 1.0822, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.2551834130781499, |
|
"grad_norm": 2.549631357192993, |
|
"learning_rate": 0.00014914893617021276, |
|
"loss": 1.3206, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.2658160552897395, |
|
"grad_norm": 2.53717041015625, |
|
"learning_rate": 0.00014702127659574468, |
|
"loss": 1.1995, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.27644869750132905, |
|
"grad_norm": 2.331685781478882, |
|
"learning_rate": 0.0001448936170212766, |
|
"loss": 1.2426, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.28708133971291866, |
|
"grad_norm": 2.6866092681884766, |
|
"learning_rate": 0.00014276595744680851, |
|
"loss": 1.1314, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.29771398192450826, |
|
"grad_norm": 2.107909679412842, |
|
"learning_rate": 0.00014063829787234043, |
|
"loss": 1.1542, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.3083466241360978, |
|
"grad_norm": 1.8758138418197632, |
|
"learning_rate": 0.00013851063829787235, |
|
"loss": 1.1377, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.3189792663476874, |
|
"grad_norm": 1.647929072380066, |
|
"learning_rate": 0.00013638297872340427, |
|
"loss": 1.0096, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.32961190855927697, |
|
"grad_norm": 2.186124563217163, |
|
"learning_rate": 0.00013425531914893618, |
|
"loss": 1.1746, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.34024455077086657, |
|
"grad_norm": 2.4536380767822266, |
|
"learning_rate": 0.0001321276595744681, |
|
"loss": 1.1833, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.3508771929824561, |
|
"grad_norm": 1.8024215698242188, |
|
"learning_rate": 0.00013000000000000002, |
|
"loss": 0.9309, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.3615098351940457, |
|
"grad_norm": 3.0355000495910645, |
|
"learning_rate": 0.0001278723404255319, |
|
"loss": 1.0863, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.3721424774056353, |
|
"grad_norm": 1.9415550231933594, |
|
"learning_rate": 0.00012574468085106382, |
|
"loss": 1.0507, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.3827751196172249, |
|
"grad_norm": 2.327995538711548, |
|
"learning_rate": 0.00012361702127659577, |
|
"loss": 1.2524, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.3934077618288145, |
|
"grad_norm": 2.001037120819092, |
|
"learning_rate": 0.00012148936170212766, |
|
"loss": 1.0437, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.40404040404040403, |
|
"grad_norm": 2.1419551372528076, |
|
"learning_rate": 0.00011936170212765959, |
|
"loss": 1.0968, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.41467304625199364, |
|
"grad_norm": 2.3085482120513916, |
|
"learning_rate": 0.0001172340425531915, |
|
"loss": 1.1706, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.4253056884635832, |
|
"grad_norm": 4.618401050567627, |
|
"learning_rate": 0.0001151063829787234, |
|
"loss": 1.0685, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.4359383306751728, |
|
"grad_norm": 2.421363115310669, |
|
"learning_rate": 0.00011297872340425532, |
|
"loss": 1.0373, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.44657097288676234, |
|
"grad_norm": 1.6373859643936157, |
|
"learning_rate": 0.00011085106382978725, |
|
"loss": 1.0669, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.45720361509835195, |
|
"grad_norm": 2.3031554222106934, |
|
"learning_rate": 0.00010872340425531916, |
|
"loss": 1.1394, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.4678362573099415, |
|
"grad_norm": 1.9488067626953125, |
|
"learning_rate": 0.00010659574468085107, |
|
"loss": 1.0347, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.4784688995215311, |
|
"grad_norm": 1.8650946617126465, |
|
"learning_rate": 0.00010446808510638298, |
|
"loss": 1.1159, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.4891015417331207, |
|
"grad_norm": 1.8462837934494019, |
|
"learning_rate": 0.0001023404255319149, |
|
"loss": 1.0389, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.49973418394471025, |
|
"grad_norm": 2.5941386222839355, |
|
"learning_rate": 0.00010021276595744682, |
|
"loss": 1.1587, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.5103668261562998, |
|
"grad_norm": 1.34873366355896, |
|
"learning_rate": 9.808510638297873e-05, |
|
"loss": 1.1095, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.5209994683678895, |
|
"grad_norm": 2.2580478191375732, |
|
"learning_rate": 9.595744680851064e-05, |
|
"loss": 1.1268, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.531632110579479, |
|
"grad_norm": 2.389127731323242, |
|
"learning_rate": 9.382978723404256e-05, |
|
"loss": 1.2718, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.5422647527910686, |
|
"grad_norm": 2.1379384994506836, |
|
"learning_rate": 9.170212765957448e-05, |
|
"loss": 1.0394, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.5528973950026581, |
|
"grad_norm": 2.5996925830841064, |
|
"learning_rate": 8.95744680851064e-05, |
|
"loss": 1.0508, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.5635300372142478, |
|
"grad_norm": 2.143913984298706, |
|
"learning_rate": 8.74468085106383e-05, |
|
"loss": 1.0038, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.5741626794258373, |
|
"grad_norm": 2.285888910293579, |
|
"learning_rate": 8.531914893617021e-05, |
|
"loss": 1.2064, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.5847953216374269, |
|
"grad_norm": 2.3260293006896973, |
|
"learning_rate": 8.319148936170213e-05, |
|
"loss": 1.0499, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.5954279638490165, |
|
"grad_norm": 2.3417248725891113, |
|
"learning_rate": 8.106382978723405e-05, |
|
"loss": 1.1371, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.6060606060606061, |
|
"grad_norm": 2.194345474243164, |
|
"learning_rate": 7.893617021276596e-05, |
|
"loss": 1.0571, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.6166932482721956, |
|
"grad_norm": 2.3759639263153076, |
|
"learning_rate": 7.680851063829788e-05, |
|
"loss": 0.9709, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.6273258904837852, |
|
"grad_norm": 1.7851307392120361, |
|
"learning_rate": 7.46808510638298e-05, |
|
"loss": 1.0751, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.6379585326953748, |
|
"grad_norm": 2.1073718070983887, |
|
"learning_rate": 7.25531914893617e-05, |
|
"loss": 1.0453, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.6485911749069644, |
|
"grad_norm": 3.0715222358703613, |
|
"learning_rate": 7.042553191489362e-05, |
|
"loss": 1.019, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.6592238171185539, |
|
"grad_norm": 2.7208268642425537, |
|
"learning_rate": 6.829787234042554e-05, |
|
"loss": 0.919, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.6698564593301436, |
|
"grad_norm": 1.7897045612335205, |
|
"learning_rate": 6.617021276595745e-05, |
|
"loss": 0.9964, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.6804891015417331, |
|
"grad_norm": 2.317929744720459, |
|
"learning_rate": 6.404255319148937e-05, |
|
"loss": 1.1598, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.6911217437533227, |
|
"grad_norm": 1.826894760131836, |
|
"learning_rate": 6.191489361702127e-05, |
|
"loss": 1.117, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.7017543859649122, |
|
"grad_norm": 2.0165112018585205, |
|
"learning_rate": 5.9787234042553196e-05, |
|
"loss": 1.0511, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.7123870281765019, |
|
"grad_norm": 1.6636179685592651, |
|
"learning_rate": 5.7659574468085106e-05, |
|
"loss": 1.0488, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.7230196703880915, |
|
"grad_norm": 2.3491950035095215, |
|
"learning_rate": 5.553191489361702e-05, |
|
"loss": 1.2297, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.733652312599681, |
|
"grad_norm": 2.28796124458313, |
|
"learning_rate": 5.3404255319148946e-05, |
|
"loss": 1.1457, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.7442849548112705, |
|
"grad_norm": 2.550320863723755, |
|
"learning_rate": 5.1276595744680856e-05, |
|
"loss": 1.069, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.7549175970228602, |
|
"grad_norm": 1.5172102451324463, |
|
"learning_rate": 4.9148936170212766e-05, |
|
"loss": 0.849, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.7655502392344498, |
|
"grad_norm": 1.7714675664901733, |
|
"learning_rate": 4.702127659574468e-05, |
|
"loss": 1.1457, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.7761828814460393, |
|
"grad_norm": 1.587169885635376, |
|
"learning_rate": 4.489361702127659e-05, |
|
"loss": 0.9438, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.786815523657629, |
|
"grad_norm": 2.464047908782959, |
|
"learning_rate": 4.276595744680851e-05, |
|
"loss": 1.0606, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.7974481658692185, |
|
"grad_norm": 1.6491392850875854, |
|
"learning_rate": 4.063829787234043e-05, |
|
"loss": 1.0333, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.8080808080808081, |
|
"grad_norm": 2.159282684326172, |
|
"learning_rate": 3.8510638297872344e-05, |
|
"loss": 0.9192, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.8187134502923976, |
|
"grad_norm": 1.6473966836929321, |
|
"learning_rate": 3.638297872340426e-05, |
|
"loss": 1.0218, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.8293460925039873, |
|
"grad_norm": 2.5140249729156494, |
|
"learning_rate": 3.425531914893617e-05, |
|
"loss": 1.1425, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.8399787347155768, |
|
"grad_norm": 1.8191956281661987, |
|
"learning_rate": 3.212765957446809e-05, |
|
"loss": 1.0179, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.8506113769271664, |
|
"grad_norm": 1.570918083190918, |
|
"learning_rate": 3e-05, |
|
"loss": 1.0624, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.861244019138756, |
|
"grad_norm": 2.4648308753967285, |
|
"learning_rate": 2.7872340425531918e-05, |
|
"loss": 1.0768, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.8718766613503456, |
|
"grad_norm": 2.4284791946411133, |
|
"learning_rate": 2.574468085106383e-05, |
|
"loss": 1.0748, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.8825093035619351, |
|
"grad_norm": 2.543541193008423, |
|
"learning_rate": 2.3617021276595748e-05, |
|
"loss": 1.051, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.8931419457735247, |
|
"grad_norm": 2.0287232398986816, |
|
"learning_rate": 2.148936170212766e-05, |
|
"loss": 0.9987, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.9037745879851143, |
|
"grad_norm": 2.2504048347473145, |
|
"learning_rate": 1.9361702127659575e-05, |
|
"loss": 0.9468, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.9144072301967039, |
|
"grad_norm": 1.889223337173462, |
|
"learning_rate": 1.723404255319149e-05, |
|
"loss": 1.1471, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.9250398724082934, |
|
"grad_norm": 2.414099931716919, |
|
"learning_rate": 1.5106382978723405e-05, |
|
"loss": 1.0941, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.935672514619883, |
|
"grad_norm": 1.7655644416809082, |
|
"learning_rate": 1.2978723404255318e-05, |
|
"loss": 1.0504, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.9463051568314726, |
|
"grad_norm": 1.6641113758087158, |
|
"learning_rate": 1.0851063829787235e-05, |
|
"loss": 1.1144, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.9569377990430622, |
|
"grad_norm": 2.2806735038757324, |
|
"learning_rate": 8.72340425531915e-06, |
|
"loss": 1.0884, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.9675704412546517, |
|
"grad_norm": 2.1201162338256836, |
|
"learning_rate": 6.595744680851064e-06, |
|
"loss": 1.0405, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.9782030834662414, |
|
"grad_norm": 1.651154637336731, |
|
"learning_rate": 4.468085106382979e-06, |
|
"loss": 0.9533, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.988835725677831, |
|
"grad_norm": 2.6276893615722656, |
|
"learning_rate": 2.3404255319148935e-06, |
|
"loss": 1.0675, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.9994683678894205, |
|
"grad_norm": 1.7685601711273193, |
|
"learning_rate": 2.1276595744680852e-07, |
|
"loss": 0.8962, |
|
"step": 940 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 940, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2872735048949760.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|