|
{ |
|
"best_metric": 0.0350230410695076, |
|
"best_model_checkpoint": "saves/psy-course/Llama3-OpenBioLLM-8B/train/fold3/checkpoint-1300", |
|
"epoch": 4.9976479443033215, |
|
"eval_steps": 50, |
|
"global_step": 3320, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.015053156458744943, |
|
"grad_norm": 6.178123474121094, |
|
"learning_rate": 3.0120481927710846e-06, |
|
"loss": 1.3483, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.030106312917489886, |
|
"grad_norm": 7.668521404266357, |
|
"learning_rate": 6.024096385542169e-06, |
|
"loss": 1.2949, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.04515946937623483, |
|
"grad_norm": 4.06071138381958, |
|
"learning_rate": 9.036144578313253e-06, |
|
"loss": 1.1399, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.06021262583497977, |
|
"grad_norm": 2.9999659061431885, |
|
"learning_rate": 1.2048192771084338e-05, |
|
"loss": 0.9064, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.07526578229372471, |
|
"grad_norm": 1.4494060277938843, |
|
"learning_rate": 1.5060240963855424e-05, |
|
"loss": 0.4566, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.07526578229372471, |
|
"eval_loss": 0.31506142020225525, |
|
"eval_runtime": 156.762, |
|
"eval_samples_per_second": 7.534, |
|
"eval_steps_per_second": 7.534, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.09031893875246966, |
|
"grad_norm": 0.9608872532844543, |
|
"learning_rate": 1.8072289156626505e-05, |
|
"loss": 0.2427, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.1053720952112146, |
|
"grad_norm": 2.3095591068267822, |
|
"learning_rate": 2.1084337349397593e-05, |
|
"loss": 0.2342, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.12042525166995954, |
|
"grad_norm": 0.8325700163841248, |
|
"learning_rate": 2.4096385542168677e-05, |
|
"loss": 0.1697, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.1354784081287045, |
|
"grad_norm": 1.9299920797348022, |
|
"learning_rate": 2.7108433734939758e-05, |
|
"loss": 0.1242, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.15053156458744943, |
|
"grad_norm": 1.137899398803711, |
|
"learning_rate": 3.012048192771085e-05, |
|
"loss": 0.1039, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.15053156458744943, |
|
"eval_loss": 0.09335148334503174, |
|
"eval_runtime": 156.9195, |
|
"eval_samples_per_second": 7.526, |
|
"eval_steps_per_second": 7.526, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.16558472104619437, |
|
"grad_norm": 2.3025782108306885, |
|
"learning_rate": 3.313253012048193e-05, |
|
"loss": 0.0837, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.18063787750493931, |
|
"grad_norm": 1.0972262620925903, |
|
"learning_rate": 3.614457831325301e-05, |
|
"loss": 0.1008, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.19569103396368426, |
|
"grad_norm": 0.8031754493713379, |
|
"learning_rate": 3.91566265060241e-05, |
|
"loss": 0.0816, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.2107441904224292, |
|
"grad_norm": 2.3960483074188232, |
|
"learning_rate": 4.2168674698795186e-05, |
|
"loss": 0.0828, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.22579734688117414, |
|
"grad_norm": 1.7464011907577515, |
|
"learning_rate": 4.5180722891566266e-05, |
|
"loss": 0.0879, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.22579734688117414, |
|
"eval_loss": 0.07289445400238037, |
|
"eval_runtime": 156.9574, |
|
"eval_samples_per_second": 7.524, |
|
"eval_steps_per_second": 7.524, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.2408505033399191, |
|
"grad_norm": 0.8709592223167419, |
|
"learning_rate": 4.8192771084337354e-05, |
|
"loss": 0.0821, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.25590365979866403, |
|
"grad_norm": 0.9007524847984314, |
|
"learning_rate": 5.120481927710844e-05, |
|
"loss": 0.0799, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.270956816257409, |
|
"grad_norm": 1.1023762226104736, |
|
"learning_rate": 5.4216867469879516e-05, |
|
"loss": 0.0729, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.2860099727161539, |
|
"grad_norm": 1.223743200302124, |
|
"learning_rate": 5.72289156626506e-05, |
|
"loss": 0.0699, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.30106312917489886, |
|
"grad_norm": 1.030595064163208, |
|
"learning_rate": 6.02409638554217e-05, |
|
"loss": 0.0684, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.30106312917489886, |
|
"eval_loss": 0.06437338143587112, |
|
"eval_runtime": 156.9724, |
|
"eval_samples_per_second": 7.524, |
|
"eval_steps_per_second": 7.524, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.3161162856336438, |
|
"grad_norm": 0.8338972926139832, |
|
"learning_rate": 6.325301204819278e-05, |
|
"loss": 0.0706, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.33116944209238874, |
|
"grad_norm": 1.3395456075668335, |
|
"learning_rate": 6.626506024096386e-05, |
|
"loss": 0.066, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.3462225985511337, |
|
"grad_norm": 0.8639088869094849, |
|
"learning_rate": 6.927710843373494e-05, |
|
"loss": 0.0741, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.36127575500987863, |
|
"grad_norm": 0.9407922029495239, |
|
"learning_rate": 7.228915662650602e-05, |
|
"loss": 0.0612, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.3763289114686236, |
|
"grad_norm": 1.3013204336166382, |
|
"learning_rate": 7.530120481927712e-05, |
|
"loss": 0.0696, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.3763289114686236, |
|
"eval_loss": 0.060398537665605545, |
|
"eval_runtime": 156.4906, |
|
"eval_samples_per_second": 7.547, |
|
"eval_steps_per_second": 7.547, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.3913820679273685, |
|
"grad_norm": 0.6307721138000488, |
|
"learning_rate": 7.83132530120482e-05, |
|
"loss": 0.0436, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.40643522438611346, |
|
"grad_norm": 0.4275800585746765, |
|
"learning_rate": 8.132530120481928e-05, |
|
"loss": 0.0543, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.4214883808448584, |
|
"grad_norm": 0.3509841561317444, |
|
"learning_rate": 8.433734939759037e-05, |
|
"loss": 0.0613, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.43654153730360334, |
|
"grad_norm": 0.2633926272392273, |
|
"learning_rate": 8.734939759036145e-05, |
|
"loss": 0.0506, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.4515946937623483, |
|
"grad_norm": 1.0434097051620483, |
|
"learning_rate": 9.036144578313253e-05, |
|
"loss": 0.064, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.4515946937623483, |
|
"eval_loss": 0.053826671093702316, |
|
"eval_runtime": 156.4877, |
|
"eval_samples_per_second": 7.547, |
|
"eval_steps_per_second": 7.547, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.46664785022109323, |
|
"grad_norm": 0.2717237174510956, |
|
"learning_rate": 9.337349397590361e-05, |
|
"loss": 0.0469, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.4817010066798382, |
|
"grad_norm": 0.8223183155059814, |
|
"learning_rate": 9.638554216867471e-05, |
|
"loss": 0.0784, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.4967541631385831, |
|
"grad_norm": 0.9409707188606262, |
|
"learning_rate": 9.939759036144579e-05, |
|
"loss": 0.056, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.5118073195973281, |
|
"grad_norm": 0.4314805865287781, |
|
"learning_rate": 9.999823129264712e-05, |
|
"loss": 0.0396, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.526860476056073, |
|
"grad_norm": 0.5084919333457947, |
|
"learning_rate": 9.999104613348688e-05, |
|
"loss": 0.048, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.526860476056073, |
|
"eval_loss": 0.055311419069767, |
|
"eval_runtime": 156.4635, |
|
"eval_samples_per_second": 7.548, |
|
"eval_steps_per_second": 7.548, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.541913632514818, |
|
"grad_norm": 0.37716764211654663, |
|
"learning_rate": 9.997833477197385e-05, |
|
"loss": 0.0501, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.5569667889735629, |
|
"grad_norm": 0.7053455114364624, |
|
"learning_rate": 9.996009861327077e-05, |
|
"loss": 0.0474, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.5720199454323078, |
|
"grad_norm": 0.5760583877563477, |
|
"learning_rate": 9.993633967327269e-05, |
|
"loss": 0.0538, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.5870731018910528, |
|
"grad_norm": 0.6930764317512512, |
|
"learning_rate": 9.990706057838416e-05, |
|
"loss": 0.0417, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.6021262583497977, |
|
"grad_norm": 0.5816176533699036, |
|
"learning_rate": 9.987226456522884e-05, |
|
"loss": 0.0568, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.6021262583497977, |
|
"eval_loss": 0.05042395368218422, |
|
"eval_runtime": 156.5219, |
|
"eval_samples_per_second": 7.545, |
|
"eval_steps_per_second": 7.545, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.6171794148085427, |
|
"grad_norm": 0.5168653726577759, |
|
"learning_rate": 9.983195548029173e-05, |
|
"loss": 0.0403, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.6322325712672876, |
|
"grad_norm": 0.9114458560943604, |
|
"learning_rate": 9.9786137779494e-05, |
|
"loss": 0.0613, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.6472857277260325, |
|
"grad_norm": 0.3248330354690552, |
|
"learning_rate": 9.973481652770038e-05, |
|
"loss": 0.0427, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.6623388841847775, |
|
"grad_norm": 0.35154685378074646, |
|
"learning_rate": 9.967799739815925e-05, |
|
"loss": 0.0413, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.6773920406435224, |
|
"grad_norm": 0.813867449760437, |
|
"learning_rate": 9.961568667187556e-05, |
|
"loss": 0.0548, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.6773920406435224, |
|
"eval_loss": 0.046242497861385345, |
|
"eval_runtime": 156.7192, |
|
"eval_samples_per_second": 7.536, |
|
"eval_steps_per_second": 7.536, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.6924451971022674, |
|
"grad_norm": 0.8682682514190674, |
|
"learning_rate": 9.954789123691642e-05, |
|
"loss": 0.0444, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.7074983535610123, |
|
"grad_norm": 0.4860692322254181, |
|
"learning_rate": 9.947461858764978e-05, |
|
"loss": 0.0486, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.7225515100197573, |
|
"grad_norm": 0.49391481280326843, |
|
"learning_rate": 9.939587682391586e-05, |
|
"loss": 0.0423, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.7376046664785022, |
|
"grad_norm": 0.6448398232460022, |
|
"learning_rate": 9.931167465013182e-05, |
|
"loss": 0.0604, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.7526578229372471, |
|
"grad_norm": 0.7871343493461609, |
|
"learning_rate": 9.922202137432955e-05, |
|
"loss": 0.0387, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.7526578229372471, |
|
"eval_loss": 0.04570082202553749, |
|
"eval_runtime": 157.0432, |
|
"eval_samples_per_second": 7.52, |
|
"eval_steps_per_second": 7.52, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.7677109793959921, |
|
"grad_norm": 0.16567155718803406, |
|
"learning_rate": 9.912692690712665e-05, |
|
"loss": 0.0443, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.782764135854737, |
|
"grad_norm": 0.49851304292678833, |
|
"learning_rate": 9.902640176063103e-05, |
|
"loss": 0.0482, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.797817292313482, |
|
"grad_norm": 0.27470889687538147, |
|
"learning_rate": 9.892045704727864e-05, |
|
"loss": 0.0529, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.8128704487722269, |
|
"grad_norm": 0.4213567078113556, |
|
"learning_rate": 9.880910447860527e-05, |
|
"loss": 0.0636, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.8279236052309719, |
|
"grad_norm": 0.42453011870384216, |
|
"learning_rate": 9.869235636395177e-05, |
|
"loss": 0.0454, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.8279236052309719, |
|
"eval_loss": 0.043903350830078125, |
|
"eval_runtime": 157.299, |
|
"eval_samples_per_second": 7.508, |
|
"eval_steps_per_second": 7.508, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.8429767616897168, |
|
"grad_norm": 0.7841992378234863, |
|
"learning_rate": 9.857022560910338e-05, |
|
"loss": 0.0639, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.8580299181484617, |
|
"grad_norm": 0.6617533564567566, |
|
"learning_rate": 9.844272571486311e-05, |
|
"loss": 0.0414, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.8730830746072067, |
|
"grad_norm": 0.23420529067516327, |
|
"learning_rate": 9.830987077555924e-05, |
|
"loss": 0.0569, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.8881362310659516, |
|
"grad_norm": 0.5472472906112671, |
|
"learning_rate": 9.817167547748729e-05, |
|
"loss": 0.0629, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.9031893875246966, |
|
"grad_norm": 0.2491326779127121, |
|
"learning_rate": 9.802815509728662e-05, |
|
"loss": 0.0343, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.9031893875246966, |
|
"eval_loss": 0.042122405022382736, |
|
"eval_runtime": 157.5417, |
|
"eval_samples_per_second": 7.496, |
|
"eval_steps_per_second": 7.496, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.9182425439834415, |
|
"grad_norm": 0.2855018377304077, |
|
"learning_rate": 9.787932550025158e-05, |
|
"loss": 0.0373, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.9332957004421865, |
|
"grad_norm": 0.36227959394454956, |
|
"learning_rate": 9.772520313857775e-05, |
|
"loss": 0.0413, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.9483488569009314, |
|
"grad_norm": 0.425972044467926, |
|
"learning_rate": 9.756580504954334e-05, |
|
"loss": 0.0462, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.9634020133596763, |
|
"grad_norm": 0.35800591111183167, |
|
"learning_rate": 9.740114885362562e-05, |
|
"loss": 0.047, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.9784551698184213, |
|
"grad_norm": 0.28469201922416687, |
|
"learning_rate": 9.723125275255325e-05, |
|
"loss": 0.0363, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.9784551698184213, |
|
"eval_loss": 0.0401780903339386, |
|
"eval_runtime": 157.68, |
|
"eval_samples_per_second": 7.49, |
|
"eval_steps_per_second": 7.49, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.9935083262771662, |
|
"grad_norm": 0.2893456220626831, |
|
"learning_rate": 9.705613552729415e-05, |
|
"loss": 0.0306, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.0085614827359113, |
|
"grad_norm": 0.29287856817245483, |
|
"learning_rate": 9.68758165359794e-05, |
|
"loss": 0.0412, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.0236146391946561, |
|
"grad_norm": 0.30157166719436646, |
|
"learning_rate": 9.669031571176322e-05, |
|
"loss": 0.0404, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.0386677956534012, |
|
"grad_norm": 0.29122719168663025, |
|
"learning_rate": 9.64996535606196e-05, |
|
"loss": 0.0377, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.053720952112146, |
|
"grad_norm": 0.1511625498533249, |
|
"learning_rate": 9.630385115907545e-05, |
|
"loss": 0.0272, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.053720952112146, |
|
"eval_loss": 0.04149024188518524, |
|
"eval_runtime": 157.9474, |
|
"eval_samples_per_second": 7.477, |
|
"eval_steps_per_second": 7.477, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.068774108570891, |
|
"grad_norm": 0.19845999777317047, |
|
"learning_rate": 9.610293015188067e-05, |
|
"loss": 0.0345, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.083827265029636, |
|
"grad_norm": 0.3642478287220001, |
|
"learning_rate": 9.589691274961556e-05, |
|
"loss": 0.0271, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.098880421488381, |
|
"grad_norm": 0.3590589761734009, |
|
"learning_rate": 9.568582172623544e-05, |
|
"loss": 0.0437, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.1139335779471258, |
|
"grad_norm": 0.5430022478103638, |
|
"learning_rate": 9.546968041655326e-05, |
|
"loss": 0.0355, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.1289867344058706, |
|
"grad_norm": 0.18729464709758759, |
|
"learning_rate": 9.524851271366001e-05, |
|
"loss": 0.0324, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.1289867344058706, |
|
"eval_loss": 0.03844917565584183, |
|
"eval_runtime": 158.1863, |
|
"eval_samples_per_second": 7.466, |
|
"eval_steps_per_second": 7.466, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.1440398908646157, |
|
"grad_norm": 0.26876088976860046, |
|
"learning_rate": 9.502234306628355e-05, |
|
"loss": 0.0288, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.1590930473233607, |
|
"grad_norm": 0.3723165690898895, |
|
"learning_rate": 9.47911964760858e-05, |
|
"loss": 0.0327, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.1741462037821055, |
|
"grad_norm": 0.4213339686393738, |
|
"learning_rate": 9.455509849489915e-05, |
|
"loss": 0.0284, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.1891993602408506, |
|
"grad_norm": 0.3251422047615051, |
|
"learning_rate": 9.431407522190175e-05, |
|
"loss": 0.0361, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.2042525166995954, |
|
"grad_norm": 0.23341499269008636, |
|
"learning_rate": 9.406815330073244e-05, |
|
"loss": 0.0394, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.2042525166995954, |
|
"eval_loss": 0.042881306260824203, |
|
"eval_runtime": 158.3508, |
|
"eval_samples_per_second": 7.458, |
|
"eval_steps_per_second": 7.458, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.2193056731583405, |
|
"grad_norm": 0.24621306359767914, |
|
"learning_rate": 9.381735991654546e-05, |
|
"loss": 0.0394, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.2343588296170853, |
|
"grad_norm": 0.4738430380821228, |
|
"learning_rate": 9.356172279300528e-05, |
|
"loss": 0.038, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.2494119860758304, |
|
"grad_norm": 0.16725154221057892, |
|
"learning_rate": 9.330127018922194e-05, |
|
"loss": 0.0351, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.2644651425345752, |
|
"grad_norm": 0.1837790161371231, |
|
"learning_rate": 9.303603089662716e-05, |
|
"loss": 0.0379, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.2795182989933203, |
|
"grad_norm": 0.2514703869819641, |
|
"learning_rate": 9.276603423579164e-05, |
|
"loss": 0.0297, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.2795182989933203, |
|
"eval_loss": 0.04107122868299484, |
|
"eval_runtime": 158.521, |
|
"eval_samples_per_second": 7.45, |
|
"eval_steps_per_second": 7.45, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.294571455452065, |
|
"grad_norm": 0.3693402111530304, |
|
"learning_rate": 9.249131005318387e-05, |
|
"loss": 0.0429, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.30962461191081, |
|
"grad_norm": 0.16316485404968262, |
|
"learning_rate": 9.221188871787075e-05, |
|
"loss": 0.0349, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.324677768369555, |
|
"grad_norm": 0.2155531495809555, |
|
"learning_rate": 9.192780111816047e-05, |
|
"loss": 0.037, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.3397309248283, |
|
"grad_norm": 0.2901955544948578, |
|
"learning_rate": 9.163907865818806e-05, |
|
"loss": 0.0327, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.3547840812870449, |
|
"grad_norm": 0.5211817622184753, |
|
"learning_rate": 9.134575325444376e-05, |
|
"loss": 0.0423, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.3547840812870449, |
|
"eval_loss": 0.03963349014520645, |
|
"eval_runtime": 158.651, |
|
"eval_samples_per_second": 7.444, |
|
"eval_steps_per_second": 7.444, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.3698372377457897, |
|
"grad_norm": 0.27670544385910034, |
|
"learning_rate": 9.104785733224496e-05, |
|
"loss": 0.0292, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.3848903942045347, |
|
"grad_norm": 0.45413723587989807, |
|
"learning_rate": 9.07454238221517e-05, |
|
"loss": 0.0358, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.3999435506632798, |
|
"grad_norm": 0.2523304522037506, |
|
"learning_rate": 9.043848615632642e-05, |
|
"loss": 0.0294, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.4149967071220246, |
|
"grad_norm": 0.2198895364999771, |
|
"learning_rate": 9.012707826483823e-05, |
|
"loss": 0.0375, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.4300498635807695, |
|
"grad_norm": 0.3737075626850128, |
|
"learning_rate": 8.98112345719122e-05, |
|
"loss": 0.0324, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.4300498635807695, |
|
"eval_loss": 0.03718731552362442, |
|
"eval_runtime": 158.7997, |
|
"eval_samples_per_second": 7.437, |
|
"eval_steps_per_second": 7.437, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.4451030200395145, |
|
"grad_norm": 0.6216887831687927, |
|
"learning_rate": 8.949098999212391e-05, |
|
"loss": 0.0385, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.4601561764982596, |
|
"grad_norm": 0.2569785416126251, |
|
"learning_rate": 8.916637992653991e-05, |
|
"loss": 0.0321, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.4752093329570044, |
|
"grad_norm": 0.5239596366882324, |
|
"learning_rate": 8.883744025880428e-05, |
|
"loss": 0.0299, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.4902624894157492, |
|
"grad_norm": 0.4125153422355652, |
|
"learning_rate": 8.850420735117202e-05, |
|
"loss": 0.0304, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.5053156458744943, |
|
"grad_norm": 0.23545460402965546, |
|
"learning_rate": 8.816671804048933e-05, |
|
"loss": 0.0311, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.5053156458744943, |
|
"eval_loss": 0.03950938582420349, |
|
"eval_runtime": 158.844, |
|
"eval_samples_per_second": 7.435, |
|
"eval_steps_per_second": 7.435, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.5203688023332393, |
|
"grad_norm": 0.2283906191587448, |
|
"learning_rate": 8.782500963412156e-05, |
|
"loss": 0.0358, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.5354219587919842, |
|
"grad_norm": 0.3116263449192047, |
|
"learning_rate": 8.747911990582912e-05, |
|
"loss": 0.029, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.550475115250729, |
|
"grad_norm": 0.36042335629463196, |
|
"learning_rate": 8.712908709159183e-05, |
|
"loss": 0.0288, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.565528271709474, |
|
"grad_norm": 0.5503274202346802, |
|
"learning_rate": 8.677494988538211e-05, |
|
"loss": 0.0422, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.5805814281682191, |
|
"grad_norm": 0.30002403259277344, |
|
"learning_rate": 8.641674743488769e-05, |
|
"loss": 0.0309, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.5805814281682191, |
|
"eval_loss": 0.04065566882491112, |
|
"eval_runtime": 159.2321, |
|
"eval_samples_per_second": 7.417, |
|
"eval_steps_per_second": 7.417, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.595634584626964, |
|
"grad_norm": 0.34152716398239136, |
|
"learning_rate": 8.605451933718397e-05, |
|
"loss": 0.0377, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.6106877410857088, |
|
"grad_norm": 0.3016291558742523, |
|
"learning_rate": 8.568830563435694e-05, |
|
"loss": 0.0387, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.6257408975444538, |
|
"grad_norm": 0.5669217109680176, |
|
"learning_rate": 8.531814680907664e-05, |
|
"loss": 0.0254, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.6407940540031989, |
|
"grad_norm": 0.15337613224983215, |
|
"learning_rate": 8.494408378012209e-05, |
|
"loss": 0.0284, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.6558472104619437, |
|
"grad_norm": 0.14289391040802002, |
|
"learning_rate": 8.456615789785804e-05, |
|
"loss": 0.0233, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.6558472104619437, |
|
"eval_loss": 0.03772934526205063, |
|
"eval_runtime": 159.1219, |
|
"eval_samples_per_second": 7.422, |
|
"eval_steps_per_second": 7.422, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.6709003669206886, |
|
"grad_norm": 0.24611565470695496, |
|
"learning_rate": 8.418441093966385e-05, |
|
"loss": 0.0445, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.6859535233794336, |
|
"grad_norm": 0.21417029201984406, |
|
"learning_rate": 8.379888510531535e-05, |
|
"loss": 0.029, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.7010066798381787, |
|
"grad_norm": 0.3725775182247162, |
|
"learning_rate": 8.340962301231981e-05, |
|
"loss": 0.0224, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 1.7160598362969235, |
|
"grad_norm": 0.16964785754680634, |
|
"learning_rate": 8.301666769120488e-05, |
|
"loss": 0.0311, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.7311129927556683, |
|
"grad_norm": 0.19215409457683563, |
|
"learning_rate": 8.262006258076187e-05, |
|
"loss": 0.0455, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.7311129927556683, |
|
"eval_loss": 0.035410501062870026, |
|
"eval_runtime": 159.3978, |
|
"eval_samples_per_second": 7.409, |
|
"eval_steps_per_second": 7.409, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.7461661492144134, |
|
"grad_norm": 0.1331547200679779, |
|
"learning_rate": 8.221985152324385e-05, |
|
"loss": 0.0307, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.7612193056731584, |
|
"grad_norm": 0.22927172482013702, |
|
"learning_rate": 8.18160787595191e-05, |
|
"loss": 0.0413, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.7762724621319033, |
|
"grad_norm": 0.3783109784126282, |
|
"learning_rate": 8.14087889241806e-05, |
|
"loss": 0.0364, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.791325618590648, |
|
"grad_norm": 0.211699977517128, |
|
"learning_rate": 8.099802704061195e-05, |
|
"loss": 0.0384, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 1.8063787750493931, |
|
"grad_norm": 0.44848278164863586, |
|
"learning_rate": 8.058383851601027e-05, |
|
"loss": 0.0329, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.8063787750493931, |
|
"eval_loss": 0.03639229014515877, |
|
"eval_runtime": 159.4908, |
|
"eval_samples_per_second": 7.405, |
|
"eval_steps_per_second": 7.405, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.8214319315081382, |
|
"grad_norm": 0.5605763792991638, |
|
"learning_rate": 8.01662691363668e-05, |
|
"loss": 0.0289, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 1.836485087966883, |
|
"grad_norm": 0.3262452483177185, |
|
"learning_rate": 7.974536506140547e-05, |
|
"loss": 0.0367, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.8515382444256279, |
|
"grad_norm": 0.2924893796443939, |
|
"learning_rate": 7.932117281948021e-05, |
|
"loss": 0.0328, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 1.866591400884373, |
|
"grad_norm": 0.277554452419281, |
|
"learning_rate": 7.889373930243164e-05, |
|
"loss": 0.0343, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.881644557343118, |
|
"grad_norm": 0.23780488967895508, |
|
"learning_rate": 7.846311176040331e-05, |
|
"loss": 0.0352, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.881644557343118, |
|
"eval_loss": 0.035072676837444305, |
|
"eval_runtime": 159.7659, |
|
"eval_samples_per_second": 7.392, |
|
"eval_steps_per_second": 7.392, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.8966977138018628, |
|
"grad_norm": 0.17802149057388306, |
|
"learning_rate": 7.802933779661859e-05, |
|
"loss": 0.0374, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.9117508702606076, |
|
"grad_norm": 0.19435422122478485, |
|
"learning_rate": 7.759246536211844e-05, |
|
"loss": 0.0307, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 1.9268040267193527, |
|
"grad_norm": 0.1490616649389267, |
|
"learning_rate": 7.715254275046062e-05, |
|
"loss": 0.0326, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 1.9418571831780977, |
|
"grad_norm": 0.22022828459739685, |
|
"learning_rate": 7.670961859238124e-05, |
|
"loss": 0.0297, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 1.9569103396368426, |
|
"grad_norm": 0.43419596552848816, |
|
"learning_rate": 7.626374185041886e-05, |
|
"loss": 0.029, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.9569103396368426, |
|
"eval_loss": 0.0350230410695076, |
|
"eval_runtime": 159.6805, |
|
"eval_samples_per_second": 7.396, |
|
"eval_steps_per_second": 7.396, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.9719634960955874, |
|
"grad_norm": 0.24266742169857025, |
|
"learning_rate": 7.581496181350203e-05, |
|
"loss": 0.0263, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 1.9870166525543325, |
|
"grad_norm": 0.17458631098270416, |
|
"learning_rate": 7.536332809150067e-05, |
|
"loss": 0.0281, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 2.0020698090130775, |
|
"grad_norm": 0.27361252903938293, |
|
"learning_rate": 7.490889060974201e-05, |
|
"loss": 0.0346, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 2.0171229654718226, |
|
"grad_norm": 0.27446067333221436, |
|
"learning_rate": 7.445169960349167e-05, |
|
"loss": 0.017, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 2.032176121930567, |
|
"grad_norm": 0.1635950803756714, |
|
"learning_rate": 7.399180561240044e-05, |
|
"loss": 0.0187, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 2.032176121930567, |
|
"eval_loss": 0.03626255691051483, |
|
"eval_runtime": 159.7835, |
|
"eval_samples_per_second": 7.391, |
|
"eval_steps_per_second": 7.391, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 2.0472292783893122, |
|
"grad_norm": 0.10786867886781693, |
|
"learning_rate": 7.352925947491746e-05, |
|
"loss": 0.0119, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 2.0622824348480573, |
|
"grad_norm": 0.13401247560977936, |
|
"learning_rate": 7.306411232267029e-05, |
|
"loss": 0.0175, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 2.0773355913068023, |
|
"grad_norm": 0.18004673719406128, |
|
"learning_rate": 7.259641557481269e-05, |
|
"loss": 0.0172, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 2.092388747765547, |
|
"grad_norm": 0.31352919340133667, |
|
"learning_rate": 7.212622093234049e-05, |
|
"loss": 0.0146, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 2.107441904224292, |
|
"grad_norm": 0.5032555460929871, |
|
"learning_rate": 7.165358037237643e-05, |
|
"loss": 0.025, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.107441904224292, |
|
"eval_loss": 0.038006313145160675, |
|
"eval_runtime": 159.9044, |
|
"eval_samples_per_second": 7.386, |
|
"eval_steps_per_second": 7.386, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.122495060683037, |
|
"grad_norm": 0.28312698006629944, |
|
"learning_rate": 7.117854614242434e-05, |
|
"loss": 0.0249, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 2.137548217141782, |
|
"grad_norm": 0.470378577709198, |
|
"learning_rate": 7.070117075459352e-05, |
|
"loss": 0.022, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 2.1526013736005267, |
|
"grad_norm": 0.1857473999261856, |
|
"learning_rate": 7.022150697979384e-05, |
|
"loss": 0.0244, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 2.167654530059272, |
|
"grad_norm": 0.3747231066226959, |
|
"learning_rate": 6.973960784190237e-05, |
|
"loss": 0.0283, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 2.182707686518017, |
|
"grad_norm": 0.2834364175796509, |
|
"learning_rate": 6.925552661190166e-05, |
|
"loss": 0.0209, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 2.182707686518017, |
|
"eval_loss": 0.03765711560845375, |
|
"eval_runtime": 160.0216, |
|
"eval_samples_per_second": 7.38, |
|
"eval_steps_per_second": 7.38, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 2.197760842976762, |
|
"grad_norm": 0.2789125144481659, |
|
"learning_rate": 6.876931680199121e-05, |
|
"loss": 0.0255, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 2.2128139994355065, |
|
"grad_norm": 0.16488228738307953, |
|
"learning_rate": 6.828103215967186e-05, |
|
"loss": 0.0202, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 2.2278671558942515, |
|
"grad_norm": 0.2753039300441742, |
|
"learning_rate": 6.779072666180446e-05, |
|
"loss": 0.0266, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 2.2429203123529966, |
|
"grad_norm": 0.08052641153335571, |
|
"learning_rate": 6.729845450864294e-05, |
|
"loss": 0.0166, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 2.257973468811741, |
|
"grad_norm": 0.18475492298603058, |
|
"learning_rate": 6.680427011784292e-05, |
|
"loss": 0.0224, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.257973468811741, |
|
"eval_loss": 0.0406508594751358, |
|
"eval_runtime": 160.1992, |
|
"eval_samples_per_second": 7.372, |
|
"eval_steps_per_second": 7.372, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.2730266252704863, |
|
"grad_norm": 0.4654618799686432, |
|
"learning_rate": 6.630822811844604e-05, |
|
"loss": 0.0187, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 2.2880797817292313, |
|
"grad_norm": 0.1831727921962738, |
|
"learning_rate": 6.58103833448412e-05, |
|
"loss": 0.0234, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 2.3031329381879764, |
|
"grad_norm": 0.3095850944519043, |
|
"learning_rate": 6.531079083070288e-05, |
|
"loss": 0.0241, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 2.3181860946467214, |
|
"grad_norm": 0.11199437081813812, |
|
"learning_rate": 6.480950580290752e-05, |
|
"loss": 0.0174, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 2.333239251105466, |
|
"grad_norm": 0.30752846598625183, |
|
"learning_rate": 6.430658367542843e-05, |
|
"loss": 0.0261, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 2.333239251105466, |
|
"eval_loss": 0.039291996508836746, |
|
"eval_runtime": 160.0709, |
|
"eval_samples_per_second": 7.378, |
|
"eval_steps_per_second": 7.378, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 2.348292407564211, |
|
"grad_norm": 0.1692783236503601, |
|
"learning_rate": 6.380208004321036e-05, |
|
"loss": 0.0239, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 2.363345564022956, |
|
"grad_norm": 0.24642743170261383, |
|
"learning_rate": 6.32960506760236e-05, |
|
"loss": 0.0182, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 2.378398720481701, |
|
"grad_norm": 0.2417881041765213, |
|
"learning_rate": 6.278855151229901e-05, |
|
"loss": 0.0169, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 2.393451876940446, |
|
"grad_norm": 0.1943085491657257, |
|
"learning_rate": 6.227963865294444e-05, |
|
"loss": 0.0126, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 2.408505033399191, |
|
"grad_norm": 0.29183781147003174, |
|
"learning_rate": 6.176936835514312e-05, |
|
"loss": 0.0156, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 2.408505033399191, |
|
"eval_loss": 0.038704484701156616, |
|
"eval_runtime": 160.2281, |
|
"eval_samples_per_second": 7.371, |
|
"eval_steps_per_second": 7.371, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 2.423558189857936, |
|
"grad_norm": 0.3246520459651947, |
|
"learning_rate": 6.125779702613471e-05, |
|
"loss": 0.0188, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 2.438611346316681, |
|
"grad_norm": 0.21692079305648804, |
|
"learning_rate": 6.074498121697983e-05, |
|
"loss": 0.0217, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 2.4536645027754256, |
|
"grad_norm": 0.29945269227027893, |
|
"learning_rate": 6.023097761630879e-05, |
|
"loss": 0.0261, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 2.4687176592341706, |
|
"grad_norm": 0.12020477652549744, |
|
"learning_rate": 5.971584304405489e-05, |
|
"loss": 0.016, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 2.4837708156929157, |
|
"grad_norm": 0.18562984466552734, |
|
"learning_rate": 5.919963444517338e-05, |
|
"loss": 0.0173, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 2.4837708156929157, |
|
"eval_loss": 0.038775425404310226, |
|
"eval_runtime": 160.3667, |
|
"eval_samples_per_second": 7.364, |
|
"eval_steps_per_second": 7.364, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 2.4988239721516607, |
|
"grad_norm": 0.45023879408836365, |
|
"learning_rate": 5.868240888334653e-05, |
|
"loss": 0.0204, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 2.5138771286104054, |
|
"grad_norm": 0.40980616211891174, |
|
"learning_rate": 5.816422353467562e-05, |
|
"loss": 0.0163, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 2.5289302850691504, |
|
"grad_norm": 0.12173057347536087, |
|
"learning_rate": 5.7645135681360496e-05, |
|
"loss": 0.0207, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 2.5439834415278955, |
|
"grad_norm": 0.36546701192855835, |
|
"learning_rate": 5.7125202705367234e-05, |
|
"loss": 0.0216, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 2.5590365979866405, |
|
"grad_norm": 0.13067616522312164, |
|
"learning_rate": 5.660448208208513e-05, |
|
"loss": 0.0228, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 2.5590365979866405, |
|
"eval_loss": 0.03666573390364647, |
|
"eval_runtime": 160.2566, |
|
"eval_samples_per_second": 7.369, |
|
"eval_steps_per_second": 7.369, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 2.574089754445385, |
|
"grad_norm": 0.12506772577762604, |
|
"learning_rate": 5.608303137397294e-05, |
|
"loss": 0.0169, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 2.58914291090413, |
|
"grad_norm": 0.27075260877609253, |
|
"learning_rate": 5.5560908224195886e-05, |
|
"loss": 0.0218, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 2.6041960673628752, |
|
"grad_norm": 0.2712065279483795, |
|
"learning_rate": 5.503817035025342e-05, |
|
"loss": 0.0224, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 2.61924922382162, |
|
"grad_norm": 0.5131589770317078, |
|
"learning_rate": 5.4514875537598985e-05, |
|
"loss": 0.019, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 2.634302380280365, |
|
"grad_norm": 0.26789724826812744, |
|
"learning_rate": 5.399108163325217e-05, |
|
"loss": 0.0328, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 2.634302380280365, |
|
"eval_loss": 0.037083230912685394, |
|
"eval_runtime": 160.4016, |
|
"eval_samples_per_second": 7.363, |
|
"eval_steps_per_second": 7.363, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 2.64935553673911, |
|
"grad_norm": 0.20713073015213013, |
|
"learning_rate": 5.346684653940408e-05, |
|
"loss": 0.0188, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 2.664408693197855, |
|
"grad_norm": 0.16466517746448517, |
|
"learning_rate": 5.294222820701661e-05, |
|
"loss": 0.0203, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 2.6794618496566, |
|
"grad_norm": 0.3724817633628845, |
|
"learning_rate": 5.24172846294163e-05, |
|
"loss": 0.0162, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 2.6945150061153447, |
|
"grad_norm": 0.23705260455608368, |
|
"learning_rate": 5.1892073835883524e-05, |
|
"loss": 0.018, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 2.7095681625740897, |
|
"grad_norm": 0.8817890882492065, |
|
"learning_rate": 5.136665388523778e-05, |
|
"loss": 0.0264, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 2.7095681625740897, |
|
"eval_loss": 0.03928447514772415, |
|
"eval_runtime": 160.3293, |
|
"eval_samples_per_second": 7.366, |
|
"eval_steps_per_second": 7.366, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 2.7246213190328348, |
|
"grad_norm": 0.11891370266675949, |
|
"learning_rate": 5.0841082859419585e-05, |
|
"loss": 0.0268, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 2.7396744754915794, |
|
"grad_norm": 0.352401465177536, |
|
"learning_rate": 5.031541885706987e-05, |
|
"loss": 0.0255, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 2.7547276319503244, |
|
"grad_norm": 0.2503037750720978, |
|
"learning_rate": 4.9789719987107545e-05, |
|
"loss": 0.0214, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 2.7697807884090695, |
|
"grad_norm": 0.34862828254699707, |
|
"learning_rate": 4.926404436230596e-05, |
|
"loss": 0.0207, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 2.7848339448678145, |
|
"grad_norm": 0.2952202260494232, |
|
"learning_rate": 4.8738450092868785e-05, |
|
"loss": 0.0235, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 2.7848339448678145, |
|
"eval_loss": 0.036392927169799805, |
|
"eval_runtime": 160.407, |
|
"eval_samples_per_second": 7.363, |
|
"eval_steps_per_second": 7.363, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 2.7998871013265596, |
|
"grad_norm": 0.13503505289554596, |
|
"learning_rate": 4.8212995280006426e-05, |
|
"loss": 0.0189, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 2.814940257785304, |
|
"grad_norm": 0.2581332325935364, |
|
"learning_rate": 4.76877380095132e-05, |
|
"loss": 0.0201, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 2.8299934142440493, |
|
"grad_norm": 0.30006110668182373, |
|
"learning_rate": 4.7162736345346303e-05, |
|
"loss": 0.0154, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 2.8450465707027943, |
|
"grad_norm": 0.5561713576316833, |
|
"learning_rate": 4.663804832320726e-05, |
|
"loss": 0.0191, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 2.860099727161539, |
|
"grad_norm": 0.32717275619506836, |
|
"learning_rate": 4.6113731944126406e-05, |
|
"loss": 0.0204, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 2.860099727161539, |
|
"eval_loss": 0.038577429950237274, |
|
"eval_runtime": 160.6685, |
|
"eval_samples_per_second": 7.351, |
|
"eval_steps_per_second": 7.351, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 2.875152883620284, |
|
"grad_norm": 0.30127692222595215, |
|
"learning_rate": 4.558984516805118e-05, |
|
"loss": 0.0193, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 2.890206040079029, |
|
"grad_norm": 0.31373462080955505, |
|
"learning_rate": 4.5066445907439104e-05, |
|
"loss": 0.0143, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 2.905259196537774, |
|
"grad_norm": 0.47283080220222473, |
|
"learning_rate": 4.454359202085582e-05, |
|
"loss": 0.0242, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 2.920312352996519, |
|
"grad_norm": 0.47461745142936707, |
|
"learning_rate": 4.402134130657925e-05, |
|
"loss": 0.0187, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 2.9353655094552638, |
|
"grad_norm": 0.3949275016784668, |
|
"learning_rate": 4.349975149621039e-05, |
|
"loss": 0.0207, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 2.9353655094552638, |
|
"eval_loss": 0.03719981759786606, |
|
"eval_runtime": 160.6509, |
|
"eval_samples_per_second": 7.351, |
|
"eval_steps_per_second": 7.351, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 2.950418665914009, |
|
"grad_norm": 0.23788753151893616, |
|
"learning_rate": 4.297888024829126e-05, |
|
"loss": 0.0159, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 2.965471822372754, |
|
"grad_norm": 0.2292858064174652, |
|
"learning_rate": 4.2458785141931314e-05, |
|
"loss": 0.0198, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 2.9805249788314985, |
|
"grad_norm": 0.12435556948184967, |
|
"learning_rate": 4.1939523670442316e-05, |
|
"loss": 0.0161, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 2.9955781352902435, |
|
"grad_norm": 0.25317656993865967, |
|
"learning_rate": 4.14211532349828e-05, |
|
"loss": 0.0166, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 3.0106312917489886, |
|
"grad_norm": 0.03211146593093872, |
|
"learning_rate": 4.090373113821281e-05, |
|
"loss": 0.01, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 3.0106312917489886, |
|
"eval_loss": 0.03901160508394241, |
|
"eval_runtime": 160.8018, |
|
"eval_samples_per_second": 7.344, |
|
"eval_steps_per_second": 7.344, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 3.0256844482077336, |
|
"grad_norm": 0.23511217534542084, |
|
"learning_rate": 4.0387314577959315e-05, |
|
"loss": 0.0093, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 3.0407376046664787, |
|
"grad_norm": 0.06058523431420326, |
|
"learning_rate": 3.987196064089346e-05, |
|
"loss": 0.0063, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 3.0557907611252233, |
|
"grad_norm": 0.2121521681547165, |
|
"learning_rate": 3.935772629621995e-05, |
|
"loss": 0.0047, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 3.0708439175839684, |
|
"grad_norm": 0.11164995282888412, |
|
"learning_rate": 3.8844668389379396e-05, |
|
"loss": 0.0084, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 3.0858970740427134, |
|
"grad_norm": 0.3772049844264984, |
|
"learning_rate": 3.833284363576447e-05, |
|
"loss": 0.0103, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 3.0858970740427134, |
|
"eval_loss": 0.043147701770067215, |
|
"eval_runtime": 160.8171, |
|
"eval_samples_per_second": 7.344, |
|
"eval_steps_per_second": 7.344, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 3.1009502305014585, |
|
"grad_norm": 0.551115870475769, |
|
"learning_rate": 3.7822308614450406e-05, |
|
"loss": 0.0063, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 3.116003386960203, |
|
"grad_norm": 0.13338187336921692, |
|
"learning_rate": 3.7313119761940375e-05, |
|
"loss": 0.0136, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 3.131056543418948, |
|
"grad_norm": 0.2778257429599762, |
|
"learning_rate": 3.680533336592694e-05, |
|
"loss": 0.0074, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 3.146109699877693, |
|
"grad_norm": 0.10951809585094452, |
|
"learning_rate": 3.62990055590697e-05, |
|
"loss": 0.0101, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 3.1611628563364382, |
|
"grad_norm": 0.08265303820371628, |
|
"learning_rate": 3.579419231279023e-05, |
|
"loss": 0.0096, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 3.1611628563364382, |
|
"eval_loss": 0.041790470480918884, |
|
"eval_runtime": 160.7872, |
|
"eval_samples_per_second": 7.345, |
|
"eval_steps_per_second": 7.345, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 3.176216012795183, |
|
"grad_norm": 0.032768603414297104, |
|
"learning_rate": 3.529094943108475e-05, |
|
"loss": 0.0072, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 3.191269169253928, |
|
"grad_norm": 0.22910161316394806, |
|
"learning_rate": 3.478933254435534e-05, |
|
"loss": 0.0124, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 3.206322325712673, |
|
"grad_norm": 0.22300395369529724, |
|
"learning_rate": 3.4289397103260346e-05, |
|
"loss": 0.0116, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 3.221375482171418, |
|
"grad_norm": 0.23817072808742523, |
|
"learning_rate": 3.3791198372584664e-05, |
|
"loss": 0.0106, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 3.2364286386301626, |
|
"grad_norm": 0.12127862125635147, |
|
"learning_rate": 3.329479142513051e-05, |
|
"loss": 0.012, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 3.2364286386301626, |
|
"eval_loss": 0.043117210268974304, |
|
"eval_runtime": 160.8539, |
|
"eval_samples_per_second": 7.342, |
|
"eval_steps_per_second": 7.342, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 3.2514817950889077, |
|
"grad_norm": 0.28934839367866516, |
|
"learning_rate": 3.280023113562957e-05, |
|
"loss": 0.0115, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 3.2665349515476527, |
|
"grad_norm": 0.2645263671875, |
|
"learning_rate": 3.230757217467677e-05, |
|
"loss": 0.0163, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 3.2815881080063978, |
|
"grad_norm": 0.29075613617897034, |
|
"learning_rate": 3.1816869002686936e-05, |
|
"loss": 0.0105, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 3.2966412644651424, |
|
"grad_norm": 0.05602017417550087, |
|
"learning_rate": 3.1328175863874464e-05, |
|
"loss": 0.0067, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 3.3116944209238874, |
|
"grad_norm": 0.293961763381958, |
|
"learning_rate": 3.084154678025692e-05, |
|
"loss": 0.0098, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 3.3116944209238874, |
|
"eval_loss": 0.04433353245258331, |
|
"eval_runtime": 160.9207, |
|
"eval_samples_per_second": 7.339, |
|
"eval_steps_per_second": 7.339, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 3.3267475773826325, |
|
"grad_norm": 0.5569509863853455, |
|
"learning_rate": 3.035703554568331e-05, |
|
"loss": 0.0101, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 3.3418007338413775, |
|
"grad_norm": 0.23910270631313324, |
|
"learning_rate": 2.9874695719887464e-05, |
|
"loss": 0.0078, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 3.356853890300122, |
|
"grad_norm": 0.3301239013671875, |
|
"learning_rate": 2.9394580622567312e-05, |
|
"loss": 0.0049, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 3.371907046758867, |
|
"grad_norm": 0.25911498069763184, |
|
"learning_rate": 2.8916743327490803e-05, |
|
"loss": 0.0093, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 3.3869602032176123, |
|
"grad_norm": 0.22444899380207062, |
|
"learning_rate": 2.8441236656628828e-05, |
|
"loss": 0.0104, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 3.3869602032176123, |
|
"eval_loss": 0.04660218954086304, |
|
"eval_runtime": 161.0077, |
|
"eval_samples_per_second": 7.335, |
|
"eval_steps_per_second": 7.335, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 3.4020133596763573, |
|
"grad_norm": 0.18989907205104828, |
|
"learning_rate": 2.79681131743161e-05, |
|
"loss": 0.0038, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 3.417066516135102, |
|
"grad_norm": 0.3380717933177948, |
|
"learning_rate": 2.7497425181440607e-05, |
|
"loss": 0.0083, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 3.432119672593847, |
|
"grad_norm": 0.3728026747703552, |
|
"learning_rate": 2.702922470966187e-05, |
|
"loss": 0.0076, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 3.447172829052592, |
|
"grad_norm": 0.47184136509895325, |
|
"learning_rate": 2.6563563515659306e-05, |
|
"loss": 0.0094, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 3.462225985511337, |
|
"grad_norm": 0.5333015322685242, |
|
"learning_rate": 2.6100493075410848e-05, |
|
"loss": 0.0061, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 3.462225985511337, |
|
"eval_loss": 0.04809270426630974, |
|
"eval_runtime": 160.9513, |
|
"eval_samples_per_second": 7.338, |
|
"eval_steps_per_second": 7.338, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 3.4772791419700817, |
|
"grad_norm": 0.4247993230819702, |
|
"learning_rate": 2.5640064578502497e-05, |
|
"loss": 0.0135, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 3.4923322984288268, |
|
"grad_norm": 0.4623792767524719, |
|
"learning_rate": 2.5182328922469723e-05, |
|
"loss": 0.011, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 3.507385454887572, |
|
"grad_norm": 0.37935352325439453, |
|
"learning_rate": 2.4727336707170973e-05, |
|
"loss": 0.0118, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 3.522438611346317, |
|
"grad_norm": 0.13567650318145752, |
|
"learning_rate": 2.427513822919424e-05, |
|
"loss": 0.0097, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 3.5374917678050615, |
|
"grad_norm": 0.3444633483886719, |
|
"learning_rate": 2.3825783476297087e-05, |
|
"loss": 0.0051, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 3.5374917678050615, |
|
"eval_loss": 0.04625854268670082, |
|
"eval_runtime": 160.9561, |
|
"eval_samples_per_second": 7.337, |
|
"eval_steps_per_second": 7.337, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 3.5525449242638065, |
|
"grad_norm": 0.2048613429069519, |
|
"learning_rate": 2.337932212188073e-05, |
|
"loss": 0.0113, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 3.5675980807225516, |
|
"grad_norm": 0.30786871910095215, |
|
"learning_rate": 2.2935803519499e-05, |
|
"loss": 0.0086, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 3.582651237181296, |
|
"grad_norm": 0.3959846794605255, |
|
"learning_rate": 2.2495276697402662e-05, |
|
"loss": 0.0079, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 3.5977043936400412, |
|
"grad_norm": 0.08823379129171371, |
|
"learning_rate": 2.2057790353119535e-05, |
|
"loss": 0.0077, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 3.6127575500987863, |
|
"grad_norm": 0.09143965691328049, |
|
"learning_rate": 2.1623392848071354e-05, |
|
"loss": 0.0085, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 3.6127575500987863, |
|
"eval_loss": 0.045845236629247665, |
|
"eval_runtime": 161.0434, |
|
"eval_samples_per_second": 7.333, |
|
"eval_steps_per_second": 7.333, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 3.6278107065575314, |
|
"grad_norm": 0.031368013471364975, |
|
"learning_rate": 2.1192132202227677e-05, |
|
"loss": 0.0082, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 3.6428638630162764, |
|
"grad_norm": 0.35049495100975037, |
|
"learning_rate": 2.0764056088797645e-05, |
|
"loss": 0.0067, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 3.657917019475021, |
|
"grad_norm": 0.40344491600990295, |
|
"learning_rate": 2.0339211828959904e-05, |
|
"loss": 0.0057, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 3.672970175933766, |
|
"grad_norm": 0.22775594890117645, |
|
"learning_rate": 1.9917646386631577e-05, |
|
"loss": 0.0064, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 3.688023332392511, |
|
"grad_norm": 0.025965895503759384, |
|
"learning_rate": 1.949940636327671e-05, |
|
"loss": 0.0093, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 3.688023332392511, |
|
"eval_loss": 0.04670306667685509, |
|
"eval_runtime": 161.0288, |
|
"eval_samples_per_second": 7.334, |
|
"eval_steps_per_second": 7.334, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 3.7030764888512557, |
|
"grad_norm": 0.36296582221984863, |
|
"learning_rate": 1.9084537992754792e-05, |
|
"loss": 0.0064, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 3.718129645310001, |
|
"grad_norm": 0.05511193722486496, |
|
"learning_rate": 1.8673087136209803e-05, |
|
"loss": 0.0056, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 3.733182801768746, |
|
"grad_norm": 0.03270444646477699, |
|
"learning_rate": 1.8265099277000614e-05, |
|
"loss": 0.006, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 3.748235958227491, |
|
"grad_norm": 0.2531210780143738, |
|
"learning_rate": 1.7860619515673033e-05, |
|
"loss": 0.0081, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 3.763289114686236, |
|
"grad_norm": 1.2437455654144287, |
|
"learning_rate": 1.7459692564974316e-05, |
|
"loss": 0.0103, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 3.763289114686236, |
|
"eval_loss": 0.04770328849554062, |
|
"eval_runtime": 161.0027, |
|
"eval_samples_per_second": 7.335, |
|
"eval_steps_per_second": 7.335, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 3.7783422711449806, |
|
"grad_norm": 0.21093598008155823, |
|
"learning_rate": 1.7062362744910322e-05, |
|
"loss": 0.0053, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 3.7933954276037256, |
|
"grad_norm": 0.33043161034584045, |
|
"learning_rate": 1.6668673977846254e-05, |
|
"loss": 0.0071, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 3.8084485840624707, |
|
"grad_norm": 0.1768001765012741, |
|
"learning_rate": 1.6278669783651395e-05, |
|
"loss": 0.0053, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 3.8235017405212153, |
|
"grad_norm": 0.09979119896888733, |
|
"learning_rate": 1.589239327488812e-05, |
|
"loss": 0.0065, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 3.8385548969799603, |
|
"grad_norm": 0.06995889544487, |
|
"learning_rate": 1.5509887152046137e-05, |
|
"loss": 0.0066, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 3.8385548969799603, |
|
"eval_loss": 0.048698194324970245, |
|
"eval_runtime": 161.1062, |
|
"eval_samples_per_second": 7.331, |
|
"eval_steps_per_second": 7.331, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 3.8536080534387054, |
|
"grad_norm": 0.30948641896247864, |
|
"learning_rate": 1.5131193698822232e-05, |
|
"loss": 0.0085, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 3.8686612098974504, |
|
"grad_norm": 0.21722477674484253, |
|
"learning_rate": 1.4756354777446001e-05, |
|
"loss": 0.007, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 3.8837143663561955, |
|
"grad_norm": 0.4803875684738159, |
|
"learning_rate": 1.4385411824052342e-05, |
|
"loss": 0.0069, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 3.89876752281494, |
|
"grad_norm": 0.31911638379096985, |
|
"learning_rate": 1.4018405844100812e-05, |
|
"loss": 0.0084, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 3.913820679273685, |
|
"grad_norm": 0.5358151197433472, |
|
"learning_rate": 1.3655377407842812e-05, |
|
"loss": 0.0101, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 3.913820679273685, |
|
"eval_loss": 0.04765944555401802, |
|
"eval_runtime": 161.2114, |
|
"eval_samples_per_second": 7.326, |
|
"eval_steps_per_second": 7.326, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 3.92887383573243, |
|
"grad_norm": 0.04455338418483734, |
|
"learning_rate": 1.3296366645836822e-05, |
|
"loss": 0.0067, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 3.943926992191175, |
|
"grad_norm": 0.34614667296409607, |
|
"learning_rate": 1.2941413244512113e-05, |
|
"loss": 0.0088, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 3.95898014864992, |
|
"grad_norm": 0.2596482038497925, |
|
"learning_rate": 1.2590556441781725e-05, |
|
"loss": 0.0078, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 3.974033305108665, |
|
"grad_norm": 0.4142569601535797, |
|
"learning_rate": 1.2243835022705003e-05, |
|
"loss": 0.0102, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 3.98908646156741, |
|
"grad_norm": 0.302739679813385, |
|
"learning_rate": 1.1901287315199977e-05, |
|
"loss": 0.0104, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 3.98908646156741, |
|
"eval_loss": 0.04554565250873566, |
|
"eval_runtime": 161.125, |
|
"eval_samples_per_second": 7.33, |
|
"eval_steps_per_second": 7.33, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 4.004139618026155, |
|
"grad_norm": 0.37217289209365845, |
|
"learning_rate": 1.1562951185806676e-05, |
|
"loss": 0.0066, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 4.0191927744849, |
|
"grad_norm": 0.054262690246105194, |
|
"learning_rate": 1.1228864035501069e-05, |
|
"loss": 0.0038, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 4.034245930943645, |
|
"grad_norm": 0.3357163965702057, |
|
"learning_rate": 1.0899062795560573e-05, |
|
"loss": 0.0033, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 4.04929908740239, |
|
"grad_norm": 0.25357311964035034, |
|
"learning_rate": 1.0573583923481711e-05, |
|
"loss": 0.0047, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 4.064352243861134, |
|
"grad_norm": 0.34490883350372314, |
|
"learning_rate": 1.0252463398949792e-05, |
|
"loss": 0.0073, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 4.064352243861134, |
|
"eval_loss": 0.046002693474292755, |
|
"eval_runtime": 161.1762, |
|
"eval_samples_per_second": 7.327, |
|
"eval_steps_per_second": 7.327, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 4.07940540031988, |
|
"grad_norm": 0.12939685583114624, |
|
"learning_rate": 9.935736719861622e-06, |
|
"loss": 0.0033, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 4.0944585567786245, |
|
"grad_norm": 0.0437207855284214, |
|
"learning_rate": 9.62343889840151e-06, |
|
"loss": 0.0043, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 4.109511713237369, |
|
"grad_norm": 0.10753034800291061, |
|
"learning_rate": 9.315604457170768e-06, |
|
"loss": 0.0028, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 4.124564869696115, |
|
"grad_norm": 0.04752466827630997, |
|
"learning_rate": 9.012267425371513e-06, |
|
"loss": 0.0024, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 4.139618026154859, |
|
"grad_norm": 0.11159101128578186, |
|
"learning_rate": 8.71346133504498e-06, |
|
"loss": 0.003, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 4.139618026154859, |
|
"eval_loss": 0.04775209724903107, |
|
"eval_runtime": 161.2317, |
|
"eval_samples_per_second": 7.325, |
|
"eval_steps_per_second": 7.325, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 4.154671182613605, |
|
"grad_norm": 0.06603502482175827, |
|
"learning_rate": 8.419219217364654e-06, |
|
"loss": 0.0023, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 4.169724339072349, |
|
"grad_norm": 0.08250313252210617, |
|
"learning_rate": 8.129573598984997e-06, |
|
"loss": 0.0031, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 4.184777495531094, |
|
"grad_norm": 0.33163315057754517, |
|
"learning_rate": 7.844556498445788e-06, |
|
"loss": 0.0023, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 4.199830651989839, |
|
"grad_norm": 0.32914066314697266, |
|
"learning_rate": 7.564199422632579e-06, |
|
"loss": 0.0053, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 4.214883808448584, |
|
"grad_norm": 0.027555787935853004, |
|
"learning_rate": 7.288533363293959e-06, |
|
"loss": 0.0025, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 4.214883808448584, |
|
"eval_loss": 0.04977192357182503, |
|
"eval_runtime": 161.302, |
|
"eval_samples_per_second": 7.322, |
|
"eval_steps_per_second": 7.322, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 4.229936964907329, |
|
"grad_norm": 0.4597860872745514, |
|
"learning_rate": 7.017588793615498e-06, |
|
"loss": 0.0045, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 4.244990121366074, |
|
"grad_norm": 0.0674595907330513, |
|
"learning_rate": 6.751395664851135e-06, |
|
"loss": 0.0037, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 4.260043277824819, |
|
"grad_norm": 0.09473542869091034, |
|
"learning_rate": 6.489983403012312e-06, |
|
"loss": 0.0021, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 4.275096434283564, |
|
"grad_norm": 0.4772437810897827, |
|
"learning_rate": 6.233380905615049e-06, |
|
"loss": 0.003, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 4.290149590742309, |
|
"grad_norm": 0.14856751263141632, |
|
"learning_rate": 5.981616538485496e-06, |
|
"loss": 0.0051, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 4.290149590742309, |
|
"eval_loss": 0.0505627803504467, |
|
"eval_runtime": 161.4919, |
|
"eval_samples_per_second": 7.313, |
|
"eval_steps_per_second": 7.313, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 4.3052027472010534, |
|
"grad_norm": 0.07821979373693466, |
|
"learning_rate": 5.73471813262435e-06, |
|
"loss": 0.0021, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 4.320255903659799, |
|
"grad_norm": 0.07666800916194916, |
|
"learning_rate": 5.4927129811301715e-06, |
|
"loss": 0.0019, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 4.335309060118544, |
|
"grad_norm": 0.06206662952899933, |
|
"learning_rate": 5.255627836182453e-06, |
|
"loss": 0.0021, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 4.350362216577288, |
|
"grad_norm": 0.17196908593177795, |
|
"learning_rate": 5.0234889060842176e-06, |
|
"loss": 0.0042, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 4.365415373036034, |
|
"grad_norm": 0.05123843625187874, |
|
"learning_rate": 4.796321852364877e-06, |
|
"loss": 0.0078, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 4.365415373036034, |
|
"eval_loss": 0.051590368151664734, |
|
"eval_runtime": 161.4863, |
|
"eval_samples_per_second": 7.313, |
|
"eval_steps_per_second": 7.313, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 4.380468529494778, |
|
"grad_norm": 0.09933144599199295, |
|
"learning_rate": 4.5741517869435706e-06, |
|
"loss": 0.0016, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 4.395521685953524, |
|
"grad_norm": 0.1274539679288864, |
|
"learning_rate": 4.357003269353105e-06, |
|
"loss": 0.003, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 4.410574842412268, |
|
"grad_norm": 0.0829009860754013, |
|
"learning_rate": 4.144900304025101e-06, |
|
"loss": 0.0039, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 4.425627998871013, |
|
"grad_norm": 0.0254693403840065, |
|
"learning_rate": 3.937866337636459e-06, |
|
"loss": 0.0042, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 4.4406811553297585, |
|
"grad_norm": 0.22674445807933807, |
|
"learning_rate": 3.7359242565174423e-06, |
|
"loss": 0.0019, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 4.4406811553297585, |
|
"eval_loss": 0.05175633728504181, |
|
"eval_runtime": 161.6011, |
|
"eval_samples_per_second": 7.308, |
|
"eval_steps_per_second": 7.308, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 4.455734311788503, |
|
"grad_norm": 0.04479767754673958, |
|
"learning_rate": 3.539096384121743e-06, |
|
"loss": 0.0052, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 4.470787468247248, |
|
"grad_norm": 0.050800859928131104, |
|
"learning_rate": 3.34740447855878e-06, |
|
"loss": 0.0021, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 4.485840624705993, |
|
"grad_norm": 0.3891144096851349, |
|
"learning_rate": 3.160869730188465e-06, |
|
"loss": 0.0038, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 4.500893781164738, |
|
"grad_norm": 0.05956409499049187, |
|
"learning_rate": 2.9795127592787186e-06, |
|
"loss": 0.0038, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 4.515946937623482, |
|
"grad_norm": 0.3809262812137604, |
|
"learning_rate": 2.803353613726056e-06, |
|
"loss": 0.0029, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 4.515946937623482, |
|
"eval_loss": 0.05206470191478729, |
|
"eval_runtime": 161.5497, |
|
"eval_samples_per_second": 7.31, |
|
"eval_steps_per_second": 7.31, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 4.531000094082228, |
|
"grad_norm": 0.08371811360120773, |
|
"learning_rate": 2.6324117668393877e-06, |
|
"loss": 0.0039, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 4.5460532505409725, |
|
"grad_norm": 0.2737991511821747, |
|
"learning_rate": 2.466706115187406e-06, |
|
"loss": 0.002, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 4.561106406999718, |
|
"grad_norm": 0.08303394168615341, |
|
"learning_rate": 2.3062549765096364e-06, |
|
"loss": 0.002, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 4.576159563458463, |
|
"grad_norm": 0.29287809133529663, |
|
"learning_rate": 2.1510760876915505e-06, |
|
"loss": 0.0065, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 4.591212719917207, |
|
"grad_norm": 0.22874212265014648, |
|
"learning_rate": 2.0011866028038617e-06, |
|
"loss": 0.0048, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 4.591212719917207, |
|
"eval_loss": 0.052139151841402054, |
|
"eval_runtime": 161.5292, |
|
"eval_samples_per_second": 7.311, |
|
"eval_steps_per_second": 7.311, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 4.606265876375953, |
|
"grad_norm": 0.13931158185005188, |
|
"learning_rate": 1.8566030912062549e-06, |
|
"loss": 0.0022, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 4.621319032834697, |
|
"grad_norm": 0.13953697681427002, |
|
"learning_rate": 1.717341535715733e-06, |
|
"loss": 0.0026, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 4.636372189293443, |
|
"grad_norm": 0.10214455425739288, |
|
"learning_rate": 1.5834173308397982e-06, |
|
"loss": 0.0029, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 4.6514253457521875, |
|
"grad_norm": 0.14687424898147583, |
|
"learning_rate": 1.4548452810747403e-06, |
|
"loss": 0.0039, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 4.666478502210932, |
|
"grad_norm": 0.17296817898750305, |
|
"learning_rate": 1.33163959926903e-06, |
|
"loss": 0.0033, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 4.666478502210932, |
|
"eval_loss": 0.05249761790037155, |
|
"eval_runtime": 161.6303, |
|
"eval_samples_per_second": 7.307, |
|
"eval_steps_per_second": 7.307, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 4.681531658669678, |
|
"grad_norm": 0.11973539739847183, |
|
"learning_rate": 1.2138139050522023e-06, |
|
"loss": 0.0041, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 4.696584815128422, |
|
"grad_norm": 0.16553695499897003, |
|
"learning_rate": 1.101381223329301e-06, |
|
"loss": 0.0028, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 4.711637971587168, |
|
"grad_norm": 0.036532752215862274, |
|
"learning_rate": 9.943539828410342e-07, |
|
"loss": 0.0027, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 4.726691128045912, |
|
"grad_norm": 0.08354666829109192, |
|
"learning_rate": 8.927440147898702e-07, |
|
"loss": 0.0015, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 4.741744284504657, |
|
"grad_norm": 0.14718425273895264, |
|
"learning_rate": 7.96562551532154e-07, |
|
"loss": 0.002, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 4.741744284504657, |
|
"eval_loss": 0.05251290276646614, |
|
"eval_runtime": 161.6656, |
|
"eval_samples_per_second": 7.305, |
|
"eval_steps_per_second": 7.305, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 4.756797440963402, |
|
"grad_norm": 0.019017163664102554, |
|
"learning_rate": 7.05820225336451e-07, |
|
"loss": 0.0022, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 4.771850597422147, |
|
"grad_norm": 0.06493131816387177, |
|
"learning_rate": 6.20527067208232e-07, |
|
"loss": 0.002, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 4.786903753880892, |
|
"grad_norm": 0.13360609114170074, |
|
"learning_rate": 5.406925057809653e-07, |
|
"loss": 0.0026, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 4.801956910339637, |
|
"grad_norm": 0.26471206545829773, |
|
"learning_rate": 4.6632536627386756e-07, |
|
"loss": 0.002, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 4.817010066798382, |
|
"grad_norm": 0.1764497309923172, |
|
"learning_rate": 3.974338695163393e-07, |
|
"loss": 0.0017, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 4.817010066798382, |
|
"eval_loss": 0.05277745798230171, |
|
"eval_runtime": 161.6536, |
|
"eval_samples_per_second": 7.306, |
|
"eval_steps_per_second": 7.306, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 4.832063223257126, |
|
"grad_norm": 0.055088795721530914, |
|
"learning_rate": 3.3402563103916984e-07, |
|
"loss": 0.001, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 4.847116379715872, |
|
"grad_norm": 0.059630122035741806, |
|
"learning_rate": 2.7610766023271615e-07, |
|
"loss": 0.0013, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 4.8621695361746164, |
|
"grad_norm": 0.031690843403339386, |
|
"learning_rate": 2.2368635957205618e-07, |
|
"loss": 0.0029, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 4.877222692633362, |
|
"grad_norm": 0.09654844552278519, |
|
"learning_rate": 1.7676752390920482e-07, |
|
"loss": 0.0038, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 4.8922758490921066, |
|
"grad_norm": 0.6978961825370789, |
|
"learning_rate": 1.3535633983257078e-07, |
|
"loss": 0.003, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 4.8922758490921066, |
|
"eval_loss": 0.05285593494772911, |
|
"eval_runtime": 161.6775, |
|
"eval_samples_per_second": 7.305, |
|
"eval_steps_per_second": 7.305, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 4.907329005550851, |
|
"grad_norm": 0.05225907266139984, |
|
"learning_rate": 9.945738509358205e-08, |
|
"loss": 0.0028, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 4.922382162009597, |
|
"grad_norm": 0.04852062091231346, |
|
"learning_rate": 6.907462810065158e-08, |
|
"loss": 0.0015, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 4.937435318468341, |
|
"grad_norm": 0.24554955959320068, |
|
"learning_rate": 4.4211427480500554e-08, |
|
"loss": 0.0056, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 4.952488474927087, |
|
"grad_norm": 0.01633327081799507, |
|
"learning_rate": 2.4870531706872034e-08, |
|
"loss": 0.0038, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 4.967541631385831, |
|
"grad_norm": 0.20463958382606506, |
|
"learning_rate": 1.105407879670728e-08, |
|
"loss": 0.0021, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 4.967541631385831, |
|
"eval_loss": 0.05255585163831711, |
|
"eval_runtime": 161.6589, |
|
"eval_samples_per_second": 7.306, |
|
"eval_steps_per_second": 7.306, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 4.982594787844576, |
|
"grad_norm": 0.07524765282869339, |
|
"learning_rate": 2.763596073807051e-09, |
|
"loss": 0.0022, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 4.9976479443033215, |
|
"grad_norm": 0.0663004145026207, |
|
"learning_rate": 0.0, |
|
"loss": 0.002, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 4.9976479443033215, |
|
"step": 3320, |
|
"total_flos": 8.339771893214085e+17, |
|
"train_loss": 0.04091864959056298, |
|
"train_runtime": 36302.3812, |
|
"train_samples_per_second": 1.464, |
|
"train_steps_per_second": 0.091 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 3320, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 8.339771893214085e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|