|
{ |
|
"best_metric": 3648.62841796875, |
|
"best_model_checkpoint": "./ckpts/tiny_llama_v1.1/int1-g128/checkpoint-400", |
|
"epoch": 4.0, |
|
"eval_steps": 4, |
|
"global_step": 400, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.0, |
|
"loss": 11380.3086, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 2e-05, |
|
"loss": 12868.3359, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 2e-05, |
|
"loss": 12883.2969, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 2e-05, |
|
"loss": 11909.3701, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 10881.7998046875, |
|
"eval_runtime": 94.8632, |
|
"eval_samples_per_second": 16.824, |
|
"eval_steps_per_second": 1.054, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 2e-05, |
|
"loss": 10834.709, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 2e-05, |
|
"loss": 11078.376, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 2e-05, |
|
"loss": 8636.8652, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 2e-05, |
|
"loss": 9161.7168, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 7404.39404296875, |
|
"eval_runtime": 95.0792, |
|
"eval_samples_per_second": 16.786, |
|
"eval_steps_per_second": 1.052, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 2e-05, |
|
"loss": 8061.96, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"learning_rate": 2e-05, |
|
"loss": 7025.2109, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"learning_rate": 2e-05, |
|
"loss": 10119.207, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 2e-05, |
|
"loss": 8901.1523, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"eval_loss": 7435.04150390625, |
|
"eval_runtime": 95.107, |
|
"eval_samples_per_second": 16.781, |
|
"eval_steps_per_second": 1.051, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"learning_rate": 2e-05, |
|
"loss": 6522.3311, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"learning_rate": 2e-05, |
|
"loss": 7874.7578, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"learning_rate": 2e-05, |
|
"loss": 7117.9897, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 2e-05, |
|
"loss": 7384.1929, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"eval_loss": 7038.021484375, |
|
"eval_runtime": 95.1143, |
|
"eval_samples_per_second": 16.78, |
|
"eval_steps_per_second": 1.051, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"learning_rate": 2e-05, |
|
"loss": 6837.7974, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"learning_rate": 2e-05, |
|
"loss": 6401.292, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"learning_rate": 2e-05, |
|
"loss": 7396.1997, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 2e-05, |
|
"loss": 5846.4893, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"eval_loss": 6782.6787109375, |
|
"eval_runtime": 95.1226, |
|
"eval_samples_per_second": 16.778, |
|
"eval_steps_per_second": 1.051, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"learning_rate": 2e-05, |
|
"loss": 7062.3677, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"learning_rate": 2e-05, |
|
"loss": 6488.6855, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"learning_rate": 2e-05, |
|
"loss": 7019.1787, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"learning_rate": 2e-05, |
|
"loss": 6871.1865, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"eval_loss": 6575.58154296875, |
|
"eval_runtime": 95.0095, |
|
"eval_samples_per_second": 16.798, |
|
"eval_steps_per_second": 1.053, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"learning_rate": 2e-05, |
|
"loss": 6340.9995, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"learning_rate": 2e-05, |
|
"loss": 6536.9585, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"learning_rate": 2e-05, |
|
"loss": 6144.7646, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"learning_rate": 2e-05, |
|
"loss": 6152.7886, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"eval_loss": 6434.06640625, |
|
"eval_runtime": 95.1456, |
|
"eval_samples_per_second": 16.774, |
|
"eval_steps_per_second": 1.051, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"learning_rate": 2e-05, |
|
"loss": 5927.832, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"learning_rate": 2e-05, |
|
"loss": 6375.9375, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"learning_rate": 2e-05, |
|
"loss": 6714.1953, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"learning_rate": 2e-05, |
|
"loss": 5591.8081, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"eval_loss": 6206.52197265625, |
|
"eval_runtime": 95.0989, |
|
"eval_samples_per_second": 16.783, |
|
"eval_steps_per_second": 1.052, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"learning_rate": 2e-05, |
|
"loss": 6417.4902, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"learning_rate": 2e-05, |
|
"loss": 5807.3662, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"learning_rate": 2e-05, |
|
"loss": 6891.5049, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"learning_rate": 2e-05, |
|
"loss": 6353.3838, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"eval_loss": 6035.97705078125, |
|
"eval_runtime": 95.1008, |
|
"eval_samples_per_second": 16.782, |
|
"eval_steps_per_second": 1.052, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"learning_rate": 2e-05, |
|
"loss": 5999.5938, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"learning_rate": 2e-05, |
|
"loss": 5384.0732, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"learning_rate": 2e-05, |
|
"loss": 6043.8564, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"learning_rate": 2e-05, |
|
"loss": 5397.1992, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"eval_loss": 5899.00927734375, |
|
"eval_runtime": 95.1465, |
|
"eval_samples_per_second": 16.774, |
|
"eval_steps_per_second": 1.051, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"learning_rate": 2e-05, |
|
"loss": 5770.0815, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"learning_rate": 2e-05, |
|
"loss": 5922.8789, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"learning_rate": 2e-05, |
|
"loss": 5941.6562, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"learning_rate": 2e-05, |
|
"loss": 6474.5508, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"eval_loss": 5830.09716796875, |
|
"eval_runtime": 94.9508, |
|
"eval_samples_per_second": 16.809, |
|
"eval_steps_per_second": 1.053, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"learning_rate": 2e-05, |
|
"loss": 6641.1704, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"learning_rate": 2e-05, |
|
"loss": 5292.1631, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"learning_rate": 2e-05, |
|
"loss": 5402.1309, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"learning_rate": 2e-05, |
|
"loss": 5108.5684, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"eval_loss": 5756.92724609375, |
|
"eval_runtime": 95.3749, |
|
"eval_samples_per_second": 16.734, |
|
"eval_steps_per_second": 1.048, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"learning_rate": 2e-05, |
|
"loss": 5287.3154, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"learning_rate": 2e-05, |
|
"loss": 5745.0361, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"learning_rate": 2e-05, |
|
"loss": 5594.8262, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"learning_rate": 2e-05, |
|
"loss": 6582.165, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"eval_loss": 5648.44091796875, |
|
"eval_runtime": 95.5253, |
|
"eval_samples_per_second": 16.708, |
|
"eval_steps_per_second": 1.047, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"learning_rate": 2e-05, |
|
"loss": 6087.4844, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"learning_rate": 2e-05, |
|
"loss": 6035.2637, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"learning_rate": 2e-05, |
|
"loss": 6145.3789, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"learning_rate": 2e-05, |
|
"loss": 5178.7305, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"eval_loss": 5449.46240234375, |
|
"eval_runtime": 95.1764, |
|
"eval_samples_per_second": 16.769, |
|
"eval_steps_per_second": 1.051, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"learning_rate": 2e-05, |
|
"loss": 5329.77, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"learning_rate": 2e-05, |
|
"loss": 6133.9678, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"learning_rate": 2e-05, |
|
"loss": 5170.7642, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"learning_rate": 2e-05, |
|
"loss": 4754.7891, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"eval_loss": 5294.06494140625, |
|
"eval_runtime": 95.2108, |
|
"eval_samples_per_second": 16.763, |
|
"eval_steps_per_second": 1.05, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"learning_rate": 2e-05, |
|
"loss": 5026.2295, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"learning_rate": 2e-05, |
|
"loss": 5056.1416, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"learning_rate": 2e-05, |
|
"loss": 4999.1313, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"learning_rate": 2e-05, |
|
"loss": 5597.2129, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"eval_loss": 5080.79638671875, |
|
"eval_runtime": 95.0431, |
|
"eval_samples_per_second": 16.792, |
|
"eval_steps_per_second": 1.052, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"learning_rate": 2e-05, |
|
"loss": 5225.0186, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"learning_rate": 2e-05, |
|
"loss": 4696.772, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"learning_rate": 2e-05, |
|
"loss": 4756.3345, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"learning_rate": 2e-05, |
|
"loss": 5449.8564, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"eval_loss": 4948.94287109375, |
|
"eval_runtime": 95.1267, |
|
"eval_samples_per_second": 16.778, |
|
"eval_steps_per_second": 1.051, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"learning_rate": 2e-05, |
|
"loss": 5115.0347, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"learning_rate": 2e-05, |
|
"loss": 4621.6206, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"learning_rate": 2e-05, |
|
"loss": 4672.1475, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"learning_rate": 2e-05, |
|
"loss": 5242.2466, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"eval_loss": 4875.96826171875, |
|
"eval_runtime": 95.1339, |
|
"eval_samples_per_second": 16.776, |
|
"eval_steps_per_second": 1.051, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"learning_rate": 2e-05, |
|
"loss": 4792.4268, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"learning_rate": 2e-05, |
|
"loss": 4925.5439, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"learning_rate": 2e-05, |
|
"loss": 4519.2134, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"learning_rate": 2e-05, |
|
"loss": 4790.4316, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"eval_loss": 4870.0810546875, |
|
"eval_runtime": 95.1754, |
|
"eval_samples_per_second": 16.769, |
|
"eval_steps_per_second": 1.051, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"learning_rate": 2e-05, |
|
"loss": 5035.2295, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"learning_rate": 2e-05, |
|
"loss": 4832.3154, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"learning_rate": 2e-05, |
|
"loss": 5150.7158, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"learning_rate": 2e-05, |
|
"loss": 4470.4863, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"eval_loss": 4762.04150390625, |
|
"eval_runtime": 95.1952, |
|
"eval_samples_per_second": 16.766, |
|
"eval_steps_per_second": 1.05, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"learning_rate": 2e-05, |
|
"loss": 4649.1685, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"learning_rate": 2e-05, |
|
"loss": 4672.1406, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"learning_rate": 2e-05, |
|
"loss": 5002.249, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"learning_rate": 2e-05, |
|
"loss": 4852.25, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"eval_loss": 4709.67431640625, |
|
"eval_runtime": 95.0691, |
|
"eval_samples_per_second": 16.788, |
|
"eval_steps_per_second": 1.052, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"learning_rate": 2e-05, |
|
"loss": 4892.4893, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"learning_rate": 2e-05, |
|
"loss": 4282.9248, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"learning_rate": 2e-05, |
|
"loss": 4886.5312, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"learning_rate": 2e-05, |
|
"loss": 4792.9912, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"eval_loss": 4597.8984375, |
|
"eval_runtime": 95.2285, |
|
"eval_samples_per_second": 16.76, |
|
"eval_steps_per_second": 1.05, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"learning_rate": 2e-05, |
|
"loss": 3879.5151, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"learning_rate": 2e-05, |
|
"loss": 4130.7891, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"learning_rate": 2e-05, |
|
"loss": 4897.5684, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"learning_rate": 2e-05, |
|
"loss": 5063.8096, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"eval_loss": 4595.23046875, |
|
"eval_runtime": 95.2271, |
|
"eval_samples_per_second": 16.76, |
|
"eval_steps_per_second": 1.05, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"learning_rate": 2e-05, |
|
"loss": 4932.4224, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"learning_rate": 2e-05, |
|
"loss": 4803.2412, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"learning_rate": 2e-05, |
|
"loss": 4688.5557, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"learning_rate": 2e-05, |
|
"loss": 4638.0762, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"eval_loss": 4560.80810546875, |
|
"eval_runtime": 95.1641, |
|
"eval_samples_per_second": 16.771, |
|
"eval_steps_per_second": 1.051, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"learning_rate": 2e-05, |
|
"loss": 4238.0225, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"learning_rate": 2e-05, |
|
"loss": 4853.1382, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"learning_rate": 2e-05, |
|
"loss": 5206.4658, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"learning_rate": 2e-05, |
|
"loss": 3976.3425, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 4543.9541015625, |
|
"eval_runtime": 95.1889, |
|
"eval_samples_per_second": 16.767, |
|
"eval_steps_per_second": 1.051, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"learning_rate": 2e-05, |
|
"loss": 4047.6965, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"learning_rate": 2e-05, |
|
"loss": 4423.4912, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"learning_rate": 2e-05, |
|
"loss": 4422.5103, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"learning_rate": 2e-05, |
|
"loss": 4676.5659, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"eval_loss": 4510.7314453125, |
|
"eval_runtime": 95.0706, |
|
"eval_samples_per_second": 16.788, |
|
"eval_steps_per_second": 1.052, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"learning_rate": 2e-05, |
|
"loss": 4369.8477, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"learning_rate": 2e-05, |
|
"loss": 4672.6108, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"learning_rate": 2e-05, |
|
"loss": 4738.3755, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"learning_rate": 2e-05, |
|
"loss": 4361.5522, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"eval_loss": 4447.572265625, |
|
"eval_runtime": 95.1237, |
|
"eval_samples_per_second": 16.778, |
|
"eval_steps_per_second": 1.051, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"learning_rate": 2e-05, |
|
"loss": 4602.5703, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"learning_rate": 2e-05, |
|
"loss": 4380.708, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"learning_rate": 2e-05, |
|
"loss": 4037.3726, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"learning_rate": 2e-05, |
|
"loss": 4673.7178, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"eval_loss": 4398.06640625, |
|
"eval_runtime": 95.1992, |
|
"eval_samples_per_second": 16.765, |
|
"eval_steps_per_second": 1.05, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"learning_rate": 2e-05, |
|
"loss": 4349.0542, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"learning_rate": 2e-05, |
|
"loss": 4754.3525, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"learning_rate": 2e-05, |
|
"loss": 3983.9561, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"learning_rate": 2e-05, |
|
"loss": 3387.5273, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"eval_loss": 4363.39501953125, |
|
"eval_runtime": 95.226, |
|
"eval_samples_per_second": 16.76, |
|
"eval_steps_per_second": 1.05, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"learning_rate": 2e-05, |
|
"loss": 4539.5605, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"learning_rate": 2e-05, |
|
"loss": 4573.0249, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"learning_rate": 2e-05, |
|
"loss": 4173.1982, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"learning_rate": 2e-05, |
|
"loss": 4301.8296, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"eval_loss": 4314.70166015625, |
|
"eval_runtime": 95.1567, |
|
"eval_samples_per_second": 16.772, |
|
"eval_steps_per_second": 1.051, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"learning_rate": 2e-05, |
|
"loss": 4462.0898, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"learning_rate": 2e-05, |
|
"loss": 4151.0615, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"learning_rate": 2e-05, |
|
"loss": 4575.5889, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"learning_rate": 2e-05, |
|
"loss": 4630.2876, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"eval_loss": 4325.21875, |
|
"eval_runtime": 95.0575, |
|
"eval_samples_per_second": 16.79, |
|
"eval_steps_per_second": 1.052, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"learning_rate": 2e-05, |
|
"loss": 4619.9248, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"learning_rate": 2e-05, |
|
"loss": 3764.6904, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"learning_rate": 2e-05, |
|
"loss": 4107.4839, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"learning_rate": 2e-05, |
|
"loss": 4439.1548, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"eval_loss": 4270.97998046875, |
|
"eval_runtime": 95.2025, |
|
"eval_samples_per_second": 16.764, |
|
"eval_steps_per_second": 1.05, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"learning_rate": 2e-05, |
|
"loss": 4332.3652, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"learning_rate": 2e-05, |
|
"loss": 4009.8579, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"learning_rate": 2e-05, |
|
"loss": 4263.3091, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"learning_rate": 2e-05, |
|
"loss": 4529.6118, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"eval_loss": 4247.3193359375, |
|
"eval_runtime": 95.1903, |
|
"eval_samples_per_second": 16.766, |
|
"eval_steps_per_second": 1.051, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"learning_rate": 2e-05, |
|
"loss": 4782.1045, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"learning_rate": 2e-05, |
|
"loss": 4572.3145, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"learning_rate": 2e-05, |
|
"loss": 3709.7561, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"learning_rate": 2e-05, |
|
"loss": 5060.5757, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"eval_loss": 4267.9052734375, |
|
"eval_runtime": 95.2344, |
|
"eval_samples_per_second": 16.759, |
|
"eval_steps_per_second": 1.05, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"learning_rate": 2e-05, |
|
"loss": 4048.5073, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"learning_rate": 2e-05, |
|
"loss": 4337.002, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"learning_rate": 2e-05, |
|
"loss": 4471.6353, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"learning_rate": 2e-05, |
|
"loss": 4257.4619, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"eval_loss": 4241.09912109375, |
|
"eval_runtime": 95.1639, |
|
"eval_samples_per_second": 16.771, |
|
"eval_steps_per_second": 1.051, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"learning_rate": 2e-05, |
|
"loss": 4113.8926, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"learning_rate": 2e-05, |
|
"loss": 4255.1045, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"learning_rate": 2e-05, |
|
"loss": 4150.0469, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"learning_rate": 2e-05, |
|
"loss": 4069.4043, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"eval_loss": 4201.818359375, |
|
"eval_runtime": 95.2146, |
|
"eval_samples_per_second": 16.762, |
|
"eval_steps_per_second": 1.05, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"learning_rate": 2e-05, |
|
"loss": 4251.7646, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"learning_rate": 2e-05, |
|
"loss": 4371.7056, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"learning_rate": 2e-05, |
|
"loss": 4043.783, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"learning_rate": 2e-05, |
|
"loss": 3856.8018, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"eval_loss": 4195.00439453125, |
|
"eval_runtime": 95.3553, |
|
"eval_samples_per_second": 16.737, |
|
"eval_steps_per_second": 1.049, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"learning_rate": 2e-05, |
|
"loss": 4359.2188, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"learning_rate": 2e-05, |
|
"loss": 4015.7729, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"learning_rate": 2e-05, |
|
"loss": 4574.147, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"learning_rate": 2e-05, |
|
"loss": 4308.5566, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"eval_loss": 4192.68798828125, |
|
"eval_runtime": 95.3592, |
|
"eval_samples_per_second": 16.737, |
|
"eval_steps_per_second": 1.049, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"learning_rate": 2e-05, |
|
"loss": 3850.4014, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"learning_rate": 2e-05, |
|
"loss": 4326.3857, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"learning_rate": 2e-05, |
|
"loss": 4223.0674, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"learning_rate": 2e-05, |
|
"loss": 4567.7056, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"eval_loss": 4181.10009765625, |
|
"eval_runtime": 95.2118, |
|
"eval_samples_per_second": 16.763, |
|
"eval_steps_per_second": 1.05, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"learning_rate": 2e-05, |
|
"loss": 4342.7461, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"learning_rate": 2e-05, |
|
"loss": 3950.2617, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"learning_rate": 2e-05, |
|
"loss": 3843.9983, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"learning_rate": 2e-05, |
|
"loss": 4143.4326, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"eval_loss": 4187.84912109375, |
|
"eval_runtime": 95.4267, |
|
"eval_samples_per_second": 16.725, |
|
"eval_steps_per_second": 1.048, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"learning_rate": 2e-05, |
|
"loss": 5007.4238, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"learning_rate": 2e-05, |
|
"loss": 4008.7954, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"learning_rate": 2e-05, |
|
"loss": 3739.9316, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"learning_rate": 2e-05, |
|
"loss": 3865.8225, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"eval_loss": 4126.61669921875, |
|
"eval_runtime": 95.1828, |
|
"eval_samples_per_second": 16.768, |
|
"eval_steps_per_second": 1.051, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"learning_rate": 2e-05, |
|
"loss": 4461.7637, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"learning_rate": 2e-05, |
|
"loss": 3879.4729, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"learning_rate": 2e-05, |
|
"loss": 3731.812, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"learning_rate": 2e-05, |
|
"loss": 4175.4346, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"eval_loss": 4113.404296875, |
|
"eval_runtime": 95.5754, |
|
"eval_samples_per_second": 16.699, |
|
"eval_steps_per_second": 1.046, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"learning_rate": 2e-05, |
|
"loss": 3942.5459, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"learning_rate": 2e-05, |
|
"loss": 4637.8037, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"learning_rate": 2e-05, |
|
"loss": 4628.354, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"learning_rate": 2e-05, |
|
"loss": 4403.8843, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"eval_loss": 4094.84375, |
|
"eval_runtime": 95.3998, |
|
"eval_samples_per_second": 16.73, |
|
"eval_steps_per_second": 1.048, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"learning_rate": 2e-05, |
|
"loss": 3811.9785, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"learning_rate": 2e-05, |
|
"loss": 4257.5371, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"learning_rate": 2e-05, |
|
"loss": 3668.6951, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"learning_rate": 2e-05, |
|
"loss": 3921.7227, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"eval_loss": 4081.22021484375, |
|
"eval_runtime": 95.4773, |
|
"eval_samples_per_second": 16.716, |
|
"eval_steps_per_second": 1.047, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"learning_rate": 2e-05, |
|
"loss": 3529.4546, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"learning_rate": 2e-05, |
|
"loss": 4019.564, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"learning_rate": 2e-05, |
|
"loss": 4207.9526, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"learning_rate": 2e-05, |
|
"loss": 4506.7544, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"eval_loss": 4089.59033203125, |
|
"eval_runtime": 95.5353, |
|
"eval_samples_per_second": 16.706, |
|
"eval_steps_per_second": 1.047, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"learning_rate": 2e-05, |
|
"loss": 3739.2524, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"learning_rate": 2e-05, |
|
"loss": 4080.3894, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"learning_rate": 2e-05, |
|
"loss": 3937.1353, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"learning_rate": 2e-05, |
|
"loss": 4788.46, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"eval_loss": 4071.869140625, |
|
"eval_runtime": 95.2152, |
|
"eval_samples_per_second": 16.762, |
|
"eval_steps_per_second": 1.05, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"learning_rate": 2e-05, |
|
"loss": 3903.95, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"learning_rate": 2e-05, |
|
"loss": 4017.4819, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"learning_rate": 2e-05, |
|
"loss": 3605.0239, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"learning_rate": 2e-05, |
|
"loss": 4021.0627, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"eval_loss": 4055.228271484375, |
|
"eval_runtime": 95.2891, |
|
"eval_samples_per_second": 16.749, |
|
"eval_steps_per_second": 1.049, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"learning_rate": 2e-05, |
|
"loss": 3843.1729, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"learning_rate": 2e-05, |
|
"loss": 3957.4761, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"learning_rate": 2e-05, |
|
"loss": 4018.5742, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"learning_rate": 2e-05, |
|
"loss": 3802.7734, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"eval_loss": 4049.432373046875, |
|
"eval_runtime": 95.3586, |
|
"eval_samples_per_second": 16.737, |
|
"eval_steps_per_second": 1.049, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"learning_rate": 2e-05, |
|
"loss": 4562.4453, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"learning_rate": 2e-05, |
|
"loss": 3944.2036, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"learning_rate": 2e-05, |
|
"loss": 4023.9956, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"learning_rate": 2e-05, |
|
"loss": 4351.9863, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"eval_loss": 4015.07373046875, |
|
"eval_runtime": 95.1955, |
|
"eval_samples_per_second": 16.766, |
|
"eval_steps_per_second": 1.05, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"learning_rate": 2e-05, |
|
"loss": 3598.9106, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"learning_rate": 2e-05, |
|
"loss": 3748.6475, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"learning_rate": 2e-05, |
|
"loss": 3714.1748, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"learning_rate": 2e-05, |
|
"loss": 4423.7349, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 4013.801025390625, |
|
"eval_runtime": 95.1615, |
|
"eval_samples_per_second": 16.771, |
|
"eval_steps_per_second": 1.051, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"learning_rate": 2e-05, |
|
"loss": 3955.9373, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 2.02, |
|
"learning_rate": 2e-05, |
|
"loss": 4049.354, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 2.03, |
|
"learning_rate": 2e-05, |
|
"loss": 4312.8247, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"learning_rate": 2e-05, |
|
"loss": 3894.739, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"eval_loss": 4020.21240234375, |
|
"eval_runtime": 94.9862, |
|
"eval_samples_per_second": 16.802, |
|
"eval_steps_per_second": 1.053, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"learning_rate": 2e-05, |
|
"loss": 3237.6282, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 2.06, |
|
"learning_rate": 2e-05, |
|
"loss": 4196.6562, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 2.07, |
|
"learning_rate": 2e-05, |
|
"loss": 4185.1821, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"learning_rate": 2e-05, |
|
"loss": 3637.6782, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"eval_loss": 4004.71826171875, |
|
"eval_runtime": 95.0911, |
|
"eval_samples_per_second": 16.784, |
|
"eval_steps_per_second": 1.052, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 2.09, |
|
"learning_rate": 2e-05, |
|
"loss": 3988.9346, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"learning_rate": 2e-05, |
|
"loss": 3972.2412, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 2.11, |
|
"learning_rate": 2e-05, |
|
"loss": 4046.356, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"learning_rate": 2e-05, |
|
"loss": 4103.7446, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"eval_loss": 3999.879150390625, |
|
"eval_runtime": 95.151, |
|
"eval_samples_per_second": 16.773, |
|
"eval_steps_per_second": 1.051, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 2.13, |
|
"learning_rate": 2e-05, |
|
"loss": 4567.3848, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 2.14, |
|
"learning_rate": 2e-05, |
|
"loss": 4096.2393, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 2.15, |
|
"learning_rate": 2e-05, |
|
"loss": 3677.2112, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"learning_rate": 2e-05, |
|
"loss": 3869.1951, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"eval_loss": 3975.71240234375, |
|
"eval_runtime": 95.223, |
|
"eval_samples_per_second": 16.761, |
|
"eval_steps_per_second": 1.05, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 2.17, |
|
"learning_rate": 2e-05, |
|
"loss": 4281.8516, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 2.18, |
|
"learning_rate": 2e-05, |
|
"loss": 4052.2227, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 2.19, |
|
"learning_rate": 2e-05, |
|
"loss": 4156.7637, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"learning_rate": 2e-05, |
|
"loss": 3435.3076, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"eval_loss": 3955.1552734375, |
|
"eval_runtime": 95.1337, |
|
"eval_samples_per_second": 16.776, |
|
"eval_steps_per_second": 1.051, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 2.21, |
|
"learning_rate": 2e-05, |
|
"loss": 3932.6658, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 2.22, |
|
"learning_rate": 2e-05, |
|
"loss": 3382.2915, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 2.23, |
|
"learning_rate": 2e-05, |
|
"loss": 3378.9287, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"learning_rate": 2e-05, |
|
"loss": 3811.2612, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"eval_loss": 3958.28759765625, |
|
"eval_runtime": 94.998, |
|
"eval_samples_per_second": 16.8, |
|
"eval_steps_per_second": 1.053, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"learning_rate": 2e-05, |
|
"loss": 4169.3755, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 2.26, |
|
"learning_rate": 2e-05, |
|
"loss": 4260.3926, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 2.27, |
|
"learning_rate": 2e-05, |
|
"loss": 4086.002, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 2.28, |
|
"learning_rate": 2e-05, |
|
"loss": 4149.6416, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 2.28, |
|
"eval_loss": 3951.014404296875, |
|
"eval_runtime": 95.1617, |
|
"eval_samples_per_second": 16.771, |
|
"eval_steps_per_second": 1.051, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 2.29, |
|
"learning_rate": 2e-05, |
|
"loss": 3471.564, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 2.3, |
|
"learning_rate": 2e-05, |
|
"loss": 3824.6528, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 2.31, |
|
"learning_rate": 2e-05, |
|
"loss": 3360.2578, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"learning_rate": 2e-05, |
|
"loss": 3621.5127, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"eval_loss": 3957.65380859375, |
|
"eval_runtime": 95.1799, |
|
"eval_samples_per_second": 16.768, |
|
"eval_steps_per_second": 1.051, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 2.33, |
|
"learning_rate": 2e-05, |
|
"loss": 4050.9973, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 2.34, |
|
"learning_rate": 2e-05, |
|
"loss": 4221.9043, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"learning_rate": 2e-05, |
|
"loss": 3668.5938, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 2.36, |
|
"learning_rate": 2e-05, |
|
"loss": 3374.7612, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 2.36, |
|
"eval_loss": 3935.7607421875, |
|
"eval_runtime": 95.1897, |
|
"eval_samples_per_second": 16.767, |
|
"eval_steps_per_second": 1.051, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"learning_rate": 2e-05, |
|
"loss": 3811.7754, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 2.38, |
|
"learning_rate": 2e-05, |
|
"loss": 3923.9185, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 2.39, |
|
"learning_rate": 2e-05, |
|
"loss": 3072.0181, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"learning_rate": 2e-05, |
|
"loss": 4219.8125, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"eval_loss": 3909.99560546875, |
|
"eval_runtime": 95.2009, |
|
"eval_samples_per_second": 16.765, |
|
"eval_steps_per_second": 1.05, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 2.41, |
|
"learning_rate": 2e-05, |
|
"loss": 4093.2798, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 2.42, |
|
"learning_rate": 2e-05, |
|
"loss": 4144.9243, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 2.43, |
|
"learning_rate": 2e-05, |
|
"loss": 3701.4731, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 2.44, |
|
"learning_rate": 2e-05, |
|
"loss": 3713.2314, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 2.44, |
|
"eval_loss": 3911.078857421875, |
|
"eval_runtime": 95.0383, |
|
"eval_samples_per_second": 16.793, |
|
"eval_steps_per_second": 1.052, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 2.45, |
|
"learning_rate": 2e-05, |
|
"loss": 3543.9497, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"learning_rate": 2e-05, |
|
"loss": 3692.1785, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 2.47, |
|
"learning_rate": 2e-05, |
|
"loss": 4061.7036, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"learning_rate": 2e-05, |
|
"loss": 3513.0017, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"eval_loss": 3910.442626953125, |
|
"eval_runtime": 95.2387, |
|
"eval_samples_per_second": 16.758, |
|
"eval_steps_per_second": 1.05, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 2.49, |
|
"learning_rate": 2e-05, |
|
"loss": 4166.7217, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"learning_rate": 2e-05, |
|
"loss": 3680.3262, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 2.51, |
|
"learning_rate": 2e-05, |
|
"loss": 3960.1064, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 2.52, |
|
"learning_rate": 2e-05, |
|
"loss": 3358.9592, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 2.52, |
|
"eval_loss": 3899.7275390625, |
|
"eval_runtime": 95.1727, |
|
"eval_samples_per_second": 16.77, |
|
"eval_steps_per_second": 1.051, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 2.53, |
|
"learning_rate": 2e-05, |
|
"loss": 3664.8364, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 2.54, |
|
"learning_rate": 2e-05, |
|
"loss": 3645.0352, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 2.55, |
|
"learning_rate": 2e-05, |
|
"loss": 4320.4873, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"learning_rate": 2e-05, |
|
"loss": 4029.3306, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"eval_loss": 3905.114013671875, |
|
"eval_runtime": 95.1752, |
|
"eval_samples_per_second": 16.769, |
|
"eval_steps_per_second": 1.051, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 2.57, |
|
"learning_rate": 2e-05, |
|
"loss": 4058.9121, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 2.58, |
|
"learning_rate": 2e-05, |
|
"loss": 4163.0127, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 2.59, |
|
"learning_rate": 2e-05, |
|
"loss": 3698.998, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"learning_rate": 2e-05, |
|
"loss": 3724.9954, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"eval_loss": 3902.5595703125, |
|
"eval_runtime": 95.1954, |
|
"eval_samples_per_second": 16.766, |
|
"eval_steps_per_second": 1.05, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 2.61, |
|
"learning_rate": 2e-05, |
|
"loss": 4149.5771, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 2.62, |
|
"learning_rate": 2e-05, |
|
"loss": 3952.4429, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 2.63, |
|
"learning_rate": 2e-05, |
|
"loss": 3482.0449, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"learning_rate": 2e-05, |
|
"loss": 3825.3076, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"eval_loss": 3882.601806640625, |
|
"eval_runtime": 95.0573, |
|
"eval_samples_per_second": 16.79, |
|
"eval_steps_per_second": 1.052, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 2.65, |
|
"learning_rate": 2e-05, |
|
"loss": 3698.0227, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 2.66, |
|
"learning_rate": 2e-05, |
|
"loss": 3797.3899, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 2.67, |
|
"learning_rate": 2e-05, |
|
"loss": 3853.0374, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 2.68, |
|
"learning_rate": 2e-05, |
|
"loss": 3591.9854, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 2.68, |
|
"eval_loss": 3878.234375, |
|
"eval_runtime": 95.1839, |
|
"eval_samples_per_second": 16.768, |
|
"eval_steps_per_second": 1.051, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 2.69, |
|
"learning_rate": 2e-05, |
|
"loss": 3889.3989, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 2.7, |
|
"learning_rate": 2e-05, |
|
"loss": 3946.7024, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 2.71, |
|
"learning_rate": 2e-05, |
|
"loss": 4180.001, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 2.72, |
|
"learning_rate": 2e-05, |
|
"loss": 3440.269, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 2.72, |
|
"eval_loss": 3873.271728515625, |
|
"eval_runtime": 95.2614, |
|
"eval_samples_per_second": 16.754, |
|
"eval_steps_per_second": 1.05, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 2.73, |
|
"learning_rate": 2e-05, |
|
"loss": 3619.1611, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"learning_rate": 2e-05, |
|
"loss": 3825.2026, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"learning_rate": 2e-05, |
|
"loss": 4002.6731, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"learning_rate": 2e-05, |
|
"loss": 3595.2097, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"eval_loss": 3864.229248046875, |
|
"eval_runtime": 95.4953, |
|
"eval_samples_per_second": 16.713, |
|
"eval_steps_per_second": 1.047, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 2.77, |
|
"learning_rate": 2e-05, |
|
"loss": 3813.6006, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 2.78, |
|
"learning_rate": 2e-05, |
|
"loss": 4163.6484, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 2.79, |
|
"learning_rate": 2e-05, |
|
"loss": 3158.1484, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"learning_rate": 2e-05, |
|
"loss": 3456.5925, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"eval_loss": 3866.61767578125, |
|
"eval_runtime": 95.7468, |
|
"eval_samples_per_second": 16.669, |
|
"eval_steps_per_second": 1.044, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 2.81, |
|
"learning_rate": 2e-05, |
|
"loss": 3924.3809, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 2.82, |
|
"learning_rate": 2e-05, |
|
"loss": 4106.1128, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 2.83, |
|
"learning_rate": 2e-05, |
|
"loss": 3968.1797, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 2.84, |
|
"learning_rate": 2e-05, |
|
"loss": 3913.6394, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 2.84, |
|
"eval_loss": 3853.93505859375, |
|
"eval_runtime": 95.3329, |
|
"eval_samples_per_second": 16.741, |
|
"eval_steps_per_second": 1.049, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 2.85, |
|
"learning_rate": 2e-05, |
|
"loss": 3787.5688, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 2.86, |
|
"learning_rate": 2e-05, |
|
"loss": 4051.9841, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 2.87, |
|
"learning_rate": 2e-05, |
|
"loss": 3468.6104, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"learning_rate": 2e-05, |
|
"loss": 4153.1147, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"eval_loss": 3836.56201171875, |
|
"eval_runtime": 95.4516, |
|
"eval_samples_per_second": 16.721, |
|
"eval_steps_per_second": 1.048, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 2.89, |
|
"learning_rate": 2e-05, |
|
"loss": 3588.4766, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 2.9, |
|
"learning_rate": 2e-05, |
|
"loss": 3939.7871, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 2.91, |
|
"learning_rate": 2e-05, |
|
"loss": 4142.5205, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"learning_rate": 2e-05, |
|
"loss": 3433.5652, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"eval_loss": 3846.0400390625, |
|
"eval_runtime": 95.5865, |
|
"eval_samples_per_second": 16.697, |
|
"eval_steps_per_second": 1.046, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 2.93, |
|
"learning_rate": 2e-05, |
|
"loss": 3704.4355, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 2.94, |
|
"learning_rate": 2e-05, |
|
"loss": 3561.1382, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 2.95, |
|
"learning_rate": 2e-05, |
|
"loss": 4084.75, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"learning_rate": 2e-05, |
|
"loss": 3592.4402, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"eval_loss": 3824.31298828125, |
|
"eval_runtime": 95.6271, |
|
"eval_samples_per_second": 16.69, |
|
"eval_steps_per_second": 1.046, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 2.97, |
|
"learning_rate": 2e-05, |
|
"loss": 3653.906, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 2.98, |
|
"learning_rate": 2e-05, |
|
"loss": 3920.9485, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 2.99, |
|
"learning_rate": 2e-05, |
|
"loss": 4010.104, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"learning_rate": 2e-05, |
|
"loss": 4472.293, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 3811.89013671875, |
|
"eval_runtime": 95.218, |
|
"eval_samples_per_second": 16.762, |
|
"eval_steps_per_second": 1.05, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 3.01, |
|
"learning_rate": 2e-05, |
|
"loss": 4144.3301, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 3.02, |
|
"learning_rate": 2e-05, |
|
"loss": 3661.8413, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 3.03, |
|
"learning_rate": 2e-05, |
|
"loss": 4158.1958, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 3.04, |
|
"learning_rate": 2e-05, |
|
"loss": 3705.3564, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 3.04, |
|
"eval_loss": 3801.527587890625, |
|
"eval_runtime": 95.4031, |
|
"eval_samples_per_second": 16.729, |
|
"eval_steps_per_second": 1.048, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 3.05, |
|
"learning_rate": 2e-05, |
|
"loss": 3678.627, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 3.06, |
|
"learning_rate": 2e-05, |
|
"loss": 3692.2642, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 3.07, |
|
"learning_rate": 2e-05, |
|
"loss": 4010.1907, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 3.08, |
|
"learning_rate": 2e-05, |
|
"loss": 3680.8633, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 3.08, |
|
"eval_loss": 3794.080322265625, |
|
"eval_runtime": 95.2615, |
|
"eval_samples_per_second": 16.754, |
|
"eval_steps_per_second": 1.05, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 3.09, |
|
"learning_rate": 2e-05, |
|
"loss": 4088.3789, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 3.1, |
|
"learning_rate": 2e-05, |
|
"loss": 3263.083, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 3.11, |
|
"learning_rate": 2e-05, |
|
"loss": 3412.7646, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 3.12, |
|
"learning_rate": 2e-05, |
|
"loss": 2886.2148, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 3.12, |
|
"eval_loss": 3781.141357421875, |
|
"eval_runtime": 95.4625, |
|
"eval_samples_per_second": 16.719, |
|
"eval_steps_per_second": 1.048, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 3.13, |
|
"learning_rate": 2e-05, |
|
"loss": 3421.999, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 3.14, |
|
"learning_rate": 2e-05, |
|
"loss": 4785.0142, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 3.15, |
|
"learning_rate": 2e-05, |
|
"loss": 3859.052, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 3.16, |
|
"learning_rate": 2e-05, |
|
"loss": 3919.5405, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 3.16, |
|
"eval_loss": 3814.05224609375, |
|
"eval_runtime": 95.377, |
|
"eval_samples_per_second": 16.734, |
|
"eval_steps_per_second": 1.048, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 3.17, |
|
"learning_rate": 2e-05, |
|
"loss": 3466.0466, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 3.18, |
|
"learning_rate": 2e-05, |
|
"loss": 3382.6812, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 3.19, |
|
"learning_rate": 2e-05, |
|
"loss": 3554.6763, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"learning_rate": 2e-05, |
|
"loss": 3869.5581, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"eval_loss": 3784.0302734375, |
|
"eval_runtime": 95.3698, |
|
"eval_samples_per_second": 16.735, |
|
"eval_steps_per_second": 1.049, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 3.21, |
|
"learning_rate": 2e-05, |
|
"loss": 3639.7705, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 3.22, |
|
"learning_rate": 2e-05, |
|
"loss": 3887.6313, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 3.23, |
|
"learning_rate": 2e-05, |
|
"loss": 3908.9287, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 3.24, |
|
"learning_rate": 2e-05, |
|
"loss": 3672.9302, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 3.24, |
|
"eval_loss": 3771.347412109375, |
|
"eval_runtime": 95.2036, |
|
"eval_samples_per_second": 16.764, |
|
"eval_steps_per_second": 1.05, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 3.25, |
|
"learning_rate": 2e-05, |
|
"loss": 3436.8274, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 3.26, |
|
"learning_rate": 2e-05, |
|
"loss": 3314.8804, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 3.27, |
|
"learning_rate": 2e-05, |
|
"loss": 4389.9253, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 3.28, |
|
"learning_rate": 2e-05, |
|
"loss": 4018.4453, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 3.28, |
|
"eval_loss": 3771.130859375, |
|
"eval_runtime": 95.3816, |
|
"eval_samples_per_second": 16.733, |
|
"eval_steps_per_second": 1.048, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 3.29, |
|
"learning_rate": 2e-05, |
|
"loss": 3510.4697, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 3.3, |
|
"learning_rate": 2e-05, |
|
"loss": 3604.1636, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 3.31, |
|
"learning_rate": 2e-05, |
|
"loss": 4791.7847, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 3.32, |
|
"learning_rate": 2e-05, |
|
"loss": 3840.9712, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 3.32, |
|
"eval_loss": 3769.62744140625, |
|
"eval_runtime": 95.364, |
|
"eval_samples_per_second": 16.736, |
|
"eval_steps_per_second": 1.049, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 3.33, |
|
"learning_rate": 2e-05, |
|
"loss": 3833.2412, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 3.34, |
|
"learning_rate": 2e-05, |
|
"loss": 3851.5979, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 3.35, |
|
"learning_rate": 2e-05, |
|
"loss": 3107.7024, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 3.36, |
|
"learning_rate": 2e-05, |
|
"loss": 3598.7141, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 3.36, |
|
"eval_loss": 3741.15283203125, |
|
"eval_runtime": 95.3181, |
|
"eval_samples_per_second": 16.744, |
|
"eval_steps_per_second": 1.049, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 3.37, |
|
"learning_rate": 2e-05, |
|
"loss": 3801.637, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 3.38, |
|
"learning_rate": 2e-05, |
|
"loss": 3975.8054, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 3.39, |
|
"learning_rate": 2e-05, |
|
"loss": 3865.6533, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 3.4, |
|
"learning_rate": 2e-05, |
|
"loss": 4142.3828, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 3.4, |
|
"eval_loss": 3735.14892578125, |
|
"eval_runtime": 95.3278, |
|
"eval_samples_per_second": 16.742, |
|
"eval_steps_per_second": 1.049, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 3.41, |
|
"learning_rate": 2e-05, |
|
"loss": 3423.9048, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 3.42, |
|
"learning_rate": 2e-05, |
|
"loss": 3500.3408, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 3.43, |
|
"learning_rate": 2e-05, |
|
"loss": 3935.4355, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 3.44, |
|
"learning_rate": 2e-05, |
|
"loss": 3490.2856, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 3.44, |
|
"eval_loss": 3755.257568359375, |
|
"eval_runtime": 95.3238, |
|
"eval_samples_per_second": 16.743, |
|
"eval_steps_per_second": 1.049, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 3.45, |
|
"learning_rate": 2e-05, |
|
"loss": 3251.2615, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 3.46, |
|
"learning_rate": 2e-05, |
|
"loss": 3406.0591, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 3.47, |
|
"learning_rate": 2e-05, |
|
"loss": 3306.6914, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 3.48, |
|
"learning_rate": 2e-05, |
|
"loss": 3858.6367, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 3.48, |
|
"eval_loss": 3728.05615234375, |
|
"eval_runtime": 95.4779, |
|
"eval_samples_per_second": 16.716, |
|
"eval_steps_per_second": 1.047, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 3.49, |
|
"learning_rate": 2e-05, |
|
"loss": 2887.1479, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"learning_rate": 2e-05, |
|
"loss": 3831.7244, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 3.51, |
|
"learning_rate": 2e-05, |
|
"loss": 3361.1741, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 3.52, |
|
"learning_rate": 2e-05, |
|
"loss": 3100.0488, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 3.52, |
|
"eval_loss": 3719.01953125, |
|
"eval_runtime": 95.2657, |
|
"eval_samples_per_second": 16.753, |
|
"eval_steps_per_second": 1.05, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 3.53, |
|
"learning_rate": 2e-05, |
|
"loss": 3521.7913, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 3.54, |
|
"learning_rate": 2e-05, |
|
"loss": 3286.1931, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 3.55, |
|
"learning_rate": 2e-05, |
|
"loss": 4122.73, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 3.56, |
|
"learning_rate": 2e-05, |
|
"loss": 3818.4766, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 3.56, |
|
"eval_loss": 3691.170166015625, |
|
"eval_runtime": 95.4436, |
|
"eval_samples_per_second": 16.722, |
|
"eval_steps_per_second": 1.048, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 3.57, |
|
"learning_rate": 2e-05, |
|
"loss": 3285.2466, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 3.58, |
|
"learning_rate": 2e-05, |
|
"loss": 3424.1902, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 3.59, |
|
"learning_rate": 2e-05, |
|
"loss": 3263.9805, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"learning_rate": 2e-05, |
|
"loss": 3721.5125, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"eval_loss": 3691.932861328125, |
|
"eval_runtime": 95.5042, |
|
"eval_samples_per_second": 16.711, |
|
"eval_steps_per_second": 1.047, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 3.61, |
|
"learning_rate": 2e-05, |
|
"loss": 3110.5813, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 3.62, |
|
"learning_rate": 2e-05, |
|
"loss": 3151.187, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 3.63, |
|
"learning_rate": 2e-05, |
|
"loss": 3511.4319, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 3.64, |
|
"learning_rate": 2e-05, |
|
"loss": 3510.0305, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 3.64, |
|
"eval_loss": 3845.365234375, |
|
"eval_runtime": 95.2618, |
|
"eval_samples_per_second": 16.754, |
|
"eval_steps_per_second": 1.05, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 3.65, |
|
"learning_rate": 2e-05, |
|
"loss": 3422.686, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 3.66, |
|
"learning_rate": 2e-05, |
|
"loss": 4240.2275, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 3.67, |
|
"learning_rate": 2e-05, |
|
"loss": 3544.7195, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 3.68, |
|
"learning_rate": 2e-05, |
|
"loss": 3897.0737, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 3.68, |
|
"eval_loss": 3735.1435546875, |
|
"eval_runtime": 95.385, |
|
"eval_samples_per_second": 16.732, |
|
"eval_steps_per_second": 1.048, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 3.69, |
|
"learning_rate": 2e-05, |
|
"loss": 3670.8647, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 3.7, |
|
"learning_rate": 2e-05, |
|
"loss": 3732.1475, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 3.71, |
|
"learning_rate": 2e-05, |
|
"loss": 3360.2307, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 3.72, |
|
"learning_rate": 2e-05, |
|
"loss": 3394.595, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 3.72, |
|
"eval_loss": 3689.341552734375, |
|
"eval_runtime": 95.4346, |
|
"eval_samples_per_second": 16.724, |
|
"eval_steps_per_second": 1.048, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 3.73, |
|
"learning_rate": 2e-05, |
|
"loss": 3888.7461, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 3.74, |
|
"learning_rate": 2e-05, |
|
"loss": 3887.3716, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"learning_rate": 2e-05, |
|
"loss": 3540.4429, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 3.76, |
|
"learning_rate": 2e-05, |
|
"loss": 3279.0503, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 3.76, |
|
"eval_loss": 3688.84619140625, |
|
"eval_runtime": 95.5526, |
|
"eval_samples_per_second": 16.703, |
|
"eval_steps_per_second": 1.047, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 3.77, |
|
"learning_rate": 2e-05, |
|
"loss": 3422.4468, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 3.78, |
|
"learning_rate": 2e-05, |
|
"loss": 3986.1982, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 3.79, |
|
"learning_rate": 2e-05, |
|
"loss": 3540.3237, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 3.8, |
|
"learning_rate": 2e-05, |
|
"loss": 3527.5225, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 3.8, |
|
"eval_loss": 3675.298828125, |
|
"eval_runtime": 95.6115, |
|
"eval_samples_per_second": 16.693, |
|
"eval_steps_per_second": 1.046, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 3.81, |
|
"learning_rate": 2e-05, |
|
"loss": 3525.8005, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 3.82, |
|
"learning_rate": 2e-05, |
|
"loss": 3136.1785, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 3.83, |
|
"learning_rate": 2e-05, |
|
"loss": 3337.8206, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 3.84, |
|
"learning_rate": 2e-05, |
|
"loss": 3592.8411, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 3.84, |
|
"eval_loss": 3678.1904296875, |
|
"eval_runtime": 95.1358, |
|
"eval_samples_per_second": 16.776, |
|
"eval_steps_per_second": 1.051, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 3.85, |
|
"learning_rate": 2e-05, |
|
"loss": 3581.0991, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 3.86, |
|
"learning_rate": 2e-05, |
|
"loss": 4133.2891, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 3.87, |
|
"learning_rate": 2e-05, |
|
"loss": 3552.9802, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 3.88, |
|
"learning_rate": 2e-05, |
|
"loss": 3522.3882, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 3.88, |
|
"eval_loss": 3674.487548828125, |
|
"eval_runtime": 95.2781, |
|
"eval_samples_per_second": 16.751, |
|
"eval_steps_per_second": 1.05, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 3.89, |
|
"learning_rate": 2e-05, |
|
"loss": 3406.5796, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 3.9, |
|
"learning_rate": 2e-05, |
|
"loss": 3628.0713, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 3.91, |
|
"learning_rate": 2e-05, |
|
"loss": 3814.6108, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 3.92, |
|
"learning_rate": 2e-05, |
|
"loss": 3952.0635, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 3.92, |
|
"eval_loss": 3655.67919921875, |
|
"eval_runtime": 95.4463, |
|
"eval_samples_per_second": 16.721, |
|
"eval_steps_per_second": 1.048, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 3.93, |
|
"learning_rate": 2e-05, |
|
"loss": 3041.1628, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 3.94, |
|
"learning_rate": 2e-05, |
|
"loss": 3882.897, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 3.95, |
|
"learning_rate": 2e-05, |
|
"loss": 3482.4553, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 3.96, |
|
"learning_rate": 2e-05, |
|
"loss": 3240.0728, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 3.96, |
|
"eval_loss": 3666.441650390625, |
|
"eval_runtime": 95.3661, |
|
"eval_samples_per_second": 16.736, |
|
"eval_steps_per_second": 1.049, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 3.97, |
|
"learning_rate": 2e-05, |
|
"loss": 3923.905, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 3.98, |
|
"learning_rate": 2e-05, |
|
"loss": 3575.6948, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 3.99, |
|
"learning_rate": 2e-05, |
|
"loss": 2891.4922, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"learning_rate": 2e-05, |
|
"loss": 3905.6792, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 3648.62841796875, |
|
"eval_runtime": 95.4232, |
|
"eval_samples_per_second": 16.725, |
|
"eval_steps_per_second": 1.048, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"step": 400, |
|
"total_flos": 1.624099600371548e+17, |
|
"train_loss": 4450.289553833008, |
|
"train_runtime": 14295.8762, |
|
"train_samples_per_second": 1.787, |
|
"train_steps_per_second": 0.028 |
|
} |
|
], |
|
"logging_steps": 1.0, |
|
"max_steps": 400, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 20, |
|
"total_flos": 1.624099600371548e+17, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|