diff --git "a/checkpoint-28500/trainer_state.json" "b/checkpoint-28500/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-28500/trainer_state.json" @@ -0,0 +1,19984 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.9769954476479514, + "eval_steps": 500, + "global_step": 28500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0006936917407327119, + "grad_norm": 12224.0, + "learning_rate": 4.998434782608696e-06, + "loss": 10.7132, + "step": 10 + }, + { + "epoch": 0.0013873834814654238, + "grad_norm": 19712.0, + "learning_rate": 4.9966956521739135e-06, + "loss": 1.4256, + "step": 20 + }, + { + "epoch": 0.0020810752221981357, + "grad_norm": 21504.0, + "learning_rate": 4.994956521739131e-06, + "loss": 0.8714, + "step": 30 + }, + { + "epoch": 0.0027747669629308476, + "grad_norm": 2624.0, + "learning_rate": 4.993217391304348e-06, + "loss": 0.7117, + "step": 40 + }, + { + "epoch": 0.0034684587036635595, + "grad_norm": 23.875, + "learning_rate": 4.991478260869566e-06, + "loss": 0.3634, + "step": 50 + }, + { + "epoch": 0.004162150444396271, + "grad_norm": 31.375, + "learning_rate": 4.989739130434783e-06, + "loss": 0.7818, + "step": 60 + }, + { + "epoch": 0.004855842185128983, + "grad_norm": 8096.0, + "learning_rate": 4.988e-06, + "loss": 0.5493, + "step": 70 + }, + { + "epoch": 0.005549533925861695, + "grad_norm": 27.875, + "learning_rate": 4.986260869565218e-06, + "loss": 0.457, + "step": 80 + }, + { + "epoch": 0.006243225666594407, + "grad_norm": 37632.0, + "learning_rate": 4.984521739130435e-06, + "loss": 0.4123, + "step": 90 + }, + { + "epoch": 0.006936917407327119, + "grad_norm": 61.5, + "learning_rate": 4.9827826086956525e-06, + "loss": 0.3742, + "step": 100 + }, + { + "epoch": 0.007630609148059831, + "grad_norm": 34.25, + "learning_rate": 4.98104347826087e-06, + "loss": 0.2794, + "step": 110 + }, + { + "epoch": 0.008324300888792543, + "grad_norm": 61.5, + "learning_rate": 4.979304347826087e-06, + "loss": 0.2913, + "step": 120 + }, + { + "epoch": 0.009017992629525256, + "grad_norm": 8.5, + "learning_rate": 4.977565217391305e-06, + "loss": 0.3476, + "step": 130 + }, + { + "epoch": 0.009711684370257967, + "grad_norm": 76800.0, + "learning_rate": 4.975826086956522e-06, + "loss": 1.5003, + "step": 140 + }, + { + "epoch": 0.01040537611099068, + "grad_norm": 120.5, + "learning_rate": 4.97408695652174e-06, + "loss": 0.3051, + "step": 150 + }, + { + "epoch": 0.01109906785172339, + "grad_norm": 2.28125, + "learning_rate": 4.972347826086957e-06, + "loss": 0.2632, + "step": 160 + }, + { + "epoch": 0.011792759592456103, + "grad_norm": 12.6875, + "learning_rate": 4.970608695652174e-06, + "loss": 0.2847, + "step": 170 + }, + { + "epoch": 0.012486451333188814, + "grad_norm": 20.75, + "learning_rate": 4.9688695652173914e-06, + "loss": 0.3436, + "step": 180 + }, + { + "epoch": 0.013180143073921527, + "grad_norm": 234.0, + "learning_rate": 4.96713043478261e-06, + "loss": 0.2752, + "step": 190 + }, + { + "epoch": 0.013873834814654238, + "grad_norm": 10.25, + "learning_rate": 4.965391304347826e-06, + "loss": 0.3504, + "step": 200 + }, + { + "epoch": 0.01456752655538695, + "grad_norm": 8.0625, + "learning_rate": 4.9636521739130436e-06, + "loss": 0.2447, + "step": 210 + }, + { + "epoch": 0.015261218296119662, + "grad_norm": 3.640625, + "learning_rate": 4.961913043478262e-06, + "loss": 0.2864, + "step": 220 + }, + { + "epoch": 0.015954910036852375, + "grad_norm": 7.6875, + "learning_rate": 4.960173913043478e-06, + "loss": 0.2455, + "step": 230 + }, + { + "epoch": 0.016648601777585086, + "grad_norm": 10.25, + "learning_rate": 4.958434782608696e-06, + "loss": 0.2847, + "step": 240 + }, + { + "epoch": 0.017342293518317797, + "grad_norm": 6.75, + "learning_rate": 4.956695652173914e-06, + "loss": 0.2797, + "step": 250 + }, + { + "epoch": 0.01803598525905051, + "grad_norm": 9.4375, + "learning_rate": 4.954956521739131e-06, + "loss": 0.5388, + "step": 260 + }, + { + "epoch": 0.018729676999783222, + "grad_norm": 125.0, + "learning_rate": 4.953217391304348e-06, + "loss": 0.2508, + "step": 270 + }, + { + "epoch": 0.019423368740515933, + "grad_norm": 2.515625, + "learning_rate": 4.951478260869565e-06, + "loss": 0.263, + "step": 280 + }, + { + "epoch": 0.020117060481248644, + "grad_norm": 1.578125, + "learning_rate": 4.949739130434783e-06, + "loss": 0.2583, + "step": 290 + }, + { + "epoch": 0.02081075222198136, + "grad_norm": 4.53125, + "learning_rate": 4.948000000000001e-06, + "loss": 0.2684, + "step": 300 + }, + { + "epoch": 0.02150444396271407, + "grad_norm": 19.625, + "learning_rate": 4.946260869565217e-06, + "loss": 0.2318, + "step": 310 + }, + { + "epoch": 0.02219813570344678, + "grad_norm": 15.8125, + "learning_rate": 4.9445217391304355e-06, + "loss": 0.2509, + "step": 320 + }, + { + "epoch": 0.022891827444179492, + "grad_norm": 4.5625, + "learning_rate": 4.942782608695653e-06, + "loss": 0.2428, + "step": 330 + }, + { + "epoch": 0.023585519184912206, + "grad_norm": 20.75, + "learning_rate": 4.94104347826087e-06, + "loss": 0.2789, + "step": 340 + }, + { + "epoch": 0.024279210925644917, + "grad_norm": 1.59375, + "learning_rate": 4.939304347826087e-06, + "loss": 0.2624, + "step": 350 + }, + { + "epoch": 0.02497290266637763, + "grad_norm": 1.6640625, + "learning_rate": 4.937565217391305e-06, + "loss": 0.2712, + "step": 360 + }, + { + "epoch": 0.02566659440711034, + "grad_norm": 2.0625, + "learning_rate": 4.935826086956522e-06, + "loss": 0.2973, + "step": 370 + }, + { + "epoch": 0.026360286147843054, + "grad_norm": 2.4375, + "learning_rate": 4.93408695652174e-06, + "loss": 0.2769, + "step": 380 + }, + { + "epoch": 0.027053977888575765, + "grad_norm": 1.3203125, + "learning_rate": 4.932347826086957e-06, + "loss": 0.2675, + "step": 390 + }, + { + "epoch": 0.027747669629308476, + "grad_norm": 4.21875, + "learning_rate": 4.9306086956521744e-06, + "loss": 0.2663, + "step": 400 + }, + { + "epoch": 0.028441361370041187, + "grad_norm": 17.75, + "learning_rate": 4.928869565217392e-06, + "loss": 0.3639, + "step": 410 + }, + { + "epoch": 0.0291350531107739, + "grad_norm": 5.03125, + "learning_rate": 4.927130434782609e-06, + "loss": 0.3279, + "step": 420 + }, + { + "epoch": 0.029828744851506613, + "grad_norm": 2.21875, + "learning_rate": 4.9253913043478266e-06, + "loss": 0.2773, + "step": 430 + }, + { + "epoch": 0.030522436592239324, + "grad_norm": 6.625, + "learning_rate": 4.923652173913044e-06, + "loss": 0.2445, + "step": 440 + }, + { + "epoch": 0.031216128332972035, + "grad_norm": 3.84375, + "learning_rate": 4.921913043478261e-06, + "loss": 0.2256, + "step": 450 + }, + { + "epoch": 0.03190982007370475, + "grad_norm": 3.15625, + "learning_rate": 4.920173913043479e-06, + "loss": 0.2288, + "step": 460 + }, + { + "epoch": 0.03260351181443746, + "grad_norm": 52.0, + "learning_rate": 4.918434782608696e-06, + "loss": 0.2662, + "step": 470 + }, + { + "epoch": 0.03329720355517017, + "grad_norm": 13.4375, + "learning_rate": 4.916695652173913e-06, + "loss": 0.2476, + "step": 480 + }, + { + "epoch": 0.033990895295902886, + "grad_norm": 49.0, + "learning_rate": 4.914956521739131e-06, + "loss": 0.2328, + "step": 490 + }, + { + "epoch": 0.03468458703663559, + "grad_norm": 1.5546875, + "learning_rate": 4.913217391304348e-06, + "loss": 0.2463, + "step": 500 + }, + { + "epoch": 0.03537827877736831, + "grad_norm": 10.5625, + "learning_rate": 4.9114782608695655e-06, + "loss": 0.2657, + "step": 510 + }, + { + "epoch": 0.03607197051810102, + "grad_norm": 2.1875, + "learning_rate": 4.909739130434783e-06, + "loss": 0.268, + "step": 520 + }, + { + "epoch": 0.03676566225883373, + "grad_norm": 1.9921875, + "learning_rate": 4.908e-06, + "loss": 0.3324, + "step": 530 + }, + { + "epoch": 0.037459353999566444, + "grad_norm": 1.5703125, + "learning_rate": 4.906260869565218e-06, + "loss": 0.2086, + "step": 540 + }, + { + "epoch": 0.03815304574029915, + "grad_norm": 2.453125, + "learning_rate": 4.904521739130435e-06, + "loss": 0.2581, + "step": 550 + }, + { + "epoch": 0.038846737481031866, + "grad_norm": 1.4609375, + "learning_rate": 4.902782608695652e-06, + "loss": 0.2908, + "step": 560 + }, + { + "epoch": 0.03954042922176458, + "grad_norm": 4.28125, + "learning_rate": 4.90104347826087e-06, + "loss": 0.2192, + "step": 570 + }, + { + "epoch": 0.04023412096249729, + "grad_norm": 3.859375, + "learning_rate": 4.899304347826087e-06, + "loss": 0.2916, + "step": 580 + }, + { + "epoch": 0.04092781270323, + "grad_norm": 1.390625, + "learning_rate": 4.8975652173913045e-06, + "loss": 0.2388, + "step": 590 + }, + { + "epoch": 0.04162150444396272, + "grad_norm": 1.5859375, + "learning_rate": 4.895826086956522e-06, + "loss": 0.2509, + "step": 600 + }, + { + "epoch": 0.042315196184695425, + "grad_norm": 1.3515625, + "learning_rate": 4.89408695652174e-06, + "loss": 0.2736, + "step": 610 + }, + { + "epoch": 0.04300888792542814, + "grad_norm": 5.09375, + "learning_rate": 4.892347826086957e-06, + "loss": 0.329, + "step": 620 + }, + { + "epoch": 0.04370257966616085, + "grad_norm": 1.15625, + "learning_rate": 4.890608695652174e-06, + "loss": 0.2188, + "step": 630 + }, + { + "epoch": 0.04439627140689356, + "grad_norm": 1.640625, + "learning_rate": 4.888869565217391e-06, + "loss": 0.2942, + "step": 640 + }, + { + "epoch": 0.045089963147626276, + "grad_norm": 1.046875, + "learning_rate": 4.8871304347826096e-06, + "loss": 0.2324, + "step": 650 + }, + { + "epoch": 0.045783654888358984, + "grad_norm": 6.40625, + "learning_rate": 4.885391304347826e-06, + "loss": 0.2601, + "step": 660 + }, + { + "epoch": 0.0464773466290917, + "grad_norm": 1.859375, + "learning_rate": 4.8836521739130435e-06, + "loss": 0.2292, + "step": 670 + }, + { + "epoch": 0.04717103836982441, + "grad_norm": 1.84375, + "learning_rate": 4.881913043478262e-06, + "loss": 0.2719, + "step": 680 + }, + { + "epoch": 0.04786473011055712, + "grad_norm": 2.125, + "learning_rate": 4.880173913043479e-06, + "loss": 0.2875, + "step": 690 + }, + { + "epoch": 0.048558421851289835, + "grad_norm": 2.265625, + "learning_rate": 4.878434782608696e-06, + "loss": 0.2648, + "step": 700 + }, + { + "epoch": 0.04925211359202254, + "grad_norm": 1.625, + "learning_rate": 4.876695652173914e-06, + "loss": 0.2901, + "step": 710 + }, + { + "epoch": 0.04994580533275526, + "grad_norm": 1.2421875, + "learning_rate": 4.874956521739131e-06, + "loss": 0.2532, + "step": 720 + }, + { + "epoch": 0.05063949707348797, + "grad_norm": 1.953125, + "learning_rate": 4.8732173913043485e-06, + "loss": 0.2736, + "step": 730 + }, + { + "epoch": 0.05133318881422068, + "grad_norm": 2.390625, + "learning_rate": 4.871478260869565e-06, + "loss": 0.2531, + "step": 740 + }, + { + "epoch": 0.05202688055495339, + "grad_norm": 1.875, + "learning_rate": 4.869739130434783e-06, + "loss": 0.262, + "step": 750 + }, + { + "epoch": 0.05272057229568611, + "grad_norm": 2.171875, + "learning_rate": 4.868000000000001e-06, + "loss": 0.2242, + "step": 760 + }, + { + "epoch": 0.053414264036418815, + "grad_norm": 2.546875, + "learning_rate": 4.866260869565218e-06, + "loss": 0.2873, + "step": 770 + }, + { + "epoch": 0.05410795577715153, + "grad_norm": 1.4765625, + "learning_rate": 4.864521739130435e-06, + "loss": 0.2867, + "step": 780 + }, + { + "epoch": 0.05480164751788424, + "grad_norm": 1.40625, + "learning_rate": 4.862782608695653e-06, + "loss": 0.2815, + "step": 790 + }, + { + "epoch": 0.05549533925861695, + "grad_norm": 1.7265625, + "learning_rate": 4.86104347826087e-06, + "loss": 0.2943, + "step": 800 + }, + { + "epoch": 0.056189030999349666, + "grad_norm": 1.4375, + "learning_rate": 4.8593043478260875e-06, + "loss": 0.2723, + "step": 810 + }, + { + "epoch": 0.056882722740082374, + "grad_norm": 1.25, + "learning_rate": 4.857565217391305e-06, + "loss": 0.2441, + "step": 820 + }, + { + "epoch": 0.05757641448081509, + "grad_norm": 1.234375, + "learning_rate": 4.855826086956522e-06, + "loss": 0.239, + "step": 830 + }, + { + "epoch": 0.0582701062215478, + "grad_norm": 1.2421875, + "learning_rate": 4.85408695652174e-06, + "loss": 0.3108, + "step": 840 + }, + { + "epoch": 0.05896379796228051, + "grad_norm": 1.3515625, + "learning_rate": 4.852347826086957e-06, + "loss": 0.2425, + "step": 850 + }, + { + "epoch": 0.059657489703013225, + "grad_norm": 1.390625, + "learning_rate": 4.850608695652174e-06, + "loss": 0.3201, + "step": 860 + }, + { + "epoch": 0.06035118144374593, + "grad_norm": 1.859375, + "learning_rate": 4.848869565217392e-06, + "loss": 0.2886, + "step": 870 + }, + { + "epoch": 0.06104487318447865, + "grad_norm": 1.15625, + "learning_rate": 4.847130434782609e-06, + "loss": 0.2269, + "step": 880 + }, + { + "epoch": 0.06173856492521136, + "grad_norm": 1.21875, + "learning_rate": 4.8453913043478265e-06, + "loss": 0.247, + "step": 890 + }, + { + "epoch": 0.06243225666594407, + "grad_norm": 1.484375, + "learning_rate": 4.843652173913044e-06, + "loss": 0.2773, + "step": 900 + }, + { + "epoch": 0.06312594840667678, + "grad_norm": 1.1171875, + "learning_rate": 4.841913043478261e-06, + "loss": 0.2464, + "step": 910 + }, + { + "epoch": 0.0638196401474095, + "grad_norm": 1.703125, + "learning_rate": 4.840173913043479e-06, + "loss": 0.2407, + "step": 920 + }, + { + "epoch": 0.0645133318881422, + "grad_norm": 1.125, + "learning_rate": 4.838434782608696e-06, + "loss": 0.2772, + "step": 930 + }, + { + "epoch": 0.06520702362887491, + "grad_norm": 1.015625, + "learning_rate": 4.836695652173913e-06, + "loss": 0.252, + "step": 940 + }, + { + "epoch": 0.06590071536960763, + "grad_norm": 1.5703125, + "learning_rate": 4.834956521739131e-06, + "loss": 0.2426, + "step": 950 + }, + { + "epoch": 0.06659440711034034, + "grad_norm": 1.421875, + "learning_rate": 4.833217391304348e-06, + "loss": 0.2722, + "step": 960 + }, + { + "epoch": 0.06728809885107305, + "grad_norm": 1.7890625, + "learning_rate": 4.8314782608695655e-06, + "loss": 0.2661, + "step": 970 + }, + { + "epoch": 0.06798179059180577, + "grad_norm": 1.6015625, + "learning_rate": 4.829739130434783e-06, + "loss": 0.2329, + "step": 980 + }, + { + "epoch": 0.06867548233253848, + "grad_norm": 1.578125, + "learning_rate": 4.828e-06, + "loss": 0.2414, + "step": 990 + }, + { + "epoch": 0.06936917407327119, + "grad_norm": 1.9296875, + "learning_rate": 4.826260869565218e-06, + "loss": 0.264, + "step": 1000 + }, + { + "epoch": 0.07006286581400391, + "grad_norm": 1.5546875, + "learning_rate": 4.824521739130435e-06, + "loss": 0.2493, + "step": 1010 + }, + { + "epoch": 0.07075655755473662, + "grad_norm": 1.3984375, + "learning_rate": 4.822782608695652e-06, + "loss": 0.3118, + "step": 1020 + }, + { + "epoch": 0.07145024929546932, + "grad_norm": 1.8828125, + "learning_rate": 4.82104347826087e-06, + "loss": 0.2739, + "step": 1030 + }, + { + "epoch": 0.07214394103620204, + "grad_norm": 1.0234375, + "learning_rate": 4.819304347826088e-06, + "loss": 0.2743, + "step": 1040 + }, + { + "epoch": 0.07283763277693475, + "grad_norm": 1.1796875, + "learning_rate": 4.817565217391304e-06, + "loss": 0.2299, + "step": 1050 + }, + { + "epoch": 0.07353132451766746, + "grad_norm": 1.15625, + "learning_rate": 4.815826086956522e-06, + "loss": 0.2203, + "step": 1060 + }, + { + "epoch": 0.07422501625840017, + "grad_norm": 0.74609375, + "learning_rate": 4.81408695652174e-06, + "loss": 0.2155, + "step": 1070 + }, + { + "epoch": 0.07491870799913289, + "grad_norm": 1.421875, + "learning_rate": 4.812347826086957e-06, + "loss": 0.248, + "step": 1080 + }, + { + "epoch": 0.0756123997398656, + "grad_norm": 1.1640625, + "learning_rate": 4.810608695652174e-06, + "loss": 0.2366, + "step": 1090 + }, + { + "epoch": 0.0763060914805983, + "grad_norm": 1.0234375, + "learning_rate": 4.808869565217391e-06, + "loss": 0.2251, + "step": 1100 + }, + { + "epoch": 0.07699978322133103, + "grad_norm": 1.5859375, + "learning_rate": 4.8071304347826095e-06, + "loss": 0.2265, + "step": 1110 + }, + { + "epoch": 0.07769347496206373, + "grad_norm": 1.5546875, + "learning_rate": 4.805391304347827e-06, + "loss": 0.2876, + "step": 1120 + }, + { + "epoch": 0.07838716670279644, + "grad_norm": 1.3359375, + "learning_rate": 4.803652173913043e-06, + "loss": 0.2942, + "step": 1130 + }, + { + "epoch": 0.07908085844352916, + "grad_norm": 0.8984375, + "learning_rate": 4.801913043478262e-06, + "loss": 0.2531, + "step": 1140 + }, + { + "epoch": 0.07977455018426187, + "grad_norm": 2.265625, + "learning_rate": 4.800173913043479e-06, + "loss": 0.2473, + "step": 1150 + }, + { + "epoch": 0.08046824192499458, + "grad_norm": 1.2265625, + "learning_rate": 4.7984347826086955e-06, + "loss": 0.2106, + "step": 1160 + }, + { + "epoch": 0.0811619336657273, + "grad_norm": 1.0, + "learning_rate": 4.796695652173914e-06, + "loss": 0.2111, + "step": 1170 + }, + { + "epoch": 0.08185562540646, + "grad_norm": 1.734375, + "learning_rate": 4.794956521739131e-06, + "loss": 0.262, + "step": 1180 + }, + { + "epoch": 0.08254931714719271, + "grad_norm": 1.125, + "learning_rate": 4.7932173913043485e-06, + "loss": 0.2402, + "step": 1190 + }, + { + "epoch": 0.08324300888792543, + "grad_norm": 1.2421875, + "learning_rate": 4.791478260869565e-06, + "loss": 0.2325, + "step": 1200 + }, + { + "epoch": 0.08393670062865814, + "grad_norm": 1.296875, + "learning_rate": 4.789739130434783e-06, + "loss": 0.2355, + "step": 1210 + }, + { + "epoch": 0.08463039236939085, + "grad_norm": 1.234375, + "learning_rate": 4.7880000000000006e-06, + "loss": 0.2561, + "step": 1220 + }, + { + "epoch": 0.08532408411012356, + "grad_norm": 1.7890625, + "learning_rate": 4.786260869565218e-06, + "loss": 0.2351, + "step": 1230 + }, + { + "epoch": 0.08601777585085628, + "grad_norm": 1.2421875, + "learning_rate": 4.784521739130435e-06, + "loss": 0.2571, + "step": 1240 + }, + { + "epoch": 0.08671146759158899, + "grad_norm": 2.640625, + "learning_rate": 4.782782608695653e-06, + "loss": 0.2187, + "step": 1250 + }, + { + "epoch": 0.0874051593323217, + "grad_norm": 1.1640625, + "learning_rate": 4.78104347826087e-06, + "loss": 0.313, + "step": 1260 + }, + { + "epoch": 0.08809885107305442, + "grad_norm": 1.9140625, + "learning_rate": 4.7793043478260874e-06, + "loss": 0.3083, + "step": 1270 + }, + { + "epoch": 0.08879254281378712, + "grad_norm": 1.5859375, + "learning_rate": 4.777565217391305e-06, + "loss": 0.2748, + "step": 1280 + }, + { + "epoch": 0.08948623455451983, + "grad_norm": 1.640625, + "learning_rate": 4.775826086956522e-06, + "loss": 0.2393, + "step": 1290 + }, + { + "epoch": 0.09017992629525255, + "grad_norm": 1.3359375, + "learning_rate": 4.7740869565217395e-06, + "loss": 0.2355, + "step": 1300 + }, + { + "epoch": 0.09087361803598526, + "grad_norm": 1.1328125, + "learning_rate": 4.772347826086957e-06, + "loss": 0.2335, + "step": 1310 + }, + { + "epoch": 0.09156730977671797, + "grad_norm": 1.5859375, + "learning_rate": 4.770608695652174e-06, + "loss": 0.2321, + "step": 1320 + }, + { + "epoch": 0.09226100151745069, + "grad_norm": 1.4453125, + "learning_rate": 4.768869565217392e-06, + "loss": 0.3314, + "step": 1330 + }, + { + "epoch": 0.0929546932581834, + "grad_norm": 0.99609375, + "learning_rate": 4.767130434782609e-06, + "loss": 0.2046, + "step": 1340 + }, + { + "epoch": 0.0936483849989161, + "grad_norm": 1.328125, + "learning_rate": 4.765391304347826e-06, + "loss": 0.2461, + "step": 1350 + }, + { + "epoch": 0.09434207673964883, + "grad_norm": 1.140625, + "learning_rate": 4.763652173913044e-06, + "loss": 0.2129, + "step": 1360 + }, + { + "epoch": 0.09503576848038153, + "grad_norm": 1.2578125, + "learning_rate": 4.761913043478261e-06, + "loss": 0.2483, + "step": 1370 + }, + { + "epoch": 0.09572946022111424, + "grad_norm": 1.859375, + "learning_rate": 4.7601739130434785e-06, + "loss": 0.2285, + "step": 1380 + }, + { + "epoch": 0.09642315196184695, + "grad_norm": 1.0390625, + "learning_rate": 4.758434782608696e-06, + "loss": 0.2333, + "step": 1390 + }, + { + "epoch": 0.09711684370257967, + "grad_norm": 1.4375, + "learning_rate": 4.756695652173913e-06, + "loss": 0.2183, + "step": 1400 + }, + { + "epoch": 0.09781053544331238, + "grad_norm": 1.3828125, + "learning_rate": 4.754956521739131e-06, + "loss": 0.2621, + "step": 1410 + }, + { + "epoch": 0.09850422718404508, + "grad_norm": 1.5859375, + "learning_rate": 4.753217391304348e-06, + "loss": 0.2572, + "step": 1420 + }, + { + "epoch": 0.0991979189247778, + "grad_norm": 1.265625, + "learning_rate": 4.751478260869566e-06, + "loss": 0.2177, + "step": 1430 + }, + { + "epoch": 0.09989161066551051, + "grad_norm": 1.5703125, + "learning_rate": 4.749739130434783e-06, + "loss": 0.2542, + "step": 1440 + }, + { + "epoch": 0.10058530240624322, + "grad_norm": 1.4609375, + "learning_rate": 4.748e-06, + "loss": 0.2459, + "step": 1450 + }, + { + "epoch": 0.10127899414697594, + "grad_norm": 1.640625, + "learning_rate": 4.746260869565218e-06, + "loss": 0.2786, + "step": 1460 + }, + { + "epoch": 0.10197268588770865, + "grad_norm": 1.8359375, + "learning_rate": 4.744521739130435e-06, + "loss": 0.2875, + "step": 1470 + }, + { + "epoch": 0.10266637762844136, + "grad_norm": 1.9765625, + "learning_rate": 4.742782608695652e-06, + "loss": 0.2957, + "step": 1480 + }, + { + "epoch": 0.10336006936917408, + "grad_norm": 1.59375, + "learning_rate": 4.74104347826087e-06, + "loss": 0.2543, + "step": 1490 + }, + { + "epoch": 0.10405376110990679, + "grad_norm": 1.828125, + "learning_rate": 4.739304347826088e-06, + "loss": 0.2631, + "step": 1500 + }, + { + "epoch": 0.1047474528506395, + "grad_norm": 1.5703125, + "learning_rate": 4.737565217391304e-06, + "loss": 0.2462, + "step": 1510 + }, + { + "epoch": 0.10544114459137222, + "grad_norm": 1.203125, + "learning_rate": 4.735826086956522e-06, + "loss": 0.2237, + "step": 1520 + }, + { + "epoch": 0.10613483633210492, + "grad_norm": 1.1875, + "learning_rate": 4.73408695652174e-06, + "loss": 0.2206, + "step": 1530 + }, + { + "epoch": 0.10682852807283763, + "grad_norm": 1.59375, + "learning_rate": 4.732347826086957e-06, + "loss": 0.2533, + "step": 1540 + }, + { + "epoch": 0.10752221981357034, + "grad_norm": 1.203125, + "learning_rate": 4.730608695652174e-06, + "loss": 0.2136, + "step": 1550 + }, + { + "epoch": 0.10821591155430306, + "grad_norm": 2.015625, + "learning_rate": 4.728869565217391e-06, + "loss": 0.3101, + "step": 1560 + }, + { + "epoch": 0.10890960329503577, + "grad_norm": 2.4375, + "learning_rate": 4.727130434782609e-06, + "loss": 0.2574, + "step": 1570 + }, + { + "epoch": 0.10960329503576847, + "grad_norm": 1.25, + "learning_rate": 4.725391304347827e-06, + "loss": 0.2142, + "step": 1580 + }, + { + "epoch": 0.1102969867765012, + "grad_norm": 2.21875, + "learning_rate": 4.723652173913043e-06, + "loss": 0.1908, + "step": 1590 + }, + { + "epoch": 0.1109906785172339, + "grad_norm": 1.703125, + "learning_rate": 4.7219130434782615e-06, + "loss": 0.2437, + "step": 1600 + }, + { + "epoch": 0.11168437025796661, + "grad_norm": 1.25, + "learning_rate": 4.720173913043479e-06, + "loss": 0.2262, + "step": 1610 + }, + { + "epoch": 0.11237806199869933, + "grad_norm": 0.9921875, + "learning_rate": 4.718434782608696e-06, + "loss": 0.258, + "step": 1620 + }, + { + "epoch": 0.11307175373943204, + "grad_norm": 1.609375, + "learning_rate": 4.716695652173914e-06, + "loss": 0.2992, + "step": 1630 + }, + { + "epoch": 0.11376544548016475, + "grad_norm": 1.5703125, + "learning_rate": 4.714956521739131e-06, + "loss": 0.247, + "step": 1640 + }, + { + "epoch": 0.11445913722089747, + "grad_norm": 1.1484375, + "learning_rate": 4.713217391304348e-06, + "loss": 0.262, + "step": 1650 + }, + { + "epoch": 0.11515282896163018, + "grad_norm": 1.2890625, + "learning_rate": 4.711478260869566e-06, + "loss": 0.2604, + "step": 1660 + }, + { + "epoch": 0.11584652070236288, + "grad_norm": 1.0390625, + "learning_rate": 4.709739130434783e-06, + "loss": 0.2101, + "step": 1670 + }, + { + "epoch": 0.1165402124430956, + "grad_norm": 1.8671875, + "learning_rate": 4.7080000000000005e-06, + "loss": 0.2641, + "step": 1680 + }, + { + "epoch": 0.11723390418382831, + "grad_norm": 1.3984375, + "learning_rate": 4.706260869565218e-06, + "loss": 0.2134, + "step": 1690 + }, + { + "epoch": 0.11792759592456102, + "grad_norm": 1.0546875, + "learning_rate": 4.704521739130435e-06, + "loss": 0.2305, + "step": 1700 + }, + { + "epoch": 0.11862128766529373, + "grad_norm": 1.2265625, + "learning_rate": 4.702782608695653e-06, + "loss": 0.2542, + "step": 1710 + }, + { + "epoch": 0.11931497940602645, + "grad_norm": 1.328125, + "learning_rate": 4.70104347826087e-06, + "loss": 0.2722, + "step": 1720 + }, + { + "epoch": 0.12000867114675916, + "grad_norm": 1.015625, + "learning_rate": 4.699304347826087e-06, + "loss": 0.2172, + "step": 1730 + }, + { + "epoch": 0.12070236288749187, + "grad_norm": 1.1953125, + "learning_rate": 4.697565217391305e-06, + "loss": 0.3591, + "step": 1740 + }, + { + "epoch": 0.12139605462822459, + "grad_norm": 1.3359375, + "learning_rate": 4.695826086956522e-06, + "loss": 0.2487, + "step": 1750 + }, + { + "epoch": 0.1220897463689573, + "grad_norm": 1.234375, + "learning_rate": 4.6940869565217395e-06, + "loss": 0.2619, + "step": 1760 + }, + { + "epoch": 0.12278343810969, + "grad_norm": 1.0078125, + "learning_rate": 4.692347826086957e-06, + "loss": 0.2614, + "step": 1770 + }, + { + "epoch": 0.12347712985042272, + "grad_norm": 1.0625, + "learning_rate": 4.690608695652174e-06, + "loss": 0.2354, + "step": 1780 + }, + { + "epoch": 0.12417082159115543, + "grad_norm": 1.109375, + "learning_rate": 4.688869565217392e-06, + "loss": 0.2302, + "step": 1790 + }, + { + "epoch": 0.12486451333188814, + "grad_norm": 1.203125, + "learning_rate": 4.687130434782609e-06, + "loss": 0.2436, + "step": 1800 + }, + { + "epoch": 0.12555820507262086, + "grad_norm": 1.9140625, + "learning_rate": 4.685391304347826e-06, + "loss": 0.2219, + "step": 1810 + }, + { + "epoch": 0.12625189681335355, + "grad_norm": 1.3125, + "learning_rate": 4.683652173913044e-06, + "loss": 0.237, + "step": 1820 + }, + { + "epoch": 0.12694558855408627, + "grad_norm": 1.1640625, + "learning_rate": 4.681913043478261e-06, + "loss": 0.2562, + "step": 1830 + }, + { + "epoch": 0.127639280294819, + "grad_norm": 1.2734375, + "learning_rate": 4.6801739130434784e-06, + "loss": 0.2683, + "step": 1840 + }, + { + "epoch": 0.1283329720355517, + "grad_norm": 1.421875, + "learning_rate": 4.678434782608696e-06, + "loss": 0.2653, + "step": 1850 + }, + { + "epoch": 0.1290266637762844, + "grad_norm": 1.2421875, + "learning_rate": 4.676695652173913e-06, + "loss": 0.2423, + "step": 1860 + }, + { + "epoch": 0.12972035551701713, + "grad_norm": 0.98828125, + "learning_rate": 4.6749565217391305e-06, + "loss": 0.2275, + "step": 1870 + }, + { + "epoch": 0.13041404725774983, + "grad_norm": 1.046875, + "learning_rate": 4.673217391304348e-06, + "loss": 0.229, + "step": 1880 + }, + { + "epoch": 0.13110773899848255, + "grad_norm": 1.453125, + "learning_rate": 4.671478260869566e-06, + "loss": 0.2491, + "step": 1890 + }, + { + "epoch": 0.13180143073921527, + "grad_norm": 1.3359375, + "learning_rate": 4.669739130434783e-06, + "loss": 0.2106, + "step": 1900 + }, + { + "epoch": 0.13249512247994796, + "grad_norm": 1.7265625, + "learning_rate": 4.668e-06, + "loss": 0.3183, + "step": 1910 + }, + { + "epoch": 0.13318881422068068, + "grad_norm": 1.5625, + "learning_rate": 4.666260869565218e-06, + "loss": 0.2252, + "step": 1920 + }, + { + "epoch": 0.1338825059614134, + "grad_norm": 1.078125, + "learning_rate": 4.664521739130436e-06, + "loss": 0.2986, + "step": 1930 + }, + { + "epoch": 0.1345761977021461, + "grad_norm": 1.6328125, + "learning_rate": 4.662782608695652e-06, + "loss": 0.2311, + "step": 1940 + }, + { + "epoch": 0.13526988944287882, + "grad_norm": 2.3125, + "learning_rate": 4.6610434782608695e-06, + "loss": 0.2383, + "step": 1950 + }, + { + "epoch": 0.13596358118361154, + "grad_norm": 1.4609375, + "learning_rate": 4.659304347826088e-06, + "loss": 0.2228, + "step": 1960 + }, + { + "epoch": 0.13665727292434424, + "grad_norm": 1.6484375, + "learning_rate": 4.657565217391305e-06, + "loss": 0.2704, + "step": 1970 + }, + { + "epoch": 0.13735096466507696, + "grad_norm": 1.265625, + "learning_rate": 4.655826086956522e-06, + "loss": 0.2858, + "step": 1980 + }, + { + "epoch": 0.13804465640580968, + "grad_norm": 1.046875, + "learning_rate": 4.65408695652174e-06, + "loss": 0.2813, + "step": 1990 + }, + { + "epoch": 0.13873834814654237, + "grad_norm": 1.25, + "learning_rate": 4.652347826086957e-06, + "loss": 0.2458, + "step": 2000 + }, + { + "epoch": 0.1394320398872751, + "grad_norm": 1.1171875, + "learning_rate": 4.650608695652175e-06, + "loss": 0.2186, + "step": 2010 + }, + { + "epoch": 0.14012573162800782, + "grad_norm": 2.546875, + "learning_rate": 4.648869565217391e-06, + "loss": 0.2789, + "step": 2020 + }, + { + "epoch": 0.1408194233687405, + "grad_norm": 1.59375, + "learning_rate": 4.647130434782609e-06, + "loss": 0.2564, + "step": 2030 + }, + { + "epoch": 0.14151311510947323, + "grad_norm": 1.28125, + "learning_rate": 4.645391304347827e-06, + "loss": 0.2318, + "step": 2040 + }, + { + "epoch": 0.14220680685020595, + "grad_norm": 1.703125, + "learning_rate": 4.643652173913044e-06, + "loss": 0.2513, + "step": 2050 + }, + { + "epoch": 0.14290049859093865, + "grad_norm": 0.89453125, + "learning_rate": 4.6419130434782614e-06, + "loss": 0.2211, + "step": 2060 + }, + { + "epoch": 0.14359419033167137, + "grad_norm": 1.1328125, + "learning_rate": 4.640173913043479e-06, + "loss": 0.2939, + "step": 2070 + }, + { + "epoch": 0.1442878820724041, + "grad_norm": 1.296875, + "learning_rate": 4.638434782608696e-06, + "loss": 0.2321, + "step": 2080 + }, + { + "epoch": 0.14498157381313678, + "grad_norm": 1.484375, + "learning_rate": 4.6366956521739136e-06, + "loss": 0.226, + "step": 2090 + }, + { + "epoch": 0.1456752655538695, + "grad_norm": 1.0625, + "learning_rate": 4.634956521739131e-06, + "loss": 0.2262, + "step": 2100 + }, + { + "epoch": 0.1463689572946022, + "grad_norm": 1.328125, + "learning_rate": 4.633217391304348e-06, + "loss": 0.2198, + "step": 2110 + }, + { + "epoch": 0.14706264903533492, + "grad_norm": 1.421875, + "learning_rate": 4.631478260869566e-06, + "loss": 0.2517, + "step": 2120 + }, + { + "epoch": 0.14775634077606764, + "grad_norm": 1.125, + "learning_rate": 4.629739130434783e-06, + "loss": 0.2736, + "step": 2130 + }, + { + "epoch": 0.14845003251680033, + "grad_norm": 1.4609375, + "learning_rate": 4.628e-06, + "loss": 0.2483, + "step": 2140 + }, + { + "epoch": 0.14914372425753306, + "grad_norm": 1.1328125, + "learning_rate": 4.626260869565218e-06, + "loss": 0.2138, + "step": 2150 + }, + { + "epoch": 0.14983741599826578, + "grad_norm": 1.578125, + "learning_rate": 4.624521739130435e-06, + "loss": 0.2202, + "step": 2160 + }, + { + "epoch": 0.15053110773899847, + "grad_norm": 1.359375, + "learning_rate": 4.6227826086956525e-06, + "loss": 0.2468, + "step": 2170 + }, + { + "epoch": 0.1512247994797312, + "grad_norm": 1.171875, + "learning_rate": 4.62104347826087e-06, + "loss": 0.2218, + "step": 2180 + }, + { + "epoch": 0.1519184912204639, + "grad_norm": 1.6328125, + "learning_rate": 4.619304347826087e-06, + "loss": 0.2741, + "step": 2190 + }, + { + "epoch": 0.1526121829611966, + "grad_norm": 1.078125, + "learning_rate": 4.617565217391305e-06, + "loss": 0.2553, + "step": 2200 + }, + { + "epoch": 0.15330587470192933, + "grad_norm": 1.2265625, + "learning_rate": 4.615826086956522e-06, + "loss": 0.234, + "step": 2210 + }, + { + "epoch": 0.15399956644266205, + "grad_norm": 1.4453125, + "learning_rate": 4.614086956521739e-06, + "loss": 0.2321, + "step": 2220 + }, + { + "epoch": 0.15469325818339474, + "grad_norm": 1.2265625, + "learning_rate": 4.612347826086957e-06, + "loss": 0.2222, + "step": 2230 + }, + { + "epoch": 0.15538694992412747, + "grad_norm": 1.40625, + "learning_rate": 4.610608695652174e-06, + "loss": 0.2726, + "step": 2240 + }, + { + "epoch": 0.1560806416648602, + "grad_norm": 1.0078125, + "learning_rate": 4.6088695652173915e-06, + "loss": 0.2418, + "step": 2250 + }, + { + "epoch": 0.15677433340559288, + "grad_norm": 1.3828125, + "learning_rate": 4.607130434782609e-06, + "loss": 0.2499, + "step": 2260 + }, + { + "epoch": 0.1574680251463256, + "grad_norm": 0.984375, + "learning_rate": 4.605391304347826e-06, + "loss": 0.2741, + "step": 2270 + }, + { + "epoch": 0.15816171688705832, + "grad_norm": 1.3515625, + "learning_rate": 4.6036521739130445e-06, + "loss": 0.2061, + "step": 2280 + }, + { + "epoch": 0.15885540862779102, + "grad_norm": 1.3046875, + "learning_rate": 4.601913043478261e-06, + "loss": 0.2696, + "step": 2290 + }, + { + "epoch": 0.15954910036852374, + "grad_norm": 1.359375, + "learning_rate": 4.600173913043478e-06, + "loss": 0.2767, + "step": 2300 + }, + { + "epoch": 0.16024279210925646, + "grad_norm": 1.3515625, + "learning_rate": 4.598434782608696e-06, + "loss": 0.2485, + "step": 2310 + }, + { + "epoch": 0.16093648384998915, + "grad_norm": 1.2578125, + "learning_rate": 4.596695652173914e-06, + "loss": 0.2485, + "step": 2320 + }, + { + "epoch": 0.16163017559072188, + "grad_norm": 1.8046875, + "learning_rate": 4.5949565217391305e-06, + "loss": 0.2586, + "step": 2330 + }, + { + "epoch": 0.1623238673314546, + "grad_norm": 1.0390625, + "learning_rate": 4.593217391304348e-06, + "loss": 0.1902, + "step": 2340 + }, + { + "epoch": 0.1630175590721873, + "grad_norm": 1.7734375, + "learning_rate": 4.591478260869566e-06, + "loss": 0.3126, + "step": 2350 + }, + { + "epoch": 0.16371125081292, + "grad_norm": 1.8828125, + "learning_rate": 4.5897391304347834e-06, + "loss": 0.285, + "step": 2360 + }, + { + "epoch": 0.16440494255365273, + "grad_norm": 1.1484375, + "learning_rate": 4.588e-06, + "loss": 0.2054, + "step": 2370 + }, + { + "epoch": 0.16509863429438543, + "grad_norm": 2.921875, + "learning_rate": 4.586260869565218e-06, + "loss": 0.2095, + "step": 2380 + }, + { + "epoch": 0.16579232603511815, + "grad_norm": 1.046875, + "learning_rate": 4.5845217391304355e-06, + "loss": 0.234, + "step": 2390 + }, + { + "epoch": 0.16648601777585087, + "grad_norm": 1.4765625, + "learning_rate": 4.582782608695652e-06, + "loss": 0.2567, + "step": 2400 + }, + { + "epoch": 0.16717970951658356, + "grad_norm": 0.94140625, + "learning_rate": 4.5810434782608694e-06, + "loss": 0.2976, + "step": 2410 + }, + { + "epoch": 0.16787340125731628, + "grad_norm": 1.3203125, + "learning_rate": 4.579304347826088e-06, + "loss": 0.2411, + "step": 2420 + }, + { + "epoch": 0.16856709299804898, + "grad_norm": 1.125, + "learning_rate": 4.577565217391305e-06, + "loss": 0.2256, + "step": 2430 + }, + { + "epoch": 0.1692607847387817, + "grad_norm": 1.1796875, + "learning_rate": 4.5758260869565215e-06, + "loss": 0.2514, + "step": 2440 + }, + { + "epoch": 0.16995447647951442, + "grad_norm": 1.0625, + "learning_rate": 4.57408695652174e-06, + "loss": 0.2754, + "step": 2450 + }, + { + "epoch": 0.17064816822024712, + "grad_norm": 1.2890625, + "learning_rate": 4.572347826086957e-06, + "loss": 0.2486, + "step": 2460 + }, + { + "epoch": 0.17134185996097984, + "grad_norm": 1.1015625, + "learning_rate": 4.5706086956521745e-06, + "loss": 0.2643, + "step": 2470 + }, + { + "epoch": 0.17203555170171256, + "grad_norm": 1.4453125, + "learning_rate": 4.568869565217391e-06, + "loss": 0.2401, + "step": 2480 + }, + { + "epoch": 0.17272924344244525, + "grad_norm": 1.3671875, + "learning_rate": 4.567130434782609e-06, + "loss": 0.2471, + "step": 2490 + }, + { + "epoch": 0.17342293518317797, + "grad_norm": 1.6015625, + "learning_rate": 4.565391304347827e-06, + "loss": 0.2813, + "step": 2500 + }, + { + "epoch": 0.1741166269239107, + "grad_norm": 2.109375, + "learning_rate": 4.563652173913044e-06, + "loss": 0.2199, + "step": 2510 + }, + { + "epoch": 0.1748103186646434, + "grad_norm": 1.0859375, + "learning_rate": 4.561913043478261e-06, + "loss": 0.2346, + "step": 2520 + }, + { + "epoch": 0.1755040104053761, + "grad_norm": 0.90234375, + "learning_rate": 4.560173913043479e-06, + "loss": 0.3302, + "step": 2530 + }, + { + "epoch": 0.17619770214610883, + "grad_norm": 1.234375, + "learning_rate": 4.558434782608696e-06, + "loss": 0.2683, + "step": 2540 + }, + { + "epoch": 0.17689139388684152, + "grad_norm": 2.34375, + "learning_rate": 4.5566956521739135e-06, + "loss": 0.2297, + "step": 2550 + }, + { + "epoch": 0.17758508562757425, + "grad_norm": 1.3203125, + "learning_rate": 4.554956521739131e-06, + "loss": 0.2285, + "step": 2560 + }, + { + "epoch": 0.17827877736830697, + "grad_norm": 1.1796875, + "learning_rate": 4.553217391304348e-06, + "loss": 0.2914, + "step": 2570 + }, + { + "epoch": 0.17897246910903966, + "grad_norm": 1.3359375, + "learning_rate": 4.551478260869566e-06, + "loss": 0.2983, + "step": 2580 + }, + { + "epoch": 0.17966616084977238, + "grad_norm": 1.1484375, + "learning_rate": 4.549739130434783e-06, + "loss": 0.2334, + "step": 2590 + }, + { + "epoch": 0.1803598525905051, + "grad_norm": 1.4609375, + "learning_rate": 4.548e-06, + "loss": 0.2318, + "step": 2600 + }, + { + "epoch": 0.1810535443312378, + "grad_norm": 1.4609375, + "learning_rate": 4.546260869565218e-06, + "loss": 0.2391, + "step": 2610 + }, + { + "epoch": 0.18174723607197052, + "grad_norm": 0.80859375, + "learning_rate": 4.544521739130435e-06, + "loss": 0.2697, + "step": 2620 + }, + { + "epoch": 0.18244092781270324, + "grad_norm": 1.0703125, + "learning_rate": 4.5427826086956524e-06, + "loss": 0.263, + "step": 2630 + }, + { + "epoch": 0.18313461955343593, + "grad_norm": 1.6015625, + "learning_rate": 4.54104347826087e-06, + "loss": 0.2569, + "step": 2640 + }, + { + "epoch": 0.18382831129416866, + "grad_norm": 1.8203125, + "learning_rate": 4.539304347826087e-06, + "loss": 0.2997, + "step": 2650 + }, + { + "epoch": 0.18452200303490138, + "grad_norm": 1.4765625, + "learning_rate": 4.5375652173913046e-06, + "loss": 0.2352, + "step": 2660 + }, + { + "epoch": 0.18521569477563407, + "grad_norm": 2.0625, + "learning_rate": 4.535826086956523e-06, + "loss": 0.2896, + "step": 2670 + }, + { + "epoch": 0.1859093865163668, + "grad_norm": 1.921875, + "learning_rate": 4.534086956521739e-06, + "loss": 0.2807, + "step": 2680 + }, + { + "epoch": 0.18660307825709951, + "grad_norm": 1.2578125, + "learning_rate": 4.532347826086957e-06, + "loss": 0.2258, + "step": 2690 + }, + { + "epoch": 0.1872967699978322, + "grad_norm": 0.91015625, + "learning_rate": 4.530608695652174e-06, + "loss": 0.2467, + "step": 2700 + }, + { + "epoch": 0.18799046173856493, + "grad_norm": 1.1796875, + "learning_rate": 4.528869565217391e-06, + "loss": 0.2375, + "step": 2710 + }, + { + "epoch": 0.18868415347929765, + "grad_norm": 1.0859375, + "learning_rate": 4.527130434782609e-06, + "loss": 0.2648, + "step": 2720 + }, + { + "epoch": 0.18937784522003034, + "grad_norm": 1.1640625, + "learning_rate": 4.525391304347826e-06, + "loss": 0.241, + "step": 2730 + }, + { + "epoch": 0.19007153696076307, + "grad_norm": 1.59375, + "learning_rate": 4.523652173913044e-06, + "loss": 0.2694, + "step": 2740 + }, + { + "epoch": 0.19076522870149576, + "grad_norm": 1.0390625, + "learning_rate": 4.521913043478261e-06, + "loss": 0.213, + "step": 2750 + }, + { + "epoch": 0.19145892044222848, + "grad_norm": 1.0546875, + "learning_rate": 4.520173913043478e-06, + "loss": 0.2127, + "step": 2760 + }, + { + "epoch": 0.1921526121829612, + "grad_norm": 1.4765625, + "learning_rate": 4.518434782608696e-06, + "loss": 0.26, + "step": 2770 + }, + { + "epoch": 0.1928463039236939, + "grad_norm": 1.28125, + "learning_rate": 4.516695652173914e-06, + "loss": 0.2297, + "step": 2780 + }, + { + "epoch": 0.19353999566442662, + "grad_norm": 1.40625, + "learning_rate": 4.51495652173913e-06, + "loss": 0.2087, + "step": 2790 + }, + { + "epoch": 0.19423368740515934, + "grad_norm": 1.1328125, + "learning_rate": 4.513217391304348e-06, + "loss": 0.2179, + "step": 2800 + }, + { + "epoch": 0.19492737914589203, + "grad_norm": 1.5625, + "learning_rate": 4.511478260869566e-06, + "loss": 0.2698, + "step": 2810 + }, + { + "epoch": 0.19562107088662475, + "grad_norm": 1.0625, + "learning_rate": 4.509739130434783e-06, + "loss": 0.2297, + "step": 2820 + }, + { + "epoch": 0.19631476262735748, + "grad_norm": 1.578125, + "learning_rate": 4.508e-06, + "loss": 0.2154, + "step": 2830 + }, + { + "epoch": 0.19700845436809017, + "grad_norm": 1.1171875, + "learning_rate": 4.506260869565218e-06, + "loss": 0.236, + "step": 2840 + }, + { + "epoch": 0.1977021461088229, + "grad_norm": 1.109375, + "learning_rate": 4.5045217391304355e-06, + "loss": 0.281, + "step": 2850 + }, + { + "epoch": 0.1983958378495556, + "grad_norm": 1.1171875, + "learning_rate": 4.502782608695653e-06, + "loss": 0.2373, + "step": 2860 + }, + { + "epoch": 0.1990895295902883, + "grad_norm": 1.375, + "learning_rate": 4.501043478260869e-06, + "loss": 0.2899, + "step": 2870 + }, + { + "epoch": 0.19978322133102103, + "grad_norm": 1.421875, + "learning_rate": 4.4993043478260876e-06, + "loss": 0.2459, + "step": 2880 + }, + { + "epoch": 0.20047691307175375, + "grad_norm": 1.1015625, + "learning_rate": 4.497565217391305e-06, + "loss": 0.2292, + "step": 2890 + }, + { + "epoch": 0.20117060481248644, + "grad_norm": 1.0546875, + "learning_rate": 4.495826086956522e-06, + "loss": 0.2465, + "step": 2900 + }, + { + "epoch": 0.20186429655321916, + "grad_norm": 1.3671875, + "learning_rate": 4.49408695652174e-06, + "loss": 0.2323, + "step": 2910 + }, + { + "epoch": 0.20255798829395188, + "grad_norm": 2.234375, + "learning_rate": 4.492347826086957e-06, + "loss": 0.3579, + "step": 2920 + }, + { + "epoch": 0.20325168003468458, + "grad_norm": 1.28125, + "learning_rate": 4.4906086956521744e-06, + "loss": 0.3302, + "step": 2930 + }, + { + "epoch": 0.2039453717754173, + "grad_norm": 1.4375, + "learning_rate": 4.488869565217392e-06, + "loss": 0.2271, + "step": 2940 + }, + { + "epoch": 0.20463906351615002, + "grad_norm": 1.1796875, + "learning_rate": 4.487130434782609e-06, + "loss": 0.2119, + "step": 2950 + }, + { + "epoch": 0.20533275525688272, + "grad_norm": 1.140625, + "learning_rate": 4.4853913043478265e-06, + "loss": 0.2446, + "step": 2960 + }, + { + "epoch": 0.20602644699761544, + "grad_norm": 0.9375, + "learning_rate": 4.483652173913044e-06, + "loss": 0.2289, + "step": 2970 + }, + { + "epoch": 0.20672013873834816, + "grad_norm": 1.1484375, + "learning_rate": 4.481913043478261e-06, + "loss": 0.2335, + "step": 2980 + }, + { + "epoch": 0.20741383047908085, + "grad_norm": 0.9609375, + "learning_rate": 4.480173913043479e-06, + "loss": 0.2208, + "step": 2990 + }, + { + "epoch": 0.20810752221981357, + "grad_norm": 1.5859375, + "learning_rate": 4.478434782608696e-06, + "loss": 0.2224, + "step": 3000 + }, + { + "epoch": 0.2088012139605463, + "grad_norm": 1.234375, + "learning_rate": 4.476695652173913e-06, + "loss": 0.2543, + "step": 3010 + }, + { + "epoch": 0.209494905701279, + "grad_norm": 1.171875, + "learning_rate": 4.474956521739131e-06, + "loss": 0.2274, + "step": 3020 + }, + { + "epoch": 0.2101885974420117, + "grad_norm": 1.3046875, + "learning_rate": 4.473217391304348e-06, + "loss": 0.2999, + "step": 3030 + }, + { + "epoch": 0.21088228918274443, + "grad_norm": 1.1640625, + "learning_rate": 4.4714782608695655e-06, + "loss": 0.2384, + "step": 3040 + }, + { + "epoch": 0.21157598092347712, + "grad_norm": 1.3984375, + "learning_rate": 4.469739130434783e-06, + "loss": 0.2618, + "step": 3050 + }, + { + "epoch": 0.21226967266420985, + "grad_norm": 1.453125, + "learning_rate": 4.468e-06, + "loss": 0.2545, + "step": 3060 + }, + { + "epoch": 0.21296336440494254, + "grad_norm": 1.1171875, + "learning_rate": 4.466260869565218e-06, + "loss": 0.2794, + "step": 3070 + }, + { + "epoch": 0.21365705614567526, + "grad_norm": 1.0234375, + "learning_rate": 4.464521739130435e-06, + "loss": 0.2537, + "step": 3080 + }, + { + "epoch": 0.21435074788640798, + "grad_norm": 1.4921875, + "learning_rate": 4.462782608695652e-06, + "loss": 0.2936, + "step": 3090 + }, + { + "epoch": 0.21504443962714068, + "grad_norm": 1.2890625, + "learning_rate": 4.46104347826087e-06, + "loss": 0.2446, + "step": 3100 + }, + { + "epoch": 0.2157381313678734, + "grad_norm": 1.171875, + "learning_rate": 4.459304347826087e-06, + "loss": 0.2443, + "step": 3110 + }, + { + "epoch": 0.21643182310860612, + "grad_norm": 1.2578125, + "learning_rate": 4.4575652173913045e-06, + "loss": 0.2338, + "step": 3120 + }, + { + "epoch": 0.2171255148493388, + "grad_norm": 1.0, + "learning_rate": 4.455826086956523e-06, + "loss": 0.2197, + "step": 3130 + }, + { + "epoch": 0.21781920659007153, + "grad_norm": 0.96875, + "learning_rate": 4.454086956521739e-06, + "loss": 0.2222, + "step": 3140 + }, + { + "epoch": 0.21851289833080426, + "grad_norm": 1.15625, + "learning_rate": 4.452347826086957e-06, + "loss": 0.2494, + "step": 3150 + }, + { + "epoch": 0.21920659007153695, + "grad_norm": 1.2890625, + "learning_rate": 4.450608695652174e-06, + "loss": 0.2529, + "step": 3160 + }, + { + "epoch": 0.21990028181226967, + "grad_norm": 0.9921875, + "learning_rate": 4.448869565217392e-06, + "loss": 0.185, + "step": 3170 + }, + { + "epoch": 0.2205939735530024, + "grad_norm": 1.6015625, + "learning_rate": 4.447130434782609e-06, + "loss": 0.2576, + "step": 3180 + }, + { + "epoch": 0.2212876652937351, + "grad_norm": 1.84375, + "learning_rate": 4.445391304347826e-06, + "loss": 0.2218, + "step": 3190 + }, + { + "epoch": 0.2219813570344678, + "grad_norm": 1.546875, + "learning_rate": 4.443652173913044e-06, + "loss": 0.2442, + "step": 3200 + }, + { + "epoch": 0.22267504877520053, + "grad_norm": 1.0859375, + "learning_rate": 4.441913043478262e-06, + "loss": 0.269, + "step": 3210 + }, + { + "epoch": 0.22336874051593322, + "grad_norm": 1.2578125, + "learning_rate": 4.440173913043478e-06, + "loss": 0.239, + "step": 3220 + }, + { + "epoch": 0.22406243225666594, + "grad_norm": 0.94140625, + "learning_rate": 4.4384347826086956e-06, + "loss": 0.2284, + "step": 3230 + }, + { + "epoch": 0.22475612399739867, + "grad_norm": 1.2265625, + "learning_rate": 4.436695652173914e-06, + "loss": 0.2405, + "step": 3240 + }, + { + "epoch": 0.22544981573813136, + "grad_norm": 1.5703125, + "learning_rate": 4.434956521739131e-06, + "loss": 0.2391, + "step": 3250 + }, + { + "epoch": 0.22614350747886408, + "grad_norm": 1.171875, + "learning_rate": 4.433217391304348e-06, + "loss": 0.2244, + "step": 3260 + }, + { + "epoch": 0.2268371992195968, + "grad_norm": 1.5, + "learning_rate": 4.431478260869566e-06, + "loss": 0.2313, + "step": 3270 + }, + { + "epoch": 0.2275308909603295, + "grad_norm": 1.71875, + "learning_rate": 4.429739130434783e-06, + "loss": 0.2855, + "step": 3280 + }, + { + "epoch": 0.22822458270106222, + "grad_norm": 1.4921875, + "learning_rate": 4.428000000000001e-06, + "loss": 0.2532, + "step": 3290 + }, + { + "epoch": 0.22891827444179494, + "grad_norm": 1.2890625, + "learning_rate": 4.426260869565218e-06, + "loss": 0.2226, + "step": 3300 + }, + { + "epoch": 0.22961196618252763, + "grad_norm": 1.3203125, + "learning_rate": 4.424521739130435e-06, + "loss": 0.2389, + "step": 3310 + }, + { + "epoch": 0.23030565792326035, + "grad_norm": 1.1484375, + "learning_rate": 4.422782608695653e-06, + "loss": 0.2067, + "step": 3320 + }, + { + "epoch": 0.23099934966399308, + "grad_norm": 1.2890625, + "learning_rate": 4.421043478260869e-06, + "loss": 0.1951, + "step": 3330 + }, + { + "epoch": 0.23169304140472577, + "grad_norm": 1.2265625, + "learning_rate": 4.4193043478260875e-06, + "loss": 0.2054, + "step": 3340 + }, + { + "epoch": 0.2323867331454585, + "grad_norm": 1.125, + "learning_rate": 4.417565217391305e-06, + "loss": 0.2557, + "step": 3350 + }, + { + "epoch": 0.2330804248861912, + "grad_norm": 1.203125, + "learning_rate": 4.415826086956522e-06, + "loss": 0.229, + "step": 3360 + }, + { + "epoch": 0.2337741166269239, + "grad_norm": 1.4921875, + "learning_rate": 4.41408695652174e-06, + "loss": 0.2895, + "step": 3370 + }, + { + "epoch": 0.23446780836765663, + "grad_norm": 1.0859375, + "learning_rate": 4.412347826086957e-06, + "loss": 0.299, + "step": 3380 + }, + { + "epoch": 0.23516150010838932, + "grad_norm": 1.1015625, + "learning_rate": 4.410608695652174e-06, + "loss": 0.2132, + "step": 3390 + }, + { + "epoch": 0.23585519184912204, + "grad_norm": 1.078125, + "learning_rate": 4.408869565217392e-06, + "loss": 0.2573, + "step": 3400 + }, + { + "epoch": 0.23654888358985476, + "grad_norm": 1.109375, + "learning_rate": 4.407130434782609e-06, + "loss": 0.2225, + "step": 3410 + }, + { + "epoch": 0.23724257533058746, + "grad_norm": 1.2109375, + "learning_rate": 4.4053913043478265e-06, + "loss": 0.2154, + "step": 3420 + }, + { + "epoch": 0.23793626707132018, + "grad_norm": 0.91015625, + "learning_rate": 4.403652173913044e-06, + "loss": 0.3088, + "step": 3430 + }, + { + "epoch": 0.2386299588120529, + "grad_norm": 1.9375, + "learning_rate": 4.401913043478261e-06, + "loss": 0.2877, + "step": 3440 + }, + { + "epoch": 0.2393236505527856, + "grad_norm": 1.984375, + "learning_rate": 4.4001739130434786e-06, + "loss": 0.2145, + "step": 3450 + }, + { + "epoch": 0.24001734229351832, + "grad_norm": 0.8515625, + "learning_rate": 4.398434782608696e-06, + "loss": 0.2338, + "step": 3460 + }, + { + "epoch": 0.24071103403425104, + "grad_norm": 1.328125, + "learning_rate": 4.396695652173913e-06, + "loss": 0.2488, + "step": 3470 + }, + { + "epoch": 0.24140472577498373, + "grad_norm": 1.0234375, + "learning_rate": 4.394956521739131e-06, + "loss": 0.2444, + "step": 3480 + }, + { + "epoch": 0.24209841751571645, + "grad_norm": 1.234375, + "learning_rate": 4.393217391304348e-06, + "loss": 0.2326, + "step": 3490 + }, + { + "epoch": 0.24279210925644917, + "grad_norm": 0.99609375, + "learning_rate": 4.3914782608695654e-06, + "loss": 0.2431, + "step": 3500 + }, + { + "epoch": 0.24348580099718187, + "grad_norm": 1.4375, + "learning_rate": 4.389739130434783e-06, + "loss": 0.2479, + "step": 3510 + }, + { + "epoch": 0.2441794927379146, + "grad_norm": 1.46875, + "learning_rate": 4.388e-06, + "loss": 0.2517, + "step": 3520 + }, + { + "epoch": 0.2448731844786473, + "grad_norm": 1.1640625, + "learning_rate": 4.3862608695652175e-06, + "loss": 0.2808, + "step": 3530 + }, + { + "epoch": 0.24556687621938, + "grad_norm": 1.1796875, + "learning_rate": 4.384521739130435e-06, + "loss": 0.2571, + "step": 3540 + }, + { + "epoch": 0.24626056796011273, + "grad_norm": 1.9921875, + "learning_rate": 4.382782608695652e-06, + "loss": 0.302, + "step": 3550 + }, + { + "epoch": 0.24695425970084545, + "grad_norm": 1.28125, + "learning_rate": 4.3810434782608705e-06, + "loss": 0.2128, + "step": 3560 + }, + { + "epoch": 0.24764795144157814, + "grad_norm": 0.96875, + "learning_rate": 4.379304347826087e-06, + "loss": 0.2285, + "step": 3570 + }, + { + "epoch": 0.24834164318231086, + "grad_norm": 1.3671875, + "learning_rate": 4.377565217391304e-06, + "loss": 0.2401, + "step": 3580 + }, + { + "epoch": 0.24903533492304358, + "grad_norm": 1.109375, + "learning_rate": 4.375826086956523e-06, + "loss": 0.2543, + "step": 3590 + }, + { + "epoch": 0.24972902666377628, + "grad_norm": 1.2109375, + "learning_rate": 4.37408695652174e-06, + "loss": 0.2598, + "step": 3600 + }, + { + "epoch": 0.25042271840450897, + "grad_norm": 1.34375, + "learning_rate": 4.3723478260869565e-06, + "loss": 0.3157, + "step": 3610 + }, + { + "epoch": 0.2511164101452417, + "grad_norm": 1.3671875, + "learning_rate": 4.370608695652174e-06, + "loss": 0.1943, + "step": 3620 + }, + { + "epoch": 0.2518101018859744, + "grad_norm": 1.1953125, + "learning_rate": 4.368869565217392e-06, + "loss": 0.2737, + "step": 3630 + }, + { + "epoch": 0.2525037936267071, + "grad_norm": 0.9375, + "learning_rate": 4.367130434782609e-06, + "loss": 0.2737, + "step": 3640 + }, + { + "epoch": 0.25319748536743986, + "grad_norm": 1.078125, + "learning_rate": 4.365391304347826e-06, + "loss": 0.2254, + "step": 3650 + }, + { + "epoch": 0.25389117710817255, + "grad_norm": 1.0859375, + "learning_rate": 4.363652173913044e-06, + "loss": 0.2407, + "step": 3660 + }, + { + "epoch": 0.25458486884890524, + "grad_norm": 1.453125, + "learning_rate": 4.361913043478262e-06, + "loss": 0.2414, + "step": 3670 + }, + { + "epoch": 0.255278560589638, + "grad_norm": 0.97265625, + "learning_rate": 4.360173913043478e-06, + "loss": 0.2899, + "step": 3680 + }, + { + "epoch": 0.2559722523303707, + "grad_norm": 1.125, + "learning_rate": 4.3584347826086955e-06, + "loss": 0.2452, + "step": 3690 + }, + { + "epoch": 0.2566659440711034, + "grad_norm": 1.2109375, + "learning_rate": 4.356695652173914e-06, + "loss": 0.2251, + "step": 3700 + }, + { + "epoch": 0.25735963581183613, + "grad_norm": 1.421875, + "learning_rate": 4.354956521739131e-06, + "loss": 0.2506, + "step": 3710 + }, + { + "epoch": 0.2580533275525688, + "grad_norm": 1.3125, + "learning_rate": 4.353217391304348e-06, + "loss": 0.2592, + "step": 3720 + }, + { + "epoch": 0.2587470192933015, + "grad_norm": 1.3984375, + "learning_rate": 4.351478260869566e-06, + "loss": 0.2523, + "step": 3730 + }, + { + "epoch": 0.25944071103403427, + "grad_norm": 1.3515625, + "learning_rate": 4.349739130434783e-06, + "loss": 0.2409, + "step": 3740 + }, + { + "epoch": 0.26013440277476696, + "grad_norm": 1.3359375, + "learning_rate": 4.3480000000000006e-06, + "loss": 0.253, + "step": 3750 + }, + { + "epoch": 0.26082809451549965, + "grad_norm": 1.8515625, + "learning_rate": 4.346260869565218e-06, + "loss": 0.2862, + "step": 3760 + }, + { + "epoch": 0.2615217862562324, + "grad_norm": 1.359375, + "learning_rate": 4.344521739130435e-06, + "loss": 0.2504, + "step": 3770 + }, + { + "epoch": 0.2622154779969651, + "grad_norm": 0.984375, + "learning_rate": 4.342782608695653e-06, + "loss": 0.2537, + "step": 3780 + }, + { + "epoch": 0.2629091697376978, + "grad_norm": 1.0078125, + "learning_rate": 4.34104347826087e-06, + "loss": 0.2099, + "step": 3790 + }, + { + "epoch": 0.26360286147843054, + "grad_norm": 1.0625, + "learning_rate": 4.339304347826087e-06, + "loss": 0.2753, + "step": 3800 + }, + { + "epoch": 0.26429655321916323, + "grad_norm": 1.125, + "learning_rate": 4.337565217391305e-06, + "loss": 0.2202, + "step": 3810 + }, + { + "epoch": 0.2649902449598959, + "grad_norm": 1.1484375, + "learning_rate": 4.335826086956522e-06, + "loss": 0.2382, + "step": 3820 + }, + { + "epoch": 0.2656839367006287, + "grad_norm": 1.09375, + "learning_rate": 4.3340869565217395e-06, + "loss": 0.2092, + "step": 3830 + }, + { + "epoch": 0.26637762844136137, + "grad_norm": 1.1328125, + "learning_rate": 4.332347826086957e-06, + "loss": 0.3343, + "step": 3840 + }, + { + "epoch": 0.26707132018209406, + "grad_norm": 1.265625, + "learning_rate": 4.330608695652174e-06, + "loss": 0.2324, + "step": 3850 + }, + { + "epoch": 0.2677650119228268, + "grad_norm": 1.1875, + "learning_rate": 4.328869565217392e-06, + "loss": 0.2864, + "step": 3860 + }, + { + "epoch": 0.2684587036635595, + "grad_norm": 1.46875, + "learning_rate": 4.327130434782609e-06, + "loss": 0.2883, + "step": 3870 + }, + { + "epoch": 0.2691523954042922, + "grad_norm": 1.046875, + "learning_rate": 4.325391304347826e-06, + "loss": 0.2376, + "step": 3880 + }, + { + "epoch": 0.26984608714502495, + "grad_norm": 1.5, + "learning_rate": 4.323652173913044e-06, + "loss": 0.2556, + "step": 3890 + }, + { + "epoch": 0.27053977888575764, + "grad_norm": 1.3359375, + "learning_rate": 4.321913043478261e-06, + "loss": 0.3276, + "step": 3900 + }, + { + "epoch": 0.27123347062649034, + "grad_norm": 1.375, + "learning_rate": 4.3201739130434785e-06, + "loss": 0.2472, + "step": 3910 + }, + { + "epoch": 0.2719271623672231, + "grad_norm": 1.1640625, + "learning_rate": 4.318434782608696e-06, + "loss": 0.2264, + "step": 3920 + }, + { + "epoch": 0.2726208541079558, + "grad_norm": 1.3515625, + "learning_rate": 4.316695652173913e-06, + "loss": 0.2494, + "step": 3930 + }, + { + "epoch": 0.2733145458486885, + "grad_norm": 1.59375, + "learning_rate": 4.314956521739131e-06, + "loss": 0.2339, + "step": 3940 + }, + { + "epoch": 0.2740082375894212, + "grad_norm": 0.89453125, + "learning_rate": 4.313217391304348e-06, + "loss": 0.2365, + "step": 3950 + }, + { + "epoch": 0.2747019293301539, + "grad_norm": 1.359375, + "learning_rate": 4.311478260869565e-06, + "loss": 0.2584, + "step": 3960 + }, + { + "epoch": 0.2753956210708866, + "grad_norm": 1.28125, + "learning_rate": 4.309739130434783e-06, + "loss": 0.2279, + "step": 3970 + }, + { + "epoch": 0.27608931281161936, + "grad_norm": 1.3984375, + "learning_rate": 4.308000000000001e-06, + "loss": 0.2542, + "step": 3980 + }, + { + "epoch": 0.27678300455235205, + "grad_norm": 1.15625, + "learning_rate": 4.3062608695652175e-06, + "loss": 0.2315, + "step": 3990 + }, + { + "epoch": 0.27747669629308475, + "grad_norm": 1.5, + "learning_rate": 4.304521739130435e-06, + "loss": 0.2557, + "step": 4000 + }, + { + "epoch": 0.2781703880338175, + "grad_norm": 0.90625, + "learning_rate": 4.302782608695652e-06, + "loss": 0.2569, + "step": 4010 + }, + { + "epoch": 0.2788640797745502, + "grad_norm": 1.1875, + "learning_rate": 4.30104347826087e-06, + "loss": 0.2823, + "step": 4020 + }, + { + "epoch": 0.2795577715152829, + "grad_norm": 0.9921875, + "learning_rate": 4.299304347826087e-06, + "loss": 0.2826, + "step": 4030 + }, + { + "epoch": 0.28025146325601563, + "grad_norm": 1.234375, + "learning_rate": 4.297565217391304e-06, + "loss": 0.2421, + "step": 4040 + }, + { + "epoch": 0.2809451549967483, + "grad_norm": 1.375, + "learning_rate": 4.2958260869565225e-06, + "loss": 0.236, + "step": 4050 + }, + { + "epoch": 0.281638846737481, + "grad_norm": 1.0625, + "learning_rate": 4.29408695652174e-06, + "loss": 0.2304, + "step": 4060 + }, + { + "epoch": 0.28233253847821377, + "grad_norm": 1.2578125, + "learning_rate": 4.2923478260869564e-06, + "loss": 0.2403, + "step": 4070 + }, + { + "epoch": 0.28302623021894646, + "grad_norm": 2.03125, + "learning_rate": 4.290608695652174e-06, + "loss": 0.3514, + "step": 4080 + }, + { + "epoch": 0.28371992195967916, + "grad_norm": 1.328125, + "learning_rate": 4.288869565217392e-06, + "loss": 0.2904, + "step": 4090 + }, + { + "epoch": 0.2844136137004119, + "grad_norm": 1.140625, + "learning_rate": 4.287130434782609e-06, + "loss": 0.2514, + "step": 4100 + }, + { + "epoch": 0.2851073054411446, + "grad_norm": 1.1953125, + "learning_rate": 4.285391304347826e-06, + "loss": 0.2677, + "step": 4110 + }, + { + "epoch": 0.2858009971818773, + "grad_norm": 1.46875, + "learning_rate": 4.283652173913044e-06, + "loss": 0.2288, + "step": 4120 + }, + { + "epoch": 0.28649468892261004, + "grad_norm": 1.328125, + "learning_rate": 4.2819130434782615e-06, + "loss": 0.2551, + "step": 4130 + }, + { + "epoch": 0.28718838066334273, + "grad_norm": 0.98828125, + "learning_rate": 4.280173913043479e-06, + "loss": 0.2538, + "step": 4140 + }, + { + "epoch": 0.28788207240407543, + "grad_norm": 1.3671875, + "learning_rate": 4.278434782608696e-06, + "loss": 0.2679, + "step": 4150 + }, + { + "epoch": 0.2885757641448082, + "grad_norm": 1.296875, + "learning_rate": 4.276695652173914e-06, + "loss": 0.2682, + "step": 4160 + }, + { + "epoch": 0.28926945588554087, + "grad_norm": 1.8046875, + "learning_rate": 4.274956521739131e-06, + "loss": 0.2981, + "step": 4170 + }, + { + "epoch": 0.28996314762627357, + "grad_norm": 2.015625, + "learning_rate": 4.273217391304348e-06, + "loss": 0.2792, + "step": 4180 + }, + { + "epoch": 0.29065683936700626, + "grad_norm": 1.234375, + "learning_rate": 4.271478260869566e-06, + "loss": 0.243, + "step": 4190 + }, + { + "epoch": 0.291350531107739, + "grad_norm": 1.078125, + "learning_rate": 4.269739130434783e-06, + "loss": 0.217, + "step": 4200 + }, + { + "epoch": 0.2920442228484717, + "grad_norm": 1.28125, + "learning_rate": 4.2680000000000005e-06, + "loss": 0.2287, + "step": 4210 + }, + { + "epoch": 0.2927379145892044, + "grad_norm": 1.265625, + "learning_rate": 4.266260869565218e-06, + "loss": 0.2609, + "step": 4220 + }, + { + "epoch": 0.29343160632993714, + "grad_norm": 1.4453125, + "learning_rate": 4.264521739130435e-06, + "loss": 0.2283, + "step": 4230 + }, + { + "epoch": 0.29412529807066984, + "grad_norm": 1.0078125, + "learning_rate": 4.262782608695653e-06, + "loss": 0.2076, + "step": 4240 + }, + { + "epoch": 0.29481898981140253, + "grad_norm": 1.6953125, + "learning_rate": 4.26104347826087e-06, + "loss": 0.2701, + "step": 4250 + }, + { + "epoch": 0.2955126815521353, + "grad_norm": 1.34375, + "learning_rate": 4.259304347826087e-06, + "loss": 0.2239, + "step": 4260 + }, + { + "epoch": 0.296206373292868, + "grad_norm": 1.015625, + "learning_rate": 4.257565217391305e-06, + "loss": 0.2263, + "step": 4270 + }, + { + "epoch": 0.29690006503360067, + "grad_norm": 1.0234375, + "learning_rate": 4.255826086956522e-06, + "loss": 0.2316, + "step": 4280 + }, + { + "epoch": 0.2975937567743334, + "grad_norm": 1.4375, + "learning_rate": 4.2540869565217394e-06, + "loss": 0.2467, + "step": 4290 + }, + { + "epoch": 0.2982874485150661, + "grad_norm": 1.359375, + "learning_rate": 4.252347826086957e-06, + "loss": 0.2514, + "step": 4300 + }, + { + "epoch": 0.2989811402557988, + "grad_norm": 1.1796875, + "learning_rate": 4.250608695652174e-06, + "loss": 0.257, + "step": 4310 + }, + { + "epoch": 0.29967483199653155, + "grad_norm": 1.453125, + "learning_rate": 4.2488695652173916e-06, + "loss": 0.2062, + "step": 4320 + }, + { + "epoch": 0.30036852373726425, + "grad_norm": 1.140625, + "learning_rate": 4.247130434782609e-06, + "loss": 0.197, + "step": 4330 + }, + { + "epoch": 0.30106221547799694, + "grad_norm": 0.9609375, + "learning_rate": 4.245391304347826e-06, + "loss": 0.2319, + "step": 4340 + }, + { + "epoch": 0.3017559072187297, + "grad_norm": 1.171875, + "learning_rate": 4.243652173913044e-06, + "loss": 0.2404, + "step": 4350 + }, + { + "epoch": 0.3024495989594624, + "grad_norm": 1.09375, + "learning_rate": 4.241913043478261e-06, + "loss": 0.2156, + "step": 4360 + }, + { + "epoch": 0.3031432907001951, + "grad_norm": 1.3046875, + "learning_rate": 4.240173913043478e-06, + "loss": 0.2299, + "step": 4370 + }, + { + "epoch": 0.3038369824409278, + "grad_norm": 1.375, + "learning_rate": 4.238434782608696e-06, + "loss": 0.2404, + "step": 4380 + }, + { + "epoch": 0.3045306741816605, + "grad_norm": 1.4140625, + "learning_rate": 4.236695652173913e-06, + "loss": 0.1988, + "step": 4390 + }, + { + "epoch": 0.3052243659223932, + "grad_norm": 0.87890625, + "learning_rate": 4.2349565217391305e-06, + "loss": 0.2215, + "step": 4400 + }, + { + "epoch": 0.30591805766312596, + "grad_norm": 1.0390625, + "learning_rate": 4.233217391304349e-06, + "loss": 0.1986, + "step": 4410 + }, + { + "epoch": 0.30661174940385866, + "grad_norm": 1.296875, + "learning_rate": 4.231478260869565e-06, + "loss": 0.2359, + "step": 4420 + }, + { + "epoch": 0.30730544114459135, + "grad_norm": 1.2109375, + "learning_rate": 4.229739130434783e-06, + "loss": 0.2575, + "step": 4430 + }, + { + "epoch": 0.3079991328853241, + "grad_norm": 1.2734375, + "learning_rate": 4.228000000000001e-06, + "loss": 0.2429, + "step": 4440 + }, + { + "epoch": 0.3086928246260568, + "grad_norm": 1.28125, + "learning_rate": 4.226260869565218e-06, + "loss": 0.2587, + "step": 4450 + }, + { + "epoch": 0.3093865163667895, + "grad_norm": 1.6171875, + "learning_rate": 4.224521739130435e-06, + "loss": 0.348, + "step": 4460 + }, + { + "epoch": 0.31008020810752224, + "grad_norm": 1.2265625, + "learning_rate": 4.222782608695652e-06, + "loss": 0.2771, + "step": 4470 + }, + { + "epoch": 0.31077389984825493, + "grad_norm": 1.1015625, + "learning_rate": 4.22104347826087e-06, + "loss": 0.2308, + "step": 4480 + }, + { + "epoch": 0.3114675915889876, + "grad_norm": 1.015625, + "learning_rate": 4.219304347826088e-06, + "loss": 0.2672, + "step": 4490 + }, + { + "epoch": 0.3121612833297204, + "grad_norm": 1.359375, + "learning_rate": 4.217565217391304e-06, + "loss": 0.2603, + "step": 4500 + }, + { + "epoch": 0.31285497507045307, + "grad_norm": 0.8671875, + "learning_rate": 4.2158260869565225e-06, + "loss": 0.2212, + "step": 4510 + }, + { + "epoch": 0.31354866681118576, + "grad_norm": 1.1875, + "learning_rate": 4.21408695652174e-06, + "loss": 0.2073, + "step": 4520 + }, + { + "epoch": 0.3142423585519185, + "grad_norm": 1.546875, + "learning_rate": 4.212347826086957e-06, + "loss": 0.2602, + "step": 4530 + }, + { + "epoch": 0.3149360502926512, + "grad_norm": 1.140625, + "learning_rate": 4.210608695652174e-06, + "loss": 0.2279, + "step": 4540 + }, + { + "epoch": 0.3156297420333839, + "grad_norm": 1.40625, + "learning_rate": 4.208869565217392e-06, + "loss": 0.2433, + "step": 4550 + }, + { + "epoch": 0.31632343377411665, + "grad_norm": 1.171875, + "learning_rate": 4.207130434782609e-06, + "loss": 0.3271, + "step": 4560 + }, + { + "epoch": 0.31701712551484934, + "grad_norm": 1.046875, + "learning_rate": 4.205391304347826e-06, + "loss": 0.3058, + "step": 4570 + }, + { + "epoch": 0.31771081725558203, + "grad_norm": 1.921875, + "learning_rate": 4.203652173913044e-06, + "loss": 0.2724, + "step": 4580 + }, + { + "epoch": 0.3184045089963148, + "grad_norm": 1.09375, + "learning_rate": 4.201913043478261e-06, + "loss": 0.2538, + "step": 4590 + }, + { + "epoch": 0.3190982007370475, + "grad_norm": 1.328125, + "learning_rate": 4.200173913043479e-06, + "loss": 0.2394, + "step": 4600 + }, + { + "epoch": 0.31979189247778017, + "grad_norm": 1.265625, + "learning_rate": 4.198434782608696e-06, + "loss": 0.2433, + "step": 4610 + }, + { + "epoch": 0.3204855842185129, + "grad_norm": 1.2890625, + "learning_rate": 4.1966956521739135e-06, + "loss": 0.2457, + "step": 4620 + }, + { + "epoch": 0.3211792759592456, + "grad_norm": 1.0390625, + "learning_rate": 4.194956521739131e-06, + "loss": 0.2291, + "step": 4630 + }, + { + "epoch": 0.3218729676999783, + "grad_norm": 1.40625, + "learning_rate": 4.193217391304348e-06, + "loss": 0.3308, + "step": 4640 + }, + { + "epoch": 0.32256665944071106, + "grad_norm": 1.234375, + "learning_rate": 4.191478260869566e-06, + "loss": 0.2558, + "step": 4650 + }, + { + "epoch": 0.32326035118144375, + "grad_norm": 1.2265625, + "learning_rate": 4.189739130434783e-06, + "loss": 0.308, + "step": 4660 + }, + { + "epoch": 0.32395404292217644, + "grad_norm": 1.578125, + "learning_rate": 4.188e-06, + "loss": 0.2524, + "step": 4670 + }, + { + "epoch": 0.3246477346629092, + "grad_norm": 1.125, + "learning_rate": 4.186260869565218e-06, + "loss": 0.2373, + "step": 4680 + }, + { + "epoch": 0.3253414264036419, + "grad_norm": 1.4609375, + "learning_rate": 4.184521739130435e-06, + "loss": 0.2355, + "step": 4690 + }, + { + "epoch": 0.3260351181443746, + "grad_norm": 1.0234375, + "learning_rate": 4.1827826086956525e-06, + "loss": 0.2094, + "step": 4700 + }, + { + "epoch": 0.32672880988510733, + "grad_norm": 1.1328125, + "learning_rate": 4.18104347826087e-06, + "loss": 0.2346, + "step": 4710 + }, + { + "epoch": 0.32742250162584, + "grad_norm": 1.109375, + "learning_rate": 4.179304347826087e-06, + "loss": 0.2531, + "step": 4720 + }, + { + "epoch": 0.3281161933665727, + "grad_norm": 1.6015625, + "learning_rate": 4.177565217391305e-06, + "loss": 0.2837, + "step": 4730 + }, + { + "epoch": 0.32880988510730547, + "grad_norm": 1.1484375, + "learning_rate": 4.175826086956522e-06, + "loss": 0.2844, + "step": 4740 + }, + { + "epoch": 0.32950357684803816, + "grad_norm": 1.25, + "learning_rate": 4.174086956521739e-06, + "loss": 0.2293, + "step": 4750 + }, + { + "epoch": 0.33019726858877085, + "grad_norm": 1.234375, + "learning_rate": 4.172347826086957e-06, + "loss": 0.2447, + "step": 4760 + }, + { + "epoch": 0.3308909603295036, + "grad_norm": 1.4453125, + "learning_rate": 4.170608695652174e-06, + "loss": 0.2422, + "step": 4770 + }, + { + "epoch": 0.3315846520702363, + "grad_norm": 1.2578125, + "learning_rate": 4.1688695652173915e-06, + "loss": 0.2318, + "step": 4780 + }, + { + "epoch": 0.332278343810969, + "grad_norm": 1.7734375, + "learning_rate": 4.167130434782609e-06, + "loss": 0.2632, + "step": 4790 + }, + { + "epoch": 0.33297203555170174, + "grad_norm": 0.91796875, + "learning_rate": 4.165391304347827e-06, + "loss": 0.2217, + "step": 4800 + }, + { + "epoch": 0.33366572729243443, + "grad_norm": 1.34375, + "learning_rate": 4.163652173913044e-06, + "loss": 0.2919, + "step": 4810 + }, + { + "epoch": 0.3343594190331671, + "grad_norm": 1.109375, + "learning_rate": 4.161913043478261e-06, + "loss": 0.2504, + "step": 4820 + }, + { + "epoch": 0.3350531107738999, + "grad_norm": 0.890625, + "learning_rate": 4.160173913043478e-06, + "loss": 0.2633, + "step": 4830 + }, + { + "epoch": 0.33574680251463257, + "grad_norm": 1.03125, + "learning_rate": 4.1584347826086965e-06, + "loss": 0.2394, + "step": 4840 + }, + { + "epoch": 0.33644049425536526, + "grad_norm": 1.1484375, + "learning_rate": 4.156695652173913e-06, + "loss": 0.2634, + "step": 4850 + }, + { + "epoch": 0.33713418599609796, + "grad_norm": 1.265625, + "learning_rate": 4.1549565217391304e-06, + "loss": 0.2994, + "step": 4860 + }, + { + "epoch": 0.3378278777368307, + "grad_norm": 1.0, + "learning_rate": 4.153217391304349e-06, + "loss": 0.2641, + "step": 4870 + }, + { + "epoch": 0.3385215694775634, + "grad_norm": 1.1484375, + "learning_rate": 4.151478260869565e-06, + "loss": 0.233, + "step": 4880 + }, + { + "epoch": 0.3392152612182961, + "grad_norm": 1.0703125, + "learning_rate": 4.1497391304347826e-06, + "loss": 0.2218, + "step": 4890 + }, + { + "epoch": 0.33990895295902884, + "grad_norm": 1.5546875, + "learning_rate": 4.148000000000001e-06, + "loss": 0.2219, + "step": 4900 + }, + { + "epoch": 0.34060264469976154, + "grad_norm": 1.1015625, + "learning_rate": 4.146260869565218e-06, + "loss": 0.2372, + "step": 4910 + }, + { + "epoch": 0.34129633644049423, + "grad_norm": 1.3828125, + "learning_rate": 4.144521739130435e-06, + "loss": 0.2405, + "step": 4920 + }, + { + "epoch": 0.341990028181227, + "grad_norm": 1.1640625, + "learning_rate": 4.142782608695652e-06, + "loss": 0.2585, + "step": 4930 + }, + { + "epoch": 0.3426837199219597, + "grad_norm": 1.09375, + "learning_rate": 4.14104347826087e-06, + "loss": 0.2515, + "step": 4940 + }, + { + "epoch": 0.34337741166269237, + "grad_norm": 1.09375, + "learning_rate": 4.139304347826088e-06, + "loss": 0.3092, + "step": 4950 + }, + { + "epoch": 0.3440711034034251, + "grad_norm": 1.28125, + "learning_rate": 4.137565217391304e-06, + "loss": 0.237, + "step": 4960 + }, + { + "epoch": 0.3447647951441578, + "grad_norm": 1.359375, + "learning_rate": 4.135826086956522e-06, + "loss": 0.2279, + "step": 4970 + }, + { + "epoch": 0.3454584868848905, + "grad_norm": 1.3203125, + "learning_rate": 4.13408695652174e-06, + "loss": 0.2481, + "step": 4980 + }, + { + "epoch": 0.34615217862562325, + "grad_norm": 1.046875, + "learning_rate": 4.132347826086957e-06, + "loss": 0.2375, + "step": 4990 + }, + { + "epoch": 0.34684587036635595, + "grad_norm": 1.3125, + "learning_rate": 4.130608695652174e-06, + "loss": 0.2408, + "step": 5000 + }, + { + "epoch": 0.34753956210708864, + "grad_norm": 1.203125, + "learning_rate": 4.128869565217392e-06, + "loss": 0.2326, + "step": 5010 + }, + { + "epoch": 0.3482332538478214, + "grad_norm": 1.0234375, + "learning_rate": 4.127130434782609e-06, + "loss": 0.2321, + "step": 5020 + }, + { + "epoch": 0.3489269455885541, + "grad_norm": 1.078125, + "learning_rate": 4.125391304347827e-06, + "loss": 0.2579, + "step": 5030 + }, + { + "epoch": 0.3496206373292868, + "grad_norm": 1.0390625, + "learning_rate": 4.123652173913044e-06, + "loss": 0.2407, + "step": 5040 + }, + { + "epoch": 0.3503143290700195, + "grad_norm": 1.53125, + "learning_rate": 4.121913043478261e-06, + "loss": 0.2519, + "step": 5050 + }, + { + "epoch": 0.3510080208107522, + "grad_norm": 1.1953125, + "learning_rate": 4.120173913043479e-06, + "loss": 0.2598, + "step": 5060 + }, + { + "epoch": 0.3517017125514849, + "grad_norm": 1.203125, + "learning_rate": 4.118434782608696e-06, + "loss": 0.2603, + "step": 5070 + }, + { + "epoch": 0.35239540429221766, + "grad_norm": 1.296875, + "learning_rate": 4.1166956521739135e-06, + "loss": 0.2179, + "step": 5080 + }, + { + "epoch": 0.35308909603295036, + "grad_norm": 0.98828125, + "learning_rate": 4.114956521739131e-06, + "loss": 0.2368, + "step": 5090 + }, + { + "epoch": 0.35378278777368305, + "grad_norm": 1.1015625, + "learning_rate": 4.113217391304348e-06, + "loss": 0.2639, + "step": 5100 + }, + { + "epoch": 0.3544764795144158, + "grad_norm": 1.03125, + "learning_rate": 4.1114782608695656e-06, + "loss": 0.224, + "step": 5110 + }, + { + "epoch": 0.3551701712551485, + "grad_norm": 1.2578125, + "learning_rate": 4.109739130434783e-06, + "loss": 0.2546, + "step": 5120 + }, + { + "epoch": 0.3558638629958812, + "grad_norm": 1.3671875, + "learning_rate": 4.108e-06, + "loss": 0.2738, + "step": 5130 + }, + { + "epoch": 0.35655755473661394, + "grad_norm": 1.125, + "learning_rate": 4.106260869565218e-06, + "loss": 0.2176, + "step": 5140 + }, + { + "epoch": 0.35725124647734663, + "grad_norm": 1.125, + "learning_rate": 4.104521739130435e-06, + "loss": 0.2345, + "step": 5150 + }, + { + "epoch": 0.3579449382180793, + "grad_norm": 1.25, + "learning_rate": 4.102782608695652e-06, + "loss": 0.2391, + "step": 5160 + }, + { + "epoch": 0.35863862995881207, + "grad_norm": 1.328125, + "learning_rate": 4.10104347826087e-06, + "loss": 0.2476, + "step": 5170 + }, + { + "epoch": 0.35933232169954477, + "grad_norm": 1.390625, + "learning_rate": 4.099304347826087e-06, + "loss": 0.2305, + "step": 5180 + }, + { + "epoch": 0.36002601344027746, + "grad_norm": 1.3984375, + "learning_rate": 4.0975652173913045e-06, + "loss": 0.2072, + "step": 5190 + }, + { + "epoch": 0.3607197051810102, + "grad_norm": 1.3828125, + "learning_rate": 4.095826086956522e-06, + "loss": 0.2452, + "step": 5200 + }, + { + "epoch": 0.3614133969217429, + "grad_norm": 1.453125, + "learning_rate": 4.094086956521739e-06, + "loss": 0.2279, + "step": 5210 + }, + { + "epoch": 0.3621070886624756, + "grad_norm": 0.8984375, + "learning_rate": 4.092347826086957e-06, + "loss": 0.2563, + "step": 5220 + }, + { + "epoch": 0.36280078040320834, + "grad_norm": 1.1953125, + "learning_rate": 4.090608695652174e-06, + "loss": 0.2403, + "step": 5230 + }, + { + "epoch": 0.36349447214394104, + "grad_norm": 1.4375, + "learning_rate": 4.088869565217391e-06, + "loss": 0.2423, + "step": 5240 + }, + { + "epoch": 0.36418816388467373, + "grad_norm": 1.0234375, + "learning_rate": 4.087130434782609e-06, + "loss": 0.2361, + "step": 5250 + }, + { + "epoch": 0.3648818556254065, + "grad_norm": 1.125, + "learning_rate": 4.085391304347827e-06, + "loss": 0.2578, + "step": 5260 + }, + { + "epoch": 0.3655755473661392, + "grad_norm": 1.5703125, + "learning_rate": 4.0836521739130435e-06, + "loss": 0.2747, + "step": 5270 + }, + { + "epoch": 0.36626923910687187, + "grad_norm": 0.91015625, + "learning_rate": 4.081913043478261e-06, + "loss": 0.2621, + "step": 5280 + }, + { + "epoch": 0.3669629308476046, + "grad_norm": 1.2265625, + "learning_rate": 4.080173913043478e-06, + "loss": 0.2146, + "step": 5290 + }, + { + "epoch": 0.3676566225883373, + "grad_norm": 1.2734375, + "learning_rate": 4.0784347826086965e-06, + "loss": 0.2239, + "step": 5300 + }, + { + "epoch": 0.36835031432907, + "grad_norm": 1.0078125, + "learning_rate": 4.076695652173913e-06, + "loss": 0.2409, + "step": 5310 + }, + { + "epoch": 0.36904400606980275, + "grad_norm": 1.5078125, + "learning_rate": 4.07495652173913e-06, + "loss": 0.2346, + "step": 5320 + }, + { + "epoch": 0.36973769781053545, + "grad_norm": 1.1796875, + "learning_rate": 4.073217391304349e-06, + "loss": 0.2461, + "step": 5330 + }, + { + "epoch": 0.37043138955126814, + "grad_norm": 1.328125, + "learning_rate": 4.071478260869566e-06, + "loss": 0.3231, + "step": 5340 + }, + { + "epoch": 0.3711250812920009, + "grad_norm": 1.0546875, + "learning_rate": 4.0697391304347825e-06, + "loss": 0.2363, + "step": 5350 + }, + { + "epoch": 0.3718187730327336, + "grad_norm": 1.21875, + "learning_rate": 4.068000000000001e-06, + "loss": 0.2485, + "step": 5360 + }, + { + "epoch": 0.3725124647734663, + "grad_norm": 1.3046875, + "learning_rate": 4.066260869565218e-06, + "loss": 0.2416, + "step": 5370 + }, + { + "epoch": 0.37320615651419903, + "grad_norm": 1.34375, + "learning_rate": 4.0645217391304354e-06, + "loss": 0.2117, + "step": 5380 + }, + { + "epoch": 0.3738998482549317, + "grad_norm": 1.1640625, + "learning_rate": 4.062782608695652e-06, + "loss": 0.2054, + "step": 5390 + }, + { + "epoch": 0.3745935399956644, + "grad_norm": 1.078125, + "learning_rate": 4.06104347826087e-06, + "loss": 0.216, + "step": 5400 + }, + { + "epoch": 0.37528723173639716, + "grad_norm": 1.0390625, + "learning_rate": 4.0593043478260875e-06, + "loss": 0.251, + "step": 5410 + }, + { + "epoch": 0.37598092347712986, + "grad_norm": 0.92578125, + "learning_rate": 4.057565217391305e-06, + "loss": 0.2377, + "step": 5420 + }, + { + "epoch": 0.37667461521786255, + "grad_norm": 1.2890625, + "learning_rate": 4.055826086956522e-06, + "loss": 0.2643, + "step": 5430 + }, + { + "epoch": 0.3773683069585953, + "grad_norm": 2.03125, + "learning_rate": 4.05408695652174e-06, + "loss": 0.3051, + "step": 5440 + }, + { + "epoch": 0.378061998699328, + "grad_norm": 0.96875, + "learning_rate": 4.052347826086957e-06, + "loss": 0.2471, + "step": 5450 + }, + { + "epoch": 0.3787556904400607, + "grad_norm": 1.015625, + "learning_rate": 4.050608695652174e-06, + "loss": 0.2804, + "step": 5460 + }, + { + "epoch": 0.37944938218079344, + "grad_norm": 1.078125, + "learning_rate": 4.048869565217392e-06, + "loss": 0.2892, + "step": 5470 + }, + { + "epoch": 0.38014307392152613, + "grad_norm": 1.015625, + "learning_rate": 4.047130434782609e-06, + "loss": 0.2323, + "step": 5480 + }, + { + "epoch": 0.3808367656622588, + "grad_norm": 1.0859375, + "learning_rate": 4.0453913043478265e-06, + "loss": 0.2362, + "step": 5490 + }, + { + "epoch": 0.3815304574029915, + "grad_norm": 0.96875, + "learning_rate": 4.043652173913044e-06, + "loss": 0.2262, + "step": 5500 + }, + { + "epoch": 0.38222414914372427, + "grad_norm": 1.1015625, + "learning_rate": 4.041913043478261e-06, + "loss": 0.2667, + "step": 5510 + }, + { + "epoch": 0.38291784088445696, + "grad_norm": 1.03125, + "learning_rate": 4.040173913043479e-06, + "loss": 0.2642, + "step": 5520 + }, + { + "epoch": 0.38361153262518966, + "grad_norm": 1.4140625, + "learning_rate": 4.038434782608696e-06, + "loss": 0.2579, + "step": 5530 + }, + { + "epoch": 0.3843052243659224, + "grad_norm": 0.921875, + "learning_rate": 4.036695652173913e-06, + "loss": 0.2491, + "step": 5540 + }, + { + "epoch": 0.3849989161066551, + "grad_norm": 1.7890625, + "learning_rate": 4.034956521739131e-06, + "loss": 0.2521, + "step": 5550 + }, + { + "epoch": 0.3856926078473878, + "grad_norm": 1.9453125, + "learning_rate": 4.033217391304348e-06, + "loss": 0.3052, + "step": 5560 + }, + { + "epoch": 0.38638629958812054, + "grad_norm": 1.1171875, + "learning_rate": 4.0314782608695655e-06, + "loss": 0.2403, + "step": 5570 + }, + { + "epoch": 0.38707999132885323, + "grad_norm": 1.1328125, + "learning_rate": 4.029739130434783e-06, + "loss": 0.2278, + "step": 5580 + }, + { + "epoch": 0.38777368306958593, + "grad_norm": 1.15625, + "learning_rate": 4.028e-06, + "loss": 0.2863, + "step": 5590 + }, + { + "epoch": 0.3884673748103187, + "grad_norm": 1.46875, + "learning_rate": 4.026260869565218e-06, + "loss": 0.2414, + "step": 5600 + }, + { + "epoch": 0.38916106655105137, + "grad_norm": 1.4375, + "learning_rate": 4.024521739130435e-06, + "loss": 0.2932, + "step": 5610 + }, + { + "epoch": 0.38985475829178406, + "grad_norm": 1.7421875, + "learning_rate": 4.022782608695652e-06, + "loss": 0.2212, + "step": 5620 + }, + { + "epoch": 0.3905484500325168, + "grad_norm": 1.28125, + "learning_rate": 4.02104347826087e-06, + "loss": 0.2225, + "step": 5630 + }, + { + "epoch": 0.3912421417732495, + "grad_norm": 0.9296875, + "learning_rate": 4.019304347826087e-06, + "loss": 0.2289, + "step": 5640 + }, + { + "epoch": 0.3919358335139822, + "grad_norm": 1.25, + "learning_rate": 4.017565217391305e-06, + "loss": 0.3125, + "step": 5650 + }, + { + "epoch": 0.39262952525471495, + "grad_norm": 1.7578125, + "learning_rate": 4.015826086956522e-06, + "loss": 0.278, + "step": 5660 + }, + { + "epoch": 0.39332321699544764, + "grad_norm": 1.1875, + "learning_rate": 4.014086956521739e-06, + "loss": 0.2555, + "step": 5670 + }, + { + "epoch": 0.39401690873618034, + "grad_norm": 1.1171875, + "learning_rate": 4.0123478260869566e-06, + "loss": 0.2313, + "step": 5680 + }, + { + "epoch": 0.3947106004769131, + "grad_norm": 1.1953125, + "learning_rate": 4.010608695652175e-06, + "loss": 0.2275, + "step": 5690 + }, + { + "epoch": 0.3954042922176458, + "grad_norm": 1.0390625, + "learning_rate": 4.008869565217391e-06, + "loss": 0.219, + "step": 5700 + }, + { + "epoch": 0.3960979839583785, + "grad_norm": 1.21875, + "learning_rate": 4.007130434782609e-06, + "loss": 0.2823, + "step": 5710 + }, + { + "epoch": 0.3967916756991112, + "grad_norm": 1.4296875, + "learning_rate": 4.005391304347827e-06, + "loss": 0.2273, + "step": 5720 + }, + { + "epoch": 0.3974853674398439, + "grad_norm": 1.0234375, + "learning_rate": 4.003652173913044e-06, + "loss": 0.2356, + "step": 5730 + }, + { + "epoch": 0.3981790591805766, + "grad_norm": 1.0859375, + "learning_rate": 4.001913043478261e-06, + "loss": 0.2267, + "step": 5740 + }, + { + "epoch": 0.39887275092130936, + "grad_norm": 1.203125, + "learning_rate": 4.000173913043478e-06, + "loss": 0.2469, + "step": 5750 + }, + { + "epoch": 0.39956644266204205, + "grad_norm": 1.3515625, + "learning_rate": 3.998434782608696e-06, + "loss": 0.2549, + "step": 5760 + }, + { + "epoch": 0.40026013440277475, + "grad_norm": 0.7578125, + "learning_rate": 3.996695652173914e-06, + "loss": 0.23, + "step": 5770 + }, + { + "epoch": 0.4009538261435075, + "grad_norm": 1.2734375, + "learning_rate": 3.99495652173913e-06, + "loss": 0.2393, + "step": 5780 + }, + { + "epoch": 0.4016475178842402, + "grad_norm": 1.328125, + "learning_rate": 3.9932173913043485e-06, + "loss": 0.3191, + "step": 5790 + }, + { + "epoch": 0.4023412096249729, + "grad_norm": 1.09375, + "learning_rate": 3.991478260869566e-06, + "loss": 0.2143, + "step": 5800 + }, + { + "epoch": 0.40303490136570563, + "grad_norm": 1.4296875, + "learning_rate": 3.989739130434782e-06, + "loss": 0.2801, + "step": 5810 + }, + { + "epoch": 0.4037285931064383, + "grad_norm": 1.4140625, + "learning_rate": 3.988000000000001e-06, + "loss": 0.2921, + "step": 5820 + }, + { + "epoch": 0.404422284847171, + "grad_norm": 1.2265625, + "learning_rate": 3.986260869565218e-06, + "loss": 0.3027, + "step": 5830 + }, + { + "epoch": 0.40511597658790377, + "grad_norm": 1.53125, + "learning_rate": 3.984521739130435e-06, + "loss": 0.2357, + "step": 5840 + }, + { + "epoch": 0.40580966832863646, + "grad_norm": 0.99609375, + "learning_rate": 3.982782608695652e-06, + "loss": 0.2426, + "step": 5850 + }, + { + "epoch": 0.40650336006936916, + "grad_norm": 1.3515625, + "learning_rate": 3.98104347826087e-06, + "loss": 0.2232, + "step": 5860 + }, + { + "epoch": 0.4071970518101019, + "grad_norm": 1.296875, + "learning_rate": 3.9793043478260875e-06, + "loss": 0.2597, + "step": 5870 + }, + { + "epoch": 0.4078907435508346, + "grad_norm": 1.03125, + "learning_rate": 3.977565217391305e-06, + "loss": 0.2848, + "step": 5880 + }, + { + "epoch": 0.4085844352915673, + "grad_norm": 1.3203125, + "learning_rate": 3.975826086956522e-06, + "loss": 0.2033, + "step": 5890 + }, + { + "epoch": 0.40927812703230004, + "grad_norm": 1.2578125, + "learning_rate": 3.97408695652174e-06, + "loss": 0.2851, + "step": 5900 + }, + { + "epoch": 0.40997181877303274, + "grad_norm": 1.078125, + "learning_rate": 3.972347826086957e-06, + "loss": 0.2773, + "step": 5910 + }, + { + "epoch": 0.41066551051376543, + "grad_norm": 1.375, + "learning_rate": 3.970608695652174e-06, + "loss": 0.3003, + "step": 5920 + }, + { + "epoch": 0.4113592022544982, + "grad_norm": 1.328125, + "learning_rate": 3.968869565217392e-06, + "loss": 0.2621, + "step": 5930 + }, + { + "epoch": 0.4120528939952309, + "grad_norm": 1.2890625, + "learning_rate": 3.967130434782609e-06, + "loss": 0.2287, + "step": 5940 + }, + { + "epoch": 0.41274658573596357, + "grad_norm": 1.1640625, + "learning_rate": 3.9653913043478264e-06, + "loss": 0.2912, + "step": 5950 + }, + { + "epoch": 0.4134402774766963, + "grad_norm": 1.1171875, + "learning_rate": 3.963652173913044e-06, + "loss": 0.1962, + "step": 5960 + }, + { + "epoch": 0.414133969217429, + "grad_norm": 0.88671875, + "learning_rate": 3.961913043478261e-06, + "loss": 0.2444, + "step": 5970 + }, + { + "epoch": 0.4148276609581617, + "grad_norm": 1.1328125, + "learning_rate": 3.9601739130434785e-06, + "loss": 0.2362, + "step": 5980 + }, + { + "epoch": 0.41552135269889445, + "grad_norm": 1.15625, + "learning_rate": 3.958434782608696e-06, + "loss": 0.2421, + "step": 5990 + }, + { + "epoch": 0.41621504443962715, + "grad_norm": 1.0390625, + "learning_rate": 3.956695652173913e-06, + "loss": 0.2565, + "step": 6000 + }, + { + "epoch": 0.41690873618035984, + "grad_norm": 1.53125, + "learning_rate": 3.954956521739131e-06, + "loss": 0.2551, + "step": 6010 + }, + { + "epoch": 0.4176024279210926, + "grad_norm": 1.234375, + "learning_rate": 3.953217391304348e-06, + "loss": 0.2542, + "step": 6020 + }, + { + "epoch": 0.4182961196618253, + "grad_norm": 1.2421875, + "learning_rate": 3.951478260869565e-06, + "loss": 0.2514, + "step": 6030 + }, + { + "epoch": 0.418989811402558, + "grad_norm": 1.6875, + "learning_rate": 3.949739130434783e-06, + "loss": 0.2156, + "step": 6040 + }, + { + "epoch": 0.4196835031432907, + "grad_norm": 0.96484375, + "learning_rate": 3.948e-06, + "loss": 0.2041, + "step": 6050 + }, + { + "epoch": 0.4203771948840234, + "grad_norm": 1.5859375, + "learning_rate": 3.9462608695652175e-06, + "loss": 0.245, + "step": 6060 + }, + { + "epoch": 0.4210708866247561, + "grad_norm": 1.125, + "learning_rate": 3.944521739130435e-06, + "loss": 0.2536, + "step": 6070 + }, + { + "epoch": 0.42176457836548886, + "grad_norm": 1.28125, + "learning_rate": 3.942782608695653e-06, + "loss": 0.2117, + "step": 6080 + }, + { + "epoch": 0.42245827010622156, + "grad_norm": 1.359375, + "learning_rate": 3.94104347826087e-06, + "loss": 0.2369, + "step": 6090 + }, + { + "epoch": 0.42315196184695425, + "grad_norm": 1.484375, + "learning_rate": 3.939304347826087e-06, + "loss": 0.2449, + "step": 6100 + }, + { + "epoch": 0.423845653587687, + "grad_norm": 1.4921875, + "learning_rate": 3.937565217391305e-06, + "loss": 0.3172, + "step": 6110 + }, + { + "epoch": 0.4245393453284197, + "grad_norm": 0.984375, + "learning_rate": 3.935826086956522e-06, + "loss": 0.2349, + "step": 6120 + }, + { + "epoch": 0.4252330370691524, + "grad_norm": 1.09375, + "learning_rate": 3.934086956521739e-06, + "loss": 0.2547, + "step": 6130 + }, + { + "epoch": 0.4259267288098851, + "grad_norm": 1.125, + "learning_rate": 3.9323478260869565e-06, + "loss": 0.2688, + "step": 6140 + }, + { + "epoch": 0.42662042055061783, + "grad_norm": 1.46875, + "learning_rate": 3.930608695652175e-06, + "loss": 0.3595, + "step": 6150 + }, + { + "epoch": 0.4273141122913505, + "grad_norm": 1.125, + "learning_rate": 3.928869565217391e-06, + "loss": 0.2317, + "step": 6160 + }, + { + "epoch": 0.4280078040320832, + "grad_norm": 1.6328125, + "learning_rate": 3.927130434782609e-06, + "loss": 0.2359, + "step": 6170 + }, + { + "epoch": 0.42870149577281597, + "grad_norm": 1.125, + "learning_rate": 3.925391304347827e-06, + "loss": 0.2333, + "step": 6180 + }, + { + "epoch": 0.42939518751354866, + "grad_norm": 1.1328125, + "learning_rate": 3.923652173913044e-06, + "loss": 0.2054, + "step": 6190 + }, + { + "epoch": 0.43008887925428135, + "grad_norm": 1.40625, + "learning_rate": 3.921913043478261e-06, + "loss": 0.2491, + "step": 6200 + }, + { + "epoch": 0.4307825709950141, + "grad_norm": 1.328125, + "learning_rate": 3.920173913043478e-06, + "loss": 0.2249, + "step": 6210 + }, + { + "epoch": 0.4314762627357468, + "grad_norm": 1.078125, + "learning_rate": 3.918434782608696e-06, + "loss": 0.2962, + "step": 6220 + }, + { + "epoch": 0.4321699544764795, + "grad_norm": 1.59375, + "learning_rate": 3.916695652173914e-06, + "loss": 0.3596, + "step": 6230 + }, + { + "epoch": 0.43286364621721224, + "grad_norm": 1.28125, + "learning_rate": 3.91495652173913e-06, + "loss": 0.2359, + "step": 6240 + }, + { + "epoch": 0.43355733795794493, + "grad_norm": 1.4140625, + "learning_rate": 3.913217391304348e-06, + "loss": 0.3381, + "step": 6250 + }, + { + "epoch": 0.4342510296986776, + "grad_norm": 1.171875, + "learning_rate": 3.911478260869566e-06, + "loss": 0.24, + "step": 6260 + }, + { + "epoch": 0.4349447214394104, + "grad_norm": 1.21875, + "learning_rate": 3.909739130434783e-06, + "loss": 0.2566, + "step": 6270 + }, + { + "epoch": 0.43563841318014307, + "grad_norm": 1.359375, + "learning_rate": 3.9080000000000005e-06, + "loss": 0.2599, + "step": 6280 + }, + { + "epoch": 0.43633210492087576, + "grad_norm": 1.234375, + "learning_rate": 3.906260869565218e-06, + "loss": 0.2257, + "step": 6290 + }, + { + "epoch": 0.4370257966616085, + "grad_norm": 1.078125, + "learning_rate": 3.904521739130435e-06, + "loss": 0.229, + "step": 6300 + }, + { + "epoch": 0.4377194884023412, + "grad_norm": 1.2109375, + "learning_rate": 3.902782608695653e-06, + "loss": 0.2464, + "step": 6310 + }, + { + "epoch": 0.4384131801430739, + "grad_norm": 1.34375, + "learning_rate": 3.90104347826087e-06, + "loss": 0.2392, + "step": 6320 + }, + { + "epoch": 0.43910687188380665, + "grad_norm": 1.6015625, + "learning_rate": 3.899304347826087e-06, + "loss": 0.2661, + "step": 6330 + }, + { + "epoch": 0.43980056362453934, + "grad_norm": 0.94140625, + "learning_rate": 3.897565217391305e-06, + "loss": 0.2971, + "step": 6340 + }, + { + "epoch": 0.44049425536527204, + "grad_norm": 1.2421875, + "learning_rate": 3.895826086956522e-06, + "loss": 0.2578, + "step": 6350 + }, + { + "epoch": 0.4411879471060048, + "grad_norm": 1.125, + "learning_rate": 3.8940869565217395e-06, + "loss": 0.2107, + "step": 6360 + }, + { + "epoch": 0.4418816388467375, + "grad_norm": 1.1640625, + "learning_rate": 3.892347826086957e-06, + "loss": 0.213, + "step": 6370 + }, + { + "epoch": 0.4425753305874702, + "grad_norm": 1.4375, + "learning_rate": 3.890608695652174e-06, + "loss": 0.3, + "step": 6380 + }, + { + "epoch": 0.4432690223282029, + "grad_norm": 1.21875, + "learning_rate": 3.888869565217392e-06, + "loss": 0.2388, + "step": 6390 + }, + { + "epoch": 0.4439627140689356, + "grad_norm": 1.0859375, + "learning_rate": 3.887130434782609e-06, + "loss": 0.2557, + "step": 6400 + }, + { + "epoch": 0.4446564058096683, + "grad_norm": 0.71484375, + "learning_rate": 3.885391304347826e-06, + "loss": 0.2638, + "step": 6410 + }, + { + "epoch": 0.44535009755040106, + "grad_norm": 1.5234375, + "learning_rate": 3.883652173913044e-06, + "loss": 0.2884, + "step": 6420 + }, + { + "epoch": 0.44604378929113375, + "grad_norm": 1.2265625, + "learning_rate": 3.881913043478261e-06, + "loss": 0.2356, + "step": 6430 + }, + { + "epoch": 0.44673748103186645, + "grad_norm": 1.65625, + "learning_rate": 3.8801739130434785e-06, + "loss": 0.2654, + "step": 6440 + }, + { + "epoch": 0.4474311727725992, + "grad_norm": 1.0390625, + "learning_rate": 3.878434782608696e-06, + "loss": 0.2445, + "step": 6450 + }, + { + "epoch": 0.4481248645133319, + "grad_norm": 1.1796875, + "learning_rate": 3.876695652173913e-06, + "loss": 0.2363, + "step": 6460 + }, + { + "epoch": 0.4488185562540646, + "grad_norm": 1.234375, + "learning_rate": 3.874956521739131e-06, + "loss": 0.2402, + "step": 6470 + }, + { + "epoch": 0.44951224799479733, + "grad_norm": 1.0546875, + "learning_rate": 3.873217391304348e-06, + "loss": 0.2362, + "step": 6480 + }, + { + "epoch": 0.45020593973553, + "grad_norm": 1.2890625, + "learning_rate": 3.871478260869565e-06, + "loss": 0.3399, + "step": 6490 + }, + { + "epoch": 0.4508996314762627, + "grad_norm": 1.2421875, + "learning_rate": 3.869739130434783e-06, + "loss": 0.2556, + "step": 6500 + }, + { + "epoch": 0.45159332321699547, + "grad_norm": 1.0390625, + "learning_rate": 3.868e-06, + "loss": 0.2391, + "step": 6510 + }, + { + "epoch": 0.45228701495772816, + "grad_norm": 1.1875, + "learning_rate": 3.8662608695652174e-06, + "loss": 0.315, + "step": 6520 + }, + { + "epoch": 0.45298070669846086, + "grad_norm": 1.5546875, + "learning_rate": 3.864521739130435e-06, + "loss": 0.2522, + "step": 6530 + }, + { + "epoch": 0.4536743984391936, + "grad_norm": 1.2109375, + "learning_rate": 3.862782608695653e-06, + "loss": 0.2416, + "step": 6540 + }, + { + "epoch": 0.4543680901799263, + "grad_norm": 1.328125, + "learning_rate": 3.8610434782608696e-06, + "loss": 0.2644, + "step": 6550 + }, + { + "epoch": 0.455061781920659, + "grad_norm": 1.140625, + "learning_rate": 3.859304347826087e-06, + "loss": 0.2374, + "step": 6560 + }, + { + "epoch": 0.45575547366139174, + "grad_norm": 1.375, + "learning_rate": 3.857565217391305e-06, + "loss": 0.2737, + "step": 6570 + }, + { + "epoch": 0.45644916540212443, + "grad_norm": 1.015625, + "learning_rate": 3.8558260869565225e-06, + "loss": 0.2772, + "step": 6580 + }, + { + "epoch": 0.45714285714285713, + "grad_norm": 1.2109375, + "learning_rate": 3.854086956521739e-06, + "loss": 0.2583, + "step": 6590 + }, + { + "epoch": 0.4578365488835899, + "grad_norm": 1.546875, + "learning_rate": 3.852347826086956e-06, + "loss": 0.2847, + "step": 6600 + }, + { + "epoch": 0.45853024062432257, + "grad_norm": 1.1875, + "learning_rate": 3.850608695652175e-06, + "loss": 0.213, + "step": 6610 + }, + { + "epoch": 0.45922393236505527, + "grad_norm": 0.98046875, + "learning_rate": 3.848869565217392e-06, + "loss": 0.2157, + "step": 6620 + }, + { + "epoch": 0.459917624105788, + "grad_norm": 1.375, + "learning_rate": 3.8471304347826085e-06, + "loss": 0.2696, + "step": 6630 + }, + { + "epoch": 0.4606113158465207, + "grad_norm": 1.046875, + "learning_rate": 3.845391304347827e-06, + "loss": 0.2729, + "step": 6640 + }, + { + "epoch": 0.4613050075872534, + "grad_norm": 1.1796875, + "learning_rate": 3.843652173913044e-06, + "loss": 0.2258, + "step": 6650 + }, + { + "epoch": 0.46199869932798615, + "grad_norm": 1.125, + "learning_rate": 3.8419130434782615e-06, + "loss": 0.2469, + "step": 6660 + }, + { + "epoch": 0.46269239106871884, + "grad_norm": 0.94140625, + "learning_rate": 3.840173913043478e-06, + "loss": 0.2038, + "step": 6670 + }, + { + "epoch": 0.46338608280945154, + "grad_norm": 1.1171875, + "learning_rate": 3.838434782608696e-06, + "loss": 0.315, + "step": 6680 + }, + { + "epoch": 0.4640797745501843, + "grad_norm": 1.453125, + "learning_rate": 3.836695652173914e-06, + "loss": 0.2534, + "step": 6690 + }, + { + "epoch": 0.464773466290917, + "grad_norm": 1.265625, + "learning_rate": 3.834956521739131e-06, + "loss": 0.2372, + "step": 6700 + }, + { + "epoch": 0.4654671580316497, + "grad_norm": 1.234375, + "learning_rate": 3.833217391304348e-06, + "loss": 0.246, + "step": 6710 + }, + { + "epoch": 0.4661608497723824, + "grad_norm": 1.1171875, + "learning_rate": 3.831478260869566e-06, + "loss": 0.2586, + "step": 6720 + }, + { + "epoch": 0.4668545415131151, + "grad_norm": 1.25, + "learning_rate": 3.829739130434783e-06, + "loss": 0.2605, + "step": 6730 + }, + { + "epoch": 0.4675482332538478, + "grad_norm": 1.5625, + "learning_rate": 3.8280000000000004e-06, + "loss": 0.2853, + "step": 6740 + }, + { + "epoch": 0.46824192499458056, + "grad_norm": 1.375, + "learning_rate": 3.826260869565218e-06, + "loss": 0.2116, + "step": 6750 + }, + { + "epoch": 0.46893561673531325, + "grad_norm": 1.09375, + "learning_rate": 3.824521739130435e-06, + "loss": 0.3067, + "step": 6760 + }, + { + "epoch": 0.46962930847604595, + "grad_norm": 1.1953125, + "learning_rate": 3.8227826086956526e-06, + "loss": 0.2128, + "step": 6770 + }, + { + "epoch": 0.47032300021677864, + "grad_norm": 0.85546875, + "learning_rate": 3.82104347826087e-06, + "loss": 0.2067, + "step": 6780 + }, + { + "epoch": 0.4710166919575114, + "grad_norm": 1.296875, + "learning_rate": 3.819304347826087e-06, + "loss": 0.2094, + "step": 6790 + }, + { + "epoch": 0.4717103836982441, + "grad_norm": 1.4140625, + "learning_rate": 3.817565217391305e-06, + "loss": 0.2455, + "step": 6800 + }, + { + "epoch": 0.4724040754389768, + "grad_norm": 1.1796875, + "learning_rate": 3.815826086956522e-06, + "loss": 0.2471, + "step": 6810 + }, + { + "epoch": 0.4730977671797095, + "grad_norm": 1.5546875, + "learning_rate": 3.8140869565217394e-06, + "loss": 0.2839, + "step": 6820 + }, + { + "epoch": 0.4737914589204422, + "grad_norm": 1.5703125, + "learning_rate": 3.812347826086957e-06, + "loss": 0.2218, + "step": 6830 + }, + { + "epoch": 0.4744851506611749, + "grad_norm": 1.1875, + "learning_rate": 3.810608695652174e-06, + "loss": 0.2793, + "step": 6840 + }, + { + "epoch": 0.47517884240190766, + "grad_norm": 1.9453125, + "learning_rate": 3.808869565217392e-06, + "loss": 0.3007, + "step": 6850 + }, + { + "epoch": 0.47587253414264036, + "grad_norm": 0.99609375, + "learning_rate": 3.807130434782609e-06, + "loss": 0.1971, + "step": 6860 + }, + { + "epoch": 0.47656622588337305, + "grad_norm": 0.89453125, + "learning_rate": 3.8053913043478263e-06, + "loss": 0.2195, + "step": 6870 + }, + { + "epoch": 0.4772599176241058, + "grad_norm": 1.140625, + "learning_rate": 3.803652173913044e-06, + "loss": 0.2397, + "step": 6880 + }, + { + "epoch": 0.4779536093648385, + "grad_norm": 1.5625, + "learning_rate": 3.8019130434782614e-06, + "loss": 0.2547, + "step": 6890 + }, + { + "epoch": 0.4786473011055712, + "grad_norm": 1.1015625, + "learning_rate": 3.8001739130434784e-06, + "loss": 0.2687, + "step": 6900 + }, + { + "epoch": 0.47934099284630394, + "grad_norm": 1.46875, + "learning_rate": 3.7984347826086958e-06, + "loss": 0.2853, + "step": 6910 + }, + { + "epoch": 0.48003468458703663, + "grad_norm": 1.140625, + "learning_rate": 3.7966956521739136e-06, + "loss": 0.2279, + "step": 6920 + }, + { + "epoch": 0.4807283763277693, + "grad_norm": 1.5078125, + "learning_rate": 3.794956521739131e-06, + "loss": 0.2167, + "step": 6930 + }, + { + "epoch": 0.4814220680685021, + "grad_norm": 1.1875, + "learning_rate": 3.793217391304348e-06, + "loss": 0.3339, + "step": 6940 + }, + { + "epoch": 0.48211575980923477, + "grad_norm": 1.46875, + "learning_rate": 3.7914782608695657e-06, + "loss": 0.2604, + "step": 6950 + }, + { + "epoch": 0.48280945154996746, + "grad_norm": 1.0078125, + "learning_rate": 3.789739130434783e-06, + "loss": 0.2045, + "step": 6960 + }, + { + "epoch": 0.4835031432907002, + "grad_norm": 1.40625, + "learning_rate": 3.7880000000000004e-06, + "loss": 0.2629, + "step": 6970 + }, + { + "epoch": 0.4841968350314329, + "grad_norm": 1.1484375, + "learning_rate": 3.7862608695652174e-06, + "loss": 0.2602, + "step": 6980 + }, + { + "epoch": 0.4848905267721656, + "grad_norm": 1.6171875, + "learning_rate": 3.784521739130435e-06, + "loss": 0.3032, + "step": 6990 + }, + { + "epoch": 0.48558421851289835, + "grad_norm": 0.9453125, + "learning_rate": 3.7827826086956525e-06, + "loss": 0.238, + "step": 7000 + }, + { + "epoch": 0.48627791025363104, + "grad_norm": 1.3203125, + "learning_rate": 3.7810434782608703e-06, + "loss": 0.2422, + "step": 7010 + }, + { + "epoch": 0.48697160199436373, + "grad_norm": 1.0234375, + "learning_rate": 3.7793043478260873e-06, + "loss": 0.2349, + "step": 7020 + }, + { + "epoch": 0.4876652937350965, + "grad_norm": 1.3125, + "learning_rate": 3.7775652173913046e-06, + "loss": 0.23, + "step": 7030 + }, + { + "epoch": 0.4883589854758292, + "grad_norm": 1.03125, + "learning_rate": 3.775826086956522e-06, + "loss": 0.2634, + "step": 7040 + }, + { + "epoch": 0.48905267721656187, + "grad_norm": 1.453125, + "learning_rate": 3.7740869565217394e-06, + "loss": 0.2912, + "step": 7050 + }, + { + "epoch": 0.4897463689572946, + "grad_norm": 1.0234375, + "learning_rate": 3.7723478260869567e-06, + "loss": 0.232, + "step": 7060 + }, + { + "epoch": 0.4904400606980273, + "grad_norm": 1.0390625, + "learning_rate": 3.770608695652174e-06, + "loss": 0.2252, + "step": 7070 + }, + { + "epoch": 0.49113375243876, + "grad_norm": 0.8125, + "learning_rate": 3.768869565217392e-06, + "loss": 0.2311, + "step": 7080 + }, + { + "epoch": 0.49182744417949276, + "grad_norm": 1.234375, + "learning_rate": 3.767130434782609e-06, + "loss": 0.2493, + "step": 7090 + }, + { + "epoch": 0.49252113592022545, + "grad_norm": 1.3671875, + "learning_rate": 3.7653913043478262e-06, + "loss": 0.2665, + "step": 7100 + }, + { + "epoch": 0.49321482766095814, + "grad_norm": 1.328125, + "learning_rate": 3.763652173913044e-06, + "loss": 0.2158, + "step": 7110 + }, + { + "epoch": 0.4939085194016909, + "grad_norm": 1.1328125, + "learning_rate": 3.7619130434782614e-06, + "loss": 0.2555, + "step": 7120 + }, + { + "epoch": 0.4946022111424236, + "grad_norm": 0.9296875, + "learning_rate": 3.7601739130434783e-06, + "loss": 0.2437, + "step": 7130 + }, + { + "epoch": 0.4952959028831563, + "grad_norm": 1.15625, + "learning_rate": 3.7584347826086957e-06, + "loss": 0.2703, + "step": 7140 + }, + { + "epoch": 0.49598959462388903, + "grad_norm": 1.203125, + "learning_rate": 3.7566956521739135e-06, + "loss": 0.2314, + "step": 7150 + }, + { + "epoch": 0.4966832863646217, + "grad_norm": 1.265625, + "learning_rate": 3.754956521739131e-06, + "loss": 0.2534, + "step": 7160 + }, + { + "epoch": 0.4973769781053544, + "grad_norm": 1.2578125, + "learning_rate": 3.753217391304348e-06, + "loss": 0.2763, + "step": 7170 + }, + { + "epoch": 0.49807066984608717, + "grad_norm": 1.578125, + "learning_rate": 3.7514782608695656e-06, + "loss": 0.2734, + "step": 7180 + }, + { + "epoch": 0.49876436158681986, + "grad_norm": 1.1640625, + "learning_rate": 3.749739130434783e-06, + "loss": 0.2158, + "step": 7190 + }, + { + "epoch": 0.49945805332755255, + "grad_norm": 1.4765625, + "learning_rate": 3.7480000000000004e-06, + "loss": 0.2429, + "step": 7200 + }, + { + "epoch": 0.5001517450682853, + "grad_norm": 1.2421875, + "learning_rate": 3.7462608695652173e-06, + "loss": 0.2961, + "step": 7210 + }, + { + "epoch": 0.5008454368090179, + "grad_norm": 1.4375, + "learning_rate": 3.744521739130435e-06, + "loss": 0.2368, + "step": 7220 + }, + { + "epoch": 0.5015391285497507, + "grad_norm": 1.5625, + "learning_rate": 3.7427826086956525e-06, + "loss": 0.2605, + "step": 7230 + }, + { + "epoch": 0.5022328202904834, + "grad_norm": 0.83984375, + "learning_rate": 3.7410434782608703e-06, + "loss": 0.2194, + "step": 7240 + }, + { + "epoch": 0.5029265120312161, + "grad_norm": 1.1953125, + "learning_rate": 3.7393043478260872e-06, + "loss": 0.2153, + "step": 7250 + }, + { + "epoch": 0.5036202037719488, + "grad_norm": 1.671875, + "learning_rate": 3.7375652173913046e-06, + "loss": 0.2329, + "step": 7260 + }, + { + "epoch": 0.5043138955126816, + "grad_norm": 1.8203125, + "learning_rate": 3.735826086956522e-06, + "loss": 0.2821, + "step": 7270 + }, + { + "epoch": 0.5050075872534142, + "grad_norm": 1.3984375, + "learning_rate": 3.7340869565217398e-06, + "loss": 0.263, + "step": 7280 + }, + { + "epoch": 0.505701278994147, + "grad_norm": 1.3046875, + "learning_rate": 3.7323478260869567e-06, + "loss": 0.2428, + "step": 7290 + }, + { + "epoch": 0.5063949707348797, + "grad_norm": 1.015625, + "learning_rate": 3.730608695652174e-06, + "loss": 0.2353, + "step": 7300 + }, + { + "epoch": 0.5070886624756124, + "grad_norm": 1.0234375, + "learning_rate": 3.728869565217392e-06, + "loss": 0.2616, + "step": 7310 + }, + { + "epoch": 0.5077823542163451, + "grad_norm": 1.3203125, + "learning_rate": 3.7271304347826092e-06, + "loss": 0.2314, + "step": 7320 + }, + { + "epoch": 0.5084760459570778, + "grad_norm": 1.171875, + "learning_rate": 3.725391304347826e-06, + "loss": 0.2484, + "step": 7330 + }, + { + "epoch": 0.5091697376978105, + "grad_norm": 1.1796875, + "learning_rate": 3.723652173913044e-06, + "loss": 0.2182, + "step": 7340 + }, + { + "epoch": 0.5098634294385432, + "grad_norm": 1.296875, + "learning_rate": 3.7219130434782614e-06, + "loss": 0.1906, + "step": 7350 + }, + { + "epoch": 0.510557121179276, + "grad_norm": 1.2421875, + "learning_rate": 3.7201739130434783e-06, + "loss": 0.2971, + "step": 7360 + }, + { + "epoch": 0.5112508129200086, + "grad_norm": 1.359375, + "learning_rate": 3.7184347826086957e-06, + "loss": 0.2281, + "step": 7370 + }, + { + "epoch": 0.5119445046607414, + "grad_norm": 1.4375, + "learning_rate": 3.7166956521739135e-06, + "loss": 0.2597, + "step": 7380 + }, + { + "epoch": 0.5126381964014741, + "grad_norm": 1.3125, + "learning_rate": 3.714956521739131e-06, + "loss": 0.2685, + "step": 7390 + }, + { + "epoch": 0.5133318881422068, + "grad_norm": 0.98046875, + "learning_rate": 3.713217391304348e-06, + "loss": 0.2763, + "step": 7400 + }, + { + "epoch": 0.5140255798829395, + "grad_norm": 1.3203125, + "learning_rate": 3.7114782608695656e-06, + "loss": 0.2088, + "step": 7410 + }, + { + "epoch": 0.5147192716236723, + "grad_norm": 4.9375, + "learning_rate": 3.709739130434783e-06, + "loss": 0.2388, + "step": 7420 + }, + { + "epoch": 0.5154129633644049, + "grad_norm": 1.3046875, + "learning_rate": 3.7080000000000003e-06, + "loss": 0.2594, + "step": 7430 + }, + { + "epoch": 0.5161066551051376, + "grad_norm": 1.7265625, + "learning_rate": 3.7062608695652173e-06, + "loss": 0.2595, + "step": 7440 + }, + { + "epoch": 0.5168003468458704, + "grad_norm": 1.2578125, + "learning_rate": 3.704521739130435e-06, + "loss": 0.2404, + "step": 7450 + }, + { + "epoch": 0.517494038586603, + "grad_norm": 1.25, + "learning_rate": 3.7027826086956524e-06, + "loss": 0.2635, + "step": 7460 + }, + { + "epoch": 0.5181877303273358, + "grad_norm": 0.875, + "learning_rate": 3.7010434782608702e-06, + "loss": 0.245, + "step": 7470 + }, + { + "epoch": 0.5188814220680685, + "grad_norm": 1.171875, + "learning_rate": 3.699304347826087e-06, + "loss": 0.2676, + "step": 7480 + }, + { + "epoch": 0.5195751138088012, + "grad_norm": 1.21875, + "learning_rate": 3.6975652173913046e-06, + "loss": 0.2699, + "step": 7490 + }, + { + "epoch": 0.5202688055495339, + "grad_norm": 1.234375, + "learning_rate": 3.695826086956522e-06, + "loss": 0.244, + "step": 7500 + }, + { + "epoch": 0.5209624972902667, + "grad_norm": 1.390625, + "learning_rate": 3.6940869565217397e-06, + "loss": 0.2438, + "step": 7510 + }, + { + "epoch": 0.5216561890309993, + "grad_norm": 1.2890625, + "learning_rate": 3.6923478260869567e-06, + "loss": 0.2516, + "step": 7520 + }, + { + "epoch": 0.5223498807717321, + "grad_norm": 1.0625, + "learning_rate": 3.690608695652174e-06, + "loss": 0.2458, + "step": 7530 + }, + { + "epoch": 0.5230435725124648, + "grad_norm": 1.375, + "learning_rate": 3.688869565217392e-06, + "loss": 0.2696, + "step": 7540 + }, + { + "epoch": 0.5237372642531974, + "grad_norm": 1.453125, + "learning_rate": 3.687130434782609e-06, + "loss": 0.2365, + "step": 7550 + }, + { + "epoch": 0.5244309559939302, + "grad_norm": 1.609375, + "learning_rate": 3.685391304347826e-06, + "loss": 0.2247, + "step": 7560 + }, + { + "epoch": 0.5251246477346629, + "grad_norm": 1.390625, + "learning_rate": 3.683652173913044e-06, + "loss": 0.2388, + "step": 7570 + }, + { + "epoch": 0.5258183394753956, + "grad_norm": 1.171875, + "learning_rate": 3.6819130434782613e-06, + "loss": 0.2709, + "step": 7580 + }, + { + "epoch": 0.5265120312161283, + "grad_norm": 1.2578125, + "learning_rate": 3.6801739130434787e-06, + "loss": 0.3069, + "step": 7590 + }, + { + "epoch": 0.5272057229568611, + "grad_norm": 1.046875, + "learning_rate": 3.6784347826086956e-06, + "loss": 0.2254, + "step": 7600 + }, + { + "epoch": 0.5278994146975937, + "grad_norm": 1.0390625, + "learning_rate": 3.6766956521739134e-06, + "loss": 0.2332, + "step": 7610 + }, + { + "epoch": 0.5285931064383265, + "grad_norm": 1.3359375, + "learning_rate": 3.674956521739131e-06, + "loss": 0.2917, + "step": 7620 + }, + { + "epoch": 0.5292867981790592, + "grad_norm": 1.203125, + "learning_rate": 3.6732173913043486e-06, + "loss": 0.2575, + "step": 7630 + }, + { + "epoch": 0.5299804899197919, + "grad_norm": 0.96875, + "learning_rate": 3.6714782608695655e-06, + "loss": 0.2636, + "step": 7640 + }, + { + "epoch": 0.5306741816605246, + "grad_norm": 1.0390625, + "learning_rate": 3.669739130434783e-06, + "loss": 0.2782, + "step": 7650 + }, + { + "epoch": 0.5313678734012574, + "grad_norm": 1.4296875, + "learning_rate": 3.6680000000000003e-06, + "loss": 0.2155, + "step": 7660 + }, + { + "epoch": 0.53206156514199, + "grad_norm": 1.96875, + "learning_rate": 3.6662608695652172e-06, + "loss": 0.2726, + "step": 7670 + }, + { + "epoch": 0.5327552568827227, + "grad_norm": 1.453125, + "learning_rate": 3.664521739130435e-06, + "loss": 0.2456, + "step": 7680 + }, + { + "epoch": 0.5334489486234555, + "grad_norm": 1.328125, + "learning_rate": 3.6627826086956524e-06, + "loss": 0.2691, + "step": 7690 + }, + { + "epoch": 0.5341426403641881, + "grad_norm": 1.421875, + "learning_rate": 3.66104347826087e-06, + "loss": 0.2618, + "step": 7700 + }, + { + "epoch": 0.5348363321049209, + "grad_norm": 1.3359375, + "learning_rate": 3.659304347826087e-06, + "loss": 0.2684, + "step": 7710 + }, + { + "epoch": 0.5355300238456536, + "grad_norm": 0.8828125, + "learning_rate": 3.6575652173913045e-06, + "loss": 0.2165, + "step": 7720 + }, + { + "epoch": 0.5362237155863863, + "grad_norm": 0.79296875, + "learning_rate": 3.655826086956522e-06, + "loss": 0.2917, + "step": 7730 + }, + { + "epoch": 0.536917407327119, + "grad_norm": 1.203125, + "learning_rate": 3.6540869565217397e-06, + "loss": 0.2629, + "step": 7740 + }, + { + "epoch": 0.5376110990678518, + "grad_norm": 1.046875, + "learning_rate": 3.6523478260869566e-06, + "loss": 0.2612, + "step": 7750 + }, + { + "epoch": 0.5383047908085844, + "grad_norm": 1.1953125, + "learning_rate": 3.650608695652174e-06, + "loss": 0.3481, + "step": 7760 + }, + { + "epoch": 0.5389984825493171, + "grad_norm": 1.5859375, + "learning_rate": 3.648869565217392e-06, + "loss": 0.2939, + "step": 7770 + }, + { + "epoch": 0.5396921742900499, + "grad_norm": 1.1015625, + "learning_rate": 3.647130434782609e-06, + "loss": 0.232, + "step": 7780 + }, + { + "epoch": 0.5403858660307825, + "grad_norm": 1.3359375, + "learning_rate": 3.645391304347826e-06, + "loss": 0.2843, + "step": 7790 + }, + { + "epoch": 0.5410795577715153, + "grad_norm": 1.0, + "learning_rate": 3.643652173913044e-06, + "loss": 0.2311, + "step": 7800 + }, + { + "epoch": 0.541773249512248, + "grad_norm": 1.078125, + "learning_rate": 3.6419130434782613e-06, + "loss": 0.2587, + "step": 7810 + }, + { + "epoch": 0.5424669412529807, + "grad_norm": 0.95703125, + "learning_rate": 3.6401739130434786e-06, + "loss": 0.2894, + "step": 7820 + }, + { + "epoch": 0.5431606329937134, + "grad_norm": 1.0078125, + "learning_rate": 3.6384347826086956e-06, + "loss": 0.2337, + "step": 7830 + }, + { + "epoch": 0.5438543247344462, + "grad_norm": 0.9921875, + "learning_rate": 3.6366956521739134e-06, + "loss": 0.2619, + "step": 7840 + }, + { + "epoch": 0.5445480164751788, + "grad_norm": 1.15625, + "learning_rate": 3.6349565217391308e-06, + "loss": 0.3071, + "step": 7850 + }, + { + "epoch": 0.5452417082159116, + "grad_norm": 0.87109375, + "learning_rate": 3.6332173913043486e-06, + "loss": 0.2436, + "step": 7860 + }, + { + "epoch": 0.5459353999566443, + "grad_norm": 0.98046875, + "learning_rate": 3.6314782608695655e-06, + "loss": 0.316, + "step": 7870 + }, + { + "epoch": 0.546629091697377, + "grad_norm": 1.6328125, + "learning_rate": 3.629739130434783e-06, + "loss": 0.2135, + "step": 7880 + }, + { + "epoch": 0.5473227834381097, + "grad_norm": 1.3671875, + "learning_rate": 3.6280000000000002e-06, + "loss": 0.2578, + "step": 7890 + }, + { + "epoch": 0.5480164751788424, + "grad_norm": 1.2578125, + "learning_rate": 3.626260869565218e-06, + "loss": 0.2587, + "step": 7900 + }, + { + "epoch": 0.5487101669195751, + "grad_norm": 1.3828125, + "learning_rate": 3.624521739130435e-06, + "loss": 0.2425, + "step": 7910 + }, + { + "epoch": 0.5494038586603078, + "grad_norm": 1.421875, + "learning_rate": 3.6227826086956524e-06, + "loss": 0.2497, + "step": 7920 + }, + { + "epoch": 0.5500975504010406, + "grad_norm": 1.0546875, + "learning_rate": 3.62104347826087e-06, + "loss": 0.1969, + "step": 7930 + }, + { + "epoch": 0.5507912421417732, + "grad_norm": 1.25, + "learning_rate": 3.6193043478260875e-06, + "loss": 0.2794, + "step": 7940 + }, + { + "epoch": 0.551484933882506, + "grad_norm": 1.4921875, + "learning_rate": 3.6175652173913045e-06, + "loss": 0.2452, + "step": 7950 + }, + { + "epoch": 0.5521786256232387, + "grad_norm": 1.1640625, + "learning_rate": 3.615826086956522e-06, + "loss": 0.2181, + "step": 7960 + }, + { + "epoch": 0.5528723173639714, + "grad_norm": 1.2265625, + "learning_rate": 3.6140869565217396e-06, + "loss": 0.2407, + "step": 7970 + }, + { + "epoch": 0.5535660091047041, + "grad_norm": 1.0703125, + "learning_rate": 3.6123478260869566e-06, + "loss": 0.2556, + "step": 7980 + }, + { + "epoch": 0.5542597008454369, + "grad_norm": 1.234375, + "learning_rate": 3.610608695652174e-06, + "loss": 0.2783, + "step": 7990 + }, + { + "epoch": 0.5549533925861695, + "grad_norm": 1.59375, + "learning_rate": 3.6088695652173918e-06, + "loss": 0.2702, + "step": 8000 + }, + { + "epoch": 0.5556470843269022, + "grad_norm": 1.359375, + "learning_rate": 3.607130434782609e-06, + "loss": 0.2606, + "step": 8010 + }, + { + "epoch": 0.556340776067635, + "grad_norm": 1.21875, + "learning_rate": 3.605391304347826e-06, + "loss": 0.2103, + "step": 8020 + }, + { + "epoch": 0.5570344678083676, + "grad_norm": 1.34375, + "learning_rate": 3.603652173913044e-06, + "loss": 0.2451, + "step": 8030 + }, + { + "epoch": 0.5577281595491004, + "grad_norm": 1.2734375, + "learning_rate": 3.6019130434782612e-06, + "loss": 0.2489, + "step": 8040 + }, + { + "epoch": 0.5584218512898331, + "grad_norm": 1.2578125, + "learning_rate": 3.6001739130434786e-06, + "loss": 0.2254, + "step": 8050 + }, + { + "epoch": 0.5591155430305658, + "grad_norm": 1.5625, + "learning_rate": 3.5984347826086956e-06, + "loss": 0.2404, + "step": 8060 + }, + { + "epoch": 0.5598092347712985, + "grad_norm": 1.3125, + "learning_rate": 3.5966956521739134e-06, + "loss": 0.3151, + "step": 8070 + }, + { + "epoch": 0.5605029265120313, + "grad_norm": 1.4296875, + "learning_rate": 3.5949565217391307e-06, + "loss": 0.2554, + "step": 8080 + }, + { + "epoch": 0.5611966182527639, + "grad_norm": 1.234375, + "learning_rate": 3.5932173913043485e-06, + "loss": 0.25, + "step": 8090 + }, + { + "epoch": 0.5618903099934967, + "grad_norm": 1.390625, + "learning_rate": 3.5914782608695655e-06, + "loss": 0.2968, + "step": 8100 + }, + { + "epoch": 0.5625840017342294, + "grad_norm": 1.2109375, + "learning_rate": 3.589739130434783e-06, + "loss": 0.2523, + "step": 8110 + }, + { + "epoch": 0.563277693474962, + "grad_norm": 1.2421875, + "learning_rate": 3.588e-06, + "loss": 0.2254, + "step": 8120 + }, + { + "epoch": 0.5639713852156948, + "grad_norm": 1.578125, + "learning_rate": 3.586260869565218e-06, + "loss": 0.272, + "step": 8130 + }, + { + "epoch": 0.5646650769564275, + "grad_norm": 1.6953125, + "learning_rate": 3.584521739130435e-06, + "loss": 0.3524, + "step": 8140 + }, + { + "epoch": 0.5653587686971602, + "grad_norm": 1.1640625, + "learning_rate": 3.5827826086956523e-06, + "loss": 0.2291, + "step": 8150 + }, + { + "epoch": 0.5660524604378929, + "grad_norm": 1.34375, + "learning_rate": 3.58104347826087e-06, + "loss": 0.27, + "step": 8160 + }, + { + "epoch": 0.5667461521786257, + "grad_norm": 1.1640625, + "learning_rate": 3.5793043478260875e-06, + "loss": 0.324, + "step": 8170 + }, + { + "epoch": 0.5674398439193583, + "grad_norm": 1.015625, + "learning_rate": 3.5775652173913044e-06, + "loss": 0.3121, + "step": 8180 + }, + { + "epoch": 0.5681335356600911, + "grad_norm": 1.0078125, + "learning_rate": 3.575826086956522e-06, + "loss": 0.2035, + "step": 8190 + }, + { + "epoch": 0.5688272274008238, + "grad_norm": 1.1015625, + "learning_rate": 3.5740869565217396e-06, + "loss": 0.2779, + "step": 8200 + }, + { + "epoch": 0.5695209191415564, + "grad_norm": 1.46875, + "learning_rate": 3.572347826086957e-06, + "loss": 0.2545, + "step": 8210 + }, + { + "epoch": 0.5702146108822892, + "grad_norm": 1.2734375, + "learning_rate": 3.570608695652174e-06, + "loss": 0.3188, + "step": 8220 + }, + { + "epoch": 0.570908302623022, + "grad_norm": 1.1875, + "learning_rate": 3.5688695652173917e-06, + "loss": 0.2356, + "step": 8230 + }, + { + "epoch": 0.5716019943637546, + "grad_norm": 1.1484375, + "learning_rate": 3.567130434782609e-06, + "loss": 0.2316, + "step": 8240 + }, + { + "epoch": 0.5722956861044873, + "grad_norm": 1.5390625, + "learning_rate": 3.5653913043478265e-06, + "loss": 0.2724, + "step": 8250 + }, + { + "epoch": 0.5729893778452201, + "grad_norm": 1.34375, + "learning_rate": 3.563652173913044e-06, + "loss": 0.2535, + "step": 8260 + }, + { + "epoch": 0.5736830695859527, + "grad_norm": 1.125, + "learning_rate": 3.561913043478261e-06, + "loss": 0.2413, + "step": 8270 + }, + { + "epoch": 0.5743767613266855, + "grad_norm": 0.98046875, + "learning_rate": 3.5601739130434786e-06, + "loss": 0.2488, + "step": 8280 + }, + { + "epoch": 0.5750704530674182, + "grad_norm": 1.140625, + "learning_rate": 3.5584347826086955e-06, + "loss": 0.2248, + "step": 8290 + }, + { + "epoch": 0.5757641448081509, + "grad_norm": 1.1640625, + "learning_rate": 3.5566956521739133e-06, + "loss": 0.2339, + "step": 8300 + }, + { + "epoch": 0.5764578365488836, + "grad_norm": 2.25, + "learning_rate": 3.5549565217391307e-06, + "loss": 0.3776, + "step": 8310 + }, + { + "epoch": 0.5771515282896164, + "grad_norm": 1.2109375, + "learning_rate": 3.5532173913043485e-06, + "loss": 0.2102, + "step": 8320 + }, + { + "epoch": 0.577845220030349, + "grad_norm": 0.9921875, + "learning_rate": 3.5514782608695654e-06, + "loss": 0.2692, + "step": 8330 + }, + { + "epoch": 0.5785389117710817, + "grad_norm": 1.6796875, + "learning_rate": 3.549739130434783e-06, + "loss": 0.3611, + "step": 8340 + }, + { + "epoch": 0.5792326035118145, + "grad_norm": 1.15625, + "learning_rate": 3.548e-06, + "loss": 0.227, + "step": 8350 + }, + { + "epoch": 0.5799262952525471, + "grad_norm": 1.015625, + "learning_rate": 3.546260869565218e-06, + "loss": 0.242, + "step": 8360 + }, + { + "epoch": 0.5806199869932799, + "grad_norm": 1.09375, + "learning_rate": 3.544521739130435e-06, + "loss": 0.3021, + "step": 8370 + }, + { + "epoch": 0.5813136787340125, + "grad_norm": 1.2265625, + "learning_rate": 3.5427826086956523e-06, + "loss": 0.2542, + "step": 8380 + }, + { + "epoch": 0.5820073704747453, + "grad_norm": 1.2734375, + "learning_rate": 3.54104347826087e-06, + "loss": 0.24, + "step": 8390 + }, + { + "epoch": 0.582701062215478, + "grad_norm": 1.046875, + "learning_rate": 3.5393043478260874e-06, + "loss": 0.2421, + "step": 8400 + }, + { + "epoch": 0.5833947539562107, + "grad_norm": 0.984375, + "learning_rate": 3.5375652173913044e-06, + "loss": 0.212, + "step": 8410 + }, + { + "epoch": 0.5840884456969434, + "grad_norm": 1.0390625, + "learning_rate": 3.5358260869565218e-06, + "loss": 0.2281, + "step": 8420 + }, + { + "epoch": 0.5847821374376762, + "grad_norm": 1.40625, + "learning_rate": 3.5340869565217396e-06, + "loss": 0.2296, + "step": 8430 + }, + { + "epoch": 0.5854758291784088, + "grad_norm": 1.640625, + "learning_rate": 3.532347826086957e-06, + "loss": 0.2546, + "step": 8440 + }, + { + "epoch": 0.5861695209191415, + "grad_norm": 1.7109375, + "learning_rate": 3.530608695652174e-06, + "loss": 0.2691, + "step": 8450 + }, + { + "epoch": 0.5868632126598743, + "grad_norm": 1.5078125, + "learning_rate": 3.5288695652173917e-06, + "loss": 0.2716, + "step": 8460 + }, + { + "epoch": 0.5875569044006069, + "grad_norm": 1.3046875, + "learning_rate": 3.527130434782609e-06, + "loss": 0.3, + "step": 8470 + }, + { + "epoch": 0.5882505961413397, + "grad_norm": 0.98046875, + "learning_rate": 3.5253913043478264e-06, + "loss": 0.2811, + "step": 8480 + }, + { + "epoch": 0.5889442878820724, + "grad_norm": 1.265625, + "learning_rate": 3.5236521739130438e-06, + "loss": 0.2327, + "step": 8490 + }, + { + "epoch": 0.5896379796228051, + "grad_norm": 1.03125, + "learning_rate": 3.521913043478261e-06, + "loss": 0.2393, + "step": 8500 + }, + { + "epoch": 0.5903316713635378, + "grad_norm": 1.03125, + "learning_rate": 3.5201739130434785e-06, + "loss": 0.2633, + "step": 8510 + }, + { + "epoch": 0.5910253631042706, + "grad_norm": 1.1015625, + "learning_rate": 3.5184347826086963e-06, + "loss": 0.2459, + "step": 8520 + }, + { + "epoch": 0.5917190548450032, + "grad_norm": 1.1328125, + "learning_rate": 3.5166956521739133e-06, + "loss": 0.2063, + "step": 8530 + }, + { + "epoch": 0.592412746585736, + "grad_norm": 0.88671875, + "learning_rate": 3.5149565217391306e-06, + "loss": 0.2193, + "step": 8540 + }, + { + "epoch": 0.5931064383264687, + "grad_norm": 1.3828125, + "learning_rate": 3.5132173913043484e-06, + "loss": 0.3143, + "step": 8550 + }, + { + "epoch": 0.5938001300672013, + "grad_norm": 1.2421875, + "learning_rate": 3.511478260869566e-06, + "loss": 0.2205, + "step": 8560 + }, + { + "epoch": 0.5944938218079341, + "grad_norm": 1.2734375, + "learning_rate": 3.5097391304347828e-06, + "loss": 0.2392, + "step": 8570 + }, + { + "epoch": 0.5951875135486668, + "grad_norm": 1.421875, + "learning_rate": 3.508e-06, + "loss": 0.252, + "step": 8580 + }, + { + "epoch": 0.5958812052893995, + "grad_norm": 1.4296875, + "learning_rate": 3.506260869565218e-06, + "loss": 0.318, + "step": 8590 + }, + { + "epoch": 0.5965748970301322, + "grad_norm": 1.078125, + "learning_rate": 3.504521739130435e-06, + "loss": 0.2799, + "step": 8600 + }, + { + "epoch": 0.597268588770865, + "grad_norm": 1.25, + "learning_rate": 3.5027826086956522e-06, + "loss": 0.2301, + "step": 8610 + }, + { + "epoch": 0.5979622805115976, + "grad_norm": 1.28125, + "learning_rate": 3.50104347826087e-06, + "loss": 0.2516, + "step": 8620 + }, + { + "epoch": 0.5986559722523304, + "grad_norm": 1.375, + "learning_rate": 3.4993043478260874e-06, + "loss": 0.2338, + "step": 8630 + }, + { + "epoch": 0.5993496639930631, + "grad_norm": 1.09375, + "learning_rate": 3.4975652173913044e-06, + "loss": 0.2511, + "step": 8640 + }, + { + "epoch": 0.6000433557337957, + "grad_norm": 1.15625, + "learning_rate": 3.4958260869565217e-06, + "loss": 0.2401, + "step": 8650 + }, + { + "epoch": 0.6007370474745285, + "grad_norm": 1.09375, + "learning_rate": 3.4940869565217395e-06, + "loss": 0.2961, + "step": 8660 + }, + { + "epoch": 0.6014307392152612, + "grad_norm": 1.1953125, + "learning_rate": 3.492347826086957e-06, + "loss": 0.2639, + "step": 8670 + }, + { + "epoch": 0.6021244309559939, + "grad_norm": 1.2578125, + "learning_rate": 3.490608695652174e-06, + "loss": 0.2283, + "step": 8680 + }, + { + "epoch": 0.6028181226967266, + "grad_norm": 1.0703125, + "learning_rate": 3.4888695652173916e-06, + "loss": 0.2404, + "step": 8690 + }, + { + "epoch": 0.6035118144374594, + "grad_norm": 1.375, + "learning_rate": 3.487130434782609e-06, + "loss": 0.2559, + "step": 8700 + }, + { + "epoch": 0.604205506178192, + "grad_norm": 1.0390625, + "learning_rate": 3.4853913043478264e-06, + "loss": 0.24, + "step": 8710 + }, + { + "epoch": 0.6048991979189248, + "grad_norm": 1.078125, + "learning_rate": 3.4836521739130437e-06, + "loss": 0.2297, + "step": 8720 + }, + { + "epoch": 0.6055928896596575, + "grad_norm": 1.296875, + "learning_rate": 3.481913043478261e-06, + "loss": 0.2517, + "step": 8730 + }, + { + "epoch": 0.6062865814003902, + "grad_norm": 1.1875, + "learning_rate": 3.4801739130434785e-06, + "loss": 0.2096, + "step": 8740 + }, + { + "epoch": 0.6069802731411229, + "grad_norm": 1.4609375, + "learning_rate": 3.4784347826086963e-06, + "loss": 0.2116, + "step": 8750 + }, + { + "epoch": 0.6076739648818557, + "grad_norm": 0.79296875, + "learning_rate": 3.4766956521739132e-06, + "loss": 0.1944, + "step": 8760 + }, + { + "epoch": 0.6083676566225883, + "grad_norm": 1.3125, + "learning_rate": 3.4749565217391306e-06, + "loss": 0.2687, + "step": 8770 + }, + { + "epoch": 0.609061348363321, + "grad_norm": 1.2421875, + "learning_rate": 3.4732173913043484e-06, + "loss": 0.283, + "step": 8780 + }, + { + "epoch": 0.6097550401040538, + "grad_norm": 1.3359375, + "learning_rate": 3.4714782608695658e-06, + "loss": 0.2059, + "step": 8790 + }, + { + "epoch": 0.6104487318447864, + "grad_norm": 0.9765625, + "learning_rate": 3.4697391304347827e-06, + "loss": 0.3626, + "step": 8800 + }, + { + "epoch": 0.6111424235855192, + "grad_norm": 1.2578125, + "learning_rate": 3.468e-06, + "loss": 0.2557, + "step": 8810 + }, + { + "epoch": 0.6118361153262519, + "grad_norm": 1.3203125, + "learning_rate": 3.466260869565218e-06, + "loss": 0.2391, + "step": 8820 + }, + { + "epoch": 0.6125298070669846, + "grad_norm": 1.265625, + "learning_rate": 3.4645217391304353e-06, + "loss": 0.296, + "step": 8830 + }, + { + "epoch": 0.6132234988077173, + "grad_norm": 1.046875, + "learning_rate": 3.462782608695652e-06, + "loss": 0.2482, + "step": 8840 + }, + { + "epoch": 0.6139171905484501, + "grad_norm": 1.359375, + "learning_rate": 3.46104347826087e-06, + "loss": 0.2761, + "step": 8850 + }, + { + "epoch": 0.6146108822891827, + "grad_norm": 1.2890625, + "learning_rate": 3.4593043478260874e-06, + "loss": 0.2493, + "step": 8860 + }, + { + "epoch": 0.6153045740299155, + "grad_norm": 1.1328125, + "learning_rate": 3.4575652173913047e-06, + "loss": 0.2301, + "step": 8870 + }, + { + "epoch": 0.6159982657706482, + "grad_norm": 0.99609375, + "learning_rate": 3.4558260869565217e-06, + "loss": 0.2608, + "step": 8880 + }, + { + "epoch": 0.6166919575113808, + "grad_norm": 0.92578125, + "learning_rate": 3.4540869565217395e-06, + "loss": 0.2481, + "step": 8890 + }, + { + "epoch": 0.6173856492521136, + "grad_norm": 1.4140625, + "learning_rate": 3.452347826086957e-06, + "loss": 0.2292, + "step": 8900 + }, + { + "epoch": 0.6180793409928463, + "grad_norm": 1.1015625, + "learning_rate": 3.450608695652174e-06, + "loss": 0.2091, + "step": 8910 + }, + { + "epoch": 0.618773032733579, + "grad_norm": 1.2421875, + "learning_rate": 3.4488695652173916e-06, + "loss": 0.2997, + "step": 8920 + }, + { + "epoch": 0.6194667244743117, + "grad_norm": 1.453125, + "learning_rate": 3.447130434782609e-06, + "loss": 0.3502, + "step": 8930 + }, + { + "epoch": 0.6201604162150445, + "grad_norm": 1.7578125, + "learning_rate": 3.4453913043478263e-06, + "loss": 0.313, + "step": 8940 + }, + { + "epoch": 0.6208541079557771, + "grad_norm": 1.2109375, + "learning_rate": 3.4436521739130437e-06, + "loss": 0.2215, + "step": 8950 + }, + { + "epoch": 0.6215477996965099, + "grad_norm": 1.3203125, + "learning_rate": 3.441913043478261e-06, + "loss": 0.2398, + "step": 8960 + }, + { + "epoch": 0.6222414914372426, + "grad_norm": 1.046875, + "learning_rate": 3.4401739130434784e-06, + "loss": 0.2002, + "step": 8970 + }, + { + "epoch": 0.6229351831779752, + "grad_norm": 1.484375, + "learning_rate": 3.4384347826086962e-06, + "loss": 0.2549, + "step": 8980 + }, + { + "epoch": 0.623628874918708, + "grad_norm": 1.1328125, + "learning_rate": 3.436695652173913e-06, + "loss": 0.2009, + "step": 8990 + }, + { + "epoch": 0.6243225666594407, + "grad_norm": 0.9375, + "learning_rate": 3.4349565217391306e-06, + "loss": 0.22, + "step": 9000 + }, + { + "epoch": 0.6250162584001734, + "grad_norm": 1.1875, + "learning_rate": 3.4332173913043484e-06, + "loss": 0.2436, + "step": 9010 + }, + { + "epoch": 0.6257099501409061, + "grad_norm": 0.96484375, + "learning_rate": 3.4314782608695657e-06, + "loss": 0.2411, + "step": 9020 + }, + { + "epoch": 0.6264036418816389, + "grad_norm": 1.15625, + "learning_rate": 3.4297391304347827e-06, + "loss": 0.2155, + "step": 9030 + }, + { + "epoch": 0.6270973336223715, + "grad_norm": 1.765625, + "learning_rate": 3.428e-06, + "loss": 0.26, + "step": 9040 + }, + { + "epoch": 0.6277910253631043, + "grad_norm": 1.296875, + "learning_rate": 3.426260869565218e-06, + "loss": 0.2602, + "step": 9050 + }, + { + "epoch": 0.628484717103837, + "grad_norm": 1.0859375, + "learning_rate": 3.424521739130435e-06, + "loss": 0.2459, + "step": 9060 + }, + { + "epoch": 0.6291784088445697, + "grad_norm": 1.515625, + "learning_rate": 3.422782608695652e-06, + "loss": 0.3279, + "step": 9070 + }, + { + "epoch": 0.6298721005853024, + "grad_norm": 1.09375, + "learning_rate": 3.42104347826087e-06, + "loss": 0.2208, + "step": 9080 + }, + { + "epoch": 0.6305657923260352, + "grad_norm": 1.046875, + "learning_rate": 3.4193043478260873e-06, + "loss": 0.2717, + "step": 9090 + }, + { + "epoch": 0.6312594840667678, + "grad_norm": 1.0546875, + "learning_rate": 3.4175652173913047e-06, + "loss": 0.2052, + "step": 9100 + }, + { + "epoch": 0.6319531758075005, + "grad_norm": 1.1171875, + "learning_rate": 3.4158260869565216e-06, + "loss": 0.2791, + "step": 9110 + }, + { + "epoch": 0.6326468675482333, + "grad_norm": 1.109375, + "learning_rate": 3.4140869565217394e-06, + "loss": 0.2904, + "step": 9120 + }, + { + "epoch": 0.6333405592889659, + "grad_norm": 1.046875, + "learning_rate": 3.412347826086957e-06, + "loss": 0.2522, + "step": 9130 + }, + { + "epoch": 0.6340342510296987, + "grad_norm": 1.4140625, + "learning_rate": 3.4106086956521746e-06, + "loss": 0.2051, + "step": 9140 + }, + { + "epoch": 0.6347279427704314, + "grad_norm": 1.515625, + "learning_rate": 3.4088695652173915e-06, + "loss": 0.2321, + "step": 9150 + }, + { + "epoch": 0.6354216345111641, + "grad_norm": 1.203125, + "learning_rate": 3.407130434782609e-06, + "loss": 0.2734, + "step": 9160 + }, + { + "epoch": 0.6361153262518968, + "grad_norm": 1.1796875, + "learning_rate": 3.4053913043478263e-06, + "loss": 0.2302, + "step": 9170 + }, + { + "epoch": 0.6368090179926296, + "grad_norm": 1.40625, + "learning_rate": 3.403652173913044e-06, + "loss": 0.2541, + "step": 9180 + }, + { + "epoch": 0.6375027097333622, + "grad_norm": 1.5, + "learning_rate": 3.401913043478261e-06, + "loss": 0.2151, + "step": 9190 + }, + { + "epoch": 0.638196401474095, + "grad_norm": 1.2421875, + "learning_rate": 3.4001739130434784e-06, + "loss": 0.2795, + "step": 9200 + }, + { + "epoch": 0.6388900932148277, + "grad_norm": 1.2734375, + "learning_rate": 3.398434782608696e-06, + "loss": 0.2498, + "step": 9210 + }, + { + "epoch": 0.6395837849555603, + "grad_norm": 2.0625, + "learning_rate": 3.396695652173913e-06, + "loss": 0.2836, + "step": 9220 + }, + { + "epoch": 0.6402774766962931, + "grad_norm": 1.125, + "learning_rate": 3.3949565217391305e-06, + "loss": 0.2346, + "step": 9230 + }, + { + "epoch": 0.6409711684370258, + "grad_norm": 1.03125, + "learning_rate": 3.3932173913043483e-06, + "loss": 0.2447, + "step": 9240 + }, + { + "epoch": 0.6416648601777585, + "grad_norm": 1.2109375, + "learning_rate": 3.3914782608695657e-06, + "loss": 0.2775, + "step": 9250 + }, + { + "epoch": 0.6423585519184912, + "grad_norm": 1.4296875, + "learning_rate": 3.3897391304347826e-06, + "loss": 0.2225, + "step": 9260 + }, + { + "epoch": 0.643052243659224, + "grad_norm": 1.4609375, + "learning_rate": 3.388e-06, + "loss": 0.2327, + "step": 9270 + }, + { + "epoch": 0.6437459353999566, + "grad_norm": 1.078125, + "learning_rate": 3.386260869565218e-06, + "loss": 0.2091, + "step": 9280 + }, + { + "epoch": 0.6444396271406894, + "grad_norm": 1.484375, + "learning_rate": 3.384521739130435e-06, + "loss": 0.244, + "step": 9290 + }, + { + "epoch": 0.6451333188814221, + "grad_norm": 1.25, + "learning_rate": 3.382782608695652e-06, + "loss": 0.2169, + "step": 9300 + }, + { + "epoch": 0.6458270106221548, + "grad_norm": 1.4296875, + "learning_rate": 3.38104347826087e-06, + "loss": 0.3279, + "step": 9310 + }, + { + "epoch": 0.6465207023628875, + "grad_norm": 1.046875, + "learning_rate": 3.3793043478260873e-06, + "loss": 0.2176, + "step": 9320 + }, + { + "epoch": 0.6472143941036202, + "grad_norm": 1.96875, + "learning_rate": 3.3775652173913047e-06, + "loss": 0.315, + "step": 9330 + }, + { + "epoch": 0.6479080858443529, + "grad_norm": 1.2421875, + "learning_rate": 3.3758260869565216e-06, + "loss": 0.2778, + "step": 9340 + }, + { + "epoch": 0.6486017775850856, + "grad_norm": 0.92578125, + "learning_rate": 3.3740869565217394e-06, + "loss": 0.2681, + "step": 9350 + }, + { + "epoch": 0.6492954693258184, + "grad_norm": 1.078125, + "learning_rate": 3.3723478260869568e-06, + "loss": 0.2078, + "step": 9360 + }, + { + "epoch": 0.649989161066551, + "grad_norm": 1.8984375, + "learning_rate": 3.3706086956521746e-06, + "loss": 0.3201, + "step": 9370 + }, + { + "epoch": 0.6506828528072838, + "grad_norm": 1.078125, + "learning_rate": 3.3688695652173915e-06, + "loss": 0.2226, + "step": 9380 + }, + { + "epoch": 0.6513765445480165, + "grad_norm": 1.171875, + "learning_rate": 3.367130434782609e-06, + "loss": 0.2847, + "step": 9390 + }, + { + "epoch": 0.6520702362887492, + "grad_norm": 0.99609375, + "learning_rate": 3.3653913043478263e-06, + "loss": 0.2418, + "step": 9400 + }, + { + "epoch": 0.6527639280294819, + "grad_norm": 2.078125, + "learning_rate": 3.363652173913044e-06, + "loss": 0.2898, + "step": 9410 + }, + { + "epoch": 0.6534576197702147, + "grad_norm": 1.109375, + "learning_rate": 3.361913043478261e-06, + "loss": 0.2522, + "step": 9420 + }, + { + "epoch": 0.6541513115109473, + "grad_norm": 1.203125, + "learning_rate": 3.3601739130434784e-06, + "loss": 0.2297, + "step": 9430 + }, + { + "epoch": 0.65484500325168, + "grad_norm": 1.1484375, + "learning_rate": 3.358434782608696e-06, + "loss": 0.2395, + "step": 9440 + }, + { + "epoch": 0.6555386949924128, + "grad_norm": 1.4453125, + "learning_rate": 3.3566956521739135e-06, + "loss": 0.2372, + "step": 9450 + }, + { + "epoch": 0.6562323867331454, + "grad_norm": 1.265625, + "learning_rate": 3.3549565217391305e-06, + "loss": 0.2412, + "step": 9460 + }, + { + "epoch": 0.6569260784738782, + "grad_norm": 1.28125, + "learning_rate": 3.3532173913043483e-06, + "loss": 0.2586, + "step": 9470 + }, + { + "epoch": 0.6576197702146109, + "grad_norm": 1.5703125, + "learning_rate": 3.3514782608695656e-06, + "loss": 0.269, + "step": 9480 + }, + { + "epoch": 0.6583134619553436, + "grad_norm": 1.1796875, + "learning_rate": 3.349739130434783e-06, + "loss": 0.2298, + "step": 9490 + }, + { + "epoch": 0.6590071536960763, + "grad_norm": 0.9609375, + "learning_rate": 3.348e-06, + "loss": 0.2737, + "step": 9500 + }, + { + "epoch": 0.6597008454368091, + "grad_norm": 1.4453125, + "learning_rate": 3.3462608695652178e-06, + "loss": 0.2729, + "step": 9510 + }, + { + "epoch": 0.6603945371775417, + "grad_norm": 1.0625, + "learning_rate": 3.344521739130435e-06, + "loss": 0.2285, + "step": 9520 + }, + { + "epoch": 0.6610882289182745, + "grad_norm": 1.0390625, + "learning_rate": 3.342782608695652e-06, + "loss": 0.2581, + "step": 9530 + }, + { + "epoch": 0.6617819206590072, + "grad_norm": 1.21875, + "learning_rate": 3.34104347826087e-06, + "loss": 0.3278, + "step": 9540 + }, + { + "epoch": 0.6624756123997398, + "grad_norm": 0.9765625, + "learning_rate": 3.3393043478260872e-06, + "loss": 0.2526, + "step": 9550 + }, + { + "epoch": 0.6631693041404726, + "grad_norm": 1.265625, + "learning_rate": 3.3375652173913046e-06, + "loss": 0.2401, + "step": 9560 + }, + { + "epoch": 0.6638629958812053, + "grad_norm": 0.9765625, + "learning_rate": 3.3358260869565216e-06, + "loss": 0.2359, + "step": 9570 + }, + { + "epoch": 0.664556687621938, + "grad_norm": 1.2265625, + "learning_rate": 3.3340869565217394e-06, + "loss": 0.2697, + "step": 9580 + }, + { + "epoch": 0.6652503793626707, + "grad_norm": 1.3203125, + "learning_rate": 3.3323478260869567e-06, + "loss": 0.2191, + "step": 9590 + }, + { + "epoch": 0.6659440711034035, + "grad_norm": 1.0859375, + "learning_rate": 3.3306086956521745e-06, + "loss": 0.2142, + "step": 9600 + }, + { + "epoch": 0.6666377628441361, + "grad_norm": 1.2578125, + "learning_rate": 3.3288695652173915e-06, + "loss": 0.2708, + "step": 9610 + }, + { + "epoch": 0.6673314545848689, + "grad_norm": 1.265625, + "learning_rate": 3.327130434782609e-06, + "loss": 0.2285, + "step": 9620 + }, + { + "epoch": 0.6680251463256016, + "grad_norm": 1.4609375, + "learning_rate": 3.325391304347826e-06, + "loss": 0.2216, + "step": 9630 + }, + { + "epoch": 0.6687188380663343, + "grad_norm": 1.1796875, + "learning_rate": 3.323652173913044e-06, + "loss": 0.2208, + "step": 9640 + }, + { + "epoch": 0.669412529807067, + "grad_norm": 0.86328125, + "learning_rate": 3.321913043478261e-06, + "loss": 0.1785, + "step": 9650 + }, + { + "epoch": 0.6701062215477998, + "grad_norm": 1.0390625, + "learning_rate": 3.3201739130434783e-06, + "loss": 0.2453, + "step": 9660 + }, + { + "epoch": 0.6707999132885324, + "grad_norm": 1.09375, + "learning_rate": 3.318434782608696e-06, + "loss": 0.2761, + "step": 9670 + }, + { + "epoch": 0.6714936050292651, + "grad_norm": 1.1015625, + "learning_rate": 3.3166956521739135e-06, + "loss": 0.2701, + "step": 9680 + }, + { + "epoch": 0.6721872967699978, + "grad_norm": 1.5546875, + "learning_rate": 3.3149565217391304e-06, + "loss": 0.258, + "step": 9690 + }, + { + "epoch": 0.6728809885107305, + "grad_norm": 1.21875, + "learning_rate": 3.3132173913043482e-06, + "loss": 0.2648, + "step": 9700 + }, + { + "epoch": 0.6735746802514633, + "grad_norm": 1.3125, + "learning_rate": 3.3114782608695656e-06, + "loss": 0.2885, + "step": 9710 + }, + { + "epoch": 0.6742683719921959, + "grad_norm": 1.234375, + "learning_rate": 3.309739130434783e-06, + "loss": 0.264, + "step": 9720 + }, + { + "epoch": 0.6749620637329287, + "grad_norm": 1.09375, + "learning_rate": 3.308e-06, + "loss": 0.2387, + "step": 9730 + }, + { + "epoch": 0.6756557554736614, + "grad_norm": 1.375, + "learning_rate": 3.3062608695652177e-06, + "loss": 0.3259, + "step": 9740 + }, + { + "epoch": 0.676349447214394, + "grad_norm": 1.421875, + "learning_rate": 3.304521739130435e-06, + "loss": 0.251, + "step": 9750 + }, + { + "epoch": 0.6770431389551268, + "grad_norm": 1.0859375, + "learning_rate": 3.302782608695653e-06, + "loss": 0.2134, + "step": 9760 + }, + { + "epoch": 0.6777368306958595, + "grad_norm": 1.1171875, + "learning_rate": 3.30104347826087e-06, + "loss": 0.2788, + "step": 9770 + }, + { + "epoch": 0.6784305224365922, + "grad_norm": 1.265625, + "learning_rate": 3.299304347826087e-06, + "loss": 0.2509, + "step": 9780 + }, + { + "epoch": 0.6791242141773249, + "grad_norm": 1.3359375, + "learning_rate": 3.2975652173913046e-06, + "loss": 0.2613, + "step": 9790 + }, + { + "epoch": 0.6798179059180577, + "grad_norm": 1.1328125, + "learning_rate": 3.2958260869565224e-06, + "loss": 0.2278, + "step": 9800 + }, + { + "epoch": 0.6805115976587903, + "grad_norm": 1.0859375, + "learning_rate": 3.2940869565217393e-06, + "loss": 0.2223, + "step": 9810 + }, + { + "epoch": 0.6812052893995231, + "grad_norm": 1.484375, + "learning_rate": 3.2923478260869567e-06, + "loss": 0.2233, + "step": 9820 + }, + { + "epoch": 0.6818989811402558, + "grad_norm": 1.234375, + "learning_rate": 3.2906086956521745e-06, + "loss": 0.2087, + "step": 9830 + }, + { + "epoch": 0.6825926728809885, + "grad_norm": 1.125, + "learning_rate": 3.2888695652173914e-06, + "loss": 0.2244, + "step": 9840 + }, + { + "epoch": 0.6832863646217212, + "grad_norm": 1.03125, + "learning_rate": 3.287130434782609e-06, + "loss": 0.2528, + "step": 9850 + }, + { + "epoch": 0.683980056362454, + "grad_norm": 1.3203125, + "learning_rate": 3.2853913043478266e-06, + "loss": 0.2369, + "step": 9860 + }, + { + "epoch": 0.6846737481031866, + "grad_norm": 1.1875, + "learning_rate": 3.283652173913044e-06, + "loss": 0.2412, + "step": 9870 + }, + { + "epoch": 0.6853674398439193, + "grad_norm": 1.9296875, + "learning_rate": 3.281913043478261e-06, + "loss": 0.2932, + "step": 9880 + }, + { + "epoch": 0.6860611315846521, + "grad_norm": 1.4296875, + "learning_rate": 3.2801739130434783e-06, + "loss": 0.2391, + "step": 9890 + }, + { + "epoch": 0.6867548233253847, + "grad_norm": 1.015625, + "learning_rate": 3.278434782608696e-06, + "loss": 0.2181, + "step": 9900 + }, + { + "epoch": 0.6874485150661175, + "grad_norm": 0.890625, + "learning_rate": 3.2766956521739134e-06, + "loss": 0.2253, + "step": 9910 + }, + { + "epoch": 0.6881422068068502, + "grad_norm": 1.46875, + "learning_rate": 3.2749565217391304e-06, + "loss": 0.2034, + "step": 9920 + }, + { + "epoch": 0.6888358985475829, + "grad_norm": 1.265625, + "learning_rate": 3.273217391304348e-06, + "loss": 0.2552, + "step": 9930 + }, + { + "epoch": 0.6895295902883156, + "grad_norm": 1.6171875, + "learning_rate": 3.2714782608695656e-06, + "loss": 0.3157, + "step": 9940 + }, + { + "epoch": 0.6902232820290484, + "grad_norm": 0.96875, + "learning_rate": 3.269739130434783e-06, + "loss": 0.218, + "step": 9950 + }, + { + "epoch": 0.690916973769781, + "grad_norm": 1.2265625, + "learning_rate": 3.268e-06, + "loss": 0.2232, + "step": 9960 + }, + { + "epoch": 0.6916106655105138, + "grad_norm": 1.296875, + "learning_rate": 3.2662608695652177e-06, + "loss": 0.2971, + "step": 9970 + }, + { + "epoch": 0.6923043572512465, + "grad_norm": 1.1953125, + "learning_rate": 3.264521739130435e-06, + "loss": 0.2534, + "step": 9980 + }, + { + "epoch": 0.6929980489919791, + "grad_norm": 1.1328125, + "learning_rate": 3.262782608695653e-06, + "loss": 0.2167, + "step": 9990 + }, + { + "epoch": 0.6936917407327119, + "grad_norm": 0.98828125, + "learning_rate": 3.26104347826087e-06, + "loss": 0.2506, + "step": 10000 + }, + { + "epoch": 0.6943854324734446, + "grad_norm": 1.1484375, + "learning_rate": 3.259304347826087e-06, + "loss": 0.2381, + "step": 10010 + }, + { + "epoch": 0.6950791242141773, + "grad_norm": 1.3515625, + "learning_rate": 3.2575652173913045e-06, + "loss": 0.2228, + "step": 10020 + }, + { + "epoch": 0.69577281595491, + "grad_norm": 1.0078125, + "learning_rate": 3.2558260869565223e-06, + "loss": 0.299, + "step": 10030 + }, + { + "epoch": 0.6964665076956428, + "grad_norm": 1.125, + "learning_rate": 3.2540869565217393e-06, + "loss": 0.2691, + "step": 10040 + }, + { + "epoch": 0.6971601994363754, + "grad_norm": 1.0625, + "learning_rate": 3.2523478260869566e-06, + "loss": 0.2412, + "step": 10050 + }, + { + "epoch": 0.6978538911771082, + "grad_norm": 1.453125, + "learning_rate": 3.2506086956521744e-06, + "loss": 0.2672, + "step": 10060 + }, + { + "epoch": 0.6985475829178409, + "grad_norm": 1.2734375, + "learning_rate": 3.248869565217392e-06, + "loss": 0.2148, + "step": 10070 + }, + { + "epoch": 0.6992412746585736, + "grad_norm": 1.28125, + "learning_rate": 3.2471304347826088e-06, + "loss": 0.246, + "step": 10080 + }, + { + "epoch": 0.6999349663993063, + "grad_norm": 0.8984375, + "learning_rate": 3.2453913043478266e-06, + "loss": 0.2934, + "step": 10090 + }, + { + "epoch": 0.700628658140039, + "grad_norm": 1.1875, + "learning_rate": 3.243652173913044e-06, + "loss": 0.2409, + "step": 10100 + }, + { + "epoch": 0.7013223498807717, + "grad_norm": 1.046875, + "learning_rate": 3.241913043478261e-06, + "loss": 0.2713, + "step": 10110 + }, + { + "epoch": 0.7020160416215044, + "grad_norm": 1.15625, + "learning_rate": 3.2401739130434782e-06, + "loss": 0.2528, + "step": 10120 + }, + { + "epoch": 0.7027097333622372, + "grad_norm": 1.21875, + "learning_rate": 3.238434782608696e-06, + "loss": 0.2619, + "step": 10130 + }, + { + "epoch": 0.7034034251029698, + "grad_norm": 1.1875, + "learning_rate": 3.2366956521739134e-06, + "loss": 0.2569, + "step": 10140 + }, + { + "epoch": 0.7040971168437026, + "grad_norm": 1.2265625, + "learning_rate": 3.2349565217391304e-06, + "loss": 0.2033, + "step": 10150 + }, + { + "epoch": 0.7047908085844353, + "grad_norm": 1.4765625, + "learning_rate": 3.233217391304348e-06, + "loss": 0.2447, + "step": 10160 + }, + { + "epoch": 0.705484500325168, + "grad_norm": 0.83203125, + "learning_rate": 3.2314782608695655e-06, + "loss": 0.2063, + "step": 10170 + }, + { + "epoch": 0.7061781920659007, + "grad_norm": 1.234375, + "learning_rate": 3.229739130434783e-06, + "loss": 0.235, + "step": 10180 + }, + { + "epoch": 0.7068718838066335, + "grad_norm": 1.375, + "learning_rate": 3.228e-06, + "loss": 0.2407, + "step": 10190 + }, + { + "epoch": 0.7075655755473661, + "grad_norm": 1.2421875, + "learning_rate": 3.2262608695652176e-06, + "loss": 0.2973, + "step": 10200 + }, + { + "epoch": 0.7082592672880988, + "grad_norm": 1.34375, + "learning_rate": 3.224521739130435e-06, + "loss": 0.2291, + "step": 10210 + }, + { + "epoch": 0.7089529590288316, + "grad_norm": 1.1328125, + "learning_rate": 3.222782608695653e-06, + "loss": 0.2552, + "step": 10220 + }, + { + "epoch": 0.7096466507695642, + "grad_norm": 1.1953125, + "learning_rate": 3.2210434782608697e-06, + "loss": 0.2343, + "step": 10230 + }, + { + "epoch": 0.710340342510297, + "grad_norm": 1.265625, + "learning_rate": 3.219304347826087e-06, + "loss": 0.2158, + "step": 10240 + }, + { + "epoch": 0.7110340342510297, + "grad_norm": 1.0703125, + "learning_rate": 3.2175652173913045e-06, + "loss": 0.2512, + "step": 10250 + }, + { + "epoch": 0.7117277259917624, + "grad_norm": 0.890625, + "learning_rate": 3.2158260869565223e-06, + "loss": 0.2346, + "step": 10260 + }, + { + "epoch": 0.7124214177324951, + "grad_norm": 1.0078125, + "learning_rate": 3.2140869565217392e-06, + "loss": 0.2133, + "step": 10270 + }, + { + "epoch": 0.7131151094732279, + "grad_norm": 1.15625, + "learning_rate": 3.2123478260869566e-06, + "loss": 0.2386, + "step": 10280 + }, + { + "epoch": 0.7138088012139605, + "grad_norm": 1.109375, + "learning_rate": 3.2106086956521744e-06, + "loss": 0.1876, + "step": 10290 + }, + { + "epoch": 0.7145024929546933, + "grad_norm": 0.953125, + "learning_rate": 3.2088695652173918e-06, + "loss": 0.2198, + "step": 10300 + }, + { + "epoch": 0.715196184695426, + "grad_norm": 0.9453125, + "learning_rate": 3.2071304347826087e-06, + "loss": 0.218, + "step": 10310 + }, + { + "epoch": 0.7158898764361586, + "grad_norm": 0.97265625, + "learning_rate": 3.2053913043478265e-06, + "loss": 0.2749, + "step": 10320 + }, + { + "epoch": 0.7165835681768914, + "grad_norm": 0.99609375, + "learning_rate": 3.203652173913044e-06, + "loss": 0.2199, + "step": 10330 + }, + { + "epoch": 0.7172772599176241, + "grad_norm": 0.9921875, + "learning_rate": 3.2019130434782613e-06, + "loss": 0.2523, + "step": 10340 + }, + { + "epoch": 0.7179709516583568, + "grad_norm": 0.98828125, + "learning_rate": 3.200173913043478e-06, + "loss": 0.2313, + "step": 10350 + }, + { + "epoch": 0.7186646433990895, + "grad_norm": 1.25, + "learning_rate": 3.198434782608696e-06, + "loss": 0.3035, + "step": 10360 + }, + { + "epoch": 0.7193583351398223, + "grad_norm": 1.6015625, + "learning_rate": 3.1966956521739134e-06, + "loss": 0.3116, + "step": 10370 + }, + { + "epoch": 0.7200520268805549, + "grad_norm": 1.21875, + "learning_rate": 3.194956521739131e-06, + "loss": 0.2449, + "step": 10380 + }, + { + "epoch": 0.7207457186212877, + "grad_norm": 1.796875, + "learning_rate": 3.193217391304348e-06, + "loss": 0.3198, + "step": 10390 + }, + { + "epoch": 0.7214394103620204, + "grad_norm": 1.25, + "learning_rate": 3.1914782608695655e-06, + "loss": 0.2539, + "step": 10400 + }, + { + "epoch": 0.722133102102753, + "grad_norm": 1.125, + "learning_rate": 3.189739130434783e-06, + "loss": 0.3254, + "step": 10410 + }, + { + "epoch": 0.7228267938434858, + "grad_norm": 1.375, + "learning_rate": 3.188e-06, + "loss": 0.2163, + "step": 10420 + }, + { + "epoch": 0.7235204855842186, + "grad_norm": 1.2109375, + "learning_rate": 3.1862608695652176e-06, + "loss": 0.253, + "step": 10430 + }, + { + "epoch": 0.7242141773249512, + "grad_norm": 1.0234375, + "learning_rate": 3.184521739130435e-06, + "loss": 0.2237, + "step": 10440 + }, + { + "epoch": 0.7249078690656839, + "grad_norm": 1.234375, + "learning_rate": 3.1827826086956528e-06, + "loss": 0.2336, + "step": 10450 + }, + { + "epoch": 0.7256015608064167, + "grad_norm": 1.046875, + "learning_rate": 3.1810434782608697e-06, + "loss": 0.2501, + "step": 10460 + }, + { + "epoch": 0.7262952525471493, + "grad_norm": 1.3515625, + "learning_rate": 3.179304347826087e-06, + "loss": 0.2205, + "step": 10470 + }, + { + "epoch": 0.7269889442878821, + "grad_norm": 1.515625, + "learning_rate": 3.1775652173913045e-06, + "loss": 0.2509, + "step": 10480 + }, + { + "epoch": 0.7276826360286148, + "grad_norm": 1.109375, + "learning_rate": 3.1758260869565222e-06, + "loss": 0.2539, + "step": 10490 + }, + { + "epoch": 0.7283763277693475, + "grad_norm": 1.0703125, + "learning_rate": 3.174086956521739e-06, + "loss": 0.2217, + "step": 10500 + }, + { + "epoch": 0.7290700195100802, + "grad_norm": 1.4375, + "learning_rate": 3.1723478260869566e-06, + "loss": 0.26, + "step": 10510 + }, + { + "epoch": 0.729763711250813, + "grad_norm": 1.0703125, + "learning_rate": 3.1706086956521744e-06, + "loss": 0.2377, + "step": 10520 + }, + { + "epoch": 0.7304574029915456, + "grad_norm": 1.265625, + "learning_rate": 3.1688695652173917e-06, + "loss": 0.2349, + "step": 10530 + }, + { + "epoch": 0.7311510947322784, + "grad_norm": 0.953125, + "learning_rate": 3.1671304347826087e-06, + "loss": 0.2118, + "step": 10540 + }, + { + "epoch": 0.7318447864730111, + "grad_norm": 1.328125, + "learning_rate": 3.1653913043478265e-06, + "loss": 0.2557, + "step": 10550 + }, + { + "epoch": 0.7325384782137437, + "grad_norm": 1.125, + "learning_rate": 3.163652173913044e-06, + "loss": 0.272, + "step": 10560 + }, + { + "epoch": 0.7332321699544765, + "grad_norm": 1.078125, + "learning_rate": 3.1619130434782612e-06, + "loss": 0.2569, + "step": 10570 + }, + { + "epoch": 0.7339258616952092, + "grad_norm": 0.77734375, + "learning_rate": 3.160173913043478e-06, + "loss": 0.2273, + "step": 10580 + }, + { + "epoch": 0.7346195534359419, + "grad_norm": 1.109375, + "learning_rate": 3.158434782608696e-06, + "loss": 0.2184, + "step": 10590 + }, + { + "epoch": 0.7353132451766746, + "grad_norm": 1.15625, + "learning_rate": 3.1566956521739133e-06, + "loss": 0.2226, + "step": 10600 + }, + { + "epoch": 0.7360069369174074, + "grad_norm": 0.8984375, + "learning_rate": 3.154956521739131e-06, + "loss": 0.2256, + "step": 10610 + }, + { + "epoch": 0.73670062865814, + "grad_norm": 1.578125, + "learning_rate": 3.153217391304348e-06, + "loss": 0.2162, + "step": 10620 + }, + { + "epoch": 0.7373943203988728, + "grad_norm": 1.15625, + "learning_rate": 3.1514782608695654e-06, + "loss": 0.2247, + "step": 10630 + }, + { + "epoch": 0.7380880121396055, + "grad_norm": 1.6015625, + "learning_rate": 3.149739130434783e-06, + "loss": 0.2441, + "step": 10640 + }, + { + "epoch": 0.7387817038803381, + "grad_norm": 1.265625, + "learning_rate": 3.1480000000000006e-06, + "loss": 0.2476, + "step": 10650 + }, + { + "epoch": 0.7394753956210709, + "grad_norm": 1.1953125, + "learning_rate": 3.1462608695652176e-06, + "loss": 0.301, + "step": 10660 + }, + { + "epoch": 0.7401690873618036, + "grad_norm": 1.296875, + "learning_rate": 3.144521739130435e-06, + "loss": 0.267, + "step": 10670 + }, + { + "epoch": 0.7408627791025363, + "grad_norm": 1.1328125, + "learning_rate": 3.1427826086956527e-06, + "loss": 0.2188, + "step": 10680 + }, + { + "epoch": 0.741556470843269, + "grad_norm": 1.1875, + "learning_rate": 3.14104347826087e-06, + "loss": 0.2561, + "step": 10690 + }, + { + "epoch": 0.7422501625840018, + "grad_norm": 1.171875, + "learning_rate": 3.139304347826087e-06, + "loss": 0.2115, + "step": 10700 + }, + { + "epoch": 0.7429438543247344, + "grad_norm": 1.40625, + "learning_rate": 3.1375652173913044e-06, + "loss": 0.2606, + "step": 10710 + }, + { + "epoch": 0.7436375460654672, + "grad_norm": 1.0625, + "learning_rate": 3.135826086956522e-06, + "loss": 0.3279, + "step": 10720 + }, + { + "epoch": 0.7443312378061999, + "grad_norm": 1.59375, + "learning_rate": 3.134086956521739e-06, + "loss": 0.2229, + "step": 10730 + }, + { + "epoch": 0.7450249295469326, + "grad_norm": 1.546875, + "learning_rate": 3.1323478260869565e-06, + "loss": 0.3244, + "step": 10740 + }, + { + "epoch": 0.7457186212876653, + "grad_norm": 1.28125, + "learning_rate": 3.1306086956521743e-06, + "loss": 0.2381, + "step": 10750 + }, + { + "epoch": 0.7464123130283981, + "grad_norm": 0.9765625, + "learning_rate": 3.1288695652173917e-06, + "loss": 0.1984, + "step": 10760 + }, + { + "epoch": 0.7471060047691307, + "grad_norm": 1.140625, + "learning_rate": 3.1271304347826086e-06, + "loss": 0.2426, + "step": 10770 + }, + { + "epoch": 0.7477996965098634, + "grad_norm": 1.2734375, + "learning_rate": 3.1253913043478264e-06, + "loss": 0.2656, + "step": 10780 + }, + { + "epoch": 0.7484933882505962, + "grad_norm": 1.359375, + "learning_rate": 3.123652173913044e-06, + "loss": 0.2148, + "step": 10790 + }, + { + "epoch": 0.7491870799913288, + "grad_norm": 1.28125, + "learning_rate": 3.121913043478261e-06, + "loss": 0.2359, + "step": 10800 + }, + { + "epoch": 0.7498807717320616, + "grad_norm": 1.25, + "learning_rate": 3.120173913043478e-06, + "loss": 0.2798, + "step": 10810 + }, + { + "epoch": 0.7505744634727943, + "grad_norm": 1.203125, + "learning_rate": 3.118434782608696e-06, + "loss": 0.2446, + "step": 10820 + }, + { + "epoch": 0.751268155213527, + "grad_norm": 1.140625, + "learning_rate": 3.1166956521739133e-06, + "loss": 0.2492, + "step": 10830 + }, + { + "epoch": 0.7519618469542597, + "grad_norm": 1.0234375, + "learning_rate": 3.114956521739131e-06, + "loss": 0.2174, + "step": 10840 + }, + { + "epoch": 0.7526555386949925, + "grad_norm": 1.0234375, + "learning_rate": 3.113217391304348e-06, + "loss": 0.2919, + "step": 10850 + }, + { + "epoch": 0.7533492304357251, + "grad_norm": 0.97265625, + "learning_rate": 3.1114782608695654e-06, + "loss": 0.2294, + "step": 10860 + }, + { + "epoch": 0.7540429221764579, + "grad_norm": 1.171875, + "learning_rate": 3.1097391304347828e-06, + "loss": 0.2915, + "step": 10870 + }, + { + "epoch": 0.7547366139171906, + "grad_norm": 1.5234375, + "learning_rate": 3.1080000000000006e-06, + "loss": 0.2996, + "step": 10880 + }, + { + "epoch": 0.7554303056579232, + "grad_norm": 1.140625, + "learning_rate": 3.1062608695652175e-06, + "loss": 0.3066, + "step": 10890 + }, + { + "epoch": 0.756123997398656, + "grad_norm": 1.3671875, + "learning_rate": 3.104521739130435e-06, + "loss": 0.2634, + "step": 10900 + }, + { + "epoch": 0.7568176891393887, + "grad_norm": 1.265625, + "learning_rate": 3.1027826086956527e-06, + "loss": 0.2178, + "step": 10910 + }, + { + "epoch": 0.7575113808801214, + "grad_norm": 0.91015625, + "learning_rate": 3.10104347826087e-06, + "loss": 0.2464, + "step": 10920 + }, + { + "epoch": 0.7582050726208541, + "grad_norm": 1.375, + "learning_rate": 3.099304347826087e-06, + "loss": 0.2308, + "step": 10930 + }, + { + "epoch": 0.7588987643615869, + "grad_norm": 1.1015625, + "learning_rate": 3.0975652173913044e-06, + "loss": 0.2179, + "step": 10940 + }, + { + "epoch": 0.7595924561023195, + "grad_norm": 1.203125, + "learning_rate": 3.095826086956522e-06, + "loss": 0.2317, + "step": 10950 + }, + { + "epoch": 0.7602861478430523, + "grad_norm": 0.86328125, + "learning_rate": 3.0940869565217395e-06, + "loss": 0.2443, + "step": 10960 + }, + { + "epoch": 0.7609798395837849, + "grad_norm": 1.3515625, + "learning_rate": 3.0923478260869565e-06, + "loss": 0.3217, + "step": 10970 + }, + { + "epoch": 0.7616735313245176, + "grad_norm": 1.3359375, + "learning_rate": 3.0906086956521743e-06, + "loss": 0.2126, + "step": 10980 + }, + { + "epoch": 0.7623672230652504, + "grad_norm": 1.1953125, + "learning_rate": 3.0888695652173916e-06, + "loss": 0.2326, + "step": 10990 + }, + { + "epoch": 0.763060914805983, + "grad_norm": 1.296875, + "learning_rate": 3.087130434782609e-06, + "loss": 0.2266, + "step": 11000 + }, + { + "epoch": 0.7637546065467158, + "grad_norm": 1.078125, + "learning_rate": 3.0853913043478264e-06, + "loss": 0.2807, + "step": 11010 + }, + { + "epoch": 0.7644482982874485, + "grad_norm": 1.2890625, + "learning_rate": 3.0836521739130438e-06, + "loss": 0.3183, + "step": 11020 + }, + { + "epoch": 0.7651419900281812, + "grad_norm": 1.2890625, + "learning_rate": 3.081913043478261e-06, + "loss": 0.2456, + "step": 11030 + }, + { + "epoch": 0.7658356817689139, + "grad_norm": 1.203125, + "learning_rate": 3.080173913043478e-06, + "loss": 0.2157, + "step": 11040 + }, + { + "epoch": 0.7665293735096467, + "grad_norm": 1.171875, + "learning_rate": 3.078434782608696e-06, + "loss": 0.2612, + "step": 11050 + }, + { + "epoch": 0.7672230652503793, + "grad_norm": 1.1953125, + "learning_rate": 3.0766956521739132e-06, + "loss": 0.253, + "step": 11060 + }, + { + "epoch": 0.7679167569911121, + "grad_norm": 0.90625, + "learning_rate": 3.074956521739131e-06, + "loss": 0.225, + "step": 11070 + }, + { + "epoch": 0.7686104487318448, + "grad_norm": 1.703125, + "learning_rate": 3.073217391304348e-06, + "loss": 0.2247, + "step": 11080 + }, + { + "epoch": 0.7693041404725774, + "grad_norm": 1.2109375, + "learning_rate": 3.0714782608695654e-06, + "loss": 0.2493, + "step": 11090 + }, + { + "epoch": 0.7699978322133102, + "grad_norm": 1.390625, + "learning_rate": 3.0697391304347827e-06, + "loss": 0.3239, + "step": 11100 + }, + { + "epoch": 0.770691523954043, + "grad_norm": 1.1015625, + "learning_rate": 3.0680000000000005e-06, + "loss": 0.2053, + "step": 11110 + }, + { + "epoch": 0.7713852156947756, + "grad_norm": 1.2578125, + "learning_rate": 3.0662608695652175e-06, + "loss": 0.2254, + "step": 11120 + }, + { + "epoch": 0.7720789074355083, + "grad_norm": 1.140625, + "learning_rate": 3.064521739130435e-06, + "loss": 0.2715, + "step": 11130 + }, + { + "epoch": 0.7727725991762411, + "grad_norm": 1.234375, + "learning_rate": 3.0627826086956526e-06, + "loss": 0.2294, + "step": 11140 + }, + { + "epoch": 0.7734662909169737, + "grad_norm": 1.390625, + "learning_rate": 3.06104347826087e-06, + "loss": 0.2423, + "step": 11150 + }, + { + "epoch": 0.7741599826577065, + "grad_norm": 1.453125, + "learning_rate": 3.059304347826087e-06, + "loss": 0.2477, + "step": 11160 + }, + { + "epoch": 0.7748536743984392, + "grad_norm": 1.2578125, + "learning_rate": 3.0575652173913043e-06, + "loss": 0.2103, + "step": 11170 + }, + { + "epoch": 0.7755473661391719, + "grad_norm": 1.484375, + "learning_rate": 3.055826086956522e-06, + "loss": 0.2501, + "step": 11180 + }, + { + "epoch": 0.7762410578799046, + "grad_norm": 1.0859375, + "learning_rate": 3.0540869565217395e-06, + "loss": 0.2741, + "step": 11190 + }, + { + "epoch": 0.7769347496206374, + "grad_norm": 1.4453125, + "learning_rate": 3.0523478260869564e-06, + "loss": 0.2483, + "step": 11200 + }, + { + "epoch": 0.77762844136137, + "grad_norm": 1.53125, + "learning_rate": 3.0506086956521742e-06, + "loss": 0.2672, + "step": 11210 + }, + { + "epoch": 0.7783221331021027, + "grad_norm": 1.21875, + "learning_rate": 3.0488695652173916e-06, + "loss": 0.225, + "step": 11220 + }, + { + "epoch": 0.7790158248428355, + "grad_norm": 1.21875, + "learning_rate": 3.047130434782609e-06, + "loss": 0.2435, + "step": 11230 + }, + { + "epoch": 0.7797095165835681, + "grad_norm": 1.1484375, + "learning_rate": 3.0453913043478264e-06, + "loss": 0.251, + "step": 11240 + }, + { + "epoch": 0.7804032083243009, + "grad_norm": 1.1015625, + "learning_rate": 3.0436521739130437e-06, + "loss": 0.2056, + "step": 11250 + }, + { + "epoch": 0.7810969000650336, + "grad_norm": 1.4921875, + "learning_rate": 3.041913043478261e-06, + "loss": 0.2781, + "step": 11260 + }, + { + "epoch": 0.7817905918057663, + "grad_norm": 1.6171875, + "learning_rate": 3.040173913043479e-06, + "loss": 0.2197, + "step": 11270 + }, + { + "epoch": 0.782484283546499, + "grad_norm": 1.171875, + "learning_rate": 3.038434782608696e-06, + "loss": 0.2486, + "step": 11280 + }, + { + "epoch": 0.7831779752872318, + "grad_norm": 1.0, + "learning_rate": 3.036695652173913e-06, + "loss": 0.2142, + "step": 11290 + }, + { + "epoch": 0.7838716670279644, + "grad_norm": 1.5, + "learning_rate": 3.034956521739131e-06, + "loss": 0.2634, + "step": 11300 + }, + { + "epoch": 0.7845653587686972, + "grad_norm": 1.3984375, + "learning_rate": 3.0332173913043484e-06, + "loss": 0.2851, + "step": 11310 + }, + { + "epoch": 0.7852590505094299, + "grad_norm": 1.0859375, + "learning_rate": 3.0314782608695653e-06, + "loss": 0.2177, + "step": 11320 + }, + { + "epoch": 0.7859527422501625, + "grad_norm": 1.15625, + "learning_rate": 3.0297391304347827e-06, + "loss": 0.2772, + "step": 11330 + }, + { + "epoch": 0.7866464339908953, + "grad_norm": 1.40625, + "learning_rate": 3.0280000000000005e-06, + "loss": 0.2819, + "step": 11340 + }, + { + "epoch": 0.787340125731628, + "grad_norm": 0.87890625, + "learning_rate": 3.0262608695652174e-06, + "loss": 0.2356, + "step": 11350 + }, + { + "epoch": 0.7880338174723607, + "grad_norm": 0.859375, + "learning_rate": 3.024521739130435e-06, + "loss": 0.2149, + "step": 11360 + }, + { + "epoch": 0.7887275092130934, + "grad_norm": 1.265625, + "learning_rate": 3.0227826086956526e-06, + "loss": 0.2252, + "step": 11370 + }, + { + "epoch": 0.7894212009538262, + "grad_norm": 1.8515625, + "learning_rate": 3.02104347826087e-06, + "loss": 0.2917, + "step": 11380 + }, + { + "epoch": 0.7901148926945588, + "grad_norm": 1.4296875, + "learning_rate": 3.019304347826087e-06, + "loss": 0.257, + "step": 11390 + }, + { + "epoch": 0.7908085844352916, + "grad_norm": 1.296875, + "learning_rate": 3.0175652173913043e-06, + "loss": 0.2266, + "step": 11400 + }, + { + "epoch": 0.7915022761760243, + "grad_norm": 1.0546875, + "learning_rate": 3.015826086956522e-06, + "loss": 0.2351, + "step": 11410 + }, + { + "epoch": 0.792195967916757, + "grad_norm": 1.46875, + "learning_rate": 3.0140869565217395e-06, + "loss": 0.2555, + "step": 11420 + }, + { + "epoch": 0.7928896596574897, + "grad_norm": 1.078125, + "learning_rate": 3.0123478260869564e-06, + "loss": 0.2709, + "step": 11430 + }, + { + "epoch": 0.7935833513982224, + "grad_norm": 1.3359375, + "learning_rate": 3.010608695652174e-06, + "loss": 0.2159, + "step": 11440 + }, + { + "epoch": 0.7942770431389551, + "grad_norm": 1.4375, + "learning_rate": 3.0088695652173916e-06, + "loss": 0.3059, + "step": 11450 + }, + { + "epoch": 0.7949707348796878, + "grad_norm": 1.125, + "learning_rate": 3.007130434782609e-06, + "loss": 0.2167, + "step": 11460 + }, + { + "epoch": 0.7956644266204206, + "grad_norm": 0.84765625, + "learning_rate": 3.0053913043478263e-06, + "loss": 0.2494, + "step": 11470 + }, + { + "epoch": 0.7963581183611532, + "grad_norm": 1.2421875, + "learning_rate": 3.0036521739130437e-06, + "loss": 0.2852, + "step": 11480 + }, + { + "epoch": 0.797051810101886, + "grad_norm": 1.171875, + "learning_rate": 3.001913043478261e-06, + "loss": 0.2671, + "step": 11490 + }, + { + "epoch": 0.7977455018426187, + "grad_norm": 1.265625, + "learning_rate": 3.000173913043479e-06, + "loss": 0.2252, + "step": 11500 + }, + { + "epoch": 0.7984391935833514, + "grad_norm": 1.171875, + "learning_rate": 2.998434782608696e-06, + "loss": 0.2481, + "step": 11510 + }, + { + "epoch": 0.7991328853240841, + "grad_norm": 1.2578125, + "learning_rate": 2.996695652173913e-06, + "loss": 0.2194, + "step": 11520 + }, + { + "epoch": 0.7998265770648169, + "grad_norm": 1.125, + "learning_rate": 2.994956521739131e-06, + "loss": 0.2264, + "step": 11530 + }, + { + "epoch": 0.8005202688055495, + "grad_norm": 1.3828125, + "learning_rate": 2.9932173913043483e-06, + "loss": 0.2368, + "step": 11540 + }, + { + "epoch": 0.8012139605462822, + "grad_norm": 1.109375, + "learning_rate": 2.9914782608695653e-06, + "loss": 0.2505, + "step": 11550 + }, + { + "epoch": 0.801907652287015, + "grad_norm": 1.0546875, + "learning_rate": 2.9897391304347827e-06, + "loss": 0.2175, + "step": 11560 + }, + { + "epoch": 0.8026013440277476, + "grad_norm": 1.0703125, + "learning_rate": 2.9880000000000004e-06, + "loss": 0.2256, + "step": 11570 + }, + { + "epoch": 0.8032950357684804, + "grad_norm": 1.140625, + "learning_rate": 2.986260869565218e-06, + "loss": 0.2488, + "step": 11580 + }, + { + "epoch": 0.8039887275092131, + "grad_norm": 1.3828125, + "learning_rate": 2.9845217391304348e-06, + "loss": 0.2574, + "step": 11590 + }, + { + "epoch": 0.8046824192499458, + "grad_norm": 1.0234375, + "learning_rate": 2.9827826086956526e-06, + "loss": 0.2263, + "step": 11600 + }, + { + "epoch": 0.8053761109906785, + "grad_norm": 1.3203125, + "learning_rate": 2.98104347826087e-06, + "loss": 0.2127, + "step": 11610 + }, + { + "epoch": 0.8060698027314113, + "grad_norm": 1.2109375, + "learning_rate": 2.9793043478260873e-06, + "loss": 0.2328, + "step": 11620 + }, + { + "epoch": 0.8067634944721439, + "grad_norm": 1.25, + "learning_rate": 2.9775652173913042e-06, + "loss": 0.279, + "step": 11630 + }, + { + "epoch": 0.8074571862128767, + "grad_norm": 1.265625, + "learning_rate": 2.975826086956522e-06, + "loss": 0.1984, + "step": 11640 + }, + { + "epoch": 0.8081508779536094, + "grad_norm": 1.203125, + "learning_rate": 2.9740869565217394e-06, + "loss": 0.2232, + "step": 11650 + }, + { + "epoch": 0.808844569694342, + "grad_norm": 1.3359375, + "learning_rate": 2.9723478260869564e-06, + "loss": 0.2565, + "step": 11660 + }, + { + "epoch": 0.8095382614350748, + "grad_norm": 1.25, + "learning_rate": 2.970608695652174e-06, + "loss": 0.2786, + "step": 11670 + }, + { + "epoch": 0.8102319531758075, + "grad_norm": 1.3828125, + "learning_rate": 2.9688695652173915e-06, + "loss": 0.326, + "step": 11680 + }, + { + "epoch": 0.8109256449165402, + "grad_norm": 1.171875, + "learning_rate": 2.967130434782609e-06, + "loss": 0.3024, + "step": 11690 + }, + { + "epoch": 0.8116193366572729, + "grad_norm": 1.1328125, + "learning_rate": 2.9653913043478263e-06, + "loss": 0.2609, + "step": 11700 + }, + { + "epoch": 0.8123130283980057, + "grad_norm": 1.2109375, + "learning_rate": 2.9636521739130436e-06, + "loss": 0.2344, + "step": 11710 + }, + { + "epoch": 0.8130067201387383, + "grad_norm": 1.4921875, + "learning_rate": 2.961913043478261e-06, + "loss": 0.2705, + "step": 11720 + }, + { + "epoch": 0.8137004118794711, + "grad_norm": 1.390625, + "learning_rate": 2.960173913043479e-06, + "loss": 0.2354, + "step": 11730 + }, + { + "epoch": 0.8143941036202038, + "grad_norm": 0.94140625, + "learning_rate": 2.9584347826086958e-06, + "loss": 0.2062, + "step": 11740 + }, + { + "epoch": 0.8150877953609365, + "grad_norm": 1.3515625, + "learning_rate": 2.956695652173913e-06, + "loss": 0.2292, + "step": 11750 + }, + { + "epoch": 0.8157814871016692, + "grad_norm": 1.296875, + "learning_rate": 2.954956521739131e-06, + "loss": 0.2944, + "step": 11760 + }, + { + "epoch": 0.816475178842402, + "grad_norm": 1.2421875, + "learning_rate": 2.9532173913043483e-06, + "loss": 0.2349, + "step": 11770 + }, + { + "epoch": 0.8171688705831346, + "grad_norm": 1.0859375, + "learning_rate": 2.9514782608695652e-06, + "loss": 0.2075, + "step": 11780 + }, + { + "epoch": 0.8178625623238673, + "grad_norm": 1.171875, + "learning_rate": 2.9497391304347826e-06, + "loss": 0.2725, + "step": 11790 + }, + { + "epoch": 0.8185562540646001, + "grad_norm": 0.92578125, + "learning_rate": 2.9480000000000004e-06, + "loss": 0.2386, + "step": 11800 + }, + { + "epoch": 0.8192499458053327, + "grad_norm": 1.3515625, + "learning_rate": 2.9462608695652178e-06, + "loss": 0.2515, + "step": 11810 + }, + { + "epoch": 0.8199436375460655, + "grad_norm": 1.296875, + "learning_rate": 2.9445217391304347e-06, + "loss": 0.2432, + "step": 11820 + }, + { + "epoch": 0.8206373292867982, + "grad_norm": 1.5703125, + "learning_rate": 2.9427826086956525e-06, + "loss": 0.2537, + "step": 11830 + }, + { + "epoch": 0.8213310210275309, + "grad_norm": 1.234375, + "learning_rate": 2.94104347826087e-06, + "loss": 0.2072, + "step": 11840 + }, + { + "epoch": 0.8220247127682636, + "grad_norm": 1.0703125, + "learning_rate": 2.9393043478260873e-06, + "loss": 0.215, + "step": 11850 + }, + { + "epoch": 0.8227184045089964, + "grad_norm": 1.171875, + "learning_rate": 2.937565217391304e-06, + "loss": 0.2244, + "step": 11860 + }, + { + "epoch": 0.823412096249729, + "grad_norm": 1.1328125, + "learning_rate": 2.935826086956522e-06, + "loss": 0.2119, + "step": 11870 + }, + { + "epoch": 0.8241057879904617, + "grad_norm": 1.625, + "learning_rate": 2.9340869565217394e-06, + "loss": 0.2654, + "step": 11880 + }, + { + "epoch": 0.8247994797311945, + "grad_norm": 2.21875, + "learning_rate": 2.932347826086957e-06, + "loss": 0.2935, + "step": 11890 + }, + { + "epoch": 0.8254931714719271, + "grad_norm": 1.171875, + "learning_rate": 2.930608695652174e-06, + "loss": 0.3218, + "step": 11900 + }, + { + "epoch": 0.8261868632126599, + "grad_norm": 1.3046875, + "learning_rate": 2.9288695652173915e-06, + "loss": 0.2326, + "step": 11910 + }, + { + "epoch": 0.8268805549533926, + "grad_norm": 1.3203125, + "learning_rate": 2.927130434782609e-06, + "loss": 0.265, + "step": 11920 + }, + { + "epoch": 0.8275742466941253, + "grad_norm": 1.3515625, + "learning_rate": 2.9253913043478267e-06, + "loss": 0.2347, + "step": 11930 + }, + { + "epoch": 0.828267938434858, + "grad_norm": 1.0625, + "learning_rate": 2.9236521739130436e-06, + "loss": 0.2534, + "step": 11940 + }, + { + "epoch": 0.8289616301755908, + "grad_norm": 1.09375, + "learning_rate": 2.921913043478261e-06, + "loss": 0.2471, + "step": 11950 + }, + { + "epoch": 0.8296553219163234, + "grad_norm": 1.46875, + "learning_rate": 2.9201739130434788e-06, + "loss": 0.2258, + "step": 11960 + }, + { + "epoch": 0.8303490136570562, + "grad_norm": 1.0078125, + "learning_rate": 2.9184347826086957e-06, + "loss": 0.2336, + "step": 11970 + }, + { + "epoch": 0.8310427053977889, + "grad_norm": 0.9296875, + "learning_rate": 2.916695652173913e-06, + "loss": 0.1999, + "step": 11980 + }, + { + "epoch": 0.8317363971385215, + "grad_norm": 1.3828125, + "learning_rate": 2.914956521739131e-06, + "loss": 0.2423, + "step": 11990 + }, + { + "epoch": 0.8324300888792543, + "grad_norm": 0.98046875, + "learning_rate": 2.9132173913043483e-06, + "loss": 0.3134, + "step": 12000 + }, + { + "epoch": 0.833123780619987, + "grad_norm": 1.2109375, + "learning_rate": 2.911478260869565e-06, + "loss": 0.2421, + "step": 12010 + }, + { + "epoch": 0.8338174723607197, + "grad_norm": 1.2578125, + "learning_rate": 2.9097391304347826e-06, + "loss": 0.2408, + "step": 12020 + }, + { + "epoch": 0.8345111641014524, + "grad_norm": 1.09375, + "learning_rate": 2.9080000000000004e-06, + "loss": 0.3132, + "step": 12030 + }, + { + "epoch": 0.8352048558421852, + "grad_norm": 1.4921875, + "learning_rate": 2.9062608695652177e-06, + "loss": 0.3039, + "step": 12040 + }, + { + "epoch": 0.8358985475829178, + "grad_norm": 1.3046875, + "learning_rate": 2.9045217391304347e-06, + "loss": 0.2368, + "step": 12050 + }, + { + "epoch": 0.8365922393236506, + "grad_norm": 1.1015625, + "learning_rate": 2.9027826086956525e-06, + "loss": 0.2429, + "step": 12060 + }, + { + "epoch": 0.8372859310643833, + "grad_norm": 1.34375, + "learning_rate": 2.90104347826087e-06, + "loss": 0.2199, + "step": 12070 + }, + { + "epoch": 0.837979622805116, + "grad_norm": 0.9609375, + "learning_rate": 2.8993043478260872e-06, + "loss": 0.2216, + "step": 12080 + }, + { + "epoch": 0.8386733145458487, + "grad_norm": 1.1171875, + "learning_rate": 2.897565217391304e-06, + "loss": 0.2114, + "step": 12090 + }, + { + "epoch": 0.8393670062865815, + "grad_norm": 1.0859375, + "learning_rate": 2.895826086956522e-06, + "loss": 0.2314, + "step": 12100 + }, + { + "epoch": 0.8400606980273141, + "grad_norm": 1.2421875, + "learning_rate": 2.8940869565217393e-06, + "loss": 0.2591, + "step": 12110 + }, + { + "epoch": 0.8407543897680468, + "grad_norm": 1.1953125, + "learning_rate": 2.892347826086957e-06, + "loss": 0.2981, + "step": 12120 + }, + { + "epoch": 0.8414480815087796, + "grad_norm": 1.2734375, + "learning_rate": 2.890608695652174e-06, + "loss": 0.2615, + "step": 12130 + }, + { + "epoch": 0.8421417732495122, + "grad_norm": 1.1328125, + "learning_rate": 2.8888695652173914e-06, + "loss": 0.2214, + "step": 12140 + }, + { + "epoch": 0.842835464990245, + "grad_norm": 1.46875, + "learning_rate": 2.887130434782609e-06, + "loss": 0.2462, + "step": 12150 + }, + { + "epoch": 0.8435291567309777, + "grad_norm": 1.2109375, + "learning_rate": 2.8853913043478266e-06, + "loss": 0.2383, + "step": 12160 + }, + { + "epoch": 0.8442228484717104, + "grad_norm": 1.1484375, + "learning_rate": 2.8836521739130436e-06, + "loss": 0.2856, + "step": 12170 + }, + { + "epoch": 0.8449165402124431, + "grad_norm": 1.4375, + "learning_rate": 2.881913043478261e-06, + "loss": 0.2847, + "step": 12180 + }, + { + "epoch": 0.8456102319531759, + "grad_norm": 1.15625, + "learning_rate": 2.8801739130434787e-06, + "loss": 0.2508, + "step": 12190 + }, + { + "epoch": 0.8463039236939085, + "grad_norm": 1.34375, + "learning_rate": 2.878434782608696e-06, + "loss": 0.2317, + "step": 12200 + }, + { + "epoch": 0.8469976154346412, + "grad_norm": 1.3359375, + "learning_rate": 2.876695652173913e-06, + "loss": 0.3621, + "step": 12210 + }, + { + "epoch": 0.847691307175374, + "grad_norm": 1.109375, + "learning_rate": 2.874956521739131e-06, + "loss": 0.2664, + "step": 12220 + }, + { + "epoch": 0.8483849989161066, + "grad_norm": 0.86328125, + "learning_rate": 2.873217391304348e-06, + "loss": 0.2206, + "step": 12230 + }, + { + "epoch": 0.8490786906568394, + "grad_norm": 1.6484375, + "learning_rate": 2.8714782608695656e-06, + "loss": 0.3451, + "step": 12240 + }, + { + "epoch": 0.849772382397572, + "grad_norm": 1.5390625, + "learning_rate": 2.8697391304347825e-06, + "loss": 0.2597, + "step": 12250 + }, + { + "epoch": 0.8504660741383048, + "grad_norm": 1.1328125, + "learning_rate": 2.8680000000000003e-06, + "loss": 0.2391, + "step": 12260 + }, + { + "epoch": 0.8511597658790375, + "grad_norm": 0.92578125, + "learning_rate": 2.8662608695652177e-06, + "loss": 0.2241, + "step": 12270 + }, + { + "epoch": 0.8518534576197702, + "grad_norm": 1.1796875, + "learning_rate": 2.8645217391304346e-06, + "loss": 0.2928, + "step": 12280 + }, + { + "epoch": 0.8525471493605029, + "grad_norm": 1.1640625, + "learning_rate": 2.8627826086956524e-06, + "loss": 0.2468, + "step": 12290 + }, + { + "epoch": 0.8532408411012357, + "grad_norm": 1.0546875, + "learning_rate": 2.86104347826087e-06, + "loss": 0.2234, + "step": 12300 + }, + { + "epoch": 0.8539345328419683, + "grad_norm": 1.2421875, + "learning_rate": 2.859304347826087e-06, + "loss": 0.2329, + "step": 12310 + }, + { + "epoch": 0.854628224582701, + "grad_norm": 1.390625, + "learning_rate": 2.857565217391304e-06, + "loss": 0.2267, + "step": 12320 + }, + { + "epoch": 0.8553219163234338, + "grad_norm": 1.3203125, + "learning_rate": 2.855826086956522e-06, + "loss": 0.2083, + "step": 12330 + }, + { + "epoch": 0.8560156080641664, + "grad_norm": 1.0546875, + "learning_rate": 2.8540869565217393e-06, + "loss": 0.2738, + "step": 12340 + }, + { + "epoch": 0.8567092998048992, + "grad_norm": 1.421875, + "learning_rate": 2.852347826086957e-06, + "loss": 0.2128, + "step": 12350 + }, + { + "epoch": 0.8574029915456319, + "grad_norm": 1.515625, + "learning_rate": 2.850608695652174e-06, + "loss": 0.2226, + "step": 12360 + }, + { + "epoch": 0.8580966832863646, + "grad_norm": 1.40625, + "learning_rate": 2.8488695652173914e-06, + "loss": 0.2691, + "step": 12370 + }, + { + "epoch": 0.8587903750270973, + "grad_norm": 1.3359375, + "learning_rate": 2.8471304347826088e-06, + "loss": 0.2323, + "step": 12380 + }, + { + "epoch": 0.8594840667678301, + "grad_norm": 0.9921875, + "learning_rate": 2.8453913043478266e-06, + "loss": 0.228, + "step": 12390 + }, + { + "epoch": 0.8601777585085627, + "grad_norm": 0.96484375, + "learning_rate": 2.8436521739130435e-06, + "loss": 0.2741, + "step": 12400 + }, + { + "epoch": 0.8608714502492955, + "grad_norm": 1.3046875, + "learning_rate": 2.841913043478261e-06, + "loss": 0.2483, + "step": 12410 + }, + { + "epoch": 0.8615651419900282, + "grad_norm": 1.3515625, + "learning_rate": 2.8401739130434787e-06, + "loss": 0.2552, + "step": 12420 + }, + { + "epoch": 0.8622588337307608, + "grad_norm": 1.3828125, + "learning_rate": 2.838434782608696e-06, + "loss": 0.2283, + "step": 12430 + }, + { + "epoch": 0.8629525254714936, + "grad_norm": 1.2109375, + "learning_rate": 2.836695652173913e-06, + "loss": 0.226, + "step": 12440 + }, + { + "epoch": 0.8636462172122263, + "grad_norm": 1.3828125, + "learning_rate": 2.834956521739131e-06, + "loss": 0.2357, + "step": 12450 + }, + { + "epoch": 0.864339908952959, + "grad_norm": 1.1484375, + "learning_rate": 2.833217391304348e-06, + "loss": 0.2832, + "step": 12460 + }, + { + "epoch": 0.8650336006936917, + "grad_norm": 0.9375, + "learning_rate": 2.8314782608695655e-06, + "loss": 0.2512, + "step": 12470 + }, + { + "epoch": 0.8657272924344245, + "grad_norm": 1.1015625, + "learning_rate": 2.8297391304347825e-06, + "loss": 0.1749, + "step": 12480 + }, + { + "epoch": 0.8664209841751571, + "grad_norm": 1.578125, + "learning_rate": 2.8280000000000003e-06, + "loss": 0.3021, + "step": 12490 + }, + { + "epoch": 0.8671146759158899, + "grad_norm": 1.015625, + "learning_rate": 2.8262608695652177e-06, + "loss": 0.257, + "step": 12500 + }, + { + "epoch": 0.8678083676566226, + "grad_norm": 1.1328125, + "learning_rate": 2.8245217391304354e-06, + "loss": 0.229, + "step": 12510 + }, + { + "epoch": 0.8685020593973553, + "grad_norm": 1.0234375, + "learning_rate": 2.8227826086956524e-06, + "loss": 0.2163, + "step": 12520 + }, + { + "epoch": 0.869195751138088, + "grad_norm": 1.4609375, + "learning_rate": 2.8210434782608698e-06, + "loss": 0.264, + "step": 12530 + }, + { + "epoch": 0.8698894428788208, + "grad_norm": 1.1171875, + "learning_rate": 2.819304347826087e-06, + "loss": 0.2647, + "step": 12540 + }, + { + "epoch": 0.8705831346195534, + "grad_norm": 1.25, + "learning_rate": 2.817565217391305e-06, + "loss": 0.2262, + "step": 12550 + }, + { + "epoch": 0.8712768263602861, + "grad_norm": 1.2578125, + "learning_rate": 2.815826086956522e-06, + "loss": 0.2163, + "step": 12560 + }, + { + "epoch": 0.8719705181010189, + "grad_norm": 1.3515625, + "learning_rate": 2.8140869565217393e-06, + "loss": 0.2299, + "step": 12570 + }, + { + "epoch": 0.8726642098417515, + "grad_norm": 1.0703125, + "learning_rate": 2.812347826086957e-06, + "loss": 0.2756, + "step": 12580 + }, + { + "epoch": 0.8733579015824843, + "grad_norm": 1.3125, + "learning_rate": 2.810608695652174e-06, + "loss": 0.2429, + "step": 12590 + }, + { + "epoch": 0.874051593323217, + "grad_norm": 1.328125, + "learning_rate": 2.8088695652173914e-06, + "loss": 0.2404, + "step": 12600 + }, + { + "epoch": 0.8747452850639497, + "grad_norm": 1.1640625, + "learning_rate": 2.8071304347826087e-06, + "loss": 0.2412, + "step": 12610 + }, + { + "epoch": 0.8754389768046824, + "grad_norm": 1.1875, + "learning_rate": 2.8053913043478265e-06, + "loss": 0.2682, + "step": 12620 + }, + { + "epoch": 0.8761326685454152, + "grad_norm": 1.203125, + "learning_rate": 2.8036521739130435e-06, + "loss": 0.2255, + "step": 12630 + }, + { + "epoch": 0.8768263602861478, + "grad_norm": 1.1875, + "learning_rate": 2.801913043478261e-06, + "loss": 0.2766, + "step": 12640 + }, + { + "epoch": 0.8775200520268805, + "grad_norm": 1.1796875, + "learning_rate": 2.8001739130434786e-06, + "loss": 0.2353, + "step": 12650 + }, + { + "epoch": 0.8782137437676133, + "grad_norm": 0.8671875, + "learning_rate": 2.798434782608696e-06, + "loss": 0.1858, + "step": 12660 + }, + { + "epoch": 0.8789074355083459, + "grad_norm": 1.2890625, + "learning_rate": 2.796695652173913e-06, + "loss": 0.214, + "step": 12670 + }, + { + "epoch": 0.8796011272490787, + "grad_norm": 1.40625, + "learning_rate": 2.7949565217391308e-06, + "loss": 0.2371, + "step": 12680 + }, + { + "epoch": 0.8802948189898114, + "grad_norm": 1.2734375, + "learning_rate": 2.793217391304348e-06, + "loss": 0.23, + "step": 12690 + }, + { + "epoch": 0.8809885107305441, + "grad_norm": 1.28125, + "learning_rate": 2.7914782608695655e-06, + "loss": 0.2453, + "step": 12700 + }, + { + "epoch": 0.8816822024712768, + "grad_norm": 1.265625, + "learning_rate": 2.7897391304347824e-06, + "loss": 0.2131, + "step": 12710 + }, + { + "epoch": 0.8823758942120096, + "grad_norm": 1.390625, + "learning_rate": 2.7880000000000002e-06, + "loss": 0.242, + "step": 12720 + }, + { + "epoch": 0.8830695859527422, + "grad_norm": 0.953125, + "learning_rate": 2.7862608695652176e-06, + "loss": 0.3201, + "step": 12730 + }, + { + "epoch": 0.883763277693475, + "grad_norm": 0.94140625, + "learning_rate": 2.7845217391304354e-06, + "loss": 0.2541, + "step": 12740 + }, + { + "epoch": 0.8844569694342077, + "grad_norm": 1.0859375, + "learning_rate": 2.7827826086956524e-06, + "loss": 0.2767, + "step": 12750 + }, + { + "epoch": 0.8851506611749403, + "grad_norm": 1.28125, + "learning_rate": 2.7810434782608697e-06, + "loss": 0.2321, + "step": 12760 + }, + { + "epoch": 0.8858443529156731, + "grad_norm": 1.203125, + "learning_rate": 2.779304347826087e-06, + "loss": 0.2789, + "step": 12770 + }, + { + "epoch": 0.8865380446564058, + "grad_norm": 1.5546875, + "learning_rate": 2.777565217391305e-06, + "loss": 0.2188, + "step": 12780 + }, + { + "epoch": 0.8872317363971385, + "grad_norm": 1.21875, + "learning_rate": 2.775826086956522e-06, + "loss": 0.2593, + "step": 12790 + }, + { + "epoch": 0.8879254281378712, + "grad_norm": 1.015625, + "learning_rate": 2.774086956521739e-06, + "loss": 0.215, + "step": 12800 + }, + { + "epoch": 0.888619119878604, + "grad_norm": 1.3203125, + "learning_rate": 2.772347826086957e-06, + "loss": 0.2497, + "step": 12810 + }, + { + "epoch": 0.8893128116193366, + "grad_norm": 0.984375, + "learning_rate": 2.7706086956521744e-06, + "loss": 0.2003, + "step": 12820 + }, + { + "epoch": 0.8900065033600694, + "grad_norm": 1.171875, + "learning_rate": 2.7688695652173913e-06, + "loss": 0.2473, + "step": 12830 + }, + { + "epoch": 0.8907001951008021, + "grad_norm": 0.90234375, + "learning_rate": 2.7671304347826087e-06, + "loss": 0.2628, + "step": 12840 + }, + { + "epoch": 0.8913938868415348, + "grad_norm": 1.0546875, + "learning_rate": 2.7653913043478265e-06, + "loss": 0.2674, + "step": 12850 + }, + { + "epoch": 0.8920875785822675, + "grad_norm": 1.3828125, + "learning_rate": 2.763652173913044e-06, + "loss": 0.2468, + "step": 12860 + }, + { + "epoch": 0.8927812703230003, + "grad_norm": 1.109375, + "learning_rate": 2.761913043478261e-06, + "loss": 0.2161, + "step": 12870 + }, + { + "epoch": 0.8934749620637329, + "grad_norm": 1.1484375, + "learning_rate": 2.7601739130434786e-06, + "loss": 0.2284, + "step": 12880 + }, + { + "epoch": 0.8941686538044656, + "grad_norm": 1.1640625, + "learning_rate": 2.758434782608696e-06, + "loss": 0.2432, + "step": 12890 + }, + { + "epoch": 0.8948623455451984, + "grad_norm": 1.3359375, + "learning_rate": 2.756695652173913e-06, + "loss": 0.2964, + "step": 12900 + }, + { + "epoch": 0.895556037285931, + "grad_norm": 1.375, + "learning_rate": 2.7549565217391307e-06, + "loss": 0.2177, + "step": 12910 + }, + { + "epoch": 0.8962497290266638, + "grad_norm": 1.484375, + "learning_rate": 2.753217391304348e-06, + "loss": 0.2864, + "step": 12920 + }, + { + "epoch": 0.8969434207673965, + "grad_norm": 1.078125, + "learning_rate": 2.7514782608695655e-06, + "loss": 0.265, + "step": 12930 + }, + { + "epoch": 0.8976371125081292, + "grad_norm": 1.53125, + "learning_rate": 2.7497391304347824e-06, + "loss": 0.3763, + "step": 12940 + }, + { + "epoch": 0.8983308042488619, + "grad_norm": 1.1640625, + "learning_rate": 2.748e-06, + "loss": 0.2525, + "step": 12950 + }, + { + "epoch": 0.8990244959895947, + "grad_norm": 1.3984375, + "learning_rate": 2.7462608695652176e-06, + "loss": 0.2358, + "step": 12960 + }, + { + "epoch": 0.8997181877303273, + "grad_norm": 1.40625, + "learning_rate": 2.7445217391304354e-06, + "loss": 0.2315, + "step": 12970 + }, + { + "epoch": 0.90041187947106, + "grad_norm": 1.328125, + "learning_rate": 2.7427826086956523e-06, + "loss": 0.2688, + "step": 12980 + }, + { + "epoch": 0.9011055712117928, + "grad_norm": 1.4296875, + "learning_rate": 2.7410434782608697e-06, + "loss": 0.2491, + "step": 12990 + }, + { + "epoch": 0.9017992629525254, + "grad_norm": 1.3203125, + "learning_rate": 2.739304347826087e-06, + "loss": 0.2402, + "step": 13000 + }, + { + "epoch": 0.9024929546932582, + "grad_norm": 1.15625, + "learning_rate": 2.737565217391305e-06, + "loss": 0.258, + "step": 13010 + }, + { + "epoch": 0.9031866464339909, + "grad_norm": 1.109375, + "learning_rate": 2.735826086956522e-06, + "loss": 0.263, + "step": 13020 + }, + { + "epoch": 0.9038803381747236, + "grad_norm": 1.203125, + "learning_rate": 2.734086956521739e-06, + "loss": 0.2312, + "step": 13030 + }, + { + "epoch": 0.9045740299154563, + "grad_norm": 1.1875, + "learning_rate": 2.732347826086957e-06, + "loss": 0.2835, + "step": 13040 + }, + { + "epoch": 0.9052677216561891, + "grad_norm": 1.1796875, + "learning_rate": 2.7306086956521743e-06, + "loss": 0.2328, + "step": 13050 + }, + { + "epoch": 0.9059614133969217, + "grad_norm": 1.2734375, + "learning_rate": 2.7288695652173913e-06, + "loss": 0.2367, + "step": 13060 + }, + { + "epoch": 0.9066551051376545, + "grad_norm": 1.6796875, + "learning_rate": 2.7271304347826087e-06, + "loss": 0.2629, + "step": 13070 + }, + { + "epoch": 0.9073487968783872, + "grad_norm": 1.1171875, + "learning_rate": 2.7253913043478264e-06, + "loss": 0.2833, + "step": 13080 + }, + { + "epoch": 0.9080424886191198, + "grad_norm": 1.078125, + "learning_rate": 2.723652173913044e-06, + "loss": 0.2077, + "step": 13090 + }, + { + "epoch": 0.9087361803598526, + "grad_norm": 1.2734375, + "learning_rate": 2.7219130434782608e-06, + "loss": 0.213, + "step": 13100 + }, + { + "epoch": 0.9094298721005853, + "grad_norm": 1.3515625, + "learning_rate": 2.7201739130434786e-06, + "loss": 0.2784, + "step": 13110 + }, + { + "epoch": 0.910123563841318, + "grad_norm": 1.3203125, + "learning_rate": 2.718434782608696e-06, + "loss": 0.2505, + "step": 13120 + }, + { + "epoch": 0.9108172555820507, + "grad_norm": 1.171875, + "learning_rate": 2.7166956521739133e-06, + "loss": 0.2348, + "step": 13130 + }, + { + "epoch": 0.9115109473227835, + "grad_norm": 1.828125, + "learning_rate": 2.7149565217391307e-06, + "loss": 0.2753, + "step": 13140 + }, + { + "epoch": 0.9122046390635161, + "grad_norm": 1.5625, + "learning_rate": 2.713217391304348e-06, + "loss": 0.263, + "step": 13150 + }, + { + "epoch": 0.9128983308042489, + "grad_norm": 1.1875, + "learning_rate": 2.7114782608695654e-06, + "loss": 0.2445, + "step": 13160 + }, + { + "epoch": 0.9135920225449816, + "grad_norm": 1.5078125, + "learning_rate": 2.7097391304347832e-06, + "loss": 0.2877, + "step": 13170 + }, + { + "epoch": 0.9142857142857143, + "grad_norm": 1.0859375, + "learning_rate": 2.708e-06, + "loss": 0.2414, + "step": 13180 + }, + { + "epoch": 0.914979406026447, + "grad_norm": 1.171875, + "learning_rate": 2.7062608695652175e-06, + "loss": 0.2208, + "step": 13190 + }, + { + "epoch": 0.9156730977671798, + "grad_norm": 1.21875, + "learning_rate": 2.7045217391304353e-06, + "loss": 0.2381, + "step": 13200 + }, + { + "epoch": 0.9163667895079124, + "grad_norm": 1.5078125, + "learning_rate": 2.7027826086956523e-06, + "loss": 0.2036, + "step": 13210 + }, + { + "epoch": 0.9170604812486451, + "grad_norm": 1.2109375, + "learning_rate": 2.7010434782608696e-06, + "loss": 0.2259, + "step": 13220 + }, + { + "epoch": 0.9177541729893779, + "grad_norm": 1.0078125, + "learning_rate": 2.699304347826087e-06, + "loss": 0.2317, + "step": 13230 + }, + { + "epoch": 0.9184478647301105, + "grad_norm": 1.359375, + "learning_rate": 2.697565217391305e-06, + "loss": 0.2817, + "step": 13240 + }, + { + "epoch": 0.9191415564708433, + "grad_norm": 1.46875, + "learning_rate": 2.6958260869565218e-06, + "loss": 0.2527, + "step": 13250 + }, + { + "epoch": 0.919835248211576, + "grad_norm": 1.15625, + "learning_rate": 2.694086956521739e-06, + "loss": 0.2398, + "step": 13260 + }, + { + "epoch": 0.9205289399523087, + "grad_norm": 1.203125, + "learning_rate": 2.692347826086957e-06, + "loss": 0.2632, + "step": 13270 + }, + { + "epoch": 0.9212226316930414, + "grad_norm": 1.2109375, + "learning_rate": 2.6906086956521743e-06, + "loss": 0.218, + "step": 13280 + }, + { + "epoch": 0.9219163234337742, + "grad_norm": 1.296875, + "learning_rate": 2.6888695652173912e-06, + "loss": 0.2906, + "step": 13290 + }, + { + "epoch": 0.9226100151745068, + "grad_norm": 1.21875, + "learning_rate": 2.6871304347826086e-06, + "loss": 0.233, + "step": 13300 + }, + { + "epoch": 0.9233037069152396, + "grad_norm": 1.1875, + "learning_rate": 2.6853913043478264e-06, + "loss": 0.2417, + "step": 13310 + }, + { + "epoch": 0.9239973986559723, + "grad_norm": 1.3671875, + "learning_rate": 2.6836521739130438e-06, + "loss": 0.3028, + "step": 13320 + }, + { + "epoch": 0.9246910903967049, + "grad_norm": 1.2578125, + "learning_rate": 2.6819130434782607e-06, + "loss": 0.2826, + "step": 13330 + }, + { + "epoch": 0.9253847821374377, + "grad_norm": 1.3515625, + "learning_rate": 2.6801739130434785e-06, + "loss": 0.238, + "step": 13340 + }, + { + "epoch": 0.9260784738781704, + "grad_norm": 1.0234375, + "learning_rate": 2.678434782608696e-06, + "loss": 0.2322, + "step": 13350 + }, + { + "epoch": 0.9267721656189031, + "grad_norm": 1.3515625, + "learning_rate": 2.6766956521739133e-06, + "loss": 0.2983, + "step": 13360 + }, + { + "epoch": 0.9274658573596358, + "grad_norm": 0.9609375, + "learning_rate": 2.6749565217391306e-06, + "loss": 0.2692, + "step": 13370 + }, + { + "epoch": 0.9281595491003686, + "grad_norm": 1.28125, + "learning_rate": 2.673217391304348e-06, + "loss": 0.2064, + "step": 13380 + }, + { + "epoch": 0.9288532408411012, + "grad_norm": 1.265625, + "learning_rate": 2.6714782608695654e-06, + "loss": 0.2225, + "step": 13390 + }, + { + "epoch": 0.929546932581834, + "grad_norm": 1.375, + "learning_rate": 2.669739130434783e-06, + "loss": 0.2397, + "step": 13400 + }, + { + "epoch": 0.9302406243225667, + "grad_norm": 1.28125, + "learning_rate": 2.668e-06, + "loss": 0.238, + "step": 13410 + }, + { + "epoch": 0.9309343160632993, + "grad_norm": 1.8828125, + "learning_rate": 2.6662608695652175e-06, + "loss": 0.2753, + "step": 13420 + }, + { + "epoch": 0.9316280078040321, + "grad_norm": 0.8671875, + "learning_rate": 2.6645217391304353e-06, + "loss": 0.225, + "step": 13430 + }, + { + "epoch": 0.9323216995447648, + "grad_norm": 1.2421875, + "learning_rate": 2.6627826086956527e-06, + "loss": 0.2449, + "step": 13440 + }, + { + "epoch": 0.9330153912854975, + "grad_norm": 0.98046875, + "learning_rate": 2.6610434782608696e-06, + "loss": 0.2416, + "step": 13450 + }, + { + "epoch": 0.9337090830262302, + "grad_norm": 1.0703125, + "learning_rate": 2.659304347826087e-06, + "loss": 0.217, + "step": 13460 + }, + { + "epoch": 0.934402774766963, + "grad_norm": 1.5078125, + "learning_rate": 2.6575652173913048e-06, + "loss": 0.2213, + "step": 13470 + }, + { + "epoch": 0.9350964665076956, + "grad_norm": 1.0390625, + "learning_rate": 2.655826086956522e-06, + "loss": 0.3161, + "step": 13480 + }, + { + "epoch": 0.9357901582484284, + "grad_norm": 1.5234375, + "learning_rate": 2.654086956521739e-06, + "loss": 0.2487, + "step": 13490 + }, + { + "epoch": 0.9364838499891611, + "grad_norm": 1.6796875, + "learning_rate": 2.652347826086957e-06, + "loss": 0.3213, + "step": 13500 + }, + { + "epoch": 0.9371775417298938, + "grad_norm": 1.1875, + "learning_rate": 2.6506086956521743e-06, + "loss": 0.2486, + "step": 13510 + }, + { + "epoch": 0.9378712334706265, + "grad_norm": 1.5078125, + "learning_rate": 2.648869565217391e-06, + "loss": 0.2348, + "step": 13520 + }, + { + "epoch": 0.9385649252113591, + "grad_norm": 1.0859375, + "learning_rate": 2.6471304347826086e-06, + "loss": 0.2137, + "step": 13530 + }, + { + "epoch": 0.9392586169520919, + "grad_norm": 1.1640625, + "learning_rate": 2.6453913043478264e-06, + "loss": 0.2431, + "step": 13540 + }, + { + "epoch": 0.9399523086928246, + "grad_norm": 0.82421875, + "learning_rate": 2.6436521739130437e-06, + "loss": 0.2795, + "step": 13550 + }, + { + "epoch": 0.9406460004335573, + "grad_norm": 1.109375, + "learning_rate": 2.6419130434782607e-06, + "loss": 0.2674, + "step": 13560 + }, + { + "epoch": 0.94133969217429, + "grad_norm": 1.0859375, + "learning_rate": 2.6401739130434785e-06, + "loss": 0.2346, + "step": 13570 + }, + { + "epoch": 0.9420333839150228, + "grad_norm": 1.3125, + "learning_rate": 2.638434782608696e-06, + "loss": 0.2416, + "step": 13580 + }, + { + "epoch": 0.9427270756557554, + "grad_norm": 1.703125, + "learning_rate": 2.6366956521739132e-06, + "loss": 0.2999, + "step": 13590 + }, + { + "epoch": 0.9434207673964882, + "grad_norm": 1.1015625, + "learning_rate": 2.6349565217391306e-06, + "loss": 0.1903, + "step": 13600 + }, + { + "epoch": 0.9441144591372209, + "grad_norm": 0.96875, + "learning_rate": 2.633217391304348e-06, + "loss": 0.2281, + "step": 13610 + }, + { + "epoch": 0.9448081508779536, + "grad_norm": 1.203125, + "learning_rate": 2.6314782608695653e-06, + "loss": 0.2115, + "step": 13620 + }, + { + "epoch": 0.9455018426186863, + "grad_norm": 1.2578125, + "learning_rate": 2.629739130434783e-06, + "loss": 0.2522, + "step": 13630 + }, + { + "epoch": 0.946195534359419, + "grad_norm": 1.1796875, + "learning_rate": 2.628e-06, + "loss": 0.2353, + "step": 13640 + }, + { + "epoch": 0.9468892261001517, + "grad_norm": 1.3671875, + "learning_rate": 2.6262608695652175e-06, + "loss": 0.2585, + "step": 13650 + }, + { + "epoch": 0.9475829178408844, + "grad_norm": 1.0546875, + "learning_rate": 2.6245217391304352e-06, + "loss": 0.2067, + "step": 13660 + }, + { + "epoch": 0.9482766095816172, + "grad_norm": 1.4609375, + "learning_rate": 2.6227826086956526e-06, + "loss": 0.2414, + "step": 13670 + }, + { + "epoch": 0.9489703013223498, + "grad_norm": 0.9921875, + "learning_rate": 2.6210434782608696e-06, + "loss": 0.2992, + "step": 13680 + }, + { + "epoch": 0.9496639930630826, + "grad_norm": 1.578125, + "learning_rate": 2.619304347826087e-06, + "loss": 0.2543, + "step": 13690 + }, + { + "epoch": 0.9503576848038153, + "grad_norm": 1.3046875, + "learning_rate": 2.6175652173913047e-06, + "loss": 0.2452, + "step": 13700 + }, + { + "epoch": 0.951051376544548, + "grad_norm": 1.2421875, + "learning_rate": 2.615826086956522e-06, + "loss": 0.2564, + "step": 13710 + }, + { + "epoch": 0.9517450682852807, + "grad_norm": 1.2109375, + "learning_rate": 2.614086956521739e-06, + "loss": 0.2446, + "step": 13720 + }, + { + "epoch": 0.9524387600260135, + "grad_norm": 1.3515625, + "learning_rate": 2.612347826086957e-06, + "loss": 0.2343, + "step": 13730 + }, + { + "epoch": 0.9531324517667461, + "grad_norm": 1.0546875, + "learning_rate": 2.6106086956521742e-06, + "loss": 0.2255, + "step": 13740 + }, + { + "epoch": 0.9538261435074789, + "grad_norm": 1.6328125, + "learning_rate": 2.6088695652173916e-06, + "loss": 0.2345, + "step": 13750 + }, + { + "epoch": 0.9545198352482116, + "grad_norm": 1.4921875, + "learning_rate": 2.6071304347826085e-06, + "loss": 0.2453, + "step": 13760 + }, + { + "epoch": 0.9552135269889442, + "grad_norm": 1.2421875, + "learning_rate": 2.6053913043478263e-06, + "loss": 0.2377, + "step": 13770 + }, + { + "epoch": 0.955907218729677, + "grad_norm": 0.88671875, + "learning_rate": 2.6036521739130437e-06, + "loss": 0.2412, + "step": 13780 + }, + { + "epoch": 0.9566009104704097, + "grad_norm": 1.265625, + "learning_rate": 2.6019130434782615e-06, + "loss": 0.3032, + "step": 13790 + }, + { + "epoch": 0.9572946022111424, + "grad_norm": 1.5703125, + "learning_rate": 2.6001739130434784e-06, + "loss": 0.2317, + "step": 13800 + }, + { + "epoch": 0.9579882939518751, + "grad_norm": 1.125, + "learning_rate": 2.598434782608696e-06, + "loss": 0.2275, + "step": 13810 + }, + { + "epoch": 0.9586819856926079, + "grad_norm": 1.25, + "learning_rate": 2.596695652173913e-06, + "loss": 0.2155, + "step": 13820 + }, + { + "epoch": 0.9593756774333405, + "grad_norm": 1.2734375, + "learning_rate": 2.5949565217391306e-06, + "loss": 0.2261, + "step": 13830 + }, + { + "epoch": 0.9600693691740733, + "grad_norm": 1.3203125, + "learning_rate": 2.593217391304348e-06, + "loss": 0.2183, + "step": 13840 + }, + { + "epoch": 0.960763060914806, + "grad_norm": 1.328125, + "learning_rate": 2.5914782608695653e-06, + "loss": 0.2203, + "step": 13850 + }, + { + "epoch": 0.9614567526555386, + "grad_norm": 1.2421875, + "learning_rate": 2.589739130434783e-06, + "loss": 0.2219, + "step": 13860 + }, + { + "epoch": 0.9621504443962714, + "grad_norm": 1.0546875, + "learning_rate": 2.588e-06, + "loss": 0.25, + "step": 13870 + }, + { + "epoch": 0.9628441361370041, + "grad_norm": 0.921875, + "learning_rate": 2.5862608695652174e-06, + "loss": 0.2036, + "step": 13880 + }, + { + "epoch": 0.9635378278777368, + "grad_norm": 1.015625, + "learning_rate": 2.584521739130435e-06, + "loss": 0.2385, + "step": 13890 + }, + { + "epoch": 0.9642315196184695, + "grad_norm": 1.2421875, + "learning_rate": 2.5827826086956526e-06, + "loss": 0.2036, + "step": 13900 + }, + { + "epoch": 0.9649252113592023, + "grad_norm": 1.3203125, + "learning_rate": 2.5810434782608695e-06, + "loss": 0.2155, + "step": 13910 + }, + { + "epoch": 0.9656189030999349, + "grad_norm": 0.93359375, + "learning_rate": 2.579304347826087e-06, + "loss": 0.238, + "step": 13920 + }, + { + "epoch": 0.9663125948406677, + "grad_norm": 1.1640625, + "learning_rate": 2.5775652173913047e-06, + "loss": 0.2178, + "step": 13930 + }, + { + "epoch": 0.9670062865814004, + "grad_norm": 1.15625, + "learning_rate": 2.575826086956522e-06, + "loss": 0.2891, + "step": 13940 + }, + { + "epoch": 0.9676999783221331, + "grad_norm": 1.140625, + "learning_rate": 2.574086956521739e-06, + "loss": 0.2354, + "step": 13950 + }, + { + "epoch": 0.9683936700628658, + "grad_norm": 1.40625, + "learning_rate": 2.572347826086957e-06, + "loss": 0.2305, + "step": 13960 + }, + { + "epoch": 0.9690873618035986, + "grad_norm": 1.0625, + "learning_rate": 2.570608695652174e-06, + "loss": 0.2175, + "step": 13970 + }, + { + "epoch": 0.9697810535443312, + "grad_norm": 1.1484375, + "learning_rate": 2.5688695652173915e-06, + "loss": 0.232, + "step": 13980 + }, + { + "epoch": 0.9704747452850639, + "grad_norm": 1.03125, + "learning_rate": 2.5671304347826085e-06, + "loss": 0.2387, + "step": 13990 + }, + { + "epoch": 0.9711684370257967, + "grad_norm": 0.8671875, + "learning_rate": 2.5653913043478263e-06, + "loss": 0.3122, + "step": 14000 + }, + { + "epoch": 0.9718621287665293, + "grad_norm": 1.1171875, + "learning_rate": 2.5636521739130437e-06, + "loss": 0.2158, + "step": 14010 + }, + { + "epoch": 0.9725558205072621, + "grad_norm": 0.75390625, + "learning_rate": 2.5619130434782615e-06, + "loss": 0.222, + "step": 14020 + }, + { + "epoch": 0.9732495122479948, + "grad_norm": 1.125, + "learning_rate": 2.5601739130434784e-06, + "loss": 0.2692, + "step": 14030 + }, + { + "epoch": 0.9739432039887275, + "grad_norm": 1.4453125, + "learning_rate": 2.5584347826086958e-06, + "loss": 0.2721, + "step": 14040 + }, + { + "epoch": 0.9746368957294602, + "grad_norm": 1.5234375, + "learning_rate": 2.5566956521739136e-06, + "loss": 0.2335, + "step": 14050 + }, + { + "epoch": 0.975330587470193, + "grad_norm": 1.0703125, + "learning_rate": 2.554956521739131e-06, + "loss": 0.2132, + "step": 14060 + }, + { + "epoch": 0.9760242792109256, + "grad_norm": 1.25, + "learning_rate": 2.553217391304348e-06, + "loss": 0.2424, + "step": 14070 + }, + { + "epoch": 0.9767179709516584, + "grad_norm": 1.140625, + "learning_rate": 2.5514782608695653e-06, + "loss": 0.2583, + "step": 14080 + }, + { + "epoch": 0.9774116626923911, + "grad_norm": 1.21875, + "learning_rate": 2.549739130434783e-06, + "loss": 0.2165, + "step": 14090 + }, + { + "epoch": 0.9781053544331237, + "grad_norm": 1.2734375, + "learning_rate": 2.5480000000000004e-06, + "loss": 0.2442, + "step": 14100 + }, + { + "epoch": 0.9787990461738565, + "grad_norm": 1.21875, + "learning_rate": 2.5462608695652174e-06, + "loss": 0.3018, + "step": 14110 + }, + { + "epoch": 0.9794927379145892, + "grad_norm": 1.1640625, + "learning_rate": 2.544521739130435e-06, + "loss": 0.2094, + "step": 14120 + }, + { + "epoch": 0.9801864296553219, + "grad_norm": 1.0859375, + "learning_rate": 2.5427826086956525e-06, + "loss": 0.2546, + "step": 14130 + }, + { + "epoch": 0.9808801213960546, + "grad_norm": 1.3515625, + "learning_rate": 2.5410434782608695e-06, + "loss": 0.2204, + "step": 14140 + }, + { + "epoch": 0.9815738131367874, + "grad_norm": 1.3515625, + "learning_rate": 2.539304347826087e-06, + "loss": 0.176, + "step": 14150 + }, + { + "epoch": 0.98226750487752, + "grad_norm": 1.015625, + "learning_rate": 2.5375652173913046e-06, + "loss": 0.2255, + "step": 14160 + }, + { + "epoch": 0.9829611966182528, + "grad_norm": 0.9765625, + "learning_rate": 2.535826086956522e-06, + "loss": 0.228, + "step": 14170 + }, + { + "epoch": 0.9836548883589855, + "grad_norm": 1.421875, + "learning_rate": 2.534086956521739e-06, + "loss": 0.2245, + "step": 14180 + }, + { + "epoch": 0.9843485800997182, + "grad_norm": 1.0703125, + "learning_rate": 2.5323478260869568e-06, + "loss": 0.2602, + "step": 14190 + }, + { + "epoch": 0.9850422718404509, + "grad_norm": 1.453125, + "learning_rate": 2.530608695652174e-06, + "loss": 0.3022, + "step": 14200 + }, + { + "epoch": 0.9857359635811836, + "grad_norm": 1.2421875, + "learning_rate": 2.5288695652173915e-06, + "loss": 0.2025, + "step": 14210 + }, + { + "epoch": 0.9864296553219163, + "grad_norm": 1.140625, + "learning_rate": 2.527130434782609e-06, + "loss": 0.2147, + "step": 14220 + }, + { + "epoch": 0.987123347062649, + "grad_norm": 1.28125, + "learning_rate": 2.5253913043478262e-06, + "loss": 0.2519, + "step": 14230 + }, + { + "epoch": 0.9878170388033818, + "grad_norm": 1.15625, + "learning_rate": 2.5236521739130436e-06, + "loss": 0.2693, + "step": 14240 + }, + { + "epoch": 0.9885107305441144, + "grad_norm": 0.90625, + "learning_rate": 2.5219130434782614e-06, + "loss": 0.246, + "step": 14250 + }, + { + "epoch": 0.9892044222848472, + "grad_norm": 1.125, + "learning_rate": 2.5201739130434784e-06, + "loss": 0.235, + "step": 14260 + }, + { + "epoch": 0.9898981140255799, + "grad_norm": 1.1640625, + "learning_rate": 2.5184347826086957e-06, + "loss": 0.2056, + "step": 14270 + }, + { + "epoch": 0.9905918057663126, + "grad_norm": 1.2578125, + "learning_rate": 2.5166956521739135e-06, + "loss": 0.2063, + "step": 14280 + }, + { + "epoch": 0.9912854975070453, + "grad_norm": 1.484375, + "learning_rate": 2.514956521739131e-06, + "loss": 0.2341, + "step": 14290 + }, + { + "epoch": 0.9919791892477781, + "grad_norm": 1.3671875, + "learning_rate": 2.513217391304348e-06, + "loss": 0.21, + "step": 14300 + }, + { + "epoch": 0.9926728809885107, + "grad_norm": 1.4140625, + "learning_rate": 2.5114782608695652e-06, + "loss": 0.2236, + "step": 14310 + }, + { + "epoch": 0.9933665727292434, + "grad_norm": 1.2109375, + "learning_rate": 2.509739130434783e-06, + "loss": 0.2257, + "step": 14320 + }, + { + "epoch": 0.9940602644699762, + "grad_norm": 1.4453125, + "learning_rate": 2.5080000000000004e-06, + "loss": 0.2403, + "step": 14330 + }, + { + "epoch": 0.9947539562107088, + "grad_norm": 1.0390625, + "learning_rate": 2.5062608695652173e-06, + "loss": 0.2051, + "step": 14340 + }, + { + "epoch": 0.9954476479514416, + "grad_norm": 0.94921875, + "learning_rate": 2.504521739130435e-06, + "loss": 0.2264, + "step": 14350 + }, + { + "epoch": 0.9961413396921743, + "grad_norm": 1.1953125, + "learning_rate": 2.5027826086956525e-06, + "loss": 0.2702, + "step": 14360 + }, + { + "epoch": 0.996835031432907, + "grad_norm": 0.92578125, + "learning_rate": 2.50104347826087e-06, + "loss": 0.2241, + "step": 14370 + }, + { + "epoch": 0.9975287231736397, + "grad_norm": 1.0625, + "learning_rate": 2.4993043478260872e-06, + "loss": 0.2191, + "step": 14380 + }, + { + "epoch": 0.9982224149143725, + "grad_norm": 1.5546875, + "learning_rate": 2.4975652173913046e-06, + "loss": 0.3005, + "step": 14390 + }, + { + "epoch": 0.9989161066551051, + "grad_norm": 1.2578125, + "learning_rate": 2.495826086956522e-06, + "loss": 0.3011, + "step": 14400 + }, + { + "epoch": 0.9996097983958379, + "grad_norm": 1.7109375, + "learning_rate": 2.4940869565217394e-06, + "loss": 0.2558, + "step": 14410 + }, + { + "epoch": 1.0002774766962932, + "grad_norm": 1.484375, + "learning_rate": 2.4923478260869567e-06, + "loss": 0.2414, + "step": 14420 + }, + { + "epoch": 1.0009711684370257, + "grad_norm": 1.03125, + "learning_rate": 2.490608695652174e-06, + "loss": 0.2269, + "step": 14430 + }, + { + "epoch": 1.0016648601777585, + "grad_norm": 1.203125, + "learning_rate": 2.4888695652173915e-06, + "loss": 0.2052, + "step": 14440 + }, + { + "epoch": 1.0023585519184912, + "grad_norm": 1.09375, + "learning_rate": 2.487130434782609e-06, + "loss": 0.2343, + "step": 14450 + }, + { + "epoch": 1.003052243659224, + "grad_norm": 1.046875, + "learning_rate": 2.485391304347826e-06, + "loss": 0.2279, + "step": 14460 + }, + { + "epoch": 1.0037459353999567, + "grad_norm": 0.94921875, + "learning_rate": 2.4836521739130436e-06, + "loss": 0.2713, + "step": 14470 + }, + { + "epoch": 1.0044396271406895, + "grad_norm": 1.2109375, + "learning_rate": 2.481913043478261e-06, + "loss": 0.2522, + "step": 14480 + }, + { + "epoch": 1.005133318881422, + "grad_norm": 1.3046875, + "learning_rate": 2.4801739130434783e-06, + "loss": 0.2336, + "step": 14490 + }, + { + "epoch": 1.0058270106221547, + "grad_norm": 1.140625, + "learning_rate": 2.4784347826086957e-06, + "loss": 0.228, + "step": 14500 + }, + { + "epoch": 1.0065207023628875, + "grad_norm": 1.1640625, + "learning_rate": 2.4766956521739135e-06, + "loss": 0.2082, + "step": 14510 + }, + { + "epoch": 1.0072143941036202, + "grad_norm": 1.25, + "learning_rate": 2.4749565217391304e-06, + "loss": 0.2847, + "step": 14520 + }, + { + "epoch": 1.007908085844353, + "grad_norm": 0.9453125, + "learning_rate": 2.4732173913043482e-06, + "loss": 0.2132, + "step": 14530 + }, + { + "epoch": 1.0086017775850857, + "grad_norm": 1.3671875, + "learning_rate": 2.471478260869565e-06, + "loss": 0.2656, + "step": 14540 + }, + { + "epoch": 1.0092954693258183, + "grad_norm": 1.3046875, + "learning_rate": 2.469739130434783e-06, + "loss": 0.2238, + "step": 14550 + }, + { + "epoch": 1.009989161066551, + "grad_norm": 1.6640625, + "learning_rate": 2.468e-06, + "loss": 0.3057, + "step": 14560 + }, + { + "epoch": 1.0106828528072838, + "grad_norm": 1.1484375, + "learning_rate": 2.4662608695652177e-06, + "loss": 0.2365, + "step": 14570 + }, + { + "epoch": 1.0113765445480165, + "grad_norm": 0.98046875, + "learning_rate": 2.464521739130435e-06, + "loss": 0.2215, + "step": 14580 + }, + { + "epoch": 1.0120702362887493, + "grad_norm": 1.4921875, + "learning_rate": 2.4627826086956525e-06, + "loss": 0.2266, + "step": 14590 + }, + { + "epoch": 1.012763928029482, + "grad_norm": 2.265625, + "learning_rate": 2.46104347826087e-06, + "loss": 0.2978, + "step": 14600 + }, + { + "epoch": 1.0134576197702145, + "grad_norm": 1.328125, + "learning_rate": 2.459304347826087e-06, + "loss": 0.2397, + "step": 14610 + }, + { + "epoch": 1.0141513115109473, + "grad_norm": 1.2109375, + "learning_rate": 2.4575652173913046e-06, + "loss": 0.3155, + "step": 14620 + }, + { + "epoch": 1.01484500325168, + "grad_norm": 1.3828125, + "learning_rate": 2.455826086956522e-06, + "loss": 0.2873, + "step": 14630 + }, + { + "epoch": 1.0155386949924128, + "grad_norm": 1.7734375, + "learning_rate": 2.4540869565217393e-06, + "loss": 0.2612, + "step": 14640 + }, + { + "epoch": 1.0162323867331455, + "grad_norm": 1.1171875, + "learning_rate": 2.4523478260869567e-06, + "loss": 0.2231, + "step": 14650 + }, + { + "epoch": 1.0169260784738783, + "grad_norm": 1.5234375, + "learning_rate": 2.450608695652174e-06, + "loss": 0.3073, + "step": 14660 + }, + { + "epoch": 1.0176197702146108, + "grad_norm": 1.1875, + "learning_rate": 2.4488695652173914e-06, + "loss": 0.3001, + "step": 14670 + }, + { + "epoch": 1.0183134619553436, + "grad_norm": 1.2109375, + "learning_rate": 2.447130434782609e-06, + "loss": 0.2108, + "step": 14680 + }, + { + "epoch": 1.0190071536960763, + "grad_norm": 1.1953125, + "learning_rate": 2.4453913043478266e-06, + "loss": 0.2234, + "step": 14690 + }, + { + "epoch": 1.019700845436809, + "grad_norm": 1.25, + "learning_rate": 2.4436521739130435e-06, + "loss": 0.235, + "step": 14700 + }, + { + "epoch": 1.0203945371775418, + "grad_norm": 1.1171875, + "learning_rate": 2.4419130434782613e-06, + "loss": 0.2167, + "step": 14710 + }, + { + "epoch": 1.0210882289182743, + "grad_norm": 0.97265625, + "learning_rate": 2.4401739130434783e-06, + "loss": 0.2383, + "step": 14720 + }, + { + "epoch": 1.021781920659007, + "grad_norm": 1.375, + "learning_rate": 2.438434782608696e-06, + "loss": 0.2186, + "step": 14730 + }, + { + "epoch": 1.0224756123997398, + "grad_norm": 1.1015625, + "learning_rate": 2.4366956521739134e-06, + "loss": 0.2027, + "step": 14740 + }, + { + "epoch": 1.0231693041404726, + "grad_norm": 1.078125, + "learning_rate": 2.4349565217391304e-06, + "loss": 0.2397, + "step": 14750 + }, + { + "epoch": 1.0238629958812053, + "grad_norm": 0.9921875, + "learning_rate": 2.433217391304348e-06, + "loss": 0.2239, + "step": 14760 + }, + { + "epoch": 1.024556687621938, + "grad_norm": 1.21875, + "learning_rate": 2.431478260869565e-06, + "loss": 0.2252, + "step": 14770 + }, + { + "epoch": 1.0252503793626706, + "grad_norm": 1.296875, + "learning_rate": 2.429739130434783e-06, + "loss": 0.2448, + "step": 14780 + }, + { + "epoch": 1.0259440711034034, + "grad_norm": 1.375, + "learning_rate": 2.428e-06, + "loss": 0.2628, + "step": 14790 + }, + { + "epoch": 1.026637762844136, + "grad_norm": 1.5703125, + "learning_rate": 2.4262608695652177e-06, + "loss": 0.2583, + "step": 14800 + }, + { + "epoch": 1.0273314545848689, + "grad_norm": 1.171875, + "learning_rate": 2.424521739130435e-06, + "loss": 0.2329, + "step": 14810 + }, + { + "epoch": 1.0280251463256016, + "grad_norm": 1.171875, + "learning_rate": 2.4227826086956524e-06, + "loss": 0.2318, + "step": 14820 + }, + { + "epoch": 1.0287188380663344, + "grad_norm": 1.46875, + "learning_rate": 2.4210434782608698e-06, + "loss": 0.2351, + "step": 14830 + }, + { + "epoch": 1.0294125298070669, + "grad_norm": 1.078125, + "learning_rate": 2.419304347826087e-06, + "loss": 0.2533, + "step": 14840 + }, + { + "epoch": 1.0301062215477996, + "grad_norm": 1.6328125, + "learning_rate": 2.4175652173913045e-06, + "loss": 0.2254, + "step": 14850 + }, + { + "epoch": 1.0307999132885324, + "grad_norm": 0.90234375, + "learning_rate": 2.415826086956522e-06, + "loss": 0.2149, + "step": 14860 + }, + { + "epoch": 1.0314936050292651, + "grad_norm": 1.4453125, + "learning_rate": 2.4140869565217393e-06, + "loss": 0.2698, + "step": 14870 + }, + { + "epoch": 1.0321872967699979, + "grad_norm": 1.40625, + "learning_rate": 2.4123478260869566e-06, + "loss": 0.1998, + "step": 14880 + }, + { + "epoch": 1.0328809885107306, + "grad_norm": 1.1484375, + "learning_rate": 2.410608695652174e-06, + "loss": 0.241, + "step": 14890 + }, + { + "epoch": 1.0335746802514632, + "grad_norm": 1.0859375, + "learning_rate": 2.4088695652173914e-06, + "loss": 0.2182, + "step": 14900 + }, + { + "epoch": 1.034268371992196, + "grad_norm": 1.3359375, + "learning_rate": 2.4071304347826088e-06, + "loss": 0.2561, + "step": 14910 + }, + { + "epoch": 1.0349620637329286, + "grad_norm": 1.0859375, + "learning_rate": 2.4053913043478265e-06, + "loss": 0.2321, + "step": 14920 + }, + { + "epoch": 1.0356557554736614, + "grad_norm": 1.3828125, + "learning_rate": 2.4036521739130435e-06, + "loss": 0.2393, + "step": 14930 + }, + { + "epoch": 1.0363494472143941, + "grad_norm": 0.8984375, + "learning_rate": 2.4019130434782613e-06, + "loss": 0.23, + "step": 14940 + }, + { + "epoch": 1.037043138955127, + "grad_norm": 1.125, + "learning_rate": 2.4001739130434782e-06, + "loss": 0.2397, + "step": 14950 + }, + { + "epoch": 1.0377368306958594, + "grad_norm": 1.078125, + "learning_rate": 2.398434782608696e-06, + "loss": 0.3271, + "step": 14960 + }, + { + "epoch": 1.0384305224365922, + "grad_norm": 1.1953125, + "learning_rate": 2.3966956521739134e-06, + "loss": 0.2324, + "step": 14970 + }, + { + "epoch": 1.039124214177325, + "grad_norm": 1.265625, + "learning_rate": 2.3949565217391308e-06, + "loss": 0.2342, + "step": 14980 + }, + { + "epoch": 1.0398179059180577, + "grad_norm": 1.0625, + "learning_rate": 2.393217391304348e-06, + "loss": 0.2481, + "step": 14990 + }, + { + "epoch": 1.0405115976587904, + "grad_norm": 1.1875, + "learning_rate": 2.3914782608695655e-06, + "loss": 0.2157, + "step": 15000 + }, + { + "epoch": 1.0412052893995232, + "grad_norm": 1.515625, + "learning_rate": 2.389739130434783e-06, + "loss": 0.2339, + "step": 15010 + }, + { + "epoch": 1.0418989811402557, + "grad_norm": 1.1484375, + "learning_rate": 2.3880000000000003e-06, + "loss": 0.2366, + "step": 15020 + }, + { + "epoch": 1.0425926728809884, + "grad_norm": 1.0859375, + "learning_rate": 2.3862608695652176e-06, + "loss": 0.227, + "step": 15030 + }, + { + "epoch": 1.0432863646217212, + "grad_norm": 1.1796875, + "learning_rate": 2.384521739130435e-06, + "loss": 0.2355, + "step": 15040 + }, + { + "epoch": 1.043980056362454, + "grad_norm": 1.25, + "learning_rate": 2.3827826086956524e-06, + "loss": 0.2874, + "step": 15050 + }, + { + "epoch": 1.0446737481031867, + "grad_norm": 1.0703125, + "learning_rate": 2.3810434782608697e-06, + "loss": 0.235, + "step": 15060 + }, + { + "epoch": 1.0453674398439194, + "grad_norm": 1.0546875, + "learning_rate": 2.379304347826087e-06, + "loss": 0.2298, + "step": 15070 + }, + { + "epoch": 1.046061131584652, + "grad_norm": 1.328125, + "learning_rate": 2.3775652173913045e-06, + "loss": 0.2549, + "step": 15080 + }, + { + "epoch": 1.0467548233253847, + "grad_norm": 1.0859375, + "learning_rate": 2.375826086956522e-06, + "loss": 0.1974, + "step": 15090 + }, + { + "epoch": 1.0474485150661175, + "grad_norm": 1.359375, + "learning_rate": 2.3740869565217392e-06, + "loss": 0.2338, + "step": 15100 + }, + { + "epoch": 1.0481422068068502, + "grad_norm": 1.640625, + "learning_rate": 2.3723478260869566e-06, + "loss": 0.2567, + "step": 15110 + }, + { + "epoch": 1.048835898547583, + "grad_norm": 1.453125, + "learning_rate": 2.370608695652174e-06, + "loss": 0.2979, + "step": 15120 + }, + { + "epoch": 1.0495295902883157, + "grad_norm": 1.328125, + "learning_rate": 2.3688695652173913e-06, + "loss": 0.2831, + "step": 15130 + }, + { + "epoch": 1.0502232820290482, + "grad_norm": 1.15625, + "learning_rate": 2.3671304347826087e-06, + "loss": 0.2648, + "step": 15140 + }, + { + "epoch": 1.050916973769781, + "grad_norm": 1.1015625, + "learning_rate": 2.3653913043478265e-06, + "loss": 0.2652, + "step": 15150 + }, + { + "epoch": 1.0516106655105137, + "grad_norm": 0.89453125, + "learning_rate": 2.3636521739130435e-06, + "loss": 0.1974, + "step": 15160 + }, + { + "epoch": 1.0523043572512465, + "grad_norm": 1.0, + "learning_rate": 2.3619130434782613e-06, + "loss": 0.2288, + "step": 15170 + }, + { + "epoch": 1.0529980489919792, + "grad_norm": 1.0546875, + "learning_rate": 2.360173913043478e-06, + "loss": 0.2257, + "step": 15180 + }, + { + "epoch": 1.053691740732712, + "grad_norm": 1.0078125, + "learning_rate": 2.358434782608696e-06, + "loss": 0.2343, + "step": 15190 + }, + { + "epoch": 1.0543854324734445, + "grad_norm": 1.78125, + "learning_rate": 2.3566956521739134e-06, + "loss": 0.2327, + "step": 15200 + }, + { + "epoch": 1.0550791242141773, + "grad_norm": 1.1875, + "learning_rate": 2.3549565217391307e-06, + "loss": 0.2529, + "step": 15210 + }, + { + "epoch": 1.05577281595491, + "grad_norm": 1.09375, + "learning_rate": 2.353217391304348e-06, + "loss": 0.2498, + "step": 15220 + }, + { + "epoch": 1.0564665076956428, + "grad_norm": 1.046875, + "learning_rate": 2.3514782608695655e-06, + "loss": 0.2349, + "step": 15230 + }, + { + "epoch": 1.0571601994363755, + "grad_norm": 0.9765625, + "learning_rate": 2.349739130434783e-06, + "loss": 0.2656, + "step": 15240 + }, + { + "epoch": 1.0578538911771083, + "grad_norm": 1.25, + "learning_rate": 2.3480000000000002e-06, + "loss": 0.2531, + "step": 15250 + }, + { + "epoch": 1.0585475829178408, + "grad_norm": 1.6171875, + "learning_rate": 2.3462608695652176e-06, + "loss": 0.2521, + "step": 15260 + }, + { + "epoch": 1.0592412746585735, + "grad_norm": 1.375, + "learning_rate": 2.344521739130435e-06, + "loss": 0.266, + "step": 15270 + }, + { + "epoch": 1.0599349663993063, + "grad_norm": 1.171875, + "learning_rate": 2.3427826086956523e-06, + "loss": 0.2564, + "step": 15280 + }, + { + "epoch": 1.060628658140039, + "grad_norm": 1.1171875, + "learning_rate": 2.3410434782608697e-06, + "loss": 0.2466, + "step": 15290 + }, + { + "epoch": 1.0613223498807718, + "grad_norm": 1.125, + "learning_rate": 2.339304347826087e-06, + "loss": 0.2282, + "step": 15300 + }, + { + "epoch": 1.0620160416215045, + "grad_norm": 1.1796875, + "learning_rate": 2.3375652173913044e-06, + "loss": 0.3059, + "step": 15310 + }, + { + "epoch": 1.062709733362237, + "grad_norm": 1.265625, + "learning_rate": 2.335826086956522e-06, + "loss": 0.3653, + "step": 15320 + }, + { + "epoch": 1.0634034251029698, + "grad_norm": 1.265625, + "learning_rate": 2.3340869565217396e-06, + "loss": 0.228, + "step": 15330 + }, + { + "epoch": 1.0640971168437026, + "grad_norm": 1.0390625, + "learning_rate": 2.3323478260869566e-06, + "loss": 0.2498, + "step": 15340 + }, + { + "epoch": 1.0647908085844353, + "grad_norm": 1.0078125, + "learning_rate": 2.3306086956521744e-06, + "loss": 0.2772, + "step": 15350 + }, + { + "epoch": 1.065484500325168, + "grad_norm": 1.1015625, + "learning_rate": 2.3288695652173913e-06, + "loss": 0.2754, + "step": 15360 + }, + { + "epoch": 1.0661781920659008, + "grad_norm": 1.140625, + "learning_rate": 2.3271304347826087e-06, + "loss": 0.2358, + "step": 15370 + }, + { + "epoch": 1.0668718838066333, + "grad_norm": 1.265625, + "learning_rate": 2.3253913043478265e-06, + "loss": 0.3299, + "step": 15380 + }, + { + "epoch": 1.067565575547366, + "grad_norm": 1.1171875, + "learning_rate": 2.3236521739130434e-06, + "loss": 0.2327, + "step": 15390 + }, + { + "epoch": 1.0682592672880988, + "grad_norm": 1.1328125, + "learning_rate": 2.321913043478261e-06, + "loss": 0.2344, + "step": 15400 + }, + { + "epoch": 1.0689529590288316, + "grad_norm": 1.3203125, + "learning_rate": 2.320173913043478e-06, + "loss": 0.2527, + "step": 15410 + }, + { + "epoch": 1.0696466507695643, + "grad_norm": 1.625, + "learning_rate": 2.318434782608696e-06, + "loss": 0.2592, + "step": 15420 + }, + { + "epoch": 1.070340342510297, + "grad_norm": 1.3203125, + "learning_rate": 2.3166956521739133e-06, + "loss": 0.271, + "step": 15430 + }, + { + "epoch": 1.0710340342510296, + "grad_norm": 1.2109375, + "learning_rate": 2.3149565217391307e-06, + "loss": 0.2486, + "step": 15440 + }, + { + "epoch": 1.0717277259917624, + "grad_norm": 1.09375, + "learning_rate": 2.313217391304348e-06, + "loss": 0.2046, + "step": 15450 + }, + { + "epoch": 1.072421417732495, + "grad_norm": 1.3984375, + "learning_rate": 2.3114782608695654e-06, + "loss": 0.2947, + "step": 15460 + }, + { + "epoch": 1.0731151094732279, + "grad_norm": 1.078125, + "learning_rate": 2.309739130434783e-06, + "loss": 0.2328, + "step": 15470 + }, + { + "epoch": 1.0738088012139606, + "grad_norm": 1.2734375, + "learning_rate": 2.308e-06, + "loss": 0.2694, + "step": 15480 + }, + { + "epoch": 1.0745024929546934, + "grad_norm": 1.171875, + "learning_rate": 2.3062608695652176e-06, + "loss": 0.2282, + "step": 15490 + }, + { + "epoch": 1.0751961846954259, + "grad_norm": 1.15625, + "learning_rate": 2.304521739130435e-06, + "loss": 0.2505, + "step": 15500 + }, + { + "epoch": 1.0758898764361586, + "grad_norm": 1.34375, + "learning_rate": 2.3027826086956523e-06, + "loss": 0.219, + "step": 15510 + }, + { + "epoch": 1.0765835681768914, + "grad_norm": 1.2578125, + "learning_rate": 2.3010434782608697e-06, + "loss": 0.2595, + "step": 15520 + }, + { + "epoch": 1.0772772599176241, + "grad_norm": 0.9609375, + "learning_rate": 2.299304347826087e-06, + "loss": 0.2773, + "step": 15530 + }, + { + "epoch": 1.0779709516583569, + "grad_norm": 1.0546875, + "learning_rate": 2.2975652173913044e-06, + "loss": 0.2845, + "step": 15540 + }, + { + "epoch": 1.0786646433990896, + "grad_norm": 1.3203125, + "learning_rate": 2.2958260869565218e-06, + "loss": 0.2097, + "step": 15550 + }, + { + "epoch": 1.0793583351398222, + "grad_norm": 1.390625, + "learning_rate": 2.2940869565217396e-06, + "loss": 0.2575, + "step": 15560 + }, + { + "epoch": 1.080052026880555, + "grad_norm": 1.421875, + "learning_rate": 2.2923478260869565e-06, + "loss": 0.2349, + "step": 15570 + }, + { + "epoch": 1.0807457186212877, + "grad_norm": 1.203125, + "learning_rate": 2.2906086956521743e-06, + "loss": 0.2144, + "step": 15580 + }, + { + "epoch": 1.0814394103620204, + "grad_norm": 1.5234375, + "learning_rate": 2.2888695652173913e-06, + "loss": 0.2695, + "step": 15590 + }, + { + "epoch": 1.0821331021027532, + "grad_norm": 1.171875, + "learning_rate": 2.287130434782609e-06, + "loss": 0.2336, + "step": 15600 + }, + { + "epoch": 1.082826793843486, + "grad_norm": 1.046875, + "learning_rate": 2.2853913043478264e-06, + "loss": 0.2254, + "step": 15610 + }, + { + "epoch": 1.0835204855842184, + "grad_norm": 1.0546875, + "learning_rate": 2.283652173913044e-06, + "loss": 0.1932, + "step": 15620 + }, + { + "epoch": 1.0842141773249512, + "grad_norm": 1.40625, + "learning_rate": 2.281913043478261e-06, + "loss": 0.2222, + "step": 15630 + }, + { + "epoch": 1.084907869065684, + "grad_norm": 1.1640625, + "learning_rate": 2.2801739130434785e-06, + "loss": 0.197, + "step": 15640 + }, + { + "epoch": 1.0856015608064167, + "grad_norm": 1.203125, + "learning_rate": 2.278434782608696e-06, + "loss": 0.2951, + "step": 15650 + }, + { + "epoch": 1.0862952525471494, + "grad_norm": 1.1953125, + "learning_rate": 2.2766956521739133e-06, + "loss": 0.2323, + "step": 15660 + }, + { + "epoch": 1.0869889442878822, + "grad_norm": 1.2109375, + "learning_rate": 2.2749565217391307e-06, + "loss": 0.2331, + "step": 15670 + }, + { + "epoch": 1.0876826360286147, + "grad_norm": 1.0234375, + "learning_rate": 2.273217391304348e-06, + "loss": 0.2251, + "step": 15680 + }, + { + "epoch": 1.0883763277693475, + "grad_norm": 1.2265625, + "learning_rate": 2.2714782608695654e-06, + "loss": 0.2431, + "step": 15690 + }, + { + "epoch": 1.0890700195100802, + "grad_norm": 1.2109375, + "learning_rate": 2.2697391304347828e-06, + "loss": 0.2459, + "step": 15700 + }, + { + "epoch": 1.089763711250813, + "grad_norm": 1.046875, + "learning_rate": 2.268e-06, + "loss": 0.1969, + "step": 15710 + }, + { + "epoch": 1.0904574029915457, + "grad_norm": 1.3125, + "learning_rate": 2.2662608695652175e-06, + "loss": 0.2647, + "step": 15720 + }, + { + "epoch": 1.0911510947322784, + "grad_norm": 1.0546875, + "learning_rate": 2.264521739130435e-06, + "loss": 0.2552, + "step": 15730 + }, + { + "epoch": 1.091844786473011, + "grad_norm": 1.0625, + "learning_rate": 2.2627826086956523e-06, + "loss": 0.2258, + "step": 15740 + }, + { + "epoch": 1.0925384782137437, + "grad_norm": 1.203125, + "learning_rate": 2.2610434782608696e-06, + "loss": 0.268, + "step": 15750 + }, + { + "epoch": 1.0932321699544765, + "grad_norm": 1.09375, + "learning_rate": 2.259304347826087e-06, + "loss": 0.2232, + "step": 15760 + }, + { + "epoch": 1.0939258616952092, + "grad_norm": 1.625, + "learning_rate": 2.2575652173913044e-06, + "loss": 0.2888, + "step": 15770 + }, + { + "epoch": 1.094619553435942, + "grad_norm": 1.03125, + "learning_rate": 2.2558260869565217e-06, + "loss": 0.2236, + "step": 15780 + }, + { + "epoch": 1.0953132451766745, + "grad_norm": 1.3515625, + "learning_rate": 2.2540869565217395e-06, + "loss": 0.2405, + "step": 15790 + }, + { + "epoch": 1.0960069369174072, + "grad_norm": 1.484375, + "learning_rate": 2.2523478260869565e-06, + "loss": 0.2135, + "step": 15800 + }, + { + "epoch": 1.09670062865814, + "grad_norm": 1.171875, + "learning_rate": 2.2506086956521743e-06, + "loss": 0.2673, + "step": 15810 + }, + { + "epoch": 1.0973943203988727, + "grad_norm": 1.0, + "learning_rate": 2.2488695652173912e-06, + "loss": 0.2108, + "step": 15820 + }, + { + "epoch": 1.0980880121396055, + "grad_norm": 1.3046875, + "learning_rate": 2.247130434782609e-06, + "loss": 0.2335, + "step": 15830 + }, + { + "epoch": 1.0987817038803382, + "grad_norm": 1.3828125, + "learning_rate": 2.2453913043478264e-06, + "loss": 0.2494, + "step": 15840 + }, + { + "epoch": 1.099475395621071, + "grad_norm": 1.8125, + "learning_rate": 2.2436521739130438e-06, + "loss": 0.371, + "step": 15850 + }, + { + "epoch": 1.1001690873618035, + "grad_norm": 1.1171875, + "learning_rate": 2.241913043478261e-06, + "loss": 0.3074, + "step": 15860 + }, + { + "epoch": 1.1008627791025363, + "grad_norm": 1.15625, + "learning_rate": 2.2401739130434785e-06, + "loss": 0.2505, + "step": 15870 + }, + { + "epoch": 1.101556470843269, + "grad_norm": 1.0859375, + "learning_rate": 2.238434782608696e-06, + "loss": 0.2206, + "step": 15880 + }, + { + "epoch": 1.1022501625840018, + "grad_norm": 1.1015625, + "learning_rate": 2.2366956521739132e-06, + "loss": 0.28, + "step": 15890 + }, + { + "epoch": 1.1029438543247345, + "grad_norm": 1.1484375, + "learning_rate": 2.2349565217391306e-06, + "loss": 0.2172, + "step": 15900 + }, + { + "epoch": 1.103637546065467, + "grad_norm": 1.140625, + "learning_rate": 2.233217391304348e-06, + "loss": 0.2868, + "step": 15910 + }, + { + "epoch": 1.1043312378061998, + "grad_norm": 0.91796875, + "learning_rate": 2.2314782608695654e-06, + "loss": 0.2454, + "step": 15920 + }, + { + "epoch": 1.1050249295469325, + "grad_norm": 1.4609375, + "learning_rate": 2.2297391304347827e-06, + "loss": 0.2951, + "step": 15930 + }, + { + "epoch": 1.1057186212876653, + "grad_norm": 1.1640625, + "learning_rate": 2.228e-06, + "loss": 0.2183, + "step": 15940 + }, + { + "epoch": 1.106412313028398, + "grad_norm": 1.0546875, + "learning_rate": 2.226260869565218e-06, + "loss": 0.2182, + "step": 15950 + }, + { + "epoch": 1.1071060047691308, + "grad_norm": 1.1875, + "learning_rate": 2.224521739130435e-06, + "loss": 0.1977, + "step": 15960 + }, + { + "epoch": 1.1077996965098635, + "grad_norm": 1.4140625, + "learning_rate": 2.2227826086956526e-06, + "loss": 0.2098, + "step": 15970 + }, + { + "epoch": 1.108493388250596, + "grad_norm": 0.9375, + "learning_rate": 2.2210434782608696e-06, + "loss": 0.2466, + "step": 15980 + }, + { + "epoch": 1.1091870799913288, + "grad_norm": 1.0390625, + "learning_rate": 2.219304347826087e-06, + "loss": 0.2097, + "step": 15990 + }, + { + "epoch": 1.1098807717320616, + "grad_norm": 1.125, + "learning_rate": 2.2175652173913043e-06, + "loss": 0.2058, + "step": 16000 + }, + { + "epoch": 1.1105744634727943, + "grad_norm": 1.703125, + "learning_rate": 2.2158260869565217e-06, + "loss": 0.2597, + "step": 16010 + }, + { + "epoch": 1.111268155213527, + "grad_norm": 1.28125, + "learning_rate": 2.2140869565217395e-06, + "loss": 0.2305, + "step": 16020 + }, + { + "epoch": 1.1119618469542596, + "grad_norm": 1.2265625, + "learning_rate": 2.2123478260869564e-06, + "loss": 0.2129, + "step": 16030 + }, + { + "epoch": 1.1126555386949923, + "grad_norm": 0.984375, + "learning_rate": 2.2106086956521742e-06, + "loss": 0.2537, + "step": 16040 + }, + { + "epoch": 1.113349230435725, + "grad_norm": 1.0703125, + "learning_rate": 2.208869565217391e-06, + "loss": 0.2468, + "step": 16050 + }, + { + "epoch": 1.1140429221764578, + "grad_norm": 1.078125, + "learning_rate": 2.207130434782609e-06, + "loss": 0.2024, + "step": 16060 + }, + { + "epoch": 1.1147366139171906, + "grad_norm": 1.4375, + "learning_rate": 2.2053913043478263e-06, + "loss": 0.2803, + "step": 16070 + }, + { + "epoch": 1.1154303056579233, + "grad_norm": 1.2421875, + "learning_rate": 2.2036521739130437e-06, + "loss": 0.2135, + "step": 16080 + }, + { + "epoch": 1.116123997398656, + "grad_norm": 1.0078125, + "learning_rate": 2.201913043478261e-06, + "loss": 0.2603, + "step": 16090 + }, + { + "epoch": 1.1168176891393886, + "grad_norm": 1.0703125, + "learning_rate": 2.2001739130434785e-06, + "loss": 0.2369, + "step": 16100 + }, + { + "epoch": 1.1175113808801214, + "grad_norm": 1.1953125, + "learning_rate": 2.198434782608696e-06, + "loss": 0.2354, + "step": 16110 + }, + { + "epoch": 1.1182050726208541, + "grad_norm": 1.109375, + "learning_rate": 2.196695652173913e-06, + "loss": 0.1965, + "step": 16120 + }, + { + "epoch": 1.1188987643615869, + "grad_norm": 1.0546875, + "learning_rate": 2.1949565217391306e-06, + "loss": 0.2309, + "step": 16130 + }, + { + "epoch": 1.1195924561023196, + "grad_norm": 1.140625, + "learning_rate": 2.193217391304348e-06, + "loss": 0.244, + "step": 16140 + }, + { + "epoch": 1.1202861478430521, + "grad_norm": 1.3984375, + "learning_rate": 2.1914782608695653e-06, + "loss": 0.2881, + "step": 16150 + }, + { + "epoch": 1.1209798395837849, + "grad_norm": 1.109375, + "learning_rate": 2.1897391304347827e-06, + "loss": 0.2484, + "step": 16160 + }, + { + "epoch": 1.1216735313245176, + "grad_norm": 1.0703125, + "learning_rate": 2.188e-06, + "loss": 0.2577, + "step": 16170 + }, + { + "epoch": 1.1223672230652504, + "grad_norm": 1.25, + "learning_rate": 2.186260869565218e-06, + "loss": 0.2607, + "step": 16180 + }, + { + "epoch": 1.1230609148059831, + "grad_norm": 1.7109375, + "learning_rate": 2.184521739130435e-06, + "loss": 0.2481, + "step": 16190 + }, + { + "epoch": 1.1237546065467159, + "grad_norm": 1.0234375, + "learning_rate": 2.1827826086956526e-06, + "loss": 0.2204, + "step": 16200 + }, + { + "epoch": 1.1244482982874484, + "grad_norm": 1.328125, + "learning_rate": 2.1810434782608695e-06, + "loss": 0.2596, + "step": 16210 + }, + { + "epoch": 1.1251419900281812, + "grad_norm": 1.109375, + "learning_rate": 2.1793043478260873e-06, + "loss": 0.2827, + "step": 16220 + }, + { + "epoch": 1.125835681768914, + "grad_norm": 1.109375, + "learning_rate": 2.1775652173913047e-06, + "loss": 0.2533, + "step": 16230 + }, + { + "epoch": 1.1265293735096467, + "grad_norm": 0.953125, + "learning_rate": 2.175826086956522e-06, + "loss": 0.2578, + "step": 16240 + }, + { + "epoch": 1.1272230652503794, + "grad_norm": 1.125, + "learning_rate": 2.1740869565217395e-06, + "loss": 0.2142, + "step": 16250 + }, + { + "epoch": 1.1279167569911122, + "grad_norm": 1.1953125, + "learning_rate": 2.172347826086957e-06, + "loss": 0.2088, + "step": 16260 + }, + { + "epoch": 1.1286104487318447, + "grad_norm": 1.109375, + "learning_rate": 2.170608695652174e-06, + "loss": 0.2214, + "step": 16270 + }, + { + "epoch": 1.1293041404725774, + "grad_norm": 1.0703125, + "learning_rate": 2.1688695652173916e-06, + "loss": 0.2313, + "step": 16280 + }, + { + "epoch": 1.1299978322133102, + "grad_norm": 1.0859375, + "learning_rate": 2.167130434782609e-06, + "loss": 0.2488, + "step": 16290 + }, + { + "epoch": 1.130691523954043, + "grad_norm": 1.1171875, + "learning_rate": 2.1653913043478263e-06, + "loss": 0.2474, + "step": 16300 + }, + { + "epoch": 1.1313852156947757, + "grad_norm": 1.2421875, + "learning_rate": 2.1636521739130437e-06, + "loss": 0.2476, + "step": 16310 + }, + { + "epoch": 1.1320789074355084, + "grad_norm": 1.03125, + "learning_rate": 2.161913043478261e-06, + "loss": 0.2353, + "step": 16320 + }, + { + "epoch": 1.1327725991762412, + "grad_norm": 0.796875, + "learning_rate": 2.1601739130434784e-06, + "loss": 0.2156, + "step": 16330 + }, + { + "epoch": 1.1334662909169737, + "grad_norm": 1.1640625, + "learning_rate": 2.158434782608696e-06, + "loss": 0.2233, + "step": 16340 + }, + { + "epoch": 1.1341599826577065, + "grad_norm": 1.375, + "learning_rate": 2.156695652173913e-06, + "loss": 0.2461, + "step": 16350 + }, + { + "epoch": 1.1348536743984392, + "grad_norm": 0.96875, + "learning_rate": 2.1549565217391305e-06, + "loss": 0.2388, + "step": 16360 + }, + { + "epoch": 1.135547366139172, + "grad_norm": 1.140625, + "learning_rate": 2.153217391304348e-06, + "loss": 0.1843, + "step": 16370 + }, + { + "epoch": 1.1362410578799047, + "grad_norm": 1.328125, + "learning_rate": 2.1514782608695653e-06, + "loss": 0.2327, + "step": 16380 + }, + { + "epoch": 1.1369347496206372, + "grad_norm": 1.2421875, + "learning_rate": 2.1497391304347826e-06, + "loss": 0.364, + "step": 16390 + }, + { + "epoch": 1.13762844136137, + "grad_norm": 1.078125, + "learning_rate": 2.148e-06, + "loss": 0.2377, + "step": 16400 + }, + { + "epoch": 1.1383221331021027, + "grad_norm": 1.2734375, + "learning_rate": 2.146260869565218e-06, + "loss": 0.2195, + "step": 16410 + }, + { + "epoch": 1.1390158248428355, + "grad_norm": 1.2265625, + "learning_rate": 2.1445217391304348e-06, + "loss": 0.2238, + "step": 16420 + }, + { + "epoch": 1.1397095165835682, + "grad_norm": 1.1875, + "learning_rate": 2.1427826086956526e-06, + "loss": 0.2505, + "step": 16430 + }, + { + "epoch": 1.140403208324301, + "grad_norm": 1.0625, + "learning_rate": 2.1410434782608695e-06, + "loss": 0.2871, + "step": 16440 + }, + { + "epoch": 1.1410969000650335, + "grad_norm": 0.98828125, + "learning_rate": 2.1393043478260873e-06, + "loss": 0.248, + "step": 16450 + }, + { + "epoch": 1.1417905918057663, + "grad_norm": 0.98828125, + "learning_rate": 2.1375652173913047e-06, + "loss": 0.2396, + "step": 16460 + }, + { + "epoch": 1.142484283546499, + "grad_norm": 1.171875, + "learning_rate": 2.135826086956522e-06, + "loss": 0.225, + "step": 16470 + }, + { + "epoch": 1.1431779752872318, + "grad_norm": 1.0390625, + "learning_rate": 2.1340869565217394e-06, + "loss": 0.2223, + "step": 16480 + }, + { + "epoch": 1.1438716670279645, + "grad_norm": 1.2109375, + "learning_rate": 2.1323478260869568e-06, + "loss": 0.2289, + "step": 16490 + }, + { + "epoch": 1.1445653587686972, + "grad_norm": 1.2734375, + "learning_rate": 2.130608695652174e-06, + "loss": 0.2196, + "step": 16500 + }, + { + "epoch": 1.1452590505094298, + "grad_norm": 1.375, + "learning_rate": 2.1288695652173915e-06, + "loss": 0.3, + "step": 16510 + }, + { + "epoch": 1.1459527422501625, + "grad_norm": 1.078125, + "learning_rate": 2.127130434782609e-06, + "loss": 0.2349, + "step": 16520 + }, + { + "epoch": 1.1466464339908953, + "grad_norm": 1.2578125, + "learning_rate": 2.1253913043478263e-06, + "loss": 0.2661, + "step": 16530 + }, + { + "epoch": 1.147340125731628, + "grad_norm": 1.0546875, + "learning_rate": 2.1236521739130436e-06, + "loss": 0.2279, + "step": 16540 + }, + { + "epoch": 1.1480338174723608, + "grad_norm": 0.9375, + "learning_rate": 2.121913043478261e-06, + "loss": 0.1994, + "step": 16550 + }, + { + "epoch": 1.1487275092130935, + "grad_norm": 1.1328125, + "learning_rate": 2.1201739130434784e-06, + "loss": 0.2183, + "step": 16560 + }, + { + "epoch": 1.149421200953826, + "grad_norm": 1.2109375, + "learning_rate": 2.1184347826086957e-06, + "loss": 0.213, + "step": 16570 + }, + { + "epoch": 1.1501148926945588, + "grad_norm": 1.140625, + "learning_rate": 2.116695652173913e-06, + "loss": 0.2149, + "step": 16580 + }, + { + "epoch": 1.1508085844352915, + "grad_norm": 1.171875, + "learning_rate": 2.114956521739131e-06, + "loss": 0.2344, + "step": 16590 + }, + { + "epoch": 1.1515022761760243, + "grad_norm": 1.1640625, + "learning_rate": 2.113217391304348e-06, + "loss": 0.307, + "step": 16600 + }, + { + "epoch": 1.152195967916757, + "grad_norm": 1.125, + "learning_rate": 2.1114782608695652e-06, + "loss": 0.2201, + "step": 16610 + }, + { + "epoch": 1.1528896596574898, + "grad_norm": 1.234375, + "learning_rate": 2.1097391304347826e-06, + "loss": 0.2571, + "step": 16620 + }, + { + "epoch": 1.1535833513982223, + "grad_norm": 1.4140625, + "learning_rate": 2.108e-06, + "loss": 0.2089, + "step": 16630 + }, + { + "epoch": 1.154277043138955, + "grad_norm": 1.25, + "learning_rate": 2.1062608695652178e-06, + "loss": 0.3009, + "step": 16640 + }, + { + "epoch": 1.1549707348796878, + "grad_norm": 1.15625, + "learning_rate": 2.1045217391304347e-06, + "loss": 0.2993, + "step": 16650 + }, + { + "epoch": 1.1556644266204206, + "grad_norm": 1.03125, + "learning_rate": 2.1027826086956525e-06, + "loss": 0.2074, + "step": 16660 + }, + { + "epoch": 1.1563581183611533, + "grad_norm": 1.2578125, + "learning_rate": 2.1010434782608695e-06, + "loss": 0.2703, + "step": 16670 + }, + { + "epoch": 1.157051810101886, + "grad_norm": 0.91796875, + "learning_rate": 2.0993043478260873e-06, + "loss": 0.2232, + "step": 16680 + }, + { + "epoch": 1.1577455018426186, + "grad_norm": 1.1484375, + "learning_rate": 2.0975652173913046e-06, + "loss": 0.2249, + "step": 16690 + }, + { + "epoch": 1.1584391935833513, + "grad_norm": 1.3984375, + "learning_rate": 2.095826086956522e-06, + "loss": 0.2514, + "step": 16700 + }, + { + "epoch": 1.159132885324084, + "grad_norm": 1.6171875, + "learning_rate": 2.0940869565217394e-06, + "loss": 0.2142, + "step": 16710 + }, + { + "epoch": 1.1598265770648168, + "grad_norm": 1.1640625, + "learning_rate": 2.0923478260869567e-06, + "loss": 0.2381, + "step": 16720 + }, + { + "epoch": 1.1605202688055496, + "grad_norm": 1.359375, + "learning_rate": 2.090608695652174e-06, + "loss": 0.2305, + "step": 16730 + }, + { + "epoch": 1.1612139605462823, + "grad_norm": 1.203125, + "learning_rate": 2.0888695652173915e-06, + "loss": 0.2416, + "step": 16740 + }, + { + "epoch": 1.1619076522870149, + "grad_norm": 1.5625, + "learning_rate": 2.087130434782609e-06, + "loss": 0.3016, + "step": 16750 + }, + { + "epoch": 1.1626013440277476, + "grad_norm": 1.9765625, + "learning_rate": 2.0853913043478262e-06, + "loss": 0.3109, + "step": 16760 + }, + { + "epoch": 1.1632950357684804, + "grad_norm": 1.2734375, + "learning_rate": 2.0836521739130436e-06, + "loss": 0.2107, + "step": 16770 + }, + { + "epoch": 1.1639887275092131, + "grad_norm": 1.2421875, + "learning_rate": 2.081913043478261e-06, + "loss": 0.2386, + "step": 16780 + }, + { + "epoch": 1.1646824192499459, + "grad_norm": 1.1640625, + "learning_rate": 2.0801739130434783e-06, + "loss": 0.2904, + "step": 16790 + }, + { + "epoch": 1.1653761109906786, + "grad_norm": 1.078125, + "learning_rate": 2.0784347826086957e-06, + "loss": 0.2027, + "step": 16800 + }, + { + "epoch": 1.1660698027314111, + "grad_norm": 1.3046875, + "learning_rate": 2.076695652173913e-06, + "loss": 0.2178, + "step": 16810 + }, + { + "epoch": 1.166763494472144, + "grad_norm": 1.3828125, + "learning_rate": 2.074956521739131e-06, + "loss": 0.2418, + "step": 16820 + }, + { + "epoch": 1.1674571862128766, + "grad_norm": 1.4453125, + "learning_rate": 2.073217391304348e-06, + "loss": 0.2702, + "step": 16830 + }, + { + "epoch": 1.1681508779536094, + "grad_norm": 1.484375, + "learning_rate": 2.0714782608695656e-06, + "loss": 0.2903, + "step": 16840 + }, + { + "epoch": 1.1688445696943421, + "grad_norm": 1.7109375, + "learning_rate": 2.0697391304347826e-06, + "loss": 0.2985, + "step": 16850 + }, + { + "epoch": 1.1695382614350747, + "grad_norm": 1.0625, + "learning_rate": 2.0680000000000004e-06, + "loss": 0.2587, + "step": 16860 + }, + { + "epoch": 1.1702319531758074, + "grad_norm": 1.1171875, + "learning_rate": 2.0662608695652177e-06, + "loss": 0.2319, + "step": 16870 + }, + { + "epoch": 1.1709256449165402, + "grad_norm": 1.1796875, + "learning_rate": 2.064521739130435e-06, + "loss": 0.2573, + "step": 16880 + }, + { + "epoch": 1.171619336657273, + "grad_norm": 1.3515625, + "learning_rate": 2.0627826086956525e-06, + "loss": 0.2489, + "step": 16890 + }, + { + "epoch": 1.1723130283980057, + "grad_norm": 1.3828125, + "learning_rate": 2.06104347826087e-06, + "loss": 0.2214, + "step": 16900 + }, + { + "epoch": 1.1730067201387384, + "grad_norm": 1.15625, + "learning_rate": 2.0593043478260872e-06, + "loss": 0.2204, + "step": 16910 + }, + { + "epoch": 1.1737004118794712, + "grad_norm": 1.234375, + "learning_rate": 2.0575652173913046e-06, + "loss": 0.2512, + "step": 16920 + }, + { + "epoch": 1.1743941036202037, + "grad_norm": 1.3359375, + "learning_rate": 2.055826086956522e-06, + "loss": 0.2447, + "step": 16930 + }, + { + "epoch": 1.1750877953609364, + "grad_norm": 1.296875, + "learning_rate": 2.0540869565217393e-06, + "loss": 0.2006, + "step": 16940 + }, + { + "epoch": 1.1757814871016692, + "grad_norm": 1.09375, + "learning_rate": 2.0523478260869567e-06, + "loss": 0.2405, + "step": 16950 + }, + { + "epoch": 1.176475178842402, + "grad_norm": 1.5703125, + "learning_rate": 2.050608695652174e-06, + "loss": 0.2739, + "step": 16960 + }, + { + "epoch": 1.1771688705831347, + "grad_norm": 1.1171875, + "learning_rate": 2.0488695652173914e-06, + "loss": 0.2256, + "step": 16970 + }, + { + "epoch": 1.1778625623238672, + "grad_norm": 1.1640625, + "learning_rate": 2.047130434782609e-06, + "loss": 0.2264, + "step": 16980 + }, + { + "epoch": 1.1785562540646, + "grad_norm": 1.59375, + "learning_rate": 2.045391304347826e-06, + "loss": 0.2409, + "step": 16990 + }, + { + "epoch": 1.1792499458053327, + "grad_norm": 1.265625, + "learning_rate": 2.0436521739130436e-06, + "loss": 0.2711, + "step": 17000 + }, + { + "epoch": 1.1799436375460655, + "grad_norm": 1.1328125, + "learning_rate": 2.041913043478261e-06, + "loss": 0.2456, + "step": 17010 + }, + { + "epoch": 1.1806373292867982, + "grad_norm": 1.171875, + "learning_rate": 2.0401739130434783e-06, + "loss": 0.244, + "step": 17020 + }, + { + "epoch": 1.181331021027531, + "grad_norm": 1.5390625, + "learning_rate": 2.0384347826086957e-06, + "loss": 0.2305, + "step": 17030 + }, + { + "epoch": 1.1820247127682637, + "grad_norm": 0.91796875, + "learning_rate": 2.036695652173913e-06, + "loss": 0.2297, + "step": 17040 + }, + { + "epoch": 1.1827184045089962, + "grad_norm": 1.09375, + "learning_rate": 2.034956521739131e-06, + "loss": 0.2263, + "step": 17050 + }, + { + "epoch": 1.183412096249729, + "grad_norm": 1.6328125, + "learning_rate": 2.0332173913043478e-06, + "loss": 0.2718, + "step": 17060 + }, + { + "epoch": 1.1841057879904617, + "grad_norm": 1.3359375, + "learning_rate": 2.0314782608695656e-06, + "loss": 0.2617, + "step": 17070 + }, + { + "epoch": 1.1847994797311945, + "grad_norm": 1.3515625, + "learning_rate": 2.0297391304347825e-06, + "loss": 0.2891, + "step": 17080 + }, + { + "epoch": 1.1854931714719272, + "grad_norm": 1.25, + "learning_rate": 2.0280000000000003e-06, + "loss": 0.2944, + "step": 17090 + }, + { + "epoch": 1.1861868632126598, + "grad_norm": 1.140625, + "learning_rate": 2.0262608695652177e-06, + "loss": 0.2248, + "step": 17100 + }, + { + "epoch": 1.1868805549533925, + "grad_norm": 1.078125, + "learning_rate": 2.024521739130435e-06, + "loss": 0.2692, + "step": 17110 + }, + { + "epoch": 1.1875742466941253, + "grad_norm": 1.328125, + "learning_rate": 2.0227826086956524e-06, + "loss": 0.2349, + "step": 17120 + }, + { + "epoch": 1.188267938434858, + "grad_norm": 1.234375, + "learning_rate": 2.02104347826087e-06, + "loss": 0.2191, + "step": 17130 + }, + { + "epoch": 1.1889616301755908, + "grad_norm": 1.0078125, + "learning_rate": 2.019304347826087e-06, + "loss": 0.2398, + "step": 17140 + }, + { + "epoch": 1.1896553219163235, + "grad_norm": 1.2578125, + "learning_rate": 2.0175652173913045e-06, + "loss": 0.2761, + "step": 17150 + }, + { + "epoch": 1.1903490136570563, + "grad_norm": 1.171875, + "learning_rate": 2.015826086956522e-06, + "loss": 0.2036, + "step": 17160 + }, + { + "epoch": 1.1910427053977888, + "grad_norm": 1.609375, + "learning_rate": 2.0140869565217393e-06, + "loss": 0.2639, + "step": 17170 + }, + { + "epoch": 1.1917363971385215, + "grad_norm": 1.25, + "learning_rate": 2.0123478260869567e-06, + "loss": 0.195, + "step": 17180 + }, + { + "epoch": 1.1924300888792543, + "grad_norm": 1.3125, + "learning_rate": 2.010608695652174e-06, + "loss": 0.2145, + "step": 17190 + }, + { + "epoch": 1.193123780619987, + "grad_norm": 1.84375, + "learning_rate": 2.0088695652173914e-06, + "loss": 0.3457, + "step": 17200 + }, + { + "epoch": 1.1938174723607198, + "grad_norm": 1.28125, + "learning_rate": 2.007130434782609e-06, + "loss": 0.2548, + "step": 17210 + }, + { + "epoch": 1.1945111641014523, + "grad_norm": 1.0234375, + "learning_rate": 2.005391304347826e-06, + "loss": 0.24, + "step": 17220 + }, + { + "epoch": 1.195204855842185, + "grad_norm": 1.1640625, + "learning_rate": 2.0036521739130435e-06, + "loss": 0.2872, + "step": 17230 + }, + { + "epoch": 1.1958985475829178, + "grad_norm": 1.296875, + "learning_rate": 2.001913043478261e-06, + "loss": 0.231, + "step": 17240 + }, + { + "epoch": 1.1965922393236506, + "grad_norm": 1.3515625, + "learning_rate": 2.0001739130434783e-06, + "loss": 0.2923, + "step": 17250 + }, + { + "epoch": 1.1972859310643833, + "grad_norm": 1.2109375, + "learning_rate": 1.9984347826086956e-06, + "loss": 0.2832, + "step": 17260 + }, + { + "epoch": 1.197979622805116, + "grad_norm": 1.4921875, + "learning_rate": 1.996695652173913e-06, + "loss": 0.2875, + "step": 17270 + }, + { + "epoch": 1.1986733145458488, + "grad_norm": 1.0390625, + "learning_rate": 1.994956521739131e-06, + "loss": 0.2527, + "step": 17280 + }, + { + "epoch": 1.1993670062865813, + "grad_norm": 1.265625, + "learning_rate": 1.9932173913043477e-06, + "loss": 0.2425, + "step": 17290 + }, + { + "epoch": 1.200060698027314, + "grad_norm": 1.265625, + "learning_rate": 1.9914782608695655e-06, + "loss": 0.2546, + "step": 17300 + }, + { + "epoch": 1.2007543897680468, + "grad_norm": 1.4140625, + "learning_rate": 1.9897391304347825e-06, + "loss": 0.255, + "step": 17310 + }, + { + "epoch": 1.2014480815087796, + "grad_norm": 0.9609375, + "learning_rate": 1.9880000000000003e-06, + "loss": 0.2478, + "step": 17320 + }, + { + "epoch": 1.2021417732495123, + "grad_norm": 1.0390625, + "learning_rate": 1.9862608695652176e-06, + "loss": 0.2083, + "step": 17330 + }, + { + "epoch": 1.2028354649902449, + "grad_norm": 1.5078125, + "learning_rate": 1.984521739130435e-06, + "loss": 0.2753, + "step": 17340 + }, + { + "epoch": 1.2035291567309776, + "grad_norm": 1.2265625, + "learning_rate": 1.9827826086956524e-06, + "loss": 0.3128, + "step": 17350 + }, + { + "epoch": 1.2042228484717103, + "grad_norm": 1.0, + "learning_rate": 1.9810434782608698e-06, + "loss": 0.2478, + "step": 17360 + }, + { + "epoch": 1.204916540212443, + "grad_norm": 1.15625, + "learning_rate": 1.979304347826087e-06, + "loss": 0.328, + "step": 17370 + }, + { + "epoch": 1.2056102319531758, + "grad_norm": 1.1875, + "learning_rate": 1.9775652173913045e-06, + "loss": 0.2781, + "step": 17380 + }, + { + "epoch": 1.2063039236939086, + "grad_norm": 1.203125, + "learning_rate": 1.975826086956522e-06, + "loss": 0.3133, + "step": 17390 + }, + { + "epoch": 1.2069976154346413, + "grad_norm": 1.140625, + "learning_rate": 1.9740869565217392e-06, + "loss": 0.212, + "step": 17400 + }, + { + "epoch": 1.2076913071753739, + "grad_norm": 1.3515625, + "learning_rate": 1.9723478260869566e-06, + "loss": 0.2839, + "step": 17410 + }, + { + "epoch": 1.2083849989161066, + "grad_norm": 1.34375, + "learning_rate": 1.970608695652174e-06, + "loss": 0.2442, + "step": 17420 + }, + { + "epoch": 1.2090786906568394, + "grad_norm": 1.28125, + "learning_rate": 1.9688695652173914e-06, + "loss": 0.2751, + "step": 17430 + }, + { + "epoch": 1.2097723823975721, + "grad_norm": 1.265625, + "learning_rate": 1.967130434782609e-06, + "loss": 0.2529, + "step": 17440 + }, + { + "epoch": 1.2104660741383049, + "grad_norm": 1.3125, + "learning_rate": 1.965391304347826e-06, + "loss": 0.2791, + "step": 17450 + }, + { + "epoch": 1.2111597658790374, + "grad_norm": 1.2109375, + "learning_rate": 1.963652173913044e-06, + "loss": 0.2276, + "step": 17460 + }, + { + "epoch": 1.2118534576197701, + "grad_norm": 1.7890625, + "learning_rate": 1.961913043478261e-06, + "loss": 0.2515, + "step": 17470 + }, + { + "epoch": 1.212547149360503, + "grad_norm": 1.140625, + "learning_rate": 1.9601739130434786e-06, + "loss": 0.2579, + "step": 17480 + }, + { + "epoch": 1.2132408411012356, + "grad_norm": 0.98046875, + "learning_rate": 1.9584347826086956e-06, + "loss": 0.2222, + "step": 17490 + }, + { + "epoch": 1.2139345328419684, + "grad_norm": 0.98046875, + "learning_rate": 1.9566956521739134e-06, + "loss": 0.2072, + "step": 17500 + }, + { + "epoch": 1.2146282245827011, + "grad_norm": 1.203125, + "learning_rate": 1.9549565217391308e-06, + "loss": 0.2278, + "step": 17510 + }, + { + "epoch": 1.215321916323434, + "grad_norm": 1.0390625, + "learning_rate": 1.953217391304348e-06, + "loss": 0.2338, + "step": 17520 + }, + { + "epoch": 1.2160156080641664, + "grad_norm": 1.109375, + "learning_rate": 1.9514782608695655e-06, + "loss": 0.235, + "step": 17530 + }, + { + "epoch": 1.2167092998048992, + "grad_norm": 1.40625, + "learning_rate": 1.9497391304347824e-06, + "loss": 0.2458, + "step": 17540 + }, + { + "epoch": 1.217402991545632, + "grad_norm": 1.21875, + "learning_rate": 1.9480000000000002e-06, + "loss": 0.2322, + "step": 17550 + }, + { + "epoch": 1.2180966832863647, + "grad_norm": 0.8671875, + "learning_rate": 1.9462608695652176e-06, + "loss": 0.2301, + "step": 17560 + }, + { + "epoch": 1.2187903750270974, + "grad_norm": 1.015625, + "learning_rate": 1.944521739130435e-06, + "loss": 0.2707, + "step": 17570 + }, + { + "epoch": 1.21948406676783, + "grad_norm": 1.3046875, + "learning_rate": 1.9427826086956524e-06, + "loss": 0.2158, + "step": 17580 + }, + { + "epoch": 1.2201777585085627, + "grad_norm": 1.28125, + "learning_rate": 1.9410434782608697e-06, + "loss": 0.2404, + "step": 17590 + }, + { + "epoch": 1.2208714502492954, + "grad_norm": 1.1484375, + "learning_rate": 1.939304347826087e-06, + "loss": 0.2562, + "step": 17600 + }, + { + "epoch": 1.2215651419900282, + "grad_norm": 1.2109375, + "learning_rate": 1.9375652173913045e-06, + "loss": 0.2154, + "step": 17610 + }, + { + "epoch": 1.222258833730761, + "grad_norm": 1.2890625, + "learning_rate": 1.935826086956522e-06, + "loss": 0.2309, + "step": 17620 + }, + { + "epoch": 1.2229525254714937, + "grad_norm": 1.296875, + "learning_rate": 1.934086956521739e-06, + "loss": 0.229, + "step": 17630 + }, + { + "epoch": 1.2236462172122264, + "grad_norm": 1.453125, + "learning_rate": 1.9323478260869566e-06, + "loss": 0.2428, + "step": 17640 + }, + { + "epoch": 1.224339908952959, + "grad_norm": 1.1171875, + "learning_rate": 1.930608695652174e-06, + "loss": 0.2227, + "step": 17650 + }, + { + "epoch": 1.2250336006936917, + "grad_norm": 1.59375, + "learning_rate": 1.9288695652173913e-06, + "loss": 0.3012, + "step": 17660 + }, + { + "epoch": 1.2257272924344245, + "grad_norm": 0.9765625, + "learning_rate": 1.927130434782609e-06, + "loss": 0.2796, + "step": 17670 + }, + { + "epoch": 1.2264209841751572, + "grad_norm": 0.953125, + "learning_rate": 1.925391304347826e-06, + "loss": 0.2393, + "step": 17680 + }, + { + "epoch": 1.22711467591589, + "grad_norm": 1.3984375, + "learning_rate": 1.923652173913044e-06, + "loss": 0.2685, + "step": 17690 + }, + { + "epoch": 1.2278083676566225, + "grad_norm": 1.15625, + "learning_rate": 1.921913043478261e-06, + "loss": 0.2598, + "step": 17700 + }, + { + "epoch": 1.2285020593973552, + "grad_norm": 1.0625, + "learning_rate": 1.9201739130434786e-06, + "loss": 0.237, + "step": 17710 + }, + { + "epoch": 1.229195751138088, + "grad_norm": 0.9765625, + "learning_rate": 1.9184347826086955e-06, + "loss": 0.2421, + "step": 17720 + }, + { + "epoch": 1.2298894428788207, + "grad_norm": 1.1484375, + "learning_rate": 1.9166956521739133e-06, + "loss": 0.2501, + "step": 17730 + }, + { + "epoch": 1.2305831346195535, + "grad_norm": 1.1640625, + "learning_rate": 1.9149565217391307e-06, + "loss": 0.21, + "step": 17740 + }, + { + "epoch": 1.2312768263602862, + "grad_norm": 1.1328125, + "learning_rate": 1.913217391304348e-06, + "loss": 0.2558, + "step": 17750 + }, + { + "epoch": 1.2319705181010188, + "grad_norm": 1.203125, + "learning_rate": 1.9114782608695655e-06, + "loss": 0.2314, + "step": 17760 + }, + { + "epoch": 1.2326642098417515, + "grad_norm": 1.6171875, + "learning_rate": 1.909739130434783e-06, + "loss": 0.2358, + "step": 17770 + }, + { + "epoch": 1.2333579015824843, + "grad_norm": 1.0234375, + "learning_rate": 1.908e-06, + "loss": 0.2434, + "step": 17780 + }, + { + "epoch": 1.234051593323217, + "grad_norm": 1.3515625, + "learning_rate": 1.9062608695652176e-06, + "loss": 0.2677, + "step": 17790 + }, + { + "epoch": 1.2347452850639498, + "grad_norm": 1.21875, + "learning_rate": 1.904521739130435e-06, + "loss": 0.2215, + "step": 17800 + }, + { + "epoch": 1.2354389768046825, + "grad_norm": 1.0, + "learning_rate": 1.9027826086956525e-06, + "loss": 0.2329, + "step": 17810 + }, + { + "epoch": 1.236132668545415, + "grad_norm": 1.203125, + "learning_rate": 1.9010434782608697e-06, + "loss": 0.2311, + "step": 17820 + }, + { + "epoch": 1.2368263602861478, + "grad_norm": 1.0390625, + "learning_rate": 1.8993043478260873e-06, + "loss": 0.2237, + "step": 17830 + }, + { + "epoch": 1.2375200520268805, + "grad_norm": 1.3984375, + "learning_rate": 1.8975652173913044e-06, + "loss": 0.254, + "step": 17840 + }, + { + "epoch": 1.2382137437676133, + "grad_norm": 1.171875, + "learning_rate": 1.8958260869565218e-06, + "loss": 0.2176, + "step": 17850 + }, + { + "epoch": 1.238907435508346, + "grad_norm": 1.3671875, + "learning_rate": 1.8940869565217394e-06, + "loss": 0.2342, + "step": 17860 + }, + { + "epoch": 1.2396011272490788, + "grad_norm": 1.0703125, + "learning_rate": 1.8923478260869565e-06, + "loss": 0.2417, + "step": 17870 + }, + { + "epoch": 1.2402948189898113, + "grad_norm": 1.2109375, + "learning_rate": 1.8906086956521741e-06, + "loss": 0.2707, + "step": 17880 + }, + { + "epoch": 1.240988510730544, + "grad_norm": 1.484375, + "learning_rate": 1.8888695652173913e-06, + "loss": 0.2278, + "step": 17890 + }, + { + "epoch": 1.2416822024712768, + "grad_norm": 1.203125, + "learning_rate": 1.8871304347826089e-06, + "loss": 0.2455, + "step": 17900 + }, + { + "epoch": 1.2423758942120096, + "grad_norm": 1.34375, + "learning_rate": 1.8853913043478262e-06, + "loss": 0.2575, + "step": 17910 + }, + { + "epoch": 1.2430695859527423, + "grad_norm": 1.5, + "learning_rate": 1.8836521739130436e-06, + "loss": 0.2959, + "step": 17920 + }, + { + "epoch": 1.243763277693475, + "grad_norm": 1.234375, + "learning_rate": 1.881913043478261e-06, + "loss": 0.2936, + "step": 17930 + }, + { + "epoch": 1.2444569694342076, + "grad_norm": 1.2265625, + "learning_rate": 1.8801739130434786e-06, + "loss": 0.3073, + "step": 17940 + }, + { + "epoch": 1.2451506611749403, + "grad_norm": 1.3125, + "learning_rate": 1.8784347826086957e-06, + "loss": 0.2216, + "step": 17950 + }, + { + "epoch": 1.245844352915673, + "grad_norm": 1.328125, + "learning_rate": 1.8766956521739133e-06, + "loss": 0.2443, + "step": 17960 + }, + { + "epoch": 1.2465380446564058, + "grad_norm": 1.15625, + "learning_rate": 1.8749565217391305e-06, + "loss": 0.2522, + "step": 17970 + }, + { + "epoch": 1.2472317363971386, + "grad_norm": 1.0546875, + "learning_rate": 1.873217391304348e-06, + "loss": 0.2404, + "step": 17980 + }, + { + "epoch": 1.2479254281378713, + "grad_norm": 1.3046875, + "learning_rate": 1.8714782608695652e-06, + "loss": 0.2511, + "step": 17990 + }, + { + "epoch": 1.2486191198786039, + "grad_norm": 0.95703125, + "learning_rate": 1.8697391304347828e-06, + "loss": 0.3144, + "step": 18000 + }, + { + "epoch": 1.2493128116193366, + "grad_norm": 1.0703125, + "learning_rate": 1.8680000000000002e-06, + "loss": 0.2274, + "step": 18010 + }, + { + "epoch": 1.2500065033600694, + "grad_norm": 1.2421875, + "learning_rate": 1.8662608695652175e-06, + "loss": 0.3148, + "step": 18020 + }, + { + "epoch": 1.250700195100802, + "grad_norm": 1.0546875, + "learning_rate": 1.864521739130435e-06, + "loss": 0.2123, + "step": 18030 + }, + { + "epoch": 1.2513938868415349, + "grad_norm": 1.25, + "learning_rate": 1.8627826086956525e-06, + "loss": 0.2383, + "step": 18040 + }, + { + "epoch": 1.2520875785822674, + "grad_norm": 1.140625, + "learning_rate": 1.8610434782608696e-06, + "loss": 0.2308, + "step": 18050 + }, + { + "epoch": 1.2527812703230001, + "grad_norm": 1.1328125, + "learning_rate": 1.8593043478260872e-06, + "loss": 0.2131, + "step": 18060 + }, + { + "epoch": 1.2534749620637329, + "grad_norm": 1.1640625, + "learning_rate": 1.8575652173913044e-06, + "loss": 0.264, + "step": 18070 + }, + { + "epoch": 1.2541686538044656, + "grad_norm": 1.1953125, + "learning_rate": 1.855826086956522e-06, + "loss": 0.2561, + "step": 18080 + }, + { + "epoch": 1.2548623455451984, + "grad_norm": 1.5078125, + "learning_rate": 1.8540869565217393e-06, + "loss": 0.3174, + "step": 18090 + }, + { + "epoch": 1.2555560372859311, + "grad_norm": 1.1015625, + "learning_rate": 1.8523478260869567e-06, + "loss": 0.2361, + "step": 18100 + }, + { + "epoch": 1.2562497290266639, + "grad_norm": 1.4765625, + "learning_rate": 1.850608695652174e-06, + "loss": 0.2928, + "step": 18110 + }, + { + "epoch": 1.2569434207673966, + "grad_norm": 1.21875, + "learning_rate": 1.8488695652173917e-06, + "loss": 0.1963, + "step": 18120 + }, + { + "epoch": 1.2576371125081292, + "grad_norm": 1.25, + "learning_rate": 1.8471304347826088e-06, + "loss": 0.261, + "step": 18130 + }, + { + "epoch": 1.258330804248862, + "grad_norm": 0.95703125, + "learning_rate": 1.8453913043478264e-06, + "loss": 0.2202, + "step": 18140 + }, + { + "epoch": 1.2590244959895946, + "grad_norm": 1.21875, + "learning_rate": 1.8436521739130436e-06, + "loss": 0.3148, + "step": 18150 + }, + { + "epoch": 1.2597181877303274, + "grad_norm": 1.1484375, + "learning_rate": 1.841913043478261e-06, + "loss": 0.2427, + "step": 18160 + }, + { + "epoch": 1.26041187947106, + "grad_norm": 1.125, + "learning_rate": 1.8401739130434785e-06, + "loss": 0.2313, + "step": 18170 + }, + { + "epoch": 1.2611055712117927, + "grad_norm": 1.40625, + "learning_rate": 1.8384347826086957e-06, + "loss": 0.2316, + "step": 18180 + }, + { + "epoch": 1.2617992629525254, + "grad_norm": 1.3203125, + "learning_rate": 1.8366956521739133e-06, + "loss": 0.2298, + "step": 18190 + }, + { + "epoch": 1.2624929546932582, + "grad_norm": 1.0234375, + "learning_rate": 1.8349565217391304e-06, + "loss": 0.2165, + "step": 18200 + }, + { + "epoch": 1.263186646433991, + "grad_norm": 1.296875, + "learning_rate": 1.833217391304348e-06, + "loss": 0.2387, + "step": 18210 + }, + { + "epoch": 1.2638803381747237, + "grad_norm": 1.0703125, + "learning_rate": 1.8314782608695652e-06, + "loss": 0.2883, + "step": 18220 + }, + { + "epoch": 1.2645740299154564, + "grad_norm": 1.6015625, + "learning_rate": 1.8297391304347827e-06, + "loss": 0.2552, + "step": 18230 + }, + { + "epoch": 1.265267721656189, + "grad_norm": 1.2734375, + "learning_rate": 1.8280000000000001e-06, + "loss": 0.2563, + "step": 18240 + }, + { + "epoch": 1.2659614133969217, + "grad_norm": 1.0859375, + "learning_rate": 1.8262608695652175e-06, + "loss": 0.2093, + "step": 18250 + }, + { + "epoch": 1.2666551051376544, + "grad_norm": 1.2265625, + "learning_rate": 1.8245217391304349e-06, + "loss": 0.241, + "step": 18260 + }, + { + "epoch": 1.2673487968783872, + "grad_norm": 1.4140625, + "learning_rate": 1.8227826086956524e-06, + "loss": 0.2887, + "step": 18270 + }, + { + "epoch": 1.26804248861912, + "grad_norm": 1.203125, + "learning_rate": 1.8210434782608696e-06, + "loss": 0.2639, + "step": 18280 + }, + { + "epoch": 1.2687361803598525, + "grad_norm": 1.1640625, + "learning_rate": 1.8193043478260872e-06, + "loss": 0.2598, + "step": 18290 + }, + { + "epoch": 1.2694298721005852, + "grad_norm": 1.1640625, + "learning_rate": 1.8175652173913043e-06, + "loss": 0.248, + "step": 18300 + }, + { + "epoch": 1.270123563841318, + "grad_norm": 1.296875, + "learning_rate": 1.815826086956522e-06, + "loss": 0.1977, + "step": 18310 + }, + { + "epoch": 1.2708172555820507, + "grad_norm": 1.5859375, + "learning_rate": 1.8140869565217393e-06, + "loss": 0.2536, + "step": 18320 + }, + { + "epoch": 1.2715109473227835, + "grad_norm": 1.1484375, + "learning_rate": 1.8123478260869567e-06, + "loss": 0.2557, + "step": 18330 + }, + { + "epoch": 1.2722046390635162, + "grad_norm": 1.3828125, + "learning_rate": 1.810608695652174e-06, + "loss": 0.3069, + "step": 18340 + }, + { + "epoch": 1.272898330804249, + "grad_norm": 1.0234375, + "learning_rate": 1.8088695652173916e-06, + "loss": 0.2315, + "step": 18350 + }, + { + "epoch": 1.2735920225449815, + "grad_norm": 1.2890625, + "learning_rate": 1.8071304347826088e-06, + "loss": 0.202, + "step": 18360 + }, + { + "epoch": 1.2742857142857142, + "grad_norm": 1.1796875, + "learning_rate": 1.8053913043478264e-06, + "loss": 0.304, + "step": 18370 + }, + { + "epoch": 1.274979406026447, + "grad_norm": 1.59375, + "learning_rate": 1.8036521739130435e-06, + "loss": 0.259, + "step": 18380 + }, + { + "epoch": 1.2756730977671797, + "grad_norm": 1.2578125, + "learning_rate": 1.8019130434782611e-06, + "loss": 0.2263, + "step": 18390 + }, + { + "epoch": 1.2763667895079125, + "grad_norm": 1.1328125, + "learning_rate": 1.8001739130434785e-06, + "loss": 0.2381, + "step": 18400 + }, + { + "epoch": 1.277060481248645, + "grad_norm": 1.5, + "learning_rate": 1.7984347826086958e-06, + "loss": 0.2154, + "step": 18410 + }, + { + "epoch": 1.2777541729893778, + "grad_norm": 1.28125, + "learning_rate": 1.7966956521739132e-06, + "loss": 0.2191, + "step": 18420 + }, + { + "epoch": 1.2784478647301105, + "grad_norm": 1.0, + "learning_rate": 1.7949565217391308e-06, + "loss": 0.2516, + "step": 18430 + }, + { + "epoch": 1.2791415564708433, + "grad_norm": 1.2109375, + "learning_rate": 1.793217391304348e-06, + "loss": 0.2271, + "step": 18440 + }, + { + "epoch": 1.279835248211576, + "grad_norm": 1.1015625, + "learning_rate": 1.7914782608695655e-06, + "loss": 0.2308, + "step": 18450 + }, + { + "epoch": 1.2805289399523088, + "grad_norm": 1.25, + "learning_rate": 1.7897391304347827e-06, + "loss": 0.3075, + "step": 18460 + }, + { + "epoch": 1.2812226316930415, + "grad_norm": 1.4609375, + "learning_rate": 1.788e-06, + "loss": 0.2446, + "step": 18470 + }, + { + "epoch": 1.281916323433774, + "grad_norm": 1.1953125, + "learning_rate": 1.7862608695652174e-06, + "loss": 0.2359, + "step": 18480 + }, + { + "epoch": 1.2826100151745068, + "grad_norm": 2.171875, + "learning_rate": 1.7845217391304348e-06, + "loss": 0.3021, + "step": 18490 + }, + { + "epoch": 1.2833037069152395, + "grad_norm": 1.4609375, + "learning_rate": 1.7827826086956524e-06, + "loss": 0.2255, + "step": 18500 + }, + { + "epoch": 1.2839973986559723, + "grad_norm": 1.2578125, + "learning_rate": 1.7810434782608696e-06, + "loss": 0.2562, + "step": 18510 + }, + { + "epoch": 1.284691090396705, + "grad_norm": 1.3515625, + "learning_rate": 1.7793043478260871e-06, + "loss": 0.2513, + "step": 18520 + }, + { + "epoch": 1.2853847821374376, + "grad_norm": 1.4140625, + "learning_rate": 1.7775652173913043e-06, + "loss": 0.2296, + "step": 18530 + }, + { + "epoch": 1.2860784738781703, + "grad_norm": 1.3046875, + "learning_rate": 1.7758260869565219e-06, + "loss": 0.2612, + "step": 18540 + }, + { + "epoch": 1.286772165618903, + "grad_norm": 1.1171875, + "learning_rate": 1.7740869565217393e-06, + "loss": 0.2088, + "step": 18550 + }, + { + "epoch": 1.2874658573596358, + "grad_norm": 1.046875, + "learning_rate": 1.7723478260869566e-06, + "loss": 0.2278, + "step": 18560 + }, + { + "epoch": 1.2881595491003686, + "grad_norm": 1.015625, + "learning_rate": 1.770608695652174e-06, + "loss": 0.2695, + "step": 18570 + }, + { + "epoch": 1.2888532408411013, + "grad_norm": 0.921875, + "learning_rate": 1.7688695652173916e-06, + "loss": 0.237, + "step": 18580 + }, + { + "epoch": 1.289546932581834, + "grad_norm": 1.53125, + "learning_rate": 1.7671304347826087e-06, + "loss": 0.3094, + "step": 18590 + }, + { + "epoch": 1.2902406243225666, + "grad_norm": 1.03125, + "learning_rate": 1.7653913043478263e-06, + "loss": 0.2231, + "step": 18600 + }, + { + "epoch": 1.2909343160632993, + "grad_norm": 1.1328125, + "learning_rate": 1.7636521739130435e-06, + "loss": 0.2858, + "step": 18610 + }, + { + "epoch": 1.291628007804032, + "grad_norm": 1.1484375, + "learning_rate": 1.761913043478261e-06, + "loss": 0.2832, + "step": 18620 + }, + { + "epoch": 1.2923216995447648, + "grad_norm": 1.3359375, + "learning_rate": 1.7601739130434784e-06, + "loss": 0.2447, + "step": 18630 + }, + { + "epoch": 1.2930153912854976, + "grad_norm": 1.1484375, + "learning_rate": 1.7584347826086958e-06, + "loss": 0.2911, + "step": 18640 + }, + { + "epoch": 1.2937090830262301, + "grad_norm": 1.078125, + "learning_rate": 1.7566956521739132e-06, + "loss": 0.261, + "step": 18650 + }, + { + "epoch": 1.2944027747669629, + "grad_norm": 1.5, + "learning_rate": 1.7549565217391308e-06, + "loss": 0.2486, + "step": 18660 + }, + { + "epoch": 1.2950964665076956, + "grad_norm": 0.9765625, + "learning_rate": 1.753217391304348e-06, + "loss": 0.236, + "step": 18670 + }, + { + "epoch": 1.2957901582484284, + "grad_norm": 1.390625, + "learning_rate": 1.7514782608695655e-06, + "loss": 0.2216, + "step": 18680 + }, + { + "epoch": 1.296483849989161, + "grad_norm": 1.125, + "learning_rate": 1.7497391304347827e-06, + "loss": 0.2348, + "step": 18690 + }, + { + "epoch": 1.2971775417298939, + "grad_norm": 1.1328125, + "learning_rate": 1.7480000000000002e-06, + "loss": 0.2821, + "step": 18700 + }, + { + "epoch": 1.2978712334706266, + "grad_norm": 1.0703125, + "learning_rate": 1.7462608695652174e-06, + "loss": 0.2281, + "step": 18710 + }, + { + "epoch": 1.2985649252113591, + "grad_norm": 1.21875, + "learning_rate": 1.744521739130435e-06, + "loss": 0.3438, + "step": 18720 + }, + { + "epoch": 1.2992586169520919, + "grad_norm": 1.8515625, + "learning_rate": 1.7427826086956524e-06, + "loss": 0.307, + "step": 18730 + }, + { + "epoch": 1.2999523086928246, + "grad_norm": 1.3125, + "learning_rate": 1.7410434782608697e-06, + "loss": 0.2792, + "step": 18740 + }, + { + "epoch": 1.3006460004335574, + "grad_norm": 1.328125, + "learning_rate": 1.739304347826087e-06, + "loss": 0.246, + "step": 18750 + }, + { + "epoch": 1.3013396921742901, + "grad_norm": 1.203125, + "learning_rate": 1.7375652173913047e-06, + "loss": 0.2229, + "step": 18760 + }, + { + "epoch": 1.3020333839150227, + "grad_norm": 1.4921875, + "learning_rate": 1.7358260869565218e-06, + "loss": 0.2921, + "step": 18770 + }, + { + "epoch": 1.3027270756557554, + "grad_norm": 0.9921875, + "learning_rate": 1.7340869565217392e-06, + "loss": 0.2882, + "step": 18780 + }, + { + "epoch": 1.3034207673964882, + "grad_norm": 1.2265625, + "learning_rate": 1.7323478260869566e-06, + "loss": 0.2465, + "step": 18790 + }, + { + "epoch": 1.304114459137221, + "grad_norm": 1.21875, + "learning_rate": 1.730608695652174e-06, + "loss": 0.2151, + "step": 18800 + }, + { + "epoch": 1.3048081508779537, + "grad_norm": 1.1640625, + "learning_rate": 1.7288695652173915e-06, + "loss": 0.2568, + "step": 18810 + }, + { + "epoch": 1.3055018426186864, + "grad_norm": 1.5703125, + "learning_rate": 1.7271304347826087e-06, + "loss": 0.2507, + "step": 18820 + }, + { + "epoch": 1.3061955343594192, + "grad_norm": 1.21875, + "learning_rate": 1.7253913043478263e-06, + "loss": 0.2791, + "step": 18830 + }, + { + "epoch": 1.3068892261001517, + "grad_norm": 1.5390625, + "learning_rate": 1.7236521739130434e-06, + "loss": 0.2841, + "step": 18840 + }, + { + "epoch": 1.3075829178408844, + "grad_norm": 0.9609375, + "learning_rate": 1.721913043478261e-06, + "loss": 0.2459, + "step": 18850 + }, + { + "epoch": 1.3082766095816172, + "grad_norm": 1.078125, + "learning_rate": 1.7201739130434784e-06, + "loss": 0.2317, + "step": 18860 + }, + { + "epoch": 1.30897030132235, + "grad_norm": 1.328125, + "learning_rate": 1.7184347826086958e-06, + "loss": 0.2455, + "step": 18870 + }, + { + "epoch": 1.3096639930630825, + "grad_norm": 1.6953125, + "learning_rate": 1.7166956521739131e-06, + "loss": 0.2476, + "step": 18880 + }, + { + "epoch": 1.3103576848038152, + "grad_norm": 1.484375, + "learning_rate": 1.7149565217391307e-06, + "loss": 0.2529, + "step": 18890 + }, + { + "epoch": 1.311051376544548, + "grad_norm": 1.2734375, + "learning_rate": 1.7132173913043479e-06, + "loss": 0.2066, + "step": 18900 + }, + { + "epoch": 1.3117450682852807, + "grad_norm": 1.2109375, + "learning_rate": 1.7114782608695655e-06, + "loss": 0.2273, + "step": 18910 + }, + { + "epoch": 1.3124387600260135, + "grad_norm": 1.1171875, + "learning_rate": 1.7097391304347826e-06, + "loss": 0.1843, + "step": 18920 + }, + { + "epoch": 1.3131324517667462, + "grad_norm": 1.21875, + "learning_rate": 1.7080000000000002e-06, + "loss": 0.2144, + "step": 18930 + }, + { + "epoch": 1.313826143507479, + "grad_norm": 1.1875, + "learning_rate": 1.7062608695652174e-06, + "loss": 0.2223, + "step": 18940 + }, + { + "epoch": 1.3145198352482117, + "grad_norm": 1.296875, + "learning_rate": 1.704521739130435e-06, + "loss": 0.2149, + "step": 18950 + }, + { + "epoch": 1.3152135269889442, + "grad_norm": 1.28125, + "learning_rate": 1.7027826086956523e-06, + "loss": 0.2343, + "step": 18960 + }, + { + "epoch": 1.315907218729677, + "grad_norm": 1.59375, + "learning_rate": 1.7010434782608697e-06, + "loss": 0.2878, + "step": 18970 + }, + { + "epoch": 1.3166009104704097, + "grad_norm": 1.484375, + "learning_rate": 1.699304347826087e-06, + "loss": 0.2503, + "step": 18980 + }, + { + "epoch": 1.3172946022111425, + "grad_norm": 1.328125, + "learning_rate": 1.6975652173913046e-06, + "loss": 0.2683, + "step": 18990 + }, + { + "epoch": 1.317988293951875, + "grad_norm": 0.9453125, + "learning_rate": 1.6958260869565218e-06, + "loss": 0.226, + "step": 19000 + }, + { + "epoch": 1.3186819856926077, + "grad_norm": 1.3203125, + "learning_rate": 1.6940869565217394e-06, + "loss": 0.2354, + "step": 19010 + }, + { + "epoch": 1.3193756774333405, + "grad_norm": 1.3359375, + "learning_rate": 1.6923478260869565e-06, + "loss": 0.2512, + "step": 19020 + }, + { + "epoch": 1.3200693691740732, + "grad_norm": 1.1171875, + "learning_rate": 1.6906086956521741e-06, + "loss": 0.215, + "step": 19030 + }, + { + "epoch": 1.320763060914806, + "grad_norm": 1.0703125, + "learning_rate": 1.6888695652173915e-06, + "loss": 0.2324, + "step": 19040 + }, + { + "epoch": 1.3214567526555387, + "grad_norm": 1.0390625, + "learning_rate": 1.6871304347826089e-06, + "loss": 0.2898, + "step": 19050 + }, + { + "epoch": 1.3221504443962715, + "grad_norm": 1.4453125, + "learning_rate": 1.6853913043478262e-06, + "loss": 0.2415, + "step": 19060 + }, + { + "epoch": 1.3228441361370042, + "grad_norm": 1.46875, + "learning_rate": 1.6836521739130438e-06, + "loss": 0.2365, + "step": 19070 + }, + { + "epoch": 1.3235378278777368, + "grad_norm": 1.2421875, + "learning_rate": 1.681913043478261e-06, + "loss": 0.2251, + "step": 19080 + }, + { + "epoch": 1.3242315196184695, + "grad_norm": 0.9921875, + "learning_rate": 1.6801739130434784e-06, + "loss": 0.2137, + "step": 19090 + }, + { + "epoch": 1.3249252113592023, + "grad_norm": 1.3203125, + "learning_rate": 1.6784347826086957e-06, + "loss": 0.2239, + "step": 19100 + }, + { + "epoch": 1.325618903099935, + "grad_norm": 1.4609375, + "learning_rate": 1.676695652173913e-06, + "loss": 0.2748, + "step": 19110 + }, + { + "epoch": 1.3263125948406675, + "grad_norm": 1.21875, + "learning_rate": 1.6749565217391307e-06, + "loss": 0.2748, + "step": 19120 + }, + { + "epoch": 1.3270062865814003, + "grad_norm": 1.0078125, + "learning_rate": 1.6732173913043478e-06, + "loss": 0.2845, + "step": 19130 + }, + { + "epoch": 1.327699978322133, + "grad_norm": 1.265625, + "learning_rate": 1.6714782608695654e-06, + "loss": 0.2227, + "step": 19140 + }, + { + "epoch": 1.3283936700628658, + "grad_norm": 1.140625, + "learning_rate": 1.6697391304347826e-06, + "loss": 0.2701, + "step": 19150 + }, + { + "epoch": 1.3290873618035985, + "grad_norm": 1.1796875, + "learning_rate": 1.6680000000000002e-06, + "loss": 0.3026, + "step": 19160 + }, + { + "epoch": 1.3297810535443313, + "grad_norm": 1.34375, + "learning_rate": 1.6662608695652175e-06, + "loss": 0.2401, + "step": 19170 + }, + { + "epoch": 1.330474745285064, + "grad_norm": 1.28125, + "learning_rate": 1.664521739130435e-06, + "loss": 0.2907, + "step": 19180 + }, + { + "epoch": 1.3311684370257968, + "grad_norm": 1.09375, + "learning_rate": 1.6627826086956523e-06, + "loss": 0.2404, + "step": 19190 + }, + { + "epoch": 1.3318621287665293, + "grad_norm": 1.3359375, + "learning_rate": 1.6610434782608699e-06, + "loss": 0.2548, + "step": 19200 + }, + { + "epoch": 1.332555820507262, + "grad_norm": 1.2109375, + "learning_rate": 1.659304347826087e-06, + "loss": 0.3054, + "step": 19210 + }, + { + "epoch": 1.3332495122479948, + "grad_norm": 1.0859375, + "learning_rate": 1.6575652173913046e-06, + "loss": 0.2174, + "step": 19220 + }, + { + "epoch": 1.3339432039887276, + "grad_norm": 1.2734375, + "learning_rate": 1.6558260869565218e-06, + "loss": 0.2712, + "step": 19230 + }, + { + "epoch": 1.33463689572946, + "grad_norm": 1.5234375, + "learning_rate": 1.6540869565217393e-06, + "loss": 0.2191, + "step": 19240 + }, + { + "epoch": 1.3353305874701928, + "grad_norm": 1.0859375, + "learning_rate": 1.6523478260869565e-06, + "loss": 0.2889, + "step": 19250 + }, + { + "epoch": 1.3360242792109256, + "grad_norm": 1.21875, + "learning_rate": 1.650608695652174e-06, + "loss": 0.2662, + "step": 19260 + }, + { + "epoch": 1.3367179709516583, + "grad_norm": 1.1328125, + "learning_rate": 1.6488695652173915e-06, + "loss": 0.2417, + "step": 19270 + }, + { + "epoch": 1.337411662692391, + "grad_norm": 1.0703125, + "learning_rate": 1.6471304347826088e-06, + "loss": 0.2668, + "step": 19280 + }, + { + "epoch": 1.3381053544331238, + "grad_norm": 1.0234375, + "learning_rate": 1.6453913043478262e-06, + "loss": 0.2028, + "step": 19290 + }, + { + "epoch": 1.3387990461738566, + "grad_norm": 1.3203125, + "learning_rate": 1.6436521739130438e-06, + "loss": 0.2831, + "step": 19300 + }, + { + "epoch": 1.3394927379145893, + "grad_norm": 1.0625, + "learning_rate": 1.641913043478261e-06, + "loss": 0.2186, + "step": 19310 + }, + { + "epoch": 1.3401864296553219, + "grad_norm": 1.359375, + "learning_rate": 1.6401739130434785e-06, + "loss": 0.2243, + "step": 19320 + }, + { + "epoch": 1.3408801213960546, + "grad_norm": 1.09375, + "learning_rate": 1.6384347826086957e-06, + "loss": 0.2012, + "step": 19330 + }, + { + "epoch": 1.3415738131367874, + "grad_norm": 1.3046875, + "learning_rate": 1.6366956521739133e-06, + "loss": 0.2255, + "step": 19340 + }, + { + "epoch": 1.3422675048775201, + "grad_norm": 1.046875, + "learning_rate": 1.6349565217391306e-06, + "loss": 0.2735, + "step": 19350 + }, + { + "epoch": 1.3429611966182526, + "grad_norm": 1.90625, + "learning_rate": 1.633217391304348e-06, + "loss": 0.3033, + "step": 19360 + }, + { + "epoch": 1.3436548883589854, + "grad_norm": 1.3046875, + "learning_rate": 1.6314782608695654e-06, + "loss": 0.2539, + "step": 19370 + }, + { + "epoch": 1.3443485800997181, + "grad_norm": 1.1171875, + "learning_rate": 1.629739130434783e-06, + "loss": 0.2277, + "step": 19380 + }, + { + "epoch": 1.3450422718404509, + "grad_norm": 1.09375, + "learning_rate": 1.6280000000000001e-06, + "loss": 0.2215, + "step": 19390 + }, + { + "epoch": 1.3457359635811836, + "grad_norm": 1.03125, + "learning_rate": 1.6262608695652175e-06, + "loss": 0.2031, + "step": 19400 + }, + { + "epoch": 1.3464296553219164, + "grad_norm": 1.2734375, + "learning_rate": 1.6245217391304349e-06, + "loss": 0.2154, + "step": 19410 + }, + { + "epoch": 1.3471233470626491, + "grad_norm": 1.359375, + "learning_rate": 1.6227826086956522e-06, + "loss": 0.2311, + "step": 19420 + }, + { + "epoch": 1.3478170388033819, + "grad_norm": 1.0546875, + "learning_rate": 1.6210434782608698e-06, + "loss": 0.2294, + "step": 19430 + }, + { + "epoch": 1.3485107305441144, + "grad_norm": 1.2421875, + "learning_rate": 1.619304347826087e-06, + "loss": 0.3163, + "step": 19440 + }, + { + "epoch": 1.3492044222848472, + "grad_norm": 1.0078125, + "learning_rate": 1.6175652173913046e-06, + "loss": 0.1935, + "step": 19450 + }, + { + "epoch": 1.34989811402558, + "grad_norm": 1.453125, + "learning_rate": 1.6158260869565217e-06, + "loss": 0.2372, + "step": 19460 + }, + { + "epoch": 1.3505918057663127, + "grad_norm": 1.1484375, + "learning_rate": 1.6140869565217393e-06, + "loss": 0.3196, + "step": 19470 + }, + { + "epoch": 1.3512854975070452, + "grad_norm": 1.21875, + "learning_rate": 1.6123478260869565e-06, + "loss": 0.2137, + "step": 19480 + }, + { + "epoch": 1.351979189247778, + "grad_norm": 1.0078125, + "learning_rate": 1.610608695652174e-06, + "loss": 0.2553, + "step": 19490 + }, + { + "epoch": 1.3526728809885107, + "grad_norm": 1.0859375, + "learning_rate": 1.6088695652173914e-06, + "loss": 0.2622, + "step": 19500 + }, + { + "epoch": 1.3533665727292434, + "grad_norm": 0.98046875, + "learning_rate": 1.6071304347826088e-06, + "loss": 0.2121, + "step": 19510 + }, + { + "epoch": 1.3540602644699762, + "grad_norm": 1.953125, + "learning_rate": 1.6053913043478262e-06, + "loss": 0.2425, + "step": 19520 + }, + { + "epoch": 1.354753956210709, + "grad_norm": 1.0625, + "learning_rate": 1.6036521739130437e-06, + "loss": 0.2537, + "step": 19530 + }, + { + "epoch": 1.3554476479514417, + "grad_norm": 1.7890625, + "learning_rate": 1.601913043478261e-06, + "loss": 0.3255, + "step": 19540 + }, + { + "epoch": 1.3561413396921742, + "grad_norm": 1.46875, + "learning_rate": 1.6001739130434785e-06, + "loss": 0.2601, + "step": 19550 + }, + { + "epoch": 1.356835031432907, + "grad_norm": 1.296875, + "learning_rate": 1.5984347826086956e-06, + "loss": 0.2258, + "step": 19560 + }, + { + "epoch": 1.3575287231736397, + "grad_norm": 1.3203125, + "learning_rate": 1.5966956521739132e-06, + "loss": 0.2909, + "step": 19570 + }, + { + "epoch": 1.3582224149143725, + "grad_norm": 1.1328125, + "learning_rate": 1.5949565217391306e-06, + "loss": 0.254, + "step": 19580 + }, + { + "epoch": 1.3589161066551052, + "grad_norm": 0.94140625, + "learning_rate": 1.593217391304348e-06, + "loss": 0.2798, + "step": 19590 + }, + { + "epoch": 1.3596097983958377, + "grad_norm": 1.2890625, + "learning_rate": 1.5914782608695653e-06, + "loss": 0.2097, + "step": 19600 + }, + { + "epoch": 1.3603034901365705, + "grad_norm": 1.171875, + "learning_rate": 1.589739130434783e-06, + "loss": 0.3109, + "step": 19610 + }, + { + "epoch": 1.3609971818773032, + "grad_norm": 0.94140625, + "learning_rate": 1.588e-06, + "loss": 0.3035, + "step": 19620 + }, + { + "epoch": 1.361690873618036, + "grad_norm": 1.3359375, + "learning_rate": 1.5862608695652177e-06, + "loss": 0.3036, + "step": 19630 + }, + { + "epoch": 1.3623845653587687, + "grad_norm": 1.203125, + "learning_rate": 1.5845217391304348e-06, + "loss": 0.2218, + "step": 19640 + }, + { + "epoch": 1.3630782570995015, + "grad_norm": 1.2890625, + "learning_rate": 1.5827826086956524e-06, + "loss": 0.2462, + "step": 19650 + }, + { + "epoch": 1.3637719488402342, + "grad_norm": 0.984375, + "learning_rate": 1.5810434782608698e-06, + "loss": 0.2062, + "step": 19660 + }, + { + "epoch": 1.3644656405809668, + "grad_norm": 1.1875, + "learning_rate": 1.5793043478260872e-06, + "loss": 0.2235, + "step": 19670 + }, + { + "epoch": 1.3651593323216995, + "grad_norm": 1.5703125, + "learning_rate": 1.5775652173913045e-06, + "loss": 0.2577, + "step": 19680 + }, + { + "epoch": 1.3658530240624323, + "grad_norm": 1.0625, + "learning_rate": 1.5758260869565221e-06, + "loss": 0.2769, + "step": 19690 + }, + { + "epoch": 1.366546715803165, + "grad_norm": 1.1171875, + "learning_rate": 1.5740869565217393e-06, + "loss": 0.222, + "step": 19700 + }, + { + "epoch": 1.3672404075438978, + "grad_norm": 1.1640625, + "learning_rate": 1.5723478260869564e-06, + "loss": 0.2199, + "step": 19710 + }, + { + "epoch": 1.3679340992846303, + "grad_norm": 1.1015625, + "learning_rate": 1.570608695652174e-06, + "loss": 0.2524, + "step": 19720 + }, + { + "epoch": 1.368627791025363, + "grad_norm": 1.0390625, + "learning_rate": 1.5688695652173914e-06, + "loss": 0.2201, + "step": 19730 + }, + { + "epoch": 1.3693214827660958, + "grad_norm": 1.0078125, + "learning_rate": 1.5671304347826088e-06, + "loss": 0.3256, + "step": 19740 + }, + { + "epoch": 1.3700151745068285, + "grad_norm": 1.546875, + "learning_rate": 1.5653913043478261e-06, + "loss": 0.2929, + "step": 19750 + }, + { + "epoch": 1.3707088662475613, + "grad_norm": 1.2421875, + "learning_rate": 1.5636521739130437e-06, + "loss": 0.2536, + "step": 19760 + }, + { + "epoch": 1.371402557988294, + "grad_norm": 1.3828125, + "learning_rate": 1.5619130434782609e-06, + "loss": 0.2349, + "step": 19770 + }, + { + "epoch": 1.3720962497290268, + "grad_norm": 1.203125, + "learning_rate": 1.5601739130434784e-06, + "loss": 0.2195, + "step": 19780 + }, + { + "epoch": 1.3727899414697593, + "grad_norm": 1.171875, + "learning_rate": 1.5584347826086956e-06, + "loss": 0.278, + "step": 19790 + }, + { + "epoch": 1.373483633210492, + "grad_norm": 1.6953125, + "learning_rate": 1.5566956521739132e-06, + "loss": 0.2423, + "step": 19800 + }, + { + "epoch": 1.3741773249512248, + "grad_norm": 1.140625, + "learning_rate": 1.5549565217391306e-06, + "loss": 0.2768, + "step": 19810 + }, + { + "epoch": 1.3748710166919575, + "grad_norm": 1.0078125, + "learning_rate": 1.553217391304348e-06, + "loss": 0.2441, + "step": 19820 + }, + { + "epoch": 1.3755647084326903, + "grad_norm": 1.6015625, + "learning_rate": 1.5514782608695653e-06, + "loss": 0.2551, + "step": 19830 + }, + { + "epoch": 1.3762584001734228, + "grad_norm": 1.90625, + "learning_rate": 1.5497391304347829e-06, + "loss": 0.2861, + "step": 19840 + }, + { + "epoch": 1.3769520919141556, + "grad_norm": 1.5, + "learning_rate": 1.548e-06, + "loss": 0.2289, + "step": 19850 + }, + { + "epoch": 1.3776457836548883, + "grad_norm": 1.3828125, + "learning_rate": 1.5462608695652176e-06, + "loss": 0.2622, + "step": 19860 + }, + { + "epoch": 1.378339475395621, + "grad_norm": 1.046875, + "learning_rate": 1.5445217391304348e-06, + "loss": 0.3027, + "step": 19870 + }, + { + "epoch": 1.3790331671363538, + "grad_norm": 1.2421875, + "learning_rate": 1.5427826086956524e-06, + "loss": 0.2128, + "step": 19880 + }, + { + "epoch": 1.3797268588770866, + "grad_norm": 1.1015625, + "learning_rate": 1.5410434782608697e-06, + "loss": 0.2341, + "step": 19890 + }, + { + "epoch": 1.3804205506178193, + "grad_norm": 1.078125, + "learning_rate": 1.5393043478260871e-06, + "loss": 0.2186, + "step": 19900 + }, + { + "epoch": 1.3811142423585518, + "grad_norm": 1.21875, + "learning_rate": 1.5375652173913045e-06, + "loss": 0.2346, + "step": 19910 + }, + { + "epoch": 1.3818079340992846, + "grad_norm": 1.15625, + "learning_rate": 1.535826086956522e-06, + "loss": 0.2231, + "step": 19920 + }, + { + "epoch": 1.3825016258400173, + "grad_norm": 1.1796875, + "learning_rate": 1.5340869565217392e-06, + "loss": 0.2264, + "step": 19930 + }, + { + "epoch": 1.38319531758075, + "grad_norm": 1.1640625, + "learning_rate": 1.5323478260869568e-06, + "loss": 0.2376, + "step": 19940 + }, + { + "epoch": 1.3838890093214828, + "grad_norm": 1.34375, + "learning_rate": 1.530608695652174e-06, + "loss": 0.2588, + "step": 19950 + }, + { + "epoch": 1.3845827010622154, + "grad_norm": 1.2578125, + "learning_rate": 1.5288695652173916e-06, + "loss": 0.2747, + "step": 19960 + }, + { + "epoch": 1.3852763928029481, + "grad_norm": 1.421875, + "learning_rate": 1.5271304347826087e-06, + "loss": 0.2317, + "step": 19970 + }, + { + "epoch": 1.3859700845436809, + "grad_norm": 1.296875, + "learning_rate": 1.5253913043478263e-06, + "loss": 0.2596, + "step": 19980 + }, + { + "epoch": 1.3866637762844136, + "grad_norm": 1.328125, + "learning_rate": 1.5236521739130437e-06, + "loss": 0.2658, + "step": 19990 + }, + { + "epoch": 1.3873574680251464, + "grad_norm": 1.140625, + "learning_rate": 1.521913043478261e-06, + "loss": 0.2576, + "step": 20000 + }, + { + "epoch": 1.3880511597658791, + "grad_norm": 1.5390625, + "learning_rate": 1.5201739130434784e-06, + "loss": 0.237, + "step": 20010 + }, + { + "epoch": 1.3887448515066119, + "grad_norm": 1.1953125, + "learning_rate": 1.5184347826086956e-06, + "loss": 0.2249, + "step": 20020 + }, + { + "epoch": 1.3894385432473444, + "grad_norm": 1.3359375, + "learning_rate": 1.5166956521739131e-06, + "loss": 0.2541, + "step": 20030 + }, + { + "epoch": 1.3901322349880771, + "grad_norm": 0.87109375, + "learning_rate": 1.5149565217391305e-06, + "loss": 0.2328, + "step": 20040 + }, + { + "epoch": 1.39082592672881, + "grad_norm": 0.8515625, + "learning_rate": 1.5132173913043479e-06, + "loss": 0.2299, + "step": 20050 + }, + { + "epoch": 1.3915196184695426, + "grad_norm": 1.21875, + "learning_rate": 1.5114782608695653e-06, + "loss": 0.2728, + "step": 20060 + }, + { + "epoch": 1.3922133102102754, + "grad_norm": 0.94140625, + "learning_rate": 1.5097391304347828e-06, + "loss": 0.2113, + "step": 20070 + }, + { + "epoch": 1.392907001951008, + "grad_norm": 1.1953125, + "learning_rate": 1.508e-06, + "loss": 0.2075, + "step": 20080 + }, + { + "epoch": 1.3936006936917407, + "grad_norm": 1.1484375, + "learning_rate": 1.5062608695652176e-06, + "loss": 0.2242, + "step": 20090 + }, + { + "epoch": 1.3942943854324734, + "grad_norm": 0.90625, + "learning_rate": 1.5045217391304347e-06, + "loss": 0.2404, + "step": 20100 + }, + { + "epoch": 1.3949880771732062, + "grad_norm": 1.25, + "learning_rate": 1.5027826086956523e-06, + "loss": 0.2921, + "step": 20110 + }, + { + "epoch": 1.395681768913939, + "grad_norm": 1.3359375, + "learning_rate": 1.5010434782608697e-06, + "loss": 0.2469, + "step": 20120 + }, + { + "epoch": 1.3963754606546717, + "grad_norm": 1.53125, + "learning_rate": 1.499304347826087e-06, + "loss": 0.2326, + "step": 20130 + }, + { + "epoch": 1.3970691523954044, + "grad_norm": 1.140625, + "learning_rate": 1.4975652173913044e-06, + "loss": 0.2378, + "step": 20140 + }, + { + "epoch": 1.397762844136137, + "grad_norm": 1.34375, + "learning_rate": 1.495826086956522e-06, + "loss": 0.2893, + "step": 20150 + }, + { + "epoch": 1.3984565358768697, + "grad_norm": 0.7890625, + "learning_rate": 1.4940869565217392e-06, + "loss": 0.2289, + "step": 20160 + }, + { + "epoch": 1.3991502276176024, + "grad_norm": 1.3046875, + "learning_rate": 1.4923478260869568e-06, + "loss": 0.2284, + "step": 20170 + }, + { + "epoch": 1.3998439193583352, + "grad_norm": 0.953125, + "learning_rate": 1.490608695652174e-06, + "loss": 0.2175, + "step": 20180 + }, + { + "epoch": 1.4005376110990677, + "grad_norm": 1.2421875, + "learning_rate": 1.4888695652173915e-06, + "loss": 0.213, + "step": 20190 + }, + { + "epoch": 1.4012313028398005, + "grad_norm": 1.6484375, + "learning_rate": 1.4871304347826087e-06, + "loss": 0.2365, + "step": 20200 + }, + { + "epoch": 1.4019249945805332, + "grad_norm": 1.515625, + "learning_rate": 1.4853913043478263e-06, + "loss": 0.238, + "step": 20210 + }, + { + "epoch": 1.402618686321266, + "grad_norm": 1.4609375, + "learning_rate": 1.4836521739130436e-06, + "loss": 0.227, + "step": 20220 + }, + { + "epoch": 1.4033123780619987, + "grad_norm": 1.0546875, + "learning_rate": 1.481913043478261e-06, + "loss": 0.2311, + "step": 20230 + }, + { + "epoch": 1.4040060698027315, + "grad_norm": 1.0390625, + "learning_rate": 1.4801739130434784e-06, + "loss": 0.2447, + "step": 20240 + }, + { + "epoch": 1.4046997615434642, + "grad_norm": 1.4140625, + "learning_rate": 1.478434782608696e-06, + "loss": 0.2528, + "step": 20250 + }, + { + "epoch": 1.405393453284197, + "grad_norm": 1.2734375, + "learning_rate": 1.4766956521739131e-06, + "loss": 0.2466, + "step": 20260 + }, + { + "epoch": 1.4060871450249295, + "grad_norm": 1.515625, + "learning_rate": 1.4749565217391307e-06, + "loss": 0.2668, + "step": 20270 + }, + { + "epoch": 1.4067808367656622, + "grad_norm": 1.25, + "learning_rate": 1.4732173913043478e-06, + "loss": 0.2382, + "step": 20280 + }, + { + "epoch": 1.407474528506395, + "grad_norm": 1.2265625, + "learning_rate": 1.4714782608695654e-06, + "loss": 0.2393, + "step": 20290 + }, + { + "epoch": 1.4081682202471277, + "grad_norm": 0.828125, + "learning_rate": 1.4697391304347828e-06, + "loss": 0.2099, + "step": 20300 + }, + { + "epoch": 1.4088619119878603, + "grad_norm": 1.1328125, + "learning_rate": 1.4680000000000002e-06, + "loss": 0.2382, + "step": 20310 + }, + { + "epoch": 1.409555603728593, + "grad_norm": 1.5703125, + "learning_rate": 1.4662608695652175e-06, + "loss": 0.3246, + "step": 20320 + }, + { + "epoch": 1.4102492954693258, + "grad_norm": 1.0703125, + "learning_rate": 1.4645217391304347e-06, + "loss": 0.3255, + "step": 20330 + }, + { + "epoch": 1.4109429872100585, + "grad_norm": 1.0234375, + "learning_rate": 1.4627826086956523e-06, + "loss": 0.1941, + "step": 20340 + }, + { + "epoch": 1.4116366789507913, + "grad_norm": 1.0859375, + "learning_rate": 1.4610434782608697e-06, + "loss": 0.2243, + "step": 20350 + }, + { + "epoch": 1.412330370691524, + "grad_norm": 1.140625, + "learning_rate": 1.459304347826087e-06, + "loss": 0.219, + "step": 20360 + }, + { + "epoch": 1.4130240624322568, + "grad_norm": 1.203125, + "learning_rate": 1.4575652173913044e-06, + "loss": 0.225, + "step": 20370 + }, + { + "epoch": 1.4137177541729895, + "grad_norm": 1.140625, + "learning_rate": 1.455826086956522e-06, + "loss": 0.2449, + "step": 20380 + }, + { + "epoch": 1.414411445913722, + "grad_norm": 0.984375, + "learning_rate": 1.4540869565217391e-06, + "loss": 0.2189, + "step": 20390 + }, + { + "epoch": 1.4151051376544548, + "grad_norm": 1.390625, + "learning_rate": 1.4523478260869567e-06, + "loss": 0.2627, + "step": 20400 + }, + { + "epoch": 1.4157988293951875, + "grad_norm": 1.046875, + "learning_rate": 1.4506086956521739e-06, + "loss": 0.2063, + "step": 20410 + }, + { + "epoch": 1.4164925211359203, + "grad_norm": 1.0078125, + "learning_rate": 1.4488695652173915e-06, + "loss": 0.213, + "step": 20420 + }, + { + "epoch": 1.4171862128766528, + "grad_norm": 1.3671875, + "learning_rate": 1.4471304347826086e-06, + "loss": 0.2139, + "step": 20430 + }, + { + "epoch": 1.4178799046173856, + "grad_norm": 1.2265625, + "learning_rate": 1.4453913043478262e-06, + "loss": 0.2276, + "step": 20440 + }, + { + "epoch": 1.4185735963581183, + "grad_norm": 1.234375, + "learning_rate": 1.4436521739130436e-06, + "loss": 0.2084, + "step": 20450 + }, + { + "epoch": 1.419267288098851, + "grad_norm": 0.9765625, + "learning_rate": 1.441913043478261e-06, + "loss": 0.2437, + "step": 20460 + }, + { + "epoch": 1.4199609798395838, + "grad_norm": 1.15625, + "learning_rate": 1.4401739130434783e-06, + "loss": 0.2394, + "step": 20470 + }, + { + "epoch": 1.4206546715803166, + "grad_norm": 1.4765625, + "learning_rate": 1.438434782608696e-06, + "loss": 0.251, + "step": 20480 + }, + { + "epoch": 1.4213483633210493, + "grad_norm": 1.3125, + "learning_rate": 1.436695652173913e-06, + "loss": 0.288, + "step": 20490 + }, + { + "epoch": 1.422042055061782, + "grad_norm": 1.0390625, + "learning_rate": 1.4349565217391306e-06, + "loss": 0.2525, + "step": 20500 + }, + { + "epoch": 1.4227357468025146, + "grad_norm": 1.2578125, + "learning_rate": 1.4332173913043478e-06, + "loss": 0.2392, + "step": 20510 + }, + { + "epoch": 1.4234294385432473, + "grad_norm": 0.96484375, + "learning_rate": 1.4314782608695654e-06, + "loss": 0.2057, + "step": 20520 + }, + { + "epoch": 1.42412313028398, + "grad_norm": 1.5, + "learning_rate": 1.4297391304347828e-06, + "loss": 0.2368, + "step": 20530 + }, + { + "epoch": 1.4248168220247128, + "grad_norm": 1.265625, + "learning_rate": 1.4280000000000001e-06, + "loss": 0.2351, + "step": 20540 + }, + { + "epoch": 1.4255105137654454, + "grad_norm": 1.234375, + "learning_rate": 1.4262608695652175e-06, + "loss": 0.2066, + "step": 20550 + }, + { + "epoch": 1.426204205506178, + "grad_norm": 1.6328125, + "learning_rate": 1.424521739130435e-06, + "loss": 0.2405, + "step": 20560 + }, + { + "epoch": 1.4268978972469109, + "grad_norm": 1.328125, + "learning_rate": 1.4227826086956522e-06, + "loss": 0.2749, + "step": 20570 + }, + { + "epoch": 1.4275915889876436, + "grad_norm": 1.5078125, + "learning_rate": 1.4210434782608698e-06, + "loss": 0.2632, + "step": 20580 + }, + { + "epoch": 1.4282852807283763, + "grad_norm": 1.2109375, + "learning_rate": 1.419304347826087e-06, + "loss": 0.3155, + "step": 20590 + }, + { + "epoch": 1.428978972469109, + "grad_norm": 1.40625, + "learning_rate": 1.4175652173913046e-06, + "loss": 0.2016, + "step": 20600 + }, + { + "epoch": 1.4296726642098418, + "grad_norm": 1.1484375, + "learning_rate": 1.415826086956522e-06, + "loss": 0.2906, + "step": 20610 + }, + { + "epoch": 1.4303663559505746, + "grad_norm": 1.7890625, + "learning_rate": 1.4140869565217393e-06, + "loss": 0.2672, + "step": 20620 + }, + { + "epoch": 1.4310600476913071, + "grad_norm": 1.609375, + "learning_rate": 1.4123478260869567e-06, + "loss": 0.2527, + "step": 20630 + }, + { + "epoch": 1.4317537394320399, + "grad_norm": 1.1796875, + "learning_rate": 1.4106086956521738e-06, + "loss": 0.3494, + "step": 20640 + }, + { + "epoch": 1.4324474311727726, + "grad_norm": 0.8984375, + "learning_rate": 1.4088695652173914e-06, + "loss": 0.2156, + "step": 20650 + }, + { + "epoch": 1.4331411229135054, + "grad_norm": 1.2421875, + "learning_rate": 1.4071304347826086e-06, + "loss": 0.2444, + "step": 20660 + }, + { + "epoch": 1.433834814654238, + "grad_norm": 0.8828125, + "learning_rate": 1.4053913043478262e-06, + "loss": 0.2617, + "step": 20670 + }, + { + "epoch": 1.4345285063949706, + "grad_norm": 1.15625, + "learning_rate": 1.4036521739130435e-06, + "loss": 0.2533, + "step": 20680 + }, + { + "epoch": 1.4352221981357034, + "grad_norm": 1.1328125, + "learning_rate": 1.401913043478261e-06, + "loss": 0.2659, + "step": 20690 + }, + { + "epoch": 1.4359158898764361, + "grad_norm": 1.484375, + "learning_rate": 1.4001739130434783e-06, + "loss": 0.2591, + "step": 20700 + }, + { + "epoch": 1.436609581617169, + "grad_norm": 1.1953125, + "learning_rate": 1.3984347826086959e-06, + "loss": 0.2581, + "step": 20710 + }, + { + "epoch": 1.4373032733579016, + "grad_norm": 0.96484375, + "learning_rate": 1.396695652173913e-06, + "loss": 0.273, + "step": 20720 + }, + { + "epoch": 1.4379969650986344, + "grad_norm": 1.125, + "learning_rate": 1.3949565217391306e-06, + "loss": 0.2176, + "step": 20730 + }, + { + "epoch": 1.4386906568393671, + "grad_norm": 1.3359375, + "learning_rate": 1.3932173913043478e-06, + "loss": 0.2122, + "step": 20740 + }, + { + "epoch": 1.4393843485800997, + "grad_norm": 1.4140625, + "learning_rate": 1.3914782608695654e-06, + "loss": 0.2135, + "step": 20750 + }, + { + "epoch": 1.4400780403208324, + "grad_norm": 0.75, + "learning_rate": 1.3897391304347827e-06, + "loss": 0.2196, + "step": 20760 + }, + { + "epoch": 1.4407717320615652, + "grad_norm": 0.82421875, + "learning_rate": 1.388e-06, + "loss": 0.2397, + "step": 20770 + }, + { + "epoch": 1.441465423802298, + "grad_norm": 1.2578125, + "learning_rate": 1.3862608695652175e-06, + "loss": 0.2441, + "step": 20780 + }, + { + "epoch": 1.4421591155430304, + "grad_norm": 1.1484375, + "learning_rate": 1.384521739130435e-06, + "loss": 0.2584, + "step": 20790 + }, + { + "epoch": 1.4428528072837632, + "grad_norm": 1.15625, + "learning_rate": 1.3827826086956522e-06, + "loss": 0.2123, + "step": 20800 + }, + { + "epoch": 1.443546499024496, + "grad_norm": 0.96484375, + "learning_rate": 1.3810434782608698e-06, + "loss": 0.2157, + "step": 20810 + }, + { + "epoch": 1.4442401907652287, + "grad_norm": 1.4609375, + "learning_rate": 1.379304347826087e-06, + "loss": 0.2407, + "step": 20820 + }, + { + "epoch": 1.4449338825059614, + "grad_norm": 1.3203125, + "learning_rate": 1.3775652173913045e-06, + "loss": 0.2347, + "step": 20830 + }, + { + "epoch": 1.4456275742466942, + "grad_norm": 1.234375, + "learning_rate": 1.375826086956522e-06, + "loss": 0.2429, + "step": 20840 + }, + { + "epoch": 1.446321265987427, + "grad_norm": 1.1328125, + "learning_rate": 1.3740869565217393e-06, + "loss": 0.2441, + "step": 20850 + }, + { + "epoch": 1.4470149577281595, + "grad_norm": 1.421875, + "learning_rate": 1.3723478260869566e-06, + "loss": 0.2429, + "step": 20860 + }, + { + "epoch": 1.4477086494688922, + "grad_norm": 1.0859375, + "learning_rate": 1.3706086956521742e-06, + "loss": 0.2133, + "step": 20870 + }, + { + "epoch": 1.448402341209625, + "grad_norm": 1.0546875, + "learning_rate": 1.3688695652173914e-06, + "loss": 0.232, + "step": 20880 + }, + { + "epoch": 1.4490960329503577, + "grad_norm": 1.4375, + "learning_rate": 1.367130434782609e-06, + "loss": 0.2202, + "step": 20890 + }, + { + "epoch": 1.4497897246910905, + "grad_norm": 1.109375, + "learning_rate": 1.3653913043478261e-06, + "loss": 0.2187, + "step": 20900 + }, + { + "epoch": 1.450483416431823, + "grad_norm": 1.0078125, + "learning_rate": 1.3636521739130437e-06, + "loss": 0.2306, + "step": 20910 + }, + { + "epoch": 1.4511771081725557, + "grad_norm": 1.40625, + "learning_rate": 1.3619130434782609e-06, + "loss": 0.2472, + "step": 20920 + }, + { + "epoch": 1.4518707999132885, + "grad_norm": 1.78125, + "learning_rate": 1.3601739130434782e-06, + "loss": 0.2569, + "step": 20930 + }, + { + "epoch": 1.4525644916540212, + "grad_norm": 0.85546875, + "learning_rate": 1.3584347826086958e-06, + "loss": 0.2277, + "step": 20940 + }, + { + "epoch": 1.453258183394754, + "grad_norm": 1.140625, + "learning_rate": 1.356695652173913e-06, + "loss": 0.2238, + "step": 20950 + }, + { + "epoch": 1.4539518751354867, + "grad_norm": 1.375, + "learning_rate": 1.3549565217391306e-06, + "loss": 0.2185, + "step": 20960 + }, + { + "epoch": 1.4546455668762195, + "grad_norm": 1.2734375, + "learning_rate": 1.3532173913043477e-06, + "loss": 0.2834, + "step": 20970 + }, + { + "epoch": 1.455339258616952, + "grad_norm": 1.25, + "learning_rate": 1.3514782608695653e-06, + "loss": 0.2911, + "step": 20980 + }, + { + "epoch": 1.4560329503576848, + "grad_norm": 1.4453125, + "learning_rate": 1.3497391304347827e-06, + "loss": 0.2226, + "step": 20990 + }, + { + "epoch": 1.4567266420984175, + "grad_norm": 1.0546875, + "learning_rate": 1.348e-06, + "loss": 0.2248, + "step": 21000 + }, + { + "epoch": 1.4574203338391503, + "grad_norm": 1.1171875, + "learning_rate": 1.3462608695652174e-06, + "loss": 0.2437, + "step": 21010 + }, + { + "epoch": 1.458114025579883, + "grad_norm": 1.3125, + "learning_rate": 1.344521739130435e-06, + "loss": 0.2421, + "step": 21020 + }, + { + "epoch": 1.4588077173206155, + "grad_norm": 1.328125, + "learning_rate": 1.3427826086956522e-06, + "loss": 0.2161, + "step": 21030 + }, + { + "epoch": 1.4595014090613483, + "grad_norm": 1.34375, + "learning_rate": 1.3410434782608697e-06, + "loss": 0.245, + "step": 21040 + }, + { + "epoch": 1.460195100802081, + "grad_norm": 1.4140625, + "learning_rate": 1.339304347826087e-06, + "loss": 0.2432, + "step": 21050 + }, + { + "epoch": 1.4608887925428138, + "grad_norm": 1.5625, + "learning_rate": 1.3375652173913045e-06, + "loss": 0.2379, + "step": 21060 + }, + { + "epoch": 1.4615824842835465, + "grad_norm": 1.5859375, + "learning_rate": 1.3358260869565219e-06, + "loss": 0.2513, + "step": 21070 + }, + { + "epoch": 1.4622761760242793, + "grad_norm": 1.203125, + "learning_rate": 1.3340869565217392e-06, + "loss": 0.3091, + "step": 21080 + }, + { + "epoch": 1.462969867765012, + "grad_norm": 1.4296875, + "learning_rate": 1.3323478260869566e-06, + "loss": 0.2542, + "step": 21090 + }, + { + "epoch": 1.4636635595057446, + "grad_norm": 1.296875, + "learning_rate": 1.3306086956521742e-06, + "loss": 0.2138, + "step": 21100 + }, + { + "epoch": 1.4643572512464773, + "grad_norm": 1.1484375, + "learning_rate": 1.3288695652173913e-06, + "loss": 0.252, + "step": 21110 + }, + { + "epoch": 1.46505094298721, + "grad_norm": 1.25, + "learning_rate": 1.327130434782609e-06, + "loss": 0.2159, + "step": 21120 + }, + { + "epoch": 1.4657446347279428, + "grad_norm": 1.3359375, + "learning_rate": 1.325391304347826e-06, + "loss": 0.2535, + "step": 21130 + }, + { + "epoch": 1.4664383264686756, + "grad_norm": 1.6328125, + "learning_rate": 1.3236521739130437e-06, + "loss": 0.3247, + "step": 21140 + }, + { + "epoch": 1.467132018209408, + "grad_norm": 1.1328125, + "learning_rate": 1.3219130434782608e-06, + "loss": 0.2415, + "step": 21150 + }, + { + "epoch": 1.4678257099501408, + "grad_norm": 1.125, + "learning_rate": 1.3201739130434784e-06, + "loss": 0.2295, + "step": 21160 + }, + { + "epoch": 1.4685194016908736, + "grad_norm": 1.3828125, + "learning_rate": 1.3184347826086958e-06, + "loss": 0.2348, + "step": 21170 + }, + { + "epoch": 1.4692130934316063, + "grad_norm": 1.1796875, + "learning_rate": 1.3166956521739134e-06, + "loss": 0.2122, + "step": 21180 + }, + { + "epoch": 1.469906785172339, + "grad_norm": 1.078125, + "learning_rate": 1.3149565217391305e-06, + "loss": 0.2565, + "step": 21190 + }, + { + "epoch": 1.4706004769130718, + "grad_norm": 1.390625, + "learning_rate": 1.3132173913043481e-06, + "loss": 0.2724, + "step": 21200 + }, + { + "epoch": 1.4712941686538046, + "grad_norm": 1.140625, + "learning_rate": 1.3114782608695653e-06, + "loss": 0.2413, + "step": 21210 + }, + { + "epoch": 1.471987860394537, + "grad_norm": 1.7890625, + "learning_rate": 1.3097391304347829e-06, + "loss": 0.235, + "step": 21220 + }, + { + "epoch": 1.4726815521352699, + "grad_norm": 0.92578125, + "learning_rate": 1.308e-06, + "loss": 0.2664, + "step": 21230 + }, + { + "epoch": 1.4733752438760026, + "grad_norm": 1.2578125, + "learning_rate": 1.3062608695652174e-06, + "loss": 0.2151, + "step": 21240 + }, + { + "epoch": 1.4740689356167354, + "grad_norm": 1.34375, + "learning_rate": 1.304521739130435e-06, + "loss": 0.2598, + "step": 21250 + }, + { + "epoch": 1.474762627357468, + "grad_norm": 1.7265625, + "learning_rate": 1.3027826086956521e-06, + "loss": 0.2444, + "step": 21260 + }, + { + "epoch": 1.4754563190982006, + "grad_norm": 1.671875, + "learning_rate": 1.3010434782608697e-06, + "loss": 0.3088, + "step": 21270 + }, + { + "epoch": 1.4761500108389334, + "grad_norm": 1.1484375, + "learning_rate": 1.2993043478260869e-06, + "loss": 0.2525, + "step": 21280 + }, + { + "epoch": 1.4768437025796661, + "grad_norm": 1.2109375, + "learning_rate": 1.2975652173913045e-06, + "loss": 0.2769, + "step": 21290 + }, + { + "epoch": 1.4775373943203989, + "grad_norm": 1.125, + "learning_rate": 1.2958260869565218e-06, + "loss": 0.2298, + "step": 21300 + }, + { + "epoch": 1.4782310860611316, + "grad_norm": 2.3125, + "learning_rate": 1.2940869565217392e-06, + "loss": 0.3014, + "step": 21310 + }, + { + "epoch": 1.4789247778018644, + "grad_norm": 2.015625, + "learning_rate": 1.2923478260869566e-06, + "loss": 0.2391, + "step": 21320 + }, + { + "epoch": 1.4796184695425971, + "grad_norm": 1.1640625, + "learning_rate": 1.2906086956521741e-06, + "loss": 0.2179, + "step": 21330 + }, + { + "epoch": 1.4803121612833297, + "grad_norm": 1.25, + "learning_rate": 1.2888695652173913e-06, + "loss": 0.2062, + "step": 21340 + }, + { + "epoch": 1.4810058530240624, + "grad_norm": 1.453125, + "learning_rate": 1.2871304347826089e-06, + "loss": 0.2396, + "step": 21350 + }, + { + "epoch": 1.4816995447647952, + "grad_norm": 1.09375, + "learning_rate": 1.285391304347826e-06, + "loss": 0.2306, + "step": 21360 + }, + { + "epoch": 1.482393236505528, + "grad_norm": 1.0390625, + "learning_rate": 1.2836521739130436e-06, + "loss": 0.3046, + "step": 21370 + }, + { + "epoch": 1.4830869282462606, + "grad_norm": 1.1796875, + "learning_rate": 1.281913043478261e-06, + "loss": 0.2427, + "step": 21380 + }, + { + "epoch": 1.4837806199869932, + "grad_norm": 1.96875, + "learning_rate": 1.2801739130434784e-06, + "loss": 0.2876, + "step": 21390 + }, + { + "epoch": 1.484474311727726, + "grad_norm": 1.25, + "learning_rate": 1.2784347826086957e-06, + "loss": 0.2684, + "step": 21400 + }, + { + "epoch": 1.4851680034684587, + "grad_norm": 1.34375, + "learning_rate": 1.2766956521739133e-06, + "loss": 0.2441, + "step": 21410 + }, + { + "epoch": 1.4858616952091914, + "grad_norm": 1.15625, + "learning_rate": 1.2749565217391305e-06, + "loss": 0.2629, + "step": 21420 + }, + { + "epoch": 1.4865553869499242, + "grad_norm": 1.265625, + "learning_rate": 1.273217391304348e-06, + "loss": 0.3055, + "step": 21430 + }, + { + "epoch": 1.487249078690657, + "grad_norm": 1.7265625, + "learning_rate": 1.2714782608695652e-06, + "loss": 0.2489, + "step": 21440 + }, + { + "epoch": 1.4879427704313897, + "grad_norm": 1.0703125, + "learning_rate": 1.2697391304347828e-06, + "loss": 0.2639, + "step": 21450 + }, + { + "epoch": 1.4886364621721222, + "grad_norm": 1.2578125, + "learning_rate": 1.268e-06, + "loss": 0.2326, + "step": 21460 + }, + { + "epoch": 1.489330153912855, + "grad_norm": 1.2109375, + "learning_rate": 1.2662608695652176e-06, + "loss": 0.2768, + "step": 21470 + }, + { + "epoch": 1.4900238456535877, + "grad_norm": 1.3671875, + "learning_rate": 1.264521739130435e-06, + "loss": 0.2324, + "step": 21480 + }, + { + "epoch": 1.4907175373943204, + "grad_norm": 1.1953125, + "learning_rate": 1.2627826086956523e-06, + "loss": 0.2632, + "step": 21490 + }, + { + "epoch": 1.491411229135053, + "grad_norm": 1.34375, + "learning_rate": 1.2610434782608697e-06, + "loss": 0.1857, + "step": 21500 + }, + { + "epoch": 1.4921049208757857, + "grad_norm": 1.0078125, + "learning_rate": 1.2593043478260873e-06, + "loss": 0.2186, + "step": 21510 + }, + { + "epoch": 1.4927986126165185, + "grad_norm": 1.234375, + "learning_rate": 1.2575652173913044e-06, + "loss": 0.2557, + "step": 21520 + }, + { + "epoch": 1.4934923043572512, + "grad_norm": 1.234375, + "learning_rate": 1.255826086956522e-06, + "loss": 0.2264, + "step": 21530 + }, + { + "epoch": 1.494185996097984, + "grad_norm": 0.99609375, + "learning_rate": 1.2540869565217392e-06, + "loss": 0.2318, + "step": 21540 + }, + { + "epoch": 1.4948796878387167, + "grad_norm": 1.21875, + "learning_rate": 1.2523478260869565e-06, + "loss": 0.2262, + "step": 21550 + }, + { + "epoch": 1.4955733795794495, + "grad_norm": 1.3515625, + "learning_rate": 1.2506086956521741e-06, + "loss": 0.2534, + "step": 21560 + }, + { + "epoch": 1.4962670713201822, + "grad_norm": 1.328125, + "learning_rate": 1.2488695652173915e-06, + "loss": 0.2823, + "step": 21570 + }, + { + "epoch": 1.4969607630609147, + "grad_norm": 1.3671875, + "learning_rate": 1.2471304347826088e-06, + "loss": 0.2572, + "step": 21580 + }, + { + "epoch": 1.4976544548016475, + "grad_norm": 1.109375, + "learning_rate": 1.2453913043478262e-06, + "loss": 0.2499, + "step": 21590 + }, + { + "epoch": 1.4983481465423802, + "grad_norm": 0.96484375, + "learning_rate": 1.2436521739130436e-06, + "loss": 0.2307, + "step": 21600 + }, + { + "epoch": 1.499041838283113, + "grad_norm": 1.21875, + "learning_rate": 1.241913043478261e-06, + "loss": 0.2116, + "step": 21610 + }, + { + "epoch": 1.4997355300238455, + "grad_norm": 1.2109375, + "learning_rate": 1.2401739130434783e-06, + "loss": 0.2305, + "step": 21620 + }, + { + "epoch": 1.5004292217645783, + "grad_norm": 1.3203125, + "learning_rate": 1.2384347826086957e-06, + "loss": 0.3057, + "step": 21630 + }, + { + "epoch": 1.501122913505311, + "grad_norm": 1.3203125, + "learning_rate": 1.2366956521739133e-06, + "loss": 0.24, + "step": 21640 + }, + { + "epoch": 1.5018166052460438, + "grad_norm": 1.40625, + "learning_rate": 1.2349565217391307e-06, + "loss": 0.2924, + "step": 21650 + }, + { + "epoch": 1.5025102969867765, + "grad_norm": 1.125, + "learning_rate": 1.233217391304348e-06, + "loss": 0.2801, + "step": 21660 + }, + { + "epoch": 1.5032039887275093, + "grad_norm": 1.203125, + "learning_rate": 1.2314782608695654e-06, + "loss": 0.2273, + "step": 21670 + }, + { + "epoch": 1.503897680468242, + "grad_norm": 1.1640625, + "learning_rate": 1.2297391304347828e-06, + "loss": 0.2464, + "step": 21680 + }, + { + "epoch": 1.5045913722089748, + "grad_norm": 1.1953125, + "learning_rate": 1.2280000000000001e-06, + "loss": 0.284, + "step": 21690 + }, + { + "epoch": 1.5052850639497075, + "grad_norm": 1.140625, + "learning_rate": 1.2262608695652175e-06, + "loss": 0.2401, + "step": 21700 + }, + { + "epoch": 1.50597875569044, + "grad_norm": 1.1328125, + "learning_rate": 1.2245217391304349e-06, + "loss": 0.2709, + "step": 21710 + }, + { + "epoch": 1.5066724474311728, + "grad_norm": 1.1328125, + "learning_rate": 1.2227826086956523e-06, + "loss": 0.2462, + "step": 21720 + }, + { + "epoch": 1.5073661391719055, + "grad_norm": 1.28125, + "learning_rate": 1.2210434782608696e-06, + "loss": 0.2562, + "step": 21730 + }, + { + "epoch": 1.508059830912638, + "grad_norm": 0.9765625, + "learning_rate": 1.219304347826087e-06, + "loss": 0.2227, + "step": 21740 + }, + { + "epoch": 1.5087535226533708, + "grad_norm": 1.265625, + "learning_rate": 1.2175652173913044e-06, + "loss": 0.2564, + "step": 21750 + }, + { + "epoch": 1.5094472143941036, + "grad_norm": 0.921875, + "learning_rate": 1.2158260869565217e-06, + "loss": 0.2372, + "step": 21760 + }, + { + "epoch": 1.5101409061348363, + "grad_norm": 0.99609375, + "learning_rate": 1.2140869565217391e-06, + "loss": 0.2139, + "step": 21770 + }, + { + "epoch": 1.510834597875569, + "grad_norm": 1.3359375, + "learning_rate": 1.2123478260869565e-06, + "loss": 0.2311, + "step": 21780 + }, + { + "epoch": 1.5115282896163018, + "grad_norm": 1.1796875, + "learning_rate": 1.210608695652174e-06, + "loss": 0.2667, + "step": 21790 + }, + { + "epoch": 1.5122219813570346, + "grad_norm": 1.1171875, + "learning_rate": 1.2088695652173914e-06, + "loss": 0.2117, + "step": 21800 + }, + { + "epoch": 1.5129156730977673, + "grad_norm": 1.03125, + "learning_rate": 1.2071304347826088e-06, + "loss": 0.2567, + "step": 21810 + }, + { + "epoch": 1.5136093648384998, + "grad_norm": 1.3671875, + "learning_rate": 1.2053913043478262e-06, + "loss": 0.2249, + "step": 21820 + }, + { + "epoch": 1.5143030565792326, + "grad_norm": 1.125, + "learning_rate": 1.2036521739130436e-06, + "loss": 0.2845, + "step": 21830 + }, + { + "epoch": 1.5149967483199653, + "grad_norm": 1.359375, + "learning_rate": 1.201913043478261e-06, + "loss": 0.2194, + "step": 21840 + }, + { + "epoch": 1.5156904400606979, + "grad_norm": 1.2734375, + "learning_rate": 1.2001739130434783e-06, + "loss": 0.2333, + "step": 21850 + }, + { + "epoch": 1.5163841318014306, + "grad_norm": 1.234375, + "learning_rate": 1.1984347826086957e-06, + "loss": 0.2, + "step": 21860 + }, + { + "epoch": 1.5170778235421634, + "grad_norm": 1.1328125, + "learning_rate": 1.1966956521739132e-06, + "loss": 0.2649, + "step": 21870 + }, + { + "epoch": 1.517771515282896, + "grad_norm": 1.6171875, + "learning_rate": 1.1949565217391306e-06, + "loss": 0.2606, + "step": 21880 + }, + { + "epoch": 1.5184652070236289, + "grad_norm": 1.109375, + "learning_rate": 1.193217391304348e-06, + "loss": 0.2429, + "step": 21890 + }, + { + "epoch": 1.5191588987643616, + "grad_norm": 1.359375, + "learning_rate": 1.1914782608695654e-06, + "loss": 0.2327, + "step": 21900 + }, + { + "epoch": 1.5198525905050944, + "grad_norm": 1.0, + "learning_rate": 1.1897391304347827e-06, + "loss": 0.2417, + "step": 21910 + }, + { + "epoch": 1.520546282245827, + "grad_norm": 1.046875, + "learning_rate": 1.188e-06, + "loss": 0.2538, + "step": 21920 + }, + { + "epoch": 1.5212399739865599, + "grad_norm": 1.078125, + "learning_rate": 1.1862608695652175e-06, + "loss": 0.2746, + "step": 21930 + }, + { + "epoch": 1.5219336657272924, + "grad_norm": 1.3359375, + "learning_rate": 1.1845217391304348e-06, + "loss": 0.2102, + "step": 21940 + }, + { + "epoch": 1.5226273574680251, + "grad_norm": 1.2421875, + "learning_rate": 1.1827826086956522e-06, + "loss": 0.2367, + "step": 21950 + }, + { + "epoch": 1.5233210492087579, + "grad_norm": 0.85546875, + "learning_rate": 1.1810434782608698e-06, + "loss": 0.2139, + "step": 21960 + }, + { + "epoch": 1.5240147409494904, + "grad_norm": 1.15625, + "learning_rate": 1.1793043478260872e-06, + "loss": 0.3149, + "step": 21970 + }, + { + "epoch": 1.5247084326902232, + "grad_norm": 1.1796875, + "learning_rate": 1.1775652173913045e-06, + "loss": 0.2062, + "step": 21980 + }, + { + "epoch": 1.525402124430956, + "grad_norm": 1.46875, + "learning_rate": 1.175826086956522e-06, + "loss": 0.2454, + "step": 21990 + }, + { + "epoch": 1.5260958161716887, + "grad_norm": 1.0078125, + "learning_rate": 1.1740869565217393e-06, + "loss": 0.1977, + "step": 22000 + }, + { + "epoch": 1.5267895079124214, + "grad_norm": 1.5546875, + "learning_rate": 1.1723478260869567e-06, + "loss": 0.26, + "step": 22010 + }, + { + "epoch": 1.5274831996531542, + "grad_norm": 1.015625, + "learning_rate": 1.170608695652174e-06, + "loss": 0.2235, + "step": 22020 + }, + { + "epoch": 1.528176891393887, + "grad_norm": 1.265625, + "learning_rate": 1.1688695652173914e-06, + "loss": 0.2297, + "step": 22030 + }, + { + "epoch": 1.5288705831346197, + "grad_norm": 1.25, + "learning_rate": 1.1671304347826088e-06, + "loss": 0.2571, + "step": 22040 + }, + { + "epoch": 1.5295642748753524, + "grad_norm": 1.4453125, + "learning_rate": 1.1653913043478261e-06, + "loss": 0.2845, + "step": 22050 + }, + { + "epoch": 1.530257966616085, + "grad_norm": 1.453125, + "learning_rate": 1.1636521739130435e-06, + "loss": 0.2432, + "step": 22060 + }, + { + "epoch": 1.5309516583568177, + "grad_norm": 1.96875, + "learning_rate": 1.1619130434782609e-06, + "loss": 0.2807, + "step": 22070 + }, + { + "epoch": 1.5316453500975504, + "grad_norm": 1.09375, + "learning_rate": 1.1601739130434783e-06, + "loss": 0.222, + "step": 22080 + }, + { + "epoch": 1.532339041838283, + "grad_norm": 1.1875, + "learning_rate": 1.1584347826086956e-06, + "loss": 0.2292, + "step": 22090 + }, + { + "epoch": 1.5330327335790157, + "grad_norm": 1.8203125, + "learning_rate": 1.1566956521739132e-06, + "loss": 0.2663, + "step": 22100 + }, + { + "epoch": 1.5337264253197485, + "grad_norm": 1.3359375, + "learning_rate": 1.1549565217391306e-06, + "loss": 0.2528, + "step": 22110 + }, + { + "epoch": 1.5344201170604812, + "grad_norm": 1.484375, + "learning_rate": 1.153217391304348e-06, + "loss": 0.2552, + "step": 22120 + }, + { + "epoch": 1.535113808801214, + "grad_norm": 1.0859375, + "learning_rate": 1.1514782608695653e-06, + "loss": 0.1995, + "step": 22130 + }, + { + "epoch": 1.5358075005419467, + "grad_norm": 1.453125, + "learning_rate": 1.1497391304347827e-06, + "loss": 0.2302, + "step": 22140 + }, + { + "epoch": 1.5365011922826795, + "grad_norm": 1.1640625, + "learning_rate": 1.148e-06, + "loss": 0.2027, + "step": 22150 + }, + { + "epoch": 1.5371948840234122, + "grad_norm": 1.1875, + "learning_rate": 1.1462608695652174e-06, + "loss": 0.234, + "step": 22160 + }, + { + "epoch": 1.537888575764145, + "grad_norm": 1.3359375, + "learning_rate": 1.1445217391304348e-06, + "loss": 0.2325, + "step": 22170 + }, + { + "epoch": 1.5385822675048775, + "grad_norm": 1.703125, + "learning_rate": 1.1427826086956522e-06, + "loss": 0.2567, + "step": 22180 + }, + { + "epoch": 1.5392759592456102, + "grad_norm": 1.21875, + "learning_rate": 1.1410434782608698e-06, + "loss": 0.2226, + "step": 22190 + }, + { + "epoch": 1.539969650986343, + "grad_norm": 1.1015625, + "learning_rate": 1.1393043478260871e-06, + "loss": 0.2206, + "step": 22200 + }, + { + "epoch": 1.5406633427270755, + "grad_norm": 1.0234375, + "learning_rate": 1.1375652173913045e-06, + "loss": 0.2342, + "step": 22210 + }, + { + "epoch": 1.5413570344678083, + "grad_norm": 1.1875, + "learning_rate": 1.1358260869565219e-06, + "loss": 0.2695, + "step": 22220 + }, + { + "epoch": 1.542050726208541, + "grad_norm": 1.3671875, + "learning_rate": 1.1340869565217392e-06, + "loss": 0.2812, + "step": 22230 + }, + { + "epoch": 1.5427444179492737, + "grad_norm": 0.95703125, + "learning_rate": 1.1323478260869566e-06, + "loss": 0.2701, + "step": 22240 + }, + { + "epoch": 1.5434381096900065, + "grad_norm": 1.3046875, + "learning_rate": 1.130608695652174e-06, + "loss": 0.2258, + "step": 22250 + }, + { + "epoch": 1.5441318014307392, + "grad_norm": 0.96484375, + "learning_rate": 1.1288695652173914e-06, + "loss": 0.2496, + "step": 22260 + }, + { + "epoch": 1.544825493171472, + "grad_norm": 1.1796875, + "learning_rate": 1.127130434782609e-06, + "loss": 0.2058, + "step": 22270 + }, + { + "epoch": 1.5455191849122047, + "grad_norm": 1.265625, + "learning_rate": 1.1253913043478263e-06, + "loss": 0.2076, + "step": 22280 + }, + { + "epoch": 1.5462128766529375, + "grad_norm": 0.8984375, + "learning_rate": 1.1236521739130437e-06, + "loss": 0.2568, + "step": 22290 + }, + { + "epoch": 1.54690656839367, + "grad_norm": 1.2578125, + "learning_rate": 1.121913043478261e-06, + "loss": 0.2804, + "step": 22300 + }, + { + "epoch": 1.5476002601344028, + "grad_norm": 1.109375, + "learning_rate": 1.1201739130434784e-06, + "loss": 0.2401, + "step": 22310 + }, + { + "epoch": 1.5482939518751355, + "grad_norm": 0.953125, + "learning_rate": 1.1184347826086958e-06, + "loss": 0.2465, + "step": 22320 + }, + { + "epoch": 1.548987643615868, + "grad_norm": 1.171875, + "learning_rate": 1.1166956521739132e-06, + "loss": 0.3361, + "step": 22330 + }, + { + "epoch": 1.5496813353566008, + "grad_norm": 1.3359375, + "learning_rate": 1.1149565217391305e-06, + "loss": 0.2217, + "step": 22340 + }, + { + "epoch": 1.5503750270973335, + "grad_norm": 1.1875, + "learning_rate": 1.113217391304348e-06, + "loss": 0.2402, + "step": 22350 + }, + { + "epoch": 1.5510687188380663, + "grad_norm": 1.09375, + "learning_rate": 1.1114782608695653e-06, + "loss": 0.2065, + "step": 22360 + }, + { + "epoch": 1.551762410578799, + "grad_norm": 0.921875, + "learning_rate": 1.1097391304347827e-06, + "loss": 0.237, + "step": 22370 + }, + { + "epoch": 1.5524561023195318, + "grad_norm": 1.03125, + "learning_rate": 1.108e-06, + "loss": 0.3074, + "step": 22380 + }, + { + "epoch": 1.5531497940602645, + "grad_norm": 1.1796875, + "learning_rate": 1.1062608695652174e-06, + "loss": 0.2363, + "step": 22390 + }, + { + "epoch": 1.5538434858009973, + "grad_norm": 1.328125, + "learning_rate": 1.1045217391304348e-06, + "loss": 0.3468, + "step": 22400 + }, + { + "epoch": 1.55453717754173, + "grad_norm": 1.578125, + "learning_rate": 1.1027826086956521e-06, + "loss": 0.2699, + "step": 22410 + }, + { + "epoch": 1.5552308692824626, + "grad_norm": 1.375, + "learning_rate": 1.1010434782608697e-06, + "loss": 0.2687, + "step": 22420 + }, + { + "epoch": 1.5559245610231953, + "grad_norm": 0.9375, + "learning_rate": 1.099304347826087e-06, + "loss": 0.2097, + "step": 22430 + }, + { + "epoch": 1.556618252763928, + "grad_norm": 1.109375, + "learning_rate": 1.0975652173913045e-06, + "loss": 0.2544, + "step": 22440 + }, + { + "epoch": 1.5573119445046606, + "grad_norm": 1.1015625, + "learning_rate": 1.0958260869565218e-06, + "loss": 0.229, + "step": 22450 + }, + { + "epoch": 1.5580056362453933, + "grad_norm": 1.15625, + "learning_rate": 1.0940869565217392e-06, + "loss": 0.3094, + "step": 22460 + }, + { + "epoch": 1.558699327986126, + "grad_norm": 0.9296875, + "learning_rate": 1.0923478260869566e-06, + "loss": 0.1948, + "step": 22470 + }, + { + "epoch": 1.5593930197268588, + "grad_norm": 1.4375, + "learning_rate": 1.090608695652174e-06, + "loss": 0.2203, + "step": 22480 + }, + { + "epoch": 1.5600867114675916, + "grad_norm": 1.1328125, + "learning_rate": 1.0888695652173913e-06, + "loss": 0.2523, + "step": 22490 + }, + { + "epoch": 1.5607804032083243, + "grad_norm": 1.203125, + "learning_rate": 1.087130434782609e-06, + "loss": 0.2415, + "step": 22500 + }, + { + "epoch": 1.561474094949057, + "grad_norm": 1.15625, + "learning_rate": 1.0853913043478263e-06, + "loss": 0.268, + "step": 22510 + }, + { + "epoch": 1.5621677866897898, + "grad_norm": 1.4453125, + "learning_rate": 1.0836521739130436e-06, + "loss": 0.2474, + "step": 22520 + }, + { + "epoch": 1.5628614784305226, + "grad_norm": 1.15625, + "learning_rate": 1.081913043478261e-06, + "loss": 0.3104, + "step": 22530 + }, + { + "epoch": 1.5635551701712551, + "grad_norm": 1.0703125, + "learning_rate": 1.0801739130434784e-06, + "loss": 0.2487, + "step": 22540 + }, + { + "epoch": 1.5642488619119879, + "grad_norm": 1.21875, + "learning_rate": 1.0784347826086958e-06, + "loss": 0.2224, + "step": 22550 + }, + { + "epoch": 1.5649425536527206, + "grad_norm": 1.046875, + "learning_rate": 1.0766956521739131e-06, + "loss": 0.2211, + "step": 22560 + }, + { + "epoch": 1.5656362453934531, + "grad_norm": 0.99609375, + "learning_rate": 1.0749565217391305e-06, + "loss": 0.2382, + "step": 22570 + }, + { + "epoch": 1.566329937134186, + "grad_norm": 1.5078125, + "learning_rate": 1.0732173913043479e-06, + "loss": 0.2285, + "step": 22580 + }, + { + "epoch": 1.5670236288749186, + "grad_norm": 1.5078125, + "learning_rate": 1.0714782608695655e-06, + "loss": 0.3084, + "step": 22590 + }, + { + "epoch": 1.5677173206156514, + "grad_norm": 1.140625, + "learning_rate": 1.0697391304347828e-06, + "loss": 0.2316, + "step": 22600 + }, + { + "epoch": 1.5684110123563841, + "grad_norm": 1.203125, + "learning_rate": 1.0680000000000002e-06, + "loss": 0.2359, + "step": 22610 + }, + { + "epoch": 1.5691047040971169, + "grad_norm": 1.0390625, + "learning_rate": 1.0662608695652176e-06, + "loss": 0.3312, + "step": 22620 + }, + { + "epoch": 1.5697983958378496, + "grad_norm": 1.2734375, + "learning_rate": 1.064521739130435e-06, + "loss": 0.2094, + "step": 22630 + }, + { + "epoch": 1.5704920875785824, + "grad_norm": 1.2265625, + "learning_rate": 1.062782608695652e-06, + "loss": 0.2005, + "step": 22640 + }, + { + "epoch": 1.5711857793193151, + "grad_norm": 1.3671875, + "learning_rate": 1.0610434782608697e-06, + "loss": 0.2151, + "step": 22650 + }, + { + "epoch": 1.5718794710600477, + "grad_norm": 1.28125, + "learning_rate": 1.059304347826087e-06, + "loss": 0.226, + "step": 22660 + }, + { + "epoch": 1.5725731628007804, + "grad_norm": 1.09375, + "learning_rate": 1.0575652173913044e-06, + "loss": 0.2764, + "step": 22670 + }, + { + "epoch": 1.5732668545415132, + "grad_norm": 1.1484375, + "learning_rate": 1.0558260869565218e-06, + "loss": 0.2149, + "step": 22680 + }, + { + "epoch": 1.5739605462822457, + "grad_norm": 0.8046875, + "learning_rate": 1.0540869565217392e-06, + "loss": 0.2572, + "step": 22690 + }, + { + "epoch": 1.5746542380229784, + "grad_norm": 1.359375, + "learning_rate": 1.0523478260869565e-06, + "loss": 0.2572, + "step": 22700 + }, + { + "epoch": 1.5753479297637112, + "grad_norm": 1.0703125, + "learning_rate": 1.050608695652174e-06, + "loss": 0.241, + "step": 22710 + }, + { + "epoch": 1.576041621504444, + "grad_norm": 1.078125, + "learning_rate": 1.0488695652173913e-06, + "loss": 0.2464, + "step": 22720 + }, + { + "epoch": 1.5767353132451767, + "grad_norm": 1.5, + "learning_rate": 1.0471304347826089e-06, + "loss": 0.2494, + "step": 22730 + }, + { + "epoch": 1.5774290049859094, + "grad_norm": 1.1640625, + "learning_rate": 1.0453913043478262e-06, + "loss": 0.314, + "step": 22740 + }, + { + "epoch": 1.5781226967266422, + "grad_norm": 1.203125, + "learning_rate": 1.0436521739130436e-06, + "loss": 0.231, + "step": 22750 + }, + { + "epoch": 1.578816388467375, + "grad_norm": 1.4453125, + "learning_rate": 1.041913043478261e-06, + "loss": 0.3283, + "step": 22760 + }, + { + "epoch": 1.5795100802081077, + "grad_norm": 1.0859375, + "learning_rate": 1.0401739130434783e-06, + "loss": 0.3071, + "step": 22770 + }, + { + "epoch": 1.5802037719488402, + "grad_norm": 1.125, + "learning_rate": 1.0384347826086957e-06, + "loss": 0.2743, + "step": 22780 + }, + { + "epoch": 1.580897463689573, + "grad_norm": 1.2890625, + "learning_rate": 1.036695652173913e-06, + "loss": 0.253, + "step": 22790 + }, + { + "epoch": 1.5815911554303057, + "grad_norm": 1.046875, + "learning_rate": 1.0349565217391305e-06, + "loss": 0.2143, + "step": 22800 + }, + { + "epoch": 1.5822848471710382, + "grad_norm": 1.140625, + "learning_rate": 1.0332173913043478e-06, + "loss": 0.2481, + "step": 22810 + }, + { + "epoch": 1.582978538911771, + "grad_norm": 1.375, + "learning_rate": 1.0314782608695654e-06, + "loss": 0.2572, + "step": 22820 + }, + { + "epoch": 1.5836722306525037, + "grad_norm": 1.171875, + "learning_rate": 1.0297391304347828e-06, + "loss": 0.2987, + "step": 22830 + }, + { + "epoch": 1.5843659223932365, + "grad_norm": 1.375, + "learning_rate": 1.0280000000000002e-06, + "loss": 0.2331, + "step": 22840 + }, + { + "epoch": 1.5850596141339692, + "grad_norm": 0.95703125, + "learning_rate": 1.0262608695652175e-06, + "loss": 0.2076, + "step": 22850 + }, + { + "epoch": 1.585753305874702, + "grad_norm": 1.125, + "learning_rate": 1.024521739130435e-06, + "loss": 0.2365, + "step": 22860 + }, + { + "epoch": 1.5864469976154347, + "grad_norm": 1.8828125, + "learning_rate": 1.0227826086956523e-06, + "loss": 0.2808, + "step": 22870 + }, + { + "epoch": 1.5871406893561675, + "grad_norm": 1.2265625, + "learning_rate": 1.0210434782608696e-06, + "loss": 0.2484, + "step": 22880 + }, + { + "epoch": 1.5878343810969002, + "grad_norm": 1.625, + "learning_rate": 1.019304347826087e-06, + "loss": 0.2585, + "step": 22890 + }, + { + "epoch": 1.5885280728376328, + "grad_norm": 1.1015625, + "learning_rate": 1.0175652173913044e-06, + "loss": 0.2316, + "step": 22900 + }, + { + "epoch": 1.5892217645783655, + "grad_norm": 1.6015625, + "learning_rate": 1.015826086956522e-06, + "loss": 0.2097, + "step": 22910 + }, + { + "epoch": 1.5899154563190983, + "grad_norm": 1.3203125, + "learning_rate": 1.0140869565217393e-06, + "loss": 0.275, + "step": 22920 + }, + { + "epoch": 1.5906091480598308, + "grad_norm": 1.359375, + "learning_rate": 1.0123478260869567e-06, + "loss": 0.2504, + "step": 22930 + }, + { + "epoch": 1.5913028398005635, + "grad_norm": 1.1953125, + "learning_rate": 1.0106086956521739e-06, + "loss": 0.2771, + "step": 22940 + }, + { + "epoch": 1.5919965315412963, + "grad_norm": 1.375, + "learning_rate": 1.0088695652173912e-06, + "loss": 0.2555, + "step": 22950 + }, + { + "epoch": 1.592690223282029, + "grad_norm": 1.2421875, + "learning_rate": 1.0071304347826088e-06, + "loss": 0.2447, + "step": 22960 + }, + { + "epoch": 1.5933839150227618, + "grad_norm": 1.015625, + "learning_rate": 1.0053913043478262e-06, + "loss": 0.2536, + "step": 22970 + }, + { + "epoch": 1.5940776067634945, + "grad_norm": 1.1328125, + "learning_rate": 1.0036521739130436e-06, + "loss": 0.2336, + "step": 22980 + }, + { + "epoch": 1.5947712985042273, + "grad_norm": 1.4765625, + "learning_rate": 1.001913043478261e-06, + "loss": 0.2328, + "step": 22990 + }, + { + "epoch": 1.59546499024496, + "grad_norm": 1.703125, + "learning_rate": 1.0001739130434783e-06, + "loss": 0.2339, + "step": 23000 + }, + { + "epoch": 1.5961586819856928, + "grad_norm": 1.4609375, + "learning_rate": 9.984347826086957e-07, + "loss": 0.2749, + "step": 23010 + }, + { + "epoch": 1.5968523737264253, + "grad_norm": 1.390625, + "learning_rate": 9.96695652173913e-07, + "loss": 0.2484, + "step": 23020 + }, + { + "epoch": 1.597546065467158, + "grad_norm": 1.2734375, + "learning_rate": 9.949565217391304e-07, + "loss": 0.2099, + "step": 23030 + }, + { + "epoch": 1.5982397572078908, + "grad_norm": 2.0, + "learning_rate": 9.932173913043478e-07, + "loss": 0.2897, + "step": 23040 + }, + { + "epoch": 1.5989334489486233, + "grad_norm": 1.296875, + "learning_rate": 9.914782608695654e-07, + "loss": 0.2628, + "step": 23050 + }, + { + "epoch": 1.599627140689356, + "grad_norm": 1.15625, + "learning_rate": 9.897391304347827e-07, + "loss": 0.1823, + "step": 23060 + }, + { + "epoch": 1.6003208324300888, + "grad_norm": 0.796875, + "learning_rate": 9.880000000000001e-07, + "loss": 0.2547, + "step": 23070 + }, + { + "epoch": 1.6010145241708216, + "grad_norm": 1.1328125, + "learning_rate": 9.862608695652175e-07, + "loss": 0.2082, + "step": 23080 + }, + { + "epoch": 1.6017082159115543, + "grad_norm": 0.78515625, + "learning_rate": 9.845217391304349e-07, + "loss": 0.2021, + "step": 23090 + }, + { + "epoch": 1.602401907652287, + "grad_norm": 1.0859375, + "learning_rate": 9.827826086956522e-07, + "loss": 0.2168, + "step": 23100 + }, + { + "epoch": 1.6030955993930198, + "grad_norm": 1.3984375, + "learning_rate": 9.810434782608696e-07, + "loss": 0.3124, + "step": 23110 + }, + { + "epoch": 1.6037892911337526, + "grad_norm": 1.234375, + "learning_rate": 9.79304347826087e-07, + "loss": 0.2166, + "step": 23120 + }, + { + "epoch": 1.604482982874485, + "grad_norm": 0.9609375, + "learning_rate": 9.775652173913043e-07, + "loss": 0.2531, + "step": 23130 + }, + { + "epoch": 1.6051766746152178, + "grad_norm": 1.109375, + "learning_rate": 9.75826086956522e-07, + "loss": 0.3279, + "step": 23140 + }, + { + "epoch": 1.6058703663559506, + "grad_norm": 1.453125, + "learning_rate": 9.740869565217393e-07, + "loss": 0.242, + "step": 23150 + }, + { + "epoch": 1.6065640580966831, + "grad_norm": 1.078125, + "learning_rate": 9.723478260869567e-07, + "loss": 0.1916, + "step": 23160 + }, + { + "epoch": 1.6072577498374159, + "grad_norm": 1.53125, + "learning_rate": 9.70608695652174e-07, + "loss": 0.3144, + "step": 23170 + }, + { + "epoch": 1.6079514415781486, + "grad_norm": 1.15625, + "learning_rate": 9.688695652173914e-07, + "loss": 0.251, + "step": 23180 + }, + { + "epoch": 1.6086451333188814, + "grad_norm": 1.359375, + "learning_rate": 9.671304347826088e-07, + "loss": 0.2514, + "step": 23190 + }, + { + "epoch": 1.6093388250596141, + "grad_norm": 1.8984375, + "learning_rate": 9.653913043478261e-07, + "loss": 0.3102, + "step": 23200 + }, + { + "epoch": 1.6100325168003469, + "grad_norm": 1.2421875, + "learning_rate": 9.636521739130435e-07, + "loss": 0.2325, + "step": 23210 + }, + { + "epoch": 1.6107262085410796, + "grad_norm": 0.984375, + "learning_rate": 9.61913043478261e-07, + "loss": 0.2195, + "step": 23220 + }, + { + "epoch": 1.6114199002818124, + "grad_norm": 1.28125, + "learning_rate": 9.601739130434785e-07, + "loss": 0.2705, + "step": 23230 + }, + { + "epoch": 1.6121135920225451, + "grad_norm": 1.0234375, + "learning_rate": 9.584347826086958e-07, + "loss": 0.2394, + "step": 23240 + }, + { + "epoch": 1.6128072837632776, + "grad_norm": 1.2578125, + "learning_rate": 9.56695652173913e-07, + "loss": 0.2567, + "step": 23250 + }, + { + "epoch": 1.6135009755040104, + "grad_norm": 0.9453125, + "learning_rate": 9.549565217391304e-07, + "loss": 0.2116, + "step": 23260 + }, + { + "epoch": 1.6141946672447431, + "grad_norm": 1.125, + "learning_rate": 9.532173913043479e-07, + "loss": 0.2329, + "step": 23270 + }, + { + "epoch": 1.6148883589854757, + "grad_norm": 1.4765625, + "learning_rate": 9.514782608695652e-07, + "loss": 0.2868, + "step": 23280 + }, + { + "epoch": 1.6155820507262084, + "grad_norm": 1.1796875, + "learning_rate": 9.497391304347826e-07, + "loss": 0.289, + "step": 23290 + }, + { + "epoch": 1.6162757424669412, + "grad_norm": 1.171875, + "learning_rate": 9.480000000000001e-07, + "loss": 0.2032, + "step": 23300 + }, + { + "epoch": 1.616969434207674, + "grad_norm": 1.2890625, + "learning_rate": 9.462608695652174e-07, + "loss": 0.257, + "step": 23310 + }, + { + "epoch": 1.6176631259484067, + "grad_norm": 1.3046875, + "learning_rate": 9.445217391304348e-07, + "loss": 0.2119, + "step": 23320 + }, + { + "epoch": 1.6183568176891394, + "grad_norm": 1.296875, + "learning_rate": 9.427826086956522e-07, + "loss": 0.247, + "step": 23330 + }, + { + "epoch": 1.6190505094298722, + "grad_norm": 1.296875, + "learning_rate": 9.410434782608697e-07, + "loss": 0.2446, + "step": 23340 + }, + { + "epoch": 1.619744201170605, + "grad_norm": 1.5625, + "learning_rate": 9.39304347826087e-07, + "loss": 0.2619, + "step": 23350 + }, + { + "epoch": 1.6204378929113377, + "grad_norm": 1.390625, + "learning_rate": 9.375652173913044e-07, + "loss": 0.257, + "step": 23360 + }, + { + "epoch": 1.6211315846520702, + "grad_norm": 1.296875, + "learning_rate": 9.358260869565218e-07, + "loss": 0.2905, + "step": 23370 + }, + { + "epoch": 1.621825276392803, + "grad_norm": 1.0703125, + "learning_rate": 9.340869565217391e-07, + "loss": 0.2398, + "step": 23380 + }, + { + "epoch": 1.6225189681335357, + "grad_norm": 1.2421875, + "learning_rate": 9.323478260869566e-07, + "loss": 0.2423, + "step": 23390 + }, + { + "epoch": 1.6232126598742682, + "grad_norm": 1.3046875, + "learning_rate": 9.30608695652174e-07, + "loss": 0.2582, + "step": 23400 + }, + { + "epoch": 1.623906351615001, + "grad_norm": 1.3359375, + "learning_rate": 9.288695652173914e-07, + "loss": 0.2746, + "step": 23410 + }, + { + "epoch": 1.6246000433557337, + "grad_norm": 1.125, + "learning_rate": 9.271304347826087e-07, + "loss": 0.2465, + "step": 23420 + }, + { + "epoch": 1.6252937350964665, + "grad_norm": 1.296875, + "learning_rate": 9.253913043478262e-07, + "loss": 0.223, + "step": 23430 + }, + { + "epoch": 1.6259874268371992, + "grad_norm": 1.203125, + "learning_rate": 9.236521739130436e-07, + "loss": 0.2014, + "step": 23440 + }, + { + "epoch": 1.626681118577932, + "grad_norm": 1.21875, + "learning_rate": 9.21913043478261e-07, + "loss": 0.2196, + "step": 23450 + }, + { + "epoch": 1.6273748103186647, + "grad_norm": 0.9765625, + "learning_rate": 9.201739130434783e-07, + "loss": 0.2466, + "step": 23460 + }, + { + "epoch": 1.6280685020593975, + "grad_norm": 1.03125, + "learning_rate": 9.184347826086958e-07, + "loss": 0.2713, + "step": 23470 + }, + { + "epoch": 1.6287621938001302, + "grad_norm": 0.7734375, + "learning_rate": 9.166956521739132e-07, + "loss": 0.2448, + "step": 23480 + }, + { + "epoch": 1.6294558855408627, + "grad_norm": 1.8203125, + "learning_rate": 9.149565217391305e-07, + "loss": 0.2153, + "step": 23490 + }, + { + "epoch": 1.6301495772815955, + "grad_norm": 1.1484375, + "learning_rate": 9.132173913043479e-07, + "loss": 0.2345, + "step": 23500 + }, + { + "epoch": 1.6308432690223282, + "grad_norm": 1.484375, + "learning_rate": 9.114782608695653e-07, + "loss": 0.2398, + "step": 23510 + }, + { + "epoch": 1.6315369607630608, + "grad_norm": 1.546875, + "learning_rate": 9.097391304347828e-07, + "loss": 0.2641, + "step": 23520 + }, + { + "epoch": 1.6322306525037935, + "grad_norm": 1.265625, + "learning_rate": 9.080000000000001e-07, + "loss": 0.216, + "step": 23530 + }, + { + "epoch": 1.6329243442445263, + "grad_norm": 1.109375, + "learning_rate": 9.062608695652175e-07, + "loss": 0.2306, + "step": 23540 + }, + { + "epoch": 1.633618035985259, + "grad_norm": 1.2109375, + "learning_rate": 9.045217391304349e-07, + "loss": 0.233, + "step": 23550 + }, + { + "epoch": 1.6343117277259918, + "grad_norm": 1.1953125, + "learning_rate": 9.027826086956521e-07, + "loss": 0.2194, + "step": 23560 + }, + { + "epoch": 1.6350054194667245, + "grad_norm": 0.96484375, + "learning_rate": 9.010434782608696e-07, + "loss": 0.1982, + "step": 23570 + }, + { + "epoch": 1.6356991112074573, + "grad_norm": 1.40625, + "learning_rate": 8.99304347826087e-07, + "loss": 0.2591, + "step": 23580 + }, + { + "epoch": 1.63639280294819, + "grad_norm": 1.3046875, + "learning_rate": 8.975652173913044e-07, + "loss": 0.2702, + "step": 23590 + }, + { + "epoch": 1.6370864946889228, + "grad_norm": 1.5625, + "learning_rate": 8.958260869565217e-07, + "loss": 0.2622, + "step": 23600 + }, + { + "epoch": 1.6377801864296553, + "grad_norm": 0.9453125, + "learning_rate": 8.940869565217391e-07, + "loss": 0.2386, + "step": 23610 + }, + { + "epoch": 1.638473878170388, + "grad_norm": 1.0234375, + "learning_rate": 8.923478260869566e-07, + "loss": 0.2167, + "step": 23620 + }, + { + "epoch": 1.6391675699111208, + "grad_norm": 1.109375, + "learning_rate": 8.90608695652174e-07, + "loss": 0.2375, + "step": 23630 + }, + { + "epoch": 1.6398612616518533, + "grad_norm": 1.21875, + "learning_rate": 8.888695652173913e-07, + "loss": 0.2261, + "step": 23640 + }, + { + "epoch": 1.640554953392586, + "grad_norm": 1.1953125, + "learning_rate": 8.871304347826087e-07, + "loss": 0.2658, + "step": 23650 + }, + { + "epoch": 1.6412486451333188, + "grad_norm": 1.0625, + "learning_rate": 8.853913043478262e-07, + "loss": 0.2705, + "step": 23660 + }, + { + "epoch": 1.6419423368740516, + "grad_norm": 1.0078125, + "learning_rate": 8.836521739130435e-07, + "loss": 0.2295, + "step": 23670 + }, + { + "epoch": 1.6426360286147843, + "grad_norm": 0.99609375, + "learning_rate": 8.819130434782609e-07, + "loss": 0.2608, + "step": 23680 + }, + { + "epoch": 1.643329720355517, + "grad_norm": 0.984375, + "learning_rate": 8.801739130434783e-07, + "loss": 0.2073, + "step": 23690 + }, + { + "epoch": 1.6440234120962498, + "grad_norm": 1.25, + "learning_rate": 8.784347826086958e-07, + "loss": 0.2729, + "step": 23700 + }, + { + "epoch": 1.6447171038369826, + "grad_norm": 1.09375, + "learning_rate": 8.766956521739131e-07, + "loss": 0.2614, + "step": 23710 + }, + { + "epoch": 1.6454107955777153, + "grad_norm": 1.09375, + "learning_rate": 8.749565217391305e-07, + "loss": 0.2278, + "step": 23720 + }, + { + "epoch": 1.6461044873184478, + "grad_norm": 1.2109375, + "learning_rate": 8.732173913043479e-07, + "loss": 0.2365, + "step": 23730 + }, + { + "epoch": 1.6467981790591806, + "grad_norm": 1.3125, + "learning_rate": 8.714782608695654e-07, + "loss": 0.2512, + "step": 23740 + }, + { + "epoch": 1.6474918707999133, + "grad_norm": 1.46875, + "learning_rate": 8.697391304347827e-07, + "loss": 0.3113, + "step": 23750 + }, + { + "epoch": 1.6481855625406459, + "grad_norm": 1.375, + "learning_rate": 8.680000000000001e-07, + "loss": 0.2599, + "step": 23760 + }, + { + "epoch": 1.6488792542813786, + "grad_norm": 0.9921875, + "learning_rate": 8.662608695652175e-07, + "loss": 0.2318, + "step": 23770 + }, + { + "epoch": 1.6495729460221114, + "grad_norm": 0.96484375, + "learning_rate": 8.645217391304348e-07, + "loss": 0.2484, + "step": 23780 + }, + { + "epoch": 1.650266637762844, + "grad_norm": 1.609375, + "learning_rate": 8.627826086956523e-07, + "loss": 0.2319, + "step": 23790 + }, + { + "epoch": 1.6509603295035769, + "grad_norm": 1.1953125, + "learning_rate": 8.610434782608697e-07, + "loss": 0.2098, + "step": 23800 + }, + { + "epoch": 1.6516540212443096, + "grad_norm": 1.0078125, + "learning_rate": 8.593043478260871e-07, + "loss": 0.2516, + "step": 23810 + }, + { + "epoch": 1.6523477129850423, + "grad_norm": 0.9765625, + "learning_rate": 8.575652173913044e-07, + "loss": 0.205, + "step": 23820 + }, + { + "epoch": 1.653041404725775, + "grad_norm": 1.4375, + "learning_rate": 8.558260869565219e-07, + "loss": 0.2468, + "step": 23830 + }, + { + "epoch": 1.6537350964665078, + "grad_norm": 1.375, + "learning_rate": 8.540869565217393e-07, + "loss": 0.2423, + "step": 23840 + }, + { + "epoch": 1.6544287882072404, + "grad_norm": 1.171875, + "learning_rate": 8.523478260869566e-07, + "loss": 0.2577, + "step": 23850 + }, + { + "epoch": 1.6551224799479731, + "grad_norm": 1.296875, + "learning_rate": 8.50608695652174e-07, + "loss": 0.2681, + "step": 23860 + }, + { + "epoch": 1.6558161716887059, + "grad_norm": 1.234375, + "learning_rate": 8.488695652173913e-07, + "loss": 0.2362, + "step": 23870 + }, + { + "epoch": 1.6565098634294384, + "grad_norm": 1.3203125, + "learning_rate": 8.471304347826087e-07, + "loss": 0.2296, + "step": 23880 + }, + { + "epoch": 1.6572035551701711, + "grad_norm": 1.265625, + "learning_rate": 8.453913043478261e-07, + "loss": 0.2451, + "step": 23890 + }, + { + "epoch": 1.657897246910904, + "grad_norm": 1.140625, + "learning_rate": 8.436521739130435e-07, + "loss": 0.2586, + "step": 23900 + }, + { + "epoch": 1.6585909386516366, + "grad_norm": 1.3203125, + "learning_rate": 8.419130434782609e-07, + "loss": 0.1875, + "step": 23910 + }, + { + "epoch": 1.6592846303923694, + "grad_norm": 1.2109375, + "learning_rate": 8.401739130434782e-07, + "loss": 0.2252, + "step": 23920 + }, + { + "epoch": 1.6599783221331021, + "grad_norm": 1.4375, + "learning_rate": 8.384347826086957e-07, + "loss": 0.2808, + "step": 23930 + }, + { + "epoch": 1.660672013873835, + "grad_norm": 1.3125, + "learning_rate": 8.366956521739131e-07, + "loss": 0.2576, + "step": 23940 + }, + { + "epoch": 1.6613657056145676, + "grad_norm": 1.1484375, + "learning_rate": 8.349565217391305e-07, + "loss": 0.2447, + "step": 23950 + }, + { + "epoch": 1.6620593973553004, + "grad_norm": 1.4921875, + "learning_rate": 8.332173913043478e-07, + "loss": 0.2325, + "step": 23960 + }, + { + "epoch": 1.662753089096033, + "grad_norm": 1.265625, + "learning_rate": 8.314782608695653e-07, + "loss": 0.1994, + "step": 23970 + }, + { + "epoch": 1.6634467808367657, + "grad_norm": 1.109375, + "learning_rate": 8.297391304347827e-07, + "loss": 0.2345, + "step": 23980 + }, + { + "epoch": 1.6641404725774984, + "grad_norm": 1.4453125, + "learning_rate": 8.280000000000001e-07, + "loss": 0.3115, + "step": 23990 + }, + { + "epoch": 1.664834164318231, + "grad_norm": 1.09375, + "learning_rate": 8.262608695652174e-07, + "loss": 0.2616, + "step": 24000 + }, + { + "epoch": 1.6655278560589637, + "grad_norm": 1.390625, + "learning_rate": 8.245217391304348e-07, + "loss": 0.2968, + "step": 24010 + }, + { + "epoch": 1.6662215477996964, + "grad_norm": 1.28125, + "learning_rate": 8.227826086956523e-07, + "loss": 0.2652, + "step": 24020 + }, + { + "epoch": 1.6669152395404292, + "grad_norm": 1.0625, + "learning_rate": 8.210434782608696e-07, + "loss": 0.275, + "step": 24030 + }, + { + "epoch": 1.667608931281162, + "grad_norm": 1.1171875, + "learning_rate": 8.19304347826087e-07, + "loss": 0.202, + "step": 24040 + }, + { + "epoch": 1.6683026230218947, + "grad_norm": 1.1796875, + "learning_rate": 8.175652173913044e-07, + "loss": 0.2373, + "step": 24050 + }, + { + "epoch": 1.6689963147626274, + "grad_norm": 1.21875, + "learning_rate": 8.158260869565219e-07, + "loss": 0.2368, + "step": 24060 + }, + { + "epoch": 1.6696900065033602, + "grad_norm": 1.546875, + "learning_rate": 8.140869565217392e-07, + "loss": 0.2361, + "step": 24070 + }, + { + "epoch": 1.670383698244093, + "grad_norm": 1.5078125, + "learning_rate": 8.123478260869566e-07, + "loss": 0.2362, + "step": 24080 + }, + { + "epoch": 1.6710773899848255, + "grad_norm": 1.28125, + "learning_rate": 8.10608695652174e-07, + "loss": 0.2434, + "step": 24090 + }, + { + "epoch": 1.6717710817255582, + "grad_norm": 1.2578125, + "learning_rate": 8.088695652173915e-07, + "loss": 0.2348, + "step": 24100 + }, + { + "epoch": 1.672464773466291, + "grad_norm": 1.0625, + "learning_rate": 8.071304347826088e-07, + "loss": 0.2239, + "step": 24110 + }, + { + "epoch": 1.6731584652070235, + "grad_norm": 1.3359375, + "learning_rate": 8.053913043478262e-07, + "loss": 0.2476, + "step": 24120 + }, + { + "epoch": 1.6738521569477562, + "grad_norm": 1.5234375, + "learning_rate": 8.036521739130436e-07, + "loss": 0.3045, + "step": 24130 + }, + { + "epoch": 1.674545848688489, + "grad_norm": 1.03125, + "learning_rate": 8.019130434782609e-07, + "loss": 0.2166, + "step": 24140 + }, + { + "epoch": 1.6752395404292217, + "grad_norm": 1.25, + "learning_rate": 8.001739130434784e-07, + "loss": 0.2401, + "step": 24150 + }, + { + "epoch": 1.6759332321699545, + "grad_norm": 1.0, + "learning_rate": 7.984347826086958e-07, + "loss": 0.2398, + "step": 24160 + }, + { + "epoch": 1.6766269239106872, + "grad_norm": 1.21875, + "learning_rate": 7.966956521739132e-07, + "loss": 0.2428, + "step": 24170 + }, + { + "epoch": 1.67732061565142, + "grad_norm": 1.0859375, + "learning_rate": 7.949565217391304e-07, + "loss": 0.2544, + "step": 24180 + }, + { + "epoch": 1.6780143073921527, + "grad_norm": 0.9765625, + "learning_rate": 7.932173913043478e-07, + "loss": 0.1938, + "step": 24190 + }, + { + "epoch": 1.6787079991328855, + "grad_norm": 1.2265625, + "learning_rate": 7.914782608695653e-07, + "loss": 0.2606, + "step": 24200 + }, + { + "epoch": 1.679401690873618, + "grad_norm": 1.125, + "learning_rate": 7.897391304347826e-07, + "loss": 0.2166, + "step": 24210 + }, + { + "epoch": 1.6800953826143508, + "grad_norm": 1.3125, + "learning_rate": 7.88e-07, + "loss": 0.2149, + "step": 24220 + }, + { + "epoch": 1.6807890743550835, + "grad_norm": 0.875, + "learning_rate": 7.862608695652174e-07, + "loss": 0.2509, + "step": 24230 + }, + { + "epoch": 1.681482766095816, + "grad_norm": 1.2734375, + "learning_rate": 7.845217391304348e-07, + "loss": 0.255, + "step": 24240 + }, + { + "epoch": 1.6821764578365488, + "grad_norm": 0.84375, + "learning_rate": 7.827826086956522e-07, + "loss": 0.2084, + "step": 24250 + }, + { + "epoch": 1.6828701495772815, + "grad_norm": 1.3671875, + "learning_rate": 7.810434782608696e-07, + "loss": 0.2378, + "step": 24260 + }, + { + "epoch": 1.6835638413180143, + "grad_norm": 1.2265625, + "learning_rate": 7.79304347826087e-07, + "loss": 0.2314, + "step": 24270 + }, + { + "epoch": 1.684257533058747, + "grad_norm": 1.3359375, + "learning_rate": 7.775652173913043e-07, + "loss": 0.2701, + "step": 24280 + }, + { + "epoch": 1.6849512247994798, + "grad_norm": 1.171875, + "learning_rate": 7.758260869565218e-07, + "loss": 0.2682, + "step": 24290 + }, + { + "epoch": 1.6856449165402125, + "grad_norm": 1.40625, + "learning_rate": 7.740869565217392e-07, + "loss": 0.235, + "step": 24300 + }, + { + "epoch": 1.6863386082809453, + "grad_norm": 1.4609375, + "learning_rate": 7.723478260869566e-07, + "loss": 0.2136, + "step": 24310 + }, + { + "epoch": 1.687032300021678, + "grad_norm": 1.4140625, + "learning_rate": 7.706086956521739e-07, + "loss": 0.2159, + "step": 24320 + }, + { + "epoch": 1.6877259917624106, + "grad_norm": 1.25, + "learning_rate": 7.688695652173914e-07, + "loss": 0.1848, + "step": 24330 + }, + { + "epoch": 1.6884196835031433, + "grad_norm": 1.3359375, + "learning_rate": 7.671304347826088e-07, + "loss": 0.2464, + "step": 24340 + }, + { + "epoch": 1.6891133752438758, + "grad_norm": 1.2109375, + "learning_rate": 7.653913043478262e-07, + "loss": 0.2249, + "step": 24350 + }, + { + "epoch": 1.6898070669846086, + "grad_norm": 1.3046875, + "learning_rate": 7.636521739130435e-07, + "loss": 0.2389, + "step": 24360 + }, + { + "epoch": 1.6905007587253413, + "grad_norm": 1.125, + "learning_rate": 7.619130434782609e-07, + "loss": 0.2296, + "step": 24370 + }, + { + "epoch": 1.691194450466074, + "grad_norm": 1.3984375, + "learning_rate": 7.601739130434784e-07, + "loss": 0.2403, + "step": 24380 + }, + { + "epoch": 1.6918881422068068, + "grad_norm": 1.515625, + "learning_rate": 7.584347826086957e-07, + "loss": 0.3061, + "step": 24390 + }, + { + "epoch": 1.6925818339475396, + "grad_norm": 1.4140625, + "learning_rate": 7.566956521739131e-07, + "loss": 0.225, + "step": 24400 + }, + { + "epoch": 1.6932755256882723, + "grad_norm": 1.21875, + "learning_rate": 7.549565217391305e-07, + "loss": 0.234, + "step": 24410 + }, + { + "epoch": 1.693969217429005, + "grad_norm": 1.046875, + "learning_rate": 7.53217391304348e-07, + "loss": 0.2418, + "step": 24420 + }, + { + "epoch": 1.6946629091697378, + "grad_norm": 1.375, + "learning_rate": 7.514782608695653e-07, + "loss": 0.3734, + "step": 24430 + }, + { + "epoch": 1.6953566009104704, + "grad_norm": 1.234375, + "learning_rate": 7.497391304347827e-07, + "loss": 0.2375, + "step": 24440 + }, + { + "epoch": 1.696050292651203, + "grad_norm": 1.3046875, + "learning_rate": 7.480000000000001e-07, + "loss": 0.2323, + "step": 24450 + }, + { + "epoch": 1.6967439843919359, + "grad_norm": 1.125, + "learning_rate": 7.462608695652176e-07, + "loss": 0.3004, + "step": 24460 + }, + { + "epoch": 1.6974376761326684, + "grad_norm": 1.28125, + "learning_rate": 7.445217391304349e-07, + "loss": 0.2572, + "step": 24470 + }, + { + "epoch": 1.6981313678734011, + "grad_norm": 1.4375, + "learning_rate": 7.427826086956523e-07, + "loss": 0.2329, + "step": 24480 + }, + { + "epoch": 1.6988250596141339, + "grad_norm": 1.125, + "learning_rate": 7.410434782608696e-07, + "loss": 0.2483, + "step": 24490 + }, + { + "epoch": 1.6995187513548666, + "grad_norm": 1.578125, + "learning_rate": 7.393043478260869e-07, + "loss": 0.205, + "step": 24500 + }, + { + "epoch": 1.7002124430955994, + "grad_norm": 1.25, + "learning_rate": 7.375652173913043e-07, + "loss": 0.2956, + "step": 24510 + }, + { + "epoch": 1.7009061348363321, + "grad_norm": 1.15625, + "learning_rate": 7.358260869565218e-07, + "loss": 0.2632, + "step": 24520 + }, + { + "epoch": 1.7015998265770649, + "grad_norm": 1.234375, + "learning_rate": 7.340869565217392e-07, + "loss": 0.244, + "step": 24530 + }, + { + "epoch": 1.7022935183177976, + "grad_norm": 1.1875, + "learning_rate": 7.323478260869565e-07, + "loss": 0.2039, + "step": 24540 + }, + { + "epoch": 1.7029872100585304, + "grad_norm": 1.734375, + "learning_rate": 7.306086956521739e-07, + "loss": 0.3392, + "step": 24550 + }, + { + "epoch": 1.703680901799263, + "grad_norm": 1.4140625, + "learning_rate": 7.288695652173914e-07, + "loss": 0.2325, + "step": 24560 + }, + { + "epoch": 1.7043745935399957, + "grad_norm": 1.171875, + "learning_rate": 7.271304347826087e-07, + "loss": 0.219, + "step": 24570 + }, + { + "epoch": 1.7050682852807284, + "grad_norm": 1.1875, + "learning_rate": 7.253913043478261e-07, + "loss": 0.229, + "step": 24580 + }, + { + "epoch": 1.705761977021461, + "grad_norm": 1.1328125, + "learning_rate": 7.236521739130435e-07, + "loss": 0.232, + "step": 24590 + }, + { + "epoch": 1.7064556687621937, + "grad_norm": 1.015625, + "learning_rate": 7.219130434782609e-07, + "loss": 0.2213, + "step": 24600 + }, + { + "epoch": 1.7071493605029264, + "grad_norm": 1.125, + "learning_rate": 7.201739130434783e-07, + "loss": 0.2818, + "step": 24610 + }, + { + "epoch": 1.7078430522436592, + "grad_norm": 1.375, + "learning_rate": 7.184347826086957e-07, + "loss": 0.2612, + "step": 24620 + }, + { + "epoch": 1.708536743984392, + "grad_norm": 1.609375, + "learning_rate": 7.166956521739131e-07, + "loss": 0.2474, + "step": 24630 + }, + { + "epoch": 1.7092304357251247, + "grad_norm": 1.484375, + "learning_rate": 7.149565217391304e-07, + "loss": 0.2656, + "step": 24640 + }, + { + "epoch": 1.7099241274658574, + "grad_norm": 1.1328125, + "learning_rate": 7.132173913043479e-07, + "loss": 0.2126, + "step": 24650 + }, + { + "epoch": 1.7106178192065902, + "grad_norm": 1.3203125, + "learning_rate": 7.114782608695653e-07, + "loss": 0.2265, + "step": 24660 + }, + { + "epoch": 1.711311510947323, + "grad_norm": 1.4609375, + "learning_rate": 7.097391304347827e-07, + "loss": 0.2468, + "step": 24670 + }, + { + "epoch": 1.7120052026880554, + "grad_norm": 1.1015625, + "learning_rate": 7.08e-07, + "loss": 0.2395, + "step": 24680 + }, + { + "epoch": 1.7126988944287882, + "grad_norm": 1.21875, + "learning_rate": 7.062608695652175e-07, + "loss": 0.2194, + "step": 24690 + }, + { + "epoch": 1.713392586169521, + "grad_norm": 1.265625, + "learning_rate": 7.045217391304349e-07, + "loss": 0.2355, + "step": 24700 + }, + { + "epoch": 1.7140862779102535, + "grad_norm": 1.125, + "learning_rate": 7.027826086956523e-07, + "loss": 0.2146, + "step": 24710 + }, + { + "epoch": 1.7147799696509862, + "grad_norm": 1.3125, + "learning_rate": 7.010434782608696e-07, + "loss": 0.2325, + "step": 24720 + }, + { + "epoch": 1.715473661391719, + "grad_norm": 1.046875, + "learning_rate": 6.99304347826087e-07, + "loss": 0.252, + "step": 24730 + }, + { + "epoch": 1.7161673531324517, + "grad_norm": 0.9921875, + "learning_rate": 6.975652173913045e-07, + "loss": 0.2504, + "step": 24740 + }, + { + "epoch": 1.7168610448731845, + "grad_norm": 0.97265625, + "learning_rate": 6.958260869565218e-07, + "loss": 0.2483, + "step": 24750 + }, + { + "epoch": 1.7175547366139172, + "grad_norm": 1.765625, + "learning_rate": 6.940869565217392e-07, + "loss": 0.2785, + "step": 24760 + }, + { + "epoch": 1.71824842835465, + "grad_norm": 1.828125, + "learning_rate": 6.923478260869566e-07, + "loss": 0.3625, + "step": 24770 + }, + { + "epoch": 1.7189421200953827, + "grad_norm": 1.21875, + "learning_rate": 6.906086956521741e-07, + "loss": 0.2435, + "step": 24780 + }, + { + "epoch": 1.7196358118361155, + "grad_norm": 1.5390625, + "learning_rate": 6.888695652173914e-07, + "loss": 0.2389, + "step": 24790 + }, + { + "epoch": 1.720329503576848, + "grad_norm": 1.25, + "learning_rate": 6.871304347826087e-07, + "loss": 0.2638, + "step": 24800 + }, + { + "epoch": 1.7210231953175807, + "grad_norm": 1.109375, + "learning_rate": 6.853913043478261e-07, + "loss": 0.2871, + "step": 24810 + }, + { + "epoch": 1.7217168870583135, + "grad_norm": 1.015625, + "learning_rate": 6.836521739130434e-07, + "loss": 0.2482, + "step": 24820 + }, + { + "epoch": 1.722410578799046, + "grad_norm": 1.6484375, + "learning_rate": 6.819130434782609e-07, + "loss": 0.2159, + "step": 24830 + }, + { + "epoch": 1.7231042705397788, + "grad_norm": 1.2421875, + "learning_rate": 6.801739130434783e-07, + "loss": 0.2426, + "step": 24840 + }, + { + "epoch": 1.7237979622805115, + "grad_norm": 1.6484375, + "learning_rate": 6.784347826086957e-07, + "loss": 0.2381, + "step": 24850 + }, + { + "epoch": 1.7244916540212443, + "grad_norm": 1.046875, + "learning_rate": 6.76695652173913e-07, + "loss": 0.2482, + "step": 24860 + }, + { + "epoch": 1.725185345761977, + "grad_norm": 1.09375, + "learning_rate": 6.749565217391304e-07, + "loss": 0.2337, + "step": 24870 + }, + { + "epoch": 1.7258790375027098, + "grad_norm": 0.7578125, + "learning_rate": 6.732173913043479e-07, + "loss": 0.2348, + "step": 24880 + }, + { + "epoch": 1.7265727292434425, + "grad_norm": 1.578125, + "learning_rate": 6.714782608695653e-07, + "loss": 0.271, + "step": 24890 + }, + { + "epoch": 1.7272664209841753, + "grad_norm": 0.96875, + "learning_rate": 6.697391304347826e-07, + "loss": 0.2612, + "step": 24900 + }, + { + "epoch": 1.727960112724908, + "grad_norm": 1.1796875, + "learning_rate": 6.68e-07, + "loss": 0.2523, + "step": 24910 + }, + { + "epoch": 1.7286538044656405, + "grad_norm": 1.3828125, + "learning_rate": 6.662608695652175e-07, + "loss": 0.2531, + "step": 24920 + }, + { + "epoch": 1.7293474962063733, + "grad_norm": 1.171875, + "learning_rate": 6.645217391304348e-07, + "loss": 0.226, + "step": 24930 + }, + { + "epoch": 1.730041187947106, + "grad_norm": 1.15625, + "learning_rate": 6.627826086956522e-07, + "loss": 0.2511, + "step": 24940 + }, + { + "epoch": 1.7307348796878386, + "grad_norm": 1.1328125, + "learning_rate": 6.610434782608696e-07, + "loss": 0.2648, + "step": 24950 + }, + { + "epoch": 1.7314285714285713, + "grad_norm": 1.4296875, + "learning_rate": 6.593043478260871e-07, + "loss": 0.2495, + "step": 24960 + }, + { + "epoch": 1.732122263169304, + "grad_norm": 1.03125, + "learning_rate": 6.575652173913044e-07, + "loss": 0.2491, + "step": 24970 + }, + { + "epoch": 1.7328159549100368, + "grad_norm": 1.1875, + "learning_rate": 6.558260869565218e-07, + "loss": 0.1996, + "step": 24980 + }, + { + "epoch": 1.7335096466507696, + "grad_norm": 1.265625, + "learning_rate": 6.540869565217392e-07, + "loss": 0.2316, + "step": 24990 + }, + { + "epoch": 1.7342033383915023, + "grad_norm": 1.1484375, + "learning_rate": 6.523478260869566e-07, + "loss": 0.2417, + "step": 25000 + }, + { + "epoch": 1.734897030132235, + "grad_norm": 1.0078125, + "learning_rate": 6.50608695652174e-07, + "loss": 0.2705, + "step": 25010 + }, + { + "epoch": 1.7355907218729678, + "grad_norm": 1.265625, + "learning_rate": 6.488695652173914e-07, + "loss": 0.234, + "step": 25020 + }, + { + "epoch": 1.7362844136137006, + "grad_norm": 1.3515625, + "learning_rate": 6.471304347826088e-07, + "loss": 0.2018, + "step": 25030 + }, + { + "epoch": 1.736978105354433, + "grad_norm": 1.71875, + "learning_rate": 6.453913043478261e-07, + "loss": 0.26, + "step": 25040 + }, + { + "epoch": 1.7376717970951658, + "grad_norm": 1.4375, + "learning_rate": 6.436521739130436e-07, + "loss": 0.2964, + "step": 25050 + }, + { + "epoch": 1.7383654888358986, + "grad_norm": 1.1484375, + "learning_rate": 6.41913043478261e-07, + "loss": 0.2349, + "step": 25060 + }, + { + "epoch": 1.7390591805766311, + "grad_norm": 1.359375, + "learning_rate": 6.401739130434784e-07, + "loss": 0.249, + "step": 25070 + }, + { + "epoch": 1.7397528723173639, + "grad_norm": 1.953125, + "learning_rate": 6.384347826086957e-07, + "loss": 0.2878, + "step": 25080 + }, + { + "epoch": 1.7404465640580966, + "grad_norm": 1.3671875, + "learning_rate": 6.366956521739132e-07, + "loss": 0.2922, + "step": 25090 + }, + { + "epoch": 1.7411402557988294, + "grad_norm": 1.328125, + "learning_rate": 6.349565217391306e-07, + "loss": 0.2339, + "step": 25100 + }, + { + "epoch": 1.741833947539562, + "grad_norm": 1.3671875, + "learning_rate": 6.332173913043478e-07, + "loss": 0.2677, + "step": 25110 + }, + { + "epoch": 1.7425276392802949, + "grad_norm": 1.5546875, + "learning_rate": 6.314782608695652e-07, + "loss": 0.2195, + "step": 25120 + }, + { + "epoch": 1.7432213310210276, + "grad_norm": 1.234375, + "learning_rate": 6.297391304347826e-07, + "loss": 0.2228, + "step": 25130 + }, + { + "epoch": 1.7439150227617604, + "grad_norm": 1.15625, + "learning_rate": 6.28e-07, + "loss": 0.1983, + "step": 25140 + }, + { + "epoch": 1.744608714502493, + "grad_norm": 1.3359375, + "learning_rate": 6.262608695652174e-07, + "loss": 0.2045, + "step": 25150 + }, + { + "epoch": 1.7453024062432256, + "grad_norm": 1.53125, + "learning_rate": 6.245217391304348e-07, + "loss": 0.2562, + "step": 25160 + }, + { + "epoch": 1.7459960979839584, + "grad_norm": 1.3984375, + "learning_rate": 6.227826086956523e-07, + "loss": 0.2394, + "step": 25170 + }, + { + "epoch": 1.7466897897246911, + "grad_norm": 0.98046875, + "learning_rate": 6.210434782608697e-07, + "loss": 0.2807, + "step": 25180 + }, + { + "epoch": 1.7473834814654237, + "grad_norm": 1.21875, + "learning_rate": 6.19304347826087e-07, + "loss": 0.2084, + "step": 25190 + }, + { + "epoch": 1.7480771732061564, + "grad_norm": 1.125, + "learning_rate": 6.175652173913044e-07, + "loss": 0.2275, + "step": 25200 + }, + { + "epoch": 1.7487708649468892, + "grad_norm": 1.9921875, + "learning_rate": 6.158260869565218e-07, + "loss": 0.3696, + "step": 25210 + }, + { + "epoch": 1.749464556687622, + "grad_norm": 1.6484375, + "learning_rate": 6.140869565217391e-07, + "loss": 0.3133, + "step": 25220 + }, + { + "epoch": 1.7501582484283547, + "grad_norm": 1.2265625, + "learning_rate": 6.123478260869565e-07, + "loss": 0.2205, + "step": 25230 + }, + { + "epoch": 1.7508519401690874, + "grad_norm": 0.9765625, + "learning_rate": 6.10608695652174e-07, + "loss": 0.2292, + "step": 25240 + }, + { + "epoch": 1.7515456319098202, + "grad_norm": 1.1640625, + "learning_rate": 6.088695652173914e-07, + "loss": 0.2705, + "step": 25250 + }, + { + "epoch": 1.752239323650553, + "grad_norm": 1.0625, + "learning_rate": 6.071304347826087e-07, + "loss": 0.2202, + "step": 25260 + }, + { + "epoch": 1.7529330153912857, + "grad_norm": 1.546875, + "learning_rate": 6.053913043478261e-07, + "loss": 0.2228, + "step": 25270 + }, + { + "epoch": 1.7536267071320182, + "grad_norm": 1.3046875, + "learning_rate": 6.036521739130436e-07, + "loss": 0.242, + "step": 25280 + }, + { + "epoch": 1.754320398872751, + "grad_norm": 0.96875, + "learning_rate": 6.01913043478261e-07, + "loss": 0.2173, + "step": 25290 + }, + { + "epoch": 1.7550140906134837, + "grad_norm": 1.375, + "learning_rate": 6.001739130434783e-07, + "loss": 0.2418, + "step": 25300 + }, + { + "epoch": 1.7557077823542162, + "grad_norm": 1.375, + "learning_rate": 5.984347826086957e-07, + "loss": 0.3065, + "step": 25310 + }, + { + "epoch": 1.756401474094949, + "grad_norm": 1.1875, + "learning_rate": 5.966956521739132e-07, + "loss": 0.2139, + "step": 25320 + }, + { + "epoch": 1.7570951658356817, + "grad_norm": 1.03125, + "learning_rate": 5.949565217391305e-07, + "loss": 0.2255, + "step": 25330 + }, + { + "epoch": 1.7577888575764145, + "grad_norm": 0.953125, + "learning_rate": 5.932173913043478e-07, + "loss": 0.2343, + "step": 25340 + }, + { + "epoch": 1.7584825493171472, + "grad_norm": 1.265625, + "learning_rate": 5.914782608695653e-07, + "loss": 0.2387, + "step": 25350 + }, + { + "epoch": 1.75917624105788, + "grad_norm": 1.171875, + "learning_rate": 5.897391304347827e-07, + "loss": 0.2096, + "step": 25360 + }, + { + "epoch": 1.7598699327986127, + "grad_norm": 1.6796875, + "learning_rate": 5.88e-07, + "loss": 0.2429, + "step": 25370 + }, + { + "epoch": 1.7605636245393455, + "grad_norm": 1.03125, + "learning_rate": 5.862608695652174e-07, + "loss": 0.2416, + "step": 25380 + }, + { + "epoch": 1.7612573162800782, + "grad_norm": 1.28125, + "learning_rate": 5.845217391304349e-07, + "loss": 0.2747, + "step": 25390 + }, + { + "epoch": 1.7619510080208107, + "grad_norm": 1.2265625, + "learning_rate": 5.827826086956522e-07, + "loss": 0.2036, + "step": 25400 + }, + { + "epoch": 1.7626446997615435, + "grad_norm": 1.1328125, + "learning_rate": 5.810434782608696e-07, + "loss": 0.2278, + "step": 25410 + }, + { + "epoch": 1.7633383915022762, + "grad_norm": 1.2890625, + "learning_rate": 5.79304347826087e-07, + "loss": 0.2085, + "step": 25420 + }, + { + "epoch": 1.7640320832430088, + "grad_norm": 0.98828125, + "learning_rate": 5.775652173913044e-07, + "loss": 0.2329, + "step": 25430 + }, + { + "epoch": 1.7647257749837415, + "grad_norm": 1.3203125, + "learning_rate": 5.758260869565218e-07, + "loss": 0.2339, + "step": 25440 + }, + { + "epoch": 1.7654194667244743, + "grad_norm": 1.140625, + "learning_rate": 5.740869565217392e-07, + "loss": 0.2597, + "step": 25450 + }, + { + "epoch": 1.766113158465207, + "grad_norm": 1.1796875, + "learning_rate": 5.723478260869566e-07, + "loss": 0.2197, + "step": 25460 + }, + { + "epoch": 1.7668068502059397, + "grad_norm": 1.0859375, + "learning_rate": 5.70608695652174e-07, + "loss": 0.2542, + "step": 25470 + }, + { + "epoch": 1.7675005419466725, + "grad_norm": 1.15625, + "learning_rate": 5.688695652173914e-07, + "loss": 0.2738, + "step": 25480 + }, + { + "epoch": 1.7681942336874052, + "grad_norm": 1.4140625, + "learning_rate": 5.671304347826087e-07, + "loss": 0.2285, + "step": 25490 + }, + { + "epoch": 1.768887925428138, + "grad_norm": 1.2265625, + "learning_rate": 5.653913043478261e-07, + "loss": 0.2296, + "step": 25500 + }, + { + "epoch": 1.7695816171688707, + "grad_norm": 1.6015625, + "learning_rate": 5.636521739130435e-07, + "loss": 0.2071, + "step": 25510 + }, + { + "epoch": 1.7702753089096033, + "grad_norm": 1.1953125, + "learning_rate": 5.619130434782609e-07, + "loss": 0.2482, + "step": 25520 + }, + { + "epoch": 1.770969000650336, + "grad_norm": 1.109375, + "learning_rate": 5.601739130434783e-07, + "loss": 0.3006, + "step": 25530 + }, + { + "epoch": 1.7716626923910688, + "grad_norm": 1.21875, + "learning_rate": 5.584347826086957e-07, + "loss": 0.2639, + "step": 25540 + }, + { + "epoch": 1.7723563841318013, + "grad_norm": 1.5546875, + "learning_rate": 5.566956521739131e-07, + "loss": 0.2591, + "step": 25550 + }, + { + "epoch": 1.773050075872534, + "grad_norm": 1.1875, + "learning_rate": 5.549565217391305e-07, + "loss": 0.3238, + "step": 25560 + }, + { + "epoch": 1.7737437676132668, + "grad_norm": 1.8515625, + "learning_rate": 5.532173913043479e-07, + "loss": 0.2856, + "step": 25570 + }, + { + "epoch": 1.7744374593539995, + "grad_norm": 1.1640625, + "learning_rate": 5.514782608695652e-07, + "loss": 0.3195, + "step": 25580 + }, + { + "epoch": 1.7751311510947323, + "grad_norm": 1.59375, + "learning_rate": 5.497391304347826e-07, + "loss": 0.2509, + "step": 25590 + }, + { + "epoch": 1.775824842835465, + "grad_norm": 1.140625, + "learning_rate": 5.480000000000001e-07, + "loss": 0.2397, + "step": 25600 + }, + { + "epoch": 1.7765185345761978, + "grad_norm": 1.1328125, + "learning_rate": 5.462608695652175e-07, + "loss": 0.2355, + "step": 25610 + }, + { + "epoch": 1.7772122263169305, + "grad_norm": 1.21875, + "learning_rate": 5.445217391304348e-07, + "loss": 0.1725, + "step": 25620 + }, + { + "epoch": 1.777905918057663, + "grad_norm": 1.1484375, + "learning_rate": 5.427826086956522e-07, + "loss": 0.3111, + "step": 25630 + }, + { + "epoch": 1.7785996097983958, + "grad_norm": 1.3046875, + "learning_rate": 5.410434782608697e-07, + "loss": 0.2527, + "step": 25640 + }, + { + "epoch": 1.7792933015391286, + "grad_norm": 1.28125, + "learning_rate": 5.393043478260869e-07, + "loss": 0.2612, + "step": 25650 + }, + { + "epoch": 1.779986993279861, + "grad_norm": 1.5390625, + "learning_rate": 5.375652173913043e-07, + "loss": 0.2509, + "step": 25660 + }, + { + "epoch": 1.7806806850205938, + "grad_norm": 1.390625, + "learning_rate": 5.358260869565218e-07, + "loss": 0.2457, + "step": 25670 + }, + { + "epoch": 1.7813743767613266, + "grad_norm": 1.15625, + "learning_rate": 5.340869565217392e-07, + "loss": 0.2486, + "step": 25680 + }, + { + "epoch": 1.7820680685020593, + "grad_norm": 1.3125, + "learning_rate": 5.323478260869565e-07, + "loss": 0.2046, + "step": 25690 + }, + { + "epoch": 1.782761760242792, + "grad_norm": 1.125, + "learning_rate": 5.306086956521739e-07, + "loss": 0.2119, + "step": 25700 + }, + { + "epoch": 1.7834554519835248, + "grad_norm": 1.0, + "learning_rate": 5.288695652173914e-07, + "loss": 0.2471, + "step": 25710 + }, + { + "epoch": 1.7841491437242576, + "grad_norm": 0.93359375, + "learning_rate": 5.271304347826088e-07, + "loss": 0.2273, + "step": 25720 + }, + { + "epoch": 1.7848428354649903, + "grad_norm": 0.95703125, + "learning_rate": 5.253913043478261e-07, + "loss": 0.2592, + "step": 25730 + }, + { + "epoch": 1.785536527205723, + "grad_norm": 1.3359375, + "learning_rate": 5.236521739130435e-07, + "loss": 0.2367, + "step": 25740 + }, + { + "epoch": 1.7862302189464556, + "grad_norm": 1.3046875, + "learning_rate": 5.21913043478261e-07, + "loss": 0.2357, + "step": 25750 + }, + { + "epoch": 1.7869239106871884, + "grad_norm": 1.2734375, + "learning_rate": 5.201739130434783e-07, + "loss": 0.219, + "step": 25760 + }, + { + "epoch": 1.7876176024279211, + "grad_norm": 1.171875, + "learning_rate": 5.184347826086957e-07, + "loss": 0.2937, + "step": 25770 + }, + { + "epoch": 1.7883112941686536, + "grad_norm": 0.984375, + "learning_rate": 5.166956521739131e-07, + "loss": 0.2343, + "step": 25780 + }, + { + "epoch": 1.7890049859093864, + "grad_norm": 1.3515625, + "learning_rate": 5.149565217391305e-07, + "loss": 0.226, + "step": 25790 + }, + { + "epoch": 1.7896986776501191, + "grad_norm": 0.921875, + "learning_rate": 5.132173913043478e-07, + "loss": 0.2387, + "step": 25800 + }, + { + "epoch": 1.790392369390852, + "grad_norm": 1.1015625, + "learning_rate": 5.114782608695652e-07, + "loss": 0.2074, + "step": 25810 + }, + { + "epoch": 1.7910860611315846, + "grad_norm": 1.1328125, + "learning_rate": 5.097391304347827e-07, + "loss": 0.243, + "step": 25820 + }, + { + "epoch": 1.7917797528723174, + "grad_norm": 1.21875, + "learning_rate": 5.08e-07, + "loss": 0.2289, + "step": 25830 + }, + { + "epoch": 1.7924734446130501, + "grad_norm": 1.1875, + "learning_rate": 5.062608695652174e-07, + "loss": 0.2942, + "step": 25840 + }, + { + "epoch": 1.7931671363537829, + "grad_norm": 1.1953125, + "learning_rate": 5.045217391304348e-07, + "loss": 0.2439, + "step": 25850 + }, + { + "epoch": 1.7938608280945156, + "grad_norm": 0.98046875, + "learning_rate": 5.027826086956522e-07, + "loss": 0.2651, + "step": 25860 + }, + { + "epoch": 1.7945545198352482, + "grad_norm": 1.4375, + "learning_rate": 5.010434782608696e-07, + "loss": 0.2474, + "step": 25870 + }, + { + "epoch": 1.795248211575981, + "grad_norm": 1.578125, + "learning_rate": 4.99304347826087e-07, + "loss": 0.2417, + "step": 25880 + }, + { + "epoch": 1.7959419033167137, + "grad_norm": 1.234375, + "learning_rate": 4.975652173913044e-07, + "loss": 0.2635, + "step": 25890 + }, + { + "epoch": 1.7966355950574462, + "grad_norm": 1.84375, + "learning_rate": 4.958260869565218e-07, + "loss": 0.3054, + "step": 25900 + }, + { + "epoch": 1.797329286798179, + "grad_norm": 1.1953125, + "learning_rate": 4.940869565217392e-07, + "loss": 0.2265, + "step": 25910 + }, + { + "epoch": 1.7980229785389117, + "grad_norm": 1.2109375, + "learning_rate": 4.923478260869566e-07, + "loss": 0.2727, + "step": 25920 + }, + { + "epoch": 1.7987166702796444, + "grad_norm": 1.0390625, + "learning_rate": 4.90608695652174e-07, + "loss": 0.2491, + "step": 25930 + }, + { + "epoch": 1.7994103620203772, + "grad_norm": 1.2578125, + "learning_rate": 4.888695652173913e-07, + "loss": 0.2025, + "step": 25940 + }, + { + "epoch": 1.80010405376111, + "grad_norm": 0.96484375, + "learning_rate": 4.871304347826088e-07, + "loss": 0.2422, + "step": 25950 + }, + { + "epoch": 1.8007977455018427, + "grad_norm": 1.4453125, + "learning_rate": 4.853913043478261e-07, + "loss": 0.2329, + "step": 25960 + }, + { + "epoch": 1.8014914372425754, + "grad_norm": 1.609375, + "learning_rate": 4.836521739130435e-07, + "loss": 0.2516, + "step": 25970 + }, + { + "epoch": 1.8021851289833082, + "grad_norm": 1.1953125, + "learning_rate": 4.819130434782609e-07, + "loss": 0.1947, + "step": 25980 + }, + { + "epoch": 1.8028788207240407, + "grad_norm": 1.390625, + "learning_rate": 4.801739130434783e-07, + "loss": 0.2751, + "step": 25990 + }, + { + "epoch": 1.8035725124647735, + "grad_norm": 1.5234375, + "learning_rate": 4.784347826086957e-07, + "loss": 0.2421, + "step": 26000 + }, + { + "epoch": 1.8042662042055062, + "grad_norm": 1.4609375, + "learning_rate": 4.7669565217391305e-07, + "loss": 0.2389, + "step": 26010 + }, + { + "epoch": 1.8049598959462387, + "grad_norm": 1.21875, + "learning_rate": 4.7495652173913047e-07, + "loss": 0.2199, + "step": 26020 + }, + { + "epoch": 1.8056535876869715, + "grad_norm": 1.71875, + "learning_rate": 4.7321739130434784e-07, + "loss": 0.2385, + "step": 26030 + }, + { + "epoch": 1.8063472794277042, + "grad_norm": 1.25, + "learning_rate": 4.7147826086956527e-07, + "loss": 0.2346, + "step": 26040 + }, + { + "epoch": 1.807040971168437, + "grad_norm": 1.3203125, + "learning_rate": 4.6973913043478264e-07, + "loss": 0.2409, + "step": 26050 + }, + { + "epoch": 1.8077346629091697, + "grad_norm": 1.2890625, + "learning_rate": 4.6800000000000006e-07, + "loss": 0.2329, + "step": 26060 + }, + { + "epoch": 1.8084283546499025, + "grad_norm": 1.28125, + "learning_rate": 4.6626086956521743e-07, + "loss": 0.2252, + "step": 26070 + }, + { + "epoch": 1.8091220463906352, + "grad_norm": 1.28125, + "learning_rate": 4.6452173913043486e-07, + "loss": 0.2564, + "step": 26080 + }, + { + "epoch": 1.809815738131368, + "grad_norm": 1.5078125, + "learning_rate": 4.6278260869565223e-07, + "loss": 0.2508, + "step": 26090 + }, + { + "epoch": 1.8105094298721007, + "grad_norm": 1.3828125, + "learning_rate": 4.6104347826086965e-07, + "loss": 0.2399, + "step": 26100 + }, + { + "epoch": 1.8112031216128333, + "grad_norm": 1.5234375, + "learning_rate": 4.5930434782608697e-07, + "loss": 0.2153, + "step": 26110 + }, + { + "epoch": 1.811896813353566, + "grad_norm": 1.3828125, + "learning_rate": 4.5756521739130434e-07, + "loss": 0.2163, + "step": 26120 + }, + { + "epoch": 1.8125905050942988, + "grad_norm": 1.3515625, + "learning_rate": 4.5582608695652177e-07, + "loss": 0.2719, + "step": 26130 + }, + { + "epoch": 1.8132841968350313, + "grad_norm": 0.9609375, + "learning_rate": 4.5408695652173914e-07, + "loss": 0.1888, + "step": 26140 + }, + { + "epoch": 1.813977888575764, + "grad_norm": 1.2734375, + "learning_rate": 4.5234782608695656e-07, + "loss": 0.3202, + "step": 26150 + }, + { + "epoch": 1.8146715803164968, + "grad_norm": 1.1171875, + "learning_rate": 4.5060869565217393e-07, + "loss": 0.21, + "step": 26160 + }, + { + "epoch": 1.8153652720572295, + "grad_norm": 1.1171875, + "learning_rate": 4.4886956521739136e-07, + "loss": 0.194, + "step": 26170 + }, + { + "epoch": 1.8160589637979623, + "grad_norm": 1.296875, + "learning_rate": 4.4713043478260873e-07, + "loss": 0.2186, + "step": 26180 + }, + { + "epoch": 1.816752655538695, + "grad_norm": 0.74609375, + "learning_rate": 4.4539130434782615e-07, + "loss": 0.2346, + "step": 26190 + }, + { + "epoch": 1.8174463472794278, + "grad_norm": 1.375, + "learning_rate": 4.436521739130435e-07, + "loss": 0.2373, + "step": 26200 + }, + { + "epoch": 1.8181400390201605, + "grad_norm": 1.0, + "learning_rate": 4.419130434782609e-07, + "loss": 0.262, + "step": 26210 + }, + { + "epoch": 1.8188337307608933, + "grad_norm": 0.97265625, + "learning_rate": 4.401739130434783e-07, + "loss": 0.2297, + "step": 26220 + }, + { + "epoch": 1.8195274225016258, + "grad_norm": 1.1171875, + "learning_rate": 4.384347826086957e-07, + "loss": 0.2302, + "step": 26230 + }, + { + "epoch": 1.8202211142423586, + "grad_norm": 0.94921875, + "learning_rate": 4.366956521739131e-07, + "loss": 0.2239, + "step": 26240 + }, + { + "epoch": 1.8209148059830913, + "grad_norm": 1.1796875, + "learning_rate": 4.349565217391305e-07, + "loss": 0.2234, + "step": 26250 + }, + { + "epoch": 1.8216084977238238, + "grad_norm": 1.09375, + "learning_rate": 4.332173913043479e-07, + "loss": 0.242, + "step": 26260 + }, + { + "epoch": 1.8223021894645566, + "grad_norm": 1.9609375, + "learning_rate": 4.314782608695652e-07, + "loss": 0.2617, + "step": 26270 + }, + { + "epoch": 1.8229958812052893, + "grad_norm": 1.4765625, + "learning_rate": 4.297391304347826e-07, + "loss": 0.2493, + "step": 26280 + }, + { + "epoch": 1.823689572946022, + "grad_norm": 1.3046875, + "learning_rate": 4.28e-07, + "loss": 0.2419, + "step": 26290 + }, + { + "epoch": 1.8243832646867548, + "grad_norm": 1.2265625, + "learning_rate": 4.262608695652174e-07, + "loss": 0.2478, + "step": 26300 + }, + { + "epoch": 1.8250769564274876, + "grad_norm": 1.3046875, + "learning_rate": 4.245217391304348e-07, + "loss": 0.2801, + "step": 26310 + }, + { + "epoch": 1.8257706481682203, + "grad_norm": 1.03125, + "learning_rate": 4.227826086956522e-07, + "loss": 0.2001, + "step": 26320 + }, + { + "epoch": 1.826464339908953, + "grad_norm": 0.890625, + "learning_rate": 4.210434782608696e-07, + "loss": 0.2197, + "step": 26330 + }, + { + "epoch": 1.8271580316496858, + "grad_norm": 1.28125, + "learning_rate": 4.19304347826087e-07, + "loss": 0.2616, + "step": 26340 + }, + { + "epoch": 1.8278517233904183, + "grad_norm": 1.40625, + "learning_rate": 4.175652173913044e-07, + "loss": 0.2663, + "step": 26350 + }, + { + "epoch": 1.828545415131151, + "grad_norm": 1.3515625, + "learning_rate": 4.158260869565218e-07, + "loss": 0.2307, + "step": 26360 + }, + { + "epoch": 1.8292391068718838, + "grad_norm": 1.3671875, + "learning_rate": 4.140869565217392e-07, + "loss": 0.2576, + "step": 26370 + }, + { + "epoch": 1.8299327986126164, + "grad_norm": 1.109375, + "learning_rate": 4.1234782608695657e-07, + "loss": 0.1986, + "step": 26380 + }, + { + "epoch": 1.8306264903533491, + "grad_norm": 1.3359375, + "learning_rate": 4.1060869565217394e-07, + "loss": 0.2378, + "step": 26390 + }, + { + "epoch": 1.8313201820940819, + "grad_norm": 1.78125, + "learning_rate": 4.0886956521739137e-07, + "loss": 0.2416, + "step": 26400 + }, + { + "epoch": 1.8320138738348146, + "grad_norm": 1.7890625, + "learning_rate": 4.0713043478260874e-07, + "loss": 0.3052, + "step": 26410 + }, + { + "epoch": 1.8327075655755474, + "grad_norm": 1.15625, + "learning_rate": 4.053913043478261e-07, + "loss": 0.2222, + "step": 26420 + }, + { + "epoch": 1.8334012573162801, + "grad_norm": 1.2890625, + "learning_rate": 4.036521739130435e-07, + "loss": 0.2901, + "step": 26430 + }, + { + "epoch": 1.8340949490570129, + "grad_norm": 1.4921875, + "learning_rate": 4.0191304347826085e-07, + "loss": 0.2526, + "step": 26440 + }, + { + "epoch": 1.8347886407977456, + "grad_norm": 1.234375, + "learning_rate": 4.001739130434783e-07, + "loss": 0.2955, + "step": 26450 + }, + { + "epoch": 1.8354823325384784, + "grad_norm": 1.8359375, + "learning_rate": 3.9843478260869565e-07, + "loss": 0.301, + "step": 26460 + }, + { + "epoch": 1.836176024279211, + "grad_norm": 1.28125, + "learning_rate": 3.9669565217391307e-07, + "loss": 0.2689, + "step": 26470 + }, + { + "epoch": 1.8368697160199436, + "grad_norm": 0.921875, + "learning_rate": 3.9495652173913044e-07, + "loss": 0.2165, + "step": 26480 + }, + { + "epoch": 1.8375634077606764, + "grad_norm": 1.96875, + "learning_rate": 3.9321739130434787e-07, + "loss": 0.3363, + "step": 26490 + }, + { + "epoch": 1.838257099501409, + "grad_norm": 1.5625, + "learning_rate": 3.9147826086956524e-07, + "loss": 0.2889, + "step": 26500 + }, + { + "epoch": 1.8389507912421417, + "grad_norm": 1.140625, + "learning_rate": 3.8973913043478266e-07, + "loss": 0.205, + "step": 26510 + }, + { + "epoch": 1.8396444829828744, + "grad_norm": 1.265625, + "learning_rate": 3.8800000000000003e-07, + "loss": 0.2951, + "step": 26520 + }, + { + "epoch": 1.8403381747236072, + "grad_norm": 1.25, + "learning_rate": 3.8626086956521746e-07, + "loss": 0.2299, + "step": 26530 + }, + { + "epoch": 1.84103186646434, + "grad_norm": 1.40625, + "learning_rate": 3.8452173913043483e-07, + "loss": 0.233, + "step": 26540 + }, + { + "epoch": 1.8417255582050727, + "grad_norm": 1.234375, + "learning_rate": 3.8278260869565225e-07, + "loss": 0.2723, + "step": 26550 + }, + { + "epoch": 1.8424192499458054, + "grad_norm": 1.21875, + "learning_rate": 3.810434782608696e-07, + "loss": 0.2245, + "step": 26560 + }, + { + "epoch": 1.8431129416865382, + "grad_norm": 1.0, + "learning_rate": 3.7930434782608705e-07, + "loss": 0.2378, + "step": 26570 + }, + { + "epoch": 1.843806633427271, + "grad_norm": 1.2734375, + "learning_rate": 3.7756521739130437e-07, + "loss": 0.2427, + "step": 26580 + }, + { + "epoch": 1.8445003251680034, + "grad_norm": 1.0703125, + "learning_rate": 3.7582608695652174e-07, + "loss": 0.2357, + "step": 26590 + }, + { + "epoch": 1.8451940169087362, + "grad_norm": 1.359375, + "learning_rate": 3.7408695652173916e-07, + "loss": 0.3183, + "step": 26600 + }, + { + "epoch": 1.845887708649469, + "grad_norm": 1.03125, + "learning_rate": 3.7234782608695653e-07, + "loss": 0.3006, + "step": 26610 + }, + { + "epoch": 1.8465814003902015, + "grad_norm": 1.5546875, + "learning_rate": 3.7060869565217396e-07, + "loss": 0.2224, + "step": 26620 + }, + { + "epoch": 1.8472750921309342, + "grad_norm": 1.140625, + "learning_rate": 3.6886956521739133e-07, + "loss": 0.221, + "step": 26630 + }, + { + "epoch": 1.847968783871667, + "grad_norm": 1.109375, + "learning_rate": 3.671304347826087e-07, + "loss": 0.2246, + "step": 26640 + }, + { + "epoch": 1.8486624756123997, + "grad_norm": 1.53125, + "learning_rate": 3.653913043478261e-07, + "loss": 0.2503, + "step": 26650 + }, + { + "epoch": 1.8493561673531325, + "grad_norm": 1.34375, + "learning_rate": 3.636521739130435e-07, + "loss": 0.1998, + "step": 26660 + }, + { + "epoch": 1.8500498590938652, + "grad_norm": 1.171875, + "learning_rate": 3.619130434782609e-07, + "loss": 0.2254, + "step": 26670 + }, + { + "epoch": 1.850743550834598, + "grad_norm": 1.25, + "learning_rate": 3.601739130434783e-07, + "loss": 0.2482, + "step": 26680 + }, + { + "epoch": 1.8514372425753307, + "grad_norm": 1.0, + "learning_rate": 3.584347826086957e-07, + "loss": 0.2333, + "step": 26690 + }, + { + "epoch": 1.8521309343160635, + "grad_norm": 1.515625, + "learning_rate": 3.566956521739131e-07, + "loss": 0.2732, + "step": 26700 + }, + { + "epoch": 1.852824626056796, + "grad_norm": 1.15625, + "learning_rate": 3.549565217391305e-07, + "loss": 0.2439, + "step": 26710 + }, + { + "epoch": 1.8535183177975287, + "grad_norm": 1.25, + "learning_rate": 3.532173913043479e-07, + "loss": 0.196, + "step": 26720 + }, + { + "epoch": 1.8542120095382615, + "grad_norm": 1.625, + "learning_rate": 3.514782608695652e-07, + "loss": 0.2994, + "step": 26730 + }, + { + "epoch": 1.854905701278994, + "grad_norm": 1.2421875, + "learning_rate": 3.497391304347826e-07, + "loss": 0.2272, + "step": 26740 + }, + { + "epoch": 1.8555993930197268, + "grad_norm": 1.2734375, + "learning_rate": 3.48e-07, + "loss": 0.2331, + "step": 26750 + }, + { + "epoch": 1.8562930847604595, + "grad_norm": 1.4453125, + "learning_rate": 3.462608695652174e-07, + "loss": 0.2274, + "step": 26760 + }, + { + "epoch": 1.8569867765011923, + "grad_norm": 1.53125, + "learning_rate": 3.445217391304348e-07, + "loss": 0.2571, + "step": 26770 + }, + { + "epoch": 1.857680468241925, + "grad_norm": 1.453125, + "learning_rate": 3.427826086956522e-07, + "loss": 0.2547, + "step": 26780 + }, + { + "epoch": 1.8583741599826578, + "grad_norm": 1.140625, + "learning_rate": 3.410434782608696e-07, + "loss": 0.2549, + "step": 26790 + }, + { + "epoch": 1.8590678517233905, + "grad_norm": 1.140625, + "learning_rate": 3.39304347826087e-07, + "loss": 0.278, + "step": 26800 + }, + { + "epoch": 1.8597615434641233, + "grad_norm": 0.97265625, + "learning_rate": 3.375652173913044e-07, + "loss": 0.2091, + "step": 26810 + }, + { + "epoch": 1.860455235204856, + "grad_norm": 2.015625, + "learning_rate": 3.3582608695652175e-07, + "loss": 0.2894, + "step": 26820 + }, + { + "epoch": 1.8611489269455885, + "grad_norm": 0.85546875, + "learning_rate": 3.3408695652173917e-07, + "loss": 0.2035, + "step": 26830 + }, + { + "epoch": 1.8618426186863213, + "grad_norm": 1.65625, + "learning_rate": 3.3234782608695654e-07, + "loss": 0.2096, + "step": 26840 + }, + { + "epoch": 1.862536310427054, + "grad_norm": 1.296875, + "learning_rate": 3.3060869565217397e-07, + "loss": 0.2182, + "step": 26850 + }, + { + "epoch": 1.8632300021677866, + "grad_norm": 1.2109375, + "learning_rate": 3.2886956521739134e-07, + "loss": 0.2323, + "step": 26860 + }, + { + "epoch": 1.8639236939085193, + "grad_norm": 0.96875, + "learning_rate": 3.2713043478260876e-07, + "loss": 0.307, + "step": 26870 + }, + { + "epoch": 1.864617385649252, + "grad_norm": 1.125, + "learning_rate": 3.2539130434782614e-07, + "loss": 0.2165, + "step": 26880 + }, + { + "epoch": 1.8653110773899848, + "grad_norm": 1.1015625, + "learning_rate": 3.2365217391304345e-07, + "loss": 0.2663, + "step": 26890 + }, + { + "epoch": 1.8660047691307176, + "grad_norm": 0.9765625, + "learning_rate": 3.219130434782609e-07, + "loss": 0.2491, + "step": 26900 + }, + { + "epoch": 1.8666984608714503, + "grad_norm": 1.609375, + "learning_rate": 3.2017391304347825e-07, + "loss": 0.3144, + "step": 26910 + }, + { + "epoch": 1.867392152612183, + "grad_norm": 1.578125, + "learning_rate": 3.1843478260869567e-07, + "loss": 0.2495, + "step": 26920 + }, + { + "epoch": 1.8680858443529158, + "grad_norm": 1.28125, + "learning_rate": 3.1669565217391304e-07, + "loss": 0.3146, + "step": 26930 + }, + { + "epoch": 1.8687795360936483, + "grad_norm": 1.2734375, + "learning_rate": 3.1495652173913047e-07, + "loss": 0.2254, + "step": 26940 + }, + { + "epoch": 1.869473227834381, + "grad_norm": 1.2890625, + "learning_rate": 3.1321739130434784e-07, + "loss": 0.2286, + "step": 26950 + }, + { + "epoch": 1.8701669195751138, + "grad_norm": 1.328125, + "learning_rate": 3.1147826086956526e-07, + "loss": 0.238, + "step": 26960 + }, + { + "epoch": 1.8708606113158464, + "grad_norm": 1.1015625, + "learning_rate": 3.0973913043478263e-07, + "loss": 0.2336, + "step": 26970 + }, + { + "epoch": 1.871554303056579, + "grad_norm": 1.15625, + "learning_rate": 3.0800000000000006e-07, + "loss": 0.2426, + "step": 26980 + }, + { + "epoch": 1.8722479947973119, + "grad_norm": 1.0390625, + "learning_rate": 3.0626086956521743e-07, + "loss": 0.2485, + "step": 26990 + }, + { + "epoch": 1.8729416865380446, + "grad_norm": 1.0625, + "learning_rate": 3.045217391304348e-07, + "loss": 0.2637, + "step": 27000 + }, + { + "epoch": 1.8736353782787774, + "grad_norm": 1.0859375, + "learning_rate": 3.0278260869565217e-07, + "loss": 0.2391, + "step": 27010 + }, + { + "epoch": 1.87432907001951, + "grad_norm": 1.3359375, + "learning_rate": 3.010434782608696e-07, + "loss": 0.2819, + "step": 27020 + }, + { + "epoch": 1.8750227617602429, + "grad_norm": 1.21875, + "learning_rate": 2.9930434782608697e-07, + "loss": 0.238, + "step": 27030 + }, + { + "epoch": 1.8757164535009756, + "grad_norm": 1.3125, + "learning_rate": 2.975652173913044e-07, + "loss": 0.2429, + "step": 27040 + }, + { + "epoch": 1.8764101452417083, + "grad_norm": 1.53125, + "learning_rate": 2.9582608695652176e-07, + "loss": 0.2977, + "step": 27050 + }, + { + "epoch": 1.8771038369824409, + "grad_norm": 1.375, + "learning_rate": 2.940869565217392e-07, + "loss": 0.2399, + "step": 27060 + }, + { + "epoch": 1.8777975287231736, + "grad_norm": 1.5234375, + "learning_rate": 2.9234782608695656e-07, + "loss": 0.2749, + "step": 27070 + }, + { + "epoch": 1.8784912204639064, + "grad_norm": 1.3203125, + "learning_rate": 2.9060869565217393e-07, + "loss": 0.2985, + "step": 27080 + }, + { + "epoch": 1.879184912204639, + "grad_norm": 1.3046875, + "learning_rate": 2.888695652173913e-07, + "loss": 0.228, + "step": 27090 + }, + { + "epoch": 1.8798786039453717, + "grad_norm": 1.3828125, + "learning_rate": 2.871304347826087e-07, + "loss": 0.2449, + "step": 27100 + }, + { + "epoch": 1.8805722956861044, + "grad_norm": 1.171875, + "learning_rate": 2.853913043478261e-07, + "loss": 0.1995, + "step": 27110 + }, + { + "epoch": 1.8812659874268371, + "grad_norm": 0.9609375, + "learning_rate": 2.836521739130435e-07, + "loss": 0.2751, + "step": 27120 + }, + { + "epoch": 1.88195967916757, + "grad_norm": 1.234375, + "learning_rate": 2.819130434782609e-07, + "loss": 0.2217, + "step": 27130 + }, + { + "epoch": 1.8826533709083026, + "grad_norm": 1.359375, + "learning_rate": 2.801739130434783e-07, + "loss": 0.2758, + "step": 27140 + }, + { + "epoch": 1.8833470626490354, + "grad_norm": 1.296875, + "learning_rate": 2.784347826086957e-07, + "loss": 0.2817, + "step": 27150 + }, + { + "epoch": 1.8840407543897681, + "grad_norm": 1.703125, + "learning_rate": 2.7669565217391306e-07, + "loss": 0.2296, + "step": 27160 + }, + { + "epoch": 1.884734446130501, + "grad_norm": 1.3046875, + "learning_rate": 2.7495652173913043e-07, + "loss": 0.219, + "step": 27170 + }, + { + "epoch": 1.8854281378712334, + "grad_norm": 1.2421875, + "learning_rate": 2.7321739130434785e-07, + "loss": 0.2468, + "step": 27180 + }, + { + "epoch": 1.8861218296119662, + "grad_norm": 1.6953125, + "learning_rate": 2.714782608695652e-07, + "loss": 0.2361, + "step": 27190 + }, + { + "epoch": 1.886815521352699, + "grad_norm": 1.296875, + "learning_rate": 2.6973913043478265e-07, + "loss": 0.2341, + "step": 27200 + }, + { + "epoch": 1.8875092130934314, + "grad_norm": 1.4453125, + "learning_rate": 2.68e-07, + "loss": 0.258, + "step": 27210 + }, + { + "epoch": 1.8882029048341642, + "grad_norm": 1.390625, + "learning_rate": 2.6626086956521744e-07, + "loss": 0.2434, + "step": 27220 + }, + { + "epoch": 1.888896596574897, + "grad_norm": 1.1171875, + "learning_rate": 2.645217391304348e-07, + "loss": 0.2194, + "step": 27230 + }, + { + "epoch": 1.8895902883156297, + "grad_norm": 1.1328125, + "learning_rate": 2.627826086956522e-07, + "loss": 0.2651, + "step": 27240 + }, + { + "epoch": 1.8902839800563624, + "grad_norm": 1.25, + "learning_rate": 2.6104347826086955e-07, + "loss": 0.2527, + "step": 27250 + }, + { + "epoch": 1.8909776717970952, + "grad_norm": 1.21875, + "learning_rate": 2.59304347826087e-07, + "loss": 0.2297, + "step": 27260 + }, + { + "epoch": 1.891671363537828, + "grad_norm": 1.34375, + "learning_rate": 2.5756521739130435e-07, + "loss": 0.2108, + "step": 27270 + }, + { + "epoch": 1.8923650552785607, + "grad_norm": 1.0234375, + "learning_rate": 2.558260869565218e-07, + "loss": 0.2001, + "step": 27280 + }, + { + "epoch": 1.8930587470192934, + "grad_norm": 0.9921875, + "learning_rate": 2.5408695652173915e-07, + "loss": 0.2906, + "step": 27290 + }, + { + "epoch": 1.893752438760026, + "grad_norm": 1.6796875, + "learning_rate": 2.5234782608695657e-07, + "loss": 0.2958, + "step": 27300 + }, + { + "epoch": 1.8944461305007587, + "grad_norm": 1.140625, + "learning_rate": 2.5060869565217394e-07, + "loss": 0.32, + "step": 27310 + }, + { + "epoch": 1.8951398222414915, + "grad_norm": 1.25, + "learning_rate": 2.488695652173913e-07, + "loss": 0.2223, + "step": 27320 + }, + { + "epoch": 1.895833513982224, + "grad_norm": 1.421875, + "learning_rate": 2.4713043478260874e-07, + "loss": 0.323, + "step": 27330 + }, + { + "epoch": 1.8965272057229567, + "grad_norm": 1.34375, + "learning_rate": 2.453913043478261e-07, + "loss": 0.2378, + "step": 27340 + }, + { + "epoch": 1.8972208974636895, + "grad_norm": 1.2890625, + "learning_rate": 2.436521739130435e-07, + "loss": 0.26, + "step": 27350 + }, + { + "epoch": 1.8979145892044222, + "grad_norm": 1.2421875, + "learning_rate": 2.419130434782609e-07, + "loss": 0.2459, + "step": 27360 + }, + { + "epoch": 1.898608280945155, + "grad_norm": 1.125, + "learning_rate": 2.4017391304347827e-07, + "loss": 0.2385, + "step": 27370 + }, + { + "epoch": 1.8993019726858877, + "grad_norm": 1.390625, + "learning_rate": 2.384347826086957e-07, + "loss": 0.2089, + "step": 27380 + }, + { + "epoch": 1.8999956644266205, + "grad_norm": 0.99609375, + "learning_rate": 2.3669565217391304e-07, + "loss": 0.2246, + "step": 27390 + }, + { + "epoch": 1.9006893561673532, + "grad_norm": 1.3515625, + "learning_rate": 2.3495652173913044e-07, + "loss": 0.2281, + "step": 27400 + }, + { + "epoch": 1.901383047908086, + "grad_norm": 1.4765625, + "learning_rate": 2.3321739130434784e-07, + "loss": 0.2276, + "step": 27410 + }, + { + "epoch": 1.9020767396488185, + "grad_norm": 1.0234375, + "learning_rate": 2.3147826086956523e-07, + "loss": 0.3057, + "step": 27420 + }, + { + "epoch": 1.9027704313895513, + "grad_norm": 1.203125, + "learning_rate": 2.2973913043478263e-07, + "loss": 0.237, + "step": 27430 + }, + { + "epoch": 1.903464123130284, + "grad_norm": 1.3828125, + "learning_rate": 2.2800000000000003e-07, + "loss": 0.2445, + "step": 27440 + }, + { + "epoch": 1.9041578148710165, + "grad_norm": 1.171875, + "learning_rate": 2.2626086956521743e-07, + "loss": 0.2948, + "step": 27450 + }, + { + "epoch": 1.9048515066117493, + "grad_norm": 1.203125, + "learning_rate": 2.2452173913043483e-07, + "loss": 0.2938, + "step": 27460 + }, + { + "epoch": 1.905545198352482, + "grad_norm": 1.21875, + "learning_rate": 2.2278260869565217e-07, + "loss": 0.3153, + "step": 27470 + }, + { + "epoch": 1.9062388900932148, + "grad_norm": 1.234375, + "learning_rate": 2.2104347826086957e-07, + "loss": 0.2448, + "step": 27480 + }, + { + "epoch": 1.9069325818339475, + "grad_norm": 1.046875, + "learning_rate": 2.1930434782608696e-07, + "loss": 0.2489, + "step": 27490 + }, + { + "epoch": 1.9076262735746803, + "grad_norm": 0.95703125, + "learning_rate": 2.1756521739130436e-07, + "loss": 0.2393, + "step": 27500 + }, + { + "epoch": 1.908319965315413, + "grad_norm": 0.90234375, + "learning_rate": 2.1582608695652176e-07, + "loss": 0.2245, + "step": 27510 + }, + { + "epoch": 1.9090136570561458, + "grad_norm": 1.3359375, + "learning_rate": 2.1408695652173916e-07, + "loss": 0.2445, + "step": 27520 + }, + { + "epoch": 1.9097073487968785, + "grad_norm": 1.078125, + "learning_rate": 2.1234782608695656e-07, + "loss": 0.2321, + "step": 27530 + }, + { + "epoch": 1.910401040537611, + "grad_norm": 1.03125, + "learning_rate": 2.1060869565217393e-07, + "loss": 0.2311, + "step": 27540 + }, + { + "epoch": 1.9110947322783438, + "grad_norm": 1.3359375, + "learning_rate": 2.088695652173913e-07, + "loss": 0.2688, + "step": 27550 + }, + { + "epoch": 1.9117884240190766, + "grad_norm": 1.1875, + "learning_rate": 2.071304347826087e-07, + "loss": 0.2381, + "step": 27560 + }, + { + "epoch": 1.912482115759809, + "grad_norm": 1.09375, + "learning_rate": 2.053913043478261e-07, + "loss": 0.2505, + "step": 27570 + }, + { + "epoch": 1.9131758075005418, + "grad_norm": 1.1796875, + "learning_rate": 2.036521739130435e-07, + "loss": 0.2384, + "step": 27580 + }, + { + "epoch": 1.9138694992412746, + "grad_norm": 1.3359375, + "learning_rate": 2.019130434782609e-07, + "loss": 0.2604, + "step": 27590 + }, + { + "epoch": 1.9145631909820073, + "grad_norm": 1.2734375, + "learning_rate": 2.0017391304347829e-07, + "loss": 0.2229, + "step": 27600 + }, + { + "epoch": 1.91525688272274, + "grad_norm": 1.7421875, + "learning_rate": 1.9843478260869568e-07, + "loss": 0.3559, + "step": 27610 + }, + { + "epoch": 1.9159505744634728, + "grad_norm": 1.5390625, + "learning_rate": 1.9669565217391305e-07, + "loss": 0.2457, + "step": 27620 + }, + { + "epoch": 1.9166442662042056, + "grad_norm": 1.21875, + "learning_rate": 1.9495652173913045e-07, + "loss": 0.2954, + "step": 27630 + }, + { + "epoch": 1.9173379579449383, + "grad_norm": 1.1640625, + "learning_rate": 1.9321739130434782e-07, + "loss": 0.3314, + "step": 27640 + }, + { + "epoch": 1.918031649685671, + "grad_norm": 1.0546875, + "learning_rate": 1.9147826086956522e-07, + "loss": 0.2632, + "step": 27650 + }, + { + "epoch": 1.9187253414264036, + "grad_norm": 1.3046875, + "learning_rate": 1.8973913043478262e-07, + "loss": 0.2361, + "step": 27660 + }, + { + "epoch": 1.9194190331671364, + "grad_norm": 1.34375, + "learning_rate": 1.8800000000000002e-07, + "loss": 0.2164, + "step": 27670 + }, + { + "epoch": 1.920112724907869, + "grad_norm": 1.265625, + "learning_rate": 1.8626086956521741e-07, + "loss": 0.2366, + "step": 27680 + }, + { + "epoch": 1.9208064166486016, + "grad_norm": 1.421875, + "learning_rate": 1.845217391304348e-07, + "loss": 0.2236, + "step": 27690 + }, + { + "epoch": 1.9215001083893344, + "grad_norm": 1.0078125, + "learning_rate": 1.8278260869565218e-07, + "loss": 0.2692, + "step": 27700 + }, + { + "epoch": 1.9221938001300671, + "grad_norm": 1.96875, + "learning_rate": 1.8104347826086958e-07, + "loss": 0.3485, + "step": 27710 + }, + { + "epoch": 1.9228874918707999, + "grad_norm": 1.171875, + "learning_rate": 1.7930434782608698e-07, + "loss": 0.3303, + "step": 27720 + }, + { + "epoch": 1.9235811836115326, + "grad_norm": 1.3828125, + "learning_rate": 1.7756521739130437e-07, + "loss": 0.2459, + "step": 27730 + }, + { + "epoch": 1.9242748753522654, + "grad_norm": 1.609375, + "learning_rate": 1.7582608695652175e-07, + "loss": 0.2606, + "step": 27740 + }, + { + "epoch": 1.9249685670929981, + "grad_norm": 1.3359375, + "learning_rate": 1.7408695652173914e-07, + "loss": 0.2583, + "step": 27750 + }, + { + "epoch": 1.9256622588337309, + "grad_norm": 1.015625, + "learning_rate": 1.7234782608695654e-07, + "loss": 0.2222, + "step": 27760 + }, + { + "epoch": 1.9263559505744636, + "grad_norm": 1.1171875, + "learning_rate": 1.706086956521739e-07, + "loss": 0.2098, + "step": 27770 + }, + { + "epoch": 1.9270496423151962, + "grad_norm": 1.1796875, + "learning_rate": 1.688695652173913e-07, + "loss": 0.2314, + "step": 27780 + }, + { + "epoch": 1.927743334055929, + "grad_norm": 1.265625, + "learning_rate": 1.671304347826087e-07, + "loss": 0.2344, + "step": 27790 + }, + { + "epoch": 1.9284370257966617, + "grad_norm": 1.234375, + "learning_rate": 1.653913043478261e-07, + "loss": 0.2579, + "step": 27800 + }, + { + "epoch": 1.9291307175373942, + "grad_norm": 1.078125, + "learning_rate": 1.636521739130435e-07, + "loss": 0.2024, + "step": 27810 + }, + { + "epoch": 1.929824409278127, + "grad_norm": 1.2890625, + "learning_rate": 1.619130434782609e-07, + "loss": 0.2661, + "step": 27820 + }, + { + "epoch": 1.9305181010188597, + "grad_norm": 1.0859375, + "learning_rate": 1.6017391304347827e-07, + "loss": 0.2161, + "step": 27830 + }, + { + "epoch": 1.9312117927595924, + "grad_norm": 1.1171875, + "learning_rate": 1.5843478260869567e-07, + "loss": 0.2477, + "step": 27840 + }, + { + "epoch": 1.9319054845003252, + "grad_norm": 1.140625, + "learning_rate": 1.5669565217391304e-07, + "loss": 0.2482, + "step": 27850 + }, + { + "epoch": 1.932599176241058, + "grad_norm": 1.7265625, + "learning_rate": 1.5495652173913046e-07, + "loss": 0.2296, + "step": 27860 + }, + { + "epoch": 1.9332928679817907, + "grad_norm": 1.09375, + "learning_rate": 1.5321739130434784e-07, + "loss": 0.2083, + "step": 27870 + }, + { + "epoch": 1.9339865597225234, + "grad_norm": 1.2109375, + "learning_rate": 1.5147826086956523e-07, + "loss": 0.2222, + "step": 27880 + }, + { + "epoch": 1.9346802514632562, + "grad_norm": 0.9453125, + "learning_rate": 1.4973913043478263e-07, + "loss": 0.2657, + "step": 27890 + }, + { + "epoch": 1.9353739432039887, + "grad_norm": 1.0078125, + "learning_rate": 1.4800000000000003e-07, + "loss": 0.2672, + "step": 27900 + }, + { + "epoch": 1.9360676349447214, + "grad_norm": 0.9609375, + "learning_rate": 1.462608695652174e-07, + "loss": 0.2201, + "step": 27910 + }, + { + "epoch": 1.9367613266854542, + "grad_norm": 1.078125, + "learning_rate": 1.445217391304348e-07, + "loss": 0.2807, + "step": 27920 + }, + { + "epoch": 1.9374550184261867, + "grad_norm": 1.4375, + "learning_rate": 1.427826086956522e-07, + "loss": 0.2759, + "step": 27930 + }, + { + "epoch": 1.9381487101669195, + "grad_norm": 1.15625, + "learning_rate": 1.410434782608696e-07, + "loss": 0.2086, + "step": 27940 + }, + { + "epoch": 1.9388424019076522, + "grad_norm": 1.3984375, + "learning_rate": 1.3930434782608696e-07, + "loss": 0.2209, + "step": 27950 + }, + { + "epoch": 1.939536093648385, + "grad_norm": 2.015625, + "learning_rate": 1.3756521739130436e-07, + "loss": 0.2617, + "step": 27960 + }, + { + "epoch": 1.9402297853891177, + "grad_norm": 1.3046875, + "learning_rate": 1.3582608695652176e-07, + "loss": 0.25, + "step": 27970 + }, + { + "epoch": 1.9409234771298505, + "grad_norm": 1.2109375, + "learning_rate": 1.3408695652173916e-07, + "loss": 0.1882, + "step": 27980 + }, + { + "epoch": 1.9416171688705832, + "grad_norm": 1.078125, + "learning_rate": 1.3234782608695653e-07, + "loss": 0.279, + "step": 27990 + }, + { + "epoch": 1.942310860611316, + "grad_norm": 1.25, + "learning_rate": 1.3060869565217392e-07, + "loss": 0.2479, + "step": 28000 + }, + { + "epoch": 1.9430045523520487, + "grad_norm": 1.140625, + "learning_rate": 1.2886956521739132e-07, + "loss": 0.2482, + "step": 28010 + }, + { + "epoch": 1.9436982440927812, + "grad_norm": 1.1328125, + "learning_rate": 1.2713043478260872e-07, + "loss": 0.2375, + "step": 28020 + }, + { + "epoch": 1.944391935833514, + "grad_norm": 1.0390625, + "learning_rate": 1.253913043478261e-07, + "loss": 0.215, + "step": 28030 + }, + { + "epoch": 1.9450856275742467, + "grad_norm": 1.3125, + "learning_rate": 1.236521739130435e-07, + "loss": 0.2363, + "step": 28040 + }, + { + "epoch": 1.9457793193149793, + "grad_norm": 0.93359375, + "learning_rate": 1.2191304347826089e-07, + "loss": 0.3115, + "step": 28050 + }, + { + "epoch": 1.946473011055712, + "grad_norm": 1.1328125, + "learning_rate": 1.2017391304347826e-07, + "loss": 0.2421, + "step": 28060 + }, + { + "epoch": 1.9471667027964448, + "grad_norm": 1.1328125, + "learning_rate": 1.1843478260869566e-07, + "loss": 0.204, + "step": 28070 + }, + { + "epoch": 1.9478603945371775, + "grad_norm": 1.0859375, + "learning_rate": 1.1669565217391305e-07, + "loss": 0.2407, + "step": 28080 + }, + { + "epoch": 1.9485540862779103, + "grad_norm": 1.3828125, + "learning_rate": 1.1495652173913045e-07, + "loss": 0.2806, + "step": 28090 + }, + { + "epoch": 1.949247778018643, + "grad_norm": 1.0703125, + "learning_rate": 1.1321739130434782e-07, + "loss": 0.205, + "step": 28100 + }, + { + "epoch": 1.9499414697593758, + "grad_norm": 1.4609375, + "learning_rate": 1.1147826086956522e-07, + "loss": 0.29, + "step": 28110 + }, + { + "epoch": 1.9506351615001085, + "grad_norm": 1.1484375, + "learning_rate": 1.0973913043478262e-07, + "loss": 0.256, + "step": 28120 + }, + { + "epoch": 1.9513288532408413, + "grad_norm": 1.328125, + "learning_rate": 1.0800000000000001e-07, + "loss": 0.2424, + "step": 28130 + }, + { + "epoch": 1.9520225449815738, + "grad_norm": 1.15625, + "learning_rate": 1.062608695652174e-07, + "loss": 0.2434, + "step": 28140 + }, + { + "epoch": 1.9527162367223065, + "grad_norm": 1.34375, + "learning_rate": 1.0452173913043478e-07, + "loss": 0.273, + "step": 28150 + }, + { + "epoch": 1.9534099284630393, + "grad_norm": 1.265625, + "learning_rate": 1.0278260869565218e-07, + "loss": 0.2397, + "step": 28160 + }, + { + "epoch": 1.9541036202037718, + "grad_norm": 1.28125, + "learning_rate": 1.0104347826086958e-07, + "loss": 0.2082, + "step": 28170 + }, + { + "epoch": 1.9547973119445046, + "grad_norm": 1.140625, + "learning_rate": 9.930434782608696e-08, + "loss": 0.2713, + "step": 28180 + }, + { + "epoch": 1.9554910036852373, + "grad_norm": 1.25, + "learning_rate": 9.756521739130436e-08, + "loss": 0.2822, + "step": 28190 + }, + { + "epoch": 1.95618469542597, + "grad_norm": 1.296875, + "learning_rate": 9.582608695652174e-08, + "loss": 0.2366, + "step": 28200 + }, + { + "epoch": 1.9568783871667028, + "grad_norm": 1.734375, + "learning_rate": 9.408695652173914e-08, + "loss": 0.2482, + "step": 28210 + }, + { + "epoch": 1.9575720789074356, + "grad_norm": 1.1171875, + "learning_rate": 9.234782608695653e-08, + "loss": 0.2706, + "step": 28220 + }, + { + "epoch": 1.9582657706481683, + "grad_norm": 1.203125, + "learning_rate": 9.060869565217392e-08, + "loss": 0.2504, + "step": 28230 + }, + { + "epoch": 1.958959462388901, + "grad_norm": 0.8984375, + "learning_rate": 8.886956521739131e-08, + "loss": 0.3229, + "step": 28240 + }, + { + "epoch": 1.9596531541296336, + "grad_norm": 1.09375, + "learning_rate": 8.71304347826087e-08, + "loss": 0.2716, + "step": 28250 + }, + { + "epoch": 1.9603468458703663, + "grad_norm": 1.6640625, + "learning_rate": 8.539130434782609e-08, + "loss": 0.2757, + "step": 28260 + }, + { + "epoch": 1.961040537611099, + "grad_norm": 1.21875, + "learning_rate": 8.365217391304349e-08, + "loss": 0.2666, + "step": 28270 + }, + { + "epoch": 1.9617342293518316, + "grad_norm": 1.203125, + "learning_rate": 8.191304347826089e-08, + "loss": 0.2189, + "step": 28280 + }, + { + "epoch": 1.9624279210925644, + "grad_norm": 1.3828125, + "learning_rate": 8.017391304347827e-08, + "loss": 0.2109, + "step": 28290 + }, + { + "epoch": 1.9631216128332971, + "grad_norm": 1.1640625, + "learning_rate": 7.843478260869565e-08, + "loss": 0.2154, + "step": 28300 + }, + { + "epoch": 1.9638153045740299, + "grad_norm": 1.046875, + "learning_rate": 7.669565217391305e-08, + "loss": 0.2053, + "step": 28310 + }, + { + "epoch": 1.9645089963147626, + "grad_norm": 1.171875, + "learning_rate": 7.495652173913045e-08, + "loss": 0.3179, + "step": 28320 + }, + { + "epoch": 1.9652026880554954, + "grad_norm": 1.234375, + "learning_rate": 7.321739130434783e-08, + "loss": 0.2356, + "step": 28330 + }, + { + "epoch": 1.965896379796228, + "grad_norm": 1.3203125, + "learning_rate": 7.147826086956522e-08, + "loss": 0.223, + "step": 28340 + }, + { + "epoch": 1.9665900715369609, + "grad_norm": 1.2109375, + "learning_rate": 6.973913043478262e-08, + "loss": 0.27, + "step": 28350 + }, + { + "epoch": 1.9672837632776936, + "grad_norm": 1.0859375, + "learning_rate": 6.8e-08, + "loss": 0.2565, + "step": 28360 + }, + { + "epoch": 1.9679774550184261, + "grad_norm": 1.015625, + "learning_rate": 6.62608695652174e-08, + "loss": 0.2209, + "step": 28370 + }, + { + "epoch": 1.9686711467591589, + "grad_norm": 1.1484375, + "learning_rate": 6.452173913043478e-08, + "loss": 0.2149, + "step": 28380 + }, + { + "epoch": 1.9693648384998916, + "grad_norm": 1.3046875, + "learning_rate": 6.278260869565218e-08, + "loss": 0.2276, + "step": 28390 + }, + { + "epoch": 1.9700585302406242, + "grad_norm": 1.5, + "learning_rate": 6.104347826086956e-08, + "loss": 0.2533, + "step": 28400 + }, + { + "epoch": 1.970752221981357, + "grad_norm": 1.296875, + "learning_rate": 5.930434782608696e-08, + "loss": 0.2304, + "step": 28410 + }, + { + "epoch": 1.9714459137220897, + "grad_norm": 0.97265625, + "learning_rate": 5.756521739130435e-08, + "loss": 0.2486, + "step": 28420 + }, + { + "epoch": 1.9721396054628224, + "grad_norm": 1.0, + "learning_rate": 5.5826086956521744e-08, + "loss": 0.234, + "step": 28430 + }, + { + "epoch": 1.9728332972035552, + "grad_norm": 1.421875, + "learning_rate": 5.4086956521739135e-08, + "loss": 0.2633, + "step": 28440 + }, + { + "epoch": 1.973526988944288, + "grad_norm": 1.046875, + "learning_rate": 5.2347826086956526e-08, + "loss": 0.2266, + "step": 28450 + }, + { + "epoch": 1.9742206806850207, + "grad_norm": 1.3359375, + "learning_rate": 5.0608695652173917e-08, + "loss": 0.2356, + "step": 28460 + }, + { + "epoch": 1.9749143724257534, + "grad_norm": 1.0234375, + "learning_rate": 4.886956521739131e-08, + "loss": 0.2128, + "step": 28470 + }, + { + "epoch": 1.9756080641664862, + "grad_norm": 1.09375, + "learning_rate": 4.71304347826087e-08, + "loss": 0.2174, + "step": 28480 + }, + { + "epoch": 1.9763017559072187, + "grad_norm": 1.3046875, + "learning_rate": 4.5391304347826096e-08, + "loss": 0.2517, + "step": 28490 + }, + { + "epoch": 1.9769954476479514, + "grad_norm": 1.25, + "learning_rate": 4.365217391304348e-08, + "loss": 0.227, + "step": 28500 + } + ], + "logging_steps": 10, + "max_steps": 28750, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.3842958733451e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}