gemma3-4b-coder / trainer_state.json

Add files using upload-large-folder tool

4bfe1c5 verified 5 months ago

19.9 kB

	{
	"best_global_step": null,
	"best_metric": null,
	"best_model_checkpoint": null,
	"epoch": 0.013256206314774645,
	"eval_steps": 500,
	"global_step": 110,
	"is_hyper_param_search": false,
	"is_local_process_zero": true,
	"is_world_process_zero": true,
	"log_history": [
	{
	"epoch": 0.00012051096649795132,
	"grad_norm": 0.7307866811752319,
	"learning_rate": 4e-05,
	"loss": 1.2502,
	"step": 1
	},
	{
	"epoch": 0.00024102193299590263,
	"grad_norm": 0.7944597601890564,
	"learning_rate": 8e-05,
	"loss": 1.0923,
	"step": 2
	},
	{
	"epoch": 0.00036153289949385393,
	"grad_norm": 0.8116863965988159,
	"learning_rate": 0.00012,
	"loss": 1.4372,
	"step": 3
	},
	{
	"epoch": 0.00048204386599180526,
	"grad_norm": 0.6883746981620789,
	"learning_rate": 0.00016,
	"loss": 1.2503,
	"step": 4
	},
	{
	"epoch": 0.0006025548324897565,
	"grad_norm": 0.6956741809844971,
	"learning_rate": 0.0002,
	"loss": 1.135,
	"step": 5
	},
	{
	"epoch": 0.0007230657989877079,
	"grad_norm": 0.7852187752723694,
	"learning_rate": 0.0001980952380952381,
	"loss": 1.0132,
	"step": 6
	},
	{
	"epoch": 0.0008435767654856592,
	"grad_norm": 0.4692592918872833,
	"learning_rate": 0.0001961904761904762,
	"loss": 0.7826,
	"step": 7
	},
	{
	"epoch": 0.0009640877319836105,
	"grad_norm": 0.27623867988586426,
	"learning_rate": 0.0001942857142857143,
	"loss": 0.664,
	"step": 8
	},
	{
	"epoch": 0.0010845986984815619,
	"grad_norm": 0.21396474540233612,
	"learning_rate": 0.0001923809523809524,
	"loss": 0.9179,
	"step": 9
	},
	{
	"epoch": 0.001205109664979513,
	"grad_norm": 0.1967506855726242,
	"learning_rate": 0.00019047619047619048,
	"loss": 0.6711,
	"step": 10
	},
	{
	"epoch": 0.0013256206314774645,
	"grad_norm": 0.20955657958984375,
	"learning_rate": 0.00018857142857142857,
	"loss": 0.8331,
	"step": 11
	},
	{
	"epoch": 0.0014461315979754157,
	"grad_norm": 0.2680826485157013,
	"learning_rate": 0.0001866666666666667,
	"loss": 0.8829,
	"step": 12
	},
	{
	"epoch": 0.0015666425644733672,
	"grad_norm": 0.25052550435066223,
	"learning_rate": 0.00018476190476190478,
	"loss": 0.7536,
	"step": 13
	},
	{
	"epoch": 0.0016871535309713184,
	"grad_norm": 0.27972114086151123,
	"learning_rate": 0.00018285714285714286,
	"loss": 0.8129,
	"step": 14
	},
	{
	"epoch": 0.0018076644974692696,
	"grad_norm": 0.23484091460704803,
	"learning_rate": 0.00018095238095238095,
	"loss": 0.8715,
	"step": 15
	},
	{
	"epoch": 0.001928175463967221,
	"grad_norm": 0.2122180461883545,
	"learning_rate": 0.00017904761904761907,
	"loss": 0.9421,
	"step": 16
	},
	{
	"epoch": 0.0020486864304651723,
	"grad_norm": 0.19645242393016815,
	"learning_rate": 0.00017714285714285713,
	"loss": 0.6596,
	"step": 17
	},
	{
	"epoch": 0.0021691973969631237,
	"grad_norm": 0.21645572781562805,
	"learning_rate": 0.00017523809523809525,
	"loss": 0.764,
	"step": 18
	},
	{
	"epoch": 0.002289708363461075,
	"grad_norm": 0.15910537540912628,
	"learning_rate": 0.00017333333333333334,
	"loss": 0.7156,
	"step": 19
	},
	{
	"epoch": 0.002410219329959026,
	"grad_norm": 0.1565140336751938,
	"learning_rate": 0.00017142857142857143,
	"loss": 0.6023,
	"step": 20
	},
	{
	"epoch": 0.0025307302964569776,
	"grad_norm": 0.17277204990386963,
	"learning_rate": 0.00016952380952380954,
	"loss": 0.5594,
	"step": 21
	},
	{
	"epoch": 0.002651241262954929,
	"grad_norm": 0.17323294281959534,
	"learning_rate": 0.00016761904761904763,
	"loss": 0.681,
	"step": 22
	},
	{
	"epoch": 0.00277175222945288,
	"grad_norm": 0.1539444774389267,
	"learning_rate": 0.00016571428571428575,
	"loss": 0.7535,
	"step": 23
	},
	{
	"epoch": 0.0028922631959508315,
	"grad_norm": 0.16936075687408447,
	"learning_rate": 0.0001638095238095238,
	"loss": 0.5524,
	"step": 24
	},
	{
	"epoch": 0.003012774162448783,
	"grad_norm": 0.1893339455127716,
	"learning_rate": 0.00016190476190476192,
	"loss": 0.802,
	"step": 25
	},
	{
	"epoch": 0.0031332851289467343,
	"grad_norm": 0.17078277468681335,
	"learning_rate": 0.00016,
	"loss": 0.677,
	"step": 26
	},
	{
	"epoch": 0.0032537960954446853,
	"grad_norm": 0.1889839768409729,
	"learning_rate": 0.0001580952380952381,
	"loss": 0.5551,
	"step": 27
	},
	{
	"epoch": 0.003374307061942637,
	"grad_norm": 0.2148876190185547,
	"learning_rate": 0.0001561904761904762,
	"loss": 0.6161,
	"step": 28
	},
	{
	"epoch": 0.0034948180284405882,
	"grad_norm": 0.1392691731452942,
	"learning_rate": 0.0001542857142857143,
	"loss": 0.5348,
	"step": 29
	},
	{
	"epoch": 0.0036153289949385392,
	"grad_norm": 0.17458081245422363,
	"learning_rate": 0.00015238095238095237,
	"loss": 0.7913,
	"step": 30
	},
	{
	"epoch": 0.0037358399614364907,
	"grad_norm": 0.1562052071094513,
	"learning_rate": 0.00015047619047619048,
	"loss": 0.8158,
	"step": 31
	},
	{
	"epoch": 0.003856350927934442,
	"grad_norm": 0.1435224562883377,
	"learning_rate": 0.00014857142857142857,
	"loss": 0.7528,
	"step": 32
	},
	{
	"epoch": 0.0039768618944323935,
	"grad_norm": 0.14048519730567932,
	"learning_rate": 0.00014666666666666666,
	"loss": 0.6955,
	"step": 33
	},
	{
	"epoch": 0.0040973728609303445,
	"grad_norm": 0.16571789979934692,
	"learning_rate": 0.00014476190476190475,
	"loss": 0.5537,
	"step": 34
	},
	{
	"epoch": 0.0042178838274282955,
	"grad_norm": 0.165692538022995,
	"learning_rate": 0.00014285714285714287,
	"loss": 0.7134,
	"step": 35
	},
	{
	"epoch": 0.004338394793926247,
	"grad_norm": 0.1822883039712906,
	"learning_rate": 0.00014095238095238096,
	"loss": 0.5432,
	"step": 36
	},
	{
	"epoch": 0.004458905760424198,
	"grad_norm": 0.1414850652217865,
	"learning_rate": 0.00013904761904761905,
	"loss": 0.6703,
	"step": 37
	},
	{
	"epoch": 0.00457941672692215,
	"grad_norm": 0.15394528210163116,
	"learning_rate": 0.00013714285714285716,
	"loss": 0.6561,
	"step": 38
	},
	{
	"epoch": 0.004699927693420101,
	"grad_norm": 0.1435491144657135,
	"learning_rate": 0.00013523809523809525,
	"loss": 0.5644,
	"step": 39
	},
	{
	"epoch": 0.004820438659918052,
	"grad_norm": 0.16691423952579498,
	"learning_rate": 0.00013333333333333334,
	"loss": 0.7856,
	"step": 40
	},
	{
	"epoch": 0.004940949626416004,
	"grad_norm": 0.14211532473564148,
	"learning_rate": 0.00013142857142857143,
	"loss": 0.6399,
	"step": 41
	},
	{
	"epoch": 0.005061460592913955,
	"grad_norm": 0.18083994090557098,
	"learning_rate": 0.00012952380952380954,
	"loss": 0.715,
	"step": 42
	},
	{
	"epoch": 0.005181971559411906,
	"grad_norm": 0.15873770415782928,
	"learning_rate": 0.0001276190476190476,
	"loss": 0.7614,
	"step": 43
	},
	{
	"epoch": 0.005302482525909858,
	"grad_norm": 0.14993314445018768,
	"learning_rate": 0.00012571428571428572,
	"loss": 0.6105,
	"step": 44
	},
	{
	"epoch": 0.005422993492407809,
	"grad_norm": 0.18779931962490082,
	"learning_rate": 0.0001238095238095238,
	"loss": 1.0716,
	"step": 45
	},
	{
	"epoch": 0.00554350445890576,
	"grad_norm": 0.15650784969329834,
	"learning_rate": 0.00012190476190476193,
	"loss": 0.738,
	"step": 46
	},
	{
	"epoch": 0.005664015425403712,
	"grad_norm": 0.1431063711643219,
	"learning_rate": 0.00012,
	"loss": 0.5219,
	"step": 47
	},
	{
	"epoch": 0.005784526391901663,
	"grad_norm": 0.1359708309173584,
	"learning_rate": 0.0001180952380952381,
	"loss": 0.5886,
	"step": 48
	},
	{
	"epoch": 0.005905037358399614,
	"grad_norm": 0.16217978298664093,
	"learning_rate": 0.00011619047619047621,
	"loss": 0.7634,
	"step": 49
	},
	{
	"epoch": 0.006025548324897566,
	"grad_norm": 0.16889767348766327,
	"learning_rate": 0.00011428571428571428,
	"loss": 0.7717,
	"step": 50
	},
	{
	"epoch": 0.006146059291395517,
	"grad_norm": 0.21841812133789062,
	"learning_rate": 0.00011238095238095239,
	"loss": 0.937,
	"step": 51
	},
	{
	"epoch": 0.006266570257893469,
	"grad_norm": 0.17994704842567444,
	"learning_rate": 0.00011047619047619049,
	"loss": 0.8443,
	"step": 52
	},
	{
	"epoch": 0.00638708122439142,
	"grad_norm": 0.15717928111553192,
	"learning_rate": 0.00010857142857142856,
	"loss": 0.7624,
	"step": 53
	},
	{
	"epoch": 0.006507592190889371,
	"grad_norm": 0.16110721230506897,
	"learning_rate": 0.00010666666666666667,
	"loss": 0.7228,
	"step": 54
	},
	{
	"epoch": 0.0066281031573873226,
	"grad_norm": 0.14764989912509918,
	"learning_rate": 0.00010476190476190477,
	"loss": 0.6782,
	"step": 55
	},
	{
	"epoch": 0.006748614123885274,
	"grad_norm": 0.1577727496623993,
	"learning_rate": 0.00010285714285714286,
	"loss": 0.7367,
	"step": 56
	},
	{
	"epoch": 0.006869125090383225,
	"grad_norm": 0.17438825964927673,
	"learning_rate": 0.00010095238095238096,
	"loss": 0.65,
	"step": 57
	},
	{
	"epoch": 0.0069896360568811764,
	"grad_norm": 0.1775740683078766,
	"learning_rate": 9.904761904761905e-05,
	"loss": 0.7797,
	"step": 58
	},
	{
	"epoch": 0.0071101470233791274,
	"grad_norm": 0.18453216552734375,
	"learning_rate": 9.714285714285715e-05,
	"loss": 0.9153,
	"step": 59
	},
	{
	"epoch": 0.0072306579898770785,
	"grad_norm": 0.16022688150405884,
	"learning_rate": 9.523809523809524e-05,
	"loss": 0.7798,
	"step": 60
	},
	{
	"epoch": 0.00735116895637503,
	"grad_norm": 0.16944445669651031,
	"learning_rate": 9.333333333333334e-05,
	"loss": 0.8193,
	"step": 61
	},
	{
	"epoch": 0.007471679922872981,
	"grad_norm": 0.14207735657691956,
	"learning_rate": 9.142857142857143e-05,
	"loss": 0.5361,
	"step": 62
	},
	{
	"epoch": 0.007592190889370932,
	"grad_norm": 0.16854678094387054,
	"learning_rate": 8.952380952380953e-05,
	"loss": 0.7976,
	"step": 63
	},
	{
	"epoch": 0.007712701855868884,
	"grad_norm": 0.17764142155647278,
	"learning_rate": 8.761904761904762e-05,
	"loss": 0.6938,
	"step": 64
	},
	{
	"epoch": 0.007833212822366835,
	"grad_norm": 0.21041354537010193,
	"learning_rate": 8.571428571428571e-05,
	"loss": 0.8384,
	"step": 65
	},
	{
	"epoch": 0.007953723788864787,
	"grad_norm": 0.18576891720294952,
	"learning_rate": 8.380952380952382e-05,
	"loss": 0.6401,
	"step": 66
	},
	{
	"epoch": 0.008074234755362737,
	"grad_norm": 0.20624496042728424,
	"learning_rate": 8.19047619047619e-05,
	"loss": 0.7563,
	"step": 67
	},
	{
	"epoch": 0.008194745721860689,
	"grad_norm": 0.18236589431762695,
	"learning_rate": 8e-05,
	"loss": 0.748,
	"step": 68
	},
	{
	"epoch": 0.008315256688358641,
	"grad_norm": 0.15884153544902802,
	"learning_rate": 7.80952380952381e-05,
	"loss": 0.649,
	"step": 69
	},
	{
	"epoch": 0.008435767654856591,
	"grad_norm": 0.18527762591838837,
	"learning_rate": 7.619047619047618e-05,
	"loss": 0.5163,
	"step": 70
	},
	{
	"epoch": 0.008556278621354543,
	"grad_norm": 0.166184663772583,
	"learning_rate": 7.428571428571429e-05,
	"loss": 0.7672,
	"step": 71
	},
	{
	"epoch": 0.008676789587852495,
	"grad_norm": 0.19784916937351227,
	"learning_rate": 7.238095238095238e-05,
	"loss": 0.7482,
	"step": 72
	},
	{
	"epoch": 0.008797300554350447,
	"grad_norm": 0.16908536851406097,
	"learning_rate": 7.047619047619048e-05,
	"loss": 0.7461,
	"step": 73
	},
	{
	"epoch": 0.008917811520848397,
	"grad_norm": 0.18411517143249512,
	"learning_rate": 6.857142857142858e-05,
	"loss": 0.5697,
	"step": 74
	},
	{
	"epoch": 0.009038322487346349,
	"grad_norm": 0.15351906418800354,
	"learning_rate": 6.666666666666667e-05,
	"loss": 0.6597,
	"step": 75
	},
	{
	"epoch": 0.0091588334538443,
	"grad_norm": 0.17720364034175873,
	"learning_rate": 6.476190476190477e-05,
	"loss": 0.808,
	"step": 76
	},
	{
	"epoch": 0.00927934442034225,
	"grad_norm": 0.18325303494930267,
	"learning_rate": 6.285714285714286e-05,
	"loss": 0.7917,
	"step": 77
	},
	{
	"epoch": 0.009399855386840203,
	"grad_norm": 0.1679506152868271,
	"learning_rate": 6.0952380952380964e-05,
	"loss": 0.6326,
	"step": 78
	},
	{
	"epoch": 0.009520366353338154,
	"grad_norm": 0.19260190427303314,
	"learning_rate": 5.904761904761905e-05,
	"loss": 0.5601,
	"step": 79
	},
	{
	"epoch": 0.009640877319836105,
	"grad_norm": 0.15009605884552002,
	"learning_rate": 5.714285714285714e-05,
	"loss": 0.6072,
	"step": 80
	},
	{
	"epoch": 0.009761388286334056,
	"grad_norm": 0.15776121616363525,
	"learning_rate": 5.5238095238095244e-05,
	"loss": 0.6753,
	"step": 81
	},
	{
	"epoch": 0.009881899252832008,
	"grad_norm": 0.18575388193130493,
	"learning_rate": 5.333333333333333e-05,
	"loss": 0.6219,
	"step": 82
	},
	{
	"epoch": 0.010002410219329958,
	"grad_norm": 0.21978633105754852,
	"learning_rate": 5.142857142857143e-05,
	"loss": 0.8581,
	"step": 83
	},
	{
	"epoch": 0.01012292118582791,
	"grad_norm": 0.1704164743423462,
	"learning_rate": 4.9523809523809525e-05,
	"loss": 0.6461,
	"step": 84
	},
	{
	"epoch": 0.010243432152325862,
	"grad_norm": 0.18057820200920105,
	"learning_rate": 4.761904761904762e-05,
	"loss": 0.7416,
	"step": 85
	},
	{
	"epoch": 0.010363943118823812,
	"grad_norm": 0.15225447714328766,
	"learning_rate": 4.5714285714285716e-05,
	"loss": 0.4868,
	"step": 86
	},
	{
	"epoch": 0.010484454085321764,
	"grad_norm": 0.17193946242332458,
	"learning_rate": 4.380952380952381e-05,
	"loss": 0.8092,
	"step": 87
	},
	{
	"epoch": 0.010604965051819716,
	"grad_norm": 0.194380983710289,
	"learning_rate": 4.190476190476191e-05,
	"loss": 0.8461,
	"step": 88
	},
	{
	"epoch": 0.010725476018317666,
	"grad_norm": 0.2139783650636673,
	"learning_rate": 4e-05,
	"loss": 0.6548,
	"step": 89
	},
	{
	"epoch": 0.010845986984815618,
	"grad_norm": 0.16700893640518188,
	"learning_rate": 3.809523809523809e-05,
	"loss": 0.5584,
	"step": 90
	},
	{
	"epoch": 0.01096649795131357,
	"grad_norm": 0.1971975564956665,
	"learning_rate": 3.619047619047619e-05,
	"loss": 0.8535,
	"step": 91
	},
	{
	"epoch": 0.01108700891781152,
	"grad_norm": 0.19667109847068787,
	"learning_rate": 3.428571428571429e-05,
	"loss": 0.8635,
	"step": 92
	},
	{
	"epoch": 0.011207519884309472,
	"grad_norm": 0.18818983435630798,
	"learning_rate": 3.2380952380952386e-05,
	"loss": 0.8435,
	"step": 93
	},
	{
	"epoch": 0.011328030850807424,
	"grad_norm": 0.16365501284599304,
	"learning_rate": 3.0476190476190482e-05,
	"loss": 0.6243,
	"step": 94
	},
	{
	"epoch": 0.011448541817305374,
	"grad_norm": 0.20358283817768097,
	"learning_rate": 2.857142857142857e-05,
	"loss": 0.6483,
	"step": 95
	},
	{
	"epoch": 0.011569052783803326,
	"grad_norm": 0.17696398496627808,
	"learning_rate": 2.6666666666666667e-05,
	"loss": 0.6057,
	"step": 96
	},
	{
	"epoch": 0.011689563750301278,
	"grad_norm": 0.15508583188056946,
	"learning_rate": 2.4761904761904762e-05,
	"loss": 0.524,
	"step": 97
	},
	{
	"epoch": 0.011810074716799228,
	"grad_norm": 0.18458549678325653,
	"learning_rate": 2.2857142857142858e-05,
	"loss": 0.8364,
	"step": 98
	},
	{
	"epoch": 0.01193058568329718,
	"grad_norm": 0.1944003403186798,
	"learning_rate": 2.0952380952380954e-05,
	"loss": 0.5383,
	"step": 99
	},
	{
	"epoch": 0.012051096649795132,
	"grad_norm": 0.4217074513435364,
	"learning_rate": 1.9047619047619046e-05,
	"loss": 0.6774,
	"step": 100
	},
	{
	"epoch": 0.012171607616293083,
	"grad_norm": 0.20350486040115356,
	"learning_rate": 1.7142857142857145e-05,
	"loss": 0.6871,
	"step": 101
	},
	{
	"epoch": 0.012292118582791034,
	"grad_norm": 0.19154471158981323,
	"learning_rate": 1.5238095238095241e-05,
	"loss": 0.7226,
	"step": 102
	},
	{
	"epoch": 0.012412629549288986,
	"grad_norm": 0.17253194749355316,
	"learning_rate": 1.3333333333333333e-05,
	"loss": 0.7514,
	"step": 103
	},
	{
	"epoch": 0.012533140515786937,
	"grad_norm": 0.14699283242225647,
	"learning_rate": 1.1428571428571429e-05,
	"loss": 0.5358,
	"step": 104
	},
	{
	"epoch": 0.012653651482284888,
	"grad_norm": 0.19192050397396088,
	"learning_rate": 9.523809523809523e-06,
	"loss": 0.9153,
	"step": 105
	},
	{
	"epoch": 0.01277416244878284,
	"grad_norm": 0.15646027028560638,
	"learning_rate": 7.6190476190476205e-06,
	"loss": 0.5182,
	"step": 106
	},
	{
	"epoch": 0.012894673415280791,
	"grad_norm": 0.18160918354988098,
	"learning_rate": 5.7142857142857145e-06,
	"loss": 0.5822,
	"step": 107
	},
	{
	"epoch": 0.013015184381778741,
	"grad_norm": 0.19203059375286102,
	"learning_rate": 3.8095238095238102e-06,
	"loss": 0.7678,
	"step": 108
	},
	{
	"epoch": 0.013135695348276693,
	"grad_norm": 0.20908264815807343,
	"learning_rate": 1.9047619047619051e-06,
	"loss": 0.8563,
	"step": 109
	},
	{
	"epoch": 0.013256206314774645,
	"grad_norm": 0.16366459429264069,
	"learning_rate": 0.0,
	"loss": 0.4258,
	"step": 110
	}
	],
	"logging_steps": 1,
	"max_steps": 110,
	"num_input_tokens_seen": 0,
	"num_train_epochs": 1,
	"save_steps": 55,
	"stateful_callbacks": {
	"TrainerControl": {
	"args": {
	"should_epoch_stop": false,
	"should_evaluate": false,
	"should_log": false,
	"should_save": true,
	"should_training_stop": true
	},
	"attributes": {}
	}
	},
	"total_flos": 3.103938402981235e+16,
	"train_batch_size": 2,
	"trial_name": null,
	"trial_params": null
	}