Model save

Browse files

Files changed (8) hide show

README.md +58 -0
all_results.json +9 -0
config.json +1 -1
generation_config.json +14 -0
model.safetensors +1 -1
runs/Mar11_17-20-56_h100-m-07/events.out.tfevents.1741674065.h100-m-07.1151483.0 +2 -2
train_results.json +9 -0
trainer_state.json +756 -0

README.md ADDED Viewed

	@@ -0,0 +1,58 @@

+---
+base_model: Qwen/Qwen2.5-0.5B-Instruct
+library_name: transformers
+model_name: logsQwen2.5-0.5B-Instruct-math-gsm8k
+tags:
+- generated_from_trainer
+- trl
+- sft
+licence: license
+---
+# Model Card for logsQwen2.5-0.5B-Instruct-math-gsm8k
+This model is a fine-tuned version of [Qwen/Qwen2.5-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct).
+It has been trained using [TRL](https://github.com/huggingface/trl).
+## Quick start
+```python
+from transformers import pipeline
+question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
+generator = pipeline("text-generation", model="neurocoder/logsQwen2.5-0.5B-Instruct-math-gsm8k", device="cuda")
+output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
+print(output["generated_text"])
+```
+## Training procedure
+This model was trained with SFT.
+### Framework versions
+- TRL: 0.14.0
+- Transformers: 4.48.2
+- Pytorch: 2.5.1
+- Datasets: 3.2.0
+- Tokenizers: 0.21.0
+## Citations
+Cite TRL as:
+```bibtex
+@misc{vonwerra2022trl,
+	title        = {{TRL: Transformer Reinforcement Learning}},
+	author       = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallouédec},
+	year         = 2020,
+	journal      = {GitHub repository},
+	publisher    = {GitHub},
+	howpublished = {\url{https://github.com/huggingface/trl}}
+}
+```

all_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 0.9995119570522206,
+    "total_flos": 3518281600204800.0,
+    "train_loss": 1.3758189086802304,
+    "train_runtime": 549.4956,
+    "train_samples": 7473,
+    "train_samples_per_second": 14.915,
+    "train_steps_per_second": 0.932
+}

config.json CHANGED Viewed

@@ -23,7 +23,7 @@
   "tie_word_embeddings": true,
   "torch_dtype": "bfloat16",
   "transformers_version": "4.48.2",
-  "use_cache": false,
   "use_sliding_window": false,
   "vocab_size": 151936
 }

   "tie_word_embeddings": true,
   "torch_dtype": "bfloat16",
   "transformers_version": "4.48.2",
+  "use_cache": true,
   "use_sliding_window": false,
   "vocab_size": 151936
 }

generation_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "bos_token_id": 151643,
+  "do_sample": true,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "pad_token_id": 151643,
+  "repetition_penalty": 1.1,
+  "temperature": 0.7,
+  "top_k": 20,
+  "top_p": 0.8,
+  "transformers_version": "4.48.2"
+}

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:441f88c7cb29db407e074ac4cf0d227a254396cb3c5365a1064679d4984902b7
 size 988097824

 version https://git-lfs.github.com/spec/v1
+oid sha256:266a2dc7d9bc80e446f38e6d728f239890773ce51e34e46c8ca42f05348f3401
 size 988097824

runs/Mar11_17-20-56_h100-m-07/events.out.tfevents.1741674065.h100-m-07.1151483.0 CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b2bc4c472859d4dc42c248242abdd5c02ea8076f4d8a5eea63d8039a69c52760
-size 26545

 version https://git-lfs.github.com/spec/v1
+oid sha256:ff5a893d14961013a6ede0adc3e69aae937f25ffbabeaba1a3dfe53267e64545
+size 27321

train_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 0.9995119570522206,
+    "total_flos": 3518281600204800.0,
+    "train_loss": 1.3758189086802304,
+    "train_runtime": 549.4956,
+    "train_samples": 7473,
+    "train_samples_per_second": 14.915,
+    "train_steps_per_second": 0.932
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,756 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9995119570522206,
+  "eval_steps": 500,
+  "global_step": 512,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.009760858955588092,
+      "grad_norm": 23.375,
+      "learning_rate": 0.0002,
+      "loss": 2.8524,
+      "step": 5
+    },
+    {
+      "epoch": 0.019521717911176184,
+      "grad_norm": 12.9375,
+      "learning_rate": 0.0002,
+      "loss": 1.7471,
+      "step": 10
+    },
+    {
+      "epoch": 0.029282576866764276,
+      "grad_norm": 5.34375,
+      "learning_rate": 0.0002,
+      "loss": 1.4572,
+      "step": 15
+    },
+    {
+      "epoch": 0.03904343582235237,
+      "grad_norm": 16.625,
+      "learning_rate": 0.0002,
+      "loss": 1.3997,
+      "step": 20
+    },
+    {
+      "epoch": 0.04880429477794046,
+      "grad_norm": 6.875,
+      "learning_rate": 0.0002,
+      "loss": 1.5013,
+      "step": 25
+    },
+    {
+      "epoch": 0.05856515373352855,
+      "grad_norm": 5.25,
+      "learning_rate": 0.0002,
+      "loss": 1.4325,
+      "step": 30
+    },
+    {
+      "epoch": 0.06832601268911664,
+      "grad_norm": 4.5,
+      "learning_rate": 0.0002,
+      "loss": 1.3938,
+      "step": 35
+    },
+    {
+      "epoch": 0.07808687164470474,
+      "grad_norm": 4.53125,
+      "learning_rate": 0.0002,
+      "loss": 1.4496,
+      "step": 40
+    },
+    {
+      "epoch": 0.08784773060029283,
+      "grad_norm": 4.90625,
+      "learning_rate": 0.0002,
+      "loss": 1.4919,
+      "step": 45
+    },
+    {
+      "epoch": 0.09760858955588092,
+      "grad_norm": 3.96875,
+      "learning_rate": 0.0002,
+      "loss": 1.4757,
+      "step": 50
+    },
+    {
+      "epoch": 0.10736944851146901,
+      "grad_norm": 5.8125,
+      "learning_rate": 0.0002,
+      "loss": 1.5236,
+      "step": 55
+    },
+    {
+      "epoch": 0.1171303074670571,
+      "grad_norm": 3.71875,
+      "learning_rate": 0.0002,
+      "loss": 1.464,
+      "step": 60
+    },
+    {
+      "epoch": 0.1268911664226452,
+      "grad_norm": 5.0,
+      "learning_rate": 0.0002,
+      "loss": 1.4982,
+      "step": 65
+    },
+    {
+      "epoch": 0.1366520253782333,
+      "grad_norm": 4.15625,
+      "learning_rate": 0.0002,
+      "loss": 1.3583,
+      "step": 70
+    },
+    {
+      "epoch": 0.14641288433382138,
+      "grad_norm": 3.203125,
+      "learning_rate": 0.0002,
+      "loss": 1.3943,
+      "step": 75
+    },
+    {
+      "epoch": 0.15617374328940947,
+      "grad_norm": 3.796875,
+      "learning_rate": 0.0002,
+      "loss": 1.4915,
+      "step": 80
+    },
+    {
+      "epoch": 0.16593460224499756,
+      "grad_norm": 4.15625,
+      "learning_rate": 0.0002,
+      "loss": 1.5084,
+      "step": 85
+    },
+    {
+      "epoch": 0.17569546120058566,
+      "grad_norm": 3.1875,
+      "learning_rate": 0.0002,
+      "loss": 1.4788,
+      "step": 90
+    },
+    {
+      "epoch": 0.18545632015617375,
+      "grad_norm": 7.28125,
+      "learning_rate": 0.0002,
+      "loss": 1.4738,
+      "step": 95
+    },
+    {
+      "epoch": 0.19521717911176184,
+      "grad_norm": 3.734375,
+      "learning_rate": 0.0002,
+      "loss": 1.4498,
+      "step": 100
+    },
+    {
+      "epoch": 0.20497803806734993,
+      "grad_norm": 7.15625,
+      "learning_rate": 0.0002,
+      "loss": 1.5028,
+      "step": 105
+    },
+    {
+      "epoch": 0.21473889702293802,
+      "grad_norm": 7.3125,
+      "learning_rate": 0.0002,
+      "loss": 1.4605,
+      "step": 110
+    },
+    {
+      "epoch": 0.22449975597852612,
+      "grad_norm": 3.53125,
+      "learning_rate": 0.0002,
+      "loss": 1.4047,
+      "step": 115
+    },
+    {
+      "epoch": 0.2342606149341142,
+      "grad_norm": 3.640625,
+      "learning_rate": 0.0002,
+      "loss": 1.4216,
+      "step": 120
+    },
+    {
+      "epoch": 0.2440214738897023,
+      "grad_norm": 3.5,
+      "learning_rate": 0.0002,
+      "loss": 1.4401,
+      "step": 125
+    },
+    {
+      "epoch": 0.2537823328452904,
+      "grad_norm": 2.78125,
+      "learning_rate": 0.0002,
+      "loss": 1.4301,
+      "step": 130
+    },
+    {
+      "epoch": 0.2635431918008785,
+      "grad_norm": 2.96875,
+      "learning_rate": 0.0002,
+      "loss": 1.4325,
+      "step": 135
+    },
+    {
+      "epoch": 0.2733040507564666,
+      "grad_norm": 2.8125,
+      "learning_rate": 0.0002,
+      "loss": 1.3988,
+      "step": 140
+    },
+    {
+      "epoch": 0.28306490971205467,
+      "grad_norm": 3.234375,
+      "learning_rate": 0.0002,
+      "loss": 1.3768,
+      "step": 145
+    },
+    {
+      "epoch": 0.29282576866764276,
+      "grad_norm": 5.0,
+      "learning_rate": 0.0002,
+      "loss": 1.4328,
+      "step": 150
+    },
+    {
+      "epoch": 0.30258662762323085,
+      "grad_norm": 4.0625,
+      "learning_rate": 0.0002,
+      "loss": 1.4189,
+      "step": 155
+    },
+    {
+      "epoch": 0.31234748657881894,
+      "grad_norm": 2.78125,
+      "learning_rate": 0.0002,
+      "loss": 1.3605,
+      "step": 160
+    },
+    {
+      "epoch": 0.32210834553440704,
+      "grad_norm": 3.40625,
+      "learning_rate": 0.0002,
+      "loss": 1.4092,
+      "step": 165
+    },
+    {
+      "epoch": 0.33186920448999513,
+      "grad_norm": 3.421875,
+      "learning_rate": 0.0002,
+      "loss": 1.3854,
+      "step": 170
+    },
+    {
+      "epoch": 0.3416300634455832,
+      "grad_norm": 3.375,
+      "learning_rate": 0.0002,
+      "loss": 1.3923,
+      "step": 175
+    },
+    {
+      "epoch": 0.3513909224011713,
+      "grad_norm": 2.609375,
+      "learning_rate": 0.0002,
+      "loss": 1.3395,
+      "step": 180
+    },
+    {
+      "epoch": 0.3611517813567594,
+      "grad_norm": 2.671875,
+      "learning_rate": 0.0002,
+      "loss": 1.3532,
+      "step": 185
+    },
+    {
+      "epoch": 0.3709126403123475,
+      "grad_norm": 2.765625,
+      "learning_rate": 0.0002,
+      "loss": 1.3764,
+      "step": 190
+    },
+    {
+      "epoch": 0.3806734992679356,
+      "grad_norm": 3.109375,
+      "learning_rate": 0.0002,
+      "loss": 1.3179,
+      "step": 195
+    },
+    {
+      "epoch": 0.3904343582235237,
+      "grad_norm": 2.71875,
+      "learning_rate": 0.0002,
+      "loss": 1.309,
+      "step": 200
+    },
+    {
+      "epoch": 0.4001952171791118,
+      "grad_norm": 2.9375,
+      "learning_rate": 0.0002,
+      "loss": 1.4215,
+      "step": 205
+    },
+    {
+      "epoch": 0.40995607613469986,
+      "grad_norm": 2.71875,
+      "learning_rate": 0.0002,
+      "loss": 1.4362,
+      "step": 210
+    },
+    {
+      "epoch": 0.41971693509028796,
+      "grad_norm": 2.875,
+      "learning_rate": 0.0002,
+      "loss": 1.3835,
+      "step": 215
+    },
+    {
+      "epoch": 0.42947779404587605,
+      "grad_norm": 2.46875,
+      "learning_rate": 0.0002,
+      "loss": 1.3099,
+      "step": 220
+    },
+    {
+      "epoch": 0.43923865300146414,
+      "grad_norm": 2.609375,
+      "learning_rate": 0.0002,
+      "loss": 1.3739,
+      "step": 225
+    },
+    {
+      "epoch": 0.44899951195705223,
+      "grad_norm": 2.890625,
+      "learning_rate": 0.0002,
+      "loss": 1.3374,
+      "step": 230
+    },
+    {
+      "epoch": 0.4587603709126403,
+      "grad_norm": 2.453125,
+      "learning_rate": 0.0002,
+      "loss": 1.3693,
+      "step": 235
+    },
+    {
+      "epoch": 0.4685212298682284,
+      "grad_norm": 2.625,
+      "learning_rate": 0.0002,
+      "loss": 1.3478,
+      "step": 240
+    },
+    {
+      "epoch": 0.4782820888238165,
+      "grad_norm": 2.78125,
+      "learning_rate": 0.0002,
+      "loss": 1.3685,
+      "step": 245
+    },
+    {
+      "epoch": 0.4880429477794046,
+      "grad_norm": 2.53125,
+      "learning_rate": 0.0002,
+      "loss": 1.2963,
+      "step": 250
+    },
+    {
+      "epoch": 0.4978038067349927,
+      "grad_norm": 2.359375,
+      "learning_rate": 0.0002,
+      "loss": 1.3452,
+      "step": 255
+    },
+    {
+      "epoch": 0.5075646656905808,
+      "grad_norm": 2.515625,
+      "learning_rate": 0.0002,
+      "loss": 1.3321,
+      "step": 260
+    },
+    {
+      "epoch": 0.5173255246461689,
+      "grad_norm": 2.65625,
+      "learning_rate": 0.0002,
+      "loss": 1.3702,
+      "step": 265
+    },
+    {
+      "epoch": 0.527086383601757,
+      "grad_norm": 3.296875,
+      "learning_rate": 0.0002,
+      "loss": 1.4063,
+      "step": 270
+    },
+    {
+      "epoch": 0.5368472425573451,
+      "grad_norm": 2.59375,
+      "learning_rate": 0.0002,
+      "loss": 1.2899,
+      "step": 275
+    },
+    {
+      "epoch": 0.5466081015129332,
+      "grad_norm": 2.453125,
+      "learning_rate": 0.0002,
+      "loss": 1.309,
+      "step": 280
+    },
+    {
+      "epoch": 0.5563689604685212,
+      "grad_norm": 2.5,
+      "learning_rate": 0.0002,
+      "loss": 1.3354,
+      "step": 285
+    },
+    {
+      "epoch": 0.5661298194241093,
+      "grad_norm": 2.578125,
+      "learning_rate": 0.0002,
+      "loss": 1.3682,
+      "step": 290
+    },
+    {
+      "epoch": 0.5758906783796974,
+      "grad_norm": 2.40625,
+      "learning_rate": 0.0002,
+      "loss": 1.3351,
+      "step": 295
+    },
+    {
+      "epoch": 0.5856515373352855,
+      "grad_norm": 2.65625,
+      "learning_rate": 0.0002,
+      "loss": 1.3483,
+      "step": 300
+    },
+    {
+      "epoch": 0.5954123962908736,
+      "grad_norm": 2.421875,
+      "learning_rate": 0.0002,
+      "loss": 1.292,
+      "step": 305
+    },
+    {
+      "epoch": 0.6051732552464617,
+      "grad_norm": 2.53125,
+      "learning_rate": 0.0002,
+      "loss": 1.3021,
+      "step": 310
+    },
+    {
+      "epoch": 0.6149341142020498,
+      "grad_norm": 2.453125,
+      "learning_rate": 0.0002,
+      "loss": 1.3805,
+      "step": 315
+    },
+    {
+      "epoch": 0.6246949731576379,
+      "grad_norm": 2.71875,
+      "learning_rate": 0.0002,
+      "loss": 1.3212,
+      "step": 320
+    },
+    {
+      "epoch": 0.634455832113226,
+      "grad_norm": 2.40625,
+      "learning_rate": 0.0002,
+      "loss": 1.2793,
+      "step": 325
+    },
+    {
+      "epoch": 0.6442166910688141,
+      "grad_norm": 2.53125,
+      "learning_rate": 0.0002,
+      "loss": 1.3733,
+      "step": 330
+    },
+    {
+      "epoch": 0.6539775500244022,
+      "grad_norm": 2.53125,
+      "learning_rate": 0.0002,
+      "loss": 1.2849,
+      "step": 335
+    },
+    {
+      "epoch": 0.6637384089799903,
+      "grad_norm": 2.59375,
+      "learning_rate": 0.0002,
+      "loss": 1.3962,
+      "step": 340
+    },
+    {
+      "epoch": 0.6734992679355783,
+      "grad_norm": 2.796875,
+      "learning_rate": 0.0002,
+      "loss": 1.3527,
+      "step": 345
+    },
+    {
+      "epoch": 0.6832601268911664,
+      "grad_norm": 2.828125,
+      "learning_rate": 0.0002,
+      "loss": 1.3286,
+      "step": 350
+    },
+    {
+      "epoch": 0.6930209858467545,
+      "grad_norm": 2.671875,
+      "learning_rate": 0.0002,
+      "loss": 1.3914,
+      "step": 355
+    },
+    {
+      "epoch": 0.7027818448023426,
+      "grad_norm": 5.125,
+      "learning_rate": 0.0002,
+      "loss": 1.3359,
+      "step": 360
+    },
+    {
+      "epoch": 0.7125427037579307,
+      "grad_norm": 2.609375,
+      "learning_rate": 0.0002,
+      "loss": 1.3115,
+      "step": 365
+    },
+    {
+      "epoch": 0.7223035627135188,
+      "grad_norm": 2.109375,
+      "learning_rate": 0.0002,
+      "loss": 1.3279,
+      "step": 370
+    },
+    {
+      "epoch": 0.7320644216691069,
+      "grad_norm": 7.15625,
+      "learning_rate": 0.0002,
+      "loss": 1.2885,
+      "step": 375
+    },
+    {
+      "epoch": 0.741825280624695,
+      "grad_norm": 3.390625,
+      "learning_rate": 0.0002,
+      "loss": 1.322,
+      "step": 380
+    },
+    {
+      "epoch": 0.7515861395802831,
+      "grad_norm": 3.109375,
+      "learning_rate": 0.0002,
+      "loss": 1.3047,
+      "step": 385
+    },
+    {
+      "epoch": 0.7613469985358712,
+      "grad_norm": 2.65625,
+      "learning_rate": 0.0002,
+      "loss": 1.2534,
+      "step": 390
+    },
+    {
+      "epoch": 0.7711078574914593,
+      "grad_norm": 2.734375,
+      "learning_rate": 0.0002,
+      "loss": 1.2539,
+      "step": 395
+    },
+    {
+      "epoch": 0.7808687164470474,
+      "grad_norm": 2.71875,
+      "learning_rate": 0.0002,
+      "loss": 1.277,
+      "step": 400
+    },
+    {
+      "epoch": 0.7906295754026355,
+      "grad_norm": 2.8125,
+      "learning_rate": 0.0002,
+      "loss": 1.2932,
+      "step": 405
+    },
+    {
+      "epoch": 0.8003904343582235,
+      "grad_norm": 2.296875,
+      "learning_rate": 0.0002,
+      "loss": 1.2782,
+      "step": 410
+    },
+    {
+      "epoch": 0.8101512933138116,
+      "grad_norm": 2.359375,
+      "learning_rate": 0.0002,
+      "loss": 1.294,
+      "step": 415
+    },
+    {
+      "epoch": 0.8199121522693997,
+      "grad_norm": 2.296875,
+      "learning_rate": 0.0002,
+      "loss": 1.2311,
+      "step": 420
+    },
+    {
+      "epoch": 0.8296730112249878,
+      "grad_norm": 2.28125,
+      "learning_rate": 0.0002,
+      "loss": 1.3014,
+      "step": 425
+    },
+    {
+      "epoch": 0.8394338701805759,
+      "grad_norm": 2.859375,
+      "learning_rate": 0.0002,
+      "loss": 1.325,
+      "step": 430
+    },
+    {
+      "epoch": 0.849194729136164,
+      "grad_norm": 2.125,
+      "learning_rate": 0.0002,
+      "loss": 1.187,
+      "step": 435
+    },
+    {
+      "epoch": 0.8589555880917521,
+      "grad_norm": 2.359375,
+      "learning_rate": 0.0002,
+      "loss": 1.2626,
+      "step": 440
+    },
+    {
+      "epoch": 0.8687164470473402,
+      "grad_norm": 2.3125,
+      "learning_rate": 0.0002,
+      "loss": 1.1967,
+      "step": 445
+    },
+    {
+      "epoch": 0.8784773060029283,
+      "grad_norm": 2.125,
+      "learning_rate": 0.0002,
+      "loss": 1.3065,
+      "step": 450
+    },
+    {
+      "epoch": 0.8882381649585164,
+      "grad_norm": 2.421875,
+      "learning_rate": 0.0002,
+      "loss": 1.2892,
+      "step": 455
+    },
+    {
+      "epoch": 0.8979990239141045,
+      "grad_norm": 2.3125,
+      "learning_rate": 0.0002,
+      "loss": 1.2817,
+      "step": 460
+    },
+    {
+      "epoch": 0.9077598828696926,
+      "grad_norm": 2.109375,
+      "learning_rate": 0.0002,
+      "loss": 1.2344,
+      "step": 465
+    },
+    {
+      "epoch": 0.9175207418252807,
+      "grad_norm": 2.359375,
+      "learning_rate": 0.0002,
+      "loss": 1.349,
+      "step": 470
+    },
+    {
+      "epoch": 0.9272816007808687,
+      "grad_norm": 2.21875,
+      "learning_rate": 0.0002,
+      "loss": 1.255,
+      "step": 475
+    },
+    {
+      "epoch": 0.9370424597364568,
+      "grad_norm": 2.03125,
+      "learning_rate": 0.0002,
+      "loss": 1.2741,
+      "step": 480
+    },
+    {
+      "epoch": 0.9468033186920449,
+      "grad_norm": 2.328125,
+      "learning_rate": 0.0002,
+      "loss": 1.3024,
+      "step": 485
+    },
+    {
+      "epoch": 0.956564177647633,
+      "grad_norm": 2.34375,
+      "learning_rate": 0.0002,
+      "loss": 1.297,
+      "step": 490
+    },
+    {
+      "epoch": 0.9663250366032211,
+      "grad_norm": 2.125,
+      "learning_rate": 0.0002,
+      "loss": 1.2095,
+      "step": 495
+    },
+    {
+      "epoch": 0.9760858955588092,
+      "grad_norm": 2.1875,
+      "learning_rate": 0.0002,
+      "loss": 1.3445,
+      "step": 500
+    },
+    {
+      "epoch": 0.9858467545143973,
+      "grad_norm": 2.78125,
+      "learning_rate": 0.0002,
+      "loss": 1.349,
+      "step": 505
+    },
+    {
+      "epoch": 0.9956076134699854,
+      "grad_norm": 2.296875,
+      "learning_rate": 0.0002,
+      "loss": 1.2542,
+      "step": 510
+    },
+    {
+      "epoch": 0.9995119570522206,
+      "step": 512,
+      "total_flos": 3518281600204800.0,
+      "train_loss": 1.3758189086802304,
+      "train_runtime": 549.4956,
+      "train_samples_per_second": 14.915,
+      "train_steps_per_second": 0.932
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 512,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 3518281600204800.0,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}