miike-ai commited on Mar 23

Commit

4bfe1c5

verified ·

1 Parent(s): 408ee42

Add files using upload-large-folder tool

Browse files

Files changed (17) hide show

.gitattributes +1 -0
README.md +202 -0
adapter_config.json +31 -0
adapter_model.safetensors +3 -0
added_tokens.json +3 -0
chat_template.json +3 -0
optimizer.pt +3 -0
preprocessor_config.json +29 -0
processor_config.json +4 -0
rng_state.pth +3 -0
scheduler.pt +3 -0
special_tokens_map.json +33 -0
tokenizer.json +3 -0
tokenizer.model +3 -0
tokenizer_config.json +0 -0
trainer_state.json +804 -0
training_args.bin +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: unsloth/gemma-3-4b-it-unsloth-bnb-4bit
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.15.0

adapter_config.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "unsloth/gemma-3-4b-it-unsloth-bnb-4bit",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 8,
+  "lora_bias": false,
+  "lora_dropout": 0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": "(?:.*?(?:language|text).*?(?:self_attn|attention|attn|mlp|feed_forward|ffn|dense).*?(?:k_proj|v_proj|q_proj|out_proj|fc1|fc2|o_proj|gate_proj|up_proj|down_proj).*?)|(?:\\bmodel\\.layers\\.[\\d]{1,}\\.(?:self_attn|attention|attn|mlp|feed_forward|ffn|dense)\\.(?:(?:k_proj|v_proj|q_proj|out_proj|fc1|fc2|o_proj|gate_proj|up_proj|down_proj)))",
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_rslora": false
+}

adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9aaeeaf4f45fc7eaf29202ba22549dfd5940a8a18d471fe6d059e22ad8ccee8f
+size 59675008

added_tokens.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "<image_soft_token>": 262144
+}

chat_template.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "chat_template": "{{ bos_token }}\n{%- if messages[0]['role'] == 'system' -%}\n    {%- if messages[0]['content'] is string -%}\n        {%- set first_user_prefix = messages[0]['content'] + '\n\n' -%}\n    {%- else -%}\n        {%- set first_user_prefix = messages[0]['content'][0]['text'] + '\n\n' -%}\n    {%- endif -%}\n    {%- set loop_messages = messages[1:] -%}\n{%- else -%}\n    {%- set first_user_prefix = \"\" -%}\n    {%- set loop_messages = messages -%}\n{%- endif -%}\n{%- for message in loop_messages -%}\n    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}\n        {{ raise_exception(\"Conversation roles must alternate user/assistant/user/assistant/...\") }}\n    {%- endif -%}\n    {%- if (message['role'] == 'assistant') -%}\n        {%- set role = \"model\" -%}\n    {%- else -%}\n        {%- set role = message['role'] -%}\n    {%- endif -%}\n    {{ '<start_of_turn>' + role + '\n' + (first_user_prefix if loop.first else \"\") }}\n    {%- if message['content'] is string -%}\n        {{ message['content'] | trim }}\n    {%- elif message['content'] is iterable -%}\n        {%- for item in message['content'] -%}\n            {%- if item['type'] == 'image' -%}\n                {{ '<start_of_image>' }}\n            {%- elif item['type'] == 'text' -%}\n                {{ item['text'] | trim }}\n            {%- endif -%}\n        {%- endfor -%}\n    {%- else -%}\n        {{ raise_exception(\"Invalid content type\") }}\n    {%- endif -%}\n    {{ '<end_of_turn>\n' }}\n{%- endfor -%}\n{%- if add_generation_prompt -%}\n    {{ '<start_of_turn>model\n' }}\n{%- endif -%}\n"
+}

optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c2d31ec10ed63ada5221a093e4c7d46f6e36896e0f1a264b1769a0d42ba56493
+size 30825092

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "do_convert_rgb": null,
+  "do_normalize": true,
+  "do_pan_and_scan": null,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "image_processor_type": "Gemma3ImageProcessor",
+  "image_seq_length": 256,
+  "image_std": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "pan_and_scan_max_num_crops": null,
+  "pan_and_scan_min_crop_size": null,
+  "pan_and_scan_min_ratio_to_activate": null,
+  "processor_class": "Gemma3Processor",
+  "resample": 2,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "height": 896,
+    "width": 896
+  }
+}

processor_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "image_seq_length": 256,
+  "processor_class": "Gemma3Processor"
+}

rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d4c917636c7a58af68a29056522a757e9f9b99005b776641aa157c536967817d
+size 14244

scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:901abe9c577da6d49bd74f99c6ebe7e9c67dbca0824b753bc4bbc938a94b4eda
+size 1064

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "boi_token": "<start_of_image>",
+  "bos_token": {
+    "content": "<bos>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eoi_token": "<end_of_image>",
+  "eos_token": {
+    "content": "<end_of_turn>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "image_token": "<image_soft_token>",
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4667f2089529e8e7657cfb6d1c19910ae71ff5f28aa7ab2ff2763330affad795
+size 33384568

tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1299c11d7cf632ef3b4e11937501358ada021bbdf7c47638d13c0ee982f2e79c
+size 4689074

tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

trainer_state.json ADDED Viewed

	@@ -0,0 +1,804 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.013256206314774645,
+  "eval_steps": 500,
+  "global_step": 110,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.00012051096649795132,
+      "grad_norm": 0.7307866811752319,
+      "learning_rate": 4e-05,
+      "loss": 1.2502,
+      "step": 1
+    },
+    {
+      "epoch": 0.00024102193299590263,
+      "grad_norm": 0.7944597601890564,
+      "learning_rate": 8e-05,
+      "loss": 1.0923,
+      "step": 2
+    },
+    {
+      "epoch": 0.00036153289949385393,
+      "grad_norm": 0.8116863965988159,
+      "learning_rate": 0.00012,
+      "loss": 1.4372,
+      "step": 3
+    },
+    {
+      "epoch": 0.00048204386599180526,
+      "grad_norm": 0.6883746981620789,
+      "learning_rate": 0.00016,
+      "loss": 1.2503,
+      "step": 4
+    },
+    {
+      "epoch": 0.0006025548324897565,
+      "grad_norm": 0.6956741809844971,
+      "learning_rate": 0.0002,
+      "loss": 1.135,
+      "step": 5
+    },
+    {
+      "epoch": 0.0007230657989877079,
+      "grad_norm": 0.7852187752723694,
+      "learning_rate": 0.0001980952380952381,
+      "loss": 1.0132,
+      "step": 6
+    },
+    {
+      "epoch": 0.0008435767654856592,
+      "grad_norm": 0.4692592918872833,
+      "learning_rate": 0.0001961904761904762,
+      "loss": 0.7826,
+      "step": 7
+    },
+    {
+      "epoch": 0.0009640877319836105,
+      "grad_norm": 0.27623867988586426,
+      "learning_rate": 0.0001942857142857143,
+      "loss": 0.664,
+      "step": 8
+    },
+    {
+      "epoch": 0.0010845986984815619,
+      "grad_norm": 0.21396474540233612,
+      "learning_rate": 0.0001923809523809524,
+      "loss": 0.9179,
+      "step": 9
+    },
+    {
+      "epoch": 0.001205109664979513,
+      "grad_norm": 0.1967506855726242,
+      "learning_rate": 0.00019047619047619048,
+      "loss": 0.6711,
+      "step": 10
+    },
+    {
+      "epoch": 0.0013256206314774645,
+      "grad_norm": 0.20955657958984375,
+      "learning_rate": 0.00018857142857142857,
+      "loss": 0.8331,
+      "step": 11
+    },
+    {
+      "epoch": 0.0014461315979754157,
+      "grad_norm": 0.2680826485157013,
+      "learning_rate": 0.0001866666666666667,
+      "loss": 0.8829,
+      "step": 12
+    },
+    {
+      "epoch": 0.0015666425644733672,
+      "grad_norm": 0.25052550435066223,
+      "learning_rate": 0.00018476190476190478,
+      "loss": 0.7536,
+      "step": 13
+    },
+    {
+      "epoch": 0.0016871535309713184,
+      "grad_norm": 0.27972114086151123,
+      "learning_rate": 0.00018285714285714286,
+      "loss": 0.8129,
+      "step": 14
+    },
+    {
+      "epoch": 0.0018076644974692696,
+      "grad_norm": 0.23484091460704803,
+      "learning_rate": 0.00018095238095238095,
+      "loss": 0.8715,
+      "step": 15
+    },
+    {
+      "epoch": 0.001928175463967221,
+      "grad_norm": 0.2122180461883545,
+      "learning_rate": 0.00017904761904761907,
+      "loss": 0.9421,
+      "step": 16
+    },
+    {
+      "epoch": 0.0020486864304651723,
+      "grad_norm": 0.19645242393016815,
+      "learning_rate": 0.00017714285714285713,
+      "loss": 0.6596,
+      "step": 17
+    },
+    {
+      "epoch": 0.0021691973969631237,
+      "grad_norm": 0.21645572781562805,
+      "learning_rate": 0.00017523809523809525,
+      "loss": 0.764,
+      "step": 18
+    },
+    {
+      "epoch": 0.002289708363461075,
+      "grad_norm": 0.15910537540912628,
+      "learning_rate": 0.00017333333333333334,
+      "loss": 0.7156,
+      "step": 19
+    },
+    {
+      "epoch": 0.002410219329959026,
+      "grad_norm": 0.1565140336751938,
+      "learning_rate": 0.00017142857142857143,
+      "loss": 0.6023,
+      "step": 20
+    },
+    {
+      "epoch": 0.0025307302964569776,
+      "grad_norm": 0.17277204990386963,
+      "learning_rate": 0.00016952380952380954,
+      "loss": 0.5594,
+      "step": 21
+    },
+    {
+      "epoch": 0.002651241262954929,
+      "grad_norm": 0.17323294281959534,
+      "learning_rate": 0.00016761904761904763,
+      "loss": 0.681,
+      "step": 22
+    },
+    {
+      "epoch": 0.00277175222945288,
+      "grad_norm": 0.1539444774389267,
+      "learning_rate": 0.00016571428571428575,
+      "loss": 0.7535,
+      "step": 23
+    },
+    {
+      "epoch": 0.0028922631959508315,
+      "grad_norm": 0.16936075687408447,
+      "learning_rate": 0.0001638095238095238,
+      "loss": 0.5524,
+      "step": 24
+    },
+    {
+      "epoch": 0.003012774162448783,
+      "grad_norm": 0.1893339455127716,
+      "learning_rate": 0.00016190476190476192,
+      "loss": 0.802,
+      "step": 25
+    },
+    {
+      "epoch": 0.0031332851289467343,
+      "grad_norm": 0.17078277468681335,
+      "learning_rate": 0.00016,
+      "loss": 0.677,
+      "step": 26
+    },
+    {
+      "epoch": 0.0032537960954446853,
+      "grad_norm": 0.1889839768409729,
+      "learning_rate": 0.0001580952380952381,
+      "loss": 0.5551,
+      "step": 27
+    },
+    {
+      "epoch": 0.003374307061942637,
+      "grad_norm": 0.2148876190185547,
+      "learning_rate": 0.0001561904761904762,
+      "loss": 0.6161,
+      "step": 28
+    },
+    {
+      "epoch": 0.0034948180284405882,
+      "grad_norm": 0.1392691731452942,
+      "learning_rate": 0.0001542857142857143,
+      "loss": 0.5348,
+      "step": 29
+    },
+    {
+      "epoch": 0.0036153289949385392,
+      "grad_norm": 0.17458081245422363,
+      "learning_rate": 0.00015238095238095237,
+      "loss": 0.7913,
+      "step": 30
+    },
+    {
+      "epoch": 0.0037358399614364907,
+      "grad_norm": 0.1562052071094513,
+      "learning_rate": 0.00015047619047619048,
+      "loss": 0.8158,
+      "step": 31
+    },
+    {
+      "epoch": 0.003856350927934442,
+      "grad_norm": 0.1435224562883377,
+      "learning_rate": 0.00014857142857142857,
+      "loss": 0.7528,
+      "step": 32
+    },
+    {
+      "epoch": 0.0039768618944323935,
+      "grad_norm": 0.14048519730567932,
+      "learning_rate": 0.00014666666666666666,
+      "loss": 0.6955,
+      "step": 33
+    },
+    {
+      "epoch": 0.0040973728609303445,
+      "grad_norm": 0.16571789979934692,
+      "learning_rate": 0.00014476190476190475,
+      "loss": 0.5537,
+      "step": 34
+    },
+    {
+      "epoch": 0.0042178838274282955,
+      "grad_norm": 0.165692538022995,
+      "learning_rate": 0.00014285714285714287,
+      "loss": 0.7134,
+      "step": 35
+    },
+    {
+      "epoch": 0.004338394793926247,
+      "grad_norm": 0.1822883039712906,
+      "learning_rate": 0.00014095238095238096,
+      "loss": 0.5432,
+      "step": 36
+    },
+    {
+      "epoch": 0.004458905760424198,
+      "grad_norm": 0.1414850652217865,
+      "learning_rate": 0.00013904761904761905,
+      "loss": 0.6703,
+      "step": 37
+    },
+    {
+      "epoch": 0.00457941672692215,
+      "grad_norm": 0.15394528210163116,
+      "learning_rate": 0.00013714285714285716,
+      "loss": 0.6561,
+      "step": 38
+    },
+    {
+      "epoch": 0.004699927693420101,
+      "grad_norm": 0.1435491144657135,
+      "learning_rate": 0.00013523809523809525,
+      "loss": 0.5644,
+      "step": 39
+    },
+    {
+      "epoch": 0.004820438659918052,
+      "grad_norm": 0.16691423952579498,
+      "learning_rate": 0.00013333333333333334,
+      "loss": 0.7856,
+      "step": 40
+    },
+    {
+      "epoch": 0.004940949626416004,
+      "grad_norm": 0.14211532473564148,
+      "learning_rate": 0.00013142857142857143,
+      "loss": 0.6399,
+      "step": 41
+    },
+    {
+      "epoch": 0.005061460592913955,
+      "grad_norm": 0.18083994090557098,
+      "learning_rate": 0.00012952380952380954,
+      "loss": 0.715,
+      "step": 42
+    },
+    {
+      "epoch": 0.005181971559411906,
+      "grad_norm": 0.15873770415782928,
+      "learning_rate": 0.0001276190476190476,
+      "loss": 0.7614,
+      "step": 43
+    },
+    {
+      "epoch": 0.005302482525909858,
+      "grad_norm": 0.14993314445018768,
+      "learning_rate": 0.00012571428571428572,
+      "loss": 0.6105,
+      "step": 44
+    },
+    {
+      "epoch": 0.005422993492407809,
+      "grad_norm": 0.18779931962490082,
+      "learning_rate": 0.0001238095238095238,
+      "loss": 1.0716,
+      "step": 45
+    },
+    {
+      "epoch": 0.00554350445890576,
+      "grad_norm": 0.15650784969329834,
+      "learning_rate": 0.00012190476190476193,
+      "loss": 0.738,
+      "step": 46
+    },
+    {
+      "epoch": 0.005664015425403712,
+      "grad_norm": 0.1431063711643219,
+      "learning_rate": 0.00012,
+      "loss": 0.5219,
+      "step": 47
+    },
+    {
+      "epoch": 0.005784526391901663,
+      "grad_norm": 0.1359708309173584,
+      "learning_rate": 0.0001180952380952381,
+      "loss": 0.5886,
+      "step": 48
+    },
+    {
+      "epoch": 0.005905037358399614,
+      "grad_norm": 0.16217978298664093,
+      "learning_rate": 0.00011619047619047621,
+      "loss": 0.7634,
+      "step": 49
+    },
+    {
+      "epoch": 0.006025548324897566,
+      "grad_norm": 0.16889767348766327,
+      "learning_rate": 0.00011428571428571428,
+      "loss": 0.7717,
+      "step": 50
+    },
+    {
+      "epoch": 0.006146059291395517,
+      "grad_norm": 0.21841812133789062,
+      "learning_rate": 0.00011238095238095239,
+      "loss": 0.937,
+      "step": 51
+    },
+    {
+      "epoch": 0.006266570257893469,
+      "grad_norm": 0.17994704842567444,
+      "learning_rate": 0.00011047619047619049,
+      "loss": 0.8443,
+      "step": 52
+    },
+    {
+      "epoch": 0.00638708122439142,
+      "grad_norm": 0.15717928111553192,
+      "learning_rate": 0.00010857142857142856,
+      "loss": 0.7624,
+      "step": 53
+    },
+    {
+      "epoch": 0.006507592190889371,
+      "grad_norm": 0.16110721230506897,
+      "learning_rate": 0.00010666666666666667,
+      "loss": 0.7228,
+      "step": 54
+    },
+    {
+      "epoch": 0.0066281031573873226,
+      "grad_norm": 0.14764989912509918,
+      "learning_rate": 0.00010476190476190477,
+      "loss": 0.6782,
+      "step": 55
+    },
+    {
+      "epoch": 0.006748614123885274,
+      "grad_norm": 0.1577727496623993,
+      "learning_rate": 0.00010285714285714286,
+      "loss": 0.7367,
+      "step": 56
+    },
+    {
+      "epoch": 0.006869125090383225,
+      "grad_norm": 0.17438825964927673,
+      "learning_rate": 0.00010095238095238096,
+      "loss": 0.65,
+      "step": 57
+    },
+    {
+      "epoch": 0.0069896360568811764,
+      "grad_norm": 0.1775740683078766,
+      "learning_rate": 9.904761904761905e-05,
+      "loss": 0.7797,
+      "step": 58
+    },
+    {
+      "epoch": 0.0071101470233791274,
+      "grad_norm": 0.18453216552734375,
+      "learning_rate": 9.714285714285715e-05,
+      "loss": 0.9153,
+      "step": 59
+    },
+    {
+      "epoch": 0.0072306579898770785,
+      "grad_norm": 0.16022688150405884,
+      "learning_rate": 9.523809523809524e-05,
+      "loss": 0.7798,
+      "step": 60
+    },
+    {
+      "epoch": 0.00735116895637503,
+      "grad_norm": 0.16944445669651031,
+      "learning_rate": 9.333333333333334e-05,
+      "loss": 0.8193,
+      "step": 61
+    },
+    {
+      "epoch": 0.007471679922872981,
+      "grad_norm": 0.14207735657691956,
+      "learning_rate": 9.142857142857143e-05,
+      "loss": 0.5361,
+      "step": 62
+    },
+    {
+      "epoch": 0.007592190889370932,
+      "grad_norm": 0.16854678094387054,
+      "learning_rate": 8.952380952380953e-05,
+      "loss": 0.7976,
+      "step": 63
+    },
+    {
+      "epoch": 0.007712701855868884,
+      "grad_norm": 0.17764142155647278,
+      "learning_rate": 8.761904761904762e-05,
+      "loss": 0.6938,
+      "step": 64
+    },
+    {
+      "epoch": 0.007833212822366835,
+      "grad_norm": 0.21041354537010193,
+      "learning_rate": 8.571428571428571e-05,
+      "loss": 0.8384,
+      "step": 65
+    },
+    {
+      "epoch": 0.007953723788864787,
+      "grad_norm": 0.18576891720294952,
+      "learning_rate": 8.380952380952382e-05,
+      "loss": 0.6401,
+      "step": 66
+    },
+    {
+      "epoch": 0.008074234755362737,
+      "grad_norm": 0.20624496042728424,
+      "learning_rate": 8.19047619047619e-05,
+      "loss": 0.7563,
+      "step": 67
+    },
+    {
+      "epoch": 0.008194745721860689,
+      "grad_norm": 0.18236589431762695,
+      "learning_rate": 8e-05,
+      "loss": 0.748,
+      "step": 68
+    },
+    {
+      "epoch": 0.008315256688358641,
+      "grad_norm": 0.15884153544902802,
+      "learning_rate": 7.80952380952381e-05,
+      "loss": 0.649,
+      "step": 69
+    },
+    {
+      "epoch": 0.008435767654856591,
+      "grad_norm": 0.18527762591838837,
+      "learning_rate": 7.619047619047618e-05,
+      "loss": 0.5163,
+      "step": 70
+    },
+    {
+      "epoch": 0.008556278621354543,
+      "grad_norm": 0.166184663772583,
+      "learning_rate": 7.428571428571429e-05,
+      "loss": 0.7672,
+      "step": 71
+    },
+    {
+      "epoch": 0.008676789587852495,
+      "grad_norm": 0.19784916937351227,
+      "learning_rate": 7.238095238095238e-05,
+      "loss": 0.7482,
+      "step": 72
+    },
+    {
+      "epoch": 0.008797300554350447,
+      "grad_norm": 0.16908536851406097,
+      "learning_rate": 7.047619047619048e-05,
+      "loss": 0.7461,
+      "step": 73
+    },
+    {
+      "epoch": 0.008917811520848397,
+      "grad_norm": 0.18411517143249512,
+      "learning_rate": 6.857142857142858e-05,
+      "loss": 0.5697,
+      "step": 74
+    },
+    {
+      "epoch": 0.009038322487346349,
+      "grad_norm": 0.15351906418800354,
+      "learning_rate": 6.666666666666667e-05,
+      "loss": 0.6597,
+      "step": 75
+    },
+    {
+      "epoch": 0.0091588334538443,
+      "grad_norm": 0.17720364034175873,
+      "learning_rate": 6.476190476190477e-05,
+      "loss": 0.808,
+      "step": 76
+    },
+    {
+      "epoch": 0.00927934442034225,
+      "grad_norm": 0.18325303494930267,
+      "learning_rate": 6.285714285714286e-05,
+      "loss": 0.7917,
+      "step": 77
+    },
+    {
+      "epoch": 0.009399855386840203,
+      "grad_norm": 0.1679506152868271,
+      "learning_rate": 6.0952380952380964e-05,
+      "loss": 0.6326,
+      "step": 78
+    },
+    {
+      "epoch": 0.009520366353338154,
+      "grad_norm": 0.19260190427303314,
+      "learning_rate": 5.904761904761905e-05,
+      "loss": 0.5601,
+      "step": 79
+    },
+    {
+      "epoch": 0.009640877319836105,
+      "grad_norm": 0.15009605884552002,
+      "learning_rate": 5.714285714285714e-05,
+      "loss": 0.6072,
+      "step": 80
+    },
+    {
+      "epoch": 0.009761388286334056,
+      "grad_norm": 0.15776121616363525,
+      "learning_rate": 5.5238095238095244e-05,
+      "loss": 0.6753,
+      "step": 81
+    },
+    {
+      "epoch": 0.009881899252832008,
+      "grad_norm": 0.18575388193130493,
+      "learning_rate": 5.333333333333333e-05,
+      "loss": 0.6219,
+      "step": 82
+    },
+    {
+      "epoch": 0.010002410219329958,
+      "grad_norm": 0.21978633105754852,
+      "learning_rate": 5.142857142857143e-05,
+      "loss": 0.8581,
+      "step": 83
+    },
+    {
+      "epoch": 0.01012292118582791,
+      "grad_norm": 0.1704164743423462,
+      "learning_rate": 4.9523809523809525e-05,
+      "loss": 0.6461,
+      "step": 84
+    },
+    {
+      "epoch": 0.010243432152325862,
+      "grad_norm": 0.18057820200920105,
+      "learning_rate": 4.761904761904762e-05,
+      "loss": 0.7416,
+      "step": 85
+    },
+    {
+      "epoch": 0.010363943118823812,
+      "grad_norm": 0.15225447714328766,
+      "learning_rate": 4.5714285714285716e-05,
+      "loss": 0.4868,
+      "step": 86
+    },
+    {
+      "epoch": 0.010484454085321764,
+      "grad_norm": 0.17193946242332458,
+      "learning_rate": 4.380952380952381e-05,
+      "loss": 0.8092,
+      "step": 87
+    },
+    {
+      "epoch": 0.010604965051819716,
+      "grad_norm": 0.194380983710289,
+      "learning_rate": 4.190476190476191e-05,
+      "loss": 0.8461,
+      "step": 88
+    },
+    {
+      "epoch": 0.010725476018317666,
+      "grad_norm": 0.2139783650636673,
+      "learning_rate": 4e-05,
+      "loss": 0.6548,
+      "step": 89
+    },
+    {
+      "epoch": 0.010845986984815618,
+      "grad_norm": 0.16700893640518188,
+      "learning_rate": 3.809523809523809e-05,
+      "loss": 0.5584,
+      "step": 90
+    },
+    {
+      "epoch": 0.01096649795131357,
+      "grad_norm": 0.1971975564956665,
+      "learning_rate": 3.619047619047619e-05,
+      "loss": 0.8535,
+      "step": 91
+    },
+    {
+      "epoch": 0.01108700891781152,
+      "grad_norm": 0.19667109847068787,
+      "learning_rate": 3.428571428571429e-05,
+      "loss": 0.8635,
+      "step": 92
+    },
+    {
+      "epoch": 0.011207519884309472,
+      "grad_norm": 0.18818983435630798,
+      "learning_rate": 3.2380952380952386e-05,
+      "loss": 0.8435,
+      "step": 93
+    },
+    {
+      "epoch": 0.011328030850807424,
+      "grad_norm": 0.16365501284599304,
+      "learning_rate": 3.0476190476190482e-05,
+      "loss": 0.6243,
+      "step": 94
+    },
+    {
+      "epoch": 0.011448541817305374,
+      "grad_norm": 0.20358283817768097,
+      "learning_rate": 2.857142857142857e-05,
+      "loss": 0.6483,
+      "step": 95
+    },
+    {
+      "epoch": 0.011569052783803326,
+      "grad_norm": 0.17696398496627808,
+      "learning_rate": 2.6666666666666667e-05,
+      "loss": 0.6057,
+      "step": 96
+    },
+    {
+      "epoch": 0.011689563750301278,
+      "grad_norm": 0.15508583188056946,
+      "learning_rate": 2.4761904761904762e-05,
+      "loss": 0.524,
+      "step": 97
+    },
+    {
+      "epoch": 0.011810074716799228,
+      "grad_norm": 0.18458549678325653,
+      "learning_rate": 2.2857142857142858e-05,
+      "loss": 0.8364,
+      "step": 98
+    },
+    {
+      "epoch": 0.01193058568329718,
+      "grad_norm": 0.1944003403186798,
+      "learning_rate": 2.0952380952380954e-05,
+      "loss": 0.5383,
+      "step": 99
+    },
+    {
+      "epoch": 0.012051096649795132,
+      "grad_norm": 0.4217074513435364,
+      "learning_rate": 1.9047619047619046e-05,
+      "loss": 0.6774,
+      "step": 100
+    },
+    {
+      "epoch": 0.012171607616293083,
+      "grad_norm": 0.20350486040115356,
+      "learning_rate": 1.7142857142857145e-05,
+      "loss": 0.6871,
+      "step": 101
+    },
+    {
+      "epoch": 0.012292118582791034,
+      "grad_norm": 0.19154471158981323,
+      "learning_rate": 1.5238095238095241e-05,
+      "loss": 0.7226,
+      "step": 102
+    },
+    {
+      "epoch": 0.012412629549288986,
+      "grad_norm": 0.17253194749355316,
+      "learning_rate": 1.3333333333333333e-05,
+      "loss": 0.7514,
+      "step": 103
+    },
+    {
+      "epoch": 0.012533140515786937,
+      "grad_norm": 0.14699283242225647,
+      "learning_rate": 1.1428571428571429e-05,
+      "loss": 0.5358,
+      "step": 104
+    },
+    {
+      "epoch": 0.012653651482284888,
+      "grad_norm": 0.19192050397396088,
+      "learning_rate": 9.523809523809523e-06,
+      "loss": 0.9153,
+      "step": 105
+    },
+    {
+      "epoch": 0.01277416244878284,
+      "grad_norm": 0.15646027028560638,
+      "learning_rate": 7.6190476190476205e-06,
+      "loss": 0.5182,
+      "step": 106
+    },
+    {
+      "epoch": 0.012894673415280791,
+      "grad_norm": 0.18160918354988098,
+      "learning_rate": 5.7142857142857145e-06,
+      "loss": 0.5822,
+      "step": 107
+    },
+    {
+      "epoch": 0.013015184381778741,
+      "grad_norm": 0.19203059375286102,
+      "learning_rate": 3.8095238095238102e-06,
+      "loss": 0.7678,
+      "step": 108
+    },
+    {
+      "epoch": 0.013135695348276693,
+      "grad_norm": 0.20908264815807343,
+      "learning_rate": 1.9047619047619051e-06,
+      "loss": 0.8563,
+      "step": 109
+    },
+    {
+      "epoch": 0.013256206314774645,
+      "grad_norm": 0.16366459429264069,
+      "learning_rate": 0.0,
+      "loss": 0.4258,
+      "step": 110
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 110,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 55,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 3.103938402981235e+16,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e78d9e9740351e2b1b3b1897ecee95b7a4bd19239aad7acb92a951d6d1190a45
+size 5624