Spaces:
Sleeping
Sleeping
File size: 4,838 Bytes
7749830 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 |
#!/usr/bin/env python3
"""
Fine-tuning script for SmolLM2-135M model using Unsloth.
This script demonstrates how to:
1. Install and configure Unsloth
2. Prepare and format training data
3. Configure and run the training process
4. Save and evaluate the model
To run this script:
1. Install dependencies: pip install -r requirements.txt
2. Run: python train.py
"""
import os
from typing import Union
from datasets import (
Dataset,
DatasetDict,
IterableDataset,
IterableDatasetDict,
load_dataset,
)
from transformers import AutoTokenizer, Trainer, TrainingArguments
from trl import SFTTrainer
from unsloth import FastLanguageModel, is_bfloat16_supported
from unsloth.chat_templates import get_chat_template
# Configuration
max_seq_length = 2048 # Auto supports RoPE Scaling internally
dtype = (
None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
)
load_in_4bit = True # Use 4bit quantization to reduce memory usage
# def install_dependencies():
# """Install required dependencies."""
# os.system('pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"')
# os.system('pip install --no-deps xformers trl peft accelerate bitsandbytes')
def load_model() -> tuple[FastLanguageModel, AutoTokenizer]:
"""Load and configure the model."""
model, tokenizer = FastLanguageModel.from_pretrained(
model_name="unsloth/SmolLM2-135M-Instruct-bnb-4bit",
max_seq_length=max_seq_length,
dtype=dtype,
load_in_4bit=load_in_4bit,
)
# Configure LoRA
model = FastLanguageModel.get_peft_model(
model,
r=64,
target_modules=[
"q_proj",
"k_proj",
"v_proj",
"o_proj",
"gate_proj",
"up_proj",
"down_proj",
],
lora_alpha=128,
lora_dropout=0.05,
bias="none",
use_gradient_checkpointing="unsloth",
random_state=3407,
use_rslora=True,
loftq_config=None,
)
return model, tokenizer
def load_and_format_dataset(
tokenizer: AutoTokenizer,
) -> tuple[
Union[DatasetDict, Dataset, IterableDatasetDict, IterableDataset], AutoTokenizer
]:
"""Load and format the training dataset."""
# Load the code-act dataset
dataset = load_dataset("xingyaoww/code-act", split="codeact")
# Configure chat template
tokenizer = get_chat_template(
tokenizer,
chat_template="chatml", # Supports zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, unsloth
mapping={
"role": "from",
"content": "value",
"user": "human",
"assistant": "gpt",
}, # ShareGPT style
map_eos_token=True, # Maps <|im_end|> to </s> instead
)
def formatting_prompts_func(examples):
convos = examples["conversations"]
texts = [
tokenizer.apply_chat_template(
convo, tokenize=False, add_generation_prompt=False
)
for convo in convos
]
return {"text": texts}
# Apply formatting to dataset
dataset = dataset.map(formatting_prompts_func, batched=True)
return dataset, tokenizer
def create_trainer(
model: FastLanguageModel,
tokenizer: AutoTokenizer,
dataset: Union[DatasetDict, Dataset, IterableDatasetDict, IterableDataset],
) -> Trainer:
"""Create and configure the SFTTrainer."""
return SFTTrainer(
model=model,
tokenizer=tokenizer,
train_dataset=dataset,
dataset_text_field="text",
max_seq_length=max_seq_length,
dataset_num_proc=2,
packing=False,
args=TrainingArguments(
per_device_train_batch_size=2,
gradient_accumulation_steps=16,
warmup_steps=100,
max_steps=120,
learning_rate=5e-5,
fp16=not is_bfloat16_supported(),
bf16=is_bfloat16_supported(),
logging_steps=1,
optim="adamw_8bit",
weight_decay=0.01,
lr_scheduler_type="cosine_with_restarts",
seed=3407,
output_dir="outputs",
gradient_checkpointing=True,
save_strategy="steps",
save_steps=30,
save_total_limit=2,
),
)
def main():
"""Main training function."""
# Install dependencies
# install_dependencies()
# Load model and tokenizer
model, tokenizer = load_model()
# Load and prepare dataset
dataset, tokenizer = load_and_format_dataset(tokenizer)
# Create trainer
trainer: Trainer = create_trainer(model, tokenizer, dataset)
# Train
trainer.train()
# Save model
trainer.save_model("final_model")
if __name__ == "__main__":
main()
|