File size: 4,838 Bytes
7749830
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
#!/usr/bin/env python3
"""
Fine-tuning script for SmolLM2-135M model using Unsloth.

This script demonstrates how to:
1. Install and configure Unsloth
2. Prepare and format training data
3. Configure and run the training process
4. Save and evaluate the model

To run this script:
1. Install dependencies: pip install -r requirements.txt
2. Run: python train.py
"""

import os
from typing import Union

from datasets import (
    Dataset,
    DatasetDict,
    IterableDataset,
    IterableDatasetDict,
    load_dataset,
)
from transformers import AutoTokenizer, Trainer, TrainingArguments
from trl import SFTTrainer
from unsloth import FastLanguageModel, is_bfloat16_supported
from unsloth.chat_templates import get_chat_template

# Configuration
max_seq_length = 2048  # Auto supports RoPE Scaling internally
dtype = (
    None  # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
)
load_in_4bit = True  # Use 4bit quantization to reduce memory usage

# def install_dependencies():
#     """Install required dependencies."""
#     os.system('pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"')
#     os.system('pip install --no-deps xformers trl peft accelerate bitsandbytes')


def load_model() -> tuple[FastLanguageModel, AutoTokenizer]:
    """Load and configure the model."""
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name="unsloth/SmolLM2-135M-Instruct-bnb-4bit",
        max_seq_length=max_seq_length,
        dtype=dtype,
        load_in_4bit=load_in_4bit,
    )

    # Configure LoRA
    model = FastLanguageModel.get_peft_model(
        model,
        r=64,
        target_modules=[
            "q_proj",
            "k_proj",
            "v_proj",
            "o_proj",
            "gate_proj",
            "up_proj",
            "down_proj",
        ],
        lora_alpha=128,
        lora_dropout=0.05,
        bias="none",
        use_gradient_checkpointing="unsloth",
        random_state=3407,
        use_rslora=True,
        loftq_config=None,
    )

    return model, tokenizer


def load_and_format_dataset(
    tokenizer: AutoTokenizer,
) -> tuple[
    Union[DatasetDict, Dataset, IterableDatasetDict, IterableDataset], AutoTokenizer
]:
    """Load and format the training dataset."""
    # Load the code-act dataset
    dataset = load_dataset("xingyaoww/code-act", split="codeact")

    # Configure chat template
    tokenizer = get_chat_template(
        tokenizer,
        chat_template="chatml",  # Supports zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, unsloth
        mapping={
            "role": "from",
            "content": "value",
            "user": "human",
            "assistant": "gpt",
        },  # ShareGPT style
        map_eos_token=True,  # Maps <|im_end|> to </s> instead
    )

    def formatting_prompts_func(examples):
        convos = examples["conversations"]
        texts = [
            tokenizer.apply_chat_template(
                convo, tokenize=False, add_generation_prompt=False
            )
            for convo in convos
        ]
        return {"text": texts}

    # Apply formatting to dataset
    dataset = dataset.map(formatting_prompts_func, batched=True)

    return dataset, tokenizer


def create_trainer(
    model: FastLanguageModel,
    tokenizer: AutoTokenizer,
    dataset: Union[DatasetDict, Dataset, IterableDatasetDict, IterableDataset],
) -> Trainer:
    """Create and configure the SFTTrainer."""
    return SFTTrainer(
        model=model,
        tokenizer=tokenizer,
        train_dataset=dataset,
        dataset_text_field="text",
        max_seq_length=max_seq_length,
        dataset_num_proc=2,
        packing=False,
        args=TrainingArguments(
            per_device_train_batch_size=2,
            gradient_accumulation_steps=16,
            warmup_steps=100,
            max_steps=120,
            learning_rate=5e-5,
            fp16=not is_bfloat16_supported(),
            bf16=is_bfloat16_supported(),
            logging_steps=1,
            optim="adamw_8bit",
            weight_decay=0.01,
            lr_scheduler_type="cosine_with_restarts",
            seed=3407,
            output_dir="outputs",
            gradient_checkpointing=True,
            save_strategy="steps",
            save_steps=30,
            save_total_limit=2,
        ),
    )


def main():
    """Main training function."""
    # Install dependencies
    # install_dependencies()

    # Load model and tokenizer
    model, tokenizer = load_model()

    # Load and prepare dataset
    dataset, tokenizer = load_and_format_dataset(tokenizer)

    # Create trainer
    trainer: Trainer = create_trainer(model, tokenizer, dataset)

    # Train
    trainer.train()

    # Save model
    trainer.save_model("final_model")


if __name__ == "__main__":
    main()