|
|
|
"""PreFinetuningForRunPod.ipynb |
|
|
|
Automatically generated by Colaboratory. |
|
|
|
Original file is located at |
|
https://colab.research.google.com/drive/1LtsUCcWfL2VpWLJXVkE5076XX5k3PTyg |
|
""" |
|
|
|
|
|
import torch |
|
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig |
|
import datasets |
|
from datasets import load_dataset |
|
from trl import SFTTrainer |
|
from peft import PeftConfig, PeftModel |
|
from multiprocessing import cpu_count |
|
from peft import prepare_model_for_kbit_training |
|
from peft import LoraConfig, get_peft_model |
|
import bitsandbytes as bnb |
|
import transformers |
|
|
|
|
|
|
|
|
|
|
|
|
|
model_id = "mistralai/Mistral-7B-v0.1" |
|
print("-----------------------------loading tokenizer-----------------------------------------------------------") |
|
tokenizer = AutoTokenizer.from_pretrained(model_id, add_eos_token=True) |
|
|
|
|
|
print("-----------------------------loading dataset-----------------------------------------------------------") |
|
|
|
data = load_dataset("gbharti/finance-alpaca", split='train') |
|
|
|
|
|
def generate_prompt(data_point): |
|
"""Gen. input text based on a prompt, task instruction, (context info.), and answer |
|
|
|
:param data_point: dict: Data point |
|
:return: dict: tokenzed prompt |
|
""" |
|
|
|
if data_point['input']: |
|
text = 'Below is an instruction that describes a task, paired with an input that provides' \ |
|
' further context. Write a response that appropriately completes the request.\n\n' |
|
text += f'### Instruction:\n{data_point["instruction"]}\n\n' |
|
text += f'### Input:\n{data_point["input"]}\n\n' |
|
text += f'### Response:\n{data_point["output"]}' |
|
|
|
|
|
else: |
|
text = 'Below is an instruction that describes a task. Write a response that ' \ |
|
'appropriately completes the request.\n\n' |
|
text += f'### Instruction:\n{data_point["instruction"]}\n\n' |
|
text += f'### Response:\n{data_point["output"]}' |
|
return text |
|
|
|
print("-----------------------------Preparing dataset for fine-tuning-----------------------------------------------------------") |
|
prompt = [generate_prompt(data_point) for data_point in data] |
|
data = data.add_column("prompt", prompt); |
|
data = data.map(lambda sample: tokenizer(sample["prompt"]),num_proc=cpu_count(), batched=True) |
|
|
|
data = data.shuffle(seed=1234) |
|
data = data.train_test_split(test_size=0.1) |
|
train_data = data["train"] |
|
test_data = data["test"] |
|
|
|
|
|
|
|
bnb_config = BitsAndBytesConfig( |
|
load_in_4bit = True, |
|
bnb_4bit_use_double_quant=True, |
|
bnb_4bit_quant_type="nf4", |
|
bnb_4bit_compute_dtype=torch.bfloat16 |
|
) |
|
|
|
d_map = {"": torch.cuda.current_device()} if torch.cuda.is_available() else None |
|
|
|
print("-----------------------------loading model-----------------------------------------------------------") |
|
model = AutoModelForCausalLM.from_pretrained( |
|
model_id, |
|
torch_dtype="auto", |
|
use_cache=False, |
|
quantization_config=bnb_config, |
|
device_map=d_map |
|
) |
|
|
|
|
|
def find_all_linear_names(model): |
|
cls = bnb.nn.Linear4bit |
|
lora_module_names = set() |
|
for name, module in model.named_modules(): |
|
if isinstance(module, cls): |
|
names = name.split('.') |
|
lora_module_names.add(names[0] if len(names) == 1 else names[-1]) |
|
if 'lm_head' in lora_module_names: |
|
lora_module_names.remove('lm_head') |
|
return list(lora_module_names) |
|
|
|
modules = find_all_linear_names(model) |
|
|
|
|
|
lora_config = LoraConfig( |
|
r=8, |
|
lora_alpha=32, |
|
target_modules = modules, |
|
lora_dropout=0.05, |
|
bias="none", |
|
task_type="CAUSAL_LM" |
|
) |
|
model.gradient_checkpointing_enable() |
|
model = prepare_model_for_kbit_training(model) |
|
model = get_peft_model(model, lora_config) |
|
|
|
|
|
|
|
|
|
|
|
tokenizer.pad_token = tokenizer.eos_token |
|
torch.cuda.empty_cache() |
|
|
|
trainer = SFTTrainer( |
|
model=model, |
|
train_dataset=train_data, |
|
eval_dataset=test_data, |
|
dataset_text_field="prompt", |
|
peft_config=lora_config, |
|
args=transformers.TrainingArguments( |
|
per_device_train_batch_size=1, |
|
gradient_accumulation_steps=4, |
|
gradient_checkpointing=True, |
|
fp16=True, |
|
warmup_steps=0.03, |
|
max_steps=10, |
|
learning_rate=2e-4, |
|
logging_steps=1, |
|
output_dir="outputs", |
|
optim="paged_adamw_8bit", |
|
save_strategy="epoch", |
|
), |
|
data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False), |
|
) |
|
print("-----------------------------finetuning starts------------------------------------------------------------") |
|
model.config.use_cache = False |
|
trainer.train() |
|
|