finetuninggpt2 / app.py
MAsad789565's picture
Update app.py
d78c5aa verified
raw
history blame
2.67 kB
import os
os.system('pip install torch datasets transformers')
from transformers import GPT2LMHeadModel, GPT2Tokenizer, AdamW
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
import torch
# Load Ultrachat dataset
dataset = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft")
# Tokenization
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
class MyDataset(Dataset):
def __init__(self, data, max_length=1024):
self.data = data
self.max_length = max_length
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
# Extract relevant information from the user and assistant messages
user_content = self.data[idx][0]['content'] if 'content' in self.data[idx][0] else ""
assistant_content = self.data[idx][1]['content'] if 'content' in self.data[idx][1] else ""
# Combine user and assistant messages into a single text
text = f"User: {user_content} Assistant: {assistant_content}"
# Tokenize the text without squeezing the tensor and convert to Long tensor
input_ids = tokenizer.encode(text, return_tensors='pt').long()
# Optionally truncate or pad the sequence to a maximum length
input_ids = input_ids[:, :self.max_length]
# If needed, pad the sequence to the max_length using torch.nn.functional.pad
input_ids = torch.nn.functional.pad(input_ids, (0, self.max_length - input_ids.size(1)), 'constant', 0)
return {'input_ids': input_ids}
# Create DataLoader without collate_fn
my_dataset = MyDataset(dataset['messages'])
dataloader = DataLoader(my_dataset, batch_size=4, shuffle=True)
# Load pre-trained model
model = GPT2LMHeadModel.from_pretrained("gpt2")
# Move model to GPU if available
device = torch.device("cpu")
model.to(device)
# Define optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)
# Fine-tuning Loop
for epoch in range(1):
total_loss = 0.0
for i, batch in enumerate(dataloader):
batch = {k: v.to(device) for k, v in batch.items()}
outputs = model(**batch, labels=batch['input_ids'])
loss = outputs.loss
loss.backward()
optimizer.step()
optimizer.zero_grad()
total_loss += loss.item()
if (i + 1) % 100 == 0: # Print loss every 100 batches
average_loss = total_loss / 100
print(f"Epoch: {epoch + 1}, Batch: {i + 1}, Average Loss: {average_loss:.4f}")
total_loss = 0.0
print("Training complete!")
model.save_pretrained('/gpt2_finetuned')
tokenizer.save_pretrained('/gpt2_finetuned/tokenizer')
print("Model Saved! \n Enjoy the model Now!")