### Load all required modules for loading data, model setup, training, and metric evaluation

In [1]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..", "src")))
from data_loader import load_and_prepare_data         
from model import get_model, get_tokenizer            
from train import get_training_args, train_model      
from evaluate import compute_metrics                  
from torch.utils.data import Dataset                      
import torch

### Define a class that wraps tokenized data and labels for Hugging Faceâ€™s Trainer to use

In [3]:
class EmotionDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings  # BERT tokenized inputs (input_ids, attention_mask)
        self.labels = labels        # Encoded labels (integers)

    def __len__(self):
        return len(self.labels)     # Total number of samples

    def __getitem__(self, idx):
        # Return dictionary of input tensors + label tensor for a single sample
        return {
            key: torch.tensor(val[idx]) for key, val in self.encodings.items()
        } | {"labels": torch.tensor(self.labels[idx])}

### Load the dataset from Hugging Face, clean and encode it, then tokenize it using the BERT tokenizer.

In [5]:
# Load train/test splits and label encoder
train_texts, test_texts, train_labels, test_labels, label_encoder = load_and_prepare_data()

# Load BERT tokenizer
tokenizer = get_tokenizer()

# Tokenize training and testing texts with truncation and padding
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=128)

# Wrap the tokenized data into EmotionDataset objects
train_dataset = EmotionDataset(train_encodings, train_labels)
test_dataset = EmotionDataset(test_encodings, test_labels)

### Samples from the dataset

In [7]:
for i in range(5):
    print(f"Sample {i+1}")
    print(f"Text: {train_texts[i]}")
    print(f"Label (encoded): {train_labels[i]}")
    print()

Sample 1
Text: i'd just feel less out of place, i guess. my sa makes me feel like i'm so behind my peers in terms of a social life
Label (encoded): 9

Sample 2
Text: i love the lady in the green jacket chasing after the second car looking back at the first car like "look what you did"
Label (encoded): 18

Sample 3
Text: man. really bad last possession there. bummer.
Label (encoded): 10

Sample 4
Text: never wouldâ€™ve guessed that one.
Label (encoded): 20

Sample 5
Text: i wasnâ€™t even expecting the reply thatâ€™s why iâ€™m literally bamboozled.
Label (encoded): 27



### Set up the BERT model for sequence classification and define training parameters.

In [9]:
# Load pre-trained BERT model with classification head for number of emotion classes
model = get_model(num_labels=len(label_encoder.classes_))

# Set training configuration: batch size, epochs, logging, saving, evaluation
training_args = get_training_args()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Train the Model 

In [13]:
trainer = train_model(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    val_dataset=test_dataset,
    compute_metrics=compute_metrics
)

# Begin training
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,1.3589,1.335635,0.613467,0.579882
2,0.9471,1.284574,0.615671,0.601428
3,0.9704,1.297894,0.617048,0.606042


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.9072,1.365916,0.602313,0.595804
2,0.5491,1.48813,0.595566,0.591464
3,0.5144,1.593286,0.591297,0.589066


TrainOutput(global_step=5448, training_loss=0.7054264770818002, metrics={'train_runtime': 5721.3012, 'train_samples_per_second': 15.23, 'train_steps_per_second': 0.952, 'total_flos': 5733080823638016.0, 'train_loss': 0.7054264770818002, 'epoch': 3.0})

### Save both model weights and tokenizer files for future inference or deployment.

In [23]:
from pathlib import Path
model_path = Path("..") / "outputs" / "model"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

('../outputs/model/tokenizer_config.json',
 '../outputs/model/special_tokens_map.json',
 '../outputs/model/vocab.txt',
 '../outputs/model/added_tokens.json')