{ "cells": [ { "cell_type": "markdown", "id": "80f816c1-0839-41cb-847b-c79a62ca1465", "metadata": {}, "source": [ "### Load all required modules for loading data, model setup, training, and metric evaluation" ] }, { "cell_type": "code", "execution_count": 1, "id": "2554d05b-f08a-4c21-953f-4f507407e426", "metadata": {}, "outputs": [], "source": [ "import sys\n", "import os\n", "sys.path.append(os.path.abspath(os.path.join(os.getcwd(), \"..\", \"src\")))\n", "from data_loader import load_and_prepare_data \n", "from model import get_model, get_tokenizer \n", "from train import get_training_args, train_model \n", "from evaluate import compute_metrics \n", "from torch.utils.data import Dataset \n", "import torch" ] }, { "cell_type": "markdown", "id": "3bfbb706-4b0b-43de-a95a-884d46343668", "metadata": {}, "source": [ "### Define a class that wraps tokenized data and labels for Hugging Face’s Trainer to use" ] }, { "cell_type": "code", "execution_count": 3, "id": "c814c354-7962-4a2d-b7bd-5c498f1d004e", "metadata": {}, "outputs": [], "source": [ "class EmotionDataset(Dataset):\n", " def __init__(self, encodings, labels):\n", " self.encodings = encodings # BERT tokenized inputs (input_ids, attention_mask)\n", " self.labels = labels # Encoded labels (integers)\n", "\n", " def __len__(self):\n", " return len(self.labels) # Total number of samples\n", "\n", " def __getitem__(self, idx):\n", " # Return dictionary of input tensors + label tensor for a single sample\n", " return {\n", " key: torch.tensor(val[idx]) for key, val in self.encodings.items()\n", " } | {\"labels\": torch.tensor(self.labels[idx])}" ] }, { "cell_type": "markdown", "id": "f9b87257-f0c0-4532-9eee-939d8747ef79", "metadata": {}, "source": [ "### Load the dataset from Hugging Face, clean and encode it, then tokenize it using the BERT tokenizer." ] }, { "cell_type": "code", "execution_count": 5, "id": "18e312be-5863-4e24-900a-843e42e145cc", "metadata": {}, "outputs": [], "source": [ "# Load train/test splits and label encoder\n", "train_texts, test_texts, train_labels, test_labels, label_encoder = load_and_prepare_data()\n", "\n", "# Load BERT tokenizer\n", "tokenizer = get_tokenizer()\n", "\n", "# Tokenize training and testing texts with truncation and padding\n", "train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)\n", "test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=128)\n", "\n", "# Wrap the tokenized data into EmotionDataset objects\n", "train_dataset = EmotionDataset(train_encodings, train_labels)\n", "test_dataset = EmotionDataset(test_encodings, test_labels)" ] }, { "cell_type": "markdown", "id": "66b99b4e-5297-4bc0-8cfb-20dbe22526c0", "metadata": {}, "source": [ "### Samples from the dataset" ] }, { "cell_type": "code", "execution_count": 7, "id": "35db4426-db21-4438-ba0e-ebb51d52edfb", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Sample 1\n", "Text: i'd just feel less out of place, i guess. my sa makes me feel like i'm so behind my peers in terms of a social life\n", "Label (encoded): 9\n", "\n", "Sample 2\n", "Text: i love the lady in the green jacket chasing after the second car looking back at the first car like \"look what you did\"\n", "Label (encoded): 18\n", "\n", "Sample 3\n", "Text: man. really bad last possession there. bummer.\n", "Label (encoded): 10\n", "\n", "Sample 4\n", "Text: never would’ve guessed that one.\n", "Label (encoded): 20\n", "\n", "Sample 5\n", "Text: i wasn’t even expecting the reply that’s why i’m literally bamboozled.\n", "Label (encoded): 27\n", "\n" ] } ], "source": [ "for i in range(5):\n", " print(f\"Sample {i+1}\")\n", " print(f\"Text: {train_texts[i]}\")\n", " print(f\"Label (encoded): {train_labels[i]}\")\n", " print()" ] }, { "cell_type": "markdown", "id": "0883760a-a449-42ca-ba69-fa01d874e50b", "metadata": {}, "source": [ "### Set up the BERT model for sequence classification and define training parameters." ] }, { "cell_type": "code", "execution_count": 9, "id": "3176ccf4-d20d-460c-b620-c73a1ab9cb6d", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']\n", "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", "/opt/anaconda3/lib/python3.12/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead\n", " warnings.warn(\n" ] } ], "source": [ "# Load pre-trained BERT model with classification head for number of emotion classes\n", "model = get_model(num_labels=len(label_encoder.classes_))\n", "\n", "# Set training configuration: batch size, epochs, logging, saving, evaluation\n", "training_args = get_training_args()" ] }, { "cell_type": "markdown", "id": "874a4e6a-80dd-470d-9283-e1c88e731b8e", "metadata": {}, "source": [ "### Train the Model " ] }, { "cell_type": "code", "execution_count": 13, "id": "4c312e56-52bf-417d-82c0-8a1f47b82670", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
\n", " \n", " \n", " [5448/5448 1:46:28, Epoch 3/3]\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
EpochTraining LossValidation LossAccuracyF1
11.3589001.3356350.6134670.579882
20.9471001.2845740.6156710.601428
30.9704001.2978940.6170480.606042

" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "\n", "

\n", " \n", " \n", " [5448/5448 1:35:20, Epoch 3/3]\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
EpochTraining LossValidation LossAccuracyF1
10.9072001.3659160.6023130.595804
20.5491001.4881300.5955660.591464
30.5144001.5932860.5912970.589066

" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "TrainOutput(global_step=5448, training_loss=0.7054264770818002, metrics={'train_runtime': 5721.3012, 'train_samples_per_second': 15.23, 'train_steps_per_second': 0.952, 'total_flos': 5733080823638016.0, 'train_loss': 0.7054264770818002, 'epoch': 3.0})" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "trainer = train_model(\n", " model=model,\n", " args=training_args,\n", " train_dataset=train_dataset,\n", " val_dataset=test_dataset,\n", " compute_metrics=compute_metrics\n", ")\n", "\n", "# Begin training\n", "trainer.train()" ] }, { "cell_type": "markdown", "id": "020729b6-c545-42ba-bd2c-00ee5f9bbb80", "metadata": {}, "source": [ "### Save both model weights and tokenizer files for future inference or deployment." ] }, { "cell_type": "code", "execution_count": 23, "id": "5f12aedb-b3f8-4a1b-8e1f-6a68eb29933f", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "('../outputs/model/tokenizer_config.json',\n", " '../outputs/model/special_tokens_map.json',\n", " '../outputs/model/vocab.txt',\n", " '../outputs/model/added_tokens.json')" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from pathlib import Path\n", "model_path = Path(\"..\") / \"outputs\" / \"model\"\n", "model.save_pretrained(model_path)\n", "tokenizer.save_pretrained(model_path)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.2" } }, "nbformat": 4, "nbformat_minor": 5 }