{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "80f816c1-0839-41cb-847b-c79a62ca1465",
   "metadata": {},
   "source": [
    "### Load all required modules for loading data, model setup, training, and metric evaluation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "2554d05b-f08a-4c21-953f-4f507407e426",
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "import os\n",
    "sys.path.append(os.path.abspath(os.path.join(os.getcwd(), \"..\", \"src\")))\n",
    "from data_loader import load_and_prepare_data         \n",
    "from model import get_model, get_tokenizer            \n",
    "from train import get_training_args, train_model      \n",
    "from evaluate import compute_metrics                  \n",
    "from torch.utils.data import Dataset                      \n",
    "import torch"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3bfbb706-4b0b-43de-a95a-884d46343668",
   "metadata": {},
   "source": [
    "### Define a class that wraps tokenized data and labels for Hugging Face’s Trainer to use"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "c814c354-7962-4a2d-b7bd-5c498f1d004e",
   "metadata": {},
   "outputs": [],
   "source": [
    "class EmotionDataset(Dataset):\n",
    "    def __init__(self, encodings, labels):\n",
    "        self.encodings = encodings  # BERT tokenized inputs (input_ids, attention_mask)\n",
    "        self.labels = labels        # Encoded labels (integers)\n",
    "\n",
    "    def __len__(self):\n",
    "        return len(self.labels)     # Total number of samples\n",
    "\n",
    "    def __getitem__(self, idx):\n",
    "        # Return dictionary of input tensors + label tensor for a single sample\n",
    "        return {\n",
    "            key: torch.tensor(val[idx]) for key, val in self.encodings.items()\n",
    "        } | {\"labels\": torch.tensor(self.labels[idx])}"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f9b87257-f0c0-4532-9eee-939d8747ef79",
   "metadata": {},
   "source": [
    "### Load the dataset from Hugging Face, clean and encode it, then tokenize it using the BERT tokenizer."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "18e312be-5863-4e24-900a-843e42e145cc",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load train/test splits and label encoder\n",
    "train_texts, test_texts, train_labels, test_labels, label_encoder = load_and_prepare_data()\n",
    "\n",
    "# Load BERT tokenizer\n",
    "tokenizer = get_tokenizer()\n",
    "\n",
    "# Tokenize training and testing texts with truncation and padding\n",
    "train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)\n",
    "test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=128)\n",
    "\n",
    "# Wrap the tokenized data into EmotionDataset objects\n",
    "train_dataset = EmotionDataset(train_encodings, train_labels)\n",
    "test_dataset = EmotionDataset(test_encodings, test_labels)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "66b99b4e-5297-4bc0-8cfb-20dbe22526c0",
   "metadata": {},
   "source": [
    "### Samples from the dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "35db4426-db21-4438-ba0e-ebb51d52edfb",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sample 1\n",
      "Text: i'd just feel less out of place, i guess. my sa makes me feel like i'm so behind my peers in terms of a social life\n",
      "Label (encoded): 9\n",
      "\n",
      "Sample 2\n",
      "Text: i love the lady in the green jacket chasing after the second car looking back at the first car like \"look what you did\"\n",
      "Label (encoded): 18\n",
      "\n",
      "Sample 3\n",
      "Text: man. really bad last possession there. bummer.\n",
      "Label (encoded): 10\n",
      "\n",
      "Sample 4\n",
      "Text: never would’ve guessed that one.\n",
      "Label (encoded): 20\n",
      "\n",
      "Sample 5\n",
      "Text: i wasn’t even expecting the reply that’s why i’m literally bamboozled.\n",
      "Label (encoded): 27\n",
      "\n"
     ]
    }
   ],
   "source": [
    "for i in range(5):\n",
    "    print(f\"Sample {i+1}\")\n",
    "    print(f\"Text: {train_texts[i]}\")\n",
    "    print(f\"Label (encoded): {train_labels[i]}\")\n",
    "    print()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0883760a-a449-42ca-ba69-fa01d874e50b",
   "metadata": {},
   "source": [
    "### Set up the BERT model for sequence classification and define training parameters."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "3176ccf4-d20d-460c-b620-c73a1ab9cb6d",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
      "/opt/anaconda3/lib/python3.12/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead\n",
      "  warnings.warn(\n"
     ]
    }
   ],
   "source": [
    "# Load pre-trained BERT model with classification head for number of emotion classes\n",
    "model = get_model(num_labels=len(label_encoder.classes_))\n",
    "\n",
    "# Set training configuration: batch size, epochs, logging, saving, evaluation\n",
    "training_args = get_training_args()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "874a4e6a-80dd-470d-9283-e1c88e731b8e",
   "metadata": {},
   "source": [
    "### Train the Model "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "4c312e56-52bf-417d-82c0-8a1f47b82670",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "\n",
       "    <div>\n",
       "      \n",
       "      <progress value='5448' max='5448' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
       "      [5448/5448 1:46:28, Epoch 3/3]\n",
       "    </div>\n",
       "    <table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       " <tr style=\"text-align: left;\">\n",
       "      <th>Epoch</th>\n",
       "      <th>Training Loss</th>\n",
       "      <th>Validation Loss</th>\n",
       "      <th>Accuracy</th>\n",
       "      <th>F1</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>1.358900</td>\n",
       "      <td>1.335635</td>\n",
       "      <td>0.613467</td>\n",
       "      <td>0.579882</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>0.947100</td>\n",
       "      <td>1.284574</td>\n",
       "      <td>0.615671</td>\n",
       "      <td>0.601428</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>0.970400</td>\n",
       "      <td>1.297894</td>\n",
       "      <td>0.617048</td>\n",
       "      <td>0.606042</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table><p>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "\n",
       "    <div>\n",
       "      \n",
       "      <progress value='5448' max='5448' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
       "      [5448/5448 1:35:20, Epoch 3/3]\n",
       "    </div>\n",
       "    <table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       " <tr style=\"text-align: left;\">\n",
       "      <th>Epoch</th>\n",
       "      <th>Training Loss</th>\n",
       "      <th>Validation Loss</th>\n",
       "      <th>Accuracy</th>\n",
       "      <th>F1</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>0.907200</td>\n",
       "      <td>1.365916</td>\n",
       "      <td>0.602313</td>\n",
       "      <td>0.595804</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>0.549100</td>\n",
       "      <td>1.488130</td>\n",
       "      <td>0.595566</td>\n",
       "      <td>0.591464</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>0.514400</td>\n",
       "      <td>1.593286</td>\n",
       "      <td>0.591297</td>\n",
       "      <td>0.589066</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table><p>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "TrainOutput(global_step=5448, training_loss=0.7054264770818002, metrics={'train_runtime': 5721.3012, 'train_samples_per_second': 15.23, 'train_steps_per_second': 0.952, 'total_flos': 5733080823638016.0, 'train_loss': 0.7054264770818002, 'epoch': 3.0})"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "trainer = train_model(\n",
    "    model=model,\n",
    "    args=training_args,\n",
    "    train_dataset=train_dataset,\n",
    "    val_dataset=test_dataset,\n",
    "    compute_metrics=compute_metrics\n",
    ")\n",
    "\n",
    "# Begin training\n",
    "trainer.train()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "020729b6-c545-42ba-bd2c-00ee5f9bbb80",
   "metadata": {},
   "source": [
    "### Save both model weights and tokenizer files for future inference or deployment."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "5f12aedb-b3f8-4a1b-8e1f-6a68eb29933f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('../outputs/model/tokenizer_config.json',\n",
       " '../outputs/model/special_tokens_map.json',\n",
       " '../outputs/model/vocab.txt',\n",
       " '../outputs/model/added_tokens.json')"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from pathlib import Path\n",
    "model_path = Path(\"..\") / \"outputs\" / \"model\"\n",
    "model.save_pretrained(model_path)\n",
    "tokenizer.save_pretrained(model_path)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}