In [26]:
%%capture

!pip install transformers
!pip install accelerate -U
!pip install datasets
!pip install huggingface_hub

In [27]:
%%capture

import torch
from sklearn.metrics import f1_score
import pandas as pd
import numpy as np

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
from transformers import pipeline
from datasets import load_dataset
import nltk
nltk.download('punkt')
from torch import nn
from transformers import TrainingArguments
from transformers import Trainer
##others
import warnings
warnings.filterwarnings("ignore")
import os
pd.set_option("display.max_rows", 50)
pd.set_option("display.max_columns", 50)


In [28]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Importing Dataset

In [29]:
data_path= '/content/drive/MyDrive/deep-learning/clean_copy.csv'

In [30]:
data= pd.read_csv(data_path)

In [31]:
data.head()

Unnamed: 0.1,Unnamed: 0,clean_tweet,label,agreement
0,0,amp big homie meanboy stegman st,0.0,1.0
1,1,im thinking devoting career proving autism isn...,1.0,1.0
2,2,vaccines vaccinate child,-1.0,1.0
3,3,mean immunize kid something wont secretly kill...,-1.0,1.0
4,4,thanks catch performing la nuit nyc st ave sho...,0.0,1.0


In [32]:
##dropping the Unnamed: 0 column

data.isna().sum()

Unnamed: 0      0
clean_tweet    29
label           0
agreement       0
dtype: int64

In [33]:
data[data["clean_tweet"].isnull()]

Unnamed: 0.1,Unnamed: 0,clean_tweet,label,agreement
444,444,,0.0,1.0
1523,1523,,0.0,1.0
2155,2155,,0.0,1.0
2515,2515,,0.0,1.0
3062,3062,,0.0,0.666667
3204,3204,,0.0,1.0
3819,3819,,1.0,0.666667
4631,4631,,0.0,1.0
4638,4638,,0.0,1.0
4770,4770,,0.0,1.0


In [34]:
##dropping all missing values and

data= data.dropna()
data= data.drop("Unnamed: 0", axis=1)

In [35]:
##before splitting I will convert each tweet row to a tuple since that't the acceptable format

data['clean_tweet'] = data['clean_tweet'].apply(lambda tweet: tuple(tweet.split(),))

## ii. Data Splitting

In [36]:
train_set, eval_set= train_test_split(data, test_size= 0.2, stratify= data["label"])

In [37]:
train_set

Unnamed: 0,clean_tweet,label,agreement
2671,"(heres, breakdown, measles, cases, dallas, cou...",0.0,1.000000
1078,"(worried, cdc, felt, need, produce, clinical, ...",-1.0,0.333333
7225,"(people, chose, raise, kids, public, matter, p...",1.0,0.666667
7673,"(vaccinate, dogs, children, huh, ok, ok, im, d...",1.0,1.000000
1386,"(flu, now, widespread, county, health, says, t...",1.0,0.666667
...,...,...,...
4667,"(thanks, dr, offit, enlighting, talk, measles,...",0.0,1.000000
6099,"(measles, case, confirmed, dc)",0.0,1.000000
5913,"(kisses, blown, kisses, wasted, kisses, arent,...",0.0,0.666667
7311,"(vaccines, dont, cause, autism, think, please,...",1.0,1.000000


In [38]:

train_set.to_csv("/content/train_set.csv")
eval_set.to_csv("/content/eval_set.csv")

## iii. Loading Datasets using Load_dataset

In [39]:
dataset= load_dataset("csv", data_files={"train_set":"train_set.csv", "eval_set":"eval_set.csv" }, encoding= "ISO-8859-1")

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-128b327d42b4e0c1/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train_set split: 0 examples [00:00, ? examples/s]

Generating eval_set split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-128b327d42b4e0c1/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [40]:
dataset

DatasetDict({
    train_set: Dataset({
        features: ['Unnamed: 0', 'clean_tweet', 'label', 'agreement'],
        num_rows: 7976
    })
    eval_set: Dataset({
        features: ['Unnamed: 0', 'clean_tweet', 'label', 'agreement'],
        num_rows: 1994
    })
})

###. Tokenization

In [41]:
tokenizer= AutoTokenizer.from_pretrained("roberta-base")

In [42]:
## our labels are-1, 0, 1 and we will like to transform them into 0,1,2 respectively

def transform_labels(input):
  label= input["label"]
  num =0

  if label== -1:
    num= 0
  elif label== 0:
    num =1
  elif label == 1:
    num = 2
  return {"labels": num}

def tokenize(example):
  return tokenizer(example["clean_tweet"], padding= "max_length", truncation=True, return_tensors= "pt")


In [43]:
dataset= dataset.map(tokenize, batched= True)
remove_columns= ['Unnamed: 0', 'clean_tweet', 'label', 'agreement']
dataset = dataset.map(transform_labels, remove_columns=remove_columns)

Map:   0%|          | 0/7976 [00:00<?, ? examples/s]

Map:   0%|          | 0/1994 [00:00<?, ? examples/s]

Map:   0%|          | 0/7976 [00:00<?, ? examples/s]

Map:   0%|          | 0/1994 [00:00<?, ? examples/s]

In [44]:
dataset

DatasetDict({
    train_set: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 7976
    })
    eval_set: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1994
    })
})

## ii.Modelling

In [45]:
model= AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels= 3)

In [46]:
def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  f1 = f1_score(labels, preds, average="weighted")
  return {"f1": f1}

In [47]:
batch_size= 16

In [48]:


training_args = TrainingArguments( output_dir="Finetuned-Roberta-Base-Sentiment-classifier",
   num_train_epochs=3, load_best_model_at_end=True,evaluation_strategy="steps",save_strategy="steps",push_to_hub=True

)

In [49]:
train_dataset= dataset['train_set'].shuffle(seed=10)
eval_dataset= dataset['eval_set'].shuffle(seed=10)

In [50]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
trainer= Trainer(
    model= model,
      args= training_args,
      train_dataset= train_dataset,
      eval_dataset= eval_dataset,
      tokenizer= tokenizer,
      compute_metrics=compute_metrics

)



In [None]:
trainer.train()

Step,Training Loss,Validation Loss,F1
500,0.8752,0.806804,0.641171
1000,0.831,0.81825,0.642111
1500,0.859,0.800891,0.64305
2000,0.7984,0.809756,0.640259
2500,0.7948,0.810039,0.640747


TrainOutput(global_step=2991, training_loss=0.8307910690894298, metrics={'train_runtime': 2760.4432, 'train_samples_per_second': 8.668, 'train_steps_per_second': 1.084, 'total_flos': 6295777859395584.0, 'train_loss': 0.8307910690894298, 'epoch': 3.0})

In [None]:
trainer.push_to_hub()

Upload file runs/Jul23_09-24-00_b1c6e973c4b5/events.out.tfevents.1690104246.b1c6e973c4b5.287.5:   0%|         …

   f7cd78a..91750a0  main -> main

   91750a0..eb70e66  main -> main



'https://huggingface.co/gArthur98/Finetuned-Roberta-Base-Sentiment-classifier/commit/91750a00c5f8f9bb9e5eb877bc8994eb45878ed7'

## handling imbalance

In [53]:
class_weights= (1-(data["label"].value_counts().sort_index() /len(data))).values
class_weights

array([0.89618857, 0.50992979, 0.59388164])

In [54]:
class_weights= torch.from_numpy(class_weights).float().to("cuda")

In [55]:
class WeightedLossTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs["labels"]
        inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits.float()
        labels = labels
        loss_func = nn.CrossEntropyLoss(weight=class_weights)
        loss = loss_func(logits, labels)
        return (loss, outputs) if return_outputs else loss

In [56]:

weight_training_args = TrainingArguments(

   output_dir="Roberta-classweight-Sentiment-classifier",
   num_train_epochs=3, load_best_model_at_end=True, weight_decay=0.01, evaluation_strategy="steps",save_strategy="steps",push_to_hub=True

)

In [57]:
train_dataset= dataset['train_set'].shuffle(seed=12)
eval_dataset= dataset['eval_set'].shuffle(seed=12)

In [58]:
class_trainer = WeightedLossTrainer(
      model= model,
      args= weight_training_args,
      train_dataset= train_dataset,
      eval_dataset= eval_dataset,
      tokenizer= tokenizer,
      compute_metrics=compute_metrics )



In [59]:
class_trainer.train()

Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss,F1
500,0.9827,0.889014,0.63951
1000,0.9185,0.870799,0.644917
1500,0.8998,0.86729,0.644889
2000,0.8792,0.864813,0.644889
2500,0.8877,0.865559,0.644889


TrainOutput(global_step=2991, training_loss=0.9072929134898184, metrics={'train_runtime': 2802.765, 'train_samples_per_second': 8.537, 'train_steps_per_second': 1.067, 'total_flos': 6295777859395584.0, 'train_loss': 0.9072929134898184, 'epoch': 3.0})

In [60]:
class_trainer.push_to_hub()

Upload file runs/Jul23_16-00-42_e475fb65e51d/events.out.tfevents.1690128053.e475fb65e51d.304.0:   0%|         …

   ae2ba44..1fdde58  main -> main

   1fdde58..939b0c0  main -> main



'https://huggingface.co/gArthur98/Roberta-classweight-Sentiment-classifier/commit/1fdde585cadbb71c7c5998fb930a2ce40b9a1ad9'

In [61]:
class_trainer.evaluate()

{'eval_loss': 0.8648133277893066,
 'eval_f1': 0.6448887733504817,
 'eval_runtime': 63.249,
 'eval_samples_per_second': 31.526,
 'eval_steps_per_second': 3.953,
 'epoch': 3.0}