File size: 1,384 Bytes
0b6b733
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
from datasets import load_dataset
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import pandas as pd

def load_and_prepare_data():
    # Load dataset directly from Hugging Face
    raw_dataset = load_dataset("go_emotions")

    # Convert to DataFrame for easy manipulation
    df = pd.DataFrame(raw_dataset["train"])

    # Grab only examples with a single label (simplification)
    df = df[df["labels"].apply(lambda x: len(x) == 1)].reset_index(drop=True)
    
    # Keep just the first label in list
    df["label"] = df["labels"].apply(lambda x: x[0])

    # Clean text if needed
    df["clean_text"] = df["text"].str.lower()

    # Map integer label to string label using Hugging Face's label list
    label_names = raw_dataset["train"].features["labels"].feature.names
    df["emotion"] = df["label"].apply(lambda x: label_names[x])

    # Encode emotion names for training
    label_encoder = LabelEncoder()
    df["label_encoded"] = label_encoder.fit_transform(df["emotion"])

    # Split into train/test
    train_texts, test_texts, train_labels, test_labels = train_test_split(
        df["clean_text"].tolist(),
        df["label_encoded"].tolist(),
        test_size=0.2,
        stratify=df["label_encoded"],
        random_state=42
    )

    return train_texts, test_texts, train_labels, test_labels, label_encoder