emotion-classifier / src /data_loader.py
Ashwin B
Move project to Hugging Space
0b6b733
from datasets import load_dataset
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import pandas as pd
def load_and_prepare_data():
# Load dataset directly from Hugging Face
raw_dataset = load_dataset("go_emotions")
# Convert to DataFrame for easy manipulation
df = pd.DataFrame(raw_dataset["train"])
# Grab only examples with a single label (simplification)
df = df[df["labels"].apply(lambda x: len(x) == 1)].reset_index(drop=True)
# Keep just the first label in list
df["label"] = df["labels"].apply(lambda x: x[0])
# Clean text if needed
df["clean_text"] = df["text"].str.lower()
# Map integer label to string label using Hugging Face's label list
label_names = raw_dataset["train"].features["labels"].feature.names
df["emotion"] = df["label"].apply(lambda x: label_names[x])
# Encode emotion names for training
label_encoder = LabelEncoder()
df["label_encoded"] = label_encoder.fit_transform(df["emotion"])
# Split into train/test
train_texts, test_texts, train_labels, test_labels = train_test_split(
df["clean_text"].tolist(),
df["label_encoded"].tolist(),
test_size=0.2,
stratify=df["label_encoded"],
random_state=42
)
return train_texts, test_texts, train_labels, test_labels, label_encoder