Spaces:
Sleeping
Sleeping
File size: 1,384 Bytes
0b6b733 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 |
from datasets import load_dataset
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import pandas as pd
def load_and_prepare_data():
# Load dataset directly from Hugging Face
raw_dataset = load_dataset("go_emotions")
# Convert to DataFrame for easy manipulation
df = pd.DataFrame(raw_dataset["train"])
# Grab only examples with a single label (simplification)
df = df[df["labels"].apply(lambda x: len(x) == 1)].reset_index(drop=True)
# Keep just the first label in list
df["label"] = df["labels"].apply(lambda x: x[0])
# Clean text if needed
df["clean_text"] = df["text"].str.lower()
# Map integer label to string label using Hugging Face's label list
label_names = raw_dataset["train"].features["labels"].feature.names
df["emotion"] = df["label"].apply(lambda x: label_names[x])
# Encode emotion names for training
label_encoder = LabelEncoder()
df["label_encoded"] = label_encoder.fit_transform(df["emotion"])
# Split into train/test
train_texts, test_texts, train_labels, test_labels = train_test_split(
df["clean_text"].tolist(),
df["label_encoded"].tolist(),
test_size=0.2,
stratify=df["label_encoded"],
random_state=42
)
return train_texts, test_texts, train_labels, test_labels, label_encoder
|