Spaces:

ashbeexd
/

emotion-classifier

Sleeping

emotion-classifier / src /data_loader.py

Ashwin B

Move project to Hugging Space

0b6b733 about 1 month ago

1.38 kB

	from datasets import load_dataset
	from sklearn.preprocessing import LabelEncoder
	from sklearn.model_selection import train_test_split
	import pandas as pd

	def load_and_prepare_data():
	# Load dataset directly from Hugging Face
	raw_dataset = load_dataset("go_emotions")

	# Convert to DataFrame for easy manipulation
	df = pd.DataFrame(raw_dataset["train"])

	# Grab only examples with a single label (simplification)
	df = df[df["labels"].apply(lambda x: len(x) == 1)].reset_index(drop=True)

	# Keep just the first label in list
	df["label"] = df["labels"].apply(lambda x: x[0])

	# Clean text if needed
	df["clean_text"] = df["text"].str.lower()

	# Map integer label to string label using Hugging Face's label list
	label_names = raw_dataset["train"].features["labels"].feature.names
	df["emotion"] = df["label"].apply(lambda x: label_names[x])

	# Encode emotion names for training
	label_encoder = LabelEncoder()
	df["label_encoded"] = label_encoder.fit_transform(df["emotion"])

	# Split into train/test
	train_texts, test_texts, train_labels, test_labels = train_test_split(
	df["clean_text"].tolist(),
	df["label_encoded"].tolist(),
	test_size=0.2,
	stratify=df["label_encoded"],
	random_state=42
	)

	return train_texts, test_texts, train_labels, test_labels, label_encoder