Spaces:

krushna123
/

nlp-sentiment-app

Running

App Files Files Community

nlp-sentiment-app / app.py

krushna123

Update app.py

d15e671 verified 4 months ago

raw

history blame

8.67 kB

	# -- coding: utf-8 --
	"""Emotion Detection NLP Mental Health

	Automatically generated by Colab.

	Original file is located at
	https://colab.research.google.com/#fileId=https%3A//storage.googleapis.com/kaggle-colab-exported-notebooks/emotion-detection-nlp-mental-health-07377912-eef1-476c-bca0-e3f3abe2bc31.ipynb%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com/20250205/auto/storage/goog4_request%26X-Goog-Date%3D20250205T063040Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D3379ac810304cc40b0fa5fa915ff09212c0da161bbdae3190bbb13f09d158e28ddbebaecc6f31f960598bf39852f632c8d65288530a38effc9d316c50e6ab1a71aedc9066b12ef4487648ede7d5646dbef0283c9eb7a5539c47ac342e640964e13ff9ea00f5ca777b4adc007f3a830e7d9cfccc590924dc8a5057440bfd82b0e97c9739112dba40371f7321d5231ddd5b476890fb7d4fced9ed0ba155fde73046cb775adeadd827f01dcc90a583f7dab149ca3a5c35f2b29df5106ca356258ee13267ac10671a604057af3e053d45fdabb4d1758c1b3f3da38ddbab02762b81b7f717321a649a1b63f8bc5773a8a27377de6214668dd1b1253012ff8017e2850
	"""

	'''# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
	# THEN FEEL FREE TO DELETE THIS CELL.
	# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
	# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
	# NOTEBOOK.
	import kagglehub
	thedevastator_nlp_mental_health_conversations_path = kagglehub.dataset_download('thedevastator/nlp-mental-health-conversations')

	print('Data source import complete.')'''

	"""# Introduction

	In recent years, mental health awareness has grown, leading to a greater emphasis on making support more accessible to everyone. Artificial Intelligence (AI) is playing a pivotal role in bridging the gap between those in need of mental health advice and the limited number of qualified professionals available. The dataset provided in this project is a valuable resource for developing Natural Language Processing (NLP) models that can assist with mental health support.

	The dataset used in this project consists of anonymized conversations between patients and experienced psychologists, where we will concentrate on detecting the emotional context of the dialogue. By understanding the emotions present in these exchanges, the NLP model will be able to respond more appropriately and offer tailored advice based on the patient's emotional state.

	## Purpose

	The notebook will explore, preprocess, and model the data with the goal of improving emotion detection in patient conversations. This will allow us to understand the emotional landscape of mental health discussions and create AI systems capable of providing emotionally aware responses.

	# Libraries
	"""

	'''#Download and Extracting Data from Kaggle
	import os
	import zipfile'''


	# Data Preprcessing
	import string
	import re
	from warnings import filterwarnings
	import matplotlib.pyplot as plt
	import numpy as np
	import pandas as pd
	from PIL import Image


	import nltk
	from nltk.corpus import stopwords
	from nltk.tokenize import word_tokenize
	from nltk.sentiment import SentimentIntensityAnalyzer
	from nltk.stem import WordNetLemmatizer

	#Label Encouding
	from sklearn.preprocessing import LabelEncoder
	from textblob import Word, TextBlob
	from wordcloud import WordCloud

	#Feature Extracting
	from sklearn.feature_extraction.text import CountVectorizer
	from sklearn.feature_extraction.text import TfidfVectorizer


	filterwarnings('ignore')
	pd.set_option('display.max_columns', None)
	pd.set_option('display.max_colwidth', None)
	pd.set_option('display.width', 200)
	pd.set_option('display.float_format', lambda x: '%.2f' % x)

	# Download necessary NLTK resources
	import nltk
	nltk.download('punkt_tab')
	nltk.download('stopwords')
	nltk.download('punkt')
	nltk.download('wordnet') # Download the wordnet corpus for lemmatization

	"""# Data

	## Download and Extracting
	"""

	'''# Downlaod the dataset using kaggle API
	os.system("kaggle datasets download -d thedevastator/nlp-mental-health-conversations")

	#Extract the download zip files
	dataset_zip='nlp-mental-health-conversations.zip'
	extracted_folder='nlp_mental_health_conversations'

	#Extract the dataset
	with zipfile.ZipFile(dataset_zip,'r') as zip_ref:
	zip_ref.extractall(extracted_folder)

	print("Dataset downloaded and extracted successfully.")'''

	"""## Explore Data"""

	data = pd.read_csv("train.csv")


	data.head()

	reponse=data.loc[0,"Response"]
	print("Length Before text preprocessing : ",len(reponse))

	"""## Text Preprocessing
	- Normlaize
	- Punctuation
	- Numbers
	- StopWords
	- Lemmezation
	- Removing Words
	"""

	# Initialize the lemmatizer
	lemmatizer = WordNetLemmatizer()

	def clean_text(text):
	# Convert to string
	text = str(text)
	# Convert to lowercase
	text = text.lower()
	# Remove punctuation
	text = text.translate(str.maketrans('', '', string.punctuation))
	# Remove numbers
	text = re.sub(r'\d+', '', text)
	# Tokenize text
	tokens = word_tokenize(text)
	# Remove stop words
	stop_words = set(stopwords.words('english'))
	tokens = [word for word in tokens if word not in stop_words]
	# Lemmatize tokens
	tokens = [lemmatizer.lemmatize(word) for word in tokens]

	# Join tokens back into a string
	return ' '.join(tokens)

	# Apply the clean_text function to your 'Context' column
	data['Context'] = data['Context'].apply(clean_text)

	# Remove Rarewords:

	# Let's remove words used less than 1
	temp_Context = pd.Series(' '.join(data['Context']).split()).value_counts()
	drops = temp_Context[temp_Context <= 1]
	data['Context'] = data['Context'].apply(lambda x: " ".join(x for x in x.split() if x not in drops))

	"""## Text visualization"""

	tf_Context = data["Context"].apply(lambda x: pd.value_counts(x.split(" "))).sum(axis=0).reset_index()
	tf_Context.columns = ["words", "tf"]
	tf_Context.sort_values("tf", ascending=False)

	# Barplot for Context

	tf_Context[tf_Context["tf"] > 300].plot.bar(x="words", y="tf")
	plt.show()

	"""# Emotions Anaylsis"""

	from transformers import pipeline

	# Extract and clean 'Context' column
	contexts = data['Context']

	# Load pre-trained emotion detection model
	emotion_model = pipeline('sentiment-analysis', model='j-hartmann/emotion-english-distilroberta-base')

	# Analyze emotions in 'Context'
	emotions = contexts.apply(lambda x: emotion_model(x)[0]['label'])

	# Add detected emotions as a new column
	data['Detected_Emotion'] = emotions

	data.head()

	data['Detected_Emotion'].value_counts()

	"""# Feature Extraction"""

	# Initialize TF-IDF Vectorizer
	vectorizer = TfidfVectorizer()

	# Fit and transform the data
	tfidf_matrix = vectorizer.fit_transform(contexts)

	# Convert to array (if needed)
	tfidf_array = tfidf_matrix.toarray()

	"""# Model

	## Data Spilting
	"""

	from sklearn.model_selection import train_test_split

	# Split the data
	X_train, X_test, y_train, y_test = train_test_split(tfidf_array, data['Detected_Emotion'], test_size=0.3, random_state=42)

	from sklearn.ensemble import RandomForestClassifier

	# Initialize the model
	model = RandomForestClassifier()

	"""## Fine Tuning"""

	from sklearn.model_selection import GridSearchCV

	# Define the parameter grid
	param_grid = {
	'n_estimators': [100, 200, 300],
	'max_depth': [None, 10, 20, 30]
	}

	# Perform grid search
	grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy')
	grid_search.fit(X_train, y_train)

	# Best parameters
	print(f'Best parameters: {grid_search.best_params_}')

	"""# Train and Evaluation

	## Train
	"""

	model = RandomForestClassifier()
	# Train the model
	model.fit(X_train, y_train)

	"""## Evaluation"""

	import seaborn as sns
	from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

	# Make predictions
	y_pred = model.predict(X_test)

	# Calculate accuracy
	accuracy = accuracy_score(y_test, y_pred)
	print(f'Accuracy: {accuracy}')

	# Print classification report
	print("Classification Report:")
	print(classification_report(y_test, y_pred))

	# Generate confusion matrix
	conf_matrix = confusion_matrix(y_test, y_pred)
	print("Confusion Matrix:")
	print(conf_matrix)

	# Plot confusion matrix
	plt.figure(figsize=(10, 7))
	sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=np.unique(y_pred), yticklabels=np.unique(y_test))
	plt.xlabel('Predicted')
	plt.ylabel('Actual')
	plt.title('Confusion Matrix')
	plt.show()

	"""## Test Unseen Data"""

	# Example new text
	new_text = ["let's leave i am scared"]

	# Clean and transform the new text
	new_text_cleaned = [clean_text(text) for text in new_text]
	new_text_tfidf = vectorizer.transform(new_text_cleaned)

	# Predict emotion
	predicted_emotion = model.predict(new_text_tfidf)
	print(predicted_emotion)