Spaces:
Running
Running
File size: 8,673 Bytes
276fedc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 |
# -*- coding: utf-8 -*-
"""Emotion Detection NLP Mental Health
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/#fileId=https%3A//storage.googleapis.com/kaggle-colab-exported-notebooks/emotion-detection-nlp-mental-health-07377912-eef1-476c-bca0-e3f3abe2bc31.ipynb%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com/20250205/auto/storage/goog4_request%26X-Goog-Date%3D20250205T063040Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D3379ac810304cc40b0fa5fa915ff09212c0da161bbdae3190bbb13f09d158e28ddbebaecc6f31f960598bf39852f632c8d65288530a38effc9d316c50e6ab1a71aedc9066b12ef4487648ede7d5646dbef0283c9eb7a5539c47ac342e640964e13ff9ea00f5ca777b4adc007f3a830e7d9cfccc590924dc8a5057440bfd82b0e97c9739112dba40371f7321d5231ddd5b476890fb7d4fced9ed0ba155fde73046cb775adeadd827f01dcc90a583f7dab149ca3a5c35f2b29df5106ca356258ee13267ac10671a604057af3e053d45fdabb4d1758c1b3f3da38ddbab02762b81b7f717321a649a1b63f8bc5773a8a27377de6214668dd1b1253012ff8017e2850
"""
'''# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
thedevastator_nlp_mental_health_conversations_path = kagglehub.dataset_download('thedevastator/nlp-mental-health-conversations')
print('Data source import complete.')'''
"""# Introduction
In recent years, mental health awareness has grown, leading to a greater emphasis on making support more accessible to everyone. Artificial Intelligence (AI) is playing a pivotal role in bridging the gap between those in need of mental health advice and the limited number of qualified professionals available. The dataset provided in this project is a valuable resource for developing Natural Language Processing (NLP) models that can assist with mental health support.
The dataset used in this project consists of anonymized conversations between patients and experienced psychologists, where we will concentrate on detecting the emotional context of the dialogue. By understanding the emotions present in these exchanges, the NLP model will be able to respond more appropriately and offer tailored advice based on the patient's emotional state.
## Purpose
The notebook will explore, preprocess, and model the data with the goal of improving emotion detection in patient conversations. This will allow us to understand the emotional landscape of mental health discussions and create AI systems capable of providing emotionally aware responses.
# Libraries
"""
'''#Download and Extracting Data from Kaggle
import os
import zipfile'''
# Data Preprcessing
import string
import re
from warnings import filterwarnings
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from PIL import Image
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer
#Label Encouding
from sklearn.preprocessing import LabelEncoder
from textblob import Word, TextBlob
from wordcloud import WordCloud
#Feature Extracting
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', 200)
pd.set_option('display.float_format', lambda x: '%.2f' % x)
# Download necessary NLTK resources
import nltk
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet') # Download the wordnet corpus for lemmatization
"""# Data
## Download and Extracting
"""
'''# Downlaod the dataset using kaggle API
os.system("kaggle datasets download -d thedevastator/nlp-mental-health-conversations")
#Extract the download zip files
dataset_zip='nlp-mental-health-conversations.zip'
extracted_folder='nlp_mental_health_conversations'
#Extract the dataset
with zipfile.ZipFile(dataset_zip,'r') as zip_ref:
zip_ref.extractall(extracted_folder)
print("Dataset downloaded and extracted successfully.")'''
"""## Explore Data"""
data=pd.read_csv("/content/train.csv")
data.head()
reponse=data.loc[0,"Response"]
print("Length Before text preprocessing : ",len(reponse))
"""## Text Preprocessing
- Normlaize
- Punctuation
- Numbers
- StopWords
- Lemmezation
- Removing Words
"""
# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()
def clean_text(text):
# Convert to string
text = str(text)
# Convert to lowercase
text = text.lower()
# Remove punctuation
text = text.translate(str.maketrans('', '', string.punctuation))
# Remove numbers
text = re.sub(r'\d+', '', text)
# Tokenize text
tokens = word_tokenize(text)
# Remove stop words
stop_words = set(stopwords.words('english'))
tokens = [word for word in tokens if word not in stop_words]
# Lemmatize tokens
tokens = [lemmatizer.lemmatize(word) for word in tokens]
# Join tokens back into a string
return ' '.join(tokens)
# Apply the clean_text function to your 'Context' column
data['Context'] = data['Context'].apply(clean_text)
# Remove Rarewords:
# Let's remove words used less than 1
temp_Context = pd.Series(' '.join(data['Context']).split()).value_counts()
drops = temp_Context[temp_Context <= 1]
data['Context'] = data['Context'].apply(lambda x: " ".join(x for x in x.split() if x not in drops))
"""## Text visualization"""
tf_Context = data["Context"].apply(lambda x: pd.value_counts(x.split(" "))).sum(axis=0).reset_index()
tf_Context.columns = ["words", "tf"]
tf_Context.sort_values("tf", ascending=False)
# Barplot for Context
tf_Context[tf_Context["tf"] > 300].plot.bar(x="words", y="tf")
plt.show()
"""# Emotions Anaylsis"""
from transformers import pipeline
# Extract and clean 'Context' column
contexts = data['Context']
# Load pre-trained emotion detection model
emotion_model = pipeline('sentiment-analysis', model='j-hartmann/emotion-english-distilroberta-base')
# Analyze emotions in 'Context'
emotions = contexts.apply(lambda x: emotion_model(x)[0]['label'])
# Add detected emotions as a new column
data['Detected_Emotion'] = emotions
data.head()
data['Detected_Emotion'].value_counts()
"""# Feature Extraction"""
# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer()
# Fit and transform the data
tfidf_matrix = vectorizer.fit_transform(contexts)
# Convert to array (if needed)
tfidf_array = tfidf_matrix.toarray()
"""# Model
## Data Spilting
"""
from sklearn.model_selection import train_test_split
# Split the data
X_train, X_test, y_train, y_test = train_test_split(tfidf_array, data['Detected_Emotion'], test_size=0.3, random_state=42)
from sklearn.ensemble import RandomForestClassifier
# Initialize the model
model = RandomForestClassifier()
"""## Fine Tuning"""
from sklearn.model_selection import GridSearchCV
# Define the parameter grid
param_grid = {
'n_estimators': [100, 200, 300],
'max_depth': [None, 10, 20, 30]
}
# Perform grid search
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)
# Best parameters
print(f'Best parameters: {grid_search.best_params_}')
"""# Train and Evaluation
## Train
"""
model = RandomForestClassifier()
# Train the model
model.fit(X_train, y_train)
"""## Evaluation"""
import seaborn as sns
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
# Make predictions
y_pred = model.predict(X_test)
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))
# Generate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)
# Plot confusion matrix
plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=np.unique(y_pred), yticklabels=np.unique(y_test))
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()
"""## Test Unseen Data"""
# Example new text
new_text = ["let's leave i am scared"]
# Clean and transform the new text
new_text_cleaned = [clean_text(text) for text in new_text]
new_text_tfidf = vectorizer.transform(new_text_cleaned)
# Predict emotion
predicted_emotion = model.predict(new_text_tfidf)
print(predicted_emotion) |