Spaces:

Bibek-Mukherjee
/

Syntax-Squad

Sleeping

Syntax-Squad / app.py

Bibek Mukherjee

Upload 3 files

474ddf8 verified 5 months ago

9.3 kB


	import pandas as pd
	import numpy as np
	from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
	import matplotlib.pyplot as plt
	import seaborn as sns
	import pickle
	import gradio as gr
	import os

	# Load the model
	model_path = 'career_prediction_model.pkl'
	with open(model_path, 'rb') as f:
	saved_data = pickle.load(f)

	model = saved_data['model']
	label_encoders = saved_data['label_encoders']
	target_encoder = saved_data['target_encoder']
	features = saved_data['features']
	target = 'What would you like to become when you grow up'

	# Function for individual prediction
	def predict_career(work_env, academic_perf, motivation, leadership, tech_savvy):
	# Prepare input data
	input_data = pd.DataFrame({
	'Preferred Work Environment': [work_env],
	'Academic Performance (CGPA/Percentage)': [float(academic_perf)],
	'Motivation for Career Choice ': [motivation], # Note the space at the end
	'Leadership Experience': [leadership],
	'Tech-Savviness': [tech_savvy]
	})

	# Encode categorical features
	for feature in features:
	if feature in label_encoders and input_data[feature].dtype == 'object':
	try:
	input_data[feature] = label_encoders[feature].transform(input_data[feature])
	except ValueError:
	# Handle unknown categories
	print(f"Warning: Unknown category in {feature}. Using most frequent category.")
	input_data[feature] = 0 # Default to first category

	# Make prediction
	prediction = model.predict(input_data)[0]
	predicted_career = target_encoder.inverse_transform([int(prediction)])[0]

	# Get probabilities for all classes
	if hasattr(model, 'predict_proba'):
	probabilities = model.predict_proba(input_data)[0]
	class_probs = {target_encoder.inverse_transform([i])[0]: prob
	for i, prob in enumerate(probabilities)}
	sorted_probs = dict(sorted(class_probs.items(), key=lambda x: x[1], reverse=True))

	result = f"Predicted career: {predicted_career}\n\nProbabilities:\n"
	for career, prob in sorted_probs.items():
	result += f"{career}: {prob:.2f}\n"
	return result
	else:
	return f"Predicted career: {predicted_career}"

	# Function for batch evaluation
	def evaluate_model_with_csv(csv_file):
	try:
	# Try different encodings
	encodings = ['utf-8', 'latin1', 'ISO-8859-1', 'cp1252', 'utf-8-sig']

	# Try each encoding until one works
	for encoding in encodings:
	try:
	test_df = pd.read_csv(csv_file.name, encoding=encoding)
	break
	except UnicodeDecodeError:
	if encoding == encodings[-1]:
	return ["Error: Could not decode the CSV file with any common encodings.", None]
	continue
	except Exception as e:
	if encoding == encodings[-1]:
	return [f"Error reading CSV: {str(e)}", None]
	continue

	# Check if required columns exist
	missing_cols = [col for col in features + [target] if col not in test_df.columns]
	if missing_cols:
	return [f"Error: The following required columns are missing in the CSV: {missing_cols}", None]

	# Preprocess the test data
	X_eval = test_df[features].copy()

	# Handle missing values
	X_eval = X_eval.fillna('Unknown')

	# Convert Academic Performance to numeric
	X_eval['Academic Performance (CGPA/Percentage)'] = pd.to_numeric(
	X_eval['Academic Performance (CGPA/Percentage)'], errors='coerce')
	X_eval['Academic Performance (CGPA/Percentage)'].fillna(
	X_eval['Academic Performance (CGPA/Percentage)'].mean(), inplace=True)

	# Encode categorical features
	for feature in features:
	if feature in label_encoders and X_eval[feature].dtype == 'object':
	# Handle unknown categories by mapping them to 0
	X_eval[feature] = X_eval[feature].apply(
	lambda x: label_encoders[feature].transform([x])[0]
	if x in label_encoders[feature].classes_ else 0
	)

	# Get the true labels
	y_true = test_df[target].copy()
	y_true = y_true.fillna('Corporate Employee')

	# Encode the true labels
	y_true_encoded = y_true.apply(
	lambda x: target_encoder.transform([x])[0]
	if x in target_encoder.classes_ else 0
	).values

	# Make predictions
	y_pred = model.predict(X_eval)
	y_pred = np.array(y_pred).astype(int)

	# Calculate accuracy
	accuracy = accuracy_score(y_true_encoded, y_pred)

	# Create a DataFrame with actual vs predicted values
	results_df = pd.DataFrame({
	'Actual Career': [target_encoder.classes_[i] for i in y_true_encoded],
	'Predicted Career': [target_encoder.classes_[i] for i in y_pred]
	})

	# Count correct predictions
	results_df['Correct'] = results_df['Actual Career'] == results_df['Predicted Career']
	correct_count = results_df['Correct'].sum()
	total_count = len(results_df)

	# Create confusion matrix
	plt.figure(figsize=(12, 10))
	cm = pd.crosstab(results_df['Actual Career'], results_df['Predicted Career'])
	sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
	plt.title('Confusion Matrix')
	plt.ylabel('Actual Career')
	plt.xlabel('Predicted Career')
	plt.tight_layout()

	# Save the confusion matrix
	cm_path = 'confusion_matrix.png'
	plt.savefig(cm_path)

	# Prepare the results
	result_text = f"Model Evaluation Results:\n\n"
	result_text += f"Total samples: {total_count}\n"
	result_text += f"Correct predictions: {correct_count}\n"
	result_text += f"Accuracy: {accuracy:.4f}\n\n"

	# Generate classification report
	report = classification_report(y_true_encoded, y_pred,
	target_names=target_encoder.classes_,
	output_dict=True)

	# Add class-wise metrics
	result_text += "Class-wise Performance:\n"
	for class_name in target_encoder.classes_:
	if class_name in report:
	result_text += f"\n{class_name}:\n"
	result_text += f" Precision: {report[class_name]['precision']:.4f}\n"
	result_text += f" Recall: {report[class_name]['recall']:.4f}\n"
	result_text += f" F1-score: {report[class_name]['f1-score']:.4f}\n"

	return [result_text, cm_path]

	except Exception as e:
	import traceback
	error_details = traceback.format_exc()
	print(f"Error in evaluation: {str(e)}\n{error_details}")

	# Create a simple error image
	plt.figure(figsize=(6, 4))
	plt.text(0.5, 0.5, f"Error: {str(e)}",
	horizontalalignment='center', verticalalignment='center', fontsize=12, color='red')
	plt.axis('off')
	error_path = 'error_image.png'
	plt.savefig(error_path)

	return [f"Error: {str(e)}", error_path]

	# Get unique values for dropdowns
	work_env_options = list(label_encoders['Preferred Work Environment'].classes_)
	motivation_options = list(label_encoders['Motivation for Career Choice '].classes_)
	leadership_options = list(label_encoders['Leadership Experience'].classes_)
	tech_savvy_options = list(label_encoders['Tech-Savviness'].classes_)

	# Create the Gradio interface
	iface = gr.Interface(
	fn=predict_career,
	inputs=[
	gr.Dropdown(work_env_options, label="Preferred Work Environment"),
	gr.Number(label="Academic Performance (CGPA/Percentage)", minimum=0, maximum=10),
	gr.Dropdown(motivation_options, label="Motivation for Career Choice"),
	gr.Dropdown(leadership_options, label="Leadership Experience"),
	gr.Dropdown(tech_savvy_options, label="Tech-Savviness")
	],
	outputs="text",
	title="Career Prediction Model",
	description="Enter your details to predict your future career path",
	theme="huggingface"
	)

	# Create a separate interface for model evaluation
	eval_iface = gr.Interface(
	fn=evaluate_model_with_csv,
	inputs=gr.File(label="Upload Test CSV File"),
	outputs=[
	gr.Textbox(label="Evaluation Results"),
	gr.Image(label="Confusion Matrix")
	],
	title="Career Prediction Model Evaluation",
	description="Upload a CSV file with test data to evaluate the model's performance",
	theme="huggingface"
	)

	# Create a tabbed interface
	demo = gr.TabbedInterface(
	[iface, eval_iface],
	["Individual Prediction", "Batch Evaluation"]
	)

	# Launch the interface
	demo.launch()