In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/disease-and-symptoms-dataset/DiseaseAndSymptoms.csv
/kaggle/input/disease-and-symptoms-dataset/Disease precaution.csv


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

# Step 1: Load the dataset
df = pd.read_csv('/kaggle/input/disease-and-symptoms-dataset/DiseaseAndSymptoms.csv')

# Combine all symptom columns into a list
symptom_columns = [col for col in df.columns if 'Symptom' in col]
df['Symptoms'] = df[symptom_columns].apply(lambda row: [s for s in row if pd.notna(s)], axis=1)

# Drop original symptom columns
df = df[['Disease', 'Symptoms']]

# One-hot encoding for symptoms
mlb = MultiLabelBinarizer()
X = mlb.fit_transform(df['Symptoms'])  # Features: Symptoms
y = df['Disease']  # Target: Disease

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Save processed data
pd.DataFrame(X_train, columns=mlb.classes_).to_csv('X_train.csv', index=False)
pd.DataFrame(X_test, columns=mlb.classes_).to_csv('X_test.csv', index=False)
y_train.to_csv('y_train.csv', index=False)
y_test.to_csv('y_test.csv', index=False)

print("Data Preprocessing Completed Successfully!")

Data Preprocessing Completed Successfully!


In [5]:
import xgboost as xgb
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

# Load preprocessed data
X_train = pd.read_csv('X_train.csv')
X_test = pd.read_csv('X_test.csv')
y_train = pd.read_csv('y_train.csv').values.ravel()  # Convert to 1D array
y_test = pd.read_csv('y_test.csv').values.ravel()

# Step 1: Encode Disease Labels into Numbers
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)  # Convert disease names to numbers
y_test_encoded = label_encoder.transform(y_test)

# Step 2: Initialize XGBoost classifier
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

# Step 3: Train the model
model.fit(X_train, y_train_encoded)

# Step 4: Predict on test set
y_pred = model.predict(X_test)

# Step 5: Evaluate accuracy
accuracy = accuracy_score(y_test_encoded, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

# Step 6: Save the trained model and label encoder
model.save_model("symptom_disease_model.json")
import pickle
pickle.dump(label_encoder, open("label_encoder.pkl", "wb"))

print("Model Training Completed and Saved!")


Model Accuracy: 1.00
Model Training Completed and Saved!


In [11]:
import pandas as pd

# Load Precaution Data
precaution_df = pd.read_csv('/kaggle/input/disease-and-symptoms-dataset/Disease precaution.csv')

# Convert to dictionary for fast lookup
precaution_dict = {}
for _, row in precaution_df.iterrows():
    disease = row['Disease'].strip().lower()  # Normalize disease name
    precautions = [row[f'Precaution_{i}'] for i in range(1, 5) if pd.notna(row[f'Precaution_{i}'])]
    precaution_dict[disease] = precautions

# Function to Get Precautions
def get_precautions(disease_name):
    disease_name = disease_name.strip().lower()
    return precaution_dict.get(disease_name, ["No precautions found"])

# Example Usage
predicted_disease = "Diabetes"  # Replace with actual model prediction
precautions = get_precautions(predicted_disease)
print(f"Precautions for {predicted_disease}: {precautions}")


Precautions for Diabetes: ['have balanced diet', 'exercise', 'consult doctor', 'follow up']
