Obesity_Level_Prediction-aim.ipynb
|
#%% md
Source: https://github.com/Saba-Gul/Exploratory-Data-Analysis-and-Statistical-Analysis-Notebooks
#%% md
## **Do not forget to write your student number in the assignment giben in 4 cells below**
#%%
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import pandas as pd
import numpy as np
#%% md
Obesity is a great challenge for health worldwide, which cause various health disease and reduce life quality. With the help of this data set we aims to estimate the obesity level in individuals based on their physical activities, eating habits, family history on overweight, etc. We will use various methods on the dataset gathered from public online survey to prevent and reduce the obesity level.
the target of this dataset is 'NObeyesdad'.
#%% md
Here's a description of each column in the dataset:
1. **Age**: The age of the individual.
2. **Gender**: The gender of the individual (e.g., Male, Female).
3. **Height**: The height of the individual in meters.
4. **Weight**: The weight of the individual in kilograms.
5. **CALC**: Unknown column. You might need to check the data source or documentation to understand what this column represents.
6. **FAVC**: Whether the individual frequently consumes high caloric food (e.g., yes, no).
7. **FCVC**: Frequency of consumption of vegetables (numeric scale).
8. **NCP**: Number of main meals per day (numeric scale).
9. **SCC**: Squamous cell carcinoma
10. **SMOKE**: Whether the individual smokes (e.g., yes, no).
11. **CH2O**: Consumption of water daily (numeric scale).
12. **family_history_with_overweight**: Whether the individual has a family history of overweight (e.g., yes, no).
13. **FAF**: Physical activity frequency (numeric scale).
14. **TUE**: Time using technology devices (numeric scale).
15. **CAEC**: Unknown column. You might need to check the data source or documentation to understand what this column represents.
16. **MTRANS**: Mode of transportation (e.g., Public Transportation, Walking).
17. **NObeyesdad**: Obesity level of the individual (e.g., Normal_Weight, Overweight_Level_I).
Some columns have numeric values, some are categorical, and others seem to be ordinal.
#%%
# if you are using COLAB. You have to make DataSets folder and put your obessity dataset there
from google.colab import drive
drive.mount('/content/drive')
#%%
# Load dataset
# From your project folder under DataSets directory
data = pd.read_csv("DataSets/ObesityDataSet_raw_and_data_sinthetic.csv")
# From your google drive
#data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/ObesityDataSet_raw_and_data_sinthetic.csv")
data.head(10)
#%%
# Change dataset according to Student Number
StudentNo = 11230199
strStudentNo = str(StudentNo)
LastDigit1 = strStudentNo[len(strStudentNo)-1]
LastDigit2 = strStudentNo[len(strStudentNo)-2]
LastDigit3 = strStudentNo[len(strStudentNo)-3]
# Update data
for x in data.index:
data.loc[x, "Age"] = data.loc[x, "Age"] + int(LastDigit1)
data.loc[x, "Height"] = data.loc[x, "Height"] + int(LastDigit2) / 10.0
data.loc[x, "Weight"] = data.loc[x, "Weight"] + int(LastDigit3) - 2
data.loc[x, "FCVC"] = data.loc[x, "FCVC"] + int(LastDigit1) / 7
data.loc[x, "NCP"] = data.loc[x, "NCP"] + int(LastDigit2) / 8
data.loc[x, "CH2O"] = data.loc[x, "CH2O"] + int(LastDigit3) / 10.0
data.head(10)
#%% md
The following columns could be considered ordinal:
#%%
print(data['NObeyesdad'].nunique())
data['NObeyesdad'].unique()
#%%
print(data['CALC'].nunique())
data['CALC'].unique()
#%%
print(data['CAEC'].nunique())
data['CAEC'].unique()
#%% md
## **Exploratory Data Analysis**
#%%
data.shape
#%%
data.info()
#%%
data.columns
#%%
data.isna().sum()
#%%
data.describe()
#%%
data.duplicated().sum()
#%%
data=data.drop_duplicates()
#%%
data.shape
#%%
# Height Distribution
plt.figure(figsize=(10, 6))
sns.histplot(data=data, x='Height', color='skyblue', kde=True, label='Height')
plt.title('Height Distribution')
plt.xlabel('Height')
plt.ylabel('Frequency')
plt.legend()
plt.show()
#%%
# Weight Distribution
plt.figure(figsize=(10, 6))
sns.histplot(data=data, x='Weight', color='salmon', kde=True, label='Weight')
plt.title('Weight Distribution')
plt.xlabel('Weight')
plt.ylabel('Frequency')
plt.legend()
plt.show()
#%%
# Top 10 Ages with Highest Weight
top_10_ages = data.groupby('Age')['Weight'].mean().nlargest(10)
plt.figure(figsize=(10, 6))
sns.barplot(x=top_10_ages.index, y=top_10_ages.values, color='skyblue')
plt.title('Top 10 Ages with Highest Weight')
plt.xlabel('Age')
plt.ylabel('Average Weight')
plt.xticks(rotation=45)
plt.show()
#%%
# Distribution of CAEC values
plt.figure(figsize=(8, 5))
sns.countplot(data=data, x='CAEC', palette='viridis')
plt.title('Distribution of CAEC values')
plt.xlabel('CAEC')
plt.ylabel('Count')
plt.show()
#%%
# Average Ages with Family History with Overweight
plt.figure(figsize=(8, 5))
sns.boxplot(data=data, x='family_history_with_overweight', y='Age', palette='Set2')
plt.title('Distribution of Ages by Family History of Overweight')
plt.xlabel('Family History with Overweight')
plt.ylabel('Age')
plt.show()
#%%
# Create a violin plot
plt.figure(figsize=(10, 6))
sns.violinplot(data=data, x='family_history_with_overweight', y='Age', palette='Set2')
plt.title('Distribution of Ages by Family History of Overweight')
plt.xlabel('Family History with Overweight')
plt.ylabel('Age')
plt.show()
#%%
# Calculate the correlation matrix
# Select only numeric columns
numeric_df = data.select_dtypes(include=['number'])
# Calculate correlation matrix
correlation_matrix = numeric_df.corr()
# Print the correlation matrix
print("Correlation Matrix:")
print(correlation_matrix)
#%%
import seaborn as sns
import matplotlib.pyplot as plt
# Assuming 'correlation_matrix' is your calculated correlation matrix
# Replace 'correlation_matrix' with the name of your correlation matrix if different
# Create a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show()
#%% md
## **Encoding of Categorical Variables**
#%%
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
#%%
# List of categorical columns
#categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
# List of numeric columns
numeric_cols = data.select_dtypes(include=['number']).columns.tolist()
#%%
# Ordinal encoding for ordinal categorical variables
ordinal_mapping = {
'CALC':{ 'no':1 ,'Sometimes':2 , 'Frequently':3, 'Always':4 },
'CAEC':{ 'no':1 ,'Sometimes':2 , 'Frequently':3, 'Always':4 }
}
for column, mapping in ordinal_mapping.items():
data[column] = data[column].map(mapping)
#%%
data.head(10)
#%%
from sklearn.preprocessing import LabelEncoder
# Select categorical columns
categorical_columns = ['Gender', 'FAVC', 'SCC', 'SMOKE', 'family_history_with_overweight', 'MTRANS', 'NObeyesdad']
# Initialize LabelEncoder
label_encoder = LabelEncoder()
# Apply LabelEncoder to each categorical column
for col in categorical_columns:
data[col] = label_encoder.fit_transform(data[col])
# Display the updated dataset
data.head(10)
#%% md
## **Standardization of Numeric Variables**
#%%
# Standard scaling for continuous columns
scaler = StandardScaler()
data[numeric_cols ] = scaler.fit_transform(data[numeric_cols ])
#%%
data.head(10)
#%%
# Split dataset into train and test sets
X = data.drop(columns=['NObeyesdad'])
y = data['NObeyesdad']
#%%
# Split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
#%% md
## **Classification of Diabetes Type in Indivisuals Using ML**
#%% md
The 'NObeyesdad' column appears to represent the obesity level of individuals, which likely consists of discrete categories such as 'Normal_Weight', 'Overweight_Level_I', 'Overweight_Level_II', 'Obesity_Type_I', 'Obesity_Type_II', and 'Obesity_Type_III'. Thus, predicting 'NObeyesdad' is a classification problem since the goal is to classify each individual into one of these discrete categories.
Given that it's a classification problem, here are some algorithms you could consider:
1. **Logistic Regression**: Despite its name, logistic regression is a classification algorithm suitable for binary (two-class) or multi-class classification problems.
2. **Decision Trees**: Decision trees are versatile and can handle both classification and regression tasks. They're easy to interpret and can handle both numerical and categorical data.
3. **Random Forest**: Random Forest is an ensemble learning method based on decision trees. It creates multiple decision trees and combines their predictions to improve accuracy and reduce overfitting.
4. **Gradient Boosting**: Gradient Boosting algorithms like XGBoost, LightGBM, and CatBoost are powerful techniques for classification tasks. They build trees sequentially, where each new tree corrects the errors made by the previous ones.
5. **Support Vector Machines (SVM)**: SVM is effective for both linear and non-linear classification problems. It finds the optimal hyperplane that separates the classes in a high-dimensional space.
#%%
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
#%%
# List of classifiers
classifiers = {
'Logistic Regression': LogisticRegression(),
'Decision Tree': DecisionTreeClassifier(),
'Random Forest': RandomForestClassifier(),
'Gradient Boosting': GradientBoostingClassifier(),
'Support Vector Machine': SVC()
}
#%%
# Train and evaluate classifiers
for name, clf in classifiers.items():
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"{name} Accuracy: {accuracy:.3f}")
print(f"{name} Classification Report:")
print(classification_report(y_test, y_pred))
print("---------------------------------------------------------")
#%% md
Based on the accuracy scores provided, the Random Forest algorithm performed the best with an accuracy of 0.964. Let's now provide its classification report and confusion matrix:
#%%
from sklearn.metrics import confusion_matrix
#%%
# Print classification report
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred))
# Calculate and print confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)
#%%
fig, ax = plt.subplots(figsize=(10, 8))
#plt.figure(figsize=(10, 8))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, ax = ax, fmt = 'g')
ax.set_title('Confusion Matrix', fontsize=10)
ax.xaxis.set_ticklabels(['Normal_Weight', 'Overweight_Level_I', 'Overweight_Level_II',
'Obesity_Type_I', 'Insufficient_Weight', 'Obesity_Type_II',
'Obesity_Type_III'], fontsize = 7)
ax.xaxis.tick_top()
ax.yaxis.set_ticklabels(['Normal_Weight', 'Overweight_Level_I', 'Overweight_Level_II',
'Obesity_Type_I', 'Insufficient_Weight', 'Obesity_Type_II',
'Obesity_Type_III'], fontsize = 7)
plt.show()