Obesity_Level_Prediction-aim.ipynb
#%% md 

Source: https://github.com/Saba-Gul/Exploratory-Data-Analysis-and-Statistical-Analysis-Notebooks 
#%% md ## **Do not forget to write your student number in the assignment giben in 4 cells below**
#%% import matplotlib.pyplot as plt import seaborn as sns import warnings warnings.filterwarnings("ignore") from sklearn.preprocessing import OneHotEncoder, StandardScaler import pandas as pd import numpy as np
#%% md Obesity is a great challenge for health worldwide, which cause various health disease and reduce life quality. With the help of this data set we aims to estimate the obesity level in individuals based on their physical activities, eating habits, family history on overweight, etc. We will use various methods on the dataset gathered from public online survey to prevent and reduce the obesity level. the target of this dataset is 'NObeyesdad'.
#%% md Here's a description of each column in the dataset: 1. **Age**: The age of the individual. 2. **Gender**: The gender of the individual (e.g., Male, Female). 3. **Height**: The height of the individual in meters. 4. **Weight**: The weight of the individual in kilograms. 5. **CALC**: Unknown column. You might need to check the data source or documentation to understand what this column represents. 6. **FAVC**: Whether the individual frequently consumes high caloric food (e.g., yes, no). 7. **FCVC**: Frequency of consumption of vegetables (numeric scale). 8. **NCP**: Number of main meals per day (numeric scale). 9. **SCC**: Squamous cell carcinoma 10. **SMOKE**: Whether the individual smokes (e.g., yes, no). 11. **CH2O**: Consumption of water daily (numeric scale). 12. **family_history_with_overweight**: Whether the individual has a family history of overweight (e.g., yes, no). 13. **FAF**: Physical activity frequency (numeric scale). 14. **TUE**: Time using technology devices (numeric scale). 15. **CAEC**: Unknown column. You might need to check the data source or documentation to understand what this column represents. 16. **MTRANS**: Mode of transportation (e.g., Public Transportation, Walking). 17. **NObeyesdad**: Obesity level of the individual (e.g., Normal_Weight, Overweight_Level_I). Some columns have numeric values, some are categorical, and others seem to be ordinal.
#%% # if you are using COLAB. You have to make DataSets folder and put your obessity dataset there from google.colab import drive drive.mount('/content/drive')
#%% # Load dataset # From your project folder under DataSets directory data = pd.read_csv("DataSets/ObesityDataSet_raw_and_data_sinthetic.csv") # From your google drive #data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/ObesityDataSet_raw_and_data_sinthetic.csv") data.head(10)
#%% # Change dataset according to Student Number StudentNo = 11230199 strStudentNo = str(StudentNo) LastDigit1 = strStudentNo[len(strStudentNo)-1] LastDigit2 = strStudentNo[len(strStudentNo)-2] LastDigit3 = strStudentNo[len(strStudentNo)-3] # Update data for x in data.index: data.loc[x, "Age"] = data.loc[x, "Age"] + int(LastDigit1) data.loc[x, "Height"] = data.loc[x, "Height"] + int(LastDigit2) / 10.0 data.loc[x, "Weight"] = data.loc[x, "Weight"] + int(LastDigit3) - 2 data.loc[x, "FCVC"] = data.loc[x, "FCVC"] + int(LastDigit1) / 7 data.loc[x, "NCP"] = data.loc[x, "NCP"] + int(LastDigit2) / 8 data.loc[x, "CH2O"] = data.loc[x, "CH2O"] + int(LastDigit3) / 10.0 data.head(10)
#%% md The following columns could be considered ordinal:
#%% print(data['NObeyesdad'].nunique()) data['NObeyesdad'].unique()
#%% print(data['CALC'].nunique()) data['CALC'].unique()
#%% print(data['CAEC'].nunique()) data['CAEC'].unique()
#%% md ## **Exploratory Data Analysis**
#%% data.shape
#%% data.info()
#%% data.columns
#%% data.isna().sum()
#%% data.describe()
#%% data.duplicated().sum()
#%% data=data.drop_duplicates()
#%% data.shape
#%% # Height Distribution plt.figure(figsize=(10, 6)) sns.histplot(data=data, x='Height', color='skyblue', kde=True, label='Height') plt.title('Height Distribution') plt.xlabel('Height') plt.ylabel('Frequency') plt.legend() plt.show()
#%% # Weight Distribution plt.figure(figsize=(10, 6)) sns.histplot(data=data, x='Weight', color='salmon', kde=True, label='Weight') plt.title('Weight Distribution') plt.xlabel('Weight') plt.ylabel('Frequency') plt.legend() plt.show()
#%% # Top 10 Ages with Highest Weight top_10_ages = data.groupby('Age')['Weight'].mean().nlargest(10) plt.figure(figsize=(10, 6)) sns.barplot(x=top_10_ages.index, y=top_10_ages.values, color='skyblue') plt.title('Top 10 Ages with Highest Weight') plt.xlabel('Age') plt.ylabel('Average Weight') plt.xticks(rotation=45) plt.show()
#%% # Distribution of CAEC values plt.figure(figsize=(8, 5)) sns.countplot(data=data, x='CAEC', palette='viridis') plt.title('Distribution of CAEC values') plt.xlabel('CAEC') plt.ylabel('Count') plt.show()
#%% # Average Ages with Family History with Overweight plt.figure(figsize=(8, 5)) sns.boxplot(data=data, x='family_history_with_overweight', y='Age', palette='Set2') plt.title('Distribution of Ages by Family History of Overweight') plt.xlabel('Family History with Overweight') plt.ylabel('Age') plt.show()
#%% # Create a violin plot plt.figure(figsize=(10, 6)) sns.violinplot(data=data, x='family_history_with_overweight', y='Age', palette='Set2') plt.title('Distribution of Ages by Family History of Overweight') plt.xlabel('Family History with Overweight') plt.ylabel('Age') plt.show()
#%% # Calculate the correlation matrix # Select only numeric columns numeric_df = data.select_dtypes(include=['number']) # Calculate correlation matrix correlation_matrix = numeric_df.corr() # Print the correlation matrix print("Correlation Matrix:") print(correlation_matrix)
#%% import seaborn as sns import matplotlib.pyplot as plt # Assuming 'correlation_matrix' is your calculated correlation matrix # Replace 'correlation_matrix' with the name of your correlation matrix if different # Create a heatmap plt.figure(figsize=(10, 8)) sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5) plt.title('Correlation Heatmap') plt.show()
#%% md ## **Encoding of Categorical Variables**
#%% from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder from sklearn.model_selection import train_test_split
#%% # List of categorical columns #categorical_cols = X.select_dtypes(include=['object']).columns.tolist() # List of numeric columns numeric_cols = data.select_dtypes(include=['number']).columns.tolist()
#%% # Ordinal encoding for ordinal categorical variables ordinal_mapping = { 'CALC':{ 'no':1 ,'Sometimes':2 , 'Frequently':3, 'Always':4 }, 'CAEC':{ 'no':1 ,'Sometimes':2 , 'Frequently':3, 'Always':4 } } for column, mapping in ordinal_mapping.items(): data[column] = data[column].map(mapping)
#%% data.head(10)
#%% from sklearn.preprocessing import LabelEncoder # Select categorical columns categorical_columns = ['Gender', 'FAVC', 'SCC', 'SMOKE', 'family_history_with_overweight', 'MTRANS', 'NObeyesdad'] # Initialize LabelEncoder label_encoder = LabelEncoder() # Apply LabelEncoder to each categorical column for col in categorical_columns: data[col] = label_encoder.fit_transform(data[col]) # Display the updated dataset data.head(10)
#%% md ## **Standardization of Numeric Variables**
#%% # Standard scaling for continuous columns scaler = StandardScaler() data[numeric_cols ] = scaler.fit_transform(data[numeric_cols ])
#%% data.head(10)
#%% # Split dataset into train and test sets X = data.drop(columns=['NObeyesdad']) y = data['NObeyesdad']
#%% # Split dataset into train and test sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
#%% md ## **Classification of Diabetes Type in Indivisuals Using ML**
#%% md The 'NObeyesdad' column appears to represent the obesity level of individuals, which likely consists of discrete categories such as 'Normal_Weight', 'Overweight_Level_I', 'Overweight_Level_II', 'Obesity_Type_I', 'Obesity_Type_II', and 'Obesity_Type_III'. Thus, predicting 'NObeyesdad' is a classification problem since the goal is to classify each individual into one of these discrete categories. Given that it's a classification problem, here are some algorithms you could consider: 1. **Logistic Regression**: Despite its name, logistic regression is a classification algorithm suitable for binary (two-class) or multi-class classification problems. 2. **Decision Trees**: Decision trees are versatile and can handle both classification and regression tasks. They're easy to interpret and can handle both numerical and categorical data. 3. **Random Forest**: Random Forest is an ensemble learning method based on decision trees. It creates multiple decision trees and combines their predictions to improve accuracy and reduce overfitting. 4. **Gradient Boosting**: Gradient Boosting algorithms like XGBoost, LightGBM, and CatBoost are powerful techniques for classification tasks. They build trees sequentially, where each new tree corrects the errors made by the previous ones. 5. **Support Vector Machines (SVM)**: SVM is effective for both linear and non-linear classification problems. It finds the optimal hyperplane that separates the classes in a high-dimensional space.
#%% from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier from sklearn.linear_model import LogisticRegression from sklearn.tree import DecisionTreeClassifier from sklearn.svm import SVC from sklearn.metrics import classification_report, accuracy_score
#%% # List of classifiers classifiers = { 'Logistic Regression': LogisticRegression(), 'Decision Tree': DecisionTreeClassifier(), 'Random Forest': RandomForestClassifier(), 'Gradient Boosting': GradientBoostingClassifier(), 'Support Vector Machine': SVC() }
#%% # Train and evaluate classifiers for name, clf in classifiers.items(): clf.fit(X_train, y_train) y_pred = clf.predict(X_test) accuracy = accuracy_score(y_test, y_pred) print(f"{name} Accuracy: {accuracy:.3f}") print(f"{name} Classification Report:") print(classification_report(y_test, y_pred)) print("---------------------------------------------------------")
#%% md Based on the accuracy scores provided, the Random Forest algorithm performed the best with an accuracy of 0.964. Let's now provide its classification report and confusion matrix:
#%% from sklearn.metrics import confusion_matrix
#%% # Print classification report print("Random Forest Classification Report:") print(classification_report(y_test, y_pred)) # Calculate and print confusion matrix conf_matrix = confusion_matrix(y_test, y_pred) print("Confusion Matrix:") print(conf_matrix)
#%% fig, ax = plt.subplots(figsize=(10, 8)) #plt.figure(figsize=(10, 8)) sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, ax = ax, fmt = 'g') ax.set_title('Confusion Matrix', fontsize=10) ax.xaxis.set_ticklabels(['Normal_Weight', 'Overweight_Level_I', 'Overweight_Level_II', 'Obesity_Type_I', 'Insufficient_Weight', 'Obesity_Type_II', 'Obesity_Type_III'], fontsize = 7) ax.xaxis.tick_top() ax.yaxis.set_ticklabels(['Normal_Weight', 'Overweight_Level_I', 'Overweight_Level_II', 'Obesity_Type_I', 'Insufficient_Weight', 'Obesity_Type_II', 'Obesity_Type_III'], fontsize = 7) plt.show()