Number Preprocessing#

import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

%matplotlib inline
data_path = os.path.expanduser("../../data/diabetes.csv")
df = pd.read_csv(data_path)
df.head()
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
0 6 148 72 35 0 33.6 0.627 50 1
1 1 85 66 29 0 26.6 0.351 31 0
2 8 183 64 0 0 23.3 0.672 32 1
3 1 89 66 23 94 28.1 0.167 21 0
4 0 137 40 35 168 43.1 2.288 33 1

Inspecting the dataset#

df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB
df.describe()
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
count 768.000000 768.000000 768.000000 768.000000 768.000000 768.000000 768.000000 768.000000 768.000000
mean 3.845052 120.894531 69.105469 20.536458 79.799479 31.992578 0.471876 33.240885 0.348958
std 3.369578 31.972618 19.355807 15.952218 115.244002 7.884160 0.331329 11.760232 0.476951
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.078000 21.000000 0.000000
25% 1.000000 99.000000 62.000000 0.000000 0.000000 27.300000 0.243750 24.000000 0.000000
50% 3.000000 117.000000 72.000000 23.000000 30.500000 32.000000 0.372500 29.000000 0.000000
75% 6.000000 140.250000 80.000000 32.000000 127.250000 36.600000 0.626250 41.000000 1.000000
max 17.000000 199.000000 122.000000 99.000000 846.000000 67.100000 2.420000 81.000000 1.000000

Visualizing the dataset#

df.hist(bins=50, figsize=(25, 20))
plt.show()
../_images/c48a2f0b1da9c1bfab3cb68ce45ac18d280122c8a57238ad9be5dd150164e4a0.png

Removing duplicated data points#

df.drop_duplicates(keep="first", inplace=True)

Creating training and testing set#

train_df, test_df = train_test_split(df, test_size=0.2)

Gaining further insight#

correlation_matrix = train_df.corr(method="pearson")
correlation_matrix
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
Pregnancies 1.000000 0.135109 0.164122 -0.041149 -0.056988 0.017513 -0.015260 0.537494 0.217668
Glucose 0.135109 1.000000 0.164170 0.048504 0.328315 0.210757 0.125436 0.269642 0.475968
BloodPressure 0.164122 0.164170 1.000000 0.214924 0.080210 0.249102 0.044764 0.241728 0.114867
SkinThickness -0.041149 0.048504 0.214924 1.000000 0.424873 0.391442 0.176407 -0.088288 0.097148
Insulin -0.056988 0.328315 0.080210 0.424873 1.000000 0.193953 0.140380 -0.034457 0.140022
BMI 0.017513 0.210757 0.249102 0.391442 0.193953 1.000000 0.126622 0.020420 0.306521
DiabetesPedigreeFunction -0.015260 0.125436 0.044764 0.176407 0.140380 0.126622 1.000000 0.051460 0.170353
Age 0.537494 0.269642 0.241728 -0.088288 -0.034457 0.020420 0.051460 1.000000 0.234581
Outcome 0.217668 0.475968 0.114867 0.097148 0.140022 0.306521 0.170353 0.234581 1.000000
correlation_matrix["Outcome"].sort_values()
SkinThickness               0.097148
BloodPressure               0.114867
Insulin                     0.140022
DiabetesPedigreeFunction    0.170353
Pregnancies                 0.217668
Age                         0.234581
BMI                         0.306521
Glucose                     0.475968
Outcome                     1.000000
Name: Outcome, dtype: float64

Handling missing data#

names = [
    "Glucose",
    "BloodPressure",
    "SkinThickness",
    "Insulin",
    "BMI",
    "DiabetesPedigreeFunction",
    "Age",
]

for name in names:
    train_df[name].replace(0, np.nan, inplace=True)
/var/folders/7w/fv5n0x414253d7dv5g2wwmb40000gn/T/ipykernel_1701/551462321.py:4: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df[name].replace(0, np.nan, inplace=True)
train_df.head()
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
683 4 125.0 80.0 NaN NaN 32.3 0.536 27 1
394 4 158.0 78.0 NaN NaN 32.9 0.803 31 1
190 3 111.0 62.0 NaN NaN 22.6 0.142 21 0
274 13 106.0 70.0 NaN NaN 34.2 0.251 52 0
162 0 114.0 80.0 34.0 285.0 44.2 0.167 27 0
glucose_median = train_df["Glucose"].median()
blood_pressure_median = train_df["BloodPressure"].median()
skin_thickness_median = train_df["SkinThickness"].median()
insulin_median = train_df["Insulin"].median()
bmi_median = train_df["BMI"].median()
age_median = train_df["Age"].median()
dpf_median = train_df["DiabetesPedigreeFunction"].median()

train_df["Glucose"].fillna(glucose_median, inplace=True)
train_df["BloodPressure"].fillna(blood_pressure_median, inplace=True)
train_df["SkinThickness"].fillna(skin_thickness_median, inplace=True)
train_df["Insulin"].fillna(insulin_median, inplace=True)
train_df["BMI"].fillna(bmi_median, inplace=True)
train_df["Age"].fillna(age_median, inplace=True)
train_df["DiabetesPedigreeFunction"].fillna(dpf_median, inplace=True)
/var/folders/7w/fv5n0x414253d7dv5g2wwmb40000gn/T/ipykernel_1701/2039840509.py:13: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df["BMI"].fillna(bmi_median, inplace=True)
/var/folders/7w/fv5n0x414253d7dv5g2wwmb40000gn/T/ipykernel_1701/2039840509.py:14: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df["Age"].fillna(age_median, inplace=True)
/var/folders/7w/fv5n0x414253d7dv5g2wwmb40000gn/T/ipykernel_1701/2039840509.py:15: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df["DiabetesPedigreeFunction"].fillna(dpf_median, inplace=True)
train_df.head()
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
683 4 125.0 80.0 29.0 125.0 32.3 0.536 27 1
394 4 158.0 78.0 29.0 125.0 32.9 0.803 31 1
190 3 111.0 62.0 29.0 125.0 22.6 0.142 21 0
274 13 106.0 70.0 29.0 125.0 34.2 0.251 52 0
162 0 114.0 80.0 34.0 285.0 44.2 0.167 27 0

Encoding categorial attributes#

# Copy original dataframe and add new column with random fitness values
temp_df = train_df.copy()
fitness_values = [
    "bad",
    "moderate",
    "good",
    "very good",
]
temp_df["fitness"] = np.random.choice(fitness_values, temp_df.shape[0])

temp_df.head(5)
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome fitness
683 4 125.0 80.0 29.0 125.0 32.3 0.536 27 1 very good
394 4 158.0 78.0 29.0 125.0 32.9 0.803 31 1 very good
190 3 111.0 62.0 29.0 125.0 22.6 0.142 21 0 bad
274 13 106.0 70.0 29.0 125.0 34.2 0.251 52 0 moderate
162 0 114.0 80.0 34.0 285.0 44.2 0.167 27 0 very good
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
fitness_encoded = encoder.fit_transform(temp_df["fitness"])

for id_, class_ in enumerate(encoder.classes_):
    print(f"class id {id_} has label {class_}")

print()
print(f"Encoded fitness values for first 10 entries: {fitness_encoded[:10]}")
class id 0 has label bad
class id 1 has label good
class id 2 has label moderate
class id 3 has label very good

Encoded fitness values for first 10 entries: [3 3 0 2 3 0 1 3 2 1]

Rescaling or standardizing attributes#

from sklearn.preprocessing import MinMaxScaler

# initialize min-max scaler
mm_scaler = MinMaxScaler()

temp1_df = train_df.copy()
column_names = temp1_df.columns.tolist()

# transform all attributes
temp1_df[column_names] = mm_scaler.fit_transform(temp1_df[column_names])

temp1_df.sort_index(inplace=True)
temp1_df.head()
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
2 0.470588 0.896774 0.444444 0.239130 0.133413 0.104294 0.253629 0.183333 1.0
3 0.058824 0.290323 0.466667 0.173913 0.096154 0.202454 0.038002 0.000000 0.0
5 0.294118 0.464516 0.555556 0.239130 0.133413 0.151329 0.052519 0.150000 0.0
6 0.176471 0.219355 0.288889 0.271739 0.088942 0.261759 0.072588 0.083333 1.0
7 0.588235 0.458065 0.533333 0.239130 0.133413 0.349693 0.023911 0.133333 0.0
from sklearn.preprocessing import StandardScaler

standard_scaler = StandardScaler()

temp2_df = train_df.copy()

# transform all attributes
temp2_df[column_names] = mm_scaler.fit_transform(temp2_df[column_names])
temp2_df.sort_index(inplace=True)
temp2_df.head()
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
2 0.470588 0.896774 0.444444 0.239130 0.133413 0.104294 0.253629 0.183333 1.0
3 0.058824 0.290323 0.466667 0.173913 0.096154 0.202454 0.038002 0.000000 0.0
5 0.294118 0.464516 0.555556 0.239130 0.133413 0.151329 0.052519 0.150000 0.0
6 0.176471 0.219355 0.288889 0.271739 0.088942 0.261759 0.072588 0.083333 1.0
7 0.588235 0.458065 0.533333 0.239130 0.133413 0.349693 0.023911 0.133333 0.0