{ "cells": [ { "cell_type": "markdown", "id": "bf709519-f491-4f9b-b9ed-53c3869b114c", "metadata": {}, "source": [ "# Number Preprocessing" ] }, { "cell_type": "code", "execution_count": 1, "id": "23eba203-ea09-4786-8935-65742a920b32", "metadata": {}, "outputs": [], "source": [ "import os\n", "\n", "import matplotlib.pyplot as plt\n", "import numpy as np\n", "import pandas as pd\n", "from sklearn.model_selection import train_test_split\n", "\n", "%matplotlib inline" ] }, { "cell_type": "code", "execution_count": 2, "id": "49fe9e33-847e-4184-b35b-8cf730130b7e", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PregnanciesGlucoseBloodPressureSkinThicknessInsulinBMIDiabetesPedigreeFunctionAgeOutcome
061487235033.60.627501
11856629026.60.351310
28183640023.30.672321
318966239428.10.167210
40137403516843.12.288331
\n", "
" ], "text/plain": [ " Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \\\n", "0 6 148 72 35 0 33.6 \n", "1 1 85 66 29 0 26.6 \n", "2 8 183 64 0 0 23.3 \n", "3 1 89 66 23 94 28.1 \n", "4 0 137 40 35 168 43.1 \n", "\n", " DiabetesPedigreeFunction Age Outcome \n", "0 0.627 50 1 \n", "1 0.351 31 0 \n", "2 0.672 32 1 \n", "3 0.167 21 0 \n", "4 2.288 33 1 " ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data_path = os.path.expanduser(\"../../data/diabetes.csv\")\n", "df = pd.read_csv(data_path)\n", "df.head()" ] }, { "cell_type": "markdown", "id": "33980054-2917-4392-9312-e53fef085d09", "metadata": {}, "source": [ "## Inspecting the dataset" ] }, { "cell_type": "code", "execution_count": 3, "id": "a2977276-cc8d-4088-8c6f-28ca6811a9be", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 768 entries, 0 to 767\n", "Data columns (total 9 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 Pregnancies 768 non-null int64 \n", " 1 Glucose 768 non-null int64 \n", " 2 BloodPressure 768 non-null int64 \n", " 3 SkinThickness 768 non-null int64 \n", " 4 Insulin 768 non-null int64 \n", " 5 BMI 768 non-null float64\n", " 6 DiabetesPedigreeFunction 768 non-null float64\n", " 7 Age 768 non-null int64 \n", " 8 Outcome 768 non-null int64 \n", "dtypes: float64(2), int64(7)\n", "memory usage: 54.1 KB\n" ] } ], "source": [ "df.info()" ] }, { "cell_type": "code", "execution_count": 4, "id": "391cbf96-dcc0-4fc7-bc08-3f9402615f78", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PregnanciesGlucoseBloodPressureSkinThicknessInsulinBMIDiabetesPedigreeFunctionAgeOutcome
count768.000000768.000000768.000000768.000000768.000000768.000000768.000000768.000000768.000000
mean3.845052120.89453169.10546920.53645879.79947931.9925780.47187633.2408850.348958
std3.36957831.97261819.35580715.952218115.2440027.8841600.33132911.7602320.476951
min0.0000000.0000000.0000000.0000000.0000000.0000000.07800021.0000000.000000
25%1.00000099.00000062.0000000.0000000.00000027.3000000.24375024.0000000.000000
50%3.000000117.00000072.00000023.00000030.50000032.0000000.37250029.0000000.000000
75%6.000000140.25000080.00000032.000000127.25000036.6000000.62625041.0000001.000000
max17.000000199.000000122.00000099.000000846.00000067.1000002.42000081.0000001.000000
\n", "
" ], "text/plain": [ " Pregnancies Glucose BloodPressure SkinThickness Insulin \\\n", "count 768.000000 768.000000 768.000000 768.000000 768.000000 \n", "mean 3.845052 120.894531 69.105469 20.536458 79.799479 \n", "std 3.369578 31.972618 19.355807 15.952218 115.244002 \n", "min 0.000000 0.000000 0.000000 0.000000 0.000000 \n", "25% 1.000000 99.000000 62.000000 0.000000 0.000000 \n", "50% 3.000000 117.000000 72.000000 23.000000 30.500000 \n", "75% 6.000000 140.250000 80.000000 32.000000 127.250000 \n", "max 17.000000 199.000000 122.000000 99.000000 846.000000 \n", "\n", " BMI DiabetesPedigreeFunction Age Outcome \n", "count 768.000000 768.000000 768.000000 768.000000 \n", "mean 31.992578 0.471876 33.240885 0.348958 \n", "std 7.884160 0.331329 11.760232 0.476951 \n", "min 0.000000 0.078000 21.000000 0.000000 \n", "25% 27.300000 0.243750 24.000000 0.000000 \n", "50% 32.000000 0.372500 29.000000 0.000000 \n", "75% 36.600000 0.626250 41.000000 1.000000 \n", "max 67.100000 2.420000 81.000000 1.000000 " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.describe()" ] }, { "cell_type": "markdown", "id": "854f88fe-855a-4fe8-8fb8-fc075b4d30d3", "metadata": {}, "source": [ "## Visualizing the dataset" ] }, { "cell_type": "code", "execution_count": 5, "id": "ebc2cb8c-8a75-4e3d-a6c9-bb65b9ea3da1", "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "df.hist(bins=50, figsize=(25, 20))\n", "plt.show()" ] }, { "cell_type": "markdown", "id": "5f6d42f5-1976-4db3-9952-5a149e407b63", "metadata": {}, "source": [ "## Removing duplicated data points" ] }, { "cell_type": "code", "execution_count": 6, "id": "b301472c-3a8d-4e33-a253-b69208dd47a3", "metadata": {}, "outputs": [], "source": [ "df.drop_duplicates(keep=\"first\", inplace=True)" ] }, { "cell_type": "markdown", "id": "1c2415b4-4041-434f-8c25-f614cfae3bec", "metadata": {}, "source": [ "## Creating training and testing set" ] }, { "cell_type": "code", "execution_count": 8, "id": "ecc191ce-4933-460b-ad5c-0f2a60a65633", "metadata": {}, "outputs": [], "source": [ "train_df, test_df = train_test_split(df, test_size=0.2)" ] }, { "cell_type": "markdown", "id": "cf75f778-f9ed-4f28-a357-9ed5e035c55b", "metadata": {}, "source": [ "## Gaining further insight" ] }, { "cell_type": "code", "execution_count": 9, "id": "3ad46965-615c-4d08-84de-76d0902138d9", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PregnanciesGlucoseBloodPressureSkinThicknessInsulinBMIDiabetesPedigreeFunctionAgeOutcome
Pregnancies1.0000000.1351090.164122-0.041149-0.0569880.017513-0.0152600.5374940.217668
Glucose0.1351091.0000000.1641700.0485040.3283150.2107570.1254360.2696420.475968
BloodPressure0.1641220.1641701.0000000.2149240.0802100.2491020.0447640.2417280.114867
SkinThickness-0.0411490.0485040.2149241.0000000.4248730.3914420.176407-0.0882880.097148
Insulin-0.0569880.3283150.0802100.4248731.0000000.1939530.140380-0.0344570.140022
BMI0.0175130.2107570.2491020.3914420.1939531.0000000.1266220.0204200.306521
DiabetesPedigreeFunction-0.0152600.1254360.0447640.1764070.1403800.1266221.0000000.0514600.170353
Age0.5374940.2696420.241728-0.088288-0.0344570.0204200.0514601.0000000.234581
Outcome0.2176680.4759680.1148670.0971480.1400220.3065210.1703530.2345811.000000
\n", "
" ], "text/plain": [ " Pregnancies Glucose BloodPressure SkinThickness \\\n", "Pregnancies 1.000000 0.135109 0.164122 -0.041149 \n", "Glucose 0.135109 1.000000 0.164170 0.048504 \n", "BloodPressure 0.164122 0.164170 1.000000 0.214924 \n", "SkinThickness -0.041149 0.048504 0.214924 1.000000 \n", "Insulin -0.056988 0.328315 0.080210 0.424873 \n", "BMI 0.017513 0.210757 0.249102 0.391442 \n", "DiabetesPedigreeFunction -0.015260 0.125436 0.044764 0.176407 \n", "Age 0.537494 0.269642 0.241728 -0.088288 \n", "Outcome 0.217668 0.475968 0.114867 0.097148 \n", "\n", " Insulin BMI DiabetesPedigreeFunction \\\n", "Pregnancies -0.056988 0.017513 -0.015260 \n", "Glucose 0.328315 0.210757 0.125436 \n", "BloodPressure 0.080210 0.249102 0.044764 \n", "SkinThickness 0.424873 0.391442 0.176407 \n", "Insulin 1.000000 0.193953 0.140380 \n", "BMI 0.193953 1.000000 0.126622 \n", "DiabetesPedigreeFunction 0.140380 0.126622 1.000000 \n", "Age -0.034457 0.020420 0.051460 \n", "Outcome 0.140022 0.306521 0.170353 \n", "\n", " Age Outcome \n", "Pregnancies 0.537494 0.217668 \n", "Glucose 0.269642 0.475968 \n", "BloodPressure 0.241728 0.114867 \n", "SkinThickness -0.088288 0.097148 \n", "Insulin -0.034457 0.140022 \n", "BMI 0.020420 0.306521 \n", "DiabetesPedigreeFunction 0.051460 0.170353 \n", "Age 1.000000 0.234581 \n", "Outcome 0.234581 1.000000 " ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "correlation_matrix = train_df.corr(method=\"pearson\")\n", "correlation_matrix" ] }, { "cell_type": "code", "execution_count": 10, "id": "e2faef62-4795-4963-a011-66cac38465a0", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "SkinThickness 0.097148\n", "BloodPressure 0.114867\n", "Insulin 0.140022\n", "DiabetesPedigreeFunction 0.170353\n", "Pregnancies 0.217668\n", "Age 0.234581\n", "BMI 0.306521\n", "Glucose 0.475968\n", "Outcome 1.000000\n", "Name: Outcome, dtype: float64" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "correlation_matrix[\"Outcome\"].sort_values()" ] }, { "cell_type": "markdown", "id": "0d750195-c1f1-4c6b-8415-871aafab4218", "metadata": {}, "source": [ "## Handling missing data" ] }, { "cell_type": "code", "execution_count": 15, "id": "a7cd2214-4e2d-4609-bf8c-d1c81b212250", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/var/folders/7w/fv5n0x414253d7dv5g2wwmb40000gn/T/ipykernel_1701/551462321.py:4: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n", "\n", "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n", "\n", "\n", " train_df[name].replace(0, np.nan, inplace=True)\n" ] } ], "source": [ "names = [\n", " \"Glucose\",\n", " \"BloodPressure\",\n", " \"SkinThickness\",\n", " \"Insulin\",\n", " \"BMI\",\n", " \"DiabetesPedigreeFunction\",\n", " \"Age\",\n", "]\n", "\n", "for name in names:\n", " train_df[name].replace(0, np.nan, inplace=True)" ] }, { "cell_type": "code", "execution_count": 16, "id": "9fae1487-4f92-4802-9df0-afd8fd67349f", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PregnanciesGlucoseBloodPressureSkinThicknessInsulinBMIDiabetesPedigreeFunctionAgeOutcome
6834125.080.0NaNNaN32.30.536271
3944158.078.0NaNNaN32.90.803311
1903111.062.0NaNNaN22.60.142210
27413106.070.0NaNNaN34.20.251520
1620114.080.034.0285.044.20.167270
\n", "
" ], "text/plain": [ " Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \\\n", "683 4 125.0 80.0 NaN NaN 32.3 \n", "394 4 158.0 78.0 NaN NaN 32.9 \n", "190 3 111.0 62.0 NaN NaN 22.6 \n", "274 13 106.0 70.0 NaN NaN 34.2 \n", "162 0 114.0 80.0 34.0 285.0 44.2 \n", "\n", " DiabetesPedigreeFunction Age Outcome \n", "683 0.536 27 1 \n", "394 0.803 31 1 \n", "190 0.142 21 0 \n", "274 0.251 52 0 \n", "162 0.167 27 0 " ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train_df.head()" ] }, { "cell_type": "code", "execution_count": 17, "id": "7542d555-810d-4811-bf5b-010c4ce4471d", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/var/folders/7w/fv5n0x414253d7dv5g2wwmb40000gn/T/ipykernel_1701/2039840509.py:13: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n", "\n", "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n", "\n", "\n", " train_df[\"BMI\"].fillna(bmi_median, inplace=True)\n", "/var/folders/7w/fv5n0x414253d7dv5g2wwmb40000gn/T/ipykernel_1701/2039840509.py:14: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n", "\n", "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n", "\n", "\n", " train_df[\"Age\"].fillna(age_median, inplace=True)\n", "/var/folders/7w/fv5n0x414253d7dv5g2wwmb40000gn/T/ipykernel_1701/2039840509.py:15: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n", "\n", "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n", "\n", "\n", " train_df[\"DiabetesPedigreeFunction\"].fillna(dpf_median, inplace=True)\n" ] } ], "source": [ "glucose_median = train_df[\"Glucose\"].median()\n", "blood_pressure_median = train_df[\"BloodPressure\"].median()\n", "skin_thickness_median = train_df[\"SkinThickness\"].median()\n", "insulin_median = train_df[\"Insulin\"].median()\n", "bmi_median = train_df[\"BMI\"].median()\n", "age_median = train_df[\"Age\"].median()\n", "dpf_median = train_df[\"DiabetesPedigreeFunction\"].median()\n", "\n", "train_df[\"Glucose\"].fillna(glucose_median, inplace=True)\n", "train_df[\"BloodPressure\"].fillna(blood_pressure_median, inplace=True)\n", "train_df[\"SkinThickness\"].fillna(skin_thickness_median, inplace=True)\n", "train_df[\"Insulin\"].fillna(insulin_median, inplace=True)\n", "train_df[\"BMI\"].fillna(bmi_median, inplace=True)\n", "train_df[\"Age\"].fillna(age_median, inplace=True)\n", "train_df[\"DiabetesPedigreeFunction\"].fillna(dpf_median, inplace=True)" ] }, { "cell_type": "code", "execution_count": 18, "id": "51d1257e-be6e-48a2-9927-70f50635015d", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PregnanciesGlucoseBloodPressureSkinThicknessInsulinBMIDiabetesPedigreeFunctionAgeOutcome
6834125.080.029.0125.032.30.536271
3944158.078.029.0125.032.90.803311
1903111.062.029.0125.022.60.142210
27413106.070.029.0125.034.20.251520
1620114.080.034.0285.044.20.167270
\n", "
" ], "text/plain": [ " Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \\\n", "683 4 125.0 80.0 29.0 125.0 32.3 \n", "394 4 158.0 78.0 29.0 125.0 32.9 \n", "190 3 111.0 62.0 29.0 125.0 22.6 \n", "274 13 106.0 70.0 29.0 125.0 34.2 \n", "162 0 114.0 80.0 34.0 285.0 44.2 \n", "\n", " DiabetesPedigreeFunction Age Outcome \n", "683 0.536 27 1 \n", "394 0.803 31 1 \n", "190 0.142 21 0 \n", "274 0.251 52 0 \n", "162 0.167 27 0 " ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train_df.head()" ] }, { "cell_type": "markdown", "id": "ffaa8cf2-5956-4c16-95ce-6f03ebd941d5", "metadata": {}, "source": [ "## Encoding categorial attributes" ] }, { "cell_type": "code", "execution_count": 19, "id": "f96db2cd-7241-4d41-bdbf-291cd4d7b1f6", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PregnanciesGlucoseBloodPressureSkinThicknessInsulinBMIDiabetesPedigreeFunctionAgeOutcomefitness
6834125.080.029.0125.032.30.536271very good
3944158.078.029.0125.032.90.803311very good
1903111.062.029.0125.022.60.142210bad
27413106.070.029.0125.034.20.251520moderate
1620114.080.034.0285.044.20.167270very good
\n", "
" ], "text/plain": [ " Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \\\n", "683 4 125.0 80.0 29.0 125.0 32.3 \n", "394 4 158.0 78.0 29.0 125.0 32.9 \n", "190 3 111.0 62.0 29.0 125.0 22.6 \n", "274 13 106.0 70.0 29.0 125.0 34.2 \n", "162 0 114.0 80.0 34.0 285.0 44.2 \n", "\n", " DiabetesPedigreeFunction Age Outcome fitness \n", "683 0.536 27 1 very good \n", "394 0.803 31 1 very good \n", "190 0.142 21 0 bad \n", "274 0.251 52 0 moderate \n", "162 0.167 27 0 very good " ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Copy original dataframe and add new column with random fitness values\n", "temp_df = train_df.copy()\n", "fitness_values = [\n", " \"bad\",\n", " \"moderate\",\n", " \"good\",\n", " \"very good\",\n", "]\n", "temp_df[\"fitness\"] = np.random.choice(fitness_values, temp_df.shape[0])\n", "\n", "temp_df.head(5)" ] }, { "cell_type": "code", "execution_count": 20, "id": "ce5946b5-30a3-4628-afb9-68190fe37792", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "class id 0 has label bad\n", "class id 1 has label good\n", "class id 2 has label moderate\n", "class id 3 has label very good\n", "\n", "Encoded fitness values for first 10 entries: [3 3 0 2 3 0 1 3 2 1]\n" ] } ], "source": [ "from sklearn.preprocessing import LabelEncoder\n", "\n", "encoder = LabelEncoder()\n", "fitness_encoded = encoder.fit_transform(temp_df[\"fitness\"])\n", "\n", "for id_, class_ in enumerate(encoder.classes_):\n", " print(f\"class id {id_} has label {class_}\")\n", "\n", "print()\n", "print(f\"Encoded fitness values for first 10 entries: {fitness_encoded[:10]}\")" ] }, { "cell_type": "markdown", "id": "e91b169e-4532-414f-a4d6-37cb9539bfb7", "metadata": {}, "source": [ "## Rescaling or standardizing attributes" ] }, { "cell_type": "code", "execution_count": 21, "id": "e5958be0-dd02-41c8-9cac-4ffe8a40a2ea", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PregnanciesGlucoseBloodPressureSkinThicknessInsulinBMIDiabetesPedigreeFunctionAgeOutcome
20.4705880.8967740.4444440.2391300.1334130.1042940.2536290.1833331.0
30.0588240.2903230.4666670.1739130.0961540.2024540.0380020.0000000.0
50.2941180.4645160.5555560.2391300.1334130.1513290.0525190.1500000.0
60.1764710.2193550.2888890.2717390.0889420.2617590.0725880.0833331.0
70.5882350.4580650.5333330.2391300.1334130.3496930.0239110.1333330.0
\n", "
" ], "text/plain": [ " Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \\\n", "2 0.470588 0.896774 0.444444 0.239130 0.133413 0.104294 \n", "3 0.058824 0.290323 0.466667 0.173913 0.096154 0.202454 \n", "5 0.294118 0.464516 0.555556 0.239130 0.133413 0.151329 \n", "6 0.176471 0.219355 0.288889 0.271739 0.088942 0.261759 \n", "7 0.588235 0.458065 0.533333 0.239130 0.133413 0.349693 \n", "\n", " DiabetesPedigreeFunction Age Outcome \n", "2 0.253629 0.183333 1.0 \n", "3 0.038002 0.000000 0.0 \n", "5 0.052519 0.150000 0.0 \n", "6 0.072588 0.083333 1.0 \n", "7 0.023911 0.133333 0.0 " ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.preprocessing import MinMaxScaler\n", "\n", "# initialize min-max scaler\n", "mm_scaler = MinMaxScaler()\n", "\n", "temp1_df = train_df.copy()\n", "column_names = temp1_df.columns.tolist()\n", "\n", "# transform all attributes\n", "temp1_df[column_names] = mm_scaler.fit_transform(temp1_df[column_names])\n", "\n", "temp1_df.sort_index(inplace=True)\n", "temp1_df.head()" ] }, { "cell_type": "code", "execution_count": 22, "id": "b6c101ee-c8d5-4fbc-9982-cb6e711c6bc6", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PregnanciesGlucoseBloodPressureSkinThicknessInsulinBMIDiabetesPedigreeFunctionAgeOutcome
20.4705880.8967740.4444440.2391300.1334130.1042940.2536290.1833331.0
30.0588240.2903230.4666670.1739130.0961540.2024540.0380020.0000000.0
50.2941180.4645160.5555560.2391300.1334130.1513290.0525190.1500000.0
60.1764710.2193550.2888890.2717390.0889420.2617590.0725880.0833331.0
70.5882350.4580650.5333330.2391300.1334130.3496930.0239110.1333330.0
\n", "
" ], "text/plain": [ " Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \\\n", "2 0.470588 0.896774 0.444444 0.239130 0.133413 0.104294 \n", "3 0.058824 0.290323 0.466667 0.173913 0.096154 0.202454 \n", "5 0.294118 0.464516 0.555556 0.239130 0.133413 0.151329 \n", "6 0.176471 0.219355 0.288889 0.271739 0.088942 0.261759 \n", "7 0.588235 0.458065 0.533333 0.239130 0.133413 0.349693 \n", "\n", " DiabetesPedigreeFunction Age Outcome \n", "2 0.253629 0.183333 1.0 \n", "3 0.038002 0.000000 0.0 \n", "5 0.052519 0.150000 0.0 \n", "6 0.072588 0.083333 1.0 \n", "7 0.023911 0.133333 0.0 " ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.preprocessing import StandardScaler\n", "\n", "standard_scaler = StandardScaler()\n", "\n", "temp2_df = train_df.copy()\n", "\n", "# transform all attributes\n", "temp2_df[column_names] = mm_scaler.fit_transform(temp2_df[column_names])\n", "temp2_df.sort_index(inplace=True)\n", "temp2_df.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "8a34d793-33f6-477d-bf01-01a331d9e0d7", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.4" } }, "nbformat": 4, "nbformat_minor": 5 }