From d5109afe8db968c05f9571002bb361b5d72548a3 Mon Sep 17 00:00:00 2001
From: Sarah Nabelsi <snabelsi@Sarahs-MacBook-Pro-2.local>
Date: Mon, 23 Jan 2023 10:22:32 -0800
Subject: [PATCH 1/6] video file for 03_04

---
 03_04/03_04 General Cleaning Techniques.ipynb | 295 ++++++++++++++++++
 1 file changed, 295 insertions(+)
 create mode 100644 03_04/03_04 General Cleaning Techniques.ipynb

diff --git a/03_04/03_04 General Cleaning Techniques.ipynb b/03_04/03_04 General Cleaning Techniques.ipynb
new file mode 100644
index 0000000..36b3030
--- /dev/null
+++ b/03_04/03_04 General Cleaning Techniques.ipynb	
@@ -0,0 +1,295 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Credit Card Retention Analysis"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Imports"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns\n",
+    "import numpy as np\n",
+    "import plotly.graph_objs as go\n",
+    "from plotly.offline import iplot\n",
+    "sns.set()\n",
+    "pd.options.display.max_columns = 999"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 80,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = pd.read_csv('data/BankChurners_v2.csv')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "***"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## General Cleaning Techniques"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The next important step when working with any dataset, is to perform any necessary cleaning steps. This includes (but is not limited to): 1) converting incorrect variable data types, 2) dropping or imputing missing (`NULL`) values, 3) finding and fixing erroneous values, and 4) handling outliers. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Checking for Duplicates"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "A really good high level check to do at the start is to check for duplicates in the dataset. If there is a unique index you can check on like customer or advertiser IDs then that's the best variable to check uniques on by using the `nunique()` method. Else, you can use the `.drop_duplicates()` method."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(10127, 23)"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "10127"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data['CLIENTNUM'].nunique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data.drop_duplicates(inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(10127, 23)"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data.shape"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "No duplicates based on `CLIENTNUM`--good to go! "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Subsetting Data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Next, we will look at the column names and see if we want to change column names or subset the data."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Index(['CLIENTNUM', 'Attrition_Flag', 'Customer_Age', 'Gender',\n",
+       "       'Dependent_count', 'Education_Level', 'Marital_Status',\n",
+       "       'Income_Category', 'Card_Category', 'Months_on_book',\n",
+       "       'Total_Relationship_Count', 'Months_Inactive_12_mon',\n",
+       "       'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal',\n",
+       "       'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt',\n",
+       "       'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio',\n",
+       "       'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1',\n",
+       "       'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2'],\n",
+       "      dtype='object')"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data.columns"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We will likely not need the naive bayes classifiers that they've created, so for simplicity, we will subset to remove them with the following code: "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = data[['CLIENTNUM', 'Attrition_Flag', 'Customer_Age', 'Gender',\n",
+    "       'Dependent_count', 'Education_Level', 'Marital_Status',\n",
+    "       'Income_Category', 'Card_Category', 'Months_on_book',\n",
+    "       'Total_Relationship_Count', 'Months_Inactive_12_mon',\n",
+    "       'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal',\n",
+    "       'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt',\n",
+    "       'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio',]]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Datatypes"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Next, we will take a look at datatypes. To check for datatypes, we will type `df.dtypes` to see how each variable has been read in. Main things to check here are dates and numbers that have been read in as string values and will need to be converted into their respective types in order to work with those variables as intended. Here we see that we have three different datatypes `int64`, `float64` and `object`. The `object` dtype is roughly analogous to str in native Python. You can reference the [user guide](https://pandas.pydata.org/docs/user_guide/basics.html#basics-dtypes) for pandas dtypes."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "CLIENTNUM                     int64\n",
+       "Attrition_Flag               object\n",
+       "Customer_Age                  int64\n",
+       "Gender                       object\n",
+       "Dependent_count               int64\n",
+       "Education_Level              object\n",
+       "Marital_Status               object\n",
+       "Income_Category              object\n",
+       "Card_Category                object\n",
+       "Months_on_book                int64\n",
+       "Total_Relationship_Count      int64\n",
+       "Months_Inactive_12_mon        int64\n",
+       "Contacts_Count_12_mon         int64\n",
+       "Credit_Limit                float64\n",
+       "Total_Revolving_Bal           int64\n",
+       "Avg_Open_To_Buy             float64\n",
+       "Total_Amt_Chng_Q4_Q1        float64\n",
+       "Total_Trans_Amt               int64\n",
+       "Total_Trans_Ct                int64\n",
+       "Total_Ct_Chng_Q4_Q1         float64\n",
+       "Avg_Utilization_Ratio       float64\n",
+       "dtype: object"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data.dtypes"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

From ebabe2a1068f0e33ad9cfdad37c68e7ca8e47caf Mon Sep 17 00:00:00 2001
From: Sarah Nabelsi <snabelsi@Sarahs-MacBook-Pro-2.local>
Date: Mon, 23 Jan 2023 10:54:37 -0800
Subject: [PATCH 2/6] adding beginning notebook

---
 ... General Cleaning Techniques [Begin].ipynb | 197 ++++++++++++++++++
 1 file changed, 197 insertions(+)
 create mode 100644 03_04/03_04 General Cleaning Techniques [Begin].ipynb

diff --git a/03_04/03_04 General Cleaning Techniques [Begin].ipynb b/03_04/03_04 General Cleaning Techniques [Begin].ipynb
new file mode 100644
index 0000000..205b79c
--- /dev/null
+++ b/03_04/03_04 General Cleaning Techniques [Begin].ipynb	
@@ -0,0 +1,197 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Credit Card Retention Analysis"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Imports"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns\n",
+    "import numpy as np\n",
+    "import plotly.graph_objs as go\n",
+    "from plotly.offline import iplot\n",
+    "sns.set()\n",
+    "pd.options.display.max_columns = 999"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 80,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = pd.read_csv('data/BankChurners_v2.csv')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "***"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## General Cleaning Techniques"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The next important step when working with any dataset, is to perform any necessary cleaning steps. This includes (but is not limited to): 1) converting incorrect variable data types, 2) dropping or imputing missing (`NULL`) values, 3) finding and fixing erroneous values, and 4) handling outliers. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Checking for Duplicates"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "A really good high level check to do at the start is to check for duplicates in the dataset. If there is a unique index you can check on like customer or advertiser IDs then that's the best variable to check uniques on by using the `nunique()` method. Else, you can use the `.drop_duplicates()` method."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "No duplicates based on `CLIENTNUM`--good to go! "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Subsetting Data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Next, we will look at the column names and see if we want to change column names or subset the data."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We will likely not need the naive bayes classifiers that they've created, so for simplicity, we will subset to remove them with the following code: "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Datatypes"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Next, we will take a look at datatypes. To check for datatypes, we will type `df.dtypes` to see how each variable has been read in. Main things to check here are dates and numbers that have been read in as string values and will need to be converted into their respective types in order to work with those variables as intended. Here we see that we have three different datatypes `int64`, `float64` and `object`. The `object` dtype is roughly analogous to str in native Python. You can reference the [user guide](https://pandas.pydata.org/docs/user_guide/basics.html#basics-dtypes) for pandas dtypes."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

From db38c57b900811e0c0449b5661c39a92f374d44d Mon Sep 17 00:00:00 2001
From: Sarah Nabelsi <snabelsi@Sarahs-MacBook-Pro-2.local>
Date: Tue, 24 Jan 2023 05:41:10 -0800
Subject: [PATCH 3/6] modify final

---
 ... General Cleaning Techniques [Begin].ipynb | 35 -------------------
 1 file changed, 35 deletions(-)

diff --git a/03_04/03_04 General Cleaning Techniques [Begin].ipynb b/03_04/03_04 General Cleaning Techniques [Begin].ipynb
index 205b79c..466ee72 100644
--- a/03_04/03_04 General Cleaning Techniques [Begin].ipynb	
+++ b/03_04/03_04 General Cleaning Techniques [Begin].ipynb	
@@ -53,13 +53,6 @@
     "## General Cleaning Techniques"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The next important step when working with any dataset, is to perform any necessary cleaning steps. This includes (but is not limited to): 1) converting incorrect variable data types, 2) dropping or imputing missing (`NULL`) values, 3) finding and fixing erroneous values, and 4) handling outliers. "
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -67,13 +60,6 @@
     "### Checking for Duplicates"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "A really good high level check to do at the start is to check for duplicates in the dataset. If there is a unique index you can check on like customer or advertiser IDs then that's the best variable to check uniques on by using the `nunique()` method. Else, you can use the `.drop_duplicates()` method."
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -116,13 +102,6 @@
     "### Subsetting Data"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Next, we will look at the column names and see if we want to change column names or subset the data."
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -130,13 +109,6 @@
    "outputs": [],
    "source": []
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "We will likely not need the naive bayes classifiers that they've created, so for simplicity, we will subset to remove them with the following code: "
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": 10,
@@ -151,13 +123,6 @@
     "### Datatypes"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Next, we will take a look at datatypes. To check for datatypes, we will type `df.dtypes` to see how each variable has been read in. Main things to check here are dates and numbers that have been read in as string values and will need to be converted into their respective types in order to work with those variables as intended. Here we see that we have three different datatypes `int64`, `float64` and `object`. The `object` dtype is roughly analogous to str in native Python. You can reference the [user guide](https://pandas.pydata.org/docs/user_guide/basics.html#basics-dtypes) for pandas dtypes."
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,

From dfa5e90769c591c6e45b9177e950beb1d3012d33 Mon Sep 17 00:00:00 2001
From: Sarah Nabelsi <snooravi17@gmail.com>
Date: Tue, 24 Jan 2023 17:28:47 +0000
Subject: [PATCH 4/6] small change

---
 03_04/03_04 General Cleaning Techniques [Begin].ipynb | 9 +++++++--
 03_04/03_04 General Cleaning Techniques.ipynb         | 2 +-
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/03_04/03_04 General Cleaning Techniques [Begin].ipynb b/03_04/03_04 General Cleaning Techniques [Begin].ipynb
index 466ee72..d11b556 100644
--- a/03_04/03_04 General Cleaning Techniques [Begin].ipynb	
+++ b/03_04/03_04 General Cleaning Techniques [Begin].ipynb	
@@ -36,7 +36,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "data = pd.read_csv('data/BankChurners_v2.csv')"
+    "data = pd.read_csv('../data/BankChurners_v2.csv')"
    ]
   },
   {
@@ -154,7 +154,12 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.5"
+   "version": "3.10.4 (main, Jan 18 2023, 00:26:41) [GCC 9.4.0]"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "3ad933181bd8a04b432d3370b9dc3b0662ad032c4dfaa4e4f1596c548f763858"
+   }
   }
  },
  "nbformat": 4,
diff --git a/03_04/03_04 General Cleaning Techniques.ipynb b/03_04/03_04 General Cleaning Techniques.ipynb
index 36b3030..e7ab95e 100644
--- a/03_04/03_04 General Cleaning Techniques.ipynb	
+++ b/03_04/03_04 General Cleaning Techniques.ipynb	
@@ -36,7 +36,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "data = pd.read_csv('data/BankChurners_v2.csv')"
+    "data = pd.read_csv('../data/BankChurners_v2.csv')"
    ]
   },
   {

From 33eedb1732b8377d54772ba6b35471938691b6ab Mon Sep 17 00:00:00 2001
From: Sarah Nabelsi <snooravi17@gmail.com>
Date: Tue, 24 Jan 2023 17:29:38 +0000
Subject: [PATCH 5/6] small change

---
 03_04/03_04 General Cleaning Techniques.ipynb | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/03_04/03_04 General Cleaning Techniques.ipynb b/03_04/03_04 General Cleaning Techniques.ipynb
index e7ab95e..00a8cb8 100644
--- a/03_04/03_04 General Cleaning Techniques.ipynb	
+++ b/03_04/03_04 General Cleaning Techniques.ipynb	
@@ -287,7 +287,12 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.5"
+   "version": "3.10.4 (main, Jan 18 2023, 00:26:41) [GCC 9.4.0]"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "3ad933181bd8a04b432d3370b9dc3b0662ad032c4dfaa4e4f1596c548f763858"
+   }
   }
  },
  "nbformat": 4,

From f0dbdcfee78b08c7e5bac3fe6d3e67fc7a134ff0 Mon Sep 17 00:00:00 2001
From: Rob Michael <84167666+robmichael93@users.noreply.github.com>
Date: Fri, 14 Nov 2025 05:40:00 +0000
Subject: [PATCH 6/6] Ch 3.2

---
 ... General Cleaning Techniques [Begin].ipynb | 397 +++++++++++++++++-
 1 file changed, 375 insertions(+), 22 deletions(-)

diff --git a/03_04/03_04 General Cleaning Techniques [Begin].ipynb b/03_04/03_04 General Cleaning Techniques [Begin].ipynb
index d11b556..57e1448 100644
--- a/03_04/03_04 General Cleaning Techniques [Begin].ipynb	
+++ b/03_04/03_04 General Cleaning Techniques [Begin].ipynb	
@@ -16,7 +16,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 23,
+   "id": "dce2d54d",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -26,13 +27,14 @@
     "import numpy as np\n",
     "import plotly.graph_objs as go\n",
     "from plotly.offline import iplot\n",
-    "sns.set()\n",
+    "sns.set_theme()\n",
     "pd.options.display.max_columns = 999"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 80,
+   "execution_count": 24,
+   "id": "e1d972af",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -41,6 +43,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "dc713314",
    "metadata": {},
    "source": [
     "***"
@@ -48,6 +51,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "26e113b3",
    "metadata": {},
    "source": [
     "## General Cleaning Techniques"
@@ -55,6 +59,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "c9172797",
    "metadata": {},
    "source": [
     "### Checking for Duplicates"
@@ -62,34 +67,313 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 25,
+   "id": "6e0a6169",
    "metadata": {},
-   "outputs": [],
-   "source": []
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(10127, 23)"
+      ]
+     },
+     "execution_count": 25,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data.shape"
+   ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 26,
+   "id": "6c005b1e",
    "metadata": {},
-   "outputs": [],
-   "source": []
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>CLIENTNUM</th>\n",
+       "      <th>Attrition_Flag</th>\n",
+       "      <th>Customer_Age</th>\n",
+       "      <th>Gender</th>\n",
+       "      <th>Dependent_count</th>\n",
+       "      <th>Education_Level</th>\n",
+       "      <th>Marital_Status</th>\n",
+       "      <th>Income_Category</th>\n",
+       "      <th>Card_Category</th>\n",
+       "      <th>Months_on_book</th>\n",
+       "      <th>Total_Relationship_Count</th>\n",
+       "      <th>Months_Inactive_12_mon</th>\n",
+       "      <th>Contacts_Count_12_mon</th>\n",
+       "      <th>Credit_Limit</th>\n",
+       "      <th>Total_Revolving_Bal</th>\n",
+       "      <th>Avg_Open_To_Buy</th>\n",
+       "      <th>Total_Amt_Chng_Q4_Q1</th>\n",
+       "      <th>Total_Trans_Amt</th>\n",
+       "      <th>Total_Trans_Ct</th>\n",
+       "      <th>Total_Ct_Chng_Q4_Q1</th>\n",
+       "      <th>Avg_Utilization_Ratio</th>\n",
+       "      <th>Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1</th>\n",
+       "      <th>Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>90032</td>\n",
+       "      <td>Existing Customer</td>\n",
+       "      <td>45</td>\n",
+       "      <td>M</td>\n",
+       "      <td>3</td>\n",
+       "      <td>High School</td>\n",
+       "      <td>Married</td>\n",
+       "      <td>$60K - $80K</td>\n",
+       "      <td>Blue</td>\n",
+       "      <td>39</td>\n",
+       "      <td>5</td>\n",
+       "      <td>1</td>\n",
+       "      <td>3</td>\n",
+       "      <td>12691.0</td>\n",
+       "      <td>777</td>\n",
+       "      <td>11914.0</td>\n",
+       "      <td>1.335</td>\n",
+       "      <td>1144</td>\n",
+       "      <td>42</td>\n",
+       "      <td>1.625</td>\n",
+       "      <td>0.061</td>\n",
+       "      <td>0.000093</td>\n",
+       "      <td>0.99991</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>90033</td>\n",
+       "      <td>Existing Customer</td>\n",
+       "      <td>49</td>\n",
+       "      <td>F</td>\n",
+       "      <td>5</td>\n",
+       "      <td>Graduate</td>\n",
+       "      <td>Single</td>\n",
+       "      <td>Less than $40K</td>\n",
+       "      <td>Blue</td>\n",
+       "      <td>44</td>\n",
+       "      <td>6</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2</td>\n",
+       "      <td>8256.0</td>\n",
+       "      <td>864</td>\n",
+       "      <td>7392.0</td>\n",
+       "      <td>1.541</td>\n",
+       "      <td>1291</td>\n",
+       "      <td>33</td>\n",
+       "      <td>3.714</td>\n",
+       "      <td>0.105</td>\n",
+       "      <td>0.000057</td>\n",
+       "      <td>0.99994</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>90034</td>\n",
+       "      <td>Existing Customer</td>\n",
+       "      <td>51</td>\n",
+       "      <td>M</td>\n",
+       "      <td>3</td>\n",
+       "      <td>Graduate</td>\n",
+       "      <td>Married</td>\n",
+       "      <td>$80K - $120K</td>\n",
+       "      <td>Blue</td>\n",
+       "      <td>36</td>\n",
+       "      <td>4</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>3418.0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>3418.0</td>\n",
+       "      <td>2.594</td>\n",
+       "      <td>1887</td>\n",
+       "      <td>20</td>\n",
+       "      <td>2.333</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>0.000021</td>\n",
+       "      <td>0.99998</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>90035</td>\n",
+       "      <td>Existing Customer</td>\n",
+       "      <td>40</td>\n",
+       "      <td>F</td>\n",
+       "      <td>4</td>\n",
+       "      <td>High School</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Less than $40K</td>\n",
+       "      <td>Blue</td>\n",
+       "      <td>34</td>\n",
+       "      <td>3</td>\n",
+       "      <td>4</td>\n",
+       "      <td>1</td>\n",
+       "      <td>3313.0</td>\n",
+       "      <td>2517</td>\n",
+       "      <td>796.0</td>\n",
+       "      <td>1.405</td>\n",
+       "      <td>1171</td>\n",
+       "      <td>20</td>\n",
+       "      <td>2.333</td>\n",
+       "      <td>0.760</td>\n",
+       "      <td>0.000134</td>\n",
+       "      <td>0.99987</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>90036</td>\n",
+       "      <td>Existing Customer</td>\n",
+       "      <td>40</td>\n",
+       "      <td>M</td>\n",
+       "      <td>3</td>\n",
+       "      <td>Uneducated</td>\n",
+       "      <td>Married</td>\n",
+       "      <td>$60K - $80K</td>\n",
+       "      <td>Blue</td>\n",
+       "      <td>21</td>\n",
+       "      <td>5</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>4716.0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>4716.0</td>\n",
+       "      <td>2.175</td>\n",
+       "      <td>816</td>\n",
+       "      <td>28</td>\n",
+       "      <td>2.500</td>\n",
+       "      <td>0.000</td>\n",
+       "      <td>0.000022</td>\n",
+       "      <td>0.99998</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   CLIENTNUM     Attrition_Flag  Customer_Age Gender  Dependent_count  \\\n",
+       "0      90032  Existing Customer            45      M                3   \n",
+       "1      90033  Existing Customer            49      F                5   \n",
+       "2      90034  Existing Customer            51      M                3   \n",
+       "3      90035  Existing Customer            40      F                4   \n",
+       "4      90036  Existing Customer            40      M                3   \n",
+       "\n",
+       "  Education_Level Marital_Status Income_Category Card_Category  \\\n",
+       "0     High School        Married     $60K - $80K          Blue   \n",
+       "1        Graduate         Single  Less than $40K          Blue   \n",
+       "2        Graduate        Married    $80K - $120K          Blue   \n",
+       "3     High School            NaN  Less than $40K          Blue   \n",
+       "4      Uneducated        Married     $60K - $80K          Blue   \n",
+       "\n",
+       "   Months_on_book  Total_Relationship_Count  Months_Inactive_12_mon  \\\n",
+       "0              39                         5                       1   \n",
+       "1              44                         6                       1   \n",
+       "2              36                         4                       1   \n",
+       "3              34                         3                       4   \n",
+       "4              21                         5                       1   \n",
+       "\n",
+       "   Contacts_Count_12_mon  Credit_Limit  Total_Revolving_Bal  Avg_Open_To_Buy  \\\n",
+       "0                      3       12691.0                  777          11914.0   \n",
+       "1                      2        8256.0                  864           7392.0   \n",
+       "2                      0        3418.0                    0           3418.0   \n",
+       "3                      1        3313.0                 2517            796.0   \n",
+       "4                      0        4716.0                    0           4716.0   \n",
+       "\n",
+       "   Total_Amt_Chng_Q4_Q1  Total_Trans_Amt  Total_Trans_Ct  Total_Ct_Chng_Q4_Q1  \\\n",
+       "0                 1.335             1144              42                1.625   \n",
+       "1                 1.541             1291              33                3.714   \n",
+       "2                 2.594             1887              20                2.333   \n",
+       "3                 1.405             1171              20                2.333   \n",
+       "4                 2.175              816              28                2.500   \n",
+       "\n",
+       "   Avg_Utilization_Ratio  \\\n",
+       "0                  0.061   \n",
+       "1                  0.105   \n",
+       "2                  0.000   \n",
+       "3                  0.760   \n",
+       "4                  0.000   \n",
+       "\n",
+       "   Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1  \\\n",
+       "0                                           0.000093                                                                                    \n",
+       "1                                           0.000057                                                                                    \n",
+       "2                                           0.000021                                                                                    \n",
+       "3                                           0.000134                                                                                    \n",
+       "4                                           0.000022                                                                                    \n",
+       "\n",
+       "   Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2  \n",
+       "0                                            0.99991                                                                                   \n",
+       "1                                            0.99994                                                                                   \n",
+       "2                                            0.99998                                                                                   \n",
+       "3                                            0.99987                                                                                   \n",
+       "4                                            0.99998                                                                                   "
+      ]
+     },
+     "execution_count": 26,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data.head()"
+   ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 27,
+   "id": "76e76248",
    "metadata": {},
-   "outputs": [],
-   "source": []
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "10127"
+      ]
+     },
+     "execution_count": 27,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data['CLIENTNUM'].nunique()"
+   ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 28,
+   "id": "428c055d",
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "data.drop_duplicates(inplace=True)"
+   ]
   },
   {
    "cell_type": "markdown",
+   "id": "a08a6dab",
    "metadata": {},
    "source": [
     "No duplicates based on `CLIENTNUM`--good to go! "
@@ -97,6 +381,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "9c3bc621",
    "metadata": {},
    "source": [
     "### Subsetting Data"
@@ -104,20 +389,53 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 29,
+   "id": "5ba3e7e4",
    "metadata": {},
-   "outputs": [],
-   "source": []
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Index(['CLIENTNUM', 'Attrition_Flag', 'Customer_Age', 'Gender',\n",
+       "       'Dependent_count', 'Education_Level', 'Marital_Status',\n",
+       "       'Income_Category', 'Card_Category', 'Months_on_book',\n",
+       "       'Total_Relationship_Count', 'Months_Inactive_12_mon',\n",
+       "       'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal',\n",
+       "       'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt',\n",
+       "       'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio',\n",
+       "       'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1',\n",
+       "       'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2'],\n",
+       "      dtype='object')"
+      ]
+     },
+     "execution_count": 29,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data.columns"
+   ]
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 30,
+   "id": "d975075d",
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "data = data[['CLIENTNUM', 'Attrition_Flag', 'Customer_Age', 'Gender',\n",
+    "       'Dependent_count', 'Education_Level', 'Marital_Status',\n",
+    "       'Income_Category', 'Card_Category', 'Months_on_book',\n",
+    "       'Total_Relationship_Count', 'Months_Inactive_12_mon',\n",
+    "       'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal',\n",
+    "       'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt',\n",
+    "       'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio']]"
+   ]
   },
   {
    "cell_type": "markdown",
+   "id": "ffc43c21",
    "metadata": {},
    "source": [
     "### Datatypes"
@@ -125,10 +443,45 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 31,
+   "id": "272689b7",
    "metadata": {},
-   "outputs": [],
-   "source": []
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "CLIENTNUM                     int64\n",
+       "Attrition_Flag               object\n",
+       "Customer_Age                  int64\n",
+       "Gender                       object\n",
+       "Dependent_count               int64\n",
+       "Education_Level              object\n",
+       "Marital_Status               object\n",
+       "Income_Category              object\n",
+       "Card_Category                object\n",
+       "Months_on_book                int64\n",
+       "Total_Relationship_Count      int64\n",
+       "Months_Inactive_12_mon        int64\n",
+       "Contacts_Count_12_mon         int64\n",
+       "Credit_Limit                float64\n",
+       "Total_Revolving_Bal           int64\n",
+       "Avg_Open_To_Buy             float64\n",
+       "Total_Amt_Chng_Q4_Q1        float64\n",
+       "Total_Trans_Amt               int64\n",
+       "Total_Trans_Ct                int64\n",
+       "Total_Ct_Chng_Q4_Q1         float64\n",
+       "Avg_Utilization_Ratio       float64\n",
+       "dtype: object"
+      ]
+     },
+     "execution_count": 31,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data.dtypes"
+   ]
   },
   {
    "cell_type": "code",