diff --git a/.vscode/settings.json b/.vscode/settings.json
index 2369810..5eef4fc 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -3,7 +3,7 @@
"editor.cursorBlinking": "solid",
"editor.fontFamily": "ui-monospace, Menlo, Monaco, 'Cascadia Mono', 'Segoe UI Mono', 'Roboto Mono', 'Oxygen Mono', 'Ubuntu Monospace', 'Source Code Pro', 'Fira Mono', 'Droid Sans Mono', 'Courier New', monospace",
"editor.fontLigatures": false,
- "editor.fontSize": 22,
+ "editor.fontSize": 8,
"editor.formatOnPaste": true,
"editor.formatOnSave": true,
"editor.lineNumbers": "on",
@@ -17,8 +17,7 @@
"files.autoSave": "afterDelay",
"screencastMode.onlyKeyboardShortcuts": true,
"terminal.integrated.fontSize": 18,
- "workbench.activityBar.visible": true,
"workbench.colorTheme": "Visual Studio Dark",
"workbench.fontAliasing": "antialiased",
"workbench.statusBar.visible": true
-}
+}
\ No newline at end of file
diff --git a/03_04/03_04 General Cleaning Techniques [Begin].ipynb b/03_04/03_04 General Cleaning Techniques [Begin].ipynb
index d11b556..7dfceee 100644
--- a/03_04/03_04 General Cleaning Techniques [Begin].ipynb
+++ b/03_04/03_04 General Cleaning Techniques [Begin].ipynb
@@ -16,7 +16,7 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
@@ -32,7 +32,7 @@
},
{
"cell_type": "code",
- "execution_count": 80,
+ "execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
@@ -62,17 +62,363 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 4,
"metadata": {},
- "outputs": [],
- "source": []
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " CLIENTNUM | \n",
+ " Customer_Age | \n",
+ " Dependent_count | \n",
+ " Months_on_book | \n",
+ " Total_Relationship_Count | \n",
+ " Months_Inactive_12_mon | \n",
+ " Contacts_Count_12_mon | \n",
+ " Credit_Limit | \n",
+ " Total_Revolving_Bal | \n",
+ " Avg_Open_To_Buy | \n",
+ " Total_Amt_Chng_Q4_Q1 | \n",
+ " Total_Trans_Amt | \n",
+ " Total_Trans_Ct | \n",
+ " Total_Ct_Chng_Q4_Q1 | \n",
+ " Avg_Utilization_Ratio | \n",
+ " Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1 | \n",
+ " Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | count | \n",
+ " 10127.000000 | \n",
+ " 10127.000000 | \n",
+ " 10127.000000 | \n",
+ " 10127.000000 | \n",
+ " 10127.000000 | \n",
+ " 10127.000000 | \n",
+ " 10127.000000 | \n",
+ " 10127.000000 | \n",
+ " 10127.000000 | \n",
+ " 10127.000000 | \n",
+ " 10127.000000 | \n",
+ " 10127.000000 | \n",
+ " 10127.000000 | \n",
+ " 10127.000000 | \n",
+ " 10127.000000 | \n",
+ " 10127.000000 | \n",
+ " 10127.000000 | \n",
+ "
\n",
+ " \n",
+ " | mean | \n",
+ " 95095.000000 | \n",
+ " 46.325960 | \n",
+ " 2.346203 | \n",
+ " 35.928409 | \n",
+ " 3.812580 | \n",
+ " 2.341167 | \n",
+ " 2.455317 | \n",
+ " 8631.953698 | \n",
+ " 1162.814061 | \n",
+ " 7469.139637 | \n",
+ " 0.759941 | \n",
+ " 4404.086304 | \n",
+ " 64.858695 | \n",
+ " 0.712222 | \n",
+ " 0.274894 | \n",
+ " 0.159997 | \n",
+ " 0.840003 | \n",
+ "
\n",
+ " \n",
+ " | std | \n",
+ " 2923.557422 | \n",
+ " 8.016814 | \n",
+ " 1.298908 | \n",
+ " 7.986416 | \n",
+ " 1.554408 | \n",
+ " 1.010622 | \n",
+ " 1.106225 | \n",
+ " 9088.776650 | \n",
+ " 814.987335 | \n",
+ " 9090.685324 | \n",
+ " 0.219207 | \n",
+ " 3397.129254 | \n",
+ " 23.472570 | \n",
+ " 0.238086 | \n",
+ " 0.275691 | \n",
+ " 0.365301 | \n",
+ " 0.365301 | \n",
+ "
\n",
+ " \n",
+ " | min | \n",
+ " 90032.000000 | \n",
+ " 26.000000 | \n",
+ " 0.000000 | \n",
+ " 13.000000 | \n",
+ " 1.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 1438.300000 | \n",
+ " 0.000000 | \n",
+ " 3.000000 | \n",
+ " 0.000000 | \n",
+ " 510.000000 | \n",
+ " 10.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000008 | \n",
+ " 0.000420 | \n",
+ "
\n",
+ " \n",
+ " | 25% | \n",
+ " 92563.500000 | \n",
+ " 41.000000 | \n",
+ " 1.000000 | \n",
+ " 31.000000 | \n",
+ " 3.000000 | \n",
+ " 2.000000 | \n",
+ " 2.000000 | \n",
+ " 2555.000000 | \n",
+ " 359.000000 | \n",
+ " 1324.500000 | \n",
+ " 0.631000 | \n",
+ " 2155.500000 | \n",
+ " 45.000000 | \n",
+ " 0.582000 | \n",
+ " 0.023000 | \n",
+ " 0.000099 | \n",
+ " 0.999660 | \n",
+ "
\n",
+ " \n",
+ " | 50% | \n",
+ " 95095.000000 | \n",
+ " 46.000000 | \n",
+ " 2.000000 | \n",
+ " 36.000000 | \n",
+ " 4.000000 | \n",
+ " 2.000000 | \n",
+ " 2.000000 | \n",
+ " 4549.000000 | \n",
+ " 1276.000000 | \n",
+ " 3474.000000 | \n",
+ " 0.736000 | \n",
+ " 3899.000000 | \n",
+ " 67.000000 | \n",
+ " 0.702000 | \n",
+ " 0.176000 | \n",
+ " 0.000181 | \n",
+ " 0.999820 | \n",
+ "
\n",
+ " \n",
+ " | 75% | \n",
+ " 97626.500000 | \n",
+ " 52.000000 | \n",
+ " 3.000000 | \n",
+ " 40.000000 | \n",
+ " 5.000000 | \n",
+ " 3.000000 | \n",
+ " 3.000000 | \n",
+ " 11067.500000 | \n",
+ " 1784.000000 | \n",
+ " 9859.000000 | \n",
+ " 0.859000 | \n",
+ " 4741.000000 | \n",
+ " 81.000000 | \n",
+ " 0.818000 | \n",
+ " 0.503000 | \n",
+ " 0.000337 | \n",
+ " 0.999900 | \n",
+ "
\n",
+ " \n",
+ " | max | \n",
+ " 100158.000000 | \n",
+ " 73.000000 | \n",
+ " 5.000000 | \n",
+ " 56.000000 | \n",
+ " 6.000000 | \n",
+ " 6.000000 | \n",
+ " 6.000000 | \n",
+ " 34516.000000 | \n",
+ " 2517.000000 | \n",
+ " 34516.000000 | \n",
+ " 3.397000 | \n",
+ " 18484.000000 | \n",
+ " 139.000000 | \n",
+ " 3.714000 | \n",
+ " 0.999000 | \n",
+ " 0.999580 | \n",
+ " 0.999990 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " CLIENTNUM Customer_Age Dependent_count Months_on_book \\\n",
+ "count 10127.000000 10127.000000 10127.000000 10127.000000 \n",
+ "mean 95095.000000 46.325960 2.346203 35.928409 \n",
+ "std 2923.557422 8.016814 1.298908 7.986416 \n",
+ "min 90032.000000 26.000000 0.000000 13.000000 \n",
+ "25% 92563.500000 41.000000 1.000000 31.000000 \n",
+ "50% 95095.000000 46.000000 2.000000 36.000000 \n",
+ "75% 97626.500000 52.000000 3.000000 40.000000 \n",
+ "max 100158.000000 73.000000 5.000000 56.000000 \n",
+ "\n",
+ " Total_Relationship_Count Months_Inactive_12_mon \\\n",
+ "count 10127.000000 10127.000000 \n",
+ "mean 3.812580 2.341167 \n",
+ "std 1.554408 1.010622 \n",
+ "min 1.000000 0.000000 \n",
+ "25% 3.000000 2.000000 \n",
+ "50% 4.000000 2.000000 \n",
+ "75% 5.000000 3.000000 \n",
+ "max 6.000000 6.000000 \n",
+ "\n",
+ " Contacts_Count_12_mon Credit_Limit Total_Revolving_Bal \\\n",
+ "count 10127.000000 10127.000000 10127.000000 \n",
+ "mean 2.455317 8631.953698 1162.814061 \n",
+ "std 1.106225 9088.776650 814.987335 \n",
+ "min 0.000000 1438.300000 0.000000 \n",
+ "25% 2.000000 2555.000000 359.000000 \n",
+ "50% 2.000000 4549.000000 1276.000000 \n",
+ "75% 3.000000 11067.500000 1784.000000 \n",
+ "max 6.000000 34516.000000 2517.000000 \n",
+ "\n",
+ " Avg_Open_To_Buy Total_Amt_Chng_Q4_Q1 Total_Trans_Amt Total_Trans_Ct \\\n",
+ "count 10127.000000 10127.000000 10127.000000 10127.000000 \n",
+ "mean 7469.139637 0.759941 4404.086304 64.858695 \n",
+ "std 9090.685324 0.219207 3397.129254 23.472570 \n",
+ "min 3.000000 0.000000 510.000000 10.000000 \n",
+ "25% 1324.500000 0.631000 2155.500000 45.000000 \n",
+ "50% 3474.000000 0.736000 3899.000000 67.000000 \n",
+ "75% 9859.000000 0.859000 4741.000000 81.000000 \n",
+ "max 34516.000000 3.397000 18484.000000 139.000000 \n",
+ "\n",
+ " Total_Ct_Chng_Q4_Q1 Avg_Utilization_Ratio \\\n",
+ "count 10127.000000 10127.000000 \n",
+ "mean 0.712222 0.274894 \n",
+ "std 0.238086 0.275691 \n",
+ "min 0.000000 0.000000 \n",
+ "25% 0.582000 0.023000 \n",
+ "50% 0.702000 0.176000 \n",
+ "75% 0.818000 0.503000 \n",
+ "max 3.714000 0.999000 \n",
+ "\n",
+ " Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1 \\\n",
+ "count 10127.000000 \n",
+ "mean 0.159997 \n",
+ "std 0.365301 \n",
+ "min 0.000008 \n",
+ "25% 0.000099 \n",
+ "50% 0.000181 \n",
+ "75% 0.000337 \n",
+ "max 0.999580 \n",
+ "\n",
+ " Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2 \n",
+ "count 10127.000000 \n",
+ "mean 0.840003 \n",
+ "std 0.365301 \n",
+ "min 0.000420 \n",
+ "25% 0.999660 \n",
+ "50% 0.999820 \n",
+ "75% 0.999900 \n",
+ "max 0.999990 "
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data.describe()"
+ ]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 9,
"metadata": {},
- "outputs": [],
- "source": []
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " CLIENTNUM | \n",
+ " Attrition_Flag | \n",
+ " Customer_Age | \n",
+ " Gender | \n",
+ " Dependent_count | \n",
+ " Education_Level | \n",
+ " Marital_Status | \n",
+ " Income_Category | \n",
+ " Card_Category | \n",
+ " Months_on_book | \n",
+ " Total_Relationship_Count | \n",
+ " Months_Inactive_12_mon | \n",
+ " Contacts_Count_12_mon | \n",
+ " Credit_Limit | \n",
+ " Total_Revolving_Bal | \n",
+ " Avg_Open_To_Buy | \n",
+ " Total_Amt_Chng_Q4_Q1 | \n",
+ " Total_Trans_Amt | \n",
+ " Total_Trans_Ct | \n",
+ " Total_Ct_Chng_Q4_Q1 | \n",
+ " Avg_Utilization_Ratio | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ "Empty DataFrame\n",
+ "Columns: [CLIENTNUM, Attrition_Flag, Customer_Age, Gender, Dependent_count, Education_Level, Marital_Status, Income_Category, Card_Category, Months_on_book, Total_Relationship_Count, Months_Inactive_12_mon, Contacts_Count_12_mon, Credit_Limit, Total_Revolving_Bal, Avg_Open_To_Buy, Total_Amt_Chng_Q4_Q1, Total_Trans_Amt, Total_Trans_Ct, Total_Ct_Chng_Q4_Q1, Avg_Utilization_Ratio]\n",
+ "Index: []"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data[data[\"CLIENTNUM\"].duplicated()]"
+ ]
},
{
"cell_type": "code",
@@ -104,17 +450,60 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 5,
"metadata": {},
- "outputs": [],
- "source": []
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Index(['CLIENTNUM', 'Attrition_Flag', 'Customer_Age', 'Gender',\n",
+ " 'Dependent_count', 'Education_Level', 'Marital_Status',\n",
+ " 'Income_Category', 'Card_Category', 'Months_on_book',\n",
+ " 'Total_Relationship_Count', 'Months_Inactive_12_mon',\n",
+ " 'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal',\n",
+ " 'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt',\n",
+ " 'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio',\n",
+ " 'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1',\n",
+ " 'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2'],\n",
+ " dtype='object')"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data.columns"
+ ]
},
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": 6,
"metadata": {},
- "outputs": [],
- "source": []
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(10127, 21)"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data = data[['CLIENTNUM', 'Attrition_Flag', 'Customer_Age', 'Gender',\n",
+ " 'Dependent_count', 'Education_Level', 'Marital_Status',\n",
+ " 'Income_Category', 'Card_Category', 'Months_on_book',\n",
+ " 'Total_Relationship_Count', 'Months_Inactive_12_mon',\n",
+ " 'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal',\n",
+ " 'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt',\n",
+ " 'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio']]\n",
+ "\n",
+ "data.shape"
+ ]
},
{
"cell_type": "markdown",
@@ -125,10 +514,44 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 7,
"metadata": {},
- "outputs": [],
- "source": []
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "CLIENTNUM int64\n",
+ "Attrition_Flag object\n",
+ "Customer_Age int64\n",
+ "Gender object\n",
+ "Dependent_count int64\n",
+ "Education_Level object\n",
+ "Marital_Status object\n",
+ "Income_Category object\n",
+ "Card_Category object\n",
+ "Months_on_book int64\n",
+ "Total_Relationship_Count int64\n",
+ "Months_Inactive_12_mon int64\n",
+ "Contacts_Count_12_mon int64\n",
+ "Credit_Limit float64\n",
+ "Total_Revolving_Bal int64\n",
+ "Avg_Open_To_Buy float64\n",
+ "Total_Amt_Chng_Q4_Q1 float64\n",
+ "Total_Trans_Amt int64\n",
+ "Total_Trans_Ct int64\n",
+ "Total_Ct_Chng_Q4_Q1 float64\n",
+ "Avg_Utilization_Ratio float64\n",
+ "dtype: object"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data.dtypes"
+ ]
},
{
"cell_type": "code",
@@ -154,7 +577,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.10.4 (main, Jan 18 2023, 00:26:41) [GCC 9.4.0]"
+ "version": "3.12.1"
},
"vscode": {
"interpreter": {