diff --git a/.vscode/settings.json b/.vscode/settings.json index 2369810..5eef4fc 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -3,7 +3,7 @@ "editor.cursorBlinking": "solid", "editor.fontFamily": "ui-monospace, Menlo, Monaco, 'Cascadia Mono', 'Segoe UI Mono', 'Roboto Mono', 'Oxygen Mono', 'Ubuntu Monospace', 'Source Code Pro', 'Fira Mono', 'Droid Sans Mono', 'Courier New', monospace", "editor.fontLigatures": false, - "editor.fontSize": 22, + "editor.fontSize": 8, "editor.formatOnPaste": true, "editor.formatOnSave": true, "editor.lineNumbers": "on", @@ -17,8 +17,7 @@ "files.autoSave": "afterDelay", "screencastMode.onlyKeyboardShortcuts": true, "terminal.integrated.fontSize": 18, - "workbench.activityBar.visible": true, "workbench.colorTheme": "Visual Studio Dark", "workbench.fontAliasing": "antialiased", "workbench.statusBar.visible": true -} +} \ No newline at end of file diff --git a/03_04/03_04 General Cleaning Techniques [Begin].ipynb b/03_04/03_04 General Cleaning Techniques [Begin].ipynb index d11b556..7dfceee 100644 --- a/03_04/03_04 General Cleaning Techniques [Begin].ipynb +++ b/03_04/03_04 General Cleaning Techniques [Begin].ipynb @@ -16,7 +16,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -32,7 +32,7 @@ }, { "cell_type": "code", - "execution_count": 80, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -62,17 +62,363 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CLIENTNUMCustomer_AgeDependent_countMonths_on_bookTotal_Relationship_CountMonths_Inactive_12_monContacts_Count_12_monCredit_LimitTotal_Revolving_BalAvg_Open_To_BuyTotal_Amt_Chng_Q4_Q1Total_Trans_AmtTotal_Trans_CtTotal_Ct_Chng_Q4_Q1Avg_Utilization_RatioNaive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2
count10127.00000010127.00000010127.00000010127.00000010127.00000010127.00000010127.00000010127.00000010127.00000010127.00000010127.00000010127.00000010127.00000010127.00000010127.00000010127.00000010127.000000
mean95095.00000046.3259602.34620335.9284093.8125802.3411672.4553178631.9536981162.8140617469.1396370.7599414404.08630464.8586950.7122220.2748940.1599970.840003
std2923.5574228.0168141.2989087.9864161.5544081.0106221.1062259088.776650814.9873359090.6853240.2192073397.12925423.4725700.2380860.2756910.3653010.365301
min90032.00000026.0000000.00000013.0000001.0000000.0000000.0000001438.3000000.0000003.0000000.000000510.00000010.0000000.0000000.0000000.0000080.000420
25%92563.50000041.0000001.00000031.0000003.0000002.0000002.0000002555.000000359.0000001324.5000000.6310002155.50000045.0000000.5820000.0230000.0000990.999660
50%95095.00000046.0000002.00000036.0000004.0000002.0000002.0000004549.0000001276.0000003474.0000000.7360003899.00000067.0000000.7020000.1760000.0001810.999820
75%97626.50000052.0000003.00000040.0000005.0000003.0000003.00000011067.5000001784.0000009859.0000000.8590004741.00000081.0000000.8180000.5030000.0003370.999900
max100158.00000073.0000005.00000056.0000006.0000006.0000006.00000034516.0000002517.00000034516.0000003.39700018484.000000139.0000003.7140000.9990000.9995800.999990
\n", + "
" + ], + "text/plain": [ + " CLIENTNUM Customer_Age Dependent_count Months_on_book \\\n", + "count 10127.000000 10127.000000 10127.000000 10127.000000 \n", + "mean 95095.000000 46.325960 2.346203 35.928409 \n", + "std 2923.557422 8.016814 1.298908 7.986416 \n", + "min 90032.000000 26.000000 0.000000 13.000000 \n", + "25% 92563.500000 41.000000 1.000000 31.000000 \n", + "50% 95095.000000 46.000000 2.000000 36.000000 \n", + "75% 97626.500000 52.000000 3.000000 40.000000 \n", + "max 100158.000000 73.000000 5.000000 56.000000 \n", + "\n", + " Total_Relationship_Count Months_Inactive_12_mon \\\n", + "count 10127.000000 10127.000000 \n", + "mean 3.812580 2.341167 \n", + "std 1.554408 1.010622 \n", + "min 1.000000 0.000000 \n", + "25% 3.000000 2.000000 \n", + "50% 4.000000 2.000000 \n", + "75% 5.000000 3.000000 \n", + "max 6.000000 6.000000 \n", + "\n", + " Contacts_Count_12_mon Credit_Limit Total_Revolving_Bal \\\n", + "count 10127.000000 10127.000000 10127.000000 \n", + "mean 2.455317 8631.953698 1162.814061 \n", + "std 1.106225 9088.776650 814.987335 \n", + "min 0.000000 1438.300000 0.000000 \n", + "25% 2.000000 2555.000000 359.000000 \n", + "50% 2.000000 4549.000000 1276.000000 \n", + "75% 3.000000 11067.500000 1784.000000 \n", + "max 6.000000 34516.000000 2517.000000 \n", + "\n", + " Avg_Open_To_Buy Total_Amt_Chng_Q4_Q1 Total_Trans_Amt Total_Trans_Ct \\\n", + "count 10127.000000 10127.000000 10127.000000 10127.000000 \n", + "mean 7469.139637 0.759941 4404.086304 64.858695 \n", + "std 9090.685324 0.219207 3397.129254 23.472570 \n", + "min 3.000000 0.000000 510.000000 10.000000 \n", + "25% 1324.500000 0.631000 2155.500000 45.000000 \n", + "50% 3474.000000 0.736000 3899.000000 67.000000 \n", + "75% 9859.000000 0.859000 4741.000000 81.000000 \n", + "max 34516.000000 3.397000 18484.000000 139.000000 \n", + "\n", + " Total_Ct_Chng_Q4_Q1 Avg_Utilization_Ratio \\\n", + "count 10127.000000 10127.000000 \n", + "mean 0.712222 0.274894 \n", + "std 0.238086 0.275691 \n", + "min 0.000000 0.000000 \n", + "25% 0.582000 0.023000 \n", + "50% 0.702000 0.176000 \n", + "75% 0.818000 0.503000 \n", + "max 3.714000 0.999000 \n", + "\n", + " Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1 \\\n", + "count 10127.000000 \n", + "mean 0.159997 \n", + "std 0.365301 \n", + "min 0.000008 \n", + "25% 0.000099 \n", + "50% 0.000181 \n", + "75% 0.000337 \n", + "max 0.999580 \n", + "\n", + " Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2 \n", + "count 10127.000000 \n", + "mean 0.840003 \n", + "std 0.365301 \n", + "min 0.000420 \n", + "25% 0.999660 \n", + "50% 0.999820 \n", + "75% 0.999900 \n", + "max 0.999990 " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.describe()" + ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CLIENTNUMAttrition_FlagCustomer_AgeGenderDependent_countEducation_LevelMarital_StatusIncome_CategoryCard_CategoryMonths_on_bookTotal_Relationship_CountMonths_Inactive_12_monContacts_Count_12_monCredit_LimitTotal_Revolving_BalAvg_Open_To_BuyTotal_Amt_Chng_Q4_Q1Total_Trans_AmtTotal_Trans_CtTotal_Ct_Chng_Q4_Q1Avg_Utilization_Ratio
\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [CLIENTNUM, Attrition_Flag, Customer_Age, Gender, Dependent_count, Education_Level, Marital_Status, Income_Category, Card_Category, Months_on_book, Total_Relationship_Count, Months_Inactive_12_mon, Contacts_Count_12_mon, Credit_Limit, Total_Revolving_Bal, Avg_Open_To_Buy, Total_Amt_Chng_Q4_Q1, Total_Trans_Amt, Total_Trans_Ct, Total_Ct_Chng_Q4_Q1, Avg_Utilization_Ratio]\n", + "Index: []" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data[data[\"CLIENTNUM\"].duplicated()]" + ] }, { "cell_type": "code", @@ -104,17 +450,60 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['CLIENTNUM', 'Attrition_Flag', 'Customer_Age', 'Gender',\n", + " 'Dependent_count', 'Education_Level', 'Marital_Status',\n", + " 'Income_Category', 'Card_Category', 'Months_on_book',\n", + " 'Total_Relationship_Count', 'Months_Inactive_12_mon',\n", + " 'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal',\n", + " 'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt',\n", + " 'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio',\n", + " 'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1',\n", + " 'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2'],\n", + " dtype='object')" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.columns" + ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 6, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/plain": [ + "(10127, 21)" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data = data[['CLIENTNUM', 'Attrition_Flag', 'Customer_Age', 'Gender',\n", + " 'Dependent_count', 'Education_Level', 'Marital_Status',\n", + " 'Income_Category', 'Card_Category', 'Months_on_book',\n", + " 'Total_Relationship_Count', 'Months_Inactive_12_mon',\n", + " 'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal',\n", + " 'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt',\n", + " 'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio']]\n", + "\n", + "data.shape" + ] }, { "cell_type": "markdown", @@ -125,10 +514,44 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/plain": [ + "CLIENTNUM int64\n", + "Attrition_Flag object\n", + "Customer_Age int64\n", + "Gender object\n", + "Dependent_count int64\n", + "Education_Level object\n", + "Marital_Status object\n", + "Income_Category object\n", + "Card_Category object\n", + "Months_on_book int64\n", + "Total_Relationship_Count int64\n", + "Months_Inactive_12_mon int64\n", + "Contacts_Count_12_mon int64\n", + "Credit_Limit float64\n", + "Total_Revolving_Bal int64\n", + "Avg_Open_To_Buy float64\n", + "Total_Amt_Chng_Q4_Q1 float64\n", + "Total_Trans_Amt int64\n", + "Total_Trans_Ct int64\n", + "Total_Ct_Chng_Q4_Q1 float64\n", + "Avg_Utilization_Ratio float64\n", + "dtype: object" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.dtypes" + ] }, { "cell_type": "code", @@ -154,7 +577,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.4 (main, Jan 18 2023, 00:26:41) [GCC 9.4.0]" + "version": "3.12.1" }, "vscode": { "interpreter": {