Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -464,9 +464,9 @@
"metadata": {},
"outputs": [],
"source": [
"vars_num = # fill your code here\n",
"vars_num = [var for var in data.columns if data[var].dtype != 'O' and var != target]\n",
"\n",
"vars_cat = # fill your code here\n",
"vars_cat = [var for var in data.columns if data[var].dtype == 'O']\n",
"\n",
"print('Number of numerical variables: {}'.format(len(vars_num)))\n",
"print('Number of categorical variables: {}'.format(len(vars_cat)))"
Expand All @@ -486,7 +486,8 @@
"outputs": [],
"source": [
"# first in numerical variables\n",
"\n"
"\n",
"data[vars_num].isnull().mean()"
]
},
{
Expand All @@ -496,7 +497,8 @@
"outputs": [],
"source": [
"# now in categorical variables\n",
"\n"
"\n",
"data[vars_cat].isnull().mean()"
]
},
{
Expand All @@ -511,7 +513,11 @@
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
"source": [
"# determine cardinality of categorical variables\n",
"\n",
"data[vars_cat].nunique().sort_values(ascending=False)"
]
},
{
"cell_type": "markdown",
Expand All @@ -525,7 +531,13 @@
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
"source": [
"# display numerical variables distributions\n",
"\n",
"data[vars_num].hist(bins=30, figsize=(12, 10))\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
Expand Down Expand Up @@ -565,7 +577,12 @@
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
"source": [
"# extract the letter from the cabin\n",
"\n",
"for dataset in [X_train, X_test]:\n",
" dataset['cabin'] = dataset['cabin'].str[0]"
]
},
{
"cell_type": "markdown",
Expand All @@ -582,7 +599,21 @@
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
"source": [
"# add missing indicator and fill numerical variables with median\n",
"\n",
"for dataset in [X_train, X_test]:\n",
" for var in vars_num:\n",
" if dataset[var].isnull().sum() > 0:\n",
" dataset[var + '_na'] = np.where(dataset[var].isnull(), 1, 0)\n",
"\n",
"# capture the median values from train set\n",
"train_median = X_train[vars_num].median()\n",
"\n",
"# fill missing values in train and test\n",
"X_train[vars_num] = X_train[vars_num].fillna(train_median)\n",
"X_test[vars_num] = X_test[vars_num].fillna(train_median)"
]
},
{
"cell_type": "markdown",
Expand All @@ -596,21 +627,34 @@
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
"source": [
"# replace missing values in categorical variables with string Missing\n",
"\n",
"for dataset in [X_train, X_test]:\n",
" dataset[vars_cat] = dataset[vars_cat].fillna('Missing')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
"source": [
"# check that there are no missing values left\n",
"\n",
"X_train.isnull().mean().sort_values(ascending=False).head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
"source": [
"# display categorical variable cardinality after missing value replacement\n",
"\n",
"X_train[vars_cat].nunique().sort_values(ascending=False)"
]
},
{
"cell_type": "markdown",
Expand All @@ -626,21 +670,43 @@
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": []
"source": [
"# find frequent labels (>=5%) for each categorical variable\n",
"\n",
"def find_frequent_labels(df, var, rare_perc):\n",
" tmp = df[var].value_counts(normalize=True)\n",
" return tmp[tmp >= rare_perc].index\n",
"\n",
"for var in vars_cat:\n",
" frequent_ls = find_frequent_labels(X_train, var, 0.05)\n",
"\n",
" X_train[var] = np.where(X_train[var].isin(frequent_ls), X_train[var], 'Rare')\n",
" X_test[var] = np.where(X_test[var].isin(frequent_ls), X_test[var], 'Rare')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
"source": [
"# check new category cardinality\n",
"\n",
"X_train[vars_cat].nunique().sort_values(ascending=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
"source": [
"# visualize categorical frequencies\n",
"\n",
"for var in vars_cat:\n",
" print(X_train[var].value_counts(normalize=True))\n",
" print()"
]
},
{
"cell_type": "markdown",
Expand All @@ -657,28 +723,45 @@
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
"source": [
"# one hot encode categorical variables\n",
"\n",
"X_train = pd.get_dummies(X_train, columns=vars_cat, drop_first=True)\n",
"X_test = pd.get_dummies(X_test, columns=vars_cat, drop_first=True)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": []
"source": [
"# align train and test set columns\n",
"\n",
"X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
"source": [
"# check shape after encoding\n",
"\n",
"X_train.shape, X_test.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
"source": [
"# display first rows\n",
"\n",
"X_train.head()"
]
},
{
"cell_type": "markdown",
Expand All @@ -694,7 +777,18 @@
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
"source": [
"# set up the scaler\n",
"\n",
"scaler = StandardScaler()\n",
"\n",
"# fit on train set only\n",
"scaler.fit(X_train)\n",
"\n",
"# transform the train and test set\n",
"X_train = scaler.transform(X_train)\n",
"X_test = scaler.transform(X_test)"
]
},
{
"cell_type": "markdown",
Expand All @@ -711,7 +805,14 @@
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
"source": [
"# set up the model\n",
"\n",
"logit = LogisticRegression(C=0.0005, random_state=0, max_iter=200)\n",
"\n",
"# train the model\n",
"logit.fit(X_train, y_train)"
]
},
{
"cell_type": "markdown",
Expand All @@ -731,7 +832,18 @@
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
"source": [
"# make predictions\n",
"\n",
"pred = logit.predict(X_test)\n",
"\n",
"# probabilities for roc-auc\n",
"pred_proba = logit.predict_proba(X_test)[:, 1]\n",
"\n",
"# determine metrics\n",
"print('roc-auc: {}'.format(roc_auc_score(y_test, pred_proba)))\n",
"print('accuracy: {}'.format(accuracy_score(y_test, pred)))"
]
},
{
"cell_type": "markdown",
Expand Down