diff --git a/section-04-research-and-development/titanic-assignment/01-predicting-survival-titanic-assignement.ipynb b/section-04-research-and-development/titanic-assignment/01-predicting-survival-titanic-assignement.ipynb index 13a218de6..79a6036e8 100644 --- a/section-04-research-and-development/titanic-assignment/01-predicting-survival-titanic-assignement.ipynb +++ b/section-04-research-and-development/titanic-assignment/01-predicting-survival-titanic-assignement.ipynb @@ -464,9 +464,9 @@ "metadata": {}, "outputs": [], "source": [ - "vars_num = # fill your code here\n", + "vars_num = [var for var in data.columns if data[var].dtype != 'O' and var != target]\n", "\n", - "vars_cat = # fill your code here\n", + "vars_cat = [var for var in data.columns if data[var].dtype == 'O']\n", "\n", "print('Number of numerical variables: {}'.format(len(vars_num)))\n", "print('Number of categorical variables: {}'.format(len(vars_cat)))" @@ -486,7 +486,8 @@ "outputs": [], "source": [ "# first in numerical variables\n", - "\n" + "\n", + "data[vars_num].isnull().mean()" ] }, { @@ -496,7 +497,8 @@ "outputs": [], "source": [ "# now in categorical variables\n", - "\n" + "\n", + "data[vars_cat].isnull().mean()" ] }, { @@ -511,7 +513,11 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "# determine cardinality of categorical variables\n", + "\n", + "data[vars_cat].nunique().sort_values(ascending=False)" + ] }, { "cell_type": "markdown", @@ -525,7 +531,13 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "# display numerical variables distributions\n", + "\n", + "data[vars_num].hist(bins=30, figsize=(12, 10))\n", + "plt.tight_layout()\n", + "plt.show()" + ] }, { "cell_type": "markdown", @@ -565,7 +577,12 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "# extract the letter from the cabin\n", + "\n", + "for dataset in [X_train, X_test]:\n", + " dataset['cabin'] = dataset['cabin'].str[0]" + ] }, { "cell_type": "markdown", @@ -582,7 +599,21 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "# add missing indicator and fill numerical variables with median\n", + "\n", + "for dataset in [X_train, X_test]:\n", + " for var in vars_num:\n", + " if dataset[var].isnull().sum() > 0:\n", + " dataset[var + '_na'] = np.where(dataset[var].isnull(), 1, 0)\n", + "\n", + "# capture the median values from train set\n", + "train_median = X_train[vars_num].median()\n", + "\n", + "# fill missing values in train and test\n", + "X_train[vars_num] = X_train[vars_num].fillna(train_median)\n", + "X_test[vars_num] = X_test[vars_num].fillna(train_median)" + ] }, { "cell_type": "markdown", @@ -596,21 +627,34 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "# replace missing values in categorical variables with string Missing\n", + "\n", + "for dataset in [X_train, X_test]:\n", + " dataset[vars_cat] = dataset[vars_cat].fillna('Missing')" + ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "# check that there are no missing values left\n", + "\n", + "X_train.isnull().mean().sort_values(ascending=False).head()" + ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "# display categorical variable cardinality after missing value replacement\n", + "\n", + "X_train[vars_cat].nunique().sort_values(ascending=False)" + ] }, { "cell_type": "markdown", @@ -626,21 +670,43 @@ "execution_count": 21, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "# find frequent labels (>=5%) for each categorical variable\n", + "\n", + "def find_frequent_labels(df, var, rare_perc):\n", + " tmp = df[var].value_counts(normalize=True)\n", + " return tmp[tmp >= rare_perc].index\n", + "\n", + "for var in vars_cat:\n", + " frequent_ls = find_frequent_labels(X_train, var, 0.05)\n", + "\n", + " X_train[var] = np.where(X_train[var].isin(frequent_ls), X_train[var], 'Rare')\n", + " X_test[var] = np.where(X_test[var].isin(frequent_ls), X_test[var], 'Rare')" + ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "# check new category cardinality\n", + "\n", + "X_train[vars_cat].nunique().sort_values(ascending=False)" + ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "# visualize categorical frequencies\n", + "\n", + "for var in vars_cat:\n", + " print(X_train[var].value_counts(normalize=True))\n", + " print()" + ] }, { "cell_type": "markdown", @@ -657,28 +723,45 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "# one hot encode categorical variables\n", + "\n", + "X_train = pd.get_dummies(X_train, columns=vars_cat, drop_first=True)\n", + "X_test = pd.get_dummies(X_test, columns=vars_cat, drop_first=True)" + ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "# align train and test set columns\n", + "\n", + "X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)" + ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "# check shape after encoding\n", + "\n", + "X_train.shape, X_test.shape" + ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "# display first rows\n", + "\n", + "X_train.head()" + ] }, { "cell_type": "markdown", @@ -694,7 +777,18 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "# set up the scaler\n", + "\n", + "scaler = StandardScaler()\n", + "\n", + "# fit on train set only\n", + "scaler.fit(X_train)\n", + "\n", + "# transform the train and test set\n", + "X_train = scaler.transform(X_train)\n", + "X_test = scaler.transform(X_test)" + ] }, { "cell_type": "markdown", @@ -711,7 +805,14 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "# set up the model\n", + "\n", + "logit = LogisticRegression(C=0.0005, random_state=0, max_iter=200)\n", + "\n", + "# train the model\n", + "logit.fit(X_train, y_train)" + ] }, { "cell_type": "markdown", @@ -731,7 +832,18 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "# make predictions\n", + "\n", + "pred = logit.predict(X_test)\n", + "\n", + "# probabilities for roc-auc\n", + "pred_proba = logit.predict_proba(X_test)[:, 1]\n", + "\n", + "# determine metrics\n", + "print('roc-auc: {}'.format(roc_auc_score(y_test, pred_proba)))\n", + "print('accuracy: {}'.format(accuracy_score(y_test, pred)))" + ] }, { "cell_type": "markdown",