77< meta name ="viewport " content ="width=device-width, initial-scale=1.0, user-scalable=yes ">
88
99
10- < title > 3 Regression – Machine Learning</ title >
10+ < title > 3 Regression (work in progress) – Machine Learning</ title >
1111< style >
1212code {white-space : pre-wrap;}
1313span .smallcaps {font-variant : small-caps;}
150150 < i class ="bi bi-layout-text-sidebar-reverse "> </ i >
151151 </ button >
152152 < a class ="flex-grow-1 no-decor " role ="navigation " data-bs-toggle ="collapse " data-bs-target =".quarto-sidebar-collapse-item " aria-controls ="quarto-sidebar " aria-expanded ="false " aria-label ="Toggle sidebar navigation " onclick ="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); } ">
153- < h1 class ="quarto-secondary-nav-title "> < span class ="chapter-number "> 3</ span > < span class ="chapter-title "> Regression</ span > </ h1 >
153+ < h1 class ="quarto-secondary-nav-title "> < span class ="chapter-number "> 3</ span > < span class ="chapter-title "> Regression (work in progress) </ span > </ h1 >
154154 </ a >
155155 < button type ="button " class ="btn quarto-search-button " aria-label ="Search " onclick ="window.quartoOpenSearch(); ">
156156 < i class ="bi bi-search "> </ i >
@@ -195,7 +195,7 @@ <h1 class="quarto-secondary-nav-title"><span class="chapter-number">3</span>&nbs
195195 < li class ="sidebar-item ">
196196 < div class ="sidebar-item-container ">
197197 < a href ="./02-Regression.html " class ="sidebar-item-text sidebar-link active ">
198- < span class ="menu-text "> < span class ="chapter-number "> 3</ span > < span class ="chapter-title "> Regression</ span > </ span > </ a >
198+ < span class ="menu-text "> < span class ="chapter-number "> 3</ span > < span class ="chapter-title "> Regression (work in progress) </ span > </ span > </ a >
199199 </ div >
200200</ li >
201201 < li class ="sidebar-item ">
@@ -249,7 +249,7 @@ <h2 id="toc-title">Table of contents</h2>
249249
250250< header id ="title-block-header " class ="quarto-title-block default ">
251251< div class ="quarto-title ">
252- < h1 class ="title d-none d-lg-block "> < span class ="chapter-number "> 3</ span > < span class ="chapter-title "> Regression</ span > </ h1 >
252+ < h1 class ="title d-none d-lg-block "> < span class ="chapter-number "> 3</ span > < span class ="chapter-title "> Regression (work in progress) </ span > </ h1 >
253253</ div >
254254
255255
@@ -277,7 +277,7 @@ <h3 data-number="3.1.1" class="anchored" data-anchor-id="one-predictor"><span cl
277277MeanBloodPressure= \beta_0 + \beta_1 \cdot Age
278278\]</ span > </ p >
279279< p > Our model would look like the following like the red line from our Training data:</ p >
280- < div id ="ad9e4955 " class ="cell " data-execution_count ="1 ">
280+ < div id ="696bbdfc " class ="cell " data-execution_count ="1 ">
281281< div class ="sourceCode cell-code " id ="cb1 "> < pre class ="sourceCode python code-with-copy "> < code class ="sourceCode python "> < span id ="cb1-1 "> < a href ="#cb1-1 " aria-hidden ="true " tabindex ="-1 "> </ a > < span class ="im "> import</ span > pandas < span class ="im "> as</ span > pd</ span >
282282< span id ="cb1-2 "> < a href ="#cb1-2 " aria-hidden ="true " tabindex ="-1 "> </ a > < span class ="im "> import</ span > seaborn < span class ="im "> as</ span > sns</ span >
283283< span id ="cb1-3 "> < a href ="#cb1-3 " aria-hidden ="true " tabindex ="-1 "> </ a > < span class ="im "> import</ span > numpy < span class ="im "> as</ span > np</ span >
@@ -353,7 +353,7 @@ <h3 data-number="3.3.1" class="anchored" data-anchor-id="underfitting-and-overfi
353353< p > < span class ="math display "> \[
354354MeanBloodPressure= \beta_0 + \beta_1 \cdot Age
355355\]</ span > </ p >
356- < div id ="c59662c3 " class ="cell " data-execution_count ="2 ">
356+ < div id ="1872e11e " class ="cell " data-execution_count ="2 ">
357357< div class ="sourceCode cell-code " id ="cb2 "> < pre class ="sourceCode python code-with-copy "> < code class ="sourceCode python "> < span id ="cb2-1 "> < a href ="#cb2-1 " aria-hidden ="true " tabindex ="-1 "> </ a > < span class ="im "> import</ span > pandas < span class ="im "> as</ span > pd</ span >
358358< span id ="cb2-2 "> < a href ="#cb2-2 " aria-hidden ="true " tabindex ="-1 "> </ a > < span class ="im "> import</ span > seaborn < span class ="im "> as</ span > sns</ span >
359359< span id ="cb2-3 "> < a href ="#cb2-3 " aria-hidden ="true " tabindex ="-1 "> </ a > < span class ="im "> import</ span > numpy < span class ="im "> as</ span > np</ span >
@@ -416,7 +416,7 @@ <h3 data-number="3.3.1" class="anchored" data-anchor-id="underfitting-and-overfi
416416< li > < p > The relationship between a predictor and response is fit via a higher order polynomial or smooth function</ p > </ li >
417417</ ul >
418418< p > Let’s look at what happens if we increase the complexity of the model by fitting it with a more smooth function. We use a polynomial function of order 2.</ p >
419- < div id ="dbf103b9 " class ="cell " data-execution_count ="3 ">
419+ < div id ="4ae213a5 " class ="cell " data-execution_count ="3 ">
420420< div class ="sourceCode cell-code " id ="cb4 "> < pre class ="sourceCode python code-with-copy "> < code class ="sourceCode python "> < span id ="cb4-1 "> < a href ="#cb4-1 " aria-hidden ="true " tabindex ="-1 "> </ a > p_degree < span class ="op "> =</ span > < span class ="dv "> 2</ span > </ span >
421421< span id ="cb4-2 "> < a href ="#cb4-2 " aria-hidden ="true " tabindex ="-1 "> </ a > y, X < span class ="op "> =</ span > model_matrix(< span class ="st "> "BloodPressure ~ BMI + poly(BMI, degree="</ span > < span class ="op "> +</ span > < span class ="bu "> str</ span > (p_degree) < span class ="op "> +</ span > < span class ="st "> ")"</ span > , nhanes_tiny)</ span >
422422< span id ="cb4-3 "> < a href ="#cb4-3 " aria-hidden ="true " tabindex ="-1 "> </ a > </ span >
@@ -465,7 +465,7 @@ <h3 data-number="3.3.1" class="anchored" data-anchor-id="underfitting-and-overfi
465465</ div >
466466< p > We see that both Training and Testing error both decreased slightly!</ p >
467467< p > What happens if we keep increasing the model complexity?</ p >
468- < div id ="13bc74c9 " class ="cell " data-execution_count ="4 ">
468+ < div id ="54be1bb4 " class ="cell " data-execution_count ="4 ">
469469< div class ="sourceCode cell-code " id ="cb6 "> < pre class ="sourceCode python code-with-copy "> < code class ="sourceCode python "> < span id ="cb6-1 "> < a href ="#cb6-1 " aria-hidden ="true " tabindex ="-1 "> </ a > < span class ="cf "> for</ span > p_degree < span class ="kw "> in</ span > [< span class ="dv "> 4</ span > , < span class ="dv "> 10</ span > ]:</ span >
470470< span id ="cb6-2 "> < a href ="#cb6-2 " aria-hidden ="true " tabindex ="-1 "> </ a > y, X < span class ="op "> =</ span > model_matrix(< span class ="st "> "BloodPressure ~ BMI + poly(BMI, degree="</ span > < span class ="op "> +</ span > < span class ="bu "> str</ span > (p_degree) < span class ="op "> +</ span > < span class ="st "> ")"</ span > , nhanes_tiny)</ span >
471471< span id ="cb6-3 "> < a href ="#cb6-3 " aria-hidden ="true " tabindex ="-1 "> </ a > </ span >
@@ -523,7 +523,7 @@ <h3 data-number="3.3.1" class="anchored" data-anchor-id="underfitting-and-overfi
523523</ div >
524524</ div >
525525< p > Let’s summarize it:</ p >
526- < div id ="93f3ed51 " class ="cell " data-execution_count ="5 ">
526+ < div id ="af04bf84 " class ="cell " data-execution_count ="5 ">
527527< div class ="sourceCode cell-code " id ="cb9 "> < pre class ="sourceCode python code-with-copy "> < code class ="sourceCode python "> < span id ="cb9-1 "> < a href ="#cb9-1 " aria-hidden ="true " tabindex ="-1 "> </ a > train_err < span class ="op "> =</ span > []</ span >
528528< span id ="cb9-2 "> < a href ="#cb9-2 " aria-hidden ="true " tabindex ="-1 "> </ a > test_err < span class ="op "> =</ span > []</ span >
529529< span id ="cb9-3 "> < a href ="#cb9-3 " aria-hidden ="true " tabindex ="-1 "> </ a > polynomials < span class ="op "> =</ span > < span class ="bu "> list</ span > (< span class ="bu "> range</ span > (< span class ="dv "> 1</ span > , < span class ="dv "> 10</ span > ))</ span >
@@ -588,14 +588,14 @@ <h3 data-number="3.4.1" class="anchored" data-anchor-id="linear-model"><span cla
588588< p > < span class ="math inline "> \(\beta_0\)</ span > is a parameter describing the intercept of the line, and < span class ="math inline "> \(\beta_1\)</ span > is a parameter describing the slope of the line.</ p >
589589< p > Suppose that from fitting the model on the Training Set, < span class ="math inline "> \(\beta_1=2\)</ span > . That means increasing < span class ="math inline "> \(BMI\)</ span > by 1 will lead to an increase of < span class ="math inline "> \(BloodPressure\)</ span > by 2. This measures the strength of association between a variable and the outcome.</ p >
590590< p > Let’s see this in practice:</ p >
591- < div id ="78cfd7ec " class ="cell " data-execution_count ="6 ">
591+ < div id ="368e9426 " class ="cell " data-execution_count ="6 ">
592592< div class ="sourceCode cell-code " id ="cb10 "> < pre class ="sourceCode python code-with-copy "> < code class ="sourceCode python "> < span id ="cb10-1 "> < a href ="#cb10-1 " aria-hidden ="true " tabindex ="-1 "> </ a > y, X < span class ="op "> =</ span > model_matrix(< span class ="st "> "BloodPressure ~ BMI"</ span > , nhanes_tiny)</ span >
593593< span id ="cb10-2 "> < a href ="#cb10-2 " aria-hidden ="true " tabindex ="-1 "> </ a > </ span >
594594< span id ="cb10-3 "> < a href ="#cb10-3 " aria-hidden ="true " tabindex ="-1 "> </ a > X_train, X_test, y_train, y_test < span class ="op "> =</ span > train_test_split(X, y, test_size< span class ="op "> =</ span > < span class ="fl "> 0.5</ span > , random_state< span class ="op "> =</ span > < span class ="dv "> 42</ span > )</ span >
595595< span id ="cb10-4 "> < a href ="#cb10-4 " aria-hidden ="true " tabindex ="-1 "> </ a > linear_model < span class ="op "> =</ span > sm.OLS(y_train, X_train).fit()</ span >
596596< span id ="cb10-5 "> < a href ="#cb10-5 " aria-hidden ="true " tabindex ="-1 "> </ a > </ span >
597597< span id ="cb10-6 "> < a href ="#cb10-6 " aria-hidden ="true " tabindex ="-1 "> </ a > linear_model.summary()</ span > </ code > < button title ="Copy to Clipboard " class ="code-copy-button "> < i class ="bi "> </ i > </ button > </ pre > </ div >
598- < div class ="cell-output cell-output-display " data-execution_count ="32 ">
598+ < div class ="cell-output cell-output-display " data-execution_count ="6 ">
599599< table class ="simpletable caption-top table table-sm table-striped small " data-quarto-postprocess ="true ">
600600< caption > OLS Regression Results</ caption >
601601< tbody >
@@ -619,13 +619,13 @@ <h3 data-number="3.4.1" class="anchored" data-anchor-id="linear-model"><span cla
619619</ tr >
620620< tr class ="even ">
621621< td data-quarto-table-cell-role ="th "> Date:</ td >
622- < td > Wed, 21 Jan 2026</ td >
622+ < td > Fri, 23 Jan 2026</ td >
623623< td data-quarto-table-cell-role ="th "> Prob (F-statistic):</ td >
624624< td > 0.00118</ td >
625625</ tr >
626626< tr class ="odd ">
627627< td data-quarto-table-cell-role ="th "> Time:</ td >
628- < td > 15:25:31 </ td >
628+ < td > 20:04:22 </ td >
629629< td data-quarto-table-cell-role ="th "> Log-Likelihood:</ td >
630630< td > -520.69</ td >
631631</ tr >
@@ -747,7 +747,7 @@ <h4 data-number="3.6.0.1" class="anchored" data-anchor-id="correlations-and-inte
747747</ section >
748748< section id ="exercise-correlations " class ="level4 " data-number ="3.6.0.2 ">
749749< h4 data-number ="3.6.0.2 " class ="anchored " data-anchor-id ="exercise-correlations "> < span class ="header-section-number "> 3.6.0.2</ span > Exercise: Correlations</ h4 >
750- < div id ="6ce1ed28 " class ="cell " data-execution_count ="7 ">
750+ < div id ="2310c641 " class ="cell " data-execution_count ="7 ">
751751< div class ="sourceCode cell-code " id ="cb11 "> < pre class ="sourceCode python code-with-copy "> < code class ="sourceCode python "> < span id ="cb11-1 "> < a href ="#cb11-1 " aria-hidden ="true " tabindex ="-1 "> </ a > plt.clf()</ span >
752752< span id ="cb11-2 "> < a href ="#cb11-2 " aria-hidden ="true " tabindex ="-1 "> </ a > ax < span class ="op "> =</ span > sns.regplot(y< span class ="op "> =</ span > < span class ="st "> "Age"</ span > , x< span class ="op "> =</ span > < span class ="st "> "BMI"</ span > , data< span class ="op "> =</ span > nhanes_train, lowess< span class ="op "> =</ span > < span class ="va "> True</ span > , scatter_kws< span class ="op "> =</ span > {< span class ="st "> 'alpha'</ span > :< span class ="fl "> 0.1</ span > }, line_kws< span class ="op "> =</ span > {< span class ="st "> 'color'</ span > :< span class ="st "> "r"</ span > })</ span >
753753< span id ="cb11-3 "> < a href ="#cb11-3 " aria-hidden ="true " tabindex ="-1 "> </ a > ax.set_xlim([< span class ="dv "> 10</ span > , < span class ="dv "> 50</ span > ])</ span >
@@ -763,7 +763,7 @@ <h4 data-number="3.6.0.2" class="anchored" data-anchor-id="exercise-correlations
763763</ section >
764764< section id ="exercise-interactions " class ="level4 " data-number ="3.6.0.3 ">
765765< h4 data-number ="3.6.0.3 " class ="anchored " data-anchor-id ="exercise-interactions "> < span class ="header-section-number "> 3.6.0.3</ span > Exercise: Interactions</ h4 >
766- < div id ="43e93275 " class ="cell " data-execution_count ="8 ">
766+ < div id ="b8bbfdab " class ="cell " data-execution_count ="8 ">
767767< div class ="sourceCode cell-code " id ="cb12 "> < pre class ="sourceCode python code-with-copy "> < code class ="sourceCode python "> < span id ="cb12-1 "> < a href ="#cb12-1 " aria-hidden ="true " tabindex ="-1 "> </ a > plt.clf()</ span >
768768< span id ="cb12-2 "> < a href ="#cb12-2 " aria-hidden ="true " tabindex ="-1 "> </ a > ax < span class ="op "> =</ span > sns.lmplot(y< span class ="op "> =</ span > < span class ="st "> "MeanBloodPressure"</ span > , x< span class ="op "> =</ span > < span class ="st "> "BMI"</ span > , hue< span class ="op "> =</ span > < span class ="st "> "Gender"</ span > , data< span class ="op "> =</ span > nhanes_train, lowess< span class ="op "> =</ span > < span class ="va "> True</ span > , scatter_kws< span class ="op "> =</ span > {< span class ="st "> 'alpha'</ span > :< span class ="fl "> 0.1</ span > })</ span >
769769< span id ="cb12-3 "> < a href ="#cb12-3 " aria-hidden ="true " tabindex ="-1 "> </ a > ax.< span class ="bu "> set</ span > (xlim< span class ="op "> =</ span > (< span class ="dv "> 10</ span > , < span class ="dv "> 50</ span > )) </ span >
0 commit comments