From 7dc29680fc0dee10506ac6389afe2f162af0d7db Mon Sep 17 00:00:00 2001
From: Eton Tackett <entackett@wisc.edu>
Date: Mon, 2 Mar 2026 15:32:40 -0600
Subject: [PATCH 01/23] Add dataset utilities and tests

---
 src/dataset.jl        | 133 ++++++++++++++++++++++++++++++++++++++++++
 test/dataset_tests.jl |   0
 2 files changed, 133 insertions(+)
 create mode 100644 src/dataset.jl
 create mode 100644 test/dataset_tests.jl

diff --git a/src/dataset.jl b/src/dataset.jl
new file mode 100644
index 0000000..52c0153
--- /dev/null
+++ b/src/dataset.jl
@@ -0,0 +1,133 @@
+
+using CSV
+using DataFrames
+using Downloads
+
+export Dataset, csv_dataset
+
+"""
+    Dataset(name, X, y)
+
+Contains datasets for ridge regression experiments.
+
+# Fields
+- `name::String`: Name of dataset
+- `X::Matrix{Float64}`: Matrix of variables/features
+- `y::Vector{Float64}`: Target vector
+
+# Throws
+- `ArgumentError`: If rows in `X` does not equal length of `y`.
+
+# Notes
+Used as the experimental unit for ridge regression experiments.
+"""
+struct Dataset
+    name::String
+    X::Matrix{Float64}
+    y::Vector{Float64}
+
+    function Dataset(name::String, X::AbstractMatrix, y::AbstractVector)
+        size(X, 1) == length(y) ||
+            throw(ArgumentError("X and y must have same number of rows"))
+
+        new(name, Matrix{Float64}(X), Vector{Float64}(y))
+    end
+end
+
+"""
+    one_hot_encode(dataset::Dataset; drop_first=true)
+
+One-hot encode categorical (string-like) features in `dataset.X`.
+
+# Arguments
+- `dataset::Dataset`: Input dataset containing feature matrix `X`
+  and response vector `y`.
+
+# Keyword Arguments
+- `drop_first::Bool=true`: If `true`, drop the first dummy column for
+  each categorical feature to avoid multicollinearity.
+
+# Returns
+A new `Dataset` with numeric `X` and unchanged `y`.
+"""
+function one_hot_encode(Xdf::DataFrame; drop_first::Bool = true)::Matrix{Float64}
+    n = nrow(Xdf)
+    cols = Vector{Vector{Float64}}()
+
+    for name in names(Xdf)
+        col = Xdf[!, name]
+        if eltype(col) <: Real
+            push!(cols, Float64.(col))
+            continue
+        end
+
+        scol = string.(col)
+        lv = unique(scol)
+        ind = scol .== permutedims(lv)
+
+        println("Variable: $name")
+        for (j, level) in enumerate(lv)
+            println("  Dummy column (before drop) $j → $name = $level")
+        end
+
+        if drop_first && size(ind, 2) > 1
+            ind = ind[:, 2:end]
+        end
+
+        for j in 1:size(ind, 2)
+            push!(cols, Float64.(ind[:, j]))
+        end
+    end
+
+    p = length(cols)
+    X = Matrix{Float64}(undef, n, p)
+    for j in 1:p
+        X[:, j] = cols[j]
+    end
+
+    return Matrix{Float64}(X)
+
+end
+"""
+    csv_dataset(path_or_url; target_col, name="csv_dataset")
+
+Load a dataset from a CSV file or URL.
+
+# Arguments
+- `path_or_url::String`
+    Local file path or web URL that has CSV data.
+
+- `target_col`
+    Column index OR column name containing the response variable.
+
+- `name::String`
+    Dataset name.
+
+# Returns
+`Dataset`
+"""
+function csv_dataset(path_or_url::String;
+    target_col,
+    name::String = "csv_dataset"
+)
+
+    filepath =
+        startswith(path_or_url, "http") ?
+        Downloads.download(path_or_url) :
+        path_or_url
+
+    df = DataFrame(CSV.File(filepath))
+    df = dropmissing(df)
+    Xdf = select(df, DataFrames.Not(target_col))
+
+    y = target_col isa Int ?
+        df[:, target_col] :
+        df[:, Symbol(target_col)]
+
+
+    X = one_hot_encode(Xdf; drop_first = true)
+
+
+
+    return Dataset(name, Matrix{Float64}(X), Vector{Float64}(y))
+end
diff --git a/test/dataset_tests.jl b/test/dataset_tests.jl
new file mode 100644
index 0000000..e69de29

From 47fc954ddfe6fdf09c3c998929432b782079f760 Mon Sep 17 00:00:00 2001
From: Eton Tackett <entackett@wisc.edu>
Date: Tue, 3 Mar 2026 10:27:21 -0600
Subject: [PATCH 02/23] Adding dataset_tests.jl

---
 Project.toml          | 10 ++++++-
 docs/make.jl          |  1 +
 docs/src/design.md    | 32 +++++++++++++++++++++
 src/dataset.jl        |  1 -
 test/dataset_tests.jl | 66 +++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 108 insertions(+), 2 deletions(-)
 create mode 100644 docs/src/design.md

diff --git a/Project.toml b/Project.toml
index 86ea0a5..427e8aa 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,15 @@
 name = "RidgeRegression"
 uuid = "739161c8-60e1-4c49-8f89-ff30998444b1"
-authors = ["Vivak Patel <vp314@users.noreply.github.com>"]
 version = "0.1.0"
+authors = ["Eton Tackett <etont@icloud.com>", "Vivak Patel <vp314@users.noreply.github.com>"]
+
+[deps]
+CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
+DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
+Downloads = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
 
 [compat]
+CSV = "0.10.15"
+DataFrames = "1.8.1"
+Downloads = "1.7.0"
 julia = "1.12.4"
diff --git a/docs/make.jl b/docs/make.jl
index a1097bb..d42cfbe 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -14,6 +14,7 @@ makedocs(;
     ),
     pages=[
         "Home" => "index.md",
+        "Design" => "design.md",
     ],
 )
 
diff --git a/docs/src/design.md b/docs/src/design.md
new file mode 100644
index 0000000..c83592e
--- /dev/null
+++ b/docs/src/design.md
@@ -0,0 +1,32 @@
+# Motivation and Background
+Many modern applications, such as genome-wide association studies (GWAS) involve regression problems with a large number of predictors. Traditional least squares methods fail due to noise and ill-conditioning. Penalized Least Squares (PLS) extends ordinary least squares (OLS) regression by adding a penalty term to shrink parameter estimates. The goal is to select the best possible model, "best" in the sense that we find the best tradeoff between goodness of fit and model complexity. Ridge regression, an approach within PLS, adds a regularization term. 
+
+# Questions
+Key Questions:
+Which ridge regression algorithm is provides the best balance between:
+-Numerical stability
+-Computational aspects (GPU/CPU, runtime, etc)
+-Predicative accuracy
+# Experimental Units
+The experimental units are the datasets under fixed penalty weights. Due to the statistical behavior of ridge regression algorithms depends strongly on the dimensional structure of the problem, a blocking procedure will be used. Datasets will be grouped according to their dimensional regime, characterized as p >> n, p ≈ n, and p << n. These regimes correspond to fundamentally different geometric properties of the design matrix, including rank behavior, conditioning, and the stability of the normal equations.
+
+In addition to dimensional regime, matrix conditioning will be incorporated as a secondary blocking factor. The condition number of the design matrix quantifies the sensitivity of the regression problem to perturbations in the data and directly affects numerical stability and convergence behavior of ridge solution methods. Ill-conditioned matrices have slow convergence and are sensitive to errors, while well-conditioned matrices tend to produce stable and rapidly convergent behavior.
+
+| Blocking System | Factor | Blocks |
+|:----------------|:-------|:-------|
+| Dataset | Dimensional regime (\(p/n\)) | \( p \ll n \), \( p = n \), \( p \gg n \) |
+| Matrix conditioning | Condition number of \( X \) or \( X^T X \) | Low, Medium, High |
+# Treatments
+The treatments are the ridge regression solution methods:
+-Gradient descent
+-Stochastic gradient descent
+-Closed-form solutions
+# Observational Units and Measurements
+The observational units are each algorithm-dataset pair. For each combination we will observe the following 
+| Measurement System        | Factor                    | Measurements |
+|:--------------------------|:--------------------------|:-------------|
+| Predictive Performance    | Prediction error          | Training MSE, Test MSE, RMSE, R² |
+| Estimation Accuracy       | Parameter recovery        | ‖β̂ − β_true‖₂² | (if known)
+| Computational Performance | Efficiency                | Runtime (seconds), Iterations to convergence |
+| Numerical Stability       | Solution accuracy         | Perturbation sensitivity |
+| Model Complexity          | Coefficient magnitude     | ‖β̂‖₂ |
\ No newline at end of file
diff --git a/src/dataset.jl b/src/dataset.jl
index 52c0153..d135557 100644
--- a/src/dataset.jl
+++ b/src/dataset.jl
@@ -1,4 +1,3 @@
-
 using CSV
 using DataFrames
 using Downloads
diff --git a/test/dataset_tests.jl b/test/dataset_tests.jl
index e69de29..cefa8ce 100644
--- a/test/dataset_tests.jl
+++ b/test/dataset_tests.jl
@@ -0,0 +1,66 @@
+using Test
+using DataFrames
+using CSV
+
+include("../src/dataset.jl")
+@testset "Dataset" begin
+    X = [1 2; 3 4]
+    y = [10, 20]
+    d = Dataset("toy", X, y)
+
+    @test d.name == "toy"
+    @test d.X isa Matrix{Float64}
+    @test d.y isa Vector{Float64}
+    @test size(d.X) == (2, 2)
+    @test length(d.y) == 2
+    @test d.X[1, 1] == 1.0
+    @test d.y[2] == 20.0
+
+    @test_throws ArgumentError Dataset("bad", X, [1, 2, 3])
+end
+
+@testset "one_hot_encode" begin
+    df = DataFrame(
+        A = ["red", "blue", "red", "green"],
+        B = [1, 2, 3, 4],
+        C = ["small", "large", "medium", "small"]
+    )
+
+    X = redirect_stdout(devnull) do
+        one_hot_encode(df; drop_first = true)
+    end
+
+    @test size(X) == (4, 5)
+    @test X[:, 3] == [1.0, 2.0, 3.0, 4.0]
+    @test all(x -> x == 0.0 || x == 1.0, X[:, [1,2,4,5]])
+    @test all(vec(sum(X[:, 1:2]; dims=2)) .<= 1)
+    @test all(vec(sum(X[:, 4:5]; dims=2)) .<= 1)
+end
+
+@testset "csv_dataset" begin
+    tmp = tempname() * ".csv"
+    df = DataFrame(
+        a = [1.0, 2.0, missing, 4.0],
+        b = ["x", "y", "y", "x"],
+        y = [10.0, 20.0, 30.0, 40.0]
+    )
+    CSV.write(tmp, df)
+
+    d = redirect_stdout(devnull) do
+        csv_dataset(tmp; target_col=:y, name="tmp")
+    end
+
+    @test d.name == "tmp"
+    @test d.X isa Matrix{Float64}
+    @test d.y isa Vector{Float64}
+
+    @test length(d.y) == 3
+    @test size(d.X, 1) == 3
+    @test d.y == [10.0, 20.0, 40.0]
+
+    d2 = redirect_stdout(devnull) do
+        csv_dataset(tmp; target_col=3, name="tmp2")
+    end
+    @test d2.y == [10.0, 20.0, 40.0]
+    @test size(d2.X, 1) == 3
+end

From 787a88dc8a58861c69e32562f73a1727db0d52bc Mon Sep 17 00:00:00 2001
From: Eton Tackett <entackett@wisc.edu>
Date: Tue, 3 Mar 2026 10:31:54 -0600
Subject: [PATCH 03/23] Small changes to design.md

---
 docs/src/design.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/src/design.md b/docs/src/design.md
index c83592e..5c02373 100644
--- a/docs/src/design.md
+++ b/docs/src/design.md
@@ -14,7 +14,7 @@ In addition to dimensional regime, matrix conditioning will be incorporated as a
 
 | Blocking System | Factor | Blocks |
 |:----------------|:-------|:-------|
-| Dataset | Dimensional regime (\(p/n\)) | \( p \ll n \), \( p = n \), \( p \gg n \) |
+| Dataset | Dimensional regime (\(p/n\)) | $(p \ll n)$, $(p \approx n)$, $(p \gg n)$|
 | Matrix conditioning | Condition number of \( X \) or \( X^T X \) | Low, Medium, High |
 # Treatments
 The treatments are the ridge regression solution methods:

From 2dd229507d2dc9b07c2867c06a4779a55d1944f9 Mon Sep 17 00:00:00 2001
From: Eton Tackett <entackett@wisc.edu>
Date: Mon, 16 Mar 2026 22:28:05 -0500
Subject: [PATCH 04/23] March 16 Updates

---
 src/dataset.jl        | 42 ++++++++++++++++++------------------------
 test/dataset_tests.jl |  2 +-
 2 files changed, 19 insertions(+), 25 deletions(-)

diff --git a/src/dataset.jl b/src/dataset.jl
index d135557..18d7b6b 100644
--- a/src/dataset.jl
+++ b/src/dataset.jl
@@ -53,28 +53,25 @@ function one_hot_encode(Xdf::DataFrame; drop_first::Bool = true)::Matrix{Float64
     n = nrow(Xdf)
     cols = Vector{Vector{Float64}}()
 
-    for name in names(Xdf)
+    for name in names(Xdf) #Selecting columns that aren't the target variable and pushing them to the columns.
         col = Xdf[!, name]
         if eltype(col) <: Real
             push!(cols, Float64.(col))
             continue
         end
 
-        scol = string.(col)
-        lv = unique(scol)
-        ind = scol .== permutedims(lv)
+        scol = string.(col) # Convert to string for categorical processing.
+        lv = unique(scol) #Get unique category levels.
+        ind = scol .== permutedims(lv) #Create indicator matrix for each level of the categorical variable.
+        #Permutedims is used to align the dimensions for broadcasting.
+        #Broadcasting compares each element of `scol` with each level in `lv`, resulting in a matrix where each column corresponds to a level and contains `true` for rows that match that level and `false` otherwise.
 
-        println("Variable: $name")
-        for (j, level) in enumerate(lv)
-            println("  Dummy column (before drop) $j → $name = $level")
-        end
-
-        if drop_first && size(ind, 2) > 1
+        if drop_first && size(ind, 2) > 1 #Drop the first column of the indicator matrix to avoid multicollinearity if drop_first is true and there are multiple levels.
             ind = ind[:, 2:end]
         end
 
         for j in 1:size(ind, 2)
-            push!(cols, Float64.(ind[:, j]))
+            push!(cols, Float64.(ind[:, j])) #Convert the boolean indicator columns to Float64 and add them to the list of columns.
         end
     end
 
@@ -93,17 +90,14 @@ end
 Load a dataset from a CSV file or URL.
 
 # Arguments
-- `path_or_url::String`
-    Local file path or web URL that has CSV data.
+- `path_or_url::String`: Local file path or web URL containing CSV data.
 
-- `target_col`
-    Column index OR column name containing the response variable.
-
-- `name::String`
-    Dataset name.
+# Keyword Arguments
+- `target_col`: Column index or column name containing the response variable.
+- `name::String="csv_dataset"`: Dataset name.
 
 # Returns
-`Dataset`
+A `Dataset`.
 """
 function csv_dataset(path_or_url::String;
     target_col,
@@ -115,13 +109,13 @@ function csv_dataset(path_or_url::String;
         Downloads.download(path_or_url) :
         path_or_url
 
-    df = DataFrame(CSV.File(filepath))
-    df = dropmissing(df)
-    Xdf = select(df, DataFrames.Not(target_col))
+    df = DataFrame(CSV.File(filepath)) #Read CSV file into a DataFrame.
+    df = dropmissing(df) #Remove rows with missing values.
+    Xdf = select(df, DataFrames.Not(target_col)) #Select all columns except the target column for features.
 
     y = target_col isa Int ?
-        df[:, target_col] :
-        df[:, Symbol(target_col)]
+        df[:, target_col] : #If target_col is an integer, use it as a column index to extract the target variable from the DataFrame.
+        df[:, Symbol(target_col)] #Extract the target variable based on whether target_col is an index or a name.
 
 
     X = one_hot_encode(Xdf; drop_first = true)
diff --git a/test/dataset_tests.jl b/test/dataset_tests.jl
index cefa8ce..5652c96 100644
--- a/test/dataset_tests.jl
+++ b/test/dataset_tests.jl
@@ -2,7 +2,7 @@ using Test
 using DataFrames
 using CSV
 
-include("../src/dataset.jl")
+using RidgeRegression
 @testset "Dataset" begin
     X = [1 2; 3 4]
     y = [10, 20]

From 173cdd14a1197e6bbe003723fa6e8df87a58575d Mon Sep 17 00:00:00 2001
From: Eton Tackett <entackett@wisc.edu>
Date: Mon, 16 Mar 2026 22:29:22 -0500
Subject: [PATCH 05/23] Ridge Regression file

---
 src/RidgeRegression.jl | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/RidgeRegression.jl b/src/RidgeRegression.jl
index c32de91..5ef574e 100644
--- a/src/RidgeRegression.jl
+++ b/src/RidgeRegression.jl
@@ -1,5 +1,11 @@
 module RidgeRegression
+using CSV
+using DataFrames
+using Downloads
+
+export Dataset, csv_dataset, one_hot_encode
+
+include("dataset.jl")
 
-# Write your package code here.
 
 end

From 042d5d60253d7b4e708fbc92a13ba1062df1d71a Mon Sep 17 00:00:00 2001
From: Eton Tackett <entackett@wisc.edu>
Date: Mon, 16 Mar 2026 23:12:52 -0500
Subject: [PATCH 06/23] dataset.jl small update

---
 src/dataset.jl | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/src/dataset.jl b/src/dataset.jl
index 18d7b6b..5f1b77c 100644
--- a/src/dataset.jl
+++ b/src/dataset.jl
@@ -1,9 +1,3 @@
-using CSV
-using DataFrames
-using Downloads
-
-export Dataset, csv_dataset
-
 """
     Dataset(name, X, y)
 
@@ -17,8 +11,8 @@ Contains datasets for ridge regression experiments.
 # Throws
 - `ArgumentError`: If rows in `X` does not equal length of `y`.
 
-# Notes
-Used as the experimental unit for ridge regression experiments.
+!!! note
+    Used as the experimental unit for ridge regression experiments.
 """
 struct Dataset
     name::String

From 70cfa67631c488de2828e9fa2a9e377ab4d2abbe Mon Sep 17 00:00:00 2001
From: Eton Tackett <entackett@wisc.edu>
Date: Mon, 16 Mar 2026 23:19:22 -0500
Subject: [PATCH 07/23] Updated Experimental Units and Treatments Sections

---
 docs/src/design.md | 64 ++++++++++++++++++++++++++++++++++++----------
 1 file changed, 50 insertions(+), 14 deletions(-)

diff --git a/docs/src/design.md b/docs/src/design.md
index 5c02373..4795866 100644
--- a/docs/src/design.md
+++ b/docs/src/design.md
@@ -1,32 +1,68 @@
 # Motivation and Background
-Many modern applications, such as genome-wide association studies (GWAS) involve regression problems with a large number of predictors. Traditional least squares methods fail due to noise and ill-conditioning. Penalized Least Squares (PLS) extends ordinary least squares (OLS) regression by adding a penalty term to shrink parameter estimates. The goal is to select the best possible model, "best" in the sense that we find the best tradeoff between goodness of fit and model complexity. Ridge regression, an approach within PLS, adds a regularization term. 
+Many modern science problems involve regression problems with extremely large numbers of predictors. Genome-wide association studies (GWAS), for example, try to identify genetic variants associated with a disease phenotype using hundreds of thousands or millions of genomic features. In such settings, traditional least squares methods fail because noise and ill-conditioning. Penalized Least Squares (PLS) extends ordinary least squares (OLS) regression by adding a penalty term to shrink parameter estimates. Ridge regression, an approach within PLS, adds a regularization term, producing a regularized estimator that helps to stabalize the solution. 
 
+There are many numerical algorithms available to compute ridge regression estimates including direct methods, Krylov subspace methods, gradient-based optimization, coordinate descent, and stochastic gradient descent. These algorithms differ in their computational cost, memory requirements, and numerical stability. 
+
+The goal of this experiment is to investigate the performance of these algorithms when we vary the structure and scale of the regression problem. To do this, we consider the linear model $\mathbf{y} = X\boldsymbol{\beta} + \boldsymbol{\varepsilon}$ where the matrix ${X}$ will be constructed with varying dimensions, sparsity patterns, and conditioning properties.
 # Questions
 Key Questions:
 Which ridge regression algorithm is provides the best balance between:
--Numerical stability
--Computational aspects (GPU/CPU, runtime, etc)
--Predicative accuracy
+
+- Numerical stability
+- Computational aspects (GPU/CPU, runtime, etc)
 # Experimental Units
-The experimental units are the datasets under fixed penalty weights. Due to the statistical behavior of ridge regression algorithms depends strongly on the dimensional structure of the problem, a blocking procedure will be used. Datasets will be grouped according to their dimensional regime, characterized as p >> n, p ≈ n, and p << n. These regimes correspond to fundamentally different geometric properties of the design matrix, including rank behavior, conditioning, and the stability of the normal equations.
+The experimental units are the datasets under fixed penalty weights. Each dataset will contain a matrix ${X}$, a response vector $\mathbf{y}$, and a regularization parameter ${\lambda}$. Because the statistical behavior of ridge regression algorithms depends strongly on the dimensional structure of the problem, a blocking system will be used. Datasets will be grouped according to their dimensional regime, characterized as $p \ll n$, p ≈ n, and $p \gg n$. These regimes correspond to fundamentally different geometric properties of the design matrix, including rank behavior, conditioning, and the stability of the normal equations.
+
+In addition to dimensional block, the strength of the ridge penalty will be incorporated as a secondary blocking factor. The ridge estimator is $\hat{\beta_R} = (X^\top X + \lambda I)^{-1}X^\top y$. The matrix conditioning number is defined as $\kappa(A) = \frac{\sigma_{\max}(A)}{\sigma_{\min}(A)}$. In the context of ridge regression, the regularization parameter ${\lambda}$, can impact the conditioning number. Let $X = U\Sigma V^\top$ be the SVD of $X$, with singular values $\sigma_1,\dots,\sigma_p$.
+
+Then
+
+$$
+X^\top X = V \Sigma^\top \Sigma V^\top
+= V \,\mathrm{diag}(\sigma_1^2,\dots,\sigma_p^2)\, V^\top .
+$$
+
+Adding the ridge term gives
 
-In addition to dimensional regime, matrix conditioning will be incorporated as a secondary blocking factor. The condition number of the design matrix quantifies the sensitivity of the regression problem to perturbations in the data and directly affects numerical stability and convergence behavior of ridge solution methods. Ill-conditioned matrices have slow convergence and are sensitive to errors, while well-conditioned matrices tend to produce stable and rapidly convergent behavior.
+$$
+X^\top X + \lambda I
+=
+V \,\mathrm{diag}(\sigma_1^2+\lambda,\dots,\sigma_p^2+\lambda)\, V^\top .
+$$
+$$
+\kappa_2(X^\top X+\lambda I)
+=
+\frac{\sigma_{\max}^2+\lambda}{\sigma_{\min}^2+\lambda}.
+$$
+
+When ${\lambda}$ is small and the singular values are large, we aren't changing the conditioning number much. As such, the ridge penalty won't really affect the condition number. Conversely, if $\sigma_{\min}$ is close to 0, we have an ill-posed problem (Existence, uniqueness, stability not satisfied). However if a large ${\lambda}$ is chosen, the condition number is reduced and the problem becomes more stable. This behavior motivates blocking experiments according to the effective conditioning induced by the ridge penalty, allowing algorithm performance to be compared across well-posed and ill-posed regression settings.
+
+Another blocking factor that will be considered is how sparse or dense the matrix X is. Many algorithms behave differently depending on whether the matrix is sparse or dense. In ridge regression, there are many operations involving X including matrix-matrix products and matrix-vector products. A dense matrix leads to high computational cost whereas a sparse matrix we can significantly reduce the cost. As such, different algorithms may perform better depending on the sparsity structure of X, making matrix sparsity a relevant blocking factor when comparing algorithm behavior and computational efficiency.
 
 | Blocking System | Factor | Blocks |
 |:----------------|:-------|:-------|
 | Dataset | Dimensional regime (\(p/n\)) | $(p \ll n)$, $(p \approx n)$, $(p \gg n)$|
-| Matrix conditioning | Condition number of \( X \) or \( X^T X \) | Low, Medium, High |
+| Ridge Penalty| Value of ${\lambda}$ relative to the singular values | Small, Large (Frobenius Norm range of values, Calculate Singular Values)|
+| Matrix Sparsity| Density of non-zero values in X | Sparse, Moderate, Dense (Need to ask about how to quantify this)|
 # Treatments
+(How many treatments should we expect)
+(What treatments correspond to which blocking system)
+
 The treatments are the ridge regression solution methods:
--Gradient descent
--Stochastic gradient descent
--Closed-form solutions
+
+- Gradient-based optimization
+- Stochastic gradient descent
+- Direct Methods
+
+For each experimental unit, all treatments will be applied to the dataset. This will be done so that differences in performance can be attributed to the algorithms themselves rather than the data. The order of treatment application will be completely randomized to avoid any systemic bias. If there are b possible combinations of the levels of each blocking factor 
+
+The total number of block combinations is determined by the product of the number of levels in each blocking factor. For example, if the experiment includes three dimensional regimes, two sparsity levels, and two regularization strengths, then there are $3 * 2 * 2 = 12$ block combinations.
 # Observational Units and Measurements
+Explanation needed
+(How many rows/columns to expect)
+(Explain measurements: Floating point, real, etc. Meaning of values, what values it can take.)
 The observational units are each algorithm-dataset pair. For each combination we will observe the following 
 | Measurement System        | Factor                    | Measurements |
 |:--------------------------|:--------------------------|:-------------|
-| Predictive Performance    | Prediction error          | Training MSE, Test MSE, RMSE, R² |
-| Estimation Accuracy       | Parameter recovery        | ‖β̂ − β_true‖₂² | (if known)
 | Computational Performance | Efficiency                | Runtime (seconds), Iterations to convergence |
-| Numerical Stability       | Solution accuracy         | Perturbation sensitivity |
-| Model Complexity          | Coefficient magnitude     | ‖β̂‖₂ |
\ No newline at end of file
+| Numerical Stability       | Solution accuracy         | Perturbation sensitivity |
\ No newline at end of file

From 9b8c48ce777462643bc71f9b849b7b8fc1ca78ac Mon Sep 17 00:00:00 2001
From: Eton Tackett <entackett@wisc.edu>
Date: Mon, 16 Mar 2026 23:38:17 -0500
Subject: [PATCH 08/23] Small changes

---
 src/dataset.jl | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/src/dataset.jl b/src/dataset.jl
index 5f1b77c..4e20c89 100644
--- a/src/dataset.jl
+++ b/src/dataset.jl
@@ -28,20 +28,19 @@ struct Dataset
 end
 
 """
-    one_hot_encode(dataset::Dataset; drop_first=true)
+    one_hot_encode(Xdf::DataFrame; drop_first=true)
 
-One-hot encode categorical (string-like) features in `dataset.X`.
+One-hot encode categorical (string-like) features in `Xdf`.
 
 # Arguments
-- `dataset::Dataset`: Input dataset containing feature matrix `X`
-  and response vector `y`.
+- `Xdf::DataFrame`: Input DataFrame containing features and response vector `y`.
 
 # Keyword Arguments
 - `drop_first::Bool=true`: If `true`, drop the first dummy column for
   each categorical feature to avoid multicollinearity.
 
 # Returns
-A new `Dataset` with numeric `X` and unchanged `y`.
+- `Matrix{Float64}`: A numeric matrix containing the encoded feature.
 """
 function one_hot_encode(Xdf::DataFrame; drop_first::Bool = true)::Matrix{Float64}
     n = nrow(Xdf)
@@ -91,7 +90,7 @@ Load a dataset from a CSV file or URL.
 - `name::String="csv_dataset"`: Dataset name.
 
 # Returns
-A `Dataset`.
+- `Dataset`: A dataset containing the encoded feature matrix `X`, response vector `y`, and dataset name.
 """
 function csv_dataset(path_or_url::String;
     target_col,

From 6c923abdc5770c213f8cfe223dbc94682a301f9f Mon Sep 17 00:00:00 2001
From: Eton Tackett <entackett@wisc.edu>
Date: Tue, 17 Mar 2026 10:34:42 -0500
Subject: [PATCH 09/23] Bidiagonalization Stuff

---
 src/bidiagonalization.jl        | 146 ++++++++++++++++++++++++++++++++
 test/bidiagonalization_tests.jl | 137 ++++++++++++++++++++++++++++++
 2 files changed, 283 insertions(+)
 create mode 100644 src/bidiagonalization.jl
 create mode 100644 test/bidiagonalization_tests.jl

diff --git a/src/bidiagonalization.jl b/src/bidiagonalization.jl
new file mode 100644
index 0000000..5405e7e
--- /dev/null
+++ b/src/bidiagonalization.jl
@@ -0,0 +1,146 @@
+"""
+    compute_givens(a, b)
+
+Compute Givens rotation coefficients for scalars `a` and `b`.
+
+# Arguments
+- `a::Number`: First scalar
+- `b::Number`: Second scalar
+
+# Returns
+- `c::Number`: Cosine coefficient of the Givens rotation
+- `s::Number`: Sine coefficient of the Givens rotation
+
+"""
+function compute_givens(a::Number, b::Number) # Compute Givens rotation coefficients for scalars a and b
+    if b == 0
+        return one(a), zero(a)
+    elseif a == 0
+        throw(ArgumentError("a is zero, cannot compute Givens rotation"))
+    else
+        r = hypot(a, b)
+        c = a / r
+        s = b / r
+        return c, s
+    end
+end
+
+"""
+    rotate_rows!(M::AbstractMatrix,i::Int,j::Int,c::Number,s::Number)
+
+Apply a Givens rotation to rows `i` and `j` of matrix `M`.
+
+# Arguments
+- `M::AbstractMatrix`: The matrix to be rotated
+- `i::Int`: First row index
+- `j::Int`: Second row index
+- `c::Number`: Cosine coefficient of the Givens rotation
+- `s::Number`: Sine coefficient of the Givens rotation
+
+# Returns
+- `M::AbstractMatrix`: The rotated matrix
+
+"""
+function rotate_rows!(M::AbstractMatrix,i::Int,j::Int,c::Number,s::Number)
+    for k in 1:size(M,2) # Loop over columns
+        temp = M[i,k] # Store the original value of M[i,k] before modification
+        M[i,k] = c*temp + s*M[j,k]
+        M[j,k] = -conj(s)*temp + c*M[j,k] #Apply the Givens rotation to the elements in rows i and j
+    end
+    return M
+end
+
+
+"""
+    rotate_cols!(M::AbstractMatrix,i::Int,j::Int,c::Number,s::Number)
+
+Apply a Givens rotation to columns `i` and `j` of matrix `M`.
+
+# Arguments
+- `M::AbstractMatrix`: The matrix to be rotated
+- `i::Int`: First column index
+- `j::Int`: Second column index
+- `c::Number`: Cosine coefficient of the Givens rotation
+- `s::Number`: Sine coefficient of the Givens rotation
+
+# Returns
+- `M::AbstractMatrix`: The rotated matrix
+
+"""
+function rotate_cols!(M::AbstractMatrix,i::Int,j::Int,c::Number,s::Number)
+    for k in 1:size(M,1) # Loop over rows
+        temp = M[k,i] # Store the original value of M[k,i] before modification
+        M[k,i] = c*temp + s*M[k,j]
+        M[k,j] = -conj(s)*temp + c*M[k,j] # Apply the Givens rotation to the elements in columns i and j
+    end
+    return M
+end
+"""
+    bidiagonalize_with_H(A::AbstractMatrix, L::AbstractMatrix, b::AbstractVector)
+
+Performs bidiagonalization of a matrix A using a sequence of Givens transformations while explicitly accumulating
+the orthogonal left factor `H` and right factor `K` such that
+
+    H' * A * K = B.
+
+# Arguments
+- `A::AbstractMatrix`: The matrix to be bidiagonalized
+- `L::AbstractMatrix`: The banded matrix to be updated in-place
+- `b::AbstractVector`: The vector to be transformed
+
+# Returns
+- `B::AbstractMatrix`: The bidiagonalized form of the input matrix A with dimension (n,n)
+- `C::AbstractMatrix`: The matrix resulting from the sequence of Givens transformations
+- `H::AbstractMatrix`: The orthogonal left factor
+- `K::AbstractMatrix`: The orthogonal right factor
+- `Ht::AbstractMatrix`: The transpose of the orthogonal left factor
+- `b::AbstractVector`: The vector resulting from applying the Givens transformations to the input vector b
+
+"""
+
+function bidiagonalize_with_H(A::AbstractMatrix, L::AbstractMatrix, b::AbstractVector)
+    m, n = size(A)
+
+    B = copy(A) #Will be transformed into bidiagonal form
+    C = copy(L)
+    bhat = copy(b) #Will recieve same left transformations applied to A
+
+    Ht = Matrix{eltype(A)}(I, m, m) #Ht will accumulate the left transformations, initialized as identity
+    K  = Matrix{eltype(A)}(I, n, n) #K will accumulate the right transformations, initialized as identity
+
+    imax = min(m, n)
+
+    for i in 1:imax
+        # Left Givens rotations: zero below diagonal in column i
+        for j in m:-1:(i + 1)
+            if B[j, i] != 0
+                c, s = compute_givens(B[i, i], B[j, i]) #Build Givens rotation to zero B[j, i]
+                rotate_rows!(B, i, j, c, s) #Apply the Givens rotation to rows i and j of B
+                rotate_rows!(Ht, i, j, c, s) #Accumulate the left transformations in Ht
+
+                temp = bhat[i]
+                bhat[i] = c*temp + s*bhat[j]
+                bhat[j] = -conj(s)*temp + c*bhat[j] #Applying same left rotation to bhat, bhat tracks how b is transformed by the left transformations.
+
+                B[j, i] = zero(eltype(B))
+            end
+        end
+
+        # Right Givens rotations: zero entries right of superdiagonal
+        if i <= n - 2
+            for k in n:-1:(i + 2)
+                if B[i, k] != 0
+                    c, s = compute_givens(B[i, i + 1], B[i, k]) #Build Givens rotation to zero B[i, k]
+                    #s = -s
+                    rotate_cols!(B, i + 1, k, c, s) #Apply the Givens rotation to columns i+1 and k of B
+                    rotate_cols!(C, i + 1, k, c, s) #Apply the same right rotation to C, since C is updated by the right transformations
+                    rotate_cols!(K, i + 1, k, c, s) #Accumulate the right transformations in K
+                    B[i, k] = zero(eltype(B))
+                end
+            end
+        end
+    end
+
+    H = transpose(Ht)
+    return B, C, H, K, Ht, bhat
+end
diff --git a/test/bidiagonalization_tests.jl b/test/bidiagonalization_tests.jl
new file mode 100644
index 0000000..5fa22e8
--- /dev/null
+++ b/test/bidiagonalization_tests.jl
@@ -0,0 +1,137 @@
+using Test
+using LinearAlgebra
+using RidgeRegression
+
+@testset "compute_givens" begin
+    c, s = compute_givens(3.0, 0.0)
+    @test c == 1.0
+    @test s == 0.0
+    a = 3.0
+    b = 4.0
+    c, s = compute_givens(a, b)
+    v1 = c*a + s*b
+    v2 = -s*a + c*b
+    @test isapprox(v2, 0.0; atol=1e-12, rtol=0)
+    @test isapprox(abs(v1), hypot(a, b); atol=1e-12, rtol=0)
+    @test_throws ArgumentError compute_givens(0.0, 2.0)
+end
+@testset "rotate_rows!" begin
+    M = [1.0 2.0;
+         3.0 4.0]
+
+    c, s = compute_givens(1.0, 3.0)
+    rotate_rows!(M, 1, 2, c, s)
+
+    @test isapprox(M[2, 1], 0.0; atol=1e-12, rtol=0)
+end
+
+@testset "rotate_cols!" begin
+    M = [3.0 4.0;
+         1.0 2.0]
+
+    c, s = compute_givens(3.0, 4.0)
+    rotate_cols!(M, 1, 2, c, s)
+
+    @test isapprox(M[1, 2], 0.0; atol=1e-12, rtol=0)
+end
+@testset "bidiagonalize_with_H basic properties" begin
+    A = [1.0 2.0 3.0;
+         4.0 5.0 6.0;
+         7.0 8.0 10.0]
+
+    L = Matrix{Float64}(I, 3, 3)
+    b = [1.0, 2.0, 3.0]
+
+    B, C, H, K, Ht, bhat = bidiagonalize_with_H(A, L, b)
+
+    # Ht should be the transpose of H
+    @test Ht ≈ H'
+
+    # Orthogonality of H and K
+    @test H' * H ≈ Matrix{Float64}(I, 3, 3)
+    @test H * H' ≈ Matrix{Float64}(I, 3, 3)
+    @test K' * K ≈ Matrix{Float64}(I, 3, 3)
+    @test K * K' ≈ Matrix{Float64}(I, 3, 3)
+
+    # Main factorization identity
+    @test H' * A * K ≈ B
+
+    @test H' * b ≈ bhat
+
+    # Applies right transforms to L, implicitly assuming J = I
+    @test L * K ≈ C
+end
+
+@testset "bidiagonalize_with_H produces upper bidiagonal matrix" begin
+    A = [2.0 1.0 3.0;
+         4.0 5.0 6.0;
+         7.0 8.0 9.0]
+
+    L = Matrix{Float64}(I, 3, 3)
+    b = [1.0, -1.0, 2.0]
+
+    B, C, H, K, Ht, bhat = bidiagonalize_with_H(A, L, b)
+
+    m, n = size(B)
+
+    for i in 1:m
+        for j in 1:n
+            # In an upper bidiagonal matrix, only diagonal and superdiagonal may be nonzero
+            if !(j == i || j == i + 1)
+                @test isapprox(B[i, j], 0.0; atol=1e-10, rtol=0)
+            end
+        end
+    end
+end
+
+@testset "bidiagonalize_with_H rectangular tall matrix" begin
+    A = [1.0 2.0 3.0;
+         4.0 5.0 6.0;
+         7.0 8.0 9.0;
+         1.0 0.0 1.0]
+
+    L = Matrix{Float64}(I, 3, 3)
+    b = [1.0, 2.0, 3.0, 4.0]
+
+    B, C, H, K, Ht, bhat = bidiagonalize_with_H(A, L, b)
+
+    m, n = size(A)
+
+    @test size(B) == (m, n)
+    @test size(C) == size(L)
+    @test size(H) == (m, m)
+    @test size(K) == (n, n)
+    @test size(Ht) == (m, m)
+    @test length(bhat) == length(b)
+
+    @test H' * H ≈ Matrix{Float64}(I, m, m)
+    @test K' * K ≈ Matrix{Float64}(I, n, n)
+
+    @test H' * A * K ≈ B
+    @test H' * b ≈ bhat
+    @test L * K ≈ C
+
+    for i in 1:m
+        for j in 1:n
+            if !(j == i || j == i + 1)
+                @test isapprox(B[i, j], 0.0; atol=1e-10, rtol=0)
+            end
+        end
+    end
+end
+
+@testset "bidiagonalize_with_H identity-like simple case" begin
+    A = [3.0 0.0;
+         4.0 5.0]
+
+    L = Matrix{Float64}(I, 2, 2)
+    b = [1.0, 2.0]
+
+    B, C, H, K, Ht, bhat = bidiagonalize_with_H(A, L, b)
+
+    @test H' * A * K ≈ B
+    @test H' * b ≈ bhat
+    @test L * K ≈ C
+
+    @test isapprox(B[2,1], 0.0; atol=1e-12, rtol=0)
+end

From 6727a9006ede2a5ee3f95fa40371c9745974acee Mon Sep 17 00:00:00 2001
From: Eton Tackett <entackett@wisc.edu>
Date: Tue, 17 Mar 2026 11:00:18 -0500
Subject: [PATCH 10/23] Adding Linear Algebra to Project.toml

---
 test/Project.toml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/test/Project.toml b/test/Project.toml
index 0c36332..73141b0 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -1,2 +1,5 @@
 [deps]
+CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
+DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
\ No newline at end of file

From 358e88189de5f076f6b2a99430143f997175fae5 Mon Sep 17 00:00:00 2001
From: Eton Tackett <entackett@wisc.edu>
Date: Tue, 17 Mar 2026 11:32:20 -0500
Subject: [PATCH 11/23] Design Branch

---
 docs/src/design.md | 33 ++++++++++++++++++---------------
 1 file changed, 18 insertions(+), 15 deletions(-)

diff --git a/docs/src/design.md b/docs/src/design.md
index 4795866..78e1244 100644
--- a/docs/src/design.md
+++ b/docs/src/design.md
@@ -11,7 +11,7 @@ Which ridge regression algorithm is provides the best balance between:
 - Numerical stability
 - Computational aspects (GPU/CPU, runtime, etc)
 # Experimental Units
-The experimental units are the datasets under fixed penalty weights. Each dataset will contain a matrix ${X}$, a response vector $\mathbf{y}$, and a regularization parameter ${\lambda}$. Because the statistical behavior of ridge regression algorithms depends strongly on the dimensional structure of the problem, a blocking system will be used. Datasets will be grouped according to their dimensional regime, characterized as $p \ll n$, p ≈ n, and $p \gg n$. These regimes correspond to fundamentally different geometric properties of the design matrix, including rank behavior, conditioning, and the stability of the normal equations.
+The experimental units are the datasets under fixed penalty weights. Each dataset will contain a matrix ${X}$, a response vector $\mathbf{y}$, and a regularization parameter ${\lambda}$ for some specific ${\lambda}$. Due to the statistical behavior of ridge regression algorithms depends strongly on the dimensional structure of the problem, a blocking system will be used. Datasets will be grouped according to their dimensional regime, characterized as $p \ll n$, p ≈ n, and $p \gg n$. These regimes correspond to fundamentally different geometric properties of the design matrix, including rank behavior, conditioning, and the stability of the normal equations.
 
 In addition to dimensional block, the strength of the ridge penalty will be incorporated as a secondary blocking factor. The ridge estimator is $\hat{\beta_R} = (X^\top X + \lambda I)^{-1}X^\top y$. The matrix conditioning number is defined as $\kappa(A) = \frac{\sigma_{\max}(A)}{\sigma_{\min}(A)}$. In the context of ridge regression, the regularization parameter ${\lambda}$, can impact the conditioning number. Let $X = U\Sigma V^\top$ be the SVD of $X$, with singular values $\sigma_1,\dots,\sigma_p$.
 
@@ -42,11 +42,9 @@ Another blocking factor that will be considered is how sparse or dense the matri
 | Blocking System | Factor | Blocks |
 |:----------------|:-------|:-------|
 | Dataset | Dimensional regime (\(p/n\)) | $(p \ll n)$, $(p \approx n)$, $(p \gg n)$|
-| Ridge Penalty| Value of ${\lambda}$ relative to the singular values | Small, Large (Frobenius Norm range of values, Calculate Singular Values)|
-| Matrix Sparsity| Density of non-zero values in X | Sparse, Moderate, Dense (Need to ask about how to quantify this)|
+| Ridge Penalty| Value of ${\lambda}$ relative to the singular values | Small, Large (determined by comparing 𝜆 to the magnitude of the singular values of $$X^\top X$$ estimated via the Frobenius norm and SVD)|
+| Matrix Sparsity| Density of non-zero values in X | Sparse (< 10% non-zero), Moderate (10%-50% non-zero), Dense (> 50% non-zero)|
 # Treatments
-(How many treatments should we expect)
-(What treatments correspond to which blocking system)
 
 The treatments are the ridge regression solution methods:
 
@@ -54,15 +52,20 @@ The treatments are the ridge regression solution methods:
 - Stochastic gradient descent
 - Direct Methods
 
-For each experimental unit, all treatments will be applied to the dataset. This will be done so that differences in performance can be attributed to the algorithms themselves rather than the data. The order of treatment application will be completely randomized to avoid any systemic bias. If there are b possible combinations of the levels of each blocking factor 
+For each experimental unit, all treatments will be applied to the dataset. This will be done so that differences in performance can be attributed to the algorithms themselves rather than the data. The order of treatment application will be completely randomized to avoid any systemic bias.
+
+Blocks are defined by combinations of the experimental blocking factors, including dimensional regime, matrix sparsity, and ridge penalty magnitude. Each block represents datasets with similar structural properties. Within each block, multiple datasets will be generated, and each dataset forms an experimental unit. For every experimental unit all treatments are applied.
+
+ The total number of block combinations is determined by the product of the number of levels in each blocking factor. For example, if the experiment includes three dimensional regimes, two sparsity levels, and two regularization strengths, then there are $3 * 2 * 2 = 12$ block combinations. We will also denote r to be the number of replicated datasets in each block. Here, we mean the number datasets within a block. The total number of experimental units is then ${b * r}$. Since each experimental unit will recieves all t treatments, the total number of algorithm runs in the experiment is ${t * b * r}$. For this experiment, ${t=3}$.
 
-The total number of block combinations is determined by the product of the number of levels in each blocking factor. For example, if the experiment includes three dimensional regimes, two sparsity levels, and two regularization strengths, then there are $3 * 2 * 2 = 12$ block combinations.
 # Observational Units and Measurements
-Explanation needed
-(How many rows/columns to expect)
-(Explain measurements: Floating point, real, etc. Meaning of values, what values it can take.)
-The observational units are each algorithm-dataset pair. For each combination we will observe the following 
-| Measurement System        | Factor                    | Measurements |
-|:--------------------------|:--------------------------|:-------------|
-| Computational Performance | Efficiency                | Runtime (seconds), Iterations to convergence |
-| Numerical Stability       | Solution accuracy         | Perturbation sensitivity |
\ No newline at end of file
+\The observational units are each algorithm-dataset pairs. For each combination we will observe the following 
+
+| Measurement System        | Factor            | Measurement               | Data Type      | Description                                                                                          |
+| :------------------------ | :---------------- | :------------------------ | :------------- | :--------------------------------------------------------------------------------------------------- |
+| Computational Performance | Efficiency        | Runtime (seconds)         | Floating-point | Time required for the algorithm to compute the ridge regression solution.                 |
+| Computational Performance | Efficiency        | Iterations to convergence | Integer        | Number of iterations required                   |
+| Numerical Stability       | Residuals  | Residual Norm   | Floating-point | Unsure how to quantify (Norm of the Error?), measurement of how well solution satisfies the regression problem |
+| Numerical Stability       | Robustness        | Perturbation sensitivity  | Floating-point | Change in the solution under small perturbations to the input data.                                  |
+
+The collected measurements will be written to a CSV file. Each row in the file corresponds to a single algorithm–dataset pair, which forms the observational unit of the experiment. The columns represent the recorded measurements. After the experiment, the resulting CSV file should contain ${Algorithms * Datasets}$ number of rows and each row will contain roughly 6-8 columns.
\ No newline at end of file

From 2ad65a9cd302d1d2d5ec9f8d7b0c8e759686ce86 Mon Sep 17 00:00:00 2001
From: Eton Tackett <entackett@wisc.edu>
Date: Thu, 19 Mar 2026 12:01:00 -0500
Subject: [PATCH 12/23] Updated design 3/19

---
 docs/src/design.md | 63 +++++++++++++++++++++++++++-------------------
 1 file changed, 37 insertions(+), 26 deletions(-)

diff --git a/docs/src/design.md b/docs/src/design.md
index 78e1244..b03c9d0 100644
--- a/docs/src/design.md
+++ b/docs/src/design.md
@@ -1,47 +1,55 @@
 # Motivation and Background
-Many modern science problems involve regression problems with extremely large numbers of predictors. Genome-wide association studies (GWAS), for example, try to identify genetic variants associated with a disease phenotype using hundreds of thousands or millions of genomic features. In such settings, traditional least squares methods fail because noise and ill-conditioning. Penalized Least Squares (PLS) extends ordinary least squares (OLS) regression by adding a penalty term to shrink parameter estimates. Ridge regression, an approach within PLS, adds a regularization term, producing a regularized estimator that helps to stabalize the solution. 
+Many modern science problems involve regression problems with extremely large numbers of predictors. Genome-wide association studies (GWAS), for example, try to identify genetic variants associated with a disease phenotype using hundreds of thousands or millions of genomic features. In such settings, traditional least squares methods fail because noise and ill-conditioning. Penalized Least Squares (PLS) extends ordinary least squares (OLS) regression by adding a penalty term to shrink parameter estimates. Ridge regression, an approach within PLS, adds a regularization term, producing a regularized estimator that helps to stabilize the solution. 
 
-There are many numerical algorithms available to compute ridge regression estimates including direct methods, Krylov subspace methods, gradient-based optimization, coordinate descent, and stochastic gradient descent. These algorithms differ in their computational cost, memory requirements, and numerical stability. 
+There are many numerical algorithms available to compute ridge regression estimates including direct methods, Krylov subspace methods, gradient-based optimization, coordinate descent, and stochastic gradient descent. These algorithms differ in their computational costs and numerical stability. 
 
 The goal of this experiment is to investigate the performance of these algorithms when we vary the structure and scale of the regression problem. To do this, we consider the linear model $\mathbf{y} = X\boldsymbol{\beta} + \boldsymbol{\varepsilon}$ where the matrix ${X}$ will be constructed with varying dimensions, sparsity patterns, and conditioning properties.
 # Questions
 Key Questions:
+(adding the above)
 Which ridge regression algorithm is provides the best balance between:
 
 - Numerical stability
-- Computational aspects (GPU/CPU, runtime, etc)
+- Computational costs
+- 
 # Experimental Units
-The experimental units are the datasets under fixed penalty weights. Each dataset will contain a matrix ${X}$, a response vector $\mathbf{y}$, and a regularization parameter ${\lambda}$ for some specific ${\lambda}$. Due to the statistical behavior of ridge regression algorithms depends strongly on the dimensional structure of the problem, a blocking system will be used. Datasets will be grouped according to their dimensional regime, characterized as $p \ll n$, p ≈ n, and $p \gg n$. These regimes correspond to fundamentally different geometric properties of the design matrix, including rank behavior, conditioning, and the stability of the normal equations.
+The experimental units are the datasets under fixed penalty weights. For each experimental unit, all treatments will be applied to the dataset. This will be done so that differences in performance can be attributed to the algorithms themselves rather than the data. Each dataset will contain a matrix ${X}$, a response vector $\mathbf{y}$, and a regularization parameter ${\lambda}$ for some specific ${\lambda}$. Owing to the statistical behavior of ridge regression algorithms depends strongly on the dimensional structure of the problem, a blocking system will be used. 
+
+Blocks are defined by combinations of the experimental blocking factors, including dimensional regime, matrix sparsity, and ridge penalty magnitude. Each block represents datasets with similar structural properties. Within each block, multiple datasets will be generated, and each dataset forms an experimental unit. For every experimental unit all treatments are applied.
+
+Datasets will be grouped according to their dimensional regime, characterized as $p \ll n$, p ≈ n, and $p \gg n$. These regimes correspond to fundamentally different geometric properties of the design matrix, including rank behavior, conditioning, and the stability of the normal equations.
 
 In addition to dimensional block, the strength of the ridge penalty will be incorporated as a secondary blocking factor. The ridge estimator is $\hat{\beta_R} = (X^\top X + \lambda I)^{-1}X^\top y$. The matrix conditioning number is defined as $\kappa(A) = \frac{\sigma_{\max}(A)}{\sigma_{\min}(A)}$. In the context of ridge regression, the regularization parameter ${\lambda}$, can impact the conditioning number. Let $X = U\Sigma V^\top$ be the SVD of $X$, with singular values $\sigma_1,\dots,\sigma_p$.
 
 Then
-
-$$
+```math
 X^\top X = V \Sigma^\top \Sigma V^\top
 = V \,\mathrm{diag}(\sigma_1^2,\dots,\sigma_p^2)\, V^\top .
-$$
+```
 
 Adding the ridge term gives
 
-$$
+```math
 X^\top X + \lambda I
 =
 V \,\mathrm{diag}(\sigma_1^2+\lambda,\dots,\sigma_p^2+\lambda)\, V^\top .
-$$
-$$
+```
+
+```math
 \kappa_2(X^\top X+\lambda I)
 =
 \frac{\sigma_{\max}^2+\lambda}{\sigma_{\min}^2+\lambda}.
-$$
+```
 
 When ${\lambda}$ is small and the singular values are large, we aren't changing the conditioning number much. As such, the ridge penalty won't really affect the condition number. Conversely, if $\sigma_{\min}$ is close to 0, we have an ill-posed problem (Existence, uniqueness, stability not satisfied). However if a large ${\lambda}$ is chosen, the condition number is reduced and the problem becomes more stable. This behavior motivates blocking experiments according to the effective conditioning induced by the ridge penalty, allowing algorithm performance to be compared across well-posed and ill-posed regression settings.
 
 Another blocking factor that will be considered is how sparse or dense the matrix X is. Many algorithms behave differently depending on whether the matrix is sparse or dense. In ridge regression, there are many operations involving X including matrix-matrix products and matrix-vector products. A dense matrix leads to high computational cost whereas a sparse matrix we can significantly reduce the cost. As such, different algorithms may perform better depending on the sparsity structure of X, making matrix sparsity a relevant blocking factor when comparing algorithm behavior and computational efficiency.
 
+The total number of block combinations is determined by the product of the number of levels in each blocking factor, denoted b. For example, if the experiment includes three dimensional regimes, two sparsity levels, and two regularization strengths, then there are $3 * 2 * 2 = 12$ block combinations. We will also denote r to be the number of replicated datasets in each block. Here, we mean the number datasets within a block. The total number of experimental units is then ${b * r}$.
+
 | Blocking System | Factor | Blocks |
 |:----------------|:-------|:-------|
-| Dataset | Dimensional regime (\(p/n\)) | $(p \ll n)$, $(p \approx n)$, $(p \gg n)$|
+| Dataset | Dimensional regime (`(p/n)`) | $(p \ll n)$, $(p \approx n)$, $(p \gg n)$|
 | Ridge Penalty| Value of ${\lambda}$ relative to the singular values | Small, Large (determined by comparing 𝜆 to the magnitude of the singular values of $$X^\top X$$ estimated via the Frobenius norm and SVD)|
 | Matrix Sparsity| Density of non-zero values in X | Sparse (< 10% non-zero), Moderate (10%-50% non-zero), Dense (> 50% non-zero)|
 # Treatments
@@ -51,21 +59,24 @@ The treatments are the ridge regression solution methods:
 - Gradient-based optimization
 - Stochastic gradient descent
 - Direct Methods
+    - Bidiagonalization
+ 
+ Since each experimental unit will recieves all t treatments, the total number of algorithm runs in the experiment is ${t * b * r}$. For this experiment, ${t=3}$. To ensure fair comparison between algorithms, each treatment will be applied under a fixed time constraint. Each algorithm will be run for a maximum of two hours per experimental unit. 
+# Observational Units and Measurements
 
-For each experimental unit, all treatments will be applied to the dataset. This will be done so that differences in performance can be attributed to the algorithms themselves rather than the data. The order of treatment application will be completely randomized to avoid any systemic bias.
-
-Blocks are defined by combinations of the experimental blocking factors, including dimensional regime, matrix sparsity, and ridge penalty magnitude. Each block represents datasets with similar structural properties. Within each block, multiple datasets will be generated, and each dataset forms an experimental unit. For every experimental unit all treatments are applied.
-
- The total number of block combinations is determined by the product of the number of levels in each blocking factor. For example, if the experiment includes three dimensional regimes, two sparsity levels, and two regularization strengths, then there are $3 * 2 * 2 = 12$ block combinations. We will also denote r to be the number of replicated datasets in each block. Here, we mean the number datasets within a block. The total number of experimental units is then ${b * r}$. Since each experimental unit will recieves all t treatments, the total number of algorithm runs in the experiment is ${t * b * r}$. For this experiment, ${t=3}$.
+The observational units are each algorithm-dataset pair. For each combination we will observe the following 
 
-# Observational Units and Measurements
-\The observational units are each algorithm-dataset pairs. For each combination we will observe the following 
+| Column Name | Data Type | Description |
+|:---|:---|:---|
+| `dataset_id` | Integer | Identifier for the generated dataset (experimental unit). |
+| `dimensional_regime` | String | Relationship between predictors and observations: `p << n`, `p ≈ n`, or `p >> n`. |
+| `sparsity_level` | String | Density of the matrix `X`: `Sparse`, `Moderate`, or `Dense`. |
+| `lambda_level` | String | Relative magnitude of the ridge penalty parameter `λ`: `Small` or `Large`. |
+| `algorithm` | String | Ridge regression solution method used: `GradientDescent`, `SGD`, or `DirectMethod`. |
+| `runtime_seconds` | Positive Floating-point | Time required for the algorithm to compute a solution. |
+| `iterations` | Positive Integer | Number of iterations performed by the algorithm (`NA` for direct methods). |
+| `residual_norm` | Positive Floating-point | Norm, measuring how well the solution fits the regression problem. |
 
-| Measurement System        | Factor            | Measurement               | Data Type      | Description                                                                                          |
-| :------------------------ | :---------------- | :------------------------ | :------------- | :--------------------------------------------------------------------------------------------------- |
-| Computational Performance | Efficiency        | Runtime (seconds)         | Floating-point | Time required for the algorithm to compute the ridge regression solution.                 |
-| Computational Performance | Efficiency        | Iterations to convergence | Integer        | Number of iterations required                   |
-| Numerical Stability       | Residuals  | Residual Norm   | Floating-point | Unsure how to quantify (Norm of the Error?), measurement of how well solution satisfies the regression problem |
-| Numerical Stability       | Robustness        | Perturbation sensitivity  | Floating-point | Change in the solution under small perturbations to the input data.                                  |
+The collected measurements will be written to a CSV file. Each row in the file corresponds to a single algorithm–dataset pair, which forms the observational unit of the experiment. The columns represent the recorded measurements. After the experiment, the resulting CSV file should contain ${Algorithms * Datasets}$ number of rows and each row will contain exactly eight columns.
 
-The collected measurements will be written to a CSV file. Each row in the file corresponds to a single algorithm–dataset pair, which forms the observational unit of the experiment. The columns represent the recorded measurements. After the experiment, the resulting CSV file should contain ${Algorithms * Datasets}$ number of rows and each row will contain roughly 6-8 columns.
\ No newline at end of file
+We also plan to observe the residual norm and the 
\ No newline at end of file

From fc2595f6940a442ea8abc3ad5c79b50a8b48909e Mon Sep 17 00:00:00 2001
From: Eton Tackett <entackett@wisc.edu>
Date: Thu, 19 Mar 2026 12:06:55 -0500
Subject: [PATCH 13/23] 3/19 edits

---
 docs/src/design.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/src/design.md b/docs/src/design.md
index b03c9d0..4618cd1 100644
--- a/docs/src/design.md
+++ b/docs/src/design.md
@@ -11,7 +11,7 @@ Which ridge regression algorithm is provides the best balance between:
 
 - Numerical stability
 - Computational costs
-- 
+
 # Experimental Units
 The experimental units are the datasets under fixed penalty weights. For each experimental unit, all treatments will be applied to the dataset. This will be done so that differences in performance can be attributed to the algorithms themselves rather than the data. Each dataset will contain a matrix ${X}$, a response vector $\mathbf{y}$, and a regularization parameter ${\lambda}$ for some specific ${\lambda}$. Owing to the statistical behavior of ridge regression algorithms depends strongly on the dimensional structure of the problem, a blocking system will be used. 
 
@@ -75,7 +75,7 @@ The observational units are each algorithm-dataset pair. For each combination we
 | `algorithm` | String | Ridge regression solution method used: `GradientDescent`, `SGD`, or `DirectMethod`. |
 | `runtime_seconds` | Positive Floating-point | Time required for the algorithm to compute a solution. |
 | `iterations` | Positive Integer | Number of iterations performed by the algorithm (`NA` for direct methods). |
-| `residual_norm` | Positive Floating-point | Norm, measuring how well the solution fits the regression problem. |
+| `beta_error`| Positive Floating-point| Norm, measuring the difference between the estimated regression coefficients and the true coefficients
 
 The collected measurements will be written to a CSV file. Each row in the file corresponds to a single algorithm–dataset pair, which forms the observational unit of the experiment. The columns represent the recorded measurements. After the experiment, the resulting CSV file should contain ${Algorithms * Datasets}$ number of rows and each row will contain exactly eight columns.
 

From 788665ce17f10104721b0538c4cfa5d3f13c31f9 Mon Sep 17 00:00:00 2001
From: Eton Tackett <entackett@wisc.edu>
Date: Tue, 24 Mar 2026 08:49:29 -0500
Subject: [PATCH 14/23] Update experimental design document

---
 docs/src/design.md | 51 ++++++++++++++++++++++++++++++----------------
 1 file changed, 33 insertions(+), 18 deletions(-)

diff --git a/docs/src/design.md b/docs/src/design.md
index 4618cd1..b02e497 100644
--- a/docs/src/design.md
+++ b/docs/src/design.md
@@ -1,19 +1,36 @@
 # Motivation and Background
-Many modern science problems involve regression problems with extremely large numbers of predictors. Genome-wide association studies (GWAS), for example, try to identify genetic variants associated with a disease phenotype using hundreds of thousands or millions of genomic features. In such settings, traditional least squares methods fail because noise and ill-conditioning. Penalized Least Squares (PLS) extends ordinary least squares (OLS) regression by adding a penalty term to shrink parameter estimates. Ridge regression, an approach within PLS, adds a regularization term, producing a regularized estimator that helps to stabilize the solution. 
+Many modern science problems involve regression problems with extremely large numbers of predictors. Genome-wide association studies (GWAS), for example, try to identify genetic variants associated with a disease phenotype using hundreds of thousands or millions of genomic features. In such settings, traditional least squares methods fail because noise and ill-conditioning. Penalized Least Squares (PLS) extends ordinary least squares (OLS) regression by adding a penalty term to shrink parameter estimates. Ridge regression, an approach within PLS, adds a regularization term, producing a regularized estimator. 
+
+Mathematically, ridge regression estimates the regression coefficients by solving the penalized least squares problem
+$$
+\hat{\boldsymbol{\beta}} =
+\arg\min_{\boldsymbol{\beta}}
+\left(
+\| \mathbf{y} - X\boldsymbol{\beta} \|^2
++
+\lambda \| \boldsymbol{\beta} \|^2
+\right)
+$$
+where $\lambda > 0$ is a regularization parameter that controls the strength of the penalty.
+
+The purpose of ridge regression is to stabilize regression estimates where the predictors are highly correlated or the design matrix $X$ is almost singular. Ridge regression shrinks the estimated coefficient vector in a way such that the coefficient estimates minimize the sum of squared residuals subject to a constraint on the $\ell_2$ norm of the coefficient vector, $\|\boldsymbol{\beta}\|^2 \leq t$, which shrinks the least squares estimates toward the origin. This reduces the variance of the coefficient estimates and mitigates the effects of multicollinearity.
 
 There are many numerical algorithms available to compute ridge regression estimates including direct methods, Krylov subspace methods, gradient-based optimization, coordinate descent, and stochastic gradient descent. These algorithms differ in their computational costs and numerical stability. 
 
-The goal of this experiment is to investigate the performance of these algorithms when we vary the structure and scale of the regression problem. To do this, we consider the linear model $\mathbf{y} = X\boldsymbol{\beta} + \boldsymbol{\varepsilon}$ where the matrix ${X}$ will be constructed with varying dimensions, sparsity patterns, and conditioning properties.
+The goal of this experiment is to investigate the performance of these algorithms when we vary the structure and scale of the regression problem. To do this, we consider the linear model $\mathbf{y} = X\boldsymbol{\beta} + \boldsymbol{\varepsilon}$ where the matrix ${X}$ may be constructed with varying dimensions, sparsity patterns, and conditioning properties.
 # Questions
-Key Questions:
-(adding the above)
-Which ridge regression algorithm is provides the best balance between:
+The primary goal of this experiment is to compare numerical algorithms for computing ridge regression estimates under various conditions. In particular, we aim to address the following questions:
 
-- Numerical stability
-- Computational costs
+1. How does algorithm performance vary across regression problems with different dimensional regimes?
+
+2. How does the sparsity structure of the design matrix $X$ influence computational efficiency and convergence behavior of different algorithms?
+
+3. How does the magnitude of the ridge penalty parameter $\lambda$, which affects the conditioning of $X^\top X + \lambda I$, influence numerical stability and runtime across algorithms?
+
+4. Which algorithm provides the best tradeoff between numerical stability and computational cost across these varying problem regimes?
 
 # Experimental Units
-The experimental units are the datasets under fixed penalty weights. For each experimental unit, all treatments will be applied to the dataset. This will be done so that differences in performance can be attributed to the algorithms themselves rather than the data. Each dataset will contain a matrix ${X}$, a response vector $\mathbf{y}$, and a regularization parameter ${\lambda}$ for some specific ${\lambda}$. Owing to the statistical behavior of ridge regression algorithms depends strongly on the dimensional structure of the problem, a blocking system will be used. 
+The experimental units are the datasets under fixed penalty weights. For each experimental unit, all treatments will be applied to the dataset. This will be done so that differences in performance can be attributed to the algorithms themselves rather than the data. Each dataset will contain a matrix ${X}$, a response vector $\mathbf{y}$, and a regularization parameter ${\lambda}$ for some specific ${\lambda}$. 
 
 Blocks are defined by combinations of the experimental blocking factors, including dimensional regime, matrix sparsity, and ridge penalty magnitude. Each block represents datasets with similar structural properties. Within each block, multiple datasets will be generated, and each dataset forms an experimental unit. For every experimental unit all treatments are applied.
 
@@ -41,7 +58,7 @@ V \,\mathrm{diag}(\sigma_1^2+\lambda,\dots,\sigma_p^2+\lambda)\, V^\top .
 \frac{\sigma_{\max}^2+\lambda}{\sigma_{\min}^2+\lambda}.
 ```
 
-When ${\lambda}$ is small and the singular values are large, we aren't changing the conditioning number much. As such, the ridge penalty won't really affect the condition number. Conversely, if $\sigma_{\min}$ is close to 0, we have an ill-posed problem (Existence, uniqueness, stability not satisfied). However if a large ${\lambda}$ is chosen, the condition number is reduced and the problem becomes more stable. This behavior motivates blocking experiments according to the effective conditioning induced by the ridge penalty, allowing algorithm performance to be compared across well-posed and ill-posed regression settings.
+Because the performance of numerical algorithms is strongly influenced by the conditioning of the system they solve, the ridge penalty effectively creates regression problems with different numerical difficulty. This provides a way to assess how algorithm performance, convergence behavior, and computational cost depend on the numerical stability of the problem. In this experiment, the magnitude of $\lambda$ is selected relative to the smallest and largest singular values of $X$. A weak regularization regime corresponds to $\lambda \approx \sigma_{\min}^2$, where the ridge penalty begins to influence the smallest singular directions but the system remains moderately ill-conditioned. A moderate regularization regime corresponds to $\lambda \approx \sigma_{\min}\sigma_{\max}$, which substantially improves the conditioning of the problem by increasing the smallest eigenvalues of $X^\top X + \lambda I$. Finally, a strong regularization regime corresponds to $\lambda \approx \sigma_{\max}^2$, where the ridge penalty dominates the spectral scale of the problem and produces a well-conditioned system.
 
 Another blocking factor that will be considered is how sparse or dense the matrix X is. Many algorithms behave differently depending on whether the matrix is sparse or dense. In ridge regression, there are many operations involving X including matrix-matrix products and matrix-vector products. A dense matrix leads to high computational cost whereas a sparse matrix we can significantly reduce the cost. As such, different algorithms may perform better depending on the sparsity structure of X, making matrix sparsity a relevant blocking factor when comparing algorithm behavior and computational efficiency.
 
@@ -49,9 +66,9 @@ The total number of block combinations is determined by the product of the numbe
 
 | Blocking System | Factor | Blocks |
 |:----------------|:-------|:-------|
-| Dataset | Dimensional regime (`(p/n)`) | $(p \ll n)$, $(p \approx n)$, $(p \gg n)$|
-| Ridge Penalty| Value of ${\lambda}$ relative to the singular values | Small, Large (determined by comparing 𝜆 to the magnitude of the singular values of $$X^\top X$$ estimated via the Frobenius norm and SVD)|
-| Matrix Sparsity| Density of non-zero values in X | Sparse (< 10% non-zero), Moderate (10%-50% non-zero), Dense (> 50% non-zero)|
+| Dataset | Dimensional regime | $(p \ll n)$, $(p \approx n)$, $(p \gg n)$|
+| Ridge Penalty | Magnitude of ${\lambda}$ relative to the spectral scale of $X^\top X$ | Weak ($\lambda \approx \sigma_{\min}^2$), Moderate ($\lambda \approx \sigma_{\min}\sigma_{\max}$), Strong ($\lambda \approx \sigma_{\max}^2$), where $\sigma_{\min}$ and $\sigma_{\max}$ denote the smallest and largest singular values of $X$. |
+| Matrix Sparsity| Density of non-zero values in $X$ | Sparse (< 10% non-zero), Moderate (10%-50% non-zero), Dense (> 50% non-zero)|
 # Treatments
 
 The treatments are the ridge regression solution methods:
@@ -59,7 +76,7 @@ The treatments are the ridge regression solution methods:
 - Gradient-based optimization
 - Stochastic gradient descent
 - Direct Methods
-    - Bidiagonalization
+    - Golub Kahan Bidiagonalization
  
  Since each experimental unit will recieves all t treatments, the total number of algorithm runs in the experiment is ${t * b * r}$. For this experiment, ${t=3}$. To ensure fair comparison between algorithms, each treatment will be applied under a fixed time constraint. Each algorithm will be run for a maximum of two hours per experimental unit. 
 # Observational Units and Measurements
@@ -68,15 +85,13 @@ The observational units are each algorithm-dataset pair. For each combination we
 
 | Column Name | Data Type | Description |
 |:---|:---|:---|
-| `dataset_id` | Integer | Identifier for the generated dataset (experimental unit). |
+| `dataset_id` | Positive Integer | Identifier for the generated dataset (experimental unit). |
 | `dimensional_regime` | String | Relationship between predictors and observations: `p << n`, `p ≈ n`, or `p >> n`. |
 | `sparsity_level` | String | Density of the matrix `X`: `Sparse`, `Moderate`, or `Dense`. |
-| `lambda_level` | String | Relative magnitude of the ridge penalty parameter `λ`: `Small` or `Large`. |
+| `lambda_level` | String | Relative magnitude of the ridge penalty parameter `λ`: `Weak`, `Moderate`, or  `Strong`. |
 | `algorithm` | String | Ridge regression solution method used: `GradientDescent`, `SGD`, or `DirectMethod`. |
 | `runtime_seconds` | Positive Floating-point | Time required for the algorithm to compute a solution. |
 | `iterations` | Positive Integer | Number of iterations performed by the algorithm (`NA` for direct methods). |
-| `beta_error`| Positive Floating-point| Norm, measuring the difference between the estimated regression coefficients and the true coefficients
 
-The collected measurements will be written to a CSV file. Each row in the file corresponds to a single algorithm–dataset pair, which forms the observational unit of the experiment. The columns represent the recorded measurements. After the experiment, the resulting CSV file should contain ${Algorithms * Datasets}$ number of rows and each row will contain exactly eight columns.
 
-We also plan to observe the residual norm and the 
\ No newline at end of file
+The collected measurements will be written to a CSV file. Each row in the file corresponds to a single algorithm–dataset pair, which forms the observational unit of the experiment. The columns represent the recorded measurements. After the experiment, the resulting CSV file should contain ${Algorithms∗Datasets}$ number of rows and each row will contain exactly seven columns.
\ No newline at end of file

From 698aff72cc49a78c6372986be99db02ead13e3cd Mon Sep 17 00:00:00 2001
From: Eton Tackett <entackett@wisc.edu>
Date: Tue, 24 Mar 2026 08:52:39 -0500
Subject: [PATCH 15/23] Remove code files from design branch

---
 src/RidgeRegression.jl          |  11 ---
 src/bidiagonalization.jl        | 146 --------------------------------
 src/dataset.jl                  | 119 --------------------------
 test/bidiagonalization_tests.jl | 137 ------------------------------
 test/dataset_tests.jl           |  66 ---------------
 test/runtests.jl                |   6 --
 6 files changed, 485 deletions(-)
 delete mode 100644 src/RidgeRegression.jl
 delete mode 100644 src/bidiagonalization.jl
 delete mode 100644 src/dataset.jl
 delete mode 100644 test/bidiagonalization_tests.jl
 delete mode 100644 test/dataset_tests.jl
 delete mode 100644 test/runtests.jl

diff --git a/src/RidgeRegression.jl b/src/RidgeRegression.jl
deleted file mode 100644
index 5ef574e..0000000
--- a/src/RidgeRegression.jl
+++ /dev/null
@@ -1,11 +0,0 @@
-module RidgeRegression
-using CSV
-using DataFrames
-using Downloads
-
-export Dataset, csv_dataset, one_hot_encode
-
-include("dataset.jl")
-
-
-end
diff --git a/src/bidiagonalization.jl b/src/bidiagonalization.jl
deleted file mode 100644
index 5405e7e..0000000
--- a/src/bidiagonalization.jl
+++ /dev/null
@@ -1,146 +0,0 @@
-"""
-    compute_givens(a, b)
-
-Compute Givens rotation coefficients for scalars `a` and `b`.
-
-# Arguments
-- `a::Number`: First scalar
-- `b::Number`: Second scalar
-
-# Returns
-- `c::Number`: Cosine coefficient of the Givens rotation
-- `s::Number`: Sine coefficient of the Givens rotation
-
-"""
-function compute_givens(a::Number, b::Number) # Compute Givens rotation coefficients for scalars a and b
-    if b == 0
-        return one(a), zero(a)
-    elseif a == 0
-        throw(ArgumentError("a is zero, cannot compute Givens rotation"))
-    else
-        r = hypot(a, b)
-        c = a / r
-        s = b / r
-        return c, s
-    end
-end
-
-"""
-    rotate_rows!(M::AbstractMatrix,i::Int,j::Int,c::Number,s::Number)
-
-Apply a Givens rotation to rows `i` and `j` of matrix `M`.
-
-# Arguments
-- `M::AbstractMatrix`: The matrix to be rotated
-- `i::Int`: First row index
-- `j::Int`: Second row index
-- `c::Number`: Cosine coefficient of the Givens rotation
-- `s::Number`: Sine coefficient of the Givens rotation
-
-# Returns
-- `M::AbstractMatrix`: The rotated matrix
-
-"""
-function rotate_rows!(M::AbstractMatrix,i::Int,j::Int,c::Number,s::Number)
-    for k in 1:size(M,2) # Loop over columns
-        temp = M[i,k] # Store the original value of M[i,k] before modification
-        M[i,k] = c*temp + s*M[j,k]
-        M[j,k] = -conj(s)*temp + c*M[j,k] #Apply the Givens rotation to the elements in rows i and j
-    end
-    return M
-end
-
-
-"""
-    rotate_cols!(M::AbstractMatrix,i::Int,j::Int,c::Number,s::Number)
-
-Apply a Givens rotation to columns `i` and `j` of matrix `M`.
-
-# Arguments
-- `M::AbstractMatrix`: The matrix to be rotated
-- `i::Int`: First column index
-- `j::Int`: Second column index
-- `c::Number`: Cosine coefficient of the Givens rotation
-- `s::Number`: Sine coefficient of the Givens rotation
-
-# Returns
-- `M::AbstractMatrix`: The rotated matrix
-
-"""
-function rotate_cols!(M::AbstractMatrix,i::Int,j::Int,c::Number,s::Number)
-    for k in 1:size(M,1) # Loop over rows
-        temp = M[k,i] # Store the original value of M[k,i] before modification
-        M[k,i] = c*temp + s*M[k,j]
-        M[k,j] = -conj(s)*temp + c*M[k,j] # Apply the Givens rotation to the elements in columns i and j
-    end
-    return M
-end
-"""
-    bidiagonalize_with_H(A::AbstractMatrix, L::AbstractMatrix, b::AbstractVector)
-
-Performs bidiagonalization of a matrix A using a sequence of Givens transformations while explicitly accumulating
-the orthogonal left factor `H` and right factor `K` such that
-
-    H' * A * K = B.
-
-# Arguments
-- `A::AbstractMatrix`: The matrix to be bidiagonalized
-- `L::AbstractMatrix`: The banded matrix to be updated in-place
-- `b::AbstractVector`: The vector to be transformed
-
-# Returns
-- `B::AbstractMatrix`: The bidiagonalized form of the input matrix A with dimension (n,n)
-- `C::AbstractMatrix`: The matrix resulting from the sequence of Givens transformations
-- `H::AbstractMatrix`: The orthogonal left factor
-- `K::AbstractMatrix`: The orthogonal right factor
-- `Ht::AbstractMatrix`: The transpose of the orthogonal left factor
-- `b::AbstractVector`: The vector resulting from applying the Givens transformations to the input vector b
-
-"""
-
-function bidiagonalize_with_H(A::AbstractMatrix, L::AbstractMatrix, b::AbstractVector)
-    m, n = size(A)
-
-    B = copy(A) #Will be transformed into bidiagonal form
-    C = copy(L)
-    bhat = copy(b) #Will recieve same left transformations applied to A
-
-    Ht = Matrix{eltype(A)}(I, m, m) #Ht will accumulate the left transformations, initialized as identity
-    K  = Matrix{eltype(A)}(I, n, n) #K will accumulate the right transformations, initialized as identity
-
-    imax = min(m, n)
-
-    for i in 1:imax
-        # Left Givens rotations: zero below diagonal in column i
-        for j in m:-1:(i + 1)
-            if B[j, i] != 0
-                c, s = compute_givens(B[i, i], B[j, i]) #Build Givens rotation to zero B[j, i]
-                rotate_rows!(B, i, j, c, s) #Apply the Givens rotation to rows i and j of B
-                rotate_rows!(Ht, i, j, c, s) #Accumulate the left transformations in Ht
-
-                temp = bhat[i]
-                bhat[i] = c*temp + s*bhat[j]
-                bhat[j] = -conj(s)*temp + c*bhat[j] #Applying same left rotation to bhat, bhat tracks how b is transformed by the left transformations.
-
-                B[j, i] = zero(eltype(B))
-            end
-        end
-
-        # Right Givens rotations: zero entries right of superdiagonal
-        if i <= n - 2
-            for k in n:-1:(i + 2)
-                if B[i, k] != 0
-                    c, s = compute_givens(B[i, i + 1], B[i, k]) #Build Givens rotation to zero B[i, k]
-                    #s = -s
-                    rotate_cols!(B, i + 1, k, c, s) #Apply the Givens rotation to columns i+1 and k of B
-                    rotate_cols!(C, i + 1, k, c, s) #Apply the same right rotation to C, since C is updated by the right transformations
-                    rotate_cols!(K, i + 1, k, c, s) #Accumulate the right transformations in K
-                    B[i, k] = zero(eltype(B))
-                end
-            end
-        end
-    end
-
-    H = transpose(Ht)
-    return B, C, H, K, Ht, bhat
-end
diff --git a/src/dataset.jl b/src/dataset.jl
deleted file mode 100644
index 4e20c89..0000000
--- a/src/dataset.jl
+++ /dev/null
@@ -1,119 +0,0 @@
-"""
-    Dataset(name, X, y)
-
-Contains datasets for ridge regression experiments.
-
-# Fields
-- `name::String`: Name of dataset
-- `X::Matrix{Float64}`: Matrix of variables/features
-- `y::Vector{Float64}`: Target vector
-
-# Throws
-- `ArgumentError`: If rows in `X` does not equal length of `y`.
-
-!!! note
-    Used as the experimental unit for ridge regression experiments.
-"""
-struct Dataset
-    name::String
-    X::Matrix{Float64}
-    y::Vector{Float64}
-
-    function Dataset(name::String, X::AbstractMatrix, y::AbstractVector)
-        size(X, 1) == length(y) ||
-            throw(ArgumentError("X and y must have same number of rows"))
-
-        new(name, Matrix{Float64}(X), Vector{Float64}(y))
-    end
-end
-
-"""
-    one_hot_encode(Xdf::DataFrame; drop_first=true)
-
-One-hot encode categorical (string-like) features in `Xdf`.
-
-# Arguments
-- `Xdf::DataFrame`: Input DataFrame containing features and response vector `y`.
-
-# Keyword Arguments
-- `drop_first::Bool=true`: If `true`, drop the first dummy column for
-  each categorical feature to avoid multicollinearity.
-
-# Returns
-- `Matrix{Float64}`: A numeric matrix containing the encoded feature.
-"""
-function one_hot_encode(Xdf::DataFrame; drop_first::Bool = true)::Matrix{Float64}
-    n = nrow(Xdf)
-    cols = Vector{Vector{Float64}}()
-
-    for name in names(Xdf) #Selecting columns that aren't the target variable and pushing them to the columns.
-        col = Xdf[!, name]
-        if eltype(col) <: Real
-            push!(cols, Float64.(col))
-            continue
-        end
-
-        scol = string.(col) # Convert to string for categorical processing.
-        lv = unique(scol) #Get unique category levels.
-        ind = scol .== permutedims(lv) #Create indicator matrix for each level of the categorical variable.
-        #Permutedims is used to align the dimensions for broadcasting.
-        #Broadcasting compares each element of `scol` with each level in `lv`, resulting in a matrix where each column corresponds to a level and contains `true` for rows that match that level and `false` otherwise.
-
-        if drop_first && size(ind, 2) > 1 #Drop the first column of the indicator matrix to avoid multicollinearity if drop_first is true and there are multiple levels.
-            ind = ind[:, 2:end]
-        end
-
-        for j in 1:size(ind, 2)
-            push!(cols, Float64.(ind[:, j])) #Convert the boolean indicator columns to Float64 and add them to the list of columns.
-        end
-    end
-
-    p = length(cols)
-    X = Matrix{Float64}(undef, n, p)
-    for j in 1:p
-        X[:, j] = cols[j]
-    end
-
-    return Matrix{Float64}(X)
-
-end
-"""
-    csv_dataset(path_or_url; target_col, name="csv_dataset")
-
-Load a dataset from a CSV file or URL.
-
-# Arguments
-- `path_or_url::String`: Local file path or web URL containing CSV data.
-
-# Keyword Arguments
-- `target_col`: Column index or column name containing the response variable.
-- `name::String="csv_dataset"`: Dataset name.
-
-# Returns
-- `Dataset`: A dataset containing the encoded feature matrix `X`, response vector `y`, and dataset name.
-"""
-function csv_dataset(path_or_url::String;
-    target_col,
-    name::String = "csv_dataset"
-)
-
-    filepath =
-        startswith(path_or_url, "http") ?
-        Downloads.download(path_or_url) :
-        path_or_url
-
-    df = DataFrame(CSV.File(filepath)) #Read CSV file into a DataFrame.
-    df = dropmissing(df) #Remove rows with missing values.
-    Xdf = select(df, DataFrames.Not(target_col)) #Select all columns except the target column for features.
-
-    y = target_col isa Int ?
-        df[:, target_col] : #If target_col is an integer, use it as a column index to extract the target variable from the DataFrame.
-        df[:, Symbol(target_col)] #Extract the target variable based on whether target_col is an index or a name.
-
-
-    X = one_hot_encode(Xdf; drop_first = true)
-
-
-
-    return Dataset(name, Matrix{Float64}(X), Vector{Float64}(y))
-end
diff --git a/test/bidiagonalization_tests.jl b/test/bidiagonalization_tests.jl
deleted file mode 100644
index 5fa22e8..0000000
--- a/test/bidiagonalization_tests.jl
+++ /dev/null
@@ -1,137 +0,0 @@
-using Test
-using LinearAlgebra
-using RidgeRegression
-
-@testset "compute_givens" begin
-    c, s = compute_givens(3.0, 0.0)
-    @test c == 1.0
-    @test s == 0.0
-    a = 3.0
-    b = 4.0
-    c, s = compute_givens(a, b)
-    v1 = c*a + s*b
-    v2 = -s*a + c*b
-    @test isapprox(v2, 0.0; atol=1e-12, rtol=0)
-    @test isapprox(abs(v1), hypot(a, b); atol=1e-12, rtol=0)
-    @test_throws ArgumentError compute_givens(0.0, 2.0)
-end
-@testset "rotate_rows!" begin
-    M = [1.0 2.0;
-         3.0 4.0]
-
-    c, s = compute_givens(1.0, 3.0)
-    rotate_rows!(M, 1, 2, c, s)
-
-    @test isapprox(M[2, 1], 0.0; atol=1e-12, rtol=0)
-end
-
-@testset "rotate_cols!" begin
-    M = [3.0 4.0;
-         1.0 2.0]
-
-    c, s = compute_givens(3.0, 4.0)
-    rotate_cols!(M, 1, 2, c, s)
-
-    @test isapprox(M[1, 2], 0.0; atol=1e-12, rtol=0)
-end
-@testset "bidiagonalize_with_H basic properties" begin
-    A = [1.0 2.0 3.0;
-         4.0 5.0 6.0;
-         7.0 8.0 10.0]
-
-    L = Matrix{Float64}(I, 3, 3)
-    b = [1.0, 2.0, 3.0]
-
-    B, C, H, K, Ht, bhat = bidiagonalize_with_H(A, L, b)
-
-    # Ht should be the transpose of H
-    @test Ht ≈ H'
-
-    # Orthogonality of H and K
-    @test H' * H ≈ Matrix{Float64}(I, 3, 3)
-    @test H * H' ≈ Matrix{Float64}(I, 3, 3)
-    @test K' * K ≈ Matrix{Float64}(I, 3, 3)
-    @test K * K' ≈ Matrix{Float64}(I, 3, 3)
-
-    # Main factorization identity
-    @test H' * A * K ≈ B
-
-    @test H' * b ≈ bhat
-
-    # Applies right transforms to L, implicitly assuming J = I
-    @test L * K ≈ C
-end
-
-@testset "bidiagonalize_with_H produces upper bidiagonal matrix" begin
-    A = [2.0 1.0 3.0;
-         4.0 5.0 6.0;
-         7.0 8.0 9.0]
-
-    L = Matrix{Float64}(I, 3, 3)
-    b = [1.0, -1.0, 2.0]
-
-    B, C, H, K, Ht, bhat = bidiagonalize_with_H(A, L, b)
-
-    m, n = size(B)
-
-    for i in 1:m
-        for j in 1:n
-            # In an upper bidiagonal matrix, only diagonal and superdiagonal may be nonzero
-            if !(j == i || j == i + 1)
-                @test isapprox(B[i, j], 0.0; atol=1e-10, rtol=0)
-            end
-        end
-    end
-end
-
-@testset "bidiagonalize_with_H rectangular tall matrix" begin
-    A = [1.0 2.0 3.0;
-         4.0 5.0 6.0;
-         7.0 8.0 9.0;
-         1.0 0.0 1.0]
-
-    L = Matrix{Float64}(I, 3, 3)
-    b = [1.0, 2.0, 3.0, 4.0]
-
-    B, C, H, K, Ht, bhat = bidiagonalize_with_H(A, L, b)
-
-    m, n = size(A)
-
-    @test size(B) == (m, n)
-    @test size(C) == size(L)
-    @test size(H) == (m, m)
-    @test size(K) == (n, n)
-    @test size(Ht) == (m, m)
-    @test length(bhat) == length(b)
-
-    @test H' * H ≈ Matrix{Float64}(I, m, m)
-    @test K' * K ≈ Matrix{Float64}(I, n, n)
-
-    @test H' * A * K ≈ B
-    @test H' * b ≈ bhat
-    @test L * K ≈ C
-
-    for i in 1:m
-        for j in 1:n
-            if !(j == i || j == i + 1)
-                @test isapprox(B[i, j], 0.0; atol=1e-10, rtol=0)
-            end
-        end
-    end
-end
-
-@testset "bidiagonalize_with_H identity-like simple case" begin
-    A = [3.0 0.0;
-         4.0 5.0]
-
-    L = Matrix{Float64}(I, 2, 2)
-    b = [1.0, 2.0]
-
-    B, C, H, K, Ht, bhat = bidiagonalize_with_H(A, L, b)
-
-    @test H' * A * K ≈ B
-    @test H' * b ≈ bhat
-    @test L * K ≈ C
-
-    @test isapprox(B[2,1], 0.0; atol=1e-12, rtol=0)
-end
diff --git a/test/dataset_tests.jl b/test/dataset_tests.jl
deleted file mode 100644
index 5652c96..0000000
--- a/test/dataset_tests.jl
+++ /dev/null
@@ -1,66 +0,0 @@
-using Test
-using DataFrames
-using CSV
-
-using RidgeRegression
-@testset "Dataset" begin
-    X = [1 2; 3 4]
-    y = [10, 20]
-    d = Dataset("toy", X, y)
-
-    @test d.name == "toy"
-    @test d.X isa Matrix{Float64}
-    @test d.y isa Vector{Float64}
-    @test size(d.X) == (2, 2)
-    @test length(d.y) == 2
-    @test d.X[1, 1] == 1.0
-    @test d.y[2] == 20.0
-
-    @test_throws ArgumentError Dataset("bad", X, [1, 2, 3])
-end
-
-@testset "one_hot_encode" begin
-    df = DataFrame(
-        A = ["red", "blue", "red", "green"],
-        B = [1, 2, 3, 4],
-        C = ["small", "large", "medium", "small"]
-    )
-
-    X = redirect_stdout(devnull) do
-        one_hot_encode(df; drop_first = true)
-    end
-
-    @test size(X) == (4, 5)
-    @test X[:, 3] == [1.0, 2.0, 3.0, 4.0]
-    @test all(x -> x == 0.0 || x == 1.0, X[:, [1,2,4,5]])
-    @test all(vec(sum(X[:, 1:2]; dims=2)) .<= 1)
-    @test all(vec(sum(X[:, 4:5]; dims=2)) .<= 1)
-end
-
-@testset "csv_dataset" begin
-    tmp = tempname() * ".csv"
-    df = DataFrame(
-        a = [1.0, 2.0, missing, 4.0],
-        b = ["x", "y", "y", "x"],
-        y = [10.0, 20.0, 30.0, 40.0]
-    )
-    CSV.write(tmp, df)
-
-    d = redirect_stdout(devnull) do
-        csv_dataset(tmp; target_col=:y, name="tmp")
-    end
-
-    @test d.name == "tmp"
-    @test d.X isa Matrix{Float64}
-    @test d.y isa Vector{Float64}
-
-    @test length(d.y) == 3
-    @test size(d.X, 1) == 3
-    @test d.y == [10.0, 20.0, 40.0]
-
-    d2 = redirect_stdout(devnull) do
-        csv_dataset(tmp; target_col=3, name="tmp2")
-    end
-    @test d2.y == [10.0, 20.0, 40.0]
-    @test size(d2.X, 1) == 3
-end
diff --git a/test/runtests.jl b/test/runtests.jl
deleted file mode 100644
index dbbe06f..0000000
--- a/test/runtests.jl
+++ /dev/null
@@ -1,6 +0,0 @@
-using RidgeRegression
-using Test
-
-@testset "RidgeRegression.jl" begin
-    # Write your tests here.
-end

From f904351bd75f3867708de4ab05cf89b09596e6f5 Mon Sep 17 00:00:00 2001
From: Eton Tackett <entackett@wisc.edu>
Date: Tue, 24 Mar 2026 08:59:25 -0500
Subject: [PATCH 16/23] Compiling issues

---
 docs/src/design.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/src/design.md b/docs/src/design.md
index b02e497..a5da56c 100644
--- a/docs/src/design.md
+++ b/docs/src/design.md
@@ -2,7 +2,7 @@
 Many modern science problems involve regression problems with extremely large numbers of predictors. Genome-wide association studies (GWAS), for example, try to identify genetic variants associated with a disease phenotype using hundreds of thousands or millions of genomic features. In such settings, traditional least squares methods fail because noise and ill-conditioning. Penalized Least Squares (PLS) extends ordinary least squares (OLS) regression by adding a penalty term to shrink parameter estimates. Ridge regression, an approach within PLS, adds a regularization term, producing a regularized estimator. 
 
 Mathematically, ridge regression estimates the regression coefficients by solving the penalized least squares problem
-$$
+$
 \hat{\boldsymbol{\beta}} =
 \arg\min_{\boldsymbol{\beta}}
 \left(
@@ -10,7 +10,7 @@ $$
 +
 \lambda \| \boldsymbol{\beta} \|^2
 \right)
-$$
+$
 where $\lambda > 0$ is a regularization parameter that controls the strength of the penalty.
 
 The purpose of ridge regression is to stabilize regression estimates where the predictors are highly correlated or the design matrix $X$ is almost singular. Ridge regression shrinks the estimated coefficient vector in a way such that the coefficient estimates minimize the sum of squared residuals subject to a constraint on the $\ell_2$ norm of the coefficient vector, $\|\boldsymbol{\beta}\|^2 \leq t$, which shrinks the least squares estimates toward the origin. This reduces the variance of the coefficient estimates and mitigates the effects of multicollinearity.

From 49c8a817bcfcd2c8f3dc4d2ef0beefbd67e3d737 Mon Sep 17 00:00:00 2001
From: Eton Tackett <entackett@wisc.edu>
Date: Tue, 24 Mar 2026 09:53:08 -0500
Subject: [PATCH 17/23] Attempt 1

---
 docs/src/design.md | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/docs/src/design.md b/docs/src/design.md
index a5da56c..b9b68e4 100644
--- a/docs/src/design.md
+++ b/docs/src/design.md
@@ -2,14 +2,14 @@
 Many modern science problems involve regression problems with extremely large numbers of predictors. Genome-wide association studies (GWAS), for example, try to identify genetic variants associated with a disease phenotype using hundreds of thousands or millions of genomic features. In such settings, traditional least squares methods fail because noise and ill-conditioning. Penalized Least Squares (PLS) extends ordinary least squares (OLS) regression by adding a penalty term to shrink parameter estimates. Ridge regression, an approach within PLS, adds a regularization term, producing a regularized estimator. 
 
 Mathematically, ridge regression estimates the regression coefficients by solving the penalized least squares problem
-$
+${
 \hat{\boldsymbol{\beta}} =
 \arg\min_{\boldsymbol{\beta}}
 \left(
 \| \mathbf{y} - X\boldsymbol{\beta} \|^2
 +
 \lambda \| \boldsymbol{\beta} \|^2
-\right)
+\right)}
 $
 where $\lambda > 0$ is a regularization parameter that controls the strength of the penalty.
 
@@ -21,13 +21,9 @@ The goal of this experiment is to investigate the performance of these algorithm
 # Questions
 The primary goal of this experiment is to compare numerical algorithms for computing ridge regression estimates under various conditions. In particular, we aim to address the following questions:
 
-1. How does algorithm performance vary across regression problems with different dimensional regimes?
-
-2. How does the sparsity structure of the design matrix $X$ influence computational efficiency and convergence behavior of different algorithms?
-
-3. How does the magnitude of the ridge penalty parameter $\lambda$, which affects the conditioning of $X^\top X + \lambda I$, influence numerical stability and runtime across algorithms?
+1. How does the performance of ridge regression algorithms change as the structural and numerical properties of the regression problem vary?
 
-4. Which algorithm provides the best tradeoff between numerical stability and computational cost across these varying problem regimes?
+2. Which ridge regression algorithm provides the best balance between numerical stability and computational cost across these problem regimes?
 
 # Experimental Units
 The experimental units are the datasets under fixed penalty weights. For each experimental unit, all treatments will be applied to the dataset. This will be done so that differences in performance can be attributed to the algorithms themselves rather than the data. Each dataset will contain a matrix ${X}$, a response vector $\mathbf{y}$, and a regularization parameter ${\lambda}$ for some specific ${\lambda}$. 

From 8529e348bee87e4f723810e8c092732a5f4eb56b Mon Sep 17 00:00:00 2001
From: Eton Tackett <entackett@wisc.edu>
Date: Tue, 24 Mar 2026 10:03:29 -0500
Subject: [PATCH 18/23] Minor adjustment

---
 docs/src/design.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/src/design.md b/docs/src/design.md
index b9b68e4..794fece 100644
--- a/docs/src/design.md
+++ b/docs/src/design.md
@@ -56,7 +56,7 @@ V \,\mathrm{diag}(\sigma_1^2+\lambda,\dots,\sigma_p^2+\lambda)\, V^\top .
 
 Because the performance of numerical algorithms is strongly influenced by the conditioning of the system they solve, the ridge penalty effectively creates regression problems with different numerical difficulty. This provides a way to assess how algorithm performance, convergence behavior, and computational cost depend on the numerical stability of the problem. In this experiment, the magnitude of $\lambda$ is selected relative to the smallest and largest singular values of $X$. A weak regularization regime corresponds to $\lambda \approx \sigma_{\min}^2$, where the ridge penalty begins to influence the smallest singular directions but the system remains moderately ill-conditioned. A moderate regularization regime corresponds to $\lambda \approx \sigma_{\min}\sigma_{\max}$, which substantially improves the conditioning of the problem by increasing the smallest eigenvalues of $X^\top X + \lambda I$. Finally, a strong regularization regime corresponds to $\lambda \approx \sigma_{\max}^2$, where the ridge penalty dominates the spectral scale of the problem and produces a well-conditioned system.
 
-Another blocking factor that will be considered is how sparse or dense the matrix X is. Many algorithms behave differently depending on whether the matrix is sparse or dense. In ridge regression, there are many operations involving X including matrix-matrix products and matrix-vector products. A dense matrix leads to high computational cost whereas a sparse matrix we can significantly reduce the cost. As such, different algorithms may perform better depending on the sparsity structure of X, making matrix sparsity a relevant blocking factor when comparing algorithm behavior and computational efficiency.
+Another blocking factor that will be considered is how sparse or dense the matrix $X$ is. Many algorithms behave differently depending on whether the matrix is sparse or dense. In ridge regression, there are many operations involving $X$ including matrix-matrix products and matrix-vector products. A dense matrix leads to high computational cost whereas a sparse matrix we can significantly reduce the cost. As such, different algorithms may perform better depending on the sparsity structure of X, making matrix sparsity a relevant blocking factor when comparing algorithm behavior and computational efficiency.
 
 The total number of block combinations is determined by the product of the number of levels in each blocking factor, denoted b. For example, if the experiment includes three dimensional regimes, two sparsity levels, and two regularization strengths, then there are $3 * 2 * 2 = 12$ block combinations. We will also denote r to be the number of replicated datasets in each block. Here, we mean the number datasets within a block. The total number of experimental units is then ${b * r}$.
 

From 3734a4638be84ea7da7097e00b65d433710c96bc Mon Sep 17 00:00:00 2001
From: Eton Tackett <entackett@wisc.edu>
Date: Tue, 24 Mar 2026 11:53:29 -0500
Subject: [PATCH 19/23] restoring .jl files

---
 docs/RidgeRegression.jl | 5 +++++
 test/runtests.jl        | 6 ++++++
 2 files changed, 11 insertions(+)
 create mode 100644 docs/RidgeRegression.jl
 create mode 100644 test/runtests.jl

diff --git a/docs/RidgeRegression.jl b/docs/RidgeRegression.jl
new file mode 100644
index 0000000..c32de91
--- /dev/null
+++ b/docs/RidgeRegression.jl
@@ -0,0 +1,5 @@
+module RidgeRegression
+
+# Write your package code here.
+
+end
diff --git a/test/runtests.jl b/test/runtests.jl
new file mode 100644
index 0000000..dbbe06f
--- /dev/null
+++ b/test/runtests.jl
@@ -0,0 +1,6 @@
+using RidgeRegression
+using Test
+
+@testset "RidgeRegression.jl" begin
+    # Write your tests here.
+end

From 48edb295588a60db8bcc369285e705445e92782b Mon Sep 17 00:00:00 2001
From: Eton Tackett <entackett@wisc.edu>
Date: Tue, 24 Mar 2026 11:54:18 -0500
Subject: [PATCH 20/23] moving to src

---
 docs/{ => src}/RidgeRegression.jl | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename docs/{ => src}/RidgeRegression.jl (100%)

diff --git a/docs/RidgeRegression.jl b/docs/src/RidgeRegression.jl
similarity index 100%
rename from docs/RidgeRegression.jl
rename to docs/src/RidgeRegression.jl

From 8c2e5bcf19644783109e4abaa972f7ee99e7109b Mon Sep 17 00:00:00 2001
From: Vivak Patel <vp314@users.noreply.github.com>
Date: Tue, 24 Mar 2026 11:59:03 -0500
Subject: [PATCH 21/23] Put source file back into the folder.

---
 src/RidgeRegression.jl | 5 +++++
 1 file changed, 5 insertions(+)
 create mode 100644 src/RidgeRegression.jl

diff --git a/src/RidgeRegression.jl b/src/RidgeRegression.jl
new file mode 100644
index 0000000..c32de91
--- /dev/null
+++ b/src/RidgeRegression.jl
@@ -0,0 +1,5 @@
+module RidgeRegression
+
+# Write your package code here.
+
+end

From 8192f16382ad31e33af25224f6cd8c35ae2d7b0a Mon Sep 17 00:00:00 2001
From: Vivak Patel <vp314@users.noreply.github.com>
Date: Tue, 24 Mar 2026 12:00:32 -0500
Subject: [PATCH 22/23] Delete docs/src/RidgeRegression.jl

---
 docs/src/RidgeRegression.jl | 5 -----
 1 file changed, 5 deletions(-)
 delete mode 100644 docs/src/RidgeRegression.jl

diff --git a/docs/src/RidgeRegression.jl b/docs/src/RidgeRegression.jl
deleted file mode 100644
index c32de91..0000000
--- a/docs/src/RidgeRegression.jl
+++ /dev/null
@@ -1,5 +0,0 @@
-module RidgeRegression
-
-# Write your package code here.
-
-end

From dc1059095c388a2124e2afcece8cadac9d57831b Mon Sep 17 00:00:00 2001
From: Vivak Patel <vp314@users.noreply.github.com>
Date: Tue, 24 Mar 2026 12:05:27 -0500
Subject: [PATCH 23/23] Fixed math notation

---
 docs/src/design.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/src/design.md b/docs/src/design.md
index 794fece..987fb4a 100644
--- a/docs/src/design.md
+++ b/docs/src/design.md
@@ -2,15 +2,15 @@
 Many modern science problems involve regression problems with extremely large numbers of predictors. Genome-wide association studies (GWAS), for example, try to identify genetic variants associated with a disease phenotype using hundreds of thousands or millions of genomic features. In such settings, traditional least squares methods fail because noise and ill-conditioning. Penalized Least Squares (PLS) extends ordinary least squares (OLS) regression by adding a penalty term to shrink parameter estimates. Ridge regression, an approach within PLS, adds a regularization term, producing a regularized estimator. 
 
 Mathematically, ridge regression estimates the regression coefficients by solving the penalized least squares problem
-${
+```math
 \hat{\boldsymbol{\beta}} =
 \arg\min_{\boldsymbol{\beta}}
 \left(
 \| \mathbf{y} - X\boldsymbol{\beta} \|^2
 +
 \lambda \| \boldsymbol{\beta} \|^2
-\right)}
-$
+\right)
+```
 where $\lambda > 0$ is a regularization parameter that controls the strength of the penalty.
 
 The purpose of ridge regression is to stabilize regression estimates where the predictors are highly correlated or the design matrix $X$ is almost singular. Ridge regression shrinks the estimated coefficient vector in a way such that the coefficient estimates minimize the sum of squared residuals subject to a constraint on the $\ell_2$ norm of the coefficient vector, $\|\boldsymbol{\beta}\|^2 \leq t$, which shrinks the least squares estimates toward the origin. This reduces the variance of the coefficient estimates and mitigates the effects of multicollinearity.