From 7dc29680fc0dee10506ac6389afe2f162af0d7db Mon Sep 17 00:00:00 2001 From: Eton Tackett Date: Mon, 2 Mar 2026 15:32:40 -0600 Subject: [PATCH 01/14] Add dataset utilities and tests --- src/dataset.jl | 133 ++++++++++++++++++++++++++++++++++++++++++ test/dataset_tests.jl | 0 2 files changed, 133 insertions(+) create mode 100644 src/dataset.jl create mode 100644 test/dataset_tests.jl diff --git a/src/dataset.jl b/src/dataset.jl new file mode 100644 index 0000000..52c0153 --- /dev/null +++ b/src/dataset.jl @@ -0,0 +1,133 @@ + +using CSV +using DataFrames +using Downloads + +export Dataset, csv_dataset + +""" + Dataset(name, X, y) + +Contains datasets for ridge regression experiments. + +# Fields +- `name::String`: Name of dataset +- `X::Matrix{Float64}`: Matrix of variables/features +- `y::Vector{Float64}`: Target vector + +# Throws +- `ArgumentError`: If rows in `X` does not equal length of `y`. + +# Notes +Used as the experimental unit for ridge regression experiments. +""" +struct Dataset + name::String + X::Matrix{Float64} + y::Vector{Float64} + + function Dataset(name::String, X::AbstractMatrix, y::AbstractVector) + size(X, 1) == length(y) || + throw(ArgumentError("X and y must have same number of rows")) + + new(name, Matrix{Float64}(X), Vector{Float64}(y)) + end +end + +""" + one_hot_encode(dataset::Dataset; drop_first=true) + +One-hot encode categorical (string-like) features in `dataset.X`. + +# Arguments +- `dataset::Dataset`: Input dataset containing feature matrix `X` + and response vector `y`. + +# Keyword Arguments +- `drop_first::Bool=true`: If `true`, drop the first dummy column for + each categorical feature to avoid multicollinearity. + +# Returns +A new `Dataset` with numeric `X` and unchanged `y`. +""" +function one_hot_encode(Xdf::DataFrame; drop_first::Bool = true)::Matrix{Float64} + n = nrow(Xdf) + cols = Vector{Vector{Float64}}() + + for name in names(Xdf) + col = Xdf[!, name] + if eltype(col) <: Real + push!(cols, Float64.(col)) + continue + end + + scol = string.(col) + lv = unique(scol) + ind = scol .== permutedims(lv) + + println("Variable: $name") + for (j, level) in enumerate(lv) + println(" Dummy column (before drop) $j → $name = $level") + end + + if drop_first && size(ind, 2) > 1 + ind = ind[:, 2:end] + end + + for j in 1:size(ind, 2) + push!(cols, Float64.(ind[:, j])) + end + end + + p = length(cols) + X = Matrix{Float64}(undef, n, p) + for j in 1:p + X[:, j] = cols[j] + end + + return Matrix{Float64}(X) + +end +""" + csv_dataset(path_or_url; target_col, name="csv_dataset") + +Load a dataset from a CSV file or URL. + +# Arguments +- `path_or_url::String` + Local file path or web URL that has CSV data. + +- `target_col` + Column index OR column name containing the response variable. + +- `name::String` + Dataset name. + +# Returns +`Dataset` +""" +function csv_dataset(path_or_url::String; + target_col, + name::String = "csv_dataset" +) + + filepath = + startswith(path_or_url, "http") ? + Downloads.download(path_or_url) : + path_or_url + + df = DataFrame(CSV.File(filepath)) + df = dropmissing(df) + Xdf = select(df, DataFrames.Not(target_col)) + + y = target_col isa Int ? + df[:, target_col] : + df[:, Symbol(target_col)] + + + X = one_hot_encode(Xdf; drop_first = true) + + + + return Dataset(name, Matrix{Float64}(X), Vector{Float64}(y)) +end diff --git a/test/dataset_tests.jl b/test/dataset_tests.jl new file mode 100644 index 0000000..e69de29 From 47fc954ddfe6fdf09c3c998929432b782079f760 Mon Sep 17 00:00:00 2001 From: Eton Tackett Date: Tue, 3 Mar 2026 10:27:21 -0600 Subject: [PATCH 02/14] Adding dataset_tests.jl --- Project.toml | 10 ++++++- docs/make.jl | 1 + docs/src/design.md | 32 +++++++++++++++++++++ src/dataset.jl | 1 - test/dataset_tests.jl | 66 +++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 108 insertions(+), 2 deletions(-) create mode 100644 docs/src/design.md diff --git a/Project.toml b/Project.toml index 86ea0a5..427e8aa 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,15 @@ name = "RidgeRegression" uuid = "739161c8-60e1-4c49-8f89-ff30998444b1" -authors = ["Vivak Patel "] version = "0.1.0" +authors = ["Eton Tackett ", "Vivak Patel "] + +[deps] +CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" +DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" +Downloads = "f43a241f-c20a-4ad4-852c-f6b1247861c6" [compat] +CSV = "0.10.15" +DataFrames = "1.8.1" +Downloads = "1.7.0" julia = "1.12.4" diff --git a/docs/make.jl b/docs/make.jl index a1097bb..d42cfbe 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -14,6 +14,7 @@ makedocs(; ), pages=[ "Home" => "index.md", + "Design" => "design.md", ], ) diff --git a/docs/src/design.md b/docs/src/design.md new file mode 100644 index 0000000..c83592e --- /dev/null +++ b/docs/src/design.md @@ -0,0 +1,32 @@ +# Motivation and Background +Many modern applications, such as genome-wide association studies (GWAS) involve regression problems with a large number of predictors. Traditional least squares methods fail due to noise and ill-conditioning. Penalized Least Squares (PLS) extends ordinary least squares (OLS) regression by adding a penalty term to shrink parameter estimates. The goal is to select the best possible model, "best" in the sense that we find the best tradeoff between goodness of fit and model complexity. Ridge regression, an approach within PLS, adds a regularization term. + +# Questions +Key Questions: +Which ridge regression algorithm is provides the best balance between: +-Numerical stability +-Computational aspects (GPU/CPU, runtime, etc) +-Predicative accuracy +# Experimental Units +The experimental units are the datasets under fixed penalty weights. Due to the statistical behavior of ridge regression algorithms depends strongly on the dimensional structure of the problem, a blocking procedure will be used. Datasets will be grouped according to their dimensional regime, characterized as p >> n, p ≈ n, and p << n. These regimes correspond to fundamentally different geometric properties of the design matrix, including rank behavior, conditioning, and the stability of the normal equations. + +In addition to dimensional regime, matrix conditioning will be incorporated as a secondary blocking factor. The condition number of the design matrix quantifies the sensitivity of the regression problem to perturbations in the data and directly affects numerical stability and convergence behavior of ridge solution methods. Ill-conditioned matrices have slow convergence and are sensitive to errors, while well-conditioned matrices tend to produce stable and rapidly convergent behavior. + +| Blocking System | Factor | Blocks | +|:----------------|:-------|:-------| +| Dataset | Dimensional regime (\(p/n\)) | \( p \ll n \), \( p = n \), \( p \gg n \) | +| Matrix conditioning | Condition number of \( X \) or \( X^T X \) | Low, Medium, High | +# Treatments +The treatments are the ridge regression solution methods: +-Gradient descent +-Stochastic gradient descent +-Closed-form solutions +# Observational Units and Measurements +The observational units are each algorithm-dataset pair. For each combination we will observe the following +| Measurement System | Factor | Measurements | +|:--------------------------|:--------------------------|:-------------| +| Predictive Performance | Prediction error | Training MSE, Test MSE, RMSE, R² | +| Estimation Accuracy | Parameter recovery | ‖β̂ − β_true‖₂² | (if known) +| Computational Performance | Efficiency | Runtime (seconds), Iterations to convergence | +| Numerical Stability | Solution accuracy | Perturbation sensitivity | +| Model Complexity | Coefficient magnitude | ‖β̂‖₂ | \ No newline at end of file diff --git a/src/dataset.jl b/src/dataset.jl index 52c0153..d135557 100644 --- a/src/dataset.jl +++ b/src/dataset.jl @@ -1,4 +1,3 @@ - using CSV using DataFrames using Downloads diff --git a/test/dataset_tests.jl b/test/dataset_tests.jl index e69de29..cefa8ce 100644 --- a/test/dataset_tests.jl +++ b/test/dataset_tests.jl @@ -0,0 +1,66 @@ +using Test +using DataFrames +using CSV + +include("../src/dataset.jl") +@testset "Dataset" begin + X = [1 2; 3 4] + y = [10, 20] + d = Dataset("toy", X, y) + + @test d.name == "toy" + @test d.X isa Matrix{Float64} + @test d.y isa Vector{Float64} + @test size(d.X) == (2, 2) + @test length(d.y) == 2 + @test d.X[1, 1] == 1.0 + @test d.y[2] == 20.0 + + @test_throws ArgumentError Dataset("bad", X, [1, 2, 3]) +end + +@testset "one_hot_encode" begin + df = DataFrame( + A = ["red", "blue", "red", "green"], + B = [1, 2, 3, 4], + C = ["small", "large", "medium", "small"] + ) + + X = redirect_stdout(devnull) do + one_hot_encode(df; drop_first = true) + end + + @test size(X) == (4, 5) + @test X[:, 3] == [1.0, 2.0, 3.0, 4.0] + @test all(x -> x == 0.0 || x == 1.0, X[:, [1,2,4,5]]) + @test all(vec(sum(X[:, 1:2]; dims=2)) .<= 1) + @test all(vec(sum(X[:, 4:5]; dims=2)) .<= 1) +end + +@testset "csv_dataset" begin + tmp = tempname() * ".csv" + df = DataFrame( + a = [1.0, 2.0, missing, 4.0], + b = ["x", "y", "y", "x"], + y = [10.0, 20.0, 30.0, 40.0] + ) + CSV.write(tmp, df) + + d = redirect_stdout(devnull) do + csv_dataset(tmp; target_col=:y, name="tmp") + end + + @test d.name == "tmp" + @test d.X isa Matrix{Float64} + @test d.y isa Vector{Float64} + + @test length(d.y) == 3 + @test size(d.X, 1) == 3 + @test d.y == [10.0, 20.0, 40.0] + + d2 = redirect_stdout(devnull) do + csv_dataset(tmp; target_col=3, name="tmp2") + end + @test d2.y == [10.0, 20.0, 40.0] + @test size(d2.X, 1) == 3 +end From 787a88dc8a58861c69e32562f73a1727db0d52bc Mon Sep 17 00:00:00 2001 From: Eton Tackett Date: Tue, 3 Mar 2026 10:31:54 -0600 Subject: [PATCH 03/14] Small changes to design.md --- docs/src/design.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/src/design.md b/docs/src/design.md index c83592e..5c02373 100644 --- a/docs/src/design.md +++ b/docs/src/design.md @@ -14,7 +14,7 @@ In addition to dimensional regime, matrix conditioning will be incorporated as a | Blocking System | Factor | Blocks | |:----------------|:-------|:-------| -| Dataset | Dimensional regime (\(p/n\)) | \( p \ll n \), \( p = n \), \( p \gg n \) | +| Dataset | Dimensional regime (\(p/n\)) | $(p \ll n)$, $(p \approx n)$, $(p \gg n)$| | Matrix conditioning | Condition number of \( X \) or \( X^T X \) | Low, Medium, High | # Treatments The treatments are the ridge regression solution methods: From 2dd229507d2dc9b07c2867c06a4779a55d1944f9 Mon Sep 17 00:00:00 2001 From: Eton Tackett Date: Mon, 16 Mar 2026 22:28:05 -0500 Subject: [PATCH 04/14] March 16 Updates --- src/dataset.jl | 42 ++++++++++++++++++------------------------ test/dataset_tests.jl | 2 +- 2 files changed, 19 insertions(+), 25 deletions(-) diff --git a/src/dataset.jl b/src/dataset.jl index d135557..18d7b6b 100644 --- a/src/dataset.jl +++ b/src/dataset.jl @@ -53,28 +53,25 @@ function one_hot_encode(Xdf::DataFrame; drop_first::Bool = true)::Matrix{Float64 n = nrow(Xdf) cols = Vector{Vector{Float64}}() - for name in names(Xdf) + for name in names(Xdf) #Selecting columns that aren't the target variable and pushing them to the columns. col = Xdf[!, name] if eltype(col) <: Real push!(cols, Float64.(col)) continue end - scol = string.(col) - lv = unique(scol) - ind = scol .== permutedims(lv) + scol = string.(col) # Convert to string for categorical processing. + lv = unique(scol) #Get unique category levels. + ind = scol .== permutedims(lv) #Create indicator matrix for each level of the categorical variable. + #Permutedims is used to align the dimensions for broadcasting. + #Broadcasting compares each element of `scol` with each level in `lv`, resulting in a matrix where each column corresponds to a level and contains `true` for rows that match that level and `false` otherwise. - println("Variable: $name") - for (j, level) in enumerate(lv) - println(" Dummy column (before drop) $j → $name = $level") - end - - if drop_first && size(ind, 2) > 1 + if drop_first && size(ind, 2) > 1 #Drop the first column of the indicator matrix to avoid multicollinearity if drop_first is true and there are multiple levels. ind = ind[:, 2:end] end for j in 1:size(ind, 2) - push!(cols, Float64.(ind[:, j])) + push!(cols, Float64.(ind[:, j])) #Convert the boolean indicator columns to Float64 and add them to the list of columns. end end @@ -93,17 +90,14 @@ end Load a dataset from a CSV file or URL. # Arguments -- `path_or_url::String` - Local file path or web URL that has CSV data. +- `path_or_url::String`: Local file path or web URL containing CSV data. -- `target_col` - Column index OR column name containing the response variable. - -- `name::String` - Dataset name. +# Keyword Arguments +- `target_col`: Column index or column name containing the response variable. +- `name::String="csv_dataset"`: Dataset name. # Returns -`Dataset` +A `Dataset`. """ function csv_dataset(path_or_url::String; target_col, @@ -115,13 +109,13 @@ function csv_dataset(path_or_url::String; Downloads.download(path_or_url) : path_or_url - df = DataFrame(CSV.File(filepath)) - df = dropmissing(df) - Xdf = select(df, DataFrames.Not(target_col)) + df = DataFrame(CSV.File(filepath)) #Read CSV file into a DataFrame. + df = dropmissing(df) #Remove rows with missing values. + Xdf = select(df, DataFrames.Not(target_col)) #Select all columns except the target column for features. y = target_col isa Int ? - df[:, target_col] : - df[:, Symbol(target_col)] + df[:, target_col] : #If target_col is an integer, use it as a column index to extract the target variable from the DataFrame. + df[:, Symbol(target_col)] #Extract the target variable based on whether target_col is an index or a name. X = one_hot_encode(Xdf; drop_first = true) diff --git a/test/dataset_tests.jl b/test/dataset_tests.jl index cefa8ce..5652c96 100644 --- a/test/dataset_tests.jl +++ b/test/dataset_tests.jl @@ -2,7 +2,7 @@ using Test using DataFrames using CSV -include("../src/dataset.jl") +using RidgeRegression @testset "Dataset" begin X = [1 2; 3 4] y = [10, 20] From 173cdd14a1197e6bbe003723fa6e8df87a58575d Mon Sep 17 00:00:00 2001 From: Eton Tackett Date: Mon, 16 Mar 2026 22:29:22 -0500 Subject: [PATCH 05/14] Ridge Regression file --- src/RidgeRegression.jl | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/RidgeRegression.jl b/src/RidgeRegression.jl index c32de91..5ef574e 100644 --- a/src/RidgeRegression.jl +++ b/src/RidgeRegression.jl @@ -1,5 +1,11 @@ module RidgeRegression +using CSV +using DataFrames +using Downloads + +export Dataset, csv_dataset, one_hot_encode + +include("dataset.jl") -# Write your package code here. end From 042d5d60253d7b4e708fbc92a13ba1062df1d71a Mon Sep 17 00:00:00 2001 From: Eton Tackett Date: Mon, 16 Mar 2026 23:12:52 -0500 Subject: [PATCH 06/14] dataset.jl small update --- src/dataset.jl | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/src/dataset.jl b/src/dataset.jl index 18d7b6b..5f1b77c 100644 --- a/src/dataset.jl +++ b/src/dataset.jl @@ -1,9 +1,3 @@ -using CSV -using DataFrames -using Downloads - -export Dataset, csv_dataset - """ Dataset(name, X, y) @@ -17,8 +11,8 @@ Contains datasets for ridge regression experiments. # Throws - `ArgumentError`: If rows in `X` does not equal length of `y`. -# Notes -Used as the experimental unit for ridge regression experiments. +!!! note + Used as the experimental unit for ridge regression experiments. """ struct Dataset name::String From 70cfa67631c488de2828e9fa2a9e377ab4d2abbe Mon Sep 17 00:00:00 2001 From: Eton Tackett Date: Mon, 16 Mar 2026 23:19:22 -0500 Subject: [PATCH 07/14] Updated Experimental Units and Treatments Sections --- docs/src/design.md | 64 ++++++++++++++++++++++++++++++++++++---------- 1 file changed, 50 insertions(+), 14 deletions(-) diff --git a/docs/src/design.md b/docs/src/design.md index 5c02373..4795866 100644 --- a/docs/src/design.md +++ b/docs/src/design.md @@ -1,32 +1,68 @@ # Motivation and Background -Many modern applications, such as genome-wide association studies (GWAS) involve regression problems with a large number of predictors. Traditional least squares methods fail due to noise and ill-conditioning. Penalized Least Squares (PLS) extends ordinary least squares (OLS) regression by adding a penalty term to shrink parameter estimates. The goal is to select the best possible model, "best" in the sense that we find the best tradeoff between goodness of fit and model complexity. Ridge regression, an approach within PLS, adds a regularization term. +Many modern science problems involve regression problems with extremely large numbers of predictors. Genome-wide association studies (GWAS), for example, try to identify genetic variants associated with a disease phenotype using hundreds of thousands or millions of genomic features. In such settings, traditional least squares methods fail because noise and ill-conditioning. Penalized Least Squares (PLS) extends ordinary least squares (OLS) regression by adding a penalty term to shrink parameter estimates. Ridge regression, an approach within PLS, adds a regularization term, producing a regularized estimator that helps to stabalize the solution. +There are many numerical algorithms available to compute ridge regression estimates including direct methods, Krylov subspace methods, gradient-based optimization, coordinate descent, and stochastic gradient descent. These algorithms differ in their computational cost, memory requirements, and numerical stability. + +The goal of this experiment is to investigate the performance of these algorithms when we vary the structure and scale of the regression problem. To do this, we consider the linear model $\mathbf{y} = X\boldsymbol{\beta} + \boldsymbol{\varepsilon}$ where the matrix ${X}$ will be constructed with varying dimensions, sparsity patterns, and conditioning properties. # Questions Key Questions: Which ridge regression algorithm is provides the best balance between: --Numerical stability --Computational aspects (GPU/CPU, runtime, etc) --Predicative accuracy + +- Numerical stability +- Computational aspects (GPU/CPU, runtime, etc) # Experimental Units -The experimental units are the datasets under fixed penalty weights. Due to the statistical behavior of ridge regression algorithms depends strongly on the dimensional structure of the problem, a blocking procedure will be used. Datasets will be grouped according to their dimensional regime, characterized as p >> n, p ≈ n, and p << n. These regimes correspond to fundamentally different geometric properties of the design matrix, including rank behavior, conditioning, and the stability of the normal equations. +The experimental units are the datasets under fixed penalty weights. Each dataset will contain a matrix ${X}$, a response vector $\mathbf{y}$, and a regularization parameter ${\lambda}$. Because the statistical behavior of ridge regression algorithms depends strongly on the dimensional structure of the problem, a blocking system will be used. Datasets will be grouped according to their dimensional regime, characterized as $p \ll n$, p ≈ n, and $p \gg n$. These regimes correspond to fundamentally different geometric properties of the design matrix, including rank behavior, conditioning, and the stability of the normal equations. + +In addition to dimensional block, the strength of the ridge penalty will be incorporated as a secondary blocking factor. The ridge estimator is $\hat{\beta_R} = (X^\top X + \lambda I)^{-1}X^\top y$. The matrix conditioning number is defined as $\kappa(A) = \frac{\sigma_{\max}(A)}{\sigma_{\min}(A)}$. In the context of ridge regression, the regularization parameter ${\lambda}$, can impact the conditioning number. Let $X = U\Sigma V^\top$ be the SVD of $X$, with singular values $\sigma_1,\dots,\sigma_p$. + +Then + +$$ +X^\top X = V \Sigma^\top \Sigma V^\top += V \,\mathrm{diag}(\sigma_1^2,\dots,\sigma_p^2)\, V^\top . +$$ + +Adding the ridge term gives -In addition to dimensional regime, matrix conditioning will be incorporated as a secondary blocking factor. The condition number of the design matrix quantifies the sensitivity of the regression problem to perturbations in the data and directly affects numerical stability and convergence behavior of ridge solution methods. Ill-conditioned matrices have slow convergence and are sensitive to errors, while well-conditioned matrices tend to produce stable and rapidly convergent behavior. +$$ +X^\top X + \lambda I += +V \,\mathrm{diag}(\sigma_1^2+\lambda,\dots,\sigma_p^2+\lambda)\, V^\top . +$$ +$$ +\kappa_2(X^\top X+\lambda I) += +\frac{\sigma_{\max}^2+\lambda}{\sigma_{\min}^2+\lambda}. +$$ + +When ${\lambda}$ is small and the singular values are large, we aren't changing the conditioning number much. As such, the ridge penalty won't really affect the condition number. Conversely, if $\sigma_{\min}$ is close to 0, we have an ill-posed problem (Existence, uniqueness, stability not satisfied). However if a large ${\lambda}$ is chosen, the condition number is reduced and the problem becomes more stable. This behavior motivates blocking experiments according to the effective conditioning induced by the ridge penalty, allowing algorithm performance to be compared across well-posed and ill-posed regression settings. + +Another blocking factor that will be considered is how sparse or dense the matrix X is. Many algorithms behave differently depending on whether the matrix is sparse or dense. In ridge regression, there are many operations involving X including matrix-matrix products and matrix-vector products. A dense matrix leads to high computational cost whereas a sparse matrix we can significantly reduce the cost. As such, different algorithms may perform better depending on the sparsity structure of X, making matrix sparsity a relevant blocking factor when comparing algorithm behavior and computational efficiency. | Blocking System | Factor | Blocks | |:----------------|:-------|:-------| | Dataset | Dimensional regime (\(p/n\)) | $(p \ll n)$, $(p \approx n)$, $(p \gg n)$| -| Matrix conditioning | Condition number of \( X \) or \( X^T X \) | Low, Medium, High | +| Ridge Penalty| Value of ${\lambda}$ relative to the singular values | Small, Large (Frobenius Norm range of values, Calculate Singular Values)| +| Matrix Sparsity| Density of non-zero values in X | Sparse, Moderate, Dense (Need to ask about how to quantify this)| # Treatments +(How many treatments should we expect) +(What treatments correspond to which blocking system) + The treatments are the ridge regression solution methods: --Gradient descent --Stochastic gradient descent --Closed-form solutions + +- Gradient-based optimization +- Stochastic gradient descent +- Direct Methods + +For each experimental unit, all treatments will be applied to the dataset. This will be done so that differences in performance can be attributed to the algorithms themselves rather than the data. The order of treatment application will be completely randomized to avoid any systemic bias. If there are b possible combinations of the levels of each blocking factor + +The total number of block combinations is determined by the product of the number of levels in each blocking factor. For example, if the experiment includes three dimensional regimes, two sparsity levels, and two regularization strengths, then there are $3 * 2 * 2 = 12$ block combinations. # Observational Units and Measurements +Explanation needed +(How many rows/columns to expect) +(Explain measurements: Floating point, real, etc. Meaning of values, what values it can take.) The observational units are each algorithm-dataset pair. For each combination we will observe the following | Measurement System | Factor | Measurements | |:--------------------------|:--------------------------|:-------------| -| Predictive Performance | Prediction error | Training MSE, Test MSE, RMSE, R² | -| Estimation Accuracy | Parameter recovery | ‖β̂ − β_true‖₂² | (if known) | Computational Performance | Efficiency | Runtime (seconds), Iterations to convergence | -| Numerical Stability | Solution accuracy | Perturbation sensitivity | -| Model Complexity | Coefficient magnitude | ‖β̂‖₂ | \ No newline at end of file +| Numerical Stability | Solution accuracy | Perturbation sensitivity | \ No newline at end of file From 9b8c48ce777462643bc71f9b849b7b8fc1ca78ac Mon Sep 17 00:00:00 2001 From: Eton Tackett Date: Mon, 16 Mar 2026 23:38:17 -0500 Subject: [PATCH 08/14] Small changes --- src/dataset.jl | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/dataset.jl b/src/dataset.jl index 5f1b77c..4e20c89 100644 --- a/src/dataset.jl +++ b/src/dataset.jl @@ -28,20 +28,19 @@ struct Dataset end """ - one_hot_encode(dataset::Dataset; drop_first=true) + one_hot_encode(Xdf::DataFrame; drop_first=true) -One-hot encode categorical (string-like) features in `dataset.X`. +One-hot encode categorical (string-like) features in `Xdf`. # Arguments -- `dataset::Dataset`: Input dataset containing feature matrix `X` - and response vector `y`. +- `Xdf::DataFrame`: Input DataFrame containing features and response vector `y`. # Keyword Arguments - `drop_first::Bool=true`: If `true`, drop the first dummy column for each categorical feature to avoid multicollinearity. # Returns -A new `Dataset` with numeric `X` and unchanged `y`. +- `Matrix{Float64}`: A numeric matrix containing the encoded feature. """ function one_hot_encode(Xdf::DataFrame; drop_first::Bool = true)::Matrix{Float64} n = nrow(Xdf) @@ -91,7 +90,7 @@ Load a dataset from a CSV file or URL. - `name::String="csv_dataset"`: Dataset name. # Returns -A `Dataset`. +- `Dataset`: A dataset containing the encoded feature matrix `X`, response vector `y`, and dataset name. """ function csv_dataset(path_or_url::String; target_col, From bf9707b36959950aa3d5c9646998bf17f6fbe4cb Mon Sep 17 00:00:00 2001 From: Eton Tackett Date: Fri, 20 Mar 2026 18:45:54 -0500 Subject: [PATCH 09/14] Changes --- test/runtests.jl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/runtests.jl b/test/runtests.jl index dbbe06f..209d515 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -2,5 +2,6 @@ using RidgeRegression using Test @testset "RidgeRegression.jl" begin - # Write your tests here. + include("dataset_tests.jl") + include("bidiagonalization_tests.jl") end From 214e74d0fcabf336b87f7528502ae39c3b615cf3 Mon Sep 17 00:00:00 2001 From: Eton Tackett Date: Fri, 20 Mar 2026 18:47:02 -0500 Subject: [PATCH 10/14] Ridge Regreesion jl changes --- src/RidgeRegression.jl | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/RidgeRegression.jl b/src/RidgeRegression.jl index 5ef574e..53f2cde 100644 --- a/src/RidgeRegression.jl +++ b/src/RidgeRegression.jl @@ -3,9 +3,8 @@ using CSV using DataFrames using Downloads -export Dataset, csv_dataset, one_hot_encode - include("dataset.jl") +export Dataset, csv_dataset, one_hot_encode end From a87c905508ea53799a632521b50802b5206a9769 Mon Sep 17 00:00:00 2001 From: Eton Tackett Date: Sat, 21 Mar 2026 15:15:16 -0500 Subject: [PATCH 11/14] Fix test environment and dataset tests --- Project.toml | 1 + src/RidgeRegression.jl | 1 + test/Project.toml | 7 +++++++ test/dataset_tests.jl | 1 - test/runtests.jl | 1 - 5 files changed, 9 insertions(+), 2 deletions(-) diff --git a/Project.toml b/Project.toml index 427e8aa..9c27dee 100644 --- a/Project.toml +++ b/Project.toml @@ -7,6 +7,7 @@ authors = ["Eton Tackett ", "Vivak Patel Date: Sat, 21 Mar 2026 15:34:04 -0500 Subject: [PATCH 12/14] Remove design.md from this PR (separate design branch) --- docs/src/design.md | 68 ---------------------------------------------- 1 file changed, 68 deletions(-) delete mode 100644 docs/src/design.md diff --git a/docs/src/design.md b/docs/src/design.md deleted file mode 100644 index 4795866..0000000 --- a/docs/src/design.md +++ /dev/null @@ -1,68 +0,0 @@ -# Motivation and Background -Many modern science problems involve regression problems with extremely large numbers of predictors. Genome-wide association studies (GWAS), for example, try to identify genetic variants associated with a disease phenotype using hundreds of thousands or millions of genomic features. In such settings, traditional least squares methods fail because noise and ill-conditioning. Penalized Least Squares (PLS) extends ordinary least squares (OLS) regression by adding a penalty term to shrink parameter estimates. Ridge regression, an approach within PLS, adds a regularization term, producing a regularized estimator that helps to stabalize the solution. - -There are many numerical algorithms available to compute ridge regression estimates including direct methods, Krylov subspace methods, gradient-based optimization, coordinate descent, and stochastic gradient descent. These algorithms differ in their computational cost, memory requirements, and numerical stability. - -The goal of this experiment is to investigate the performance of these algorithms when we vary the structure and scale of the regression problem. To do this, we consider the linear model $\mathbf{y} = X\boldsymbol{\beta} + \boldsymbol{\varepsilon}$ where the matrix ${X}$ will be constructed with varying dimensions, sparsity patterns, and conditioning properties. -# Questions -Key Questions: -Which ridge regression algorithm is provides the best balance between: - -- Numerical stability -- Computational aspects (GPU/CPU, runtime, etc) -# Experimental Units -The experimental units are the datasets under fixed penalty weights. Each dataset will contain a matrix ${X}$, a response vector $\mathbf{y}$, and a regularization parameter ${\lambda}$. Because the statistical behavior of ridge regression algorithms depends strongly on the dimensional structure of the problem, a blocking system will be used. Datasets will be grouped according to their dimensional regime, characterized as $p \ll n$, p ≈ n, and $p \gg n$. These regimes correspond to fundamentally different geometric properties of the design matrix, including rank behavior, conditioning, and the stability of the normal equations. - -In addition to dimensional block, the strength of the ridge penalty will be incorporated as a secondary blocking factor. The ridge estimator is $\hat{\beta_R} = (X^\top X + \lambda I)^{-1}X^\top y$. The matrix conditioning number is defined as $\kappa(A) = \frac{\sigma_{\max}(A)}{\sigma_{\min}(A)}$. In the context of ridge regression, the regularization parameter ${\lambda}$, can impact the conditioning number. Let $X = U\Sigma V^\top$ be the SVD of $X$, with singular values $\sigma_1,\dots,\sigma_p$. - -Then - -$$ -X^\top X = V \Sigma^\top \Sigma V^\top -= V \,\mathrm{diag}(\sigma_1^2,\dots,\sigma_p^2)\, V^\top . -$$ - -Adding the ridge term gives - -$$ -X^\top X + \lambda I -= -V \,\mathrm{diag}(\sigma_1^2+\lambda,\dots,\sigma_p^2+\lambda)\, V^\top . -$$ -$$ -\kappa_2(X^\top X+\lambda I) -= -\frac{\sigma_{\max}^2+\lambda}{\sigma_{\min}^2+\lambda}. -$$ - -When ${\lambda}$ is small and the singular values are large, we aren't changing the conditioning number much. As such, the ridge penalty won't really affect the condition number. Conversely, if $\sigma_{\min}$ is close to 0, we have an ill-posed problem (Existence, uniqueness, stability not satisfied). However if a large ${\lambda}$ is chosen, the condition number is reduced and the problem becomes more stable. This behavior motivates blocking experiments according to the effective conditioning induced by the ridge penalty, allowing algorithm performance to be compared across well-posed and ill-posed regression settings. - -Another blocking factor that will be considered is how sparse or dense the matrix X is. Many algorithms behave differently depending on whether the matrix is sparse or dense. In ridge regression, there are many operations involving X including matrix-matrix products and matrix-vector products. A dense matrix leads to high computational cost whereas a sparse matrix we can significantly reduce the cost. As such, different algorithms may perform better depending on the sparsity structure of X, making matrix sparsity a relevant blocking factor when comparing algorithm behavior and computational efficiency. - -| Blocking System | Factor | Blocks | -|:----------------|:-------|:-------| -| Dataset | Dimensional regime (\(p/n\)) | $(p \ll n)$, $(p \approx n)$, $(p \gg n)$| -| Ridge Penalty| Value of ${\lambda}$ relative to the singular values | Small, Large (Frobenius Norm range of values, Calculate Singular Values)| -| Matrix Sparsity| Density of non-zero values in X | Sparse, Moderate, Dense (Need to ask about how to quantify this)| -# Treatments -(How many treatments should we expect) -(What treatments correspond to which blocking system) - -The treatments are the ridge regression solution methods: - -- Gradient-based optimization -- Stochastic gradient descent -- Direct Methods - -For each experimental unit, all treatments will be applied to the dataset. This will be done so that differences in performance can be attributed to the algorithms themselves rather than the data. The order of treatment application will be completely randomized to avoid any systemic bias. If there are b possible combinations of the levels of each blocking factor - -The total number of block combinations is determined by the product of the number of levels in each blocking factor. For example, if the experiment includes three dimensional regimes, two sparsity levels, and two regularization strengths, then there are $3 * 2 * 2 = 12$ block combinations. -# Observational Units and Measurements -Explanation needed -(How many rows/columns to expect) -(Explain measurements: Floating point, real, etc. Meaning of values, what values it can take.) -The observational units are each algorithm-dataset pair. For each combination we will observe the following -| Measurement System | Factor | Measurements | -|:--------------------------|:--------------------------|:-------------| -| Computational Performance | Efficiency | Runtime (seconds), Iterations to convergence | -| Numerical Stability | Solution accuracy | Perturbation sensitivity | \ No newline at end of file From 13fc5bf3df477ff1a53f97533c575c53784eea91 Mon Sep 17 00:00:00 2001 From: Eton Tackett Date: Sun, 22 Mar 2026 21:27:59 -0500 Subject: [PATCH 13/14] 3/22 Updates --- src/RidgeRegression.jl | 2 +- src/dataset.jl | 85 ++++++++++++++++++++++------------ test/dataset_tests.jl | 68 +++++---------------------- test/encoding_tests.jl | 38 +++++++++++++++ test/load_csv_dataset_tests.jl | 38 +++++++++++++++ test/runtests.jl | 14 +++++- 6 files changed, 156 insertions(+), 89 deletions(-) create mode 100644 test/encoding_tests.jl create mode 100644 test/load_csv_dataset_tests.jl diff --git a/src/RidgeRegression.jl b/src/RidgeRegression.jl index 011adc5..f1bbc19 100644 --- a/src/RidgeRegression.jl +++ b/src/RidgeRegression.jl @@ -6,6 +6,6 @@ using Downloads include("dataset.jl") -export Dataset, csv_dataset, one_hot_encode +export Dataset, load_csv_dataset, one_hot_encode end diff --git a/src/dataset.jl b/src/dataset.jl index 4e20c89..16b3246 100644 --- a/src/dataset.jl +++ b/src/dataset.jl @@ -1,29 +1,49 @@ """ - Dataset(name, X, y) + Dataset <: ExperimentalUnit -Contains datasets for ridge regression experiments. +A dataset for Ridge Regression experiements. + +# Description + +A `Dataset` object stores the design matrix ``X`` and response vector ``y`` +for a regression problem. These datasets serve as the experimental units for ridge regression experiments, allowing us to evaluate the performance of ridge regression models on various datasets. # Fields - `name::String`: Name of dataset -- `X::Matrix{Float64}`: Matrix of variables/features -- `y::Vector{Float64}`: Target vector +- `X::TX`: Matrix of variables/features +- `y::TY`: Target vector + +# Constructor + + Dataset(name::String, X::AbstractMatrix, y::AbstractVector) + +## Arguments +- `name::String`: Name of dataset +- `X::TX`: Matrix of variables/features +- `y::TY`: Target vector -# Throws +## Returns +- A `Dataset` object containing the numeric design matrix and response vector. + +## Throws - `ArgumentError`: If rows in `X` does not equal length of `y`. !!! note - Used as the experimental unit for ridge regression experiments. + `Dataset` objects are used as experimental units when evaluating + ridge regression algorithms. The parametric design allows both dense + and sparse matrices to be stored without forcing conversion to a + dense `Matrix{Float64}`. """ -struct Dataset +struct Dataset{TX<:AbstractMatrix, TY<:AbstractVector} name::String - X::Matrix{Float64} - y::Vector{Float64} + X::TX + y::TY - function Dataset(name::String, X::AbstractMatrix, y::AbstractVector) + function Dataset(name::String, X::TX, y::TY) where {TX<:AbstractMatrix, TY<:AbstractVector} size(X, 1) == length(y) || throw(ArgumentError("X and y must have same number of rows")) - new(name, Matrix{Float64}(X), Vector{Float64}(y)) + new{TX, TY}(name, X, y) end end @@ -36,28 +56,28 @@ One-hot encode categorical (string-like) features in `Xdf`. - `Xdf::DataFrame`: Input DataFrame containing features and response vector `y`. # Keyword Arguments +- `cols_to_encode`: A collection of column names or indices to one-hot encode. - `drop_first::Bool=true`: If `true`, drop the first dummy column for each categorical feature to avoid multicollinearity. # Returns - `Matrix{Float64}`: A numeric matrix containing the encoded feature. """ -function one_hot_encode(Xdf::DataFrame; drop_first::Bool = true)::Matrix{Float64} +function one_hot_encode(Xdf::DataFrame; cols_to_encode, drop_first::Bool = true)::Matrix{Float64} n = nrow(Xdf) cols = Vector{Vector{Float64}}() + encode_names = Set(c isa Int ? Symbol(names(Xdf)[c]) : Symbol(c) for c in cols_to_encode) + for name in names(Xdf) #Selecting columns that aren't the target variable and pushing them to the columns. col = Xdf[!, name] - if eltype(col) <: Real - push!(cols, Float64.(col)) - continue - end - - scol = string.(col) # Convert to string for categorical processing. - lv = unique(scol) #Get unique category levels. - ind = scol .== permutedims(lv) #Create indicator matrix for each level of the categorical variable. - #Permutedims is used to align the dimensions for broadcasting. - #Broadcasting compares each element of `scol` with each level in `lv`, resulting in a matrix where each column corresponds to a level and contains `true` for rows that match that level and `false` otherwise. + name_sym = Symbol(name) + if name_sym in encode_names + scol = string.(col) # Convert to string for categorical processing. + lv = unique(scol) #Get unique category levels. + ind = scol .== permutedims(lv) #Create indicator matrix for each level of the categorical variable. + #Permutedims is used to align the dimensions for broadcasting. + #Broadcasting compares each element of `scol` with each level in `lv`, resulting in a matrix where each column corresponds to a level and contains `true` for rows that match that level and `false` otherwise. if drop_first && size(ind, 2) > 1 #Drop the first column of the indicator matrix to avoid multicollinearity if drop_first is true and there are multiple levels. ind = ind[:, 2:end] @@ -66,6 +86,12 @@ function one_hot_encode(Xdf::DataFrame; drop_first::Bool = true)::Matrix{Float64 for j in 1:size(ind, 2) push!(cols, Float64.(ind[:, j])) #Convert the boolean indicator columns to Float64 and add them to the list of columns. end + else + eltype(col) <: Real || + throw(ArgumentError("Column $name must be numeric unless it is listed in cols_to_encode")) + + push!(cols, Float64.(col)) + end end p = length(cols) @@ -78,7 +104,7 @@ function one_hot_encode(Xdf::DataFrame; drop_first::Bool = true)::Matrix{Float64 end """ - csv_dataset(path_or_url; target_col, name="csv_dataset") + load_csv_dataset(path_or_url; target_col, name="csv_dataset") Load a dataset from a CSV file or URL. @@ -86,16 +112,14 @@ Load a dataset from a CSV file or URL. - `path_or_url::String`: Local file path or web URL containing CSV data. # Keyword Arguments +- `cols_to_encode=Symbol[]`: Column names or indices in the feature data to one-hot encode. - `target_col`: Column index or column name containing the response variable. - `name::String="csv_dataset"`: Dataset name. # Returns - `Dataset`: A dataset containing the encoded feature matrix `X`, response vector `y`, and dataset name. """ -function csv_dataset(path_or_url::String; - target_col, - name::String = "csv_dataset" -) +function load_csv_dataset(path_or_url::String; cols_to_encode=Symbol[], target_col, name::String = "csv_dataset") filepath = startswith(path_or_url, "http") ? @@ -111,9 +135,10 @@ function csv_dataset(path_or_url::String; df[:, Symbol(target_col)] #Extract the target variable based on whether target_col is an index or a name. - X = one_hot_encode(Xdf; drop_first = true) - + feature_names = names(Xdf) + encode_cols = [c isa Int ? Symbol(names(Xdf)[c]) : Symbol(c) for c in cols_to_encode] + X = one_hot_encode(Xdf; cols_to_encode=encode_cols, drop_first = true) - return Dataset(name, Matrix{Float64}(X), Vector{Float64}(y)) + return Dataset(name, X, collect(Float64, y)) end diff --git a/test/dataset_tests.jl b/test/dataset_tests.jl index 1d8ee80..a0176a9 100644 --- a/test/dataset_tests.jl +++ b/test/dataset_tests.jl @@ -1,65 +1,19 @@ -using Test -using DataFrames -using CSV -using RidgeRegression -@testset "Dataset" begin +@testset "Testset 1" begin X = [1 2; 3 4] y = [10, 20] d = Dataset("toy", X, y) - @test d.name == "toy" - @test d.X isa Matrix{Float64} - @test d.y isa Vector{Float64} - @test size(d.X) == (2, 2) - @test length(d.y) == 2 - @test d.X[1, 1] == 1.0 - @test d.y[2] == 20.0 - - @test_throws ArgumentError Dataset("bad", X, [1, 2, 3]) -end - -@testset "one_hot_encode" begin - df = DataFrame( - A = ["red", "blue", "red", "green"], - B = [1, 2, 3, 4], - C = ["small", "large", "medium", "small"] - ) - - X = redirect_stdout(devnull) do - one_hot_encode(df; drop_first = true) - end - - @test size(X) == (4, 5) - @test X[:, 3] == [1.0, 2.0, 3.0, 4.0] - @test all(x -> x == 0.0 || x == 1.0, X[:, [1,2,4,5]]) - @test all(vec(sum(X[:, 1:2]; dims=2)) .<= 1) - @test all(vec(sum(X[:, 4:5]; dims=2)) .<= 1) + @test "toy" == d.name + @test X == d.X + @test y == d.y + @test (2, 2) == size(d.X) + @test 2 == length(d.y) + @test 1.0 == d.X[1, 1] + @test 20.0 == d.y[2] end -@testset "csv_dataset" begin - tmp = tempname() * ".csv" - df = DataFrame( - a = [1.0, 2.0, missing, 4.0], - b = ["x", "y", "y", "x"], - y = [10.0, 20.0, 30.0, 40.0] - ) - CSV.write(tmp, df) - - d = redirect_stdout(devnull) do - csv_dataset(tmp; target_col=:y, name="tmp") - end - - @test d.name == "tmp" - @test d.X isa Matrix{Float64} - @test d.y isa Vector{Float64} - - @test length(d.y) == 3 - @test size(d.X, 1) == 3 - @test d.y == [10.0, 20.0, 40.0] +@testset "Testset 2" begin + X = [1 2; 3 4] - d2 = redirect_stdout(devnull) do - csv_dataset(tmp; target_col=3, name="tmp2") - end - @test d2.y == [10.0, 20.0, 40.0] - @test size(d2.X, 1) == 3 + @test_throws ArgumentError Dataset("bad", X, [1, 2, 3]) end diff --git a/test/encoding_tests.jl b/test/encoding_tests.jl new file mode 100644 index 0000000..ca67e61 --- /dev/null +++ b/test/encoding_tests.jl @@ -0,0 +1,38 @@ +@testset "Testset 1" begin + df = DataFrame( + A = ["red", "blue", "red", "green"], + B = [1, 2, 3, 4], + C = ["small", "large", "medium", "small"] + ) + + X = one_hot_encode(df; cols_to_encode=[:A, :C], drop_first=true) + + @test (4, 5) == size(X) + @test [1.0, 2.0, 3.0, 4.0] == X[:, 3] + @test all(x -> x == 0.0 || x == 1.0, X[:, [1, 2, 4, 5]]) + @test all(vec(sum(X[:, 1:2]; dims=2)) .<= 1) + @test all(vec(sum(X[:, 4:5]; dims=2)) .<= 1) +end + +@testset "Testset 2" begin + df = DataFrame( + A = ["red", "blue", "red", "green"], + B = [1, 2, 3, 4], + C = ["small", "large", "medium", "small"] + ) + + @test_throws ArgumentError one_hot_encode(df; cols_to_encode=[:A], drop_first=true) +end + +@testset "Testset 3" begin + df = DataFrame( + group = [1, 2, 1, 3], + x = [10.0, 20.0, 30.0, 40.0] + ) + + X = one_hot_encode(df; cols_to_encode=[:group], drop_first=true) + + @test (4, 3) == size(X) + @test [10.0, 20.0, 30.0, 40.0] == X[:, 3] + @test all(x -> x == 0.0 || x == 1.0, X[:, 1:2]) +end diff --git a/test/load_csv_dataset_tests.jl b/test/load_csv_dataset_tests.jl new file mode 100644 index 0000000..67f41ed --- /dev/null +++ b/test/load_csv_dataset_tests.jl @@ -0,0 +1,38 @@ +@testset "Testset 1" begin + tmp = tempname() * ".csv" + + df = DataFrame( + a = [1.0, 2.0, missing, 4.0], + b = ["x", "y", "y", "x"], + y = [10.0, 20.0, 30.0, 40.0] + ) + + CSV.write(tmp, df) + + d = load_csv_dataset(tmp; target_col=:y, cols_to_encode=[:b], name="tmp") + + @test "tmp" == d.name + @test 3 == length(d.y) + @test 3 == size(d.X, 1) + @test [10.0, 20.0, 40.0] == d.y + @test (3, 2) == size(d.X) +end + +@testset "Testset 2" begin + tmp = tempname() * ".csv" + + df = DataFrame( + a = [1.0, 2.0, missing, 4.0], + b = ["x", "y", "y", "x"], + y = [10.0, 20.0, 30.0, 40.0] + ) + + CSV.write(tmp, df) + + d = load_csv_dataset(tmp; target_col=3, cols_to_encode=[:b], name="tmp2") + + @test "tmp2" == d.name + @test [10.0, 20.0, 40.0] == d.y + @test 3 == size(d.X, 1) + @test (3, 2) == size(d.X) +end diff --git a/test/runtests.jl b/test/runtests.jl index 3154bcd..c8b78c5 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,6 +1,18 @@ using RidgeRegression using Test +using DataFrames +using CSV @testset "RidgeRegression.jl" begin - include("dataset_tests.jl") + @testset "Dataset tests" begin + include("dataset_tests.jl") + end + + @testset "Encoding tests" begin + include("encoding_tests.jl") + end + + @testset "Load CSV dataset tests" begin + include("load_csv_dataset_tests.jl") + end end From 5cbeb0b0ac5001a35841f49f02b6c56b1b227f09 Mon Sep 17 00:00:00 2001 From: Eton Tackett Date: Tue, 24 Mar 2026 10:08:16 -0500 Subject: [PATCH 14/14] New Edits --- test/runtests.jl | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/test/runtests.jl b/test/runtests.jl index c8b78c5..9545091 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,18 +1,19 @@ using RidgeRegression using Test using DataFrames -using CSV +using LinearAlgebra @testset "RidgeRegression.jl" begin - @testset "Dataset tests" begin + @testset "Dataset Tests" begin include("dataset_tests.jl") end - @testset "Encoding tests" begin + @testset "One-Hot Encoding Tests" begin include("encoding_tests.jl") end - @testset "Load CSV dataset tests" begin + @testset "Load CSV Dataset Tests" begin include("load_csv_dataset_tests.jl") end + end