vp314 · etontackett · Mar 2, 2026 · Mar 3, 2026 · Mar 3, 2026 · Mar 17, 2026
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,15 @@
 name = "RidgeRegression"
 uuid = "739161c8-60e1-4c49-8f89-ff30998444b1"
-authors = ["Vivak Patel <vp314@users.noreply.github.com>"]
 version = "0.1.0"
+authors = ["Eton Tackett <etont@icloud.com>", "Vivak Patel <vp314@users.noreply.github.com>"]
+
+[deps]
+CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
+DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
+Downloads = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
 
 [compat]
+CSV = "0.10.15"
+DataFrames = "1.8.1"
+Downloads = "1.7.0"
 julia = "1.12.4"
diff --git a/docs/make.jl b/docs/make.jl
@@ -14,6 +14,7 @@ makedocs(;
     ),
     pages=[
         "Home" => "index.md",
+        "Design" => "design.md",
     ],
 )
 

diff --git a/docs/src/design.md b/docs/src/design.md
@@ -0,0 +1,32 @@
+# Motivation and Background
+Many modern applications, such as genome-wide association studies (GWAS) involve regression problems with a large number of predictors. Traditional least squares methods fail due to noise and ill-conditioning. Penalized Least Squares (PLS) extends ordinary least squares (OLS) regression by adding a penalty term to shrink parameter estimates. The goal is to select the best possible model, "best" in the sense that we find the best tradeoff between goodness of fit and model complexity. Ridge regression, an approach within PLS, adds a regularization term. 
+
+# Questions
+Key Questions:
+Which ridge regression algorithm is provides the best balance between:
+-Numerical stability
+-Computational aspects (GPU/CPU, runtime, etc)
+-Predicative accuracy
+# Experimental Units
+The experimental units are the datasets under fixed penalty weights. Due to the statistical behavior of ridge regression algorithms depends strongly on the dimensional structure of the problem, a blocking procedure will be used. Datasets will be grouped according to their dimensional regime, characterized as p >> n, p ≈ n, and p << n. These regimes correspond to fundamentally different geometric properties of the design matrix, including rank behavior, conditioning, and the stability of the normal equations.
+
+In addition to dimensional regime, matrix conditioning will be incorporated as a secondary blocking factor. The condition number of the design matrix quantifies the sensitivity of the regression problem to perturbations in the data and directly affects numerical stability and convergence behavior of ridge solution methods. Ill-conditioned matrices have slow convergence and are sensitive to errors, while well-conditioned matrices tend to produce stable and rapidly convergent behavior.
+
+| Blocking System | Factor | Blocks |
+|:----------------|:-------|:-------|
+| Dataset | Dimensional regime (\(p/n\)) | $(p \ll n)$, $(p \approx n)$, $(p \gg n)$|
+| Matrix conditioning | Condition number of \( X \) or \( X^T X \) | Low, Medium, High |
+# Treatments
+The treatments are the ridge regression solution methods:
+-Gradient descent
+-Stochastic gradient descent
+-Closed-form solutions
+# Observational Units and Measurements
+The observational units are each algorithm-dataset pair. For each combination we will observe the following 
+| Measurement System        | Factor                    | Measurements |
+|:--------------------------|:--------------------------|:-------------|
+| Predictive Performance    | Prediction error          | Training MSE, Test MSE, RMSE, R² |
+| Estimation Accuracy       | Parameter recovery        | ‖β̂ − β_true‖₂² | (if known)
+| Computational Performance | Efficiency                | Runtime (seconds), Iterations to convergence |
+| Numerical Stability       | Solution accuracy         | Perturbation sensitivity |
+| Model Complexity          | Coefficient magnitude     | ‖β̂‖₂ |
diff --git a/src/dataset.jl b/src/dataset.jl
@@ -0,0 +1,132 @@
+using CSV
+using DataFrames
+using Downloads
+
+export Dataset, csv_dataset
+
+"""
+    Dataset(name, X, y)
+
+Contains datasets for ridge regression experiments.
+
+# Fields
+- `name::String`: Name of dataset
+- `X::Matrix{Float64}`: Matrix of variables/features
+- `y::Vector{Float64}`: Target vector
+
+# Throws
+- `ArgumentError`: If rows in `X` does not equal length of `y`.
+
+# Notes
+Used as the experimental unit for ridge regression experiments.
+"""
+struct Dataset
+    name::String
+    X::Matrix{Float64}
+    y::Vector{Float64}
+
+    function Dataset(name::String, X::AbstractMatrix, y::AbstractVector)
+        size(X, 1) == length(y) ||
+            throw(ArgumentError("X and y must have same number of rows"))
+
+        new(name, Matrix{Float64}(X), Vector{Float64}(y))
+    end
+end
+
+"""
+    one_hot_encode(dataset::Dataset; drop_first=true)
+
+One-hot encode categorical (string-like) features in `dataset.X`.
+
+# Arguments
+- `dataset::Dataset`: Input dataset containing feature matrix `X`
+  and response vector `y`.
+
+# Keyword Arguments
+- `drop_first::Bool=true`: If `true`, drop the first dummy column for
+  each categorical feature to avoid multicollinearity.
+
+# Returns
+A new `Dataset` with numeric `X` and unchanged `y`.
+"""
+function one_hot_encode(Xdf::DataFrame; drop_first::Bool = true)::Matrix{Float64}
+    n = nrow(Xdf)
+    cols = Vector{Vector{Float64}}()
+
+    for name in names(Xdf)
+        col = Xdf[!, name]
+        if eltype(col) <: Real
+            push!(cols, Float64.(col))
+            continue
+        end
+
+        scol = string.(col)
+        lv = unique(scol)
+        ind = scol .== permutedims(lv)
+
+        println("Variable: $name")
+        for (j, level) in enumerate(lv)
+            println("  Dummy column (before drop) $j → $name = $level")
+        end
+
+        if drop_first && size(ind, 2) > 1
+            ind = ind[:, 2:end]
+        end
+
+        for j in 1:size(ind, 2)
+            push!(cols, Float64.(ind[:, j]))
+        end
+    end
+
+    p = length(cols)
+    X = Matrix{Float64}(undef, n, p)
+    for j in 1:p
+        X[:, j] = cols[j]
+    end
+
+    return Matrix{Float64}(X)
+
+end
+"""
+    csv_dataset(path_or_url; target_col, name="csv_dataset")
+
+Load a dataset from a CSV file or URL.
+
+# Arguments
+- `path_or_url::String`
+    Local file path or web URL that has CSV data.
+
+- `target_col`
+    Column index OR column name containing the response variable.
+
+- `name::String`
+    Dataset name.
+
+# Returns
+`Dataset`
+"""
+function csv_dataset(path_or_url::String;
+    target_col,
+    name::String = "csv_dataset"
+)
+
+    filepath =
+        startswith(path_or_url, "http") ?
+        Downloads.download(path_or_url) :
+        path_or_url
+
+    df = DataFrame(CSV.File(filepath))
+    df = dropmissing(df)
+    Xdf = select(df, DataFrames.Not(target_col))
+
+    y = target_col isa Int ?
+        df[:, target_col] :
+        df[:, Symbol(target_col)]
+
+
+    X = one_hot_encode(Xdf; drop_first = true)
+
+
+
+    return Dataset(name, Matrix{Float64}(X), Vector{Float64}(y))
+end
diff --git a/test/dataset_tests.jl b/test/dataset_tests.jl
@@ -0,0 +1,66 @@
+using Test
+using DataFrames
+using CSV
+
+include("../src/dataset.jl")
+@testset "Dataset" begin
+    X = [1 2; 3 4]
+    y = [10, 20]
+    d = Dataset("toy", X, y)
+
+    @test d.name == "toy"
+    @test d.X isa Matrix{Float64}
+    @test d.y isa Vector{Float64}
+    @test size(d.X) == (2, 2)
+    @test length(d.y) == 2
+    @test d.X[1, 1] == 1.0
+    @test d.y[2] == 20.0
+
+    @test_throws ArgumentError Dataset("bad", X, [1, 2, 3])
+end
+
+@testset "one_hot_encode" begin
+    df = DataFrame(
+        A = ["red", "blue", "red", "green"],
+        B = [1, 2, 3, 4],
+        C = ["small", "large", "medium", "small"]
+    )
+
+    X = redirect_stdout(devnull) do
+        one_hot_encode(df; drop_first = true)
+    end
+
+    @test size(X) == (4, 5)
+    @test X[:, 3] == [1.0, 2.0, 3.0, 4.0]
+    @test all(x -> x == 0.0 || x == 1.0, X[:, [1,2,4,5]])
+    @test all(vec(sum(X[:, 1:2]; dims=2)) .<= 1)
+    @test all(vec(sum(X[:, 4:5]; dims=2)) .<= 1)
+end
+
+@testset "csv_dataset" begin
+    tmp = tempname() * ".csv"
+    df = DataFrame(
+        a = [1.0, 2.0, missing, 4.0],
+        b = ["x", "y", "y", "x"],
+        y = [10.0, 20.0, 30.0, 40.0]
+    )
+    CSV.write(tmp, df)
+
+    d = redirect_stdout(devnull) do
+        csv_dataset(tmp; target_col=:y, name="tmp")
+    end
+
+    @test d.name == "tmp"
+    @test d.X isa Matrix{Float64}
+    @test d.y isa Vector{Float64}
+
+    @test length(d.y) == 3
+    @test size(d.X, 1) == 3
+    @test d.y == [10.0, 20.0, 40.0]
+
+    d2 = redirect_stdout(devnull) do
+        csv_dataset(tmp; target_col=3, name="tmp2")
+    end
+    @test d2.y == [10.0, 20.0, 40.0]
+    @test size(d2.X, 1) == 3
+end
-Original file line number
+Diff line change
@@ Expand Up / @@ -14,6 +14,7 @@ makedocs(; @@
         ),
         pages=[
             "Home" => "index.md",
+            "Design" => "design.md",
         ],
     )
@@ Expand Down @@