diff --git a/Project.toml b/Project.toml index 86ea0a5..9c27dee 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,16 @@ name = "RidgeRegression" uuid = "739161c8-60e1-4c49-8f89-ff30998444b1" -authors = ["Vivak Patel "] version = "0.1.0" +authors = ["Eton Tackett ", "Vivak Patel "] + +[deps] +CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" +DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" +Downloads = "f43a241f-c20a-4ad4-852c-f6b1247861c6" +RidgeRegression = "739161c8-60e1-4c49-8f89-ff30998444b1" [compat] +CSV = "0.10.15" +DataFrames = "1.8.1" +Downloads = "1.7.0" julia = "1.12.4" diff --git a/docs/make.jl b/docs/make.jl index a1097bb..d42cfbe 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -14,6 +14,7 @@ makedocs(; ), pages=[ "Home" => "index.md", + "Design" => "design.md", ], ) diff --git a/src/RidgeRegression.jl b/src/RidgeRegression.jl index c32de91..f1bbc19 100644 --- a/src/RidgeRegression.jl +++ b/src/RidgeRegression.jl @@ -1,5 +1,11 @@ module RidgeRegression -# Write your package code here. +using CSV +using DataFrames +using Downloads + +include("dataset.jl") + +export Dataset, load_csv_dataset, one_hot_encode end diff --git a/src/dataset.jl b/src/dataset.jl new file mode 100644 index 0000000..16b3246 --- /dev/null +++ b/src/dataset.jl @@ -0,0 +1,144 @@ +""" + Dataset <: ExperimentalUnit + +A dataset for Ridge Regression experiements. + +# Description + +A `Dataset` object stores the design matrix ``X`` and response vector ``y`` +for a regression problem. These datasets serve as the experimental units for ridge regression experiments, allowing us to evaluate the performance of ridge regression models on various datasets. + +# Fields +- `name::String`: Name of dataset +- `X::TX`: Matrix of variables/features +- `y::TY`: Target vector + +# Constructor + + Dataset(name::String, X::AbstractMatrix, y::AbstractVector) + +## Arguments +- `name::String`: Name of dataset +- `X::TX`: Matrix of variables/features +- `y::TY`: Target vector + +## Returns +- A `Dataset` object containing the numeric design matrix and response vector. + +## Throws +- `ArgumentError`: If rows in `X` does not equal length of `y`. + +!!! note + `Dataset` objects are used as experimental units when evaluating + ridge regression algorithms. The parametric design allows both dense + and sparse matrices to be stored without forcing conversion to a + dense `Matrix{Float64}`. +""" +struct Dataset{TX<:AbstractMatrix, TY<:AbstractVector} + name::String + X::TX + y::TY + + function Dataset(name::String, X::TX, y::TY) where {TX<:AbstractMatrix, TY<:AbstractVector} + size(X, 1) == length(y) || + throw(ArgumentError("X and y must have same number of rows")) + + new{TX, TY}(name, X, y) + end +end + +""" + one_hot_encode(Xdf::DataFrame; drop_first=true) + +One-hot encode categorical (string-like) features in `Xdf`. + +# Arguments +- `Xdf::DataFrame`: Input DataFrame containing features and response vector `y`. + +# Keyword Arguments +- `cols_to_encode`: A collection of column names or indices to one-hot encode. +- `drop_first::Bool=true`: If `true`, drop the first dummy column for + each categorical feature to avoid multicollinearity. + +# Returns +- `Matrix{Float64}`: A numeric matrix containing the encoded feature. +""" +function one_hot_encode(Xdf::DataFrame; cols_to_encode, drop_first::Bool = true)::Matrix{Float64} + n = nrow(Xdf) + cols = Vector{Vector{Float64}}() + encode_names = Set(c isa Int ? Symbol(names(Xdf)[c]) : Symbol(c) for c in cols_to_encode) + + + for name in names(Xdf) #Selecting columns that aren't the target variable and pushing them to the columns. + col = Xdf[!, name] + name_sym = Symbol(name) + if name_sym in encode_names + scol = string.(col) # Convert to string for categorical processing. + lv = unique(scol) #Get unique category levels. + ind = scol .== permutedims(lv) #Create indicator matrix for each level of the categorical variable. + #Permutedims is used to align the dimensions for broadcasting. + #Broadcasting compares each element of `scol` with each level in `lv`, resulting in a matrix where each column corresponds to a level and contains `true` for rows that match that level and `false` otherwise. + + if drop_first && size(ind, 2) > 1 #Drop the first column of the indicator matrix to avoid multicollinearity if drop_first is true and there are multiple levels. + ind = ind[:, 2:end] + end + + for j in 1:size(ind, 2) + push!(cols, Float64.(ind[:, j])) #Convert the boolean indicator columns to Float64 and add them to the list of columns. + end + else + eltype(col) <: Real || + throw(ArgumentError("Column $name must be numeric unless it is listed in cols_to_encode")) + + push!(cols, Float64.(col)) + end + end + + p = length(cols) + X = Matrix{Float64}(undef, n, p) + for j in 1:p + X[:, j] = cols[j] + end + + return Matrix{Float64}(X) + +end +""" + load_csv_dataset(path_or_url; target_col, name="csv_dataset") + +Load a dataset from a CSV file or URL. + +# Arguments +- `path_or_url::String`: Local file path or web URL containing CSV data. + +# Keyword Arguments +- `cols_to_encode=Symbol[]`: Column names or indices in the feature data to one-hot encode. +- `target_col`: Column index or column name containing the response variable. +- `name::String="csv_dataset"`: Dataset name. + +# Returns +- `Dataset`: A dataset containing the encoded feature matrix `X`, response vector `y`, and dataset name. +""" +function load_csv_dataset(path_or_url::String; cols_to_encode=Symbol[], target_col, name::String = "csv_dataset") + + filepath = + startswith(path_or_url, "http") ? + Downloads.download(path_or_url) : + path_or_url + + df = DataFrame(CSV.File(filepath)) #Read CSV file into a DataFrame. + df = dropmissing(df) #Remove rows with missing values. + Xdf = select(df, DataFrames.Not(target_col)) #Select all columns except the target column for features. + + y = target_col isa Int ? + df[:, target_col] : #If target_col is an integer, use it as a column index to extract the target variable from the DataFrame. + df[:, Symbol(target_col)] #Extract the target variable based on whether target_col is an index or a name. + + + feature_names = names(Xdf) + encode_cols = [c isa Int ? Symbol(names(Xdf)[c]) : Symbol(c) for c in cols_to_encode] + X = one_hot_encode(Xdf; cols_to_encode=encode_cols, drop_first = true) + + + return Dataset(name, X, collect(Float64, y)) +end diff --git a/test/Project.toml b/test/Project.toml index 0c36332..b8a3f94 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -1,2 +1,9 @@ [deps] +CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" +DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" +RidgeRegression = "739161c8-60e1-4c49-8f89-ff30998444b1" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" + +[compat] +CSV = "0.10" +DataFrames = "1" diff --git a/test/dataset_tests.jl b/test/dataset_tests.jl new file mode 100644 index 0000000..a0176a9 --- /dev/null +++ b/test/dataset_tests.jl @@ -0,0 +1,19 @@ +@testset "Testset 1" begin + X = [1 2; 3 4] + y = [10, 20] + d = Dataset("toy", X, y) + + @test "toy" == d.name + @test X == d.X + @test y == d.y + @test (2, 2) == size(d.X) + @test 2 == length(d.y) + @test 1.0 == d.X[1, 1] + @test 20.0 == d.y[2] +end + +@testset "Testset 2" begin + X = [1 2; 3 4] + + @test_throws ArgumentError Dataset("bad", X, [1, 2, 3]) +end diff --git a/test/encoding_tests.jl b/test/encoding_tests.jl new file mode 100644 index 0000000..ca67e61 --- /dev/null +++ b/test/encoding_tests.jl @@ -0,0 +1,38 @@ +@testset "Testset 1" begin + df = DataFrame( + A = ["red", "blue", "red", "green"], + B = [1, 2, 3, 4], + C = ["small", "large", "medium", "small"] + ) + + X = one_hot_encode(df; cols_to_encode=[:A, :C], drop_first=true) + + @test (4, 5) == size(X) + @test [1.0, 2.0, 3.0, 4.0] == X[:, 3] + @test all(x -> x == 0.0 || x == 1.0, X[:, [1, 2, 4, 5]]) + @test all(vec(sum(X[:, 1:2]; dims=2)) .<= 1) + @test all(vec(sum(X[:, 4:5]; dims=2)) .<= 1) +end + +@testset "Testset 2" begin + df = DataFrame( + A = ["red", "blue", "red", "green"], + B = [1, 2, 3, 4], + C = ["small", "large", "medium", "small"] + ) + + @test_throws ArgumentError one_hot_encode(df; cols_to_encode=[:A], drop_first=true) +end + +@testset "Testset 3" begin + df = DataFrame( + group = [1, 2, 1, 3], + x = [10.0, 20.0, 30.0, 40.0] + ) + + X = one_hot_encode(df; cols_to_encode=[:group], drop_first=true) + + @test (4, 3) == size(X) + @test [10.0, 20.0, 30.0, 40.0] == X[:, 3] + @test all(x -> x == 0.0 || x == 1.0, X[:, 1:2]) +end diff --git a/test/load_csv_dataset_tests.jl b/test/load_csv_dataset_tests.jl new file mode 100644 index 0000000..67f41ed --- /dev/null +++ b/test/load_csv_dataset_tests.jl @@ -0,0 +1,38 @@ +@testset "Testset 1" begin + tmp = tempname() * ".csv" + + df = DataFrame( + a = [1.0, 2.0, missing, 4.0], + b = ["x", "y", "y", "x"], + y = [10.0, 20.0, 30.0, 40.0] + ) + + CSV.write(tmp, df) + + d = load_csv_dataset(tmp; target_col=:y, cols_to_encode=[:b], name="tmp") + + @test "tmp" == d.name + @test 3 == length(d.y) + @test 3 == size(d.X, 1) + @test [10.0, 20.0, 40.0] == d.y + @test (3, 2) == size(d.X) +end + +@testset "Testset 2" begin + tmp = tempname() * ".csv" + + df = DataFrame( + a = [1.0, 2.0, missing, 4.0], + b = ["x", "y", "y", "x"], + y = [10.0, 20.0, 30.0, 40.0] + ) + + CSV.write(tmp, df) + + d = load_csv_dataset(tmp; target_col=3, cols_to_encode=[:b], name="tmp2") + + @test "tmp2" == d.name + @test [10.0, 20.0, 40.0] == d.y + @test 3 == size(d.X, 1) + @test (3, 2) == size(d.X) +end diff --git a/test/runtests.jl b/test/runtests.jl index dbbe06f..9545091 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,6 +1,19 @@ using RidgeRegression using Test +using DataFrames +using LinearAlgebra @testset "RidgeRegression.jl" begin - # Write your tests here. + @testset "Dataset Tests" begin + include("dataset_tests.jl") + end + + @testset "One-Hot Encoding Tests" begin + include("encoding_tests.jl") + end + + @testset "Load CSV Dataset Tests" begin + include("load_csv_dataset_tests.jl") + end + end