LearningToOptimize.jl/src/FullyConnected.jl at 2f271f3e46474266312127853c138620cf83329c · LearningToOptimize/LearningToOptimize.jl · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
mutable struct FullyConnected
    layers::PairwiseFusion
    pass_through::Vector{<:Dense}
end

@functor FullyConnected

function (model::FullyConnected)(x)
    pass_through_inputs = [layer(x) for layer in model.pass_through]
    return model.layers((x, pass_through_inputs...))[end]
end

function Base.show(io::IO, model::FullyConnected)
    println(io, "FullyConnected(")
    println(io, "     Layers: ", model.layers, ",")
    println(io, "     Pass-Through: ", model.pass_through)
    return println(io, ")")
end

"""
    FullyConnected(input_size::Int, hidden_sizes::Vector{Int}, output_size::Int)::FullyConnected

Create a fully connected neural network with `input_size` inputs, `hidden_sizes` hidden layers and `output_size` outputs.
Adds a pass through layer for each layer.
"""
function FullyConnected(
    input_size::Int,
    hidden_sizes::Vector{Int},
    output_size::Int;
    init = Flux.glorot_uniform(Random.GLOBAL_RNG),
)::FullyConnected
    # Create layers
    layers = []

    # Create the pass through layers
    pass_through = [Dense(input_size, 1; init = init) for _ = 2:(length(hidden_sizes)+1)]

    # Create the first layer connected to the input size
    push!(layers, Dense(input_size, hidden_sizes[1], relu; init = init))

    # Create connections between hidden layers
    for i = 2:length(hidden_sizes)
        # Create a new layer
        push!(layers, Dense(hidden_sizes[i-1] + 1, hidden_sizes[i], relu; init = init))
    end

    # Create the output layer connected to the last hidden layer
    push!(layers, Dense(hidden_sizes[end] + 1, output_size; init = init))

    return FullyConnected(PairwiseFusion(vcat, layers...), pass_through) |> gpu
end

mutable struct FullyConnectedBuilder <: MLJFlux.Builder
    hidden_sizes::Vector{Int}
end

function MLJFlux.build(builder::FullyConnectedBuilder, rng, n_in, n_out)
    init = Flux.glorot_uniform(rng)
    return Chain(FullyConnected(n_in, builder.hidden_sizes, n_out; init = init)) |> gpu
end

# mutable struct ConvexRegressor <: MLJFlux.MLJFluxDeterministic
#     model::MLJFlux.Regressor
# end

# @forward((ConvexRegressor, :model), MLJFlux.Regressor)

# Define a container to hold any optimiser specific parameters (if any):
struct ConvexRule <: Optimisers.AbstractRule
    rule::Optimisers.AbstractRule
    tol::Real
end
function ConvexRule(rule::Optimisers.AbstractRule; tol = 1e-6)
    return ConvexRule(rule, tol)
end

Optimisers.init(o::ConvexRule, x::AbstractArray) = Optimisers.init(o.rule, x)

function Optimisers.apply!(o::ConvexRule, mvel, x::AbstractArray{T}, dx) where {T}
    return Optimisers.apply!(o.rule, mvel, x, dx)
end

"""
    function make_convex!(chain::PairwiseFusion; tol = 1e-6)

Make a `PairwiseFusion` model convex by making sure all dense layers (a part from the first) have positive weights.
This procedure only makes sense for single output fully connected models.
"""
function make_convex!(chain::PairwiseFusion; tol = 1e-6)
    for layer in chain.layers[2:end]
        layer.weight .= max.(layer.weight, tol)
    end
end

"""
    function make_convex!(model::FullyConnected; tol = 1e-6)

Make a `FullyConnected` model convex by making sure all dense layers (a part from the first) have positive weights.
This procedure only makes sense for single output fully connected models.
"""
function make_convex!(model::FullyConnected; tol = 1e-6)
    return make_convex!(model.layers; tol = tol)
end

function make_convex!(model::Chain; tol = 1e-6)
    for i = 1:length(model.layers)
        make_convex!(model.layers[i]; tol = tol)
    end
end

function MLJFlux.train(
    model,
    chain,
    optimiser::ConvexRule,
    optimiser_state,
    epochs,
    verbosity,
    X,
    y,
)

    loss = model.loss

    # intitialize and start progress meter:
    meter = MLJFlux.Progress(
        epochs + 1,
        dt = 0,
        desc = "Optimising neural net:",
        barglyphs = MLJFlux.BarGlyphs("[=> ]"),
        barlen = 25,
        color = :yellow,
    )
    verbosity != 1 || MLJFlux.next!(meter)

    # initiate history:
    n_batches = length(y)

    losses = (loss(chain(X[i]), y[i]) for i = 1:n_batches)
    history = [mean(losses)]

    for i = 1:epochs
        chain, optimiser_state, current_loss =
            MLJFlux.train_epoch(model, chain, optimiser, optimiser_state, X, y)
        make_convex!(chain; tol = optimiser.tol)
        verbosity < 2 || @info "Loss is $(round(current_loss; sigdigits=4))"
        verbosity != 1 || next!(meter)
        push!(history, current_loss)
    end

    return chain, optimiser_state, history

end

function train!(model, loss, opt_state, X, Y; _batchsize = 32, shuffle = true)
    batchsize = min(size(X, 2), _batchsize)
    X = X |> gpu
    Y = Y |> gpu
    data = Flux.DataLoader((X, Y), batchsize = batchsize, shuffle = shuffle)
    for d in data
        ∇model, _ = Flux.gradient(model, d...) do m, x, y  # calculate the gradients
            loss(m(x), y)
        end
        # insert what ever code you want here that needs gradient
        # E.g. logging with TensorBoardLogger.jl as histogram so you can see if it is becoming huge
        opt_state, model = Optimisers.update(opt_state, model, ∇model)
        # Here you might like to check validation set accuracy, and break out to do early stopping
    end
    return loss(model(X), Y)
end