From bf1478e538bc52051b3e3394de24f9be75820ce8 Mon Sep 17 00:00:00 2001 From: milancurcic Date: Fri, 23 May 2025 09:52:14 -0400 Subject: [PATCH 01/13] WIP optimizer refactor w/ pointers --- src/nf/nf_dense_layer.f90 | 7 +++ src/nf/nf_dense_layer_submodule.f90 | 9 ++++ src/nf/nf_network_submodule.f90 | 17 +++++-- src/nf/nf_optimizers.f90 | 76 ++++++++++++++++++++--------- 4 files changed, 82 insertions(+), 27 deletions(-) diff --git a/src/nf/nf_dense_layer.f90 b/src/nf/nf_dense_layer.f90 index 862f4cdf..462434f6 100644 --- a/src/nf/nf_dense_layer.f90 +++ b/src/nf/nf_dense_layer.f90 @@ -36,6 +36,7 @@ module nf_dense_layer procedure :: get_gradients procedure :: get_num_params procedure :: get_params + procedure :: get_params_ptr procedure :: init procedure :: set_params @@ -96,6 +97,12 @@ module function get_params(self) result(params) !! Parameters of this layer end function get_params + module subroutine get_params_ptr(self, w_ptr, b_ptr) + class(dense_layer), intent(in), target :: self + real, pointer :: w_ptr(:,:) + real, pointer :: b_ptr(:) + end subroutine get_params_ptr + module function get_gradients(self) result(gradients) !! Return the gradients of this layer. !! The gradients are ordered as weights first, biases second. diff --git a/src/nf/nf_dense_layer_submodule.f90 b/src/nf/nf_dense_layer_submodule.f90 index a424cf9c..d0ac015a 100644 --- a/src/nf/nf_dense_layer_submodule.f90 +++ b/src/nf/nf_dense_layer_submodule.f90 @@ -77,6 +77,15 @@ module function get_params(self) result(params) end function get_params + module subroutine get_params_ptr(self, w_ptr, b_ptr) + class(dense_layer), intent(in), target :: self + real, pointer :: w_ptr(:,:) + real, pointer :: b_ptr(:) + w_ptr => self % weights + b_ptr => self % biases + end subroutine get_params_ptr + + module function get_gradients(self) result(gradients) class(dense_layer), intent(in), target :: self real, allocatable :: gradients(:) diff --git a/src/nf/nf_network_submodule.f90 b/src/nf/nf_network_submodule.f90 index d8f5ff50..e7c39716 100644 --- a/src/nf/nf_network_submodule.f90 +++ b/src/nf/nf_network_submodule.f90 @@ -649,6 +649,7 @@ module subroutine update(self, optimizer, batch_size) integer, intent(in), optional :: batch_size integer :: batch_size_ real, allocatable :: params(:) + real, pointer :: weights(:), biases(:), gradient(:) integer :: n ! Passing the optimizer instance is optional. If not provided, and if the @@ -693,9 +694,19 @@ module subroutine update(self, optimizer, batch_size) end do #endif - params = self % get_params() - call self % optimizer % minimize(params, self % get_gradients() / batch_size_) - call self % set_params(params) + !params = self % get_params() + !call self % optimizer % minimize(params, self % get_gradients() / batch_size_) + !call self % set_params(params) + + do n = 2, size(self % layers) + select type(this_layer => self % layers(n) % p) + type is(dense_layer) + call this_layer % get_params_ptr(weights, biases) + call self % optimizer % minimize(weights, biases, self % get_gradients() / batch_size_) + !call this_layer % set_params(weights, biases) + end select + end do + ! Flush network gradients to zero. do n = 2, size(self % layers) diff --git a/src/nf/nf_optimizers.f90 b/src/nf/nf_optimizers.f90 index c64cefed..1caf8c1e 100644 --- a/src/nf/nf_optimizers.f90 +++ b/src/nf/nf_optimizers.f90 @@ -30,11 +30,12 @@ impure elemental subroutine init(self, num_params) integer, intent(in) :: num_params end subroutine init - pure subroutine minimize(self, param, gradient) + pure subroutine minimize(self, weights, biases, gradient) import :: optimizer_base_type class(optimizer_base_type), intent(inout) :: self - real, intent(inout) :: param(:) - real, intent(in) :: gradient(:) + real, intent(inout), pointer :: weights(:) + real, intent(inout), pointer :: biases(:) + real, intent(in), pointer :: gradient(:) end subroutine minimize end interface @@ -116,12 +117,13 @@ impure elemental subroutine init_sgd(self, num_params) end subroutine init_sgd - pure subroutine minimize_sgd(self, param, gradient) + pure subroutine minimize_sgd(self, weights, biases, gradient) !! Concrete implementation of a stochastic gradient descent optimizer !! update rule. class(sgd), intent(inout) :: self - real, intent(inout) :: param(:) - real, intent(in) :: gradient(:) + real, intent(inout), pointer :: weights(:) + real, intent(inout), pointer :: biases(:) + real, intent(in), pointer :: gradient(:) if (self % momentum > 0) then ! Apply momentum update @@ -129,14 +131,18 @@ pure subroutine minimize_sgd(self, param, gradient) - self % learning_rate * gradient if (self % nesterov) then ! Apply Nesterov update - param = param + self % momentum * self % velocity & + weights = weights + self % momentum * self % velocity & + - self % learning_rate * gradient + biases = biases + self % momentum * self % velocity & - self % learning_rate * gradient else - param = param + self % velocity + weights = weights + self % velocity + biases = biases + self % velocity end if else ! Apply regular update - param = param - self % learning_rate * gradient + weights = weights - self % learning_rate * gradient + biases = biases - self % learning_rate * gradient end if end subroutine minimize_sgd @@ -152,18 +158,21 @@ impure elemental subroutine init_rmsprop(self, num_params) end subroutine init_rmsprop - pure subroutine minimize_rmsprop(self, param, gradient) + pure subroutine minimize_rmsprop(self, weights, biases, gradient) !! Concrete implementation of a RMSProp optimizer update rule. class(rmsprop), intent(inout) :: self - real, intent(inout) :: param(:) - real, intent(in) :: gradient(:) + real, intent(inout), pointer :: weights(:) + real, intent(inout), pointer :: biases(:) + real, intent(in), pointer :: gradient(:) ! Compute the RMS of the gradient using the RMSProp rule self % rms_gradient = self % decay_rate * self % rms_gradient & + (1 - self % decay_rate) * gradient**2 ! Update the network parameters based on the new RMS of the gradient - param = param - self % learning_rate & + weights = weights - self % learning_rate & + / sqrt(self % rms_gradient + self % epsilon) * gradient + biases = biases - self % learning_rate & / sqrt(self % rms_gradient + self % epsilon) * gradient end subroutine minimize_rmsprop @@ -180,17 +189,18 @@ impure elemental subroutine init_adam(self, num_params) end subroutine init_adam - pure subroutine minimize_adam(self, param, gradient) + pure subroutine minimize_adam(self, weights, biases, gradient) !! Concrete implementation of an Adam optimizer update rule. class(adam), intent(inout) :: self - real, intent(inout) :: param(:) - real, intent(in) :: gradient(:) + real, intent(inout), pointer :: weights(:) + real, intent(inout), pointer :: biases(:) + real, intent(in), pointer :: gradient(:) self % t = self % t + 1 ! If weight_decay_l2 > 0, use L2 regularization; ! otherwise, default to regular Adam. - associate(g => gradient + self % weight_decay_l2 * param) + associate(g => gradient + self % weight_decay_l2 * weights) self % m = self % beta1 * self % m + (1 - self % beta1) * g self % v = self % beta2 * self % v + (1 - self % beta2) * g**2 end associate @@ -202,9 +212,15 @@ pure subroutine minimize_adam(self, param, gradient) ) ! Update parameters. - param = param & + weights = weights & - self % learning_rate * (m_hat / (sqrt(v_hat) + self % epsilon) & - + self % weight_decay_decoupled * param) + + self % weight_decay_decoupled * weights) + + ! Update biases (without weight decay for biases) + associate(g => gradient) + biases = biases & + - self % learning_rate * (m_hat / (sqrt(v_hat) + self % epsilon)) + end associate end associate @@ -221,19 +237,21 @@ impure elemental subroutine init_adagrad(self, num_params) end subroutine init_adagrad - pure subroutine minimize_adagrad(self, param, gradient) + pure subroutine minimize_adagrad(self, weights, biases, gradient) !! Concrete implementation of an Adagrad optimizer update rule. class(adagrad), intent(inout) :: self - real, intent(inout) :: param(:) - real, intent(in) :: gradient(:) + real, intent(inout), pointer :: weights(:) + real, intent(inout), pointer :: biases(:) + real, intent(in), pointer :: gradient(:) ! Update the current time step self % t = self % t + 1 + ! For weights associate( & ! If weight_decay_l2 > 0, use L2 regularization; ! otherwise, default to regular Adagrad. - g => gradient + self % weight_decay_l2 * param, & + g => gradient + self % weight_decay_l2 * weights, & ! Amortize the learning rate as function of the current time step. learning_rate => self % learning_rate & / (1 + (self % t - 1) * self % learning_rate_decay) & @@ -241,10 +259,20 @@ pure subroutine minimize_adagrad(self, param, gradient) self % sum_squared_gradient = self % sum_squared_gradient + g**2 - param = param - learning_rate * g / (sqrt(self % sum_squared_gradient) & + weights = weights - learning_rate * g / (sqrt(self % sum_squared_gradient) & + self % epsilon) end associate + + ! For biases (without weight decay) + associate( & + g => gradient, & + learning_rate => self % learning_rate & + / (1 + (self % t - 1) * self % learning_rate_decay) & + ) + biases = biases - learning_rate * g / (sqrt(self % sum_squared_gradient) & + + self % epsilon) + end associate end subroutine minimize_adagrad From 38896cc57abc017987f8b46b9650cb0ec3151545 Mon Sep 17 00:00:00 2001 From: milancurcic Date: Tue, 27 May 2025 11:53:57 -0400 Subject: [PATCH 02/13] WIP optimizer optimization --- src/nf/nf_dense_layer.f90 | 7 + src/nf/nf_dense_layer_submodule.f90 | 9 ++ src/nf/nf_network_submodule.f90 | 6 +- src/nf/nf_optimizers.f90 | 201 ++++++++++++++++++++-------- 4 files changed, 164 insertions(+), 59 deletions(-) diff --git a/src/nf/nf_dense_layer.f90 b/src/nf/nf_dense_layer.f90 index 462434f6..ba6c33c4 100644 --- a/src/nf/nf_dense_layer.f90 +++ b/src/nf/nf_dense_layer.f90 @@ -34,6 +34,7 @@ module nf_dense_layer procedure :: backward procedure :: forward procedure :: get_gradients + procedure :: get_gradients_ptr procedure :: get_num_params procedure :: get_params procedure :: get_params_ptr @@ -112,6 +113,12 @@ module function get_gradients(self) result(gradients) !! Gradients of this layer end function get_gradients + module subroutine get_gradients_ptr(self, dw_ptr, db_ptr) + class(dense_layer), intent(in), target :: self + real, pointer :: dw_ptr(:,:) + real, pointer :: db_ptr(:) + end subroutine get_gradients_ptr + module subroutine set_params(self, params) !! Set the parameters of this layer. !! The parameters are ordered as weights first, biases second. diff --git a/src/nf/nf_dense_layer_submodule.f90 b/src/nf/nf_dense_layer_submodule.f90 index d0ac015a..a1ca6ce5 100644 --- a/src/nf/nf_dense_layer_submodule.f90 +++ b/src/nf/nf_dense_layer_submodule.f90 @@ -102,6 +102,15 @@ module function get_gradients(self) result(gradients) end function get_gradients + module subroutine get_gradients_ptr(self, dw_ptr, db_ptr) + class(dense_layer), intent(in), target :: self + real, pointer :: dw_ptr(:,:) + real, pointer :: db_ptr(:) + dw_ptr => self % dw + db_ptr => self % db + end subroutine get_gradients_ptr + + module subroutine set_params(self, params) class(dense_layer), intent(in out) :: self real, intent(in), target :: params(:) diff --git a/src/nf/nf_network_submodule.f90 b/src/nf/nf_network_submodule.f90 index e7c39716..1d36c5e8 100644 --- a/src/nf/nf_network_submodule.f90 +++ b/src/nf/nf_network_submodule.f90 @@ -649,7 +649,7 @@ module subroutine update(self, optimizer, batch_size) integer, intent(in), optional :: batch_size integer :: batch_size_ real, allocatable :: params(:) - real, pointer :: weights(:), biases(:), gradient(:) + real, pointer :: weights(:,:), biases(:), dw(:,:), db(:) integer :: n ! Passing the optimizer instance is optional. If not provided, and if the @@ -702,7 +702,9 @@ module subroutine update(self, optimizer, batch_size) select type(this_layer => self % layers(n) % p) type is(dense_layer) call this_layer % get_params_ptr(weights, biases) - call self % optimizer % minimize(weights, biases, self % get_gradients() / batch_size_) + call this_layer % get_gradients_ptr(dw, db) + call self % optimizer % minimize(weights, dw / batch_size_) + call self % optimizer % minimize(biases, db / batch_size_) !call this_layer % set_params(weights, biases) end select end do diff --git a/src/nf/nf_optimizers.f90 b/src/nf/nf_optimizers.f90 index 1caf8c1e..400fbfa2 100644 --- a/src/nf/nf_optimizers.f90 +++ b/src/nf/nf_optimizers.f90 @@ -19,7 +19,9 @@ module nf_optimizers real :: learning_rate = 0.01 contains procedure(init), deferred :: init - procedure(minimize), deferred :: minimize + procedure(minimize_1d), deferred :: minimize_1d + procedure(minimize_2d), deferred :: minimize_2d + generic :: minimize => minimize_1d, minimize_2d end type optimizer_base_type abstract interface @@ -30,13 +32,19 @@ impure elemental subroutine init(self, num_params) integer, intent(in) :: num_params end subroutine init - pure subroutine minimize(self, weights, biases, gradient) + pure subroutine minimize_1d(self, param, gradient) import :: optimizer_base_type class(optimizer_base_type), intent(inout) :: self - real, intent(inout), pointer :: weights(:) - real, intent(inout), pointer :: biases(:) - real, intent(in), pointer :: gradient(:) - end subroutine minimize + real, intent(inout) :: param(:) + real, intent(in) :: gradient(:) + end subroutine minimize_1d + + pure subroutine minimize_2d(self, param, gradient) + import :: optimizer_base_type + class(optimizer_base_type), intent(inout) :: self + real, intent(inout) :: param(:,:) + real, intent(in) :: gradient(:,:) + end subroutine minimize_2d end interface @@ -47,7 +55,8 @@ end subroutine minimize real, allocatable, private :: velocity(:) contains procedure :: init => init_sgd - procedure :: minimize => minimize_sgd + procedure :: minimize_1d => minimize_sgd_1d + procedure :: minimize_2d => minimize_sgd_2d end type sgd type, extends(optimizer_base_type) :: rmsprop @@ -62,7 +71,8 @@ end subroutine minimize real, allocatable, private :: rms_gradient(:) contains procedure :: init => init_rmsprop - procedure :: minimize => minimize_rmsprop + procedure :: minimize_1d => minimize_rmsprop_1d + procedure :: minimize_2d => minimize_rmsprop_2d end type rmsprop type, extends(optimizer_base_type) :: adam @@ -85,7 +95,8 @@ end subroutine minimize integer, private :: t = 0 contains procedure :: init => init_adam - procedure :: minimize => minimize_adam + procedure :: minimize_1d => minimize_adam_1d + procedure :: minimize_2d => minimize_adam_2d end type adam type, extends(optimizer_base_type) :: adagrad @@ -102,7 +113,8 @@ end subroutine minimize integer, private :: t = 0 contains procedure :: init => init_adagrad - procedure :: minimize => minimize_adagrad + procedure :: minimize_1d => minimize_adagrad_1d + procedure :: minimize_2d => minimize_adagrad_2d end type adagrad contains @@ -117,13 +129,12 @@ impure elemental subroutine init_sgd(self, num_params) end subroutine init_sgd - pure subroutine minimize_sgd(self, weights, biases, gradient) + pure subroutine minimize_sgd_1d(self, param, gradient) !! Concrete implementation of a stochastic gradient descent optimizer !! update rule. class(sgd), intent(inout) :: self - real, intent(inout), pointer :: weights(:) - real, intent(inout), pointer :: biases(:) - real, intent(in), pointer :: gradient(:) + real, intent(inout) :: param(:) + real, intent(in) :: gradient(:) if (self % momentum > 0) then ! Apply momentum update @@ -131,21 +142,17 @@ pure subroutine minimize_sgd(self, weights, biases, gradient) - self % learning_rate * gradient if (self % nesterov) then ! Apply Nesterov update - weights = weights + self % momentum * self % velocity & - - self % learning_rate * gradient - biases = biases + self % momentum * self % velocity & + param = param + self % momentum * self % velocity & - self % learning_rate * gradient else - weights = weights + self % velocity - biases = biases + self % velocity + param = param + self % velocity end if else ! Apply regular update - weights = weights - self % learning_rate * gradient - biases = biases - self % learning_rate * gradient + param = param - self % learning_rate * gradient end if - end subroutine minimize_sgd + end subroutine minimize_sgd_1d impure elemental subroutine init_rmsprop(self, num_params) @@ -158,24 +165,21 @@ impure elemental subroutine init_rmsprop(self, num_params) end subroutine init_rmsprop - pure subroutine minimize_rmsprop(self, weights, biases, gradient) + pure subroutine minimize_rmsprop_1d(self, param, gradient) !! Concrete implementation of a RMSProp optimizer update rule. class(rmsprop), intent(inout) :: self - real, intent(inout), pointer :: weights(:) - real, intent(inout), pointer :: biases(:) - real, intent(in), pointer :: gradient(:) + real, intent(inout) :: param(:) + real, intent(in) :: gradient(:) ! Compute the RMS of the gradient using the RMSProp rule self % rms_gradient = self % decay_rate * self % rms_gradient & + (1 - self % decay_rate) * gradient**2 ! Update the network parameters based on the new RMS of the gradient - weights = weights - self % learning_rate & - / sqrt(self % rms_gradient + self % epsilon) * gradient - biases = biases - self % learning_rate & + param = param - self % learning_rate & / sqrt(self % rms_gradient + self % epsilon) * gradient - end subroutine minimize_rmsprop + end subroutine minimize_rmsprop_1d impure elemental subroutine init_adam(self, num_params) @@ -189,18 +193,17 @@ impure elemental subroutine init_adam(self, num_params) end subroutine init_adam - pure subroutine minimize_adam(self, weights, biases, gradient) + pure subroutine minimize_adam_1d(self, param, gradient) !! Concrete implementation of an Adam optimizer update rule. class(adam), intent(inout) :: self - real, intent(inout), pointer :: weights(:) - real, intent(inout), pointer :: biases(:) - real, intent(in), pointer :: gradient(:) + real, intent(inout) :: param(:) + real, intent(in) :: gradient(:) self % t = self % t + 1 ! If weight_decay_l2 > 0, use L2 regularization; ! otherwise, default to regular Adam. - associate(g => gradient + self % weight_decay_l2 * weights) + associate(g => gradient + self % weight_decay_l2 * param) self % m = self % beta1 * self % m + (1 - self % beta1) * g self % v = self % beta2 * self % v + (1 - self % beta2) * g**2 end associate @@ -212,19 +215,13 @@ pure subroutine minimize_adam(self, weights, biases, gradient) ) ! Update parameters. - weights = weights & + param = param & - self % learning_rate * (m_hat / (sqrt(v_hat) + self % epsilon) & - + self % weight_decay_decoupled * weights) - - ! Update biases (without weight decay for biases) - associate(g => gradient) - biases = biases & - - self % learning_rate * (m_hat / (sqrt(v_hat) + self % epsilon)) - end associate + + self % weight_decay_decoupled * param) end associate - end subroutine minimize_adam + end subroutine minimize_adam_1d impure elemental subroutine init_adagrad(self, num_params) @@ -237,21 +234,19 @@ impure elemental subroutine init_adagrad(self, num_params) end subroutine init_adagrad - pure subroutine minimize_adagrad(self, weights, biases, gradient) + pure subroutine minimize_adagrad_1d(self, param, gradient) !! Concrete implementation of an Adagrad optimizer update rule. class(adagrad), intent(inout) :: self - real, intent(inout), pointer :: weights(:) - real, intent(inout), pointer :: biases(:) - real, intent(in), pointer :: gradient(:) + real, intent(inout) :: param(:) + real, intent(in) :: gradient(:) ! Update the current time step self % t = self % t + 1 - ! For weights associate( & ! If weight_decay_l2 > 0, use L2 regularization; ! otherwise, default to regular Adagrad. - g => gradient + self % weight_decay_l2 * weights, & + g => gradient + self % weight_decay_l2 * param, & ! Amortize the learning rate as function of the current time step. learning_rate => self % learning_rate & / (1 + (self % t - 1) * self % learning_rate_decay) & @@ -259,21 +254,113 @@ pure subroutine minimize_adagrad(self, weights, biases, gradient) self % sum_squared_gradient = self % sum_squared_gradient + g**2 - weights = weights - learning_rate * g / (sqrt(self % sum_squared_gradient) & + param = param - learning_rate * g / (sqrt(self % sum_squared_gradient) & + self % epsilon) end associate - - ! For biases (without weight decay) + + end subroutine minimize_adagrad_1d + + + pure subroutine minimize_sgd_2d(self, param, gradient) + !! Concrete implementation of a stochastic gradient descent optimizer + !! update rule for 2D arrays. + class(sgd), intent(inout) :: self + real, intent(inout) :: param(:,:) + real, intent(in) :: gradient(:,:) + + if (self % momentum > 0) then + ! Apply momentum update + self % velocity = self % momentum * self % velocity & + - self % learning_rate * reshape(gradient, [size(gradient)]) + if (self % nesterov) then + ! Apply Nesterov update + param = param + reshape(self % momentum * self % velocity & + - self % learning_rate * reshape(gradient, [size(gradient)]), shape(param)) + else + param = param + reshape(self % velocity, shape(param)) + end if + else + ! Apply regular update + param = param - self % learning_rate * gradient + end if + + end subroutine minimize_sgd_2d + + + pure subroutine minimize_rmsprop_2d(self, param, gradient) + !! Concrete implementation of a RMSProp optimizer update rule for 2D arrays. + class(rmsprop), intent(inout) :: self + real, intent(inout) :: param(:,:) + real, intent(in) :: gradient(:,:) + + ! Compute the RMS of the gradient using the RMSProp rule + self % rms_gradient = self % decay_rate * self % rms_gradient & + + (1 - self % decay_rate) * reshape(gradient, [size(gradient)])**2 + + ! Update the network parameters based on the new RMS of the gradient + param = param - self % learning_rate & + / sqrt(reshape(self % rms_gradient, shape(param)) + self % epsilon) * gradient + + end subroutine minimize_rmsprop_2d + + + pure subroutine minimize_adam_2d(self, param, gradient) + !! Concrete implementation of an Adam optimizer update rule for 2D arrays. + class(adam), intent(inout) :: self + real, intent(inout) :: param(:,:) + real, intent(in) :: gradient(:,:) + + self % t = self % t + 1 + + ! If weight_decay_l2 > 0, use L2 regularization; + ! otherwise, default to regular Adam. + associate(g => reshape(gradient, [size(gradient)]) + self % weight_decay_l2 * reshape(param, [size(param)])) + self % m = self % beta1 * self % m + (1 - self % beta1) * g + self % v = self % beta2 * self % v + (1 - self % beta2) * g**2 + end associate + + ! Compute bias-corrected first and second moment estimates. + associate( & + m_hat => self % m / (1 - self % beta1**self % t), & + v_hat => self % v / (1 - self % beta2**self % t) & + ) + + ! Update parameters. + param = param & + - self % learning_rate * reshape(m_hat / (sqrt(v_hat) + self % epsilon), shape(param)) & + - self % learning_rate * self % weight_decay_decoupled * param + + end associate + + end subroutine minimize_adam_2d + + + pure subroutine minimize_adagrad_2d(self, param, gradient) + !! Concrete implementation of an Adagrad optimizer update rule for 2D arrays. + class(adagrad), intent(inout) :: self + real, intent(inout) :: param(:,:) + real, intent(in) :: gradient(:,:) + + ! Update the current time step + self % t = self % t + 1 + associate( & - g => gradient, & + ! If weight_decay_l2 > 0, use L2 regularization; + ! otherwise, default to regular Adagrad. + g => reshape(gradient, [size(gradient)]) + self % weight_decay_l2 * reshape(param, [size(param)]), & + ! Amortize the learning rate as function of the current time step. learning_rate => self % learning_rate & / (1 + (self % t - 1) * self % learning_rate_decay) & ) - biases = biases - learning_rate * g / (sqrt(self % sum_squared_gradient) & - + self % epsilon) + + self % sum_squared_gradient = self % sum_squared_gradient + g**2 + + param = param - learning_rate * reshape(g / (sqrt(self % sum_squared_gradient) & + + self % epsilon), shape(param)) + end associate - end subroutine minimize_adagrad + end subroutine minimize_adagrad_2d end module nf_optimizers From 21c5707af2e7f0b7cbc816e9378848ea06c9a591 Mon Sep 17 00:00:00 2001 From: milancurcic Date: Tue, 27 May 2025 13:57:48 -0400 Subject: [PATCH 03/13] Send the data to optimizer without a copy works for dense layers --- src/nf/nf_network_submodule.f90 | 12 +-- src/nf/nf_optimizers.f90 | 150 ++++++++++++++++---------------- 2 files changed, 81 insertions(+), 81 deletions(-) diff --git a/src/nf/nf_network_submodule.f90 b/src/nf/nf_network_submodule.f90 index 1d36c5e8..eccea580 100644 --- a/src/nf/nf_network_submodule.f90 +++ b/src/nf/nf_network_submodule.f90 @@ -694,10 +694,6 @@ module subroutine update(self, optimizer, batch_size) end do #endif - !params = self % get_params() - !call self % optimizer % minimize(params, self % get_gradients() / batch_size_) - !call self % set_params(params) - do n = 2, size(self % layers) select type(this_layer => self % layers(n) % p) type is(dense_layer) @@ -705,11 +701,15 @@ module subroutine update(self, optimizer, batch_size) call this_layer % get_gradients_ptr(dw, db) call self % optimizer % minimize(weights, dw / batch_size_) call self % optimizer % minimize(biases, db / batch_size_) - !call this_layer % set_params(weights, biases) + type is(locally_connected1d_layer) + !TODO + type is(conv1d_layer) + !TODO + type is(conv2d_layer) + !TODO end select end do - ! Flush network gradients to zero. do n = 2, size(self % layers) select type(this_layer => self % layers(n) % p) diff --git a/src/nf/nf_optimizers.f90 b/src/nf/nf_optimizers.f90 index 400fbfa2..f6759d67 100644 --- a/src/nf/nf_optimizers.f90 +++ b/src/nf/nf_optimizers.f90 @@ -155,6 +155,32 @@ pure subroutine minimize_sgd_1d(self, param, gradient) end subroutine minimize_sgd_1d + pure subroutine minimize_sgd_2d(self, param, gradient) + !! Concrete implementation of a stochastic gradient descent optimizer + !! update rule for 2D arrays. + class(sgd), intent(inout) :: self + real, intent(inout) :: param(:,:) + real, intent(in) :: gradient(:,:) + + if (self % momentum > 0) then + ! Apply momentum update + self % velocity = self % momentum * self % velocity & + - self % learning_rate * reshape(gradient, [size(gradient)]) + if (self % nesterov) then + ! Apply Nesterov update + param = param + reshape(self % momentum * self % velocity & + - self % learning_rate * reshape(gradient, [size(gradient)]), shape(param)) + else + param = param + reshape(self % velocity, shape(param)) + end if + else + ! Apply regular update + param = param - self % learning_rate * gradient + end if + + end subroutine minimize_sgd_2d + + impure elemental subroutine init_rmsprop(self, num_params) class(rmsprop), intent(inout) :: self integer, intent(in) :: num_params @@ -182,6 +208,23 @@ pure subroutine minimize_rmsprop_1d(self, param, gradient) end subroutine minimize_rmsprop_1d + pure subroutine minimize_rmsprop_2d(self, param, gradient) + !! Concrete implementation of a RMSProp optimizer update rule for 2D arrays. + class(rmsprop), intent(inout) :: self + real, intent(inout) :: param(:,:) + real, intent(in) :: gradient(:,:) + + ! Compute the RMS of the gradient using the RMSProp rule + self % rms_gradient = self % decay_rate * self % rms_gradient & + + (1 - self % decay_rate) * reshape(gradient, [size(gradient)])**2 + + ! Update the network parameters based on the new RMS of the gradient + param = param - self % learning_rate & + / sqrt(reshape(self % rms_gradient, shape(param)) + self % epsilon) * gradient + + end subroutine minimize_rmsprop_2d + + impure elemental subroutine init_adam(self, num_params) class(adam), intent(inout) :: self integer, intent(in) :: num_params @@ -224,6 +267,37 @@ pure subroutine minimize_adam_1d(self, param, gradient) end subroutine minimize_adam_1d + pure subroutine minimize_adam_2d(self, param, gradient) + !! Concrete implementation of an Adam optimizer update rule for 2D arrays. + class(adam), intent(inout) :: self + real, intent(inout) :: param(:,:) + real, intent(in) :: gradient(:,:) + + self % t = self % t + 1 + + ! If weight_decay_l2 > 0, use L2 regularization; + ! otherwise, default to regular Adam. + associate(g => reshape(gradient, [size(gradient)]) + self % weight_decay_l2 * reshape(param, [size(param)])) + self % m = self % beta1 * self % m + (1 - self % beta1) * g + self % v = self % beta2 * self % v + (1 - self % beta2) * g**2 + end associate + + ! Compute bias-corrected first and second moment estimates. + associate( & + m_hat => self % m / (1 - self % beta1**self % t), & + v_hat => self % v / (1 - self % beta2**self % t) & + ) + + ! Update parameters. + param = param & + - self % learning_rate * reshape(m_hat / (sqrt(v_hat) + self % epsilon), shape(param)) & + - self % learning_rate * self % weight_decay_decoupled * param + + end associate + + end subroutine minimize_adam_2d + + impure elemental subroutine init_adagrad(self, num_params) class(adagrad), intent(inout) :: self integer, intent(in) :: num_params @@ -262,80 +336,6 @@ pure subroutine minimize_adagrad_1d(self, param, gradient) end subroutine minimize_adagrad_1d - pure subroutine minimize_sgd_2d(self, param, gradient) - !! Concrete implementation of a stochastic gradient descent optimizer - !! update rule for 2D arrays. - class(sgd), intent(inout) :: self - real, intent(inout) :: param(:,:) - real, intent(in) :: gradient(:,:) - - if (self % momentum > 0) then - ! Apply momentum update - self % velocity = self % momentum * self % velocity & - - self % learning_rate * reshape(gradient, [size(gradient)]) - if (self % nesterov) then - ! Apply Nesterov update - param = param + reshape(self % momentum * self % velocity & - - self % learning_rate * reshape(gradient, [size(gradient)]), shape(param)) - else - param = param + reshape(self % velocity, shape(param)) - end if - else - ! Apply regular update - param = param - self % learning_rate * gradient - end if - - end subroutine minimize_sgd_2d - - - pure subroutine minimize_rmsprop_2d(self, param, gradient) - !! Concrete implementation of a RMSProp optimizer update rule for 2D arrays. - class(rmsprop), intent(inout) :: self - real, intent(inout) :: param(:,:) - real, intent(in) :: gradient(:,:) - - ! Compute the RMS of the gradient using the RMSProp rule - self % rms_gradient = self % decay_rate * self % rms_gradient & - + (1 - self % decay_rate) * reshape(gradient, [size(gradient)])**2 - - ! Update the network parameters based on the new RMS of the gradient - param = param - self % learning_rate & - / sqrt(reshape(self % rms_gradient, shape(param)) + self % epsilon) * gradient - - end subroutine minimize_rmsprop_2d - - - pure subroutine minimize_adam_2d(self, param, gradient) - !! Concrete implementation of an Adam optimizer update rule for 2D arrays. - class(adam), intent(inout) :: self - real, intent(inout) :: param(:,:) - real, intent(in) :: gradient(:,:) - - self % t = self % t + 1 - - ! If weight_decay_l2 > 0, use L2 regularization; - ! otherwise, default to regular Adam. - associate(g => reshape(gradient, [size(gradient)]) + self % weight_decay_l2 * reshape(param, [size(param)])) - self % m = self % beta1 * self % m + (1 - self % beta1) * g - self % v = self % beta2 * self % v + (1 - self % beta2) * g**2 - end associate - - ! Compute bias-corrected first and second moment estimates. - associate( & - m_hat => self % m / (1 - self % beta1**self % t), & - v_hat => self % v / (1 - self % beta2**self % t) & - ) - - ! Update parameters. - param = param & - - self % learning_rate * reshape(m_hat / (sqrt(v_hat) + self % epsilon), shape(param)) & - - self % learning_rate * self % weight_decay_decoupled * param - - end associate - - end subroutine minimize_adam_2d - - pure subroutine minimize_adagrad_2d(self, param, gradient) !! Concrete implementation of an Adagrad optimizer update rule for 2D arrays. class(adagrad), intent(inout) :: self @@ -363,4 +363,4 @@ pure subroutine minimize_adagrad_2d(self, param, gradient) end subroutine minimize_adagrad_2d -end module nf_optimizers +end module nf_optimizers \ No newline at end of file From 9d68828f7e29d66f435a6701996f1cb65f08416e Mon Sep 17 00:00:00 2001 From: milancurcic Date: Fri, 30 May 2025 13:47:28 -0400 Subject: [PATCH 04/13] Get weights and weight gradients as 1d --- src/nf/nf_dense_layer.f90 | 8 +- src/nf/nf_dense_layer_submodule.f90 | 12 +-- src/nf/nf_network_submodule.f90 | 2 +- src/nf/nf_optimizers.f90 | 145 +++------------------------- 4 files changed, 26 insertions(+), 141 deletions(-) diff --git a/src/nf/nf_dense_layer.f90 b/src/nf/nf_dense_layer.f90 index ba6c33c4..a55ec892 100644 --- a/src/nf/nf_dense_layer.f90 +++ b/src/nf/nf_dense_layer.f90 @@ -100,8 +100,8 @@ end function get_params module subroutine get_params_ptr(self, w_ptr, b_ptr) class(dense_layer), intent(in), target :: self - real, pointer :: w_ptr(:,:) - real, pointer :: b_ptr(:) + real, pointer, intent(out) :: w_ptr(:) + real, pointer, intent(out) :: b_ptr(:) end subroutine get_params_ptr module function get_gradients(self) result(gradients) @@ -115,8 +115,8 @@ end function get_gradients module subroutine get_gradients_ptr(self, dw_ptr, db_ptr) class(dense_layer), intent(in), target :: self - real, pointer :: dw_ptr(:,:) - real, pointer :: db_ptr(:) + real, pointer, intent(out) :: dw_ptr(:) + real, pointer, intent(out) :: db_ptr(:) end subroutine get_gradients_ptr module subroutine set_params(self, params) diff --git a/src/nf/nf_dense_layer_submodule.f90 b/src/nf/nf_dense_layer_submodule.f90 index a1ca6ce5..bb27c54a 100644 --- a/src/nf/nf_dense_layer_submodule.f90 +++ b/src/nf/nf_dense_layer_submodule.f90 @@ -79,9 +79,9 @@ end function get_params module subroutine get_params_ptr(self, w_ptr, b_ptr) class(dense_layer), intent(in), target :: self - real, pointer :: w_ptr(:,:) - real, pointer :: b_ptr(:) - w_ptr => self % weights + real, pointer, intent(out) :: w_ptr(:) + real, pointer, intent(out) :: b_ptr(:) + w_ptr(1:size(self % weights)) => self % weights b_ptr => self % biases end subroutine get_params_ptr @@ -104,9 +104,9 @@ end function get_gradients module subroutine get_gradients_ptr(self, dw_ptr, db_ptr) class(dense_layer), intent(in), target :: self - real, pointer :: dw_ptr(:,:) - real, pointer :: db_ptr(:) - dw_ptr => self % dw + real, pointer, intent(out) :: dw_ptr(:) + real, pointer, intent(out) :: db_ptr(:) + dw_ptr(1:size(self % dw)) => self % dw db_ptr => self % db end subroutine get_gradients_ptr diff --git a/src/nf/nf_network_submodule.f90 b/src/nf/nf_network_submodule.f90 index eccea580..3508ec50 100644 --- a/src/nf/nf_network_submodule.f90 +++ b/src/nf/nf_network_submodule.f90 @@ -649,7 +649,7 @@ module subroutine update(self, optimizer, batch_size) integer, intent(in), optional :: batch_size integer :: batch_size_ real, allocatable :: params(:) - real, pointer :: weights(:,:), biases(:), dw(:,:), db(:) + real, pointer :: weights(:), biases(:), dw(:), db(:) integer :: n ! Passing the optimizer instance is optional. If not provided, and if the diff --git a/src/nf/nf_optimizers.f90 b/src/nf/nf_optimizers.f90 index f6759d67..24089ccd 100644 --- a/src/nf/nf_optimizers.f90 +++ b/src/nf/nf_optimizers.f90 @@ -19,9 +19,7 @@ module nf_optimizers real :: learning_rate = 0.01 contains procedure(init), deferred :: init - procedure(minimize_1d), deferred :: minimize_1d - procedure(minimize_2d), deferred :: minimize_2d - generic :: minimize => minimize_1d, minimize_2d + procedure(minimize), deferred :: minimize end type optimizer_base_type abstract interface @@ -32,19 +30,12 @@ impure elemental subroutine init(self, num_params) integer, intent(in) :: num_params end subroutine init - pure subroutine minimize_1d(self, param, gradient) + pure subroutine minimize(self, param, gradient) import :: optimizer_base_type class(optimizer_base_type), intent(inout) :: self real, intent(inout) :: param(:) real, intent(in) :: gradient(:) - end subroutine minimize_1d - - pure subroutine minimize_2d(self, param, gradient) - import :: optimizer_base_type - class(optimizer_base_type), intent(inout) :: self - real, intent(inout) :: param(:,:) - real, intent(in) :: gradient(:,:) - end subroutine minimize_2d + end subroutine minimize end interface @@ -55,8 +46,7 @@ end subroutine minimize_2d real, allocatable, private :: velocity(:) contains procedure :: init => init_sgd - procedure :: minimize_1d => minimize_sgd_1d - procedure :: minimize_2d => minimize_sgd_2d + procedure :: minimize => minimize_sgd end type sgd type, extends(optimizer_base_type) :: rmsprop @@ -71,8 +61,7 @@ end subroutine minimize_2d real, allocatable, private :: rms_gradient(:) contains procedure :: init => init_rmsprop - procedure :: minimize_1d => minimize_rmsprop_1d - procedure :: minimize_2d => minimize_rmsprop_2d + procedure :: minimize => minimize_rmsprop end type rmsprop type, extends(optimizer_base_type) :: adam @@ -95,8 +84,7 @@ end subroutine minimize_2d integer, private :: t = 0 contains procedure :: init => init_adam - procedure :: minimize_1d => minimize_adam_1d - procedure :: minimize_2d => minimize_adam_2d + procedure :: minimize => minimize_adam end type adam type, extends(optimizer_base_type) :: adagrad @@ -113,8 +101,7 @@ end subroutine minimize_2d integer, private :: t = 0 contains procedure :: init => init_adagrad - procedure :: minimize_1d => minimize_adagrad_1d - procedure :: minimize_2d => minimize_adagrad_2d + procedure :: minimize => minimize_adagrad end type adagrad contains @@ -129,7 +116,7 @@ impure elemental subroutine init_sgd(self, num_params) end subroutine init_sgd - pure subroutine minimize_sgd_1d(self, param, gradient) + pure subroutine minimize_sgd(self, param, gradient) !! Concrete implementation of a stochastic gradient descent optimizer !! update rule. class(sgd), intent(inout) :: self @@ -152,33 +139,7 @@ pure subroutine minimize_sgd_1d(self, param, gradient) param = param - self % learning_rate * gradient end if - end subroutine minimize_sgd_1d - - - pure subroutine minimize_sgd_2d(self, param, gradient) - !! Concrete implementation of a stochastic gradient descent optimizer - !! update rule for 2D arrays. - class(sgd), intent(inout) :: self - real, intent(inout) :: param(:,:) - real, intent(in) :: gradient(:,:) - - if (self % momentum > 0) then - ! Apply momentum update - self % velocity = self % momentum * self % velocity & - - self % learning_rate * reshape(gradient, [size(gradient)]) - if (self % nesterov) then - ! Apply Nesterov update - param = param + reshape(self % momentum * self % velocity & - - self % learning_rate * reshape(gradient, [size(gradient)]), shape(param)) - else - param = param + reshape(self % velocity, shape(param)) - end if - else - ! Apply regular update - param = param - self % learning_rate * gradient - end if - - end subroutine minimize_sgd_2d + end subroutine minimize_sgd impure elemental subroutine init_rmsprop(self, num_params) @@ -191,7 +152,7 @@ impure elemental subroutine init_rmsprop(self, num_params) end subroutine init_rmsprop - pure subroutine minimize_rmsprop_1d(self, param, gradient) + pure subroutine minimize_rmsprop(self, param, gradient) !! Concrete implementation of a RMSProp optimizer update rule. class(rmsprop), intent(inout) :: self real, intent(inout) :: param(:) @@ -205,24 +166,7 @@ pure subroutine minimize_rmsprop_1d(self, param, gradient) param = param - self % learning_rate & / sqrt(self % rms_gradient + self % epsilon) * gradient - end subroutine minimize_rmsprop_1d - - - pure subroutine minimize_rmsprop_2d(self, param, gradient) - !! Concrete implementation of a RMSProp optimizer update rule for 2D arrays. - class(rmsprop), intent(inout) :: self - real, intent(inout) :: param(:,:) - real, intent(in) :: gradient(:,:) - - ! Compute the RMS of the gradient using the RMSProp rule - self % rms_gradient = self % decay_rate * self % rms_gradient & - + (1 - self % decay_rate) * reshape(gradient, [size(gradient)])**2 - - ! Update the network parameters based on the new RMS of the gradient - param = param - self % learning_rate & - / sqrt(reshape(self % rms_gradient, shape(param)) + self % epsilon) * gradient - - end subroutine minimize_rmsprop_2d + end subroutine minimize_rmsprop impure elemental subroutine init_adam(self, num_params) @@ -236,7 +180,7 @@ impure elemental subroutine init_adam(self, num_params) end subroutine init_adam - pure subroutine minimize_adam_1d(self, param, gradient) + pure subroutine minimize_adam(self, param, gradient) !! Concrete implementation of an Adam optimizer update rule. class(adam), intent(inout) :: self real, intent(inout) :: param(:) @@ -264,38 +208,7 @@ pure subroutine minimize_adam_1d(self, param, gradient) end associate - end subroutine minimize_adam_1d - - - pure subroutine minimize_adam_2d(self, param, gradient) - !! Concrete implementation of an Adam optimizer update rule for 2D arrays. - class(adam), intent(inout) :: self - real, intent(inout) :: param(:,:) - real, intent(in) :: gradient(:,:) - - self % t = self % t + 1 - - ! If weight_decay_l2 > 0, use L2 regularization; - ! otherwise, default to regular Adam. - associate(g => reshape(gradient, [size(gradient)]) + self % weight_decay_l2 * reshape(param, [size(param)])) - self % m = self % beta1 * self % m + (1 - self % beta1) * g - self % v = self % beta2 * self % v + (1 - self % beta2) * g**2 - end associate - - ! Compute bias-corrected first and second moment estimates. - associate( & - m_hat => self % m / (1 - self % beta1**self % t), & - v_hat => self % v / (1 - self % beta2**self % t) & - ) - - ! Update parameters. - param = param & - - self % learning_rate * reshape(m_hat / (sqrt(v_hat) + self % epsilon), shape(param)) & - - self % learning_rate * self % weight_decay_decoupled * param - - end associate - - end subroutine minimize_adam_2d + end subroutine minimize_adam impure elemental subroutine init_adagrad(self, num_params) @@ -308,7 +221,7 @@ impure elemental subroutine init_adagrad(self, num_params) end subroutine init_adagrad - pure subroutine minimize_adagrad_1d(self, param, gradient) + pure subroutine minimize_adagrad(self, param, gradient) !! Concrete implementation of an Adagrad optimizer update rule. class(adagrad), intent(inout) :: self real, intent(inout) :: param(:) @@ -333,34 +246,6 @@ pure subroutine minimize_adagrad_1d(self, param, gradient) end associate - end subroutine minimize_adagrad_1d - - - pure subroutine minimize_adagrad_2d(self, param, gradient) - !! Concrete implementation of an Adagrad optimizer update rule for 2D arrays. - class(adagrad), intent(inout) :: self - real, intent(inout) :: param(:,:) - real, intent(in) :: gradient(:,:) - - ! Update the current time step - self % t = self % t + 1 - - associate( & - ! If weight_decay_l2 > 0, use L2 regularization; - ! otherwise, default to regular Adagrad. - g => reshape(gradient, [size(gradient)]) + self % weight_decay_l2 * reshape(param, [size(param)]), & - ! Amortize the learning rate as function of the current time step. - learning_rate => self % learning_rate & - / (1 + (self % t - 1) * self % learning_rate_decay) & - ) - - self % sum_squared_gradient = self % sum_squared_gradient + g**2 - - param = param - learning_rate * reshape(g / (sqrt(self % sum_squared_gradient) & - + self % epsilon), shape(param)) - - end associate - - end subroutine minimize_adagrad_2d + end subroutine minimize_adagrad end module nf_optimizers \ No newline at end of file From 2160f97f8a6ffac1b62f6f25e38b752c4ba2d65b Mon Sep 17 00:00:00 2001 From: milancurcic Date: Thu, 19 Jun 2025 23:49:05 -0400 Subject: [PATCH 05/13] get_params_ptr and get_gradients_ptr for conv1d, conv2d, and locally_connected1d --- src/nf/nf_conv1d_layer.f90 | 22 ++++++++++++++ src/nf/nf_conv1d_layer_submodule.f90 | 16 ++++++++++ src/nf/nf_conv2d_layer.f90 | 22 ++++++++++++++ src/nf/nf_conv2d_layer_submodule.f90 | 18 ++++++++++++ src/nf/nf_locally_connected1d_layer.f90 | 14 +++++++++ ...nf_locally_connected1d_layer_submodule.f90 | 16 ++++++++++ src/nf/nf_network_submodule.f90 | 29 +++++++++---------- 7 files changed, 122 insertions(+), 15 deletions(-) diff --git a/src/nf/nf_conv1d_layer.f90 b/src/nf/nf_conv1d_layer.f90 index c39b11fc..871eef02 100644 --- a/src/nf/nf_conv1d_layer.f90 +++ b/src/nf/nf_conv1d_layer.f90 @@ -32,8 +32,10 @@ module nf_conv1d_layer procedure :: forward procedure :: backward procedure :: get_gradients + procedure :: get_gradients_ptr procedure :: get_num_params procedure :: get_params + procedure :: get_params_ptr procedure :: init procedure :: set_params @@ -97,6 +99,16 @@ module function get_params(self) result(params) !! Parameters to get end function get_params + module subroutine get_params_ptr(self, w_ptr, b_ptr) + !! Return pointers to the parameters (weights and biases) of this layer. + class(conv1d_layer), intent(in), target :: self + !! A `conv1d_layer` instance + real, pointer, intent(out) :: w_ptr(:) + !! Pointer to the kernel weights (flattened) + real, pointer, intent(out) :: b_ptr(:) + !! Pointer to the biases + end subroutine get_params_ptr + module function get_gradients(self) result(gradients) !! Return the gradients of this layer. !! The gradients are ordered as weights first, biases second. @@ -106,6 +118,16 @@ module function get_gradients(self) result(gradients) !! Gradients to get end function get_gradients + module subroutine get_gradients_ptr(self, dw_ptr, db_ptr) + !! Return pointers to the gradients of this layer. + class(conv1d_layer), intent(in), target :: self + !! A `conv1d_layer` instance + real, pointer, intent(out) :: dw_ptr(:) + !! Pointer to the kernel weight gradients (flattened) + real, pointer, intent(out) :: db_ptr(:) + !! Pointer to the bias gradients + end subroutine get_gradients_ptr + module subroutine set_params(self, params) !! Set the parameters of the layer. class(conv1d_layer), intent(in out) :: self diff --git a/src/nf/nf_conv1d_layer_submodule.f90 b/src/nf/nf_conv1d_layer_submodule.f90 index 5404b9c7..05bcde70 100644 --- a/src/nf/nf_conv1d_layer_submodule.f90 +++ b/src/nf/nf_conv1d_layer_submodule.f90 @@ -152,6 +152,14 @@ module function get_params(self) result(params) params = [ w_, self % biases] end function get_params + module subroutine get_params_ptr(self, w_ptr, b_ptr) + class(conv1d_layer), intent(in), target :: self + real, pointer, intent(out) :: w_ptr(:) + real, pointer, intent(out) :: b_ptr(:) + w_ptr(1:size(self % kernel)) => self % kernel + b_ptr => self % biases + end subroutine get_params_ptr + module function get_gradients(self) result(gradients) class(conv1d_layer), intent(in), target :: self real, allocatable :: gradients(:) @@ -160,6 +168,14 @@ module function get_gradients(self) result(gradients) gradients = [ dw_, self % db ] end function get_gradients + module subroutine get_gradients_ptr(self, dw_ptr, db_ptr) + class(conv1d_layer), intent(in), target :: self + real, pointer, intent(out) :: dw_ptr(:) + real, pointer, intent(out) :: db_ptr(:) + dw_ptr(1:size(self % dw)) => self % dw + db_ptr => self % db + end subroutine get_gradients_ptr + module subroutine set_params(self, params) class(conv1d_layer), intent(in out) :: self real, intent(in) :: params(:) diff --git a/src/nf/nf_conv2d_layer.f90 b/src/nf/nf_conv2d_layer.f90 index 4b79376e..3f7b28db 100644 --- a/src/nf/nf_conv2d_layer.f90 +++ b/src/nf/nf_conv2d_layer.f90 @@ -33,8 +33,10 @@ module nf_conv2d_layer procedure :: forward procedure :: backward procedure :: get_gradients + procedure :: get_gradients_ptr procedure :: get_num_params procedure :: get_params + procedure :: get_params_ptr procedure :: init procedure :: set_params @@ -98,6 +100,16 @@ module function get_params(self) result(params) !! Parameters to get end function get_params + module subroutine get_params_ptr(self, w_ptr, b_ptr) + !! Return pointers to the parameters (weights and biases) of this layer. + class(conv2d_layer), intent(in), target :: self + !! A `conv2d_layer` instance + real, pointer, intent(out) :: w_ptr(:) + !! Pointer to the kernel weights (flattened) + real, pointer, intent(out) :: b_ptr(:) + !! Pointer to the biases + end subroutine get_params_ptr + module function get_gradients(self) result(gradients) !! Return the gradients of this layer. !! The gradients are ordered as weights first, biases second. @@ -107,6 +119,16 @@ module function get_gradients(self) result(gradients) !! Gradients to get end function get_gradients + module subroutine get_gradients_ptr(self, dw_ptr, db_ptr) + !! Return pointers to the gradients of this layer. + class(conv2d_layer), intent(in), target :: self + !! A `conv2d_layer` instance + real, pointer, intent(out) :: dw_ptr(:) + !! Pointer to the kernel weight gradients (flattened) + real, pointer, intent(out) :: db_ptr(:) + !! Pointer to the bias gradients + end subroutine get_gradients_ptr + module subroutine set_params(self, params) !! Set the parameters of the layer. class(conv2d_layer), intent(in out) :: self diff --git a/src/nf/nf_conv2d_layer_submodule.f90 b/src/nf/nf_conv2d_layer_submodule.f90 index 45a2c1da..b617ec34 100644 --- a/src/nf/nf_conv2d_layer_submodule.f90 +++ b/src/nf/nf_conv2d_layer_submodule.f90 @@ -204,6 +204,15 @@ module function get_params(self) result(params) end function get_params + + module subroutine get_params_ptr(self, w_ptr, b_ptr) + class(conv2d_layer), intent(in), target :: self + real, pointer, intent(out) :: w_ptr(:) + real, pointer, intent(out) :: b_ptr(:) + w_ptr(1:size(self % kernel)) => self % kernel + b_ptr => self % biases + end subroutine get_params_ptr + module function get_gradients(self) result(gradients) class(conv2d_layer), intent(in), target :: self @@ -221,6 +230,15 @@ module function get_gradients(self) result(gradients) end function get_gradients + module subroutine get_gradients_ptr(self, dw_ptr, db_ptr) + class(conv2d_layer), intent(in), target :: self + real, pointer, intent(out) :: dw_ptr(:) + real, pointer, intent(out) :: db_ptr(:) + dw_ptr(1:size(self % dw)) => self % dw + db_ptr => self % db + end subroutine get_gradients_ptr + + module subroutine set_params(self, params) class(conv2d_layer), intent(in out) :: self real, intent(in) :: params(:) diff --git a/src/nf/nf_locally_connected1d_layer.f90 b/src/nf/nf_locally_connected1d_layer.f90 index beca76d5..6fea2c5c 100644 --- a/src/nf/nf_locally_connected1d_layer.f90 +++ b/src/nf/nf_locally_connected1d_layer.f90 @@ -32,8 +32,10 @@ module nf_locally_connected1d_layer procedure :: forward procedure :: backward procedure :: get_gradients + procedure :: get_gradients_ptr procedure :: get_num_params procedure :: get_params + procedure :: get_params_ptr procedure :: init procedure :: set_params @@ -97,6 +99,12 @@ module function get_params(self) result(params) !! Parameters to get end function get_params + module subroutine get_params_ptr(self, w_ptr, b_ptr) + class(locally_connected1d_layer), intent(in), target :: self + real, pointer, intent(out) :: w_ptr(:) + real, pointer, intent(out) :: b_ptr(:) + end subroutine get_params_ptr + module function get_gradients(self) result(gradients) !! Return the gradients of this layer. !! The gradients are ordered as weights first, biases second. @@ -106,6 +114,12 @@ module function get_gradients(self) result(gradients) !! Gradients to get end function get_gradients + module subroutine get_gradients_ptr(self, dw_ptr, db_ptr) + class(locally_connected1d_layer), intent(in), target :: self + real, pointer, intent(out) :: dw_ptr(:) + real, pointer, intent(out) :: db_ptr(:) + end subroutine get_gradients_ptr + module subroutine set_params(self, params) !! Set the parameters of the layer. class(locally_connected1d_layer), intent(in out) :: self diff --git a/src/nf/nf_locally_connected1d_layer_submodule.f90 b/src/nf/nf_locally_connected1d_layer_submodule.f90 index 053c520b..fa6110d5 100644 --- a/src/nf/nf_locally_connected1d_layer_submodule.f90 +++ b/src/nf/nf_locally_connected1d_layer_submodule.f90 @@ -128,12 +128,28 @@ module function get_params(self) result(params) params = [self % kernel, self % biases] end function get_params + module subroutine get_params_ptr(self, w_ptr, b_ptr) + class(locally_connected1d_layer), intent(in), target :: self + real, pointer, intent(out) :: w_ptr(:) + real, pointer, intent(out) :: b_ptr(:) + w_ptr(1:size(self % kernel)) => self % kernel + b_ptr(1:size(self % biases)) => self % biases + end subroutine get_params_ptr + module function get_gradients(self) result(gradients) class(locally_connected1d_layer), intent(in), target :: self real, allocatable :: gradients(:) gradients = [self % dw, self % db] end function get_gradients + module subroutine get_gradients_ptr(self, dw_ptr, db_ptr) + class(locally_connected1d_layer), intent(in), target :: self + real, pointer, intent(out) :: dw_ptr(:) + real, pointer, intent(out) :: db_ptr(:) + dw_ptr(1:size(self % dw)) => self % dw + db_ptr(1:size(self % db)) => self % db + end subroutine get_gradients_ptr + module subroutine set_params(self, params) class(locally_connected1d_layer), intent(in out) :: self real, intent(in) :: params(:) diff --git a/src/nf/nf_network_submodule.f90 b/src/nf/nf_network_submodule.f90 index 3508ec50..60c0e151 100644 --- a/src/nf/nf_network_submodule.f90 +++ b/src/nf/nf_network_submodule.f90 @@ -701,28 +701,27 @@ module subroutine update(self, optimizer, batch_size) call this_layer % get_gradients_ptr(dw, db) call self % optimizer % minimize(weights, dw / batch_size_) call self % optimizer % minimize(biases, db / batch_size_) - type is(locally_connected1d_layer) - !TODO - type is(conv1d_layer) - !TODO - type is(conv2d_layer) - !TODO - end select - end do - - ! Flush network gradients to zero. - do n = 2, size(self % layers) - select type(this_layer => self % layers(n) % p) - type is(dense_layer) this_layer % dw = 0 this_layer % db = 0 - type is(conv2d_layer) + type is(conv1d_layer) + call this_layer % get_params_ptr(weights, biases) + call this_layer % get_gradients_ptr(dw, db) + call self % optimizer % minimize(weights, dw / batch_size_) + call self % optimizer % minimize(biases, db / batch_size_) this_layer % dw = 0 this_layer % db = 0 - type is(conv1d_layer) + type is(conv2d_layer) + call this_layer % get_params_ptr(weights, biases) + call this_layer % get_gradients_ptr(dw, db) + call self % optimizer % minimize(weights, dw / batch_size_) + call self % optimizer % minimize(biases, db / batch_size_) this_layer % dw = 0 this_layer % db = 0 type is(locally_connected1d_layer) + call this_layer % get_params_ptr(weights, biases) + call this_layer % get_gradients_ptr(dw, db) + call self % optimizer % minimize(weights, dw / batch_size_) + call self % optimizer % minimize(biases, db / batch_size_) this_layer % dw = 0 this_layer % db = 0 end select From 0e11f1016828f229dbb5d1f50d7c573ff9a9c918 Mon Sep 17 00:00:00 2001 From: milancurcic Date: Fri, 20 Jun 2025 13:59:22 -0400 Subject: [PATCH 06/13] Define optimizer instance per layer to preserve memory across layers --- src/nf/nf_layer.f90 | 1 + src/nf/nf_network_submodule.f90 | 46 +++++++++++++++++++++++++++------ 2 files changed, 39 insertions(+), 8 deletions(-) diff --git a/src/nf/nf_layer.f90 b/src/nf/nf_layer.f90 index 517622b0..b12592f3 100644 --- a/src/nf/nf_layer.f90 +++ b/src/nf/nf_layer.f90 @@ -22,6 +22,7 @@ module nf_layer integer, allocatable :: layer_shape(:) integer, allocatable :: input_layer_shape(:) logical :: initialized = .false. + class(optimizer_base_type), allocatable :: optimizer contains diff --git a/src/nf/nf_network_submodule.f90 b/src/nf/nf_network_submodule.f90 index 60c0e151..876070bc 100644 --- a/src/nf/nf_network_submodule.f90 +++ b/src/nf/nf_network_submodule.f90 @@ -597,12 +597,26 @@ module subroutine train(self, input_data, output_data, batch_size, & ! If not provided, we default to SGD with its default settings. if (present(optimizer)) then self % optimizer = optimizer + + do n = 1, size(self % layers) + self % layers(n) % optimizer = optimizer + end do + else self % optimizer = sgd() + + do n = 1, size(self % layers) + self % layers(n) % optimizer = sgd() + end do + end if call self % optimizer % init(self % get_num_params()) + do n = 1, size(self % layers) + call self % layers(n) % optimizer % init(self % layers(n) % get_num_params()) + end do + ! Passing the loss instance is optional. ! If not provided, we default to quadratic(). if (present(loss)) then @@ -662,10 +676,26 @@ module subroutine update(self, optimizer, batch_size) if (.not. allocated(self % optimizer)) then if (present(optimizer)) then self % optimizer = optimizer + + do n = 1, size(self % layers) + self % layers(n) % optimizer = optimizer + end do + else self % optimizer = sgd() + + do n = 1, size(self % layers) + self % layers(n) % optimizer = sgd() + end do + end if + call self % optimizer % init(self % get_num_params()) + + do n = 1, size(self % layers) + call self % layers(n) % optimizer % init(self % layers(n) % get_num_params()) + end do + end if if (present(batch_size)) then @@ -699,29 +729,29 @@ module subroutine update(self, optimizer, batch_size) type is(dense_layer) call this_layer % get_params_ptr(weights, biases) call this_layer % get_gradients_ptr(dw, db) - call self % optimizer % minimize(weights, dw / batch_size_) - call self % optimizer % minimize(biases, db / batch_size_) + call self % layers(n) %optimizer % minimize(weights, dw / batch_size_) + call self % layers(n) %optimizer % minimize(biases, db / batch_size_) this_layer % dw = 0 this_layer % db = 0 type is(conv1d_layer) call this_layer % get_params_ptr(weights, biases) call this_layer % get_gradients_ptr(dw, db) - call self % optimizer % minimize(weights, dw / batch_size_) - call self % optimizer % minimize(biases, db / batch_size_) + call self % layers(n) %optimizer % minimize(weights, dw / batch_size_) + call self % layers(n) %optimizer % minimize(biases, db / batch_size_) this_layer % dw = 0 this_layer % db = 0 type is(conv2d_layer) call this_layer % get_params_ptr(weights, biases) call this_layer % get_gradients_ptr(dw, db) - call self % optimizer % minimize(weights, dw / batch_size_) - call self % optimizer % minimize(biases, db / batch_size_) + call self % layers(n) %optimizer % minimize(weights, dw / batch_size_) + call self % layers(n) %optimizer % minimize(biases, db / batch_size_) this_layer % dw = 0 this_layer % db = 0 type is(locally_connected1d_layer) call this_layer % get_params_ptr(weights, biases) call this_layer % get_gradients_ptr(dw, db) - call self % optimizer % minimize(weights, dw / batch_size_) - call self % optimizer % minimize(biases, db / batch_size_) + call self % layers(n) %optimizer % minimize(weights, dw / batch_size_) + call self % layers(n) %optimizer % minimize(biases, db / batch_size_) this_layer % dw = 0 this_layer % db = 0 end select From dc55df09d872b4373ca8894523babab7fbdf8416 Mon Sep 17 00:00:00 2001 From: milancurcic Date: Fri, 4 Jul 2025 10:10:27 -0400 Subject: [PATCH 07/13] Initialization of network-wide optimizer no longer needed now that we switched to per-layer optimizer instances --- src/nf/nf_network_submodule.f90 | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/src/nf/nf_network_submodule.f90 b/src/nf/nf_network_submodule.f90 index 876070bc..f434eab0 100644 --- a/src/nf/nf_network_submodule.f90 +++ b/src/nf/nf_network_submodule.f90 @@ -611,8 +611,6 @@ module subroutine train(self, input_data, output_data, batch_size, & end if - call self % optimizer % init(self % get_num_params()) - do n = 1, size(self % layers) call self % layers(n) % optimizer % init(self % layers(n) % get_num_params()) end do @@ -690,8 +688,6 @@ module subroutine update(self, optimizer, batch_size) end if - call self % optimizer % init(self % get_num_params()) - do n = 1, size(self % layers) call self % layers(n) % optimizer % init(self % layers(n) % get_num_params()) end do @@ -729,29 +725,29 @@ module subroutine update(self, optimizer, batch_size) type is(dense_layer) call this_layer % get_params_ptr(weights, biases) call this_layer % get_gradients_ptr(dw, db) - call self % layers(n) %optimizer % minimize(weights, dw / batch_size_) - call self % layers(n) %optimizer % minimize(biases, db / batch_size_) + call self % layers(n) % optimizer % minimize(weights, dw / batch_size_) + call self % layers(n) % optimizer % minimize(biases, db / batch_size_) this_layer % dw = 0 this_layer % db = 0 type is(conv1d_layer) call this_layer % get_params_ptr(weights, biases) call this_layer % get_gradients_ptr(dw, db) - call self % layers(n) %optimizer % minimize(weights, dw / batch_size_) - call self % layers(n) %optimizer % minimize(biases, db / batch_size_) + call self % layers(n) % optimizer % minimize(weights, dw / batch_size_) + call self % layers(n) % optimizer % minimize(biases, db / batch_size_) this_layer % dw = 0 this_layer % db = 0 type is(conv2d_layer) call this_layer % get_params_ptr(weights, biases) call this_layer % get_gradients_ptr(dw, db) - call self % layers(n) %optimizer % minimize(weights, dw / batch_size_) - call self % layers(n) %optimizer % minimize(biases, db / batch_size_) + call self % layers(n) % optimizer % minimize(weights, dw / batch_size_) + call self % layers(n) % optimizer % minimize(biases, db / batch_size_) this_layer % dw = 0 this_layer % db = 0 type is(locally_connected1d_layer) call this_layer % get_params_ptr(weights, biases) call this_layer % get_gradients_ptr(dw, db) - call self % layers(n) %optimizer % minimize(weights, dw / batch_size_) - call self % layers(n) %optimizer % minimize(biases, db / batch_size_) + call self % layers(n) % optimizer % minimize(weights, dw / batch_size_) + call self % layers(n) % optimizer % minimize(biases, db / batch_size_) this_layer % dw = 0 this_layer % db = 0 end select From e9ba73e8f81cfecd0d8244929a7294f81bfc64f2 Mon Sep 17 00:00:00 2001 From: milancurcic Date: Mon, 28 Jul 2025 14:55:18 -0400 Subject: [PATCH 08/13] Bookkeeping for velocity, rms_gradient, etc.; optimizer tests now pass --- src/nf/nf_optimizers.f90 | 88 ++++++++++++++++++++++++++++++++++------ 1 file changed, 76 insertions(+), 12 deletions(-) diff --git a/src/nf/nf_optimizers.f90 b/src/nf/nf_optimizers.f90 index 24089ccd..2926c959 100644 --- a/src/nf/nf_optimizers.f90 +++ b/src/nf/nf_optimizers.f90 @@ -44,6 +44,7 @@ end subroutine minimize real :: momentum = 0 logical :: nesterov = .false. real, allocatable, private :: velocity(:) + integer, private :: start_index = 1 contains procedure :: init => init_sgd procedure :: minimize => minimize_sgd @@ -59,6 +60,7 @@ end subroutine minimize real :: decay_rate = 0.9 real :: epsilon = 1e-8 real, allocatable, private :: rms_gradient(:) + integer, private :: start_index = 1 contains procedure :: init => init_rmsprop procedure :: minimize => minimize_rmsprop @@ -82,6 +84,7 @@ end subroutine minimize real :: weight_decay_decoupled = 0 ! decoupled weight decay regularization (AdamW) real, allocatable, private :: m(:), v(:) integer, private :: t = 0 + integer, private :: start_index = 1 contains procedure :: init => init_adam procedure :: minimize => minimize_adam @@ -99,6 +102,7 @@ end subroutine minimize real :: learning_rate_decay = 0 real, allocatable, private :: sum_squared_gradient(:) integer, private :: t = 0 + integer, private :: start_index = 1 contains procedure :: init => init_adagrad procedure :: minimize => minimize_adagrad @@ -121,19 +125,38 @@ pure subroutine minimize_sgd(self, param, gradient) !! update rule. class(sgd), intent(inout) :: self real, intent(inout) :: param(:) - real, intent(in) :: gradient(:) + real, intent(in) :: gradient(:) ! Always the same size as param + integer :: end_index if (self % momentum > 0) then + + ! end_index is part of the bookkeeping for updating velocity because each + ! batch update makes two calls to minimize, one for the weights and one for + ! the biases. + ! We use start_index and end_index to update the appropriate sections + ! of the velocity array. + end_index = self % start_index + size(param) - 1 + ! Apply momentum update - self % velocity = self % momentum * self % velocity & + self % velocity(self % start_index:end_index) = & + self % momentum * self % velocity(self % start_index:end_index) & - self % learning_rate * gradient if (self % nesterov) then ! Apply Nesterov update - param = param + self % momentum * self % velocity & + param = param + self % momentum * self % velocity(self % start_index:end_index) & - self % learning_rate * gradient else - param = param + self % velocity + param = param + self % velocity(self % start_index:end_index) + end if + + if (self % start_index == 1) then + ! We updated the weights part, now we shift forward for the biases part + self % start_index = end_index + 1 + else + ! We updated the biases part, now we shift back to start for the next batch + self % start_index = 1 end if + else ! Apply regular update param = param - self % learning_rate * gradient @@ -157,14 +180,27 @@ pure subroutine minimize_rmsprop(self, param, gradient) class(rmsprop), intent(inout) :: self real, intent(inout) :: param(:) real, intent(in) :: gradient(:) + integer :: end_index + + end_index = self % start_index + size(param) - 1 ! Compute the RMS of the gradient using the RMSProp rule - self % rms_gradient = self % decay_rate * self % rms_gradient & + self % rms_gradient(self % start_index:end_index) = & + self % decay_rate * self % rms_gradient(self % start_index:end_index) & + (1 - self % decay_rate) * gradient**2 ! Update the network parameters based on the new RMS of the gradient param = param - self % learning_rate & - / sqrt(self % rms_gradient + self % epsilon) * gradient + / sqrt(self % rms_gradient(self % start_index:end_index) + self % epsilon) & + * gradient + + if (self % start_index == 1) then + ! We updated the weights part, now we shift forward for the biases part + self % start_index = end_index + 1 + else + ! We updated the biases part, now we shift back to start for the next batch + self % start_index = 1 + end if end subroutine minimize_rmsprop @@ -185,20 +221,27 @@ pure subroutine minimize_adam(self, param, gradient) class(adam), intent(inout) :: self real, intent(inout) :: param(:) real, intent(in) :: gradient(:) + integer :: end_index + + end_index = self % start_index + size(param) - 1 self % t = self % t + 1 ! If weight_decay_l2 > 0, use L2 regularization; ! otherwise, default to regular Adam. associate(g => gradient + self % weight_decay_l2 * param) - self % m = self % beta1 * self % m + (1 - self % beta1) * g - self % v = self % beta2 * self % v + (1 - self % beta2) * g**2 + self % m(self % start_index:end_index) = & + self % beta1 * self % m(self % start_index:end_index) & + + (1 - self % beta1) * g + self % v(self % start_index:end_index) = & + self % beta2 * self % v(self % start_index:end_index) & + + (1 - self % beta2) * g**2 end associate ! Compute bias-corrected first and second moment estimates. associate( & - m_hat => self % m / (1 - self % beta1**self % t), & - v_hat => self % v / (1 - self % beta2**self % t) & + m_hat => self % m(self % start_index:end_index) / (1 - self % beta1**self % t), & + v_hat => self % v(self % start_index:end_index) / (1 - self % beta2**self % t) & ) ! Update parameters. @@ -208,6 +251,14 @@ pure subroutine minimize_adam(self, param, gradient) end associate + if (self % start_index == 1) then + ! We updated the weights part, now we shift forward for the biases part + self % start_index = end_index + 1 + else + ! We updated the biases part, now we shift back to start for the next batch + self % start_index = 1 + end if + end subroutine minimize_adam @@ -226,6 +277,9 @@ pure subroutine minimize_adagrad(self, param, gradient) class(adagrad), intent(inout) :: self real, intent(inout) :: param(:) real, intent(in) :: gradient(:) + integer :: end_index + + end_index = self % start_index + size(param) - 1 ! Update the current time step self % t = self % t + 1 @@ -239,13 +293,23 @@ pure subroutine minimize_adagrad(self, param, gradient) / (1 + (self % t - 1) * self % learning_rate_decay) & ) - self % sum_squared_gradient = self % sum_squared_gradient + g**2 + self % sum_squared_gradient(self % start_index:end_index) = & + self % sum_squared_gradient(self % start_index:end_index) + g**2 - param = param - learning_rate * g / (sqrt(self % sum_squared_gradient) & + param = param - learning_rate * g & + / (sqrt(self % sum_squared_gradient(self % start_index:end_index)) & + self % epsilon) end associate + if (self % start_index == 1) then + ! We updated the weights part, now we shift forward for the biases part + self % start_index = end_index + 1 + else + ! We updated the biases part, now we shift back to start for the next batch + self % start_index = 1 + end if + end subroutine minimize_adagrad end module nf_optimizers \ No newline at end of file From ad176ea847ac4fc9a2674d44a3894429a7db26af Mon Sep 17 00:00:00 2001 From: milancurcic Date: Tue, 29 Jul 2025 13:25:06 -0400 Subject: [PATCH 09/13] Update optimizer flow for linear2d --- src/nf/nf_linear2d_layer.f90 | 12 +++++++++ src/nf/nf_linear2d_layer_submodule.f90 | 34 ++++++++++++++------------ 2 files changed, 30 insertions(+), 16 deletions(-) diff --git a/src/nf/nf_linear2d_layer.f90 b/src/nf/nf_linear2d_layer.f90 index f785a14c..f2c8fd16 100644 --- a/src/nf/nf_linear2d_layer.f90 +++ b/src/nf/nf_linear2d_layer.f90 @@ -25,7 +25,9 @@ module nf_linear2d_layer procedure :: init procedure :: get_num_params procedure :: get_params + procedure :: get_params_ptr procedure :: get_gradients + procedure :: get_gradients_ptr procedure :: set_params end type linear2d_layer @@ -64,11 +66,21 @@ module function get_params(self) result(params) real, allocatable :: params(:) end function get_params + module subroutine get_params_ptr(self, w_ptr, b_ptr) + class(linear2d_layer), intent(in), target :: self + real, pointer, intent(out) :: w_ptr(:), b_ptr(:) + end subroutine get_params_ptr + module function get_gradients(self) result(gradients) class(linear2d_layer), intent(in), target :: self real, allocatable :: gradients(:) end function get_gradients + module subroutine get_gradients_ptr(self, dw_ptr, db_ptr) + class(linear2d_layer), intent(in), target :: self + real, pointer, intent(out) :: dw_ptr(:), db_ptr(:) + end subroutine get_gradients_ptr + module subroutine set_params(self, params) class(linear2d_layer), intent(in out) :: self real, intent(in), target :: params(:) diff --git a/src/nf/nf_linear2d_layer_submodule.f90 b/src/nf/nf_linear2d_layer_submodule.f90 index 0dfe7e27..513527f0 100644 --- a/src/nf/nf_linear2d_layer_submodule.f90 +++ b/src/nf/nf_linear2d_layer_submodule.f90 @@ -82,33 +82,35 @@ end function get_num_params module function get_params(self) result(params) class(linear2d_layer), intent(in), target :: self real, allocatable :: params(:) - real, pointer :: w_(:) => null() + w_(1: size(self % weights)) => self % weights + params = [w_, self % biases] + end function get_params - w_(1: product(shape(self % weights))) => self % weights - - params = [ & - w_, & - self % biases & - ] - end function get_params + module subroutine get_params_ptr(self, w_ptr, b_ptr) + class(linear2d_layer), intent(in), target :: self + real, pointer, intent(out) :: w_ptr(:), b_ptr(:) + w_ptr(1:size(self % weights)) => self % weights + b_ptr => self % biases + end subroutine get_params_ptr module function get_gradients(self) result(gradients) class(linear2d_layer), intent(in), target :: self real, allocatable :: gradients(:) - real, pointer :: dw_(:) => null() + dw_(1:size(self % dw)) => self % dw + gradients = [dw_, self % db] + end function get_gradients - dw_(1: product(shape(self % dw))) => self % dw - - gradients = [ & - dw_, & - self % db & - ] - end function get_gradients + module subroutine get_gradients_ptr(self, dw_ptr, db_ptr) + class(linear2d_layer), intent(in), target :: self + real, pointer, intent(out) :: dw_ptr(:), db_ptr(:) + dw_ptr(1:size(self % dw)) => self % dw + db_ptr => self % db + end subroutine get_gradients_ptr module subroutine set_params(self, params) From e5072d3cd94ded9f97dd8781af38841cc3b16ee1 Mon Sep 17 00:00:00 2001 From: milancurcic Date: Tue, 29 Jul 2025 13:43:30 -0400 Subject: [PATCH 10/13] Update optimizer flow for layernorm --- src/nf/nf_layernorm.f90 | 14 ++++++++++++++ src/nf/nf_layernorm_submodule.f90 | 26 ++++++++++++++++---------- src/nf/nf_network_submodule.f90 | 14 ++++++++++++++ test/test_layernorm.f90 | 30 +++++++++++++++++------------- 4 files changed, 61 insertions(+), 23 deletions(-) diff --git a/src/nf/nf_layernorm.f90 b/src/nf/nf_layernorm.f90 index 36ef56f0..7bffc06a 100644 --- a/src/nf/nf_layernorm.f90 +++ b/src/nf/nf_layernorm.f90 @@ -38,7 +38,9 @@ module nf_layernorm_layer procedure :: init procedure :: get_num_params procedure :: get_params + procedure :: get_params_ptr procedure :: get_gradients + procedure :: get_gradients_ptr procedure :: set_params end type layernorm_layer @@ -78,12 +80,24 @@ module function get_params(self) result(params) end function get_params + module subroutine get_params_ptr(self, g_ptr, b_ptr) + class(layernorm_layer), intent(in), target :: self + real, pointer, intent(out) :: g_ptr(:), b_ptr(:) + end subroutine get_params_ptr + + module function get_gradients(self) result(gradients) class(layernorm_layer), intent(in), target :: self real, allocatable :: gradients(:) end function get_gradients + module subroutine get_gradients_ptr(self, dg_ptr, db_ptr) + class(layernorm_layer), intent(in), target :: self + real, pointer, intent(out) :: dg_ptr(:), db_ptr(:) + end subroutine get_gradients_ptr + + module subroutine set_params(self, params) class(layernorm_layer), intent(in out) :: self real, intent(in), target :: params(:) diff --git a/src/nf/nf_layernorm_submodule.f90 b/src/nf/nf_layernorm_submodule.f90 index 4eaa4382..5e357b33 100644 --- a/src/nf/nf_layernorm_submodule.f90 +++ b/src/nf/nf_layernorm_submodule.f90 @@ -112,25 +112,31 @@ end function get_num_params module function get_params(self) result(params) class(layernorm_layer), intent(in), target :: self real, allocatable :: params(:) + params = [self % gamma, self % beta] + end function get_params - params = [ & - self % gamma, & - self % beta & - ] - end function get_params + module subroutine get_params_ptr(self, g_ptr, b_ptr) + class(layernorm_layer), intent(in), target :: self + real, pointer, intent(out) :: g_ptr(:), b_ptr(:) + g_ptr => self % gamma + b_ptr => self % beta + end subroutine get_params_ptr module function get_gradients(self) result(gradients) class(layernorm_layer), intent(in), target :: self real, allocatable :: gradients(:) + gradients = [self % d_gamma, self % d_beta] + end function get_gradients - gradients = [ & - self % d_gamma, & - self % d_beta & - ] - end function get_gradients + module subroutine get_gradients_ptr(self, dg_ptr, db_ptr) + class(layernorm_layer), intent(in), target :: self + real, pointer, intent(out) :: dg_ptr(:), db_ptr(:) + dg_ptr => self % d_gamma + db_ptr => self % d_beta + end subroutine get_gradients_ptr module subroutine set_params(self, params) diff --git a/src/nf/nf_network_submodule.f90 b/src/nf/nf_network_submodule.f90 index f434eab0..76937ade 100644 --- a/src/nf/nf_network_submodule.f90 +++ b/src/nf/nf_network_submodule.f90 @@ -750,6 +750,20 @@ module subroutine update(self, optimizer, batch_size) call self % layers(n) % optimizer % minimize(biases, db / batch_size_) this_layer % dw = 0 this_layer % db = 0 + type is(linear2d_layer) + call this_layer % get_params_ptr(weights, biases) + call this_layer % get_gradients_ptr(dw, db) + call self % layers(n) % optimizer % minimize(weights, dw / batch_size_) + call self % layers(n) % optimizer % minimize(biases, db / batch_size_) + this_layer % dw = 0 + this_layer % db = 0 + type is(layernorm_layer) + call this_layer % get_params_ptr(weights, biases) + call this_layer % get_gradients_ptr(dw, db) + call self % layers(n) % optimizer % minimize(weights, dw / batch_size_) + call self % layers(n) % optimizer % minimize(biases, db / batch_size_) + this_layer % d_gamma = 0 + this_layer % d_beta = 0 end select end do diff --git a/test/test_layernorm.f90 b/test/test_layernorm.f90 index 6a897575..9e8bfccf 100644 --- a/test/test_layernorm.f90 +++ b/test/test_layernorm.f90 @@ -27,14 +27,14 @@ program test_layernorm_instance end if contains - function allclose(x, y) result(res) - real, intent(in) :: x(:) - real, intent(in) :: y(:) - logical :: res - res = all(abs(x - y) <= (1e-06 + 1e-05 * abs(y))) + logical function allclose(x, y) result(res) + real, intent(in) :: x(:), y(:) + !res = all(abs(x - y) <= (1e-06 + 1e-05 * abs(y))) + res = all(abs(x - y) <= 1e-05) end function allclose + subroutine test_layernorm_forward(layernorm_instance, input, ok) type(layernorm_layer), intent(in out) :: layernorm_instance real, intent(in out) :: input(:, :) @@ -61,6 +61,7 @@ subroutine test_layernorm_forward(layernorm_instance, input, ok) end if end subroutine test_layernorm_forward + subroutine test_layernorm_backward(layernorm_instance, input, gradient, ok) type(layernorm_layer), intent(in out) :: layernorm_instance real, intent(in out) :: input(:, :) @@ -103,6 +104,7 @@ subroutine test_layernorm_backward(layernorm_instance, input, gradient, ok) end if end subroutine test_layernorm_backward + subroutine test_layernorm_gradients(input, gradient, ok) real, intent(in out) :: input(:, :) real, intent(in out) :: gradient(:, :) @@ -152,6 +154,7 @@ subroutine test_layernorm_gradients(input, gradient, ok) end if end subroutine test_layernorm_gradients + subroutine test_layernorm_integration(ok) logical, intent(in out) :: ok @@ -160,13 +163,13 @@ subroutine test_layernorm_integration(ok) real :: y(6) = [0.7, 0.2, 0.1, 0.1, 0.01, 0.9] real :: tolerance = 0.1 integer :: epoch - integer :: epochs = 10000 + integer, parameter :: num_epochs = 100000 - net = network([& - input(2, 3),& - linear2d(3),& - layernorm(),& - flatten()& + net = network([ & + input(2, 3), & + linear2d(3), & + layernorm(), & + flatten() & ]) ! Kaiming weights to achieve semblance of convergance @@ -177,17 +180,18 @@ subroutine test_layernorm_integration(ok) l % biases = 0.2 end select - do epoch = 1, epochs + do epoch = 1, num_epochs call net % forward(x) call net % backward(y) call net % update(optimizer=sgd(learning_rate=0.001)) if (all(abs(net % predict(x) - y) < tolerance)) exit end do - if (.not. epoch <= epochs) then + if (.not. epoch <= num_epochs) then write(stderr, '(a)') & 'linear2d + layernorm should converge in simple training.. failed' ok = .false. end if end subroutine test_layernorm_integration + end program test_layernorm_instance From 86ed7b3a7ee68d9204055e70e34ab13f28a3e9d9 Mon Sep 17 00:00:00 2001 From: milancurcic Date: Tue, 29 Jul 2025 14:03:06 -0400 Subject: [PATCH 11/13] Previous bookkeeping for successive calls to optim % minimize() assumed 2 calls per batch; this is now generalized to allow any number of calls until size(params) is exhausted --- src/nf/nf_optimizers.f90 | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/nf/nf_optimizers.f90 b/src/nf/nf_optimizers.f90 index 2926c959..9a6b1e1f 100644 --- a/src/nf/nf_optimizers.f90 +++ b/src/nf/nf_optimizers.f90 @@ -149,7 +149,7 @@ pure subroutine minimize_sgd(self, param, gradient) param = param + self % velocity(self % start_index:end_index) end if - if (self % start_index == 1) then + if (end_index < size(param)) then ! We updated the weights part, now we shift forward for the biases part self % start_index = end_index + 1 else @@ -194,7 +194,7 @@ pure subroutine minimize_rmsprop(self, param, gradient) / sqrt(self % rms_gradient(self % start_index:end_index) + self % epsilon) & * gradient - if (self % start_index == 1) then + if (end_index < size(param)) then ! We updated the weights part, now we shift forward for the biases part self % start_index = end_index + 1 else @@ -251,7 +251,7 @@ pure subroutine minimize_adam(self, param, gradient) end associate - if (self % start_index == 1) then + if (end_index < size(param)) then ! We updated the weights part, now we shift forward for the biases part self % start_index = end_index + 1 else @@ -302,7 +302,7 @@ pure subroutine minimize_adagrad(self, param, gradient) end associate - if (self % start_index == 1) then + if (end_index < size(param)) then ! We updated the weights part, now we shift forward for the biases part self % start_index = end_index + 1 else From 309ef6e82e1f665fcab9c78b6c5b5574d253f322 Mon Sep 17 00:00:00 2001 From: milancurcic Date: Tue, 29 Jul 2025 14:19:33 -0400 Subject: [PATCH 12/13] Remove get_gradients from network, layer, dense, conv1d, conv2d --- src/nf/nf_conv1d_layer.f90 | 10 ------- src/nf/nf_conv1d_layer_submodule.f90 | 8 ----- src/nf/nf_conv2d_layer.f90 | 10 ------- src/nf/nf_conv2d_layer_submodule.f90 | 16 ---------- src/nf/nf_dense_layer.f90 | 10 ------- src/nf/nf_dense_layer_submodule.f90 | 16 ---------- src/nf/nf_layer.f90 | 9 ------ src/nf/nf_layer_submodule.f90 | 44 ---------------------------- src/nf/nf_network.f90 | 9 ------ src/nf/nf_network_submodule.f90 | 19 ------------ 10 files changed, 151 deletions(-) diff --git a/src/nf/nf_conv1d_layer.f90 b/src/nf/nf_conv1d_layer.f90 index 871eef02..65f82347 100644 --- a/src/nf/nf_conv1d_layer.f90 +++ b/src/nf/nf_conv1d_layer.f90 @@ -31,7 +31,6 @@ module nf_conv1d_layer procedure :: forward procedure :: backward - procedure :: get_gradients procedure :: get_gradients_ptr procedure :: get_num_params procedure :: get_params @@ -109,15 +108,6 @@ module subroutine get_params_ptr(self, w_ptr, b_ptr) !! Pointer to the biases end subroutine get_params_ptr - module function get_gradients(self) result(gradients) - !! Return the gradients of this layer. - !! The gradients are ordered as weights first, biases second. - class(conv1d_layer), intent(in), target :: self - !! A `conv1d_layer` instance - real, allocatable :: gradients(:) - !! Gradients to get - end function get_gradients - module subroutine get_gradients_ptr(self, dw_ptr, db_ptr) !! Return pointers to the gradients of this layer. class(conv1d_layer), intent(in), target :: self diff --git a/src/nf/nf_conv1d_layer_submodule.f90 b/src/nf/nf_conv1d_layer_submodule.f90 index 05bcde70..98856689 100644 --- a/src/nf/nf_conv1d_layer_submodule.f90 +++ b/src/nf/nf_conv1d_layer_submodule.f90 @@ -160,14 +160,6 @@ module subroutine get_params_ptr(self, w_ptr, b_ptr) b_ptr => self % biases end subroutine get_params_ptr - module function get_gradients(self) result(gradients) - class(conv1d_layer), intent(in), target :: self - real, allocatable :: gradients(:) - real, pointer :: dw_(:) => null() - dw_(1:size(self % dw)) => self % dw - gradients = [ dw_, self % db ] - end function get_gradients - module subroutine get_gradients_ptr(self, dw_ptr, db_ptr) class(conv1d_layer), intent(in), target :: self real, pointer, intent(out) :: dw_ptr(:) diff --git a/src/nf/nf_conv2d_layer.f90 b/src/nf/nf_conv2d_layer.f90 index 3f7b28db..d6c92c31 100644 --- a/src/nf/nf_conv2d_layer.f90 +++ b/src/nf/nf_conv2d_layer.f90 @@ -32,7 +32,6 @@ module nf_conv2d_layer procedure :: forward procedure :: backward - procedure :: get_gradients procedure :: get_gradients_ptr procedure :: get_num_params procedure :: get_params @@ -110,15 +109,6 @@ module subroutine get_params_ptr(self, w_ptr, b_ptr) !! Pointer to the biases end subroutine get_params_ptr - module function get_gradients(self) result(gradients) - !! Return the gradients of this layer. - !! The gradients are ordered as weights first, biases second. - class(conv2d_layer), intent(in), target :: self - !! A `conv2d_layer` instance - real, allocatable :: gradients(:) - !! Gradients to get - end function get_gradients - module subroutine get_gradients_ptr(self, dw_ptr, db_ptr) !! Return pointers to the gradients of this layer. class(conv2d_layer), intent(in), target :: self diff --git a/src/nf/nf_conv2d_layer_submodule.f90 b/src/nf/nf_conv2d_layer_submodule.f90 index b617ec34..56b398fc 100644 --- a/src/nf/nf_conv2d_layer_submodule.f90 +++ b/src/nf/nf_conv2d_layer_submodule.f90 @@ -214,22 +214,6 @@ module subroutine get_params_ptr(self, w_ptr, b_ptr) end subroutine get_params_ptr - module function get_gradients(self) result(gradients) - class(conv2d_layer), intent(in), target :: self - real, allocatable :: gradients(:) - - real, pointer :: dw_(:) => null() - - dw_(1:size(self % dw)) => self % dw - - gradients = [ & - dw_, & - self % db & - ] - - end function get_gradients - - module subroutine get_gradients_ptr(self, dw_ptr, db_ptr) class(conv2d_layer), intent(in), target :: self real, pointer, intent(out) :: dw_ptr(:) diff --git a/src/nf/nf_dense_layer.f90 b/src/nf/nf_dense_layer.f90 index a55ec892..e93a57ca 100644 --- a/src/nf/nf_dense_layer.f90 +++ b/src/nf/nf_dense_layer.f90 @@ -33,7 +33,6 @@ module nf_dense_layer procedure :: backward procedure :: forward - procedure :: get_gradients procedure :: get_gradients_ptr procedure :: get_num_params procedure :: get_params @@ -104,15 +103,6 @@ module subroutine get_params_ptr(self, w_ptr, b_ptr) real, pointer, intent(out) :: b_ptr(:) end subroutine get_params_ptr - module function get_gradients(self) result(gradients) - !! Return the gradients of this layer. - !! The gradients are ordered as weights first, biases second. - class(dense_layer), intent(in), target :: self - !! Dense layer instance - real, allocatable :: gradients(:) - !! Gradients of this layer - end function get_gradients - module subroutine get_gradients_ptr(self, dw_ptr, db_ptr) class(dense_layer), intent(in), target :: self real, pointer, intent(out) :: dw_ptr(:) diff --git a/src/nf/nf_dense_layer_submodule.f90 b/src/nf/nf_dense_layer_submodule.f90 index bb27c54a..c2f7e236 100644 --- a/src/nf/nf_dense_layer_submodule.f90 +++ b/src/nf/nf_dense_layer_submodule.f90 @@ -86,22 +86,6 @@ module subroutine get_params_ptr(self, w_ptr, b_ptr) end subroutine get_params_ptr - module function get_gradients(self) result(gradients) - class(dense_layer), intent(in), target :: self - real, allocatable :: gradients(:) - - real, pointer :: dw_(:) => null() - - dw_(1:size(self % dw)) => self % dw - - gradients = [ & - dw_, & - self % db & - ] - - end function get_gradients - - module subroutine get_gradients_ptr(self, dw_ptr, db_ptr) class(dense_layer), intent(in), target :: self real, pointer, intent(out) :: dw_ptr(:) diff --git a/src/nf/nf_layer.f90 b/src/nf/nf_layer.f90 index b12592f3..79569845 100644 --- a/src/nf/nf_layer.f90 +++ b/src/nf/nf_layer.f90 @@ -29,7 +29,6 @@ module nf_layer procedure :: forward procedure :: get_num_params procedure :: get_params - procedure :: get_gradients procedure :: set_params procedure :: init procedure :: print_info @@ -161,14 +160,6 @@ module function get_params(self) result(params) !! Parameters of this layer end function get_params - module function get_gradients(self) result(gradients) - !! Returns the gradients of this layer. - class(layer), intent(in) :: self - !! Layer instance - real, allocatable :: gradients(:) - !! Gradients of this layer - end function get_gradients - module subroutine set_params(self, params) !! Returns the parameters of this layer. class(layer), intent(in out) :: self diff --git a/src/nf/nf_layer_submodule.f90 b/src/nf/nf_layer_submodule.f90 index eebedaa9..778d227a 100644 --- a/src/nf/nf_layer_submodule.f90 +++ b/src/nf/nf_layer_submodule.f90 @@ -682,50 +682,6 @@ module function get_params(self) result(params) end function get_params - module function get_gradients(self) result(gradients) - class(layer), intent(in) :: self - real, allocatable :: gradients(:) - - select type (this_layer => self % p) - type is (input1d_layer) - ! No gradients to get. - type is (input2d_layer) - ! No gradients to get. - type is (input3d_layer) - ! No gradients to get. - type is (dense_layer) - gradients = this_layer % get_gradients() - type is (dropout_layer) - ! No gradients to get. - type is (conv1d_layer) - gradients = this_layer % get_gradients() - type is (conv2d_layer) - gradients = this_layer % get_gradients() - type is (locally_connected1d_layer) - gradients = this_layer % get_gradients() - type is (maxpool1d_layer) - ! No gradients to get. - type is (maxpool2d_layer) - ! No gradients to get. - type is (flatten_layer) - ! No gradients to get. - type is (reshape2d_layer) - ! No parameters to get. - type is (reshape3d_layer) - ! No gradients to get. - type is (linear2d_layer) - gradients = this_layer % get_gradients() - type is (self_attention_layer) - gradients = this_layer % get_gradients() - type is (embedding_layer) - gradients = this_layer % get_gradients() - type is (layernorm_layer) - gradients = this_layer % get_gradients() - class default - error stop 'Unknown layer type.' - end select - - end function get_gradients module subroutine set_params(self, params) class(layer), intent(in out) :: self diff --git a/src/nf/nf_network.f90 b/src/nf/nf_network.f90 index 2bd7ce8c..ac165adf 100644 --- a/src/nf/nf_network.f90 +++ b/src/nf/nf_network.f90 @@ -21,7 +21,6 @@ module nf_network contains procedure :: backward - procedure :: get_gradients procedure :: get_num_params procedure :: get_params procedure :: print_info @@ -216,7 +215,6 @@ module integer function get_num_params(self) !! Network instance end function get_num_params - module function get_params(self) result(params) !! Get the network parameters (weights and biases). class(network), intent(in) :: self @@ -225,13 +223,6 @@ module function get_params(self) result(params) !! Network parameters to get end function get_params - module function get_gradients(self) result(gradients) - class(network), intent(in) :: self - !! Network instance - real, allocatable :: gradients(:) - !! Network gradients to set - end function get_gradients - module subroutine set_params(self, params) !! Set the network parameters (weights and biases). class(network), intent(in out) :: self diff --git a/src/nf/nf_network_submodule.f90 b/src/nf/nf_network_submodule.f90 index 76937ade..d550f264 100644 --- a/src/nf/nf_network_submodule.f90 +++ b/src/nf/nf_network_submodule.f90 @@ -524,25 +524,6 @@ module function get_params(self) result(params) end function get_params - module function get_gradients(self) result(gradients) - class(network), intent(in) :: self - real, allocatable :: gradients(:) - integer :: n, nstart, nend - - allocate(gradients(self % get_num_params())) - - nstart = 1 - do n = 1, size(self % layers) - - if (self % layers(n) % get_num_params() < 1) cycle - - nend = nstart + self % layers(n) % get_num_params() - 1 - gradients(nstart:nend) = self % layers(n) % get_gradients() - nstart = nend + 1 - end do - - end function get_gradients - module subroutine set_params(self, params) class(network), intent(in out) :: self From e61f29ed6955ed12c5001ae48982ee5d5f4affd6 Mon Sep 17 00:00:00 2001 From: milancurcic Date: Wed, 30 Jul 2025 12:19:55 -0400 Subject: [PATCH 13/13] Remove optimizer as component to the network class --- src/nf/nf_network.f90 | 1 - src/nf/nf_network_submodule.f90 | 51 +++++++-------------------------- 2 files changed, 10 insertions(+), 42 deletions(-) diff --git a/src/nf/nf_network.f90 b/src/nf/nf_network.f90 index ac165adf..2743ff5b 100644 --- a/src/nf/nf_network.f90 +++ b/src/nf/nf_network.f90 @@ -16,7 +16,6 @@ module nf_network type(layer), allocatable :: layers(:) class(loss_type), allocatable :: loss - class(optimizer_base_type), allocatable :: optimizer contains diff --git a/src/nf/nf_network_submodule.f90 b/src/nf/nf_network_submodule.f90 index d550f264..df4498be 100644 --- a/src/nf/nf_network_submodule.f90 +++ b/src/nf/nf_network_submodule.f90 @@ -574,27 +574,8 @@ module subroutine train(self, input_data, output_data, batch_size, & integer :: i, j, n integer :: istart, iend, indices(2) - ! Passing the optimizer instance is optional. - ! If not provided, we default to SGD with its default settings. - if (present(optimizer)) then - self % optimizer = optimizer - - do n = 1, size(self % layers) - self % layers(n) % optimizer = optimizer - end do - - else - self % optimizer = sgd() - - do n = 1, size(self % layers) - self % layers(n) % optimizer = sgd() - end do - - end if - - do n = 1, size(self % layers) - call self % layers(n) % optimizer % init(self % layers(n) % get_num_params()) - end do + ! The optional optimizer instance is passed through to the update() method + ! where it is optional as well. ! Passing the loss instance is optional. ! If not provided, we default to quadratic(). @@ -628,7 +609,7 @@ module subroutine train(self, input_data, output_data, batch_size, & call self % backward(output_data(:,j)) end do - call self % update(batch_size=batch_size) + call self % update(optimizer=optimizer, batch_size=batch_size) end do batch_loop end do epoch_loop @@ -645,34 +626,22 @@ module subroutine update(self, optimizer, batch_size) real, pointer :: weights(:), biases(:), dw(:), db(:) integer :: n - ! Passing the optimizer instance is optional. If not provided, and if the - ! optimizer has not already been set, we default to the default SGD. The - ! instantiation and initialization below of the optimizer is normally done - ! at the beginning of the network % train() method. However, if the user - ! wants to call network % update() directly, for example if they use their - ! own custom mini-batching routine, we initialize the optimizer here as - ! well. If it's initialized already, this step is a cheap no-op. - if (.not. allocated(self % optimizer)) then + ! You can optionally pass an optimizer instance to the update() method. + ! This is necessary if you're not using the train() method, for example if + ! you're using your own custom mini-batching routine and calling the + ! forward(), backward(), and update() methods directly. + if (.not. allocated(self % layers(1) % optimizer)) then if (present(optimizer)) then - self % optimizer = optimizer - do n = 1, size(self % layers) self % layers(n) % optimizer = optimizer + call self % layers(n) % optimizer % init(self % layers(n) % get_num_params()) end do - else - self % optimizer = sgd() - do n = 1, size(self % layers) self % layers(n) % optimizer = sgd() + call self % layers(n) % optimizer % init(self % layers(n) % get_num_params()) end do - end if - - do n = 1, size(self % layers) - call self % layers(n) % optimizer % init(self % layers(n) % get_num_params()) - end do - end if if (present(batch_size)) then