From bf1478e538bc52051b3e3394de24f9be75820ce8 Mon Sep 17 00:00:00 2001
From: milancurcic <caomaco@gmail.com>
Date: Fri, 23 May 2025 09:52:14 -0400
Subject: [PATCH 01/13] WIP optimizer refactor w/ pointers

---
 src/nf/nf_dense_layer.f90           |  7 +++
 src/nf/nf_dense_layer_submodule.f90 |  9 ++++
 src/nf/nf_network_submodule.f90     | 17 +++++--
 src/nf/nf_optimizers.f90            | 76 ++++++++++++++++++++---------
 4 files changed, 82 insertions(+), 27 deletions(-)

diff --git a/src/nf/nf_dense_layer.f90 b/src/nf/nf_dense_layer.f90
index 862f4cdf..462434f6 100644
--- a/src/nf/nf_dense_layer.f90
+++ b/src/nf/nf_dense_layer.f90
@@ -36,6 +36,7 @@ module nf_dense_layer
     procedure :: get_gradients
     procedure :: get_num_params
     procedure :: get_params
+    procedure :: get_params_ptr
     procedure :: init
     procedure :: set_params
 
@@ -96,6 +97,12 @@ module function get_params(self) result(params)
         !! Parameters of this layer
     end function get_params
 
+    module subroutine get_params_ptr(self, w_ptr, b_ptr)
+      class(dense_layer), intent(in), target :: self
+      real, pointer :: w_ptr(:,:)
+      real, pointer :: b_ptr(:)
+    end subroutine get_params_ptr
+
     module function get_gradients(self) result(gradients)
       !! Return the gradients of this layer.
       !! The gradients are ordered as weights first, biases second.
diff --git a/src/nf/nf_dense_layer_submodule.f90 b/src/nf/nf_dense_layer_submodule.f90
index a424cf9c..d0ac015a 100644
--- a/src/nf/nf_dense_layer_submodule.f90
+++ b/src/nf/nf_dense_layer_submodule.f90
@@ -77,6 +77,15 @@ module function get_params(self) result(params)
   end function get_params
 
 
+  module subroutine get_params_ptr(self, w_ptr, b_ptr)
+    class(dense_layer), intent(in), target :: self
+    real, pointer :: w_ptr(:,:)
+    real, pointer :: b_ptr(:)
+    w_ptr => self % weights
+    b_ptr => self % biases
+  end subroutine get_params_ptr
+
+
   module function get_gradients(self) result(gradients)
     class(dense_layer), intent(in), target :: self
     real, allocatable :: gradients(:)
diff --git a/src/nf/nf_network_submodule.f90 b/src/nf/nf_network_submodule.f90
index d8f5ff50..e7c39716 100644
--- a/src/nf/nf_network_submodule.f90
+++ b/src/nf/nf_network_submodule.f90
@@ -649,6 +649,7 @@ module subroutine update(self, optimizer, batch_size)
     integer, intent(in), optional :: batch_size
     integer :: batch_size_
     real, allocatable :: params(:)
+    real, pointer :: weights(:), biases(:), gradient(:)
     integer :: n
 
     ! Passing the optimizer instance is optional. If not provided, and if the
@@ -693,9 +694,19 @@ module subroutine update(self, optimizer, batch_size)
     end do
 #endif
 
-    params = self % get_params()
-    call self % optimizer % minimize(params, self % get_gradients() / batch_size_)
-    call self % set_params(params)
+    !params = self % get_params()
+    !call self % optimizer % minimize(params, self % get_gradients() / batch_size_)
+    !call self % set_params(params)
+
+    do n = 2, size(self % layers)
+      select type(this_layer => self % layers(n) % p)
+        type is(dense_layer)
+          call this_layer % get_params_ptr(weights, biases)
+          call self % optimizer % minimize(weights, biases, self % get_gradients() / batch_size_)
+          !call this_layer % set_params(weights, biases)
+      end select
+    end do
+
 
     ! Flush network gradients to zero.
     do n = 2, size(self % layers)
diff --git a/src/nf/nf_optimizers.f90 b/src/nf/nf_optimizers.f90
index c64cefed..1caf8c1e 100644
--- a/src/nf/nf_optimizers.f90
+++ b/src/nf/nf_optimizers.f90
@@ -30,11 +30,12 @@ impure elemental subroutine init(self, num_params)
       integer, intent(in) :: num_params
     end subroutine init
 
-    pure subroutine minimize(self, param, gradient)
+    pure subroutine minimize(self, weights, biases, gradient)
       import :: optimizer_base_type
       class(optimizer_base_type), intent(inout) :: self
-      real, intent(inout) :: param(:)
-      real, intent(in) :: gradient(:)
+      real, intent(inout), pointer :: weights(:)
+      real, intent(inout), pointer :: biases(:)
+      real, intent(in), pointer :: gradient(:)
     end subroutine minimize
 
   end interface
@@ -116,12 +117,13 @@ impure elemental subroutine init_sgd(self, num_params)
   end subroutine init_sgd
 
 
-  pure subroutine minimize_sgd(self, param, gradient)
+  pure subroutine minimize_sgd(self, weights, biases, gradient)
     !! Concrete implementation of a stochastic gradient descent optimizer
     !! update rule.
     class(sgd), intent(inout) :: self
-    real, intent(inout) :: param(:)
-    real, intent(in) :: gradient(:)
+    real, intent(inout), pointer :: weights(:)
+    real, intent(inout), pointer :: biases(:)
+    real, intent(in), pointer :: gradient(:)
 
     if (self % momentum > 0) then
       ! Apply momentum update
@@ -129,14 +131,18 @@ pure subroutine minimize_sgd(self, param, gradient)
         - self % learning_rate * gradient
       if (self % nesterov) then
         ! Apply Nesterov update
-        param = param + self % momentum * self % velocity &
+        weights = weights + self % momentum * self % velocity &
+          - self % learning_rate * gradient
+        biases = biases + self % momentum * self % velocity &
           - self % learning_rate * gradient
       else
-        param = param + self % velocity
+        weights = weights + self % velocity
+        biases = biases + self % velocity
       end if
     else
       ! Apply regular update
-      param = param - self % learning_rate * gradient
+      weights = weights - self % learning_rate * gradient
+      biases = biases - self % learning_rate * gradient
     end if
 
   end subroutine minimize_sgd
@@ -152,18 +158,21 @@ impure elemental subroutine init_rmsprop(self, num_params)
   end subroutine init_rmsprop
 
 
-  pure subroutine minimize_rmsprop(self, param, gradient)
+  pure subroutine minimize_rmsprop(self, weights, biases, gradient)
     !! Concrete implementation of a RMSProp optimizer update rule.
     class(rmsprop), intent(inout) :: self
-    real, intent(inout) :: param(:)
-    real, intent(in) :: gradient(:)
+    real, intent(inout), pointer :: weights(:)
+    real, intent(inout), pointer :: biases(:)
+    real, intent(in), pointer :: gradient(:)
 
     ! Compute the RMS of the gradient using the RMSProp rule
     self % rms_gradient = self % decay_rate * self % rms_gradient &
       + (1 - self % decay_rate) * gradient**2
 
     ! Update the network parameters based on the new RMS of the gradient
-    param = param - self % learning_rate &
+    weights = weights - self % learning_rate &
+      / sqrt(self % rms_gradient + self % epsilon) * gradient
+    biases = biases - self % learning_rate &
       / sqrt(self % rms_gradient + self % epsilon) * gradient
 
   end subroutine minimize_rmsprop
@@ -180,17 +189,18 @@ impure elemental subroutine init_adam(self, num_params)
   end subroutine init_adam
 
 
-  pure subroutine minimize_adam(self, param, gradient)
+  pure subroutine minimize_adam(self, weights, biases, gradient)
     !! Concrete implementation of an Adam optimizer update rule.
     class(adam), intent(inout) :: self
-    real, intent(inout) :: param(:)
-    real, intent(in) :: gradient(:)
+    real, intent(inout), pointer :: weights(:)
+    real, intent(inout), pointer :: biases(:)
+    real, intent(in), pointer :: gradient(:)
 
     self % t = self % t + 1
 
     ! If weight_decay_l2 > 0, use L2 regularization;
     ! otherwise, default to regular Adam.
-    associate(g => gradient + self % weight_decay_l2 * param)
+    associate(g => gradient + self % weight_decay_l2 * weights)
       self % m = self % beta1 * self % m + (1 - self % beta1) * g
       self % v = self % beta2 * self % v + (1 - self % beta2) * g**2
     end associate
@@ -202,9 +212,15 @@ pure subroutine minimize_adam(self, param, gradient)
     )
 
     ! Update parameters.
-    param = param &
+    weights = weights &
       - self % learning_rate * (m_hat / (sqrt(v_hat) + self % epsilon) &
-      + self % weight_decay_decoupled * param)
+      + self % weight_decay_decoupled * weights)
+    
+    ! Update biases (without weight decay for biases)
+    associate(g => gradient)
+      biases = biases &
+        - self % learning_rate * (m_hat / (sqrt(v_hat) + self % epsilon))
+    end associate
 
     end associate
 
@@ -221,19 +237,21 @@ impure elemental subroutine init_adagrad(self, num_params)
   end subroutine init_adagrad
 
 
-  pure subroutine minimize_adagrad(self, param, gradient)
+  pure subroutine minimize_adagrad(self, weights, biases, gradient)
     !! Concrete implementation of an Adagrad optimizer update rule.
     class(adagrad), intent(inout) :: self
-    real, intent(inout) :: param(:)
-    real, intent(in) :: gradient(:)
+    real, intent(inout), pointer :: weights(:)
+    real, intent(inout), pointer :: biases(:)
+    real, intent(in), pointer :: gradient(:)
 
     ! Update the current time step
     self % t = self % t + 1
 
+    ! For weights
     associate( &
       ! If weight_decay_l2 > 0, use L2 regularization;
       ! otherwise, default to regular Adagrad.
-      g => gradient + self % weight_decay_l2 * param, &
+      g => gradient + self % weight_decay_l2 * weights, &
       ! Amortize the learning rate as function of the current time step.
       learning_rate => self % learning_rate &
         / (1 + (self % t - 1) * self % learning_rate_decay) &
@@ -241,10 +259,20 @@ pure subroutine minimize_adagrad(self, param, gradient)
 
       self % sum_squared_gradient = self % sum_squared_gradient + g**2
 
-      param = param - learning_rate * g / (sqrt(self % sum_squared_gradient) &
+      weights = weights - learning_rate * g / (sqrt(self % sum_squared_gradient) &
         + self % epsilon)
 
     end associate
+    
+    ! For biases (without weight decay)
+    associate( &
+      g => gradient, &
+      learning_rate => self % learning_rate &
+        / (1 + (self % t - 1) * self % learning_rate_decay) &
+    )
+      biases = biases - learning_rate * g / (sqrt(self % sum_squared_gradient) &
+        + self % epsilon)
+    end associate
 
   end subroutine minimize_adagrad
 

From 38896cc57abc017987f8b46b9650cb0ec3151545 Mon Sep 17 00:00:00 2001
From: milancurcic <caomaco@gmail.com>
Date: Tue, 27 May 2025 11:53:57 -0400
Subject: [PATCH 02/13] WIP optimizer optimization

---
 src/nf/nf_dense_layer.f90           |   7 +
 src/nf/nf_dense_layer_submodule.f90 |   9 ++
 src/nf/nf_network_submodule.f90     |   6 +-
 src/nf/nf_optimizers.f90            | 201 ++++++++++++++++++++--------
 4 files changed, 164 insertions(+), 59 deletions(-)

diff --git a/src/nf/nf_dense_layer.f90 b/src/nf/nf_dense_layer.f90
index 462434f6..ba6c33c4 100644
--- a/src/nf/nf_dense_layer.f90
+++ b/src/nf/nf_dense_layer.f90
@@ -34,6 +34,7 @@ module nf_dense_layer
     procedure :: backward
     procedure :: forward
     procedure :: get_gradients
+    procedure :: get_gradients_ptr
     procedure :: get_num_params
     procedure :: get_params
     procedure :: get_params_ptr
@@ -112,6 +113,12 @@ module function get_gradients(self) result(gradients)
         !! Gradients of this layer
     end function get_gradients
 
+    module subroutine get_gradients_ptr(self, dw_ptr, db_ptr)
+      class(dense_layer), intent(in), target :: self
+      real, pointer :: dw_ptr(:,:)
+      real, pointer :: db_ptr(:)
+    end subroutine get_gradients_ptr
+
     module subroutine set_params(self, params)
       !! Set the parameters of this layer.
       !! The parameters are ordered as weights first, biases second.
diff --git a/src/nf/nf_dense_layer_submodule.f90 b/src/nf/nf_dense_layer_submodule.f90
index d0ac015a..a1ca6ce5 100644
--- a/src/nf/nf_dense_layer_submodule.f90
+++ b/src/nf/nf_dense_layer_submodule.f90
@@ -102,6 +102,15 @@ module function get_gradients(self) result(gradients)
   end function get_gradients
 
 
+  module subroutine get_gradients_ptr(self, dw_ptr, db_ptr)
+    class(dense_layer), intent(in), target :: self
+    real, pointer :: dw_ptr(:,:)
+    real, pointer :: db_ptr(:)
+    dw_ptr => self % dw
+    db_ptr => self % db
+  end subroutine get_gradients_ptr
+
+
   module subroutine set_params(self, params)
     class(dense_layer), intent(in out) :: self
     real, intent(in), target :: params(:)
diff --git a/src/nf/nf_network_submodule.f90 b/src/nf/nf_network_submodule.f90
index e7c39716..1d36c5e8 100644
--- a/src/nf/nf_network_submodule.f90
+++ b/src/nf/nf_network_submodule.f90
@@ -649,7 +649,7 @@ module subroutine update(self, optimizer, batch_size)
     integer, intent(in), optional :: batch_size
     integer :: batch_size_
     real, allocatable :: params(:)
-    real, pointer :: weights(:), biases(:), gradient(:)
+    real, pointer :: weights(:,:), biases(:), dw(:,:), db(:)
     integer :: n
 
     ! Passing the optimizer instance is optional. If not provided, and if the
@@ -702,7 +702,9 @@ module subroutine update(self, optimizer, batch_size)
       select type(this_layer => self % layers(n) % p)
         type is(dense_layer)
           call this_layer % get_params_ptr(weights, biases)
-          call self % optimizer % minimize(weights, biases, self % get_gradients() / batch_size_)
+          call this_layer % get_gradients_ptr(dw, db)
+          call self % optimizer % minimize(weights, dw / batch_size_)
+          call self % optimizer % minimize(biases, db / batch_size_)
           !call this_layer % set_params(weights, biases)
       end select
     end do
diff --git a/src/nf/nf_optimizers.f90 b/src/nf/nf_optimizers.f90
index 1caf8c1e..400fbfa2 100644
--- a/src/nf/nf_optimizers.f90
+++ b/src/nf/nf_optimizers.f90
@@ -19,7 +19,9 @@ module nf_optimizers
     real :: learning_rate = 0.01
   contains
     procedure(init), deferred :: init
-    procedure(minimize), deferred :: minimize
+    procedure(minimize_1d), deferred :: minimize_1d
+    procedure(minimize_2d), deferred :: minimize_2d
+    generic :: minimize => minimize_1d, minimize_2d
   end type optimizer_base_type
 
   abstract interface
@@ -30,13 +32,19 @@ impure elemental subroutine init(self, num_params)
       integer, intent(in) :: num_params
     end subroutine init
 
-    pure subroutine minimize(self, weights, biases, gradient)
+    pure subroutine minimize_1d(self, param, gradient)
       import :: optimizer_base_type
       class(optimizer_base_type), intent(inout) :: self
-      real, intent(inout), pointer :: weights(:)
-      real, intent(inout), pointer :: biases(:)
-      real, intent(in), pointer :: gradient(:)
-    end subroutine minimize
+      real, intent(inout) :: param(:)
+      real, intent(in) :: gradient(:)
+    end subroutine minimize_1d
+
+    pure subroutine minimize_2d(self, param, gradient)
+      import :: optimizer_base_type
+      class(optimizer_base_type), intent(inout) :: self
+      real, intent(inout) :: param(:,:)
+      real, intent(in) :: gradient(:,:)
+    end subroutine minimize_2d
 
   end interface
 
@@ -47,7 +55,8 @@ end subroutine minimize
     real, allocatable, private :: velocity(:)
   contains
     procedure :: init => init_sgd
-    procedure :: minimize => minimize_sgd
+    procedure :: minimize_1d => minimize_sgd_1d
+    procedure :: minimize_2d => minimize_sgd_2d
   end type sgd
 
   type, extends(optimizer_base_type) :: rmsprop
@@ -62,7 +71,8 @@ end subroutine minimize
     real, allocatable, private :: rms_gradient(:)
   contains
     procedure :: init => init_rmsprop
-    procedure :: minimize => minimize_rmsprop
+    procedure :: minimize_1d => minimize_rmsprop_1d
+    procedure :: minimize_2d => minimize_rmsprop_2d
   end type rmsprop
 
   type, extends(optimizer_base_type) :: adam
@@ -85,7 +95,8 @@ end subroutine minimize
     integer, private :: t = 0
   contains
     procedure :: init => init_adam
-    procedure :: minimize => minimize_adam
+    procedure :: minimize_1d => minimize_adam_1d
+    procedure :: minimize_2d => minimize_adam_2d
   end type adam
 
   type, extends(optimizer_base_type) :: adagrad
@@ -102,7 +113,8 @@ end subroutine minimize
     integer, private :: t = 0
   contains
     procedure :: init => init_adagrad
-    procedure :: minimize => minimize_adagrad
+    procedure :: minimize_1d => minimize_adagrad_1d
+    procedure :: minimize_2d => minimize_adagrad_2d
   end type adagrad
 
 contains
@@ -117,13 +129,12 @@ impure elemental subroutine init_sgd(self, num_params)
   end subroutine init_sgd
 
 
-  pure subroutine minimize_sgd(self, weights, biases, gradient)
+  pure subroutine minimize_sgd_1d(self, param, gradient)
     !! Concrete implementation of a stochastic gradient descent optimizer
     !! update rule.
     class(sgd), intent(inout) :: self
-    real, intent(inout), pointer :: weights(:)
-    real, intent(inout), pointer :: biases(:)
-    real, intent(in), pointer :: gradient(:)
+    real, intent(inout) :: param(:)
+    real, intent(in) :: gradient(:)
 
     if (self % momentum > 0) then
       ! Apply momentum update
@@ -131,21 +142,17 @@ pure subroutine minimize_sgd(self, weights, biases, gradient)
         - self % learning_rate * gradient
       if (self % nesterov) then
         ! Apply Nesterov update
-        weights = weights + self % momentum * self % velocity &
-          - self % learning_rate * gradient
-        biases = biases + self % momentum * self % velocity &
+        param = param + self % momentum * self % velocity &
           - self % learning_rate * gradient
       else
-        weights = weights + self % velocity
-        biases = biases + self % velocity
+        param = param + self % velocity
       end if
     else
       ! Apply regular update
-      weights = weights - self % learning_rate * gradient
-      biases = biases - self % learning_rate * gradient
+      param = param - self % learning_rate * gradient
     end if
 
-  end subroutine minimize_sgd
+  end subroutine minimize_sgd_1d
 
 
   impure elemental subroutine init_rmsprop(self, num_params)
@@ -158,24 +165,21 @@ impure elemental subroutine init_rmsprop(self, num_params)
   end subroutine init_rmsprop
 
 
-  pure subroutine minimize_rmsprop(self, weights, biases, gradient)
+  pure subroutine minimize_rmsprop_1d(self, param, gradient)
     !! Concrete implementation of a RMSProp optimizer update rule.
     class(rmsprop), intent(inout) :: self
-    real, intent(inout), pointer :: weights(:)
-    real, intent(inout), pointer :: biases(:)
-    real, intent(in), pointer :: gradient(:)
+    real, intent(inout) :: param(:)
+    real, intent(in) :: gradient(:)
 
     ! Compute the RMS of the gradient using the RMSProp rule
     self % rms_gradient = self % decay_rate * self % rms_gradient &
       + (1 - self % decay_rate) * gradient**2
 
     ! Update the network parameters based on the new RMS of the gradient
-    weights = weights - self % learning_rate &
-      / sqrt(self % rms_gradient + self % epsilon) * gradient
-    biases = biases - self % learning_rate &
+    param = param - self % learning_rate &
       / sqrt(self % rms_gradient + self % epsilon) * gradient
 
-  end subroutine minimize_rmsprop
+  end subroutine minimize_rmsprop_1d
 
 
   impure elemental subroutine init_adam(self, num_params)
@@ -189,18 +193,17 @@ impure elemental subroutine init_adam(self, num_params)
   end subroutine init_adam
 
 
-  pure subroutine minimize_adam(self, weights, biases, gradient)
+  pure subroutine minimize_adam_1d(self, param, gradient)
     !! Concrete implementation of an Adam optimizer update rule.
     class(adam), intent(inout) :: self
-    real, intent(inout), pointer :: weights(:)
-    real, intent(inout), pointer :: biases(:)
-    real, intent(in), pointer :: gradient(:)
+    real, intent(inout) :: param(:)
+    real, intent(in) :: gradient(:)
 
     self % t = self % t + 1
 
     ! If weight_decay_l2 > 0, use L2 regularization;
     ! otherwise, default to regular Adam.
-    associate(g => gradient + self % weight_decay_l2 * weights)
+    associate(g => gradient + self % weight_decay_l2 * param)
       self % m = self % beta1 * self % m + (1 - self % beta1) * g
       self % v = self % beta2 * self % v + (1 - self % beta2) * g**2
     end associate
@@ -212,19 +215,13 @@ pure subroutine minimize_adam(self, weights, biases, gradient)
     )
 
     ! Update parameters.
-    weights = weights &
+    param = param &
       - self % learning_rate * (m_hat / (sqrt(v_hat) + self % epsilon) &
-      + self % weight_decay_decoupled * weights)
-    
-    ! Update biases (without weight decay for biases)
-    associate(g => gradient)
-      biases = biases &
-        - self % learning_rate * (m_hat / (sqrt(v_hat) + self % epsilon))
-    end associate
+      + self % weight_decay_decoupled * param)
 
     end associate
 
-  end subroutine minimize_adam
+  end subroutine minimize_adam_1d
 
 
   impure elemental subroutine init_adagrad(self, num_params)
@@ -237,21 +234,19 @@ impure elemental subroutine init_adagrad(self, num_params)
   end subroutine init_adagrad
 
 
-  pure subroutine minimize_adagrad(self, weights, biases, gradient)
+  pure subroutine minimize_adagrad_1d(self, param, gradient)
     !! Concrete implementation of an Adagrad optimizer update rule.
     class(adagrad), intent(inout) :: self
-    real, intent(inout), pointer :: weights(:)
-    real, intent(inout), pointer :: biases(:)
-    real, intent(in), pointer :: gradient(:)
+    real, intent(inout) :: param(:)
+    real, intent(in) :: gradient(:)
 
     ! Update the current time step
     self % t = self % t + 1
 
-    ! For weights
     associate( &
       ! If weight_decay_l2 > 0, use L2 regularization;
       ! otherwise, default to regular Adagrad.
-      g => gradient + self % weight_decay_l2 * weights, &
+      g => gradient + self % weight_decay_l2 * param, &
       ! Amortize the learning rate as function of the current time step.
       learning_rate => self % learning_rate &
         / (1 + (self % t - 1) * self % learning_rate_decay) &
@@ -259,21 +254,113 @@ pure subroutine minimize_adagrad(self, weights, biases, gradient)
 
       self % sum_squared_gradient = self % sum_squared_gradient + g**2
 
-      weights = weights - learning_rate * g / (sqrt(self % sum_squared_gradient) &
+      param = param - learning_rate * g / (sqrt(self % sum_squared_gradient) &
         + self % epsilon)
 
     end associate
-    
-    ! For biases (without weight decay)
+
+  end subroutine minimize_adagrad_1d
+
+
+  pure subroutine minimize_sgd_2d(self, param, gradient)
+    !! Concrete implementation of a stochastic gradient descent optimizer
+    !! update rule for 2D arrays.
+    class(sgd), intent(inout) :: self
+    real, intent(inout) :: param(:,:)
+    real, intent(in) :: gradient(:,:)
+
+    if (self % momentum > 0) then
+      ! Apply momentum update
+      self % velocity = self % momentum * self % velocity &
+        - self % learning_rate * reshape(gradient, [size(gradient)])
+      if (self % nesterov) then
+        ! Apply Nesterov update
+        param = param + reshape(self % momentum * self % velocity &
+          - self % learning_rate * reshape(gradient, [size(gradient)]), shape(param))
+      else
+        param = param + reshape(self % velocity, shape(param))
+      end if
+    else
+      ! Apply regular update
+      param = param - self % learning_rate * gradient
+    end if
+
+  end subroutine minimize_sgd_2d
+
+
+  pure subroutine minimize_rmsprop_2d(self, param, gradient)
+    !! Concrete implementation of a RMSProp optimizer update rule for 2D arrays.
+    class(rmsprop), intent(inout) :: self
+    real, intent(inout) :: param(:,:)
+    real, intent(in) :: gradient(:,:)
+
+    ! Compute the RMS of the gradient using the RMSProp rule
+    self % rms_gradient = self % decay_rate * self % rms_gradient &
+      + (1 - self % decay_rate) * reshape(gradient, [size(gradient)])**2
+
+    ! Update the network parameters based on the new RMS of the gradient
+    param = param - self % learning_rate &
+      / sqrt(reshape(self % rms_gradient, shape(param)) + self % epsilon) * gradient
+
+  end subroutine minimize_rmsprop_2d
+
+
+  pure subroutine minimize_adam_2d(self, param, gradient)
+    !! Concrete implementation of an Adam optimizer update rule for 2D arrays.
+    class(adam), intent(inout) :: self
+    real, intent(inout) :: param(:,:)
+    real, intent(in) :: gradient(:,:)
+
+    self % t = self % t + 1
+
+    ! If weight_decay_l2 > 0, use L2 regularization;
+    ! otherwise, default to regular Adam.
+    associate(g => reshape(gradient, [size(gradient)]) + self % weight_decay_l2 * reshape(param, [size(param)]))
+      self % m = self % beta1 * self % m + (1 - self % beta1) * g
+      self % v = self % beta2 * self % v + (1 - self % beta2) * g**2
+    end associate
+
+    ! Compute bias-corrected first and second moment estimates.
+    associate( &
+      m_hat => self % m / (1 - self % beta1**self % t), &
+      v_hat => self % v / (1 - self % beta2**self % t) &
+    )
+
+    ! Update parameters.
+    param = param &
+      - self % learning_rate * reshape(m_hat / (sqrt(v_hat) + self % epsilon), shape(param)) &
+      - self % learning_rate * self % weight_decay_decoupled * param
+
+    end associate
+
+  end subroutine minimize_adam_2d
+
+
+  pure subroutine minimize_adagrad_2d(self, param, gradient)
+    !! Concrete implementation of an Adagrad optimizer update rule for 2D arrays.
+    class(adagrad), intent(inout) :: self
+    real, intent(inout) :: param(:,:)
+    real, intent(in) :: gradient(:,:)
+
+    ! Update the current time step
+    self % t = self % t + 1
+
     associate( &
-      g => gradient, &
+      ! If weight_decay_l2 > 0, use L2 regularization;
+      ! otherwise, default to regular Adagrad.
+      g => reshape(gradient, [size(gradient)]) + self % weight_decay_l2 * reshape(param, [size(param)]), &
+      ! Amortize the learning rate as function of the current time step.
       learning_rate => self % learning_rate &
         / (1 + (self % t - 1) * self % learning_rate_decay) &
     )
-      biases = biases - learning_rate * g / (sqrt(self % sum_squared_gradient) &
-        + self % epsilon)
+
+      self % sum_squared_gradient = self % sum_squared_gradient + g**2
+
+      param = param - learning_rate * reshape(g / (sqrt(self % sum_squared_gradient) &
+        + self % epsilon), shape(param))
+
     end associate
 
-  end subroutine minimize_adagrad
+  end subroutine minimize_adagrad_2d
 
 end module nf_optimizers

From 21c5707af2e7f0b7cbc816e9378848ea06c9a591 Mon Sep 17 00:00:00 2001
From: milancurcic <caomaco@gmail.com>
Date: Tue, 27 May 2025 13:57:48 -0400
Subject: [PATCH 03/13] Send the data to optimizer without a copy works for
 dense layers

---
 src/nf/nf_network_submodule.f90 |  12 +--
 src/nf/nf_optimizers.f90        | 150 ++++++++++++++++----------------
 2 files changed, 81 insertions(+), 81 deletions(-)

diff --git a/src/nf/nf_network_submodule.f90 b/src/nf/nf_network_submodule.f90
index 1d36c5e8..eccea580 100644
--- a/src/nf/nf_network_submodule.f90
+++ b/src/nf/nf_network_submodule.f90
@@ -694,10 +694,6 @@ module subroutine update(self, optimizer, batch_size)
     end do
 #endif
 
-    !params = self % get_params()
-    !call self % optimizer % minimize(params, self % get_gradients() / batch_size_)
-    !call self % set_params(params)
-
     do n = 2, size(self % layers)
       select type(this_layer => self % layers(n) % p)
         type is(dense_layer)
@@ -705,11 +701,15 @@ module subroutine update(self, optimizer, batch_size)
           call this_layer % get_gradients_ptr(dw, db)
           call self % optimizer % minimize(weights, dw / batch_size_)
           call self % optimizer % minimize(biases, db / batch_size_)
-          !call this_layer % set_params(weights, biases)
+        type is(locally_connected1d_layer)
+          !TODO
+        type is(conv1d_layer)
+          !TODO
+        type is(conv2d_layer)
+          !TODO
       end select
     end do
 
-
     ! Flush network gradients to zero.
     do n = 2, size(self % layers)
       select type(this_layer => self % layers(n) % p)
diff --git a/src/nf/nf_optimizers.f90 b/src/nf/nf_optimizers.f90
index 400fbfa2..f6759d67 100644
--- a/src/nf/nf_optimizers.f90
+++ b/src/nf/nf_optimizers.f90
@@ -155,6 +155,32 @@ pure subroutine minimize_sgd_1d(self, param, gradient)
   end subroutine minimize_sgd_1d
 
 
+  pure subroutine minimize_sgd_2d(self, param, gradient)
+    !! Concrete implementation of a stochastic gradient descent optimizer
+    !! update rule for 2D arrays.
+    class(sgd), intent(inout) :: self
+    real, intent(inout) :: param(:,:)
+    real, intent(in) :: gradient(:,:)
+
+    if (self % momentum > 0) then
+      ! Apply momentum update
+      self % velocity = self % momentum * self % velocity &
+        - self % learning_rate * reshape(gradient, [size(gradient)])
+      if (self % nesterov) then
+        ! Apply Nesterov update
+        param = param + reshape(self % momentum * self % velocity &
+          - self % learning_rate * reshape(gradient, [size(gradient)]), shape(param))
+      else
+        param = param + reshape(self % velocity, shape(param))
+      end if
+    else
+      ! Apply regular update
+      param = param - self % learning_rate * gradient
+    end if
+
+  end subroutine minimize_sgd_2d
+
+
   impure elemental subroutine init_rmsprop(self, num_params)
     class(rmsprop), intent(inout) :: self
     integer, intent(in) :: num_params
@@ -182,6 +208,23 @@ pure subroutine minimize_rmsprop_1d(self, param, gradient)
   end subroutine minimize_rmsprop_1d
 
 
+  pure subroutine minimize_rmsprop_2d(self, param, gradient)
+    !! Concrete implementation of a RMSProp optimizer update rule for 2D arrays.
+    class(rmsprop), intent(inout) :: self
+    real, intent(inout) :: param(:,:)
+    real, intent(in) :: gradient(:,:)
+
+    ! Compute the RMS of the gradient using the RMSProp rule
+    self % rms_gradient = self % decay_rate * self % rms_gradient &
+      + (1 - self % decay_rate) * reshape(gradient, [size(gradient)])**2
+
+    ! Update the network parameters based on the new RMS of the gradient
+    param = param - self % learning_rate &
+      / sqrt(reshape(self % rms_gradient, shape(param)) + self % epsilon) * gradient
+
+  end subroutine minimize_rmsprop_2d
+
+
   impure elemental subroutine init_adam(self, num_params)
     class(adam), intent(inout) :: self
     integer, intent(in) :: num_params
@@ -224,6 +267,37 @@ pure subroutine minimize_adam_1d(self, param, gradient)
   end subroutine minimize_adam_1d
 
 
+  pure subroutine minimize_adam_2d(self, param, gradient)
+    !! Concrete implementation of an Adam optimizer update rule for 2D arrays.
+    class(adam), intent(inout) :: self
+    real, intent(inout) :: param(:,:)
+    real, intent(in) :: gradient(:,:)
+
+    self % t = self % t + 1
+
+    ! If weight_decay_l2 > 0, use L2 regularization;
+    ! otherwise, default to regular Adam.
+    associate(g => reshape(gradient, [size(gradient)]) + self % weight_decay_l2 * reshape(param, [size(param)]))
+      self % m = self % beta1 * self % m + (1 - self % beta1) * g
+      self % v = self % beta2 * self % v + (1 - self % beta2) * g**2
+    end associate
+
+    ! Compute bias-corrected first and second moment estimates.
+    associate( &
+      m_hat => self % m / (1 - self % beta1**self % t), &
+      v_hat => self % v / (1 - self % beta2**self % t) &
+    )
+
+    ! Update parameters.
+    param = param &
+      - self % learning_rate * reshape(m_hat / (sqrt(v_hat) + self % epsilon), shape(param)) &
+      - self % learning_rate * self % weight_decay_decoupled * param
+
+    end associate
+
+  end subroutine minimize_adam_2d
+
+
   impure elemental subroutine init_adagrad(self, num_params)
     class(adagrad), intent(inout) :: self
     integer, intent(in) :: num_params
@@ -262,80 +336,6 @@ pure subroutine minimize_adagrad_1d(self, param, gradient)
   end subroutine minimize_adagrad_1d
 
 
-  pure subroutine minimize_sgd_2d(self, param, gradient)
-    !! Concrete implementation of a stochastic gradient descent optimizer
-    !! update rule for 2D arrays.
-    class(sgd), intent(inout) :: self
-    real, intent(inout) :: param(:,:)
-    real, intent(in) :: gradient(:,:)
-
-    if (self % momentum > 0) then
-      ! Apply momentum update
-      self % velocity = self % momentum * self % velocity &
-        - self % learning_rate * reshape(gradient, [size(gradient)])
-      if (self % nesterov) then
-        ! Apply Nesterov update
-        param = param + reshape(self % momentum * self % velocity &
-          - self % learning_rate * reshape(gradient, [size(gradient)]), shape(param))
-      else
-        param = param + reshape(self % velocity, shape(param))
-      end if
-    else
-      ! Apply regular update
-      param = param - self % learning_rate * gradient
-    end if
-
-  end subroutine minimize_sgd_2d
-
-
-  pure subroutine minimize_rmsprop_2d(self, param, gradient)
-    !! Concrete implementation of a RMSProp optimizer update rule for 2D arrays.
-    class(rmsprop), intent(inout) :: self
-    real, intent(inout) :: param(:,:)
-    real, intent(in) :: gradient(:,:)
-
-    ! Compute the RMS of the gradient using the RMSProp rule
-    self % rms_gradient = self % decay_rate * self % rms_gradient &
-      + (1 - self % decay_rate) * reshape(gradient, [size(gradient)])**2
-
-    ! Update the network parameters based on the new RMS of the gradient
-    param = param - self % learning_rate &
-      / sqrt(reshape(self % rms_gradient, shape(param)) + self % epsilon) * gradient
-
-  end subroutine minimize_rmsprop_2d
-
-
-  pure subroutine minimize_adam_2d(self, param, gradient)
-    !! Concrete implementation of an Adam optimizer update rule for 2D arrays.
-    class(adam), intent(inout) :: self
-    real, intent(inout) :: param(:,:)
-    real, intent(in) :: gradient(:,:)
-
-    self % t = self % t + 1
-
-    ! If weight_decay_l2 > 0, use L2 regularization;
-    ! otherwise, default to regular Adam.
-    associate(g => reshape(gradient, [size(gradient)]) + self % weight_decay_l2 * reshape(param, [size(param)]))
-      self % m = self % beta1 * self % m + (1 - self % beta1) * g
-      self % v = self % beta2 * self % v + (1 - self % beta2) * g**2
-    end associate
-
-    ! Compute bias-corrected first and second moment estimates.
-    associate( &
-      m_hat => self % m / (1 - self % beta1**self % t), &
-      v_hat => self % v / (1 - self % beta2**self % t) &
-    )
-
-    ! Update parameters.
-    param = param &
-      - self % learning_rate * reshape(m_hat / (sqrt(v_hat) + self % epsilon), shape(param)) &
-      - self % learning_rate * self % weight_decay_decoupled * param
-
-    end associate
-
-  end subroutine minimize_adam_2d
-
-
   pure subroutine minimize_adagrad_2d(self, param, gradient)
     !! Concrete implementation of an Adagrad optimizer update rule for 2D arrays.
     class(adagrad), intent(inout) :: self
@@ -363,4 +363,4 @@ pure subroutine minimize_adagrad_2d(self, param, gradient)
 
   end subroutine minimize_adagrad_2d
 
-end module nf_optimizers
+end module nf_optimizers
\ No newline at end of file

From 9d68828f7e29d66f435a6701996f1cb65f08416e Mon Sep 17 00:00:00 2001
From: milancurcic <caomaco@gmail.com>
Date: Fri, 30 May 2025 13:47:28 -0400
Subject: [PATCH 04/13] Get weights and weight gradients as 1d

---
 src/nf/nf_dense_layer.f90           |   8 +-
 src/nf/nf_dense_layer_submodule.f90 |  12 +--
 src/nf/nf_network_submodule.f90     |   2 +-
 src/nf/nf_optimizers.f90            | 145 +++-------------------------
 4 files changed, 26 insertions(+), 141 deletions(-)

diff --git a/src/nf/nf_dense_layer.f90 b/src/nf/nf_dense_layer.f90
index ba6c33c4..a55ec892 100644
--- a/src/nf/nf_dense_layer.f90
+++ b/src/nf/nf_dense_layer.f90
@@ -100,8 +100,8 @@ end function get_params
 
     module subroutine get_params_ptr(self, w_ptr, b_ptr)
       class(dense_layer), intent(in), target :: self
-      real, pointer :: w_ptr(:,:)
-      real, pointer :: b_ptr(:)
+      real, pointer, intent(out) :: w_ptr(:)
+      real, pointer, intent(out) :: b_ptr(:)
     end subroutine get_params_ptr
 
     module function get_gradients(self) result(gradients)
@@ -115,8 +115,8 @@ end function get_gradients
 
     module subroutine get_gradients_ptr(self, dw_ptr, db_ptr)
       class(dense_layer), intent(in), target :: self
-      real, pointer :: dw_ptr(:,:)
-      real, pointer :: db_ptr(:)
+      real, pointer, intent(out) :: dw_ptr(:)
+      real, pointer, intent(out) :: db_ptr(:)
     end subroutine get_gradients_ptr
 
     module subroutine set_params(self, params)
diff --git a/src/nf/nf_dense_layer_submodule.f90 b/src/nf/nf_dense_layer_submodule.f90
index a1ca6ce5..bb27c54a 100644
--- a/src/nf/nf_dense_layer_submodule.f90
+++ b/src/nf/nf_dense_layer_submodule.f90
@@ -79,9 +79,9 @@ end function get_params
 
   module subroutine get_params_ptr(self, w_ptr, b_ptr)
     class(dense_layer), intent(in), target :: self
-    real, pointer :: w_ptr(:,:)
-    real, pointer :: b_ptr(:)
-    w_ptr => self % weights
+    real, pointer, intent(out) :: w_ptr(:)
+    real, pointer, intent(out) :: b_ptr(:)
+    w_ptr(1:size(self % weights)) => self % weights
     b_ptr => self % biases
   end subroutine get_params_ptr
 
@@ -104,9 +104,9 @@ end function get_gradients
 
   module subroutine get_gradients_ptr(self, dw_ptr, db_ptr)
     class(dense_layer), intent(in), target :: self
-    real, pointer :: dw_ptr(:,:)
-    real, pointer :: db_ptr(:)
-    dw_ptr => self % dw
+    real, pointer, intent(out) :: dw_ptr(:)
+    real, pointer, intent(out) :: db_ptr(:)
+    dw_ptr(1:size(self % dw)) => self % dw
     db_ptr => self % db
   end subroutine get_gradients_ptr
 
diff --git a/src/nf/nf_network_submodule.f90 b/src/nf/nf_network_submodule.f90
index eccea580..3508ec50 100644
--- a/src/nf/nf_network_submodule.f90
+++ b/src/nf/nf_network_submodule.f90
@@ -649,7 +649,7 @@ module subroutine update(self, optimizer, batch_size)
     integer, intent(in), optional :: batch_size
     integer :: batch_size_
     real, allocatable :: params(:)
-    real, pointer :: weights(:,:), biases(:), dw(:,:), db(:)
+    real, pointer :: weights(:), biases(:), dw(:), db(:)
     integer :: n
 
     ! Passing the optimizer instance is optional. If not provided, and if the
diff --git a/src/nf/nf_optimizers.f90 b/src/nf/nf_optimizers.f90
index f6759d67..24089ccd 100644
--- a/src/nf/nf_optimizers.f90
+++ b/src/nf/nf_optimizers.f90
@@ -19,9 +19,7 @@ module nf_optimizers
     real :: learning_rate = 0.01
   contains
     procedure(init), deferred :: init
-    procedure(minimize_1d), deferred :: minimize_1d
-    procedure(minimize_2d), deferred :: minimize_2d
-    generic :: minimize => minimize_1d, minimize_2d
+    procedure(minimize), deferred :: minimize
   end type optimizer_base_type
 
   abstract interface
@@ -32,19 +30,12 @@ impure elemental subroutine init(self, num_params)
       integer, intent(in) :: num_params
     end subroutine init
 
-    pure subroutine minimize_1d(self, param, gradient)
+    pure subroutine minimize(self, param, gradient)
       import :: optimizer_base_type
       class(optimizer_base_type), intent(inout) :: self
       real, intent(inout) :: param(:)
       real, intent(in) :: gradient(:)
-    end subroutine minimize_1d
-
-    pure subroutine minimize_2d(self, param, gradient)
-      import :: optimizer_base_type
-      class(optimizer_base_type), intent(inout) :: self
-      real, intent(inout) :: param(:,:)
-      real, intent(in) :: gradient(:,:)
-    end subroutine minimize_2d
+    end subroutine minimize
 
   end interface
 
@@ -55,8 +46,7 @@ end subroutine minimize_2d
     real, allocatable, private :: velocity(:)
   contains
     procedure :: init => init_sgd
-    procedure :: minimize_1d => minimize_sgd_1d
-    procedure :: minimize_2d => minimize_sgd_2d
+    procedure :: minimize => minimize_sgd
   end type sgd
 
   type, extends(optimizer_base_type) :: rmsprop
@@ -71,8 +61,7 @@ end subroutine minimize_2d
     real, allocatable, private :: rms_gradient(:)
   contains
     procedure :: init => init_rmsprop
-    procedure :: minimize_1d => minimize_rmsprop_1d
-    procedure :: minimize_2d => minimize_rmsprop_2d
+    procedure :: minimize => minimize_rmsprop
   end type rmsprop
 
   type, extends(optimizer_base_type) :: adam
@@ -95,8 +84,7 @@ end subroutine minimize_2d
     integer, private :: t = 0
   contains
     procedure :: init => init_adam
-    procedure :: minimize_1d => minimize_adam_1d
-    procedure :: minimize_2d => minimize_adam_2d
+    procedure :: minimize => minimize_adam
   end type adam
 
   type, extends(optimizer_base_type) :: adagrad
@@ -113,8 +101,7 @@ end subroutine minimize_2d
     integer, private :: t = 0
   contains
     procedure :: init => init_adagrad
-    procedure :: minimize_1d => minimize_adagrad_1d
-    procedure :: minimize_2d => minimize_adagrad_2d
+    procedure :: minimize => minimize_adagrad
   end type adagrad
 
 contains
@@ -129,7 +116,7 @@ impure elemental subroutine init_sgd(self, num_params)
   end subroutine init_sgd
 
 
-  pure subroutine minimize_sgd_1d(self, param, gradient)
+  pure subroutine minimize_sgd(self, param, gradient)
     !! Concrete implementation of a stochastic gradient descent optimizer
     !! update rule.
     class(sgd), intent(inout) :: self
@@ -152,33 +139,7 @@ pure subroutine minimize_sgd_1d(self, param, gradient)
       param = param - self % learning_rate * gradient
     end if
 
-  end subroutine minimize_sgd_1d
-
-
-  pure subroutine minimize_sgd_2d(self, param, gradient)
-    !! Concrete implementation of a stochastic gradient descent optimizer
-    !! update rule for 2D arrays.
-    class(sgd), intent(inout) :: self
-    real, intent(inout) :: param(:,:)
-    real, intent(in) :: gradient(:,:)
-
-    if (self % momentum > 0) then
-      ! Apply momentum update
-      self % velocity = self % momentum * self % velocity &
-        - self % learning_rate * reshape(gradient, [size(gradient)])
-      if (self % nesterov) then
-        ! Apply Nesterov update
-        param = param + reshape(self % momentum * self % velocity &
-          - self % learning_rate * reshape(gradient, [size(gradient)]), shape(param))
-      else
-        param = param + reshape(self % velocity, shape(param))
-      end if
-    else
-      ! Apply regular update
-      param = param - self % learning_rate * gradient
-    end if
-
-  end subroutine minimize_sgd_2d
+  end subroutine minimize_sgd
 
 
   impure elemental subroutine init_rmsprop(self, num_params)
@@ -191,7 +152,7 @@ impure elemental subroutine init_rmsprop(self, num_params)
   end subroutine init_rmsprop
 
 
-  pure subroutine minimize_rmsprop_1d(self, param, gradient)
+  pure subroutine minimize_rmsprop(self, param, gradient)
     !! Concrete implementation of a RMSProp optimizer update rule.
     class(rmsprop), intent(inout) :: self
     real, intent(inout) :: param(:)
@@ -205,24 +166,7 @@ pure subroutine minimize_rmsprop_1d(self, param, gradient)
     param = param - self % learning_rate &
       / sqrt(self % rms_gradient + self % epsilon) * gradient
 
-  end subroutine minimize_rmsprop_1d
-
-
-  pure subroutine minimize_rmsprop_2d(self, param, gradient)
-    !! Concrete implementation of a RMSProp optimizer update rule for 2D arrays.
-    class(rmsprop), intent(inout) :: self
-    real, intent(inout) :: param(:,:)
-    real, intent(in) :: gradient(:,:)
-
-    ! Compute the RMS of the gradient using the RMSProp rule
-    self % rms_gradient = self % decay_rate * self % rms_gradient &
-      + (1 - self % decay_rate) * reshape(gradient, [size(gradient)])**2
-
-    ! Update the network parameters based on the new RMS of the gradient
-    param = param - self % learning_rate &
-      / sqrt(reshape(self % rms_gradient, shape(param)) + self % epsilon) * gradient
-
-  end subroutine minimize_rmsprop_2d
+  end subroutine minimize_rmsprop
 
 
   impure elemental subroutine init_adam(self, num_params)
@@ -236,7 +180,7 @@ impure elemental subroutine init_adam(self, num_params)
   end subroutine init_adam
 
 
-  pure subroutine minimize_adam_1d(self, param, gradient)
+  pure subroutine minimize_adam(self, param, gradient)
     !! Concrete implementation of an Adam optimizer update rule.
     class(adam), intent(inout) :: self
     real, intent(inout) :: param(:)
@@ -264,38 +208,7 @@ pure subroutine minimize_adam_1d(self, param, gradient)
 
     end associate
 
-  end subroutine minimize_adam_1d
-
-
-  pure subroutine minimize_adam_2d(self, param, gradient)
-    !! Concrete implementation of an Adam optimizer update rule for 2D arrays.
-    class(adam), intent(inout) :: self
-    real, intent(inout) :: param(:,:)
-    real, intent(in) :: gradient(:,:)
-
-    self % t = self % t + 1
-
-    ! If weight_decay_l2 > 0, use L2 regularization;
-    ! otherwise, default to regular Adam.
-    associate(g => reshape(gradient, [size(gradient)]) + self % weight_decay_l2 * reshape(param, [size(param)]))
-      self % m = self % beta1 * self % m + (1 - self % beta1) * g
-      self % v = self % beta2 * self % v + (1 - self % beta2) * g**2
-    end associate
-
-    ! Compute bias-corrected first and second moment estimates.
-    associate( &
-      m_hat => self % m / (1 - self % beta1**self % t), &
-      v_hat => self % v / (1 - self % beta2**self % t) &
-    )
-
-    ! Update parameters.
-    param = param &
-      - self % learning_rate * reshape(m_hat / (sqrt(v_hat) + self % epsilon), shape(param)) &
-      - self % learning_rate * self % weight_decay_decoupled * param
-
-    end associate
-
-  end subroutine minimize_adam_2d
+  end subroutine minimize_adam
 
 
   impure elemental subroutine init_adagrad(self, num_params)
@@ -308,7 +221,7 @@ impure elemental subroutine init_adagrad(self, num_params)
   end subroutine init_adagrad
 
 
-  pure subroutine minimize_adagrad_1d(self, param, gradient)
+  pure subroutine minimize_adagrad(self, param, gradient)
     !! Concrete implementation of an Adagrad optimizer update rule.
     class(adagrad), intent(inout) :: self
     real, intent(inout) :: param(:)
@@ -333,34 +246,6 @@ pure subroutine minimize_adagrad_1d(self, param, gradient)
 
     end associate
 
-  end subroutine minimize_adagrad_1d
-
-
-  pure subroutine minimize_adagrad_2d(self, param, gradient)
-    !! Concrete implementation of an Adagrad optimizer update rule for 2D arrays.
-    class(adagrad), intent(inout) :: self
-    real, intent(inout) :: param(:,:)
-    real, intent(in) :: gradient(:,:)
-
-    ! Update the current time step
-    self % t = self % t + 1
-
-    associate( &
-      ! If weight_decay_l2 > 0, use L2 regularization;
-      ! otherwise, default to regular Adagrad.
-      g => reshape(gradient, [size(gradient)]) + self % weight_decay_l2 * reshape(param, [size(param)]), &
-      ! Amortize the learning rate as function of the current time step.
-      learning_rate => self % learning_rate &
-        / (1 + (self % t - 1) * self % learning_rate_decay) &
-    )
-
-      self % sum_squared_gradient = self % sum_squared_gradient + g**2
-
-      param = param - learning_rate * reshape(g / (sqrt(self % sum_squared_gradient) &
-        + self % epsilon), shape(param))
-
-    end associate
-
-  end subroutine minimize_adagrad_2d
+  end subroutine minimize_adagrad
 
 end module nf_optimizers
\ No newline at end of file

From 2160f97f8a6ffac1b62f6f25e38b752c4ba2d65b Mon Sep 17 00:00:00 2001
From: milancurcic <caomaco@gmail.com>
Date: Thu, 19 Jun 2025 23:49:05 -0400
Subject: [PATCH 05/13] get_params_ptr and get_gradients_ptr for conv1d,
 conv2d, and locally_connected1d

---
 src/nf/nf_conv1d_layer.f90                    | 22 ++++++++++++++
 src/nf/nf_conv1d_layer_submodule.f90          | 16 ++++++++++
 src/nf/nf_conv2d_layer.f90                    | 22 ++++++++++++++
 src/nf/nf_conv2d_layer_submodule.f90          | 18 ++++++++++++
 src/nf/nf_locally_connected1d_layer.f90       | 14 +++++++++
 ...nf_locally_connected1d_layer_submodule.f90 | 16 ++++++++++
 src/nf/nf_network_submodule.f90               | 29 +++++++++----------
 7 files changed, 122 insertions(+), 15 deletions(-)

diff --git a/src/nf/nf_conv1d_layer.f90 b/src/nf/nf_conv1d_layer.f90
index c39b11fc..871eef02 100644
--- a/src/nf/nf_conv1d_layer.f90
+++ b/src/nf/nf_conv1d_layer.f90
@@ -32,8 +32,10 @@ module nf_conv1d_layer
       procedure :: forward
       procedure :: backward
       procedure :: get_gradients
+      procedure :: get_gradients_ptr
       procedure :: get_num_params
       procedure :: get_params
+      procedure :: get_params_ptr
       procedure :: init
       procedure :: set_params
   
@@ -97,6 +99,16 @@ module function get_params(self) result(params)
           !! Parameters to get
       end function get_params
   
+      module subroutine get_params_ptr(self, w_ptr, b_ptr)
+        !! Return pointers to the parameters (weights and biases) of this layer.
+        class(conv1d_layer), intent(in), target :: self
+          !! A `conv1d_layer` instance
+        real, pointer, intent(out) :: w_ptr(:)
+          !! Pointer to the kernel weights (flattened)
+        real, pointer, intent(out) :: b_ptr(:)
+          !! Pointer to the biases
+      end subroutine get_params_ptr
+
       module function get_gradients(self) result(gradients)
         !! Return the gradients of this layer.
         !! The gradients are ordered as weights first, biases second.
@@ -106,6 +118,16 @@ module function get_gradients(self) result(gradients)
           !! Gradients to get
       end function get_gradients
   
+      module subroutine get_gradients_ptr(self, dw_ptr, db_ptr)
+        !! Return pointers to the gradients of this layer.
+        class(conv1d_layer), intent(in), target :: self
+          !! A `conv1d_layer` instance
+        real, pointer, intent(out) :: dw_ptr(:)
+          !! Pointer to the kernel weight gradients (flattened)
+        real, pointer, intent(out) :: db_ptr(:)
+          !! Pointer to the bias gradients
+      end subroutine get_gradients_ptr
+  
       module subroutine set_params(self, params)
         !! Set the parameters of the layer.
         class(conv1d_layer), intent(in out) :: self
diff --git a/src/nf/nf_conv1d_layer_submodule.f90 b/src/nf/nf_conv1d_layer_submodule.f90
index 5404b9c7..05bcde70 100644
--- a/src/nf/nf_conv1d_layer_submodule.f90
+++ b/src/nf/nf_conv1d_layer_submodule.f90
@@ -152,6 +152,14 @@ module function get_params(self) result(params)
     params = [ w_, self % biases]
   end function get_params
 
+  module subroutine get_params_ptr(self, w_ptr, b_ptr)
+    class(conv1d_layer), intent(in), target :: self
+    real, pointer, intent(out) :: w_ptr(:)
+    real, pointer, intent(out) :: b_ptr(:)
+    w_ptr(1:size(self % kernel)) => self % kernel
+    b_ptr => self % biases
+  end subroutine get_params_ptr
+
   module function get_gradients(self) result(gradients)
     class(conv1d_layer), intent(in), target :: self
     real, allocatable :: gradients(:)
@@ -160,6 +168,14 @@ module function get_gradients(self) result(gradients)
     gradients = [ dw_, self % db ]
   end function get_gradients
 
+  module subroutine get_gradients_ptr(self, dw_ptr, db_ptr)
+    class(conv1d_layer), intent(in), target :: self
+    real, pointer, intent(out) :: dw_ptr(:)
+    real, pointer, intent(out) :: db_ptr(:)
+    dw_ptr(1:size(self % dw)) => self % dw
+    db_ptr => self % db
+  end subroutine get_gradients_ptr
+
   module subroutine set_params(self, params)
     class(conv1d_layer), intent(in out) :: self
     real, intent(in) :: params(:)
diff --git a/src/nf/nf_conv2d_layer.f90 b/src/nf/nf_conv2d_layer.f90
index 4b79376e..3f7b28db 100644
--- a/src/nf/nf_conv2d_layer.f90
+++ b/src/nf/nf_conv2d_layer.f90
@@ -33,8 +33,10 @@ module nf_conv2d_layer
     procedure :: forward
     procedure :: backward
     procedure :: get_gradients
+    procedure :: get_gradients_ptr
     procedure :: get_num_params
     procedure :: get_params
+    procedure :: get_params_ptr
     procedure :: init
     procedure :: set_params
 
@@ -98,6 +100,16 @@ module function get_params(self) result(params)
         !! Parameters to get
     end function get_params
 
+    module subroutine get_params_ptr(self, w_ptr, b_ptr)
+      !! Return pointers to the parameters (weights and biases) of this layer.
+      class(conv2d_layer), intent(in), target :: self
+        !! A `conv2d_layer` instance
+      real, pointer, intent(out) :: w_ptr(:)
+        !! Pointer to the kernel weights (flattened)
+      real, pointer, intent(out) :: b_ptr(:)
+        !! Pointer to the biases
+    end subroutine get_params_ptr
+
     module function get_gradients(self) result(gradients)
       !! Return the gradients of this layer.
       !! The gradients are ordered as weights first, biases second.
@@ -107,6 +119,16 @@ module function get_gradients(self) result(gradients)
         !! Gradients to get
     end function get_gradients
 
+    module subroutine get_gradients_ptr(self, dw_ptr, db_ptr)
+      !! Return pointers to the gradients of this layer.
+      class(conv2d_layer), intent(in), target :: self
+        !! A `conv2d_layer` instance
+      real, pointer, intent(out) :: dw_ptr(:)
+        !! Pointer to the kernel weight gradients (flattened)
+      real, pointer, intent(out) :: db_ptr(:)
+        !! Pointer to the bias gradients
+    end subroutine get_gradients_ptr
+
     module subroutine set_params(self, params)
       !! Set the parameters of the layer.
       class(conv2d_layer), intent(in out) :: self
diff --git a/src/nf/nf_conv2d_layer_submodule.f90 b/src/nf/nf_conv2d_layer_submodule.f90
index 45a2c1da..b617ec34 100644
--- a/src/nf/nf_conv2d_layer_submodule.f90
+++ b/src/nf/nf_conv2d_layer_submodule.f90
@@ -204,6 +204,15 @@ module function get_params(self) result(params)
 
   end function get_params
 
+  
+  module subroutine get_params_ptr(self, w_ptr, b_ptr)
+    class(conv2d_layer), intent(in), target :: self
+    real, pointer, intent(out) :: w_ptr(:)
+    real, pointer, intent(out) :: b_ptr(:)
+    w_ptr(1:size(self % kernel)) => self % kernel
+    b_ptr => self % biases
+  end subroutine get_params_ptr
+
 
   module function get_gradients(self) result(gradients)
     class(conv2d_layer), intent(in), target :: self
@@ -221,6 +230,15 @@ module function get_gradients(self) result(gradients)
   end function get_gradients
 
 
+  module subroutine get_gradients_ptr(self, dw_ptr, db_ptr)
+    class(conv2d_layer), intent(in), target :: self
+    real, pointer, intent(out) :: dw_ptr(:)
+    real, pointer, intent(out) :: db_ptr(:)
+    dw_ptr(1:size(self % dw)) => self % dw
+    db_ptr => self % db
+  end subroutine get_gradients_ptr
+
+
   module subroutine set_params(self, params)
     class(conv2d_layer), intent(in out) :: self
     real, intent(in) :: params(:)
diff --git a/src/nf/nf_locally_connected1d_layer.f90 b/src/nf/nf_locally_connected1d_layer.f90
index beca76d5..6fea2c5c 100644
--- a/src/nf/nf_locally_connected1d_layer.f90
+++ b/src/nf/nf_locally_connected1d_layer.f90
@@ -32,8 +32,10 @@ module nf_locally_connected1d_layer
       procedure :: forward
       procedure :: backward
       procedure :: get_gradients
+      procedure :: get_gradients_ptr
       procedure :: get_num_params
       procedure :: get_params
+      procedure :: get_params_ptr
       procedure :: init
       procedure :: set_params
   
@@ -97,6 +99,12 @@ module function get_params(self) result(params)
           !! Parameters to get
       end function get_params
   
+      module subroutine get_params_ptr(self, w_ptr, b_ptr)
+        class(locally_connected1d_layer), intent(in), target :: self
+        real, pointer, intent(out) :: w_ptr(:)
+        real, pointer, intent(out) :: b_ptr(:)
+      end subroutine get_params_ptr
+  
       module function get_gradients(self) result(gradients)
         !! Return the gradients of this layer.
         !! The gradients are ordered as weights first, biases second.
@@ -106,6 +114,12 @@ module function get_gradients(self) result(gradients)
           !! Gradients to get
       end function get_gradients
   
+      module subroutine get_gradients_ptr(self, dw_ptr, db_ptr)
+        class(locally_connected1d_layer), intent(in), target :: self
+        real, pointer, intent(out) :: dw_ptr(:)
+        real, pointer, intent(out) :: db_ptr(:)
+      end subroutine get_gradients_ptr
+  
       module subroutine set_params(self, params)
         !! Set the parameters of the layer.
         class(locally_connected1d_layer), intent(in out) :: self
diff --git a/src/nf/nf_locally_connected1d_layer_submodule.f90 b/src/nf/nf_locally_connected1d_layer_submodule.f90
index 053c520b..fa6110d5 100644
--- a/src/nf/nf_locally_connected1d_layer_submodule.f90
+++ b/src/nf/nf_locally_connected1d_layer_submodule.f90
@@ -128,12 +128,28 @@ module function get_params(self) result(params)
     params = [self % kernel, self % biases]
   end function get_params
 
+  module subroutine get_params_ptr(self, w_ptr, b_ptr)
+    class(locally_connected1d_layer), intent(in), target :: self
+    real, pointer, intent(out) :: w_ptr(:)
+    real, pointer, intent(out) :: b_ptr(:)
+    w_ptr(1:size(self % kernel)) => self % kernel
+    b_ptr(1:size(self % biases)) => self % biases
+  end subroutine get_params_ptr
+
   module function get_gradients(self) result(gradients)
     class(locally_connected1d_layer), intent(in), target :: self
     real, allocatable :: gradients(:)
     gradients = [self % dw, self % db]
   end function get_gradients
 
+  module subroutine get_gradients_ptr(self, dw_ptr, db_ptr)
+    class(locally_connected1d_layer), intent(in), target :: self
+    real, pointer, intent(out) :: dw_ptr(:)
+    real, pointer, intent(out) :: db_ptr(:)
+    dw_ptr(1:size(self % dw)) => self % dw
+    db_ptr(1:size(self % db)) => self % db
+  end subroutine get_gradients_ptr
+
   module subroutine set_params(self, params)
     class(locally_connected1d_layer), intent(in out) :: self
     real, intent(in) :: params(:)
diff --git a/src/nf/nf_network_submodule.f90 b/src/nf/nf_network_submodule.f90
index 3508ec50..60c0e151 100644
--- a/src/nf/nf_network_submodule.f90
+++ b/src/nf/nf_network_submodule.f90
@@ -701,28 +701,27 @@ module subroutine update(self, optimizer, batch_size)
           call this_layer % get_gradients_ptr(dw, db)
           call self % optimizer % minimize(weights, dw / batch_size_)
           call self % optimizer % minimize(biases, db / batch_size_)
-        type is(locally_connected1d_layer)
-          !TODO
-        type is(conv1d_layer)
-          !TODO
-        type is(conv2d_layer)
-          !TODO
-      end select
-    end do
-
-    ! Flush network gradients to zero.
-    do n = 2, size(self % layers)
-      select type(this_layer => self % layers(n) % p)
-        type is(dense_layer)
           this_layer % dw = 0
           this_layer % db = 0
-        type is(conv2d_layer)
+        type is(conv1d_layer)
+          call this_layer % get_params_ptr(weights, biases)
+          call this_layer % get_gradients_ptr(dw, db)
+          call self % optimizer % minimize(weights, dw / batch_size_)
+          call self % optimizer % minimize(biases, db / batch_size_)
           this_layer % dw = 0
           this_layer % db = 0
-        type is(conv1d_layer)
+        type is(conv2d_layer)
+          call this_layer % get_params_ptr(weights, biases)
+          call this_layer % get_gradients_ptr(dw, db)
+          call self % optimizer % minimize(weights, dw / batch_size_)
+          call self % optimizer % minimize(biases, db / batch_size_)
           this_layer % dw = 0
           this_layer % db = 0
         type is(locally_connected1d_layer)
+          call this_layer % get_params_ptr(weights, biases)
+          call this_layer % get_gradients_ptr(dw, db)
+          call self % optimizer % minimize(weights, dw / batch_size_)
+          call self % optimizer % minimize(biases, db / batch_size_)
           this_layer % dw = 0
           this_layer % db = 0
       end select

From 0e11f1016828f229dbb5d1f50d7c573ff9a9c918 Mon Sep 17 00:00:00 2001
From: milancurcic <caomaco@gmail.com>
Date: Fri, 20 Jun 2025 13:59:22 -0400
Subject: [PATCH 06/13] Define optimizer instance per layer to preserve memory
 across layers

---
 src/nf/nf_layer.f90             |  1 +
 src/nf/nf_network_submodule.f90 | 46 +++++++++++++++++++++++++++------
 2 files changed, 39 insertions(+), 8 deletions(-)

diff --git a/src/nf/nf_layer.f90 b/src/nf/nf_layer.f90
index 517622b0..b12592f3 100644
--- a/src/nf/nf_layer.f90
+++ b/src/nf/nf_layer.f90
@@ -22,6 +22,7 @@ module nf_layer
     integer, allocatable :: layer_shape(:)
     integer, allocatable :: input_layer_shape(:)
     logical :: initialized = .false.
+    class(optimizer_base_type), allocatable :: optimizer
 
   contains
 
diff --git a/src/nf/nf_network_submodule.f90 b/src/nf/nf_network_submodule.f90
index 60c0e151..876070bc 100644
--- a/src/nf/nf_network_submodule.f90
+++ b/src/nf/nf_network_submodule.f90
@@ -597,12 +597,26 @@ module subroutine train(self, input_data, output_data, batch_size, &
     ! If not provided, we default to SGD with its default settings.
     if (present(optimizer)) then
       self % optimizer = optimizer
+
+      do n = 1, size(self % layers)
+        self % layers(n) % optimizer = optimizer
+      end do
+
     else
       self % optimizer = sgd()
+
+      do n = 1, size(self % layers)
+        self % layers(n) % optimizer = sgd()
+      end do
+
     end if
 
     call self % optimizer % init(self % get_num_params())
 
+    do n = 1, size(self % layers)
+      call self % layers(n) % optimizer % init(self % layers(n) % get_num_params())
+    end do
+
     ! Passing the loss instance is optional.
     ! If not provided, we default to quadratic().
     if (present(loss)) then
@@ -662,10 +676,26 @@ module subroutine update(self, optimizer, batch_size)
     if (.not. allocated(self % optimizer)) then
       if (present(optimizer)) then
         self % optimizer = optimizer
+        
+        do n = 1, size(self % layers)
+          self % layers(n) % optimizer = optimizer
+        end do
+
       else
         self % optimizer = sgd()
+
+        do n = 1, size(self % layers)
+          self % layers(n) % optimizer = sgd()
+        end do
+
       end if
+
       call self % optimizer % init(self % get_num_params())
+
+      do n = 1, size(self % layers)
+        call self % layers(n) % optimizer % init(self % layers(n) % get_num_params())
+      end do
+
     end if
 
     if (present(batch_size)) then
@@ -699,29 +729,29 @@ module subroutine update(self, optimizer, batch_size)
         type is(dense_layer)
           call this_layer % get_params_ptr(weights, biases)
           call this_layer % get_gradients_ptr(dw, db)
-          call self % optimizer % minimize(weights, dw / batch_size_)
-          call self % optimizer % minimize(biases, db / batch_size_)
+          call self % layers(n) %optimizer % minimize(weights, dw / batch_size_)
+          call self % layers(n) %optimizer % minimize(biases, db / batch_size_)
           this_layer % dw = 0
           this_layer % db = 0
         type is(conv1d_layer)
           call this_layer % get_params_ptr(weights, biases)
           call this_layer % get_gradients_ptr(dw, db)
-          call self % optimizer % minimize(weights, dw / batch_size_)
-          call self % optimizer % minimize(biases, db / batch_size_)
+          call self % layers(n) %optimizer % minimize(weights, dw / batch_size_)
+          call self % layers(n) %optimizer % minimize(biases, db / batch_size_)
           this_layer % dw = 0
           this_layer % db = 0
         type is(conv2d_layer)
           call this_layer % get_params_ptr(weights, biases)
           call this_layer % get_gradients_ptr(dw, db)
-          call self % optimizer % minimize(weights, dw / batch_size_)
-          call self % optimizer % minimize(biases, db / batch_size_)
+          call self % layers(n) %optimizer % minimize(weights, dw / batch_size_)
+          call self % layers(n) %optimizer % minimize(biases, db / batch_size_)
           this_layer % dw = 0
           this_layer % db = 0
         type is(locally_connected1d_layer)
           call this_layer % get_params_ptr(weights, biases)
           call this_layer % get_gradients_ptr(dw, db)
-          call self % optimizer % minimize(weights, dw / batch_size_)
-          call self % optimizer % minimize(biases, db / batch_size_)
+          call self % layers(n) %optimizer % minimize(weights, dw / batch_size_)
+          call self % layers(n) %optimizer % minimize(biases, db / batch_size_)
           this_layer % dw = 0
           this_layer % db = 0
       end select

From dc55df09d872b4373ca8894523babab7fbdf8416 Mon Sep 17 00:00:00 2001
From: milancurcic <caomaco@gmail.com>
Date: Fri, 4 Jul 2025 10:10:27 -0400
Subject: [PATCH 07/13] Initialization of network-wide optimizer no longer
 needed now that we switched to per-layer optimizer instances

---
 src/nf/nf_network_submodule.f90 | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/src/nf/nf_network_submodule.f90 b/src/nf/nf_network_submodule.f90
index 876070bc..f434eab0 100644
--- a/src/nf/nf_network_submodule.f90
+++ b/src/nf/nf_network_submodule.f90
@@ -611,8 +611,6 @@ module subroutine train(self, input_data, output_data, batch_size, &
 
     end if
 
-    call self % optimizer % init(self % get_num_params())
-
     do n = 1, size(self % layers)
       call self % layers(n) % optimizer % init(self % layers(n) % get_num_params())
     end do
@@ -690,8 +688,6 @@ module subroutine update(self, optimizer, batch_size)
 
       end if
 
-      call self % optimizer % init(self % get_num_params())
-
       do n = 1, size(self % layers)
         call self % layers(n) % optimizer % init(self % layers(n) % get_num_params())
       end do
@@ -729,29 +725,29 @@ module subroutine update(self, optimizer, batch_size)
         type is(dense_layer)
           call this_layer % get_params_ptr(weights, biases)
           call this_layer % get_gradients_ptr(dw, db)
-          call self % layers(n) %optimizer % minimize(weights, dw / batch_size_)
-          call self % layers(n) %optimizer % minimize(biases, db / batch_size_)
+          call self % layers(n) % optimizer % minimize(weights, dw / batch_size_)
+          call self % layers(n) % optimizer % minimize(biases, db / batch_size_)
           this_layer % dw = 0
           this_layer % db = 0
         type is(conv1d_layer)
           call this_layer % get_params_ptr(weights, biases)
           call this_layer % get_gradients_ptr(dw, db)
-          call self % layers(n) %optimizer % minimize(weights, dw / batch_size_)
-          call self % layers(n) %optimizer % minimize(biases, db / batch_size_)
+          call self % layers(n) % optimizer % minimize(weights, dw / batch_size_)
+          call self % layers(n) % optimizer % minimize(biases, db / batch_size_)
           this_layer % dw = 0
           this_layer % db = 0
         type is(conv2d_layer)
           call this_layer % get_params_ptr(weights, biases)
           call this_layer % get_gradients_ptr(dw, db)
-          call self % layers(n) %optimizer % minimize(weights, dw / batch_size_)
-          call self % layers(n) %optimizer % minimize(biases, db / batch_size_)
+          call self % layers(n) % optimizer % minimize(weights, dw / batch_size_)
+          call self % layers(n) % optimizer % minimize(biases, db / batch_size_)
           this_layer % dw = 0
           this_layer % db = 0
         type is(locally_connected1d_layer)
           call this_layer % get_params_ptr(weights, biases)
           call this_layer % get_gradients_ptr(dw, db)
-          call self % layers(n) %optimizer % minimize(weights, dw / batch_size_)
-          call self % layers(n) %optimizer % minimize(biases, db / batch_size_)
+          call self % layers(n) % optimizer % minimize(weights, dw / batch_size_)
+          call self % layers(n) % optimizer % minimize(biases, db / batch_size_)
           this_layer % dw = 0
           this_layer % db = 0
       end select

From e9ba73e8f81cfecd0d8244929a7294f81bfc64f2 Mon Sep 17 00:00:00 2001
From: milancurcic <caomaco@gmail.com>
Date: Mon, 28 Jul 2025 14:55:18 -0400
Subject: [PATCH 08/13] Bookkeeping for velocity, rms_gradient, etc.; optimizer
 tests now pass

---
 src/nf/nf_optimizers.f90 | 88 ++++++++++++++++++++++++++++++++++------
 1 file changed, 76 insertions(+), 12 deletions(-)

diff --git a/src/nf/nf_optimizers.f90 b/src/nf/nf_optimizers.f90
index 24089ccd..2926c959 100644
--- a/src/nf/nf_optimizers.f90
+++ b/src/nf/nf_optimizers.f90
@@ -44,6 +44,7 @@ end subroutine minimize
     real :: momentum = 0
     logical :: nesterov = .false.
     real, allocatable, private :: velocity(:)
+    integer, private :: start_index = 1
   contains
     procedure :: init => init_sgd
     procedure :: minimize => minimize_sgd
@@ -59,6 +60,7 @@ end subroutine minimize
     real :: decay_rate = 0.9
     real :: epsilon = 1e-8
     real, allocatable, private :: rms_gradient(:)
+    integer, private :: start_index = 1
   contains
     procedure :: init => init_rmsprop
     procedure :: minimize => minimize_rmsprop
@@ -82,6 +84,7 @@ end subroutine minimize
     real :: weight_decay_decoupled = 0 ! decoupled weight decay regularization (AdamW)
     real, allocatable, private :: m(:), v(:)
     integer, private :: t = 0
+    integer, private :: start_index = 1
   contains
     procedure :: init => init_adam
     procedure :: minimize => minimize_adam
@@ -99,6 +102,7 @@ end subroutine minimize
     real :: learning_rate_decay = 0
     real, allocatable, private :: sum_squared_gradient(:)
     integer, private :: t = 0
+    integer, private :: start_index = 1
   contains
     procedure :: init => init_adagrad
     procedure :: minimize => minimize_adagrad
@@ -121,19 +125,38 @@ pure subroutine minimize_sgd(self, param, gradient)
     !! update rule.
     class(sgd), intent(inout) :: self
     real, intent(inout) :: param(:)
-    real, intent(in) :: gradient(:)
+    real, intent(in) :: gradient(:) ! Always the same size as param
+    integer :: end_index
 
     if (self % momentum > 0) then
+
+      ! end_index is part of the bookkeeping for updating velocity because each
+      ! batch update makes two calls to minimize, one for the weights and one for
+      ! the biases.
+      ! We use start_index and end_index to update the appropriate sections
+      ! of the velocity array.
+      end_index = self % start_index + size(param) - 1
+
       ! Apply momentum update
-      self % velocity = self % momentum * self % velocity &
+      self % velocity(self % start_index:end_index) = &
+        self % momentum * self % velocity(self % start_index:end_index) &
         - self % learning_rate * gradient
       if (self % nesterov) then
         ! Apply Nesterov update
-        param = param + self % momentum * self % velocity &
+        param = param + self % momentum * self % velocity(self % start_index:end_index) &
           - self % learning_rate * gradient
       else
-        param = param + self % velocity
+        param = param + self % velocity(self % start_index:end_index)
+      end if
+
+      if (self % start_index == 1) then
+        ! We updated the weights part, now we shift forward for the biases part
+        self % start_index = end_index + 1
+      else
+        ! We updated the biases part, now we shift back to start for the next batch
+        self % start_index = 1
       end if
+
     else
       ! Apply regular update
       param = param - self % learning_rate * gradient
@@ -157,14 +180,27 @@ pure subroutine minimize_rmsprop(self, param, gradient)
     class(rmsprop), intent(inout) :: self
     real, intent(inout) :: param(:)
     real, intent(in) :: gradient(:)
+    integer :: end_index
+
+    end_index = self % start_index + size(param) - 1
 
     ! Compute the RMS of the gradient using the RMSProp rule
-    self % rms_gradient = self % decay_rate * self % rms_gradient &
+    self % rms_gradient(self % start_index:end_index) = &
+      self % decay_rate * self % rms_gradient(self % start_index:end_index) &
       + (1 - self % decay_rate) * gradient**2
 
     ! Update the network parameters based on the new RMS of the gradient
     param = param - self % learning_rate &
-      / sqrt(self % rms_gradient + self % epsilon) * gradient
+      / sqrt(self % rms_gradient(self % start_index:end_index) + self % epsilon) &
+      * gradient
+
+    if (self % start_index == 1) then
+      ! We updated the weights part, now we shift forward for the biases part
+      self % start_index = end_index + 1
+    else
+      ! We updated the biases part, now we shift back to start for the next batch
+      self % start_index = 1
+    end if
 
   end subroutine minimize_rmsprop
 
@@ -185,20 +221,27 @@ pure subroutine minimize_adam(self, param, gradient)
     class(adam), intent(inout) :: self
     real, intent(inout) :: param(:)
     real, intent(in) :: gradient(:)
+    integer :: end_index
+
+    end_index = self % start_index + size(param) - 1
 
     self % t = self % t + 1
 
     ! If weight_decay_l2 > 0, use L2 regularization;
     ! otherwise, default to regular Adam.
     associate(g => gradient + self % weight_decay_l2 * param)
-      self % m = self % beta1 * self % m + (1 - self % beta1) * g
-      self % v = self % beta2 * self % v + (1 - self % beta2) * g**2
+      self % m(self % start_index:end_index) = &
+        self % beta1 * self % m(self % start_index:end_index) &
+        + (1 - self % beta1) * g
+      self % v(self % start_index:end_index) = &
+        self % beta2 * self % v(self % start_index:end_index) &
+        + (1 - self % beta2) * g**2
     end associate
 
     ! Compute bias-corrected first and second moment estimates.
     associate( &
-      m_hat => self % m / (1 - self % beta1**self % t), &
-      v_hat => self % v / (1 - self % beta2**self % t) &
+      m_hat => self % m(self % start_index:end_index) / (1 - self % beta1**self % t), &
+      v_hat => self % v(self % start_index:end_index) / (1 - self % beta2**self % t) &
     )
 
     ! Update parameters.
@@ -208,6 +251,14 @@ pure subroutine minimize_adam(self, param, gradient)
 
     end associate
 
+    if (self % start_index == 1) then
+      ! We updated the weights part, now we shift forward for the biases part
+      self % start_index = end_index + 1
+    else
+      ! We updated the biases part, now we shift back to start for the next batch
+      self % start_index = 1
+    end if
+
   end subroutine minimize_adam
 
 
@@ -226,6 +277,9 @@ pure subroutine minimize_adagrad(self, param, gradient)
     class(adagrad), intent(inout) :: self
     real, intent(inout) :: param(:)
     real, intent(in) :: gradient(:)
+    integer :: end_index
+
+    end_index = self % start_index + size(param) - 1
 
     ! Update the current time step
     self % t = self % t + 1
@@ -239,13 +293,23 @@ pure subroutine minimize_adagrad(self, param, gradient)
         / (1 + (self % t - 1) * self % learning_rate_decay) &
     )
 
-      self % sum_squared_gradient = self % sum_squared_gradient + g**2
+      self % sum_squared_gradient(self % start_index:end_index) = &
+        self % sum_squared_gradient(self % start_index:end_index) + g**2
 
-      param = param - learning_rate * g / (sqrt(self % sum_squared_gradient) &
+      param = param - learning_rate * g &
+        / (sqrt(self % sum_squared_gradient(self % start_index:end_index)) &
         + self % epsilon)
 
     end associate
 
+    if (self % start_index == 1) then
+      ! We updated the weights part, now we shift forward for the biases part
+      self % start_index = end_index + 1
+    else
+      ! We updated the biases part, now we shift back to start for the next batch
+      self % start_index = 1
+    end if
+
   end subroutine minimize_adagrad
 
 end module nf_optimizers
\ No newline at end of file

From ad176ea847ac4fc9a2674d44a3894429a7db26af Mon Sep 17 00:00:00 2001
From: milancurcic <caomaco@gmail.com>
Date: Tue, 29 Jul 2025 13:25:06 -0400
Subject: [PATCH 09/13] Update optimizer flow for linear2d

---
 src/nf/nf_linear2d_layer.f90           | 12 +++++++++
 src/nf/nf_linear2d_layer_submodule.f90 | 34 ++++++++++++++------------
 2 files changed, 30 insertions(+), 16 deletions(-)

diff --git a/src/nf/nf_linear2d_layer.f90 b/src/nf/nf_linear2d_layer.f90
index f785a14c..f2c8fd16 100644
--- a/src/nf/nf_linear2d_layer.f90
+++ b/src/nf/nf_linear2d_layer.f90
@@ -25,7 +25,9 @@ module nf_linear2d_layer
     procedure :: init
     procedure :: get_num_params
     procedure :: get_params
+    procedure :: get_params_ptr
     procedure :: get_gradients
+    procedure :: get_gradients_ptr
     procedure :: set_params
 
   end type linear2d_layer
@@ -64,11 +66,21 @@ module function get_params(self) result(params)
       real, allocatable :: params(:)
     end function get_params
 
+    module subroutine get_params_ptr(self, w_ptr, b_ptr)
+      class(linear2d_layer), intent(in), target :: self
+      real, pointer, intent(out) :: w_ptr(:), b_ptr(:)
+    end subroutine get_params_ptr
+
     module function get_gradients(self) result(gradients)
       class(linear2d_layer), intent(in), target :: self
       real, allocatable :: gradients(:)
     end function get_gradients
 
+    module subroutine get_gradients_ptr(self, dw_ptr, db_ptr)
+      class(linear2d_layer), intent(in), target :: self
+      real, pointer, intent(out) :: dw_ptr(:), db_ptr(:)
+    end subroutine get_gradients_ptr
+
     module subroutine set_params(self, params)
       class(linear2d_layer), intent(in out) :: self
       real, intent(in), target :: params(:)
diff --git a/src/nf/nf_linear2d_layer_submodule.f90 b/src/nf/nf_linear2d_layer_submodule.f90
index 0dfe7e27..513527f0 100644
--- a/src/nf/nf_linear2d_layer_submodule.f90
+++ b/src/nf/nf_linear2d_layer_submodule.f90
@@ -82,33 +82,35 @@ end function get_num_params
   module function get_params(self) result(params)
     class(linear2d_layer), intent(in), target :: self
     real, allocatable :: params(:)
-
     real, pointer :: w_(:) => null()
+    w_(1: size(self % weights)) => self % weights
+    params = [w_, self % biases]
+  end function get_params
 
-    w_(1: product(shape(self % weights))) => self % weights
-
-    params = [ &
-      w_, &
-      self % biases &
-    ]
 
-  end function get_params
+  module subroutine get_params_ptr(self, w_ptr, b_ptr)
+    class(linear2d_layer), intent(in), target :: self
+    real, pointer, intent(out) :: w_ptr(:), b_ptr(:)
+    w_ptr(1:size(self % weights)) => self % weights
+    b_ptr => self % biases
+  end subroutine get_params_ptr
 
 
   module function get_gradients(self) result(gradients)
     class(linear2d_layer), intent(in), target :: self
     real, allocatable :: gradients(:)
-
     real, pointer :: dw_(:) => null()
+    dw_(1:size(self % dw)) => self % dw
+    gradients = [dw_, self % db]
+  end function get_gradients
 
-    dw_(1: product(shape(self % dw))) => self % dw
-
-    gradients = [ &
-      dw_, &
-      self % db &
-    ]
 
-  end function get_gradients
+  module subroutine get_gradients_ptr(self, dw_ptr, db_ptr)
+    class(linear2d_layer), intent(in), target :: self
+    real, pointer, intent(out) :: dw_ptr(:), db_ptr(:)
+    dw_ptr(1:size(self % dw)) => self % dw
+    db_ptr => self % db
+  end subroutine get_gradients_ptr
 
 
   module subroutine set_params(self, params)

From e5072d3cd94ded9f97dd8781af38841cc3b16ee1 Mon Sep 17 00:00:00 2001
From: milancurcic <caomaco@gmail.com>
Date: Tue, 29 Jul 2025 13:43:30 -0400
Subject: [PATCH 10/13] Update optimizer flow for layernorm

---
 src/nf/nf_layernorm.f90           | 14 ++++++++++++++
 src/nf/nf_layernorm_submodule.f90 | 26 ++++++++++++++++----------
 src/nf/nf_network_submodule.f90   | 14 ++++++++++++++
 test/test_layernorm.f90           | 30 +++++++++++++++++-------------
 4 files changed, 61 insertions(+), 23 deletions(-)

diff --git a/src/nf/nf_layernorm.f90 b/src/nf/nf_layernorm.f90
index 36ef56f0..7bffc06a 100644
--- a/src/nf/nf_layernorm.f90
+++ b/src/nf/nf_layernorm.f90
@@ -38,7 +38,9 @@ module nf_layernorm_layer
     procedure :: init
     procedure :: get_num_params
     procedure :: get_params
+    procedure :: get_params_ptr
     procedure :: get_gradients
+    procedure :: get_gradients_ptr
     procedure :: set_params
   end type layernorm_layer
 
@@ -78,12 +80,24 @@ module function get_params(self) result(params)
     end function get_params
 
 
+    module subroutine get_params_ptr(self, g_ptr, b_ptr)
+      class(layernorm_layer), intent(in), target :: self
+      real, pointer, intent(out) :: g_ptr(:), b_ptr(:)
+    end subroutine get_params_ptr
+
+
     module function get_gradients(self) result(gradients)
       class(layernorm_layer), intent(in), target :: self
       real, allocatable :: gradients(:)
     end function get_gradients
 
 
+    module subroutine get_gradients_ptr(self, dg_ptr, db_ptr)
+      class(layernorm_layer), intent(in), target :: self
+      real, pointer, intent(out) :: dg_ptr(:), db_ptr(:)
+    end subroutine get_gradients_ptr
+
+
     module subroutine set_params(self, params)
       class(layernorm_layer), intent(in out) :: self
       real, intent(in), target :: params(:)
diff --git a/src/nf/nf_layernorm_submodule.f90 b/src/nf/nf_layernorm_submodule.f90
index 4eaa4382..5e357b33 100644
--- a/src/nf/nf_layernorm_submodule.f90
+++ b/src/nf/nf_layernorm_submodule.f90
@@ -112,25 +112,31 @@ end function get_num_params
   module function get_params(self) result(params)
     class(layernorm_layer), intent(in), target :: self
     real, allocatable :: params(:)
+    params = [self % gamma, self % beta]
+  end function get_params
 
-    params = [ &
-      self % gamma, &
-      self % beta &
-    ]
 
-  end function get_params
+  module subroutine get_params_ptr(self, g_ptr, b_ptr)
+    class(layernorm_layer), intent(in), target :: self
+    real, pointer, intent(out) :: g_ptr(:), b_ptr(:)
+    g_ptr => self % gamma
+    b_ptr => self % beta
+  end subroutine get_params_ptr
 
 
   module function get_gradients(self) result(gradients)
     class(layernorm_layer), intent(in), target :: self
     real, allocatable :: gradients(:)
+    gradients = [self % d_gamma, self % d_beta]
+  end function get_gradients
 
-    gradients = [ &
-      self % d_gamma, &
-      self % d_beta &
-    ]
 
-  end function get_gradients
+  module subroutine get_gradients_ptr(self, dg_ptr, db_ptr)
+    class(layernorm_layer), intent(in), target :: self
+    real, pointer, intent(out) :: dg_ptr(:), db_ptr(:)
+    dg_ptr => self % d_gamma
+    db_ptr => self % d_beta
+  end subroutine get_gradients_ptr
 
 
   module subroutine set_params(self, params)
diff --git a/src/nf/nf_network_submodule.f90 b/src/nf/nf_network_submodule.f90
index f434eab0..76937ade 100644
--- a/src/nf/nf_network_submodule.f90
+++ b/src/nf/nf_network_submodule.f90
@@ -750,6 +750,20 @@ module subroutine update(self, optimizer, batch_size)
           call self % layers(n) % optimizer % minimize(biases, db / batch_size_)
           this_layer % dw = 0
           this_layer % db = 0
+        type is(linear2d_layer)
+          call this_layer % get_params_ptr(weights, biases)
+          call this_layer % get_gradients_ptr(dw, db)
+          call self % layers(n) % optimizer % minimize(weights, dw / batch_size_)
+          call self % layers(n) % optimizer % minimize(biases, db / batch_size_)
+          this_layer % dw = 0
+          this_layer % db = 0
+        type is(layernorm_layer)
+          call this_layer % get_params_ptr(weights, biases)
+          call this_layer % get_gradients_ptr(dw, db)
+          call self % layers(n) % optimizer % minimize(weights, dw / batch_size_)
+          call self % layers(n) % optimizer % minimize(biases, db / batch_size_)
+          this_layer % d_gamma = 0
+          this_layer % d_beta = 0
       end select
     end do
 
diff --git a/test/test_layernorm.f90 b/test/test_layernorm.f90
index 6a897575..9e8bfccf 100644
--- a/test/test_layernorm.f90
+++ b/test/test_layernorm.f90
@@ -27,14 +27,14 @@ program test_layernorm_instance
   end if
 
 contains
-  function allclose(x, y) result(res)
-    real, intent(in) :: x(:)
-    real, intent(in) :: y(:)
-    logical :: res
 
-    res = all(abs(x - y) <= (1e-06 + 1e-05 * abs(y)))
+  logical function allclose(x, y) result(res)
+    real, intent(in) :: x(:), y(:)
+    !res = all(abs(x - y) <= (1e-06 + 1e-05 * abs(y)))
+    res = all(abs(x - y) <= 1e-05)
   end function allclose
 
+
   subroutine test_layernorm_forward(layernorm_instance, input, ok)
     type(layernorm_layer), intent(in out) :: layernorm_instance
     real, intent(in out) :: input(:, :)
@@ -61,6 +61,7 @@ subroutine test_layernorm_forward(layernorm_instance, input, ok)
     end if
   end subroutine test_layernorm_forward
 
+
   subroutine test_layernorm_backward(layernorm_instance, input, gradient, ok)
     type(layernorm_layer), intent(in out) :: layernorm_instance
     real, intent(in out) :: input(:, :)
@@ -103,6 +104,7 @@ subroutine test_layernorm_backward(layernorm_instance, input, gradient, ok)
     end if
   end subroutine test_layernorm_backward
 
+
   subroutine test_layernorm_gradients(input, gradient, ok)
     real, intent(in out) :: input(:, :)
     real, intent(in out) :: gradient(:, :)
@@ -152,6 +154,7 @@ subroutine test_layernorm_gradients(input, gradient, ok)
     end if
   end subroutine test_layernorm_gradients
 
+
   subroutine test_layernorm_integration(ok)
     logical, intent(in out) :: ok
 
@@ -160,13 +163,13 @@ subroutine test_layernorm_integration(ok)
     real :: y(6) = [0.7, 0.2, 0.1, 0.1, 0.01, 0.9]
     real :: tolerance = 0.1
     integer :: epoch
-    integer :: epochs = 10000
+    integer, parameter :: num_epochs = 100000
 
-    net = network([&
-        input(2, 3),&
-        linear2d(3),&
-        layernorm(),&
-        flatten()&
+    net = network([ &
+        input(2, 3), &
+        linear2d(3), &
+        layernorm(), &
+        flatten() &
     ])
 
     ! Kaiming weights to achieve semblance of convergance
@@ -177,17 +180,18 @@ subroutine test_layernorm_integration(ok)
       l % biases = 0.2
     end select
 
-    do epoch = 1, epochs
+    do epoch = 1, num_epochs
       call net % forward(x)
       call net % backward(y)
       call net % update(optimizer=sgd(learning_rate=0.001))
       if (all(abs(net % predict(x) - y) < tolerance)) exit
     end do
 
-    if (.not. epoch <= epochs) then
+    if (.not. epoch <= num_epochs) then
       write(stderr, '(a)') &
         'linear2d + layernorm should converge in simple training.. failed'
       ok = .false.
     end if
   end subroutine test_layernorm_integration
+
 end program test_layernorm_instance

From 86ed7b3a7ee68d9204055e70e34ab13f28a3e9d9 Mon Sep 17 00:00:00 2001
From: milancurcic <caomaco@gmail.com>
Date: Tue, 29 Jul 2025 14:03:06 -0400
Subject: [PATCH 11/13] Previous bookkeeping for successive calls to optim %
 minimize() assumed 2 calls per batch; this is now generalized to allow any
 number of calls until size(params) is exhausted

---
 src/nf/nf_optimizers.f90 | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/nf/nf_optimizers.f90 b/src/nf/nf_optimizers.f90
index 2926c959..9a6b1e1f 100644
--- a/src/nf/nf_optimizers.f90
+++ b/src/nf/nf_optimizers.f90
@@ -149,7 +149,7 @@ pure subroutine minimize_sgd(self, param, gradient)
         param = param + self % velocity(self % start_index:end_index)
       end if
 
-      if (self % start_index == 1) then
+      if (end_index < size(param)) then
         ! We updated the weights part, now we shift forward for the biases part
         self % start_index = end_index + 1
       else
@@ -194,7 +194,7 @@ pure subroutine minimize_rmsprop(self, param, gradient)
       / sqrt(self % rms_gradient(self % start_index:end_index) + self % epsilon) &
       * gradient
 
-    if (self % start_index == 1) then
+    if (end_index < size(param)) then
       ! We updated the weights part, now we shift forward for the biases part
       self % start_index = end_index + 1
     else
@@ -251,7 +251,7 @@ pure subroutine minimize_adam(self, param, gradient)
 
     end associate
 
-    if (self % start_index == 1) then
+    if (end_index < size(param)) then
       ! We updated the weights part, now we shift forward for the biases part
       self % start_index = end_index + 1
     else
@@ -302,7 +302,7 @@ pure subroutine minimize_adagrad(self, param, gradient)
 
     end associate
 
-    if (self % start_index == 1) then
+    if (end_index < size(param)) then
       ! We updated the weights part, now we shift forward for the biases part
       self % start_index = end_index + 1
     else

From 309ef6e82e1f665fcab9c78b6c5b5574d253f322 Mon Sep 17 00:00:00 2001
From: milancurcic <caomaco@gmail.com>
Date: Tue, 29 Jul 2025 14:19:33 -0400
Subject: [PATCH 12/13] Remove get_gradients from network, layer, dense,
 conv1d, conv2d

---
 src/nf/nf_conv1d_layer.f90           | 10 -------
 src/nf/nf_conv1d_layer_submodule.f90 |  8 -----
 src/nf/nf_conv2d_layer.f90           | 10 -------
 src/nf/nf_conv2d_layer_submodule.f90 | 16 ----------
 src/nf/nf_dense_layer.f90            | 10 -------
 src/nf/nf_dense_layer_submodule.f90  | 16 ----------
 src/nf/nf_layer.f90                  |  9 ------
 src/nf/nf_layer_submodule.f90        | 44 ----------------------------
 src/nf/nf_network.f90                |  9 ------
 src/nf/nf_network_submodule.f90      | 19 ------------
 10 files changed, 151 deletions(-)

diff --git a/src/nf/nf_conv1d_layer.f90 b/src/nf/nf_conv1d_layer.f90
index 871eef02..65f82347 100644
--- a/src/nf/nf_conv1d_layer.f90
+++ b/src/nf/nf_conv1d_layer.f90
@@ -31,7 +31,6 @@ module nf_conv1d_layer
   
       procedure :: forward
       procedure :: backward
-      procedure :: get_gradients
       procedure :: get_gradients_ptr
       procedure :: get_num_params
       procedure :: get_params
@@ -109,15 +108,6 @@ module subroutine get_params_ptr(self, w_ptr, b_ptr)
           !! Pointer to the biases
       end subroutine get_params_ptr
 
-      module function get_gradients(self) result(gradients)
-        !! Return the gradients of this layer.
-        !! The gradients are ordered as weights first, biases second.
-        class(conv1d_layer), intent(in), target :: self
-          !! A `conv1d_layer` instance
-        real, allocatable :: gradients(:)
-          !! Gradients to get
-      end function get_gradients
-  
       module subroutine get_gradients_ptr(self, dw_ptr, db_ptr)
         !! Return pointers to the gradients of this layer.
         class(conv1d_layer), intent(in), target :: self
diff --git a/src/nf/nf_conv1d_layer_submodule.f90 b/src/nf/nf_conv1d_layer_submodule.f90
index 05bcde70..98856689 100644
--- a/src/nf/nf_conv1d_layer_submodule.f90
+++ b/src/nf/nf_conv1d_layer_submodule.f90
@@ -160,14 +160,6 @@ module subroutine get_params_ptr(self, w_ptr, b_ptr)
     b_ptr => self % biases
   end subroutine get_params_ptr
 
-  module function get_gradients(self) result(gradients)
-    class(conv1d_layer), intent(in), target :: self
-    real, allocatable :: gradients(:)
-    real, pointer :: dw_(:) => null()
-    dw_(1:size(self % dw)) => self % dw
-    gradients = [ dw_, self % db ]
-  end function get_gradients
-
   module subroutine get_gradients_ptr(self, dw_ptr, db_ptr)
     class(conv1d_layer), intent(in), target :: self
     real, pointer, intent(out) :: dw_ptr(:)
diff --git a/src/nf/nf_conv2d_layer.f90 b/src/nf/nf_conv2d_layer.f90
index 3f7b28db..d6c92c31 100644
--- a/src/nf/nf_conv2d_layer.f90
+++ b/src/nf/nf_conv2d_layer.f90
@@ -32,7 +32,6 @@ module nf_conv2d_layer
 
     procedure :: forward
     procedure :: backward
-    procedure :: get_gradients
     procedure :: get_gradients_ptr
     procedure :: get_num_params
     procedure :: get_params
@@ -110,15 +109,6 @@ module subroutine get_params_ptr(self, w_ptr, b_ptr)
         !! Pointer to the biases
     end subroutine get_params_ptr
 
-    module function get_gradients(self) result(gradients)
-      !! Return the gradients of this layer.
-      !! The gradients are ordered as weights first, biases second.
-      class(conv2d_layer), intent(in), target :: self
-        !! A `conv2d_layer` instance
-      real, allocatable :: gradients(:)
-        !! Gradients to get
-    end function get_gradients
-
     module subroutine get_gradients_ptr(self, dw_ptr, db_ptr)
       !! Return pointers to the gradients of this layer.
       class(conv2d_layer), intent(in), target :: self
diff --git a/src/nf/nf_conv2d_layer_submodule.f90 b/src/nf/nf_conv2d_layer_submodule.f90
index b617ec34..56b398fc 100644
--- a/src/nf/nf_conv2d_layer_submodule.f90
+++ b/src/nf/nf_conv2d_layer_submodule.f90
@@ -214,22 +214,6 @@ module subroutine get_params_ptr(self, w_ptr, b_ptr)
   end subroutine get_params_ptr
 
 
-  module function get_gradients(self) result(gradients)
-    class(conv2d_layer), intent(in), target :: self
-    real, allocatable :: gradients(:)
-
-    real, pointer :: dw_(:) => null()
-
-    dw_(1:size(self % dw)) => self % dw
-
-    gradients = [ &
-      dw_, &
-      self % db &
-    ]
-
-  end function get_gradients
-
-
   module subroutine get_gradients_ptr(self, dw_ptr, db_ptr)
     class(conv2d_layer), intent(in), target :: self
     real, pointer, intent(out) :: dw_ptr(:)
diff --git a/src/nf/nf_dense_layer.f90 b/src/nf/nf_dense_layer.f90
index a55ec892..e93a57ca 100644
--- a/src/nf/nf_dense_layer.f90
+++ b/src/nf/nf_dense_layer.f90
@@ -33,7 +33,6 @@ module nf_dense_layer
 
     procedure :: backward
     procedure :: forward
-    procedure :: get_gradients
     procedure :: get_gradients_ptr
     procedure :: get_num_params
     procedure :: get_params
@@ -104,15 +103,6 @@ module subroutine get_params_ptr(self, w_ptr, b_ptr)
       real, pointer, intent(out) :: b_ptr(:)
     end subroutine get_params_ptr
 
-    module function get_gradients(self) result(gradients)
-      !! Return the gradients of this layer.
-      !! The gradients are ordered as weights first, biases second.
-      class(dense_layer), intent(in), target :: self
-        !! Dense layer instance
-      real, allocatable :: gradients(:)
-        !! Gradients of this layer
-    end function get_gradients
-
     module subroutine get_gradients_ptr(self, dw_ptr, db_ptr)
       class(dense_layer), intent(in), target :: self
       real, pointer, intent(out) :: dw_ptr(:)
diff --git a/src/nf/nf_dense_layer_submodule.f90 b/src/nf/nf_dense_layer_submodule.f90
index bb27c54a..c2f7e236 100644
--- a/src/nf/nf_dense_layer_submodule.f90
+++ b/src/nf/nf_dense_layer_submodule.f90
@@ -86,22 +86,6 @@ module subroutine get_params_ptr(self, w_ptr, b_ptr)
   end subroutine get_params_ptr
 
 
-  module function get_gradients(self) result(gradients)
-    class(dense_layer), intent(in), target :: self
-    real, allocatable :: gradients(:)
-
-    real, pointer :: dw_(:) => null()
-
-    dw_(1:size(self % dw)) => self % dw
-
-    gradients = [ &
-      dw_, &
-      self % db &
-    ]
-
-  end function get_gradients
-
-
   module subroutine get_gradients_ptr(self, dw_ptr, db_ptr)
     class(dense_layer), intent(in), target :: self
     real, pointer, intent(out) :: dw_ptr(:)
diff --git a/src/nf/nf_layer.f90 b/src/nf/nf_layer.f90
index b12592f3..79569845 100644
--- a/src/nf/nf_layer.f90
+++ b/src/nf/nf_layer.f90
@@ -29,7 +29,6 @@ module nf_layer
     procedure :: forward
     procedure :: get_num_params
     procedure :: get_params
-    procedure :: get_gradients
     procedure :: set_params
     procedure :: init
     procedure :: print_info
@@ -161,14 +160,6 @@ module function get_params(self) result(params)
         !! Parameters of this layer
     end function get_params
 
-    module function get_gradients(self) result(gradients)
-      !! Returns the gradients of this layer.
-      class(layer), intent(in) :: self
-        !! Layer instance
-      real, allocatable :: gradients(:)
-        !! Gradients of this layer
-    end function get_gradients
-
     module subroutine set_params(self, params)
       !! Returns the parameters of this layer.
       class(layer), intent(in out) :: self
diff --git a/src/nf/nf_layer_submodule.f90 b/src/nf/nf_layer_submodule.f90
index eebedaa9..778d227a 100644
--- a/src/nf/nf_layer_submodule.f90
+++ b/src/nf/nf_layer_submodule.f90
@@ -682,50 +682,6 @@ module function get_params(self) result(params)
 
   end function get_params
 
-  module function get_gradients(self) result(gradients)
-    class(layer), intent(in) :: self
-    real, allocatable :: gradients(:)
-
-    select type (this_layer => self % p)
-      type is (input1d_layer)
-        ! No gradients to get.
-      type is (input2d_layer)
-        ! No gradients to get.
-      type is (input3d_layer)
-        ! No gradients to get.
-      type is (dense_layer)
-        gradients = this_layer % get_gradients()
-      type is (dropout_layer)
-        ! No gradients to get.
-      type is (conv1d_layer)
-        gradients = this_layer % get_gradients()
-      type is (conv2d_layer)
-        gradients = this_layer % get_gradients()
-      type is (locally_connected1d_layer)
-        gradients = this_layer % get_gradients()
-      type is (maxpool1d_layer)
-        ! No gradients to get.
-      type is (maxpool2d_layer)
-        ! No gradients to get.
-      type is (flatten_layer)
-        ! No gradients to get.
-      type is (reshape2d_layer)
-        ! No parameters to get.
-      type is (reshape3d_layer)
-        ! No gradients to get.
-      type is (linear2d_layer)
-        gradients = this_layer % get_gradients()
-      type is (self_attention_layer)
-        gradients = this_layer % get_gradients()
-      type is (embedding_layer)
-        gradients = this_layer % get_gradients()
-      type is (layernorm_layer)
-        gradients = this_layer % get_gradients()
-      class default
-        error stop 'Unknown layer type.'
-    end select
-
-  end function get_gradients
 
   module subroutine set_params(self, params)
     class(layer), intent(in out) :: self
diff --git a/src/nf/nf_network.f90 b/src/nf/nf_network.f90
index 2bd7ce8c..ac165adf 100644
--- a/src/nf/nf_network.f90
+++ b/src/nf/nf_network.f90
@@ -21,7 +21,6 @@ module nf_network
   contains
 
     procedure :: backward
-    procedure :: get_gradients
     procedure :: get_num_params
     procedure :: get_params
     procedure :: print_info
@@ -216,7 +215,6 @@ module integer function get_num_params(self)
       !! Network instance
     end function get_num_params
 
-
     module function get_params(self) result(params)
       !! Get the network parameters (weights and biases).
       class(network), intent(in) :: self
@@ -225,13 +223,6 @@ module function get_params(self) result(params)
         !! Network parameters to get
     end function get_params
 
-    module function get_gradients(self) result(gradients)
-      class(network), intent(in) :: self
-        !! Network instance
-      real, allocatable :: gradients(:)
-        !! Network gradients to set
-    end function get_gradients
-
     module subroutine set_params(self, params)
       !! Set the network parameters (weights and biases).
       class(network), intent(in out) :: self
diff --git a/src/nf/nf_network_submodule.f90 b/src/nf/nf_network_submodule.f90
index 76937ade..d550f264 100644
--- a/src/nf/nf_network_submodule.f90
+++ b/src/nf/nf_network_submodule.f90
@@ -524,25 +524,6 @@ module function get_params(self) result(params)
 
   end function get_params
 
-  module function get_gradients(self) result(gradients)
-    class(network), intent(in) :: self
-    real, allocatable :: gradients(:)
-    integer :: n, nstart, nend
-
-    allocate(gradients(self % get_num_params()))
-
-    nstart = 1
-    do n = 1, size(self % layers)
-
-      if (self % layers(n) % get_num_params() < 1) cycle
-
-      nend = nstart + self % layers(n) % get_num_params() - 1
-      gradients(nstart:nend) = self % layers(n) % get_gradients()
-      nstart = nend + 1
-    end do
-
-  end function get_gradients
-
 
   module subroutine set_params(self, params)
     class(network), intent(in out) :: self

From e61f29ed6955ed12c5001ae48982ee5d5f4affd6 Mon Sep 17 00:00:00 2001
From: milancurcic <caomaco@gmail.com>
Date: Wed, 30 Jul 2025 12:19:55 -0400
Subject: [PATCH 13/13] Remove optimizer as component to the network class

---
 src/nf/nf_network.f90           |  1 -
 src/nf/nf_network_submodule.f90 | 51 +++++++--------------------------
 2 files changed, 10 insertions(+), 42 deletions(-)

diff --git a/src/nf/nf_network.f90 b/src/nf/nf_network.f90
index ac165adf..2743ff5b 100644
--- a/src/nf/nf_network.f90
+++ b/src/nf/nf_network.f90
@@ -16,7 +16,6 @@ module nf_network
 
     type(layer), allocatable :: layers(:)
     class(loss_type), allocatable :: loss
-    class(optimizer_base_type), allocatable :: optimizer
 
   contains
 
diff --git a/src/nf/nf_network_submodule.f90 b/src/nf/nf_network_submodule.f90
index d550f264..df4498be 100644
--- a/src/nf/nf_network_submodule.f90
+++ b/src/nf/nf_network_submodule.f90
@@ -574,27 +574,8 @@ module subroutine train(self, input_data, output_data, batch_size, &
     integer :: i, j, n
     integer :: istart, iend, indices(2)
 
-    ! Passing the optimizer instance is optional.
-    ! If not provided, we default to SGD with its default settings.
-    if (present(optimizer)) then
-      self % optimizer = optimizer
-
-      do n = 1, size(self % layers)
-        self % layers(n) % optimizer = optimizer
-      end do
-
-    else
-      self % optimizer = sgd()
-
-      do n = 1, size(self % layers)
-        self % layers(n) % optimizer = sgd()
-      end do
-
-    end if
-
-    do n = 1, size(self % layers)
-      call self % layers(n) % optimizer % init(self % layers(n) % get_num_params())
-    end do
+    ! The optional optimizer instance is passed through to the update() method
+    ! where it is optional as well.
 
     ! Passing the loss instance is optional.
     ! If not provided, we default to quadratic().
@@ -628,7 +609,7 @@ module subroutine train(self, input_data, output_data, batch_size, &
           call self % backward(output_data(:,j))
         end do
 
-        call self % update(batch_size=batch_size)
+        call self % update(optimizer=optimizer, batch_size=batch_size)
 
       end do batch_loop
     end do epoch_loop
@@ -645,34 +626,22 @@ module subroutine update(self, optimizer, batch_size)
     real, pointer :: weights(:), biases(:), dw(:), db(:)
     integer :: n
 
-    ! Passing the optimizer instance is optional. If not provided, and if the
-    ! optimizer has not already been set, we default to the default SGD. The
-    ! instantiation and initialization below of the optimizer is normally done
-    ! at the beginning of the network % train() method. However, if the user
-    ! wants to call network % update() directly, for example if they use their
-    ! own custom mini-batching routine, we initialize the optimizer here as
-    ! well. If it's initialized already, this step is a cheap no-op.
-    if (.not. allocated(self % optimizer)) then
+    ! You can optionally pass an optimizer instance to the update() method.
+    ! This is necessary if you're not using the train() method, for example if
+    ! you're using your own custom mini-batching routine and calling the
+    ! forward(), backward(), and update() methods directly.
+    if (.not. allocated(self % layers(1) % optimizer)) then
       if (present(optimizer)) then
-        self % optimizer = optimizer
-        
         do n = 1, size(self % layers)
           self % layers(n) % optimizer = optimizer
+          call self % layers(n) % optimizer % init(self % layers(n) % get_num_params())
         end do
-
       else
-        self % optimizer = sgd()
-
         do n = 1, size(self % layers)
           self % layers(n) % optimizer = sgd()
+          call self % layers(n) % optimizer % init(self % layers(n) % get_num_params())
         end do
-
       end if
-
-      do n = 1, size(self % layers)
-        call self % layers(n) % optimizer % init(self % layers(n) % get_num_params())
-      end do
-
     end if
 
     if (present(batch_size)) then