From 362015dccd878d93cac1387e53c8a61e84be31b5 Mon Sep 17 00:00:00 2001
From: Mikhail Voronov <mikivo@list.ru>
Date: Tue, 11 Feb 2025 00:55:01 +0400
Subject: [PATCH 01/18] layernorm: initial implementation

---
 src/nf/layernorm.f90    | 159 ++++++++++++++++++++++++++++++++++++++++
 test/test_layernorm.f90 |  86 ++++++++++++++++++++++
 2 files changed, 245 insertions(+)
 create mode 100644 src/nf/layernorm.f90
 create mode 100644 test/test_layernorm.f90

diff --git a/src/nf/layernorm.f90 b/src/nf/layernorm.f90
new file mode 100644
index 00000000..a0219758
--- /dev/null
+++ b/src/nf/layernorm.f90
@@ -0,0 +1,159 @@
+module nf_layernorm_layer
+  use nf_activation, only: activation_function
+  use nf_base_layer, only: base_layer
+
+  implicit none
+
+  private
+  public :: layernorm_layer
+
+  type, extends(base_layer) :: layernorm_layer
+    !! Layer Normalization
+    !! ((x − mean(x)) / sqrt(variance(x) + eps) * gamma + beta
+    !! Based upon `Ba, Jimmy Lei, Jamie Ryan Kiros, and Geoffrey E. Hinton(2016)`:
+    !! https://arxiv.org/abs/1607.06450v1
+    integer :: sequence_length
+    integer :: model_dimension
+
+    real :: eps
+    real, allocatable :: gamma(:)
+    real, allocatable :: beta(:)
+
+    real, allocatable :: d_gamma(:)
+    real, allocatable :: d_beta(:)
+    real, allocatable :: gradient(:, :)
+
+    real, allocatable :: mu(:, :)
+    real, allocatable :: sigma(:)
+
+    real, allocatable :: output(:, :)
+
+  contains
+    procedure :: forward
+    procedure :: backward
+    procedure :: spread_by_sequence
+    procedure :: spread_by_model_dim
+    procedure :: init
+  end type layernorm_layer
+
+  interface layernorm_layer
+    module function layernorm_layer_cons(sequence_length, model_dimension) &
+      result(res)
+      integer, intent(in) :: sequence_length, model_dimension
+      type(layernorm_layer) :: res
+    end function layernorm_layer_cons
+  end interface layernorm_layer
+
+contains
+  module function layernorm_layer_cons(sequence_length, model_dimension) &
+    result(res)
+    integer, intent(in) :: sequence_length, model_dimension
+    type(layernorm_layer) :: res
+
+    res % sequence_length = sequence_length
+    res % model_dimension = model_dimension
+    res % eps = 1e-5
+  end function layernorm_layer_cons
+
+  pure module subroutine forward(self, input)
+    class(layernorm_layer), intent(in out) :: self
+    real, intent(in) :: input(:, :)
+    real, allocatable :: normalized(:, :)
+    integer :: i
+
+    allocate(normalized(self % sequence_length, self % model_dimension))
+
+    ! mu = x - MEAN_last_dim(x)
+    do concurrent(i = 1: self % model_dimension)
+      self % mu(:, i) = input(:, i) - (sum(input, dim=2) / self % model_dimension)
+    end do
+
+    ! square root of variance shifted be eps
+    self % sigma = sqrt((sum(self % mu ** 2, dim=2) / self % model_dimension) + self % eps)
+
+    ! normalize mu by variance by first axis
+    do concurrent(i = 1: self % model_dimension)
+      normalized(:, i) = self % mu(:, i) / self % sigma
+    end do
+
+    ! forward through trainable params gamma and beta
+    do concurrent(i = 1: self % sequence_length)
+      self % output(i, :) = normalized(i, :) * self % gamma + self % beta
+    end do
+
+    deallocate(normalized)
+  end subroutine forward
+
+  pure module subroutine backward(self, input, gradient)
+    class(layernorm_layer), intent(in out) :: self
+    real, intent(in) :: input(:, :)
+    real, intent(in) :: gradient(:, :)
+    real, allocatable :: one_over_sigma(:, :)
+    real, allocatable :: gradient_by_gamma_over_sigma(:, :)
+
+    allocate(one_over_sigma(self % sequence_length, self % model_dimension))
+    allocate(gradient_by_gamma_over_sigma(self % sequence_length, self % model_dimension))
+
+    one_over_sigma = (1 / self % spread_by_model_dim(self % sigma))
+    gradient_by_gamma_over_sigma = gradient * self % spread_by_sequence(self % gamma) * one_over_sigma
+
+    ! d_output/d_gamma = sum(d_output/d_y * mu/sigma)
+    self % d_gamma = sum(gradient * self % mu * one_over_sigma, dim=1)
+
+    ! d_output/d_beta = sum(d_output/d_y) * 1
+    self % d_beta = sum(gradient, dim=1)
+
+    ! From this article:
+    ! https://robotchinwag.com/posts/layer-normalization-deriving-the-gradient-for-the-backward-pass/
+    ! d_output/d_x = d_output/d_y * gamma/sigma
+    !     - d_output/d_y
+    !     - sum(d_output/d_y * gamma/sigma) / len
+    !     - mu * sum(d_output/d_y * gamma * mu * sigma^(03)) / len
+    self % gradient = &
+        gradient_by_gamma_over_sigma &
+        - self % spread_by_model_dim(sum(gradient_by_gamma_over_sigma, dim=2)) / self % model_dimension &
+        - self % mu * self % spread_by_model_dim(sum(&
+            gradient_by_gamma_over_sigma * self % mu * (one_over_sigma ** 2),&
+            dim=2)&
+        ) / self % model_dimension
+
+    deallocate(one_over_sigma)
+    deallocate(gradient_by_gamma_over_sigma)
+  end subroutine backward
+
+  pure function spread_by_sequence(self, input) result(output)
+    class(layernorm_layer), intent(in) :: self
+    real, intent(in) :: input(:)
+    real :: output(self % sequence_length, self % model_dimension)
+
+    output = spread(input, dim=1, ncopies=self % sequence_length)
+  end function spread_by_sequence
+
+  pure function spread_by_model_dim(self, input) result(output)
+    class(layernorm_layer), intent(in) :: self
+    real, intent(in) :: input(:)
+    real :: output(self % sequence_length, self % model_dimension)
+
+    output = spread(input, dim=2, ncopies=self % model_dimension)
+  end function spread_by_model_dim
+
+  module subroutine init(self, input_shape)
+    class(layernorm_layer), intent(in out) :: self
+    integer, intent(in) :: input_shape(:)
+
+    ! default initialization from PyTorch
+    allocate(self % gamma(self % model_dimension))
+    self % gamma = 1.
+    allocate(self % beta(self % model_dimension))
+    self % beta = 0.
+
+    allocate(self % d_gamma(self % model_dimension))
+    allocate(self % d_beta(self % model_dimension))
+    allocate(self % gradient(self % sequence_length, self % model_dimension))
+
+    allocate(self % mu(self % sequence_length, self % model_dimension))
+    allocate(self % sigma(self % sequence_length))
+
+    allocate(self % output(self % sequence_length, self % model_dimension))
+  end subroutine init
+end module nf_layernorm_layer
\ No newline at end of file
diff --git a/test/test_layernorm.f90 b/test/test_layernorm.f90
new file mode 100644
index 00000000..85326fad
--- /dev/null
+++ b/test/test_layernorm.f90
@@ -0,0 +1,86 @@
+program test_layernorm
+  use iso_fortran_env, only: stderr => error_unit
+  use nf_layernorm_layer, only: layernorm_layer
+  implicit none
+
+  logical :: ok = .true.
+  type(layernorm_layer) :: layernorm
+  real :: sample_input(3, 4) = reshape([0.0, 10.1, 0.2, 10.3, 0.4, 10.5, 0.6, 10.7, 10.8, 0.9, 0.11, 0.12], [3, 4])
+  real :: sample_gradient(3, 4) = reshape([0.1, 3., 2., 0.1, 3., 3., 0.1, 2., 0.1, 3., 0.1, 3.], [3, 4])
+
+  layernorm = layernorm_layer(3, 4)
+  call layernorm % init([0])
+
+  call test_layernorm_forward(layernorm, sample_input, ok)
+  call test_layernorm_backward(layernorm, sample_input, sample_gradient, ok)
+
+contains
+  subroutine test_layernorm_forward(layernorm, input, ok)
+    type(layernorm_layer), intent(in out) :: layernorm
+    real, intent(in out) :: input(:, :)
+    logical, intent(in out) :: ok
+    real :: output_shape(2)
+    real :: output_flat(12)
+    real :: expected_shape(2) = [3, 4]
+    real :: expected_output_flat(12) = [&
+        -0.693158746, 0.939844191, -0.992156327, 1.72702277, -0.970368207, 0.971188426,&
+        -0.552177250, 1.05800152, 1.02837324, -0.481686622, -1.02747762, -1.00740564&
+    ]
+
+    call layernorm % forward(input)
+
+    output_shape = shape(layernorm % output)
+    if (.not. all(output_shape.eq.expected_shape)) then
+      ok = .false.
+      write(stderr, '(a)') 'forward returned incorrect shape.. failed'
+    end if
+    output_flat = reshape(layernorm % output, shape(output_flat))
+    if (.not. all(output_flat.eq.expected_output_flat)) then
+      ok = .false.
+      write(stderr, '(a)') 'forward returned incorrect values.. failed'
+    end if
+  end subroutine test_layernorm_forward
+
+  subroutine test_layernorm_backward(layernorm, input, gradient, ok)
+    type(layernorm_layer), intent(in out) :: layernorm
+    real, intent(in out) :: input(:, :)
+    real, intent(in out) :: gradient(:, :)
+    logical, intent(in out) :: ok
+
+    real :: gradient_shape(2)
+    real :: gradient_flat(12)
+    real :: expected_gradient_shape(2) = [3, 4]
+    real :: expected_gradient_flat(12) = [&
+        -0.227230772, 0.103088334, -9.88590196E-02, -2.86390483E-02, 0.283811331, 0.277955681,&
+        -0.215662330, -0.105019525, -0.269407451, 0.471532196, -0.281880081, 9.03107598E-02&
+    ]
+
+    real :: d_gamma(4)
+    real :: expected_d_gamma(4) = [0.765904069, 0.175162792,  2.16362262, -4.57002449]
+    real :: d_beta(4)
+    real :: expected_d_beta(4) = [5.09999990, 6.09999990, 2.19999981, 6.09999990]
+
+    call layernorm % backward(input, gradient)
+
+    gradient_shape = shape(layernorm % gradient)
+    if (.not. all(gradient_shape.eq.expected_gradient_shape)) then
+      ok = .false.
+      write(stderr, '(a)') 'backward returned incorrect gradient shape.. failed'
+    end if
+    gradient_flat = reshape(layernorm % gradient, shape(gradient_flat))
+    if (.not. all(gradient_flat.eq.expected_gradient_flat)) then
+      ok = .false.
+      write(stderr, '(a)') 'backward returned incorrect gradient values.. failed'
+    end if
+
+    if (.not. all(layernorm % d_gamma.eq.expected_d_gamma)) then
+      ok = .false.
+      write(stderr, '(a)') 'backward returned incorrect d_gamma values.. failed'
+    end if
+    if (.not. all(layernorm % d_beta.eq.expected_d_beta)) then
+      ok = .false.
+      write(stderr, '(a)') 'backward returned incorrect d_beta values.. failed'
+    end if
+  end subroutine test_layernorm_backward
+
+end program test_layernorm

From 005daf21a5286eb94a53f16d1a31f44e40be9bd8 Mon Sep 17 00:00:00 2001
From: Mikhail Voronov <mikivo@list.ru>
Date: Tue, 11 Feb 2025 00:58:46 +0400
Subject: [PATCH 02/18] layernorm: rename source file

---
 src/nf/{layernorm.f90 => nf_layernorm.f90} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename src/nf/{layernorm.f90 => nf_layernorm.f90} (100%)

diff --git a/src/nf/layernorm.f90 b/src/nf/nf_layernorm.f90
similarity index 100%
rename from src/nf/layernorm.f90
rename to src/nf/nf_layernorm.f90

From d657fa787a9f18d54084295ceac7e8752f3f3407 Mon Sep 17 00:00:00 2001
From: Mikhail Voronov <mikivo@list.ru>
Date: Mon, 17 Feb 2025 17:31:05 +0400
Subject: [PATCH 03/18] layernorm: remove redundant arguments

---
 src/nf/nf_layernorm.f90 | 14 ++++++++------
 test/test_layernorm.f90 | 11 +++++++++--
 2 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/src/nf/nf_layernorm.f90 b/src/nf/nf_layernorm.f90
index a0219758..dc81bcd1 100644
--- a/src/nf/nf_layernorm.f90
+++ b/src/nf/nf_layernorm.f90
@@ -37,21 +37,17 @@ module nf_layernorm_layer
   end type layernorm_layer
 
   interface layernorm_layer
-    module function layernorm_layer_cons(sequence_length, model_dimension) &
+    module function layernorm_layer_cons() &
       result(res)
-      integer, intent(in) :: sequence_length, model_dimension
       type(layernorm_layer) :: res
     end function layernorm_layer_cons
   end interface layernorm_layer
 
 contains
-  module function layernorm_layer_cons(sequence_length, model_dimension) &
+  module function layernorm_layer_cons() &
     result(res)
-    integer, intent(in) :: sequence_length, model_dimension
     type(layernorm_layer) :: res
 
-    res % sequence_length = sequence_length
-    res % model_dimension = model_dimension
     res % eps = 1e-5
   end function layernorm_layer_cons
 
@@ -141,6 +137,12 @@ module subroutine init(self, input_shape)
     class(layernorm_layer), intent(in out) :: self
     integer, intent(in) :: input_shape(:)
 
+    if (size(input_shape) /= 2) then
+      error stop "LayerNorm Layer accepts 2D input"
+    end if
+    self % sequence_length = input_shape(1)
+    self % model_dimension = input_shape(2)
+
     ! default initialization from PyTorch
     allocate(self % gamma(self % model_dimension))
     self % gamma = 1.
diff --git a/test/test_layernorm.f90 b/test/test_layernorm.f90
index 85326fad..f51d6221 100644
--- a/test/test_layernorm.f90
+++ b/test/test_layernorm.f90
@@ -8,12 +8,19 @@ program test_layernorm
   real :: sample_input(3, 4) = reshape([0.0, 10.1, 0.2, 10.3, 0.4, 10.5, 0.6, 10.7, 10.8, 0.9, 0.11, 0.12], [3, 4])
   real :: sample_gradient(3, 4) = reshape([0.1, 3., 2., 0.1, 3., 3., 0.1, 2., 0.1, 3., 0.1, 3.], [3, 4])
 
-  layernorm = layernorm_layer(3, 4)
-  call layernorm % init([0])
+  layernorm = layernorm_layer()
+  call layernorm % init([3, 4])
 
   call test_layernorm_forward(layernorm, sample_input, ok)
   call test_layernorm_backward(layernorm, sample_input, sample_gradient, ok)
 
+  if (ok) then
+    print '(a)', 'test_layernorm_layer: All tests passed.'
+  else
+    write(stderr, '(a)') 'test_layernorm_layer: One or more tests failed.'
+    stop 1
+  end if
+
 contains
   subroutine test_layernorm_forward(layernorm, input, ok)
     type(layernorm_layer), intent(in out) :: layernorm

From 0dbaf07aecea1d9f3710f63e7d4e1626a3a44dc1 Mon Sep 17 00:00:00 2001
From: Mikhail Voronov <mikivo@list.ru>
Date: Mon, 17 Feb 2025 18:41:55 +0400
Subject: [PATCH 04/18] layernorm: remove stack allocated arrays

---
 src/nf/nf_layernorm.f90 | 40 +++++++++++++++-------------------------
 1 file changed, 15 insertions(+), 25 deletions(-)

diff --git a/src/nf/nf_layernorm.f90 b/src/nf/nf_layernorm.f90
index dc81bcd1..ac3044f8 100644
--- a/src/nf/nf_layernorm.f90
+++ b/src/nf/nf_layernorm.f90
@@ -31,8 +31,6 @@ module nf_layernorm_layer
   contains
     procedure :: forward
     procedure :: backward
-    procedure :: spread_by_sequence
-    procedure :: spread_by_model_dim
     procedure :: init
   end type layernorm_layer
 
@@ -90,8 +88,11 @@ pure module subroutine backward(self, input, gradient)
     allocate(one_over_sigma(self % sequence_length, self % model_dimension))
     allocate(gradient_by_gamma_over_sigma(self % sequence_length, self % model_dimension))
 
-    one_over_sigma = (1 / self % spread_by_model_dim(self % sigma))
-    gradient_by_gamma_over_sigma = gradient * self % spread_by_sequence(self % gamma) * one_over_sigma
+    one_over_sigma = (1 / spread(self % sigma, dim=2, ncopies=self % model_dimension))
+    gradient_by_gamma_over_sigma = &
+        gradient &
+        * spread(self % gamma, dim=1, ncopies=self % sequence_length) &
+        * one_over_sigma
 
     ! d_output/d_gamma = sum(d_output/d_y * mu/sigma)
     self % d_gamma = sum(gradient * self % mu * one_over_sigma, dim=1)
@@ -107,32 +108,21 @@ pure module subroutine backward(self, input, gradient)
     !     - mu * sum(d_output/d_y * gamma * mu * sigma^(03)) / len
     self % gradient = &
         gradient_by_gamma_over_sigma &
-        - self % spread_by_model_dim(sum(gradient_by_gamma_over_sigma, dim=2)) / self % model_dimension &
-        - self % mu * self % spread_by_model_dim(sum(&
-            gradient_by_gamma_over_sigma * self % mu * (one_over_sigma ** 2),&
-            dim=2)&
-        ) / self % model_dimension
+        - spread(&
+            sum(gradient_by_gamma_over_sigma, dim=2),&
+            dim=2,&
+            ncopies=self % model_dimension&
+          ) / self % model_dimension &
+        - self % mu * spread(&
+            sum(gradient_by_gamma_over_sigma * self % mu * (one_over_sigma ** 2), dim=2),&
+            dim=2,&
+            ncopies=self % model_dimension&
+          ) / self % model_dimension
 
     deallocate(one_over_sigma)
     deallocate(gradient_by_gamma_over_sigma)
   end subroutine backward
 
-  pure function spread_by_sequence(self, input) result(output)
-    class(layernorm_layer), intent(in) :: self
-    real, intent(in) :: input(:)
-    real :: output(self % sequence_length, self % model_dimension)
-
-    output = spread(input, dim=1, ncopies=self % sequence_length)
-  end function spread_by_sequence
-
-  pure function spread_by_model_dim(self, input) result(output)
-    class(layernorm_layer), intent(in) :: self
-    real, intent(in) :: input(:)
-    real :: output(self % sequence_length, self % model_dimension)
-
-    output = spread(input, dim=2, ncopies=self % model_dimension)
-  end function spread_by_model_dim
-
   module subroutine init(self, input_shape)
     class(layernorm_layer), intent(in out) :: self
     integer, intent(in) :: input_shape(:)

From 612db46e10f03a0f766cf1f62ade4c183cb24add Mon Sep 17 00:00:00 2001
From: Mikhail Voronov <mikivo@list.ru>
Date: Mon, 17 Feb 2025 18:45:07 +0400
Subject: [PATCH 05/18] layernorm: rearrange into submodule

---
 src/nf/nf_layernorm.f90           | 124 ++++--------------------------
 src/nf/nf_layernorm_submodule.f90 | 110 ++++++++++++++++++++++++++
 2 files changed, 127 insertions(+), 107 deletions(-)
 create mode 100644 src/nf/nf_layernorm_submodule.f90

diff --git a/src/nf/nf_layernorm.f90 b/src/nf/nf_layernorm.f90
index ac3044f8..245f2870 100644
--- a/src/nf/nf_layernorm.f90
+++ b/src/nf/nf_layernorm.f90
@@ -41,111 +41,21 @@ module function layernorm_layer_cons() &
     end function layernorm_layer_cons
   end interface layernorm_layer
 
-contains
-  module function layernorm_layer_cons() &
-    result(res)
-    type(layernorm_layer) :: res
-
-    res % eps = 1e-5
-  end function layernorm_layer_cons
-
-  pure module subroutine forward(self, input)
-    class(layernorm_layer), intent(in out) :: self
-    real, intent(in) :: input(:, :)
-    real, allocatable :: normalized(:, :)
-    integer :: i
-
-    allocate(normalized(self % sequence_length, self % model_dimension))
-
-    ! mu = x - MEAN_last_dim(x)
-    do concurrent(i = 1: self % model_dimension)
-      self % mu(:, i) = input(:, i) - (sum(input, dim=2) / self % model_dimension)
-    end do
-
-    ! square root of variance shifted be eps
-    self % sigma = sqrt((sum(self % mu ** 2, dim=2) / self % model_dimension) + self % eps)
-
-    ! normalize mu by variance by first axis
-    do concurrent(i = 1: self % model_dimension)
-      normalized(:, i) = self % mu(:, i) / self % sigma
-    end do
-
-    ! forward through trainable params gamma and beta
-    do concurrent(i = 1: self % sequence_length)
-      self % output(i, :) = normalized(i, :) * self % gamma + self % beta
-    end do
-
-    deallocate(normalized)
-  end subroutine forward
-
-  pure module subroutine backward(self, input, gradient)
-    class(layernorm_layer), intent(in out) :: self
-    real, intent(in) :: input(:, :)
-    real, intent(in) :: gradient(:, :)
-    real, allocatable :: one_over_sigma(:, :)
-    real, allocatable :: gradient_by_gamma_over_sigma(:, :)
-
-    allocate(one_over_sigma(self % sequence_length, self % model_dimension))
-    allocate(gradient_by_gamma_over_sigma(self % sequence_length, self % model_dimension))
-
-    one_over_sigma = (1 / spread(self % sigma, dim=2, ncopies=self % model_dimension))
-    gradient_by_gamma_over_sigma = &
-        gradient &
-        * spread(self % gamma, dim=1, ncopies=self % sequence_length) &
-        * one_over_sigma
-
-    ! d_output/d_gamma = sum(d_output/d_y * mu/sigma)
-    self % d_gamma = sum(gradient * self % mu * one_over_sigma, dim=1)
-
-    ! d_output/d_beta = sum(d_output/d_y) * 1
-    self % d_beta = sum(gradient, dim=1)
-
-    ! From this article:
-    ! https://robotchinwag.com/posts/layer-normalization-deriving-the-gradient-for-the-backward-pass/
-    ! d_output/d_x = d_output/d_y * gamma/sigma
-    !     - d_output/d_y
-    !     - sum(d_output/d_y * gamma/sigma) / len
-    !     - mu * sum(d_output/d_y * gamma * mu * sigma^(03)) / len
-    self % gradient = &
-        gradient_by_gamma_over_sigma &
-        - spread(&
-            sum(gradient_by_gamma_over_sigma, dim=2),&
-            dim=2,&
-            ncopies=self % model_dimension&
-          ) / self % model_dimension &
-        - self % mu * spread(&
-            sum(gradient_by_gamma_over_sigma * self % mu * (one_over_sigma ** 2), dim=2),&
-            dim=2,&
-            ncopies=self % model_dimension&
-          ) / self % model_dimension
-
-    deallocate(one_over_sigma)
-    deallocate(gradient_by_gamma_over_sigma)
-  end subroutine backward
-
-  module subroutine init(self, input_shape)
-    class(layernorm_layer), intent(in out) :: self
-    integer, intent(in) :: input_shape(:)
-
-    if (size(input_shape) /= 2) then
-      error stop "LayerNorm Layer accepts 2D input"
-    end if
-    self % sequence_length = input_shape(1)
-    self % model_dimension = input_shape(2)
-
-    ! default initialization from PyTorch
-    allocate(self % gamma(self % model_dimension))
-    self % gamma = 1.
-    allocate(self % beta(self % model_dimension))
-    self % beta = 0.
-
-    allocate(self % d_gamma(self % model_dimension))
-    allocate(self % d_beta(self % model_dimension))
-    allocate(self % gradient(self % sequence_length, self % model_dimension))
-
-    allocate(self % mu(self % sequence_length, self % model_dimension))
-    allocate(self % sigma(self % sequence_length))
-
-    allocate(self % output(self % sequence_length, self % model_dimension))
-  end subroutine init
+  interface
+    pure module subroutine forward(self, input)
+      class(layernorm_layer), intent(in out) :: self
+      real, intent(in) :: input(:, :)
+    end subroutine forward
+
+    pure module subroutine backward(self, input, gradient)
+      class(layernorm_layer), intent(in out) :: self
+      real, intent(in) :: input(:, :)
+      real, intent(in) :: gradient(:, :)
+    end subroutine backward
+
+    module subroutine init(self, input_shape)
+      class(layernorm_layer), intent(in out) :: self
+      integer, intent(in) :: input_shape(:)
+    end subroutine init
+  end interface
 end module nf_layernorm_layer
\ No newline at end of file
diff --git a/src/nf/nf_layernorm_submodule.f90 b/src/nf/nf_layernorm_submodule.f90
new file mode 100644
index 00000000..52b7a426
--- /dev/null
+++ b/src/nf/nf_layernorm_submodule.f90
@@ -0,0 +1,110 @@
+submodule(nf_layernorm_layer) nf_layernorm_layer_submodule
+  implicit none
+contains
+    module function layernorm_layer_cons() &
+    result(res)
+    type(layernorm_layer) :: res
+
+    res % eps = 1e-5
+  end function layernorm_layer_cons
+
+  pure module subroutine forward(self, input)
+    class(layernorm_layer), intent(in out) :: self
+    real, intent(in) :: input(:, :)
+    real, allocatable :: normalized(:, :)
+    integer :: i
+
+    allocate(normalized(self % sequence_length, self % model_dimension))
+
+    ! mu = x - MEAN_last_dim(x)
+    do concurrent(i = 1: self % model_dimension)
+      self % mu(:, i) = input(:, i) - (sum(input, dim=2) / self % model_dimension)
+    end do
+
+    ! square root of variance shifted be eps
+    self % sigma = sqrt((sum(self % mu ** 2, dim=2) / self % model_dimension) + self % eps)
+
+    ! normalize mu by variance by first axis
+    do concurrent(i = 1: self % model_dimension)
+      normalized(:, i) = self % mu(:, i) / self % sigma
+    end do
+
+    ! forward through trainable params gamma and beta
+    do concurrent(i = 1: self % sequence_length)
+      self % output(i, :) = normalized(i, :) * self % gamma + self % beta
+    end do
+
+    deallocate(normalized)
+  end subroutine forward
+
+  pure module subroutine backward(self, input, gradient)
+    class(layernorm_layer), intent(in out) :: self
+    real, intent(in) :: input(:, :)
+    real, intent(in) :: gradient(:, :)
+    real, allocatable :: one_over_sigma(:, :)
+    real, allocatable :: gradient_by_gamma_over_sigma(:, :)
+
+    allocate(one_over_sigma(self % sequence_length, self % model_dimension))
+    allocate(gradient_by_gamma_over_sigma(self % sequence_length, self % model_dimension))
+
+    one_over_sigma = (1 / spread(self % sigma, dim=2, ncopies=self % model_dimension))
+    gradient_by_gamma_over_sigma = &
+        gradient &
+        * spread(self % gamma, dim=1, ncopies=self % sequence_length) &
+        * one_over_sigma
+
+    ! d_output/d_gamma = sum(d_output/d_y * mu/sigma)
+    self % d_gamma = sum(gradient * self % mu * one_over_sigma, dim=1)
+
+    ! d_output/d_beta = sum(d_output/d_y) * 1
+    self % d_beta = sum(gradient, dim=1)
+
+    ! From this article:
+    ! https://robotchinwag.com/posts/layer-normalization-deriving-the-gradient-for-the-backward-pass/
+    ! d_output/d_x = d_output/d_y * gamma/sigma
+    !     - d_output/d_y
+    !     - sum(d_output/d_y * gamma/sigma) / len
+    !     - mu * sum(d_output/d_y * gamma * mu * sigma^(03)) / len
+    self % gradient = &
+        gradient_by_gamma_over_sigma &
+        - spread(&
+            sum(gradient_by_gamma_over_sigma, dim=2),&
+            dim=2,&
+            ncopies=self % model_dimension&
+          ) / self % model_dimension &
+        - self % mu * spread(&
+            sum(gradient_by_gamma_over_sigma * self % mu * (one_over_sigma ** 2), dim=2),&
+            dim=2,&
+            ncopies=self % model_dimension&
+          ) / self % model_dimension
+
+    deallocate(one_over_sigma)
+    deallocate(gradient_by_gamma_over_sigma)
+  end subroutine backward
+
+  module subroutine init(self, input_shape)
+    class(layernorm_layer), intent(in out) :: self
+    integer, intent(in) :: input_shape(:)
+
+    if (size(input_shape) /= 2) then
+      error stop "LayerNorm Layer accepts 2D input"
+    end if
+    self % sequence_length = input_shape(1)
+    self % model_dimension = input_shape(2)
+
+    ! default initialization from PyTorch
+    allocate(self % gamma(self % model_dimension))
+    self % gamma = 1.
+    allocate(self % beta(self % model_dimension))
+    self % beta = 0.
+
+    allocate(self % d_gamma(self % model_dimension))
+    allocate(self % d_beta(self % model_dimension))
+    allocate(self % gradient(self % sequence_length, self % model_dimension))
+
+    allocate(self % mu(self % sequence_length, self % model_dimension))
+    allocate(self % sigma(self % sequence_length))
+
+    allocate(self % output(self % sequence_length, self % model_dimension))
+  end subroutine init
+end submodule nf_layernorm_layer_submodule

From c4a3e3cfc41f88fc1aa9651087c1289852be1bc4 Mon Sep 17 00:00:00 2001
From: Mikhail Voronov <mikivo@list.ru>
Date: Fri, 21 Feb 2025 22:21:06 +0400
Subject: [PATCH 06/18] layernorm: add error to stop in test

---
 test/test_layernorm.f90 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_layernorm.f90 b/test/test_layernorm.f90
index f51d6221..fec4b67b 100644
--- a/test/test_layernorm.f90
+++ b/test/test_layernorm.f90
@@ -18,7 +18,7 @@ program test_layernorm
     print '(a)', 'test_layernorm_layer: All tests passed.'
   else
     write(stderr, '(a)') 'test_layernorm_layer: One or more tests failed.'
-    stop 1
+    error stop 1
   end if
 
 contains

From bdefd02a12a10164dc6c79145f747a12f4d8497c Mon Sep 17 00:00:00 2001
From: Mikhail Voronov <mikivo@list.ru>
Date: Sun, 23 Feb 2025 17:00:52 +0400
Subject: [PATCH 07/18] layernorm: add gradient updates

---
 src/nf/nf_layernorm.f90           | 27 +++++++++++++++++
 src/nf/nf_layernorm_submodule.f90 | 48 +++++++++++++++++++++++++++++
 test/test_layernorm.f90           | 50 +++++++++++++++++++++++++++++++
 3 files changed, 125 insertions(+)

diff --git a/src/nf/nf_layernorm.f90 b/src/nf/nf_layernorm.f90
index 245f2870..b148c534 100644
--- a/src/nf/nf_layernorm.f90
+++ b/src/nf/nf_layernorm.f90
@@ -32,6 +32,10 @@ module nf_layernorm_layer
     procedure :: forward
     procedure :: backward
     procedure :: init
+    procedure :: get_num_params
+    procedure :: get_params
+    procedure :: get_gradients
+    procedure :: set_params
   end type layernorm_layer
 
   interface layernorm_layer
@@ -57,5 +61,28 @@ module subroutine init(self, input_shape)
       class(layernorm_layer), intent(in out) :: self
       integer, intent(in) :: input_shape(:)
     end subroutine init
+
+    pure module function get_num_params(self) result(num_params)
+      class(layernorm_layer), intent(in) :: self
+      integer :: num_params
+    end function get_num_params
+
+
+    module function get_params(self) result(params)
+      class(layernorm_layer), intent(in), target :: self
+      real, allocatable :: params(:)
+    end function get_params
+
+
+    module function get_gradients(self) result(gradients)
+      class(layernorm_layer), intent(in), target :: self
+      real, allocatable :: gradients(:)
+    end function get_gradients
+
+
+    module subroutine set_params(self, params)
+      class(layernorm_layer), intent(in out) :: self
+      real, intent(in), target :: params(:)
+    end subroutine set_params
   end interface
 end module nf_layernorm_layer
\ No newline at end of file
diff --git a/src/nf/nf_layernorm_submodule.f90 b/src/nf/nf_layernorm_submodule.f90
index 52b7a426..4f6eae78 100644
--- a/src/nf/nf_layernorm_submodule.f90
+++ b/src/nf/nf_layernorm_submodule.f90
@@ -107,4 +107,52 @@ module subroutine init(self, input_shape)
 
     allocate(self % output(self % sequence_length, self % model_dimension))
   end subroutine init
+
+  pure module function get_num_params(self) result(num_params)
+    class(layernorm_layer), intent(in) :: self
+    integer :: num_params
+
+    ! Number of weights times number of biases
+    num_params = 2 * self % model_dimension
+
+  end function get_num_params
+
+
+  module function get_params(self) result(params)
+    class(layernorm_layer), intent(in), target :: self
+    real, allocatable :: params(:)
+
+    params = [ &
+      self % gamma, &
+      self % beta &
+    ]
+
+  end function get_params
+
+
+  module function get_gradients(self) result(gradients)
+    class(layernorm_layer), intent(in), target :: self
+    real, allocatable :: gradients(:)
+
+    gradients = [ &
+      self % d_gamma, &
+      self % d_beta &
+    ]
+
+  end function get_gradients
+
+
+  module subroutine set_params(self, params)
+    class(layernorm_layer), intent(in out) :: self
+    real, intent(in), target :: params(:)
+
+    ! check if the number of parameters is correct
+    if (size(params) /= self % get_num_params()) then
+      error stop 'Error: number of parameters does not match'
+    end if
+
+    self % gamma = params(1: self % model_dimension)
+    self % beta = params(self % model_dimension + 1: 2 * self % model_dimension)
+
+  end subroutine set_params
 end submodule nf_layernorm_layer_submodule
diff --git a/test/test_layernorm.f90 b/test/test_layernorm.f90
index fec4b67b..636089d5 100644
--- a/test/test_layernorm.f90
+++ b/test/test_layernorm.f90
@@ -1,6 +1,7 @@
 program test_layernorm
   use iso_fortran_env, only: stderr => error_unit
   use nf_layernorm_layer, only: layernorm_layer
+  use nf, only: sgd
   implicit none
 
   logical :: ok = .true.
@@ -13,6 +14,7 @@ program test_layernorm
 
   call test_layernorm_forward(layernorm, sample_input, ok)
   call test_layernorm_backward(layernorm, sample_input, sample_gradient, ok)
+  call test_layernorm_gradients(sample_input, sample_gradient, ok)
 
   if (ok) then
     print '(a)', 'test_layernorm_layer: All tests passed.'
@@ -90,4 +92,52 @@ subroutine test_layernorm_backward(layernorm, input, gradient, ok)
     end if
   end subroutine test_layernorm_backward
 
+  subroutine test_layernorm_gradients(input, gradient, ok)
+    real, intent(in out) :: input(:, :)
+    real, intent(in out) :: gradient(:, :)
+    logical, intent(in out) :: ok
+    type(layernorm_layer) :: layernorm
+    type(sgd) :: optim
+
+    real :: parameters(8)
+    real :: expected_parameters(8)
+    real :: updated_output(12)
+    real :: expected_updated_output(12) = [&
+        -0.738849819, 0.881645918, -1.03555739,&
+        1.66299772, -1.02966857, 0.908487320,&
+        -0.562230229, 1.01311040, 0.984123051,&
+        -0.564699769, -1.13543355, -1.11444426&
+    ]
+
+    layernorm = layernorm_layer()
+    call layernorm % init([3, 4])
+
+    call layernorm % forward(input)
+    call layernorm % backward(input, gradient)
+
+    if (layernorm % get_num_params() /= 8) then
+      ok = .false.
+      write(stderr, '(a)') 'incorrect number of parameters.. failed'
+    end if
+
+    expected_parameters(1: 4) = 1.
+    expected_parameters(5: 8) = 0.
+    parameters = layernorm % get_params()
+    if (.not. all(parameters.eq.expected_parameters)) then
+      ok = .false.
+      write(stderr, '(a)') 'incorrect parameters.. failed'
+    end if
+
+    optim = SGD(learning_rate=0.01)
+    call optim % minimize(parameters, layernorm % get_gradients())
+    call layernorm % set_params(parameters)
+
+    call layernorm % forward(input)
+
+    updated_output = reshape(layernorm % output, [12])
+    if (.not. all(updated_output.eq.expected_updated_output)) then
+      ok = .false.
+      write(stderr, '(a)') 'incorrect output after parameters update.. failed'
+    end if
+  end subroutine test_layernorm_gradients
 end program test_layernorm

From ccc180ee427efa6b65a0866680466738d5e0a484 Mon Sep 17 00:00:00 2001
From: Mikhail Voronov <mikivo@list.ru>
Date: Sun, 23 Feb 2025 17:24:19 +0400
Subject: [PATCH 08/18] layernorm: public api

---
 src/nf.f90                                 |  3 +-
 src/nf/nf_layer_constructors.f90           | 29 ++++++++-----
 src/nf/nf_layer_constructors_submodule.f90 |  8 ++++
 src/nf/nf_layer_submodule.f90              | 50 ++++++++++++++++++++--
 src/nf/nf_network_submodule.f90            |  3 ++
 5 files changed, 78 insertions(+), 15 deletions(-)

diff --git a/src/nf.f90 b/src/nf.f90
index 39f67ea3..67d18ea2 100644
--- a/src/nf.f90
+++ b/src/nf.f90
@@ -11,7 +11,8 @@ module nf
     linear2d, &
     maxpool2d, &
     reshape, &
-    self_attention
+    self_attention, &
+    layer_normalization
   use nf_loss, only: mse, quadratic
   use nf_metrics, only: corr, maxabs
   use nf_network, only: network
diff --git a/src/nf/nf_layer_constructors.f90 b/src/nf/nf_layer_constructors.f90
index db60cf0f..e2fd50a8 100644
--- a/src/nf/nf_layer_constructors.f90
+++ b/src/nf/nf_layer_constructors.f90
@@ -17,7 +17,8 @@ module nf_layer_constructors
     linear2d, &
     maxpool2d, &
     reshape, &
-    self_attention
+    self_attention, &
+    layer_normalization
 
   interface input
 
@@ -222,15 +223,23 @@ module function linear2d(out_features) result(res)
         !! Resulting layer instance
     end function linear2d
 
-  module function self_attention(num_heads) result(res)
-    !! Rank-2 (sequence_length, out_features) self attention constructor.
-    !! sequence_length and model_dimension are determined at layer initialization, based on the
-    !! output shape of the previous layer.
-    integer, intent(in) :: num_heads
-      !! Number of attention heads
-    type(layer) :: res
-      !! Resulting layer instance
-  end function self_attention
+    module function self_attention(num_heads) result(res)
+      !! Rank-2 (sequence_length, out_features) self attention constructor.
+      !! sequence_length and model_dimension are determined at layer initialization, based on the
+      !! output shape of the previous layer.
+      integer, intent(in) :: num_heads
+        !! Number of attention heads
+      type(layer) :: res
+        !! Resulting layer instance
+    end function self_attention
+
+    module function layer_normalization() result(res)
+      !! Layer Normalization
+      !! ((x − mean(x)) / sqrt(variance(x) + eps) * gamma + beta
+      !! Based upon `Ba, Jimmy Lei, Jamie Ryan Kiros, and Geoffrey E. Hinton(2016)`:
+      !! https://arxiv.org/abs/1607.06450v1
+      type(layer) :: res
+    end function layer_normalization
 
   end interface
 
diff --git a/src/nf/nf_layer_constructors_submodule.f90 b/src/nf/nf_layer_constructors_submodule.f90
index 9e5322c1..0b33f3c6 100644
--- a/src/nf/nf_layer_constructors_submodule.f90
+++ b/src/nf/nf_layer_constructors_submodule.f90
@@ -12,6 +12,7 @@
   use nf_reshape_layer, only: reshape3d_layer
   use nf_linear2d_layer, only: linear2d_layer
   use nf_self_attention_layer, only: self_attention_layer
+  use nf_layernorm_layer, only: layernorm_layer
   use nf_activation, only: activation_function, relu, sigmoid
 
   implicit none
@@ -179,4 +180,11 @@ module function self_attention(num_heads) result(res)
     allocate(res % p, source=self_attention_layer(num_heads))
   end function self_attention
 
+  module function layer_normalization() result(res)
+    type(layer) :: res
+
+    res % name = 'layer_normalization'
+    allocate(res % p, source=layernorm_layer())
+  end function layer_normalization
+
 end submodule nf_layer_constructors_submodule
diff --git a/src/nf/nf_layer_submodule.f90 b/src/nf/nf_layer_submodule.f90
index ecdeb41d..801c7754 100644
--- a/src/nf/nf_layer_submodule.f90
+++ b/src/nf/nf_layer_submodule.f90
@@ -12,6 +12,7 @@
   use nf_reshape_layer, only: reshape3d_layer
   use nf_linear2d_layer, only: linear2d_layer
   use nf_self_attention_layer, only: self_attention_layer
+  use nf_layernorm_layer, only: layernorm_layer
   use nf_optimizers, only: optimizer_base_type
 
 contains
@@ -60,6 +61,8 @@ pure module subroutine backward_1d(self, previous, gradient)
             call this_layer % backward(prev_layer % output, gradient)
           type is(self_attention_layer)
             call this_layer % backward(prev_layer % output, gradient)
+          type is(layernorm_layer)
+            call this_layer % backward(prev_layer % output, gradient)
         end select
 
     end select
@@ -84,6 +87,8 @@ pure module subroutine backward_2d(self, previous, gradient)
             call this_layer % backward(prev_layer % output, gradient)
           type is(self_attention_layer)
             call this_layer % backward(prev_layer % output, gradient)
+          type is(layernorm_layer)
+            call this_layer % backward(prev_layer % output, gradient)
         end select
 
       type is(self_attention_layer)
@@ -95,8 +100,18 @@ pure module subroutine backward_2d(self, previous, gradient)
             call this_layer % backward(prev_layer % output, gradient)
           type is(self_attention_layer)
             call this_layer % backward(prev_layer % output, gradient)
+          type is(layernorm_layer)
+            call this_layer % backward(prev_layer % output, gradient)
         end select
 
+      type is(layernorm_layer)
+
+        select type(prev_layer => previous % p)
+          type is(linear2d_layer)
+            call this_layer % backward(prev_layer % output, gradient)
+          type is(self_attention_layer)
+            call this_layer % backward(prev_layer % output, gradient)
+        end select
     end select
 
   end subroutine backward_2d
@@ -250,7 +265,7 @@ module subroutine forward(self, input)
 
       type is(linear2d_layer)
 
-        ! Upstream layers permitted: input2d, linear2d
+        ! Upstream layers permitted: input2d, linear2d, self_attention, layer_normalization
         select type(prev_layer => input % p)
           type is(input2d_layer)
             call this_layer % forward(prev_layer % output)
@@ -258,11 +273,13 @@ module subroutine forward(self, input)
             call this_layer % forward(prev_layer % output)
           type is(self_attention_layer)
             call this_layer % forward(prev_layer % output)
+          type is(layernorm_layer)
+            call this_layer % forward(prev_layer % output)
         end select
 
       type is(self_attention_layer)
 
-        ! Upstream layers permitted: input2d, linear2d
+        ! Upstream layers permitted: input2d, linear2d, self_attention, layer_normalization
         select type(prev_layer => input % p)
           type is(input2d_layer)
             call this_layer % forward(prev_layer % output)
@@ -270,6 +287,18 @@ module subroutine forward(self, input)
             call this_layer % forward(prev_layer % output)
           type is(self_attention_layer)
             call this_layer % forward(prev_layer % output)
+          type is(layernorm_layer)
+            call this_layer % forward(prev_layer % output)
+        end select
+
+      type is(layernorm_layer)
+
+        ! Upstream layers permitted: linear2d, self_attention
+        select type(prev_layer => input % p)
+          type is(linear2d_layer)
+            call this_layer % forward(prev_layer % output)
+          type is(self_attention_layer)
+            call this_layer % forward(prev_layer % output)
         end select
 
     end select
@@ -311,6 +340,8 @@ pure module subroutine get_output_2d(self, output)
         allocate(output, source=this_layer % output)
       type is(self_attention_layer)
         allocate(output, source=this_layer % output)
+      type is(layernorm_layer)
+        allocate(output, source=this_layer % output)
       class default
         error stop '2-d output can only be read from an input2d or linear2d layer.'
 
@@ -354,8 +385,8 @@ impure elemental module subroutine init(self, input)
       call this_layer % init(input % layer_shape)
     end select
 
-    ! The shape of conv2d, dropout, flatten, linear2d, maxpool2d, or
-    ! self_attention layers is not known until we receive an input layer.
+    ! The shape of conv2d, dropout, flatten, linear2d, maxpool2d,
+    ! self_attention or layernorm layers is not known until we receive an input layer.
     select type(this_layer => self % p)
       type is(conv2d_layer)
         self % layer_shape = shape(this_layer % output)
@@ -367,6 +398,8 @@ impure elemental module subroutine init(self, input)
         self % layer_shape = shape(this_layer % output)
       type is(self_attention_layer)
         self % layer_shape = shape(this_layer % output)
+      type is(layernorm_layer)
+        self % layer_shape = shape(this_layer % output)
       type is(maxpool2d_layer)
         self % layer_shape = shape(this_layer % output)
     end select
@@ -425,6 +458,8 @@ elemental module function get_num_params(self) result(num_params)
         num_params = this_layer % get_num_params()
       type is (self_attention_layer)
         num_params = this_layer % get_num_params()
+      type is (layernorm_layer)
+        num_params = this_layer % get_num_params()
       class default
         error stop 'Unknown layer type.'
     end select
@@ -458,6 +493,8 @@ module function get_params(self) result(params)
         params = this_layer % get_params()
       type is (self_attention_layer)
         params = this_layer % get_params()
+      type is (layernorm_layer)
+        params = this_layer % get_params()
       class default
         error stop 'Unknown layer type.'
     end select
@@ -491,6 +528,8 @@ module function get_gradients(self) result(gradients)
         gradients = this_layer % get_gradients()
       type is (self_attention_layer)
         gradients = this_layer % get_gradients()
+      type is (layernorm_layer)
+        gradients = this_layer % get_gradients()
       class default
         error stop 'Unknown layer type.'
     end select
@@ -549,6 +588,9 @@ module subroutine set_params(self, params)
       type is (self_attention_layer)
         call this_layer % set_params(params)
 
+      type is (layernorm_layer)
+        call this_layer % set_params(params)
+
       type is (maxpool2d_layer)
         ! No parameters to set.
         write(stderr, '(a)') 'Warning: calling set_params() ' &
diff --git a/src/nf/nf_network_submodule.f90 b/src/nf/nf_network_submodule.f90
index f344c5c5..a6b7657c 100644
--- a/src/nf/nf_network_submodule.f90
+++ b/src/nf/nf_network_submodule.f90
@@ -11,6 +11,7 @@
   use nf_reshape_layer, only: reshape3d_layer
   use nf_linear2d_layer, only: linear2d_layer
   use nf_self_attention_layer, only: self_attention_layer
+  use nf_layernorm_layer, only: layernorm_layer
   use nf_layer, only: layer
   use nf_layer_constructors, only: conv2d, dense, flatten, input, maxpool2d, reshape
   use nf_loss, only: quadratic
@@ -163,6 +164,8 @@ module subroutine backward(self, output, loss)
             call self % layers(n) % backward(self % layers(n - 1), next_layer % gradient)
           type is(self_attention_layer)
             call self % layers(n) % backward(self % layers(n - 1), next_layer % gradient)
+          type is(layernorm_layer)
+            call self % layers(n) % backward(self % layers(n - 1), next_layer % gradient)
         end select
       end if
 

From 06670008815a0cf11baa8054ab6599839fdeb383 Mon Sep 17 00:00:00 2001
From: Mikhail Voronov <mikivo@list.ru>
Date: Sun, 23 Feb 2025 17:34:35 +0400
Subject: [PATCH 09/18] layernorm: update tests

---
 test/test_layernorm.f90 | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/test/test_layernorm.f90 b/test/test_layernorm.f90
index 636089d5..75af7f26 100644
--- a/test/test_layernorm.f90
+++ b/test/test_layernorm.f90
@@ -24,6 +24,14 @@ program test_layernorm
   end if
 
 contains
+  function allclose(x, y) result(res)
+    real, intent(in) :: x(:)
+    real, intent(in) :: y(:)
+    logical :: res
+
+    res = all(abs(x - y) <= (1e-06 + 1e-05 * abs(y)))
+  end function allclose
+
   subroutine test_layernorm_forward(layernorm, input, ok)
     type(layernorm_layer), intent(in out) :: layernorm
     real, intent(in out) :: input(:, :)
@@ -44,7 +52,7 @@ subroutine test_layernorm_forward(layernorm, input, ok)
       write(stderr, '(a)') 'forward returned incorrect shape.. failed'
     end if
     output_flat = reshape(layernorm % output, shape(output_flat))
-    if (.not. all(output_flat.eq.expected_output_flat)) then
+    if (.not. allclose(output_flat, expected_output_flat)) then
       ok = .false.
       write(stderr, '(a)') 'forward returned incorrect values.. failed'
     end if
@@ -67,7 +75,7 @@ subroutine test_layernorm_backward(layernorm, input, gradient, ok)
     real :: d_gamma(4)
     real :: expected_d_gamma(4) = [0.765904069, 0.175162792,  2.16362262, -4.57002449]
     real :: d_beta(4)
-    real :: expected_d_beta(4) = [5.09999990, 6.09999990, 2.19999981, 6.09999990]
+    real :: expected_d_beta(4) = [5.1, 6.1, 2.2, 6.1]
 
     call layernorm % backward(input, gradient)
 
@@ -77,16 +85,16 @@ subroutine test_layernorm_backward(layernorm, input, gradient, ok)
       write(stderr, '(a)') 'backward returned incorrect gradient shape.. failed'
     end if
     gradient_flat = reshape(layernorm % gradient, shape(gradient_flat))
-    if (.not. all(gradient_flat.eq.expected_gradient_flat)) then
+    if (.not. allclose(gradient_flat, expected_gradient_flat)) then
       ok = .false.
       write(stderr, '(a)') 'backward returned incorrect gradient values.. failed'
     end if
 
-    if (.not. all(layernorm % d_gamma.eq.expected_d_gamma)) then
+    if (.not. allclose(layernorm % d_gamma, expected_d_gamma)) then
       ok = .false.
       write(stderr, '(a)') 'backward returned incorrect d_gamma values.. failed'
     end if
-    if (.not. all(layernorm % d_beta.eq.expected_d_beta)) then
+    if (.not. allclose(layernorm % d_beta, expected_d_beta)) then
       ok = .false.
       write(stderr, '(a)') 'backward returned incorrect d_beta values.. failed'
     end if
@@ -135,7 +143,7 @@ subroutine test_layernorm_gradients(input, gradient, ok)
     call layernorm % forward(input)
 
     updated_output = reshape(layernorm % output, [12])
-    if (.not. all(updated_output.eq.expected_updated_output)) then
+    if (.not. allclose(updated_output, expected_updated_output)) then
       ok = .false.
       write(stderr, '(a)') 'incorrect output after parameters update.. failed'
     end if

From c2a1e7052806d18bad466b82dd95136f7c03123e Mon Sep 17 00:00:00 2001
From: Mikhail Voronov <mikivo@list.ru>
Date: Sun, 23 Feb 2025 17:34:52 +0400
Subject: [PATCH 10/18] layernorm: update cmake

---
 CMakeLists.txt      | 2 ++
 test/CMakeLists.txt | 1 +
 2 files changed, 3 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c1bf2231..906bdd0d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -37,6 +37,8 @@ add_library(neural-fortran
   src/nf/nf_input3d_layer_submodule.f90
   src/nf/nf_layer_constructors.f90
   src/nf/nf_layer_constructors_submodule.f90
+  src/nf/nf_layernorm.f90
+  src/nf/nf_layernorm_submodule.f90
   src/nf/nf_layer.f90
   src/nf/nf_layer_submodule.f90
   src/nf/nf_linear2d_layer.f90
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 741e9930..46d349c1 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -12,6 +12,7 @@ foreach(execid
   insert_flatten
   reshape_layer
   multihead_attention_layer
+  layernorm
   dense_network
   get_set_network_params
   conv2d_network

From ddcd204464eba88527b9806df8d643bfbcbab7ed Mon Sep 17 00:00:00 2001
From: Mikhail Voronov <mikivo@list.ru>
Date: Sun, 23 Feb 2025 20:50:27 +0400
Subject: [PATCH 11/18] layernorm: use mold for temp allocation

---
 src/nf/nf_layernorm_submodule.f90 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/nf/nf_layernorm_submodule.f90 b/src/nf/nf_layernorm_submodule.f90
index 4f6eae78..744888bd 100644
--- a/src/nf/nf_layernorm_submodule.f90
+++ b/src/nf/nf_layernorm_submodule.f90
@@ -14,7 +14,7 @@ pure module subroutine forward(self, input)
     real, allocatable :: normalized(:, :)
     integer :: i
 
-    allocate(normalized(self % sequence_length, self % model_dimension))
+    allocate(normalized, mold=self % mu)
 
     ! mu = x - MEAN_last_dim(x)
     do concurrent(i = 1: self % model_dimension)

From 54d081feebfc1923d1987aff03be1c8518585846 Mon Sep 17 00:00:00 2001
From: Mikhail Voronov <mikivo@list.ru>
Date: Tue, 25 Feb 2025 10:51:52 +0400
Subject: [PATCH 12/18] layernorm: rename to layernorm

---
 src/nf.f90                                 | 2 +-
 src/nf/nf_layer_constructors.f90           | 6 +++---
 src/nf/nf_layer_constructors_submodule.f90 | 6 +++---
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/nf.f90 b/src/nf.f90
index 67d18ea2..d089c4ac 100644
--- a/src/nf.f90
+++ b/src/nf.f90
@@ -12,7 +12,7 @@ module nf
     maxpool2d, &
     reshape, &
     self_attention, &
-    layer_normalization
+    layernorm
   use nf_loss, only: mse, quadratic
   use nf_metrics, only: corr, maxabs
   use nf_network, only: network
diff --git a/src/nf/nf_layer_constructors.f90 b/src/nf/nf_layer_constructors.f90
index e2fd50a8..ce00b6bc 100644
--- a/src/nf/nf_layer_constructors.f90
+++ b/src/nf/nf_layer_constructors.f90
@@ -18,7 +18,7 @@ module nf_layer_constructors
     maxpool2d, &
     reshape, &
     self_attention, &
-    layer_normalization
+    layernorm
 
   interface input
 
@@ -233,13 +233,13 @@ module function self_attention(num_heads) result(res)
         !! Resulting layer instance
     end function self_attention
 
-    module function layer_normalization() result(res)
+    module function layernorm() result(res)
       !! Layer Normalization
       !! ((x − mean(x)) / sqrt(variance(x) + eps) * gamma + beta
       !! Based upon `Ba, Jimmy Lei, Jamie Ryan Kiros, and Geoffrey E. Hinton(2016)`:
       !! https://arxiv.org/abs/1607.06450v1
       type(layer) :: res
-    end function layer_normalization
+    end function layernorm
 
   end interface
 
diff --git a/src/nf/nf_layer_constructors_submodule.f90 b/src/nf/nf_layer_constructors_submodule.f90
index 0b33f3c6..5c2e8893 100644
--- a/src/nf/nf_layer_constructors_submodule.f90
+++ b/src/nf/nf_layer_constructors_submodule.f90
@@ -180,11 +180,11 @@ module function self_attention(num_heads) result(res)
     allocate(res % p, source=self_attention_layer(num_heads))
   end function self_attention
 
-  module function layer_normalization() result(res)
+  module function layernorm() result(res)
     type(layer) :: res
 
-    res % name = 'layer_normalization'
+    res % name = 'layernorm'
     allocate(res % p, source=layernorm_layer())
-  end function layer_normalization
+  end function layernorm
 
 end submodule nf_layer_constructors_submodule

From 6ec65acb10821046e5343fc6f19f0541d777da01 Mon Sep 17 00:00:00 2001
From: Mikhail Voronov <mikivo@list.ru>
Date: Tue, 25 Feb 2025 11:49:23 +0400
Subject: [PATCH 13/18] layernorm: allow usage of layernorm at the end

---
 src/nf/nf_layer_submodule.f90 | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/nf/nf_layer_submodule.f90 b/src/nf/nf_layer_submodule.f90
index 801c7754..39cdac1a 100644
--- a/src/nf/nf_layer_submodule.f90
+++ b/src/nf/nf_layer_submodule.f90
@@ -249,6 +249,8 @@ module subroutine forward(self, input)
             call this_layer % forward(prev_layer % output)
           type is(linear2d_layer)
             call this_layer % forward(prev_layer % output)
+          type is(layernorm_layer)
+            call this_layer % forward(prev_layer % output)
         end select
 
       type is(reshape3d_layer)

From 720b79b37420d26d99bcf04c15dca34a8d959843 Mon Sep 17 00:00:00 2001
From: Mikhail Voronov <mikivo@list.ru>
Date: Tue, 25 Feb 2025 11:49:54 +0400
Subject: [PATCH 14/18] layernorm: integration test for layernorm

---
 test/test_layernorm.f90 | 106 ++++++++++++++++++++++++++++------------
 1 file changed, 75 insertions(+), 31 deletions(-)

diff --git a/test/test_layernorm.f90 b/test/test_layernorm.f90
index 75af7f26..15f45aed 100644
--- a/test/test_layernorm.f90
+++ b/test/test_layernorm.f90
@@ -1,20 +1,23 @@
-program test_layernorm
+program test_layernorm_instance
   use iso_fortran_env, only: stderr => error_unit
   use nf_layernorm_layer, only: layernorm_layer
-  use nf, only: sgd
+  use nf_linear2d_layer, only: linear2d_layer
+  use nf_layer, only: layer
+  use nf, only: sgd, layernorm, network, input, flatten, linear2d
   implicit none
 
   logical :: ok = .true.
-  type(layernorm_layer) :: layernorm
+  type(layernorm_layer) :: layernorm_instance
   real :: sample_input(3, 4) = reshape([0.0, 10.1, 0.2, 10.3, 0.4, 10.5, 0.6, 10.7, 10.8, 0.9, 0.11, 0.12], [3, 4])
   real :: sample_gradient(3, 4) = reshape([0.1, 3., 2., 0.1, 3., 3., 0.1, 2., 0.1, 3., 0.1, 3.], [3, 4])
 
-  layernorm = layernorm_layer()
-  call layernorm % init([3, 4])
+  layernorm_instance = layernorm_layer()
+  call layernorm_instance % init([3, 4])
 
-  call test_layernorm_forward(layernorm, sample_input, ok)
-  call test_layernorm_backward(layernorm, sample_input, sample_gradient, ok)
+  call test_layernorm_forward(layernorm_instance, sample_input, ok)
+  call test_layernorm_backward(layernorm_instance, sample_input, sample_gradient, ok)
   call test_layernorm_gradients(sample_input, sample_gradient, ok)
+  call test_layernorm_integration(ok)
 
   if (ok) then
     print '(a)', 'test_layernorm_layer: All tests passed.'
@@ -32,8 +35,8 @@ function allclose(x, y) result(res)
     res = all(abs(x - y) <= (1e-06 + 1e-05 * abs(y)))
   end function allclose
 
-  subroutine test_layernorm_forward(layernorm, input, ok)
-    type(layernorm_layer), intent(in out) :: layernorm
+  subroutine test_layernorm_forward(layernorm_instance, input, ok)
+    type(layernorm_layer), intent(in out) :: layernorm_instance
     real, intent(in out) :: input(:, :)
     logical, intent(in out) :: ok
     real :: output_shape(2)
@@ -44,22 +47,22 @@ subroutine test_layernorm_forward(layernorm, input, ok)
         -0.552177250, 1.05800152, 1.02837324, -0.481686622, -1.02747762, -1.00740564&
     ]
 
-    call layernorm % forward(input)
+    call layernorm_instance % forward(input)
 
-    output_shape = shape(layernorm % output)
+    output_shape = shape(layernorm_instance % output)
     if (.not. all(output_shape.eq.expected_shape)) then
       ok = .false.
       write(stderr, '(a)') 'forward returned incorrect shape.. failed'
     end if
-    output_flat = reshape(layernorm % output, shape(output_flat))
+    output_flat = reshape(layernorm_instance % output, shape(output_flat))
     if (.not. allclose(output_flat, expected_output_flat)) then
       ok = .false.
       write(stderr, '(a)') 'forward returned incorrect values.. failed'
     end if
   end subroutine test_layernorm_forward
 
-  subroutine test_layernorm_backward(layernorm, input, gradient, ok)
-    type(layernorm_layer), intent(in out) :: layernorm
+  subroutine test_layernorm_backward(layernorm_instance, input, gradient, ok)
+    type(layernorm_layer), intent(in out) :: layernorm_instance
     real, intent(in out) :: input(:, :)
     real, intent(in out) :: gradient(:, :)
     logical, intent(in out) :: ok
@@ -77,24 +80,24 @@ subroutine test_layernorm_backward(layernorm, input, gradient, ok)
     real :: d_beta(4)
     real :: expected_d_beta(4) = [5.1, 6.1, 2.2, 6.1]
 
-    call layernorm % backward(input, gradient)
+    call layernorm_instance % backward(input, gradient)
 
-    gradient_shape = shape(layernorm % gradient)
+    gradient_shape = shape(layernorm_instance % gradient)
     if (.not. all(gradient_shape.eq.expected_gradient_shape)) then
       ok = .false.
       write(stderr, '(a)') 'backward returned incorrect gradient shape.. failed'
     end if
-    gradient_flat = reshape(layernorm % gradient, shape(gradient_flat))
+    gradient_flat = reshape(layernorm_instance % gradient, shape(gradient_flat))
     if (.not. allclose(gradient_flat, expected_gradient_flat)) then
       ok = .false.
       write(stderr, '(a)') 'backward returned incorrect gradient values.. failed'
     end if
 
-    if (.not. allclose(layernorm % d_gamma, expected_d_gamma)) then
+    if (.not. allclose(layernorm_instance % d_gamma, expected_d_gamma)) then
       ok = .false.
       write(stderr, '(a)') 'backward returned incorrect d_gamma values.. failed'
     end if
-    if (.not. allclose(layernorm % d_beta, expected_d_beta)) then
+    if (.not. allclose(layernorm_instance % d_beta, expected_d_beta)) then
       ok = .false.
       write(stderr, '(a)') 'backward returned incorrect d_beta values.. failed'
     end if
@@ -104,7 +107,7 @@ subroutine test_layernorm_gradients(input, gradient, ok)
     real, intent(in out) :: input(:, :)
     real, intent(in out) :: gradient(:, :)
     logical, intent(in out) :: ok
-    type(layernorm_layer) :: layernorm
+    type(layernorm_layer) :: layernorm_instance
     type(sgd) :: optim
 
     real :: parameters(8)
@@ -117,35 +120,76 @@ subroutine test_layernorm_gradients(input, gradient, ok)
         -0.564699769, -1.13543355, -1.11444426&
     ]
 
-    layernorm = layernorm_layer()
-    call layernorm % init([3, 4])
+    layernorm_instance = layernorm_layer()
+    call layernorm_instance % init([3, 4])
 
-    call layernorm % forward(input)
-    call layernorm % backward(input, gradient)
+    call layernorm_instance % forward(input)
+    call layernorm_instance % backward(input, gradient)
 
-    if (layernorm % get_num_params() /= 8) then
+    if (layernorm_instance % get_num_params() /= 8) then
       ok = .false.
       write(stderr, '(a)') 'incorrect number of parameters.. failed'
     end if
 
     expected_parameters(1: 4) = 1.
     expected_parameters(5: 8) = 0.
-    parameters = layernorm % get_params()
+    parameters = layernorm_instance % get_params()
     if (.not. all(parameters.eq.expected_parameters)) then
       ok = .false.
       write(stderr, '(a)') 'incorrect parameters.. failed'
     end if
 
     optim = SGD(learning_rate=0.01)
-    call optim % minimize(parameters, layernorm % get_gradients())
-    call layernorm % set_params(parameters)
+    call optim % minimize(parameters, layernorm_instance % get_gradients())
+    call layernorm_instance % set_params(parameters)
 
-    call layernorm % forward(input)
+    call layernorm_instance % forward(input)
 
-    updated_output = reshape(layernorm % output, [12])
+    updated_output = reshape(layernorm_instance % output, [12])
     if (.not. allclose(updated_output, expected_updated_output)) then
       ok = .false.
       write(stderr, '(a)') 'incorrect output after parameters update.. failed'
     end if
   end subroutine test_layernorm_gradients
-end program test_layernorm
+
+  subroutine test_layernorm_integration(ok)
+    logical, intent(in out) :: ok
+
+    type(network) :: net
+    real :: x(2, 3) = reshape([0.1, 2., 0.3, 4., 0.5, 6.], [2, 3])
+    real :: y(6) = [0.7, 0.2, 0.1, 0.1, 0.01, 0.9]
+    real :: tolerance = 0.1
+    integer :: epoch
+    integer :: epochs = 10000
+
+    net = network([&
+        input(2, 3),&
+        linear2d(3),&
+        layernorm(),&
+        flatten()&
+    ])
+
+    ! Kaiming weights to achieve semblance of convergance
+    select type(l => net % layers(2) % p)
+      type is(linear2d_layer)
+      call random_number(l % weights)
+      l % weights = l % weights * sqrt(2. / 6.)
+      l % biases = 0.2
+    end select
+
+    do epoch = 1, epochs
+      call net % forward(x)
+      call net % backward(y)
+      call net % update(optimizer=sgd(learning_rate=0.001))
+      if (all(abs(net % predict(x) - y) < tolerance)) exit
+    end do
+    print *, abs(net % predict(x) - y)
+
+    print *, epoch
+    if (.not. epoch <= epochs) then
+      write(stderr, '(a)') &
+        'linear2d + layernorm should converge in simple training.. failed'
+      ok = .false.
+    end if
+  end subroutine test_layernorm_integration
+end program test_layernorm_instance

From 981addd1c8435732d6559a87e1c2d9ee587df249 Mon Sep 17 00:00:00 2001
From: Mikhail Voronov <mikivo@list.ru>
Date: Tue, 25 Feb 2025 11:59:48 +0400
Subject: [PATCH 15/18] layernorm: memory allocation optimization

---
 src/nf/nf_layernorm.f90           |  4 ++++
 src/nf/nf_layernorm_submodule.f90 | 35 ++++++++++++-------------------
 test/test_layernorm.f90           |  2 --
 3 files changed, 17 insertions(+), 24 deletions(-)

diff --git a/src/nf/nf_layernorm.f90 b/src/nf/nf_layernorm.f90
index b148c534..36ef56f0 100644
--- a/src/nf/nf_layernorm.f90
+++ b/src/nf/nf_layernorm.f90
@@ -28,6 +28,10 @@ module nf_layernorm_layer
 
     real, allocatable :: output(:, :)
 
+    ! temp storages
+    real, allocatable, private :: normalized(:, :)
+    real, allocatable, private :: one_over_sigma(:, :)
+    real, allocatable, private :: gradient_by_gamma_over_sigma(:, :)
   contains
     procedure :: forward
     procedure :: backward
diff --git a/src/nf/nf_layernorm_submodule.f90 b/src/nf/nf_layernorm_submodule.f90
index 744888bd..4eaa4382 100644
--- a/src/nf/nf_layernorm_submodule.f90
+++ b/src/nf/nf_layernorm_submodule.f90
@@ -11,11 +11,8 @@ end function layernorm_layer_cons
   pure module subroutine forward(self, input)
     class(layernorm_layer), intent(in out) :: self
     real, intent(in) :: input(:, :)
-    real, allocatable :: normalized(:, :)
     integer :: i
 
-    allocate(normalized, mold=self % mu)
-
     ! mu = x - MEAN_last_dim(x)
     do concurrent(i = 1: self % model_dimension)
       self % mu(:, i) = input(:, i) - (sum(input, dim=2) / self % model_dimension)
@@ -26,35 +23,28 @@ pure module subroutine forward(self, input)
 
     ! normalize mu by variance by first axis
     do concurrent(i = 1: self % model_dimension)
-      normalized(:, i) = self % mu(:, i) / self % sigma
+      self % normalized(:, i) = self % mu(:, i) / self % sigma
     end do
 
     ! forward through trainable params gamma and beta
     do concurrent(i = 1: self % sequence_length)
-      self % output(i, :) = normalized(i, :) * self % gamma + self % beta
+      self % output(i, :) = self % normalized(i, :) * self % gamma + self % beta
     end do
-
-    deallocate(normalized)
   end subroutine forward
 
   pure module subroutine backward(self, input, gradient)
     class(layernorm_layer), intent(in out) :: self
     real, intent(in) :: input(:, :)
     real, intent(in) :: gradient(:, :)
-    real, allocatable :: one_over_sigma(:, :)
-    real, allocatable :: gradient_by_gamma_over_sigma(:, :)
-
-    allocate(one_over_sigma(self % sequence_length, self % model_dimension))
-    allocate(gradient_by_gamma_over_sigma(self % sequence_length, self % model_dimension))
 
-    one_over_sigma = (1 / spread(self % sigma, dim=2, ncopies=self % model_dimension))
-    gradient_by_gamma_over_sigma = &
+    self % one_over_sigma = (1 / spread(self % sigma, dim=2, ncopies=self % model_dimension))
+    self % gradient_by_gamma_over_sigma = &
         gradient &
         * spread(self % gamma, dim=1, ncopies=self % sequence_length) &
-        * one_over_sigma
+        * self % one_over_sigma
 
     ! d_output/d_gamma = sum(d_output/d_y * mu/sigma)
-    self % d_gamma = sum(gradient * self % mu * one_over_sigma, dim=1)
+    self % d_gamma = sum(gradient * self % mu * self % one_over_sigma, dim=1)
 
     ! d_output/d_beta = sum(d_output/d_y) * 1
     self % d_beta = sum(gradient, dim=1)
@@ -66,20 +56,17 @@ pure module subroutine backward(self, input, gradient)
     !     - sum(d_output/d_y * gamma/sigma) / len
     !     - mu * sum(d_output/d_y * gamma * mu * sigma^(03)) / len
     self % gradient = &
-        gradient_by_gamma_over_sigma &
+        self % gradient_by_gamma_over_sigma &
         - spread(&
-            sum(gradient_by_gamma_over_sigma, dim=2),&
+            sum(self % gradient_by_gamma_over_sigma, dim=2),&
             dim=2,&
             ncopies=self % model_dimension&
           ) / self % model_dimension &
         - self % mu * spread(&
-            sum(gradient_by_gamma_over_sigma * self % mu * (one_over_sigma ** 2), dim=2),&
+            sum(self % gradient_by_gamma_over_sigma * self % mu * (self % one_over_sigma ** 2), dim=2),&
             dim=2,&
             ncopies=self % model_dimension&
           ) / self % model_dimension
-
-    deallocate(one_over_sigma)
-    deallocate(gradient_by_gamma_over_sigma)
   end subroutine backward
 
   module subroutine init(self, input_shape)
@@ -106,6 +93,10 @@ module subroutine init(self, input_shape)
     allocate(self % sigma(self % sequence_length))
 
     allocate(self % output(self % sequence_length, self % model_dimension))
+
+    allocate(self % normalized, mold=self % mu)
+    allocate(self % one_over_sigma, mold=self % mu)
+    allocate(self % gradient_by_gamma_over_sigma, mold=self % mu)
   end subroutine init
 
   pure module function get_num_params(self) result(num_params)
diff --git a/test/test_layernorm.f90 b/test/test_layernorm.f90
index 15f45aed..6a897575 100644
--- a/test/test_layernorm.f90
+++ b/test/test_layernorm.f90
@@ -183,9 +183,7 @@ subroutine test_layernorm_integration(ok)
       call net % update(optimizer=sgd(learning_rate=0.001))
       if (all(abs(net % predict(x) - y) < tolerance)) exit
     end do
-    print *, abs(net % predict(x) - y)
 
-    print *, epoch
     if (.not. epoch <= epochs) then
       write(stderr, '(a)') &
         'linear2d + layernorm should converge in simple training.. failed'

From 55077b3d98eec53679ff85041ad2bf6bba18b83b Mon Sep 17 00:00:00 2001
From: milancurcic <caomaco@gmail.com>
Date: Tue, 25 Feb 2025 11:59:49 -0500
Subject: [PATCH 16/18] Tidy up

---
 src/nf/nf_layer_submodule.f90 | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/nf/nf_layer_submodule.f90 b/src/nf/nf_layer_submodule.f90
index 39cdac1a..a3b42434 100644
--- a/src/nf/nf_layer_submodule.f90
+++ b/src/nf/nf_layer_submodule.f90
@@ -47,7 +47,7 @@ pure module subroutine backward_1d(self, previous, gradient)
 
       type is(flatten_layer)
 
-        ! Upstream layers permitted: input2d, input3d, conv2d, maxpool2d
+        ! Upstream layers permitted: input2d, input3d, conv2d, layernorm, maxpool2d
         select type(prev_layer => previous % p)
           type is(input2d_layer)
             call this_layer % backward(prev_layer % output, gradient)
@@ -267,7 +267,7 @@ module subroutine forward(self, input)
 
       type is(linear2d_layer)
 
-        ! Upstream layers permitted: input2d, linear2d, self_attention, layer_normalization
+        ! Upstream layers permitted: input2d, linear2d, self_attention, layernorm
         select type(prev_layer => input % p)
           type is(input2d_layer)
             call this_layer % forward(prev_layer % output)
@@ -281,7 +281,7 @@ module subroutine forward(self, input)
 
       type is(self_attention_layer)
 
-        ! Upstream layers permitted: input2d, linear2d, self_attention, layer_normalization
+        ! Upstream layers permitted: input2d, linear2d, self_attention, layernorm
         select type(prev_layer => input % p)
           type is(input2d_layer)
             call this_layer % forward(prev_layer % output)

From 249485fb6ee8e357c9806f9bcaedff552ee5354e Mon Sep 17 00:00:00 2001
From: milancurcic <caomaco@gmail.com>
Date: Tue, 25 Feb 2025 12:00:02 -0500
Subject: [PATCH 17/18] Bump version

---
 fpm.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fpm.toml b/fpm.toml
index ebcceeb6..15a746e4 100644
--- a/fpm.toml
+++ b/fpm.toml
@@ -1,5 +1,5 @@
 name = "neural-fortran"
-version = "0.19.0"
+version = "0.20.0"
 license = "MIT"
 author = "Milan Curcic"
 maintainer = "mcurcic@miami.edu"

From 3e3776b2a54906b3ea8cf1985eb5d5e581504edd Mon Sep 17 00:00:00 2001
From: milancurcic <caomaco@gmail.com>
Date: Tue, 25 Feb 2025 12:00:13 -0500
Subject: [PATCH 18/18] Add layernorm to the table of layers

---
 README.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index a04ac32a..9fe3fab0 100644
--- a/README.md
+++ b/README.md
@@ -34,8 +34,9 @@ Read the paper [here](https://arxiv.org/abs/1902.06714).
 | Dropout | `dropout` | `dense`, `flatten`, `input1d` | 1 | ✅ | ✅ |
 | Convolutional (2-d) | `conv2d` | `input3d`, `conv2d`, `maxpool2d`, `reshape` | 3 | ✅ | ✅(*) |
 | Max-pooling (2-d) | `maxpool2d` | `input3d`, `conv2d`, `maxpool2d`, `reshape` | 3 | ✅ | ✅ |
-| Linear (2-d) | `linear2d` | `input2d`, `linear2d`, `self_attention` | 2 | ✅ | ✅ |
-| Self-attention | `self_attention` | `input2d`, `linear2d`, `self_attention` | 2 | ✅ | ✅ |
+| Linear (2-d) | `linear2d` | `input2d`, `layernorm`, `linear2d`, `self_attention` | 2 | ✅ | ✅ |
+| Self-attention | `self_attention` | `input2d`, `layernorm`, `linear2d`, `self_attention` | 2 | ✅ | ✅ |
+| Layer Normalization | `layernorm` | `linear2d`, `self_attention` | 2 | ✅ | ✅ |
 | Flatten | `flatten` | `input2d`, `input3d`, `conv2d`, `maxpool2d`, `reshape` | 1 | ✅ | ✅ |
 | Reshape (1-d to 3-d) | `reshape` | `input1d`, `dense`, `flatten` | 3 | ✅ | ✅ |