diff --git a/CMakeLists.txt b/CMakeLists.txt
index c1bf2231..906bdd0d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -37,6 +37,8 @@ add_library(neural-fortran
   src/nf/nf_input3d_layer_submodule.f90
   src/nf/nf_layer_constructors.f90
   src/nf/nf_layer_constructors_submodule.f90
+  src/nf/nf_layernorm.f90
+  src/nf/nf_layernorm_submodule.f90
   src/nf/nf_layer.f90
   src/nf/nf_layer_submodule.f90
   src/nf/nf_linear2d_layer.f90
diff --git a/README.md b/README.md
index a04ac32a..9fe3fab0 100644
--- a/README.md
+++ b/README.md
@@ -34,8 +34,9 @@ Read the paper [here](https://arxiv.org/abs/1902.06714).
 | Dropout | `dropout` | `dense`, `flatten`, `input1d` | 1 | ✅ | ✅ |
 | Convolutional (2-d) | `conv2d` | `input3d`, `conv2d`, `maxpool2d`, `reshape` | 3 | ✅ | ✅(*) |
 | Max-pooling (2-d) | `maxpool2d` | `input3d`, `conv2d`, `maxpool2d`, `reshape` | 3 | ✅ | ✅ |
-| Linear (2-d) | `linear2d` | `input2d`, `linear2d`, `self_attention` | 2 | ✅ | ✅ |
-| Self-attention | `self_attention` | `input2d`, `linear2d`, `self_attention` | 2 | ✅ | ✅ |
+| Linear (2-d) | `linear2d` | `input2d`, `layernorm`, `linear2d`, `self_attention` | 2 | ✅ | ✅ |
+| Self-attention | `self_attention` | `input2d`, `layernorm`, `linear2d`, `self_attention` | 2 | ✅ | ✅ |
+| Layer Normalization | `layernorm` | `linear2d`, `self_attention` | 2 | ✅ | ✅ |
 | Flatten | `flatten` | `input2d`, `input3d`, `conv2d`, `maxpool2d`, `reshape` | 1 | ✅ | ✅ |
 | Reshape (1-d to 3-d) | `reshape` | `input1d`, `dense`, `flatten` | 3 | ✅ | ✅ |
 
diff --git a/fpm.toml b/fpm.toml
index ebcceeb6..15a746e4 100644
--- a/fpm.toml
+++ b/fpm.toml
@@ -1,5 +1,5 @@
 name = "neural-fortran"
-version = "0.19.0"
+version = "0.20.0"
 license = "MIT"
 author = "Milan Curcic"
 maintainer = "mcurcic@miami.edu"
diff --git a/src/nf.f90 b/src/nf.f90
index 39f67ea3..d089c4ac 100644
--- a/src/nf.f90
+++ b/src/nf.f90
@@ -11,7 +11,8 @@ module nf
     linear2d, &
     maxpool2d, &
     reshape, &
-    self_attention
+    self_attention, &
+    layernorm
   use nf_loss, only: mse, quadratic
   use nf_metrics, only: corr, maxabs
   use nf_network, only: network
diff --git a/src/nf/nf_layer_constructors.f90 b/src/nf/nf_layer_constructors.f90
index db60cf0f..ce00b6bc 100644
--- a/src/nf/nf_layer_constructors.f90
+++ b/src/nf/nf_layer_constructors.f90
@@ -17,7 +17,8 @@ module nf_layer_constructors
     linear2d, &
     maxpool2d, &
     reshape, &
-    self_attention
+    self_attention, &
+    layernorm
 
   interface input
 
@@ -222,15 +223,23 @@ module function linear2d(out_features) result(res)
         !! Resulting layer instance
     end function linear2d
 
-  module function self_attention(num_heads) result(res)
-    !! Rank-2 (sequence_length, out_features) self attention constructor.
-    !! sequence_length and model_dimension are determined at layer initialization, based on the
-    !! output shape of the previous layer.
-    integer, intent(in) :: num_heads
-      !! Number of attention heads
-    type(layer) :: res
-      !! Resulting layer instance
-  end function self_attention
+    module function self_attention(num_heads) result(res)
+      !! Rank-2 (sequence_length, out_features) self attention constructor.
+      !! sequence_length and model_dimension are determined at layer initialization, based on the
+      !! output shape of the previous layer.
+      integer, intent(in) :: num_heads
+        !! Number of attention heads
+      type(layer) :: res
+        !! Resulting layer instance
+    end function self_attention
+
+    module function layernorm() result(res)
+      !! Layer Normalization
+      !! ((x − mean(x)) / sqrt(variance(x) + eps) * gamma + beta
+      !! Based upon `Ba, Jimmy Lei, Jamie Ryan Kiros, and Geoffrey E. Hinton(2016)`:
+      !! https://arxiv.org/abs/1607.06450v1
+      type(layer) :: res
+    end function layernorm
 
   end interface
 
diff --git a/src/nf/nf_layer_constructors_submodule.f90 b/src/nf/nf_layer_constructors_submodule.f90
index 9e5322c1..5c2e8893 100644
--- a/src/nf/nf_layer_constructors_submodule.f90
+++ b/src/nf/nf_layer_constructors_submodule.f90
@@ -12,6 +12,7 @@
   use nf_reshape_layer, only: reshape3d_layer
   use nf_linear2d_layer, only: linear2d_layer
   use nf_self_attention_layer, only: self_attention_layer
+  use nf_layernorm_layer, only: layernorm_layer
   use nf_activation, only: activation_function, relu, sigmoid
 
   implicit none
@@ -179,4 +180,11 @@ module function self_attention(num_heads) result(res)
     allocate(res % p, source=self_attention_layer(num_heads))
   end function self_attention
 
+  module function layernorm() result(res)
+    type(layer) :: res
+
+    res % name = 'layernorm'
+    allocate(res % p, source=layernorm_layer())
+  end function layernorm
+
 end submodule nf_layer_constructors_submodule
diff --git a/src/nf/nf_layer_submodule.f90 b/src/nf/nf_layer_submodule.f90
index ecdeb41d..a3b42434 100644
--- a/src/nf/nf_layer_submodule.f90
+++ b/src/nf/nf_layer_submodule.f90
@@ -12,6 +12,7 @@
   use nf_reshape_layer, only: reshape3d_layer
   use nf_linear2d_layer, only: linear2d_layer
   use nf_self_attention_layer, only: self_attention_layer
+  use nf_layernorm_layer, only: layernorm_layer
   use nf_optimizers, only: optimizer_base_type
 
 contains
@@ -46,7 +47,7 @@ pure module subroutine backward_1d(self, previous, gradient)
 
       type is(flatten_layer)
 
-        ! Upstream layers permitted: input2d, input3d, conv2d, maxpool2d
+        ! Upstream layers permitted: input2d, input3d, conv2d, layernorm, maxpool2d
         select type(prev_layer => previous % p)
           type is(input2d_layer)
             call this_layer % backward(prev_layer % output, gradient)
@@ -60,6 +61,8 @@ pure module subroutine backward_1d(self, previous, gradient)
             call this_layer % backward(prev_layer % output, gradient)
           type is(self_attention_layer)
             call this_layer % backward(prev_layer % output, gradient)
+          type is(layernorm_layer)
+            call this_layer % backward(prev_layer % output, gradient)
         end select
 
     end select
@@ -84,6 +87,8 @@ pure module subroutine backward_2d(self, previous, gradient)
             call this_layer % backward(prev_layer % output, gradient)
           type is(self_attention_layer)
             call this_layer % backward(prev_layer % output, gradient)
+          type is(layernorm_layer)
+            call this_layer % backward(prev_layer % output, gradient)
         end select
 
       type is(self_attention_layer)
@@ -95,8 +100,18 @@ pure module subroutine backward_2d(self, previous, gradient)
             call this_layer % backward(prev_layer % output, gradient)
           type is(self_attention_layer)
             call this_layer % backward(prev_layer % output, gradient)
+          type is(layernorm_layer)
+            call this_layer % backward(prev_layer % output, gradient)
         end select
 
+      type is(layernorm_layer)
+
+        select type(prev_layer => previous % p)
+          type is(linear2d_layer)
+            call this_layer % backward(prev_layer % output, gradient)
+          type is(self_attention_layer)
+            call this_layer % backward(prev_layer % output, gradient)
+        end select
     end select
 
   end subroutine backward_2d
@@ -234,6 +249,8 @@ module subroutine forward(self, input)
             call this_layer % forward(prev_layer % output)
           type is(linear2d_layer)
             call this_layer % forward(prev_layer % output)
+          type is(layernorm_layer)
+            call this_layer % forward(prev_layer % output)
         end select
 
       type is(reshape3d_layer)
@@ -250,7 +267,7 @@ module subroutine forward(self, input)
 
       type is(linear2d_layer)
 
-        ! Upstream layers permitted: input2d, linear2d
+        ! Upstream layers permitted: input2d, linear2d, self_attention, layernorm
         select type(prev_layer => input % p)
           type is(input2d_layer)
             call this_layer % forward(prev_layer % output)
@@ -258,11 +275,13 @@ module subroutine forward(self, input)
             call this_layer % forward(prev_layer % output)
           type is(self_attention_layer)
             call this_layer % forward(prev_layer % output)
+          type is(layernorm_layer)
+            call this_layer % forward(prev_layer % output)
         end select
 
       type is(self_attention_layer)
 
-        ! Upstream layers permitted: input2d, linear2d
+        ! Upstream layers permitted: input2d, linear2d, self_attention, layernorm
         select type(prev_layer => input % p)
           type is(input2d_layer)
             call this_layer % forward(prev_layer % output)
@@ -270,6 +289,18 @@ module subroutine forward(self, input)
             call this_layer % forward(prev_layer % output)
           type is(self_attention_layer)
             call this_layer % forward(prev_layer % output)
+          type is(layernorm_layer)
+            call this_layer % forward(prev_layer % output)
+        end select
+
+      type is(layernorm_layer)
+
+        ! Upstream layers permitted: linear2d, self_attention
+        select type(prev_layer => input % p)
+          type is(linear2d_layer)
+            call this_layer % forward(prev_layer % output)
+          type is(self_attention_layer)
+            call this_layer % forward(prev_layer % output)
         end select
 
     end select
@@ -311,6 +342,8 @@ pure module subroutine get_output_2d(self, output)
         allocate(output, source=this_layer % output)
       type is(self_attention_layer)
         allocate(output, source=this_layer % output)
+      type is(layernorm_layer)
+        allocate(output, source=this_layer % output)
       class default
         error stop '2-d output can only be read from an input2d or linear2d layer.'
 
@@ -354,8 +387,8 @@ impure elemental module subroutine init(self, input)
       call this_layer % init(input % layer_shape)
     end select
 
-    ! The shape of conv2d, dropout, flatten, linear2d, maxpool2d, or
-    ! self_attention layers is not known until we receive an input layer.
+    ! The shape of conv2d, dropout, flatten, linear2d, maxpool2d,
+    ! self_attention or layernorm layers is not known until we receive an input layer.
     select type(this_layer => self % p)
       type is(conv2d_layer)
         self % layer_shape = shape(this_layer % output)
@@ -367,6 +400,8 @@ impure elemental module subroutine init(self, input)
         self % layer_shape = shape(this_layer % output)
       type is(self_attention_layer)
         self % layer_shape = shape(this_layer % output)
+      type is(layernorm_layer)
+        self % layer_shape = shape(this_layer % output)
       type is(maxpool2d_layer)
         self % layer_shape = shape(this_layer % output)
     end select
@@ -425,6 +460,8 @@ elemental module function get_num_params(self) result(num_params)
         num_params = this_layer % get_num_params()
       type is (self_attention_layer)
         num_params = this_layer % get_num_params()
+      type is (layernorm_layer)
+        num_params = this_layer % get_num_params()
       class default
         error stop 'Unknown layer type.'
     end select
@@ -458,6 +495,8 @@ module function get_params(self) result(params)
         params = this_layer % get_params()
       type is (self_attention_layer)
         params = this_layer % get_params()
+      type is (layernorm_layer)
+        params = this_layer % get_params()
       class default
         error stop 'Unknown layer type.'
     end select
@@ -491,6 +530,8 @@ module function get_gradients(self) result(gradients)
         gradients = this_layer % get_gradients()
       type is (self_attention_layer)
         gradients = this_layer % get_gradients()
+      type is (layernorm_layer)
+        gradients = this_layer % get_gradients()
       class default
         error stop 'Unknown layer type.'
     end select
@@ -549,6 +590,9 @@ module subroutine set_params(self, params)
       type is (self_attention_layer)
         call this_layer % set_params(params)
 
+      type is (layernorm_layer)
+        call this_layer % set_params(params)
+
       type is (maxpool2d_layer)
         ! No parameters to set.
         write(stderr, '(a)') 'Warning: calling set_params() ' &
diff --git a/src/nf/nf_layernorm.f90 b/src/nf/nf_layernorm.f90
new file mode 100644
index 00000000..36ef56f0
--- /dev/null
+++ b/src/nf/nf_layernorm.f90
@@ -0,0 +1,92 @@
+module nf_layernorm_layer
+  use nf_activation, only: activation_function
+  use nf_base_layer, only: base_layer
+
+  implicit none
+
+  private
+  public :: layernorm_layer
+
+  type, extends(base_layer) :: layernorm_layer
+    !! Layer Normalization
+    !! ((x − mean(x)) / sqrt(variance(x) + eps) * gamma + beta
+    !! Based upon `Ba, Jimmy Lei, Jamie Ryan Kiros, and Geoffrey E. Hinton(2016)`:
+    !! https://arxiv.org/abs/1607.06450v1
+    integer :: sequence_length
+    integer :: model_dimension
+
+    real :: eps
+    real, allocatable :: gamma(:)
+    real, allocatable :: beta(:)
+
+    real, allocatable :: d_gamma(:)
+    real, allocatable :: d_beta(:)
+    real, allocatable :: gradient(:, :)
+
+    real, allocatable :: mu(:, :)
+    real, allocatable :: sigma(:)
+
+    real, allocatable :: output(:, :)
+
+    ! temp storages
+    real, allocatable, private :: normalized(:, :)
+    real, allocatable, private :: one_over_sigma(:, :)
+    real, allocatable, private :: gradient_by_gamma_over_sigma(:, :)
+  contains
+    procedure :: forward
+    procedure :: backward
+    procedure :: init
+    procedure :: get_num_params
+    procedure :: get_params
+    procedure :: get_gradients
+    procedure :: set_params
+  end type layernorm_layer
+
+  interface layernorm_layer
+    module function layernorm_layer_cons() &
+      result(res)
+      type(layernorm_layer) :: res
+    end function layernorm_layer_cons
+  end interface layernorm_layer
+
+  interface
+    pure module subroutine forward(self, input)
+      class(layernorm_layer), intent(in out) :: self
+      real, intent(in) :: input(:, :)
+    end subroutine forward
+
+    pure module subroutine backward(self, input, gradient)
+      class(layernorm_layer), intent(in out) :: self
+      real, intent(in) :: input(:, :)
+      real, intent(in) :: gradient(:, :)
+    end subroutine backward
+
+    module subroutine init(self, input_shape)
+      class(layernorm_layer), intent(in out) :: self
+      integer, intent(in) :: input_shape(:)
+    end subroutine init
+
+    pure module function get_num_params(self) result(num_params)
+      class(layernorm_layer), intent(in) :: self
+      integer :: num_params
+    end function get_num_params
+
+
+    module function get_params(self) result(params)
+      class(layernorm_layer), intent(in), target :: self
+      real, allocatable :: params(:)
+    end function get_params
+
+
+    module function get_gradients(self) result(gradients)
+      class(layernorm_layer), intent(in), target :: self
+      real, allocatable :: gradients(:)
+    end function get_gradients
+
+
+    module subroutine set_params(self, params)
+      class(layernorm_layer), intent(in out) :: self
+      real, intent(in), target :: params(:)
+    end subroutine set_params
+  end interface
+end module nf_layernorm_layer
\ No newline at end of file
diff --git a/src/nf/nf_layernorm_submodule.f90 b/src/nf/nf_layernorm_submodule.f90
new file mode 100644
index 00000000..4eaa4382
--- /dev/null
+++ b/src/nf/nf_layernorm_submodule.f90
@@ -0,0 +1,149 @@
+submodule(nf_layernorm_layer) nf_layernorm_layer_submodule
+  implicit none
+contains
+    module function layernorm_layer_cons() &
+    result(res)
+    type(layernorm_layer) :: res
+
+    res % eps = 1e-5
+  end function layernorm_layer_cons
+
+  pure module subroutine forward(self, input)
+    class(layernorm_layer), intent(in out) :: self
+    real, intent(in) :: input(:, :)
+    integer :: i
+
+    ! mu = x - MEAN_last_dim(x)
+    do concurrent(i = 1: self % model_dimension)
+      self % mu(:, i) = input(:, i) - (sum(input, dim=2) / self % model_dimension)
+    end do
+
+    ! square root of variance shifted be eps
+    self % sigma = sqrt((sum(self % mu ** 2, dim=2) / self % model_dimension) + self % eps)
+
+    ! normalize mu by variance by first axis
+    do concurrent(i = 1: self % model_dimension)
+      self % normalized(:, i) = self % mu(:, i) / self % sigma
+    end do
+
+    ! forward through trainable params gamma and beta
+    do concurrent(i = 1: self % sequence_length)
+      self % output(i, :) = self % normalized(i, :) * self % gamma + self % beta
+    end do
+  end subroutine forward
+
+  pure module subroutine backward(self, input, gradient)
+    class(layernorm_layer), intent(in out) :: self
+    real, intent(in) :: input(:, :)
+    real, intent(in) :: gradient(:, :)
+
+    self % one_over_sigma = (1 / spread(self % sigma, dim=2, ncopies=self % model_dimension))
+    self % gradient_by_gamma_over_sigma = &
+        gradient &
+        * spread(self % gamma, dim=1, ncopies=self % sequence_length) &
+        * self % one_over_sigma
+
+    ! d_output/d_gamma = sum(d_output/d_y * mu/sigma)
+    self % d_gamma = sum(gradient * self % mu * self % one_over_sigma, dim=1)
+
+    ! d_output/d_beta = sum(d_output/d_y) * 1
+    self % d_beta = sum(gradient, dim=1)
+
+    ! From this article:
+    ! https://robotchinwag.com/posts/layer-normalization-deriving-the-gradient-for-the-backward-pass/
+    ! d_output/d_x = d_output/d_y * gamma/sigma
+    !     - d_output/d_y
+    !     - sum(d_output/d_y * gamma/sigma) / len
+    !     - mu * sum(d_output/d_y * gamma * mu * sigma^(03)) / len
+    self % gradient = &
+        self % gradient_by_gamma_over_sigma &
+        - spread(&
+            sum(self % gradient_by_gamma_over_sigma, dim=2),&
+            dim=2,&
+            ncopies=self % model_dimension&
+          ) / self % model_dimension &
+        - self % mu * spread(&
+            sum(self % gradient_by_gamma_over_sigma * self % mu * (self % one_over_sigma ** 2), dim=2),&
+            dim=2,&
+            ncopies=self % model_dimension&
+          ) / self % model_dimension
+  end subroutine backward
+
+  module subroutine init(self, input_shape)
+    class(layernorm_layer), intent(in out) :: self
+    integer, intent(in) :: input_shape(:)
+
+    if (size(input_shape) /= 2) then
+      error stop "LayerNorm Layer accepts 2D input"
+    end if
+    self % sequence_length = input_shape(1)
+    self % model_dimension = input_shape(2)
+
+    ! default initialization from PyTorch
+    allocate(self % gamma(self % model_dimension))
+    self % gamma = 1.
+    allocate(self % beta(self % model_dimension))
+    self % beta = 0.
+
+    allocate(self % d_gamma(self % model_dimension))
+    allocate(self % d_beta(self % model_dimension))
+    allocate(self % gradient(self % sequence_length, self % model_dimension))
+
+    allocate(self % mu(self % sequence_length, self % model_dimension))
+    allocate(self % sigma(self % sequence_length))
+
+    allocate(self % output(self % sequence_length, self % model_dimension))
+
+    allocate(self % normalized, mold=self % mu)
+    allocate(self % one_over_sigma, mold=self % mu)
+    allocate(self % gradient_by_gamma_over_sigma, mold=self % mu)
+  end subroutine init
+
+  pure module function get_num_params(self) result(num_params)
+    class(layernorm_layer), intent(in) :: self
+    integer :: num_params
+
+    ! Number of weights times number of biases
+    num_params = 2 * self % model_dimension
+
+  end function get_num_params
+
+
+  module function get_params(self) result(params)
+    class(layernorm_layer), intent(in), target :: self
+    real, allocatable :: params(:)
+
+    params = [ &
+      self % gamma, &
+      self % beta &
+    ]
+
+  end function get_params
+
+
+  module function get_gradients(self) result(gradients)
+    class(layernorm_layer), intent(in), target :: self
+    real, allocatable :: gradients(:)
+
+    gradients = [ &
+      self % d_gamma, &
+      self % d_beta &
+    ]
+
+  end function get_gradients
+
+
+  module subroutine set_params(self, params)
+    class(layernorm_layer), intent(in out) :: self
+    real, intent(in), target :: params(:)
+
+    ! check if the number of parameters is correct
+    if (size(params) /= self % get_num_params()) then
+      error stop 'Error: number of parameters does not match'
+    end if
+
+    self % gamma = params(1: self % model_dimension)
+    self % beta = params(self % model_dimension + 1: 2 * self % model_dimension)
+
+  end subroutine set_params
+end submodule nf_layernorm_layer_submodule
diff --git a/src/nf/nf_network_submodule.f90 b/src/nf/nf_network_submodule.f90
index f344c5c5..a6b7657c 100644
--- a/src/nf/nf_network_submodule.f90
+++ b/src/nf/nf_network_submodule.f90
@@ -11,6 +11,7 @@
   use nf_reshape_layer, only: reshape3d_layer
   use nf_linear2d_layer, only: linear2d_layer
   use nf_self_attention_layer, only: self_attention_layer
+  use nf_layernorm_layer, only: layernorm_layer
   use nf_layer, only: layer
   use nf_layer_constructors, only: conv2d, dense, flatten, input, maxpool2d, reshape
   use nf_loss, only: quadratic
@@ -163,6 +164,8 @@ module subroutine backward(self, output, loss)
             call self % layers(n) % backward(self % layers(n - 1), next_layer % gradient)
           type is(self_attention_layer)
             call self % layers(n) % backward(self % layers(n - 1), next_layer % gradient)
+          type is(layernorm_layer)
+            call self % layers(n) % backward(self % layers(n - 1), next_layer % gradient)
         end select
       end if
 
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 741e9930..46d349c1 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -12,6 +12,7 @@ foreach(execid
   insert_flatten
   reshape_layer
   multihead_attention_layer
+  layernorm
   dense_network
   get_set_network_params
   conv2d_network
diff --git a/test/test_layernorm.f90 b/test/test_layernorm.f90
new file mode 100644
index 00000000..6a897575
--- /dev/null
+++ b/test/test_layernorm.f90
@@ -0,0 +1,193 @@
+program test_layernorm_instance
+  use iso_fortran_env, only: stderr => error_unit
+  use nf_layernorm_layer, only: layernorm_layer
+  use nf_linear2d_layer, only: linear2d_layer
+  use nf_layer, only: layer
+  use nf, only: sgd, layernorm, network, input, flatten, linear2d
+  implicit none
+
+  logical :: ok = .true.
+  type(layernorm_layer) :: layernorm_instance
+  real :: sample_input(3, 4) = reshape([0.0, 10.1, 0.2, 10.3, 0.4, 10.5, 0.6, 10.7, 10.8, 0.9, 0.11, 0.12], [3, 4])
+  real :: sample_gradient(3, 4) = reshape([0.1, 3., 2., 0.1, 3., 3., 0.1, 2., 0.1, 3., 0.1, 3.], [3, 4])
+
+  layernorm_instance = layernorm_layer()
+  call layernorm_instance % init([3, 4])
+
+  call test_layernorm_forward(layernorm_instance, sample_input, ok)
+  call test_layernorm_backward(layernorm_instance, sample_input, sample_gradient, ok)
+  call test_layernorm_gradients(sample_input, sample_gradient, ok)
+  call test_layernorm_integration(ok)
+
+  if (ok) then
+    print '(a)', 'test_layernorm_layer: All tests passed.'
+  else
+    write(stderr, '(a)') 'test_layernorm_layer: One or more tests failed.'
+    error stop 1
+  end if
+
+contains
+  function allclose(x, y) result(res)
+    real, intent(in) :: x(:)
+    real, intent(in) :: y(:)
+    logical :: res
+
+    res = all(abs(x - y) <= (1e-06 + 1e-05 * abs(y)))
+  end function allclose
+
+  subroutine test_layernorm_forward(layernorm_instance, input, ok)
+    type(layernorm_layer), intent(in out) :: layernorm_instance
+    real, intent(in out) :: input(:, :)
+    logical, intent(in out) :: ok
+    real :: output_shape(2)
+    real :: output_flat(12)
+    real :: expected_shape(2) = [3, 4]
+    real :: expected_output_flat(12) = [&
+        -0.693158746, 0.939844191, -0.992156327, 1.72702277, -0.970368207, 0.971188426,&
+        -0.552177250, 1.05800152, 1.02837324, -0.481686622, -1.02747762, -1.00740564&
+    ]
+
+    call layernorm_instance % forward(input)
+
+    output_shape = shape(layernorm_instance % output)
+    if (.not. all(output_shape.eq.expected_shape)) then
+      ok = .false.
+      write(stderr, '(a)') 'forward returned incorrect shape.. failed'
+    end if
+    output_flat = reshape(layernorm_instance % output, shape(output_flat))
+    if (.not. allclose(output_flat, expected_output_flat)) then
+      ok = .false.
+      write(stderr, '(a)') 'forward returned incorrect values.. failed'
+    end if
+  end subroutine test_layernorm_forward
+
+  subroutine test_layernorm_backward(layernorm_instance, input, gradient, ok)
+    type(layernorm_layer), intent(in out) :: layernorm_instance
+    real, intent(in out) :: input(:, :)
+    real, intent(in out) :: gradient(:, :)
+    logical, intent(in out) :: ok
+
+    real :: gradient_shape(2)
+    real :: gradient_flat(12)
+    real :: expected_gradient_shape(2) = [3, 4]
+    real :: expected_gradient_flat(12) = [&
+        -0.227230772, 0.103088334, -9.88590196E-02, -2.86390483E-02, 0.283811331, 0.277955681,&
+        -0.215662330, -0.105019525, -0.269407451, 0.471532196, -0.281880081, 9.03107598E-02&
+    ]
+
+    real :: d_gamma(4)
+    real :: expected_d_gamma(4) = [0.765904069, 0.175162792,  2.16362262, -4.57002449]
+    real :: d_beta(4)
+    real :: expected_d_beta(4) = [5.1, 6.1, 2.2, 6.1]
+
+    call layernorm_instance % backward(input, gradient)
+
+    gradient_shape = shape(layernorm_instance % gradient)
+    if (.not. all(gradient_shape.eq.expected_gradient_shape)) then
+      ok = .false.
+      write(stderr, '(a)') 'backward returned incorrect gradient shape.. failed'
+    end if
+    gradient_flat = reshape(layernorm_instance % gradient, shape(gradient_flat))
+    if (.not. allclose(gradient_flat, expected_gradient_flat)) then
+      ok = .false.
+      write(stderr, '(a)') 'backward returned incorrect gradient values.. failed'
+    end if
+
+    if (.not. allclose(layernorm_instance % d_gamma, expected_d_gamma)) then
+      ok = .false.
+      write(stderr, '(a)') 'backward returned incorrect d_gamma values.. failed'
+    end if
+    if (.not. allclose(layernorm_instance % d_beta, expected_d_beta)) then
+      ok = .false.
+      write(stderr, '(a)') 'backward returned incorrect d_beta values.. failed'
+    end if
+  end subroutine test_layernorm_backward
+
+  subroutine test_layernorm_gradients(input, gradient, ok)
+    real, intent(in out) :: input(:, :)
+    real, intent(in out) :: gradient(:, :)
+    logical, intent(in out) :: ok
+    type(layernorm_layer) :: layernorm_instance
+    type(sgd) :: optim
+
+    real :: parameters(8)
+    real :: expected_parameters(8)
+    real :: updated_output(12)
+    real :: expected_updated_output(12) = [&
+        -0.738849819, 0.881645918, -1.03555739,&
+        1.66299772, -1.02966857, 0.908487320,&
+        -0.562230229, 1.01311040, 0.984123051,&
+        -0.564699769, -1.13543355, -1.11444426&
+    ]
+
+    layernorm_instance = layernorm_layer()
+    call layernorm_instance % init([3, 4])
+
+    call layernorm_instance % forward(input)
+    call layernorm_instance % backward(input, gradient)
+
+    if (layernorm_instance % get_num_params() /= 8) then
+      ok = .false.
+      write(stderr, '(a)') 'incorrect number of parameters.. failed'
+    end if
+
+    expected_parameters(1: 4) = 1.
+    expected_parameters(5: 8) = 0.
+    parameters = layernorm_instance % get_params()
+    if (.not. all(parameters.eq.expected_parameters)) then
+      ok = .false.
+      write(stderr, '(a)') 'incorrect parameters.. failed'
+    end if
+
+    optim = SGD(learning_rate=0.01)
+    call optim % minimize(parameters, layernorm_instance % get_gradients())
+    call layernorm_instance % set_params(parameters)
+
+    call layernorm_instance % forward(input)
+
+    updated_output = reshape(layernorm_instance % output, [12])
+    if (.not. allclose(updated_output, expected_updated_output)) then
+      ok = .false.
+      write(stderr, '(a)') 'incorrect output after parameters update.. failed'
+    end if
+  end subroutine test_layernorm_gradients
+
+  subroutine test_layernorm_integration(ok)
+    logical, intent(in out) :: ok
+
+    type(network) :: net
+    real :: x(2, 3) = reshape([0.1, 2., 0.3, 4., 0.5, 6.], [2, 3])
+    real :: y(6) = [0.7, 0.2, 0.1, 0.1, 0.01, 0.9]
+    real :: tolerance = 0.1
+    integer :: epoch
+    integer :: epochs = 10000
+
+    net = network([&
+        input(2, 3),&
+        linear2d(3),&
+        layernorm(),&
+        flatten()&
+    ])
+
+    ! Kaiming weights to achieve semblance of convergance
+    select type(l => net % layers(2) % p)
+      type is(linear2d_layer)
+      call random_number(l % weights)
+      l % weights = l % weights * sqrt(2. / 6.)
+      l % biases = 0.2
+    end select
+
+    do epoch = 1, epochs
+      call net % forward(x)
+      call net % backward(y)
+      call net % update(optimizer=sgd(learning_rate=0.001))
+      if (all(abs(net % predict(x) - y) < tolerance)) exit
+    end do
+
+    if (.not. epoch <= epochs) then
+      write(stderr, '(a)') &
+        'linear2d + layernorm should converge in simple training.. failed'
+      ok = .false.
+    end if
+  end subroutine test_layernorm_integration
+end program test_layernorm_instance