diff --git a/CMakeLists.txt b/CMakeLists.txt index c1bf2231..906bdd0d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -37,6 +37,8 @@ add_library(neural-fortran src/nf/nf_input3d_layer_submodule.f90 src/nf/nf_layer_constructors.f90 src/nf/nf_layer_constructors_submodule.f90 + src/nf/nf_layernorm.f90 + src/nf/nf_layernorm_submodule.f90 src/nf/nf_layer.f90 src/nf/nf_layer_submodule.f90 src/nf/nf_linear2d_layer.f90 diff --git a/README.md b/README.md index a04ac32a..9fe3fab0 100644 --- a/README.md +++ b/README.md @@ -34,8 +34,9 @@ Read the paper [here](https://arxiv.org/abs/1902.06714). | Dropout | `dropout` | `dense`, `flatten`, `input1d` | 1 | ✅ | ✅ | | Convolutional (2-d) | `conv2d` | `input3d`, `conv2d`, `maxpool2d`, `reshape` | 3 | ✅ | ✅(*) | | Max-pooling (2-d) | `maxpool2d` | `input3d`, `conv2d`, `maxpool2d`, `reshape` | 3 | ✅ | ✅ | -| Linear (2-d) | `linear2d` | `input2d`, `linear2d`, `self_attention` | 2 | ✅ | ✅ | -| Self-attention | `self_attention` | `input2d`, `linear2d`, `self_attention` | 2 | ✅ | ✅ | +| Linear (2-d) | `linear2d` | `input2d`, `layernorm`, `linear2d`, `self_attention` | 2 | ✅ | ✅ | +| Self-attention | `self_attention` | `input2d`, `layernorm`, `linear2d`, `self_attention` | 2 | ✅ | ✅ | +| Layer Normalization | `layernorm` | `linear2d`, `self_attention` | 2 | ✅ | ✅ | | Flatten | `flatten` | `input2d`, `input3d`, `conv2d`, `maxpool2d`, `reshape` | 1 | ✅ | ✅ | | Reshape (1-d to 3-d) | `reshape` | `input1d`, `dense`, `flatten` | 3 | ✅ | ✅ | diff --git a/fpm.toml b/fpm.toml index ebcceeb6..15a746e4 100644 --- a/fpm.toml +++ b/fpm.toml @@ -1,5 +1,5 @@ name = "neural-fortran" -version = "0.19.0" +version = "0.20.0" license = "MIT" author = "Milan Curcic" maintainer = "mcurcic@miami.edu" diff --git a/src/nf.f90 b/src/nf.f90 index 39f67ea3..d089c4ac 100644 --- a/src/nf.f90 +++ b/src/nf.f90 @@ -11,7 +11,8 @@ module nf linear2d, & maxpool2d, & reshape, & - self_attention + self_attention, & + layernorm use nf_loss, only: mse, quadratic use nf_metrics, only: corr, maxabs use nf_network, only: network diff --git a/src/nf/nf_layer_constructors.f90 b/src/nf/nf_layer_constructors.f90 index db60cf0f..ce00b6bc 100644 --- a/src/nf/nf_layer_constructors.f90 +++ b/src/nf/nf_layer_constructors.f90 @@ -17,7 +17,8 @@ module nf_layer_constructors linear2d, & maxpool2d, & reshape, & - self_attention + self_attention, & + layernorm interface input @@ -222,15 +223,23 @@ module function linear2d(out_features) result(res) !! Resulting layer instance end function linear2d - module function self_attention(num_heads) result(res) - !! Rank-2 (sequence_length, out_features) self attention constructor. - !! sequence_length and model_dimension are determined at layer initialization, based on the - !! output shape of the previous layer. - integer, intent(in) :: num_heads - !! Number of attention heads - type(layer) :: res - !! Resulting layer instance - end function self_attention + module function self_attention(num_heads) result(res) + !! Rank-2 (sequence_length, out_features) self attention constructor. + !! sequence_length and model_dimension are determined at layer initialization, based on the + !! output shape of the previous layer. + integer, intent(in) :: num_heads + !! Number of attention heads + type(layer) :: res + !! Resulting layer instance + end function self_attention + + module function layernorm() result(res) + !! Layer Normalization + !! ((x − mean(x)) / sqrt(variance(x) + eps) * gamma + beta + !! Based upon `Ba, Jimmy Lei, Jamie Ryan Kiros, and Geoffrey E. Hinton(2016)`: + !! https://arxiv.org/abs/1607.06450v1 + type(layer) :: res + end function layernorm end interface diff --git a/src/nf/nf_layer_constructors_submodule.f90 b/src/nf/nf_layer_constructors_submodule.f90 index 9e5322c1..5c2e8893 100644 --- a/src/nf/nf_layer_constructors_submodule.f90 +++ b/src/nf/nf_layer_constructors_submodule.f90 @@ -12,6 +12,7 @@ use nf_reshape_layer, only: reshape3d_layer use nf_linear2d_layer, only: linear2d_layer use nf_self_attention_layer, only: self_attention_layer + use nf_layernorm_layer, only: layernorm_layer use nf_activation, only: activation_function, relu, sigmoid implicit none @@ -179,4 +180,11 @@ module function self_attention(num_heads) result(res) allocate(res % p, source=self_attention_layer(num_heads)) end function self_attention + module function layernorm() result(res) + type(layer) :: res + + res % name = 'layernorm' + allocate(res % p, source=layernorm_layer()) + end function layernorm + end submodule nf_layer_constructors_submodule diff --git a/src/nf/nf_layer_submodule.f90 b/src/nf/nf_layer_submodule.f90 index ecdeb41d..a3b42434 100644 --- a/src/nf/nf_layer_submodule.f90 +++ b/src/nf/nf_layer_submodule.f90 @@ -12,6 +12,7 @@ use nf_reshape_layer, only: reshape3d_layer use nf_linear2d_layer, only: linear2d_layer use nf_self_attention_layer, only: self_attention_layer + use nf_layernorm_layer, only: layernorm_layer use nf_optimizers, only: optimizer_base_type contains @@ -46,7 +47,7 @@ pure module subroutine backward_1d(self, previous, gradient) type is(flatten_layer) - ! Upstream layers permitted: input2d, input3d, conv2d, maxpool2d + ! Upstream layers permitted: input2d, input3d, conv2d, layernorm, maxpool2d select type(prev_layer => previous % p) type is(input2d_layer) call this_layer % backward(prev_layer % output, gradient) @@ -60,6 +61,8 @@ pure module subroutine backward_1d(self, previous, gradient) call this_layer % backward(prev_layer % output, gradient) type is(self_attention_layer) call this_layer % backward(prev_layer % output, gradient) + type is(layernorm_layer) + call this_layer % backward(prev_layer % output, gradient) end select end select @@ -84,6 +87,8 @@ pure module subroutine backward_2d(self, previous, gradient) call this_layer % backward(prev_layer % output, gradient) type is(self_attention_layer) call this_layer % backward(prev_layer % output, gradient) + type is(layernorm_layer) + call this_layer % backward(prev_layer % output, gradient) end select type is(self_attention_layer) @@ -95,8 +100,18 @@ pure module subroutine backward_2d(self, previous, gradient) call this_layer % backward(prev_layer % output, gradient) type is(self_attention_layer) call this_layer % backward(prev_layer % output, gradient) + type is(layernorm_layer) + call this_layer % backward(prev_layer % output, gradient) end select + type is(layernorm_layer) + + select type(prev_layer => previous % p) + type is(linear2d_layer) + call this_layer % backward(prev_layer % output, gradient) + type is(self_attention_layer) + call this_layer % backward(prev_layer % output, gradient) + end select end select end subroutine backward_2d @@ -234,6 +249,8 @@ module subroutine forward(self, input) call this_layer % forward(prev_layer % output) type is(linear2d_layer) call this_layer % forward(prev_layer % output) + type is(layernorm_layer) + call this_layer % forward(prev_layer % output) end select type is(reshape3d_layer) @@ -250,7 +267,7 @@ module subroutine forward(self, input) type is(linear2d_layer) - ! Upstream layers permitted: input2d, linear2d + ! Upstream layers permitted: input2d, linear2d, self_attention, layernorm select type(prev_layer => input % p) type is(input2d_layer) call this_layer % forward(prev_layer % output) @@ -258,11 +275,13 @@ module subroutine forward(self, input) call this_layer % forward(prev_layer % output) type is(self_attention_layer) call this_layer % forward(prev_layer % output) + type is(layernorm_layer) + call this_layer % forward(prev_layer % output) end select type is(self_attention_layer) - ! Upstream layers permitted: input2d, linear2d + ! Upstream layers permitted: input2d, linear2d, self_attention, layernorm select type(prev_layer => input % p) type is(input2d_layer) call this_layer % forward(prev_layer % output) @@ -270,6 +289,18 @@ module subroutine forward(self, input) call this_layer % forward(prev_layer % output) type is(self_attention_layer) call this_layer % forward(prev_layer % output) + type is(layernorm_layer) + call this_layer % forward(prev_layer % output) + end select + + type is(layernorm_layer) + + ! Upstream layers permitted: linear2d, self_attention + select type(prev_layer => input % p) + type is(linear2d_layer) + call this_layer % forward(prev_layer % output) + type is(self_attention_layer) + call this_layer % forward(prev_layer % output) end select end select @@ -311,6 +342,8 @@ pure module subroutine get_output_2d(self, output) allocate(output, source=this_layer % output) type is(self_attention_layer) allocate(output, source=this_layer % output) + type is(layernorm_layer) + allocate(output, source=this_layer % output) class default error stop '2-d output can only be read from an input2d or linear2d layer.' @@ -354,8 +387,8 @@ impure elemental module subroutine init(self, input) call this_layer % init(input % layer_shape) end select - ! The shape of conv2d, dropout, flatten, linear2d, maxpool2d, or - ! self_attention layers is not known until we receive an input layer. + ! The shape of conv2d, dropout, flatten, linear2d, maxpool2d, + ! self_attention or layernorm layers is not known until we receive an input layer. select type(this_layer => self % p) type is(conv2d_layer) self % layer_shape = shape(this_layer % output) @@ -367,6 +400,8 @@ impure elemental module subroutine init(self, input) self % layer_shape = shape(this_layer % output) type is(self_attention_layer) self % layer_shape = shape(this_layer % output) + type is(layernorm_layer) + self % layer_shape = shape(this_layer % output) type is(maxpool2d_layer) self % layer_shape = shape(this_layer % output) end select @@ -425,6 +460,8 @@ elemental module function get_num_params(self) result(num_params) num_params = this_layer % get_num_params() type is (self_attention_layer) num_params = this_layer % get_num_params() + type is (layernorm_layer) + num_params = this_layer % get_num_params() class default error stop 'Unknown layer type.' end select @@ -458,6 +495,8 @@ module function get_params(self) result(params) params = this_layer % get_params() type is (self_attention_layer) params = this_layer % get_params() + type is (layernorm_layer) + params = this_layer % get_params() class default error stop 'Unknown layer type.' end select @@ -491,6 +530,8 @@ module function get_gradients(self) result(gradients) gradients = this_layer % get_gradients() type is (self_attention_layer) gradients = this_layer % get_gradients() + type is (layernorm_layer) + gradients = this_layer % get_gradients() class default error stop 'Unknown layer type.' end select @@ -549,6 +590,9 @@ module subroutine set_params(self, params) type is (self_attention_layer) call this_layer % set_params(params) + type is (layernorm_layer) + call this_layer % set_params(params) + type is (maxpool2d_layer) ! No parameters to set. write(stderr, '(a)') 'Warning: calling set_params() ' & diff --git a/src/nf/nf_layernorm.f90 b/src/nf/nf_layernorm.f90 new file mode 100644 index 00000000..36ef56f0 --- /dev/null +++ b/src/nf/nf_layernorm.f90 @@ -0,0 +1,92 @@ +module nf_layernorm_layer + use nf_activation, only: activation_function + use nf_base_layer, only: base_layer + + implicit none + + private + public :: layernorm_layer + + type, extends(base_layer) :: layernorm_layer + !! Layer Normalization + !! ((x − mean(x)) / sqrt(variance(x) + eps) * gamma + beta + !! Based upon `Ba, Jimmy Lei, Jamie Ryan Kiros, and Geoffrey E. Hinton(2016)`: + !! https://arxiv.org/abs/1607.06450v1 + integer :: sequence_length + integer :: model_dimension + + real :: eps + real, allocatable :: gamma(:) + real, allocatable :: beta(:) + + real, allocatable :: d_gamma(:) + real, allocatable :: d_beta(:) + real, allocatable :: gradient(:, :) + + real, allocatable :: mu(:, :) + real, allocatable :: sigma(:) + + real, allocatable :: output(:, :) + + ! temp storages + real, allocatable, private :: normalized(:, :) + real, allocatable, private :: one_over_sigma(:, :) + real, allocatable, private :: gradient_by_gamma_over_sigma(:, :) + contains + procedure :: forward + procedure :: backward + procedure :: init + procedure :: get_num_params + procedure :: get_params + procedure :: get_gradients + procedure :: set_params + end type layernorm_layer + + interface layernorm_layer + module function layernorm_layer_cons() & + result(res) + type(layernorm_layer) :: res + end function layernorm_layer_cons + end interface layernorm_layer + + interface + pure module subroutine forward(self, input) + class(layernorm_layer), intent(in out) :: self + real, intent(in) :: input(:, :) + end subroutine forward + + pure module subroutine backward(self, input, gradient) + class(layernorm_layer), intent(in out) :: self + real, intent(in) :: input(:, :) + real, intent(in) :: gradient(:, :) + end subroutine backward + + module subroutine init(self, input_shape) + class(layernorm_layer), intent(in out) :: self + integer, intent(in) :: input_shape(:) + end subroutine init + + pure module function get_num_params(self) result(num_params) + class(layernorm_layer), intent(in) :: self + integer :: num_params + end function get_num_params + + + module function get_params(self) result(params) + class(layernorm_layer), intent(in), target :: self + real, allocatable :: params(:) + end function get_params + + + module function get_gradients(self) result(gradients) + class(layernorm_layer), intent(in), target :: self + real, allocatable :: gradients(:) + end function get_gradients + + + module subroutine set_params(self, params) + class(layernorm_layer), intent(in out) :: self + real, intent(in), target :: params(:) + end subroutine set_params + end interface +end module nf_layernorm_layer \ No newline at end of file diff --git a/src/nf/nf_layernorm_submodule.f90 b/src/nf/nf_layernorm_submodule.f90 new file mode 100644 index 00000000..4eaa4382 --- /dev/null +++ b/src/nf/nf_layernorm_submodule.f90 @@ -0,0 +1,149 @@ +submodule(nf_layernorm_layer) nf_layernorm_layer_submodule + implicit none +contains + module function layernorm_layer_cons() & + result(res) + type(layernorm_layer) :: res + + res % eps = 1e-5 + end function layernorm_layer_cons + + pure module subroutine forward(self, input) + class(layernorm_layer), intent(in out) :: self + real, intent(in) :: input(:, :) + integer :: i + + ! mu = x - MEAN_last_dim(x) + do concurrent(i = 1: self % model_dimension) + self % mu(:, i) = input(:, i) - (sum(input, dim=2) / self % model_dimension) + end do + + ! square root of variance shifted be eps + self % sigma = sqrt((sum(self % mu ** 2, dim=2) / self % model_dimension) + self % eps) + + ! normalize mu by variance by first axis + do concurrent(i = 1: self % model_dimension) + self % normalized(:, i) = self % mu(:, i) / self % sigma + end do + + ! forward through trainable params gamma and beta + do concurrent(i = 1: self % sequence_length) + self % output(i, :) = self % normalized(i, :) * self % gamma + self % beta + end do + end subroutine forward + + pure module subroutine backward(self, input, gradient) + class(layernorm_layer), intent(in out) :: self + real, intent(in) :: input(:, :) + real, intent(in) :: gradient(:, :) + + self % one_over_sigma = (1 / spread(self % sigma, dim=2, ncopies=self % model_dimension)) + self % gradient_by_gamma_over_sigma = & + gradient & + * spread(self % gamma, dim=1, ncopies=self % sequence_length) & + * self % one_over_sigma + + ! d_output/d_gamma = sum(d_output/d_y * mu/sigma) + self % d_gamma = sum(gradient * self % mu * self % one_over_sigma, dim=1) + + ! d_output/d_beta = sum(d_output/d_y) * 1 + self % d_beta = sum(gradient, dim=1) + + ! From this article: + ! https://robotchinwag.com/posts/layer-normalization-deriving-the-gradient-for-the-backward-pass/ + ! d_output/d_x = d_output/d_y * gamma/sigma + ! - d_output/d_y + ! - sum(d_output/d_y * gamma/sigma) / len + ! - mu * sum(d_output/d_y * gamma * mu * sigma^(03)) / len + self % gradient = & + self % gradient_by_gamma_over_sigma & + - spread(& + sum(self % gradient_by_gamma_over_sigma, dim=2),& + dim=2,& + ncopies=self % model_dimension& + ) / self % model_dimension & + - self % mu * spread(& + sum(self % gradient_by_gamma_over_sigma * self % mu * (self % one_over_sigma ** 2), dim=2),& + dim=2,& + ncopies=self % model_dimension& + ) / self % model_dimension + end subroutine backward + + module subroutine init(self, input_shape) + class(layernorm_layer), intent(in out) :: self + integer, intent(in) :: input_shape(:) + + if (size(input_shape) /= 2) then + error stop "LayerNorm Layer accepts 2D input" + end if + self % sequence_length = input_shape(1) + self % model_dimension = input_shape(2) + + ! default initialization from PyTorch + allocate(self % gamma(self % model_dimension)) + self % gamma = 1. + allocate(self % beta(self % model_dimension)) + self % beta = 0. + + allocate(self % d_gamma(self % model_dimension)) + allocate(self % d_beta(self % model_dimension)) + allocate(self % gradient(self % sequence_length, self % model_dimension)) + + allocate(self % mu(self % sequence_length, self % model_dimension)) + allocate(self % sigma(self % sequence_length)) + + allocate(self % output(self % sequence_length, self % model_dimension)) + + allocate(self % normalized, mold=self % mu) + allocate(self % one_over_sigma, mold=self % mu) + allocate(self % gradient_by_gamma_over_sigma, mold=self % mu) + end subroutine init + + pure module function get_num_params(self) result(num_params) + class(layernorm_layer), intent(in) :: self + integer :: num_params + + ! Number of weights times number of biases + num_params = 2 * self % model_dimension + + end function get_num_params + + + module function get_params(self) result(params) + class(layernorm_layer), intent(in), target :: self + real, allocatable :: params(:) + + params = [ & + self % gamma, & + self % beta & + ] + + end function get_params + + + module function get_gradients(self) result(gradients) + class(layernorm_layer), intent(in), target :: self + real, allocatable :: gradients(:) + + gradients = [ & + self % d_gamma, & + self % d_beta & + ] + + end function get_gradients + + + module subroutine set_params(self, params) + class(layernorm_layer), intent(in out) :: self + real, intent(in), target :: params(:) + + ! check if the number of parameters is correct + if (size(params) /= self % get_num_params()) then + error stop 'Error: number of parameters does not match' + end if + + self % gamma = params(1: self % model_dimension) + self % beta = params(self % model_dimension + 1: 2 * self % model_dimension) + + end subroutine set_params +end submodule nf_layernorm_layer_submodule diff --git a/src/nf/nf_network_submodule.f90 b/src/nf/nf_network_submodule.f90 index f344c5c5..a6b7657c 100644 --- a/src/nf/nf_network_submodule.f90 +++ b/src/nf/nf_network_submodule.f90 @@ -11,6 +11,7 @@ use nf_reshape_layer, only: reshape3d_layer use nf_linear2d_layer, only: linear2d_layer use nf_self_attention_layer, only: self_attention_layer + use nf_layernorm_layer, only: layernorm_layer use nf_layer, only: layer use nf_layer_constructors, only: conv2d, dense, flatten, input, maxpool2d, reshape use nf_loss, only: quadratic @@ -163,6 +164,8 @@ module subroutine backward(self, output, loss) call self % layers(n) % backward(self % layers(n - 1), next_layer % gradient) type is(self_attention_layer) call self % layers(n) % backward(self % layers(n - 1), next_layer % gradient) + type is(layernorm_layer) + call self % layers(n) % backward(self % layers(n - 1), next_layer % gradient) end select end if diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 741e9930..46d349c1 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -12,6 +12,7 @@ foreach(execid insert_flatten reshape_layer multihead_attention_layer + layernorm dense_network get_set_network_params conv2d_network diff --git a/test/test_layernorm.f90 b/test/test_layernorm.f90 new file mode 100644 index 00000000..6a897575 --- /dev/null +++ b/test/test_layernorm.f90 @@ -0,0 +1,193 @@ +program test_layernorm_instance + use iso_fortran_env, only: stderr => error_unit + use nf_layernorm_layer, only: layernorm_layer + use nf_linear2d_layer, only: linear2d_layer + use nf_layer, only: layer + use nf, only: sgd, layernorm, network, input, flatten, linear2d + implicit none + + logical :: ok = .true. + type(layernorm_layer) :: layernorm_instance + real :: sample_input(3, 4) = reshape([0.0, 10.1, 0.2, 10.3, 0.4, 10.5, 0.6, 10.7, 10.8, 0.9, 0.11, 0.12], [3, 4]) + real :: sample_gradient(3, 4) = reshape([0.1, 3., 2., 0.1, 3., 3., 0.1, 2., 0.1, 3., 0.1, 3.], [3, 4]) + + layernorm_instance = layernorm_layer() + call layernorm_instance % init([3, 4]) + + call test_layernorm_forward(layernorm_instance, sample_input, ok) + call test_layernorm_backward(layernorm_instance, sample_input, sample_gradient, ok) + call test_layernorm_gradients(sample_input, sample_gradient, ok) + call test_layernorm_integration(ok) + + if (ok) then + print '(a)', 'test_layernorm_layer: All tests passed.' + else + write(stderr, '(a)') 'test_layernorm_layer: One or more tests failed.' + error stop 1 + end if + +contains + function allclose(x, y) result(res) + real, intent(in) :: x(:) + real, intent(in) :: y(:) + logical :: res + + res = all(abs(x - y) <= (1e-06 + 1e-05 * abs(y))) + end function allclose + + subroutine test_layernorm_forward(layernorm_instance, input, ok) + type(layernorm_layer), intent(in out) :: layernorm_instance + real, intent(in out) :: input(:, :) + logical, intent(in out) :: ok + real :: output_shape(2) + real :: output_flat(12) + real :: expected_shape(2) = [3, 4] + real :: expected_output_flat(12) = [& + -0.693158746, 0.939844191, -0.992156327, 1.72702277, -0.970368207, 0.971188426,& + -0.552177250, 1.05800152, 1.02837324, -0.481686622, -1.02747762, -1.00740564& + ] + + call layernorm_instance % forward(input) + + output_shape = shape(layernorm_instance % output) + if (.not. all(output_shape.eq.expected_shape)) then + ok = .false. + write(stderr, '(a)') 'forward returned incorrect shape.. failed' + end if + output_flat = reshape(layernorm_instance % output, shape(output_flat)) + if (.not. allclose(output_flat, expected_output_flat)) then + ok = .false. + write(stderr, '(a)') 'forward returned incorrect values.. failed' + end if + end subroutine test_layernorm_forward + + subroutine test_layernorm_backward(layernorm_instance, input, gradient, ok) + type(layernorm_layer), intent(in out) :: layernorm_instance + real, intent(in out) :: input(:, :) + real, intent(in out) :: gradient(:, :) + logical, intent(in out) :: ok + + real :: gradient_shape(2) + real :: gradient_flat(12) + real :: expected_gradient_shape(2) = [3, 4] + real :: expected_gradient_flat(12) = [& + -0.227230772, 0.103088334, -9.88590196E-02, -2.86390483E-02, 0.283811331, 0.277955681,& + -0.215662330, -0.105019525, -0.269407451, 0.471532196, -0.281880081, 9.03107598E-02& + ] + + real :: d_gamma(4) + real :: expected_d_gamma(4) = [0.765904069, 0.175162792, 2.16362262, -4.57002449] + real :: d_beta(4) + real :: expected_d_beta(4) = [5.1, 6.1, 2.2, 6.1] + + call layernorm_instance % backward(input, gradient) + + gradient_shape = shape(layernorm_instance % gradient) + if (.not. all(gradient_shape.eq.expected_gradient_shape)) then + ok = .false. + write(stderr, '(a)') 'backward returned incorrect gradient shape.. failed' + end if + gradient_flat = reshape(layernorm_instance % gradient, shape(gradient_flat)) + if (.not. allclose(gradient_flat, expected_gradient_flat)) then + ok = .false. + write(stderr, '(a)') 'backward returned incorrect gradient values.. failed' + end if + + if (.not. allclose(layernorm_instance % d_gamma, expected_d_gamma)) then + ok = .false. + write(stderr, '(a)') 'backward returned incorrect d_gamma values.. failed' + end if + if (.not. allclose(layernorm_instance % d_beta, expected_d_beta)) then + ok = .false. + write(stderr, '(a)') 'backward returned incorrect d_beta values.. failed' + end if + end subroutine test_layernorm_backward + + subroutine test_layernorm_gradients(input, gradient, ok) + real, intent(in out) :: input(:, :) + real, intent(in out) :: gradient(:, :) + logical, intent(in out) :: ok + type(layernorm_layer) :: layernorm_instance + type(sgd) :: optim + + real :: parameters(8) + real :: expected_parameters(8) + real :: updated_output(12) + real :: expected_updated_output(12) = [& + -0.738849819, 0.881645918, -1.03555739,& + 1.66299772, -1.02966857, 0.908487320,& + -0.562230229, 1.01311040, 0.984123051,& + -0.564699769, -1.13543355, -1.11444426& + ] + + layernorm_instance = layernorm_layer() + call layernorm_instance % init([3, 4]) + + call layernorm_instance % forward(input) + call layernorm_instance % backward(input, gradient) + + if (layernorm_instance % get_num_params() /= 8) then + ok = .false. + write(stderr, '(a)') 'incorrect number of parameters.. failed' + end if + + expected_parameters(1: 4) = 1. + expected_parameters(5: 8) = 0. + parameters = layernorm_instance % get_params() + if (.not. all(parameters.eq.expected_parameters)) then + ok = .false. + write(stderr, '(a)') 'incorrect parameters.. failed' + end if + + optim = SGD(learning_rate=0.01) + call optim % minimize(parameters, layernorm_instance % get_gradients()) + call layernorm_instance % set_params(parameters) + + call layernorm_instance % forward(input) + + updated_output = reshape(layernorm_instance % output, [12]) + if (.not. allclose(updated_output, expected_updated_output)) then + ok = .false. + write(stderr, '(a)') 'incorrect output after parameters update.. failed' + end if + end subroutine test_layernorm_gradients + + subroutine test_layernorm_integration(ok) + logical, intent(in out) :: ok + + type(network) :: net + real :: x(2, 3) = reshape([0.1, 2., 0.3, 4., 0.5, 6.], [2, 3]) + real :: y(6) = [0.7, 0.2, 0.1, 0.1, 0.01, 0.9] + real :: tolerance = 0.1 + integer :: epoch + integer :: epochs = 10000 + + net = network([& + input(2, 3),& + linear2d(3),& + layernorm(),& + flatten()& + ]) + + ! Kaiming weights to achieve semblance of convergance + select type(l => net % layers(2) % p) + type is(linear2d_layer) + call random_number(l % weights) + l % weights = l % weights * sqrt(2. / 6.) + l % biases = 0.2 + end select + + do epoch = 1, epochs + call net % forward(x) + call net % backward(y) + call net % update(optimizer=sgd(learning_rate=0.001)) + if (all(abs(net % predict(x) - y) < tolerance)) exit + end do + + if (.not. epoch <= epochs) then + write(stderr, '(a)') & + 'linear2d + layernorm should converge in simple training.. failed' + ok = .false. + end if + end subroutine test_layernorm_integration +end program test_layernorm_instance