From 362015dccd878d93cac1387e53c8a61e84be31b5 Mon Sep 17 00:00:00 2001 From: Mikhail Voronov Date: Tue, 11 Feb 2025 00:55:01 +0400 Subject: [PATCH 01/18] layernorm: initial implementation --- src/nf/layernorm.f90 | 159 ++++++++++++++++++++++++++++++++++++++++ test/test_layernorm.f90 | 86 ++++++++++++++++++++++ 2 files changed, 245 insertions(+) create mode 100644 src/nf/layernorm.f90 create mode 100644 test/test_layernorm.f90 diff --git a/src/nf/layernorm.f90 b/src/nf/layernorm.f90 new file mode 100644 index 00000000..a0219758 --- /dev/null +++ b/src/nf/layernorm.f90 @@ -0,0 +1,159 @@ +module nf_layernorm_layer + use nf_activation, only: activation_function + use nf_base_layer, only: base_layer + + implicit none + + private + public :: layernorm_layer + + type, extends(base_layer) :: layernorm_layer + !! Layer Normalization + !! ((x − mean(x)) / sqrt(variance(x) + eps) * gamma + beta + !! Based upon `Ba, Jimmy Lei, Jamie Ryan Kiros, and Geoffrey E. Hinton(2016)`: + !! https://arxiv.org/abs/1607.06450v1 + integer :: sequence_length + integer :: model_dimension + + real :: eps + real, allocatable :: gamma(:) + real, allocatable :: beta(:) + + real, allocatable :: d_gamma(:) + real, allocatable :: d_beta(:) + real, allocatable :: gradient(:, :) + + real, allocatable :: mu(:, :) + real, allocatable :: sigma(:) + + real, allocatable :: output(:, :) + + contains + procedure :: forward + procedure :: backward + procedure :: spread_by_sequence + procedure :: spread_by_model_dim + procedure :: init + end type layernorm_layer + + interface layernorm_layer + module function layernorm_layer_cons(sequence_length, model_dimension) & + result(res) + integer, intent(in) :: sequence_length, model_dimension + type(layernorm_layer) :: res + end function layernorm_layer_cons + end interface layernorm_layer + +contains + module function layernorm_layer_cons(sequence_length, model_dimension) & + result(res) + integer, intent(in) :: sequence_length, model_dimension + type(layernorm_layer) :: res + + res % sequence_length = sequence_length + res % model_dimension = model_dimension + res % eps = 1e-5 + end function layernorm_layer_cons + + pure module subroutine forward(self, input) + class(layernorm_layer), intent(in out) :: self + real, intent(in) :: input(:, :) + real, allocatable :: normalized(:, :) + integer :: i + + allocate(normalized(self % sequence_length, self % model_dimension)) + + ! mu = x - MEAN_last_dim(x) + do concurrent(i = 1: self % model_dimension) + self % mu(:, i) = input(:, i) - (sum(input, dim=2) / self % model_dimension) + end do + + ! square root of variance shifted be eps + self % sigma = sqrt((sum(self % mu ** 2, dim=2) / self % model_dimension) + self % eps) + + ! normalize mu by variance by first axis + do concurrent(i = 1: self % model_dimension) + normalized(:, i) = self % mu(:, i) / self % sigma + end do + + ! forward through trainable params gamma and beta + do concurrent(i = 1: self % sequence_length) + self % output(i, :) = normalized(i, :) * self % gamma + self % beta + end do + + deallocate(normalized) + end subroutine forward + + pure module subroutine backward(self, input, gradient) + class(layernorm_layer), intent(in out) :: self + real, intent(in) :: input(:, :) + real, intent(in) :: gradient(:, :) + real, allocatable :: one_over_sigma(:, :) + real, allocatable :: gradient_by_gamma_over_sigma(:, :) + + allocate(one_over_sigma(self % sequence_length, self % model_dimension)) + allocate(gradient_by_gamma_over_sigma(self % sequence_length, self % model_dimension)) + + one_over_sigma = (1 / self % spread_by_model_dim(self % sigma)) + gradient_by_gamma_over_sigma = gradient * self % spread_by_sequence(self % gamma) * one_over_sigma + + ! d_output/d_gamma = sum(d_output/d_y * mu/sigma) + self % d_gamma = sum(gradient * self % mu * one_over_sigma, dim=1) + + ! d_output/d_beta = sum(d_output/d_y) * 1 + self % d_beta = sum(gradient, dim=1) + + ! From this article: + ! https://robotchinwag.com/posts/layer-normalization-deriving-the-gradient-for-the-backward-pass/ + ! d_output/d_x = d_output/d_y * gamma/sigma + ! - d_output/d_y + ! - sum(d_output/d_y * gamma/sigma) / len + ! - mu * sum(d_output/d_y * gamma * mu * sigma^(03)) / len + self % gradient = & + gradient_by_gamma_over_sigma & + - self % spread_by_model_dim(sum(gradient_by_gamma_over_sigma, dim=2)) / self % model_dimension & + - self % mu * self % spread_by_model_dim(sum(& + gradient_by_gamma_over_sigma * self % mu * (one_over_sigma ** 2),& + dim=2)& + ) / self % model_dimension + + deallocate(one_over_sigma) + deallocate(gradient_by_gamma_over_sigma) + end subroutine backward + + pure function spread_by_sequence(self, input) result(output) + class(layernorm_layer), intent(in) :: self + real, intent(in) :: input(:) + real :: output(self % sequence_length, self % model_dimension) + + output = spread(input, dim=1, ncopies=self % sequence_length) + end function spread_by_sequence + + pure function spread_by_model_dim(self, input) result(output) + class(layernorm_layer), intent(in) :: self + real, intent(in) :: input(:) + real :: output(self % sequence_length, self % model_dimension) + + output = spread(input, dim=2, ncopies=self % model_dimension) + end function spread_by_model_dim + + module subroutine init(self, input_shape) + class(layernorm_layer), intent(in out) :: self + integer, intent(in) :: input_shape(:) + + ! default initialization from PyTorch + allocate(self % gamma(self % model_dimension)) + self % gamma = 1. + allocate(self % beta(self % model_dimension)) + self % beta = 0. + + allocate(self % d_gamma(self % model_dimension)) + allocate(self % d_beta(self % model_dimension)) + allocate(self % gradient(self % sequence_length, self % model_dimension)) + + allocate(self % mu(self % sequence_length, self % model_dimension)) + allocate(self % sigma(self % sequence_length)) + + allocate(self % output(self % sequence_length, self % model_dimension)) + end subroutine init +end module nf_layernorm_layer \ No newline at end of file diff --git a/test/test_layernorm.f90 b/test/test_layernorm.f90 new file mode 100644 index 00000000..85326fad --- /dev/null +++ b/test/test_layernorm.f90 @@ -0,0 +1,86 @@ +program test_layernorm + use iso_fortran_env, only: stderr => error_unit + use nf_layernorm_layer, only: layernorm_layer + implicit none + + logical :: ok = .true. + type(layernorm_layer) :: layernorm + real :: sample_input(3, 4) = reshape([0.0, 10.1, 0.2, 10.3, 0.4, 10.5, 0.6, 10.7, 10.8, 0.9, 0.11, 0.12], [3, 4]) + real :: sample_gradient(3, 4) = reshape([0.1, 3., 2., 0.1, 3., 3., 0.1, 2., 0.1, 3., 0.1, 3.], [3, 4]) + + layernorm = layernorm_layer(3, 4) + call layernorm % init([0]) + + call test_layernorm_forward(layernorm, sample_input, ok) + call test_layernorm_backward(layernorm, sample_input, sample_gradient, ok) + +contains + subroutine test_layernorm_forward(layernorm, input, ok) + type(layernorm_layer), intent(in out) :: layernorm + real, intent(in out) :: input(:, :) + logical, intent(in out) :: ok + real :: output_shape(2) + real :: output_flat(12) + real :: expected_shape(2) = [3, 4] + real :: expected_output_flat(12) = [& + -0.693158746, 0.939844191, -0.992156327, 1.72702277, -0.970368207, 0.971188426,& + -0.552177250, 1.05800152, 1.02837324, -0.481686622, -1.02747762, -1.00740564& + ] + + call layernorm % forward(input) + + output_shape = shape(layernorm % output) + if (.not. all(output_shape.eq.expected_shape)) then + ok = .false. + write(stderr, '(a)') 'forward returned incorrect shape.. failed' + end if + output_flat = reshape(layernorm % output, shape(output_flat)) + if (.not. all(output_flat.eq.expected_output_flat)) then + ok = .false. + write(stderr, '(a)') 'forward returned incorrect values.. failed' + end if + end subroutine test_layernorm_forward + + subroutine test_layernorm_backward(layernorm, input, gradient, ok) + type(layernorm_layer), intent(in out) :: layernorm + real, intent(in out) :: input(:, :) + real, intent(in out) :: gradient(:, :) + logical, intent(in out) :: ok + + real :: gradient_shape(2) + real :: gradient_flat(12) + real :: expected_gradient_shape(2) = [3, 4] + real :: expected_gradient_flat(12) = [& + -0.227230772, 0.103088334, -9.88590196E-02, -2.86390483E-02, 0.283811331, 0.277955681,& + -0.215662330, -0.105019525, -0.269407451, 0.471532196, -0.281880081, 9.03107598E-02& + ] + + real :: d_gamma(4) + real :: expected_d_gamma(4) = [0.765904069, 0.175162792, 2.16362262, -4.57002449] + real :: d_beta(4) + real :: expected_d_beta(4) = [5.09999990, 6.09999990, 2.19999981, 6.09999990] + + call layernorm % backward(input, gradient) + + gradient_shape = shape(layernorm % gradient) + if (.not. all(gradient_shape.eq.expected_gradient_shape)) then + ok = .false. + write(stderr, '(a)') 'backward returned incorrect gradient shape.. failed' + end if + gradient_flat = reshape(layernorm % gradient, shape(gradient_flat)) + if (.not. all(gradient_flat.eq.expected_gradient_flat)) then + ok = .false. + write(stderr, '(a)') 'backward returned incorrect gradient values.. failed' + end if + + if (.not. all(layernorm % d_gamma.eq.expected_d_gamma)) then + ok = .false. + write(stderr, '(a)') 'backward returned incorrect d_gamma values.. failed' + end if + if (.not. all(layernorm % d_beta.eq.expected_d_beta)) then + ok = .false. + write(stderr, '(a)') 'backward returned incorrect d_beta values.. failed' + end if + end subroutine test_layernorm_backward + +end program test_layernorm From 005daf21a5286eb94a53f16d1a31f44e40be9bd8 Mon Sep 17 00:00:00 2001 From: Mikhail Voronov Date: Tue, 11 Feb 2025 00:58:46 +0400 Subject: [PATCH 02/18] layernorm: rename source file --- src/nf/{layernorm.f90 => nf_layernorm.f90} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename src/nf/{layernorm.f90 => nf_layernorm.f90} (100%) diff --git a/src/nf/layernorm.f90 b/src/nf/nf_layernorm.f90 similarity index 100% rename from src/nf/layernorm.f90 rename to src/nf/nf_layernorm.f90 From d657fa787a9f18d54084295ceac7e8752f3f3407 Mon Sep 17 00:00:00 2001 From: Mikhail Voronov Date: Mon, 17 Feb 2025 17:31:05 +0400 Subject: [PATCH 03/18] layernorm: remove redundant arguments --- src/nf/nf_layernorm.f90 | 14 ++++++++------ test/test_layernorm.f90 | 11 +++++++++-- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/src/nf/nf_layernorm.f90 b/src/nf/nf_layernorm.f90 index a0219758..dc81bcd1 100644 --- a/src/nf/nf_layernorm.f90 +++ b/src/nf/nf_layernorm.f90 @@ -37,21 +37,17 @@ module nf_layernorm_layer end type layernorm_layer interface layernorm_layer - module function layernorm_layer_cons(sequence_length, model_dimension) & + module function layernorm_layer_cons() & result(res) - integer, intent(in) :: sequence_length, model_dimension type(layernorm_layer) :: res end function layernorm_layer_cons end interface layernorm_layer contains - module function layernorm_layer_cons(sequence_length, model_dimension) & + module function layernorm_layer_cons() & result(res) - integer, intent(in) :: sequence_length, model_dimension type(layernorm_layer) :: res - res % sequence_length = sequence_length - res % model_dimension = model_dimension res % eps = 1e-5 end function layernorm_layer_cons @@ -141,6 +137,12 @@ module subroutine init(self, input_shape) class(layernorm_layer), intent(in out) :: self integer, intent(in) :: input_shape(:) + if (size(input_shape) /= 2) then + error stop "LayerNorm Layer accepts 2D input" + end if + self % sequence_length = input_shape(1) + self % model_dimension = input_shape(2) + ! default initialization from PyTorch allocate(self % gamma(self % model_dimension)) self % gamma = 1. diff --git a/test/test_layernorm.f90 b/test/test_layernorm.f90 index 85326fad..f51d6221 100644 --- a/test/test_layernorm.f90 +++ b/test/test_layernorm.f90 @@ -8,12 +8,19 @@ program test_layernorm real :: sample_input(3, 4) = reshape([0.0, 10.1, 0.2, 10.3, 0.4, 10.5, 0.6, 10.7, 10.8, 0.9, 0.11, 0.12], [3, 4]) real :: sample_gradient(3, 4) = reshape([0.1, 3., 2., 0.1, 3., 3., 0.1, 2., 0.1, 3., 0.1, 3.], [3, 4]) - layernorm = layernorm_layer(3, 4) - call layernorm % init([0]) + layernorm = layernorm_layer() + call layernorm % init([3, 4]) call test_layernorm_forward(layernorm, sample_input, ok) call test_layernorm_backward(layernorm, sample_input, sample_gradient, ok) + if (ok) then + print '(a)', 'test_layernorm_layer: All tests passed.' + else + write(stderr, '(a)') 'test_layernorm_layer: One or more tests failed.' + stop 1 + end if + contains subroutine test_layernorm_forward(layernorm, input, ok) type(layernorm_layer), intent(in out) :: layernorm From 0dbaf07aecea1d9f3710f63e7d4e1626a3a44dc1 Mon Sep 17 00:00:00 2001 From: Mikhail Voronov Date: Mon, 17 Feb 2025 18:41:55 +0400 Subject: [PATCH 04/18] layernorm: remove stack allocated arrays --- src/nf/nf_layernorm.f90 | 40 +++++++++++++++------------------------- 1 file changed, 15 insertions(+), 25 deletions(-) diff --git a/src/nf/nf_layernorm.f90 b/src/nf/nf_layernorm.f90 index dc81bcd1..ac3044f8 100644 --- a/src/nf/nf_layernorm.f90 +++ b/src/nf/nf_layernorm.f90 @@ -31,8 +31,6 @@ module nf_layernorm_layer contains procedure :: forward procedure :: backward - procedure :: spread_by_sequence - procedure :: spread_by_model_dim procedure :: init end type layernorm_layer @@ -90,8 +88,11 @@ pure module subroutine backward(self, input, gradient) allocate(one_over_sigma(self % sequence_length, self % model_dimension)) allocate(gradient_by_gamma_over_sigma(self % sequence_length, self % model_dimension)) - one_over_sigma = (1 / self % spread_by_model_dim(self % sigma)) - gradient_by_gamma_over_sigma = gradient * self % spread_by_sequence(self % gamma) * one_over_sigma + one_over_sigma = (1 / spread(self % sigma, dim=2, ncopies=self % model_dimension)) + gradient_by_gamma_over_sigma = & + gradient & + * spread(self % gamma, dim=1, ncopies=self % sequence_length) & + * one_over_sigma ! d_output/d_gamma = sum(d_output/d_y * mu/sigma) self % d_gamma = sum(gradient * self % mu * one_over_sigma, dim=1) @@ -107,32 +108,21 @@ pure module subroutine backward(self, input, gradient) ! - mu * sum(d_output/d_y * gamma * mu * sigma^(03)) / len self % gradient = & gradient_by_gamma_over_sigma & - - self % spread_by_model_dim(sum(gradient_by_gamma_over_sigma, dim=2)) / self % model_dimension & - - self % mu * self % spread_by_model_dim(sum(& - gradient_by_gamma_over_sigma * self % mu * (one_over_sigma ** 2),& - dim=2)& - ) / self % model_dimension + - spread(& + sum(gradient_by_gamma_over_sigma, dim=2),& + dim=2,& + ncopies=self % model_dimension& + ) / self % model_dimension & + - self % mu * spread(& + sum(gradient_by_gamma_over_sigma * self % mu * (one_over_sigma ** 2), dim=2),& + dim=2,& + ncopies=self % model_dimension& + ) / self % model_dimension deallocate(one_over_sigma) deallocate(gradient_by_gamma_over_sigma) end subroutine backward - pure function spread_by_sequence(self, input) result(output) - class(layernorm_layer), intent(in) :: self - real, intent(in) :: input(:) - real :: output(self % sequence_length, self % model_dimension) - - output = spread(input, dim=1, ncopies=self % sequence_length) - end function spread_by_sequence - - pure function spread_by_model_dim(self, input) result(output) - class(layernorm_layer), intent(in) :: self - real, intent(in) :: input(:) - real :: output(self % sequence_length, self % model_dimension) - - output = spread(input, dim=2, ncopies=self % model_dimension) - end function spread_by_model_dim - module subroutine init(self, input_shape) class(layernorm_layer), intent(in out) :: self integer, intent(in) :: input_shape(:) From 612db46e10f03a0f766cf1f62ade4c183cb24add Mon Sep 17 00:00:00 2001 From: Mikhail Voronov Date: Mon, 17 Feb 2025 18:45:07 +0400 Subject: [PATCH 05/18] layernorm: rearrange into submodule --- src/nf/nf_layernorm.f90 | 124 ++++-------------------------- src/nf/nf_layernorm_submodule.f90 | 110 ++++++++++++++++++++++++++ 2 files changed, 127 insertions(+), 107 deletions(-) create mode 100644 src/nf/nf_layernorm_submodule.f90 diff --git a/src/nf/nf_layernorm.f90 b/src/nf/nf_layernorm.f90 index ac3044f8..245f2870 100644 --- a/src/nf/nf_layernorm.f90 +++ b/src/nf/nf_layernorm.f90 @@ -41,111 +41,21 @@ module function layernorm_layer_cons() & end function layernorm_layer_cons end interface layernorm_layer -contains - module function layernorm_layer_cons() & - result(res) - type(layernorm_layer) :: res - - res % eps = 1e-5 - end function layernorm_layer_cons - - pure module subroutine forward(self, input) - class(layernorm_layer), intent(in out) :: self - real, intent(in) :: input(:, :) - real, allocatable :: normalized(:, :) - integer :: i - - allocate(normalized(self % sequence_length, self % model_dimension)) - - ! mu = x - MEAN_last_dim(x) - do concurrent(i = 1: self % model_dimension) - self % mu(:, i) = input(:, i) - (sum(input, dim=2) / self % model_dimension) - end do - - ! square root of variance shifted be eps - self % sigma = sqrt((sum(self % mu ** 2, dim=2) / self % model_dimension) + self % eps) - - ! normalize mu by variance by first axis - do concurrent(i = 1: self % model_dimension) - normalized(:, i) = self % mu(:, i) / self % sigma - end do - - ! forward through trainable params gamma and beta - do concurrent(i = 1: self % sequence_length) - self % output(i, :) = normalized(i, :) * self % gamma + self % beta - end do - - deallocate(normalized) - end subroutine forward - - pure module subroutine backward(self, input, gradient) - class(layernorm_layer), intent(in out) :: self - real, intent(in) :: input(:, :) - real, intent(in) :: gradient(:, :) - real, allocatable :: one_over_sigma(:, :) - real, allocatable :: gradient_by_gamma_over_sigma(:, :) - - allocate(one_over_sigma(self % sequence_length, self % model_dimension)) - allocate(gradient_by_gamma_over_sigma(self % sequence_length, self % model_dimension)) - - one_over_sigma = (1 / spread(self % sigma, dim=2, ncopies=self % model_dimension)) - gradient_by_gamma_over_sigma = & - gradient & - * spread(self % gamma, dim=1, ncopies=self % sequence_length) & - * one_over_sigma - - ! d_output/d_gamma = sum(d_output/d_y * mu/sigma) - self % d_gamma = sum(gradient * self % mu * one_over_sigma, dim=1) - - ! d_output/d_beta = sum(d_output/d_y) * 1 - self % d_beta = sum(gradient, dim=1) - - ! From this article: - ! https://robotchinwag.com/posts/layer-normalization-deriving-the-gradient-for-the-backward-pass/ - ! d_output/d_x = d_output/d_y * gamma/sigma - ! - d_output/d_y - ! - sum(d_output/d_y * gamma/sigma) / len - ! - mu * sum(d_output/d_y * gamma * mu * sigma^(03)) / len - self % gradient = & - gradient_by_gamma_over_sigma & - - spread(& - sum(gradient_by_gamma_over_sigma, dim=2),& - dim=2,& - ncopies=self % model_dimension& - ) / self % model_dimension & - - self % mu * spread(& - sum(gradient_by_gamma_over_sigma * self % mu * (one_over_sigma ** 2), dim=2),& - dim=2,& - ncopies=self % model_dimension& - ) / self % model_dimension - - deallocate(one_over_sigma) - deallocate(gradient_by_gamma_over_sigma) - end subroutine backward - - module subroutine init(self, input_shape) - class(layernorm_layer), intent(in out) :: self - integer, intent(in) :: input_shape(:) - - if (size(input_shape) /= 2) then - error stop "LayerNorm Layer accepts 2D input" - end if - self % sequence_length = input_shape(1) - self % model_dimension = input_shape(2) - - ! default initialization from PyTorch - allocate(self % gamma(self % model_dimension)) - self % gamma = 1. - allocate(self % beta(self % model_dimension)) - self % beta = 0. - - allocate(self % d_gamma(self % model_dimension)) - allocate(self % d_beta(self % model_dimension)) - allocate(self % gradient(self % sequence_length, self % model_dimension)) - - allocate(self % mu(self % sequence_length, self % model_dimension)) - allocate(self % sigma(self % sequence_length)) - - allocate(self % output(self % sequence_length, self % model_dimension)) - end subroutine init + interface + pure module subroutine forward(self, input) + class(layernorm_layer), intent(in out) :: self + real, intent(in) :: input(:, :) + end subroutine forward + + pure module subroutine backward(self, input, gradient) + class(layernorm_layer), intent(in out) :: self + real, intent(in) :: input(:, :) + real, intent(in) :: gradient(:, :) + end subroutine backward + + module subroutine init(self, input_shape) + class(layernorm_layer), intent(in out) :: self + integer, intent(in) :: input_shape(:) + end subroutine init + end interface end module nf_layernorm_layer \ No newline at end of file diff --git a/src/nf/nf_layernorm_submodule.f90 b/src/nf/nf_layernorm_submodule.f90 new file mode 100644 index 00000000..52b7a426 --- /dev/null +++ b/src/nf/nf_layernorm_submodule.f90 @@ -0,0 +1,110 @@ +submodule(nf_layernorm_layer) nf_layernorm_layer_submodule + implicit none +contains + module function layernorm_layer_cons() & + result(res) + type(layernorm_layer) :: res + + res % eps = 1e-5 + end function layernorm_layer_cons + + pure module subroutine forward(self, input) + class(layernorm_layer), intent(in out) :: self + real, intent(in) :: input(:, :) + real, allocatable :: normalized(:, :) + integer :: i + + allocate(normalized(self % sequence_length, self % model_dimension)) + + ! mu = x - MEAN_last_dim(x) + do concurrent(i = 1: self % model_dimension) + self % mu(:, i) = input(:, i) - (sum(input, dim=2) / self % model_dimension) + end do + + ! square root of variance shifted be eps + self % sigma = sqrt((sum(self % mu ** 2, dim=2) / self % model_dimension) + self % eps) + + ! normalize mu by variance by first axis + do concurrent(i = 1: self % model_dimension) + normalized(:, i) = self % mu(:, i) / self % sigma + end do + + ! forward through trainable params gamma and beta + do concurrent(i = 1: self % sequence_length) + self % output(i, :) = normalized(i, :) * self % gamma + self % beta + end do + + deallocate(normalized) + end subroutine forward + + pure module subroutine backward(self, input, gradient) + class(layernorm_layer), intent(in out) :: self + real, intent(in) :: input(:, :) + real, intent(in) :: gradient(:, :) + real, allocatable :: one_over_sigma(:, :) + real, allocatable :: gradient_by_gamma_over_sigma(:, :) + + allocate(one_over_sigma(self % sequence_length, self % model_dimension)) + allocate(gradient_by_gamma_over_sigma(self % sequence_length, self % model_dimension)) + + one_over_sigma = (1 / spread(self % sigma, dim=2, ncopies=self % model_dimension)) + gradient_by_gamma_over_sigma = & + gradient & + * spread(self % gamma, dim=1, ncopies=self % sequence_length) & + * one_over_sigma + + ! d_output/d_gamma = sum(d_output/d_y * mu/sigma) + self % d_gamma = sum(gradient * self % mu * one_over_sigma, dim=1) + + ! d_output/d_beta = sum(d_output/d_y) * 1 + self % d_beta = sum(gradient, dim=1) + + ! From this article: + ! https://robotchinwag.com/posts/layer-normalization-deriving-the-gradient-for-the-backward-pass/ + ! d_output/d_x = d_output/d_y * gamma/sigma + ! - d_output/d_y + ! - sum(d_output/d_y * gamma/sigma) / len + ! - mu * sum(d_output/d_y * gamma * mu * sigma^(03)) / len + self % gradient = & + gradient_by_gamma_over_sigma & + - spread(& + sum(gradient_by_gamma_over_sigma, dim=2),& + dim=2,& + ncopies=self % model_dimension& + ) / self % model_dimension & + - self % mu * spread(& + sum(gradient_by_gamma_over_sigma * self % mu * (one_over_sigma ** 2), dim=2),& + dim=2,& + ncopies=self % model_dimension& + ) / self % model_dimension + + deallocate(one_over_sigma) + deallocate(gradient_by_gamma_over_sigma) + end subroutine backward + + module subroutine init(self, input_shape) + class(layernorm_layer), intent(in out) :: self + integer, intent(in) :: input_shape(:) + + if (size(input_shape) /= 2) then + error stop "LayerNorm Layer accepts 2D input" + end if + self % sequence_length = input_shape(1) + self % model_dimension = input_shape(2) + + ! default initialization from PyTorch + allocate(self % gamma(self % model_dimension)) + self % gamma = 1. + allocate(self % beta(self % model_dimension)) + self % beta = 0. + + allocate(self % d_gamma(self % model_dimension)) + allocate(self % d_beta(self % model_dimension)) + allocate(self % gradient(self % sequence_length, self % model_dimension)) + + allocate(self % mu(self % sequence_length, self % model_dimension)) + allocate(self % sigma(self % sequence_length)) + + allocate(self % output(self % sequence_length, self % model_dimension)) + end subroutine init +end submodule nf_layernorm_layer_submodule From c4a3e3cfc41f88fc1aa9651087c1289852be1bc4 Mon Sep 17 00:00:00 2001 From: Mikhail Voronov Date: Fri, 21 Feb 2025 22:21:06 +0400 Subject: [PATCH 06/18] layernorm: add error to stop in test --- test/test_layernorm.f90 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_layernorm.f90 b/test/test_layernorm.f90 index f51d6221..fec4b67b 100644 --- a/test/test_layernorm.f90 +++ b/test/test_layernorm.f90 @@ -18,7 +18,7 @@ program test_layernorm print '(a)', 'test_layernorm_layer: All tests passed.' else write(stderr, '(a)') 'test_layernorm_layer: One or more tests failed.' - stop 1 + error stop 1 end if contains From bdefd02a12a10164dc6c79145f747a12f4d8497c Mon Sep 17 00:00:00 2001 From: Mikhail Voronov Date: Sun, 23 Feb 2025 17:00:52 +0400 Subject: [PATCH 07/18] layernorm: add gradient updates --- src/nf/nf_layernorm.f90 | 27 +++++++++++++++++ src/nf/nf_layernorm_submodule.f90 | 48 +++++++++++++++++++++++++++++ test/test_layernorm.f90 | 50 +++++++++++++++++++++++++++++++ 3 files changed, 125 insertions(+) diff --git a/src/nf/nf_layernorm.f90 b/src/nf/nf_layernorm.f90 index 245f2870..b148c534 100644 --- a/src/nf/nf_layernorm.f90 +++ b/src/nf/nf_layernorm.f90 @@ -32,6 +32,10 @@ module nf_layernorm_layer procedure :: forward procedure :: backward procedure :: init + procedure :: get_num_params + procedure :: get_params + procedure :: get_gradients + procedure :: set_params end type layernorm_layer interface layernorm_layer @@ -57,5 +61,28 @@ module subroutine init(self, input_shape) class(layernorm_layer), intent(in out) :: self integer, intent(in) :: input_shape(:) end subroutine init + + pure module function get_num_params(self) result(num_params) + class(layernorm_layer), intent(in) :: self + integer :: num_params + end function get_num_params + + + module function get_params(self) result(params) + class(layernorm_layer), intent(in), target :: self + real, allocatable :: params(:) + end function get_params + + + module function get_gradients(self) result(gradients) + class(layernorm_layer), intent(in), target :: self + real, allocatable :: gradients(:) + end function get_gradients + + + module subroutine set_params(self, params) + class(layernorm_layer), intent(in out) :: self + real, intent(in), target :: params(:) + end subroutine set_params end interface end module nf_layernorm_layer \ No newline at end of file diff --git a/src/nf/nf_layernorm_submodule.f90 b/src/nf/nf_layernorm_submodule.f90 index 52b7a426..4f6eae78 100644 --- a/src/nf/nf_layernorm_submodule.f90 +++ b/src/nf/nf_layernorm_submodule.f90 @@ -107,4 +107,52 @@ module subroutine init(self, input_shape) allocate(self % output(self % sequence_length, self % model_dimension)) end subroutine init + + pure module function get_num_params(self) result(num_params) + class(layernorm_layer), intent(in) :: self + integer :: num_params + + ! Number of weights times number of biases + num_params = 2 * self % model_dimension + + end function get_num_params + + + module function get_params(self) result(params) + class(layernorm_layer), intent(in), target :: self + real, allocatable :: params(:) + + params = [ & + self % gamma, & + self % beta & + ] + + end function get_params + + + module function get_gradients(self) result(gradients) + class(layernorm_layer), intent(in), target :: self + real, allocatable :: gradients(:) + + gradients = [ & + self % d_gamma, & + self % d_beta & + ] + + end function get_gradients + + + module subroutine set_params(self, params) + class(layernorm_layer), intent(in out) :: self + real, intent(in), target :: params(:) + + ! check if the number of parameters is correct + if (size(params) /= self % get_num_params()) then + error stop 'Error: number of parameters does not match' + end if + + self % gamma = params(1: self % model_dimension) + self % beta = params(self % model_dimension + 1: 2 * self % model_dimension) + + end subroutine set_params end submodule nf_layernorm_layer_submodule diff --git a/test/test_layernorm.f90 b/test/test_layernorm.f90 index fec4b67b..636089d5 100644 --- a/test/test_layernorm.f90 +++ b/test/test_layernorm.f90 @@ -1,6 +1,7 @@ program test_layernorm use iso_fortran_env, only: stderr => error_unit use nf_layernorm_layer, only: layernorm_layer + use nf, only: sgd implicit none logical :: ok = .true. @@ -13,6 +14,7 @@ program test_layernorm call test_layernorm_forward(layernorm, sample_input, ok) call test_layernorm_backward(layernorm, sample_input, sample_gradient, ok) + call test_layernorm_gradients(sample_input, sample_gradient, ok) if (ok) then print '(a)', 'test_layernorm_layer: All tests passed.' @@ -90,4 +92,52 @@ subroutine test_layernorm_backward(layernorm, input, gradient, ok) end if end subroutine test_layernorm_backward + subroutine test_layernorm_gradients(input, gradient, ok) + real, intent(in out) :: input(:, :) + real, intent(in out) :: gradient(:, :) + logical, intent(in out) :: ok + type(layernorm_layer) :: layernorm + type(sgd) :: optim + + real :: parameters(8) + real :: expected_parameters(8) + real :: updated_output(12) + real :: expected_updated_output(12) = [& + -0.738849819, 0.881645918, -1.03555739,& + 1.66299772, -1.02966857, 0.908487320,& + -0.562230229, 1.01311040, 0.984123051,& + -0.564699769, -1.13543355, -1.11444426& + ] + + layernorm = layernorm_layer() + call layernorm % init([3, 4]) + + call layernorm % forward(input) + call layernorm % backward(input, gradient) + + if (layernorm % get_num_params() /= 8) then + ok = .false. + write(stderr, '(a)') 'incorrect number of parameters.. failed' + end if + + expected_parameters(1: 4) = 1. + expected_parameters(5: 8) = 0. + parameters = layernorm % get_params() + if (.not. all(parameters.eq.expected_parameters)) then + ok = .false. + write(stderr, '(a)') 'incorrect parameters.. failed' + end if + + optim = SGD(learning_rate=0.01) + call optim % minimize(parameters, layernorm % get_gradients()) + call layernorm % set_params(parameters) + + call layernorm % forward(input) + + updated_output = reshape(layernorm % output, [12]) + if (.not. all(updated_output.eq.expected_updated_output)) then + ok = .false. + write(stderr, '(a)') 'incorrect output after parameters update.. failed' + end if + end subroutine test_layernorm_gradients end program test_layernorm From ccc180ee427efa6b65a0866680466738d5e0a484 Mon Sep 17 00:00:00 2001 From: Mikhail Voronov Date: Sun, 23 Feb 2025 17:24:19 +0400 Subject: [PATCH 08/18] layernorm: public api --- src/nf.f90 | 3 +- src/nf/nf_layer_constructors.f90 | 29 ++++++++----- src/nf/nf_layer_constructors_submodule.f90 | 8 ++++ src/nf/nf_layer_submodule.f90 | 50 ++++++++++++++++++++-- src/nf/nf_network_submodule.f90 | 3 ++ 5 files changed, 78 insertions(+), 15 deletions(-) diff --git a/src/nf.f90 b/src/nf.f90 index 39f67ea3..67d18ea2 100644 --- a/src/nf.f90 +++ b/src/nf.f90 @@ -11,7 +11,8 @@ module nf linear2d, & maxpool2d, & reshape, & - self_attention + self_attention, & + layer_normalization use nf_loss, only: mse, quadratic use nf_metrics, only: corr, maxabs use nf_network, only: network diff --git a/src/nf/nf_layer_constructors.f90 b/src/nf/nf_layer_constructors.f90 index db60cf0f..e2fd50a8 100644 --- a/src/nf/nf_layer_constructors.f90 +++ b/src/nf/nf_layer_constructors.f90 @@ -17,7 +17,8 @@ module nf_layer_constructors linear2d, & maxpool2d, & reshape, & - self_attention + self_attention, & + layer_normalization interface input @@ -222,15 +223,23 @@ module function linear2d(out_features) result(res) !! Resulting layer instance end function linear2d - module function self_attention(num_heads) result(res) - !! Rank-2 (sequence_length, out_features) self attention constructor. - !! sequence_length and model_dimension are determined at layer initialization, based on the - !! output shape of the previous layer. - integer, intent(in) :: num_heads - !! Number of attention heads - type(layer) :: res - !! Resulting layer instance - end function self_attention + module function self_attention(num_heads) result(res) + !! Rank-2 (sequence_length, out_features) self attention constructor. + !! sequence_length and model_dimension are determined at layer initialization, based on the + !! output shape of the previous layer. + integer, intent(in) :: num_heads + !! Number of attention heads + type(layer) :: res + !! Resulting layer instance + end function self_attention + + module function layer_normalization() result(res) + !! Layer Normalization + !! ((x − mean(x)) / sqrt(variance(x) + eps) * gamma + beta + !! Based upon `Ba, Jimmy Lei, Jamie Ryan Kiros, and Geoffrey E. Hinton(2016)`: + !! https://arxiv.org/abs/1607.06450v1 + type(layer) :: res + end function layer_normalization end interface diff --git a/src/nf/nf_layer_constructors_submodule.f90 b/src/nf/nf_layer_constructors_submodule.f90 index 9e5322c1..0b33f3c6 100644 --- a/src/nf/nf_layer_constructors_submodule.f90 +++ b/src/nf/nf_layer_constructors_submodule.f90 @@ -12,6 +12,7 @@ use nf_reshape_layer, only: reshape3d_layer use nf_linear2d_layer, only: linear2d_layer use nf_self_attention_layer, only: self_attention_layer + use nf_layernorm_layer, only: layernorm_layer use nf_activation, only: activation_function, relu, sigmoid implicit none @@ -179,4 +180,11 @@ module function self_attention(num_heads) result(res) allocate(res % p, source=self_attention_layer(num_heads)) end function self_attention + module function layer_normalization() result(res) + type(layer) :: res + + res % name = 'layer_normalization' + allocate(res % p, source=layernorm_layer()) + end function layer_normalization + end submodule nf_layer_constructors_submodule diff --git a/src/nf/nf_layer_submodule.f90 b/src/nf/nf_layer_submodule.f90 index ecdeb41d..801c7754 100644 --- a/src/nf/nf_layer_submodule.f90 +++ b/src/nf/nf_layer_submodule.f90 @@ -12,6 +12,7 @@ use nf_reshape_layer, only: reshape3d_layer use nf_linear2d_layer, only: linear2d_layer use nf_self_attention_layer, only: self_attention_layer + use nf_layernorm_layer, only: layernorm_layer use nf_optimizers, only: optimizer_base_type contains @@ -60,6 +61,8 @@ pure module subroutine backward_1d(self, previous, gradient) call this_layer % backward(prev_layer % output, gradient) type is(self_attention_layer) call this_layer % backward(prev_layer % output, gradient) + type is(layernorm_layer) + call this_layer % backward(prev_layer % output, gradient) end select end select @@ -84,6 +87,8 @@ pure module subroutine backward_2d(self, previous, gradient) call this_layer % backward(prev_layer % output, gradient) type is(self_attention_layer) call this_layer % backward(prev_layer % output, gradient) + type is(layernorm_layer) + call this_layer % backward(prev_layer % output, gradient) end select type is(self_attention_layer) @@ -95,8 +100,18 @@ pure module subroutine backward_2d(self, previous, gradient) call this_layer % backward(prev_layer % output, gradient) type is(self_attention_layer) call this_layer % backward(prev_layer % output, gradient) + type is(layernorm_layer) + call this_layer % backward(prev_layer % output, gradient) end select + type is(layernorm_layer) + + select type(prev_layer => previous % p) + type is(linear2d_layer) + call this_layer % backward(prev_layer % output, gradient) + type is(self_attention_layer) + call this_layer % backward(prev_layer % output, gradient) + end select end select end subroutine backward_2d @@ -250,7 +265,7 @@ module subroutine forward(self, input) type is(linear2d_layer) - ! Upstream layers permitted: input2d, linear2d + ! Upstream layers permitted: input2d, linear2d, self_attention, layer_normalization select type(prev_layer => input % p) type is(input2d_layer) call this_layer % forward(prev_layer % output) @@ -258,11 +273,13 @@ module subroutine forward(self, input) call this_layer % forward(prev_layer % output) type is(self_attention_layer) call this_layer % forward(prev_layer % output) + type is(layernorm_layer) + call this_layer % forward(prev_layer % output) end select type is(self_attention_layer) - ! Upstream layers permitted: input2d, linear2d + ! Upstream layers permitted: input2d, linear2d, self_attention, layer_normalization select type(prev_layer => input % p) type is(input2d_layer) call this_layer % forward(prev_layer % output) @@ -270,6 +287,18 @@ module subroutine forward(self, input) call this_layer % forward(prev_layer % output) type is(self_attention_layer) call this_layer % forward(prev_layer % output) + type is(layernorm_layer) + call this_layer % forward(prev_layer % output) + end select + + type is(layernorm_layer) + + ! Upstream layers permitted: linear2d, self_attention + select type(prev_layer => input % p) + type is(linear2d_layer) + call this_layer % forward(prev_layer % output) + type is(self_attention_layer) + call this_layer % forward(prev_layer % output) end select end select @@ -311,6 +340,8 @@ pure module subroutine get_output_2d(self, output) allocate(output, source=this_layer % output) type is(self_attention_layer) allocate(output, source=this_layer % output) + type is(layernorm_layer) + allocate(output, source=this_layer % output) class default error stop '2-d output can only be read from an input2d or linear2d layer.' @@ -354,8 +385,8 @@ impure elemental module subroutine init(self, input) call this_layer % init(input % layer_shape) end select - ! The shape of conv2d, dropout, flatten, linear2d, maxpool2d, or - ! self_attention layers is not known until we receive an input layer. + ! The shape of conv2d, dropout, flatten, linear2d, maxpool2d, + ! self_attention or layernorm layers is not known until we receive an input layer. select type(this_layer => self % p) type is(conv2d_layer) self % layer_shape = shape(this_layer % output) @@ -367,6 +398,8 @@ impure elemental module subroutine init(self, input) self % layer_shape = shape(this_layer % output) type is(self_attention_layer) self % layer_shape = shape(this_layer % output) + type is(layernorm_layer) + self % layer_shape = shape(this_layer % output) type is(maxpool2d_layer) self % layer_shape = shape(this_layer % output) end select @@ -425,6 +458,8 @@ elemental module function get_num_params(self) result(num_params) num_params = this_layer % get_num_params() type is (self_attention_layer) num_params = this_layer % get_num_params() + type is (layernorm_layer) + num_params = this_layer % get_num_params() class default error stop 'Unknown layer type.' end select @@ -458,6 +493,8 @@ module function get_params(self) result(params) params = this_layer % get_params() type is (self_attention_layer) params = this_layer % get_params() + type is (layernorm_layer) + params = this_layer % get_params() class default error stop 'Unknown layer type.' end select @@ -491,6 +528,8 @@ module function get_gradients(self) result(gradients) gradients = this_layer % get_gradients() type is (self_attention_layer) gradients = this_layer % get_gradients() + type is (layernorm_layer) + gradients = this_layer % get_gradients() class default error stop 'Unknown layer type.' end select @@ -549,6 +588,9 @@ module subroutine set_params(self, params) type is (self_attention_layer) call this_layer % set_params(params) + type is (layernorm_layer) + call this_layer % set_params(params) + type is (maxpool2d_layer) ! No parameters to set. write(stderr, '(a)') 'Warning: calling set_params() ' & diff --git a/src/nf/nf_network_submodule.f90 b/src/nf/nf_network_submodule.f90 index f344c5c5..a6b7657c 100644 --- a/src/nf/nf_network_submodule.f90 +++ b/src/nf/nf_network_submodule.f90 @@ -11,6 +11,7 @@ use nf_reshape_layer, only: reshape3d_layer use nf_linear2d_layer, only: linear2d_layer use nf_self_attention_layer, only: self_attention_layer + use nf_layernorm_layer, only: layernorm_layer use nf_layer, only: layer use nf_layer_constructors, only: conv2d, dense, flatten, input, maxpool2d, reshape use nf_loss, only: quadratic @@ -163,6 +164,8 @@ module subroutine backward(self, output, loss) call self % layers(n) % backward(self % layers(n - 1), next_layer % gradient) type is(self_attention_layer) call self % layers(n) % backward(self % layers(n - 1), next_layer % gradient) + type is(layernorm_layer) + call self % layers(n) % backward(self % layers(n - 1), next_layer % gradient) end select end if From 06670008815a0cf11baa8054ab6599839fdeb383 Mon Sep 17 00:00:00 2001 From: Mikhail Voronov Date: Sun, 23 Feb 2025 17:34:35 +0400 Subject: [PATCH 09/18] layernorm: update tests --- test/test_layernorm.f90 | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/test/test_layernorm.f90 b/test/test_layernorm.f90 index 636089d5..75af7f26 100644 --- a/test/test_layernorm.f90 +++ b/test/test_layernorm.f90 @@ -24,6 +24,14 @@ program test_layernorm end if contains + function allclose(x, y) result(res) + real, intent(in) :: x(:) + real, intent(in) :: y(:) + logical :: res + + res = all(abs(x - y) <= (1e-06 + 1e-05 * abs(y))) + end function allclose + subroutine test_layernorm_forward(layernorm, input, ok) type(layernorm_layer), intent(in out) :: layernorm real, intent(in out) :: input(:, :) @@ -44,7 +52,7 @@ subroutine test_layernorm_forward(layernorm, input, ok) write(stderr, '(a)') 'forward returned incorrect shape.. failed' end if output_flat = reshape(layernorm % output, shape(output_flat)) - if (.not. all(output_flat.eq.expected_output_flat)) then + if (.not. allclose(output_flat, expected_output_flat)) then ok = .false. write(stderr, '(a)') 'forward returned incorrect values.. failed' end if @@ -67,7 +75,7 @@ subroutine test_layernorm_backward(layernorm, input, gradient, ok) real :: d_gamma(4) real :: expected_d_gamma(4) = [0.765904069, 0.175162792, 2.16362262, -4.57002449] real :: d_beta(4) - real :: expected_d_beta(4) = [5.09999990, 6.09999990, 2.19999981, 6.09999990] + real :: expected_d_beta(4) = [5.1, 6.1, 2.2, 6.1] call layernorm % backward(input, gradient) @@ -77,16 +85,16 @@ subroutine test_layernorm_backward(layernorm, input, gradient, ok) write(stderr, '(a)') 'backward returned incorrect gradient shape.. failed' end if gradient_flat = reshape(layernorm % gradient, shape(gradient_flat)) - if (.not. all(gradient_flat.eq.expected_gradient_flat)) then + if (.not. allclose(gradient_flat, expected_gradient_flat)) then ok = .false. write(stderr, '(a)') 'backward returned incorrect gradient values.. failed' end if - if (.not. all(layernorm % d_gamma.eq.expected_d_gamma)) then + if (.not. allclose(layernorm % d_gamma, expected_d_gamma)) then ok = .false. write(stderr, '(a)') 'backward returned incorrect d_gamma values.. failed' end if - if (.not. all(layernorm % d_beta.eq.expected_d_beta)) then + if (.not. allclose(layernorm % d_beta, expected_d_beta)) then ok = .false. write(stderr, '(a)') 'backward returned incorrect d_beta values.. failed' end if @@ -135,7 +143,7 @@ subroutine test_layernorm_gradients(input, gradient, ok) call layernorm % forward(input) updated_output = reshape(layernorm % output, [12]) - if (.not. all(updated_output.eq.expected_updated_output)) then + if (.not. allclose(updated_output, expected_updated_output)) then ok = .false. write(stderr, '(a)') 'incorrect output after parameters update.. failed' end if From c2a1e7052806d18bad466b82dd95136f7c03123e Mon Sep 17 00:00:00 2001 From: Mikhail Voronov Date: Sun, 23 Feb 2025 17:34:52 +0400 Subject: [PATCH 10/18] layernorm: update cmake --- CMakeLists.txt | 2 ++ test/CMakeLists.txt | 1 + 2 files changed, 3 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index c1bf2231..906bdd0d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -37,6 +37,8 @@ add_library(neural-fortran src/nf/nf_input3d_layer_submodule.f90 src/nf/nf_layer_constructors.f90 src/nf/nf_layer_constructors_submodule.f90 + src/nf/nf_layernorm.f90 + src/nf/nf_layernorm_submodule.f90 src/nf/nf_layer.f90 src/nf/nf_layer_submodule.f90 src/nf/nf_linear2d_layer.f90 diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 741e9930..46d349c1 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -12,6 +12,7 @@ foreach(execid insert_flatten reshape_layer multihead_attention_layer + layernorm dense_network get_set_network_params conv2d_network From ddcd204464eba88527b9806df8d643bfbcbab7ed Mon Sep 17 00:00:00 2001 From: Mikhail Voronov Date: Sun, 23 Feb 2025 20:50:27 +0400 Subject: [PATCH 11/18] layernorm: use mold for temp allocation --- src/nf/nf_layernorm_submodule.f90 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nf/nf_layernorm_submodule.f90 b/src/nf/nf_layernorm_submodule.f90 index 4f6eae78..744888bd 100644 --- a/src/nf/nf_layernorm_submodule.f90 +++ b/src/nf/nf_layernorm_submodule.f90 @@ -14,7 +14,7 @@ pure module subroutine forward(self, input) real, allocatable :: normalized(:, :) integer :: i - allocate(normalized(self % sequence_length, self % model_dimension)) + allocate(normalized, mold=self % mu) ! mu = x - MEAN_last_dim(x) do concurrent(i = 1: self % model_dimension) From 54d081feebfc1923d1987aff03be1c8518585846 Mon Sep 17 00:00:00 2001 From: Mikhail Voronov Date: Tue, 25 Feb 2025 10:51:52 +0400 Subject: [PATCH 12/18] layernorm: rename to layernorm --- src/nf.f90 | 2 +- src/nf/nf_layer_constructors.f90 | 6 +++--- src/nf/nf_layer_constructors_submodule.f90 | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/nf.f90 b/src/nf.f90 index 67d18ea2..d089c4ac 100644 --- a/src/nf.f90 +++ b/src/nf.f90 @@ -12,7 +12,7 @@ module nf maxpool2d, & reshape, & self_attention, & - layer_normalization + layernorm use nf_loss, only: mse, quadratic use nf_metrics, only: corr, maxabs use nf_network, only: network diff --git a/src/nf/nf_layer_constructors.f90 b/src/nf/nf_layer_constructors.f90 index e2fd50a8..ce00b6bc 100644 --- a/src/nf/nf_layer_constructors.f90 +++ b/src/nf/nf_layer_constructors.f90 @@ -18,7 +18,7 @@ module nf_layer_constructors maxpool2d, & reshape, & self_attention, & - layer_normalization + layernorm interface input @@ -233,13 +233,13 @@ module function self_attention(num_heads) result(res) !! Resulting layer instance end function self_attention - module function layer_normalization() result(res) + module function layernorm() result(res) !! Layer Normalization !! ((x − mean(x)) / sqrt(variance(x) + eps) * gamma + beta !! Based upon `Ba, Jimmy Lei, Jamie Ryan Kiros, and Geoffrey E. Hinton(2016)`: !! https://arxiv.org/abs/1607.06450v1 type(layer) :: res - end function layer_normalization + end function layernorm end interface diff --git a/src/nf/nf_layer_constructors_submodule.f90 b/src/nf/nf_layer_constructors_submodule.f90 index 0b33f3c6..5c2e8893 100644 --- a/src/nf/nf_layer_constructors_submodule.f90 +++ b/src/nf/nf_layer_constructors_submodule.f90 @@ -180,11 +180,11 @@ module function self_attention(num_heads) result(res) allocate(res % p, source=self_attention_layer(num_heads)) end function self_attention - module function layer_normalization() result(res) + module function layernorm() result(res) type(layer) :: res - res % name = 'layer_normalization' + res % name = 'layernorm' allocate(res % p, source=layernorm_layer()) - end function layer_normalization + end function layernorm end submodule nf_layer_constructors_submodule From 6ec65acb10821046e5343fc6f19f0541d777da01 Mon Sep 17 00:00:00 2001 From: Mikhail Voronov Date: Tue, 25 Feb 2025 11:49:23 +0400 Subject: [PATCH 13/18] layernorm: allow usage of layernorm at the end --- src/nf/nf_layer_submodule.f90 | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/nf/nf_layer_submodule.f90 b/src/nf/nf_layer_submodule.f90 index 801c7754..39cdac1a 100644 --- a/src/nf/nf_layer_submodule.f90 +++ b/src/nf/nf_layer_submodule.f90 @@ -249,6 +249,8 @@ module subroutine forward(self, input) call this_layer % forward(prev_layer % output) type is(linear2d_layer) call this_layer % forward(prev_layer % output) + type is(layernorm_layer) + call this_layer % forward(prev_layer % output) end select type is(reshape3d_layer) From 720b79b37420d26d99bcf04c15dca34a8d959843 Mon Sep 17 00:00:00 2001 From: Mikhail Voronov Date: Tue, 25 Feb 2025 11:49:54 +0400 Subject: [PATCH 14/18] layernorm: integration test for layernorm --- test/test_layernorm.f90 | 106 ++++++++++++++++++++++++++++------------ 1 file changed, 75 insertions(+), 31 deletions(-) diff --git a/test/test_layernorm.f90 b/test/test_layernorm.f90 index 75af7f26..15f45aed 100644 --- a/test/test_layernorm.f90 +++ b/test/test_layernorm.f90 @@ -1,20 +1,23 @@ -program test_layernorm +program test_layernorm_instance use iso_fortran_env, only: stderr => error_unit use nf_layernorm_layer, only: layernorm_layer - use nf, only: sgd + use nf_linear2d_layer, only: linear2d_layer + use nf_layer, only: layer + use nf, only: sgd, layernorm, network, input, flatten, linear2d implicit none logical :: ok = .true. - type(layernorm_layer) :: layernorm + type(layernorm_layer) :: layernorm_instance real :: sample_input(3, 4) = reshape([0.0, 10.1, 0.2, 10.3, 0.4, 10.5, 0.6, 10.7, 10.8, 0.9, 0.11, 0.12], [3, 4]) real :: sample_gradient(3, 4) = reshape([0.1, 3., 2., 0.1, 3., 3., 0.1, 2., 0.1, 3., 0.1, 3.], [3, 4]) - layernorm = layernorm_layer() - call layernorm % init([3, 4]) + layernorm_instance = layernorm_layer() + call layernorm_instance % init([3, 4]) - call test_layernorm_forward(layernorm, sample_input, ok) - call test_layernorm_backward(layernorm, sample_input, sample_gradient, ok) + call test_layernorm_forward(layernorm_instance, sample_input, ok) + call test_layernorm_backward(layernorm_instance, sample_input, sample_gradient, ok) call test_layernorm_gradients(sample_input, sample_gradient, ok) + call test_layernorm_integration(ok) if (ok) then print '(a)', 'test_layernorm_layer: All tests passed.' @@ -32,8 +35,8 @@ function allclose(x, y) result(res) res = all(abs(x - y) <= (1e-06 + 1e-05 * abs(y))) end function allclose - subroutine test_layernorm_forward(layernorm, input, ok) - type(layernorm_layer), intent(in out) :: layernorm + subroutine test_layernorm_forward(layernorm_instance, input, ok) + type(layernorm_layer), intent(in out) :: layernorm_instance real, intent(in out) :: input(:, :) logical, intent(in out) :: ok real :: output_shape(2) @@ -44,22 +47,22 @@ subroutine test_layernorm_forward(layernorm, input, ok) -0.552177250, 1.05800152, 1.02837324, -0.481686622, -1.02747762, -1.00740564& ] - call layernorm % forward(input) + call layernorm_instance % forward(input) - output_shape = shape(layernorm % output) + output_shape = shape(layernorm_instance % output) if (.not. all(output_shape.eq.expected_shape)) then ok = .false. write(stderr, '(a)') 'forward returned incorrect shape.. failed' end if - output_flat = reshape(layernorm % output, shape(output_flat)) + output_flat = reshape(layernorm_instance % output, shape(output_flat)) if (.not. allclose(output_flat, expected_output_flat)) then ok = .false. write(stderr, '(a)') 'forward returned incorrect values.. failed' end if end subroutine test_layernorm_forward - subroutine test_layernorm_backward(layernorm, input, gradient, ok) - type(layernorm_layer), intent(in out) :: layernorm + subroutine test_layernorm_backward(layernorm_instance, input, gradient, ok) + type(layernorm_layer), intent(in out) :: layernorm_instance real, intent(in out) :: input(:, :) real, intent(in out) :: gradient(:, :) logical, intent(in out) :: ok @@ -77,24 +80,24 @@ subroutine test_layernorm_backward(layernorm, input, gradient, ok) real :: d_beta(4) real :: expected_d_beta(4) = [5.1, 6.1, 2.2, 6.1] - call layernorm % backward(input, gradient) + call layernorm_instance % backward(input, gradient) - gradient_shape = shape(layernorm % gradient) + gradient_shape = shape(layernorm_instance % gradient) if (.not. all(gradient_shape.eq.expected_gradient_shape)) then ok = .false. write(stderr, '(a)') 'backward returned incorrect gradient shape.. failed' end if - gradient_flat = reshape(layernorm % gradient, shape(gradient_flat)) + gradient_flat = reshape(layernorm_instance % gradient, shape(gradient_flat)) if (.not. allclose(gradient_flat, expected_gradient_flat)) then ok = .false. write(stderr, '(a)') 'backward returned incorrect gradient values.. failed' end if - if (.not. allclose(layernorm % d_gamma, expected_d_gamma)) then + if (.not. allclose(layernorm_instance % d_gamma, expected_d_gamma)) then ok = .false. write(stderr, '(a)') 'backward returned incorrect d_gamma values.. failed' end if - if (.not. allclose(layernorm % d_beta, expected_d_beta)) then + if (.not. allclose(layernorm_instance % d_beta, expected_d_beta)) then ok = .false. write(stderr, '(a)') 'backward returned incorrect d_beta values.. failed' end if @@ -104,7 +107,7 @@ subroutine test_layernorm_gradients(input, gradient, ok) real, intent(in out) :: input(:, :) real, intent(in out) :: gradient(:, :) logical, intent(in out) :: ok - type(layernorm_layer) :: layernorm + type(layernorm_layer) :: layernorm_instance type(sgd) :: optim real :: parameters(8) @@ -117,35 +120,76 @@ subroutine test_layernorm_gradients(input, gradient, ok) -0.564699769, -1.13543355, -1.11444426& ] - layernorm = layernorm_layer() - call layernorm % init([3, 4]) + layernorm_instance = layernorm_layer() + call layernorm_instance % init([3, 4]) - call layernorm % forward(input) - call layernorm % backward(input, gradient) + call layernorm_instance % forward(input) + call layernorm_instance % backward(input, gradient) - if (layernorm % get_num_params() /= 8) then + if (layernorm_instance % get_num_params() /= 8) then ok = .false. write(stderr, '(a)') 'incorrect number of parameters.. failed' end if expected_parameters(1: 4) = 1. expected_parameters(5: 8) = 0. - parameters = layernorm % get_params() + parameters = layernorm_instance % get_params() if (.not. all(parameters.eq.expected_parameters)) then ok = .false. write(stderr, '(a)') 'incorrect parameters.. failed' end if optim = SGD(learning_rate=0.01) - call optim % minimize(parameters, layernorm % get_gradients()) - call layernorm % set_params(parameters) + call optim % minimize(parameters, layernorm_instance % get_gradients()) + call layernorm_instance % set_params(parameters) - call layernorm % forward(input) + call layernorm_instance % forward(input) - updated_output = reshape(layernorm % output, [12]) + updated_output = reshape(layernorm_instance % output, [12]) if (.not. allclose(updated_output, expected_updated_output)) then ok = .false. write(stderr, '(a)') 'incorrect output after parameters update.. failed' end if end subroutine test_layernorm_gradients -end program test_layernorm + + subroutine test_layernorm_integration(ok) + logical, intent(in out) :: ok + + type(network) :: net + real :: x(2, 3) = reshape([0.1, 2., 0.3, 4., 0.5, 6.], [2, 3]) + real :: y(6) = [0.7, 0.2, 0.1, 0.1, 0.01, 0.9] + real :: tolerance = 0.1 + integer :: epoch + integer :: epochs = 10000 + + net = network([& + input(2, 3),& + linear2d(3),& + layernorm(),& + flatten()& + ]) + + ! Kaiming weights to achieve semblance of convergance + select type(l => net % layers(2) % p) + type is(linear2d_layer) + call random_number(l % weights) + l % weights = l % weights * sqrt(2. / 6.) + l % biases = 0.2 + end select + + do epoch = 1, epochs + call net % forward(x) + call net % backward(y) + call net % update(optimizer=sgd(learning_rate=0.001)) + if (all(abs(net % predict(x) - y) < tolerance)) exit + end do + print *, abs(net % predict(x) - y) + + print *, epoch + if (.not. epoch <= epochs) then + write(stderr, '(a)') & + 'linear2d + layernorm should converge in simple training.. failed' + ok = .false. + end if + end subroutine test_layernorm_integration +end program test_layernorm_instance From 981addd1c8435732d6559a87e1c2d9ee587df249 Mon Sep 17 00:00:00 2001 From: Mikhail Voronov Date: Tue, 25 Feb 2025 11:59:48 +0400 Subject: [PATCH 15/18] layernorm: memory allocation optimization --- src/nf/nf_layernorm.f90 | 4 ++++ src/nf/nf_layernorm_submodule.f90 | 35 ++++++++++++------------------- test/test_layernorm.f90 | 2 -- 3 files changed, 17 insertions(+), 24 deletions(-) diff --git a/src/nf/nf_layernorm.f90 b/src/nf/nf_layernorm.f90 index b148c534..36ef56f0 100644 --- a/src/nf/nf_layernorm.f90 +++ b/src/nf/nf_layernorm.f90 @@ -28,6 +28,10 @@ module nf_layernorm_layer real, allocatable :: output(:, :) + ! temp storages + real, allocatable, private :: normalized(:, :) + real, allocatable, private :: one_over_sigma(:, :) + real, allocatable, private :: gradient_by_gamma_over_sigma(:, :) contains procedure :: forward procedure :: backward diff --git a/src/nf/nf_layernorm_submodule.f90 b/src/nf/nf_layernorm_submodule.f90 index 744888bd..4eaa4382 100644 --- a/src/nf/nf_layernorm_submodule.f90 +++ b/src/nf/nf_layernorm_submodule.f90 @@ -11,11 +11,8 @@ end function layernorm_layer_cons pure module subroutine forward(self, input) class(layernorm_layer), intent(in out) :: self real, intent(in) :: input(:, :) - real, allocatable :: normalized(:, :) integer :: i - allocate(normalized, mold=self % mu) - ! mu = x - MEAN_last_dim(x) do concurrent(i = 1: self % model_dimension) self % mu(:, i) = input(:, i) - (sum(input, dim=2) / self % model_dimension) @@ -26,35 +23,28 @@ pure module subroutine forward(self, input) ! normalize mu by variance by first axis do concurrent(i = 1: self % model_dimension) - normalized(:, i) = self % mu(:, i) / self % sigma + self % normalized(:, i) = self % mu(:, i) / self % sigma end do ! forward through trainable params gamma and beta do concurrent(i = 1: self % sequence_length) - self % output(i, :) = normalized(i, :) * self % gamma + self % beta + self % output(i, :) = self % normalized(i, :) * self % gamma + self % beta end do - - deallocate(normalized) end subroutine forward pure module subroutine backward(self, input, gradient) class(layernorm_layer), intent(in out) :: self real, intent(in) :: input(:, :) real, intent(in) :: gradient(:, :) - real, allocatable :: one_over_sigma(:, :) - real, allocatable :: gradient_by_gamma_over_sigma(:, :) - - allocate(one_over_sigma(self % sequence_length, self % model_dimension)) - allocate(gradient_by_gamma_over_sigma(self % sequence_length, self % model_dimension)) - one_over_sigma = (1 / spread(self % sigma, dim=2, ncopies=self % model_dimension)) - gradient_by_gamma_over_sigma = & + self % one_over_sigma = (1 / spread(self % sigma, dim=2, ncopies=self % model_dimension)) + self % gradient_by_gamma_over_sigma = & gradient & * spread(self % gamma, dim=1, ncopies=self % sequence_length) & - * one_over_sigma + * self % one_over_sigma ! d_output/d_gamma = sum(d_output/d_y * mu/sigma) - self % d_gamma = sum(gradient * self % mu * one_over_sigma, dim=1) + self % d_gamma = sum(gradient * self % mu * self % one_over_sigma, dim=1) ! d_output/d_beta = sum(d_output/d_y) * 1 self % d_beta = sum(gradient, dim=1) @@ -66,20 +56,17 @@ pure module subroutine backward(self, input, gradient) ! - sum(d_output/d_y * gamma/sigma) / len ! - mu * sum(d_output/d_y * gamma * mu * sigma^(03)) / len self % gradient = & - gradient_by_gamma_over_sigma & + self % gradient_by_gamma_over_sigma & - spread(& - sum(gradient_by_gamma_over_sigma, dim=2),& + sum(self % gradient_by_gamma_over_sigma, dim=2),& dim=2,& ncopies=self % model_dimension& ) / self % model_dimension & - self % mu * spread(& - sum(gradient_by_gamma_over_sigma * self % mu * (one_over_sigma ** 2), dim=2),& + sum(self % gradient_by_gamma_over_sigma * self % mu * (self % one_over_sigma ** 2), dim=2),& dim=2,& ncopies=self % model_dimension& ) / self % model_dimension - - deallocate(one_over_sigma) - deallocate(gradient_by_gamma_over_sigma) end subroutine backward module subroutine init(self, input_shape) @@ -106,6 +93,10 @@ module subroutine init(self, input_shape) allocate(self % sigma(self % sequence_length)) allocate(self % output(self % sequence_length, self % model_dimension)) + + allocate(self % normalized, mold=self % mu) + allocate(self % one_over_sigma, mold=self % mu) + allocate(self % gradient_by_gamma_over_sigma, mold=self % mu) end subroutine init pure module function get_num_params(self) result(num_params) diff --git a/test/test_layernorm.f90 b/test/test_layernorm.f90 index 15f45aed..6a897575 100644 --- a/test/test_layernorm.f90 +++ b/test/test_layernorm.f90 @@ -183,9 +183,7 @@ subroutine test_layernorm_integration(ok) call net % update(optimizer=sgd(learning_rate=0.001)) if (all(abs(net % predict(x) - y) < tolerance)) exit end do - print *, abs(net % predict(x) - y) - print *, epoch if (.not. epoch <= epochs) then write(stderr, '(a)') & 'linear2d + layernorm should converge in simple training.. failed' From 55077b3d98eec53679ff85041ad2bf6bba18b83b Mon Sep 17 00:00:00 2001 From: milancurcic Date: Tue, 25 Feb 2025 11:59:49 -0500 Subject: [PATCH 16/18] Tidy up --- src/nf/nf_layer_submodule.f90 | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/nf/nf_layer_submodule.f90 b/src/nf/nf_layer_submodule.f90 index 39cdac1a..a3b42434 100644 --- a/src/nf/nf_layer_submodule.f90 +++ b/src/nf/nf_layer_submodule.f90 @@ -47,7 +47,7 @@ pure module subroutine backward_1d(self, previous, gradient) type is(flatten_layer) - ! Upstream layers permitted: input2d, input3d, conv2d, maxpool2d + ! Upstream layers permitted: input2d, input3d, conv2d, layernorm, maxpool2d select type(prev_layer => previous % p) type is(input2d_layer) call this_layer % backward(prev_layer % output, gradient) @@ -267,7 +267,7 @@ module subroutine forward(self, input) type is(linear2d_layer) - ! Upstream layers permitted: input2d, linear2d, self_attention, layer_normalization + ! Upstream layers permitted: input2d, linear2d, self_attention, layernorm select type(prev_layer => input % p) type is(input2d_layer) call this_layer % forward(prev_layer % output) @@ -281,7 +281,7 @@ module subroutine forward(self, input) type is(self_attention_layer) - ! Upstream layers permitted: input2d, linear2d, self_attention, layer_normalization + ! Upstream layers permitted: input2d, linear2d, self_attention, layernorm select type(prev_layer => input % p) type is(input2d_layer) call this_layer % forward(prev_layer % output) From 249485fb6ee8e357c9806f9bcaedff552ee5354e Mon Sep 17 00:00:00 2001 From: milancurcic Date: Tue, 25 Feb 2025 12:00:02 -0500 Subject: [PATCH 17/18] Bump version --- fpm.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fpm.toml b/fpm.toml index ebcceeb6..15a746e4 100644 --- a/fpm.toml +++ b/fpm.toml @@ -1,5 +1,5 @@ name = "neural-fortran" -version = "0.19.0" +version = "0.20.0" license = "MIT" author = "Milan Curcic" maintainer = "mcurcic@miami.edu" From 3e3776b2a54906b3ea8cf1985eb5d5e581504edd Mon Sep 17 00:00:00 2001 From: milancurcic Date: Tue, 25 Feb 2025 12:00:13 -0500 Subject: [PATCH 18/18] Add layernorm to the table of layers --- README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index a04ac32a..9fe3fab0 100644 --- a/README.md +++ b/README.md @@ -34,8 +34,9 @@ Read the paper [here](https://arxiv.org/abs/1902.06714). | Dropout | `dropout` | `dense`, `flatten`, `input1d` | 1 | ✅ | ✅ | | Convolutional (2-d) | `conv2d` | `input3d`, `conv2d`, `maxpool2d`, `reshape` | 3 | ✅ | ✅(*) | | Max-pooling (2-d) | `maxpool2d` | `input3d`, `conv2d`, `maxpool2d`, `reshape` | 3 | ✅ | ✅ | -| Linear (2-d) | `linear2d` | `input2d`, `linear2d`, `self_attention` | 2 | ✅ | ✅ | -| Self-attention | `self_attention` | `input2d`, `linear2d`, `self_attention` | 2 | ✅ | ✅ | +| Linear (2-d) | `linear2d` | `input2d`, `layernorm`, `linear2d`, `self_attention` | 2 | ✅ | ✅ | +| Self-attention | `self_attention` | `input2d`, `layernorm`, `linear2d`, `self_attention` | 2 | ✅ | ✅ | +| Layer Normalization | `layernorm` | `linear2d`, `self_attention` | 2 | ✅ | ✅ | | Flatten | `flatten` | `input2d`, `input3d`, `conv2d`, `maxpool2d`, `reshape` | 1 | ✅ | ✅ | | Reshape (1-d to 3-d) | `reshape` | `input1d`, `dense`, `flatten` | 3 | ✅ | ✅ |