diff --git a/CMakeLists.txt b/CMakeLists.txt
index 758b30b9..701f6f4e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -78,7 +78,23 @@ if(CMAKE_Fortran_COMPILER_ID MATCHES Cray)
 endif()
 
 # library to archive (libneural.a)
-add_library(neural src/mod_activation.f90 src/mod_io.f90 src/mod_kinds.f90 src/mod_layer.f90 src/mod_mnist.f90 src/mod_network.f90 src/mod_parallel.f90 src/mod_random.f90)
+add_library(neural
+  src/mod_activation.f90
+  src/mod_activation_submodule.f90
+  src/mod_io.f90
+  src/mod_io_submodule.f90
+  src/mod_kinds.f90
+  src/mod_layer.f90
+  src/mod_layer_submodule.f90
+  src/mod_mnist.f90
+  src/mod_mnist_submodule.f90
+  src/mod_network.f90
+  src/mod_network_submodule.f90
+  src/mod_parallel.f90
+  src/mod_parallel_submodule.f90
+  src/mod_random.f90
+  src/mod_random_submodule.f90
+)
 
 # Remove leading or trailing whitespace
 string(REGEX REPLACE "^ | $" "" LIBS "${LIBS}")
diff --git a/README.md b/README.md
index 41e55f67..e85b093f 100644
--- a/README.md
+++ b/README.md
@@ -43,6 +43,12 @@ Dependencies:
 * OpenCoarrays (optional, for parallel execution, GFortran only)
 * BLAS, MKL (optional)
 
+Compilers tested include:
+
+* gfortran-10.3.0
+* ifort-2021.4
+* ifx-2021.4
+
 ### Building with fpm
 
 #### Building in serial mode
diff --git a/fpm.toml b/fpm.toml
index b9cb2f3a..a3caa168 100644
--- a/fpm.toml
+++ b/fpm.toml
@@ -1,6 +1,6 @@
 name = "neural-fortran"
-version = "0.1.0"
+version = "0.2.0"
 license = "MIT"
 author = "Milan Curcic"
 maintainer = "milancurcic@hey.com"
-copyright = "Copyright 2018-2021, neural-fortran contributors"
+copyright = "Copyright 2018-2022, neural-fortran contributors"
diff --git a/src/mod_activation.f90 b/src/mod_activation.f90
index 8997e581..4aa19317 100644
--- a/src/mod_activation.f90
+++ b/src/mod_activation.f90
@@ -16,94 +16,86 @@ module mod_activation
   public :: tanhf, tanh_prime
 
   interface
+
     pure function activation_function(x)
       import :: rk
       real(rk), intent(in) :: x(:)
       real(rk) :: activation_function(size(x))
     end function activation_function
-  end interface
 
-contains
-
-  pure function gaussian(x) result(res)
-    !! Gaussian activation function.
-    real(rk), intent(in) :: x(:)
-    real(rk) :: res(size(x))
-    res = exp(-x**2)
-  end function gaussian
-
-  pure function gaussian_prime(x) result(res)
-    !! First derivative of the Gaussian activation function.
-    real(rk), intent(in) :: x(:)
-    real(rk) :: res(size(x))
-    res = -2 * x * gaussian(x)
-  end function gaussian_prime
-
-  pure function relu(x) result(res)
-    !! REctified Linear Unit (RELU) activation function.
-    real(rk), intent(in) :: x(:)
-    real(rk) :: res(size(x))
-    res = max(0., x)
-  end function relu
-
-  pure function relu_prime(x) result(res)
-    !! First derivative of the REctified Linear Unit (RELU) activation function.
-    real(rk), intent(in) :: x(:)
-    real(rk) :: res(size(x))
-    where (x > 0)
-      res = 1
-    elsewhere
-      res = 0
-    end where
-  end function relu_prime
-
-  pure function sigmoid(x) result(res)
-    !! Sigmoid activation function.
-    real(rk), intent(in) :: x(:)
-    real(rk) :: res(size(x))
-    res = 1 / (1 + exp(-x))
-  endfunction sigmoid
-
-  pure function sigmoid_prime(x) result(res)
-    !! First derivative of the sigmoid activation function.
-    real(rk), intent(in) :: x(:)
-    real(rk) :: res(size(x))
-    res = sigmoid(x) * (1 - sigmoid(x))
-  end function sigmoid_prime
-
-  pure function step(x) result(res)
-    !! Step activation function.
-    real(rk), intent(in) :: x(:)
-    real(rk) :: res(size(x))
-    where (x > 0)
-      res = 1
-    elsewhere
-      res = 0
-    end where
-  end function step
-
-  pure function step_prime(x) result(res)
-    !! First derivative of the step activation function.
-    real(rk), intent(in) :: x(:)
-    real(rk) :: res(size(x))
-    res = 0
-  end function step_prime
-
-  pure function tanhf(x) result(res)
-    !! Tangent hyperbolic activation function. 
-    !! Same as the intrinsic tanh, but must be 
-    !! defined here so that we can use procedure
-    !! pointer with it.
-    real(rk), intent(in) :: x(:)
-    real(rk) :: res(size(x))
-    res = tanh(x)
-  end function tanhf
-
-  pure function tanh_prime(x) result(res)
-    !! First derivative of the tanh activation function.
-    real(rk), intent(in) :: x(:)
-    real(rk) :: res(size(x))
-    res = 1 - tanh(x)**2
-  end function tanh_prime
+    pure module function gaussian(x) result(res)
+      !! Gaussian activation function.
+      implicit none
+      real(rk), intent(in) :: x(:)
+      real(rk) :: res(size(x))
+    end function gaussian
+
+    pure module function gaussian_prime(x) result(res)
+      !! First derivative of the Gaussian activation function.
+      implicit none
+      real(rk), intent(in) :: x(:)
+      real(rk) :: res(size(x))
+    end function gaussian_prime
+
+    pure module function relu(x) result(res)
+      !! REctified Linear Unit (RELU) activation function.
+      implicit none
+      real(rk), intent(in) :: x(:)
+      real(rk) :: res(size(x))
+    end function relu
+
+    pure module function relu_prime(x) result(res)
+      !! First derivative of the REctified Linear Unit (RELU) activation function.
+      implicit none
+      real(rk), intent(in) :: x(:)
+      real(rk) :: res(size(x))
+    end function relu_prime
+
+    pure module function sigmoid(x) result(res)
+      !! Sigmoid activation function.
+      implicit none
+      real(rk), intent(in) :: x(:)
+      real(rk) :: res(size(x))
+    end function sigmoid
+
+    pure module function sigmoid_prime(x) result(res)
+      !! First derivative of the sigmoid activation function.
+      implicit none
+      real(rk), intent(in) :: x(:)
+      real(rk) :: res(size(x))
+    end function sigmoid_prime
+
+    pure module function step(x) result(res)
+      !! Step activation function.
+      implicit none
+      real(rk), intent(in) :: x(:)
+      real(rk) :: res(size(x))
+    end function step
+
+    pure module function step_prime(x) result(res)
+      !! First derivative of the step activation function.
+      implicit none
+      real(rk), intent(in) :: x(:)
+      real(rk) :: res(size(x))
+    end function step_prime
+
+    pure module function tanhf(x) result(res)
+      !! Tangent hyperbolic activation function. 
+      !! Same as the intrinsic tanh, but must be 
+      !! defined here so that we can use procedure
+      !! pointer with it.
+      implicit none
+      real(rk), intent(in) :: x(:)
+      real(rk) :: res(size(x))
+    end function tanhf
+
+    pure module function tanh_prime(x) result(res)
+      !! First derivative of the tanh activation function.
+      implicit none
+      real(rk), intent(in) :: x(:)
+      real(rk) :: res(size(x))
+    end function tanh_prime
+
+  end interface
 
 end module mod_activation
diff --git a/src/mod_activation_submodule.f90 b/src/mod_activation_submodule.f90
new file mode 100644
index 00000000..c01fa2f9
--- /dev/null
+++ b/src/mod_activation_submodule.f90
@@ -0,0 +1,77 @@
+submodule(mod_activation) mod_activation_submodule
+
+  !! A collection of activation functions and their derivatives.
+
+  implicit none
+
+contains
+
+  pure module function gaussian(x) result(res)
+    real(rk), intent(in) :: x(:)
+    real(rk) :: res(size(x))
+    res = exp(-x**2)
+  end function gaussian
+
+  pure module function gaussian_prime(x) result(res)
+    real(rk), intent(in) :: x(:)
+    real(rk) :: res(size(x))
+    res = -2 * x * gaussian(x)
+  end function gaussian_prime
+
+  pure module function relu(x) result(res)
+    real(rk), intent(in) :: x(:)
+    real(rk) :: res(size(x))
+    res = max(0., x)
+  end function relu
+
+  pure module function relu_prime(x) result(res)
+    real(rk), intent(in) :: x(:)
+    real(rk) :: res(size(x))
+    where (x > 0)
+      res = 1
+    elsewhere
+      res = 0
+    end where
+  end function relu_prime
+
+  pure module function sigmoid(x) result(res)
+    real(rk), intent(in) :: x(:)
+    real(rk) :: res(size(x))
+    res = 1 / (1 + exp(-x))
+  endfunction sigmoid
+
+  pure module function sigmoid_prime(x) result(res)
+    real(rk), intent(in) :: x(:)
+    real(rk) :: res(size(x))
+    res = sigmoid(x) * (1 - sigmoid(x))
+  end function sigmoid_prime
+
+  pure module function step(x) result(res)
+    real(rk), intent(in) :: x(:)
+    real(rk) :: res(size(x))
+    where (x > 0)
+      res = 1
+    elsewhere
+      res = 0
+    end where
+  end function step
+
+  pure module function step_prime(x) result(res)
+    real(rk), intent(in) :: x(:)
+    real(rk) :: res(size(x))
+    res = 0
+  end function step_prime
+
+  pure module function tanhf(x) result(res)
+    real(rk), intent(in) :: x(:)
+    real(rk) :: res(size(x))
+    res = tanh(x)
+  end function tanhf
+
+  pure module function tanh_prime(x) result(res)
+    real(rk), intent(in) :: x(:)
+    real(rk) :: res(size(x))
+    res = 1 - tanh(x)**2
+  end function tanh_prime
+
+end submodule mod_activation_submodule
diff --git a/src/mod_io.f90 b/src/mod_io.f90
index 00374c56..0d40ad57 100644
--- a/src/mod_io.f90
+++ b/src/mod_io.f90
@@ -9,35 +9,21 @@ module mod_io
   public :: read_binary_file
 
   interface read_binary_file
-    module procedure :: read_binary_file_1d, read_binary_file_2d
-  end interface read_binary_file
 
-contains
-
-  subroutine read_binary_file_1d(filename, dtype, nrec, array)
-    character(len=*), intent(in) :: filename
-    integer(ik), intent(in) :: dtype, nrec
-    real(rk), allocatable, intent(in out) :: array(:)
-    integer(ik) :: fileunit
-    allocate(array(nrec))
-    open(newunit=fileunit, file=filename, access='direct',&
-         action='read', recl=dtype * nrec, status='old')
-    read(fileunit, rec=1) array
-    close(fileunit)
-  end subroutine read_binary_file_1d
-
-  subroutine read_binary_file_2d(filename, dtype, dsize, nrec, array)
-    character(len=*), intent(in) :: filename
-    integer(ik), intent(in) :: dtype, dsize, nrec
-    real(rk), allocatable, intent(in out) :: array(:,:)
-    integer(ik) :: fileunit, i
-    allocate(array(dsize, nrec))
-    open(newunit=fileunit, file=filename, access='direct',&
-         action='read', recl=dtype * dsize, status='old')
-    do i = 1, nrec
-      read(fileunit, rec=i) array(:,i)
-    end do
-    close(fileunit)
-  end subroutine read_binary_file_2d
+    module subroutine read_binary_file_1d(filename, dtype, nrec, array)
+      implicit none
+      character(len=*), intent(in) :: filename
+      integer(ik), intent(in) :: dtype, nrec
+      real(rk), allocatable, intent(in out) :: array(:)
+    end subroutine read_binary_file_1d
+
+    module subroutine read_binary_file_2d(filename, dtype, dsize, nrec, array)
+      implicit none
+      character(len=*), intent(in) :: filename
+      integer(ik), intent(in) :: dtype, dsize, nrec
+      real(rk), allocatable, intent(in out) :: array(:,:)
+    end subroutine read_binary_file_2d
+
+  end interface read_binary_file
 
 end module mod_io
diff --git a/src/mod_io_submodule.f90 b/src/mod_io_submodule.f90
new file mode 100644
index 00000000..59f88c7c
--- /dev/null
+++ b/src/mod_io_submodule.f90
@@ -0,0 +1,33 @@
+submodule(mod_io) mod_io_submodule
+
+  implicit none
+
+contains
+
+  module subroutine read_binary_file_1d(filename, dtype, nrec, array)
+    character(len=*), intent(in) :: filename
+    integer(ik), intent(in) :: dtype, nrec
+    real(rk), allocatable, intent(in out) :: array(:)
+    integer(ik) :: fileunit
+    allocate(array(nrec))
+    open(newunit=fileunit, file=filename, access='direct',&
+         action='read', recl=dtype * nrec, status='old')
+    read(fileunit, rec=1) array
+    close(fileunit)
+  end subroutine read_binary_file_1d
+
+  module subroutine read_binary_file_2d(filename, dtype, dsize, nrec, array)
+    character(len=*), intent(in) :: filename
+    integer(ik), intent(in) :: dtype, dsize, nrec
+    real(rk), allocatable, intent(in out) :: array(:,:)
+    integer(ik) :: fileunit, i
+    allocate(array(dsize, nrec))
+    open(newunit=fileunit, file=filename, access='direct',&
+         action='read', recl=dtype * dsize, status='old')
+    do i = 1, nrec
+      read(fileunit, rec=i) array(:,i)
+    end do
+    close(fileunit)
+  end subroutine read_binary_file_2d
+
+end submodule mod_io_submodule
diff --git a/src/mod_layer.f90 b/src/mod_layer.f90
index 4ac409cd..e7244241 100644
--- a/src/mod_layer.f90
+++ b/src/mod_layer.f90
@@ -4,7 +4,6 @@ module mod_layer
 
   use mod_activation
   use mod_kinds, only: ik, rk
-  use mod_random, only: randn
 
   implicit none
 
@@ -32,126 +31,70 @@ module mod_layer
   end type array2d
 
   interface layer_type
-    module procedure :: constructor
+    module function constructor(this_size, next_size) result(layer)
+      !! Layer class constructor. this_size is the number of neurons in the layer.
+      !! next_size is the number of neurons in the next layer, used to allocate
+      !! the weights.
+      implicit none
+      integer(ik), intent(in) :: this_size, next_size
+      type(layer_type) layer
+    end function constructor
   end interface layer_type
 
   interface array1d
-    module procedure :: array1d_constructor
+    pure module function array1d_constructor(length) result(a)
+      !! Overloads the default type constructor.
+      implicit none
+      integer(ik), intent(in) :: length
+      type(array1d) :: a
+    end function array1d_constructor  
   end interface array1d
 
-  interface array2d
-    module procedure :: array2d_constructor
+  interface array2d  
+    pure module function array2d_constructor(dims) result(a)
+      !! Overloads the default type constructor.
+      integer(ik), intent(in) :: dims(2)
+      type(array2d) :: a
+    end function array2d_constructor
   end interface array2d
-
-contains
-
-  type(layer_type) function constructor(this_size, next_size) result(layer)
-    !! Layer class constructor. this_size is the number of neurons in the layer.
-    !! next_size is the number of neurons in the next layer, used to allocate
-    !! the weights.
-    integer(ik), intent(in) :: this_size, next_size
-    allocate(layer % a(this_size))
-    allocate(layer % z(this_size))
-    layer % a = 0
-    layer % z = 0
-    layer % w = randn(this_size, next_size) / this_size
-    layer % b = randn(this_size)
-  end function constructor
-
-  pure type(array1d) function array1d_constructor(length) result(a)
-    !! Overloads the default type constructor.
-    integer(ik), intent(in) :: length
-    allocate(a % array(length))
-    a % array = 0
-  end function array1d_constructor
-
-  pure type(array2d) function array2d_constructor(dims) result(a)
-    !! Overloads the default type constructor.
-    integer(ik), intent(in) :: dims(2)
-    allocate(a % array(dims(1), dims(2)))
-    a % array = 0
-  end function array2d_constructor
-
-  pure subroutine db_init(db, dims)
-    !! Initialises biases structure.
-    type(array1d), allocatable, intent(in out) :: db(:)
-    integer(ik), intent(in) :: dims(:)
-    integer(ik) :: n, nm
-    nm = size(dims)
-    allocate(db(nm))
-    do n = 1, nm - 1
-      db(n) = array1d(dims(n))
-    end do
-    db(n) = array1d(dims(n))
-  end subroutine db_init
-
-  pure subroutine dw_init(dw, dims)
-    !! Initialises weights structure.
-    type(array2d), allocatable, intent(in out) :: dw(:)
-    integer(ik), intent(in) :: dims(:)
-    integer(ik) :: n, nm
-    nm = size(dims)
-    allocate(dw(nm))
-    do n = 1, nm - 1
-      dw(n) = array2d(dims(n:n+1))
-    end do
-    dw(n) = array2d([dims(n), 1])
-  end subroutine dw_init
-
-  subroutine db_co_sum(db)
-    !! Performs a collective sum of bias tendencies.
-    type(array1d), allocatable, intent(in out) :: db(:)
-    integer(ik) :: n
-    do n = 2, size(db)
-#ifdef CAF
-      call co_sum(db(n) % array)
-#endif
-    end do
-  end subroutine db_co_sum
-
-  subroutine dw_co_sum(dw)
-    !! Performs a collective sum of weights tendencies.
-    type(array2d), allocatable, intent(in out) :: dw(:)
-    integer(ik) :: n
-    do n = 1, size(dw) - 1
-#ifdef CAF
-      call co_sum(dw(n) % array)
-#endif
-    end do
-  end subroutine dw_co_sum
-
-  pure elemental subroutine set_activation(self, activation)
-    !! Sets the activation function. Input string must match one of
-    !! provided activation functions, otherwise it defaults to sigmoid.
-    !! If activation not present, defaults to sigmoid.
-    class(layer_type), intent(in out) :: self
-    character(len=*), intent(in) :: activation
-    select case(trim(activation))
-      case('gaussian')
-        self % activation => gaussian
-        self % activation_prime => gaussian_prime
-        self % activation_str = 'gaussian'
-      case('relu')
-        self % activation => relu
-        self % activation_prime => relu_prime
-        self % activation_str = 'relu'
-      case('sigmoid')
-        self % activation => sigmoid
-        self % activation_prime => sigmoid_prime
-        self % activation_str = 'sigmoid'
-      case('step')
-        self % activation => step
-        self % activation_prime => step_prime
-        self % activation_str = 'step'
-      case('tanh')
-        self % activation => tanhf
-        self % activation_prime => tanh_prime
-        self % activation_str = 'tanh'
-      case default
-        self % activation => sigmoid
-        self % activation_prime => sigmoid_prime
-        self % activation_str = 'sigmoid'
-    end select
-  end subroutine set_activation
+  
+  interface
+
+    pure module subroutine db_init(db, dims)
+      !! Initialises biases structure.
+      implicit none
+      type(array1d), allocatable, intent(in out) :: db(:)
+      integer(ik), intent(in) :: dims(:)
+    end subroutine db_init  
+
+    pure module subroutine dw_init(dw, dims)
+      !! Initialises weights structure.
+      implicit none
+      type(array2d), allocatable, intent(in out) :: dw(:)
+      integer(ik), intent(in) :: dims(:)
+    end subroutine dw_init
+    
+    module subroutine db_co_sum(db)
+      !! Performs a collective sum of bias tendencies.
+      implicit none
+      type(array1d), allocatable, intent(in out) :: db(:)
+    end subroutine db_co_sum
+    
+    module subroutine dw_co_sum(dw)
+      !! Performs a collective sum of weights tendencies.
+      implicit none
+      type(array2d), allocatable, intent(in out) :: dw(:)
+    end subroutine dw_co_sum
+
+    pure elemental module subroutine set_activation(self, activation)
+      !! Sets the activation function. Input string must match one of
+      !! provided activation functions, otherwise it defaults to sigmoid.
+      !! If activation not present, defaults to sigmoid.
+      implicit none
+      class(layer_type), intent(in out) :: self
+      character(len=*), intent(in) :: activation
+    end subroutine set_activation
+  
+  end interface
 
 end module mod_layer
diff --git a/src/mod_layer_submodule.f90 b/src/mod_layer_submodule.f90
new file mode 100644
index 00000000..6bbfcf61
--- /dev/null
+++ b/src/mod_layer_submodule.f90
@@ -0,0 +1,110 @@
+submodule(mod_layer) mod_layer_submodule
+  
+  use mod_random, only: randn
+
+  implicit none
+
+contains
+
+  module function constructor(this_size, next_size) result(layer)
+    integer(ik), intent(in) :: this_size, next_size
+    type(layer_type) :: layer
+    allocate(layer % a(this_size))
+    allocate(layer % z(this_size))
+    layer % a = 0
+    layer % z = 0
+    layer % w = randn(this_size, next_size) / this_size
+    layer % b = randn(this_size)
+  end function constructor
+
+  pure module function array1d_constructor(length) result(a)
+    integer(ik), intent(in) :: length
+    type(array1d) :: a
+    allocate(a % array(length))
+    a % array = 0
+  end function array1d_constructor
+  
+  pure module function array2d_constructor(dims) result(a)
+    integer(ik), intent(in) :: dims(2)
+    type(array2d) :: a
+    allocate(a % array(dims(1), dims(2)))
+    a % array = 0
+  end function array2d_constructor
+  
+  pure module subroutine db_init(db, dims)
+    type(array1d), allocatable, intent(in out) :: db(:)
+    integer(ik), intent(in) :: dims(:)
+    integer(ik) :: n, nm
+    nm = size(dims)
+    allocate(db(nm))
+    do n = 1, nm - 1
+      db(n) = array1d(dims(n))
+    end do
+    db(n) = array1d(dims(n))
+  end subroutine db_init
+  
+  pure module subroutine dw_init(dw, dims)
+    type(array2d), allocatable, intent(in out) :: dw(:)
+    integer(ik), intent(in) :: dims(:)
+    integer(ik) :: n, nm
+    nm = size(dims)
+    allocate(dw(nm))
+    do n = 1, nm - 1
+      dw(n) = array2d(dims(n:n+1))
+    end do
+    dw(n) = array2d([dims(n), 1])
+  end subroutine dw_init
+  
+  module subroutine db_co_sum(db)
+    type(array1d), allocatable, intent(in out) :: db(:)
+    integer(ik) :: n
+    do n = 2, size(db)
+#ifdef CAF
+      call co_sum(db(n) % array)
+#endif
+    end do
+  end subroutine db_co_sum
+  
+  module subroutine dw_co_sum(dw)
+    type(array2d), allocatable, intent(in out) :: dw(:)
+    integer(ik) :: n
+    do n = 1, size(dw) - 1
+#ifdef CAF
+      call co_sum(dw(n) % array)
+#endif
+    end do
+  end subroutine dw_co_sum
+  
+  pure elemental module subroutine set_activation(self, activation)
+    class(layer_type), intent(in out) :: self
+    character(len=*), intent(in) :: activation
+    select case(trim(activation))
+      case('gaussian')
+        self % activation => gaussian
+        self % activation_prime => gaussian_prime
+        self % activation_str = 'gaussian'
+      case('relu')
+        self % activation => relu
+        self % activation_prime => relu_prime
+        self % activation_str = 'relu'
+      case('sigmoid')
+        self % activation => sigmoid
+        self % activation_prime => sigmoid_prime
+        self % activation_str = 'sigmoid'
+      case('step')
+        self % activation => step
+        self % activation_prime => step_prime
+        self % activation_str = 'step'
+      case('tanh')
+        self % activation => tanhf
+        self % activation_prime => tanh_prime
+        self % activation_str = 'tanh'
+      case default
+        self % activation => sigmoid
+        self % activation_prime => sigmoid_prime
+        self % activation_str = 'sigmoid'
+    end select
+  end subroutine set_activation
+
+
+end submodule mod_layer_submodule
diff --git a/src/mod_mnist.f90 b/src/mod_mnist.f90
index 7c5f897b..e8af54cb 100644
--- a/src/mod_mnist.f90
+++ b/src/mod_mnist.f90
@@ -3,8 +3,6 @@ module mod_mnist
   !! Procedures to work with MNIST dataset, usable with data format
   !! as provided in this repo and not the original data format (idx).
 
-  use iso_fortran_env, only: real32 !! TODO make MNIST work with arbitrary precision
-  use mod_io, only: read_binary_file
   use mod_kinds, only: ik, rk
 
   implicit none
@@ -13,75 +11,33 @@ module mod_mnist
 
   public :: label_digits, load_mnist, print_image
 
-contains
-
-  pure function digits(x)
-    !! Returns an array of 10 reals, with zeros everywhere
-    !! and a one corresponding to the input number, for example:
-    !!   digits(0) = [1., 0., 0., 0., 0., 0., 0., 0., 0., 0.]
-    !!   digits(1) = [0., 1., 0., 0., 0., 0., 0., 0., 0., 0.]
-    !!   digits(6) = [0., 0., 0., 0., 0., 0., 1., 0., 0., 0.]
-    real(rk), intent(in) :: x
-    real(rk) :: digits(10)
-    digits = 0
-    digits(int(x + 1)) = 1
-  end function digits
-
-  pure function label_digits(labels) result(res)
-    !! Converts an array of MNIST labels into a form
-    !! that can be input to the network_type instance.
-    real(rk), intent(in) :: labels(:)
-    real(rk) :: res(10, size(labels))
-    integer(ik) :: i
-    do i = 1, size(labels)
-      res(:,i) = digits(labels(i))
-    end do
-  end function label_digits
-
-  subroutine load_mnist(tr_images, tr_labels, te_images,&
-                        te_labels, va_images, va_labels)
-    !! Loads the MNIST dataset into arrays.
-    real(rk), allocatable, intent(in out) :: tr_images(:,:), tr_labels(:)
-    real(rk), allocatable, intent(in out) :: te_images(:,:), te_labels(:)
-    real(rk), allocatable, intent(in out), optional :: va_images(:,:), va_labels(:)
-    integer(ik), parameter :: dtype = 4, image_size = 784
-    integer(ik), parameter :: tr_nimages = 50000
-    integer(ik), parameter :: te_nimages = 10000
-    integer(ik), parameter :: va_nimages = 10000
-
-    call read_binary_file('data/mnist/mnist_training_images.dat',&
-                          dtype, image_size, tr_nimages, tr_images)
-    call read_binary_file('data/mnist/mnist_training_labels.dat',&
-                          dtype, tr_nimages, tr_labels)
-
-    call read_binary_file('data/mnist/mnist_testing_images.dat',&
-                          dtype, image_size, te_nimages, te_images)
-    call read_binary_file('data/mnist/mnist_testing_labels.dat',&
-                          dtype, te_nimages, te_labels)
-
-    if (present(va_images) .and. present(va_labels)) then
-      call read_binary_file('data/mnist/mnist_validation_images.dat',&
-                            dtype, image_size, va_nimages, va_images)
-      call read_binary_file('data/mnist/mnist_validation_labels.dat',&
-                            dtype, va_nimages, va_labels)
-    end if
-
-  end subroutine load_mnist
-
-  subroutine print_image(images, labels, n)
-    !! Prints a single image and label to screen.
-    real(rk), intent(in) :: images(:,:), labels(:)
-    integer(ik), intent(in) :: n
-    real(rk) :: image(28, 28)
-    character(len=1) :: char_image(28, 28)
-    integer(ik) i, j
-    image = reshape(images(:,n), [28, 28])
-    char_image = '.'
-    where (image > 0) char_image = '#'
-    print *, labels(n)
-    do j = 1, 28
-      print *, char_image(:,j)
-    end do
-  end subroutine print_image
+  interface
+  
+    pure module function label_digits(labels) result(res)
+      !! Converts an array of MNIST labels into a form
+      !! that can be input to the network_type instance.
+      implicit none
+      real(rk), intent(in) :: labels(:)
+      real(rk) :: res(10, size(labels))
+    end function label_digits
+  
+    module subroutine load_mnist(tr_images, tr_labels, te_images,&
+  
+                          te_labels, va_images, va_labels)
+      !! Loads the MNIST dataset into arrays.
+      implicit none
+      real(rk), allocatable, intent(in out) :: tr_images(:,:), tr_labels(:)
+      real(rk), allocatable, intent(in out) :: te_images(:,:), te_labels(:)
+      real(rk), allocatable, intent(in out), optional :: va_images(:,:), va_labels(:)
+    end subroutine load_mnist
+  
+    module subroutine print_image(images, labels, n)
+      !! Prints a single image and label to screen.
+      implicit none
+      real(rk), intent(in) :: images(:,:), labels(:)
+      integer(ik), intent(in) :: n
+    end subroutine print_image
+  
+  end interface
 
 end module mod_mnist
diff --git a/src/mod_mnist_submodule.f90 b/src/mod_mnist_submodule.f90
new file mode 100644
index 00000000..0d33edfc
--- /dev/null
+++ b/src/mod_mnist_submodule.f90
@@ -0,0 +1,80 @@
+submodule(mod_mnist) mod_mnist_submodule
+
+  !! Procedures to work with MNIST dataset, usable with data format
+  !! as provided in this repo and not the original data format (idx).
+
+  ! TODO make MNIST work with arbitrary precision
+
+  use mod_io, only: read_binary_file
+  use mod_kinds, only: ik, rk
+
+  implicit none
+
+contains
+
+  pure module function label_digits(labels) result(res)
+    real(rk), intent(in) :: labels(:)
+    real(rk) :: res(10, size(labels))
+    integer(ik) :: i
+    do i = 1, size(labels)
+      res(:,i) = digits(labels(i))
+    end do
+  contains
+    pure function digits(x)
+      !! Returns an array of 10 reals, with zeros everywhere
+      !! and a one corresponding to the input number, for example:
+      !!   digits(0) = [1., 0., 0., 0., 0., 0., 0., 0., 0., 0.]
+      !!   digits(1) = [0., 1., 0., 0., 0., 0., 0., 0., 0., 0.]
+      !!   digits(6) = [0., 0., 0., 0., 0., 0., 1., 0., 0., 0.]
+      real(rk), intent(in) :: x
+      real(rk) :: digits(10)
+      digits = 0
+      digits(int(x + 1)) = 1
+    end function digits
+  end function label_digits
+
+  module subroutine load_mnist(tr_images, tr_labels, te_images,&
+                        te_labels, va_images, va_labels)
+    real(rk), allocatable, intent(in out) :: tr_images(:,:), tr_labels(:)
+    real(rk), allocatable, intent(in out) :: te_images(:,:), te_labels(:)
+    real(rk), allocatable, intent(in out), optional :: va_images(:,:), va_labels(:)
+    integer(ik), parameter :: dtype = 4, image_size = 784
+    integer(ik), parameter :: tr_nimages = 50000
+    integer(ik), parameter :: te_nimages = 10000
+    integer(ik), parameter :: va_nimages = 10000
+
+    call read_binary_file('data/mnist/mnist_training_images.dat',&
+                          dtype, image_size, tr_nimages, tr_images)
+    call read_binary_file('data/mnist/mnist_training_labels.dat',&
+                          dtype, tr_nimages, tr_labels)
+
+    call read_binary_file('data/mnist/mnist_testing_images.dat',&
+                          dtype, image_size, te_nimages, te_images)
+    call read_binary_file('data/mnist/mnist_testing_labels.dat',&
+                          dtype, te_nimages, te_labels)
+
+    if (present(va_images) .and. present(va_labels)) then
+      call read_binary_file('data/mnist/mnist_validation_images.dat',&
+                            dtype, image_size, va_nimages, va_images)
+      call read_binary_file('data/mnist/mnist_validation_labels.dat',&
+                            dtype, va_nimages, va_labels)
+    end if
+
+  end subroutine load_mnist
+
+  module subroutine print_image(images, labels, n)
+    real(rk), intent(in) :: images(:,:), labels(:)
+    integer(ik), intent(in) :: n
+    real(rk) :: image(28, 28)
+    character(len=1) :: char_image(28, 28)
+    integer(ik) i, j
+    image = reshape(images(:,n), [28, 28])
+    char_image = '.'
+    where (image > 0) char_image = '#'
+    print *, labels(n)
+    do j = 1, 28
+      print *, char_image(:,j)
+    end do
+  end subroutine print_image
+
+end submodule mod_mnist_submodule
diff --git a/src/mod_network.f90 b/src/mod_network.f90
index 405099d8..7e8d777a 100644
--- a/src/mod_network.f90
+++ b/src/mod_network.f90
@@ -1,9 +1,7 @@
 module mod_network
 
   use mod_kinds, only: ik, rk
-  use mod_layer, only: array1d, array2d, db_init, dw_init,&
-                       db_co_sum, dw_co_sum, layer_type
-  use mod_parallel, only: tile_indices
+  use mod_layer, only: array1d, array2d, layer_type
 
   implicit none
 
@@ -41,332 +39,166 @@ module mod_network
   end type network_type
 
   interface network_type
-    module procedure :: net_constructor
+    
+    module function net_constructor(dims, activation) result(net)
+      !! Network class constructor. Size of input array dims indicates the total
+      !! number of layers (input + hidden + output), and the value of its elements
+      !! corresponds the size of each layer.
+      implicit none
+      integer(ik), intent(in) :: dims(:)
+      character(len=*), intent(in), optional :: activation
+      type(network_type) :: net
+    end function net_constructor
+          
   end interface network_type
 
-contains
-
-  type(network_type) function net_constructor(dims, activation) result(net)
-    !! Network class constructor. Size of input array dims indicates the total
-    !! number of layers (input + hidden + output), and the value of its elements
-    !! corresponds the size of each layer.
-    integer(ik), intent(in) :: dims(:)
-    character(len=*), intent(in), optional :: activation
-    call net % init(dims)
-    if (present(activation)) then
-      call net % set_activation(activation)
-    else
-      call net % set_activation('sigmoid')
-    end if
-    call net % sync(1)
-  end function net_constructor
-
-
-  pure real(rk) function accuracy(self, x, y)
-    !! Given input x and output y, evaluates the position of the
-    !! maximum value of the output and returns the number of matches
-    !! relative to the size of the dataset.
-    class(network_type), intent(in) :: self
-    real(rk), intent(in) :: x(:,:), y(:,:)
-    integer(ik) :: i, good
-    good = 0
-    do i = 1, size(x, dim=2)
-      if (all(maxloc(self % output(x(:,i))) == maxloc(y(:,i)))) then
-        good = good + 1
-      end if
-    end do
-    accuracy = real(good, kind=rk) / size(x, dim=2)
-  end function accuracy
-
-
-  pure subroutine backprop(self, y, dw, db)
-    !! Applies a backward propagation through the network
-    !! and returns the weight and bias gradients.
-    class(network_type), intent(in out) :: self
-    real(rk), intent(in) :: y(:)
-    type(array2d), allocatable, intent(out) :: dw(:)
-    type(array1d), allocatable, intent(out) :: db(:)
-    integer(ik) :: n, nm
-
-    associate(dims => self % dims, layers => self % layers)
-
-      call db_init(db, dims)
-      call dw_init(dw, dims)
-
-      n = size(dims)
-      db(n) % array = (layers(n) % a - y) * self % layers(n) % activation_prime(layers(n) % z)
-      dw(n-1) % array = matmul(reshape(layers(n-1) % a, [dims(n-1), 1]),&
-                               reshape(db(n) % array, [1, dims(n)]))
-
-      do n = size(dims) - 1, 2, -1
-        db(n) % array = matmul(layers(n) % w, db(n+1) % array)&
-                      * self % layers(n) % activation_prime(layers(n) % z)
-        dw(n-1) % array = matmul(reshape(layers(n-1) % a, [dims(n-1), 1]),&
-                                 reshape(db(n) % array, [1, dims(n)]))
-      end do
-
-    end associate
-
-  end subroutine backprop
-
-
-  pure subroutine fwdprop(self, x)
-    !! Performs the forward propagation and stores arguments to activation
-    !! functions and activations themselves for use in backprop.
-    class(network_type), intent(in out) :: self
-    real(rk), intent(in) :: x(:)
-    integer(ik) :: n
-    associate(layers => self % layers)
-      layers(1) % a = x
-      do n = 2, size(layers)
-        layers(n) % z = matmul(transpose(layers(n-1) % w), layers(n-1) % a) + layers(n) % b
-        layers(n) % a = self % layers(n) % activation(layers(n) % z)
-      end do
-    end associate
-  end subroutine fwdprop
-
-
-  subroutine init(self, dims)
-    !! Allocates and initializes the layers with given dimensions dims.
-    class(network_type), intent(in out) :: self
-    integer(ik), intent(in) :: dims(:)
-    integer(ik) :: n
-    self % dims = dims
-    if (.not. allocated(self % layers)) allocate(self % layers(size(dims)))
-    do n = 1, size(dims) - 1
-      self % layers(n) = layer_type(dims(n), dims(n+1))
-    end do
-    self % layers(n) = layer_type(dims(n), 1)
-    self % layers(1) % b = 0
-    self % layers(size(dims)) % w = 0
-  end subroutine init
-
-
-  subroutine load(self, filename)
-    !! Loads the network from file.
-    class(network_type), intent(in out) :: self
-    character(len=*), intent(in) :: filename
-    integer(ik) :: fileunit, n, num_layers, layer_idx
-    integer(ik), allocatable :: dims(:)
-    character(len=100) :: buffer !! activation string
-    open(newunit=fileunit, file=filename, status='old', action='read')
-    read(fileunit, *) num_layers
-    allocate(dims(num_layers))
-    read(fileunit, *) dims
-    call self % init(dims)
-    do n = 1, num_layers
-      read(fileunit, *) layer_idx, buffer
-      call self % layers(layer_idx) % set_activation(trim(buffer))
-    end do
-    do n = 2, size(self % dims)
-      read(fileunit, *) self % layers(n) % b
-    end do
-    do n = 1, size(self % dims) - 1
-      read(fileunit, *) self % layers(n) % w
-    end do
-    close(fileunit)
-  end subroutine load
-
-
-  pure real(rk) function loss(self, x, y)
-    !! Given input x and expected output y, returns the loss of the network.
-    class(network_type), intent(in) :: self
-    real(rk), intent(in) :: x(:), y(:)
-    loss = 0.5 * sum((y - self % output(x))**2) / size(x)
-  end function loss
-
-
-  pure function output_single(self, x) result(a)
-    !! Use forward propagation to compute the output of the network.
-    !! This specific procedure is for a single sample of 1-d input data.
-    class(network_type), intent(in) :: self
-    real(rk), intent(in) :: x(:)
-    real(rk), allocatable :: a(:)
-    integer(ik) :: n
-    associate(layers => self % layers)
-      a = self % layers(2) % activation(matmul(transpose(layers(1) % w), x) + layers(2) % b)
-      do n = 3, size(layers)
-        a = self % layers(n) % activation(matmul(transpose(layers(n-1) % w), a) + layers(n) % b)
-      end do
-    end associate
-  end function output_single
-
-
-  pure function output_batch(self, x) result(a)
-    !! Use forward propagation to compute the output of the network.
-    !! This specific procedure is for a batch of 1-d input data.
-    class(network_type), intent(in) :: self
-    real(rk), intent(in) :: x(:,:)
-    real(rk), allocatable :: a(:,:)
-    integer(ik) :: i
-    allocate(a(self % dims(size(self % dims)), size(x, dim=2)))
-    do i = 1, size(x, dim=2)
-     a(:,i) = self % output_single(x(:,i))
-    end do
-  end function output_batch
-
-
-  subroutine save(self, filename)
-    !! Saves the network to a file.
-    class(network_type), intent(in out) :: self
-    character(len=*), intent(in) :: filename
-    integer(ik) :: fileunit, n
-    open(newunit=fileunit, file=filename)
-    write(fileunit, fmt=*) size(self % dims)
-    write(fileunit, fmt=*) self % dims
-    do n = 1, size(self % dims)
-      write(fileunit, fmt=*) n, self % layers(n) % activation_str
-    end do
-    do n = 2, size(self % dims)
-      write(fileunit, fmt=*) self % layers(n) % b
-    end do
-    do n = 1, size(self % dims) - 1
-      write(fileunit, fmt=*) self % layers(n) % w
-    end do
-    close(fileunit)
-  end subroutine save
-
-
-  pure subroutine set_activation_equal(self, activation)
-    !! A thin wrapper around layer % set_activation().
-    !! This method can be used to set an activation function
-    !! for all layers at once. 
-    class(network_type), intent(in out) :: self
-    character(len=*), intent(in) :: activation
-    call self % layers(:) % set_activation(activation)
-  end subroutine set_activation_equal
-
-
-  pure subroutine set_activation_layers(self, activation)
-    !! A thin wrapper around layer % set_activation().
-    !! This method can be used to set different activation functions
-    !! for each layer separately. 
-    class(network_type), intent(in out) :: self
-    character(len=*), intent(in) :: activation(size(self % layers))
-    call self % layers(:) % set_activation(activation)
-  end subroutine set_activation_layers
-
-  subroutine sync(self, image)
-    !! Broadcasts network weights and biases from
-    !! specified image to all others.
-    class(network_type), intent(in out) :: self
-    integer(ik), intent(in) :: image
-    integer(ik) :: n
-    if (num_images() == 1) return
-    layers: do n = 1, size(self % dims)
-#ifdef CAF
-      call co_broadcast(self % layers(n) % b, image)
-      call co_broadcast(self % layers(n) % w, image)
-#endif
-    end do layers
-  end subroutine sync
-
-
-  subroutine train_batch(self, x, y, eta)
-    !! Trains a network using input data x and output data y,
-    !! and learning rate eta. The learning rate is normalized
-    !! with the size of the data batch.
-    class(network_type), intent(in out) :: self
-    real(rk), intent(in) :: x(:,:), y(:,:), eta
-    type(array1d), allocatable :: db(:), db_batch(:)
-    type(array2d), allocatable :: dw(:), dw_batch(:)
-    integer(ik) :: i, im, n, nm
-    integer(ik) :: is, ie, indices(2)
-
-    im = size(x, dim=2) !! mini-batch size
-    nm = size(self % dims) !! number of layers
-
-    ! get start and end index for mini-batch
-    indices = tile_indices(im)
-    is = indices(1)
-    ie = indices(2)
-
-    call db_init(db_batch, self % dims)
-    call dw_init(dw_batch, self % dims)
-
-    do concurrent(i = is:ie)
-      call self % fwdprop(x(:,i))
-      call self % backprop(y(:,i), dw, db)
-      do concurrent(n = 1:nm)
-        dw_batch(n) % array =  dw_batch(n) % array + dw(n) % array
-        db_batch(n) % array =  db_batch(n) % array + db(n) % array
-      end do
-    end do
-
-    if (num_images() > 1) then
-      call dw_co_sum(dw_batch)
-      call db_co_sum(db_batch)
-    end if
-
-    call self % update(dw_batch, db_batch, eta / im)
-
-  end subroutine train_batch
-
-
-  subroutine train_epochs(self, x, y, eta, num_epochs, batch_size)
-    !! Trains for num_epochs epochs with mini-bachtes of size equal to batch_size.
-    class(network_type), intent(in out) :: self
-    integer(ik), intent(in) :: num_epochs, batch_size
-    real(rk), intent(in) :: x(:,:), y(:,:), eta
-
-    integer(ik) :: i, n, nsamples, nbatch
-    integer(ik) :: batch_start, batch_end
-
-    real(rk) :: pos
-
-    nsamples = size(y, dim=2)
-    nbatch = nsamples / batch_size
-
-    epochs: do n = 1, num_epochs
-      batches: do i = 1, nbatch
-      
-        !pull a random mini-batch from the dataset  
-        call random_number(pos)
-        batch_start = int(pos * (nsamples - batch_size + 1))
-        if (batch_start == 0) batch_start = 1
-        batch_end = batch_start + batch_size - 1
-   
-        call self % train(x(:,batch_start:batch_end), y(:,batch_start:batch_end), eta)
-       
-      end do batches
-    end do epochs
-
-  end subroutine train_epochs
-
-
-  pure subroutine train_single(self, x, y, eta)
-    !! Trains a network using a single set of input data x and output data y,
-    !! and learning rate eta.
-    class(network_type), intent(in out) :: self
-    real(rk), intent(in) :: x(:), y(:), eta
-    type(array2d), allocatable :: dw(:)
-    type(array1d), allocatable :: db(:)
-    call self % fwdprop(x)
-    call self % backprop(y, dw, db)
-    call self % update(dw, db, eta)
-  end subroutine train_single
-
-
-  pure subroutine update(self, dw, db, eta)
-    !! Updates network weights and biases with gradients dw and db,
-    !! scaled by learning rate eta.
-    class(network_type), intent(in out) :: self
-    class(array2d), intent(in) :: dw(:)
-    class(array1d), intent(in) :: db(:)
-    real(rk), intent(in) :: eta
-    integer(ik) :: n
-
-    associate(layers => self % layers, nm => size(self % dims))
-      !! update biases
-      do concurrent(n = 2:nm)
-        layers(n) % b = layers(n) % b - eta * db(n) % array
-      end do
-      !! update weights
-      do concurrent(n = 1:nm-1)
-        layers(n) % w = layers(n) % w - eta * dw(n) % array
-      end do
-    end associate
-
-  end subroutine update
+  interface
+
+    pure real(rk) module function accuracy(self, x, y)
+      !! Given input x and output y, evaluates the position of the
+      !! maximum value of the output and returns the number of matches
+      !! relative to the size of the dataset.
+      implicit none
+      class(network_type), intent(in) :: self
+      real(rk), intent(in) :: x(:,:), y(:,:)
+    end function accuracy
+
+    pure module subroutine backprop(self, y, dw, db)
+      !! Applies a backward propagation through the network
+      !! and returns the weight and bias gradients.
+      implicit none
+      class(network_type), intent(in out) :: self
+      real(rk), intent(in) :: y(:)
+      type(array2d), allocatable, intent(out) :: dw(:)
+      type(array1d), allocatable, intent(out) :: db(:)
+    end subroutine backprop
+
+
+    pure module subroutine fwdprop(self, x)
+      !! Performs the forward propagation and stores arguments to activation
+      !! functions and activations themselves for use in backprop.
+      implicit none
+      class(network_type), intent(in out) :: self
+      real(rk), intent(in) :: x(:)
+    end subroutine fwdprop
+
+    module subroutine init(self, dims)
+      !! Allocates and initializes the layers with given dimensions dims.
+      implicit none
+      class(network_type), intent(in out) :: self
+      integer(ik), intent(in) :: dims(:)
+    end subroutine init
+
+
+    module subroutine load(self, filename)
+      !! Loads the network from file.
+      implicit none
+      class(network_type), intent(in out) :: self
+      character(len=*), intent(in) :: filename
+    end subroutine load
+
+
+    pure module real(rk) function loss(self, x, y)
+      !! Given input x and expected output y, returns the loss of the network.
+      implicit none
+      class(network_type), intent(in) :: self
+      real(rk), intent(in) :: x(:), y(:)
+    end function loss
+
+
+    pure module function output_single(self, x) result(a)
+      !! Use forward propagation to compute the output of the network.
+      !! This specific procedure is for a single sample of 1-d input data.
+      implicit none
+      class(network_type), intent(in) :: self
+      real(rk), intent(in) :: x(:)
+      real(rk), allocatable :: a(:)
+    end function output_single
+
+
+    pure module function output_batch(self, x) result(a)
+      !! Use forward propagation to compute the output of the network.
+      !! This specific procedure is for a batch of 1-d input data.
+      implicit none
+      class(network_type), intent(in) :: self
+      real(rk), intent(in) :: x(:,:)
+      real(rk), allocatable :: a(:,:)
+    end function output_batch
+
+    module subroutine save(self, filename)
+      !! Saves the network to a file.
+      implicit none
+      class(network_type), intent(in out) :: self
+      character(len=*), intent(in) :: filename
+    end subroutine save
+
+
+    pure module subroutine set_activation_equal(self, activation)
+      !! A thin wrapper around layer % set_activation().
+      !! This method can be used to set an activation function
+      !! for all layers at once.
+      implicit none
+      class(network_type), intent(in out) :: self
+      character(len=*), intent(in) :: activation
+    end subroutine set_activation_equal
+
+
+    pure module subroutine set_activation_layers(self, activation)
+      !! A thin wrapper around layer % set_activation().
+      !! This method can be used to set different activation functions
+      !! for each layer separately.
+      implicit none
+      class(network_type), intent(in out) :: self
+      character(len=*), intent(in) :: activation(size(self % layers))
+    end subroutine set_activation_layers
+
+    module subroutine sync(self, image)
+      !! Broadcasts network weights and biases from
+      !! specified image to all others.
+      implicit none
+      class(network_type), intent(in out) :: self
+      integer(ik), intent(in) :: image
+    end subroutine sync
+
+
+    module subroutine train_batch(self, x, y, eta)
+      !! Trains a network using input data x and output data y,
+      !! and learning rate eta. The learning rate is normalized
+      !! with the size of the data batch.
+      implicit none
+      class(network_type), intent(in out) :: self
+      real(rk), intent(in) :: x(:,:), y(:,:), eta
+    end subroutine train_batch
+
+
+    module subroutine train_epochs(self, x, y, eta, num_epochs, batch_size)
+      !! Trains for num_epochs epochs with mini-bachtes of size equal to batch_size.
+      implicit none
+      class(network_type), intent(in out) :: self
+      integer(ik), intent(in) :: num_epochs, batch_size
+      real(rk), intent(in) :: x(:,:), y(:,:), eta
+    end subroutine train_epochs
+
+
+    pure module subroutine train_single(self, x, y, eta)
+      !! Trains a network using a single set of input data x and output data y,
+      !! and learning rate eta.
+      implicit none
+      class(network_type), intent(in out) :: self
+      real(rk), intent(in) :: x(:), y(:), eta
+    end subroutine train_single
+
+
+    pure module subroutine update(self, dw, db, eta)
+      !! Updates network weights and biases with gradients dw and db,
+      !! scaled by learning rate eta.
+      implicit none
+      class(network_type), intent(in out) :: self
+      class(array2d), intent(in) :: dw(:)
+      class(array1d), intent(in) :: db(:)
+      real(rk), intent(in) :: eta
+    end subroutine update
+
+  end interface
 
 end module mod_network
diff --git a/src/mod_network_submodule.f90 b/src/mod_network_submodule.f90
new file mode 100644
index 00000000..2a32d376
--- /dev/null
+++ b/src/mod_network_submodule.f90
@@ -0,0 +1,298 @@
+submodule(mod_network) mod_network_submodule
+
+  use mod_kinds, only: ik, rk
+  use mod_layer, only: db_init, dw_init, db_co_sum, dw_co_sum
+  use mod_parallel, only: tile_indices
+
+  implicit none
+
+contains
+
+  module function net_constructor(dims, activation) result(net)
+    integer(ik), intent(in) :: dims(:)
+    character(len=*), intent(in), optional :: activation
+    type(network_type) :: net
+    call net % init(dims)
+    if (present(activation)) then
+      call net % set_activation(activation)
+    else
+      call net % set_activation('sigmoid')
+    end if
+    call net % sync(1)
+  end function net_constructor
+
+  pure real(rk) module function accuracy(self, x, y)
+    class(network_type), intent(in) :: self
+    real(rk), intent(in) :: x(:,:), y(:,:)
+    integer(ik) :: i, good
+    good = 0
+    do i = 1, size(x, dim=2)
+      if (all(maxloc(self % output(x(:,i))) == maxloc(y(:,i)))) then
+        good = good + 1
+      end if
+    end do
+    accuracy = real(good, kind=rk) / size(x, dim=2)
+  end function accuracy
+
+
+  pure module subroutine backprop(self, y, dw, db)
+    class(network_type), intent(in out) :: self
+    real(rk), intent(in) :: y(:)
+    type(array2d), allocatable, intent(out) :: dw(:)
+    type(array1d), allocatable, intent(out) :: db(:)
+    integer(ik) :: n, nm
+
+    associate(dims => self % dims, layers => self % layers)
+
+      call db_init(db, dims)
+      call dw_init(dw, dims)
+
+      n = size(dims)
+      db(n) % array = (layers(n) % a - y) * self % layers(n) % activation_prime(layers(n) % z)
+      dw(n-1) % array = matmul(reshape(layers(n-1) % a, [dims(n-1), 1]),&
+                               reshape(db(n) % array, [1, dims(n)]))
+
+      do n = size(dims) - 1, 2, -1
+        db(n) % array = matmul(layers(n) % w, db(n+1) % array)&
+                      * self % layers(n) % activation_prime(layers(n) % z)
+        dw(n-1) % array = matmul(reshape(layers(n-1) % a, [dims(n-1), 1]),&
+                                 reshape(db(n) % array, [1, dims(n)]))
+      end do
+
+    end associate
+
+  end subroutine backprop
+
+
+  pure module subroutine fwdprop(self, x)
+    class(network_type), intent(in out) :: self
+    real(rk), intent(in) :: x(:)
+    integer(ik) :: n
+    associate(layers => self % layers)
+      layers(1) % a = x
+      do n = 2, size(layers)
+        layers(n) % z = matmul(transpose(layers(n-1) % w), layers(n-1) % a) + layers(n) % b
+        layers(n) % a = self % layers(n) % activation(layers(n) % z)
+      end do
+    end associate
+  end subroutine fwdprop
+
+
+  module subroutine init(self, dims)
+    class(network_type), intent(in out) :: self
+    integer(ik), intent(in) :: dims(:)
+    integer(ik) :: n
+    self % dims = dims
+    if (.not. allocated(self % layers)) allocate(self % layers(size(dims)))
+    do n = 1, size(dims) - 1
+      self % layers(n) = layer_type(dims(n), dims(n+1))
+    end do
+    self % layers(n) = layer_type(dims(n), 1)
+    self % layers(1) % b = 0
+    self % layers(size(dims)) % w = 0
+  end subroutine init
+
+
+  module subroutine load(self, filename)
+    class(network_type), intent(in out) :: self
+    character(len=*), intent(in) :: filename
+    integer(ik) :: fileunit, n, num_layers, layer_idx
+    integer(ik), allocatable :: dims(:)
+    character(len=100) :: buffer !! activation string
+    open(newunit=fileunit, file=filename, status='old', action='read')
+    read(fileunit, *) num_layers
+    allocate(dims(num_layers))
+    read(fileunit, *) dims
+    call self % init(dims)
+    do n = 1, num_layers
+      read(fileunit, *) layer_idx, buffer
+      call self % layers(layer_idx) % set_activation(trim(buffer))
+    end do
+    do n = 2, size(self % dims)
+      read(fileunit, *) self % layers(n) % b
+    end do
+    do n = 1, size(self % dims) - 1
+      read(fileunit, *) self % layers(n) % w
+    end do
+    close(fileunit)
+  end subroutine load
+
+
+  pure real(rk) module function loss(self, x, y)
+    class(network_type), intent(in) :: self
+    real(rk), intent(in) :: x(:), y(:)
+    loss = 0.5 * sum((y - self % output(x))**2) / size(x)
+  end function loss
+
+
+  pure module function output_single(self, x) result(a)
+    class(network_type), intent(in) :: self
+    real(rk), intent(in) :: x(:)
+    real(rk), allocatable :: a(:)
+    integer(ik) :: n
+    associate(layers => self % layers)
+      a = self % layers(2) % activation(matmul(transpose(layers(1) % w), x) + layers(2) % b)
+      do n = 3, size(layers)
+        a = self % layers(n) % activation(matmul(transpose(layers(n-1) % w), a) + layers(n) % b)
+      end do
+    end associate
+  end function output_single
+
+
+  pure module function output_batch(self, x) result(a)
+    class(network_type), intent(in) :: self
+    real(rk), intent(in) :: x(:,:)
+    real(rk), allocatable :: a(:,:)
+    integer(ik) :: i
+    allocate(a(self % dims(size(self % dims)), size(x, dim=2)))
+    do i = 1, size(x, dim=2)
+     a(:,i) = self % output_single(x(:,i))
+    end do
+  end function output_batch
+
+
+  module subroutine save(self, filename)
+    class(network_type), intent(in out) :: self
+    character(len=*), intent(in) :: filename
+    integer(ik) :: fileunit, n
+    open(newunit=fileunit, file=filename)
+    write(fileunit, fmt=*) size(self % dims)
+    write(fileunit, fmt=*) self % dims
+    do n = 1, size(self % dims)
+      write(fileunit, fmt=*) n, self % layers(n) % activation_str
+    end do
+    do n = 2, size(self % dims)
+      write(fileunit, fmt=*) self % layers(n) % b
+    end do
+    do n = 1, size(self % dims) - 1
+      write(fileunit, fmt=*) self % layers(n) % w
+    end do
+    close(fileunit)
+  end subroutine save
+
+
+  pure module subroutine set_activation_equal(self, activation)
+    class(network_type), intent(in out) :: self
+    character(len=*), intent(in) :: activation
+    call self % layers(:) % set_activation(activation)
+  end subroutine set_activation_equal
+
+
+  pure module subroutine set_activation_layers(self, activation)
+    class(network_type), intent(in out) :: self
+    character(len=*), intent(in) :: activation(size(self % layers))
+    call self % layers(:) % set_activation(activation)
+  end subroutine set_activation_layers
+
+  module subroutine sync(self, image)
+    class(network_type), intent(in out) :: self
+    integer(ik), intent(in) :: image
+    integer(ik) :: n
+    if (num_images() == 1) return
+    layers: do n = 1, size(self % dims)
+#ifdef CAF
+      call co_broadcast(self % layers(n) % b, image)
+      call co_broadcast(self % layers(n) % w, image)
+#endif
+    end do layers
+  end subroutine sync
+
+  module subroutine train_batch(self, x, y, eta)
+    class(network_type), intent(in out) :: self
+    real(rk), intent(in) :: x(:,:), y(:,:), eta
+    type(array1d), allocatable :: db(:), db_batch(:)
+    type(array2d), allocatable :: dw(:), dw_batch(:)
+    integer(ik) :: i, im, n, nm
+    integer(ik) :: is, ie, indices(2)
+
+    im = size(x, dim=2) ! mini-batch size
+    nm = size(self % dims) ! number of layers
+
+    ! get start and end index for mini-batch
+    indices = tile_indices(im)
+    is = indices(1)
+    ie = indices(2)
+
+    call db_init(db_batch, self % dims)
+    call dw_init(dw_batch, self % dims)
+
+    do concurrent(i = is:ie)
+      call self % fwdprop(x(:,i))
+      call self % backprop(y(:,i), dw, db)
+      do concurrent(n = 1:nm)
+        dw_batch(n) % array =  dw_batch(n) % array + dw(n) % array
+        db_batch(n) % array =  db_batch(n) % array + db(n) % array
+      end do
+    end do
+
+    if (num_images() > 1) then
+      call dw_co_sum(dw_batch)
+      call db_co_sum(db_batch)
+    end if
+
+    call self % update(dw_batch, db_batch, eta / im)
+
+  end subroutine train_batch
+
+  module subroutine train_epochs(self, x, y, eta, num_epochs, batch_size)
+    class(network_type), intent(in out) :: self
+    integer(ik), intent(in) :: num_epochs, batch_size
+    real(rk), intent(in) :: x(:,:), y(:,:), eta
+
+    integer(ik) :: i, n, nsamples, nbatch
+    integer(ik) :: batch_start, batch_end
+
+    real(rk) :: pos
+
+    nsamples = size(y, dim=2)
+    nbatch = nsamples / batch_size
+
+    epochs: do n = 1, num_epochs
+      batches: do i = 1, nbatch
+      
+        !pull a random mini-batch from the dataset  
+        call random_number(pos)
+        batch_start = int(pos * (nsamples - batch_size + 1))
+        if (batch_start == 0) batch_start = 1
+        batch_end = batch_start + batch_size - 1
+   
+        call self % train(x(:,batch_start:batch_end), y(:,batch_start:batch_end), eta)
+       
+      end do batches
+    end do epochs
+
+  end subroutine train_epochs
+
+
+  pure module subroutine train_single(self, x, y, eta)
+    class(network_type), intent(in out) :: self
+    real(rk), intent(in) :: x(:), y(:), eta
+    type(array2d), allocatable :: dw(:)
+    type(array1d), allocatable :: db(:)
+    call self % fwdprop(x)
+    call self % backprop(y, dw, db)
+    call self % update(dw, db, eta)
+  end subroutine train_single
+
+
+  pure module subroutine update(self, dw, db, eta)
+    class(network_type), intent(in out) :: self
+    class(array2d), intent(in) :: dw(:)
+    class(array1d), intent(in) :: db(:)
+    real(rk), intent(in) :: eta
+    integer(ik) :: n
+
+    associate(layers => self % layers, nm => size(self % dims))
+      ! update biases
+      do concurrent(n = 2:nm)
+        layers(n) % b = layers(n) % b - eta * db(n) % array
+      end do
+      ! update weights
+      do concurrent(n = 1:nm-1)
+        layers(n) % w = layers(n) % w - eta * dw(n) % array
+      end do
+    end associate
+
+  end subroutine update
+
+end submodule mod_network_submodule
diff --git a/src/mod_parallel.f90 b/src/mod_parallel.f90
index 0d7503da..2c558cea 100644
--- a/src/mod_parallel.f90
+++ b/src/mod_parallel.f90
@@ -6,28 +6,16 @@ module mod_parallel
   private
   public :: tile_indices
 
-contains
-
-  pure function tile_indices(dims)
-    !! Given input global array size, return start and end index
-    !! of a parallel 1-d tile that correspond to this image.
-    integer(ik), intent(in) :: dims
-    integer(ik) :: tile_indices(2)
-    integer(ik) :: offset, tile_size
-
-    tile_size = dims / num_images()
-
-    !! start and end indices assuming equal tile sizes
-    tile_indices(1) = (this_image() - 1) * tile_size + 1
-    tile_indices(2) = tile_indices(1) + tile_size - 1
-
-    !! if we have any remainder, distribute it to the tiles at the end
-    offset = num_images() - mod(dims, num_images())
-    if (this_image() > offset) then
-      tile_indices(1) = tile_indices(1) + this_image() - offset - 1
-      tile_indices(2) = tile_indices(2) + this_image() - offset
-    end if
-
-  end function tile_indices
+  interface
+  
+    pure module function tile_indices(dims)
+      !! Given input global array size, return start and end index
+      !! of a parallel 1-d tile that correspond to this image.
+      implicit none
+      integer(ik), intent(in) :: dims
+      integer(ik) :: tile_indices(2)
+    end function tile_indices
+  
+  end interface
 
 end module mod_parallel
diff --git a/src/mod_parallel_submodule.f90 b/src/mod_parallel_submodule.f90
new file mode 100644
index 00000000..9f7cb9ce
--- /dev/null
+++ b/src/mod_parallel_submodule.f90
@@ -0,0 +1,28 @@
+submodule(mod_parallel) mod_parallel_submodule
+
+  use mod_kinds, only: ik, rk
+  implicit none
+
+contains
+
+  pure module function tile_indices(dims)
+    integer(ik), intent(in) :: dims
+    integer(ik) :: tile_indices(2)
+    integer(ik) :: offset, tile_size
+
+    tile_size = dims / num_images()
+
+    !! start and end indices assuming equal tile sizes
+    tile_indices(1) = (this_image() - 1) * tile_size + 1
+    tile_indices(2) = tile_indices(1) + tile_size - 1
+
+    !! if we have any remainder, distribute it to the tiles at the end
+    offset = num_images() - mod(dims, num_images())
+    if (this_image() > offset) then
+      tile_indices(1) = tile_indices(1) + this_image() - offset - 1
+      tile_indices(2) = tile_indices(2) + this_image() - offset
+    end if
+
+  end function tile_indices
+
+end submodule mod_parallel_submodule
diff --git a/src/mod_random.f90 b/src/mod_random.f90
index 78814c40..6470d2c9 100644
--- a/src/mod_random.f90
+++ b/src/mod_random.f90
@@ -10,30 +10,22 @@ module mod_random
   private
   public :: randn
 
-  real(rk), parameter :: pi = 4 * atan(1._rk)
-
   interface randn
-    module procedure :: randn1d, randn2d
-  end interface randn
 
-contains
-
-  function randn1d(n) result(r)
-    !! Generates n random numbers with a normal distribution.
-    integer(ik), intent(in) :: n
-    real(rk) :: r(n), r2(n)
-    call random_number(r)
-    call random_number(r2)
-    r = sqrt(-2 * log(r)) * cos(2 * pi * r2)
-  end function randn1d
-
-  function randn2d(m, n) result(r)
-    !! Generates m x n random numbers with a normal distribution.
-    integer(ik), intent(in) :: m, n
-    real(rk) :: r(m, n), r2(m, n)
-    call random_number(r)
-    call random_number(r2)
-    r = sqrt(-2 * log(r)) * cos(2 * pi * r2)
-  end function randn2d
+    module function randn1d(n) result(r)
+      !! Generates n random numbers with a normal distribution.
+      implicit none
+      integer(ik), intent(in) :: n
+      real(rk) :: r(n)
+    end function randn1d
+
+    module function randn2d(m, n) result(r)
+      !! Generates m x n random numbers with a normal distribution.
+      implicit none
+      integer(ik), intent(in) :: m, n
+      real(rk) :: r(m, n)
+    end function randn2d
+
+  end interface randn
 
 end module mod_random
diff --git a/src/mod_random_submodule.f90 b/src/mod_random_submodule.f90
new file mode 100644
index 00000000..c75e4e7c
--- /dev/null
+++ b/src/mod_random_submodule.f90
@@ -0,0 +1,24 @@
+submodule(mod_random) mod_random_submodule
+  implicit none
+
+  real(rk), parameter :: pi = 4 * atan(1._rk)
+
+contains
+
+  module function randn1d(n) result(r)
+    integer(ik), intent(in) :: n
+    real(rk) :: r(n), r2(n)
+    call random_number(r)
+    call random_number(r2)
+    r = sqrt(-2 * log(r)) * cos(2 * pi * r2)
+  end function randn1d
+
+  module function randn2d(m, n) result(r)
+    integer(ik), intent(in) :: m, n
+    real(rk) :: r(m, n), r2(m, n)
+    call random_number(r)
+    call random_number(r2)
+    r = sqrt(-2 * log(r)) * cos(2 * pi * r2)
+  end function randn2d
+
+end submodule mod_random_submodule