# Source code for nnabla.initializer

# Copyright 2017,2018,2019,2020,2021 Sony Corporation.
# Copyright 2021 Sony Group Corporation.
#
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#
# Unless required by applicable law or agreed to in writing, software
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and

import numpy as np
from . import random

# Use it like "random_float_type(x)", not ".astype(random_float_type)"
# because this manner is applicable to both numpy.array and 0-dimensional
# numpy.array (or Python scalar) which appears when Initializer takes shape=(),
# for example self.rng.randn(*shape) where shape = ().
random_float_type = np.float32

[docs]class BaseInitializer(object):

"""Base class of the parameter initializer.

"""

[docs]    def __call__(self, shape):
"""Generates an array with an initializer.

Args:
shape (:obj:tuple of :obj:int): :obj:numpy.ndarray with the shape created.

Returns:
:obj:numpy.ndarray : Array.

Note:
Subclasses of :class:~nnabla.initializer.BaseInitializer must override this method.

"""
raise NotImplementedError()

[docs]class NormalInitializer(BaseInitializer):

r"""Generates a random array from a specified normal distribution.

.. math::
\mathbf x \sim {\cal N} (\mathbf 0 | \sigma^2 \mathbf I)

Args:
sigma (float): :math:\sigma.
rng (numpy.random.RandomState): Random number generator.

Example:

.. code-block:: python

import nnabla as nn
import nnabla.parametric_functions as PF
import nnabla.initializer as I

x = nn.Variable([60,1,28,28])
w = I.NormalInitializer(5e-5)
b = I.NormalInitializer(0.0)
h = PF.convolution(x, 64, [3, 3], w_init=w, b_init=b, pad=[1, 1], name='conv')
"""

def __init__(self, sigma=1.0, rng=None):
if rng is None:
rng = random.prng
self.rng = rng
self.sigma = sigma

def __repr__(self):
return '{}({})'.format(self.__class__.__name__,
self.sigma)

def __call__(self, shape):
return random_float_type(self.rng.randn(*shape) * self.sigma)

[docs]class UniformInitializer(BaseInitializer):

r"""Generates a random array from a specified uniform distribution.

.. math::
\mathbf x \sim {\cal U} (a, b)

Args:
lim (:obj:tuple of :obj:float): A tuple of two floats, :math:(a, b).
rng (numpy.random.RandomState): Random number generator.

Example:

.. code-block:: python

import nnabla as nn
import nnabla.parametric_functions as PF
import nnabla.initializer as I

x = nn.Variable([60,1,28,28])
w = I.UniformInitializer() # this generates uniform distribution within the default range of (-1,1)
b = I.UniformInitializer((-0.5,0.5))
h = PF.convolution(x, 64, [3, 3], w_init=w, b_init=b, pad=[1, 1], name='conv')
"""

def __init__(self, lim=(-1, 1), rng=None):
if rng is None:
rng = random.prng
self.rng = rng
self.lim = lim

def __repr__(self):
return '{}({})'.format(self.__class__.__name__,
repr(self.lim))

def __call__(self, shape):
return random_float_type(self.rng.uniform(self.lim[0], self.lim[1],
size=shape))

[docs]class UniformIntInitializer(BaseInitializer):

r"""Generates a random array from a specified integer uniform distribution.

.. math::
\mathbf x \sim {\cal U} ([a, b))

Args:
lim (:obj:tuple of :obj:int): A tuple of two ints, :math:(a, b).
rng (numpy.random.RandomState): Random number generator.

Example:

.. code-block:: python

import nnabla as nn
import nnabla.parametric_functions as PF
import nnabla.initializer as I

x = nn.Variable([60,1,28,28])
w = I.UniformIntInitializer() # this generates uniform integer distribution within the default range of (0,10)
b = I.UniformIntInitializer((-1,1))
h = PF.convolution(x, 64, [3, 3], w_init=w, b_init=b, pad=[1, 1], name='conv')
"""

def __init__(self, lim=(0, 10), rng=None):
if rng is None:
rng = random.prng
self.rng = rng
self.lim = lim

def __repr__(self):
return '{}({})'.format(self.__class__.__name__,
repr(self.lim))

def __call__(self, shape):
return self.rng.randint(self.lim[0], self.lim[1], size=shape)

[docs]class RangeInitializer(BaseInitializer):

"""Generates an array with sequence of numbers.

.. math::
\mathbf x[i] = start + step * i

Args:
start (int): A start value.
step (int): A step value.

Example:

.. code-block:: python

import nnabla as nn
import nnabla.initializer as I

x = nn.Variable([100])
x.d = I.RangeInitializer(0, 1)(x.shape)
"""

def __init__(self, start=0, step=1):
self.start = start
self.step = step

def __call__(self, shape):
a = np.arange(0, shape[-1], 1)
return np.broadcast_to(self.start + a * self.step, shape)

[docs]class ConstantInitializer(BaseInitializer):

"""Generates a constant valued array.

Args:
value (float): A constant value.

Example:

.. code-block:: python

import nnabla as nn
import nnabla.parametric_functions as PF
import nnabla.initializer as I

x = nn.Variable([60,1,28,28])
w = I.ConstantInitializer(0.1)
b = I.ConstantInitializer() # this generates constant valued array of default value 0
h = PF.convolution(x, 64, [3, 3], w_init=w, b_init=b, pad=[1, 1], name='conv'
"""

def __init__(self, value=0):
self.value = value

def __call__(self, shape):
return random_float_type(np.ones(shape) * self.value)

[docs]class OrthogonalInitializer(BaseInitializer):

r"""Generates an orthogonal matrix weights proposed by Saxe et al.

Args:
gain (float): scaling factor which should be decided depending on a type of units.
rng (numpy.random.RandomState): Random number generator.

Example:

.. code-block:: python

import numpy as np
import nnabla as nn
import nnabla.parametric_functions as PF
import nnabla.initializer as I

x = nn.Variable([60,1,28,28])
w = I.OrthogonalInitializer(np.sqrt(2.0))
b = I.ConstantInitializer(0.0)
h = PF.convolution(x, 64, [3, 3], w_init=w, b_init=b, pad=[1, 1], name='conv')

References:
* Saxe, et al. Exact solutions to the nonlinear dynamics of
learning in deep linear neural networks.
<https://arxiv.org/abs/1312.6120>_
"""

def __init__(self, gain=1.0, rng=None):
if rng is None:
rng = random.prng
self.rng = rng
self.gain = gain

def __repr__(self):
return '{}({})'.format(self.__class__.__name__,
self.gain)

def __call__(self, shape):
flat_shape = (shape[0], int(np.prod(shape[1:])))
x = self.rng.normal(0.0, 1.0, flat_shape)
u, _, v = np.linalg.svd(x, full_matrices=False)
q = u if u.shape == flat_shape else v
return random_float_type(q.reshape(shape) * self.gain)

[docs]class WeightNormalizationScaleInitializer(BaseInitializer):

r"""Compute the L2-norm for each weight kernel.

This initializer is specific to the weight normalization scale to keep the same magnitude of the originally initialized weights even after the applicaiton of the weight normalization at only initialization.

Args:
w (:obj:Variable): Weight the weight normalization is applied.
dim (:obj:int): Output dimension of the weight normalization.
eps (:obj:float): Eplision of the weight normalization.
"""

def __init__(self, w, dim=0, eps=1e-12):
self.dim = dim
self.eps = eps

def __repr__(self):
return '{}({})'.format(self.__class__.__name__)

def __call__(self, shape):
axis = tuple([a for a in range(len(self.w.shape)) if a != self.dim])
w_norm_data = np.sqrt(np.sum(self.w.d ** 2, axis=axis) + self.eps)
return random_float_type(w_norm_data)

[docs]def calc_normal_std_he_forward(inmaps, outmaps, kernel=(1, 1)):
r"""Calculates the standard deviation proposed by He et al.

.. math::
\sigma = \sqrt{\frac{2}{NK}}

Args:
inmaps (int): Map size of an input Variable, :math:N.
outmaps (int): Map size of an output Variable, :math:M.
kernel (:obj:tuple of :obj:int): Convolution kernel spatial shape.
In above definition, :math:K is the product of shape dimensions.
In Affine, the default value should be used.

Example:

.. code-block:: python

import nnabla as nn
import nnabla.parametric_functions as PF
import nnabla.initializer as I

x = nn.Variable([60,1,28,28])
s = I.calc_normal_std_he_forward(x.shape[1],64)
w = I.NormalInitializer(s)
b = I.ConstantInitializer(0)
h = PF.convolution(x, 64, [3, 3], w_init=w, b_init=b, pad=[1, 1], name='conv')

References:
* He, et al. Delving Deep into Rectifiers: Surpassing Human-Level
Performance on ImageNet Classification.
<https://arxiv.org/abs/1502.01852>_

"""
return np.sqrt(2. / (np.prod(kernel) * inmaps))

[docs]def calc_normal_std_he_backward(inmaps, outmaps, kernel=(1, 1)):
r"""Calculates the standard deviation of He et al. (backward case).

.. math::
\sigma = \sqrt{\frac{2}{MK}}

Args:
inmaps (int): Map size of an input Variable, :math:N.
outmaps (int): Map size of an output Variable, :math:M.
kernel (:obj:tuple of :obj:int): Convolution kernel spatial shape.
In above definition, :math:K is the product of shape dimensions.
In Affine, the default value should be used.

Example:

.. code-block:: python

import nnabla as nn
import nnabla.parametric_functions as PF
import nnabla.initializer as I

x = nn.Variable([60,1,28,28])
s = I.calc_normal_std_he_backward(x.shape[1],64)
w = I.NormalInitializer(s)
b = I.ConstantInitializer(0)
h = PF.convolution(x, 64, [3, 3], w_init=w, b_init=b, pad=[1, 1], name='conv')

References:
* He, et al. Delving Deep into Rectifiers: Surpassing Human-Level
Performance on ImageNet Classification.
<https://arxiv.org/abs/1502.01852>_

"""
return np.sqrt(2. / (np.prod(kernel) * outmaps))

[docs]def calc_normal_std_glorot(inmaps, outmaps, kernel=(1, 1)):
r"""Calculates the standard deviation proposed by Glorot et al.

Note:
We have updated the definition as following from v.1.2. It may affect the
behavior of existing scripts that rely on the default initialization.

.. math::
\sigma = \sqrt{\frac{2}{K(N + M)}}

Args:
inmaps (int): Map size of an input Variable, :math:N.
outmaps (int): Map size of an output Variable, :math:M.
kernel (:obj:tuple of :obj:int): Convolution kernel spatial shape.
In above definition, :math:K is the product of shape dimensions.
In Affine, the default value should be used.

Example:

.. code-block:: python

import nnabla as nn
import nnabla.parametric_functions as PF
import nnabla.initializer as I

x = nn.Variable([60,1,28,28])
s = I.calc_normal_std_glorot(x.shape[1],64)
w = I.NormalInitializer(s)
b = I.ConstantInitializer(0)
h = PF.convolution(x, 64, [3, 3], w_init=w, b_init=b, pad=[1, 1], name='conv')

References:
* Glorot and Bengio. Understanding the difficulty of training deep
feedforward neural networks
<http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf>_

"""
return np.sqrt(2. / (np.prod(kernel) * (inmaps + outmaps)))

[docs]def calc_uniform_lim_glorot(inmaps, outmaps, kernel=(1, 1)):
r"""Calculates the lower bound and the upper bound of the uniform distribution proposed by Glorot et al.

Note:
We have updated the definition as following from v.1.3. It may affect the
behavior of existing scripts that rely on the default initialization.

.. math::
b &= \sqrt{\frac{6}{K(N + M)}}\\
a &= -b

Args:
inmaps (int): Map size of an input Variable, :math:N.
outmaps (int): Map size of an output Variable, :math:M.
kernel (:obj:tuple of :obj:int): Convolution kernel spatial shape.
In above definition, :math:K is the product of shape dimensions.
In Affine, the default value should be used.

Example:

.. code-block:: python

import nnabla as nn
import nnabla.parametric_functions as PF
import nnabla.initializer as I

x = nn.Variable([60,1,28,28])
lb,ub= I.calc_uniform_lim_glorot(x.shape[1],64)
w = I.UniformInitializer((lb,ub))
b = I.ConstantInitializer(0)
h = PF.convolution(x, 64, [3, 3], w_init=w, b_init=b, pad=[1, 1], name='conv')

References:
* Glorot and Bengio. Understanding the difficulty of training deep
feedforward neural networks
<http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf>_

"""

d = np.sqrt(6. / (np.prod(kernel) * (inmaps + outmaps)))
return -d, d