Source code for nnabla.parametric_functions

# Copyright 2017,2018,2019,2020,2021 Sony Corporation.
# Copyright 2021 Sony Group Corporation.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from six import exec_
import numpy as np

import nnabla as nn
import nnabla.functions as F
from nnabla.parameter import get_parameter_or_create, get_parameter
from nnabla.initializer import (
    calc_uniform_lim_glorot,
    ConstantInitializer, NormalInitializer, UniformInitializer,
    WeightNormalizationScaleInitializer)


[docs]def parametric_function_api(scope_name=None, param_desc=None):
    """Decorator for parametric functions.

    The decorated function is always called under
    a parameter scope ``scope_name``.
    Also, the decorator adds an additional argument ``name`` (:obj:`str`,
    default is ``None``) at the end. If ``name`` is specified, the
    scope ``scope_name`` comes under a scope ``name``. This feature
    could reduce vertical space usage of the source code.
    Any parametric function should be decorated by this.

    Args:
        scope_name (str, optional): The original function will be called
            under a parameter scope named by ``scope_name``.
        param_desc (list, optional):
            Descriptions of parameters will be automatically included into docstring.
            This must be a list of tuples with 4 elements composed of
            (name (str), description (str), shape info (str), need_grad (bool)).

    Returns:
        function: A decorated parametric function.

    """
    if scope_name is None:
        scope_name = name

    def parametric_function_api_inside(func):
        from .utils.signature_utils import SignatureEx

        name = func.__name__
        doc = func.__doc__

        if param_desc:
            indent = 8
            try:
                desc = map(lambda d: ' ' * indent +
                           '* {} (``need_grad={}``) : {}. (shape: ``{}``)'.format(d[0], d[3], d[1], d[2]), param_desc)
            except:
                ValueError(
                    'param_desc argument of parametric_function_api must be '
                    'None or a list of tuple with three elements composed of '
                    '(name(str), description(str), need_grad(bool)).')
            doc += '''
    Parameters to be registered
        The following variables are registered in a parameter scope ``"{}"``;

{}

            '''.format(scope_name, '\n'.join(desc))

        doc += """
    Note:

        If the ``name`` option is passed, the parameters become wrapped inside the parameter scope
        with the specified name, yielding the same results as the following code.
        This can be used to simplify the code.

        .. code-block:: python

            with parameter_scope(name):
                output = {name}(<args>)

        """.format(name=name)

        # Parsing argspecs
        sig = SignatureEx.from_callable(func)
        sig2 = sig.add_arg('name', default=None)
        signature = '(' + sig2.format_argument_signature() + ')' + \
            sig.format_return_annotation()
        shortsignature = sig.format_caller_argument_signature()

        # Check required argument
        assert 'fix_parameters' in sig.parameters, \
            "A parametric function must take `fix_parameters` as an argument." \
            " `{}{}` doesn't have it.".format(name, signature)

        code = """
def {name}{signature}:
    if name is None:
        with parameter_scope(scope_name):
            return func({shortsignature})
    with parameter_scope(name):
        with parameter_scope(scope_name):
            return func({shortsignature})
        """.format(**locals())
        execdict = dict(
            func=func, parameter_scope=nn.parameter_scope, scope_name=scope_name)
        exec_(code, execdict)
        newfunc = execdict[name]
        newfunc.__doc__ = doc
        newfunc.__parametric_function_api_base__ = func
        newfunc.__scope_name__ = scope_name
        newfunc.__module__ = __name__
        return newfunc
    return parametric_function_api_inside


[docs]@parametric_function_api("affine", [
    ('W', 'Weight matrix', '(inmaps, outmaps)', True),
    ('b', 'bias vector', '(outputs,)', True),
])
def affine(inp, n_outmaps,
           base_axis=1,
           w_init=None, b_init=None,
           fix_parameters=False, rng=None, with_bias=True,
           apply_w=None, apply_b=None):
    """
    The affine layer, also known as the fully connected layer. Computes

    .. math::
        {\\mathbf y} = {\\mathbf A} {\\mathbf x} + {\\mathbf b}.

    where :math:`{\\mathbf x}, {\\mathbf y}` are the inputs and outputs respectively,
    and :math:`{\\mathbf A}, {\\mathbf b}` are constants.

    Args:
        inp (~nnabla.Variable): Input N-D array with shape (:math:`M_0 \\times \ldots \\times M_{B-1} \\times D_B \\times \ldots \\times D_N`). Dimensions before and after base_axis are flattened as if it is a matrix.
        n_outmaps (:obj:`int` or :obj:`tuple` of :obj:`int`): Number of output neurons per data.
        base_axis (int): Dimensions up to `base_axis` are treated as the sample dimensions.
        w_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for weight. By default, it is initialized with :obj:`nnabla.initializer.UniformInitializer` within the range determined by :obj:`nnabla.initializer.calc_uniform_lim_glorot`.
        b_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for bias. By default, it is initialized with zeros if `with_bias` is `True`.
        fix_parameters (bool): When set to `True`, the weights and biases will not be updated.
        rng (numpy.random.RandomState): Random generator for Initializer.
        with_bias (bool): Specify whether to include the bias term.
        apply_w (function): Lambda, function, or callable object applied to the weights.
        apply_b (function): Lambda, function, or callable object applied to the bias.

    Returns:
        :class:`~nnabla.Variable`: :math:`(B + 1)`-D array. (:math:`M_0 \\times \ldots \\times M_{B-1} \\times L`)

    """
    if not hasattr(n_outmaps, '__iter__'):
        n_outmaps = [n_outmaps]
    n_outmaps = list(n_outmaps)
    n_outmap = int(np.prod(n_outmaps))
    if w_init is None:
        inmaps = np.prod(inp.shape[base_axis:])
        w_init = UniformInitializer(
            calc_uniform_lim_glorot(inmaps, n_outmap), rng=rng)
    if with_bias and b_init is None:
        b_init = ConstantInitializer()
    w = get_parameter_or_create(
        "W", [int(np.prod(inp.shape[base_axis:]))] + n_outmaps,
        w_init, True, not fix_parameters)
    if apply_w is not None:
        w = apply_w(w)
    b = None
    if with_bias:
        b = get_parameter_or_create(
            "b", n_outmaps, b_init, True, not fix_parameters)
        if apply_b is not None:
            b = apply_b(b)
    return F.affine(inp, w, b, base_axis)


[docs]@parametric_function_api("svd_affine", [
    ('U', ':math:`{\\mathbf U}`', '(inmaps, r)', True),
    ('V', ':math:`{\\mathbf V}`', '(r, outmaps)', True),
    ('b', 'Bias vector', '(outmaps,)', True),
])
def svd_affine(inp, n_outmaps, r, base_axis=1, uv_init=None,
               b_init=None, fix_parameters=False, rng=None,
               with_bias=True):
    """SVD affine is a low rank approximation of the affine layer. It can
    be seen as two consecutive affine layers with a bottleneck. It
    computes:

    .. math::
        {\\mathbf y} = {\\mathbf U} {\\mathbf V} {\\mathbf x} + {\\mathbf b}.

    where :math:`{\\mathbf x}, {\\mathbf y}` are the inputs and
    outputs respectively, and :math:`{\\mathbf U}, {\\mathbf V},
    {\\mathbf b}` are constants.

    The weights :math:`{\\mathbf U}` and :math:`{\\mathbf V}` are
    approximated with singular value decomposition (SVD) of the
    original weight matrix :math:`{\\mathbf W}` and by selecting the
    :math:`{R}` dominant singular values and the corresponding
    singular vectors. Therefore the low rank :math:`{R}` is the size
    of the bottleneck.

    If `uv_init` is a numpy array, :math:`{\\mathbf U}` and
    :math:`{\\mathbf V}` are computed such that `uv_init` is
    approximated by :math:`{\\mathbf{UV}}`. If `uv_init` is `None` or
    an initializer, the product of :math:`{\\mathbf U}` and
    :math:`{\\mathbf V}` approximates the random initialization.

    If :math:`{\\mathbf U}` and :math:`{\\mathbf V}` exist in the context,
    they take precedence over `uv_init`.

    Suppose the weight of the affine is of :math:`{I \\times O}` and
    the compression rate you want to specify is :math:`{CR}`, then you
    set :math:`{R}` as

    .. math::

        R = \\left\\lfloor \\frac{(1 - CR)OI}{O + I} \\right\\rfloor.

    Args:

        inp (~nnabla.Variable): Input N-D array with shape (:math:`M_0
          \\times \ldots \\times M_{B-1} \\times D_B \\times \ldots
          \\times D_N`). Dimensions before and after base_axis are
          flattened as if it is a matrix.

        n_outmaps (int or tuple): Number of output neurons per data.

        r (int): rank of the factorized layer (size of the bottleneck)

        base_axis (int): Dimensions up to `base_axis` are treated as
          the sample dimensions.

        uv_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`):
          Initializer for weight. By default, it is initialized with :obj:`nnabla.initializer.UniformInitializer` within the range determined by :obj:`nnabla.initializer.calc_uniform_lim_glorot`. 

        b_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for bias. By default, it is initialized with zeros if `with_bias` is `True`.
        fix_parameters (bool): When set to `True`, the weights
          and biases will not be updated.

        rng (numpy.random.RandomState): Random generator for Initializer.

        with_bias (bool): Specify whether to include the bias term.

    Returns:
        ~nnabla.Variable: :math:`(B + 1)`-D array.
        (:math:`M_0 \\times \ldots \\times M_{B-1} \\times L`)

    """
    if not hasattr(n_outmaps, '__iter__'):
        n_outmaps = [n_outmaps]

    n_outmaps = list(n_outmaps)
    n_outmap = int(np.prod(n_outmaps))
    inmaps = np.prod(inp.shape[base_axis:])

    if uv_init is None:
        uv_init = UniformInitializer(
            calc_uniform_lim_glorot(inmaps, n_outmap), rng=rng)

    if type(uv_init) is np.ndarray:
        # TODO: Assert that size of uv_init is correct
        # uv is initialize with numpy array
        uv = uv_init
    else:
        # uv is initialize from initializer
        uv = uv_init([int(np.prod(inp.shape[base_axis:])), ] +
                     list(n_outmaps))

    u = get_parameter('U')
    v = get_parameter('V')

    if (u is None) or (v is None):
        assert r > 0, "svd_ffine: The rank must be larger than zero"
        u_, s_, v_ = np.linalg.svd(uv.reshape(inmaps, n_outmap),
                                   full_matrices=False)
        u_ = np.dot(u_, np.diag(s_))  # fold s into u
        u_ = u_[:, :r]
        v_ = v_[:r, :]
        v_ = v_.reshape([r] + n_outmaps)

        u = nn.Variable([int(np.prod(inp.shape[base_axis:])), r],
                        need_grad=True)
        u.d = u_
        nn.parameter.set_parameter("U", u)

        v = nn.Variable([r] + n_outmaps, need_grad=True)
        v.d = v_
        nn.parameter.set_parameter("V", v)
    if fix_parameters == u.need_grad:
        u = u.get_unlinked_variable(need_grad=not fix_parameters)
    if fix_parameters == v.need_grad:
        v = v.get_unlinked_variable(need_grad=not fix_parameters)
        v.need_grad = not fix_parameters

    if with_bias and b_init is None:
        b_init = ConstantInitializer()

    b = None
    if with_bias:
        b = get_parameter_or_create(
            "b", n_outmaps, b_init, True, not fix_parameters)

    return F.affine(F.affine(inp, u, bias=None, base_axis=base_axis),
                    v, bias=b, base_axis=base_axis)


[docs]@parametric_function_api("bicon_affine", [
    ('W', 'Weight matrix in floating type', '(inmaps, outmaps)', True),
    ('Wb', 'Binarized weights', '(inmaps, outmaps)', False),
    ('b', 'Bias vector', '(outmaps,)', True),
])
def binary_connect_affine(inp, n_outmaps,
                          base_axis=1, quantize_zero_to=1.0,
                          w_init=None, wb_init=None, b_init=None,
                          fix_parameters=False, rng=None, with_bias=True):
    """Binary Connect Affine, multiplier-less inner-product.

    Binary Connect Affine is an affine function,
    except the definition of the inner product is modified.
    The input-output relation of this function is as follows:

    .. math::

        y_i = \sum_{i} sign(w_i) x_i.

    Therefore :math:`sign(w_i)` is either :math:`1` or :math:`-1` and the inner product
    simplifies to addition.

    This function should be used together with Batch Normalization.

    References:

        M. Courbariaux, Y. Bengio, and J.-P. David. "BinaryConnect:
        Training Deep Neural Networks with binary weights during propagations."
        Advances in Neural Information Processing Systems. 2015.

    .. note::

        1) if you would like to share weights between some layers, please
        make sure to share the standard, floating value weights (`weight`)
        and not the binarized weights (`binary_weight`)

        2) The weights and the binary weights become synced only after :func:`~nnabla._variable.Variable.forward` is called,
        and not after a call to :func:`~nnabla._variable.Variable.backward`.
        To access the parameters of the network, remember to call :func:`~nnabla._variable.Variable.forward` once before doing so, otherwise the
        float weights and the binary weights will not be in sync.

        3) Quantized values are stored as floating point number for `binary_weight`,
        since this function is only for simulation purposes.

    Args:
        inp (~nnabla.Variable): Input N-D array with shape (:math:`M_0 \\times \ldots \\times M_{B-1} \\times D_B \\times \ldots \\times D_N`). Dimensions before and after base_axis are flattened as if it is a matrix.
        n_outmaps (int or :obj:`tuple` of :obj:`int`): Number of output neurons per data.
        base_axis (int): Dimensions up to `base_axis` are treated as the sample dimensions.
        quantize_zero_to (float): Input value at zero is quantized to this value.
        w_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for weight. By default, it is initialized with :obj:`nnabla.initializer.UniformInitializer` within the range determined by :obj:`nnabla.initializer.calc_uniform_lim_glorot`.  
        wb_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for binary weight. By default, it is initialized with :obj:`nnabla.initializer.UniformInitializer` within the range determined by :obj:`nnabla.initializer.calc_uniform_lim_glorot`.   
        b_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for bias. By default, it is initialized with zeros if `with_bias` is `True`.
        fix_parameters (bool): When set to `True`, the weights and biases will not be updated.
        rng (numpy.random.RandomState): Random generator for Initializer.

    Returns:
        :class:`~nnabla.Variable`

    """
    if not hasattr(n_outmaps, '__iter__'):
        n_outmaps = [n_outmaps]
    n_outmaps = list(n_outmaps)
    n_outmap = int(np.prod(n_outmaps))
    if w_init is None:
        fan_in = np.prod(inp.shape[base_axis:])
        w_init = UniformInitializer(
            calc_uniform_lim_glorot(fan_in, n_outmap), rng=rng)
    if wb_init is None:
        fan_in = np.prod(inp.shape[base_axis:])
        wb_init = UniformInitializer(
            calc_uniform_lim_glorot(fan_in, n_outmap), rng=rng)
    if b_init is None:
        b_init = ConstantInitializer()
    w = get_parameter_or_create(
        "W", [int(np.prod(inp.shape[base_axis:]))] + n_outmaps,
        w_init, True, not fix_parameters)
    wb = get_parameter_or_create(
        "Wb", [int(np.prod(inp.shape[base_axis:]))] + n_outmaps,
        wb_init, False)
    b = None
    if with_bias:
        b = get_parameter_or_create(
            "b", n_outmaps, b_init, True, not fix_parameters)
    return F.binary_connect_affine(inp, w, wb, b, base_axis, quantize_zero_to)


[docs]@parametric_function_api("bwn_affine", [
    ('W', 'Weight matrix in floating type', '(inmaps, outmaps)', True),
    ('Wb', 'Binarized weights', '(inmaps, outmaps)', False),
    ('alpha', 'Scaling factor :math:`\\alpha`', '(outmaps,)', False),
    ('b', 'Bias vector', '(outmaps,)', True),
])
def binary_weight_affine(inp, n_outmaps,
                         base_axis=1, quantize_zero_to=1.0,
                         w_init=None, wb_init=None, b_init=None,
                         fix_parameters=False, rng=None, with_bias=True):
    """Binary Weight Affine, multiplier-less inner-product with a scale factor.

    Binary Weight Affine is the affine function, but the inner product
    in this function is the following,

    .. math::

        y_j = \\frac{1}{\\|\\mathbf{w}_j\\|_{\\ell_1}} \sum_{i} sign(w_{ji}) x_i

    Therefore :math:`sign(w_{ji})` is either :math:`1` or :math:`-1` and the inner product
    simplifies to addition followed by scaling factor :math:`\\alpha = \\frac{1}{\\|\\mathbf{w}_j\\|_{\\ell_1}}`.
    The number of ::math:`\\alpha` is the outmaps of the affine function.

    References:

        Rastegari, Mohammad, et al. "XNOR-Net: ImageNet Classification Using
        Binary Convolutional Neural Networks." arXiv preprint
        arXiv:1603.05279 (2016).

    .. note::

        1) if you would like to share weights between some layers, please
        make sure to share the standard, floating value weights (`weight`)
        and not the binarized weights (`binary_weight`)

        2) The weights and the binary weights become synced only after :func:`~nnabla._variable.Variable.forward` is called,
        and not after a call to :func:`~nnabla._variable.Variable.backward`.
        To access the parameters of the network, remember to call :func:`~nnabla._variable.Variable.forward` once before doing so, otherwise the
        float weights and the binary weights will not be in sync.

        3) Quantized values are stored as floating point number for `binary_weight`,
        since this function is only for simulation purposes.

    Args:
        inp (~nnabla.Variable): Input N-D array with shape (:math:`M_0 \\times \ldots \\times M_{B-1} \\times D_B \\times \ldots \\times D_N`). Dimensions before and after base_axis are flattened as if it was a matrix.
        n_outmaps (int or :obj:`tuple` of :obj:`int`): Number of output neurons per data.
        base_axis (int): Dimensions up to `base_axis` are treated as the sample dimensions.
        quantize_zero_to (float): Input value at zero is quantized to this value.
        w_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for the weight. By default, it is initialized with :obj:`nnabla.initializer.UniformInitializer` within the range determined by :obj:`nnabla.initializer.calc_uniform_lim_glorot`. 
        wb_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for the binary weight. By default, it is initialized with :obj:`nnabla.initializer.UniformInitializer` within the range determined by :obj:`nnabla.initializer.calc_uniform_lim_glorot`.
        b_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for the bias. By defalut, it is initialized with zeros if `with_bias` is `True`.
        fix_parameters (bool): When set to `True`, the weight and bias will not be updated.
        rng (numpy.random.RandomState): Random generator for Initializer.
        with_bias (bool): Specify whether to include the bias term.

    Returns:
        :class:`~nnabla.Variable`

    """
    if not hasattr(n_outmaps, '__iter__'):
        n_outmaps = [n_outmaps]
    n_outmaps = list(n_outmaps)
    n_outmap = int(np.prod(n_outmaps))
    if w_init is None:
        fan_in = np.prod(inp.shape[base_axis:])
        w_init = UniformInitializer(
            calc_uniform_lim_glorot(fan_in, n_outmap), rng=rng)
    if wb_init is None:
        fan_in = np.prod(inp.shape[base_axis:])
        wb_init = UniformInitializer(
            calc_uniform_lim_glorot(fan_in, n_outmap), rng=rng)
    if b_init is None:
        b_init = ConstantInitializer()
    w = get_parameter_or_create(
        "W", [int(np.prod(inp.shape[base_axis:]))] + n_outmaps,
        w_init, True, not fix_parameters)
    wb = get_parameter_or_create(
        "Wb", [int(np.prod(inp.shape[base_axis:]))] + n_outmaps,
        wb_init, False)
    alpha = get_parameter_or_create(
        "alpha", n_outmaps, ConstantInitializer(0), False)
    b = None
    if with_bias:
        b = get_parameter_or_create(
            "b", n_outmaps, b_init, True, not fix_parameters)
    return F.binary_weight_affine(inp, w, wb, alpha, b, base_axis, quantize_zero_to)


[docs]@parametric_function_api("inq_affine", [
    ('W', 'Weight matrix in floating type', '(inmaps, outmaps)', True),
    ('I', 'Binary indicator matrix of fixed weights', '(inmaps, outmaps)', False),
    ('b', 'Bias vector', '(outmaps,)', True),
])
def inq_affine(inp, n_outmaps, base_axis=1, num_bits=4,
               inq_iterations=(), selection_algorithm='random',
               seed=-1, w_init=None, i_init=None, b_init=None,
               fix_parameters=False, rng=None, with_bias=True):
    """Incremental Network Quantization Affine Layer

    During training, the weights are sequentially quantized to power-of-two
    values, which allows the training of a multiplierless network.

    Using `inq_iterations`, one can specify after how many forward passes
    half of the learnable weights are fixed and quantized to powers-of-two.
    After reaching the last value in `inq_iterations`, all weights are fixed.

    For more details, please refer to the reference.

    Reference:
    Zhou A, Yao A, Guo Y, Xu L, Chen Y. Incremental network quantization:
    Towards lossless CNNs with low-precision weights.
    <https://arxiv.org/abs/1702.03044>

    Args:
        inp (~nnabla.Variable): Input N-D array with shape (:math:`M_0 \\times \ldots \\times M_{B-1} \\times D_B \\times \ldots \\times D_N`). Dimensions before and after base_axis are flattened as if it was a matrix.
        n_outmaps (int or :obj:`tuple` of :obj:`int`): Number of output neurons per data.
        base_axis (int): Dimensions up to `base_axis` are treated as the sample dimensions.
        quantize_zero_to (float): Input value at zero is quantized to this value.
        num_bits (int): Number of bits per weight. Value has to be larger than 1 as one bit is already used to code the value "0"
        inq_iterations (tuple of int): Tuple of iteration numbers at which we fix half of the weights.
        selection_algorithm (str): Chooses algorithm that is used to decide which weights are fixed. ("largest_abs" ... fix weights with largest absolute value, "random" ... fix weights randomly)
        seed (int): Random seed for INQ algorithm
        w_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for weight. By default, it is initialized with :obj:`nnabla.initializer.UniformInitializer` within the range determined by :obj:`nnabla.initializer.calc_uniform_lim_glorot`.  
        i_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for indicators (0 ... learnable, 1 ... fixed). By default, it is initialized with zeros.
        b_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for bias. By default, it is initialized with zeros if `with_bias` is `True`.
        fix_parameters (bool): When set to `True`, the weight and bias will not be updated.
        rng (numpy.random.RandomState): Random generator for Initializer.
        with_bias (bool): Specify whether to include the bias term.

    Returns:
        :class:`~nnabla.Variable`

    """
    if not hasattr(n_outmaps, '__iter__'):
        n_outmaps = [n_outmaps]
    n_outmaps = list(n_outmaps)
    n_outmap = int(np.prod(n_outmaps))
    if w_init is None:
        fan_in = np.prod(inp.shape[base_axis:])
        w_init = UniformInitializer(
            calc_uniform_lim_glorot(fan_in, n_outmap), rng=rng)
    if i_init is None:
        fan_in = np.prod(inp.shape[base_axis:])
        i_init = ConstantInitializer()
    if b_init is None:
        b_init = ConstantInitializer()
    w = get_parameter_or_create(
        "W", [int(np.prod(inp.shape[base_axis:]))] + n_outmaps,
        w_init, True, not fix_parameters)
    i = get_parameter_or_create(
        "I", [int(np.prod(inp.shape[base_axis:]))] + n_outmaps,
        i_init, False)
    b = None
    if with_bias:
        b = get_parameter_or_create(
            "b", n_outmaps, b_init, True, not fix_parameters)
    return F.inq_affine(inp, w, i, b, base_axis, num_bits, inq_iterations, selection_algorithm, seed)


[docs]@parametric_function_api("conv", [
    ('W', 'Filter weights', '(outmaps, inmaps // group, *kernel)', True),
    ('b', 'Bias vector', '(outmaps,)', True),
])
def convolution(inp, outmaps, kernel,
                pad=None, stride=None, dilation=None, group=1, channel_last=False,
                w_init=None, b_init=None,
                base_axis=1, fix_parameters=False, rng=None, with_bias=True,
                apply_w=None, apply_b=None):
    """N-D Convolution with a bias term.

    For Dilated Convolution (a.k.a. Atrous Convolution), refer to:

    - Chen et al., DeepLab: Semantic Image Segmentation with Deep Convolutional Nets, Atrous Convolution, and Fully Connected CRFs. https://arxiv.org/abs/1606.00915

    - Yu et al., Multi-Scale Context Aggregation by Dilated Convolutions. https://arxiv.org/abs/1511.07122

    Note:

        Convolution is a computationally intensive operation that
        should preferably be run with the `cudnn` backend. NNabla
        then uses CuDNN library functions to determine and cache the
        fastest algorithm for the given set of convolution parameters,
        which results in additional memory consumption which may pose
        a problem for GPUs with insufficient memory size. In that
        case, the `NNABLA_CUDNN_WORKSPACE_LIMIT` environment variable
        can be used to restrict the choice of algorithms to those that
        fit the given workspace memory limit, expressed in bytes. In
        some cases it may also be desired to restrict the automatic
        search to algorithms that produce deterministic (reproducible)
        results. This can be requested by setting the the environment
        variable `NNABLA_CUDNN_DETERMINISTIC` to a non-zero value.

    Args:
        inp (~nnabla.Variable): N-D array.
        outmaps (int): Number of convolution kernels (which is equal to the number of output channels). For example, to apply convolution on an input with 16 types of filters, specify 16.
        kernel (:obj:`tuple` of :obj:`int`): Convolution kernel size. For example, to apply convolution on an image with a 3 (height) by 5 (width) two-dimensional kernel, specify (3,5).
        pad (:obj:`tuple` of :obj:`int`): Padding sizes for dimensions.
        stride (:obj:`tuple` of :obj:`int`): Stride sizes for dimensions.
        dilation (:obj:`tuple` of :obj:`int`): Dilation sizes for dimensions.
        group (int): Number of groups of channels. This makes connections across channels more sparse by grouping connections along map direction.
        channel_last (bool): If True, the last dimension is considered as channel dimension, a.k.a. NHWC order.
        w_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for weight. By default, it is initialized with :obj:`nnabla.initializer.UniformInitializer` within the range determined by :obj:`nnabla.initializer.calc_uniform_lim_glorot`.  
        b_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for bias. By default, it is initialized with zeros if `with_bias` is `True`.
        base_axis (int): Dimensions up to `base_axis` are treated as the sample dimensions.
        fix_parameters (bool): When set to `True`, the weights and biases will not be updated.
        rng (numpy.random.RandomState): Random generator for Initializer.
        with_bias (bool): Specify whether to include the bias term.
        apply_w (function): Lambda, function, or callable object applied to the weights.
        apply_b (function): Lambda, function, or callable object applied to the bias.

    Returns:
        :class:`~nnabla.Variable`: N-D array. See :obj:`~nnabla.functions.convolution` for the output shape.

    """
    if channel_last:
        channels = inp.shape[-1]
        filter_shape = tuple(kernel) + (channels // group,)
    else:
        channels = inp.shape[base_axis]
        filter_shape = (channels // group,) + tuple(kernel)
    if w_init is None:
        w_init = UniformInitializer(
            calc_uniform_lim_glorot(channels, outmaps, tuple(kernel)), rng=rng)
    if with_bias and b_init is None:
        b_init = ConstantInitializer()
    w = get_parameter_or_create(
        "W", (outmaps,) + filter_shape,
        w_init, True, not fix_parameters)
    if apply_w is not None:
        w = apply_w(w)
    b = None
    if with_bias:
        b = get_parameter_or_create(
            "b", (outmaps,), b_init, True, not fix_parameters)
        if apply_b is not None:
            b = apply_b(b)
    return F.convolution(inp, w, b, base_axis, pad, stride, dilation, group, channel_last)


[docs]@parametric_function_api("svd_conv", [
    ('U',
     'Decomposed filter weights :math:`{\\mathbf U}`', '(inmaps * r, *kernel)', True),
    ('V', 'Decomposed filter weights :math:`{\\mathbf V}`',
     '(outmaps, inmaps * r, 1, ...)', True),
    ('b', 'Bias vector', '(outmaps,)', True),
])
def svd_convolution(inp, outmaps, kernel, r, pad=None, stride=None,
                    dilation=None, uv_init=None, b_init=None, base_axis=1,
                    fix_parameters=False, rng=None, with_bias=True):
    """SVD convolution is a low rank approximation of the convolution
    layer. It can be seen as a depth wise convolution followed by a
    1x1 convolution.

    The flattened kernels for the i-th input map are expressed by
    their low rank approximation. The kernels for the i-th input
    :math:`{\\mathbf W_i}` are approximated with the singular value
    decomposition (SVD) and by selecting the :math:`{R}` dominant
    singular values and the corresponding singular vectors.

    .. math::
        {\\mathbf W_{:,i,:}} ~ {\\mathbf U_i} {\\mathbf V_i}.

    :math:`{\\mathbf U}` contains the weights of the depthwise
    convolution with multiplier :math:`{R}` and :math:`{\\mathbf V}`
    contains the weights of the 1x1 convolution.

    If `uv_init` is a numpy array, :math:`{\\mathbf U}` and
    :math:`{\\mathbf V}` are computed such that `uv_init` is
    approximated by :math:`{\\mathbf{UV}}`. If `uv_init` is `None` or
    an initializer, the product of :math:`{\\mathbf U}` and
    :math:`{\\mathbf V}` approximates the random initialization.

    If :math:`{\\mathbf U}` and :math:`{\\mathbf V}` exist in the
    context, they take precedence over `uv_init`.

    Suppose the kernel tensor of the convolution is of :math:`{O \\times I \\times K \\times K}` and
    the compression rate you want to specify is :math:`{CR}`, then you
    set :math:`{R}` as

    .. math::

        R = \\left\\lfloor \\frac{(1 - CR)OIK^2}{I(O + K^2)} \\right\\rfloor.

    Args:
        inp (~nnabla.Variable): N-D array.

        outmaps (int): Number of convolution kernels (which is equal
          to the number of output channels). For example, to apply
          convolution on an input with 16 types of filters, specify
          16.

        kernel (tuple): Convolution kernel size. For example,
          to apply convolution on an image with a 3 (height) by 5
          (width) two-dimensional kernel, specify (3, 5).

        r (int): Rank of the factorized layer.

        pad (tuple): Padding sizes (`int`) for dimensions.

        stride (tuple): Stride sizes (`int`) for dimensions.

        dilation (tuple): Dilation sizes (`int`) for dimensions.

        uv_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`):
          Initializer for weight. By default, it is initialized with :obj:`nnabla.initializer.UniformInitializer` within the range determined by :obj:`nnabla.initializer.calc_uniform_lim_glorot`. 

        b_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for bias. By default, it is initialized with zeros if `with_bias` is `True`.

        base_axis (int): Dimensions up to `base_axis` are treated as the
          sample dimensions.

        fix_parameters (bool): When set to `True`, the weights and
          biases will not be updated.

        rng (numpy.random.RandomState): Random generator for Initializer.

        with_bias (bool): Specify whether to include the bias term.

    Returns:
        :class:`~nnabla.Variable`: :math:`(B + 1)`-D array.
        (:math:`M_0 \\times \ldots \\times M_{B-1} \\times L`)

    """
    assert r > 0, "svd_convolution: The rank must be larger than zero"

    if uv_init is None:
        uv_init = UniformInitializer(
            calc_uniform_lim_glorot(inp.shape[base_axis], outmaps,
                                    tuple(kernel)), rng=rng)

    if type(uv_init) is np.ndarray:
        # TODO: Assert that size of uv_init is correct
        # uv is initialize with numpy array
        uv = uv_init
    else:
        # uv is initialize from initializer
        uv = uv_init((outmaps, inp.shape[base_axis]) + tuple(kernel))

    # flatten kernels
    uv = uv.reshape((outmaps, inp.shape[base_axis], np.prod(kernel)))

    u = get_parameter('U')
    v = get_parameter('V')

    if (u is None) or (v is None):

        inmaps = inp.shape[base_axis]
        u_low_rank = np.zeros((inmaps, np.prod(kernel), r))
        v_low_rank = np.zeros((inmaps, r, outmaps))

        for i in range(inmaps):
            K = np.transpose(uv[:, i, :])
            u_, s_, v_ = np.linalg.svd(K, full_matrices=False)
            u_low_rank[i, :, :] = np.dot(u_[:, :r], np.diag(s_[:r]))
            v_low_rank[i, :, :] = v_[:r, :]

        # reshape U : (I,K*K,r) -> (I*r,K,K) for depthwise conv
        u = nn.Variable((inmaps * r,) + tuple(kernel),
                        need_grad=True)

        u.d = (np.transpose(u_low_rank, axes=(0, 2, 1))
               .reshape((inmaps * r,) + tuple(kernel)))

        nn.parameter.set_parameter("U", u)

        # reshape V :  (I,r,O) -> (O,I*r,1,1) for 1X1 conv
        kernel_one = (1,) * len(kernel)  # 1x1 for 2D convolution
        v = nn.Variable((outmaps, inmaps * r) + kernel_one,
                        need_grad=True)

        v.d = (np.transpose(v_low_rank, axes=(2, 0, 1))
               .reshape((outmaps, inmaps * r) + kernel_one))

        nn.parameter.set_parameter("V", v)

    if fix_parameters == u.need_grad:
        u = u.get_unlinked_variable(need_grad=not fix_parameters)
    if fix_parameters == v.need_grad:
        v = v.get_unlinked_variable(need_grad=not fix_parameters)
    if with_bias and b_init is None:
        b_init = ConstantInitializer()
    b = None
    if with_bias:
        b = get_parameter_or_create(
            "b", (outmaps,), b_init, True, not fix_parameters)

    y = F.depthwise_convolution(inp, u, bias=None, base_axis=base_axis,
                                pad=pad, stride=stride, dilation=dilation,
                                multiplier=r)

    y = F.convolution(y, v, bias=b, base_axis=base_axis, pad=None,
                      stride=None, dilation=None, group=1)
    return y


[docs]@parametric_function_api("cpd3_conv", [
    ('I',
     'Decomposed filter weights :math:`{\\mathbf I}`', '(r, inmaps, 1, ...)', True),
    ('K',
     'Decomposed filter weights :math:`{\\mathbf K}`', '(r, *kernel)', True),
    ('O',
     'Decomposed filter weights :math:`{\\mathbf O}`', '(outmaps, r, 1, ...)', True),
    ('b', 'Bias vector', '(outmaps,)', True),
])
def cpd3_convolution(inp, outmaps, kernel, r,
                     pad=None, stride=None, dilation=None,
                     oik_init=None, b_init=None,
                     base_axis=1, fix_parameters=False, rng=None, with_bias=True,
                     max_iter=500, stopping_criterion=1e-5, lambda_reg=0.0):
    """CP convolution is a low rank approximation of a convolution layer. A 3D tensor containing the parameter is built by collapsing the N-D kernels into 1D, then the tensor is decomposed into three matrices. The decomposed layer can be seen as linear combinations of the input feature maps to :math:`{R}` feature maps followed by a depthwise convolution and followed by linear combinations of the feature maps to compute the output feature maps.

    The CP decomposition allows to approximate the kernel tensor by :math:`{R}` rank-1 tensors of the form:

    .. math::

        \\sum_{r=1}^{R} \\lambda_r {\\mathbf{o}^{(r)} \\otimes \\mathbf{i}^{(r)} \\otimes \\mathbf{k}^{(r)}},

    where :math:`{\\lambda}_r` is the normalization coefficient and :math:`{\\otimes}` is the outer product.


    If `oik_init` is a numpy array, U and V are computed so that uv_init can be approximates from UV
    If `oik_init` is None or an initializer, the product of U and V approximate the randomly initialized array

    If `O`, `I` and `K` exist in context, they are used to initialize the layer and oik_init is not used.

    Suppose the kernel tensor of the affine is of :math:`{I \\times O}` and
    the compression rate you want to specify is :math:`{CR}`, then you
    set :math:`{R}` as

    .. math::

        R = \\left\\lfloor \\frac{(1 - CR)OIK^2}{O + I + K^2} \\right\\rfloor.

    References:
        - Lebedev, Vadim, Yaroslav Ganin, Maksim Rakhuba, Ivan Oseledets, and Victor Lempitsky,  "Speeding-up convolutional neural networks using fine-tuned cp-decomposition.", arXiv preprint arXiv:1412.6553 (2014).

        - Marcella Astrid, Seung-Ik Lee, "CP-decomposition with Tensor Power Method for Convolutional Neural Networks Compression", BigComp 2017.

    Args:
        inp (~nnabla.Variable): N-D array.
        outmaps (int): Number of convolution kernels (which is equal to the number of output channels). For example, to apply convolution on an input with 16 types of filters, specify 16.
        kernel (:obj:`tuple` of :obj:`int`): Convolution kernel size. For example, to apply convolution on an image with a 3 (height) by 5 (width) two-dimensional kernel, specify (3,5).
        r (int): rank of the factorized layer
        pad (:obj:`tuple` of :obj:`int`): Padding sizes for dimensions.
        stride (:obj:`tuple` of :obj:`int`): Stride sizes for dimensions.
        dilation (:obj:`tuple` of :obj:`int`): Dilation sizes for dimensions.
        oik_init (numpy array or :obj:`nnabla.initializer.BaseInitializer`): Initializer for weight. Initializer for weight. By default, it is initialized with :obj:`nnabla.initializer.UniformInitializer` within the range determined by :obj:`nnabla.initializer.calc_uniform_lim_glorot`. 
        b_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for bias. It is initialized with zeros if `with_bias` is `True`.
        base_axis (int): Dimensions up to `base_axis` are treated as the sample dimensions.
        fix_parameters (bool): When set to `True`, the weights and biases will not be updated.
        rng (numpy.random.RandomState): Random generator for Initializer.
        with_bias (bool): Specify whether to include the bias term.
        max_iter (int): Max iteration of the ALS.
        stopping_criterion (float): Threshold for stopping the ALS.
                If the value is negative, the convergence check is ignored;
                in other words, it may reduce the computation time.
        lambda_reg (float): regularization parameter for the ALS. Larger
                lambda_reg means larger regularization.

    Returns:
        :class:`~nnabla.Variable`: :math:`(B + 1)`-D array. (:math:`M_0 \\times \ldots \\times M_{B-1} \\times L`)


    """

    if oik_init is None:
        oik_init = UniformInitializer(
            calc_uniform_lim_glorot(inp.shape[base_axis], outmaps, tuple(kernel)), rng=rng)

    if type(oik_init) is np.ndarray:
        # TODO: Assert that size of uv_init is correct
        # uv is initialize with numpy array
        oik = oik_init
    else:
        # uv is initialize from initializer
        oik = oik_init((outmaps, inp.shape[base_axis]) + tuple(kernel))

    # flatten kernels
    oik = oik.reshape((outmaps, inp.shape[base_axis], np.prod(kernel)))

    o = get_parameter('O')
    i = get_parameter('I')
    k = get_parameter('K')

    if (o is None) or (i is None) or (k is None):
        assert r > 0, "cpd3_convolution: The rank must be larger than zero"
        from nnabla.utils.factorization import cpd
        als = cpd.ALS()
        U, lmbda = als.solve(X=oik, rank=r,
                             max_iter=max_iter,
                             stopping_criterion=stopping_criterion,
                             lambda_reg=lambda_reg,
                             dtype=oik.dtype,
                             rng=rng)

        o_ = U[0] * lmbda
        i_ = U[1]
        k_ = U[2]

        kernel_one = (1,) * len(kernel)  # 1x1 for 2D convolution
        inmaps = inp.shape[base_axis]

        # reshape I :  (I,r) -> (r,I,1,1)
        i = nn.Variable((r, inmaps) + kernel_one, need_grad=True)
        i.d = np.transpose(i_).reshape((r, inmaps) + kernel_one)
        nn.parameter.set_parameter("I", i)

        # reshape O :  (O,r) -> (O,r,1,1)
        o = nn.Variable((outmaps, r) + kernel_one,
                        need_grad=True)
        o.d = o_.reshape((outmaps, r) + kernel_one)
        nn.parameter.set_parameter("O", o)

        # reshape K :  (K*K,r) -> (r,K,K)
        k = nn.Variable((r,) + kernel, need_grad=True)
        k.d = np.transpose(k_).reshape((r,) + kernel)
        nn.parameter.set_parameter("K", k)

    if fix_parameters == o.need_grad:
        o = o.get_unlinked_variable(need_grad=not fix_parameters)
    if fix_parameters == i.need_grad:
        i = i.get_unlinked_variable(need_grad=not fix_parameters)
    if fix_parameters == k.need_grad:
        k = k.get_unlinked_variable(need_grad=not fix_parameters)
    if with_bias and b_init is None:
        b_init = ConstantInitializer()
    b = None
    if with_bias:
        b = get_parameter_or_create(
            "b", (outmaps,), b_init, True, not fix_parameters)

    y = F.convolution(inp, i, bias=None, base_axis=base_axis, pad=None, stride=None,
                      dilation=None, group=1)
    y = F.depthwise_convolution(y, k, bias=None, base_axis=base_axis,
                                pad=pad, stride=stride, dilation=dilation,
                                multiplier=1)
    y = F.convolution(y, o, bias=b, base_axis=base_axis, pad=None, stride=None,
                      dilation=None, group=1)
    return y


[docs]@parametric_function_api("bicon_conv", [
    ('W', 'Filter weights in float', '(outmaps, inmaps, *kernel)', True),
    ('Wb', 'Binarized filter weights', '(outmaps, inmaps, *kernel)', False),
    ('b', 'Bias vector', '(outmaps,)', True),
])
def binary_connect_convolution(inp, outmaps, kernel,
                               pad=None, stride=None, dilation=None, group=1,
                               quantize_zero_to=1.0,
                               w_init=None, wb_init=None, b_init=None,
                               base_axis=1, fix_parameters=False, rng=None,
                               with_bias=True):
    """Binary Connect Convolution, multiplier-less inner-product.

    Binary Connect Convolution is the convolution function,
    except the definition of the inner product is modified.
    The input-output relation of this function is as follows:

    .. math::

        y_{n, a, b} = \sum_{m} \sum_{i} \sum_{j} sign(w_{n, m, i, j}) x_{m, a + i, b + j}.

    Therefore :math:`sign(w_i)` is either :math:`1` or :math:`-1` and the inner product
    simplifies to addition.

    This function should be used together with BatchNormalization.

    References:

        M. Courbariaux, Y. Bengio, and J.-P. David. "BinaryConnect:
        Training Deep Neural Networks with binary weights during propagations."
        Advances in Neural Information Processing Systems. 2015.

    .. note::

        1) if you would like to share weights between some layers, please
        make sure to share the standard, floating value weights (`weight`)
        and not the binarized weights (`binary_weight`)

        2) The weights and the binary weights become synced only after :func:`~nnabla._variable.Variable.forward` is called,
        and not after a call to :func:`~nnabla._variable.Variable.backward`.
        To access the parameters of the network, remember to call :func:`~nnabla._variable.Variable.forward` once before doing so, otherwise the
        float weights and the binary weights will not be in sync.

        3) Quantized values are stored as floating point number for `binary_weight`,
        since this function is only for simulation purposes.

    Args:
        inp (~nnabla.Variable): N-D array.
        outmaps (int): Number of convolution kernels (which is equal to the number of output channels). For example, to apply convolution on an input with 16 types of filters, specify 16.
        kernel (:obj:`tuple` of :obj:`int`): Convolution kernel size. For example, to apply convolution on an image with a 3 (height) by 5 (width) two-dimensional kernel, specify (3,5).
        pad (:obj:`tuple` of :obj:`int`): Padding sizes for dimensions.
        stride (:obj:`tuple` of :obj:`int`): Stride sizes for dimensions.
        dilation (:obj:`tuple` of :obj:`int`): Dilation sizes for dimensions.
        group (int): Number of groups of channels. This makes connections across channels sparser by grouping connections along map direction.
        quantize_zero_to (float): Input value at zero is quantized to this value.
        w_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for weight. By default, it is initialized with :obj:`nnabla.initializer.UniformInitializer` within the range determined by :obj:`nnabla.initializer.calc_uniform_lim_glorot`. 
        wb_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for binary weight. By default, it is initialized with :obj:`nnabla.initializer.UniformInitializer` within the range determined by :obj:`nnabla.initializer.calc_uniform_lim_glorot`.   
        b_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for bias. By default, it is initialized with zeros if `with_bias` is `True`.
        base_axis (int): Dimensions up to `base_axis` are treated as the sample dimensions.
        fix_parameters (bool): When set to `True`, the weights and biases will not be updated.
        rng (numpy.random.RandomState): Random generator for Initializer.
        with_bias (bool): Specify whether to include the bias term.

    Returns:
        :class:`~nnabla.Variable`

    """
    if w_init is None:
        w_init = UniformInitializer(
            calc_uniform_lim_glorot(inp.shape[base_axis], outmaps, tuple(kernel)), rng=rng)
    if wb_init is None:
        wb_init = UniformInitializer(
            calc_uniform_lim_glorot(inp.shape[base_axis], outmaps, tuple(kernel)), rng=rng)
    if b_init is None:
        b_init = ConstantInitializer()
    w = get_parameter_or_create(
        "W", (outmaps, inp.shape[base_axis]) + tuple(kernel),
        w_init, True, not fix_parameters)
    wb = get_parameter_or_create(
        "Wb", (outmaps, inp.shape[base_axis]) + tuple(kernel),
        wb_init, False)
    b = None
    if with_bias:
        b = get_parameter_or_create(
            "b", (outmaps,), b_init, True, not fix_parameters)
    return F.binary_connect_convolution(inp, w, wb, b, base_axis, pad, stride, dilation, group, quantize_zero_to)


[docs]@parametric_function_api("bwn_conv", [
    ('W', 'Filter weights in float', '(outmaps, inmaps, *kernel)', True),
    ('Wb', 'Binarized filter weights', '(outmaps, inmaps, *kernel)', False),
    ('alpha', 'Scaling factor :math:`\\alpha`', '(outmaps,)', False),
    ('b', 'Bias vector', '(outmaps,)', True),
])
def binary_weight_convolution(inp, outmaps, kernel,
                              pad=None, stride=None, dilation=None, group=1,
                              quantize_zero_to=1.0,
                              w_init=None, wb_init=None, b_init=None,
                              base_axis=1, fix_parameters=False, rng=None,
                              with_bias=True):
    """Binary Weight Convolution, multiplier-less inner-product with a scale factor.

    Binary Weight Convolution is the convolution function, but the
    inner product in this function is the following,

    .. math::

        y_{n, a, b} = \\frac{1}{\\|\\mathbf{w}_n\\|_{\\ell_1}} \sum_{m} \sum_{i} \sum_{j} sign(w_{n, m, i, j}) x_{m, a + i, b + j}.


    Therefore :math:`sign(w_{n, m, i, j})`  is either :math:`1` or :math:`-1` and the inner product
    simplifies to addition followed by scaling factor :math:`\\alpha = \\frac{1}{\\|\\mathbf{w}_n\\|_{\\ell_1}}`.
    The number of :math:`n` is the number of outmaps of the convolution
    function.

    References:

        Rastegari, Mohammad, et al. "XNOR-Net: ImageNet Classification Using
        Binary Convolutional Neural Networks." arXiv preprint
        arXiv:1603.05279 (2016).

    .. note::

        1) if you would like to share weights between some layers, please
        make sure to share the standard, floating value weights (`weight`)
        and not the binarized weights (`binary_weight`)

        2) The weights and the binary weights become synced only after :func:`~nnabla._variable.Variable.forward` is called,
        and not after a call to :func:`~nnabla._variable.Variable.backward`.
        To access the parameters of the network, remember to call :func:`~nnabla._variable.Variable.forward` once before doing so, otherwise the
        float weights and the binary weights will not be in sync.

        3) Quantized values are stored as floating point number for `binary_weight`,
        since this function is only for simulation purposes.

    Args:
        inp (~nnabla.Variable): N-D array.
        outmaps (int): Number of convolution kernels (which is equal to the number of output channels). For example, to apply convolution on an input with 16 types of filters, specify 16.
        kernel (:obj:`tuple` of :obj:`int`): Convolution kernel size. For example, to apply convolution on an image with a 3 (height) by 5 (width) two-dimensional kernel, specify (3,5).
        pad (:obj:`tuple` of :obj:`int`): Padding sizes for dimensions.
        stride (:obj:`tuple` of :obj:`int`): Stride sizes for dimensions.
        dilation (:obj:`tuple` of :obj:`int`): Dilation sizes for dimensions.
        group (int): Number of groups of channels. This makes connections across channels sparser by grouping connections along map direction.
        quantize_zero_to (float): Input value at zero is quantized to this value.
        w_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for weight. By default, it is initialized with :obj:`nnabla.initializer.UniformInitializer` within the range determined by :obj:`nnabla.initializer.calc_uniform_lim_glorot`. 
        wb_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for binary weight. By default, it is initialized with :obj:`nnabla.initializer.UniformInitializer` within the range determined by :obj:`nnabla.initializer.calc_uniform_lim_glorot`.  
        b_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for bias. By default, it is initialized with zeros if `with_bias` is `True`.
        base_axis (int): Dimensions up to `base_axis` are treated as the sample dimensions.
        fix_parameters (bool): When set to `True`, the weights and biases will not be updated.
        rng (numpy.random.RandomState): Random generator for Initializer.
        with_bias (bool): Specify whether to include the bias term.

    Returns:
        :class:`~nnabla.Variable`

    """
    if w_init is None:
        w_init = UniformInitializer(
            calc_uniform_lim_glorot(inp.shape[base_axis], outmaps, tuple(kernel)), rng=rng)
    if wb_init is None:
        wb_init = UniformInitializer(
            calc_uniform_lim_glorot(inp.shape[base_axis], outmaps, tuple(kernel)), rng=rng)
    if b_init is None:
        b_init = ConstantInitializer()
    w = get_parameter_or_create(
        "W", (outmaps, inp.shape[base_axis]) + tuple(kernel),
        w_init, True, not fix_parameters)
    wb = get_parameter_or_create(
        "Wb", (outmaps, inp.shape[base_axis]) + tuple(kernel),
        wb_init, False)
    alpha = get_parameter_or_create(
        "alpha", (outmaps, ), ConstantInitializer(0), False)
    b = None
    if with_bias:
        b = get_parameter_or_create(
            "b", (outmaps,), b_init, True, not fix_parameters)
    return F.binary_weight_convolution(inp, w, wb, alpha, b, base_axis, pad, stride, dilation, group, quantize_zero_to)


[docs]@parametric_function_api("inq_conv", [
    ('W', 'Filter weights in float', '(outmaps, inmaps, *kernel)', True),
    ('I', 'Binary indicator matrix of fixed weights',
     '(outmaps, inmaps, *kernel)', False),
    ('b', 'Bias vector', '(outmaps,)', True),
])
def inq_convolution(inp, outmaps, kernel,
                    pad=None, stride=None, dilation=None, group=1,
                    num_bits=4, inq_iterations=(), selection_algorithm='random',
                    seed=-1, w_init=None, i_init=None, b_init=None,
                    base_axis=1, fix_parameters=False, rng=None,
                    with_bias=True):
    """Incremental Network Quantization Convolution Layer

    During training, the weights are sequentially quantized to power-of-two
    values, which allows the training of a multiplierless network.

    Using `inq_iterations`, one can specify after how many forward passes
    half of the learnable weights are fixed and quantized to powers-of-two.
    After reaching the last value in `inq_iterations`, all weights are fixed.

    For more details, please refer to the reference.

    Reference:
    Zhou A, Yao A, Guo Y, Xu L, Chen Y. Incremental network quantization:
    Towards lossless CNNs with low-precision weights.
    <https://arxiv.org/abs/1702.03044>

    Args:
        inp (~nnabla.Variable): Input N-D array with shape (:math:`M_0 \\times \ldots \\times M_{B-1} \\times D_B \\times \ldots \\times D_N`). Dimensions before and after base_axis are flattened as if it was a matrix.
        n_outmaps (int or :obj:`tuple` of :obj:`int`): Number of output neurons per data.
        base_axis (int): Dimensions up to `base_axis` are treated as the sample dimensions.
        num_bits (int): Number of bits per weight. Value has to be larger than 1 as one bit is already used to code the value "0"
        inq_iterations (tuple of int): Tuple of iteration numbers at which we fix half of the weights.
        selection_algorithm (str): Chooses algorithm that is used to decide which weights are fixed. ("largest_abs" ... fix weights with largest absolute value, "random" ... fix weights randomly)
        seed (int): Random seed for INQ algorithm
        w_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for the weight. By default, it is initialized with :obj:`nnabla.initializer.UniformInitializer` within the range determined by :obj:`nnabla.initializer.calc_uniform_lim_glorot`. 
        i_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for the indicators (0 ... learnable, 1 ... fixed). By default, it is initialized with zeros.
        b_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for the bias. By default, it is initialized with zeros if `with_bias` is `True`.
        fix_parameters (bool): When set to `True`, the weight and bias will not be updated.
        rng (numpy.random.RandomState): Random generator for Initializer.
        with_bias (bool): Specify whether to include the bias term.

    Returns:
        :class:`~nnabla.Variable`

    """
    if w_init is None:
        w_init = UniformInitializer(
            calc_uniform_lim_glorot(inp.shape[base_axis], outmaps, tuple(kernel)), rng=rng)
    if i_init is None:
        i_init = ConstantInitializer()
    if b_init is None:
        b_init = ConstantInitializer()
    w = get_parameter_or_create(
        "W", (outmaps, inp.shape[base_axis]) + tuple(kernel),
        w_init, True, not fix_parameters)
    i = get_parameter_or_create(
        "I", (outmaps, inp.shape[base_axis]) + tuple(kernel),
        i_init, False)
    b = None
    if with_bias:
        b = get_parameter_or_create(
            "b", (outmaps,), b_init, True, not fix_parameters)
    return F.inq_convolution(inp, w, i, b, base_axis, pad, stride, dilation, group, num_bits, inq_iterations, selection_algorithm, seed)


[docs]@parametric_function_api("deformable_conv", [
    ('W', 'Filter weights', '(outmaps, inmaps // group, *kernel)', True),
    ('b', 'Bias vector', '(outmaps,)', True),
])
def deformable_convolution(inp, outmaps, kernel, offset, mask=None,
                           pad=None, stride=None, dilation=None, group=1, deformable_group=1, channel_last=False,
                           w_init=None, b_init=None,
                           base_axis=1, fix_parameters=False, rng=None, with_bias=True,
                           apply_w=None, apply_b=None):
    """2D Deformable Convolution with a bias term. If use mask, this function is Deformable Convolution v2.

    - Dai et al., Deformable Convolutional Networks. https://arxiv.org/abs/1703.06211
    - Zhu et al., Deformable ConvNets v2: More Deformable, Better Results. https://arxiv.org/abs/1811.11168


    Args:
        inp (~nnabla.Variable): N-D array.
        outmaps (int): Number of convolution kernels (which is equal to the number of output channels). For example, to apply convolution on an input with 16 types of filters, specify 16.
        kernel (:obj:`tuple` of :obj:`int`): Convolution kernel size. For example, to apply convolution on an image with a 3 (height) by 5 (width) two-dimensional kernel, specify (3,5).
        offset (~nnabla.Variable): Offsets for deformable convolutions. Shape is fixed to :math:`(N, deformable{\_}group \\times 2 \\times Kh \\times Kw, H, W)`. Offsets must be calculated externally through a separate convolution layer.
        mask (~nnabla.Variable): Normalized mask for deformable convolutions v2. Shape is fixed to :math:`(N, deformable{\_}group \\times Kh \\times Kw, H, W)`. Masks must be calculated externally together with the offsets through a separate convolution layer.
        pad (:obj:`tuple` of :obj:`int`): Padding sizes for dimensions.
        stride (:obj:`tuple` of :obj:`int`): Stride sizes for dimensions.
        dilation (:obj:`tuple` of :obj:`int`): Dilation sizes for dimensions.
        group (int): Number of groups of channels. This makes connections across channels more sparse by grouping connections along map direction.
        deformable_group (int): Number of deformable groups of channels. This makes connections across channels more sparse by grouping connections along map direction.
        channel_last (bool): If True, the last dimension is considered as channel dimension, a.k.a. NHWC order.
        w_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for weight. By default, it is initialized with :obj:`nnabla.initializer.UniformInitializer` within the range determined by :obj:`nnabla.initializer.calc_uniform_lim_glorot`.
        b_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for bias. By default, it is initialized with zeros if `with_bias` is `True`.
        base_axis (int): Dimensions up to `base_axis` are treated as the sample dimensions.
        fix_parameters (bool): When set to `True`, the weights and biases will not be updated.
        rng (numpy.random.RandomState): Random generator for Initializer.
        with_bias (bool): Specify whether to include the bias term.
        apply_w (function): Lambda, function, or callable object applied to the weights.
        apply_b (function): Lambda, function, or callable object applied to the bias.

    Returns:
        :class:`~nnabla.Variable`: N-D array. See :obj:`~nnabla.functions.convolution` for the output shape.

    """
    if channel_last:
        channels = inp.shape[-1]
        filter_shape = tuple(kernel) + (channels // group,)
    else:
        channels = inp.shape[base_axis]
        filter_shape = (channels // group,) + tuple(kernel)
    if w_init is None:
        w_init = UniformInitializer(
            calc_uniform_lim_glorot(channels, outmaps, tuple(kernel)), rng=rng)
    if with_bias and b_init is None:
        b_init = ConstantInitializer()
    w = get_parameter_or_create(
        "W", (outmaps,) + filter_shape,
        w_init, True, not fix_parameters)
    if apply_w is not None:
        w = apply_w(w)
    b = None
    if with_bias:
        b = get_parameter_or_create(
            "b", (outmaps,), b_init, True, not fix_parameters)
        if apply_b is not None:
            b = apply_b(b)
    return F.deformable_convolution(inp, w, offset, mask, b, base_axis, pad, stride, dilation, group, deformable_group, channel_last)


[docs]@parametric_function_api("depthwise_conv", [
    ('W', 'Filter weights', '(inmaps * multiplier, *kernel)', True),
    ('b', 'Bias vector', '(inmaps * multiplier,)', True),
])
def depthwise_convolution(inp, kernel, pad=None, stride=None, dilation=None,
                          multiplier=1, w_init=None, b_init=None, base_axis=1,
                          fix_parameters=False, rng=None, with_bias=True):
    """
    N-D Depthwise Convolution with a bias term.

    Reference:

    - F. Chollet: Chollet, Francois. "Xception: Deep Learning with Depthwise Separable Convolutions. https://arxiv.org/abs/1610.02357

    Args:
        inp (~nnabla.Variable): N-D array.
        kernel (:obj:`tuple` of :obj:`int`): Convolution kernel size. For example, to apply convolution on an image with a 3 (height) by 5 (width) two-dimensional kernel, specify (3,5).
        pad (:obj:`tuple` of :obj:`int`): Padding sizes for dimensions.
        stride (:obj:`tuple` of :obj:`int`): Stride sizes for dimensions.
        dilation (:obj:`tuple` of :obj:`int`): Dilation sizes for dimensions.
        multiplier (:obj:`int`): Number of output feature maps per input feature map.
        w_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for weight.  By default, it is initialized with :obj:`nnabla.initializer.UniformInitializer` within the range determined by :obj:`nnabla.initializer.calc_uniform_lim_glorot`. 
        b_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for bias. By default, it is initialized with zeros if `with_bias` is `True`.
        base_axis (int): Dimensions up to `base_axis` are treated as the sample dimensions.
        fix_parameters (bool): When set to `True`, the weights and biases will not be updated.
        rng (numpy.random.RandomState): Random generator for Initializer.
        with_bias (bool): Specify whether to include the bias term.

    Returns:
        :class:`~nnabla.Variable`: N-D array. See :obj:`~nnabla.functions.depthwise_convolution` for the output shape.

    """
    if w_init is None:
        w_init = UniformInitializer(
            calc_uniform_lim_glorot(
                inp.shape[base_axis] * multiplier,
                inp.shape[base_axis],
                tuple(kernel)),
            rng=rng)
    if with_bias and b_init is None:
        b_init = ConstantInitializer()
    w = get_parameter_or_create(
        "W", (inp.shape[base_axis] * multiplier,) + tuple(kernel),
        w_init, True, not fix_parameters)
    b = None
    if with_bias:
        b = get_parameter_or_create(
            "b", (inp.shape[base_axis] * multiplier,),
            b_init, True, not fix_parameters)
    return F.depthwise_convolution(inp, w, b, base_axis, pad, stride, dilation,
                                   multiplier)


[docs]@parametric_function_api("deconv", [
    ('W', 'Filter weights', '(inmaps, outmaps // group, *kernel)', True),
    ('b', 'Bias vector', '(outmaps,)', True),
])
def deconvolution(inp, outmaps, kernel, pad=None, stride=None, dilation=None,
                  group=1, channel_last=False, output_padding=None,
                  w_init=None, b_init=None, base_axis=1, fix_parameters=False,
                  rng=None, with_bias=True, apply_w=None, apply_b=None):
    """
    Deconvolution layer.

    Args:
        inp (~nnabla.Variable): N-D array.
        outmaps (int): Number of deconvolution kernels (which is equal to the number of output channels). For example, to apply deconvolution on an input with 16 types of filters, specify 16.
        kernel (:obj:`tuple` of :obj:`int`): Convolution kernel size. For example, to apply deconvolution on an image with a 3 (height) by 5 (width) two-dimensional kernel, specify (3,5).
        pad (:obj:`tuple` of :obj:`int`): Padding sizes for dimensions.
        stride (:obj:`tuple` of :obj:`int`): Stride sizes for dimensions.
        dilation (:obj:`tuple` of :obj:`int`): Dilation sizes for dimensions.
        group (int): Number of groups of channels. This makes connections across channels sparser by grouping connections along map direction.
        w_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for weight. By default, it is initialized with :obj:`nnabla.initializer.UniformInitializer` within the range determined by :obj:`nnabla.initializer.calc_uniform_lim_glorot`.  
        b_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for bias. By default, it is initialized with zeros if `with_bias` is `True`.
        base_axis (int): Dimensions up to `base_axis` are treated as the sample dimensions.
        fix_parameters (bool): When set to `True`, the weights and biases will not be updated.
        rng (numpy.random.RandomState): Random generator for Initializer.
        with_bias (bool): Specify whether to include the bias term.
        apply_w (function): Lambda, function, or callable object applied to the weights.
        apply_b (function): Lambda, function, or callable object applied to the bias.

    Returns:
        :class:`~nnabla.Variable`: N-D array. See :obj:`~nnabla.functions.deconvolution` for the output shape.

    """
    if channel_last:
        channels = inp.shape[-1]
        weights_shape = (channels,) + tuple(kernel) + (outmaps // group,)
    else:
        channels = inp.shape[base_axis]
        weights_shape = (channels, outmaps // group,) + tuple(kernel)
    if w_init is None:
        w_init = UniformInitializer(
            calc_uniform_lim_glorot(outmaps, channels, tuple(kernel)), rng=rng)
    if with_bias and b_init is None:
        b_init = ConstantInitializer()
    w = get_parameter_or_create(
            "W", weights_shape, w_init, True, not fix_parameters)
    if apply_w is not None:
        w = apply_w(w)
    b = None
    if with_bias:
        b = get_parameter_or_create(
            "b", (outmaps,), b_init, True, not fix_parameters)
        if apply_b is not None:
            b = apply_b(b)
    return F.deconvolution(inp, w, b, base_axis, pad, stride, dilation, group,
                           channel_last, output_padding)


[docs]@parametric_function_api("depthwise_deconv", [
    ('W', 'Filter weights', '(inmaps,) + kernel', True),
    ('b', 'Bias vector', '(inmaps / divisor,)', True),
])
def depthwise_deconvolution(inp, kernel, pad=None, stride=None, dilation=None,
                            divisor=1, w_init=None, b_init=None, base_axis=1,
                            fix_parameters=False, rng=None, with_bias=True):
    """Depthwise deconvolution computes the transposed depthwise
    convolution for one-dimensional and two-dimensional input data.

    Args:
        inp (~nnabla.Variable): N-D array.
        kernel (:obj:`tuple` of :obj:`int`): Convolution kernel size. For example, to apply convolution on an image with a 3 (height) by 5 (width) two-dimensional kernel, specify (3,5).
        pad (:obj:`tuple` of :obj:`int`): Padding sizes for dimensions.
        stride (:obj:`tuple` of :obj:`int`): Stride sizes for dimensions.
        dilation (:obj:`tuple` of :obj:`int`): Dilation sizes for dimensions.
        divisor (:obj:`int`): Number of input feature maps per output feature map.
        w_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for weight. By default, it is initialized with :obj:`nnabla.initializer.UniformInitializer` within the range determined by :obj:`nnabla.initializer.calc_uniform_lim_glorot`.  
        b_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for bias. By default, it is initialized with zeros if `with_bias` is `True`.
        base_axis (int): Dimensions up to `base_axis` are treated as the sample dimensions.
        fix_parameters (bool): When set to `True`, the weights and biases will not be updated.
        rng (numpy.random.RandomState): Random generator for Initializer.
        with_bias (bool): Specify whether to include the bias term.

    Returns:
        :class:`~nnabla.Variable`: N-D array. See :obj:`~nnabla.functions.depthwise_deconvolution` for the output shape.

    """
    if w_init is None:
        w_init = UniformInitializer(
            calc_uniform_lim_glorot(
                inp.shape[base_axis],
                inp.shape[base_axis],
                tuple(kernel)),
            rng=rng)
    if with_bias and b_init is None:
        b_init = ConstantInitializer()
    w = get_parameter_or_create(
        "W", (inp.shape[base_axis],) + tuple(kernel),
        w_init, True, not fix_parameters)
    b = None
    if with_bias:
        b = get_parameter_or_create(
            "b", (inp.shape[base_axis] // divisor,),
            b_init, True, not fix_parameters)
    return F.depthwise_deconvolution(inp, w, b, base_axis, pad, stride,
                                     dilation, divisor)


[docs]@parametric_function_api("rnn", [
    ('weight_l0', 'Filter weights at 0-th layer', '(D, H, I + H)', True),
    ('weight', 'Filter weights at 1-st layer and above', '(L-1, D, H, DH + H)', True),
    ('bias', 'Biases', '(L, D, H)', True),
])
def rnn(x, h, w0_init=None, w_init=None, b_init=None, num_layers=1, nonlinearity='tanh', dropout=0.0, bidirectional=False, training=True, rng=None, with_bias=True, fix_parameters=False):
    """N-Step RNN (recurrent neural networks).

    N-Step RNN function implements Elman RNN with nonlinearity to input sequence.
    N-Step RNN function is defined as following:

    .. math::
        h_t = \\tanh(w_{ih}x_t+b_{ih}+w_{hh}h_{(t-1)}).

    We use the following notations to describe the inputs and outputs below.
    :math:`T`: sequcne length, :math:`B`: batch size, :math:`I`: input size, :math:`L`: number of layers, :math:`D`: number of directions, can be either 1 or 2, :math:`H`: hidden size.

    References:

        Jeffrey L. Elman. "Finding Structure in Time."
        Cognitive Science. 1990.

    Args:
        x (~nnabla.Variable): Input N-D array with shape :math:`(T, B, I)`.
        h (~nnabla.Variable): Input N-D array with shape :math:`(L, D, B, H)`.
        w0_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`, optional): Initializer for weight at the first layer. Shape is :math:`(D, H, I + H)`.
        w_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`, optional): Initializer for weights at the second layer and up. Shape is :math:`(L-1, D, H, D*H + H)`.
        b_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`, optional): Initializer for bias. Shape is :math:`(L, D, H)`.
        num_layers (int, optional): Number of layers in the network. If set to 1, only the weights for the first layer will be invoked. Default is 1.
        nonlinearity (str, optional): Type of nonlinearity applied to input sequcne. Must be either tanh or relu. Default is tanh.
        dropout (float, optional): Dropout ratio applied to parameters. Default is 0.0.
        bidirectional (bool, optional): If True, bidirectional computation will be performed in each layer. Default is False.
        training (bool, optional): Backpropagation will be performed only when it is true. Default is True.
        with_bias (bool, optional): Specify whether to include the bias term.

    Returns:
        ~nnabla.Variable: Output :math:`y` with shape :math:`(T, B, D * H)`
        ~nnabla.Variable: Output :math:`h_n` with shape :math:`(L, D, B, H)`

    Example:
        .. code-block:: python

            x = nn.Variable((seq_len, batch_size, input_size))
            h = nn.Variable((num_layers, num_directions, batch_size, hidden_size))
            y, hn = PF.rnn(x, h)

    """
    input_size = x.shape[2]
    hidden_size = h.shape[3]
    num_layers = h.shape[0]
    num_directions = 2 if bidirectional else 1

    if w0_init is None:
        w0_init_ih = UniformInitializer(
            calc_uniform_lim_glorot(input_size, hidden_size), rng)
        w0_init_ih = w0_init_ih((num_directions, hidden_size, input_size))
        w0_init_hh = UniformInitializer(
            calc_uniform_lim_glorot(hidden_size, hidden_size), rng)
        w0_init_hh = w0_init_hh((num_directions, hidden_size, hidden_size))
        w0_init = np.concatenate((w0_init_ih, w0_init_hh), axis=2)
    if w_init is None:
        w_init_ih = UniformInitializer(calc_uniform_lim_glorot(
            num_directions*hidden_size, hidden_size), rng)
        w_init_ih = w_init_ih(
            (num_layers - 1, num_directions, hidden_size, num_directions*hidden_size))
        w_init_hh = UniformInitializer(
            calc_uniform_lim_glorot(hidden_size, hidden_size), rng)
        w_init_hh = w_init_hh(
            (num_layers - 1, num_directions, hidden_size, hidden_size))
        w_init = np.concatenate((w_init_ih, w_init_hh), axis=3)
    if with_bias and b_init is None:
        b_init = ConstantInitializer()
    w0_shape = (num_directions, hidden_size, input_size + hidden_size)
    w0 = get_parameter_or_create(
        "weight_l0", w0_shape,
        w0_init, True, not fix_parameters)
    w = None
    if num_layers > 1:
        w_shape = (num_layers - 1, num_directions, hidden_size,
                   num_directions * hidden_size + hidden_size)
        w = get_parameter_or_create(
            "weight", w_shape,
            w_init, True, not fix_parameters)
    b = None
    n_outmaps = (num_layers, num_directions, hidden_size)
    if with_bias:
        b = get_parameter_or_create(
            "bias", n_outmaps, b_init, True, not fix_parameters)

    return F.rnn(x, h, weight_l0=w0, weight=w, bias=b, num_layers=num_layers, nonlinearity=nonlinearity, dropout=dropout, bidirectional=bidirectional, training=training)


[docs]@parametric_function_api("lstm", [
    ('weight_l0', 'Filter weights at 0-th layer', '(D, 4, H, I + H)', True),
    ('weight', 'Filter weights at 1-st layer and above',
     '(L-1, D, 4, H, DH + H)', True),
    ('bias', 'Biases', '(L, D, 4, H)', True),
])
def lstm(x, h, c, w0_init=None, w_init=None, b_init=None, num_layers=1, dropout=0.0, bidirectional=False, training=True, rng=None, with_bias=True, fix_parameters=False):
    """LSTM (long short-term memory).

    Long Short-Term Memory, or LSTM, is a building block for recurrent neural networks (RNN) layers.
    LSTM unit consists of a cell and input, output, forget gates whose functions are defined as following:

    .. math::
        f_t&&=\\sigma(W_fx_t+U_fh_{t-1}+b_f) \\\\
        i_t&&=\\sigma(W_ix_t+U_ih_{t-1}+b_i) \\\\
        o_t&&=\\sigma(W_ox_t+U_oh_{t-1}+b_o) \\\\
        c_t&&=f_t\\odot c_{t-1}+i_t\\odot\\tanh(W_cx_t+U_ch_{t-1}+b_c) \\\\
        h_t&&=o_t\\odot\\tanh(c_t).

    We use the following notations to describe the inputs and outputs below.
    :math:`T`: sequcne length, :math:`B`: batch size, :math:`I`: input size, :math:`L`: number of layers, :math:`D`: number of directions, can be either 1 or 2, :math:`H`: hidden size.

    References:

        S. Hochreiter, and J. Schmidhuber. "Long Short-Term Memory."
        Neural Computation. 1997.

    Args:
        x (~nnabla.Variable): Input N-D array with shape :math:`(T, B, I)`.
        h (~nnabla.Variable): Input N-D array with shape :math:`(L, D, B, H)`.
        c (~nnabla.Variable): Input N-D array with shape :math:`(L, D, B, H)` .
        w0_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`, optional): Initializer for weight at the first layer. Shape is :math:`(D, 4, H, I + H)`.
        w_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`, optional): Initializer for weights at the second layer and up. Shape is :math:`(L-1, D, 4, H, D * H + H)`.
        b_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`, optional): Initializer for bias. Shape is :math:`(L, D, 4, H)`.
        num_layers (int, optional): Number of layers in the network. If set to 1, only the weights for the first layer will be invoked. Default is 1.
        dropout (float, optional): Dropout ratio applied to parameters. Default is 0.0.
        bidirectional (bool, optional): If True, bidirectional computation will be performed in each layer. Default is False.
        training (bool, optional): Backpropagation will be performed only when it is true. Default is True.
        with_bias (bool, optional): Specify whether to include the bias term.
        fix_parameters (bool): When set to `True`, the weights and biases will not be updated.

    Returns:
        ~nnabla.Variable: Output :math:`y` with shape :math:`(T, B, D * H)`
        ~nnabla.Variable: Output :math:`h_n` with shape :math:`(L, D, B, H)`
        ~nnabla.Variable: Output :math:`c_n` with shape :math:`(L, D, B, H)`

    Example:
        .. code-block:: python

            x = nn.Variable((seq_len, batch_size, input_size))
            h = nn.Variable((num_layers, num_directions, batch_size, hidden_size))
            c = nn.Variable((num_layers, num_directions, batch_size, hidden_size))
            y, hn, cn = PF.lstm(x, h, c)

    """
    if type(w0_init) == int:
        nn.logger.warning(
            "Arguments passed seem to be for previous LSTM function, which has been renamed to lstm_cell.")
        raise ValueError

    input_size = x.shape[2]
    hidden_size = h.shape[3]
    num_layers = h.shape[0]
    num_directions = 2 if bidirectional else 1

    w0 = get_parameter('weight_l0')
    w = get_parameter('weight')
    b = get_parameter('bias')

    if w0 is None:
        if w0_init is None:
            w0_ih = UniformInitializer(
                calc_uniform_lim_glorot(input_size, hidden_size), rng)
            w0_ih = w0_ih((num_directions, 4, hidden_size, input_size))
            w0_hh = UniformInitializer(
                calc_uniform_lim_glorot(hidden_size, hidden_size), rng)
            w0_hh = w0_hh((num_directions, 4, hidden_size, hidden_size))
            w0_init = np.concatenate((w0_ih, w0_hh), axis=3)
        w0_shape = (num_directions, 4, hidden_size, input_size + hidden_size)
        w0 = get_parameter_or_create(
            "weight_l0", w0_shape,
            w0_init, True, not fix_parameters)

    if num_layers > 1 and w is None:
        if w_init is None:
            w_ih = UniformInitializer(calc_uniform_lim_glorot(
                num_directions*hidden_size, hidden_size), rng)
            w_ih = w_ih(
                (num_layers - 1, num_directions, 4, hidden_size, num_directions*hidden_size))
            w_hh = UniformInitializer(
                calc_uniform_lim_glorot(hidden_size, hidden_size), rng)
            w_hh = w_hh(
                (num_layers - 1, num_directions, 4, hidden_size, hidden_size))
            w_init = np.concatenate((w_ih, w_hh), axis=4)
        w_shape = (num_layers - 1, num_directions, 4, hidden_size,
                   num_directions * hidden_size + hidden_size)
        w = get_parameter_or_create(
            "weight", w_shape,
            w_init, True, not fix_parameters)

    if with_bias and b is None:
        if b_init is None:
            b_init = ConstantInitializer()
        n_outmaps = (num_layers, num_directions, 4, hidden_size)
        b = get_parameter_or_create(
            "bias", n_outmaps, b_init, True, not fix_parameters)

    if w0.shape != (num_directions, 4, hidden_size, input_size+hidden_size):
        nn.logger.warning(
            "Parameters seem to have been saved prior to bug fix. It will be converted into the correct shape, but we highly recommend training again to obtain the correct parameters, as we will cease to support these parametetrs in future. We apologize for the inconveinences.")
        tmp = w0.d
        w0 = nn.Variable.from_numpy_array(np.reshape(
            tmp, (num_directions, 4, hidden_size, input_size + hidden_size)), need_grad=True)
        nn.set_parameter('weight_l0', w0)
    if num_layers > 1 and w.shape != (num_layers-1, num_directions, 4, hidden_size, num_directions*hidden_size + hidden_size):
        tmp = w.d
        ww = nn.Variable.from_numpy_array(np.reshape(
            tmp, (num_layers - 1, num_directions, 4, hidden_size, num_directions*hidden_size + hidden_size)), need_grad=True)
        nn.set_parameter('weight', w)

    w0 = w0.get_unlinked_variable(need_grad=not fix_parameters)
    if num_layers > 1:
        w = w.get_unlinked_variable(need_grad=not fix_parameters)
    if with_bias:
        b = b.get_unlinked_variable(need_grad=not fix_parameters)

    return F.lstm(x, h, c, weight_l0=w0, weight=w, bias=b, num_layers=num_layers, dropout=dropout, bidirectional=bidirectional, training=training)


[docs]@parametric_function_api("gru", [
    ('weight_l0', 'Filter weights at 0-th layer', '(D, 3, H, I + H)', True),
    ('weight', 'Filter weights at 1-st layer and above',
     '(L-1, D, 3, H, DH + H)', True),
    ('bias', 'Biases', '(L, D, 4, H)', True),
])
def gru(x, h, w0_init=None, w_init=None, b_init=None, num_layers=1, dropout=0.0, bidirectional=False, training=True, rng=None, with_bias=True, fix_parameters=False):
    """GRU (gated recurrent units).

    GRU is defined as following:

    .. math::
        r_t&&=\\sigma(W_rx_t+U_rh_{t-1}+b_r) \\\\
        z_t&&=\\sigma(W_zx_t+U_zh_{t-1}+b_z) \\\\
        n_t&&=\\tanh(W_nx_t+b_{in}+r_n \odot (U_nh_{t-1}+b_{hn})) \\\\
        h_t&&=(1-z_t) \odot n_t+z_t \odot h_{t-1}.

    We use the following notations to describe the inputs and outputs below.
    :math:`T`: sequcne length, :math:`B`: batch size, :math:`I`: input size, :math:`L`: number of layers, :math:`D`: number of directions, can be either 1 or 2, :math:`H`: hidden size.

    References:

        K. Cho et al. "Learning Phrase Representations using RNN Encoder--Decoder for Statistical Machine Translation."
        Empirical Methods in Natural Language Processing. 2014.

    Args:
        x (~nnabla.Variable): Input N-D array with shape :math:`(T, B, I)`.
        h (~nnabla.Variable): Input N-D array with shape :math:`(L, D, B, H)`.
        w0_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`, optional): Initializer for weight at the first layer. Shape is :math:`(D, 3, H, I + H)`.
        w_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`, optional): Initializer for weights at the second layer and up. Shape is :math:`(L-1, D, 3, H, D * H + H)`.
        b_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`, optional): Initializer for bias. Shape is :math:`(L, D, 4, H)`.
        num_layers (int, optional): Number of layers in the network. If set to 1, only the weights for the first layer will be invoked. Default is 1.
        dropout (float, optional): Dropout ratio applied to parameters. Default is 0.0.
        bidirectional (bool, optional): If True, bidirectional computation will be performed in each layer. Default is False.
        training (bool, optional): Backpropagation will be performed only when it is true. Default is True.
        with_bias (bool, optional): Specify whether to include the bias term.

    Returns:
        ~nnabla.Variable: Output :math:`y` with shape :math:`(T, B, D * H)`
        ~nnabla.Variable: Output :math:`h_n` with shape :math:`(L, D, B, H)`

    Example:
        .. code-block:: python

            x = nn.Variable((seq_len, batch_size, input_size))
            h = nn.Variable((num_layers, num_directions, batch_size, hidden_size))
            y, hn = PF.gru(x, h)

    """
    input_size = x.shape[2]
    hidden_size = h.shape[3]
    num_layers = h.shape[0]
    num_directions = 2 if bidirectional else 1

    w0 = get_parameter('weight_l0')
    w = get_parameter('weight')
    b = get_parameter('bias')

    if w0 is None:
        if w0_init is None:
            w0_ih = UniformInitializer(
                calc_uniform_lim_glorot(input_size, hidden_size), rng)
            w0_ih = w0_ih((num_directions, 3, hidden_size, input_size))
            w0_hh = UniformInitializer(
                calc_uniform_lim_glorot(hidden_size, hidden_size), rng)
            w0_hh = w0_hh((num_directions, 3, hidden_size, hidden_size))
            w0_init = np.concatenate((w0_ih, w0_hh), axis=3)
        w0_shape = (num_directions, 3, hidden_size, input_size + hidden_size)
        w0 = get_parameter_or_create(
            "weight_l0", w0_shape,
            w0_init, True, not fix_parameters)

    if num_layers > 1 and w is None:
        if w_init is None:
            w_ih = UniformInitializer(calc_uniform_lim_glorot(
                num_directions*hidden_size, hidden_size), rng)
            w_ih = w_ih(
                (num_layers - 1, num_directions, 3, hidden_size, num_directions*hidden_size))
            w_hh = UniformInitializer(
                calc_uniform_lim_glorot(hidden_size, hidden_size), rng)
            w_hh = w_hh(
                (num_layers - 1, num_directions, 3, hidden_size, hidden_size))
            w_init = np.concatenate((w_ih, w_hh), axis=4)
        w_shape = (num_layers - 1, num_directions, 3, hidden_size,
                   num_directions * hidden_size + hidden_size)
        w = get_parameter_or_create(
            "weight", w_shape,
            w_init, True, not fix_parameters)

    if with_bias and b is None:
        if b_init is None:
            b_init = ConstantInitializer()
        n_outmaps = (num_layers, num_directions, 4, hidden_size)
        b = get_parameter_or_create(
            "bias", n_outmaps, b_init, True, not fix_parameters)

    if w0.shape != (num_directions, 3, hidden_size, input_size+hidden_size):
        nn.logger.warning(
            "Parameters seem to have been saved prior to bug fix. It will be converted into the correct shape, but we highly recommend training again to obtain the correct parameters, as we will cease to support these parametetrs in future. We apologize for the inconveinences.")
        tmp = w0.d
        w0 = nn.Variable.from_numpy_array(np.reshape(
            tmp, (num_directions, 3, hidden_size, input_size + hidden_size)), need_grad=True)
        nn.set_parameter('weight_l0', w0)
    if num_layers > 1 and w.shape != (num_layers-1, num_directions, 3, hidden_size, num_directions*hidden_size + hidden_size):
        tmp = w.d
        ww = nn.Variable.from_numpy_array(np.reshape(
            tmp, (num_layers - 1, num_directions, 3, hidden_size, num_directions*hidden_size + hidden_size)), need_grad=True)
        nn.set_parameter('weight', w)
    w0 = w0.get_unlinked_variable(need_grad=not fix_parameters)
    if num_layers > 1:
        w = w.get_unlinked_variable(need_grad=not fix_parameters)
    if with_bias:
        b = b.get_unlinked_variable(need_grad=not fix_parameters)

    return F.gru(x, h, weight_l0=w0, weight=w, bias=b, num_layers=num_layers, dropout=dropout, bidirectional=bidirectional, training=training)


[docs]@parametric_function_api("bn", [
    ('beta', 'Trainable bias :math:`\\beta`', '<see above>', True),
    ('gamma', 'Trainable scaling factor :math:`\\gamma`', '<see above>', True),
    ('mean', 'Moving average of batch mean', '<see above>', False),
    ('var', 'Moving average of batch variance', '<see above>', False),
])
def fused_batch_normalization(inp, z=None, axes=[1], decay_rate=0.9, eps=1e-5,
                              batch_stat=True, nonlinearity='relu', output_stat=False,
                              fix_parameters=False, param_init=None, no_scale=False, no_bias=False):
    """
    Batch normalization layer fused with the following add2 operation of a
    residual input and an nonlinear activation.

    Args:
        inp (~nnabla.Variable): N-D array of input.
        z (~nnabla.Variable, optional):
            A residual input. By specifying None, the activation function will
            follow immediately after BN operation.
        axes (:obj:`tuple` of :obj:`int`):
            Mean and variance for each element in ``axes`` are calculated using
            elements on the rest axes. For example, if an input is 4 dimensions,
            and ``axes`` is ``[1]``,  batch mean is calculated as
            ``np.mean(inp.d, axis=(0, 2, 3), keepdims=True)``
            (using numpy expression as an example).
        decay_rate (float): Decay rate of running mean and variance.
        eps (float): Tiny value to avoid zero division by std.
        batch_stat (bool): Use mini-batch statistics rather than running ones.
        nonlinearity (string): Activation function. The default is 'relu'.
        output_stat (bool): Output batch mean and variance.
        fix_parameters (bool): When set to `True`, the beta and gamma will not be updated.
        no_scale (bool): If `True`, the scale term is omitted.
        no_bias (bool): If `True`, the bias term is omitted.

    Returns:
        :class:`~nnabla.Variable`: N-D array.

    """
    from .normalization_functions import _init_beta_gamma

    shape_stat = [1 for _ in inp.shape]
    for i in range(len(axes)):
        shape_stat[axes[i]] = inp.shape[axes[i]]

    if param_init is None:
        param_init = {}

    beta, gamma = _init_beta_gamma(
        shape_stat, fix_parameters, param_init, no_bias, no_scale)

    mean_init = param_init.get('mean', ConstantInitializer(0))
    var_init = param_init.get('var', ConstantInitializer(1))
    mean = get_parameter_or_create(
        "mean", shape_stat, mean_init, False)
    var = get_parameter_or_create(
        "var", shape_stat, var_init, False)
    return F.fused_batch_normalization(inp, beta, gamma, mean, var, z, axes,
                                       decay_rate, eps, batch_stat,
                                       nonlinearity, output_stat)


[docs]@parametric_function_api("bn", [
    ('beta', 'Trainable bias :math:`\\beta`', '<see above>', True),
    ('gamma', 'Trainable scaling factor :math:`\\gamma`', '<see above>', True),
    ('mean', 'Moving average of batch mean', '<see above>', False),
    ('var', 'Moving average of batch variance', '<see above>', False),
])
def batch_normalization(inp, axes=[1], decay_rate=0.9, eps=1e-5,
                        batch_stat=True, output_stat=False, fix_parameters=False,
                        param_init=None, no_scale=False, no_bias=False):
    """
    Batch normalization layer.

    .. math::

        \\begin{array}{lcl}
        \\mu &=& \\frac{1}{M} \\sum x_i\\\\
        \\sigma^2 &=& \\frac{1}{M} \\sum \\left(x_i - \\mu\\right)^2\\\\
        \\hat{x}_i &=& \\frac{x_i - \\mu}{\\sqrt{\\sigma^2 + \\epsilon }}\\\\
        y_i &= & \\hat{x}_i \\gamma + \\beta.
        \\end{array}

    where :math:`x_i, y_i` are the inputs.
    In testing, the mean and variance computed by moving average calculated during training are used.

    Args:
        inp (~nnabla.Variable): N-D array of input.
        axes (:obj:`tuple` of :obj:`int`):
            Mean and variance for each element in ``axes`` are calculated using
            elements on the rest axes. For example, if an input is 4 dimensions,
            and ``axes`` is ``[1]``,  batch mean is calculated as
            ``np.mean(inp.d, axis=(0, 2, 3), keepdims=True)``
            (using numpy expression as an example).
        decay_rate (float): Decay rate of running mean and variance.
        eps (float): Tiny value to avoid zero division by std.
        batch_stat (bool): Use mini-batch statistics rather than running ones.
        output_stat (bool): Output batch mean and variance.
        fix_parameters (bool): When set to `True`, the beta and gamma will not be updated.
        param_init (dict):
            Parameter initializers can be set with a dict. A key of the dict must
            be ``'beta'``, ``'gamma'``, ``'mean'`` or ``'var'``.
            A value of the dict must be an :obj:`~nnabla.initializer.Initializer`
            or a :obj:`numpy.ndarray`.
            E.g. ``{'beta': ConstantInitializer(0), 'gamma': np.ones(gamma_shape) * 2}``.
        no_scale (bool): If `True`, the scale term is omitted.
        no_bias (bool): If `True`, the bias term is omitted.

    Returns:
        :class:`~nnabla.Variable`: N-D array.

    References:

        - Ioffe and Szegedy, Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift. https://arxiv.org/abs/1502.03167

    The shape of parameters has the same number of dimensions with the input
    data, and the shapes in ``axes`` has the same dimensions with the input, while the rest has ``1``.
    If an input is 4-dim and ``axes=[1]``, the parameter shape will be
    ``param_shape  = np.mean(inp.d, axis=(0, 2, 3), keepdims=True).shape``
    (using numpy expression as an example).

    """
    from .normalization_functions import _init_beta_gamma

    shape_stat = [1 for _ in inp.shape]
    for i in range(len(axes)):
        shape_stat[axes[i]] = inp.shape[axes[i]]

    if param_init is None:
        param_init = {}

    beta, gamma = _init_beta_gamma(
        shape_stat, fix_parameters, param_init, no_bias, no_scale)

    mean_init = param_init.get('mean', ConstantInitializer(0))
    var_init = param_init.get('var', ConstantInitializer(1))
    mean = get_parameter_or_create(
        "mean", shape_stat, mean_init, False)
    var = get_parameter_or_create(
        "var", shape_stat, var_init, False)
    return F.batch_normalization(inp, beta, gamma, mean, var, axes,
                                 decay_rate, eps, batch_stat, output_stat)


[docs]@parametric_function_api("bn", [
    ('beta', 'Trainable bias :math:`\\beta`', '<see above>', True),
    ('gamma', 'Trainable scaling factor :math:`\\gamma`', '<see above>', True),
    ('mean', 'Moving average of batch mean', '<see above>', False),
    ('var', 'Moving average of batch variance', '<see above>', False),
])
def sync_batch_normalization(inp, comm, group="world", axes=[1], decay_rate=0.9, eps=1e-5, batch_stat=True,
                             output_stat=False, fix_parameters=False,
                             param_init=None, no_scale=False, no_bias=False):
    """
    Synchronized batch normalization layer.

    For some tasks (e.g., semantic segmentation), batch size will be too small and BatchNormalization layer might not work well.
    SyncBatchNorlization layer solves these problems by synchronizing batch stats (mean and var) between multiple processes.

    .. math::

        \\begin{array}{lcl}
        \\mu &=& \\frac{1}{M} \\sum x_i\\\\
        \\sigma^2 &=& \\frac{1}{M} \\left(\\sum x_i - \\mu\\right)^2\\\\
        \\hat{x}_i &=& \\frac{x_i - \\mu}{\\sqrt{\\sigma^2 + \\epsilon }}\\\\
        y_i &= & \\hat{x}_i \\gamma + \\beta.
        \\end{array}

    where :math:`x_i, y_i` are the inputs.

    Args:
        inp (~nnabla.Variable): N-D array of input.
        comm (~nnabla.communicators.Communicator): The communicator
        group (string): The name of the communicator group
        axes (:obj:`tuple` of :obj:`int`):
            Mean and variance for each element in ``axes`` are calculated using
            elements on the rest axes. For example, if an input is 4 dimensions,
            and ``axes`` is ``[1]``,  batch mean is calculated as
            ``np.mean(inp.d, axis=(0, 2, 3), keepdims=True)``
            (using numpy expression as an example).
        decay_rate (float): Decay rate of running mean and variance.
        eps (float): Tiny value to avoid zero division by std.
        batch_stat (bool): Use mini-batch statistics rather than running ones.
        output_stat (bool): Output batch mean and variance.
        fix_parameters (bool): When set to `True`, the beta and gamma will not be updated.
        param_init (dict):
            Parameter initializers can be set with a dict. A key of the dict must
            be ``'beta'``, ``'gamma'``, ``'mean'`` or ``'var'``.
            A value of the dict must be an :obj:`~nnabla.initializer.Initializer`
            or a :obj:`numpy.ndarray`.
            E.g. ``{'beta': ConstantInitializer(0), 'gamma': np.ones(gamma_shape) * 2}``.
        no_scale (bool): If `True`, the scale term is omitted.
        no_bias (bool): If `True`, the bias term is omitted.

    Returns:
        :class:`~nnabla.Variable`: N-D array.

    References:
        - Ioffe and Szegedy, Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift, https://arxiv.org/abs/1502.03167
        - Hang Zhang, Kristin Dana, Jianping Shi, Zhongyue Zhang, Xiaogang Wang, Ambrish Tyagi, Amit Agrawal, Context Encoding for Semantic Segmentation, https://arxiv.org/abs/1803.08904 
        - Implementing Synchronized Multi-GPU Batch Normalization https://hangzhang.org/PyTorch-Encoding/notes/syncbn.html

    The shape of parameters has the same number of dimensions with the input
    data, and the shapes in ``axes`` has the same dimensions with the input, while the rest has ``1``.
    If an input is 4-dim and ``axes=[1]``, the parameter shape will be
    ``param_shape  = np.mean(inp.d, axis=(0, 2, 3), keepdims=True).shape``
    (using numpy expression as an example).

    """
    from .normalization_functions import _init_beta_gamma

    shape_stat = [1 for _ in inp.shape]
    for i in range(len(axes)):
        shape_stat[axes[i]] = inp.shape[axes[i]]

    if param_init is None:
        param_init = {}

    beta, gamma = _init_beta_gamma(
        shape_stat, fix_parameters, param_init, no_bias, no_scale)

    mean_init = param_init.get('mean', ConstantInitializer(0))
    var_init = param_init.get('var', ConstantInitializer(1))
    mean = get_parameter_or_create(
        "mean", shape_stat, mean_init, False)
    var = get_parameter_or_create(
        "var", shape_stat, var_init, False)
    return F.sync_batch_normalization(inp, beta, gamma, mean, var, comm, group,
                                      axes, decay_rate, eps, batch_stat, output_stat)


[docs]@parametric_function_api("mean_subtraction", [
    ('mean', 'Moving average', 'inp.shape[base_axis:]', False),
    ('t', 'Minibatch counter used in forward pass', '(1,)', False),
])
def mean_subtraction(inp, base_axis=1, update_running_mean=True, fix_parameters=False):
    """
    Mean subtraction layer.

    It subtracts the mean of the elements of the input array,
    and normalizes it to :math:`0`. Preprocessing arrays with this function has the effect of improving accuracy
    in various tasks such as image classification.

    At training time, this function is defined as

    .. math::

        \\begin{array}{lcl}
        \\mu &=& \\frac{1}{M} \\sum x_i \\\\
        y_i &=& x_i - \\mu
        \\end{array}

    At testing time, the mean values used are those that were computed during training by moving average.

    Note:
        The backward performs an approximated differentiation that takes into account only the latest mini-batch.

    Args:
        inp (~nnabla.Variable): N-D array of input.
        base_axis (int): Base axis of Mean Subtraction operation. Dimensions up to base_axis is treated as sample dimension.
        update_running_mean (bool): When set to `True`, the running mean will not be updated.
        fix_parameters (bool): dummy parameter. This argument dose not affect anything.

    Returns:
        ~nnabla.Variable: N-D array.
    """
    assert len(inp.shape) >= base_axis
    shape = inp.shape[base_axis:]
    mean = get_parameter_or_create(
        "mean", shape, ConstantInitializer(0), False)
    t = get_parameter_or_create(
        "t", (1, ), ConstantInitializer(0), False)
    return F.mean_subtraction(inp, mean, t, base_axis=base_axis, update_running_mean=update_running_mean)


[docs]@parametric_function_api("layer_normalization", [
    ('beta', 'Trainable bias :math:`\\beta`', '<see above>', True),
    ('gamma', 'Trainable scaling factor :math:`\\gamma`', '<see above>', True)
])
def layer_normalization(inp, batch_axis=0, eps=1e-05, output_stat=False, fix_parameters=False, param_init=None,
                        no_scale=False, no_bias=False):
    r"""
    Applies Layer Normalization over an input variable, which is defined as:

    .. math::
      \begin{eqnarray}
        \mu^l &=& \frac{1}{H} \sum_{i=1}^{H} x_i^l \\
        \sigma^l &=& \sqrt{\frac{1}{H} \sum_{i=1}^{H} \left(x_i^l - \mu^l\right)^2} \\
        y &=& \frac{x - \mu^l}{\sigma^l + \epsilon} \gamma + \beta
      \end{eqnarray}

    where :math:`x` and :math:`y` are input and output variable,
    :math:`\mu^l` and :math:`\sigma^l` are the mean and std of each layer along batch axis,
    and :math:`\alpha` and :math:`\beta` are trainable parameter.

    .. note::
        Unlike other normalization,
        which applies scalar scale and bias for each entire channel/plane,
        Layer Normalization applies per-element scale and bias.

    References:

        * `Jimmy Lei Ba, Jamie Ryan Kiros, Geoffrey E. Hinton, Layer Normalization.
          <https://arxiv.org/abs/1607.06450>`_

    Args:
        inp (Variable): An input variable.
        batch_axis (int or repeated int): Axes mean and variance are taken.
        eps (float): Tiny value to avoid zero division by std.
        output_stat(bool): It `True`, calculated mean and variance are also returned.
        fix_parameters (bool): When set to `True`, the beta and gamma will not be updated.
        param_init (dict):
            Parameter initializers can be set with a dict. A key of the dict must
            be ``'gamma'``, ``'beta'``.
            A value of the dict must be an :obj:`~nnabla.initializer.Initializer`
            or a :obj:`numpy.ndarray`.
            E.g. ``{'gamma': np.ones(...) * 2, 'beta': ConstantInitializer(0)}``.
        no_scale (bool): If `True`, the scale term is omitted.
        no_bias (bool): If `True`, the bias term is omitted.

    Returns:
        * :obj:`~nnabla.Variable`: Normalized output variable.
        * :obj:`~nnabla.Variable`: Mean (if `output_stat=True`).
        * :obj:`~nnabla.Variable`: Std (if `output_stat=True`)
    """
    from nnabla.normalization_functions import _force_list, _init_beta_gamma

    batch_axis = _force_list(batch_axis)

    shape_stat = list(inp.shape)
    for baxis in batch_axis:
        shape_stat[baxis] = 1

    if param_init is None:
        param_init = {}

    beta, gamma = _init_beta_gamma(
        shape_stat, fix_parameters, param_init, no_bias, no_scale)

    return F.layer_normalization(inp, beta, gamma,
                                 batch_axis=batch_axis, eps=eps, output_stat=output_stat)


[docs]@parametric_function_api("instance_normalization", [
    ('beta', 'Trainable bias :math:`\\beta`', '<see above>', True),
    ('gamma', 'Trainable scaling factor :math:`\\gamma`', '<see above>', True),
])
def instance_normalization(inp, channel_axis=1, batch_axis=0, eps=1e-05, output_stat=False, fix_parameters=False,
                           param_init=None, no_scale=False, no_bias=False):
    r"""
    Applies Instance Normalization over an input variable, which is defined as:

    .. math::
      \begin{eqnarray}
        \mu^i &=& \frac{1}{H} \sum_{i=1}^{H} x_i^i \\
        \sigma^i &=& \sqrt{\frac{1}{H} \sum_{i=1}^{H} \left(x_i^i - \mu^i\right)^2} \\
        y &=& \frac{x - \mu^i}{\sigma^ + \epsilon} \gamma + \beta
      \end{eqnarray}

    where :math:`x` and :math:`y` are input and output variable,
    :math:`\mu^i` and :math:`\sigma^i` are the mean and std of each instance which is separately calculated for each batch and channel,
    and :math:`\gamma` and :math:`\beta` are adaptive gains and biases.

    If the input shape is [B, C, H, W] (= channel_axis=1, batch_axis=0), the shape of calculated mean and std are [B, C, 1, 1]

    References:

        * `Dmitry Ulyanov, Andrea Vedaldi, Victor Lempitsky, Instance Normalization: The Missing Ingredient for Fast Stylization.
          <https://arxiv.org/abs/1607.08022>`_

    Args:
        inp (Variable): An input variable.
        channel_axis (int or repeated int): Channel axes.
        batch_axis (int or repeated int): Batch axes.
        eps (float): Tiny value to avoid zero division by std.
        output_stat(bool): It `True`, the batch statistics of mean and variance.
        fix_parameters (bool): If `True`, the beta and gamma will not be updated.
        param_init (dict):
            Parameter initializers can be set with a dict. A key of the dict must
            be ``'gamma'``, ``'beta'``.
            A value of the dict must be an :obj:`~nnabla.initializer.Initializer`
            or a :obj:`numpy.ndarray`.
            E.g. ``{'gamma': np.ones(...) * 2, 'beta': ConstantInitializer(0)}``.
        no_scale (bool): If `True`, the scale term is omitted.
        no_bias (bool): If `True`, the bias term is omitted.

        Returns:
            * :obj:`~nnabla.Variable`: Normalized output variable.
            * :obj:`~nnabla.Variable`: Mean (if `output_stat=True`)
            * :obj:`~nnabla.Variable`: Std (if `output_stat=True`)
        """
    from nnabla.normalization_functions import _init_beta_gamma

    shape_stat = [1 for _ in range(len(inp.shape))]
    shape_stat[channel_axis] = inp.shape[channel_axis]

    if param_init is None:
        param_init = {}

    beta, gamma = _init_beta_gamma(
        shape_stat, fix_parameters, param_init, no_bias, no_scale)

    return F.instance_normalization(inp, beta, gamma,
                                    channel_axis=channel_axis, batch_axis=batch_axis, eps=eps, output_stat=output_stat)


[docs]@parametric_function_api("group_normalization", [
    ('beta', 'Trainable bias :math:`\\beta`', '<see above>', True),
    ('gamma', 'Trainable scaling factor :math:`\\gamma`', '<see above>', True),
])
def group_normalization(inp, num_groups, channel_axis=1, batch_axis=0, eps=1e-05, output_stat=False,
                        fix_parameters=False, param_init=None, no_scale=False, no_bias=False):
    r"""
    Applies Group Normalization over an input tensor, which is defined as:

    .. math::
      \begin{eqnarray}
        \mu^g &=& \frac{1}{H} \sum_{i=1}^{H} x_i^g \\
        \sigma^g &=& \sqrt{\frac{1}{H} \sum_{i=1}^{H} \left(x_i^g - \mu^g\right)^2} \\
        y &=& \frac{x - \mu^g}{\sigma^g + \epsilon} \gamma + \beta
      \end{eqnarray}

    where :math:`x` and :math:`y` are input and output variable,
    :math:`\mu^g` and :math:`\sigma^g` are the mean and std of each group which contains `num_channels / num_groups` channels,
    and :math:`\gamma` and :math:`\beta` are adaptive gains and biases.

    The input channels, specified by :attr:`channel_axis`, are separeted into :attr:`num_groups` groups,
    and the mean and std are calculated over the each group.
    For example, if the input shape is [B, C, H, W] (= channel_axis=1, batch_axis=0),
    an input variable is once reshaped to [B, num_groups, C / num_groups, H, W]
    and standardize by its mean and std whose shapes are [B, num_groups, C / num_groups, 1, 1].
    Before returning, an output variable is reshaped again to the original input shape (= [B, C, H, W] in the case above).

    References:

        * `Yuxin Wu, Kaiming He, Group Normalization.
          <https://arxiv.org/abs/1803.08494>`_

    Args:
        inp (Variable): An input variable.
        num_groups (int): A number of groups. The channel dim of 'x' must be integer multiple of `num_groups`.
        channel_axis (int): Channel axis.
        batch_axis (int or repeated int): Axes mean and variance are taken.
        eps (float): Tiny value to avoid zero division by std.
        output_stat(bool): It true, the batch statistics of mean and variance.
        fix_parameters (bool): When set to `True`, the beta and gamma will not be updated.
        param_init (dict):
            Parameter initializers can be set with a dict. A key of the dict must
            be ``'gamma'``, ``'beta'``.
            A value of the dict must be an :obj:`~nnabla.initializer.Initializer`
            or a :obj:`numpy.ndarray`.
            E.g. ``{'gamma': np.ones(...) * 2, 'beta': ConstantInitializer(0)}``.
        no_scale (bool): If `True`, the scale term is omitted.
        no_bias (bool): If `True`, the bias term is omitted.


    Returns:
        * :obj:`~nnabla.Variable`: Normalized output variable.
        * :obj:`~nnabla.Variable`: Mean (if `output_stat=True`)
        * :obj:`~nnabla.Variable`: Std (if `output_stat=True`)
    """
    from nnabla.normalization_functions import _force_list, _init_beta_gamma

    batch_axis = _force_list(batch_axis)

    shape_stat = [1 for _ in range(len(inp.shape))]
    shape_stat[channel_axis] = inp.shape[channel_axis]

    if param_init is None:
        param_init = {}

    beta, gamma = _init_beta_gamma(
        shape_stat, fix_parameters, param_init, no_bias, no_scale)

    # we dont have to broadcast beta and gamma here in this case because adaptive operation in bn is not used.
    return F.group_normalization(inp, beta, gamma,
                                 num_groups=num_groups, channel_axis=channel_axis, batch_axis=batch_axis,
                                 eps=eps, output_stat=output_stat)


[docs]@parametric_function_api("embed", [
    ('W', 'Embedding matrix', '(n_inputs, n_features)', True),
])
def embed(inp, n_inputs, n_features, initializer=None,
          fix_parameters=False, apply_w=None):
    """ Embed.

    Embed slices a matrix/tensor with indexing array/tensor. Weights are initialized with :obj:`nnabla.initializer.UniformInitializer` within the range of :math:`-\\sqrt{3}` and :math:`\\sqrt{3}`.

    Args:
        x(~nnabla.Variable): [Integer] Indices with shape :math:`(I_0, ..., I_N)`
        n_inputs : number of possible inputs, words or vocabraries
        n_features : number of embedding features
        fix_parameters (bool): When set to `True`, the embedding weight matrix
            will not be updated.
        apply_w (function): Lambda, function, or callable object applied to the weights.

    Returns:
        ~nnabla.Variable: Output with shape :math:`(I_0, ..., I_N, W_1, ..., W_M)`
    """
    if not initializer:
        initializer = UniformInitializer((-np.sqrt(3.), np.sqrt(3)))
    w = get_parameter_or_create("W", [n_inputs, n_features],
                                initializer, True, not fix_parameters)
    if apply_w is not None:
        w = apply_w(w)
    return F.embed(inp, w)


[docs]@parametric_function_api("prelu", [
    ('slope', 'Negative slope',
     'tuple() if shared else (inp.shape[base_axis],)', True),
])
def prelu(inp, base_axis=1, shared=True, fix_parameters=False, slope_init=None):
    """
    Parametrized Rectified Linear Unit function defined as

    .. math::
        y_i = \max(0, x_i) + w_i \min(0, x_i)

    where negative slope :math:`w` is learned and can vary across channels (an
    axis specified with base_axis). Weights are initialized with :math:`-1`.

    Args:
        x(~nnabla.Variable): N-D array as input
        base_axis(int): Dimensions up to base_axis is treated as sample dimension.
        shared(bool): Use shared weight value or not
        fix_parameters (bool): When set to `True`, the negative slope values
            will not be updated.
        slope_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`):
            Initializer of negative slopes. By default, they are initialized with `0.25`.
    Returns:
        ~nnabla.Variable: N-D array.

    """
    shape = tuple() if shared else (inp.shape[base_axis],)
    if slope_init is None:
        slope_init = ConstantInitializer(0.25)
    w = get_parameter_or_create("slope", shape,
                                slope_init, True, not fix_parameters)
    return F.prelu(inp, w, base_axis)


[docs]@parametric_function_api("fp_quantized_affine", [
    ('W', 'Weight matrix in float', '(inmaps, outmaps)', True),
    ('b', 'Bias vector in float', '(outmaps,)', True),
    ('W_q', 'Quantized weights', '(inmaps, outmaps)', False),
    ('b_q', 'Quantized biases', '(outmaps,)', False),
])
def fixed_point_quantized_affine(inp, n_outmaps,
                                 base_axis=1,
                                 w_init=None, b_init=None,
                                 fix_parameters=False, rng=None, with_bias=True,
                                 quantize_w=True, sign_w=True, n_w=8, delta_w=2**-4, ste_fine_grained_w=True,
                                 quantize_b=True, sign_b=True, n_b=8, delta_b=2**-4, ste_fine_grained_b=True):
    """Fixed-Point Quantized Affine.

    Fixed-Point Quantized Affine is the affine function,
    except the definition of the inner product is modified.
    The input-output relation of this function is as follows:

    .. math::

        y_j = \sum_{i} Q(w_{ji}) x_i,

    where :math:`Q(w_{ji})` is the fixed-point quantization function.

    .. note::

        1) if you would like to share weights between some layers, please
        make sure to share the standard, floating value weights (`weight`)
        and not the quantized weights (`quantized weight`)

        2) The weights and the quantized weights become synced only after :func:`~nnabla._variable.Variable.forward` is called,
        and not after a call to :func:`~nnabla._variable.Variable.backward`.
        To access the parameters of the network, remember to call :func:`~nnabla._variable.Variable.forward` once before doing so, otherwise the
        float weights and the quantized weights will not be in sync.

        3) CPU and GPU implementations now use float value for `quantized weight`,
        since this function is only for simulation purposes.

    Args:
        inp (~nnabla.Variable): Input N-D array with shape (:math:`M_0 \\times \ldots \\times M_{B-1} \\times D_B \\times \ldots \\times D_N`). Dimensions before and after base_axis are flattened as if it is a matrix.
        n_outmaps (:obj:`int` or :obj:`tuple` of :obj:`int`): Number of output neurons per data.
        base_axis (int): Dimensions up to `base_axis` are treated as the sample dimensions.
        w_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for weight. By default, it is initialized with :obj:`nnabla.initializer.UniformInitializer` within the range determined by :obj:`nnabla.initializer.calc_uniform_lim_glorot`. 
        b_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for bias. By default, it is initialized with zeros if `with_bias` is `True`.
        fix_parameters (bool): When set to `True`, the weights and biases will not be updated.
        rng (numpy.random.RandomState): Random generator for Initializer.
        with_bias (bool): Specify whether to include the bias term.
        quantize_w (bool): Quantize weights if `True`.
        sign_w (bool): Use signed quantization if `True`.
        n_w (int): Bit width used for weight.
        delta_w (float): Step size for weight.
        ste_fine_grained_w (bool): STE is fine-grained if `True`.
        quantize_b (bool): Quantize bias if `True`.
        n_b (int): Bit width used for bias.
        delta_w (float): Step size for bias.
        ste_fine_grained_b (bool): STE is fine-grained if `True`.

    Returns:
        :class:`~nnabla.Variable`: :math:`(B + 1)`-D array. (:math:`M_0 \\times \ldots \\times M_{B-1} \\times L`)

    """

    if not hasattr(n_outmaps, '__iter__'):
        n_outmaps = [n_outmaps]
    n_outmaps = list(n_outmaps)
    n_outmap = int(np.prod(n_outmaps))
    if w_init is None:
        inmaps = np.prod(inp.shape[base_axis:])
        w_init = UniformInitializer(
            calc_uniform_lim_glorot(inmaps, n_outmap), rng=rng)
    if with_bias and b_init is None:
        b_init = ConstantInitializer()

    # Floating Weight
    w = get_parameter_or_create(
        "W", [int(np.prod(inp.shape[base_axis:]))] + n_outmaps,
        w_init, True, not fix_parameters)

    # Quantized Weight
    if quantize_w:
        w_q = get_parameter_or_create(
            "W_q", [int(np.prod(inp.shape[base_axis:]))] + n_outmaps,
            w_init, False)
        # Link computation graph
        real_w_q = F.fixed_point_quantize(w, quantize=quantize_w,
                                          sign=sign_w, n=n_w, delta=delta_w,
                                          ste_fine_grained=ste_fine_grained_w,
                                          outputs=[w_q.data])
        real_w_q.persistent = True
    else:
        real_w_q = w

    # Bias
    # Floating
    b = None
    b_q = None
    real_b_q = None
    if with_bias:
        b = get_parameter_or_create(
            "b", n_outmaps, b_init, True, not fix_parameters)
        if quantize_b:
            b_q = get_parameter_or_create(
                "b_q", n_outmaps, b_init, False)
            # Link computation graph
            real_b_q = F.fixed_point_quantize(b, quantize=quantize_b,
                                              sign=sign_b, n=n_b, delta=delta_b,
                                              ste_fine_grained=ste_fine_grained_b,
                                              outputs=[b_q.data])
            real_b_q.persistent = True
        else:
            real_b_q = b

    return F.affine(inp, real_w_q, real_b_q, base_axis)


[docs]@parametric_function_api("fp_quantized_conv", [
    ('W', 'Filter weights in float', '(outmaps, inmaps // group, *kernel)', True),
    ('b', 'Bias vector in float', '(outmaps,)', True),
    ('W_q', 'Quantized weights', '(outmaps, inmaps // group, *kernel)', False),
    ('b_q', 'Quantized biases', '(outmaps,)', False),
])
def fixed_point_quantized_convolution(inp, outmaps, kernel,
                                      pad=None, stride=None, dilation=None, group=1, channel_last=False,
                                      w_init=None, b_init=None,
                                      base_axis=1, fix_parameters=False, rng=None, with_bias=True,
                                      quantize_w=True, sign_w=True, n_w=8, delta_w=2**-4, ste_fine_grained_w=True,
                                      quantize_b=True, sign_b=True, n_b=8, delta_b=2**-4, ste_fine_grained_b=True,):
    """Fixed-Point Quantized Convolution.

    Fixed-Point Quantized Convolution is the convolution function,
    except the definition of the inner product is modified.
    The input-output relation of this function is as follows:

    .. math::

        y_{n, a, b} = \sum_{m} \sum_{i} \sum_{j} Q(w_{n, m, i, j}) x_{m, a + i, b + j},

    where :math:`Q(w_{n, m, i, j})` is the fixed-point quantization function.

    .. note::

        1) if you would like to share weights between some layers, please
        make sure to share the standard, floating value weights (`weight`)
        and not the quantized weights (`quantized weight`)

        2) The weights and the quantized weights become synced only after :func:`~nnabla._variable.Variable.forward` is called,
        and not after a call to :func:`~nnabla._variable.Variable.backward`.
        To access the parameters of the network, remember to call :func:`~nnabla._variable.Variable.forward` once before doing so, otherwise the
        float weights and the quantized weights will not be in sync.

        3) CPU and GPU implementations now use float value for `quantized weight`,
        since this function is only for simulation purposes.

    Args:
        inp (~nnabla.Variable): N-D array.
        outmaps (int): Number of convolution kernels (which is equal to the number of output channels). For example, to apply convolution on an input with 16 types of filters, specify 16.
        kernel (:obj:`tuple` of :obj:`int`): Convolution kernel size. For example, to apply convolution on an image with a 3 (height) by 5 (width) two-dimensional kernel, specify (3,5).
        pad (:obj:`tuple` of :obj:`int`): Padding sizes for dimensions.
        stride (:obj:`tuple` of :obj:`int`): Stride sizes for dimensions.
        dilation (:obj:`tuple` of :obj:`int`): Dilation sizes for dimensions.
        group (int): Number of groups of channels. This makes connections across channels more sparse by grouping connections along map direction.
        w_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for weight. By default, it is initialized with :obj:`nnabla.initializer.UniformInitializer` within the range determined by :obj:`nnabla.initializer.calc_uniform_lim_glorot`.  
        b_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for bias. By default, it is initialized with zeros if `with_bias` is `True`.
        base_axis (int): Dimensions up to `base_axis` are treated as the sample dimensions.
        fix_parameters (bool): When set to `True`, the weights and biases will not be updated.
        rng (numpy.random.RandomState): Random generator for Initializer.
        with_bias (bool): Specify whether to include the bias term.
        quantize_w (bool): Quantize weights if `True`.
        quantize_bias (bool): Quantize bias if `True`.
        sign_w (bool): Use signed quantization if `True`.
        n_w (int): Bit width used for weight.
        delta_w (float): Step size for weight.
        ste_fine_grained_w (bool): STE is fine-grained if `True`.
        quantize_b (bool): Quantize bias if `True`.
        n_b (int): Bit width used for bias.
        delta_w (float): Step size for bias.
        ste_fine_grained_b (bool): STE is fine-grained if `True`.

    Returns:
        :class:`~nnabla.Variable`: N-D array.

    """
    if channel_last:
        channels = inp.shape[-1]
        filter_shape = tuple(kernel) + (channels // group,)
    else:
        channels = inp.shape[base_axis]
        filter_shape = (channels // group,) + tuple(kernel)
    if w_init is None:
        w_init = UniformInitializer(
            calc_uniform_lim_glorot(channels, outmaps, tuple(kernel)), rng=rng)
    if with_bias and b_init is None:
        b_init = ConstantInitializer()

    # Floating Weight
    w = get_parameter_or_create(
        "W", (outmaps, inp.shape[base_axis] // group) + tuple(kernel),
        w_init, True, not fix_parameters)

    # Quantized Weight
    if quantize_w:
        w_q = get_parameter_or_create(
            "W_q", (outmaps,) + filter_shape,
            w_init, False)
        # Link computation graph
        real_w_q = F.fixed_point_quantize(w, quantize=quantize_w,
                                          sign=sign_w, n=n_w, delta=delta_w,
                                          ste_fine_grained=ste_fine_grained_w,
                                          outputs=[w_q.data])
        real_w_q.persistent = True
    else:
        real_w_q = w

    # Bias
    # Floating
    b = None
    b_q = None
    real_b_q = None

    if with_bias:
        b = get_parameter_or_create(
            "b", (outmaps,), b_init, True, not fix_parameters)
        if quantize_b:
            b_q = get_parameter_or_create(
                "b_q", (outmaps,), b_init, False)
            # Link computation graph
            real_b_q = F.fixed_point_quantize(b, quantize=quantize_b,
                                              sign=sign_b, n=n_b, delta=delta_b,
                                              ste_fine_grained=ste_fine_grained_b,
                                              outputs=[b_q.data])
            real_b_q.persistent = True
        else:
            real_b_q = b

    return F.convolution(inp, real_w_q, real_b_q, base_axis, pad, stride, dilation, group)


[docs]@parametric_function_api("pow2_quantized_affine", [
    ('W', 'Weight matrix in float', '(inmaps, outmaps)', True),
    ('b', 'Bias vector in float', '(outmaps,)', True),
    ('W_q', 'Quantized weights', '(inmaps, outmaps)', False),
    ('b_q', 'Quantized biases', '(outmaps,)', False),
])
def pow2_quantized_affine(inp, n_outmaps,
                          base_axis=1,
                          w_init=None, b_init=None,
                          fix_parameters=False, rng=None, with_bias=True,
                          quantize_w=True, sign_w=True, with_zero_w=False, n_w=8, m_w=2, ste_fine_grained_w=True,
                          quantize_b=True, sign_b=True, with_zero_b=False, n_b=8, m_b=2, ste_fine_grained_b=True):
    """Pow2 Quantized Affine.

    Pow2 Quantized Affine is the affine function,
    except the definition of the inner product is modified.
    The input-output relation of this function is as follows:

    .. math::

        y_j = \sum_{i} Q(w_{ji}) x_i,

    where :math:`Q(w_{ji})` is the power-of-2 quantization function.

    .. note::

        1) if you would like to share weights between some layers, please
        make sure to share the standard, floating value weights (`weight`)
        and not the quantized weights (`quantized weight`)

        2) The weights and the quantized weights become synced only after :func:`~nnabla._variable.Variable.forward` is called,
        and not after a call to :func:`~nnabla._variable.Variable.backward`.
        To access the parameters of the network, remember to call :func:`~nnabla._variable.Variable.forward` once before doing so, otherwise the
        float weights and the quantized weights will not be in sync.

        3) Quantized values are stored as floating point number for `quantized weight`,
        since this function is only for simulation purposes.

    Args:
        inp (~nnabla.Variable): Input N-D array with shape (:math:`M_0 \\times \ldots \\times M_{B-1} \\times D_B \\times \ldots \\times D_N`). Dimensions before and after base_axis are flattened as if it is a matrix.
        n_outmaps (:obj:`int` or :obj:`tuple` of :obj:`int`): Number of output neurons per data.
        base_axis (int): Dimensions up to `base_axis` are treated as the sample dimensions.
        w_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for weight. By default, it is initialized with :obj:`nnabla.initializer.UniformInitializer` within the range determined by :obj:`nnabla.initializer.calc_uniform_lim_glorot`.  
        b_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for bias. By default, it is initialized with zeros if `with_bias` is `True`.
        fix_parameters (bool): When set to `True`, the weights and biases will not be updated.
        rng (numpy.random.RandomState): Random generator for Initializer.
        with_bias (bool): Specify whether to include the bias term.
        quantize_w (bool): Quantize weights if `True`.
        sign_w (bool): Use signed quantization if `True`.
        with_zero_w (bool): Indicate using zero as a quantized value. Default is false.
        n_w (int): Bit width used for weight.
        m_w (int): :math:`2^m` is upper bound and :math:`-2^m` is lower bound for weights. Default is 2.
        ste_fine_grained_w (bool): STE is fine-grained if `True`.
        quantize_b (bool): Quantize bias if `True`.
        with_zero_b (bool): Indicate using zero as a quantized value. Default is false.
        n_b (int): Bit width used for bias.
        m_b (int): :math:`2^m` is upper bound and :math:`-2^m` is lower bound for bias. Default is 2.
        ste_fine_grained_b (bool): STE is fine-grained if `True`.
    Returns:
        :class:`~nnabla.Variable`: :math:`(B + 1)`-D array. (:math:`M_0 \\times \ldots \\times M_{B-1} \\times L`)

    """

    if not hasattr(n_outmaps, '__iter__'):
        n_outmaps = [n_outmaps]
    n_outmaps = list(n_outmaps)
    n_outmap = int(np.prod(n_outmaps))
    if w_init is None:
        inmaps = np.prod(inp.shape[base_axis:])
        w_init = UniformInitializer(
            calc_uniform_lim_glorot(inmaps, n_outmap), rng=rng)
    if with_bias and b_init is None:
        b_init = ConstantInitializer()

    # Floating Weight
    w = get_parameter_or_create(
        "W", [int(np.prod(inp.shape[base_axis:]))] + n_outmaps,
        w_init, True, not fix_parameters)

    # Quantized Weight
    if quantize_w:
        w_q = get_parameter_or_create(
            "W_q", [int(np.prod(inp.shape[base_axis:]))] + n_outmaps,
            w_init, False)
        # Link computation graph
        real_w_q = F.pow2_quantize(w, quantize=quantize_w,
                                   sign=sign_w, with_zero=with_zero_w,
                                   n=n_w, m=m_w, ste_fine_grained=ste_fine_grained_w,
                                   outputs=[w_q.data])
        real_w_q.persistent = True
    else:
        real_w_q = w

    # Bias
    # Floating
    b = None
    b_q = None
    real_b_q = None
    if with_bias:
        b = get_parameter_or_create(
            "b", n_outmaps, b_init, True, not fix_parameters)
        if quantize_b:
            b_q = get_parameter_or_create(
                "b_q", n_outmaps, b_init, False)
            real_b_q = F.pow2_quantize(b, quantize=quantize_b,
                                       sign=sign_b, with_zero=with_zero_b,
                                       n=n_b, m=m_b, ste_fine_grained=ste_fine_grained_b,
                                       outputs=[b_q.data])
            real_b_q.persistent = True
        else:
            real_b_q = b

    return F.affine(inp, real_w_q, real_b_q, base_axis)


[docs]@parametric_function_api("pow2_quantized_conv", [
    ('W', 'Filter weights in float', '(outmaps, inmaps // group, *kernel)', True),
    ('b', 'Bias vector in float', '(outmaps,)', True),
    ('W_q', 'Quantized weights', '(outmaps, inmaps // group, *kernel)', False),
    ('b_q', 'Quantized biases', '(outmaps,)', False),
])
def pow2_quantized_convolution(inp, outmaps, kernel,
                               pad=None, stride=None, dilation=None, group=1,
                               w_init=None, b_init=None,
                               base_axis=1, fix_parameters=False, rng=None, with_bias=True,
                               quantize_w=True, with_zero_w=False, sign_w=True, n_w=8, m_w=2, ste_fine_grained_w=True,
                               quantize_b=True, with_zero_b=False, sign_b=True, n_b=8, m_b=2, ste_fine_grained_b=True,):
    """Pow2 Quantized Convolution.

    Pow2 Quantized Convolution is the convolution function,
    except the definition of the inner product is modified.
    The input-output relation of this function is as follows:

    .. math::

        y_{n, a, b} = \sum_{m} \sum_{i} \sum_{j} Q(w_{n, m, i, j}) x_{m, a + i, b + j},

    where :math:`Q(w_{n, m, i, j})` is the power-of-2 quantization function.

    .. note::

        1) if you would like to share weights between some layers, please
        make sure to share the standard, floating value weights (`weight`)
        and not the quantized weights (`quantized weight`)

        2) The weights and the quantized weights become synced only after :func:`~nnabla._variable.Variable.forward` is called,
        and not after a call to :func:`~nnabla._variable.Variable.backward`.
        To access the parameters of the network, remember to call :func:`~nnabla._variable.Variable.forward` once before doing so, otherwise the
        float weights and the quantized weights will not be in sync.

        3) Quantized values are stored as floating point number for `quantized weight`,
        since this function is only for simulation purposes.

    Args:
        inp (~nnabla.Variable): N-D array.
        outmaps (int): Number of convolution kernels (which is equal to the number of output channels). For example, to apply convolution on an input with 16 types of filters, specify 16.
        kernel (:obj:`tuple` of :obj:`int`): Convolution kernel size. For example, to apply convolution on an image with a 3 (height) by 5 (width) two-dimensional kernel, specify (3,5).
        pad (:obj:`tuple` of :obj:`int`): Padding sizes for dimensions.
        stride (:obj:`tuple` of :obj:`int`): Stride sizes for dimensions.
        dilation (:obj:`tuple` of :obj:`int`): Dilation sizes for dimensions.
        group (int): Number of groups of channels. This makes connections across channels more sparse by grouping connections along map direction.
        w_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for weight. By default, it is initialized with :obj:`nnabla.initializer.UniformInitializer` within the range determined by :obj:`nnabla.initializer.calc_uniform_lim_glorot`.  
        b_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for bias. By default, it is initialized with zeros if `with_bias` is `True`.
        base_axis (int): Dimensions up to `base_axis` are treated as the sample dimensions.
        fix_parameters (bool): When set to `True`, the weights and biases will not be updated.
        rng (numpy.random.RandomState): Random generator for Initializer.
        with_bias (bool): Specify whether to include the bias term.
        quantize_w (bool): Quantize weights if `True`.
        sign_w (bool): Use signed quantization if `True`.
        n_w (int): Bit width used for weight.
        m_w (int): :math:`2^m` is upper bound and :math:`-2^m` is lower bound for weights. Default is 2.
        ste_fine_grained_w (bool): STE is fine-grained if `True`.
        quantize_b (bool): Quantize bias if `True`.
        sign_b (bool): Use signed quantization if `True`.
        n_b (int): Bit width used for bias.
        m_b (int): :math:`2^m` is upper bound and :math:`-2^m` is lower bound for bias. Default is 2.
        ste_fine_grained_b (bool): STE is fine-grained if `True`.

    Returns:
        :class:`~nnabla.Variable`: N-D array.

    """
    if w_init is None:
        w_init = UniformInitializer(
            calc_uniform_lim_glorot(inp.shape[base_axis], outmaps, tuple(kernel)), rng=rng)
    if with_bias and b_init is None:
        b_init = ConstantInitializer()

    # Floating Weight
    w = get_parameter_or_create(
        "W", (outmaps, inp.shape[base_axis] // group) + tuple(kernel),
        w_init, True, not fix_parameters)

    # Quantized Weight
    if quantize_w:
        w_q = get_parameter_or_create(
            "W_q", (outmaps, inp.shape[base_axis] // group) + tuple(kernel),
            w_init, False)

        # Link computation graph
        real_w_q = F.pow2_quantize(w, quantize=quantize_w,
                                   sign=sign_w, with_zero=with_zero_w,
                                   n=n_w, m=m_w, ste_fine_grained=ste_fine_grained_w,
                                   outputs=[w_q.data])
        real_w_q.persistent = True
    else:
        real_w_q = w

    # Bias
    # Floating
    b = None
    b_q = None
    real_b_q = None

    if with_bias:
        b = get_parameter_or_create(
            "b", (outmaps,), b_init, True, not fix_parameters)
        if quantize_b:
            b_q = get_parameter_or_create(
                "b_q", (outmaps,), b_init, False)
            # Link computation graph
            real_b_q = F.pow2_quantize(b, quantize=quantize_b,
                                       sign=sign_b, with_zero=with_zero_b,
                                       n=n_b, m=m_b, ste_fine_grained=ste_fine_grained_b,
                                       outputs=[b_q.data])
            real_b_q.persistent = True
        else:
            real_b_q = b

    return F.convolution(inp, real_w_q, real_b_q, base_axis, pad, stride, dilation, group)


[docs]@parametric_function_api("pruned_affine", [
    ('W', 'Weight matrix in float', '(inmaps, outmaps)', True),
    ('b', 'Bias vector in float', '(outmaps,)', True),
    ('W_q', 'Qunatized weights', '(inmaps, outmaps)', False),
    ('b_q', 'Quantized biases', '(outmaps,)', False),
])
def pruned_affine(inp, n_outmaps,
                  base_axis=1,
                  w_init=None, b_init=None,
                  fix_parameters=False, rng=None, with_bias=True,
                  prune_w=True, rate_w=0.9, prune_b=True, rate_b=0.9):
    """Pruned Affine.

    Pruned Affine is the affine function, 
    except the definition of the inner product is modified.
    The input-output relation of this function is as follows:

    .. math::

        y_j = \sum_{i} Q(w_{ji}) x_i, 

    where :math:`Q(w_{ji})` is the pruning function, i.e., `F.prune`.

    .. note::

        1) if you would like to share weights between some layers, please
        make sure to share the standard, floating value weights (`weight`)
        and not the quantized weights (`quantized weight`)

        2) The weights and the quantized weights become synced only after :func:`~nnabla._variable.Variable.forward` is called,
        and not after a call to :func:`~nnabla._variable.Variable.backward`.
        To access the parameters of the network, remember to call :func:`~nnabla._variable.Variable.forward` once before doing so, otherwise the
        float weights and the quantized weights will not be in sync.

        3) CPU and GPU implementations now use float value for `quantized weight`,
        since this function is only for simulation purposes.

    Args:
        inp (~nnabla.Variable): Input N-D array with shape (:math:`M_0 \\times \ldots \\times M_{B-1} \\times D_B \\times \ldots \\times D_N`). Dimensions before and after base_axis are flattened as if it is a matrix.
        n_outmaps (:obj:`int` or :obj:`tuple` of :obj:`int`): Number of output neurons per data.
        base_axis (int): Dimensions up to `base_axis` are treated as the sample dimensions.
        w_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for weight.
        b_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for bias.
        fix_parameters (bool): When set to `True`, the weights and biases will not be updated.
        rng (numpy.random.RandomState): Random generator for Initializer.
        with_bias (bool): Specify whether to include the bias term.
        prune_w (bool): Quantize weights if `True`.
        rate_w (float): Pruning rate for weights.
        prune_b (bool): Quantize bias if `True`.
        rate_b (float): Pruning rate for bias.


    Returns:
        :class:`~nnabla.Variable`: :math:`(B + 1)`-D array. (:math:`M_0 \\times \ldots \\times M_{B-1} \\times L`)

    """

    if not hasattr(n_outmaps, '__iter__'):
        n_outmaps = [n_outmaps]
    n_outmaps = list(n_outmaps)
    n_outmap = int(np.prod(n_outmaps))
    if w_init is None:
        inmaps = np.prod(inp.shape[base_axis:])
        w_init = UniformInitializer(
            calc_uniform_lim_glorot(inmaps, n_outmap), rng=rng)
    if with_bias and b_init is None:
        b_init = ConstantInitializer()

    # Floating Weight
    w = get_parameter_or_create(
        "W", [int(np.prod(inp.shape[base_axis:]))] + n_outmaps,
        w_init, True, not fix_parameters)

    # sparse Weight
    if prune_w:
        w_q = get_parameter_or_create(
            "W_q", [int(np.prod(inp.shape[base_axis:]))] + n_outmaps,
            w_init, False)
        # Link computation graph
        real_w_q = F.prune(w, rate=rate_w, outputs=[w_q.data])
        real_w_q.persistent = True
    else:
        real_w_q = w

    # Bias
    # Floating
    real_b_q = None
    if with_bias:
        b = get_parameter_or_create(
            "b", n_outmaps, b_init, True, not fix_parameters)
        if prune_b:
            b_q = get_parameter_or_create(
                "b_q", n_outmaps, b_init, False)
            # Link computation graph
            real_b_q = F.prune(b, rate=rate_b, outputs=[b_q.data])
            real_b_q.persistent = True
        else:
            real_b_q = b

    return F.affine(inp, real_w_q, real_b_q, base_axis)


[docs]@parametric_function_api("pruned_conv", [
    ('W', 'Filter weights in float', '(outmaps, inmaps // group, *kernel)', True),
    ('b', 'Bias vector in float', '(outmaps,)', True),
    ('W_q', 'Qunatized weights', '(outmaps, inmaps // group, *kernel)', False),
    ('b_q', 'Quantized biases', '(outmaps,)', False),
])
def pruned_convolution(inp, outmaps, kernel,
                       pad=None, stride=None, dilation=None, group=1, channel_last=False,
                       w_init=None, b_init=None,
                       base_axis=1, fix_parameters=False, rng=None, with_bias=True,
                       prune_w=True, rate_w=0.9, prune_b=True, rate_b=0.9):
    """Pruned Convolution.

    Pruned Convolution is the convolution function, 
    except the definition of the inner product is modified.
    The input-output relation of this function is as follows:

    .. math::

        y_{n, a, b} = \sum_{m} \sum_{i} \sum_{j} Q(w_{n, m, i, j}) x_{m, a + i, b + j}, 

    where :math:`Q(w_{ji})` is the pruning function, i.e., `F.prune`.

    .. note::

        1) if you would like to share weights between some layers, please
        make sure to share the standard, floating value weights (`weight`)
        and not the quantized weights (`quantized weight`)

        2) The weights and the quantized weights become synced only after :func:`~nnabla._variable.Variable.forward` is called,
        and not after a call to :func:`~nnabla._variable.Variable.backward`.
        To access the parameters of the network, remember to call :func:`~nnabla._variable.Variable.forward` once before doing so, otherwise the
        float weights and the quantized weights will not be in sync.

        3) CPU and GPU implementations now use float value for `quantized weight`,
        since this function is only for simulation purposes.

    Args:
        inp (~nnabla.Variable): N-D array.
        outmaps (int): Number of convolution kernels (which is equal to the number of output channels). For example, to apply convolution on an input with 16 types of filters, specify 16.
        kernel (:obj:`tuple` of :obj:`int`): Convolution kernel size. For example, to apply convolution on an image with a 3 (height) by 5 (width) two-dimensional kernel, specify (3,5).
        pad (:obj:`tuple` of :obj:`int`): Padding sizes for dimensions.
        stride (:obj:`tuple` of :obj:`int`): Stride sizes for dimensions.
        dilation (:obj:`tuple` of :obj:`int`): Dilation sizes for dimensions.
        group (int): Number of groups of channels. This makes connections across channels more sparse by grouping connections along map direction.
        w_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for weight.
        b_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for bias.
        base_axis (int): Dimensions up to `base_axis` are treated as the sample dimensions.
        fix_parameters (bool): When set to `True`, the weights and biases will not be updated.
        rng (numpy.random.RandomState): Random generator for Initializer.
        with_bias (bool): Specify whether to include the bias term.
        prune_w (bool): Quantize weights if `True`.
        rate_w (float): Pruning rate for weights.
        prune_b (bool): Quantize bias if `True`.
        rate_b (float): Pruning rate for bias.

    Returns:
        :class:`~nnabla.Variable`: N-D array.

    """
    if channel_last:
        channels = inp.shape[-1]
        filter_shape = tuple(kernel) + (channels // group,)
    else:
        channels = inp.shape[base_axis]
        filter_shape = (channels // group,) + tuple(kernel)
    if w_init is None:
        w_init = UniformInitializer(
            calc_uniform_lim_glorot(channels, outmaps, tuple(kernel)), rng=rng)
    if with_bias and b_init is None:
        b_init = ConstantInitializer()

    # Floating Weight
    w = get_parameter_or_create(
        "W", (outmaps,) + filter_shape,
        w_init, True, not fix_parameters)

    # Quantized Weight
    if prune_w:
        w_q = get_parameter_or_create(
            "W_q", (outmaps, inp.shape[base_axis] // group) + tuple(kernel),
            w_init, False)
        # Link computation graph
        real_w_q = F.prune(w, rate=rate_w, outputs=[w_q.data])
        real_w_q.persistent = True
    else:
        real_w_q = w

    # Bias
    # Floating
    real_b_q = None
    if with_bias:
        b = get_parameter_or_create(
            "b", (outmaps,), b_init, True, not fix_parameters)
        if prune_b:
            b_q = get_parameter_or_create(
                "b_q", (outmaps,), b_init, False)
            # Link computation graph
            real_b_q = F.prune(b, rate=rate_b, outputs=[b_q.data])
            real_b_q.persistent = True
        else:
            real_b_q = b

    return F.convolution(inp, real_w_q, real_b_q, base_axis, pad, stride, dilation, group, channel_last)


[docs]@parametric_function_api("min_max_quantize", [
    ('qr_min', 'Minimum quantization range, the exponential movining average of min values of inputs initialized with -6.0 if ema is True', 'ql_min.shape', False),
    ('qr_max', 'Maximum quantization range, the exponential movining average of max values of inputs initialized with 6.0 if ema is True', 'ql_max.shape', False),
])
def min_max_quantize(x, ql_min=0, ql_max=255, decay=0.999, x_min_max=False, ema=False,
                     ste_fine_grained=True, eps=0.01,
                     qr_min_init=None, qr_max_init=None, fix_parameters=False,
                     outputs=None):
    r"""Min-max quantization.

    This function uniformly quantizes values in the range of min and max quantization levels.

    Min-max quantization is defined as the following equation

    .. math::

        y = round \left(\frac{\min(\max(x, m), M) - m}{scale} \right) \times scale + m, 

    where the :math:`scale` is defined as 

    .. math::

        scale = \frac{M - m}{M_q - m_q}, 

    and 

    .. math::

        m_q = ql_{min}, \\
        M_q = ql_{max}, \\
        m = qr_{min}, \\
        M = qr_{max}.

    In the backward pass when using `ste_fine_grained` as false,

        .. math::

          \frac{\partial q_i}{\partial x_i} = 1.


    In the backward pass when using `ste_fine_grained` as true,

        .. math::

           \frac{\partial q_i}{\partial x_i}= \left\{
         \begin{array}{ll}
           0 & if \ \ \ x_i > M \\
           1 & if \ \ m \le x_i \le M \\
           0 & if \ \ x_i < m \\
         \end{array} \right..

    :math:`qr_{min}` and :math:`qr_{max}` are treaded as follows.

        * `x_min_max` is `True` and `ema` is `True`: 
          Exponential moving average are computed for each :math:`min(x)` and :math:`max(x)` 
          then stored in :math:`qr_{min}` and :math:`qr_{max}`.
        * `x_min_max` is `True` and `ema` is `False`:
          :math:`min(x)` and :math:`max(x)` are computed then stored in :math:`qr_{min}` and :math:`qr_{max}`.
        * `x_min_max` is `False` and `ema` is `True`:
          Exponential moving average stored in :math:`qr_{min}` and :math:`qr_{max}` are used.
        * `x_min_max` is `False` and `ema` is `False`
          Gradients of :math:`qr_{min}` and :math:`qr_{max}` are computed in the backward pass.

    More precisely, in inference of the min-max quantization, one has to consider *zero-point (zp)*
    which corresponds
    to the real value 0, and its data type is an integer. *zero-point* is defined as 

        .. math::

           && zp_f = ql_{min} -\frac{qr_{min}}{scale}, \\
           && zp = \left\{
         \begin{array}{ll}
           ql_{max} & if \ \ \ zp_f >= ql_{max} \\
           round(zp_f) & if \ \ otherwise \\
           ql_{min}  & if \ \ zp_f <= ql_{min} \\
         \end{array} \right..

    Accordingly, in order to simulate quantization effect of *zero-point*, 
    during both forward and backward pass, :math:`qr_{min}` and :math:`qr_{max}` are adjusted as follows,

        .. math::

           qr_{min}^{adj} = ql_{min} - zp * scale, \\
           qr_{max}^{adj} = ql_{max} - zp * scale.

    These operations are often called *nudge*. 

    Finally, in the formulas of the min-max quantization, :math:`m` and :math:`M` are replaced by
    :math:`qr_{min}^{adj}` and :math:`qr_{max}^{adj}` respectively.

    Args:
        x (~nnabla.Variable): Input N-D array.
        ql_min (int, float, or ~nnabla.Variable): Minimum quantization level. Default is 0.
        ql_max (int, float, or ~nnabla.Variable): Maximum quantization level. Default is 255.
        decay (float): The decay rate for the exponential moving average.
        x_min_max (bool): Use the min and max of x to compute quantization ranges. Default is `False`.
        ema (bool): Use the exponential moving average for the min and max quantization ranges.
                    Default is `False`.
        ste_fine_grained (bool): If true, STE is not 1, the {0, 1}-mask computed from the min-max is applied to the gradient in the backward; otherwise, STE is 1.
        eps (float): Epsilon, or small value to ensure :math:`qr_{max} - qr_{min}` must be greater
                     than the epsilon for both weights and bias.
        qr_min_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for the minimum quantization range, qr_min. Default is :obj:`nnabla.initializer.ConstantInitializer` (-6.0).
        qr_max_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for the maximum quantization range, qr_max Default is :obj:`nnabla.initializer.ConstantInitializer` (6.0).
        fix_parameters (bool): When set to `True`, the weights and biases will not be updated.

    References:
        Benoit Jacob, Skirmantas Kligys, Bo Chen, Menglong Zhu, Matthew Tang, Andrew Howard, Hartwig Adam, and Dmitry Kalenichenko, "Quantization and Training of Neural Networks for Efficient Integer-Arithmetic-Only Inference", https://arxiv.org/abs/1712.05877

    """
    # ql_min and ql_max
    if isinstance(ql_min, (int, float)):
        reshape = [1 for _ in range(x.ndim)]
        ql_min = np.array(ql_min).reshape(reshape)
        ql_min = get_parameter_or_create(
            "ql_min", reshape, ql_min, False, False)
    if isinstance(ql_max, (int, float)):
        reshape = [1 for _ in range(x.ndim)]
        ql_max = np.array(ql_max).reshape(reshape)
        ql_max = get_parameter_or_create(
            "ql_max", reshape, ql_max, False, False)
    # qr_min and qr_max
    qr_min_init = qr_min_init if qr_min_init else ConstantInitializer(-6.0)
    qr_max_init = qr_max_init if qr_max_init else ConstantInitializer(6.0)
    shape = ql_min.shape
    qr_min = get_parameter_or_create(
        "qr_min", shape, qr_min_init, not (x_min_max and ema), not fix_parameters)
    qr_max = get_parameter_or_create(
        "qr_max", shape, qr_max_init, not (x_min_max and ema), not fix_parameters)
    x_q = F.min_max_quantize(x, qr_min, qr_max, ql_min, ql_max,
                             decay, x_min_max, ema, ste_fine_grained, eps, outputs=outputs)
    return x_q


[docs]@parametric_function_api("min_max_quantized_affine", [
    ('W', 'Weight matrix in float', '(inmaps, outmaps)', True),
    ('b', 'Bias vector in float', '(outmaps,)', True),
    ('W_q', 'Quantized weights', '(inmaps, outmaps)', False),
    ('b_q', 'Quantized biases', '(outmaps,)', False),
    ('qr_min', 'Minimum quantization range. Minimum values of inputs or trainable range.',
     'ql_min.shape', False),
    ('qr_max', 'Maximum quantization range. Maximum values of inputs or trainable range.',
     'ql_max.shape', False)
])
def min_max_quantized_affine(inp, n_outmaps,
                             base_axis=1,
                             w_init=None, b_init=None,
                             fix_parameters=False, rng=None, with_bias=True,
                             quantize_w=True, ql_min_w=0, ql_max_w=255, w_min_max=False,
                             qr_min_w_init=None, qr_max_w_init=None,
                             ste_fine_grained_w=True,
                             quantize_b=True, ql_min_b=0, ql_max_b=255, b_min_max=False,
                             qr_min_b_init=None, qr_max_b_init=None,
                             ste_fine_grained_b=True,
                             eps=0.01):
    r"""Min-max Quantized Affine.

    Min-max Quantized Affine is the affine function,
    except the definition of the inner product is modified.
    The input-output relation of this function is as follows:

    .. math::

        y_j = \sum_{i} Q(w_{ji}) x_i,

    where :math:`Q(w_{ji})` is the min-max quantization function.

    In the min_max_quantized affine, the exponential moving average is not used. the min and max quantization
    ranges are either the min-max of weights and bias or trained.

    Notice that the min and max values of inputs are always used instead of the exponential moving average.

    .. note::

        1) if you would like to share weights between some layers, please
        make sure to share the standard, floating value weights (`weight`)
        and not the quantized weights (`quantized weight`)

        2) The weights and the quantized weights become synced only after :func:`~nnabla._variable.Variable.forward` is called,
        and not after a call to :func:`~nnabla._variable.Variable.backward`.
        To access the parameters of the network, remember to call :func:`~nnabla._variable.Variable.forward` once before doing so, otherwise the
        float weights and the quantized weights will not be in sync.

        3) CPU and GPU implementations now use float value for `quantized weight`,
        since this function is only for simulation purposes.

    Args:
        inp (~nnabla.Variable): Input N-D array with shape (:math:`M_0 \times \ldots \times M_{B-1} \times D_B \times \ldots \times D_N`). Dimensions before and after base_axis are flattened as if it is a matrix.
        n_outmaps (:obj:`int` or :obj:`tuple` of :obj:`int`): Number of output neurons per data.
        base_axis (int): Dimensions up to `base_axis` are treated as the sample dimensions.
        w_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for weight. By default, it is initialized with :obj:`nnabla.initializer.UniformInitializer` within the range determined by :obj:`nnabla.initializer.calc_uniform_lim_glorot`. 
        b_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for bias. By default, it is initialized with zeros if `with_bias` is `True`.
        fix_parameters (bool): When set to `True`, the weights and biases will not be updated.
        rng (numpy.random.RandomState): Random generator for Initializer.
        with_bias (bool): Specify whether to include the bias term.
        quantize_w (bool): Quantize weights if `True`.
        ql_min_w (int, float, or ~nnabla.Variable): Minimum quantization level for weights. Default is 0.
        ql_max_w (int, float, or ~nnabla.Variable): Maximum quantization level for weights. Default is 255.
        w_min_max (bool): Use the min and max of weights to compute quantization ranges. Default is `False`.
        qr_min_w_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for the minimum quantization range, qr_min. Default is :obj:`nnabla.initializer.ConstantInitializer` (-2.0).
        qr_max_w_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for the maximum quantization range, qr_max. Default is :obj:`nnabla.initializer.ConstantInitializer` (2.0).
        ste_fine_grained_w (bool): If true, STE is not 1, the {0, 1}-mask computed from the min-max is applied to the gradient in the backward; otherwise, STE is 1.
        quantize_b (bool): Quantize bias if `True`.
        ql_min_b (int, float, or ~nnabla.Variable): Minimum quantization level for bias. Default is 0.
        ql_max_b (int, float, or ~nnabla.Variable): Maximum quantization level for bias. Default is 255.
        b_min_max (bool): Use the min and max of bias to compute quantization ranges. Default is `False`.
        qr_min_b_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for the minimum quantization range, qr_min. Default is :obj:`nnabla.initializer.ConstantInitializer` (-6.0).
        qr_max_b_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for the maximum quantization range, qr_max. Default is :obj:`nnabla.initializer.ConstantInitializer` (6.0).
        ste_fine_grained_b (bool): If true, STE is not 1, the {0, 1}-mask computed from the min-max is applied to the gradient in the backward; otherwise, STE is 1.
        eps (float): Epsilon, or small value to ensure :math:`qr_{max} - qr_{min}` must be greater
                     than the epsilon for both weights and bias.
    Returns:
        :class:`~nnabla.Variable`: :math:`(B + 1)`-D array. (:math:`M_0 \times \ldots \times M_{B-1} \times L`)

    """

    if not hasattr(n_outmaps, '__iter__'):
        n_outmaps = [n_outmaps]
    n_outmaps = list(n_outmaps)
    n_outmap = int(np.prod(n_outmaps))
    if w_init is None:
        inmaps = np.prod(inp.shape[base_axis:])
        w_init = UniformInitializer(
            calc_uniform_lim_glorot(inmaps, n_outmap), rng=rng)
    if with_bias and b_init is None:
        b_init = ConstantInitializer()
    if qr_min_w_init is None:
        qr_min_w_init = ConstantInitializer(-2.0)
    if qr_max_w_init is None:
        qr_max_w_init = ConstantInitializer(2.0)
    if qr_min_b_init is None:
        qr_min_b_init = ConstantInitializer(-6.0)
    if qr_max_b_init is None:
        qr_max_b_init = ConstantInitializer(6.0)

    # Floating Weight
    w = get_parameter_or_create(
        "W", [int(np.prod(inp.shape[base_axis:]))] + n_outmaps,
        w_init, True, not fix_parameters)

    # Quantized Weight
    if quantize_w:
        w_q = get_parameter_or_create(
            "W_q", [int(np.prod(inp.shape[base_axis:]))] + n_outmaps,
            w_init, False)
        # Link computation graph
        real_w_q = min_max_quantize(w, ql_min_w, ql_max_w, 0.999, w_min_max, False,
                                    qr_min_init=qr_min_w_init, qr_max_init=qr_max_w_init,
                                    ste_fine_grained=ste_fine_grained_w,
                                    eps=eps,
                                    fix_parameters=fix_parameters,
                                    outputs=[w_q.data],
                                    name="min_max_quantize_w")
        real_w_q.persistent = True
    else:
        real_w_q = w

    # Bias
    # Floating
    b = None
    b_q = None
    real_b_q = None
    if with_bias:
        b = get_parameter_or_create(
            "b", n_outmaps, b_init, True, not fix_parameters)
        if quantize_b:
            b_q = get_parameter_or_create(
                "b_q", n_outmaps, b_init, False)
            # Link computation graph
            real_b_q = min_max_quantize(b, ql_min_b, ql_max_b, 0.999, b_min_max, False,
                                        qr_min_init=qr_min_b_init, qr_max_init=qr_max_b_init,
                                        ste_fine_grained=ste_fine_grained_b,
                                        eps=eps,
                                        fix_parameters=fix_parameters,
                                        outputs=[b_q.data],
                                        name="min_max_quantize_b")
            real_b_q.persistent = True
        else:
            real_b_q = b

    return F.affine(inp, real_w_q, real_b_q, base_axis)


[docs]@parametric_function_api("min_max_quantized_conv", [
    ('W', 'Filter weights in float', '(outmaps, inmaps // group, *kernel)', True),
    ('b', 'Bias vector in float', '(outmaps,)', True),
    ('W_q', 'Quantized weights', '(outmaps, inmaps // group, *kernel)', False),
    ('b_q', 'Quantized biases', '(outmaps,)', False),
    ('qr_min', 'Minimum quantization range. Minimum values of inputs or trainable range.',
     'ql_min.shape', False),
    ('qr_max', 'Maximum quantization range. Maximum values of inputs or trainable range.',
     'ql_max.shape', False)
])
def min_max_quantized_convolution(inp, outmaps, kernel,
                                  pad=None, stride=None, dilation=None, group=1, channel_last=False,
                                  w_init=None, b_init=None,
                                  base_axis=1, fix_parameters=False, rng=None, with_bias=True,
                                  quantize_w=True, ql_min_w=0, ql_max_w=255, w_min_max=False,
                                  qr_min_w_init=None, qr_max_w_init=None,
                                  ste_fine_grained_w=True,
                                  quantize_b=True, ql_min_b=0, ql_max_b=255, b_min_max=False,
                                  qr_min_b_init=None, qr_max_b_init=None,
                                  ste_fine_grained_b=True,
                                  eps=0.01):
    r"""Min-max Quantized Convolution.

    Min-max Quantized Convolution is the convolution function,
    except the definition of the inner product is modified.
    The input-output relation of this function is as follows:

    .. math::

        y_{n, a, b} = \sum_{m} \sum_{i} \sum_{j} Q(w_{n, m, i, j}) x_{m, a + i, b + j},

    where :math:`Q(w_{n, m, i, j})` is the min-max quantization function.

    In the min_max_quantized convolution, the exponential moving average is not used.
    the min and max quantization ranges are either the min-max of weights and bias or trained.

    Notice that the min and max values of inputs are always used instead of the exponential moving average.

    .. note::

        1) if you would like to share weights between some layers, please
        make sure to share the standard, floating value weights (`weight`)
        and not the quantized weights (`quantized weight`)

        2) The weights and the quantized weights become synced only after :func:`~nnabla._variable.Variable.forward` is called,
        and not after a call to :func:`~nnabla._variable.Variable.backward`.
        To access the parameters of the network, remember to call :func:`~nnabla._variable.Variable.forward` once before doing so, otherwise the
        float weights and the quantized weights will not be in sync.

        3) CPU and GPU implementations now use float value for `quantized weight`,
        since this function is only for simulation purposes.

    Args:
        inp (~nnabla.Variable): N-D array.
        outmaps (int): Number of convolution kernels (which is equal to the number of output channels). For example, to apply convolution on an input with 16 types of filters, specify 16.
        kernel (:obj:`tuple` of :obj:`int`): Convolution kernel size. For example, to apply convolution on an image with a 3 (height) by 5 (width) two-dimensional kernel, specify (3,5).
        pad (:obj:`tuple` of :obj:`int`): Padding sizes for dimensions.
        stride (:obj:`tuple` of :obj:`int`): Stride sizes for dimensions.
        dilation (:obj:`tuple` of :obj:`int`): Dilation sizes for dimensions.
        group (int): Number of groups of channels. This makes connections across channels more sparse by grouping connections along map direction.
        channel_last (bool): If True, the last dimension is considered as channel dimension, a.k.a. NHWC order.
        w_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for weight. By default, it is initialized with :obj:`nnabla.initializer.UniformInitializer` within the range determined by :obj:`nnabla.initializer.calc_uniform_lim_glorot`.  
        b_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for bias. By default, it is initialized with zeros if `with_bias` is `True`.
        base_axis (int): Dimensions up to `base_axis` are treated as the sample dimensions.
        fix_parameters (bool): When set to `True`, the weights and biases will not be updated.
        rng (numpy.random.RandomState): Random generator for Initializer.
        with_bias (bool): Specify whether to include the bias term.
        quantize_w (bool): Quantize weights if `True`.
        ql_min_w (int, float, or ~nnabla.Variable): Minimum quantization level for weights. Default is 0.
        ql_max_w (int, float, or ~nnabla.Variable): Maximum quantization level for weights. Default is 255.
        w_min_max (bool): Use the min and max of weights to compute quantization ranges. Default is `False`.
        qr_min_w_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for the minimum quantization range, qr_min. Default is :obj:`nnabla.initializer.ConstantInitializer` (-2.0).
        qr_max_w_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for the maximum quantization range, qr_max Default is :obj:`nnabla.initializer.ConstantInitializer` (2.0).
        ste_fine_grained_w (bool): If true, STE is not 1, the {0, 1}-mask computed from the min-max is applied to the gradient in the backward; otherwise, STE is 1.
        quantize_b (bool): Quantize bias if `True`.
        ql_min_b (int, float, or ~nnabla.Variable): Minimum quantization level for bias. Default is 0.
        ql_max_b (int, float, or ~nnabla.Variable): Maximum quantization level for bias. Default is 255.
        b_min_max (bool): Use the min and max of bias to compute quantization ranges. Default is `False`.
        qr_min_b_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for the minimum quantization range, qr_min. Default is :obj:`nnabla.initializer.ConstantInitializer` (-6.0).
        qr_max_b_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for the maximum quantization range, qr_max Default is :obj:`nnabla.initializer.ConstantInitializer` (6.0).
        ste_fine_grained_b (bool): If true, STE is not 1, the {0, 1}-mask computed from the min-max is applied to the gradient in the backward; otherwise, STE is 1.
        eps (float): Epsilon, or small value to ensure :math:`qr_{max} - qr_{min}` must be greater
                     than the epsilon for both weights and bias.
    Returns:
        :class:`~nnabla.Variable`: N-D array.

    """
    if channel_last:
        channels = inp.shape[-1]
        filter_shape = tuple(kernel) + (channels // group,)
    else:
        channels = inp.shape[base_axis]
        filter_shape = (channels // group,) + tuple(kernel)
    if w_init is None:
        w_init = UniformInitializer(
            calc_uniform_lim_glorot(channels, outmaps, tuple(kernel)), rng=rng)
    if with_bias and b_init is None:
        b_init = ConstantInitializer()
    if qr_min_w_init is None:
        qr_min_w_init = ConstantInitializer(-2.0)
    if qr_max_w_init is None:
        qr_max_w_init = ConstantInitializer(2.0)
    if qr_min_b_init is None:
        qr_min_b_init = ConstantInitializer(-6.0)
    if qr_max_b_init is None:
        qr_max_b_init = ConstantInitializer(6.0)

    # Floating Weight
    w = get_parameter_or_create(
        "W", (outmaps,) + filter_shape,
        w_init, True, not fix_parameters)

    # Quantized Weight
    if quantize_w:
        w_q = get_parameter_or_create(
            "W_q", (outmaps,) + filter_shape,
            w_init, False)
        # Link computation graph
        real_w_q = min_max_quantize(w, ql_min_w, ql_max_w, 0.999, w_min_max, False,
                                    qr_min_init=qr_min_w_init, qr_max_init=qr_max_w_init,
                                    ste_fine_grained=ste_fine_grained_w,
                                    eps=eps,
                                    fix_parameters=fix_parameters,
                                    outputs=[w_q.data],
                                    name="min_max_quantize_w")
        real_w_q.persistent = True
    else:
        real_w_q = w

    # Bias
    # Floating
    b = None
    b_q = None
    real_b_q = None

    if with_bias:
        b = get_parameter_or_create(
            "b", (outmaps,), b_init, True, not fix_parameters)
        if quantize_b:
            b_q = get_parameter_or_create(
                "b_q", (outmaps,), b_init, False)
            # Link computation graph
            real_b_q = min_max_quantize(b, ql_min_b, ql_max_b, 0.999, b_min_max, False,
                                        qr_min_init=qr_min_b_init, qr_max_init=qr_max_b_init,
                                        ste_fine_grained=ste_fine_grained_b,
                                        eps=eps,
                                        fix_parameters=fix_parameters,
                                        outputs=[b_q.data],
                                        name="min_max_quantize_b")
            real_b_q.persistent = True
        else:
            real_b_q = b

    return F.convolution(inp, real_w_q, real_b_q, base_axis, pad, stride, dilation, group, channel_last)


[docs]@parametric_function_api("lstm", [
    ('affine/W', 'Stacked weight matrixes of LSTM block',
     '(inmaps, 4, state_size)', True),
    ('affine/b', 'Stacked bias vectors of LSTM block', '(4, state_size,)', True),
])
def lstm_cell(x, h, c, state_size, w_init=None, b_init=None, fix_parameters=False):
    """Long Short-Term Memory.

    Long Short-Term Memory, or LSTM, is a building block for recurrent neural networks (RNN) layers.
    LSTM unit consists of a cell and input, output, forget gates whose functions are defined as following:

    .. math::
        f_t&&=\\sigma(W_fx_t+U_fh_{t-1}+b_f) \\\\
        i_t&&=\\sigma(W_ix_t+U_ih_{t-1}+b_i) \\\\
        o_t&&=\\sigma(W_ox_t+U_oh_{t-1}+b_o) \\\\
        c_t&&=f_t\\odot c_{t-1}+i_t\\odot\\tanh(W_cx_t+U_ch_{t-1}+b_c) \\\\
        h_t&&=o_t\\odot\\tanh(c_t).

    References:

        S. Hochreiter, and J. Schmidhuber. "Long Short-Term Memory."
        Neural Computation. 1997.

    Args:
        x (~nnabla.Variable): Input N-D array with shape (batch_size, input_size).
        h (~nnabla.Variable): Input N-D array with shape (batch_size, state_size).
        c (~nnabla.Variable): Input N-D array with shape (batch_size, state_size).
        state_size (int): Internal state size is set to `state_size`.
        w_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`, optional): Initializer for weight. By default, it is initialized with :obj:`nnabla.initializer.UniformInitializer` within the range determined by :obj:`nnabla.initializer.calc_uniform_lim_glorot`.  
        b_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`, optional): Initializer for bias. By default, it is initialized with zeros if `with_bias` is `True`.
        fix_parameters (bool): When set to `True`, the weights and biases will not be updated.

    Returns:
        :class:`~nnabla.Variable`

    """

    xh = F.concatenate(*(x, h), axis=1)
    iofc = affine(xh, (4, state_size), w_init=w_init,
                  b_init=b_init, fix_parameters=fix_parameters)
    i_t, o_t, f_t, gate = F.split(iofc, axis=1)
    c_t = F.sigmoid(f_t) * c + F.sigmoid(i_t) * F.tanh(gate)
    h_t = F.sigmoid(o_t) * F.tanh(c_t)
    return h_t, c_t


[docs]class LSTMCell:
    def __init__(self, batch_size, state_size, h=None, c=None, name=None):
        """
        Initializes an LSTM cell.

        Args:
            batch_size (int): Internal batch size is set to `batch_size`.
            state_size (int): Internal state size is set to `state_size`.
            h (~nnabla.Variable): Input N-D array with shape (batch_size, state_size). If not specified, it is initialized to zero by default.
            c (~nnabla.Variable): Input N-D array with shape (batch_size, state_size). If not specified, it is initialized to zero by default.
            name (str): Name for this LSTM Cell.
        """
        self.batch_size = batch_size
        self.state_size = state_size
        self.name = name
        if h:  # when user defines h
            self.h = h
        else:
            self.h = nn.Variable((self.batch_size, self.state_size))
            self.h.data.zero()
        if c:  # when user defines c
            self.c = c
        else:
            self.c = nn.Variable((self.batch_size, self.state_size))
            self.c.data.zero()

    def reset_state(self):
        """
        Resets states h and c to zero.
        """

        self.h.data.zero()
        self.c.data.zero()

[docs]    def __call__(self, x, w_init=None, b_init=None, fix_parameters=False):
        """
        Updates h and c by calling lstm function.

        Args:
            x (~nnabla.Variable): Input N-D array with shape (batch_size, input_size).
            w_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`, optional): Initializer for weight. By default, it is initialized with :obj:`nnabla.initializer.UniformInitializer` within the range determined by :obj:`nnabla.initializer.calc_uniform_lim_glorot`.  
            b_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`, optional): Initializer for bias. By default, it is initialized with zeros if `with_bias` is `True`.
            fix_parameters (bool): When set to `True`, the weights and biases will not be updated.

        """
        self.h, self.c = lstm_cell(
            x, self.h, self.c, self.state_size,
            w_init, b_init,
            fix_parameters=fix_parameters, name=self.name)
        return self.h


[docs]@parametric_function_api("spectral-norm", [
    ('u', 'singular vector', '(w.shape[dim], )', False),
])
def spectral_norm(w, dim=0, itr=1, eps=1e-12, test=False, u_init=None, fix_parameters=True):
    """Spectral Normalization.

    .. math::

        W_{sn} = \\frac{W}{\\sigma(W)}.

    where :math:`W` is the input matrix, and the :math:`\\sigma(W)` is the spectral norm of :math:`W`. The spectral norm is approximately computed by the power iteration.

    References:

        Takeru Miyato, Toshiki Kataoka, Masanori Koyama, Yuichi Yoshida, 
        "Spectral Normalization for Generative Adversarial Networks", 
        International Conference on Learning Representations. 2018.

    Args:
        W (~nnabla.Variable): Input N-D array with shape. This is normally network parameter.
        dim (`int`): Output dimension. Default is 0. If the dimension is not 0, then the specified dimension becomes the most-left dimension by transposing.
        itr (`int`): Number of iterations. Default is 1.
        eps (`float`): Epsilon for the normalization. Default is 1e-12.
        test (`bool`): Use test mode. Default is False.

    Returns:
        ~nnabla.Variable: Spectrally normalized :math:`W_{sn}` with the same shape as :math:`W`.

    Example:

        .. code-block:: python

            import nnabla as nn
            import nnabla.parametric_functions as PF

            b, c, h, w = 4, 64, 32, 32

            # Spectrally normalized convolution
            apply_w = lambda w: PF.spectral_norm(w, dim=0)
            h = nn.Variable.from_numpy_array(np.random.randn(b, c, h, w))
            h = PF.convolution(h, with_bias=False, apply_w=apply_w)

            # Spectrally normalized affine
            apply_w = lambda w: PF.spectral_norm(w, dim=1)
            h = nn.Variable.from_numpy_array(np.random.randn(b, c))
            h = PF.affine(h, with_bias=False, apply_w=apply_w)

            # Spectrally normalized embed
            apply_w = lambda w: PF.spectral_norm(w, dim=1)
            h = nn.Variable.from_numpy_array(np.random.randn(b, c))
            h = PF.embed(h, c, apply_w=apply_w)

    """

    assert (0 <= dim < len(w.shape)
            ), "`dim` must be `0 <= dim and dim < len(w.shape)`."

    if u_init is None:
        u_init = NormalInitializer()
    u_shape = (w.shape[dim],)
    u = get_parameter_or_create("u", u_shape, u_init, False, False)

    return F.spectral_norm(w, u, dim=dim, itr=itr, eps=eps, test=test)


@parametric_function_api("spectral-norm", [
    ('W_sn', 'Spectral Normalized Weight matrix', 'w.shape', False),
    ('u', 'singular vector', '(w.shape[dim], )', False),
])
def _spectral_norm_v1(w, dim=0, itr=1, eps=1e-12, test=False, u_init=None, fix_parameters=True):
    """Spectral Normalization.

    .. math::

        W_{sn} = \\frac{W}{\\sigma(W)}.

    where :math:`W` is the input matrix, and the :math:`\\sigma(W)` is the spectral norm of :math:`W`. The spectral norm is approximately computed by the power iteration.

    References:

        Takeru Miyato, Toshiki Kataoka, Masanori Koyama, Yuichi Yoshida, 
        "Spectral Normalization for Generative Adversarial Networks", 
        International Conference on Learning Representations. 2018.

    Args:
        W (~nnabla.Variable): Input N-D array with shape. This is normally network parameter.
        dim (`int`): Output dimension. Default is 0. If the dimension is not 0, then the specified dimension becomes the most-left dimension by transposing.
        itr (`int`): Number of iterations. Default is 1.
        eps (`float`): Epsilon for the normalization. Default is 1e-12.
        test (`bool`): Use test mode. Default is False.

    Returns:
        ~nnabla.Variable: Spectrally normalized :math:`W_{sn}` with the same shape as :math:`W`.

    Example:

        .. code-block:: python

            import nnabla as nn
            import nnabla.parametric_functions as PF

            b, c, h, w = 4, 64, 32, 32

            # Spectrally normalized convolution
            apply_w = lambda w: PF.spectral_norm(w, dim=0)
            h = nn.Variable.from_numpy_array(np.random.randn(b, c, h, w))
            h = PF.convolution(h, with_bias=False, apply_w=apply_w)

            # Spectrally normalized affine
            apply_w = lambda w: PF.spectral_norm(w, dim=1)
            h = nn.Variable.from_numpy_array(np.random.randn(b, c))
            h = PF.affine(h, with_bias=False, apply_w=apply_w)

            # Spectrally normalized embed
            apply_w = lambda w: PF.spectral_norm(w, dim=1)
            h = nn.Variable.from_numpy_array(np.random.randn(b, c))
            h = PF.embed(h, c, apply_w=apply_w)

    """

    assert (0 <= dim < len(w.shape)
            ), "`dim` must be `0 <= dim and dim < len(w.shape)`."
    assert 0 < itr, "`itr` must be greater than 0."
    assert 0 < eps, "`eps` must be greater than 0."

    if dim == len(w.shape) - 1:
        w_sn = _spectral_norm_outer_most_dim(w, dim=dim, itr=itr, eps=eps, test=test,
                                             u_init=u_init, fix_parameters=fix_parameters)
    else:
        w_sn = _spectral_norm(w, dim=dim, itr=itr, eps=eps, test=test,
                              u_init=u_init, fix_parameters=fix_parameters)
    return w_sn


def _spectral_norm(w, dim=0, itr=1, eps=1e-12, test=False, u_init=None, fix_parameters=True):
    # Use the original shape for W_sn
    w_shape = w.shape
    W_sn = get_parameter_or_create(
        "W_sn", w_shape, ConstantInitializer(0), False)
    # Transpose if the output dimension is not the most-left dimension.
    if dim != 0:
        dims_transpose = [dim] + [i for i in range(len(w_shape)) if i != dim]
        w = F.transpose(w, dims_transpose)
        w_shape = w.shape
    d0 = w.shape[0]            # Out
    d1 = np.prod(w.shape[1:])  # In
    w = F.reshape(w, [d0, d1])
    if u_init is None:
        u_init = NormalInitializer()
    u0 = get_parameter_or_create("u", [d0], u_init, False, False)
    u = F.reshape(u0, [1, d0])

    # Ensure both parameters (W_sn and u) exist when the test is called fast.
    if test:
        return W_sn
    # Power method
    for _ in range(itr):
        # v
        v = F.affine(u, w)
        v = v / ((F.sum(v ** 2.0, keepdims=True) + eps) ** 0.5)
        v = F.reshape(v, [d1, 1])
        # u
        u = F.affine(w, v)
        u = u / ((F.sum(u ** 2.0, keepdims=True) + eps) ** 0.5)
        u = F.reshape(u, [1, d0])
    # Iterate
    u = F.identity(u, outputs=[u0.data])
    u.persistent = True
    # No grad
    u.need_grad = False
    v.need_grad = False
    # Spectral normalization
    wv = F.affine(w, v)
    sigma = F.affine(u, wv)
    w_sn = w / sigma
    w_sn = F.reshape(w_sn, w_shape)
    # Transpose again if the output dimension is not the most-left dimension.
    if dim != 0:
        dims_transpose = [i for i in range(1, dim + 1)] \
                         + [0] + [i for i in range(dim + 1, len(w_shape))]
        w_sn = F.transpose(w_sn, dims_transpose)
    w_sn = F.identity(w_sn, outputs=[W_sn.data])
    w_sn.persistent = True
    return w_sn


def _spectral_norm_outer_most_dim(w, dim, itr=1, eps=1e-12, test=False,
                                  u_init=None, fix_parameters=True):
    w_shape = w.shape
    W_sn = get_parameter_or_create(
        "W_sn", w.shape, ConstantInitializer(0), False, False)
    d0 = np.prod(w.shape[0:-1])  # In
    d1 = w.shape[-1]             # Out
    w = F.reshape(w, [d0, d1], inplace=False)
    if u_init is None:
        u_init = NormalInitializer()
    u0 = get_parameter_or_create("u", [d1], u_init, False, False)
    u = F.reshape(u0, [d1, 1])

    # Ensure both parameters (W_sn and u) exist when the test is called fast.
    if test:
        return W_sn

    # Power method
    for _ in range(itr):
        # v
        v = F.affine(w, u)
        v = v / ((F.sum(v ** 2.0, keepdims=True) + eps) ** 0.5)
        v = F.reshape(v, [1, d0])
        # u
        u = F.affine(v, w)
        u = u / ((F.sum(u ** 2.0, keepdims=True) + eps) ** 0.5)
        u = F.reshape(u, [d1, 1])
    # Iterate
    u = F.identity(u, outputs=[u0.data])
    u.persistent = True
    # No grad
    u.need_grad = False
    v.need_grad = False
    # Spectral normalization
    wv = F.affine(v, w)
    sigma = F.affine(wv, u)
    w_sn = w / sigma
    w_sn = F.reshape(w_sn, w_shape)
    w_sn = F.identity(w_sn, outputs=[W_sn.data])
    w_sn.persistent = True
    return w_sn


[docs]@parametric_function_api("wn", [
    ('g', 'Weight Normalization adaptive scale scalar.', 'w.shape[dim]', True),
])
def weight_normalization(w, dim=0, eps=1e-12, g_init=None, fix_parameters=False):
    """Weight Normalization.

    .. math::
        \mathbf{w}_{WN} = g \dfrac{\mathbf{w}}{\|\mathbf{w}\|}

    where :math:`\mathbf{w}` is the input weights to be normalized, 
    and :math:`g` is learnable multiplication factors each of which is applied to each input weights at `dim`.
    This function is in general used as callback passed to apply_w for PF.convolution, PF.affine and so on.
    According to the author`s `original implementation <https://github.com/TimSalimans/weight_norm>`_, :math:`v` should be initialized by :math:`N(0, 0.05)`.
    To meet this condition, initializer should be passed to convolution which Weight Normalization is applied, like an example below.


    References:
        * `Tim Salimans, Diederik P. Kingma, Weight Normalization: A Simple Reparameterization to Accelerate Training of Deep Neural Networks.
          <https://arxiv.org/abs/1602.07868>`_

    Args:
        W (~nnabla.Variable): Input N-D array with shape. This is normally network parameter.
        dim (`int`):
            Output dimension. Default is 0.
            If the dimension is not 0, then the specified dimension becomes the most-left dimension by transposing.
        eps (`float`): Epsilon for the normalization. Default is 1e-12.
        g_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for the scale. By default, L2-norm of weights corresponding to `dim` are used.


    Returns:
        ~nnabla.Variable:  :math:`W` with the same shape as :math:`v`.

    Example:

        .. code-block:: python

            import nnabla as nn
            import nnabla.parametric_functions as PF
            import nnabla.initializer as I

            # h is nn.Variable.

            # convolution
            # according to the original implementation, w should be initialized by N(0, 0.05).
            h = PF.convolution(h, ..., apply_w=PF.weight_normalization, w_init=I.NormalInitializer(0.05))

            # affine
            h = PF.affine(h, ..., apply_w=lambda w: PF.weight_normalization(w, dim=1), w_init=I.NormalInitializer(0.05))

    .. warning::
       Up to the version 1.10.0, this had been implemented as the composite functions.

    """
    outmaps = w.shape[dim]
    if g_init is None:
        g_init = WeightNormalizationScaleInitializer(w, dim, eps)
    g = get_parameter_or_create("g", (outmaps,),
                                initializer=g_init, need_grad=True, as_need_grad=not fix_parameters)
    return F.weight_normalization(w, g, dim, eps)


@parametric_function_api("wn", [
    ('g', 'Weight Normalization adaptive scale scalar.', 'w.shape[dim]', True),
])
def _weight_normalization_v1(v, dim=0, eps=1e-12, fix_parameters=False):
    """Weight Normalization.

    This functions is of the composite functions. It takes a lots of memories since the intermediate results
    are stored as a part of the computation graph.

    .. math::
        \mathbf{w} = g \dfrac{\mathbf{v}}{\|\mathbf{v}\|}

    where :math:`v` is the input matrix,
    and :math:`g` is learnable multiplication factors each of which is applied to each output map at `dim`.
    This function is in general used as callback passed to apply_w for PF.convolution, PF.affine and so on.
    According to the author`s original implementation (https://github.com/TimSalimans/weight_norm), :math:`v` should be initialized by :math:`N(0, 0.05)`.
    To meet this condition, initializer should be passed to convolution which Weight Normalization is applied, like an example below.


    References:
        * `Tim Salimans, Diederik P. Kingma, Weight Normalization: A Simple Reparameterization to Accelerate Training of Deep Neural Networks.
          <https://arxiv.org/abs/1602.07868>`_

    Args:
        W (~nnabla.Variable): Input N-D array with shape. This is normally network parameter.
        dim (`int`):
            Output dimension. Default is 0.
            If the dimension is not 0, then the specified dimension becomes the most-left dimension by transposing.
        eps (`float`): Epsilon for the normalization. Default is 1e-12.

    Returns:
        ~nnabla.Variable:  :math:`W` with the same shape as :math:`v`.

    Example:

        .. code-block:: python

            import nnabla as nn
            import nnabla.parametric_functions as PF
            import nnabla.initializer as I

            # h is nn.Variable.

            # convolution
            # according to the original implementation, w should be initialized by N(0, 0.05).
            h = PF.convolution(h, ..., apply_w=PF.weight_normalization, w_init=I.NormalInitializer(0.05))

            # affine
            h = PF.affine(h, ..., apply_w=lambda w: PF.weight_normalization(w, dim=1), w_init=I.NormalInitializer(0.05))

    """
    assert - \
        len(v.shape) <= dim < len(
            v.shape), "`dim` must be `-len(w.shape) <= dim < len(w.shape)`."
    assert 0 < eps, "`eps` must be greater than 0."

    # consider w as v.

    outmaps = v.shape[dim]
    g = get_parameter_or_create("g", (outmaps,),
                                initializer=ConstantInitializer(1.), need_grad=True, as_need_grad=not fix_parameters)

    sh = tuple([outmaps if i == dim else 1 for i in range(len(v.shape))])
    ax = tuple([i for i in range(len(v.shape)) if i != dim])

    normalized_v = v / (F.sum(v ** 2, axis=ax, keepdims=True) + eps) ** 0.5

    return F.reshape(g, sh) * normalized_v


[docs]@parametric_function_api("multi_head_attention", [
    ('q_weight', 'weights for query', '(E, E)', True),
    ('k_weight', 'weights for key', '(E_k, E)', True),
    ('v_weight', 'weights for value', '(E_v, E)', True),
    ('out_weight', 'weigths for out projection', '(E, E)', True),
    ('q_bias', 'bias for query', '(E, )', True),
    ('k_bias', 'bias for key', '(E, )', True),
    ('v_bias', 'bais for value', '(E, )', True),
    ('out_bias', 'bias for out projection', '(E, )', True),
    ('attn_bias_k', 'attnetion bias for k', '(E, 1)', True),
    ('attn_bias_v', 'attnetion bias for v', '(E, 1)', True),
])
def multi_head_attention(query, key, value, num_heads=12, dropout=0.0, k_embed_dim=None, v_embed_dim=None, out_dim=None, rng=None, with_bias=True, add_attn_bias=False, additive_mask=None, key_padding_mask=None, fix_parameters=False, param_init=None):
    '''MultiHeadAttention.

    Computes multi-headed attention with query, key, and value.
    We use the following notations to describe the inputs and outputs below.
    :math:`L_T`: target sequence length, :math:`L_S`: source sequence length, :math:`B`: batch size, :math:`D`: input dimension, :math:`E`: embedding dimension.

    References:

        A. Vaswani et al. "Attention is All You Need."
        NIPS. 2017.
        <https://papers.nips.cc/paper/7181-attention-is-all-you-need.pdf>

    Example:

    .. code-block:: python

        q = nn.Variable((tgt_len, batch_size, q_input_dim))
        k = nn.Variable((src_len, batch_size, k_input_dim))
        v = nn.Variable((src_len, batch_size, v_input_dim))

        out, w = PF.multi_head_attention(q, k, v)
        out.forward()

    Args:
        query (~nnabla.Variable): Input N-D array with shape :math:`(L_T, B, D_q)`.
        key (~nnabla.Variable): Input N-D array with shape :math:`(L_S, B, D_k)`.
        value (~nnabla.Variable): Input N-D array with shape :math:`(L_S, B, D_v)`.
        num_heads (int, optional): Number of attention heads. Note that embedding dimensoin E must be divisible by the number of heads. Default is 12 which is conventional.
        dropout (float, optional): Dropout ratio applied to parameters. Default is 0.
        k_embed_dim (int, optional): Embedding dimension for key. If specified, embedding dimensions for both query and key are set as that value. Otherwise, k_embed_dim is set as the same alue as embedding dimension for query.
        v_embed_dim (int, optional): Embedding dimension for value. If not specified, it is defaulted as the same value as embedding dimension for query.
        out_dim (int, optional): Embedding dimension for output weight. If not spefied, it is defaulted as the same value as embedding dimension for value.
        rng (numpy.random.RandomState, optional): Random generator for Initializer. Default is None.
        with_bias (bool, optional): Specify whether to include the bias parameters. Default is True.
        add_attn_bias (bool, optional): Specify whether to add attention bias parameters for key and value. Default is False.
        additive_mask (~nnabla.Variable, optional): Input N-D array with shape :math:`(L_T, L_S)`. Values will be added to the attention layer to prevent attention to certain positions.
        key_padding_mask (~nnabla.Variable, optional): Input N-D array with shape :math:`(B, L_S)`. Specified padding elements will be ignored by the attention layer. Values must be either 1 or 0.
        fix_parameters (bool, optional): When set to `True`, the weights and biases will not be updated. Default is False.
        param_init (dict, optional):
            Parameter initializers can be set with a dict. Possible keys of the dict include q_weight, k_weight, v_weight, q_bias, k_bias, v_bias, out_weight, out_bias, attn_bias_k, attn_bias_v.
            A value of the dict must be an :obj:`~nnabla.initializer.Initializer`
            or a :obj:`numpy.ndarray`.
            E.g. ``{'q_bias': ConstantInitializer(0)}``.

    Returns:
        ~nnabla.Variable: Output :math:`y` with shape :math:`(L_T, B, E)`
        ~nnabla.Variable: Output :math:`h_n` with shape :math:`(B, L_T, L_S)`
    '''

    if k_embed_dim is None:
        q_embed_dim = k_embed_dim = query.shape[2]
    else:
        q_embed_dim = k_embed_dim
    if v_embed_dim is None:
        v_embed_dim = value.shape[2]
    if out_dim == None:
        out_dim = v_embed_dim

    if param_init is None:
        param_init = {}

    q_weight = param_init.get('q_weight', UniformInitializer(
        calc_uniform_lim_glorot(query.shape[2], q_embed_dim), rng))
    k_weight = param_init.get('k_weight', UniformInitializer(
        calc_uniform_lim_glorot(key.shape[2], k_embed_dim), rng))
    v_weight = param_init.get('v_weight', UniformInitializer(
        calc_uniform_lim_glorot(value.shape[2], v_embed_dim), rng))

    qw = get_parameter_or_create(
        "q_weight", (query.shape[2], q_embed_dim), q_weight, True, not fix_parameters)
    kw = get_parameter_or_create(
        "k_weight", (key.shape[2], k_embed_dim), k_weight, True, not fix_parameters)
    vw = get_parameter_or_create(
        "v_weight", (value.shape[2], v_embed_dim), v_weight, True, not fix_parameters)

    out_weight = param_init.get('out_weight', UniformInitializer(
        calc_uniform_lim_glorot(v_embed_dim, out_dim), rng))

    ow = get_parameter_or_create("out_weight", (
        v_embed_dim, out_dim), out_weight, True, not fix_parameters)

    qb = kb = vb = ob = None
    if with_bias:
        q_bias = param_init.get('q_bias', ConstantInitializer())
        k_bias = param_init.get('k_bias', ConstantInitializer())
        v_bias = param_init.get('v_bias', ConstantInitializer())
        out_bias = param_init.get('out_bias', ConstantInitializer())

        qb = get_parameter_or_create(
            "q_bias", (q_embed_dim, ), q_bias, True, not fix_parameters)
        kb = get_parameter_or_create(
            "k_bias", (k_embed_dim, ), k_bias, True, not fix_parameters)
        vb = get_parameter_or_create(
            "v_bias", (v_embed_dim, ), v_bias, True, not fix_parameters)
        ob = get_parameter_or_create(
            "out_bias", (out_dim, ), out_bias, True, not fix_parameters)

    abk = abv = None
    if add_attn_bias:
        attn_bias_k = param_init.get('attn_bias_k', UniformInitializer(
            calc_uniform_lim_glorot(k_embed_dim, 1), rng))
        attn_bias_v = param_init.get('attn_bias_v', UniformInitializer(
            calc_uniform_lim_glorot(v_embed_dim, 1), rng))

        abk = get_parameter_or_create(
            "attn_bias_k", (1, 1, k_embed_dim), attn_bias_k, True, not fix_parameters)
        abv = get_parameter_or_create(
            "attn_bias_v", (1, 1, v_embed_dim), attn_bias_v, True, not fix_parameters)

    return F.multi_head_attention(query, key, value, num_heads, qw, kw, vw, ow, qb, kb, vb, ob, abk, abv, dropout, additive_mask=additive_mask, key_padding_mask=key_padding_mask)


[docs]@parametric_function_api("transformer", [
    ('encoder{layer#}', 'parameters for the n\'th encoder layer',
     'Refer to transformer_encode for details', True),
    ('decoder{layer#}', 'parameters for the n\'th decoder layer',
     'Refer to transformer_decode for details', True),
])
def transformer(src, tgt, embed_dim=512, num_heads=8, num_encoder_layers=6, num_decoder_layers=6, dim_feedforward=2048, dropout=0.1, activation=None, src_additive_mask=None, tgt_additive_mask=None, memory_additive_mask=None, src_key_padding_mask=None, tgt_key_padding_mask=None, memory_key_padding_mask=None, rng=None, add_attn_bias=False, fix_parameters=False):
    r"""Transformer.

    We use the following notations to describe the inputs and outputs below.
    :math:`L_T`: target sequence length, :math:`L_S`: source sequence length, :math:`B`: batch size, :math:`E`: embedding dimension.

    References:

        A. Vaswani et al. "Attention is All You Need."
        NIPS. 2017.
        <https://papers.nips.cc/paper/7181-attention-is-all-you-need.pdf>

    Examples:

    .. code-block:: python

        src = nn.Variable((src_len, batch_size, embed_dim),need_grad=True)
        tgt = nn.Variable((tgt_len, batch_size, embed_dim),need_grad=True)
        out = PF.transformer(src, tgt, num_heads=16, num_encoder_layers=12)
        out.forward()

    Args:
        src (~nnabla.Variable): Input source sequence to the encoder with shape:math:`(L_S, B, E)`.
        tgt (~nnabla.Variable): Input target sequence to the decoder with shape :math:`(L_T, B, E)`.
        embed_dim (int, optional): Embedding dimension to be used. Default is 512.
        num_heads (int, optional): Number of attention heads. Default is 12.
        num_encoder_layers (int, optional): Number of encoder layers to stack. Default is 6.
        num_decoder_layers (int, optional): Number of decoder layers to stack. Default is 6.
        dim_feedforward (int, optional): Dimension of the feedforward network model. Default is 2048.
        dropout (float, optional): Dropout ratio applied. Default is 0.1.
        activation (function, optional): Non-linear activation function to be used. Default is None, which is set as F.relu in the code.
        src_additive_mask (~nnabla.Variable, optional): Additive mask for the src sequence (optional). :math:`(L_S, L_S)`.
        tgt_additive_mask (~nnabla.Variable, optional): Additive mask for the tgt sequence (optional). :math:`(L_T, L_T)`.
        memory_additive_mask (~nnabla.Variable, optional): Additive mask for the encoder output (optional). :math:`(L_T, L_S)`.
        src_key_padding_mask (~nnabla.Variable, optional): Key padding mask for src keys per batch (optional). :math:`(B, L_S)`. Specified padding elements will be ignored by the attention layer. Values must be either 1 or 0.
        tgt_key_padding_mask (~nnabla.Variable, optional): Key padding mask for tgt keys per batch (optional). :math:`(B, L_T)`. Specified padding elements will be ignored by the attention layer. Values must be either 1 or 0.
        memory_key_padding_mask (~nnabla.Variable, optional): Key padding mask for memory keys per batch (optional). :math:`(B, L_S)`. Specified padding elements will be ignored by the attention layer. Values must be either 1 or 0.
        rng (numpy.random.RandomState, optional): Random generator for Initializer. Default is None.
        add_attn_bias (bool, optional): Specify whether to add attention bias parameters for key and value. Default is False.
        fix_parameters (bool, optional): When set to `True`, the weights and biases will not be updated. Default is False.

    Returns:
        ~nnabla.Variable: Output :math:`y` with shape :math:`(L_T, B, E)`
   """
    assert src.shape[1] == tgt.shape[1], "the batch number of source and target must be equal"
    assert src.shape[2] == embed_dim and tgt.shape[2] == embed_dim, "the feature dimension of source and target must be equal to embed_dim"

    memory = src

    for i in range(num_encoder_layers):
        memory = transformer_encode(
            memory, embed_dim=embed_dim, num_heads=num_heads, dim_feedforward=dim_feedforward, dropout=dropout, activation=activation, src_additive_mask=src_additive_mask, src_key_padding_mask=src_key_padding_mask, rng=rng, add_attn_bias=add_attn_bias, fix_parameters=fix_parameters, name='encoder{:02d}'.format(i))

    output = tgt

    for i in range(num_decoder_layers):
        output = transformer_decode(output, memory, embed_dim=embed_dim, num_heads=num_heads, dim_feedforward=dim_feedforward, dropout=dropout, activation=activation, tgt_additive_mask=tgt_additive_mask, memory_additive_mask=memory_additive_mask,
                                    tgt_key_padding_mask=tgt_key_padding_mask, memory_key_padding_mask=memory_key_padding_mask, rng=rng, add_attn_bias=add_attn_bias, fix_parameters=fix_parameters, name='decoder{:02d}'.format(i))

    return output


[docs]@parametric_function_api("transformer_encode", [
    ('src_self_attn', 'self-attention parameters for source sequence',
     'Refer to multi_head_attention for details', True),
    ('enc_affine1', 'first affine used in encoder',
     'Refer to affine for details', True),
    ('enc_affine2', 'second affine used in encoder',
     'Refer to affine for details', True),
    ('enc_layer_norm1', 'fist layer normalization used in encoder',
     'Refer to layer_normalization for details', True),
    ('enc_layer_norm2', 'second layer normalization used in encoder',
     'Refer to layer_normalization for details', True),
])
def transformer_encode(src, embed_dim, num_heads, dim_feedforward=2048, dropout=0.1, activation=None, src_additive_mask=None, src_key_padding_mask=None, rng=None, add_attn_bias=False, fix_parameters=False):
    r"""Transformer Encoder.

    Args:
        src (~nnabla.Variable): Input sequnce to the encoder layer with shape :math:`(L_S, B, E)`.
        embed_dim (int): Number of embedding dimension.
        num_heads (int): Number of attention heads.
        dim_feedforward (int, optional): Dimension of the feedforward network model. Default is 2048.
        dropout (float, optional): Dropout ratio. Default is 0.1.
        activation (function, optional): Non-linear activation function to be used. Default is None, which is set as F.relu in the code.
        src_additive_mask (~nnabla.Variable, optional): Additive mask for the source sequence with shape :math:`(L_S, L_S)`
        src_key_padding_mask (~nnabla.Variable, optional): Padding mask for the source sequence with shape :math:`(B, L_S)`. Specified padding elements will be ignored by the attention layer. Values must be either 1 or 0.
        rng (numpy.random.RandomState, optional): Random generator for Initializer. Defalut is None.
        add_attn_bias (bool, optional): Specify whether to add attention bias parameters for key and value. Default is False.
        fix_parameters (bool, optional): When set to `True`, the weights and biases will not be updated. Default is False.

    Returns:
        ~nnabla.Variable: Output :math:`y` with shape :math:`(L_S, B, E)`
    """
    if activation is None:
        activation = F.relu
    src_self_attn = multi_head_attention(
        src, src, src, num_heads=num_heads, add_attn_bias=add_attn_bias, additive_mask=src_additive_mask, key_padding_mask=src_key_padding_mask, name='src_self_attn')[0]
    if dropout > 0:
        src_self_attn = F.dropout(src_self_attn, dropout)
    src_self_attn = src + src_self_attn
    src = layer_normalization(
        src_self_attn, batch_axis=(0, 1), name='enc_layer_norm1')
    src_affine = activation(affine(src, dim_feedforward,
                                   base_axis=2, name='enc_affine1'))
    if dropout > 0:
        src_affine = F.dropout(src_affine, dropout)
    src_affine = affine(src_affine, embed_dim, base_axis=2, name='enc_affine2')
    if dropout > 0:
        src_affine = F.dropout(src_affine, dropout)
    src_affine = src + src_affine
    src = layer_normalization(
        src_affine, batch_axis=(0, 1), name='enc_layer_norm2')
    return src


[docs]@parametric_function_api("transformer_decode", [
    ('tgt_self_attn', 'self-attention parameters for target sequence',
     'Refer to multi_head_attention for details', True),
    ('tgt_memory_attn', 'attention parameters for target sequence with output from encoder as key',
     'Refer to multi_head_attention for details', True),
    ('dec_affine1', 'first affine used in decoder',
     'Refer to affine for details', True),
    ('dec_affine2', 'second affine used in decoder',
     'Refer to affine for details', True),
    ('dec_layer_norm1', 'fist layer normalization used in decoder',
     'Refer to layer_normalization for details', True),
    ('dec_layer_norm2', 'second layer normalization used in decoder',
     'Refer to layer_normalization for details', True),
    ('dec_layer_norm3', 'third layer normalization used in decoder',
     'Refer to layer_normalization for details', True),
])
def transformer_decode(tgt, memory, embed_dim, num_heads, dim_feedforward=2048, dropout=0.1, activation=None, tgt_additive_mask=None, memory_additive_mask=None, tgt_key_padding_mask=None, memory_key_padding_mask=None, rng=None, add_attn_bias=False, fix_parameters=False):
    r"""Transformer Decoder.

    Args:
        tgt (~nnabla.Variable): Input sequnce to the decoder layer with shape :math:`(L_T, B, E)`.
        memory (~nnabla.Variable): Output sequnce from the last layer of the encoder with shape :math:`(L_T, B, E)`.
        embed_dim (int): Number of embedding dimension.
        num_heads (int): Number of attention heads.
        dim_feedforward (int, optional): Dimension of the feedforward network model. Default is 2048.
        dropout (float, optional): Dropout ratio. Default is 0.1.
        activation (function, optional): Non-linear activation function to be used. Default is None, which is set as F.relu in the code.
        tgt_additive_mask (~nnabla.Variable, optional): Additive mask for the target sequence with shape :math:`(L_T, L_T)`.
        memory_additive_mask (~nnabla.Variable, optional): Additive mask for the memory sequcne with shape :math:`(L_T, L_S)`.
        tgt_key_padding_mask (~nnabla.Variable, optional): Padding mask for the target sequence with shape :math:`(B, L_T)`. Specified padding elements will be ignored by the attention layer. Values must be either 1 or 0.
        memory_key_padding_mask (~nnabla.Variable, optional): Padding mask for the mask sequence with shape :math:`(B, L_S)`. Specified padding elements will be ignored by the attention layer. Values must be either 1 or 0.
        rng (numpy.random.RandomState): Random generator for Initializer. Default is None.
        add_attn_bias (bool, optional): Specify whether to add attention bias parameters for key and value. Default is False.
        fix_parameters (bool): When set to `True`, the weights and biases will not be updated. Default is False.

    Returns:
        ~nnabla.Variable: Output :math:`y` with shape :math:`(L_T, B, E)`
    """
    if activation is None:
        activation = F.relu
    tgt_self_attn = multi_head_attention(
        tgt, tgt, tgt, num_heads=num_heads, add_attn_bias=add_attn_bias, additive_mask=tgt_additive_mask, key_padding_mask=tgt_key_padding_mask, name='tgt_self_attn')[0]
    if dropout > 0:
        tgt_self_attn = F.dropout(tgt_self_attn, dropout)
    tgt_self_attn = tgt + tgt_self_attn
    tgt = layer_normalization(
        tgt_self_attn, batch_axis=(0, 1), name='dec_layer_norm1')
    tgt_multi_attn = multi_head_attention(
        tgt, memory, memory, num_heads=num_heads, add_attn_bias=add_attn_bias, additive_mask=memory_additive_mask, key_padding_mask=memory_key_padding_mask, name='tgt_memory_attn')[0]
    if dropout > 0:
        tgt_multi_attn = F.dropout(tgt_multi_attn, dropout)
    tgt_multi_attn = tgt + tgt_multi_attn
    tgt = layer_normalization(
        tgt_multi_attn, batch_axis=(0, 1), name='dec_layer_norm2')
    tgt_affine = activation(affine(tgt, dim_feedforward,
                                   base_axis=2, name='dec_affine1'))
    if dropout > 0:
        tgt_affine = F.dropout(tgt_affine, dropout)
    tgt_affine = affine(tgt_affine, embed_dim, base_axis=2, name='dec_affine2')
    if dropout > 0:
        tgt_affine = F.dropout(tgt_affine, dropout)
    tgt_affine = tgt + tgt_affine
    tgt = layer_normalization(
        tgt_affine, batch_axis=(0, 1), name='dec_layer_norm3')

    return tgt