# Source code for nnabla.functions

# Copyright 2017,2018,2019,2020,2021 Sony Corporation.
# Copyright 2021 Sony Group Corporation.
#
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#
# Unless required by applicable law or agreed to in writing, software
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and

from __future__ import absolute_import
from .function_bases import *
from . import dtypes
import nnabla as nn
import numpy as np
from .normalization_functions import *
from .numpy_compat_functions import *

[docs]def sum(x, axis=None, keepdims=False):
"""Reduction along axes with sum operation.

Args:
x (Variable): An input variable.
axis (None, int or tuple of ints): Axis or axes along which the sum is
calculated. Passing the default value None will reduce all dimensions.
keepdims (bool): Flag whether the reduced axes are kept as a dimension with 1 element.

Returns:
~nnabla.Variable: N-D array.
"""
from .function_bases import sum as sum_base
if axis is None:
axis = range(x.ndim)
elif not hasattr(axis, '__iter__'):
axis = [axis]
return sum_base(x, axis, keepdims)

[docs]def mean(x, axis=None, keepdims=False):
"""Reduction along axes with mean operation.

Args:
x (Variable): An input variable.
axis (None, int or tuple of ints): Axis or axes along which mean is
calculated. Passing the default value None will reduce all dimensions.
keepdims (bool): Flag whether the reduced axes are kept as a dimension with 1 element.

Returns:
~nnabla.Variable: N-D array.

"""
from .function_bases import mean as mean_base
if axis is None:
axis = range(x.ndim)
elif not hasattr(axis, '__iter__'):
axis = [axis]
return mean_base(x, axis, keepdims)

[docs]def max(x, axis=None, keepdims=False, with_index=False, only_index=False):
"""Reduce the input N-D array x along the given axis using the max
operation. The axis argument may be a single integer to reduce
over one axis, a tuple of integers to reduce over multiple axes,
or None to reduce over all axes. If keepdims is True,
the output will keep all reduced dimensions with size 1. If
with_index is True, result is a tuple (sorted, indices) or
only indices if only_index is True. Setting only_index to
True implies that with_index is also True.

.. code-block:: python

import numpy as np
import nnabla as nn
import nnabla.functions as F

nn.set_auto_forward(True)
x = nn.Variable.from_numpy_array(np.random.rand(2, 3, 4))

maxval = F.max(x, axis=1)
assert np.allclose(maxval.d, np.max(x.d, axis=1))

maxval, indices = F.max(x, axis=1, with_index=True)
assert np.allclose(maxval.d, np.max(x.d, axis=1))
assert np.all(indices.d == np.argmax(x.d, axis=1))

indices = F.max(x, axis=1, only_index=True)
assert np.all(indices.d == np.argmax(x.d, axis=1))

Args:
x (Variable): An input variable.
axis (None, int or tuple of ints): Axis or axes along which max is
calculated. The default value None will reduce all dimensions.
keepdims(bool): Keep reduced axes as dimension with 1 element.
with_index(bool): Return tuple of max values and index.
only_index(bool): Return only the index of max values.

Returns:
~nnabla.Variable: N-D array.

"""
from .function_bases import max as max_base
if axis is None:
axis = range(x.ndim)
elif not hasattr(axis, '__iter__'):
axis = [axis]
n_outputs = 2 if with_index and not only_index else 1
return max_base(x, axis, keepdims, with_index, only_index, n_outputs)

[docs]def min(x, axis=None, keepdims=False, with_index=False, only_index=False):
"""Reduce the input N-D array x along the given axis using the min
operation. The axis argument may be a single integer to reduce
over one axis, a tuple of integers to reduce over multiple axes,
or None to reduce over all axes. If keepdims is True,
the output will keep all reduced dimensions with size 1. If
with_index is True, result is a tuple (sorted, indices) or
only indices if only_index is True. Setting only_index to
True implies that with_index is also True.

.. code-block:: python

import numpy as np
import nnabla as nn
import nnabla.functions as F

nn.set_auto_forward(True)
x = nn.Variable.from_numpy_array(np.random.rand(2, 3, 4))

minval = F.min(x, axis=1)
assert np.allclose(minval.d, np.min(x.d, axis=1))

minval, indices = F.min(x, axis=1, with_index=True)
assert np.allclose(minval.d, np.min(x.d, axis=1))
assert np.all(indices.d == np.argmin(x.d, axis=1))

indices = F.min(x, axis=1, only_index=True)
assert np.all(indices.d == np.argmin(x.d, axis=1))

Args:
x (Variable): An input variable.
axis (None, int or tuple of ints): Axis or axes along which min is
calculated. The default value None will reduce all dimensions.
keepdims(bool): Keep reduced axes as dimension with 1 element.
with_index(bool): Return tuple of min values and index.
only_index(bool): Return only the index of min values.

Returns:
~nnabla.Variable: N-D array.

"""
from .function_bases import min as min_base
if axis is None:
axis = range(x.ndim)
elif not hasattr(axis, '__iter__'):
axis = [axis]
n_outputs = 2 if with_index and not only_index else 1
return min_base(x, axis, keepdims, with_index, only_index, n_outputs)

[docs]def norm(x, p=None, axis=None, keepdims=False):
r"""
Reduction along axes with norm operation.

.. math::
y = \|x\|_p = \left( \sum_i |x_i|^p \right)^{\frac{1}{p}}

Args:
x (Variable): An input variable.
p (float): Order of the norm.
axis (None, int or tuple of ints): Axis or axes along which product is
calculated. Passing the default value None will reduce all dimensions.
keepdims (bool): Flag whether the reduced axes are kept as a dimension with 1 element.

Returns:
~nnabla.Variable: N-D array.

"""
from .function_bases import norm as norm_base
if axis is None:
axis = range(x.ndim)
elif not hasattr(axis, '__iter__'):
axis = [axis]
return norm_base(x, p, axis, keepdims)

[docs]def norm_normalization(x, p=None, axes=None, eps=1e-12):
r"""
Norm normalization.

.. math::
y = \frac{x_i}{\|x\|_p}

Args:
x(~nnabla.Variable): N-D array.
p(float): Order of the norm.
[default= 2 ]
axes(repeated int64): Axes to be reduced. If empty list is given, all dimensions are reduced.
[default= range(x.ndim) ]
eps(float): Epsilon for the normalization. This eps is added before taking the p-th root in the norm computation.
[default= 1e-12 ]

Returns:
~nnabla.Variable: N-D array
"""
from .function_bases import norm_normalization as norm_normalization_base
if axes is None:
axes = range(x.ndim)
elif not hasattr(axes, '__iter__'):
axes = [axes]
return norm_normalization_base(x, p, axes, eps)

[docs]def spectral_norm(w, u, dim=0, itr=1, eps=1e-12, test=False, output_u=False):
r"""
Spectral Normalization.

.. math::

W_{sn} = \frac{W}{\sigma(W)}

where :math:W is the input matrix, and the :math:\sigma(W) is the spectral norm of :math:W. The spectral norm is approximately computed by the power iteration.

References:

Takeru Miyato, Toshiki Kataoka, Masanori Koyama, Yuichi Yoshida,
"Spectral Normalization for Generative Adversarial Networks",
International Conference on Learning Representations. 2018.

Args:
w(~nnabla.Variable): N-D array of learnable weights. This is normally network parameter.
u(~nnabla.Variable): 1-D array of singular vector. When test == False, the data region of u will be updated during forward calculation.
dim(int): Output dimension. Default is 0. If the dimension is not 0, then the specified dimension becomes the most-left dimension by transposing.
[default= 0 ]
itr(int): Number of power iterations. Default is 1.
[default= 1 ]
eps(float): Epsilon for the normalization. This eps is added before taking the sqrt in the norm computation.
[default= 1e-12 ]
test(bool): When in True, u will not be updated. Default is False.
[default= False ]
output_u(bool): Output original u or not. u is updated when test == True but you can get original u as output with this option. Default is False.
[default= False ]

Returns:
~nnabla.Variable: Spectrally normalized :math:W_{sn} with the same shape as :math:W.
"""
from .function_bases import spectral_norm as spectral_norm_base
n_outputs = 2 if output_u else 1
return spectral_norm_base(w, u, dim, itr, eps, test, output_u, n_outputs)

[docs]def prod(x, axis=None, keepdims=False):
"""Reduction along axes with product operation.

Args:
x (Variable): An input variable.
axis (None, int or tuple of ints): Axis or axes along which product is
calculated. Passing the default value None will reduce all dimensions.
keepdims (bool): Flag whether the reduced axes are kept as a dimension with 1 element.

Returns:
~nnabla.Variable: N-D array.

Note:
Backward computation is not accurate in a zero value input.

"""
from .function_bases import prod as prod_base
if axis is None:
axis = range(x.ndim)
elif not hasattr(axis, '__iter__'):
axis = [axis]
return prod_base(x, axis, keepdims)

def reduce(x, op='sum'):
"""Reduction function with given operation.

Args:
x (Variable): An input.
op (str): 'sum' or 'mean'.

Note:
This is deprecated. Use mean or sum instead.

"""
import warnings
warnings.warn(
"Deprecated API. Use sum or mean instead.", DeprecationWarning)
from .function_bases import reduce_sum, reduce_mean
if op == 'sum':
return reduce_sum(x)
elif op == 'mean':
return reduce_mean(x)
raise ValueError()

[docs]def meshgrid(*x, ij_indexing=False):

from .function_bases import meshgrid as meshgrid_base
return meshgrid_base(*x, ij_indexing=ij_indexing, n_outputs=len(x))

[docs]def split(x, axis=0):
"""
Split arrays at the specified axis.

It returns a number corresponding the size of the given
axis (i.e x.shape[axis]) of :obj:~nnabla.Variable s.

Args:
x(~nnabla.Variable): N-D array
axis(int): Axis

Returns: A :obj:tuple of :obj:~nnabla.Variable s

:func:nnabla.function_bases.split.

"""
from .function_bases import split as split_base
return split_base(x, axis, x.shape[axis])

@function_api
def slice(ctx, x, start=None, stop=None, step=None, n_outputs=-1, outputs=None):
r"""
Slice arrays along specified axis. This function
complies with python slice wherre slice(None, None, -1) and
slice(-1, None, -1) are the special case, which flips the
input array and results in the output array from the end to the beginning
of the input array along the corresponding dimension.

Args:
x(~nnabla.Variable): N-D array
start(repeated int64): Start indices for each axis
[default= (0,) * len(x.shape) ]
stop(repeated int64): Stop indices for each axis
[default= tuple(x.shape) ]
step(repeated int64): Step indices for each axis
[default= (1,) * len(x.shape) ]

Returns:
~nnabla.Variable: Sliced N-D array
"""
start = list(start[:]) if start is not None else len(x.shape) * (0,)
stop = list(stop[:]) if stop is not None else tuple(x.shape)
step = list(step[:]) if step is not None else len(x.shape) * (1,)

for i, (s0, s1, s2) in enumerate(zip(start, stop, step)):
# Passing all logic to c++ side breaks the slice.args which is used in some cases; nn.grad and loading a nnp file, so handle special cases in python side.
if s0 is None:
if s2 is not None and s2 < 0:
start[i] = -1
else:
start[i] = 0
if s1 is None:
if s2 is not None and s2 < 0:
stop[i] = -x.shape[i] - 1
else:
stop[i] = x.shape[i]
if s2 is None:
step[i] = 1

from .function_bases import slice as slice_base
return slice_base(x, start, stop, step, n_outputs, outputs)

[docs]def mean_subtraction(x, mean, t, base_axis=1, update_running_mean=True):
r"""
It subtracts the mean of the elements of the input array,
and normalizes it to :math:0. Preprocessing arrays with this function has the effect of improving accuracy
in various tasks such as image classification.

At training time, this function is defined as

.. math::
\begin{eqnarray}
\mu &=& \frac{1}{M} \sum x_i \\
y_i &=& x_i - \mu
\end{eqnarray}

At testing time, the mean values used are those that were computed during training by moving average.

Note:
The backward performs an approximated differentiation that takes into account only the latest mini-batch.

Args:
x(~nnabla.Variable): N-D array of input.
mean(~nnabla.Variable): N-D array of running mean (modified during forward execution).
t(~nnabla.Variable): Scalar of num of iteration of running mean (modified during forward execution).
base_axis(int): Base axis of Mean Subtraction operation. Dimensions up to base_axis is treated as sample dimension.
[default= 1 ]
update_running_mean(bool): Update running mean during forward execution.
[default= True ]

Returns:
~nnabla.Variable: N-D array.

nnabla.function_bases.mean_subtraction.

"""
from .function_bases import mean_subtraction as mean_subtraction_base
return mean_subtraction_base(x, mean, t,
base_axis=base_axis,
update_running_mean=update_running_mean)

[docs]def fixed_point_quantize(x, sign=True, n=8, delta=2**-4, quantize=True, ste_fine_grained=True, outputs=None):
r"""Fixed Point Quantize.

This function simulates to uniformly quantize values in fixed-point number representation.

Args:
x (Variable): An input variable.
sign (bool): Indicate the signed number or the unsigned number. Default is true.
n (int): Bit width used. Note that sign consumes one bit. :math:n-1 is used for number representation in signed case.
delta (float): Step size.
quantize (bool): If true, quantize input, otherwise not.
ste_fine_grained (bool): If true, STE is not 1.

Returns:
~nnabla.Variable: N-D array.

nnabla.function_bases.fixed_point_quantize.

In the forward pass,

.. math::

q_i= \left\{
\begin{array}{ll}
max & if \ \ \ x_i > max \\
sign(x_i) \times floor(|x_i| \delta^{-1} + 2^{-1}) \times \delta & if \ \ min \le x_i \le max \\
min & if \ \ x_i < min \\
\end{array} \right.,

where :math:\delta is the step size,
:math:(min, max) :=(- (2^{n-1} - 1)\delta, (2^{n-1} - 1)\delta) if :math:sign is true,
:math:(min, max) := (0, (2^n - 1) \delta) otherwise, and
:math:n is the total bit-width used.

In the backward pass when using ste_fine_grained as false,

.. math::

\frac{\partial q_i}{\partial x_i} = 1.

In the backward pass when using ste_fine_grained as true,

.. math::

\frac{\partial q_i}{\partial x_i}= \left\{
\begin{array}{ll}
0 & if \ \ \ x_i > max \\
1 & if \ \ min \le x_i \le max \\
0 & if \ \ x_i < min \\
\end{array} \right..

.. note::

Quantized values are stored as floating point number, since this function is for simulation purposes.

"""
from .function_bases import fixed_point_quantize as fixed_point_quantize_base
if not quantize:
return x
return fixed_point_quantize_base(x, sign, n, delta, ste_fine_grained, outputs=outputs)

[docs]def pow2_quantize(x, sign=True, with_zero=True, n=8, m=1, quantize=True, ste_fine_grained=True, outputs=None):
r"""Pow2 Quantize.

This function simulates to uniformly quantize values in fixed-point number representation.

Args:
x (Variable): An input variable.
sign (bool): Indicate the signed number or the unsigned number. Default is true.
with_zero (bool): Indicate using zero as a quantized value. Default is true. Note that zero consumes one bit.
n (int): Bit width used. Note that sign consumes one bit. :math:n-1 is used for number representation in signed case. Default is 8.
m (int): :math:2^m is the upper bound of the dynamic range and :math:-2^m is the lower bound, :math:m \in \mathcal{Z}. Default is 1.
quantize (bool): If true, quantize input, otherwise not.
ste_fine_grained (bool): If true, STE is not 1.

Returns:
~nnabla.Variable: N-D array.

nnabla.function_bases.pow2_quantize.

In the forward pass of signed case,

.. math::

q_i= \left\{
\begin{array}{ll}
max_{+} & if \ \ \overline{q_i} > max_{+} \\
\overline{q_i} & if \ \ min_{+} \le \overline{q_i} \le max_{+} \\
min_{+} & if \ \ 0 \le \overline{q_i} < min_{+} \\
min_{-} & if \ \ min_{-} < \overline{q_i} < 0 \\
\overline{q_i} & if \ \ max_{-} \le \overline{q_i} \le min_{-}\\
max_{-} & if \ \ \overline{q_i} < max_{-} \\
\end{array} \right.,

where

.. math::

&& max_{+} = 2^{m}, min_{+} = 2^{m - (2^{n-1} - 1)},\\
&& max_{-} = -2^{m}, min_{-} = -2^{m - (2^{n-1} - 1)},\\
&& \overline{q_i} = sign(x_i) \times 2^{round(\log_2 |x_i|)}.

This quantization uses the geometric mean between two power-of-two numbers
as quantization threshold.

In the forward pass of unsigned case,

.. math::

q_i= \left\{
\begin{array}{ll}
max & if \ \ \overline{q_i} > max \\
\overline{q_i} & if \ \ min \le \overline{q_i} \le max \\
min & if \ \ 0 < \overline{q_i} < min \\
\end{array} \right.,

where

.. math::

&& max = 2^{m}, min = 2^{m - (2^{n} - 1)},\\
&& \overline{q_i} = 2^{int(\log_2 |x_i|)}.

When using with_zero as true, a pruning threshold is used to round an input to
0 or :math:min. The pruning threshold is defined in this function as the following,

.. math::

pruning\ threshold = min \times 2^{-\frac{1}{2}}.

If an absolute value of the input is lesser than this value, the input is rounded to 0, otherwise :math:min.

In the backward pass when using ste_fine_grained as false,

.. math::

\frac{\partial q_i}{\partial x_i} = 1.

In the backward pass when using ste_fine_grained as true,

.. math::

\frac{\partial q_i}{\partial x_i}= \left\{
\begin{array}{ll}
0 & if \ \ \overline{q_i} > max_{+} \\
1 & if \ \ otherwise \\
0 & if \ \ \overline{q_i} < max_{-} \\
\end{array} \right..

"""

from .function_bases import pow2_quantize as pow2_quantize_base
if not quantize:
return x
return pow2_quantize_base(x, sign, with_zero, n, m, ste_fine_grained, outputs=outputs)

[docs]def min_max_quantize(x, qr_min, qr_max, ql_min, ql_max, decay=0.999, x_min_max=False, ema=False,
ste_fine_grained=True, eps=0.01, quantize=True, outputs=None):
r"""Min-max quantization.

This function simulates to uniformly quantize values in fixed-point number representation.

Min-max quantization is defined as the following equation

.. math::

y = round \left(\frac{\min(\max(x, m), M) - m}{scale} \right) \times scale + m,

where the :math:scale is defined as

.. math::

scale = \frac{M - m}{M_q - m_q},

and

.. math::

m_q = ql_{min}, \\
M_q = ql_{max}, \\
m = qr_{min}, \\
M = qr_{max}.

In the backward pass when using ste_fine_grained as false,

.. math::

\frac{\partial q_i}{\partial x_i} = 1.

In the backward pass when using ste_fine_grained as true,

.. math::

\frac{\partial q_i}{\partial x_i}= \left\{
\begin{array}{ll}
0 & if \ \ \ x_i > M \\
1 & if \ \ m \le x_i \le M \\
0 & if \ \ x_i < m \\
\end{array} \right..

:math:qr_{min} and :math:qr_{max} are treaded as follows.

* x_min_max is True and ema is True:
Exponential moving average are computed for each :math:min(x) and :math:max(x)
then stored in :math:qr_{min} and :math:qr_{max}.
* x_min_max is True and ema is False:
:math:min(x) and :math:max(x) are computed then stored in :math:qr_{min} and :math:qr_{max}.
* x_min_max is False and ema is True:
Exponential moving average stored in :math:qr_{min} and :math:qr_{max} are used.
* x_min_max is False and ema is False
Gradients of :math:qr_{min} and :math:qr_{max} are computed in the backward pass.

More precisely, in inference of the min-max quantization, one has to consider *zero-point (zp)*
which corresponds
to the real value 0, and its data type is an integer. *zero-point* is defined as

.. math::

&& zp_f = ql_{min} -\frac{qr_{min}}{scale}, \\
&& zp = \left\{
\begin{array}{ll}
ql_{max} & if \ \ \ zp_f >= ql_{max} \\
round(zp_f) & if \ \ otherwise \\
ql_{min}  & if \ \ zp_f <= ql_{min} \\
\end{array} \right..

Accordingly, in order to simulate quantization effect of *zero-point*,
during both forward and backward pass, :math:qr_{min} and :math:qr_{max} are adjusted as follows,

.. math::

qr_{min}^{adj} = ql_{min} - zp * scale, \\
qr_{max}^{adj} = ql_{max} - zp * scale.

These operations are often called *nudge*.

Finally, in the formulas of the min-max quantization, :math:m and :math:M are replaced by
:math:qr_{min}^{adj} and :math:qr_{max}^{adj} respectively.

Args:
x (~nnabla.Variable): Input N-D array.
qr_min (~nnabla.Variable): Minimum quantization range (modified during forward execution).
qr_max (~nnabla.Variable): Maximum quantization range (modified during forward execution).
ql_min (~nnabla.Variable): Minimum quantization level, typically 0.
ql_max (~nnabla.Variable): Maximum quantization level, typically 255.
decay (float): The decay rate for the exponential moving average.
x_min_max (bool): Use the min and max of x to compute quantization ranges. Default is False.
ema (bool): Use the exponential moving average for the min and max quantization ranges.
Default is False.
ste_fine_grained (bool): If True, STE is not 1, the {0, 1}-mask computed from the min-max is
applied to the gradient in the backward; otherwise, STE is 1.
eps (float): Epsilon, or small value to ensure :math:qr_{max} - qr_{min} must be greater
than the epsilon.
quantize (bool): Apply quantization or not.

References:
Benoit Jacob, Skirmantas Kligys, Bo Chen, Menglong Zhu, Matthew Tang, Andrew Howard, Hartwig Adam, and Dmitry Kalenichenko, "Quantization and Training of Neural Networks for Efficient Integer-Arithmetic-Only Inference", https://arxiv.org/abs/1712.05877

"""

from .function_bases import min_max_quantize as min_max_quantize_base
if not quantize:
return x
return min_max_quantize_base(x, qr_min, qr_max, ql_min, ql_max, decay, x_min_max, ema,
ste_fine_grained, eps, quantize, outputs=outputs)

[docs]def clip_by_value(x, min, max):
r"""Clip inputs by values.

.. math::

y = \begin{cases}
max & (x > max) \\
x & (otherwise) \\
min & (x < min)
\end{cases}.

Args:
x (Variable): An input variable.
min (Variable or float): A min variable or float value by which x is clipped. Note that if Variable is given, its shape must be the same as x's.
max (Variable or float): A max variable or float value by which x is clipped. Note that if Variable is given, its shape must be the same as x's

Returns:
~nnabla.Variable: N-D array.

"""
if np.isscalar(min):
maximum_base = maximum_scalar
elif isinstance(min, (nn.Variable, nn.NdArray)):
maximum_base = maximum2
else:
raise TypeError("min must be Variable, NdArray, or scalar.")

if np.isscalar(max):
minimum_base = minimum_scalar
elif isinstance(max, (nn.Variable, nn.NdArray)):
minimum_base = minimum2
else:
raise TypeError("max must be Variable, NdArray, or scalar.")

return minimum_base(maximum_base(x, min), max)

[docs]def clip_by_norm(x, clip_norm, axis=None):
r"""
Clip inputs by its L2 norm when the L2 norm is larger than the threshold value (defined by clip_norm).
If it is less than the threshold, inputs are not modified. If it is applied, the operation is represented as

.. math::
y = N \times \frac{x}{\|x\|_2}.

where :math:x is the input, :math:y is the output,
and :math:N is clip_norm. this is the case that axes is not set.
When axes is set, the norm is computed over axes.

Args:
x (Variable): An input variable.
clip_norm (Variable or float): An input scalar variable or float value. Must be positive.
axis (None, int or tuple of ints): Axis or axes along which the reduction is performed. Passing the default value None will reduce all dimensions.

Returns:
~nnabla.Variable: N-D array.

"""
from .function_bases import sum as sum_base

if axis is None:
axis = range(x.ndim)
elif not hasattr(axis, '__iter__'):
axis = [axis]
x_norm = pow_scalar(sum_base(x**2.0, axis, True), 0.5)
if isinstance(clip_norm, (nn.Variable, nn.NdArray)):
y = x * clip_norm / maximum2(x_norm, clip_norm)
else:
if clip_norm <= 0:
raise ValueError("clip_norm must be positive.")
y = x * clip_norm / maximum_scalar(x_norm, clip_norm)
return y

[docs]def interpolate(x, scale=None, output_size=None, mode='linear',
align_corners=False, half_pixel=False, half_pixel_for_nn=False,
channel_last=False):
'''
Resize an ND array with interpolation.

Scaling factors for spatial dimensions are determined by either
scale or output_size.

nd = len(scale) or nd = len(output_size) determines the number of
spatial dimensions, and the last nd dimensions of the input x are
considered as the spatial dimensions to be resized.

If scale is given, the output_size is calculated by

.. code-block:: python

output_size[i] = floor(scale[i] * x.shape[i - len(scale)]).

Calculation of the coordinate transformation are as follows.

The input coordinate i_input is computed by the output coordinate i_output,
the input size size_input, and the output size size_output as

.. table::
:align: center
:widths: auto

================= ============== =================================================================
align_corners     half_pixel                            i_input
================= ============== =================================================================
True             True       Not supported.
----------------- -------------- -----------------------------------------------------------------
True             False      i_output * (size_input - 1) / (size_output - 1)
----------------- -------------- -----------------------------------------------------------------
False            True       (i_output + 0.5) * size_input / size_output - 0.5
----------------- -------------- -----------------------------------------------------------------
False            False      i_output * size_input / size_output
================= ============== =================================================================

In the case of the nearest mode and half_pixel_for_nn is True,
the input coordinate i_input is computed by the output coordinate i_output as

.. code-block::

i_input = (i_output + 0.5) * size_input / size_output.

Example:

.. code-block:: python

import numpy as np
import nnabla as nn
import nnabla.functions as F

x_data = np.random.rand(64, 3, 224, 224)
x = nn.Variable.from_numpy_array(x_data)

# Resize by scales
y = F.interpolate(x, scale=(2, 2), mode='linear')
print(y.shape)  # (64, 3, 448, 448)
y.forward()
print(y.d)  # Print output

# Resize to a size
y2 = F.interpolate(x, output_size=(320, 257), mode='linear')
print(y2.shape)  # (64, 3, 320, 257)
y2.forward()
print(y2.d)  # Print output

Args:
x(~nnabla.Variable): N-D array with an arbitrary number of dimensions.
scale(tuple of ints): Scale factors along axes. The default is
None, and if this is omitted, output_size must be specified.
output_size(tuple of ints): The output sizes for axes. If this is
given, the scale factors are determined by the output sizes and the
input sizes. The default is None, and if this is omitted,
scale must be specified.
mode(str): Interpolation mode chosen from ('linear'|'nearest').
The default is 'linear'.
align_corners(bool): If true, the corner pixels of input and output
arrays are aligned, such that the output corner pixels have the
same values with the input corner pixels. Default is False.
half_pixel:
If true, in the coordinate transformation, 0.5 is added to the output coordinate
and 0.5 is subtracted from the input coordinate after scaling. Default is False.
half_pixel_for_nn:
This is a special argument to support the backward-compatibility of the nearest neighbor interpolation.
Default is False. When in True, the implementation of nearest neighbor interpolation
is the old one.
channel_last: Last dimension is the channel (NHWC order) if True.

Returns:
~nnabla.Variable: N-D array.

.. warning::
Up to the version 1.8.0, the default of align_corners was None, and it becomes True
if mode is linear, otherwise False.

.. warning::
Up to the version 1.8.0, the nearest mode interpolation corresponds to
the nearest mode and half_pixel_for_nn = True after the version 1.8.0.

'''
from .function_bases import interpolate as interpolate_base
import math
if scale is None and output_size is None:
raise ValueError('Either scale or output_size must be given')
elif output_size is None:
input_size = x.shape[-len(scale)-1:-1] if channel_last \
else x.shape[-len(scale):]
output_size = [int(math.floor(s * d))
for d, s in zip(input_size, scale)]
return interpolate_base(x, output_size, mode, align_corners, half_pixel, half_pixel_for_nn, channel_last)

[docs]def sort(x, axis=-1, reverse=False, with_index=False, only_index=False):
"""Sorts the elements of x along a given axis in ascending order
by value. A negative axis counts from the last dimension of x,
so the default of -1 sorts along the last dimension. If reverse
is True, then the elements are soreted in descending order.

If with_index is True, result is a tuple (sorted, indices)
or only indices if only_index is True. Setting only_index
to True implies that with_index is also True.

.. code-block:: python

import numpy as np
import nnabla as nn
import nnabla.functions as F

nn.set_auto_forward(True)
x = nn.Variable.from_numpy_array(np.random.rand(2, 3, 4))

sorted = F.sort(x)
assert np.allclose(sorted.d, np.sort(x.d))

sorted, indices = F.sort(x, with_index=True)
assert np.allclose(sorted.d, np.sort(x.d))
assert np.all(indices.d == np.argsort(x.d))

indices = F.sort(x, only_index=True)
assert np.all(indices.d == np.argsort(x.d))

Args:
x(~nnabla.Variable): N-D array
axis(int): Axis along which to sort.
reverse(bool): Sort in descending order.
with_index(bool): Return sorted values and index.
only_index(bool): Return only the sort index.

Returns: ~nnabla.Variable sorted or ~nnabla.Variable indices or (~nnabla.Variable sorted, ~nnabla.Variable indices)

"""
from .function_bases import sort as sort_base
n_outputs = 2 if with_index and not only_index else 1
return sort_base(x, axis, reverse, with_index, only_index, n_outputs)

[docs]def tile(x, reps):
"""Forward x repeated the number of times given by reps. If reps is
a sequence, the output has dimension of d = max(len(reps), x.ndim) and
either x is promoted to be d-dimensional by prepending new axes or reps
is promoted to x.ndim by prepending 1's.

Args:
x(~nnabla.Variable): Input N-D array.
reps(int or sequence of int): Repetitions of x along each axis.

Returns:
~nnabla.Variable: N-D array.

>>> import numpy as np, nnabla as nn, nnabla.functions as F
>>> F.tile(nn.Variable([2, 3]), 3).shape    # reps is promoted to [1, 3]
(2, 9)
>>> F.tile(nn.Variable([3]), [2, 3]).shape  # x is promoted to shape (1, 3)
(2, 9)
>>> nn.set_auto_forward(True)
>>> x = nn.Variable.from_numpy_array(np.array([1, 2, 3]))
>>> print(F.tile(x, 3).d)
[1. 2. 3. 1. 2. 3. 1. 2. 3.]
>>> print(F.tile(x, [2, 3]).d)
[[1. 2. 3. 1. 2. 3. 1. 2. 3.]
[1. 2. 3. 1. 2. 3. 1. 2. 3.]]
>>> x = nn.Variable.from_numpy_array(np.array([[1, 3], [2, 4]]))
>>> print(F.tile(x, 3).d)
[[1. 3. 1. 3. 1. 3.]
[2. 4. 2. 4. 2. 4.]]
>>> print(F.tile(x, [2, 3]).d)
[[1. 3. 1. 3. 1. 3.]
[2. 4. 2. 4. 2. 4.]
[1. 3. 1. 3. 1. 3.]
[2. 4. 2. 4. 2. 4.]]

"""
from .function_bases import tile as tile_base
reps = [reps] if isinstance(reps, int) else reps
return tile_base(x, reps)

[docs]def stft(x, window_size, stride, fft_size, window_type='hanning', center=True, pad_mode='reflect', as_istft_backward=False):
"""Computes the short-time Fourier transform

Args:
x (~nnabla.Variable): Time domain sequence of size batch_size x sample_size.
window_size (int): Size of STFT analysis window.
stride (int): Number of samples that we shift the window, also called hop size.
fft_size (int): Size of the FFT, the output will have fft_size // 2+ 1 frequency bins.
window_type (str): Analysis window, can be either hanning, hamming or rectangular.
For convenience, also window_type=None is supported which is equivalent to window_type='rectangular'.
center (bool): If True, then the signal x is padded by half the FFT size using reflection padding.
pad_mode (str): Padding mode, which can be 'constant' or 'reflect'. 'constant' pads with 0.
as_istft_backward: If True, then forward execution behaves as backward execution of ISTFT,
treating input x as output gradient of ISTFT and outputs y_r and y_i as inputs gradient of ISTFT.
This option is only used in nn.grad operator.

Returns:
Returns real and imaginary parts of STFT result.

* :obj:~nnabla.Variable: Real part of STFT of size batch_size x fft_size//2 + 1 x frame_size.
* :obj:~nnabla.Variable: Imaginary part of STFT of size batch x fft_size//2 + 1 x frame_size.
"""
from .function_bases import stft as stft_base
if window_type is None:
window_type = "rectangular"
return stft_base(x, window_size, stride, fft_size, window_type, center, pad_mode, as_istft_backward)

def _stft_v1(x, window_size, stride, fft_size, window_type='hanning', center=True, pad_mode='reflect'):
"""Computes the short-time Fourier transform

Args:
x (~nnabla.Variable): Time domain sequence of size batch_size x sample_size.
window_size (int): Size of STFT analysis window.
stride (int): Number of samples that we shift the window, also called hop size.
fft_size (int): Size of the FFT, the output will have fft_size // 2+ 1 frequency bins.
window_type (str): Analysis window, can be either hanning, hamming or rectangular.
For convenience, also window_type=None is supported which is equivalent to window_type='rectangular'.
center (bool): If True, then the signal x is padded by half the FFT size using reflection padding.
pad_mode (str): Padding mode, which can be 'constant' or 'reflect'. 'constant' pads with 0.

Returns:
Returns real and imaginary parts of STFT result.

* :obj:~nnabla.Variable: Real part of STFT of size batch_size x fft_size//2 + 1 x frame_size.
* :obj:~nnabla.Variable: Imaginary part of STFT of size batch x fft_size//2 + 1 x frame_size.
"""
from nnabla.parameter import get_parameter, get_parameter_or_create
conv_r = get_parameter('conv_r')
conv_i = get_parameter('conv_i')

if conv_r is None or conv_i is None:
if window_type == 'hanning':
window_func = np.hanning(window_size + 1)[:-1]
elif window_type == 'hamming':
window_func = np.hamming(window_size + 1)[:-1]
elif window_type == 'rectangular' or window_type is None:
window_func = np.ones(window_size)
else:
raise ValueError("Unknown window type {}.".format(window_type))

# pad window if fft_size > window_size
if fft_size > window_size:
diff = fft_size - window_size
window_func, (diff//2, diff - diff//2), mode='constant')
elif fft_size < window_size:
raise ValueError(
"FFT size has to be as least as large as window size.")

# compute STFT filter coefficients
mat_r = np.zeros((fft_size//2 + 1, 1, fft_size))
mat_i = np.zeros((fft_size//2 + 1, 1, fft_size))

for w in range(fft_size//2+1):
for t in range(fft_size):
mat_r[w, 0, t] = np.cos(2. * np.pi * w * t / fft_size)
mat_i[w, 0, t] = -np.sin(2. * np.pi * w * t / fft_size)
mat_r = mat_r * window_func
mat_i = mat_i * window_func

conv_r = get_parameter_or_create(
conv_i = get_parameter_or_create(

if center:

x = reshape(x, (x.shape[0], 1, x.shape[1]))

# compute STFT
y_r = convolution(x, conv_r, stride=(stride,))
y_i = convolution(x, conv_i, stride=(stride,))

return y_r, y_i

[docs]def istft(y_r, y_i, window_size, stride, fft_size, window_type='hanning', center=True, pad_mode='reflect', as_stft_backward=False):
"""Computes the inverse shoft-time Fourier transform

Note: We use a constant square inverse window for the reconstruction
of the time-domain signal, therefore, the first and last
window_size - stride are not perfectly reconstructed.

Args:
y_r (~nnabla.Variable): Real part of STFT of size batch_size x fft_size//2 + 1 x frame_size.
y_i (~nnabla.Variable): Imaginary part of STFT of size batch_size x fft_size//2 + 1 x frame_size.
window_size (int): Size of STFT analysis window.
stride (int): Number of samples that we shift the window, also called hop size.
fft_size (int): Size of the FFT, (STFT has fft_size // 2 + 1 frequency bins).
window_type (str): Analysis window, can be either hanning, hamming or rectangular.
For convenience, also window_type=None is supported which is equivalent to window_type='rectangular'.
center (bool): If True, then it is assumed that the time-domain signal has centered frames.
pad_mode (str): Padding mode corresponding to STFT pad_mode, which can be 'constant' or 'reflect'. 'constant' pads with 0.
This option is ignored for the normal use of ISTFT. You need to set the same pad_mode only when as_stft_backward == True.
as_stft_backward (bool): If True, then forward execution behaves as backward execution of STFT,
treating inputs y_r and y_i as outputs gradient of STFT and output x as input gradient of STFT.
This option is only used in nn.grad operator.

Returns:
~nnabla.Variable: Time domain sequence of size batch_size x sample_size.
"""
from .function_bases import istft as istft_base
if window_type is None:
window_type = "rectangular"
return istft_base(y_r, y_i, window_size, stride, fft_size, window_type, center, pad_mode, as_stft_backward)

def _istft_v1(y_r, y_i, window_size, stride, fft_size, window_type='hanning', center=True):
"""Computes the inverse shoft-time Fourier transform

Note: We use a constant square inverse window for the reconstruction
of the time-domain signal, therefore, the first and last
window_size - stride are not perfectly reconstructed.

Args:
y_r (~nnabla.Variable): Real part of STFT of size batch_size x fft_size//2 + 1 x frame_size.
y_i (~nnabla.Variable): Imaginary part of STFT of size batch_size x fft_size//2 + 1 x frame_size.
window_size (int): Size of STFT analysis window.
stride (int): Number of samples that we shift the window, also called hop size.
fft_size (int): Size of the FFT, (STFT has fft_size // 2 + 1 frequency bins).
window_type (str): Analysis window, can be either hanning, hamming or rectangular.
For convenience, also window_type=None is supported which is equivalent to window_type='rectangular'.
center (bool): If True, then it is assumed that the time-domain signal has centered frames.

Returns:
~nnabla.Variable: Time domain sequence of size batch_size x sample_size.
"""
from nnabla.parameter import get_parameter, get_parameter_or_create
conv_cos = get_parameter('conv_cos')
conv_sin = get_parameter('conv_sin')

if conv_cos is None or conv_sin is None:
if window_type == 'hanning':
window_func = np.hanning(window_size + 1)[:-1]
elif window_type == 'hamming':
window_func = np.hamming(window_size + 1)[:-1]
elif window_type == 'rectangular' or window_type is None:
window_func = np.ones(window_size)
else:
raise ValueError("Unknown window type {}.".format(window_type))

# pad window if fft_size > window_size
if fft_size > window_size:
diff = fft_size - window_size
window_func, (diff//2, diff - diff//2), mode='constant')
elif fft_size < window_size:
raise ValueError(
"FFT size has to be as least as large as window size.")

# compute inverse STFT filter coefficients
if fft_size % stride != 0:
raise ValueError("FFT size needs to be a multiple of stride.")

inv_window_func = np.zeros_like(window_func)
for s in range(0, fft_size, stride):
inv_window_func += np.roll(np.square(window_func), s)

mat_cos = np.zeros((fft_size//2 + 1, 1, fft_size))
mat_sin = np.zeros((fft_size//2 + 1, 1, fft_size))

for w in range(fft_size//2+1):
alpha = 1.0 if w == 0 or w == fft_size//2 else 2.0
alpha /= fft_size
for t in range(fft_size):
mat_cos[w, 0, t] = alpha * \
np.cos(2. * np.pi * w * t / fft_size)
mat_sin[w, 0, t] = alpha * \
np.sin(2. * np.pi * w * t / fft_size)
mat_cos = mat_cos * window_func / inv_window_func
mat_sin = mat_sin * window_func / inv_window_func

conv_cos = get_parameter_or_create(
conv_sin = get_parameter_or_create(

# compute inverse STFT
x_cos = deconvolution(y_r, conv_cos, stride=(stride,))
x_sin = deconvolution(y_i, conv_sin, stride=(stride,))

x = reshape(x_cos - x_sin, (x_cos.shape[0], x_cos.shape[2]))

if center:
x = x[:, fft_size//2:-fft_size//2]

return x

[docs]def gather_nd(data, indices):
"""Gather elements or slices from data according to indices, which must
be at least two-dimensional with the first dimension :math:M being less or
equal to the :math:N dimensions of data. Given data with shape
:math:(X_0, X_1, ..., X_{N-1}) and indices with shape :math:(M, Y_0, ...,
Y_{K-1}) output has shape :math:(Y_0, ..., Y_{K-1}, X_M, ..., X_{N-1}).
If :math:M == N, output shape is simply :math:(Y_0, ..., Y_{K-1}).

The forward of :func:~nnabla.functions.gather_nd is equivalent to:

.. code-block:: python

def gather_nd(data, index):
import numpy as np
tmp_index = index.reshape(index.shape[0], -1)
tmp_index = (idx + (Ellipsis,) for idx in zip(*new_index))
out_shape = index.shape[1:] + data.shape[index.shape[0]:]
return np.vstack(data[idx] for idx in tmp_index).reshape(*out_shape)

Examples:

>>> import numpy as np, nnabla as nn, nnabla.functions as F
>>> nn.set_auto_forward(True)
>>> data = F.arange(1, 11).reshape([2, 5])
>>> print(data.d)
[[ 1.  2.  3.  4.  5.]
[ 6.  7.  8.  9. 10.]]
>>> F.gather_nd(data, [[1, 1, 0]]).shape
(3, 5)
>>> F.gather_nd(data, [[1, 1, 0], [0, 1, 0]]).shape
(3,)
>>> print(F.gather_nd(data, [[1, 1, 0], [0, 1, 0]]).d)
[6. 7. 1.]
>>> print(F.gather_nd(data, [[1, 1, 0]]).d)
[[ 6.  7.  8.  9. 10.]
[ 6.  7.  8.  9. 10.]
[ 1.  2.  3.  4.  5.]]

When indices is provided as a :obj:~nnabla.Variable it will be possible
to change the actual index values after function creation. It is important
to note that out-of-bound indices raise errors when running on CPU but are
ignored when using an accelerated computation context.

>>> indices = nn.Variable((2, 1))
>>> indices.d = [[0], [0]]
>>> y = F.gather_nd(data, indices)
>>> print(y.d)
[1.]
>>> indices.d = [[1], [4]]
>>> y.forward()
>>> print(y.d)
[10.]

Args:
data(~nnabla.Variable, ~nnabla.NdArray): input data
indices(list, numpy.ndarray, ~nnabla.Variable, ~nnabla.NdArray): gather indices

Returns: ~nnabla.Variable or ~nnabla.NdArray of gathered elements.
"""
from .function_bases import gather_nd as gather_nd_base
if not isinstance(indices, (nn.Variable, nn.NdArray)):
if not isinstance(indices, np.ndarray):
indices = np.asarray(indices, dtype=np.int)
indices = nn.Variable.from_numpy_array(indices)
return gather_nd_base(data, indices)

[docs]def scatter_nd(data, indices, shape=None, out=None, add=False):
"""Scatter data according to indices into a new array of given shape
or an existing array provided as out. Exactly one of the shape or out
argument must be given. Given output shape, or shape of out array,
:math:(X_0,X_1,\ldots,X_{N-1}) and indices shape
:math:(M,Y_0,\ldots,Y_{K-1}) the input data shape is
:math:(Y_0,\ldots,Y_{K-1},X_M,\ldots,X_{N-1}), where :math:M<=N. If
:math:M==N the data shape is simply :math:(Y_0,\ldots,Y_{K-1}).
Note that indices are treated as integers and potentially converted.

The forward of :func:~nnabla.functions.scatter_nd is equivalent to:

.. code-block:: python

def scatter_nd(data, indices, shape=None, out=None):
assert (shape and not out) or (out and not shape)
if isinstance(indices, numpy.ndarray)
indices = indices.tolist()
result = out if out else numpy.zeros(shape)
result[indices] = data
return result

Examples:

>>> import numpy as np, nnabla as nn, nnabla.functions as F
>>> nn.set_auto_forward(True)
>>> data = nn.Variable.from_numpy_array(np.array([9, 10, 11, 12]))
>>> indices = nn.Variable.from_numpy_array(np.array([[4, 3, 1, 7]]))
>>> scattered = F.scatter_nd(data, indices, shape=(8,))
>>> print(scatterd.d)
[ 0. 11.  0. 10.  9.  0.  0. 12.]
>>> print(F.gather_nd(scattered, indices).d)
[ 9. 10. 11. 12.]

Args:
data(~nnabla.Variable, ~nnabla.NdArray): input data
indices(list, numpy.ndarray, ~nnabla.Variable, ~nnabla.NdArray): scatter indices
shape(tuple, list): shape of new output array
out(~nnabla.Variable, ~nnabla.NdArray): existing output array
add(tool): Add the input data to the same destination specified by the indices.

Returns: ~nnabla.Variable or ~nnabla.NdArray of given shape.

"""
from .function_bases import scatter_nd as scatter_nd_base
if not isinstance(indices, (nn.Variable, nn.NdArray)):
if not isinstance(indices, np.ndarray):
indices = np.asarray(indices, dtype=np.int)
indices = nn.Variable.from_numpy_array(indices)
if shape is None and out is None:
raise TypeError("One of shape or out argument must be supplied.")
if shape and out:
raise TypeError("Only one of shape or out argument may be used.")
if out:
if not isinstance(out, (nn.Variable, nn.NdArray)):
raise TypeError("out argument must be NdArray or Variable type.")
shape = out.shape
elif isinstance(shape, np.ndarray):
shape = shape.tolist()
return scatter_nd_base(data, indices, out, shape, add)

'''Add all values from x1 into the x0 according to index specified by indices.
This function adds x1 into the copy of x0 and outputs the copy.
The original x0 will not be changed.
x0, indices and x1 must have same number of dimensions.

The forward of :func:~nnabla.functions.scatter_add is equivalent to:

.. code-block:: python

# Assuming each input is 3 dimensional
import numpy as np
output = np.copy(x0)
for i in range(indices.shape[0]):
for j in range(indices.shape[1]):
for k in range(indices.shape[2]):
if axis == 0:
output[indices[i][j][k]][j][k] += x1[i][j][k]
elif axis == 1:
output[i][indices[i][j][k]][k] += x1[i][j][k]
elif axis == 2:
output[i][j][indices[i][j][k]] += x1[i][j][k]
return output

Args:
x0(~nnabla.Variable): N-D array which the data is added to its copy.
indices(~nnabla.Variable): N-D array scatter indices.
The size of each dimension must be equal or smaller than that of x0 except for the specified axis.
The value of indices must be smaller than the size of specified axis' dimension of x0.
The size of each dimension must be equal or smaller than that of x1.
Indices must not be negative.
x1(~nnabla.Variable): N-D array which is scattered and added to x0.
axis(int): Axis along which to index. The axis must not exceed the inputs' dimension.
[default= 0 ]

Returns:
~nnabla.Variable: N-D array which contains the result of scatter addition. The shape is same as x0.
'''

Computes multi-headed attention with query, key, and value.
We use the following notations to describe the inputs and outputs below.
:math:L_T: target sequence length, :math:L_S: source sequence length, :math:B: batch size, :math:D: input dimension, :math:E: embedding dimension, :math:H: number of attention heads.

References:

A. Vaswani et al. "Attention is All You Need."
NIPS. 2017.
<https://papers.nips.cc/paper/7181-attention-is-all-you-need.pdf>

Args:
query (~nnabla.Variable): Input N-D array with shape :math:(L_T, B, D_q).
key (~nnabla.Variable): Input N-D array with shape :math:(L_S, B, D_k).
value (~nnabla.Variable): Input N-D array with shape :math:(L_S, B, D_v).
num_heads (int): Number of attention heads. Note that embedding dimensoin E must be divisible by the number of heads. Default is 12 which is conventional.
q_weight (~nnabla.Variable): Input N-D array with shape :math:(D_q, E).
k_weight (~nnabla.Variable): Input N-D array with shape :math:(D_k, E).
v_weight (~nnabla.Variable): Input N-D array with shape :math:(D_v, E_v).
out_weight (~nnabla.Variable): Input N-D array with shape :math:(D_v, E_{out}).
q_bias (~nnabla.Variable, optional): Input N-D array with shape :math:(E, ).
k_bias (~nnabla.Variable, optional): Input N-D array with shape :math:(E, ).
v_bias (~nnabla.Variable, optional): Input N-D array with shape :math:(E_v, ).
out_bias (~nnabla.Variable, optional): Input N-D array with shape :math:(E_{out}, ).
attn_bias_k (~nnabla.Variable, optional): Input N-D array with shape :math:(E, ).
attn_bias_v (~nnabla.Variable, optional): Input N-D array with shape :math:(E_v, ).
dropout (float, optional): Dropout ratio applied to parameters. Default is 0.
additive_mask (~nnabla.Variable, optional): Input N-D array with shape :math:(L_T, L_S). Values will be added to the attention layer to prevent attention to certain positions.
key_padding_mask (~nnabla.Variable, optional): Input N-D array with shape :math:(B, L_S). Specified padding elements will be ignored by the attention layer. Values must be either 1 or 0.

Returns:
~nnabla.Variable: Output :math:y with shape :math:(L_T, B, E_{out})
~nnabla.Variable: Output :math:h_n with shape :math:(B, L_T, L_S)
'''

from . import functions as F

tgt_len, batch_size, _ = query.shape
src_len, batch_size, _ = key.shape
q_embed_dim = q_weight.shape[1]
k_embed_dim = k_weight.shape[1]
v_embed_dim = v_weight.shape[1]
out_dim = out_weight.shape[1]
assert src_len == value.shape[0]
assert q_embed_dim == k_embed_dim, "embedding dimensions must be the same for query and key."

# query:(L_T, B, E) --> q:(L_T, B, E)
q = F.affine(query, q_weight, q_bias, base_axis=2)
# key:(L_S, B, D_k) --> k:(L_S, B, E_k)
k = F.affine(key, k_weight, k_bias, base_axis=2)
# value:(L_S, B, D_v) --> v:(L_S, B, E_v)
v = F.affine(value, v_weight, v_bias, base_axis=2)

if attn_bias_k is not None:
attn_bias_k = F.reshape(attn_bias_k, (1, 1, k_embed_dim))
attn_bias_v = F.reshape(attn_bias_v, (1, 1, v_embed_dim))
src_len += 1
assert attn_bias_k is not None
attn_bias_k, (1, batch_size, attn_bias_k.shape[2]))
attn_bias_v, (1, batch_size, attn_bias_v.shape[2]))
k = F.concatenate(k, attn_bias_k, axis=0)
v = F.concatenate(v, attn_bias_v, axis=0)

q = F.transpose(
k = F.transpose(
v = F.transpose(

# attn_output_weights: (B*H, L_T, L_S)
attn_output_weights = F.batch_matmul(q, k, transpose_b=True)
assert list(attn_output_weights.shape) == [

attn_output_weights = F.reshape(
attn_output_weights = F.where(
attn_output_weights.shape),  # Condition
F.constant(val=float('-inf'),
shape=attn_output_weights.shape),  # If true
attn_output_weights)  # If false
attn_output_weights = F.reshape(

attn_output_weights = F.softmax(
attn_output_weights, axis=len(attn_output_weights.shape)-1)
if dropout > 0:
attn_output_weights = F.dropout(
attn_output_weights, p=dropout)

# (B*H, L_T, L_S) x (B*H, L_S, head_vdim) --> (B*H, L_T, head_vdim)
attn_output = F.batch_matmul(attn_output_weights, v)
assert list(attn_output.shape) == [
attn_output = F.reshape(F.transpose(
attn_output, (1, 0, 2)), (tgt_len, batch_size, v_embed_dim))  # attn_output: (L_T, B, E_v)

attn_output = F.affine(attn_output, out_weight, out_bias, base_axis=2)

return attn_output, attn_output_weights

[docs]def patch_correlation(x1, x2, patch=(1, 1), shift=(0, 0), patch_step=(1, 1),
shift_step=(1, 1), padding=(0, 0, 0, 0),
channel_last=False):
r"""
Multiplicative patch-wise comparison between inputs x1 and x2, which
must both be 4-dimensional NCHW (with channel_last=False) or NHWC (with
channel_last=True) arrays (where *N* is the number of samples, *H* and
*W* are the sample height and width and *C* is the number of channels).
The function returns a 5-D array with shape :math:(N, C_y, C_x, H_o, W_o)
where :math:H_o, W_o are determined by the possible patch locations within
the, optionally padded, input image sizeand :math:C_y, C_x are determined
by the optionally shifted patch positions.

Mathematically, the patch correlation is formulated as

.. math::

O(s_y, s_x, h_0, w_0) =
\sum_{c} \sum_{k_h} \sum_{k_w} I_1(c, h + k_h, w + k_w) \times I_2(c, h + k_h + s_h, w + k_w + s_w),

where :math:I_1(c, h, w) and :math:I_2(c, h, w) are the inputs at :math:c-th channel,
:math:h-th height, and :math:w-th width, :math:k_h, k_w indices for the patch size
and :math:s_h, s_w indices for the shifts.

A single correlation value (per sample) is produced if the patch extends
to the image dimensions and all other parameters use the default values.

>>> import numpy as np, nnabla as nn, nnabla.functions as F
>>> N, C, H, W = (1, 2, 3, 4)
>>> x = nn.Variable.from_numpy_array(np.ones([N, C, H, W]))
>>> F.patch_correlation(x, x, patch=(H, W)).d
array([[[[[24.]]]]], dtype=float32)

A patch that is smaller than the image size moves horizontal and vertical
producing a value per position. The patch_step argument may be used to
control the position increments.

>>> F.patch_correlation(x, x, patch=(H-1, W-1)).d
array([[[[[12., 12.],
[12., 12.]]]]], dtype=float32)
>>> F.patch_correlation(x, x, patch=(H-1, W-1), patch_step=(2, 1)).d
array([[[[[12., 12.]]]]], dtype=float32)

Multiple correlations may be performed at each position between the patch
from x1 and patches from x2 at relative offsets striding the maximum
vertical and horizontal distance given by the shift values at increments
of shift_step. The shifted correlation values can be obtained for the
from the second and third output dimension for the vertical and horizontal
shifts.

>>> F.patch_correlation(x, x, (H, 1), shift=(0, 1)).shape
(1, 1, 3, 1, 4)
>>> F.patch_correlation(x, x, (H, 1), shift=(0, 1)).d
array([[[[[0., 6., 6., 6.]],
[[6., 6., 6., 6.]],
[[6., 6., 6., 0.]]]]], dtype=float32)
>>> F.patch_correlation(x, x, (H, 1), shift=(0, 1), shift_step=(1, 2)).d
array([[[[[0., 6., 6., 6.]],
[[6., 6., 6., 0.]]]]], dtype=float32)

Padding with zero values may be applied individually to the top, bottom,
left and right side of the input image.

>>> F.patch_correlation(x, x, patch=(H, W), padding=(0, 1, W, W)).d
array([[[[[ 0.,  6., 12., 18., 24., 18., 12.,  6.,  0.],
[ 0.,  4.,  8., 12., 16., 12.,  8.,  4.,  0.]]]]], dtype=float32)

This function may be used to implement the FlowNetC correlation layer.

>>> N, C, H, W = (1, 256, 44, 60)
>>> x1, x2 = nn.Variable((N, C, H, W)), nn.Variable((N, C, H, W))
>>> F.patch_correlation(x1, x2, shift=20, shift_step=2).shape
(1, 21, 21, 44, 60)

References:

* Fischer et al., FlowNet: Learning Optical Flow with Convolutional
Networks. <https://arxiv.org/abs/1504.06852>_

Args:
x1(~nnabla.Variable): Input N-D array with shape :math:(N, C, H, W)
or :math:(N, H, W, C).
x2(~nnabla.Variable): Input N-D array with shape :math:(N, C, H, W)
or :math:(N, H, W, C).
patch: A tuple with height and width of the correlation patch. A single
integer expands to identical height and width.
shift: A tuple of maximum vertical and horizontal displacement of
patches from x2 that are correlated with a single patch from x1.
A single integer expands to identical vertical and horizontal
displacement.
patch_step: A tuple of vertical and horizontal increments for advancing
the position of the correlation patch within the input image shape.
A single integer expands to identical vertical and horizontal
increments.
shift_step: A tuple of vertical and horizontal increments for advancing
the relative offset position within the shift range. A single
integer expands to identical vertical and horizontal increments.
padding: A tuple of top, bottom, left and right padding extent. A tuple
of two values yields identical top/bottom and left/right padding
from the first and second tuple value. A single integer expands to
indential padding extent for all sides.
channel_last: Last dimension is the channel (NHWC order) if True.

Returns:
~nnabla.Variable: N-D array with shape :math:(N, C_y, C_x, H_o, W_o) or :math:(N, H, W, C_y, C_x) if channel_last=True.

A spatial size of the output is calculated as

.. math::

A channel size of the ouptut is calculated as

.. math::

C_y = \frac{2 \times shift_v}{shift\_step_v} + 1.

:math:W_o and :math:C_x are the same calculation with differenct components.
"""
from .function_bases import patch_correlation as patch_correlation_base

if not len(x1.shape) == len(x2.shape) == 4:
raise ValueError("Both inputs x1 and x2 must have 4 dimensions.")

if not x1.shape == x2.shape:
raise ValueError("Both inputs x1 and x2 must have equal shape.")

if isinstance(patch, int):
patch = 2 * (patch,)
if isinstance(shift, int):
shift = 2 * (shift,)
if isinstance(patch_step, int):
patch_step = 2 * (patch_step,)
if isinstance(shift_step, int):
shift_step = 2 * (shift_step,)

if not channel_last:
if x1.shape[1] == 1:
x1 = reshape(x1, (x1.shape[0], *x1.shape[2:4], 1))
x2 = reshape(x2, (x2.shape[0], *x2.shape[2:4], 1))
else:
x1 = transpose(x1, (0, 2, 3, 1))
x2 = transpose(x2, (0, 2, 3, 1))

y = patch_correlation_base(x1, x2, patch, shift, patch_step, shift_step,

if not channel_last:
y = transpose(y, (0, 3, 4, 1, 2))

return y

def quantize_linear(x, scale, zero_point,
round_mode="HALF_AWAY_FROM_ZERO", narrow_range=False, dtype=np.int8):
r"""

Quantize linearly inputs with the scale and zero point.

.. math::

y = saturate(round(x / scale) + zero_point).

:math:saturate rage is determined by dtype and :math:round mode is selected
by round_mode. :math:zero_point is constrained by the dtype range and its values are
rounded by round_mode.

This function normally aligns with ONNX QuantizeLinear.

Args:
x (Variable): An input variable.
scale (Variable): Scale variable.
zero_point (Variable): Zero point variable.
round_mode (str): Rounding mode. HALF_AWAY_FROM_ZERO or HALF_TO_EVEN.
narrow_range (bool): If true, this function does not use the minimum quantized value. For
example, if dtype is int8 (the range is in [-128, 127]), the output range
is corrected in [-127, 127].
dtype (numpy.dtype): Data type for the output. Currently np.int8 or np.uint8 are supported.
"""
from .function_bases import quantize_linear as quantize_linear_base
int_dtype = dtypes.np_dtpye_to_int[dtype]
y = quantize_linear_base(x, scale, zero_point,
round_mode, narrow_range, int_dtype)
return y

def linspace(start, stop, num):
r"""
Generate a one-dimensional vector/tensor of size num whose values are evenly spaced from start to end, inclusive.

Args:
start(float): Start value.
stop(float): End value.
num(int): Size of the constructed vector/tensor.

Returns:
~nnabla.Variable: 1-D array with the generated values.
"""
from .function_bases import linspace as linspace_base

if not isinstance(num, int):
raise TypeError(
"'{}' object cannot be interpreted as an integer".format(type(num).__name__))
if num < 0:
raise ValueError(
"Number of samples, {}, must be non-negative.".format(num))

return linspace_base(start, stop, num)