# Copyright (c) 2017 Sony Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
from collections import OrderedDict
import nnabla as nn
import nnabla.functions as F
from nnabla.function import PythonFunction
from .backward_functions import registry
class GradEndFunction(PythonFunction):
"""This nnabla function is used for the very end of a computational graph in the grad function.
"""
@property
def name(self):
return self.__class__.__name__
def min_outputs(self):
return 1
def grad_depends_output_data(self, i, o):
return False
def grad_depends_input_data(self, i, j):
return False
def setup_impl(self, inputs, outputs):
i = inputs[0]
o = outputs[0]
o.reset_shape(i.shape, True)
def forward_impl(self, inputs, outputs):
o = outputs[0].data.fill(1.0)
def backward_impl(self, inputs, outputs, propagate_down, accum):
gi = inputs[0].grad.fill(0.0)
class Grad(object):
def __init__(self, ):
pass
def _force_list(self, x):
if isinstance(x, list):
return x
elif hasattr(x, "__iter__"):
return [o for o in x]
else:
return [x]
def _connect_on_gradient_graph(self, grad_vars, f):
# 1. accumulate variables used more than one or do nothing
vf_vb_map = grad_vars.pop(f) # {VO_fwd: [VI_bwd]}
grad_inputs = []
for o in f.outputs:
# address `floating` variables; no function takes it as input.
# e.g., when dx, db, dg = dBN(...), (db, dg) are not used afterwards.
#v = vf_vb_map[o] if o in vf_vb_map else [None]
v = vf_vb_map[o] if o in vf_vb_map else [0]
if len(v) > 1:
grad_inputs += [sum(v)]
#grad_inputs += [F.add_n(v)]
else:
grad_inputs += v
# 2. lookup the backward function
f_fwd_name = f.info.type_name
if f_fwd_name not in registry:
raise ValueError(
"{} is not in the backward function registry".format(f_fwd_name))
backward_func = registry[f_fwd_name]
# 3. connect
grad_inputs = grad_inputs + f.inputs
ctx = nn.get_current_context()
with nn.context_scope(ctx):
grad_outputs = backward_func(grad_inputs, **f.info.args)
grad_outputs = self._force_list(grad_outputs)
# 4. put grad_output as grad_input to a corresponding function
for inp, grad_out in zip(f.inputs, grad_outputs):
if grad_out is None:
continue
if inp.parent not in grad_vars:
grad_vars[inp.parent] = OrderedDict()
if inp not in grad_vars[inp.parent]:
grad_vars[inp.parent][inp] = [grad_out]
else:
grad_vars[inp.parent][inp] += [grad_out]
return grad_outputs
def _get_corresponding_vars_on_graph(self, inputs, outputs):
"""
This methods binds the inputs which may be disconnected from a computation graph
(e.g., trainable parameters obtrained by nn.get_parmaeters()) to actual inputs connected
to a given computation graph traversabled from outputs
"""
ret = [None for _ in range(len(inputs))]
inp2id = {x: i for i, x in enumerate(inputs)}
open = set()
def dfs(node):
if node in open or not node.parent:
return
for par_var in node.parent.inputs:
if node in open:
continue
id = inp2id.get(par_var, None)
if id is not None:
ret[id] = par_var
dfs(par_var)
open.add(par_var)
for output in outputs:
dfs(output)
return ret
def _get_children(self, wrt_inputs):
"""
Return children traversable from `wrt_inputs` in the forward graph topological order.
"""
children = set()
def dfs(node):
if node in children:
return
for f in node.function_references:
if f.info.type_name == "GradEndFunction":
continue
for o in f.outputs:
if o in children:
continue
dfs(o)
children.add(o)
for inp in wrt_inputs:
if inp is None:
continue
dfs(inp)
return children
def __call__(self, outputs, inputs, grad_outputs=None,
persistent_outputs=[], bind_grad_output=False):
"""
The logic of this method is almost same as one in visit_function_backward in C++ layer.
"""
# TODO: address auto_forward is very slow. It may be python overhead since small diff when BS is large.
# TODO: address auto_forward consumes lots of memory, need to call v.get_unlinked_variable()?
# TODO: address auto_forward consumes lots of memory, need to use NdArray as inputs?
# Check outputs/inputs
outputs = self._force_list(outputs)
if not all([isinstance(o, nn.Variable) for o in outputs]):
raise ValueError("Element of outputs must be `nnabla.Variable`.")
inputs = self._force_list(inputs)
if not all([isinstance(i, nn.Variable) for i in inputs]):
raise ValueError("Element of inputs must be `nnabla.Variable`.")
# Check grad_outputs
if grad_outputs is None:
grad_outputs = [None] * len(outputs)
elif isinstance(grad_outputs, (int, float, np.ndarray, nn.NdArray)):
grad_outputs = self._force_list(grad_outputs)
elif isinstance(grad_outputs, list):
if len(outputs) != len(grad_outputs):
raise ValueError(
"Length of `grad_outputs` and length of `outputs` must be the same.")
for i in range(len(outputs)):
o = outputs[i]
go = grad_outputs[i]
if not isinstance(go, (type(None), int, float, np.ndarray, nn.NdArray, nn.Variable)):
raise ValueError("Element of `grad_outputs` must be "
"in (`None`, `int`, `float`, `numpy.ndarray`, "
"`nnabla.NdArray`, `nnabla.Variable`) or "
"list of (`None`, `int`, `float`, `numpy.ndarray`, "
"`nnabla.NdArray`, `nnabla.Variable`)\n"
"type(grad_outputs[{}] = {}".format(i, type(go)))
elif isinstance(go, (np.ndarray, nn.NdArray, nn.Variable)) and go.shape != o.shape:
raise ValueError("Shape of each of outputs and grad_outputs must be same.\n"
"output[{}]({}) != grad_output[{}]({})".format(i, o.shape, i, go.shape))
# Check persistent_outputs
if len(persistent_outputs) != 0 and len(outputs) != len(persistent_outputs):
raise ValueError("Length of outputs and persistent_outputs "
"must be the same except for "
"the case that the length of the persistent_outputs is 0.")
# Persistent outputs since outputs are basically losses to be monitored
persistent_outputs = [
True] * len(outputs) if persistent_outputs == [] else persistent_outputs
for o, p in zip(outputs, persistent_outputs):
o.persistent = p
# Open list of next search candidate
ids = {}
def get_id(func):
if func not in ids.keys():
size = len(ids)
ids[func] = size
return size
return ids[func]
open = set()
# Map for grad_variables consumed on the gradient graph.
# End is the special case where d_o = end_f(o) and map[end_f] = {o: [d_o]}
grad_vars = OrderedDict() # {F_fwd: {VO_fwd: [VI_bwd]}}
# Set grad_outputs
for i in range(len(outputs)):
o = outputs[i]
go = grad_outputs[i]
if go is None:
output = o
elif isinstance(go, (int, float)):
go = nn.Variable(o.shape).apply(d=go, need_grad=False)
output = o * go
elif isinstance(go, np.ndarray):
go = nn.Variable(o.shape).apply(d=go, need_grad=False)
output = o * go
elif isinstance(go, nn.NdArray):
go = nn.Variable(o.shape).apply(data=go, need_grad=False)
output = o * go
elif isinstance(go, nn.Variable):
output = o * go
func = output.parent
open.add((-output.rank, get_id(func), func))
# Connect the graph and its gradient graph
grad_output = GradEndFunction()(output).apply(need_grad=False)
grad_vars[func] = OrderedDict({output: [grad_output]})
# Return grads but
# replace inputs params with the vars connected with the given graph
wrt_inputs = self._get_corresponding_vars_on_graph(inputs, outputs)
grads = [None] * len(wrt_inputs)
child_nodes = self._get_children(wrt_inputs)
wrt_inputs = [nn.Variable() if x is None else x for x in wrt_inputs]
# Expand the graph to its gradient graph
while len(open) != 0:
open = sorted(open) # python set is NOT sorted set.
rank_func = open.pop(0) # 0 is necessary
open = set(open)
f = rank_func[2]
if not f.need_grad:
continue
# Connect variables on the gradient graph
grad_outputs = self._connect_on_gradient_graph(grad_vars, f)
# Check grads w.r.t. inputs
for inp, grad_out in zip(f.inputs, grad_outputs):
if inp not in wrt_inputs or inp.need_grad == False or grad_out is None:
continue
idx = wrt_inputs.index(inp)
if grads[idx] is None:
grads[idx] = grad_out
else:
grads[idx] = grads[idx] + grad_out # accum at leaf
if bind_grad_output:
inp.grad = grads[idx].data
# Propagate down
for inp, grad_out in zip(f.inputs, grad_outputs):
if inp not in child_nodes or not inp.need_grad or grad_out is None:
continue
p_i = inp.parent
if not p_i:
continue
open.add((-p_i.rank, get_id(p_i), p_i))
# If the final grads has None, then None becomes zero Variable(s).
for i in range(len(grads)):
if grads[i]:
continue
grads[i] = F.constant(0, wrt_inputs[i].shape)
return grads
[docs]def grad(outputs, inputs, grad_outputs=None, persistent_outputs=[], bind_grad_output=False):
r"""Gradient function for the outputs with respect to the inputs.
The grad function computes the sum of gradients of the outputs w.r.t. the inputs.
.. math::
g_i = \sum_{j} {\frac{\partial y_j}{\partial x_i}},
:math:`y_j` is each output, :math:`x_i` is each input, and :math:`g_i` is the sum of the gradient of :math:`y_j` w.r.t. :math:`x_i` over all :math:`j`.
Args:
outputs (list of :obj:`~nnabla.Variable` or :obj:`~nnabla.Variable`): Outputs of the differentiable function.
inputs (list of :obj:`~nnabla.Variable` or :obj:`~nnabla.Variable`): Inputs w.r.t. which the gradients of outputs are computed.
grad_outputs (None, scalar, :obj:`numpy.ndarray`, :obj:`nnabla.NdArray`, or list of scalar, :obj:`numpy.ndarray`, or :obj:`nnabla.NdArray`, ): Gradient outputs corresponding to outputs. This is same as the grad argument of :meth:`~nnabla.Variable.backward`. Default is None, so 1 is used as the in-coming gradient at the very beginning of the Variable in the gradient graph.
persistent_outputs (list of `bool`): Outputs become persistent accordingly. If not specified, all outputs become persistent.
bind_grad_output (`bool`): Bind data to grad of input variable. This is useful for the case where one wants to use the gradient graph for training a neural network using the first-order gradients only. Default is False.
Returns
List of :obj:`~nnabla.Variable`.
If the backpropagation does not reach input(s), the corresponding returned value(s) are `zero`
(i.e., the gradients w.r.t. inputs are zero) and not connected as a part of the gradient graph.
Example (Gradient Penalty):
.. code-block:: python
import nnabla as nn
import nnabla.functions as F
import nnabla.parametric_functions as PF
import numpy as np
from nnabla.ext_utils import get_extension_context
# Context
extension_module = "cudnn"
ctx = get_extension_context(extension_module)
nn.set_default_context(ctx)
# Input and label
x = nn.Variable.from_numpy_array(np.random.randn(4, 3, 32, 32))
y = nn.Variable.from_numpy_array(np.random.randint(0, 10, 4).reshape(4, 1))
# Network
h = PF.convolution(x, 8, (3, 3), (1, 1), name="conv1")
h = F.relu(h)
h = F.max_pooling(h, (2, 2))
h = PF.convolution(h, 16, (3, 3), (1, 1), name="conv2")
h = F.relu(h)
h = F.max_pooling(h, (2, 2))
p = PF.affine(h, 10, name="pred")
loss = F.mean(F.softmax_cross_entropy(p, y))
# Grad
outputs = [loss]
inputs = nn.get_parameters().values()
grads = nn.grad(outputs, inputs) # gradients of the parameters
# Backward of the outputs w.r.t. the parameters by constraining the gradient norms.
t = 0 # or 1
gp = sum([(F.sum(g ** 2) ** 0.5 - t) ** 2 for g in grads])
loss += gp
loss.forward()
loss.backward()
Example (Higer-order Gradients):
.. code-block:: python
import nnabla as nn
import nnabla.functions as F
import numpy as np
x = nn.Variable.from_numpy_array(np.random.randn(2, 2)).apply(need_grad=True)
x.grad.zero()
y = F.sin(x)
def grad(y, x, n=1):
dx = [y]
for _ in range(n):
dx = nn.grad([dx[0]], [x])
return dx[0]
dnx = grad(y, x, n=10)
dnx.forward()
print(np.allclose(-np.sin(x.d), dnx.d))
dnx.backward()
print(np.allclose(-np.cos(x.d), x.g))
# Show the supported status for each function
from nnabla.backward_functions import show_registry
show_registry()
"""
grad_outputs = Grad()(outputs, inputs, grad_outputs=grad_outputs,
persistent_outputs=persistent_outputs,
bind_grad_output=bind_grad_output)
return grad_outputs