Source code for nnabla.utils.qnn

# Open question
# TODO: save in a nice way, the save should be of nnp format
# TODO: address the data format (since int8x32 convolution is normally fastest among large shape)
# TODO: Input data quantize, e.g, input= image do not update scale (min/max) in recording, use 1 / 255 and remove F.mul2_scalar(255)


"""
QNN-specific Functions
"""
import nnabla as nn
import nnabla.functions as F
import numpy as np
import nnabla_ext
import nnabla.experimental.graph_converters as GC

from enum import Enum
from nnabla.function import PythonFunction
from nnabla.initializer import ConstantInitializer

__round_methods__ = {
    'CEIL': np.ceil,
    'ROUND': np.round,
    'FLOOR': np.floor,
    'NOTROUND': None
}


def _param_exits(param):
    return True if nn.parameter.get_parameter(param) else False


class MinMaxMinMaxRecorder(PythonFunction):
    """
    MinMaxMinMaxRecorder records the min and max of the batch over the training iterations.
    """

    def __init__(self, ctx, training=True):
        super(MinMaxMinMaxRecorder, self).__init__(ctx)
        self.training = training

    @property
    def name(self):
        return self.__class__.__name__

    def min_outputs(self):
        return 1

    def setup_impl(self, inputs, outputs):
        assert len(inputs) == 3, "len(inputs) must be 3; data, min, max."
        x = inputs[0]
        m = inputs[1]
        M = inputs[2]
        assert m.ndim == x.ndim and M.ndim == x.ndim, \
            "ndim of min and max must be same as ndim of data."
        assert np.prod(m.shape) == 1 and np.prod(M.shape) == 1, \
            "Any dimenstion of the shape of min and max must be 1."
        y = outputs[0]
        y.reset_shape(x.shape, True)
        # inplace
        # y.data = x.data

    def forward_impl(self, inputs, outputs):
        x = inputs[0].data
        m = inputs[1].data
        M = inputs[2].data
        y = outputs[0].data
        y.copy_from(x)

        if not self.training:
            return
        mb = F.min(x, keepdims=True)
        Mb = F.max(x, keepdims=True)
        F.minimum2(m, mb, outputs=[m])
        F.maximum2(M, Mb, outputs=[M])

    def backward_impl(self, inputs, outputs, propagate_down, accum):
        dx = inputs[0].grad
        dy = outputs[0].grad
        dy.copy_from(dx)

        if propagate_down[0]:
            if accum[0]:
                dx += dy
            else:
                dx.copy_from(dy)

    def grad_depends_output_data(self, i, o):
        return False

    def grad_depends_input_data(self, i, j):
        return False


def minmax_minmax_recorder(x, m, M, training=True):
    ctx = nn.get_current_context()
    func = MinMaxMinMaxRecorder(ctx, training)
    return func(x, m, M)


class MinMaxMinMaxRecorderCallback(object):
    def __init__(self):
        self._function = minmax_minmax_recorder

    def name(self):
        n = self.__class__.__name__
        return n[:n.rfind('Callback')]

    def __call__(self, x, axes=[1], training=True, name=''):
        shape = [1] * x.ndim
        m = nn.parameter.get_parameter_or_create('m-{}'.format(name), shape,
                                                 ConstantInitializer())
        M = nn.parameter.get_parameter_or_create('M-{}'.format(name), shape,
                                                 ConstantInitializer())
        y = self._function(x, m, M, training=training)
        return y

    def get_scale_zeropoint(x, axes=[1], narrow_range=False, round_method='NOTROUND', name=''):
        # If recorder is not added before Q/DQ, return neither scale nor zp
        if not _param_exits('m-{}'.format(name)) and not _param_exits('M-{}'.format(name)):
            return None, None

        shape = [1] * x.ndim
        m = nn.parameter.get_parameter_or_create('m-{}'.format(name), shape)
        M = nn.parameter.get_parameter_or_create('M-{}'.format(name), shape)

        n_bits = 8
        im = - 2 ** (n_bits - 1)
        iM = 2 ** (n_bits - 1) - 1
        de = (iM - im) if not narrow_range else (iM - (im + 1))

        # MinMaxMinMax, 1e-24 is a small experience value to avoid zero scale
        scale = np.maximum((M.d - m.d), 1e-24) / de

        # round
        _round = __round_methods__[round_method]
        scale = 2 ** (_round(np.log(scale) / np.log(2))
                      ) if _round else scale  # pow2 scale

        # set zeropoint to zero
        zp = np.round((np.zeros_like(m.d) / scale).astype(np.int8))
        zp = nn.parameter.get_parameter_or_create('zeropoint-{}'.format(name),
                                                  zp.shape, zp, False)

        scale = nn.parameter.get_parameter_or_create('scale-{}'.format(name),
                                                     scale.shape, scale, False)

        return scale, zp


class AbsMaxRecorder(PythonFunction):
    """
    AbsMaxRecorder records the max of the aboslute of the batch over the training iterations.
    """

    def __init__(self, ctx, training=True):
        super(AbsMaxRecorder, self).__init__(ctx)
        self.training = training

    @property
    def name(self):
        return self.__class__.__name__

    def min_outputs(self):
        return 1

    def setup_impl(self, inputs, outputs):
        assert len(inputs) == 2, "len(inputs) must be 2; data, abs_max."
        x = inputs[0]
        M = inputs[1]
        assert M.ndim == x.ndim, \
            "ndim of abs_max must be same as ndim of data."
        assert np.prod(M.shape) == 1, \
            "Any dimenstion of the shape of min and max must be 1."
        y = outputs[0]
        y.reset_shape(x.shape, True)
        # inplace
        # y.data = x.data

    def forward_impl(self, inputs, outputs):
        x = inputs[0].data
        M = inputs[1].data
        y = outputs[0].data
        y.copy_from(x)

        if not self.training:
            return
        Mb = F.max(F.abs(x), keepdims=True)
        F.maximum2(M, Mb, outputs=[M])

    def backward_impl(self, inputs, outputs, propagate_down, accum):
        dx = inputs[0].grad
        dy = outputs[0].grad
        dy.copy_from(dx)

        if propagate_down[0]:
            if accum[0]:
                dx += dy
            else:
                dx.copy_from(dy)

    def grad_depends_output_data(self, i, o):
        return False

    def grad_depends_input_data(self, i, j):
        return False


def abs_max_recorder(x, M, training=True):
    ctx = nn.get_current_context()
    func = AbsMaxRecorder(ctx, training)
    return func(x, M)


class AbsMaxRecorderCallback(object):
    def __init__(self):
        self._function = abs_max_recorder

    def name(self):
        n = self.__class__.__name__
        return n[:n.rfind('Callback')]

    def __call__(self, x, axes=[1], training=True, name=''):
        shape = [1] * x.ndim
        M = nn.parameter.get_parameter_or_create('M-{}'.format(name), shape,
                                                 ConstantInitializer())

        y = self._function(x, M, training=training)
        return y

    def get_scale_zeropoint(x, axes=[1], narrow_range=False, round_method='NOTROUND', name=''):
        # If recorder is not added before Q/DQ, return neither scale nor zp
        if not _param_exits('M-{}'.format(name)):
            return None, None

        shape = [1] * x.ndim
        M = nn.parameter.get_parameter_or_create('M-{}'.format(name), shape)

        n_bits = 8
        im = - 2 ** (n_bits - 1)
        iM = 2 ** (n_bits - 1) - 1
        de = (iM - im) if not narrow_range else (iM - (im + 1))
        scale = (2 * M.d) / de  # AbsMax

        # round
        _round = __round_methods__[round_method]
        scale = 2 ** (_round(np.log(scale) / np.log(2))
                      ) if _round else scale  # pow2 scale

        # set zeropoint to zero
        zp = np.round((np.zeros_like(M.d) / scale).astype(np.int8))
        zp = nn.parameter.get_parameter_or_create('zeropoint-{}'.format(name),
                                                  zp.shape, zp, False)

        scale = nn.parameter.get_parameter_or_create('scale-{}'.format(name),
                                                     scale.shape, scale, False)

        return scale, zp


class MinMaxMvaRecorder(PythonFunction):
    """
    MinMaxMvaRecorder records the moving average of the min and max of the batch over the training iterations.
    """

    def __init__(self, ctx, decay=0.99, training=True):
        super(MinMaxMvaRecorder, self).__init__(ctx)

        self.decay = decay
        self.training = training

    @property
    def name(self):
        return self.__class__.__name__

    def min_outputs(self):
        return 1

    def setup_impl(self, inputs, outputs):
        assert len(inputs) == 3, "len(inputs) must be 3; data, min, max."
        x = inputs[0]
        m = inputs[1]
        M = inputs[2]
        assert m.ndim == x.ndim and M.ndim == x.ndim, \
            "ndim of min and max must be same as ndim of data."
        assert np.prod(m.shape) == 1 and np.prod(M.shape) == 1, \
            "Any dimenstion of the shape of min and max must be 1."
        y = outputs[0]
        y.reset_shape(x.shape, True)
        # inplace
        # y.data = x.data

    def forward_impl(self, inputs, outputs):
        x = inputs[0].data
        m = inputs[1].data
        M = inputs[2].data
        y = outputs[0].data
        y.copy_from(x)

        if not self.training:
            return
        mb = F.min(x, keepdims=True)
        Mb = F.max(x, keepdims=True)
        F.identity(self.decay * m + (1 - self.decay) * mb, outputs=[m])
        F.identity(self.decay * M + (1 - self.decay) * Mb, outputs=[M])

    def backward_impl(self, inputs, outputs, propagate_down, accum):
        dx = inputs[0].grad
        dy = outputs[0].grad
        dy.copy_from(dx)

        if propagate_down[0]:
            if accum[0]:
                dx += dy
            else:
                dx.copy_from(dy)

    def grad_depends_output_data(self, i, o):
        return False

    def grad_depends_input_data(self, i, j):
        return False


def minmax_mva_recorder(x, m, M, decay=0.99, training=True):
    ctx = nn.get_current_context()
    func = MinMaxMvaRecorder(ctx, decay, training)
    return func(x, m, M)


class MinMaxMvaRecorderCallback(object):
    def __init__(self):
        self._function = minmax_mva_recorder

    def name(self):
        n = self.__class__.__name__
        return n[:n.rfind('Callback')]

    def __call__(self, x, axes=[1], training=True, name=''):
        shape = [1] * x.ndim
        m = nn.parameter.get_parameter_or_create('m-{}'.format(name), shape,
                                                 ConstantInitializer())
        M = nn.parameter.get_parameter_or_create('M-{}'.format(name), shape,
                                                 ConstantInitializer())

        y = self._function(x, m, M, training=training)
        return y

    def get_scale_zeropoint(x, axes=[1], narrow_range=False, round_method='NOTROUND', name=''):
        # If recorder is not added before Q/DQ, return neither scale nor zp
        if not _param_exits('m-{}'.format(name)) and not _param_exits('M-{}'.format(name)):
            return None, None

        shape = [1] * x.ndim
        m = nn.parameter.get_parameter_or_create('m-{}'.format(name), shape)
        M = nn.parameter.get_parameter_or_create('M-{}'.format(name), shape)

        n_bits = 8
        im = - 2 ** (n_bits - 1)
        iM = 2 ** (n_bits - 1) - 1
        de = (iM - im) if not narrow_range else (iM - (im + 1))

        # MinMaxMva, 1e-24 is a small experience value to avoid zero scale
        scale = np.maximum((M.d - m.d), 1e-24) / de

        # round
        _round = __round_methods__[round_method]
        scale = 2 ** (_round(np.log(scale) / np.log(2))
                      ) if _round else scale  # pow2 scale

        # set zeropoint to zero
        zp = np.round((np.zeros_like(m.d) / scale).astype(np.int8))
        zp = nn.parameter.get_parameter_or_create('zeropoint-{}'.format(name),
                                                  zp.shape, zp, False)

        scale = nn.parameter.get_parameter_or_create('scale-{}'.format(name),
                                                     scale.shape, scale, False)

        return scale, zp


class MaxMaxRecorder(PythonFunction):
    """
    MaxMaxRecorder records the max of the batch over the training iterations.
    """

    def __init__(self, ctx, training=True):
        super(MaxMaxRecorder, self).__init__(ctx)
        self.training = training

    @property
    def name(self):
        return self.__class__.__name__

    def min_outputs(self):
        return 1

    def setup_impl(self, inputs, outputs):
        assert len(inputs) == 2, "len(inputs) must be 2; data, max."
        x = inputs[0]
        M = inputs[1]
        assert M.ndim == x.ndim, \
            "ndim of max must be same as ndim of data."
        assert np.prod(M.shape) == 1, \
            "Any dimenstion of the shape of max must be 1."
        y = outputs[0]
        y.reset_shape(x.shape, True)
        # inplace
        # y.data = x.data

    def forward_impl(self, inputs, outputs):
        x = inputs[0].data
        M = inputs[1].data
        y = outputs[0].data
        y.copy_from(x)

        if not self.training:
            return
        Mb = F.max(x, keepdims=True)
        F.maximum2(M, Mb, outputs=[M])

    def backward_impl(self, inputs, outputs, propagate_down, accum):
        dx = inputs[0].grad
        dy = outputs[0].grad
        dy.copy_from(dx)

        if propagate_down[0]:
            if accum[0]:
                dx += dy
            else:
                dx.copy_from(dy)

    def grad_depends_output_data(self, i, o):
        return False

    def grad_depends_input_data(self, i, j):
        return False


def max_max_recorder(x, M, training=True):
    ctx = nn.get_current_context()
    func = MaxMaxRecorder(ctx, training)
    return func(x, M)


class MaxMaxRecorderCallback(object):
    def __init__(self):
        self._function = max_max_recorder

    def name(self):
        n = self.__class__.__name__
        return n[:n.rfind('Callback')]

    def __call__(self, x, axes=[1], training=True, name=''):
        shape = [1] * x.ndim
        M = nn.parameter.get_parameter_or_create('M-{}'.format(name), shape,
                                                 ConstantInitializer())

        y = self._function(x, M, training=training)
        return y

    def get_scale_zeropoint(x, axes=[1], narrow_range=False, round_method='NOTROUND', name=''):
        # If recorder is not added before Q/DQ, return neither scale nor zp
        if not _param_exits('M-{}'.format(name)):
            return None, None

        shape = [1] * x.ndim
        M = nn.parameter.get_parameter_or_create('M-{}'.format(name), shape)

        n_bits = 8
        im = - 2 ** (n_bits - 1)
        iM = 2 ** (n_bits - 1) - 1
        de = (iM - im) if not narrow_range else (iM - (im + 1))
        scale = (2 * M.d) / de  # MaxMax

        # round
        _round = __round_methods__[round_method]
        scale = 2 ** (_round(np.log(scale) / np.log(2))
                      ) if _round else scale  # pow2 scale

        # set zeropoint to zero
        zp = np.round((np.zeros_like(M.d) / scale).astype(np.int8))
        zp = nn.parameter.get_parameter_or_create('zeropoint-{}'.format(name),
                                                  zp.shape, zp, False)

        scale = nn.parameter.get_parameter_or_create('scale-{}'.format(name),
                                                     scale.shape, scale, False)

        return scale, zp


class MaxMvaRecorder(PythonFunction):
    """
    MaxMvaRecorder records the moving average of the max of the batch over the training iterations.
    """

    def __init__(self, ctx, decay=0.99, training=True):
        super(MaxMvaRecorder, self).__init__(ctx)

        self.decay = decay
        self.training = training

    @property
    def name(self):
        return self.__class__.__name__

    def min_outputs(self):
        return 1

    def setup_impl(self, inputs, outputs):
        assert len(inputs) == 2, "len(inputs) must be 2; data, max."
        x = inputs[0]
        M = inputs[1]
        assert M.ndim == x.ndim, \
            "ndim of max must be same as ndim of data."
        assert np.prod(M.shape) == 1, \
            "Any dimenstion of the shape of max must be 1."
        y = outputs[0]
        y.reset_shape(x.shape, True)
        # inplace
        # y.data = x.data

    def forward_impl(self, inputs, outputs):
        x = inputs[0].data
        M = inputs[1].data
        y = outputs[0].data
        y.copy_from(x)

        if not self.training:
            return
        Mb = F.max(x, keepdims=True)
        F.identity(self.decay * M + (1 - self.decay) * Mb, outputs=[M])

    def backward_impl(self, inputs, outputs, propagate_down, accum):
        dx = inputs[0].grad
        dy = outputs[0].grad
        dy.copy_from(dx)

        if propagate_down[0]:
            if accum[0]:
                dx += dy
            else:
                dx.copy_from(dy)

    def grad_depends_output_data(self, i, o):
        return False

    def grad_depends_input_data(self, i, j):
        return False


def max_mva_recorder(x, M, decay=0.99, training=True):
    ctx = nn.get_current_context()
    func = MaxMvaRecorder(ctx, decay, training)
    return func(x, M)


class MaxMvaRecorderCallback(object):
    def __init__(self):
        self._function = max_mva_recorder

    def name(self):
        n = self.__class__.__name__
        return n[:n.rfind('Callback')]

    def __call__(self, x, axes=[1], training=True, name=''):
        shape = [1] * x.ndim
        M = nn.parameter.get_parameter_or_create('M-{}'.format(name), shape,
                                                 ConstantInitializer())

        y = self._function(x, M, training=training)
        return y

    def get_scale_zeropoint(x, axes=[1], narrow_range=False, round_method='NOTROUND', name=''):
        # If recorder is not added before Q/DQ, return neither scale nor zp
        if not _param_exits('M-{}'.format(name)):
            return None, None

        shape = [1] * x.ndim
        M = nn.parameter.get_parameter_or_create('M-{}'.format(name), shape)

        n_bits = 8
        im = - 2 ** (n_bits - 1)
        iM = 2 ** (n_bits - 1) - 1
        de = (iM - im) if not narrow_range else (iM - (im + 1))
        scale = (2 * M.d) / de  # MaxMva

        # round
        _round = __round_methods__[round_method]
        scale = 2 ** (_round(np.log(scale) / np.log(2))
                      ) if _round else scale  # pow2 scale

        # set zeropoint to zero
        zp = np.round((np.zeros_like(M.d) / scale).astype(np.int8))
        zp = nn.parameter.get_parameter_or_create('zeropoint-{}'.format(name),
                                                  zp.shape, zp, False)

        scale = nn.parameter.get_parameter_or_create('scale-{}'.format(name),
                                                     scale.shape, scale, False)

        return scale, zp


class PrecisionMode(Enum):
    # Quantized functions only
    QNN = 0
    # Quantize/Dequantize only
    SIM_QNN = 1
    # Quantized functions as much as possible;
    # otherwise Quantize/Dequantize (simulated quantization) are used
    MIXED_QNN = 2


class QNNState(Enum):
    # Floating network
    NON_QNN = 0
    # Recording min/max at some weights and activations
    RECORDING = 1
    # Quantized network, some layers are quantized, depending on layer configuration
    TRAINING = 2
    # TODO: Deployment network, quantized weights might be fused and saved as int8
    DEPLOYMENT = 3


# TODO : elaborate more


[docs]class QATConfig:
    #: Extension Context. 'cpu', 'cuda' or 'cudnn'
    ext_name = "cudnn"

    #: Use zero-point (asymmetric) or not use (symmetric)
    zero_point = False

    #: Precision
    dtype = np.int8
    precision_mode = PrecisionMode.SIM_QNN

    #: Enable channel last (channel_first is only supported now)
    channel_last = False

    # (TODO: near future, normally used for weights)
    #: Enable channel-wise quantization
    channel_wise = False

    #: Step start to record
    niter_to_recording = 0

    #: Step start to QAT.
    #: The number of steps between recording and training should be greater than the number of steps of one epoch training.
    niter_to_training = -1

    #: Coerce to power-of-2 scale when transiting from the recording graph
[docs]    class RoundingMethod(Enum):
        """
        Round method of scale
        """
        #: round up.
        #: e.g. ceil(9.4) = 10
        CEIL = 'CEIL'  # round up
        #: round.
        #: e.g. round(9.4) = 9, round(9.5) = 10
        ROUND = 'ROUND'
        #: round down.
        #: e.g. floor(9.5) = 9
        FLOOR = 'FLOOR'  # round down
        #: not round
        NOTROUND = 'NOTROUND'

    #: Member of :obj:`nnabla.utils.qnn.QATConfig.RoundingMethod`.
    #: Round the scale to power-of-2.
    #: If you want to deploy the model with tensorrt, please enable this.
    pow2 = RoundingMethod.ROUND

    #: Narrow the lower-bound (e.g., when in int8, -128 -> -127)
    narrow_range = False

    #: Round mode of quantize layer
    round_mode = 'HALF_TO_EVEN'

    # (TODO: decide by experiments for ImageNet classification and super resolution task)

    #: Enable Batch Normalization Folding.
    #: Note that sometimes this can cause the training become unstable.
    bn_folding = False

    #: Enable Batch Normalization Self-Folding.
    #: Note that sometimes this can cause the training become unstable.
    bn_self_folding = False

    #: One of :obj:`nnabla.utils.qnn.MinMaxMinMaxRecorderCallback`,
    #: :obj:`nnabla.utils.qnn.AbsMaxRecorderCallback`,
    #: :obj:`nnabla.utils.qnn.MinMaxMvaRecorderCallback`,
    #: :obj:`nnabla.utils.qnn.MaxMaxRecorderCallback`,
    #: :obj:`nnabla.utils.qnn.MaxMvaRecorderCallback`
    #: Recorder of weight
    recorder_weight = MinMaxMinMaxRecorderCallback

    #: One of :obj:`nnabla.utils.qnn.MinMaxMinMaxRecorderCallback`,
    #: :obj:`nnabla.utils.qnn.AbsMaxRecorderCallback`,
    #: :obj:`nnabla.utils.qnn.MinMaxMvaRecorderCallback`,
    #: :obj:`nnabla.utils.qnn.MaxMaxRecorderCallback`,
    #: :obj:`nnabla.utils.qnn.MaxMvaRecorderCallback`
    #: Recorder of activation
    recorder_activation = MaxMvaRecorderCallback

    #: list of nnabla function names.
    #: Recording layers.
    #: If empty, add recoders to all layers. Otherwise, only add recoders to functions in record_layers.
    record_layers = []

[docs]    class RecorderPosition(Enum):
        """
        Position to add recorder for function.
        """

        #: Add recoder only before a function
        BEFORE = 0

        #: Add recoder before/after a function
        BOTH = 1

    #: Member of :obj:`nnabla.utils.qnn.QATConfig.RecorderPosition`.
    #: Recorder position
    recorder_position = RecorderPosition.BEFORE

    #: List of nnabla function name.
    #: Skip quantizing inputs layers of network
    skip_inputs_layers = ['Convolution', 'Deconvolution']

    #: List of nnabla function name.
    #: Skip quantizing outputs layers of network
    skip_outputs_layers = ['Affine']

    #: QAT Learning_rate = NonQNN Learning_rate * learning_rate_scale.
    #: Recommend setting it to 0.1 or 0.01
    learning_rate_scale = 0.1

    #: Skip quantizing bias of Affine and bias of the Convolution function family
    skip_bias = False


[docs]class QATTensorRTConfig(QATConfig):
    pow2 = QATConfig.RoundingMethod.ROUND
    bn_folding = True
    bn_self_folding = True
    record_layers = ["Convolution", "Deconvolution",
                     "Affine", "BatchMatmul", "ReLU"]
    record_position = QATConfig.RecorderPosition.BEFORE


qat_default_config = QATTensorRTConfig()


class FunctionsRankRecorder(object):
    def __init__(self, functions_only_for_training=['ImageAugmentation', 'Dropout']):
        self.functions_only_for_training = functions_only_for_training
        self.rank = 0
        self.functions_ranks = {}

    def __call__(self, f):
        if f.info.type_name not in self.functions_only_for_training:
            self.functions_ranks[f] = self.rank
            self.rank += 1


[docs]class QATScheduler:
    """
    Scheduler for quantization aware training.

    Args:
      config (:obj:`QATConfig`): Quantization-Aware-Training Configuration
      solver (:obj:`nnabla.solver.Solver`): Neural Network Solver

    Example:

        .. code-block:: python

            from nnabla.utils.qnn import QATScheduler, QATConfig, PrecisionMode

            # Set configuration
            config = QATConfig()
            config.bn_folding = True
            config.bn_self_folding = True
            config.channel_last = False
            config.precision_mode = PrecisionMode.SIM_QNN
            config.niter_to_recording = 1
            config.niter_to_training = 500

            qat_scheduler = QATScheduler(config=config, solver=solver)

            # convert graph to enable quantization aware training.
            qat_scheduler(pred) # pred is the output variable of training network
            qat_scheduler(vpred, training=False) # vpred is the output variable of evaluation network

            # Training loop
            for i in range(training_step):
                qat_scheduler.step()

                # Your training code here

            # save quantized nnp
            qat_scheduler.save('qnn.np', vimage, deploy=False) # vimage is the input variable of network
    """

    # set default config
    def __init__(self, config=qat_default_config, solver=None):
        """
        Args:
          config (:obj:`QATConfig`):
          solver (:obj: 'nnabla.solvers.Solver'): Neural Network Solver

        """
        self.config = config
        if config.niter_to_training <= 0 or config.niter_to_recording < 0:
            raise ValueError(
                'Please set niter_to_recording and niter_to_training correctly! niter_to_recording should be greater than or equal to 0. niter_to_training should be greater than 0')
        if (config.niter_to_training - config.niter_to_recording) <= 0:
            raise ValueError(
                'Please set niter_to_recording and niter_to_training correctly! The number of steps between recording and training should be greater than the number of steps of one epoch training.')
        self.solver = solver
        self.counter = 0
        self.state = QNNState.NON_QNN
        self.registry = []  # [(nn.Variable, training)]

    def __call__(self, pred, training=True):
        """
        Wrap the network to quantized.

        Args:
          pred (:obj:`nnabla.Variable` or list of :obj:`nnabla.Variable`): Network output; the output of the original computation graph to be quantized.
        """
        # TODO: address list case (e.g., multiple outputs)
        self.registry.append((pred, training))

    def _register_params(self, solver):
        """
        Re-register parameters to `solver`
        """
        if not solver:
            return

        if (not self.config.bn_folding) and (not self.config.bn_self_folding):
            return

        solver.set_parameters(nn.get_parameters(grad_only=True))

    def _set_qat_learning_rate(self):
        self.solver.set_learning_rate(self.solver.learning_rate()
                                      * self.config.learning_rate_scale)

    def _fold_bn(self, pred):
        qpred_prev = pred
        # BN folding & BN self folding
        modifiers = [] if not self.config.bn_folding else [GC.BatchNormalizationFoldingModifier(
                opposite=False, channel_last=self.config.channel_last), GC.BatchNormalizationFoldingModifier(
                opposite=True, channel_last=self.config.channel_last)]
        modifiers = modifiers + \
            [GC.BatchNormalizationSelfFoldingModifier(
            )] if self.config.bn_self_folding else modifiers
        if len(modifiers) > 0:
            # expand fused_batch_normalization if BN folding or BN self folding is enabled.
            modifiers.insert(0, GC.UnfusedBatchNormalizationModifier())
            qpred_without_bn = GC.GraphConverter(
                modifiers).convert(qpred_prev)
            qpred_prev.rewire_on(qpred_without_bn)
        return qpred_prev

    def _clear_memory_cache(self):
        if self.config.ext_name in ["cuda", "cudnn"]:
            nnabla_ext.cuda.clear_memory_cache()

    def _schedule_to_recording(self):
        for i, elm in enumerate(self.registry):
            pred, training = elm
            qpred_prev = pred
            qpred_prev = self._fold_bn(qpred_prev)

            # Collect functions rank
            rank_recorder = FunctionsRankRecorder()
            qpred_prev.visit(rank_recorder)

            qpred_curr = GC.GraphConverter([
                GC.QuantizeNonQNNToRecordingModifier(
                    rank_recorder.functions_ranks, config=self.config, training=training)]).convert(qpred_prev)
            qpred_prev.rewire_on(qpred_curr)
            qpred_prev.need_grad = False
            self._register_params(self.solver)
            self.registry[i] = (qpred_prev, training)
            self.state = QNNState.RECORDING
            print(
                'QNNState.NON_QNN -> QNNState.RECORDING: graph={}'.format(qpred_prev))

    def _schedule_to_training(self):
        for i, elm in enumerate(self.registry):
            pred, training = elm
            qpred_prev = pred

            # Remove recorder
            modifiers = []
            modifiers.append(GC.RemoveFunctionModifier(
                rm_funcs=[self.config.recorder_activation().name(),
                          self.config.recorder_weight().name()]))
            qpred_noqnn = GC.GraphConverter(modifiers).convert(qpred_prev)
            qpred_prev.rewire_on(qpred_noqnn)

            # Collect functions rank
            rank_recorder = FunctionsRankRecorder()
            qpred_prev.visit(rank_recorder)

            # Recording to training
            qpred_curr = GC.GraphConverter([
                GC.QuantizeRecordingToTrainingModifier(rank_recorder.functions_ranks, config=self.config)]).convert(
                qpred_prev)
            qpred_prev.rewire_on(qpred_curr)
            self._register_params(self.solver)
            self._set_qat_learning_rate()
            self.registry[i] = (qpred_prev, training)
            self.state = QNNState.TRAINING
            print(
                'QNNState.RECORDING -> QNNState.TRAINING: graph={}'.format(qpred_prev))

[docs]    def step(self):
        """
        Step in the state of QNN. According to the number of iterations in config.
        """

        # TODO: there is other patterns
        # TODO: address list case (e.g., multiple outputs)
        if self.counter == self.config.niter_to_recording:
            self._clear_memory_cache()
            self._schedule_to_recording()

        elif self.counter == self.config.niter_to_training:
            self._clear_memory_cache()
            self._schedule_to_training()

        self.counter += 1

[docs]    def save(self, fname, inputs, batch_size=1, net_name='net', deploy=False):
        """
        Save QAT network model to NNP file as default.

        Args:
          fname (str): NNP file name.
          inputs (:obj:`nnabla.Variable` or list of :obj:`nnabla.Variable`): Network inputs variables.
          batch_size (int): batch size.
          net_name (str): network name.
          deploy (bool): Whether to apply QNN deployment conversion. deploy=True is not supported yet.

        Returns:
          None
        """

        def _force_list(o):
            if isinstance(o, (tuple)):
                return list(o)
            if not isinstance(o, (list)):
                return [o]
            return o

        for i, elm in enumerate(self.registry):
            pred, training = elm
            if deploy:
                assert self.state == QNNState.training
                # TODO: Convert the training graph to deployment graph
                # TODO: Save as nnp (we have to define nicely)
            else:
                if training:
                    continue

                from collections import defaultdict

                inps = defaultdict(list)
                otps = defaultdict(list)
                ec_data = []
                ec_otps = []

                inputs = _force_list(inputs)
                for i, inp in enumerate(inputs):
                    key = 'x{}'.format(i)
                    inps[key] = inp
                    ec_data.append(key)

                outputs = _force_list(pred)
                for i, otp in enumerate(outputs):
                    key = 'y{}'.format(i)
                    otps[key] = otp
                    ec_otps.append(key)

                contents = {
                    'networks': [
                        {'name': net_name,
                         'batch_size': batch_size,
                         'outputs': otps,
                         'names': inps
                         }],
                    'executors': [
                        {'name': 'runtime',
                         'network': net_name,
                         'data': ec_data,
                         'outputs': ec_otps
                         }]
                }

                from nnabla.utils.save import save
                save(fname, contents)