Source code for nnabla.models.object_detection.yolov2

# Copyright 2019,2020,2021 Sony Corporation.
# Copyright 2021 Sony Group Corporation.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
import nnabla as nn
from nnabla.utils.nnp_graph import NnpNetworkPass

import numpy as np
from .base import ObjectDetection


[docs]class YoloV2(ObjectDetection):

    '''

    The following is a list of string that can be specified to ``use_up_to`` option in ``__call__`` method;

    * ``'detection'`` (default): The output from the last convolution (detection layer) after post-processing.
    * ``'convdetect'``: The output of last convolution without post-processing.
    * ``'lastconv'``: Network till the convolution layer+relu which comes before detection convolution layer.

    References:
        * `Joseph Redmon et al., YOLO9000: Better, Faster, Stronger.
          <https://arxiv.org/abs/1612.08242>`_

    '''

    _KEY_VARIABLE = {
        'detection': 'y',
        'convdetect': 'Reshape_3_Output',
        'lastconv': 'Reshape_3_Output',
        'Arange': 'Arange_Output',
        'Arange2': 'Arange_2_Output',
        }

    def __init__(self, dataset='voc'):

        # Check validity of num_layers
        assert dataset in ['voc', 'coco'],\
            'dataset must be chosen from ["voc", "coco"].'
        # Load nnp
        self._dataset_name = dataset
        if self._dataset_name == 'voc':
            self._load_nnp('yolov2-voc.nnp', 'yolov2-voc.nnp')
        elif self._dataset_name == "coco":
            self._load_nnp('yolov2-coco.nnp', 'yolov2-coco.nnp')

    def _input_shape(self):
        return (3, 416, 416)

    def __call__(self, input_var=None, use_from=None, use_up_to='detection', training=False, returns_net=False, verbose=0):

        assert use_from is None, 'This should not be set because it is for forward compatibility.'
        input_var = self.get_input_var(input_var)
        nnp_input_size = self.get_nnp_input_size()
        callback = NnpNetworkPass(verbose)
        callback.set_variable('x', input_var)
        callback.set_batch_normalization_batch_stat_all(training)
        self.use_up_to(use_up_to, callback)
        if use_up_to != 'detection':
            self.use_up_to('Arange', callback)
            self.use_up_to('Arange2', callback)
            funcs_to_drop = ('Reshape_3',
                             'Arange',
                             'Arange_2')
            callback.drop_function(*funcs_to_drop)
            if use_up_to == 'lastconv':
                callback.drop_function('Convolution_23')

        # Output dimension of reshape, arange, slice etc functions are taken from .nnp file.
        # These dimensions depend on the input image size with which the nnp file was created.
        # When different input image size is given to the model, these dimensions will change and therefore
        # shape of output from these functions need to be generalized whenevr they are generated.
        # The same has been done in below callbacks.

        # Reshape operation for simulating darknet reorg bug
        @callback.on_generate_function_by_name('Reshape')
        def reshape_for_darknet_reorg_bug(f):
            s = f.inputs[0].proto.shape.dim[:]
            stride = 2
            r = f.proto.reshape_param
            r.shape.dim[:] = [
                s[0], int(s[1]/stride/stride), s[2], stride, s[3], stride]
            return f

        # Reshape operation for simulating darknet reorg bug
        @callback.on_generate_function_by_name('Reshape_2')
        def reshape_for_darknet_reorg_bug(f):
            s = f.inputs[0].proto.shape.dim[:]
            r = f.proto.reshape_param
            r.shape.dim[:] = [s[0], s[1]*s[2]*s[3]
                              * s[1]*s[2], s[4]//s[1], s[5]//s[2]]
            return f

        # Reshape operation for output variable of yolov2 function in yolov2_activate.
        @callback.on_generate_function_by_name('Reshape_3')
        def reshape_yolov2_activate(f):
            s = f.inputs[0].proto.shape.dim[:]
            anchors = 5
            r = f.proto.reshape_param
            num_class = r.shape.dim[2] - 5
            s_add = (s[0], anchors, num_class+5)+tuple(s[2:])
            r.shape.dim[:] = s_add
            return f

        # Slicing the variable y in yolov2_activate to get t_xy
        @callback.on_generate_function_by_name('Slice')
        def slicing_t_xy(f):
            s = f.inputs[0].proto.shape.dim[:]
            s[2] = 2
            r = f.proto.slice_param
            r.stop[:] = [s[0], s[1], s[2], s[3], s[4]]
            return f

        # Arange operation in range of zero to width of input variable
        @callback.on_generate_function_by_name('Arange')
        def arange__yolov2_image_coordinate_xs(f):
            s = input_var.shape
            r = f.proto.arange_param
            r.stop = s[3]//32
            return f

        # Arange operation in range of zero to height of input variable
        @callback.on_generate_function_by_name('Arange_2')
        def arange_yolov2_image_coordinate_ys(f):
            s = input_var.shape
            r = f.proto.arange_param
            r.stop = s[2]//32
            return f

        # Slicing the variable y in yolov2_activate to get t_wh
        @callback.on_generate_function_by_name('Slice_2')
        def slicing_t_wh(f):
            s = list(f.inputs[0].proto.shape.dim[:])
            s[2] = 4
            r = f.proto.slice_param
            r.stop[:] = [s[0], s[1], s[2], s[3], s[4]]
            return f

        # Slicing the variable y in yolov2_activate to get t_o
        @callback.on_generate_function_by_name('Slice_3')
        def slicing_t_o(f):
            s = list(f.inputs[0].proto.shape.dim[:])
            s[2] = 5
            r = f.proto.slice_param
            r.stop[:] = [s[0], s[1], s[2], s[3], s[4]]
            return f

        # Slicing the variable y in yolov2_activate to get t_p
        @callback.on_generate_function_by_name('Slice_4')
        def slicing_t_p(f):
            s = list(f.inputs[0].proto.shape.dim[:])
            r = f.proto.slice_param
            r.stop[:] = [s[0], s[1], s[2], s[3], s[4]]
            return f

        # Reshape the output of Arange to get xs
        @callback.on_generate_function_by_name('Reshape_4')
        def reshape_yolov2_image_coordinate_xs(f):
            s = f.inputs[0].proto.shape.dim[:]
            r = f.proto.reshape_param
            r.shape.dim[3] = s[0]
            return f

        # Reshape operation to get t_x
        @callback.on_generate_function_by_name('Reshape_5')
        def reshape__yolov2_image_coordinate_t_x(f):
            s = f.inputs[0].proto.shape.dim[:]
            r = f.proto.reshape_param
            r.shape.dim[:] = [s[0], s[1], s[0]//s[0], s[2], s[3]]
            return f

        # Reshape the output of Arange_2 to get ys
        @callback.on_generate_function_by_name('Reshape_6')
        def reshape_yolov2_image_coordinate_ys(f):
            s = f.inputs[0].proto.shape.dim[:]
            r = f.proto.reshape_param
            r.shape.dim[2] = s[0]
            return f

        # Reshape the output of Arange to get t_y
        @callback.on_generate_function_by_name('Reshape_7')
        def reshape_yolov2_image_coordinate_t_y(f):
            s = f.inputs[0].proto.shape.dim[:]
            r = f.proto.reshape_param
            r.shape.dim[:] = [s[0], s[1], s[0]//s[0], s[2], s[3]]
            return f

        # Reshape the final variable y
        @callback.on_generate_function_by_name('Reshape_8')
        def reshape_output_variable_y(f):
            s = f.inputs[0].proto.shape.dim[:]
            r = f.proto.reshape_param
            r.shape.dim[:] = [s[0], s[1]*s[2]*s[3], s[4]]
            return f

        # Scaler division by width in function Reshape_4 in yolov2_image_coordinate
        @callback.on_generate_function_by_name('MulScalar_2')
        def mul__yolov2_image_coordinate_t_x(f):
            input_arr_shape = list(input_var.shape[2:])
            r = f.proto.mul_scalar_param
            s = f.proto.mul_scalar_param.val
            r.val = s*(nnp_input_size[1]/input_arr_shape[1])
            return f

        # Scaler division by height in function Reshape_6 in yolov2_image_coordinate
        @callback.on_generate_function_by_name('MulScalar_3')
        def mul_yolov2_image_coordinate_t_y(f):
            input_arr_shape = list(input_var.shape[2:])
            r = f.proto.mul_scalar_param
            s = f.proto.mul_scalar_param.val
            r.val = s*(nnp_input_size[0]/input_arr_shape[0])
            return f

        # Reshape biases and multiply with t_wh to rescale it.
        @callback.on_function_pass_by_name('Mul2')
        def reshape_biases(f, variables, param_scope):
            bias_param_name = f.inputs[1].proto.name
            with nn.parameter_scope('', param_scope):
                biases = nn.parameter.get_parameter(bias_param_name)
                s = list(input_var.shape)
                k = (np.array([nnp_input_size[1]//32,
                               nnp_input_size[0]//32]).reshape(1, 1, 2, 1, 1))
                m = (np.array([s[3]//32, s[2]//32]).reshape(1, 1, 2, 1, 1))
                biases = (biases.d)*k
                biases = (biases)/m
                biases = nn.Variable.from_numpy_array(biases)
                nn.parameter.set_parameter('biases', biases)

        if not training:
            callback.fix_parameters()
        batch_size = input_var.shape[0]
        net = self.nnp.get_network(
            'runtime', batch_size=batch_size, callback=callback)
        if returns_net:
            return net
        return list(net.outputs.values())[0]