Source code for nnabla.models.object_detection.yolov2

# Copyright 2019,2020,2021 Sony Corporation.
# Copyright 2021 Sony Group Corporation.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
import nnabla as nn
from nnabla.utils.nnp_graph import NnpNetworkPass

import numpy as np
from .base import ObjectDetection


[docs]class YoloV2(ObjectDetection): ''' The following is a list of string that can be specified to ``use_up_to`` option in ``__call__`` method; * ``'detection'`` (default): The output from the last convolution (detection layer) after post-processing. * ``'convdetect'``: The output of last convolution without post-processing. * ``'lastconv'``: Network till the convolution layer+relu which comes before detection convolution layer. References: * `Joseph Redmon et al., YOLO9000: Better, Faster, Stronger. <https://arxiv.org/abs/1612.08242>`_ ''' _KEY_VARIABLE = { 'detection': 'y', 'convdetect': 'Reshape_3_Output', 'lastconv': 'Reshape_3_Output', 'Arange': 'Arange_Output', 'Arange2': 'Arange_2_Output', } def __init__(self, dataset='voc'): # Check validity of num_layers assert dataset in ['voc', 'coco'], \ 'dataset must be chosen from ["voc", "coco"].' # Load nnp self._dataset_name = dataset if self._dataset_name == 'voc': self._load_nnp('yolov2-voc.nnp', 'yolov2-voc.nnp') elif self._dataset_name == "coco": self._load_nnp('yolov2-coco.nnp', 'yolov2-coco.nnp') def _input_shape(self): return (3, 416, 416) def __call__(self, input_var=None, use_from=None, use_up_to='detection', training=False, returns_net=False, verbose=0): assert use_from is None, 'This should not be set because it is for forward compatibility.' input_var = self.get_input_var(input_var) nnp_input_size = self.get_nnp_input_size() callback = NnpNetworkPass(verbose) callback.set_variable('x', input_var) callback.set_batch_normalization_batch_stat_all(training) self.use_up_to(use_up_to, callback) if use_up_to != 'detection': self.use_up_to('Arange', callback) self.use_up_to('Arange2', callback) funcs_to_drop = ('Reshape_3', 'Arange', 'Arange_2') callback.drop_function(*funcs_to_drop) if use_up_to == 'lastconv': callback.drop_function('Convolution_23') # Output dimension of reshape, arange, slice etc functions are taken from .nnp file. # These dimensions depend on the input image size with which the nnp file was created. # When different input image size is given to the model, these dimensions will change and therefore # shape of output from these functions need to be generalized whenever they are generated. # The same has been done in below callbacks. # Reshape operation for simulating darknet reorg bug @callback.on_generate_function_by_name('Reshape') def reshape_for_darknet_reorg_bug(f): s = f.inputs[0].proto.shape.dim[:] stride = 2 r = f.proto.reshape_param r.shape.dim[:] = [ s[0], int(s[1]/stride/stride), s[2], stride, s[3], stride] return f # Reshape operation for simulating darknet reorg bug @callback.on_generate_function_by_name('Reshape_2') def reshape_for_darknet_reorg_bug(f): s = f.inputs[0].proto.shape.dim[:] r = f.proto.reshape_param r.shape.dim[:] = [s[0], s[1]*s[2]*s[3] * s[1]*s[2], s[4]//s[1], s[5]//s[2]] return f # Reshape operation for output variable of yolov2 function in yolov2_activate. @callback.on_generate_function_by_name('Reshape_3') def reshape_yolov2_activate(f): s = f.inputs[0].proto.shape.dim[:] anchors = 5 r = f.proto.reshape_param num_class = r.shape.dim[2] - 5 s_add = (s[0], anchors, num_class+5)+tuple(s[2:]) r.shape.dim[:] = s_add return f # Slicing the variable y in yolov2_activate to get t_xy @callback.on_generate_function_by_name('Slice') def slicing_t_xy(f): s = f.inputs[0].proto.shape.dim[:] s[2] = 2 r = f.proto.slice_param r.stop[:] = [s[0], s[1], s[2], s[3], s[4]] return f # Arange operation in range of zero to width of input variable @callback.on_generate_function_by_name('Arange') def arange__yolov2_image_coordinate_xs(f): s = input_var.shape r = f.proto.arange_param r.stop = s[3]//32 return f # Arange operation in range of zero to height of input variable @callback.on_generate_function_by_name('Arange_2') def arange_yolov2_image_coordinate_ys(f): s = input_var.shape r = f.proto.arange_param r.stop = s[2]//32 return f # Slicing the variable y in yolov2_activate to get t_wh @callback.on_generate_function_by_name('Slice_2') def slicing_t_wh(f): s = list(f.inputs[0].proto.shape.dim[:]) s[2] = 4 r = f.proto.slice_param r.stop[:] = [s[0], s[1], s[2], s[3], s[4]] return f # Slicing the variable y in yolov2_activate to get t_o @callback.on_generate_function_by_name('Slice_3') def slicing_t_o(f): s = list(f.inputs[0].proto.shape.dim[:]) s[2] = 5 r = f.proto.slice_param r.stop[:] = [s[0], s[1], s[2], s[3], s[4]] return f # Slicing the variable y in yolov2_activate to get t_p @callback.on_generate_function_by_name('Slice_4') def slicing_t_p(f): s = list(f.inputs[0].proto.shape.dim[:]) r = f.proto.slice_param r.stop[:] = [s[0], s[1], s[2], s[3], s[4]] return f # Reshape the output of Arange to get xs @callback.on_generate_function_by_name('Reshape_4') def reshape_yolov2_image_coordinate_xs(f): s = f.inputs[0].proto.shape.dim[:] r = f.proto.reshape_param r.shape.dim[3] = s[0] return f # Reshape operation to get t_x @callback.on_generate_function_by_name('Reshape_5') def reshape__yolov2_image_coordinate_t_x(f): s = f.inputs[0].proto.shape.dim[:] r = f.proto.reshape_param r.shape.dim[:] = [s[0], s[1], s[0]//s[0], s[2], s[3]] return f # Reshape the output of Arange_2 to get ys @callback.on_generate_function_by_name('Reshape_6') def reshape_yolov2_image_coordinate_ys(f): s = f.inputs[0].proto.shape.dim[:] r = f.proto.reshape_param r.shape.dim[2] = s[0] return f # Reshape the output of Arange to get t_y @callback.on_generate_function_by_name('Reshape_7') def reshape_yolov2_image_coordinate_t_y(f): s = f.inputs[0].proto.shape.dim[:] r = f.proto.reshape_param r.shape.dim[:] = [s[0], s[1], s[0]//s[0], s[2], s[3]] return f # Reshape the final variable y @callback.on_generate_function_by_name('Reshape_8') def reshape_output_variable_y(f): s = f.inputs[0].proto.shape.dim[:] r = f.proto.reshape_param r.shape.dim[:] = [s[0], s[1]*s[2]*s[3], s[4]] return f # Scaler division by width in function Reshape_4 in yolov2_image_coordinate @callback.on_generate_function_by_name('MulScalar_2') def mul__yolov2_image_coordinate_t_x(f): input_arr_shape = list(input_var.shape[2:]) r = f.proto.mul_scalar_param s = f.proto.mul_scalar_param.val r.val = s*(nnp_input_size[1]/input_arr_shape[1]) return f # Scaler division by height in function Reshape_6 in yolov2_image_coordinate @callback.on_generate_function_by_name('MulScalar_3') def mul_yolov2_image_coordinate_t_y(f): input_arr_shape = list(input_var.shape[2:]) r = f.proto.mul_scalar_param s = f.proto.mul_scalar_param.val r.val = s*(nnp_input_size[0]/input_arr_shape[0]) return f # Reshape biases and multiply with t_wh to rescale it. @callback.on_function_pass_by_name('Mul2') def reshape_biases(f, variables, param_scope): bias_param_name = f.inputs[1].proto.name with nn.parameter_scope('', param_scope): biases = nn.parameter.get_parameter(bias_param_name) s = list(input_var.shape) k = (np.array([nnp_input_size[1]//32, nnp_input_size[0]//32]).reshape(1, 1, 2, 1, 1)) m = (np.array([s[3]//32, s[2]//32]).reshape(1, 1, 2, 1, 1)) biases = (biases.d)*k biases = (biases)/m biases = nn.Variable.from_numpy_array(biases) nn.parameter.set_parameter('biases', biases) if not training: callback.fix_parameters() batch_size = input_var.shape[0] net = self.nnp.get_network( 'runtime', batch_size=batch_size, callback=callback) if returns_net: return net return list(net.outputs.values())[0]