Source code for nnabla.models.semantic_segmentation.deeplabv3plus

# Copyright 2019,2020,2021 Sony Corporation.
# Copyright 2021 Sony Group Corporation.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import

from nnabla.utils.nnp_graph import NnpNetworkPass

from .base import SemanticSegmentation


[docs]class DeepLabV3plus(SemanticSegmentation):

    '''
    DeepLabV3+.

    Args:
        dataset(str): Specify a training dataset name from 'voc' or 'coco'. 
        output_stride(int): DeepLabV3 uses atrous (a.k.a. dilated) convolutions. The atrous rate depends on the output stride. the output stride has 
                     to be selected from 8 or 16. Default is 8. If the output_stride is 8 the atrous rate will be [12,24,36] 
                     and if the output_stride is 16 the atrous rate will be [6,12,18].

    The following is a list of string that can be specified to ``use_up_to`` option in ``__call__`` method;

    * ``'segmentation'`` (default): The output of the final layer.
    * ``'lastconv'``: The output from last Convolution.
    * ``'lastconv+relu'``: Network up to ``'lastconv'`` followed by ReLU activation.

    References:
        * `Chen et al., Rethinking Atrous Convolution for Semantic Image Segmentation.
          <https://arxiv.org/abs/1706.05587>`_

    '''

    _KEY_VARIABLE = {
        'segmentation': 'y',
        'lastconv': 'BatchNormalization_146_Output',
        'lastconv+relu': 'ReLU_82_Output',
        }

    def __init__(self, dataset='voc', output_stride=16):

        # Check validity of num_layers
        assert dataset in ['voc', 'voc-coco'],\
            'dataset must be chosen from ["voc", "voc-coco"].'
        assert output_stride in [16, 8],\
            'dataset must be chosen from [16, 8].'
        # Load nnp
        self._dataset_name = dataset
        self._output_stride = output_stride
        if self._dataset_name == 'voc':
            if self._output_stride == 16:
                self._load_nnp('DeepLabV3-voc-os-16.nnp',
                               'DeepLabV3-voc-os-16.nnp')
            else:
                self._load_nnp('DeepLabV3-voc-os-8.nnp',
                               'DeepLabV3-voc-os-8.nnp')
        elif self._dataset_name == "voc-coco":
            if self._output_stride == 16:
                self._load_nnp('DeepLabV3-voc-coco-os-16.nnp',
                               'DeepLabV3-voc-coco-os-16.nnp')
            else:
                self._load_nnp('DeepLabV3-voc-coco-os-8.nnp',
                               'DeepLabV3-voc-coco-os-8.nnp')

    def _input_shape(self):
        return (3, 513, 513)

    def __call__(self, input_var=None, use_from=None, use_up_to='segmentation', training=False, returns_net=False, verbose=0):

        assert use_from is None, 'This should not be set because it is for forward compatibility.'
        input_var = self.get_input_var(input_var)
        callback = NnpNetworkPass(verbose)
        callback.set_variable('x', input_var)

        '''
        Shape of output dimension for Interpolate and AveragePooling function depends on input image size
        and if these dimensions are taken from .nnp file, shape mis-matching will be encountered. So 
        these dimensions has been set according to input image shape in below callbacks.
        '''

        # changing kernel and stride dimension for AveragePoling
        @callback.on_generate_function_by_name('AveragePooling')
        def average_pooling_shape(f):
            s = f.inputs[0].proto.shape.dim[:]
            kernel_dim = f.proto.average_pooling_param
            kernel_dim.kernel.dim[:] = [s[2], s[3]]
            pool_stride = f.proto.average_pooling_param
            pool_stride.stride.dim[:] = [s[2], s[3]]
            return f

        # changing output size for all Interpolate functions, using the below callbacks.
        @callback.on_generate_function_by_name('Interpolate')
        def interpolate_output_shape(f):
            s = input_var.shape
            s1 = f.inputs[0].proto.shape.dim[:]
            w, h = s[2], s[3]
            for i in range(4):
                w = (w - 1) // 2 + 1
                h = (h - 1) // 2 + 1
            op_shape = f.proto.interpolate_param
            op_shape.output_size[:] = [w, h]
            return f

        @callback.on_generate_function_by_name('Interpolate_2')
        def interpolate_output_shape(f):
            s = input_var.shape
            w, h = s[2], s[3]
            for i in range(2):
                w = (w - 1) // 2 + 1
                h = (h - 1) // 2 + 1
            op_shape = f.proto.interpolate_param
            op_shape.output_size[:] = [w, h]
            return f

        @callback.on_generate_function_by_name('Interpolate_3')
        def interpolate_output_shape(f):
            s = input_var.shape
            op_shape = f.proto.interpolate_param
            op_shape.output_size[:] = [s[2], s[3]]
            return f

        callback.set_batch_normalization_batch_stat_all(training)
        self.use_up_to(use_up_to, callback)
        if not training:
            callback.fix_parameters()
        batch_size = input_var.shape[0]
        net = self.nnp.get_network(
            'runtime', batch_size=batch_size, callback=callback)
        if returns_net:
            return net
        return list(net.outputs.values())[0]