Added MODNet models (openvinotoolkit#3373)

Anna Grebneva · web-flow · commit 5da2e9a5d133 · 2022-03-28T12:48:28.000+03:00
* Added MODNet models

* Added demo support
diff --git a/data/dataset_definitions.yml b/data/dataset_definitions.yml
@@ -1493,3 +1493,14 @@ datasets:
       annotation_file: object_detection/streams_1/high/annotations/instances_glb2bcls3.json
     annotation: mscoco_detection_high_3cls.pickle
     dataset_meta: mscoco_detection_high_3cls.json
+
+  - name: HumanMattingDataset
+    data_source:  human_matting_dataset/clip_img/1803151818/clip_00000000
+    additional_data_source: human_matting_dataset/matting/1803151818/matting_00000000
+    annotation_conversion:
+      converter: background_matting
+      images_dir: human_matting_dataset/clip_img/1803151818/clip_00000000
+      masks_dir: human_matting_dataset/matting/1803151818/matting_00000000
+      image_postfix: '.jpg'
+    annotation: human_matting.pickle
+    dataset_meta: human_matting.json
diff --git a/demos/background_subtraction_demo/python/README.md b/demos/background_subtraction_demo/python/README.md
@@ -29,7 +29,10 @@ The demo application expects an instance segmentation or background matting mode
     * At least two outputs including:
         * `fgr` with normalized in [0, 1] range foreground
         * `pha` with normalized in [0, 1] range alpha
-4. for video background matting models based on RNN architecture:
+4. for image background matting models without trimap (background segmentation):
+    * Single input for input image.
+    * Single output with normalized in [0, 1] range alpha
+5. for video background matting models based on RNN architecture:
     * Five inputs:
         * `src` for input image
         * recurrent inputs: `r1`, `r2`, `r3`, `r4`
@@ -81,10 +84,12 @@ omz_converter --list models.lst
 
 ### Supported Models
 
-* instance-segmentation-person-????
-* yolact-resnet50-fpn-pytorch
 * background-matting-mobilenetv2
+* instance-segmentation-person-????
+* modnet-photographic-portrait-matting
+* modnet-webcam-portrait-matting
 * robust-video-matting-mobilenetv3
+* yolact-resnet50-fpn-pytorch
 
 > **NOTE**: Refer to the tables [Intel's Pre-Trained Models Device Support](../../../models/intel/device_support.md) and [Public Pre-Trained Models Device Support](../../../models/public/device_support.md) for the details on models inference support at different devices.
 
diff --git a/demos/background_subtraction_demo/python/background_subtraction_demo.py b/demos/background_subtraction_demo/python/background_subtraction_demo.py
@@ -26,7 +26,10 @@
 
 sys.path.append(str(Path(__file__).resolve().parents[2] / 'common/python'))
 
-from openvino.model_zoo.model_api.models import MaskRCNNModel, OutputTransform, RESIZE_TYPES, YolactModel, ImageMattingWithBackground, VideoBackgroundMatting
+from openvino.model_zoo.model_api.models import (
+    MaskRCNNModel, OutputTransform, RESIZE_TYPES, YolactModel,
+    ImageMattingWithBackground, VideoBackgroundMatting, PortraitBackgroundMatting
+)
 from openvino.model_zoo.model_api.models.utils import load_labels
 from openvino.model_zoo.model_api.performance_metrics import PerformanceMetrics
 from openvino.model_zoo.model_api.pipelines import get_user_config, AsyncPipeline
@@ -123,6 +126,9 @@ def get_model(model_adapter, configuration, args):
         model = ImageMattingWithBackground(model_adapter, configuration)
         need_bgr_input = True
         is_matting_model = True
+    elif len(inputs) == 1 and len(outputs) == 1:
+        model = PortraitBackgroundMatting(model_adapter, configuration)
+        is_matting_model = True
     else:
         model = MaskRCNNModel(model_adapter, configuration)
     if not need_bgr_input and args.background is not None:
diff --git a/demos/background_subtraction_demo/python/models.lst b/demos/background_subtraction_demo/python/models.lst
@@ -1,5 +1,7 @@
 # This file can be used with the --list option of the model downloader.
-instance-segmentation-person-????
-yolact-resnet50-fpn-pytorch
 background-matting-mobilenetv2
+instance-segmentation-person-????
+modnet-photographic-portrait-matting
+modnet-webcam-portrait-matting
 robust-video-matting-mobilenetv3
+yolact-resnet50-fpn-pytorch
diff --git a/demos/common/python/openvino/model_zoo/model_api/README.md b/demos/common/python/openvino/model_zoo/model_api/README.md
@@ -59,7 +59,7 @@ The following tasks can be solved with wrappers usage:
 
 | Task type                  | Model API wrappers |
 |----------------------------|--------------------|
-| Background Matting         | <ul><li>`VideoBackgroundMatting`</li><li>`ImageMattingWithBackground`</li></ul> |
+| Background Matting         | <ul><li>`VideoBackgroundMatting`</li><li>`ImageMattingWithBackground`</li><li>`PortraitBackgroundMatting`</li></ul> |
 | Classification             | <ul><li>`Classification`</li></ul> |
 | Deblurring                 | <ul><li>`Deblurring`</li></ul> |
 | Human Pose Estimation      | <ul><li>`HpeAssociativeEmbedding`</li><li>`OpenPose`</li></ul> |
diff --git a/demos/common/python/openvino/model_zoo/model_api/models/__init__.py b/demos/common/python/openvino/model_zoo/model_api/models/__init__.py
@@ -16,7 +16,7 @@
 
 
 from .bert import BertEmbedding, BertNamedEntityRecognition, BertQuestionAnswering
-from .background_matting import ImageMattingWithBackground, VideoBackgroundMatting
+from .background_matting import ImageMattingWithBackground, VideoBackgroundMatting, PortraitBackgroundMatting
 from .centernet import CenterNet
 from .classification import Classification
 from .deblurring import Deblurring
@@ -58,6 +58,7 @@
     'MonoDepthModel',
     'OpenPose',
     'OutputTransform',
+    'PortraitBackgroundMatting',
     'RESIZE_TYPES',
     'RetinaFace',
     'RetinaFacePyTorch',
diff --git a/demos/common/python/openvino/model_zoo/model_api/models/background_matting.py b/demos/common/python/openvino/model_zoo/model_api/models/background_matting.py
@@ -150,3 +150,36 @@ def postprocess(self, outputs, meta):
         fgr = cv2.cvtColor(cv2.resize(fgr, (w, h)), cv2.COLOR_RGB2BGR)
         pha = np.expand_dims(cv2.resize(pha, (w, h)), axis=-1)
         return fgr, pha
+
+
+class PortraitBackgroundMatting(ImageModel):
+    __model__ = 'Portrait-matting'
+
+    def __init__(self, model_adapter, configuration, preload=False):
+        super().__init__(model_adapter, configuration, preload)
+        self._check_io_number(1, 1)
+        self.output_blob_name = self._get_outputs()
+
+    @classmethod
+    def parameters(cls):
+        return super().parameters()
+
+    def _get_outputs(self):
+        output_blob_name = next(iter(self.outputs))
+        output_size = self.outputs[output_blob_name].shape
+        if len(output_size) != 4:
+            self.raise_error("Unexpected output blob shape {}. Only 4D output blob is supported".format(output_size))
+
+        return output_blob_name
+
+    def preprocess(self, inputs):
+        dict_inputs, meta = super().preprocess(inputs)
+        meta.update({"original_image": inputs})
+        return dict_inputs, meta
+
+    def postprocess(self, outputs, meta):
+        output = outputs[self.output_blob_name][0].transpose(1, 2, 0)
+        original_frame = meta['original_image'] / 255.0
+        h, w = meta['original_shape'][:2]
+        res_output = np.expand_dims(cv2.resize(output, (w, h)), -1)
+        return original_frame, res_output
diff --git a/demos/tests/cases.py b/demos/tests/cases.py
@@ -757,7 +757,9 @@ def single_option_cases(key, *args):
             ModelArg('instance-segmentation-person-0007'),
             ModelArg('robust-video-matting-mobilenetv3'),
             ModelArg('background-matting-mobilenetv2'),
-            ModelArg('yolact-resnet50-fpn-pytorch')),
+            ModelArg('yolact-resnet50-fpn-pytorch'),
+            ModelArg('modnet-photographic-portrait-matting'),
+            ModelArg('modnet-webcam-portrait-matting')),
     )),
 
     PythonDemo(name='bert_question_answering_demo', device_keys=['-d'], test_cases=combine_cases(
diff --git a/models/public/device_support.md b/models/public/device_support.md
@@ -78,6 +78,8 @@
 | mobilenet-v3-large-1.0-224-tf | YES | YES | YES |
 | mobilenet-v3-small-1.0-224-tf | YES | YES | YES |
 | mobilenet-yolo-v4-syg | YES | YES |    |
+| modnet-photographic-portrait-matting | YES | YES | YES |
+| modnet-webcam-portrait-matting | YES | YES | YES |
 | mozilla-deepspeech-0.6.1 | YES |    |    |
 | mozilla-deepspeech-0.8.2 | YES |    |    |
 | mtcnn-o | YES | YES |    |
diff --git a/models/public/index.md b/models/public/index.md
@@ -33,6 +33,8 @@
    :caption: Background Matting Models
 
    omz_models_model_background_matting_mobilenetv2
+   omz_models_model_modnet_photographic_portrait_matting
+   omz_models_model_modnet_webcam_portrait_matting
    omz_models_model_robust_video_matting_mobilenetv3
 
 .. toctree::
@@ -644,6 +646,8 @@ or mixed pixels. This distinguishes background matting from segmentation approac
 | Model Name     | Implementation | OMZ Model Name                                         | Accuracy | GFlops  | mParams  |
 | -------------- | -------------- | ------------------------------------------------------ | -------- | ------- | -------- |
 | background-matting-mobilenetv2 | PyTorch\* | [background-matting-mobilenetv2](./background-matting-mobilenetv2/README.md) | 4.32/1.0/2.48/2.7 | 6.7419 | 5.052 |
+| modnet-photographic-portrait-matting | PyTorch\* | [modnet-photographic-portrait-matting](./modnet-photographic-portrait-matting/README.md) | 5.21/727.95 | 31.1564 | 6.4597 |
+| modnet-webcam-portrait-matting | PyTorch\* | [modnet-webcam-portrait-matting](./modnet-webcam-portrait-matting/README.md) | 5.66/762.52 | 31.1564 | 6.4597 |
 | robust-video-matting-mobilenetv3 | PyTorch\* | [robust-video-matting-mobilenetv3](./robust-video-matting-mobilenetv3/README.md) | 20.8/15.1/4.42/4.05 | 9.3892 | 3.7363 |
 
 ## See Also
diff --git a/models/public/modnet-photographic-portrait-matting/README.md b/models/public/modnet-photographic-portrait-matting/README.md
@@ -0,0 +1,99 @@
+# modnet-photographic-portrait-matting
+
+## Use Case and High-Level Description
+
+The `modnet-photographic-portrait-matting` model is a lightweight matting objective decomposition network (MODNet) for photographic portrait matting in real-time with a single input image with MobileNetV2 backbone. The model is pre-trained in PyTorch\* framework and converted to ONNX\* format.
+
+More details provided in the [paper](https://arxiv.org/abs/2011.11961) and [repository](https://github.com/ZHKKKe/MODNet).
+
+## Specification
+
+| Metric                          | Value              |
+|---------------------------------|--------------------|
+| Type                            | Background Matting |
+| GFlops                          | 31.1564            |
+| MParams                         | 6.4597             |
+| Source framework                | PyTorch\*          |
+
+## Accuracy
+
+Accuracy measured on the HumanMatting dataset
+
+| Metric   | Mean value  | Std value |
+| -------- | ------------|-----------|
+| MAD      | 5.21        | 5.13      |
+| MSE      | 727.95      | 1196.28   |
+
+* MAD - mean of absolute difference
+* MSE - mean squared error.
+
+## Input
+
+### Original Model
+
+Image, name: `input`, shape: `1, 3, 512, 512`, format: `B, C, H, W`, where:
+
+- `B` - batch size
+- `C` - number of channels
+- `H` - image height
+- `W` - image width
+
+Expected color order: `RGB`.
+Mean values - [127.5, 127.5, 127.5], scale value - 127.5.
+
+### Converted Model
+
+Image, name: `input`, shape: `1, 3, 512, 512`, format: `B, C, H, W`, where:
+
+- `B` - batch size
+- `C` - number of channels
+- `H` - image height
+- `W` - image width
+
+Expected color order: `BGR`.
+
+## Output
+
+### Original model
+
+Alpha matte with values in [0, 1] range. Name: `output` Shape: `1, 1, 512, 512`, format: `B, C, H, W`, where:
+
+- `B` - batch size
+- `C` - number of channels
+- `H` - image height
+- `W` - image width
+
+### Converted model
+
+Alpha matte with values in [0, 1] range. Name: `output` Shape: `1, 1, 512, 512`, format: `B, C, H, W`, where:
+
+- `B` - batch size
+- `C` - number of channels
+- `H` - image height
+- `W` - image width
+
+## Download a Model and Convert it into OpenVINO™ IR Format
+
+You can download models and if necessary convert them into OpenVINO™ IR format using the [Model Downloader and other automation tools](../../../tools/model_tools/README.md) as shown in the examples below.
+
+An example of using the Model Downloader:
+```
+omz_downloader --name <model_name>
+```
+
+An example of using the Model Converter:
+```
+omz_converter --name <model_name>
+```
+
+## Demo usage
+
+The model can be used in the following demos provided by the Open Model Zoo to show its capabilities:
+
+* [Background subtraction Python\* Demo](../../../demos/background_subtraction_demo/python/README.md)
+
+## Legal Information
+
+The original model is distributed under the
+[Apache License, Version 2.0](https://raw.githubusercontent.com/ZHKKKe/MODNet/master/LICENSE).
+A copy of the license is provided in `<omz_dir>/models/public/licenses/APACHE-2.0.txt`.
diff --git a/models/public/modnet-photographic-portrait-matting/accuracy-check.yml b/models/public/modnet-photographic-portrait-matting/accuracy-check.yml
@@ -0,0 +1,27 @@
+models:
+  - name: modnet-photographic-portrait-matting
+    launchers:
+      - framework: openvino
+        adapter: background_matting
+    datasets:
+      - name: HumanMattingDataset
+        preprocessing:
+          - type: resize
+            size: 512
+        postprocessing:
+          - type: resize
+            apply_to: annotation
+            size: 512
+        metrics:
+          - type: mae
+            name: MAD
+            presenter: print_vector
+            reference:
+              mean: 5.213472
+              std: 5.125874
+          - type: mse
+            name: MSE
+            presenter: print_vector
+            reference:
+              mean: 727.952792
+              std: 1196.277498
diff --git a/models/public/modnet-photographic-portrait-matting/model.py b/models/public/modnet-photographic-portrait-matting/model.py
@@ -0,0 +1,26 @@
+# Copyright (c) 2022 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from torch import load
+from modnet_onnx import MODNet
+
+
+def create_modnet(weights):
+    model = MODNet(backbone_pretrained=False)
+
+    checkpoint = load(weights, map_location='cpu')
+    ckpt = {k.replace('module.', ''): v for k, v in checkpoint.items()}
+    model.load_state_dict(ckpt)
+
+    return model
diff --git a/models/public/modnet-photographic-portrait-matting/model.yml b/models/public/modnet-photographic-portrait-matting/model.yml
@@ -0,0 +1,69 @@
+# Copyright (c) 2022 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+description: >-
+  The "modnet-photographic-portrait-matting" model is a lightweight matting objective
+  decomposition network (MODNet) for photographic portrait matting in real-time with
+  a single input image with MobileNetV2 backbone. The model is pre-trained in PyTorch*
+  framework and converted to ONNX* format.
+
+  More details provided in the paper <https://arxiv.org/abs/2011.11961> and repository
+  <https://github.com/ZHKKKe/MODNet>.
+task_type: background_matting
+files:
+  - name: modnet_onnx.py
+    size: 9516
+    checksum: 37326740f2756572c639bb8089c70cfee6994f358fb185fe5f1c1f40c061a5d73f62319a8c2aa48d367778360751385e
+    source: https://raw.githubusercontent.com/ZHKKKe/MODNet/2938675e4b5c60ab5f5d7a2b2191c68256f99d70/onnx/modnet_onnx.py
+  - name: src/models/backbones/__init__.py
+    size: 277
+    checksum: cdb28b27092889a8293e189e80351087d0cd3135d3f0b56a4a4fd5ae28251bd0142e027d209cf63a24f6b6042255b384
+    source: https://raw.githubusercontent.com/ZHKKKe/MODNet/2938675e4b5c60ab5f5d7a2b2191c68256f99d70/src/models/backbones/__init__.py
+  - name: src/models/backbones/mobilenetv2.py
+    size: 5588
+    checksum: c100361a1b06a3751fd5b7720cb4a882153d62362421e51ef8ecccadf75acdcbd40376477739ac35f404a76388b01121
+    source: https://raw.githubusercontent.com/ZHKKKe/MODNet/2938675e4b5c60ab5f5d7a2b2191c68256f99d70/src/models/backbones/mobilenetv2.py
+  - name: src/models/backbones/wrapper.py
+    size: 2610
+    checksum: c627e513d6aca544c60fc7875c159e36563542c7d9a746e5b79c1a650118b6c8920561587061979d78993f12be906bbc
+    source: https://raw.githubusercontent.com/ZHKKKe/MODNet/2938675e4b5c60ab5f5d7a2b2191c68256f99d70/src/models/backbones/wrapper.py
+  - name: modnet_photographic_portrait_matting.ckpt
+    size: 26255603
+    checksum: 14fb9db68be32bbef0acd42d8925cf4750d2188c1bdd86399442a67f20f4a6f827f9ed1a56a3dc20258baaa004af9980
+    original_source:
+      $type: google_drive
+      id: 1mcr7ALciuAsHCpLnrtG_eop5-EYhbCmz
+    source: https://storage.openvinotoolkit.org/repositories/open_model_zoo/public/2022.2/modnet-photographic-portrait-matting/modnet_photographic_portrait_matting.ckpt
+conversion_to_onnx_args:
+  - --model-path=$dl_dir
+  - --model-path=$config_dir
+  - --model-name=create_modnet
+  - --import-module=model
+  - --model-param=weights=r"$dl_dir/modnet_photographic_portrait_matting.ckpt"
+  - --input-shape=1,3,512,512
+  - --input-names=input
+  - --output-names=output
+  - --output-file=$conv_dir/modnet_photographic_portrait_matting.onnx
+input_info:
+  - name: input
+    shape: [1, 3, 512, 512]
+    layout: NCHW
+model_optimizer_args:
+  - --input_model=$conv_dir/modnet_photographic_portrait_matting.onnx
+  - --scale_values=input[127.5]
+  - --mean_values=input[127.5, 127.5, 127.5]
+  - --reverse_input_channels
+  - --output=output
+framework: pytorch
+license: https://raw.githubusercontent.com/ZHKKKe/MODNet/master/LICENSE
diff --git a/models/public/modnet-webcam-portrait-matting/README.md b/models/public/modnet-webcam-portrait-matting/README.md
diff --git a/models/public/modnet-webcam-portrait-matting/accuracy-check.yml b/models/public/modnet-webcam-portrait-matting/accuracy-check.yml
diff --git a/models/public/modnet-webcam-portrait-matting/model.py b/models/public/modnet-webcam-portrait-matting/model.py
diff --git a/models/public/modnet-webcam-portrait-matting/model.yml b/models/public/modnet-webcam-portrait-matting/model.yml
diff --git a/tools/accuracy_checker/configs/modnet-photographic-portrait-matting.yml b/tools/accuracy_checker/configs/modnet-photographic-portrait-matting.yml
diff --git a/tools/accuracy_checker/configs/modnet-webcam-portrait-matting.yml b/tools/accuracy_checker/configs/modnet-webcam-portrait-matting.yml