From 477ec2e6059e5aaa8030cb4d3e1349f0047d3952 Mon Sep 17 00:00:00 2001
From: marsggbo <1435679023@qq>
Date: Tue, 20 Jun 2023 05:45:08 -0700
Subject: [PATCH] add proxylessnas

---
 .../model/network_cfg/proxylessnas.yaml       |   9 +
 hyperbox/networks/proxylessnas/__init__.py    |   0
 hyperbox/networks/proxylessnas/network.py     | 140 ++++++++
 hyperbox/networks/proxylessnas/ops.py         | 334 ++++++++++++++++++
 hyperbox/networks/proxylessnas/putils.py      |  67 ++++
 5 files changed, 550 insertions(+)
 create mode 100644 hyperbox/configs/model/network_cfg/proxylessnas.yaml
 create mode 100644 hyperbox/networks/proxylessnas/__init__.py
 create mode 100644 hyperbox/networks/proxylessnas/network.py
 create mode 100644 hyperbox/networks/proxylessnas/ops.py
 create mode 100644 hyperbox/networks/proxylessnas/putils.py

diff --git a/hyperbox/configs/model/network_cfg/proxylessnas.yaml b/hyperbox/configs/model/network_cfg/proxylessnas.yaml
new file mode 100644
index 0000000..a5f8766
--- /dev/null
+++ b/hyperbox/configs/model/network_cfg/proxylessnas.yaml
@@ -0,0 +1,9 @@
+_target_: hyperbox_app.distributed.networks.proxylessnas.network.ProxylessNAS
+width_stages: [24,40,80,96,192,320]
+n_cell_stages: [4,4,4,4,4,1]
+stride_stages: [2,2,2,1,2,1]
+width_mult: 1
+num_classes: 1000
+dropout_rate: 0
+bn_param: [0.1,1e-3]
+mask: null
\ No newline at end of file
diff --git a/hyperbox/networks/proxylessnas/__init__.py b/hyperbox/networks/proxylessnas/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/hyperbox/networks/proxylessnas/network.py b/hyperbox/networks/proxylessnas/network.py
new file mode 100644
index 0000000..10c6bc1
--- /dev/null
+++ b/hyperbox/networks/proxylessnas/network.py
@@ -0,0 +1,140 @@
+import torch
+import torch.nn as nn
+import math
+
+from copy import deepcopy
+from hyperbox_app.distributed.networks.proxylessnas import ops, putils
+from hyperbox.mutables.spaces import OperationSpace
+from hyperbox.networks.base_nas_network import BaseNASNetwork
+
+
+class ProxylessNAS(BaseNASNetwork):
+    def __init__(self,
+                 width_stages=[24,40,80,96,192,320],
+                 n_cell_stages=[4,4,4,4,4,1],
+                 stride_stages=[2,2,2,1,2,1],
+                 width_mult=1, num_classes=1000,
+                 dropout_rate=0, bn_param=(0.1, 1e-3),
+                 mask=None):
+        """
+        Parameters
+        ----------
+        width_stages: str
+            width (output channels) of each cell stage in the block
+        n_cell_stages: str
+            number of cells in each cell stage
+        stride_strages: str
+            stride of each cell stage in the block
+        width_mult : int
+            the scale factor of width
+        """
+        super(ProxylessNAS, self).__init__(mask=mask)
+
+        input_channel = putils.make_divisible(32 * width_mult, 8)
+        first_cell_width = putils.make_divisible(16 * width_mult, 8)
+        width_stages = deepcopy(width_stages)
+        for i in range(len(width_stages)):
+            width_stages[i] = putils.make_divisible(width_stages[i] * width_mult, 8)
+        # first conv
+        first_conv = ops.ConvLayer(3, input_channel, kernel_size=3, stride=2, use_bn=True, act_func='relu6', ops_order='weight_bn_act')
+        # first block
+        first_block_conv = ops.OPS['3x3_MBConv1'](input_channel, first_cell_width, 1)
+        first_block = first_block_conv
+
+        input_channel = first_cell_width
+
+        blocks = [first_block]
+
+        stage_cnt = 0
+        for width, n_cell, s in zip(width_stages, n_cell_stages, stride_stages):
+            for i in range(n_cell):
+                if i == 0:
+                    stride = s
+                else:
+                    stride = 1
+                op_candidates = [ops.OPS['3x3_MBConv3'](input_channel, width, stride),
+                                 ops.OPS['3x3_MBConv6'](input_channel, width, stride),
+                                 ops.OPS['5x5_MBConv3'](input_channel, width, stride),
+                                 ops.OPS['5x5_MBConv6'](input_channel, width, stride),
+                                 ops.OPS['7x7_MBConv3'](input_channel, width, stride),
+                                 ops.OPS['7x7_MBConv6'](input_channel, width, stride)]
+                if stride == 1 and input_channel == width:
+                    # if it is not the first one
+                    op_candidates += [ops.OPS['Zero'](input_channel, width, stride)]
+                conv_op = OperationSpace(op_candidates, key="s{}_c{}".format(stage_cnt, i), mask=self.mask, return_mask=True)
+                # shortcut
+                if stride == 1 and input_channel == width:
+                    # if not first cell
+                    shortcut = ops.IdentityLayer(input_channel, input_channel)
+                else:
+                    shortcut = None
+                inverted_residual_block = ops.MobileInvertedResidualBlock(conv_op, shortcut, op_candidates)
+                blocks.append(inverted_residual_block)
+                input_channel = width
+            stage_cnt += 1
+
+        # feature mix layer
+        last_channel = putils.make_divisible(1280 * width_mult, 8) if width_mult > 1.0 else 1280
+        feature_mix_layer = ops.ConvLayer(input_channel, last_channel, kernel_size=1, use_bn=True, act_func='relu6', ops_order='weight_bn_act', )
+        classifier = ops.LinearLayer(last_channel, num_classes, dropout_rate=dropout_rate)
+
+        self.first_conv = first_conv
+        self.blocks = nn.ModuleList(blocks)
+        self.feature_mix_layer = feature_mix_layer
+        self.global_avg_pooling = nn.AdaptiveAvgPool2d(1)
+        self.classifier = classifier
+
+        # set bn param
+        self.set_bn_param(momentum=bn_param[0], eps=bn_param[1])
+
+    def forward(self, x):
+        self.features = []
+        x = self.first_conv(x)
+        for block in self.blocks:
+            x = block(x)
+            self.features.append(x.detach())
+        x = self.feature_mix_layer(x)
+        self.features.append(x.detach())
+        x = self.global_avg_pooling(x)
+        x = x.view(x.size(0), -1)
+        x = self.classifier(x)
+        self.features.append(x.detach())
+        return x
+
+    def set_bn_param(self, momentum, eps):
+        for m in self.modules():
+            if isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.BatchNorm1d):
+                m.momentum = momentum
+                m.eps = eps
+        return
+
+    def init_model(self, model_init='he_fout', init_div_groups=False):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                if model_init == 'he_fout':
+                    n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                    if init_div_groups:
+                        n /= m.groups
+                    m.weight.data.normal_(0, math.sqrt(2. / n))
+                elif model_init == 'he_fin':
+                    n = m.kernel_size[0] * m.kernel_size[1] * m.in_channels
+                    if init_div_groups:
+                        n /= m.groups
+                    m.weight.data.normal_(0, math.sqrt(2. / n))
+                else:
+                    raise NotImplementedError
+            elif isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.BatchNorm1d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+            elif isinstance(m, nn.Linear):
+                stdv = 1. / math.sqrt(m.weight.size(1))
+                m.weight.data.uniform_(-stdv, stdv)
+                if m.bias is not None:
+                    m.bias.data.zero_()
+
+if __name__ == '__main__':
+    from hyperbox.mutator import RandomMutator
+    net = ProxylessNAS()
+    rm = RandomMutator(net)
+    rm.reset()
+    print(rm._cache, len(rm._cache))
\ No newline at end of file
diff --git a/hyperbox/networks/proxylessnas/ops.py b/hyperbox/networks/proxylessnas/ops.py
new file mode 100644
index 0000000..25c4a6f
--- /dev/null
+++ b/hyperbox/networks/proxylessnas/ops.py
@@ -0,0 +1,334 @@
+from collections import OrderedDict
+import torch
+import torch.nn as nn
+
+from hyperbox_app.distributed.networks.proxylessnas.putils import get_same_padding, build_activation
+
+
+OPS = {
+    'Identity': lambda in_C, out_C, stride: IdentityLayer(in_C, out_C, ops_order='weight_bn_act'),
+    'Zero': lambda in_C, out_C, stride: ZeroLayer(stride=stride),
+    '3x3_MBConv1': lambda in_C, out_C, stride: MBInvertedConvLayer(in_C, out_C, 3, stride, 1),
+    '3x3_MBConv2': lambda in_C, out_C, stride: MBInvertedConvLayer(in_C, out_C, 3, stride, 2),
+    '3x3_MBConv3': lambda in_C, out_C, stride: MBInvertedConvLayer(in_C, out_C, 3, stride, 3),
+    '3x3_MBConv4': lambda in_C, out_C, stride: MBInvertedConvLayer(in_C, out_C, 3, stride, 4),
+    '3x3_MBConv5': lambda in_C, out_C, stride: MBInvertedConvLayer(in_C, out_C, 3, stride, 5),
+    '3x3_MBConv6': lambda in_C, out_C, stride: MBInvertedConvLayer(in_C, out_C, 3, stride, 6),
+    '5x5_MBConv1': lambda in_C, out_C, stride: MBInvertedConvLayer(in_C, out_C, 5, stride, 1),
+    '5x5_MBConv2': lambda in_C, out_C, stride: MBInvertedConvLayer(in_C, out_C, 5, stride, 2),
+    '5x5_MBConv3': lambda in_C, out_C, stride: MBInvertedConvLayer(in_C, out_C, 5, stride, 3),
+    '5x5_MBConv4': lambda in_C, out_C, stride: MBInvertedConvLayer(in_C, out_C, 5, stride, 4),
+    '5x5_MBConv5': lambda in_C, out_C, stride: MBInvertedConvLayer(in_C, out_C, 5, stride, 5),
+    '5x5_MBConv6': lambda in_C, out_C, stride: MBInvertedConvLayer(in_C, out_C, 5, stride, 6),
+    '7x7_MBConv1': lambda in_C, out_C, stride: MBInvertedConvLayer(in_C, out_C, 7, stride, 1),
+    '7x7_MBConv2': lambda in_C, out_C, stride: MBInvertedConvLayer(in_C, out_C, 7, stride, 2),
+    '7x7_MBConv3': lambda in_C, out_C, stride: MBInvertedConvLayer(in_C, out_C, 7, stride, 3),
+    '7x7_MBConv4': lambda in_C, out_C, stride: MBInvertedConvLayer(in_C, out_C, 7, stride, 4),
+    '7x7_MBConv5': lambda in_C, out_C, stride: MBInvertedConvLayer(in_C, out_C, 7, stride, 5),
+    '7x7_MBConv6': lambda in_C, out_C, stride: MBInvertedConvLayer(in_C, out_C, 7, stride, 6)
+}
+
+
+class MobileInvertedResidualBlock(nn.Module):
+    
+    def __init__(self, mobile_inverted_conv, shortcut, op_candidates_list):
+        super(MobileInvertedResidualBlock, self).__init__()
+
+        self.mobile_inverted_conv = mobile_inverted_conv
+        self.shortcut = shortcut
+        self.op_candidates_list = op_candidates_list
+
+    def forward(self, x):
+        out, idx = self.mobile_inverted_conv(x)
+        # TODO: unify idx format
+        if not isinstance(idx, int):
+            idx = torch.nonzero(idx == 1)
+        if len(idx)==1 and self.op_candidates_list[idx].is_zero_layer():
+            res = x
+        elif self.shortcut is None:
+            res = out
+        else:
+            conv_x = out
+            skip_x = self.shortcut(x)
+            res = skip_x + conv_x
+        return res
+
+
+class ShuffleLayer(nn.Module):
+    def __init__(self, groups):
+        super(ShuffleLayer, self).__init__()
+        self.groups = groups
+
+    def forward(self, x):
+        batchsize, num_channels, height, width = x.size()
+        channels_per_group = num_channels // self.groups
+        # reshape
+        x = x.view(batchsize, self.groups, channels_per_group, height, width)
+        # noinspection PyUnresolvedReferences
+        x = torch.transpose(x, 1, 2).contiguous()
+        # flatten
+        x = x.view(batchsize, -1, height, width)
+        return x
+
+class Base2DLayer(nn.Module):
+    
+    def __init__(self, in_channels, out_channels,
+                 use_bn=True, act_func='relu', dropout_rate=0, ops_order='weight_bn_act'):
+        super(Base2DLayer, self).__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+
+        self.use_bn = use_bn
+        self.act_func = act_func
+        self.dropout_rate = dropout_rate
+        self.ops_order = ops_order
+
+        """ modules """
+        modules = {}
+        # batch norm
+        if self.use_bn:
+            if self.bn_before_weight:
+                modules['bn'] = nn.BatchNorm2d(in_channels)
+            else:
+                modules['bn'] = nn.BatchNorm2d(out_channels)
+        else:
+            modules['bn'] = None
+        # activation
+        modules['act'] = build_activation(self.act_func, self.ops_list[0] != 'act')
+        # dropout
+        if self.dropout_rate > 0:
+            modules['dropout'] = nn.Dropout2d(self.dropout_rate, inplace=True)
+        else:
+            modules['dropout'] = None
+        # weight
+        modules['weight'] = self.weight_op()
+
+        # add modules
+        for op in self.ops_list:
+            if modules[op] is None:
+                continue
+            elif op == 'weight':
+                if modules['dropout'] is not None:
+                    self.add_module('dropout', modules['dropout'])
+                for key in modules['weight']:
+                    self.add_module(key, modules['weight'][key])
+            else:
+                self.add_module(op, modules[op])
+
+    @property
+    def ops_list(self):
+        return self.ops_order.split('_')
+
+    @property
+    def bn_before_weight(self):
+        for op in self.ops_list:
+            if op == 'bn':
+                return True
+            elif op == 'weight':
+                return False
+        raise ValueError('Invalid ops_order: %s' % self.ops_order)
+
+    def weight_op(self):
+        raise NotImplementedError
+
+    def forward(self, x):
+        for module in self._modules.values():
+            x = module(x)
+        return x
+
+    @staticmethod
+    def is_zero_layer():
+        return False
+
+
+class ConvLayer(Base2DLayer):
+
+    def __init__(self, in_channels, out_channels,
+                 kernel_size=3, stride=1, dilation=1, groups=1, bias=False, has_shuffle=False,
+                 use_bn=True, act_func='relu', dropout_rate=0, ops_order='weight_bn_act'):
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.dilation = dilation
+        self.groups = groups
+        self.bias = bias
+        self.has_shuffle = has_shuffle
+
+        super(ConvLayer, self).__init__(in_channels, out_channels, use_bn, act_func, dropout_rate, ops_order)
+
+    def weight_op(self):
+        padding = get_same_padding(self.kernel_size)
+        if isinstance(padding, int):
+            padding *= self.dilation
+        else:
+            padding[0] *= self.dilation
+            padding[1] *= self.dilation
+
+        weight_dict = OrderedDict()
+        weight_dict['conv'] = nn.Conv2d(
+            self.in_channels, self.out_channels, kernel_size=self.kernel_size, stride=self.stride, padding=padding,
+            dilation=self.dilation, groups=self.groups, bias=self.bias
+        )
+        if self.has_shuffle and self.groups > 1:
+            weight_dict['shuffle'] = ShuffleLayer(self.groups)
+
+        return weight_dict
+
+
+class IdentityLayer(Base2DLayer):
+
+    def __init__(self, in_channels, out_channels,
+                 use_bn=False, act_func=None, dropout_rate=0, ops_order='weight_bn_act'):
+        super(IdentityLayer, self).__init__(in_channels, out_channels, use_bn, act_func, dropout_rate, ops_order)
+
+    def weight_op(self):
+        return None
+
+
+class LinearLayer(nn.Module):
+
+    def __init__(self, in_features, out_features, bias=True,
+                 use_bn=False, act_func=None, dropout_rate=0, ops_order='weight_bn_act'):
+        super(LinearLayer, self).__init__()
+
+        self.in_features = in_features
+        self.out_features = out_features
+        self.bias = bias
+
+        self.use_bn = use_bn
+        self.act_func = act_func
+        self.dropout_rate = dropout_rate
+        self.ops_order = ops_order
+
+        """ modules """
+        modules = {}
+        # batch norm
+        if self.use_bn:
+            if self.bn_before_weight:
+                modules['bn'] = nn.BatchNorm1d(in_features)
+            else:
+                modules['bn'] = nn.BatchNorm1d(out_features)
+        else:
+            modules['bn'] = None
+        # activation
+        modules['act'] = build_activation(self.act_func, self.ops_list[0] != 'act')
+        # dropout
+        if self.dropout_rate > 0:
+            modules['dropout'] = nn.Dropout(self.dropout_rate, inplace=True)
+        else:
+            modules['dropout'] = None
+        # linear
+        modules['weight'] = {'linear': nn.Linear(self.in_features, self.out_features, self.bias)}
+
+        # add modules
+        for op in self.ops_list:
+            if modules[op] is None:
+                continue
+            elif op == 'weight':
+                if modules['dropout'] is not None:
+                    self.add_module('dropout', modules['dropout'])
+                for key in modules['weight']:
+                    self.add_module(key, modules['weight'][key])
+            else:
+                self.add_module(op, modules[op])
+
+    @property
+    def ops_list(self):
+        return self.ops_order.split('_')
+
+    @property
+    def bn_before_weight(self):
+        for op in self.ops_list:
+            if op == 'bn':
+                return True
+            elif op == 'weight':
+                return False
+        raise ValueError('Invalid ops_order: %s' % self.ops_order)
+
+    def forward(self, x):
+        for module in self._modules.values():
+            x = module(x)
+        return x
+
+    @staticmethod
+    def is_zero_layer():
+        return False
+
+
+class MBInvertedConvLayer(nn.Module):
+    """
+    This layer is introduced in section 4.2 in the paper https://arxiv.org/pdf/1812.00332.pdf
+    """
+    def __init__(self, in_channels, out_channels,
+                 kernel_size=3, stride=1, expand_ratio=6, mid_channels=None):
+        super(MBInvertedConvLayer, self).__init__()
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.expand_ratio = expand_ratio
+        self.mid_channels = mid_channels
+
+        if self.mid_channels is None:
+            feature_dim = round(self.in_channels * self.expand_ratio)
+        else:
+            feature_dim = self.mid_channels
+
+        if self.expand_ratio == 1:
+            self.inverted_bottleneck = None
+        else:
+            self.inverted_bottleneck = nn.Sequential(OrderedDict([
+                ('conv', nn.Conv2d(self.in_channels, feature_dim, 1, 1, 0, bias=False)),
+                ('bn', nn.BatchNorm2d(feature_dim)),
+                ('act', nn.ReLU6(inplace=True)),
+            ]))
+
+        pad = get_same_padding(self.kernel_size)
+        self.depth_conv = nn.Sequential(OrderedDict([
+            ('conv', nn.Conv2d(feature_dim, feature_dim, kernel_size, stride, pad, groups=feature_dim, bias=False)),
+            ('bn', nn.BatchNorm2d(feature_dim)),
+            ('act', nn.ReLU6(inplace=True)),
+        ]))
+
+        self.point_linear = nn.Sequential(OrderedDict([
+            ('conv', nn.Conv2d(feature_dim, out_channels, 1, 1, 0, bias=False)),
+            ('bn', nn.BatchNorm2d(out_channels)),
+        ]))
+
+    def forward(self, x):
+        if self.inverted_bottleneck:
+            x = self.inverted_bottleneck(x)
+        x = self.depth_conv(x)
+        x = self.point_linear(x)
+        return x
+
+    @staticmethod
+    def is_zero_layer():
+        return False
+
+
+class ZeroLayer(nn.Module):
+    
+    def __init__(self, stride=None):
+        super(ZeroLayer, self).__init__()
+        self.stride = stride
+
+    def forward(self, x):
+        '''n, c, h, w = x.size()
+        h //= self.stride
+        w //= self.stride
+        device = x.get_device() if x.is_cuda else torch.device('cpu')
+        # noinspection PyUnresolvedReferences
+        padding = torch.zeros(n, c, h, w, device=device, requires_grad=False)
+        return padding'''
+        if self.stride == 1:
+            return x.mul(0.)
+        if len(x.shape)==4:
+            return x[:,:,::self.stride,::self.stride].mul(0.)
+        elif len(x.shape)==5:
+            return x[:,:,::self.stride,::self.stride,::self.stride].mul(0.)
+
+    @staticmethod
+    def is_zero_layer():
+        return True
diff --git a/hyperbox/networks/proxylessnas/putils.py b/hyperbox/networks/proxylessnas/putils.py
new file mode 100644
index 0000000..c490006
--- /dev/null
+++ b/hyperbox/networks/proxylessnas/putils.py
@@ -0,0 +1,67 @@
+import torch.nn as nn
+
+def get_parameters(model, keys=None, mode='include'):
+    if keys is None:
+        for name, param in model.named_parameters():
+            yield param
+    elif mode == 'include':
+        for name, param in model.named_parameters():
+            flag = False
+            for key in keys:
+                if key in name:
+                    flag = True
+                    break
+            if flag:
+                yield param
+    elif mode == 'exclude':
+        for name, param in model.named_parameters():
+            flag = True
+            for key in keys:
+                if key in name:
+                    flag = False
+                    break
+            if flag:
+                yield param
+    else:
+        raise ValueError('do not support: %s' % mode)
+
+
+def get_same_padding(kernel_size):
+    if isinstance(kernel_size, tuple):
+        assert len(kernel_size) == 2, 'invalid kernel size: %s' % kernel_size
+        p1 = get_same_padding(kernel_size[0])
+        p2 = get_same_padding(kernel_size[1])
+        return p1, p2
+    assert isinstance(kernel_size, int), 'kernel size should be either `int` or `tuple`'
+    assert kernel_size % 2 > 0, 'kernel size should be odd number'
+    return kernel_size // 2
+
+def build_activation(act_func, inplace=True):
+    if act_func == 'relu':
+        return nn.ReLU(inplace=inplace)
+    elif act_func == 'relu6':
+        return nn.ReLU6(inplace=inplace)
+    elif act_func == 'tanh':
+        return nn.Tanh()
+    elif act_func == 'sigmoid':
+        return nn.Sigmoid()
+    elif act_func is None:
+        return None
+    else:
+        raise ValueError('do not support: %s' % act_func)
+
+
+def make_divisible(v, divisor, min_val=None):
+    """
+    This function is taken from the original tf repo.
+    It ensures that all layers have a channel number that is divisible by 8
+    It can be seen here:
+    https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
+    """
+    if min_val is None:
+        min_val = divisor
+    new_v = max(min_val, int(v + divisor / 2) // divisor * divisor)
+    # Make sure that round down does not go down by more than 10%.
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v