From 477ec2e6059e5aaa8030cb4d3e1349f0047d3952 Mon Sep 17 00:00:00 2001 From: marsggbo <1435679023@qq> Date: Tue, 20 Jun 2023 05:45:08 -0700 Subject: [PATCH] add proxylessnas --- .../model/network_cfg/proxylessnas.yaml | 9 + hyperbox/networks/proxylessnas/__init__.py | 0 hyperbox/networks/proxylessnas/network.py | 140 ++++++++ hyperbox/networks/proxylessnas/ops.py | 334 ++++++++++++++++++ hyperbox/networks/proxylessnas/putils.py | 67 ++++ 5 files changed, 550 insertions(+) create mode 100644 hyperbox/configs/model/network_cfg/proxylessnas.yaml create mode 100644 hyperbox/networks/proxylessnas/__init__.py create mode 100644 hyperbox/networks/proxylessnas/network.py create mode 100644 hyperbox/networks/proxylessnas/ops.py create mode 100644 hyperbox/networks/proxylessnas/putils.py diff --git a/hyperbox/configs/model/network_cfg/proxylessnas.yaml b/hyperbox/configs/model/network_cfg/proxylessnas.yaml new file mode 100644 index 0000000..a5f8766 --- /dev/null +++ b/hyperbox/configs/model/network_cfg/proxylessnas.yaml @@ -0,0 +1,9 @@ +_target_: hyperbox_app.distributed.networks.proxylessnas.network.ProxylessNAS +width_stages: [24,40,80,96,192,320] +n_cell_stages: [4,4,4,4,4,1] +stride_stages: [2,2,2,1,2,1] +width_mult: 1 +num_classes: 1000 +dropout_rate: 0 +bn_param: [0.1,1e-3] +mask: null \ No newline at end of file diff --git a/hyperbox/networks/proxylessnas/__init__.py b/hyperbox/networks/proxylessnas/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/hyperbox/networks/proxylessnas/network.py b/hyperbox/networks/proxylessnas/network.py new file mode 100644 index 0000000..10c6bc1 --- /dev/null +++ b/hyperbox/networks/proxylessnas/network.py @@ -0,0 +1,140 @@ +import torch +import torch.nn as nn +import math + +from copy import deepcopy +from hyperbox_app.distributed.networks.proxylessnas import ops, putils +from hyperbox.mutables.spaces import OperationSpace +from hyperbox.networks.base_nas_network import BaseNASNetwork + + +class ProxylessNAS(BaseNASNetwork): + def __init__(self, + width_stages=[24,40,80,96,192,320], + n_cell_stages=[4,4,4,4,4,1], + stride_stages=[2,2,2,1,2,1], + width_mult=1, num_classes=1000, + dropout_rate=0, bn_param=(0.1, 1e-3), + mask=None): + """ + Parameters + ---------- + width_stages: str + width (output channels) of each cell stage in the block + n_cell_stages: str + number of cells in each cell stage + stride_strages: str + stride of each cell stage in the block + width_mult : int + the scale factor of width + """ + super(ProxylessNAS, self).__init__(mask=mask) + + input_channel = putils.make_divisible(32 * width_mult, 8) + first_cell_width = putils.make_divisible(16 * width_mult, 8) + width_stages = deepcopy(width_stages) + for i in range(len(width_stages)): + width_stages[i] = putils.make_divisible(width_stages[i] * width_mult, 8) + # first conv + first_conv = ops.ConvLayer(3, input_channel, kernel_size=3, stride=2, use_bn=True, act_func='relu6', ops_order='weight_bn_act') + # first block + first_block_conv = ops.OPS['3x3_MBConv1'](input_channel, first_cell_width, 1) + first_block = first_block_conv + + input_channel = first_cell_width + + blocks = [first_block] + + stage_cnt = 0 + for width, n_cell, s in zip(width_stages, n_cell_stages, stride_stages): + for i in range(n_cell): + if i == 0: + stride = s + else: + stride = 1 + op_candidates = [ops.OPS['3x3_MBConv3'](input_channel, width, stride), + ops.OPS['3x3_MBConv6'](input_channel, width, stride), + ops.OPS['5x5_MBConv3'](input_channel, width, stride), + ops.OPS['5x5_MBConv6'](input_channel, width, stride), + ops.OPS['7x7_MBConv3'](input_channel, width, stride), + ops.OPS['7x7_MBConv6'](input_channel, width, stride)] + if stride == 1 and input_channel == width: + # if it is not the first one + op_candidates += [ops.OPS['Zero'](input_channel, width, stride)] + conv_op = OperationSpace(op_candidates, key="s{}_c{}".format(stage_cnt, i), mask=self.mask, return_mask=True) + # shortcut + if stride == 1 and input_channel == width: + # if not first cell + shortcut = ops.IdentityLayer(input_channel, input_channel) + else: + shortcut = None + inverted_residual_block = ops.MobileInvertedResidualBlock(conv_op, shortcut, op_candidates) + blocks.append(inverted_residual_block) + input_channel = width + stage_cnt += 1 + + # feature mix layer + last_channel = putils.make_divisible(1280 * width_mult, 8) if width_mult > 1.0 else 1280 + feature_mix_layer = ops.ConvLayer(input_channel, last_channel, kernel_size=1, use_bn=True, act_func='relu6', ops_order='weight_bn_act', ) + classifier = ops.LinearLayer(last_channel, num_classes, dropout_rate=dropout_rate) + + self.first_conv = first_conv + self.blocks = nn.ModuleList(blocks) + self.feature_mix_layer = feature_mix_layer + self.global_avg_pooling = nn.AdaptiveAvgPool2d(1) + self.classifier = classifier + + # set bn param + self.set_bn_param(momentum=bn_param[0], eps=bn_param[1]) + + def forward(self, x): + self.features = [] + x = self.first_conv(x) + for block in self.blocks: + x = block(x) + self.features.append(x.detach()) + x = self.feature_mix_layer(x) + self.features.append(x.detach()) + x = self.global_avg_pooling(x) + x = x.view(x.size(0), -1) + x = self.classifier(x) + self.features.append(x.detach()) + return x + + def set_bn_param(self, momentum, eps): + for m in self.modules(): + if isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.BatchNorm1d): + m.momentum = momentum + m.eps = eps + return + + def init_model(self, model_init='he_fout', init_div_groups=False): + for m in self.modules(): + if isinstance(m, nn.Conv2d): + if model_init == 'he_fout': + n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + if init_div_groups: + n /= m.groups + m.weight.data.normal_(0, math.sqrt(2. / n)) + elif model_init == 'he_fin': + n = m.kernel_size[0] * m.kernel_size[1] * m.in_channels + if init_div_groups: + n /= m.groups + m.weight.data.normal_(0, math.sqrt(2. / n)) + else: + raise NotImplementedError + elif isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.BatchNorm1d): + m.weight.data.fill_(1) + m.bias.data.zero_() + elif isinstance(m, nn.Linear): + stdv = 1. / math.sqrt(m.weight.size(1)) + m.weight.data.uniform_(-stdv, stdv) + if m.bias is not None: + m.bias.data.zero_() + +if __name__ == '__main__': + from hyperbox.mutator import RandomMutator + net = ProxylessNAS() + rm = RandomMutator(net) + rm.reset() + print(rm._cache, len(rm._cache)) \ No newline at end of file diff --git a/hyperbox/networks/proxylessnas/ops.py b/hyperbox/networks/proxylessnas/ops.py new file mode 100644 index 0000000..25c4a6f --- /dev/null +++ b/hyperbox/networks/proxylessnas/ops.py @@ -0,0 +1,334 @@ +from collections import OrderedDict +import torch +import torch.nn as nn + +from hyperbox_app.distributed.networks.proxylessnas.putils import get_same_padding, build_activation + + +OPS = { + 'Identity': lambda in_C, out_C, stride: IdentityLayer(in_C, out_C, ops_order='weight_bn_act'), + 'Zero': lambda in_C, out_C, stride: ZeroLayer(stride=stride), + '3x3_MBConv1': lambda in_C, out_C, stride: MBInvertedConvLayer(in_C, out_C, 3, stride, 1), + '3x3_MBConv2': lambda in_C, out_C, stride: MBInvertedConvLayer(in_C, out_C, 3, stride, 2), + '3x3_MBConv3': lambda in_C, out_C, stride: MBInvertedConvLayer(in_C, out_C, 3, stride, 3), + '3x3_MBConv4': lambda in_C, out_C, stride: MBInvertedConvLayer(in_C, out_C, 3, stride, 4), + '3x3_MBConv5': lambda in_C, out_C, stride: MBInvertedConvLayer(in_C, out_C, 3, stride, 5), + '3x3_MBConv6': lambda in_C, out_C, stride: MBInvertedConvLayer(in_C, out_C, 3, stride, 6), + '5x5_MBConv1': lambda in_C, out_C, stride: MBInvertedConvLayer(in_C, out_C, 5, stride, 1), + '5x5_MBConv2': lambda in_C, out_C, stride: MBInvertedConvLayer(in_C, out_C, 5, stride, 2), + '5x5_MBConv3': lambda in_C, out_C, stride: MBInvertedConvLayer(in_C, out_C, 5, stride, 3), + '5x5_MBConv4': lambda in_C, out_C, stride: MBInvertedConvLayer(in_C, out_C, 5, stride, 4), + '5x5_MBConv5': lambda in_C, out_C, stride: MBInvertedConvLayer(in_C, out_C, 5, stride, 5), + '5x5_MBConv6': lambda in_C, out_C, stride: MBInvertedConvLayer(in_C, out_C, 5, stride, 6), + '7x7_MBConv1': lambda in_C, out_C, stride: MBInvertedConvLayer(in_C, out_C, 7, stride, 1), + '7x7_MBConv2': lambda in_C, out_C, stride: MBInvertedConvLayer(in_C, out_C, 7, stride, 2), + '7x7_MBConv3': lambda in_C, out_C, stride: MBInvertedConvLayer(in_C, out_C, 7, stride, 3), + '7x7_MBConv4': lambda in_C, out_C, stride: MBInvertedConvLayer(in_C, out_C, 7, stride, 4), + '7x7_MBConv5': lambda in_C, out_C, stride: MBInvertedConvLayer(in_C, out_C, 7, stride, 5), + '7x7_MBConv6': lambda in_C, out_C, stride: MBInvertedConvLayer(in_C, out_C, 7, stride, 6) +} + + +class MobileInvertedResidualBlock(nn.Module): + + def __init__(self, mobile_inverted_conv, shortcut, op_candidates_list): + super(MobileInvertedResidualBlock, self).__init__() + + self.mobile_inverted_conv = mobile_inverted_conv + self.shortcut = shortcut + self.op_candidates_list = op_candidates_list + + def forward(self, x): + out, idx = self.mobile_inverted_conv(x) + # TODO: unify idx format + if not isinstance(idx, int): + idx = torch.nonzero(idx == 1) + if len(idx)==1 and self.op_candidates_list[idx].is_zero_layer(): + res = x + elif self.shortcut is None: + res = out + else: + conv_x = out + skip_x = self.shortcut(x) + res = skip_x + conv_x + return res + + +class ShuffleLayer(nn.Module): + def __init__(self, groups): + super(ShuffleLayer, self).__init__() + self.groups = groups + + def forward(self, x): + batchsize, num_channels, height, width = x.size() + channels_per_group = num_channels // self.groups + # reshape + x = x.view(batchsize, self.groups, channels_per_group, height, width) + # noinspection PyUnresolvedReferences + x = torch.transpose(x, 1, 2).contiguous() + # flatten + x = x.view(batchsize, -1, height, width) + return x + +class Base2DLayer(nn.Module): + + def __init__(self, in_channels, out_channels, + use_bn=True, act_func='relu', dropout_rate=0, ops_order='weight_bn_act'): + super(Base2DLayer, self).__init__() + self.in_channels = in_channels + self.out_channels = out_channels + + self.use_bn = use_bn + self.act_func = act_func + self.dropout_rate = dropout_rate + self.ops_order = ops_order + + """ modules """ + modules = {} + # batch norm + if self.use_bn: + if self.bn_before_weight: + modules['bn'] = nn.BatchNorm2d(in_channels) + else: + modules['bn'] = nn.BatchNorm2d(out_channels) + else: + modules['bn'] = None + # activation + modules['act'] = build_activation(self.act_func, self.ops_list[0] != 'act') + # dropout + if self.dropout_rate > 0: + modules['dropout'] = nn.Dropout2d(self.dropout_rate, inplace=True) + else: + modules['dropout'] = None + # weight + modules['weight'] = self.weight_op() + + # add modules + for op in self.ops_list: + if modules[op] is None: + continue + elif op == 'weight': + if modules['dropout'] is not None: + self.add_module('dropout', modules['dropout']) + for key in modules['weight']: + self.add_module(key, modules['weight'][key]) + else: + self.add_module(op, modules[op]) + + @property + def ops_list(self): + return self.ops_order.split('_') + + @property + def bn_before_weight(self): + for op in self.ops_list: + if op == 'bn': + return True + elif op == 'weight': + return False + raise ValueError('Invalid ops_order: %s' % self.ops_order) + + def weight_op(self): + raise NotImplementedError + + def forward(self, x): + for module in self._modules.values(): + x = module(x) + return x + + @staticmethod + def is_zero_layer(): + return False + + +class ConvLayer(Base2DLayer): + + def __init__(self, in_channels, out_channels, + kernel_size=3, stride=1, dilation=1, groups=1, bias=False, has_shuffle=False, + use_bn=True, act_func='relu', dropout_rate=0, ops_order='weight_bn_act'): + self.kernel_size = kernel_size + self.stride = stride + self.dilation = dilation + self.groups = groups + self.bias = bias + self.has_shuffle = has_shuffle + + super(ConvLayer, self).__init__(in_channels, out_channels, use_bn, act_func, dropout_rate, ops_order) + + def weight_op(self): + padding = get_same_padding(self.kernel_size) + if isinstance(padding, int): + padding *= self.dilation + else: + padding[0] *= self.dilation + padding[1] *= self.dilation + + weight_dict = OrderedDict() + weight_dict['conv'] = nn.Conv2d( + self.in_channels, self.out_channels, kernel_size=self.kernel_size, stride=self.stride, padding=padding, + dilation=self.dilation, groups=self.groups, bias=self.bias + ) + if self.has_shuffle and self.groups > 1: + weight_dict['shuffle'] = ShuffleLayer(self.groups) + + return weight_dict + + +class IdentityLayer(Base2DLayer): + + def __init__(self, in_channels, out_channels, + use_bn=False, act_func=None, dropout_rate=0, ops_order='weight_bn_act'): + super(IdentityLayer, self).__init__(in_channels, out_channels, use_bn, act_func, dropout_rate, ops_order) + + def weight_op(self): + return None + + +class LinearLayer(nn.Module): + + def __init__(self, in_features, out_features, bias=True, + use_bn=False, act_func=None, dropout_rate=0, ops_order='weight_bn_act'): + super(LinearLayer, self).__init__() + + self.in_features = in_features + self.out_features = out_features + self.bias = bias + + self.use_bn = use_bn + self.act_func = act_func + self.dropout_rate = dropout_rate + self.ops_order = ops_order + + """ modules """ + modules = {} + # batch norm + if self.use_bn: + if self.bn_before_weight: + modules['bn'] = nn.BatchNorm1d(in_features) + else: + modules['bn'] = nn.BatchNorm1d(out_features) + else: + modules['bn'] = None + # activation + modules['act'] = build_activation(self.act_func, self.ops_list[0] != 'act') + # dropout + if self.dropout_rate > 0: + modules['dropout'] = nn.Dropout(self.dropout_rate, inplace=True) + else: + modules['dropout'] = None + # linear + modules['weight'] = {'linear': nn.Linear(self.in_features, self.out_features, self.bias)} + + # add modules + for op in self.ops_list: + if modules[op] is None: + continue + elif op == 'weight': + if modules['dropout'] is not None: + self.add_module('dropout', modules['dropout']) + for key in modules['weight']: + self.add_module(key, modules['weight'][key]) + else: + self.add_module(op, modules[op]) + + @property + def ops_list(self): + return self.ops_order.split('_') + + @property + def bn_before_weight(self): + for op in self.ops_list: + if op == 'bn': + return True + elif op == 'weight': + return False + raise ValueError('Invalid ops_order: %s' % self.ops_order) + + def forward(self, x): + for module in self._modules.values(): + x = module(x) + return x + + @staticmethod + def is_zero_layer(): + return False + + +class MBInvertedConvLayer(nn.Module): + """ + This layer is introduced in section 4.2 in the paper https://arxiv.org/pdf/1812.00332.pdf + """ + def __init__(self, in_channels, out_channels, + kernel_size=3, stride=1, expand_ratio=6, mid_channels=None): + super(MBInvertedConvLayer, self).__init__() + + self.in_channels = in_channels + self.out_channels = out_channels + + self.kernel_size = kernel_size + self.stride = stride + self.expand_ratio = expand_ratio + self.mid_channels = mid_channels + + if self.mid_channels is None: + feature_dim = round(self.in_channels * self.expand_ratio) + else: + feature_dim = self.mid_channels + + if self.expand_ratio == 1: + self.inverted_bottleneck = None + else: + self.inverted_bottleneck = nn.Sequential(OrderedDict([ + ('conv', nn.Conv2d(self.in_channels, feature_dim, 1, 1, 0, bias=False)), + ('bn', nn.BatchNorm2d(feature_dim)), + ('act', nn.ReLU6(inplace=True)), + ])) + + pad = get_same_padding(self.kernel_size) + self.depth_conv = nn.Sequential(OrderedDict([ + ('conv', nn.Conv2d(feature_dim, feature_dim, kernel_size, stride, pad, groups=feature_dim, bias=False)), + ('bn', nn.BatchNorm2d(feature_dim)), + ('act', nn.ReLU6(inplace=True)), + ])) + + self.point_linear = nn.Sequential(OrderedDict([ + ('conv', nn.Conv2d(feature_dim, out_channels, 1, 1, 0, bias=False)), + ('bn', nn.BatchNorm2d(out_channels)), + ])) + + def forward(self, x): + if self.inverted_bottleneck: + x = self.inverted_bottleneck(x) + x = self.depth_conv(x) + x = self.point_linear(x) + return x + + @staticmethod + def is_zero_layer(): + return False + + +class ZeroLayer(nn.Module): + + def __init__(self, stride=None): + super(ZeroLayer, self).__init__() + self.stride = stride + + def forward(self, x): + '''n, c, h, w = x.size() + h //= self.stride + w //= self.stride + device = x.get_device() if x.is_cuda else torch.device('cpu') + # noinspection PyUnresolvedReferences + padding = torch.zeros(n, c, h, w, device=device, requires_grad=False) + return padding''' + if self.stride == 1: + return x.mul(0.) + if len(x.shape)==4: + return x[:,:,::self.stride,::self.stride].mul(0.) + elif len(x.shape)==5: + return x[:,:,::self.stride,::self.stride,::self.stride].mul(0.) + + @staticmethod + def is_zero_layer(): + return True diff --git a/hyperbox/networks/proxylessnas/putils.py b/hyperbox/networks/proxylessnas/putils.py new file mode 100644 index 0000000..c490006 --- /dev/null +++ b/hyperbox/networks/proxylessnas/putils.py @@ -0,0 +1,67 @@ +import torch.nn as nn + +def get_parameters(model, keys=None, mode='include'): + if keys is None: + for name, param in model.named_parameters(): + yield param + elif mode == 'include': + for name, param in model.named_parameters(): + flag = False + for key in keys: + if key in name: + flag = True + break + if flag: + yield param + elif mode == 'exclude': + for name, param in model.named_parameters(): + flag = True + for key in keys: + if key in name: + flag = False + break + if flag: + yield param + else: + raise ValueError('do not support: %s' % mode) + + +def get_same_padding(kernel_size): + if isinstance(kernel_size, tuple): + assert len(kernel_size) == 2, 'invalid kernel size: %s' % kernel_size + p1 = get_same_padding(kernel_size[0]) + p2 = get_same_padding(kernel_size[1]) + return p1, p2 + assert isinstance(kernel_size, int), 'kernel size should be either `int` or `tuple`' + assert kernel_size % 2 > 0, 'kernel size should be odd number' + return kernel_size // 2 + +def build_activation(act_func, inplace=True): + if act_func == 'relu': + return nn.ReLU(inplace=inplace) + elif act_func == 'relu6': + return nn.ReLU6(inplace=inplace) + elif act_func == 'tanh': + return nn.Tanh() + elif act_func == 'sigmoid': + return nn.Sigmoid() + elif act_func is None: + return None + else: + raise ValueError('do not support: %s' % act_func) + + +def make_divisible(v, divisor, min_val=None): + """ + This function is taken from the original tf repo. + It ensures that all layers have a channel number that is divisible by 8 + It can be seen here: + https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py + """ + if min_val is None: + min_val = divisor + new_v = max(min_val, int(v + divisor / 2) // divisor * divisor) + # Make sure that round down does not go down by more than 10%. + if new_v < 0.9 * v: + new_v += divisor + return new_v