This post is divided into 2 sections: Summary and Implementation.
We are going to have an in-depth review of MobileNetV2: Inverted Residuals and Linear Bottlenecks paper which introduces the MobileNet-V2 architecture.
The implementation uses Pytorch as framework. To see full implementation, please refer to this repository.
Also, if you want to read other "Summary and Implementation", feel free to check them at my blog.
Depthwise convolution:
Pointwise convolution:
The main difference is the number of computations. In our example:
We can clearly see that a depthwise separable convolution is less expensive than a normal convolution (~22.7% less computations).
The reason is, in a normal convolution, we are transforming the image 256 times whereas in a depthwise separable convolution, we transform the image once and then expand it 256 times along the channel axis.
MobileNetV2 provides a natural separation between two things, which have been tangled together in previous architectures.
Authors say that exploring these concepts separately is an important direction for future research.
class LambdaLayer(nn.Module):
def __init__(self, lambd):
super(LambdaLayer, self).__init__()
self.lambd = lambd
def forward(self, x):
return self.lambd(x)
class Bottleneck(nn.Module):
def __init__(self, in_channels, out_channels, t, stride):
super(Bottleneck, self).__init__()
self.stride = stride
self.in_channels = in_channels
self.out_channels = out_channels
self.features = nn.Sequential(OrderedDict([
('pconv1', nn.Conv2d(in_channels,
in_channels*t,
kernel_size=1,
stride=1,
padding=0,
bias=False)),
('bn1', nn.BatchNorm2d(in_channels*t)),
('act1', nn.ReLU6()),
('dconv', nn.Conv2d(in_channels*t,
in_channels*t,
kernel_size=3,
groups=in_channels*t,
stride=stride,
padding=1,
bias=False)),
('bn2', nn.BatchNorm2d(in_channels*t)),
('act2', nn.ReLU6()),
('pconv3', nn.Conv2d(in_channels*t,
out_channels,
kernel_size=1,
stride=1,
padding=0,
bias=False)),
('bn3', nn.BatchNorm2d(out_channels))
]))
def forward(self, x):
out = self.features(x)
if self.stride == 1 and self.in_channels == self.out_channels:
out += x
return out
class MobileNet(nn.Module):
def __init__(self, block_type, bottleneck_settings, width_multiplier, num_classes):
super(MobileNet, self).__init__()
self.num_classes = num_classes
self.b_s = bottleneck_settings
self.b_s['c'] = [int(elt * width_multiplier) for elt in self.b_s['c']]
self.in_channels = int(32 * width_multiplier)
self.out_channels = int(1280 * width_multiplier)
# Feature
self.conv0 = nn.Sequential(OrderedDict([
('conv0', nn.Conv2d(3, self.in_channels, 1, stride=2, bias=False)),
('bn0', nn.BatchNorm2d(self.in_channels)),
('act0', nn.ReLU6())
]))
self.bottleneck1 = self.__build_layer(block_type,
self.in_channels,
self.b_s['c'][0],
self.b_s['t'][0],
self.b_s['s'][0],
self.b_s['n'][0])
self.bottleneck2 = self.__build_layer(block_type,
self.b_s['c'][0],
self.b_s['c'][1],
self.b_s['t'][1],
self.b_s['s'][1],
self.b_s['n'][1])
self.bottleneck3 = self.__build_layer(block_type,
self.b_s['c'][1],
self.b_s['c'][2],
self.b_s['t'][2],
self.b_s['s'][2],
self.b_s['n'][2])
self.bottleneck4 = self.__build_layer(block_type,
self.b_s['c'][2],
self.b_s['c'][3],
self.b_s['t'][3],
self.b_s['s'][3],
self.b_s['n'][3])
self.bottleneck5 = self.__build_layer(block_type,
self.b_s['c'][3],
self.b_s['c'][4],
self.b_s['t'][4],
self.b_s['s'][4],
self.b_s['n'][4])
self.bottleneck6 = self.__build_layer(block_type,
self.b_s['c'][4],
self.b_s['c'][5],
self.b_s['t'][5],
self.b_s['s'][5],
self.b_s['n'][5])
self.bottleneck7 = self.__build_layer(block_type,
self.b_s['c'][5],
self.b_s['c'][6],
self.b_s['t'][6],
self.b_s['s'][6],
self.b_s['n'][6])
# Classifier
self.conv8 = nn.Sequential(OrderedDict([
('conv8', nn.Conv2d(self.b_s['c'][6], self.out_channels, 1, bias=False)),
('bn8', nn.BatchNorm2d(self.out_channels)),
('act8', nn.ReLU6())
]))
self.avgpool = nn.AdaptiveAvgPool2d((1,1))
self.conv9 = nn.Conv2d(self.out_channels, num_classes, 1)
def __build_layer(self, block_type, in_channels, out_channels, t, s, n):
layers = []
tmp_channels = in_channels
for i in range(n):
if i == 0:
layers.append(block_type(tmp_channels, out_channels, t, s))
else:
layers.append(block_type(tmp_channels, out_channels, t, 1))
tmp_channels = out_channels
return nn.Sequential(*layers)
def forward(self, x):
out = self.conv0(x)
out = self.bottleneck1(out)
out = self.bottleneck2(out)
out = self.bottleneck3(out)
out = self.bottleneck4(out)
out = self.bottleneck5(out)
out = self.bottleneck6(out)
out = self.bottleneck7(out)
out = self.conv8(out)
out = self.avgpool(out)
out = self.conv9(out)
out = out.view(-1, self.num_classes)
return out
def MobileNetV2():
bottleneck_settings = {
'c': [16, 24, 32, 64, 96, 160, 320],
't': [1, 6, 6, 6, 6, 6, 6],
's': [1, 2, 2, 2, 1, 2, 1],
'n': [1, 2, 3, 4, 3, 3, 1]
}
return MobileNet(block_type=Bottleneck,
bottleneck_settings=bottleneck_settings,
width_multiplier=.5,
num_classes=1000)
train_costs, val_costs = train_model()
[Epoch 1/15]: train-loss = 1.641850 | train-acc = 0.397 | val-loss = 0.035919 | val-acc = 0.528
[Epoch 2/15]: train-loss = 1.192912 | train-acc = 0.569 | val-loss = 0.035297 | val-acc = 0.612
[Epoch 3/15]: train-loss = 0.997534 | train-acc = 0.642 | val-loss = 0.033239 | val-acc = 0.654
[Epoch 4/15]: train-loss = 0.868714 | train-acc = 0.692 | val-loss = 0.028015 | val-acc = 0.696
[Epoch 5/15]: train-loss = 0.760505 | train-acc = 0.733 | val-loss = 0.021540 | val-acc = 0.721
[Epoch 6/15]: train-loss = 0.681773 | train-acc = 0.763 | val-loss = 0.013692 | val-acc = 0.752
[Epoch 7/15]: train-loss = 0.617383 | train-acc = 0.786 | val-loss = 0.023159 | val-acc = 0.749
[Epoch 8/15]: train-loss = 0.568520 | train-acc = 0.802 | val-loss = 0.017240 | val-acc = 0.749
[Epoch 9/15]: train-loss = 0.527746 | train-acc = 0.816 | val-loss = 0.017224 | val-acc = 0.763
[Epoch 10/15]: train-loss = 0.491380 | train-acc = 0.828 | val-loss = 0.019151 | val-acc = 0.778
[Epoch 11/15]: train-loss = 0.469383 | train-acc = 0.837 | val-loss = 0.019102 | val-acc = 0.785
[Epoch 12/15]: train-loss = 0.432044 | train-acc = 0.849 | val-loss = 0.022228 | val-acc = 0.785
[Epoch 13/15]: train-loss = 0.404078 | train-acc = 0.860 | val-loss = 0.015088 | val-acc = 0.790
[Epoch 14/15]: train-loss = 0.385579 | train-acc = 0.865 | val-loss = 0.016302 | val-acc = 0.793
[Epoch 15/15]: train-loss = 0.363964 | train-acc = 0.872 | val-loss = 0.017327 | val-acc = 0.794
nb_test_examples = 10000
correct = 0
model.eval().cuda()
with torch.no_grad():
for inputs, labels in test_loader:
inputs, labels = inputs.to(device), labels.to(device)
# Make predictions.
prediction = model(inputs)
# Retrieve predictions indexes.
_, predicted_class = torch.max(prediction.data, 1)
# Compute number of correct predictions.
correct += (predicted_class == labels).float().sum().item()
test_accuracy = correct / nb_test_examples
print('Test accuracy: {}'.format(test_accuracy))
Test accuracy: 0.8007