# Homework #2
###### tags: `Deep Learning for Computer Vision`
# Face image generation
In this Task, I applied **DC-GAN** to implement Face image generation.

## DC-GAN
<center>
<img style="border-radius: 0.3125em;
box-shadow: 0 2px 4px 0 rgba(34,36,38,.12),0 2px 10px 0 rgba(34,36,38,.08);margin: 2%;"
src="https://i.imgur.com/sUsGjcs.png">
<br>
<div style="color:orange; border-bottom: 1px solid #d9d9d9;
display: inline-block;
color: #999;
padding: 2px;">DCGAN</div>
</center>
### Generator
``` python
# https://pytorch.org/tutorials/beginner/dcgan_faces_tutorial.html
class Generator(nn.Module):
def __init__(self, ngpu):
super(Generator, self).__init__()
self.ngpu = ngpu
self.main = nn.Sequential(
# input is Z, going into a convolution
nn.ConvTranspose2d( nz, ngf * 8, 4, 1, 0, bias=False),
nn.BatchNorm2d(ngf * 8),
nn.ReLU(True),
# state size. (ngf*8) x 4 x 4
nn.ConvTranspose2d(ngf * 8, ngf * 4, 4, 2, 1, bias=False),
nn.BatchNorm2d(ngf * 4),
nn.ReLU(True),
# state size. (ngf*4) x 8 x 8
nn.ConvTranspose2d( ngf * 4, ngf * 2, 4, 2, 1, bias=False),
nn.BatchNorm2d(ngf * 2),
nn.ReLU(True),
# state size. (ngf*2) x 16 x 16
nn.ConvTranspose2d( ngf * 2, ngf, 4, 2, 1, bias=False),
nn.BatchNorm2d(ngf),
nn.ReLU(True),
# state size. (ngf) x 32 x 32
nn.ConvTranspose2d( ngf, nc, 4, 2, 1, bias=False),
nn.Tanh()
# state size. (nc) x 64 x 64
)
def forward(self, input):
return self.main(input)
```
### Discriminator
``` python
class Discriminator(nn.Module):
def __init__(self, ngpu):
super(Discriminator, self).__init__()
self.ngpu = ngpu
self.main = nn.Sequential(
# input is (nc) x 64 x 64
nn.Conv2d(nc, ndf, 4, 2, 1, bias=False),
nn.LeakyReLU(0.2, inplace=True),
# state size. (ndf) x 32 x 32
nn.Conv2d(ndf, ndf * 2, 4, 2, 1, bias=False),
nn.BatchNorm2d(ndf * 2),
nn.LeakyReLU(0.2, inplace=True),
# state size. (ndf*2) x 16 x 16
nn.Conv2d(ndf * 2, ndf * 4, 4, 2, 1, bias=False),
nn.BatchNorm2d(ndf * 4),
nn.LeakyReLU(0.2, inplace=True),
# state size. (ndf*4) x 8 x 8
nn.Conv2d(ndf * 4, ndf * 8, 4, 2, 1, bias=False),
nn.BatchNorm2d(ndf * 8),
nn.LeakyReLU(0.2, inplace=True),
# state size. (ndf*8) x 4 x 4
nn.Conv2d(ndf * 8, 1, 4, 1, 0, bias=False),
nn.Sigmoid()
)
def forward(self, input):
return self.main(input)
```
### Hyperparameters :
* Batch size : 64
* Number of epochs : 100
* Image size : 64*64
* Learning rate : 0.0002
* latent vector : 100*1
* Learning rate scheduler : 0.8 * lr every 10 epoch
* Optimizer : Adam(betas=(0.5, 0.999))
### Model Ensemble :
I selected ten models from the last 10 epochs and averaged all the parameters.
``` python
import torch as t
model1 = t.load('/content/drive/MyDrive/HW2/model/dcgan/dcgan_100.pth')
model2 = t.load('/content/drive/MyDrive/HW2/model/dcgan/dcgan_99.pth')
model3 = t.load('/content/drive/MyDrive/HW2/model/dcgan/dcgan_98.pth')
model4 = t.load('/content/drive/MyDrive/HW2/model/dcgan/dcgan_97.pth')
model5 = t.load('/content/drive/MyDrive/HW2/model/dcgan/dcgan_96.pth')
model6 = t.load('/content/drive/MyDrive/HW2/model/dcgan/dcgan_95.pth')
model7 = t.load('/content/drive/MyDrive/HW2/model/dcgan/dcgan_94.pth')
model8 = t.load('/content/drive/MyDrive/HW2/model/dcgan/dcgan_93.pth')
model9 = t.load('/content/drive/MyDrive/HW2/model/dcgan/dcgan_92.pth')
model10 = t.load('/content/drive/MyDrive/HW2/model/dcgan/dcgan_91.pth')
for key, value in model1.items():
model1[key] = (value + model2[key] + model3[key] + model4[key] + model5[key] + model6[key] + model7[key] + model8[key] + model9[key] + model10[key]) / 10
ensemble = Generator(ngpu).to(device)
ensemble.load_state_dict(model1)
t.save(ensemble.state_dict(), '/content/drive/MyDrive/HW2/model/dcgan/dcgan_ensemble.pth')
```
### Example Results

### Fréchet inception distance (FID)
``` python
# https://github.com/mseitzer/pytorch-fid
FID: 22.067
```
### Inception score (IS)
``` python
# https://github.com/sbarratt/inception-score-pytorch
IS: 2.045
```
### Remark
When I use GAN to generate images, we must consider to the quality of the input data. For example, if the image is rotated by 45 degrees during data augmentation, the generated photos will also be rotated by 45 degrees.
# Conditional image synthesis and Feature Disentanglement
In this Task, I applied AC-GAN to implement conditional image generation.

## AC-GAN

### Generator
``` python
# https://github.com/eriklindernoren/PyTorch-GAN/tree/master/implementations
class Generator(nn.Module):
def __init__(self, latent_dim, n_classes, img_size):
self.channel = 3
super(Generator, self).__init__()
self.label_emb = nn.Embedding(n_classes, latent_dim)
self.init_size = img_size // 4 # Initial size before upsampling
self.l1 = nn.Sequential(nn.Linear(latent_dim, 128 * self.init_size ** 2))
self.conv_blocks = nn.Sequential(
nn.BatchNorm2d(128),
nn.Upsample(scale_factor=2),
nn.Conv2d(128, 128, 3, stride=1, padding=1),
nn.BatchNorm2d(128, 0.8),
nn.LeakyReLU(0.2, inplace=True),
nn.Upsample(scale_factor=2),
nn.Conv2d(128, 64, 3, stride=1, padding=1),
nn.BatchNorm2d(64, 0.8),
nn.LeakyReLU(0.2, inplace=True),
nn.Conv2d(64, self.channel, 3, stride=1, padding=1),
nn.Tanh(),
)
def forward(self, noise, labels):
gen_input = torch.mul(self.label_emb(labels), noise)
out = self.l1(gen_input)
out = out.view(out.shape[0], 128, self.init_size, self.init_size)
img = self.conv_blocks(out)
return img
```
### Discriminator
``` python
# https://github.com/eriklindernoren/PyTorch-GAN/tree/master/implementations
class Discriminator(nn.Module):
def __init__(self, n_classes, img_size):
super(Discriminator, self).__init__()
self.channel = 3
self.n_classes = n_classes
self.init_size = img_size
def discriminator_block(in_filters, out_filters, bn=True):
"""Returns layers of each discriminator block"""
block = [nn.Conv2d(in_filters, out_filters, 3, 2, 1), nn.LeakyReLU(0.2, inplace=True), nn.Dropout2d(0.25)]
if bn:
block.append(nn.BatchNorm2d(out_filters, 0.8))
return block
self.conv_blocks = nn.Sequential(
*discriminator_block(self.channel, 16, bn=True),
*discriminator_block(16, 32),
*discriminator_block(32, 64),
*discriminator_block(64, 128),
)
# The height and width of downsampled image
ds_size = self.init_size // 2 ** 4
# Output layers
self.adv_layer = nn.Sequential(nn.Linear(128 * ds_size ** 2, 1), nn.Sigmoid())
self.aux_layer = nn.Sequential(nn.Linear(128 * ds_size ** 2, self.n_classes), nn.Softmax())
def forward(self, img):
out = self.conv_blocks(img)
out = out.view(out.shape[0], -1)
validity = self.adv_layer(out)
label = self.aux_layer(out)
return validity, label
```
### Hyperparameters :
* Batch size : 50
* Number of epochs : 100
* Image size : 32*32
* Learning rate : 0.0002
* latent vector : 100*1
* Optimizer : Adam(betas=(0.5, 0.999))
### Training detial :
When we are training ACGAN, we must input label and random noise to the model together.
First, we convert the label into one hot vector and merge it with random noise
``` python
n_classes = Variable(LongTensor(np.random.randint(0, num_classes, batch_size)))
self.label_emb = nn.Embedding(n_classes, latent_dim)
```
Next, we input the fake image into the discriminator, and we will get two loss
``` python
validity, pred_label = discriminator(gen_imgs)
g_loss = 0.5 * (adversarial_loss(validity, valid) + auxiliary_loss(pred_label, gen_labels))
g_loss.backward()
```
### Model Ensemble :
I selected ten models from the last 10 epochs and averaged all the parameters.
``` python
import torch as t
model1 = t.load('/content/drive/MyDrive/HW2/model/acgan/acgan_90.pth')
model2 = t.load('/content/drive/MyDrive/HW2/model/acgan/acgan_99.pth')
model3 = t.load('/content/drive/MyDrive/HW2/model/acgan/acgan_98.pth')
model4 = t.load('/content/drive/MyDrive/HW2/model/acgan/acgan_97.pth')
model5 = t.load('/content/drive/MyDrive/HW2/model/acgan/acgan_96.pth')
model6 = t.load('/content/drive/MyDrive/HW2/model/acgan/acgan_95.pth')
model7 = t.load('/content/drive/MyDrive/HW2/model/acgan/acgan_94.pth')
model8 = t.load('/content/drive/MyDrive/HW2/model/acgan/acgan_93.pth')
model9 = t.load('/content/drive/MyDrive/HW2/model/acgan/acgan_92.pth')
model10 = t.load('/content/drive/MyDrive/HW2/model/acgan/acgan_91.pth')
for key, value in model1.items():
model1[key] = (value + model2[key] + model3[key] + model4[key] + model5[key] + model6[key] + model7[key] + model8[key] + model9[key] + model10[key]) / 10
generator = Generator(nz, num_classes, image_size).cuda()
generator.load_state_dict(model1)
t.save(generator.state_dict(), '/content/drive/MyDrive/HW2/model/acgan/acgan_ensemble.pth')
```
### Example Results

### Accuracy
We load a pre-trained classifier to predict the category of images generated from ACGAN
``` python
Accuracy: 0.991
```
# Domain Adaptation Neural Network
In this task, I applied DANN to implement domain adaptation.

## scenario
* source domain : SVHN
* Target domain : MNIST-M
### training on source domain only (Lower bound)
Use source images and labels in the training folder for training, target images and labels in the testing folder to compute the accuracy
``` python
Avg Accuracy = 0.4925
```
### training on source and target domain (domain adaptation)
Use source images and labels in the training folder + target images in the training folder for training, target images and labels in the testing folder to compute the accuracy
``` python
Avg Accuracy = 0.5683
```
### training on target domain only (Upper bound)
Use target images and labels in the training folder for training, target images and labels in the testing folder to compute the accuracy
``` python
Avg Accuracy = 0.9798
```
## Result Matrix
| | MNIST-M → USPS | SVHN → MNIST-M | USPS → SVHN |
| -------- | -------- | -------- | -------- |
| Trained on source| 0.7429 | 0.4925 | 0.1526 |
| Adaptation | 0.7693 | 0.5683 | 0.3126 |
| Trained on target| 0.9626 | 0.9788 | 0.9155 |
## Visualization



## Remark
* In addition to caculate classification loss of source , the loss function also adds domain loss of source and target. (Ie, loss = source_loss_class + source_loss_domain + target_loss_domain)
* During the training process, we expect that the accuracy of the domain classifier will get closer and closer to 0.5, which means that the features of the source and target domains have been mixed together.
## Improved UDA model
### scenario 1
* source domain : SVHN
* Target domain : MNIST-M
#### Original
``` python
# https://github.com/wogong/pytorch-dann
SVHNmodel(
(feature): Sequential(
(0): Conv2d(3, 64, kernel_size=(5, 5), stride=(1, 1))
(1): ReLU(inplace=True)
(2): MaxPool2d(kernel_size=(3, 3), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
(3): Conv2d(64, 64, kernel_size=(5, 5), stride=(1, 1))
(4): ReLU(inplace=True)
(5): MaxPool2d(kernel_size=(3, 3), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
(6): ReLU(inplace=True)
(7): Conv2d(64, 128, kernel_size=(4, 4), stride=(1, 1))
)
(classifier): Sequential(
(0): Linear(in_features=128, out_features=1024, bias=True)
(1): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU(inplace=True)
(3): Linear(in_features=1024, out_features=256, bias=True)
(4): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(5): ReLU(inplace=True)
(6): Linear(in_features=256, out_features=10, bias=True)
)
(discriminator): Sequential(
(0): Linear(in_features=128, out_features=1024, bias=True)
(1): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU(inplace=True)
(3): Linear(in_features=1024, out_features=256, bias=True)
(4): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(5): ReLU(inplace=True)
(6): Linear(in_features=256, out_features=2, bias=True)
)
)
```
#### Improved
I add batchnorm and relue to the featurer extractor
``` python
Sequential(
(0): Conv2d(3, 64, kernel_size=(5, 5), stride=(1, 1))
(1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU(inplace=True)
(3): MaxPool2d(kernel_size=(3, 3), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
(4): Conv2d(64, 64, kernel_size=(5, 5), stride=(1, 1))
(5): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(6): Dropout2d(p=0.5, inplace=False)
(7): ReLU(inplace=True)
(8): MaxPool2d(kernel_size=(3, 3), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
(9): ReLU(inplace=True)
(10): Conv2d(64, 128, kernel_size=(4, 4), stride=(1, 1))
)
```
### scenario 1
* source domain : MNIST-M
* Target domain : USPS
#### Original
``` python
# https://github.com/wogong/pytorch-dann
MNISTMmodel(
(feature): Sequential(
(0): Conv2d(3, 32, kernel_size=(5, 5), stride=(1, 1))
(1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU(inplace=True)
(3): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
(4): Conv2d(32, 48, kernel_size=(5, 5), stride=(1, 1))
(5): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(6): Dropout2d(p=0.8, inplace=False)
(7): ReLU(inplace=True)
(8): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
)
(classifier): Sequential(
(0): Linear(in_features=768, out_features=300, bias=True)
(1): BatchNorm1d(300, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU(inplace=True)
(3): Linear(in_features=300, out_features=100, bias=True)
(4): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(5): ReLU(inplace=True)
(6): Linear(in_features=100, out_features=10, bias=True)
)
(discriminator): Sequential(
(0): Linear(in_features=768, out_features=300, bias=True)
(1): BatchNorm1d(300, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU(inplace=True)
(3): Linear(in_features=300, out_features=2, bias=True)
)
)
```
#### Improved
I reduce the number of layers of the classifier and discriminator, and then add batchnorm and relue to the featurer extractor
``` python
MNISTMmodel(
(feature): Sequential(
(0): Conv2d(3, 32, kernel_size=(5, 5), stride=(1, 1))
(1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU(inplace=True)
(3): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
(4): Conv2d(32, 48, kernel_size=(5, 5), stride=(1, 1))
(5): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(6): Dropout2d(p=0.5, inplace=False)
(7): ReLU(inplace=True)
(8): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
)
(classifier): Sequential(
(0): Linear(in_features=768, out_features=100, bias=True)
(1): Linear(in_features=100, out_features=10, bias=True)
)
(discriminator): Sequential(
(0): Linear(in_features=768, out_features=100, bias=True)
(1): Linear(in_features=100, out_features=2, bias=True)
)
)
```
### scenario 3
* source domain : USPS
* Target domain : SVHN
#### Original
``` python
# https://github.com/wogong/pytorch-dann
USPSMmodel(
(feature): Sequential(
(0): Conv2d(3, 32, kernel_size=(5, 5), stride=(1, 1))
(1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU(inplace=True)
(3): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
(4): Conv2d(32, 24, kernel_size=(5, 5), stride=(1, 1))
(5): BatchNorm2d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(6): Dropout2d(p=0.8, inplace=False)
(7): ReLU(inplace=True)
(8): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
)
(classifier): Sequential(
(0): Linear(in_features=384, out_features=24, bias=True)
(1): BatchNorm1d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU(inplace=True)
(3): Linear(in_features=24, out_features=10, bias=True)
)
(discriminator): Sequential(
(0): Linear(in_features=384, out_features=24, bias=True)
(1): BatchNorm1d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU(inplace=True)
(3): Linear(in_features=24, out_features=2, bias=True)
)
)
```
#### Improved
1. Data Preprocess
``` python
transforms.Compose([
transforms.CenterCrop((28, 20)),
transforms.Resize((28,28)),
transforms.Grayscale(num_output_channels=3),
])
```
<img src="https://i.imgur.com/OqlsJND.png" width="300"/>
2. Model Architecture
``` python
dann.feature[4] = nn.Conv2d(32, 24, kernel_size=(5, 5), stride=(1, 1))
dann.feature[5] = nn.BatchNorm2d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
dann.feature[6] = nn.Dropout2d(p=0.5, inplace=False)
dann.classifier[0] = nn.Linear(in_features=24*4*4, out_features=24, bias=True)
dann.classifier[1] = nn.BatchNorm1d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
dann.classifier[2] = nn.Sigmoid()
dann.classifier[3] = nn.Linear(in_features=24, out_features=10, bias=True)
dann.discriminator[0] = nn.Linear(in_features=24*4*4, out_features=24, bias=True)
dann.discriminator[1] = nn.BatchNorm1d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
dann.discriminator[2] = nn.Sigmoid()
dann.discriminator[3] = nn.Linear(in_features=24, out_features=2, bias=True)
```
### Improved Result
| | MNIST-M → USPS | SVHN → MNIST-M | USPS → SVHN |
| -------- | -------- | -------- | -------- |
| Original model | 0.7693 | 0.5683 | 0.3126 |
| Improved model | 0.7972 | 0.5849 | 0.3726 |
### Remark
* We can transform source domain data into similar to target domain during data preprocessing.
* The model architecture with some few layer and number of neural will be better performance, when the source domain data (USPS, rgb=1) is more colorless than target domain data (SVHN, rgb=3).