# 基於深度學習進行貓與狗的辨認(Pytorch) 這次會介紹我在Kaggle上進行的貓狗辨認程式,我最終的測試結果得準確率可以高達90%,此部份會使用Pytorch這個深度學習框架。 ## 1. 準備資料 在進行訓練前需要先準備好資料,我們先從Kaggle上下載圖片下來,再做資料的分配,這邊使用貓1000張及狗1000張當作訓練資料,使用貓500張及狗500張當作驗證資料,最後使用100張貓狗照片當作測試資料,我不使用Kaggle給的所有照片,因為我的電腦要跑很久,所以這邊取比較少張照片做訓練。 ```python= import os import shutil train_all_path = "/home/chisc/workspace/wuzhenrong" train_dir = "/home/chisc/workspace/wuzhenrong/train" validation_dir = "/home/chisc/workspace/wuzhenrong/validation" test_dir = "/home/chisc/workspace/wuzhenrong/test" train_cat = "/home/chisc/workspace/wuzhenrong/train/cat" train_dog = "/home/chisc/workspace/wuzhenrong/train/dog" val_cat = "/home/chisc/workspace/wuzhenrong/validation/cat" val_dog = "/home/chisc/workspace/wuzhenrong/validation/dog" test_cat = "/home/chisc/workspace/wuzhenrong/test/cat" test_dog = "/home/chisc/workspace/wuzhenrong/test/dog" if not os.path.exists(train_dir): os.mkdir(train_dir) if not os.path.exists(validation_dir): os.mkdir(validation_dir) if not os.path.exists(test_dir): os.mkdir(test_dir) if not os.path.exists(train_cat): os.mkdir(train_cat) if not os.path.exists(train_dog): os.mkdir(train_dog) if not os.path.exists(val_cat): os.mkdir(val_cat) if not os.path.exists(val_dog): os.mkdir(val_dog) if not os.path.exists(test_cat): os.mkdir(test_cat) if not os.path.exists(test_dog): os.mkdir(test_dog) for i in range(0, 2000): addr = f"/home/chisc/workspace/wuzhenrong/train_all/cat.{i}.jpg" to_add = f"/home/chisc/workspace/wuzhenrong/train/cat/cat.{i}.jpg" shutil.copyfile(addr, to_add) for i in range(0, 2000): addr = f"/home/chisc/workspace/wuzhenrong/train_all/dog.{i}.jpg" to_add = f"/home/chisc/workspace/wuzhenrong/train/dog/dog.{i}.jpg" shutil.copyfile(addr, to_add) for i in range(2000, 2500): addr = f"/home/chisc/workspace/wuzhenrong/train_all/cat.{i}.jpg" to_add = f"/home/chisc/workspace/wuzhenrong/validation/cat/cat.{i}.jpg" shutil.copyfile(addr, to_add) for i in range(2000, 2500): addr = f"/home/chisc/workspace/wuzhenrong/train_all/dog.{i}.jpg" to_add = f"/home/chisc/workspace/wuzhenrong/validation/dog/dog.{i}.jpg" shutil.copyfile(addr, to_add) for i in range(2500, 3000): addr = f"/home/chisc/workspace/wuzhenrong/train_all/cat.{i}.jpg" to_add = f"/home/chisc/workspace/wuzhenrong/test/cat/cat.{i}.jpg" shutil.copyfile(addr, to_add) for i in range(2500, 3000): addr = f"/home/chisc/workspace/wuzhenrong/train_all/dog.{i}.jpg" to_add = f"/home/chisc/workspace/wuzhenrong/test/cat/dog.{i}.jpg" shutil.copyfile(addr, to_add) ``` 首先會先判斷目錄使否存在,如果不存在會自動建立一個,再來就是分配資料,這邊使用shutil.copyfile()來複製資料,第一個參數是資料來源地,第二個參數是目的地。 ## 2. 引入函式庫 ```python= import matplotlib.pyplot as plt import numpy as np import pandas as pd import torch import torch.nn as nn from torch.utils.data import Dataset, DataLoader import torchvision.transforms as transforms from torchvision.datasets import ImageFolder from PIL import Image from torchvision.datasets import DatasetFolder import torchvision from tqdm.notebook import tqdm as tqdm ``` 為了畫函數圖形,所以引入matplotlib,接著numpy和pandas是常用的工具,所以提前引入以防不備之需,torch.nn裡包含很多神經網路的類別。再來是torchvision,引入後可以做資料的提取及準備。由於pytorch沒有訓練進度條,所以引入tqdm可以顯示進度條。 ## 3. 引入資料 ```python= train_trans = transforms.Compose( [transforms.RandomHorizontalFlip(), transforms.RandomRotation((-30, 30)), transforms.Resize((224, 224)), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])]) # val_trans = transforms.Compose([transforms.Resize((224, 224)), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])]) val_trans = transforms.Compose([transforms.Resize((224, 224)), transforms.ToTensor()]) test_trans = transforms.Compose([transforms.Resize((224, 224)), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])]) batch_size = 32 train_data = ImageFolder(train_path, transform = train_trans) val_data = ImageFolder(val_path,transform = test_trans) test_data = ImageFolder(test_path, transform = test_trans) train_loader = DataLoader(train_data, batch_size = batch_size, shuffle = True, num_workers = 2, pin_memory = True) val_loader = DataLoader(val_data, batch_size = batch_size, shuffle = True, num_workers = 2, pin_memory = True) test_loader = DataLoader(test_data, shuffle = True) ``` 接著就是讀入資料,transforms.Compose可以放入data augmentation的資訊,ImageFolder是從目錄裡讀取資料,會依據不同資料夾來當作不同label,而DataLoader會彙整剛剛兩個的資訊。 ## 4. 看圖片 ```python= images, labels = next(iter(train_loader)) # After Normalize for i in np.arange(3): plt.figure(i) plt.imshow(images[i].permute(1, 2, 0)) # plt.show() # Before Normalize for i in np.arange(3): plt.figure(i) # Our data are normalized, in order to watch our origin image, so we need to denormalize our data mean = torch.tensor([0.485, 0.456, 0.406]) std = torch.tensor([0.229, 0.224, 0.225]) tmp = transforms.Normalize(-mean/std, 1/std)(images[i]) # denormalize plt.imshow(tmp.permute(1, 2, 0)) plt.show() ``` 這邊的程式可以看到貓與狗的圖片,由於我們的train data有做normalize,所以要做denormalize,才能看到原圖。 ![](https://i.imgur.com/XRtcid5.jpg) ![](https://i.imgur.com/Qdnk6P2.jpg) ## 5. CNN架構 ```python= class CatDpg(nn.Module): def __init__(self): super(CatDpg, self).__init__() self.cnn = nn.Sequential( ## CNN1 nn.Conv2d(in_channels = 3, out_channels = 64, kernel_size = 3, stride = 1, padding = 1), # padding = kernel_size / 2 nn.ReLU(), nn.MaxPool2d(kernel_size = 2),## (64, 112, 112) ## CNN2 nn.Conv2d(in_channels = 64, out_channels = 128, kernel_size = 3, stride = 1, padding = 1), nn.ReLU(), nn.MaxPool2d(kernel_size = 2),## (128, 56, 56) ## CNN3 nn.Conv2d(in_channels = 128, out_channels = 256, kernel_size = 3, stride = 1, padding = 1), nn.ReLU(), nn.MaxPool2d(kernel_size = 2),## (256, 28, 28) ## CNN4 nn.Conv2d(in_channels = 256, out_channels = 512, kernel_size = 3, stride = 1, padding = 1), nn.ReLU(), nn.MaxPool2d(kernel_size = 2),## (512, 14, 14) ## CNN5 nn.Conv2d(in_channels = 512, out_channels = 512, kernel_size = 3, stride = 1, padding = 1), nn.ReLU(), nn.MaxPool2d(kernel_size = 2)## (512, 7, 7) ) self.fc = nn.Sequential( nn.Linear(512 * 7 * 7, 1024), # Fully-connected layer nn.Dropout(0.4), # Avoid overfitting nn.ReLU(), nn.Linear(1024, 1024), nn.Dropout(0.5), nn.ReLU(), nn.Linear(1024, 2) ) # forward propagation def forward(self, x): x = self.cnn(x) x = x.flatten(1) x = self.fc(x) return x ``` CNN的架構如下: 1. Input layer 2. Convolutional layer 3. ReLU layer 4. Pooling layer 5. Fully-connected layer ![](https://i.imgur.com/bFv7sa5.png) 我們首先先建立卷積層,再一層激勵函數,然後再來一個池化層,記住padding等於kernel_size / 2,這樣做5層即可,然後在forward李需要加入flatten(),這樣才能做fully-connected。 ## 6. 開始訓練 ```python= device = "cuda" if train_on_gpu else "cpu" model = CatDpg() model = model.to(device) print(model) optimizer = torch.optim.Adam(model.parameters(), lr=0.0005) loss_func = nn.CrossEntropyLoss() n_epochs = 30 train_loss_record = [] train_acc_record = [] val_loss_record = [] val_acc_record = [] for epoch in range(n_epochs): train_loss = 0.0 val_loss = 0.0 train_acc = 0.0 val_acc = 0.0 model.train() for x, y in tqdm(train_loader): x, y = x.to(device), y.to(device) prediction = model(x) loss = loss_func(prediction, y) optimizer.zero_grad() loss.backward() optimizer.step() acc = ((prediction.argmax(dim = 1) == y).float().mean()) train_acc += acc/len(train_loader) train_loss += loss/len(train_loader) print(f"[ Train | {epoch+1}/{n_epochs} ] loss = {train_loss:.5f}, acc = {train_acc:.5f}") train_loss_record.append(train_loss) train_acc_record.append(train_acc) # with torch.no_grad(): for x, y in tqdm(val_loader): x, y = x.to(device), y.to(device) prediction = model(x) loss = loss_func(prediction, y) loss.backward() acc = ((prediction.argmax(dim = 1) == y).float().mean()) val_acc += acc/len(val_loader) val_loss += loss/len(val_loader) print(f"[ Validation | {epoch+1}/{n_epochs} ] loss = {val_loss:.5f}, acc = {val_acc:.5f}") val_loss_record.append(val_loss) val_acc_record.append(val_acc) torch.save(model, 'catvsdog.pkl') ``` 首先要判斷是否有CUDA,如果有就使用CUDA訓練,如果沒有,就用CPU訓練,我們這邊使用Adam當作optimizer,Adam相對SGD還要來的穩定,且沒有梯度消失及梯度爆炸的問題,loss function是使用cross entropy,接著進入訓練,記得訓練的的地方需要加入model.train()。 ## 7. 查看模型效能 ```python= plt.figure(1) plt.title('Training and Validation Loss') train_l, = plt.plot(train_loss_record, color = 'red') val_l, = plt.plot(val_loss_record, color = 'blue') plt.xlabel('Epochs') plt.ylabel('Loss') plt.legend(handles = [train_l, val_l], labels = ['Training', 'Validation'], loc = 'best') plt.show() plt.figure(2) plt.title('Training and Validation Accuracy') train_a, = plt.plot(train_acc_record, color = 'red') val_a, = plt.plot(val_acc_record, color = 'blue') plt.xlabel('Epochs') plt.ylabel('Accuracy') plt.legend(handles = [train_a, val_a], labels = ['Training', 'Validation'], loc = 'best') plt.show() i = 0 for x, y in test_loader: i += 1 if train_on_gpu: x, y = x.cuda(), y.cuda() output = model(x) out = output.argmax(dim = 1) out = out.to('cpu').numpy() # print(out) if i % 10 == 0: plt.figure(i) if out[0] == 0: plt.title('Predict: cat') else: plt.title('Predict: dog') mean = torch.tensor([0.485, 0.456, 0.406]) std = torch.tensor([0.229, 0.224, 0.225]) x = x.squeeze() tmp = transforms.Normalize(-mean/std, 1/std)(x) # denormalize tmp = tmp.to('cpu') plt.imshow(tmp.permute(1, 2, 0)) plt.show() ``` ![](https://i.imgur.com/Uj6bBtg.png) ![](https://i.imgur.com/DiKI8Jk.png) ![](https://i.imgur.com/yKqs3HF.png) ![](https://i.imgur.com/Xe81UzD.png) 最後把圖形輸出就完成了,而測試的最高準確率可以達到91%,而平均測試準確率是88%, ## 總程式碼 ```python= import matplotlib.pyplot as plt import numpy as np import pandas as pd import torch import torch.nn as nn from torch.utils.data import Dataset, DataLoader import torchvision.transforms as transforms from torchvision.datasets import ImageFolder from PIL import Image from torchvision.datasets import DatasetFolder import torchvision from tqdm.notebook import tqdm as tqdm train_on_gpu = torch.cuda.is_available() if not train_on_gpu: print('CUDA is not available.') else: print('CUDA is available!') train_path = '/home/chisc/workspace/wuzhenrong/train' val_path = '/home/chisc/workspace/wuzhenrong/validation/' test_path = '/home/chisc/workspace/wuzhenrong/test/' train_trans = transforms.Compose( [transforms.RandomHorizontalFlip(), transforms.RandomRotation((-30, 30)), transforms.Resize((224, 224)), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])]) # val_trans = transforms.Compose([transforms.Resize((224, 224)), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])]) val_trans = transforms.Compose([transforms.Resize((224, 224)), transforms.ToTensor()]) test_trans = transforms.Compose([transforms.Resize((224, 224)), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])]) batch_size = 32 train_data = ImageFolder(train_path, transform = train_trans) val_data = ImageFolder(val_path,transform = test_trans) test_data = ImageFolder(test_path, transform = test_trans) train_loader = DataLoader(train_data, batch_size = batch_size, shuffle = True, num_workers = 2, pin_memory = True) val_loader = DataLoader(val_data, batch_size = batch_size, shuffle = True, num_workers = 2, pin_memory = True) test_loader = DataLoader(test_data, shuffle = True) print(train_loader) images, labels = next(iter(train_loader)) # After Normalize for i in np.arange(3): plt.figure(i) plt.imshow(images[i].permute(1, 2, 0)) # plt.show() # Before Normalize for i in np.arange(3): plt.figure(i) # Our data are normalized, in order to watch our origin image, so we need to denormalize our data mean = torch.tensor([0.485, 0.456, 0.406]) std = torch.tensor([0.229, 0.224, 0.225]) tmp = transforms.Normalize(-mean/std, 1/std)(images[i]) # denormalize plt.imshow(tmp.permute(1, 2, 0)) plt.show() # 1. Input layer # 2. Convolutional layer # 3. ReLU layer # 4. Pooling layer # 5. Fully-connected layer class CatDpg(nn.Module): def __init__(self): super(CatDpg, self).__init__() self.cnn = nn.Sequential( ## CNN1 nn.Conv2d(in_channels = 3, out_channels = 64, kernel_size = 3, stride = 1, padding = 1), # padding = kernel_size / 2 nn.ReLU(), nn.MaxPool2d(kernel_size = 2),## (64, 112, 112) ## CNN2 nn.Conv2d(in_channels = 64, out_channels = 128, kernel_size = 3, stride = 1, padding = 1), nn.ReLU(), nn.MaxPool2d(kernel_size = 2),## (128, 56, 56) ## CNN3 nn.Conv2d(in_channels = 128, out_channels = 256, kernel_size = 3, stride = 1, padding = 1), nn.ReLU(), nn.MaxPool2d(kernel_size = 2),## (256, 28, 28) ## CNN4 nn.Conv2d(in_channels = 256, out_channels = 512, kernel_size = 3, stride = 1, padding = 1), nn.ReLU(), nn.MaxPool2d(kernel_size = 2),## (512, 14, 14) ## CNN5 nn.Conv2d(in_channels = 512, out_channels = 512, kernel_size = 3, stride = 1, padding = 1), nn.ReLU(), nn.MaxPool2d(kernel_size = 2)## (512, 7, 7) ) self.fc = nn.Sequential( nn.Linear(512 * 7 * 7, 1024), # Fully-connected layer nn.Dropout(0.4), # Avoid overfitting nn.ReLU(), nn.Linear(1024, 1024), nn.Dropout(0.5), nn.ReLU(), nn.Linear(1024, 2) ) # forward propagation def forward(self, x): x = self.cnn(x) x = x.flatten(1) x = self.fc(x) return x device = "cuda" if train_on_gpu else "cpu" model = CatDpg() model = model.to(device) print(model) optimizer = torch.optim.Adam(model.parameters(), lr=0.0005) loss_func = nn.CrossEntropyLoss() n_epochs = 30 train_loss_record = [] train_acc_record = [] val_loss_record = [] val_acc_record = [] for epoch in range(n_epochs): train_loss = 0.0 val_loss = 0.0 train_acc = 0.0 val_acc = 0.0 model.train() for x, y in tqdm(train_loader): x, y = x.to(device), y.to(device) prediction = model(x) loss = loss_func(prediction, y) optimizer.zero_grad() loss.backward() optimizer.step() acc = ((prediction.argmax(dim = 1) == y).float().mean()) train_acc += acc/len(train_loader) train_loss += loss/len(train_loader) print(f"[ Train | {epoch+1}/{n_epochs} ] loss = {train_loss:.5f}, acc = {train_acc:.5f}") train_loss_record.append(train_loss) train_acc_record.append(train_acc) # with torch.no_grad(): for x, y in tqdm(val_loader): x, y = x.to(device), y.to(device) prediction = model(x) loss = loss_func(prediction, y) loss.backward() acc = ((prediction.argmax(dim = 1) == y).float().mean()) val_acc += acc/len(val_loader) val_loss += loss/len(val_loader) print(f"[ Validation | {epoch+1}/{n_epochs} ] loss = {val_loss:.5f}, acc = {val_acc:.5f}") val_loss_record.append(val_loss) val_acc_record.append(val_acc) torch.save(model, 'catvsdog.pkl') plt.figure(1) plt.title('Training and Validation Loss') train_l, = plt.plot(train_loss_record, color = 'red') val_l, = plt.plot(val_loss_record, color = 'blue') plt.xlabel('Epochs') plt.ylabel('Loss') plt.legend(handles = [train_l, val_l], labels = ['Training', 'Validation'], loc = 'best') plt.show() plt.figure(2) plt.title('Training and Validation Accuracy') train_a, = plt.plot(train_acc_record, color = 'red') val_a, = plt.plot(val_acc_record, color = 'blue') plt.xlabel('Epochs') plt.ylabel('Accuracy') plt.legend(handles = [train_a, val_a], labels = ['Training', 'Validation'], loc = 'best') plt.show() i = 0 for x, y in test_loader: i += 1 if train_on_gpu: x, y = x.cuda(), y.cuda() output = model(x) out = output.argmax(dim = 1) out = out.to('cpu').numpy() # print(out) if i % 10 == 0: plt.figure(i) if out[0] == 0: plt.title('Predict: cat') else: plt.title('Predict: dog') mean = torch.tensor([0.485, 0.456, 0.406]) std = torch.tensor([0.229, 0.224, 0.225]) x = x.squeeze() tmp = transforms.Normalize(-mean/std, 1/std)(x) # denormalize tmp = tmp.to('cpu') plt.imshow(tmp.permute(1, 2, 0)) plt.show() ```