--- title: 'Learning 8-bit parity checking problem with MLP' disqus: hackmd --- # Learning 8-bit parity checking problem with MLP # Table of Contents [TOC] # 1.Generating training data --- ## 1.1 Creating the function ```python= def g_train(now, s): # now用來記錄目前位置 global count # count為全域變數,用來記錄list裡有多少個1 count = 0 if now < 0: # 遞迴終止條件 train_data.append(np.array(s)) for i in s: if i == 1.0: count += 1 if count%2 == 0: # 偶數個1,label為0 label.append(np.array([0, ])) else: # 奇數個1,label為0 label.append(np.array([1, ])) else: for i in range(2): # 可填0或1 s[now] = i g_train(now-1,s) ``` * `g_train` 是一個遞迴函式,用途在於產生8 bit所有可能及其對映的label ## 1.2 Run the function ```python= import numpy as np list = np.zeros(8) train_data = [] label = [] g_train(7, list) train_data_np = np.array(train_data) # 轉換成nparray label_np = np.array(label) ``` Output: ```python train_data_np array([[0., 0., 0., ..., 0., 0., 0.], [1., 0., 0., ..., 0., 0., 0.], [0., 1., 0., ..., 0., 0., 0.], ..., [1., 0., 1., ..., 1., 1., 1.], [0., 1., 1., ..., 1., 1., 1.], [1., 1., 1., ..., 1., 1., 1.]]) ``` ```python array([[0],[1],[1],[0],[1],[0],[0],[1], [1],[0],[0],[1],[0],[1],[1],[0], [1],[0],[0],[1],[0],[1],[1],[0], ..., [0],[1],[1],[0],[1],[0],[0],[1], [0],[1],[1],[0],[1],[0],[0],[1], [1],[0],[0],[1],[0],[1],[1],[0]]) ``` # 2.Add an new activation function $Tanh(x)$ --- ```python= class TanH: def __init__(self): pass def forward(self,x): out = (2/(1+np.exp(-2*x)))-1 self.o = out return out def backward(self, dout): dx = dout*(1 - self.o*self.o) return dx ``` * 網路上示意圖:![](https://i.imgur.com/xCs9EDy.png) * $Tanh(x) = \frac{\exp^{x}-\exp^{-x}}{\exp^{x}+\exp^{-x}} = \frac{2}{1+\exp^{-2x}}-1$ * $\frac{d}{dx}Tanh(x) = \frac{(\exp^{x}+\exp^{-x})(\exp^{x}+\exp^{-x})-(\exp^{x}-\exp^{-x})(\exp^{x}-\exp^{-x})}{(\exp^{x}+\exp^{-x})^2}$ $= 1-\frac{(\exp^{x}-\exp^{-x})^2}{(\exp^{x}+\exp^{-x})^2} =1-Tanh^2(x)$ # 3.Constructing network --- ## 3.1 One layer network ```python= class Linear: def __init__(self, m, n): self.W, self.b = np.random.randn(m, n), np.random.randn(1, n) self.dW, self.db = None, None def forward(self, x): self.x = x out = np.dot(x, self.W)+self.b return out def backward(self, dout): dx = np.dot(dout, self.W.T) self.dW = np.dot(self.x.T, dout) self.db = np.sum(dout, axis=0) return dx class ReLu: def __init__(self): pass def forward(self, x): self.mask = (x<=0) out = x out[out<=0] = 0 return out def backward(self, dout): dx = dout dx[self.mask] = 0 return dx class Sigmoid: def __init__(self): pass def forward(self,x): out = 1/(1+np.exp(-x)) self.o = out return out def backward(self, dout): dx = dout*self.o*(1-self.o) return dx class TanH: #新加的activation class def __init__(self): pass def forward(self,x): out = (2/(1+np.exp(-2*x)))-1 self.o = out return out def backward(self, dout): dx = dout*(1 - self.o*self.o) return dx class Loss: def __init__(self): pass def forward(self, label_np, ybar): self.ybar = ybar return np.sum((label_np-ybar)**2) def backward(self, dout): dy = -(2*(label_np-self.ybar)) return dy class OneLayer: def __init__(self, m, n): self.linear = Linear(m, n) self.sigmoid = Sigomid() self.loss = Loss() self.last_dW, self.last_db = 0, 0 def forward(self, x): x = self.linear.forward(x) self.ybar = self.sigmoid.forward(x) return self.ybar def backward(self, label_np): self.L = self.loss.forward(label_np, self.ybar) g = self.loss.backward(1) g = self.sigmoid.backward(g) g = self.linear.backward(g) def update(self, eta, alpha): self.linear.W = self.linear.W-eta*self.linear.dW+alpha*self.last_dW self.linear.b = self.linear.b-eta*self.linear.db+alpha*self.last_db self.last_dW = eta*self.linear.dW self.last_db = eta*self.linear.db ``` ## 3.2 Two layer network ```python= class TwoLayer: def __init__(self, m, n, o): self.linear1 = Linear(m, n) self.act1 = ReLu() self.linear2 = Linear(n, o) self.act2 = Sigmoid() self.loss = Loss() self.last_dW1, self.last_db1 = 0, 0 self.last_dW2, self.last_db2 = 0, 0 def forward(self, x): x = self.linear1.forward(x) x = self.act1.forward(x) x = self.linear2.forward(x) self.ybar = self.act2.forward(x) return self.ybar def backward(self, label_np): self.L = self.loss.forward(label_np, self.ybar) g = self.loss.backward(1) g = self.act2.backward(g) g = self.linear2.backward(g) g = self.act1.backward(g) g = self.linear1.backward(g) def update(self, eta, alpha): self.linear1.W = self.linear1.W-eta*self.linear1.dW+alpha*self.last_dW1 self.linear1.b = self.linear1.b-eta*self.linear1.db+alpha*self.last_db1 self.last_dW1 = eta*self.linear1.dW self.last_db1 = eta*self.linear1.db self.linear2.W = self.linear2.W-eta*self.linear2.dW+alpha*self.last_dW2 self.linear2.b = self.linear2.b-eta*self.linear2.db+alpha*self.last_db2 self.last_dW2 = eta*self.linear2.dW self.last_db2 = eta*self.linear2.db ``` ## 3.3 Three layer network ```python= class ThreeLayer: def __init__(self, m, n, o, p): self.linear1 = Linear(m, n) self.act1 = ReLu() self.linear2 = Linear(n, o) self.act2 = Tan() self.linear3 = Linear(o, p) self.act3 = Sigmoid() self.loss = Loss() self.last_dW1, self.last_db1 = 0, 0 self.last_dW2, self.last_db2 = 0, 0 self.last_dW3, self.last_db3 = 0, 0 def forward(self, x): x = self.linear1.forward(x) x = self.act1.forward(x) x = self.linear2.forward(x) x = self.act2.forward(x) x = self.linear3.forward(x) x = self.act3.forward(x) self.ybar = self.act3.forward(x) return self.ybar def backward(self, label_np): self.L = self.loss.forward(label_np, self.ybar) g = self.loss.backward(1) g = self.act3.backward(g) g = self.linear3.backward(g) g = self.act2.backward(g) g = self.linear2.backward(g) g = self.act1.backward(g) g = self.linear1.backward(g) def update(self, eta, alpha): self.linear1.W = self.linear1.W-eta*self.linear1.dW+alpha*self.last_dW1 self.linear1.b = self.linear1.b-eta*self.linear1.db+alpha*self.last_db1 self.last_dW1 = eta*self.linear1.dW self.last_db1 = eta*self.linear1.db self.linear2.W = self.linear2.W-eta*self.linear2.dW+alpha*self.last_dW2 self.linear2.b = self.linear2.b-eta*self.linear2.db+alpha*self.last_db2 self.last_dW2 = eta*self.linear2.dW self.last_db2 = eta*self.linear2.db self.linear3.W = self.linear3.W-eta*self.linear3.dW+alpha*self.last_dW3 self.linear3.b = self.linear3.b-eta*self.linear3.db+alpha*self.last_db3 self.last_dW3 = eta*self.linear3.dW self.last_db3 = eta*self.linear3.db ``` ## 3.4 Four layer network ```python= class FourLayer: def __init__(self, m, n, o, p, q): self.linear1 = Linear(m, n) self.act1 = ReLu() self.linear2 = Linear(n, o) self.act2 = Sigmoid() self.linear3 = Linear(o, p) self.act3 = ReLu() self.linear4 = Linear(p, q) self.act4 = Sigmoid() self.loss = Loss() self.last_dW1, self.last_db1 = 0, 0 self.last_dW2, self.last_db2 = 0, 0 self.last_dW3, self.last_db3 = 0, 0 self.last_dW4, self.last_db4 = 0, 0 def forward(self, x): x = self.linear1.forward(x) x = self.act1.forward(x) x = self.linear2.forward(x) x = self.act2.forward(x) x = self.linear3.forward(x) x = self.act3.forward(x) x = self.linear4.forward(x) self.ybar = self.act4.forward(x) return self.ybar def backward(self, label_np): self.L = self.loss.forward(label_np, self.ybar) g = self.loss.backward(1) g = self.act4.backward(g) g = self.linear4.backward(g) g = self.act3.backward(g) g = self.linear3.backward(g) g = self.act2.backward(g) g = self.linear2.backward(g) g = self.act1.backward(g) g = self.linear1.backward(g) def update(self, eta, alpha): self.linear1.W = self.linear1.W-eta*self.linear1.dW+alpha*self.last_dW1 self.linear1.b = self.linear1.b-eta*self.linear1.db+alpha*self.last_db1 self.last_dW1 = eta*self.linear1.dW self.last_db1 = eta*self.linear1.db self.linear2.W = self.linear2.W-eta*self.linear2.dW+alpha*self.last_dW2 self.linear2.b = self.linear2.b-eta*self.linear2.db+alpha*self.last_db2 self.last_dW2 = eta*self.linear2.dW self.last_db2 = eta*self.linear2.db self.linear3.W = self.linear3.W-eta*self.linear3.dW+alpha*self.last_dW3 self.linear3.b = self.linear3.b-eta*self.linear3.db+alpha*self.last_db3 self.last_dW3 = eta*self.linear3.dW self.last_db3 = eta*self.linear3.db self.linear4.W = self.linear4.W-eta*self.linear4.dW+alpha*self.last_dW4 self.linear4.b = self.linear4.b-eta*self.linear4.db+alpha*self.last_db4 self.last_dW4 = eta*self.linear4.dW self.last_db4 = eta*self.linear4.db ``` * 變動部分為`class_XXXLayer`裡`__init__`傳入個數和activation function的選擇 # 4.Run the code ## 4.1 Train the model ### Two layer * activation function: * Layer 1 : ReLu * Layer 2 : Sigmoid * inpue_dim: * Layer 1 : 8 * Layer 2 : 6 * variable: * eta : 0.01 * alpha : 0.9 * epoch: * max_epochs : 200000 ```python= model = TwoLayer(8,6, 1) max_epochs, ch_epochs = 200000, 100 last_dW, last_db = 0, 0 eta, alpha = 0.01, 0.9 for e in range(max_epochs): model.forward(train_data_np) model.backward(label_np) model.update(eta, alpha) if(e+1)%ch_epochs==0: print(e+1, model.L) ``` ### Three layer * activation function: * Layer 1 : ReLu * Layer 2 : Tanh * Layer 3 : Sigmoid * inpue_dim: * Layer 1 : 8 * Layer 2 : 7 * Layer 3 : 5 * variable: * eta : 0.01 * alpha : 0.7 * epoch: * max_epochs : 200000 ```python= model = ThreeLayer(8,7,5, 1) max_epochs, ch_epochs = 200000, 100 last_dW, last_db = 0, 0 eta, alpha = 0.01, 0.7 for e in range(max_epochs): model.forward(train_data_np) model.backward(label_np) model.update(eta, alpha) if(e+1)%ch_epochs==0: print(e+1, model.L) ``` ### Four layer * activation function: * Layer 1 : ReLu * Layer 2 : Tanh * Layer 3 : Sigmoid * Layer 4 : Sigmoid * inpue_dim: * Layer 1 : 8 * Layer 2 : 7 * Layer 3 : 7 * Layer 4 : 5 * variable: * eta : 0.01 * alpha : 0.7 * epoch: * max_epochs : 200000 ```python= model = FourLayer(8,7,7,5,1) max_epochs, ch_epochs = 200000, 100 last_dW, last_db = 0, 0 eta, alpha = 0.01, 0.7 for e in range(max_epochs): model.forward(train_data_np) model.backward(label_np) model.update(eta, alpha) if(e+1)%ch_epochs==0: print(e+1, model.L) ``` ## 4.2 Plot the loss * 新增`loss_draw`紀錄每次epoch的loss ```python= loss_draw = [] # 新增list model = TwoLayer(8,7,7,5, 1) max_epochs, ch_epochs = 200000, 100 last_dW, last_db = 0, 0 eta, alpha = 0.01, 0.7 for e in range(max_epochs): model.forward(train_data_np) model.backward(label_np) model.update(eta, alpha) loss_draw.append(model.L) # loss加入list if(e+1)%ch_epochs==0: print(e+1, model.L) ``` ```python= import matplotlib.pyplot as plt x = np.arange(0, 200000) y = np.array(loss_draw) plt.title("Layer") plt.xlabel("epoch") plt.ylabel("loss") plt.plot(x,y) plt.show() ``` ## 4.3 Comparison ### Two layer * ![](https://i.imgur.com/HcZr9Id.jpg) * 圖loss為46 * Two layer loss數值大約落在80-46 ### Three layer * ![](https://i.imgur.com/i2Iz54v.jpg) * 圖loss為6 * Three layer loss數值大約落在20-5 ### Four layer * ![](https://i.imgur.com/fnjxux1.jpg) * 圖loss為0.8 * Four layer loss數值大約落在10-0.9 * Four layer會有overfitting發生