---
title: 'Learning 8-bit parity checking problem with MLP'
disqus: hackmd
---
# Learning 8-bit parity checking problem with MLP
# Table of Contents
[TOC]
# 1.Generating training data
---
## 1.1 Creating the function
```python=
def g_train(now, s): # now用來記錄目前位置
global count # count為全域變數,用來記錄list裡有多少個1
count = 0
if now < 0: # 遞迴終止條件
train_data.append(np.array(s))
for i in s:
if i == 1.0:
count += 1
if count%2 == 0: # 偶數個1,label為0
label.append(np.array([0, ]))
else: # 奇數個1,label為0
label.append(np.array([1, ]))
else:
for i in range(2): # 可填0或1
s[now] = i
g_train(now-1,s)
```
* `g_train` 是一個遞迴函式,用途在於產生8 bit所有可能及其對映的label
## 1.2 Run the function
```python=
import numpy as np
list = np.zeros(8)
train_data = []
label = []
g_train(7, list)
train_data_np = np.array(train_data) # 轉換成nparray
label_np = np.array(label)
```
Output:
```python
train_data_np
array([[0., 0., 0., ..., 0., 0., 0.],
[1., 0., 0., ..., 0., 0., 0.],
[0., 1., 0., ..., 0., 0., 0.],
...,
[1., 0., 1., ..., 1., 1., 1.],
[0., 1., 1., ..., 1., 1., 1.],
[1., 1., 1., ..., 1., 1., 1.]])
```
```python
array([[0],[1],[1],[0],[1],[0],[0],[1],
[1],[0],[0],[1],[0],[1],[1],[0],
[1],[0],[0],[1],[0],[1],[1],[0],
...,
[0],[1],[1],[0],[1],[0],[0],[1],
[0],[1],[1],[0],[1],[0],[0],[1],
[1],[0],[0],[1],[0],[1],[1],[0]])
```
# 2.Add an new activation function $Tanh(x)$
---
```python=
class TanH:
def __init__(self):
pass
def forward(self,x):
out = (2/(1+np.exp(-2*x)))-1
self.o = out
return out
def backward(self, dout):
dx = dout*(1 - self.o*self.o)
return dx
```
* 網路上示意圖:![](https://i.imgur.com/xCs9EDy.png)
* $Tanh(x) = \frac{\exp^{x}-\exp^{-x}}{\exp^{x}+\exp^{-x}} = \frac{2}{1+\exp^{-2x}}-1$
* $\frac{d}{dx}Tanh(x) = \frac{(\exp^{x}+\exp^{-x})(\exp^{x}+\exp^{-x})-(\exp^{x}-\exp^{-x})(\exp^{x}-\exp^{-x})}{(\exp^{x}+\exp^{-x})^2}$
$= 1-\frac{(\exp^{x}-\exp^{-x})^2}{(\exp^{x}+\exp^{-x})^2} =1-Tanh^2(x)$
# 3.Constructing network
---
## 3.1 One layer network
```python=
class Linear:
def __init__(self, m, n):
self.W, self.b = np.random.randn(m, n), np.random.randn(1, n)
self.dW, self.db = None, None
def forward(self, x):
self.x = x
out = np.dot(x, self.W)+self.b
return out
def backward(self, dout):
dx = np.dot(dout, self.W.T)
self.dW = np.dot(self.x.T, dout)
self.db = np.sum(dout, axis=0)
return dx
class ReLu:
def __init__(self):
pass
def forward(self, x):
self.mask = (x<=0)
out = x
out[out<=0] = 0
return out
def backward(self, dout):
dx = dout
dx[self.mask] = 0
return dx
class Sigmoid:
def __init__(self):
pass
def forward(self,x):
out = 1/(1+np.exp(-x))
self.o = out
return out
def backward(self, dout):
dx = dout*self.o*(1-self.o)
return dx
class TanH: #新加的activation class
def __init__(self):
pass
def forward(self,x):
out = (2/(1+np.exp(-2*x)))-1
self.o = out
return out
def backward(self, dout):
dx = dout*(1 - self.o*self.o)
return dx
class Loss:
def __init__(self):
pass
def forward(self, label_np, ybar):
self.ybar = ybar
return np.sum((label_np-ybar)**2)
def backward(self, dout):
dy = -(2*(label_np-self.ybar))
return dy
class OneLayer:
def __init__(self, m, n):
self.linear = Linear(m, n)
self.sigmoid = Sigomid()
self.loss = Loss()
self.last_dW, self.last_db = 0, 0
def forward(self, x):
x = self.linear.forward(x)
self.ybar = self.sigmoid.forward(x)
return self.ybar
def backward(self, label_np):
self.L = self.loss.forward(label_np, self.ybar)
g = self.loss.backward(1)
g = self.sigmoid.backward(g)
g = self.linear.backward(g)
def update(self, eta, alpha):
self.linear.W = self.linear.W-eta*self.linear.dW+alpha*self.last_dW
self.linear.b = self.linear.b-eta*self.linear.db+alpha*self.last_db
self.last_dW = eta*self.linear.dW
self.last_db = eta*self.linear.db
```
## 3.2 Two layer network
```python=
class TwoLayer:
def __init__(self, m, n, o):
self.linear1 = Linear(m, n)
self.act1 = ReLu()
self.linear2 = Linear(n, o)
self.act2 = Sigmoid()
self.loss = Loss()
self.last_dW1, self.last_db1 = 0, 0
self.last_dW2, self.last_db2 = 0, 0
def forward(self, x):
x = self.linear1.forward(x)
x = self.act1.forward(x)
x = self.linear2.forward(x)
self.ybar = self.act2.forward(x)
return self.ybar
def backward(self, label_np):
self.L = self.loss.forward(label_np, self.ybar)
g = self.loss.backward(1)
g = self.act2.backward(g)
g = self.linear2.backward(g)
g = self.act1.backward(g)
g = self.linear1.backward(g)
def update(self, eta, alpha):
self.linear1.W = self.linear1.W-eta*self.linear1.dW+alpha*self.last_dW1
self.linear1.b = self.linear1.b-eta*self.linear1.db+alpha*self.last_db1
self.last_dW1 = eta*self.linear1.dW
self.last_db1 = eta*self.linear1.db
self.linear2.W = self.linear2.W-eta*self.linear2.dW+alpha*self.last_dW2
self.linear2.b = self.linear2.b-eta*self.linear2.db+alpha*self.last_db2
self.last_dW2 = eta*self.linear2.dW
self.last_db2 = eta*self.linear2.db
```
## 3.3 Three layer network
```python=
class ThreeLayer:
def __init__(self, m, n, o, p):
self.linear1 = Linear(m, n)
self.act1 = ReLu()
self.linear2 = Linear(n, o)
self.act2 = Tan()
self.linear3 = Linear(o, p)
self.act3 = Sigmoid()
self.loss = Loss()
self.last_dW1, self.last_db1 = 0, 0
self.last_dW2, self.last_db2 = 0, 0
self.last_dW3, self.last_db3 = 0, 0
def forward(self, x):
x = self.linear1.forward(x)
x = self.act1.forward(x)
x = self.linear2.forward(x)
x = self.act2.forward(x)
x = self.linear3.forward(x)
x = self.act3.forward(x)
self.ybar = self.act3.forward(x)
return self.ybar
def backward(self, label_np):
self.L = self.loss.forward(label_np, self.ybar)
g = self.loss.backward(1)
g = self.act3.backward(g)
g = self.linear3.backward(g)
g = self.act2.backward(g)
g = self.linear2.backward(g)
g = self.act1.backward(g)
g = self.linear1.backward(g)
def update(self, eta, alpha):
self.linear1.W = self.linear1.W-eta*self.linear1.dW+alpha*self.last_dW1
self.linear1.b = self.linear1.b-eta*self.linear1.db+alpha*self.last_db1
self.last_dW1 = eta*self.linear1.dW
self.last_db1 = eta*self.linear1.db
self.linear2.W = self.linear2.W-eta*self.linear2.dW+alpha*self.last_dW2
self.linear2.b = self.linear2.b-eta*self.linear2.db+alpha*self.last_db2
self.last_dW2 = eta*self.linear2.dW
self.last_db2 = eta*self.linear2.db
self.linear3.W = self.linear3.W-eta*self.linear3.dW+alpha*self.last_dW3
self.linear3.b = self.linear3.b-eta*self.linear3.db+alpha*self.last_db3
self.last_dW3 = eta*self.linear3.dW
self.last_db3 = eta*self.linear3.db
```
## 3.4 Four layer network
```python=
class FourLayer:
def __init__(self, m, n, o, p, q):
self.linear1 = Linear(m, n)
self.act1 = ReLu()
self.linear2 = Linear(n, o)
self.act2 = Sigmoid()
self.linear3 = Linear(o, p)
self.act3 = ReLu()
self.linear4 = Linear(p, q)
self.act4 = Sigmoid()
self.loss = Loss()
self.last_dW1, self.last_db1 = 0, 0
self.last_dW2, self.last_db2 = 0, 0
self.last_dW3, self.last_db3 = 0, 0
self.last_dW4, self.last_db4 = 0, 0
def forward(self, x):
x = self.linear1.forward(x)
x = self.act1.forward(x)
x = self.linear2.forward(x)
x = self.act2.forward(x)
x = self.linear3.forward(x)
x = self.act3.forward(x)
x = self.linear4.forward(x)
self.ybar = self.act4.forward(x)
return self.ybar
def backward(self, label_np):
self.L = self.loss.forward(label_np, self.ybar)
g = self.loss.backward(1)
g = self.act4.backward(g)
g = self.linear4.backward(g)
g = self.act3.backward(g)
g = self.linear3.backward(g)
g = self.act2.backward(g)
g = self.linear2.backward(g)
g = self.act1.backward(g)
g = self.linear1.backward(g)
def update(self, eta, alpha):
self.linear1.W = self.linear1.W-eta*self.linear1.dW+alpha*self.last_dW1
self.linear1.b = self.linear1.b-eta*self.linear1.db+alpha*self.last_db1
self.last_dW1 = eta*self.linear1.dW
self.last_db1 = eta*self.linear1.db
self.linear2.W = self.linear2.W-eta*self.linear2.dW+alpha*self.last_dW2
self.linear2.b = self.linear2.b-eta*self.linear2.db+alpha*self.last_db2
self.last_dW2 = eta*self.linear2.dW
self.last_db2 = eta*self.linear2.db
self.linear3.W = self.linear3.W-eta*self.linear3.dW+alpha*self.last_dW3
self.linear3.b = self.linear3.b-eta*self.linear3.db+alpha*self.last_db3
self.last_dW3 = eta*self.linear3.dW
self.last_db3 = eta*self.linear3.db
self.linear4.W = self.linear4.W-eta*self.linear4.dW+alpha*self.last_dW4
self.linear4.b = self.linear4.b-eta*self.linear4.db+alpha*self.last_db4
self.last_dW4 = eta*self.linear4.dW
self.last_db4 = eta*self.linear4.db
```
* 變動部分為`class_XXXLayer`裡`__init__`傳入個數和activation function的選擇
# 4.Run the code
## 4.1 Train the model
### Two layer
* activation function:
* Layer 1 : ReLu
* Layer 2 : Sigmoid
* inpue_dim:
* Layer 1 : 8
* Layer 2 : 6
* variable:
* eta : 0.01
* alpha : 0.9
* epoch:
* max_epochs : 200000
```python=
model = TwoLayer(8,6, 1)
max_epochs, ch_epochs = 200000, 100
last_dW, last_db = 0, 0
eta, alpha = 0.01, 0.9
for e in range(max_epochs):
model.forward(train_data_np)
model.backward(label_np)
model.update(eta, alpha)
if(e+1)%ch_epochs==0:
print(e+1, model.L)
```
### Three layer
* activation function:
* Layer 1 : ReLu
* Layer 2 : Tanh
* Layer 3 : Sigmoid
* inpue_dim:
* Layer 1 : 8
* Layer 2 : 7
* Layer 3 : 5
* variable:
* eta : 0.01
* alpha : 0.7
* epoch:
* max_epochs : 200000
```python=
model = ThreeLayer(8,7,5, 1)
max_epochs, ch_epochs = 200000, 100
last_dW, last_db = 0, 0
eta, alpha = 0.01, 0.7
for e in range(max_epochs):
model.forward(train_data_np)
model.backward(label_np)
model.update(eta, alpha)
if(e+1)%ch_epochs==0:
print(e+1, model.L)
```
### Four layer
* activation function:
* Layer 1 : ReLu
* Layer 2 : Tanh
* Layer 3 : Sigmoid
* Layer 4 : Sigmoid
* inpue_dim:
* Layer 1 : 8
* Layer 2 : 7
* Layer 3 : 7
* Layer 4 : 5
* variable:
* eta : 0.01
* alpha : 0.7
* epoch:
* max_epochs : 200000
```python=
model = FourLayer(8,7,7,5,1)
max_epochs, ch_epochs = 200000, 100
last_dW, last_db = 0, 0
eta, alpha = 0.01, 0.7
for e in range(max_epochs):
model.forward(train_data_np)
model.backward(label_np)
model.update(eta, alpha)
if(e+1)%ch_epochs==0:
print(e+1, model.L)
```
## 4.2 Plot the loss
* 新增`loss_draw`紀錄每次epoch的loss
```python=
loss_draw = [] # 新增list
model = TwoLayer(8,7,7,5, 1)
max_epochs, ch_epochs = 200000, 100
last_dW, last_db = 0, 0
eta, alpha = 0.01, 0.7
for e in range(max_epochs):
model.forward(train_data_np)
model.backward(label_np)
model.update(eta, alpha)
loss_draw.append(model.L) # loss加入list
if(e+1)%ch_epochs==0:
print(e+1, model.L)
```
```python=
import matplotlib.pyplot as plt
x = np.arange(0, 200000)
y = np.array(loss_draw)
plt.title("Layer")
plt.xlabel("epoch")
plt.ylabel("loss")
plt.plot(x,y)
plt.show()
```
## 4.3 Comparison
### Two layer
* ![](https://i.imgur.com/HcZr9Id.jpg)
* 圖loss為46
* Two layer loss數值大約落在80-46
### Three layer
* ![](https://i.imgur.com/i2Iz54v.jpg)
* 圖loss為6
* Three layer loss數值大約落在20-5
### Four layer
* ![](https://i.imgur.com/fnjxux1.jpg)
* 圖loss為0.8
* Four layer loss數值大約落在10-0.9
* Four layer會有overfitting發生