# CNN理論
### 照片的數據結構
CNN常用灰階的照片(亮度0~255)

https://ming-lian.github.io/2019/05/30/Mathmatic-Principle-in-CNN/
### 捲積
捲積核(Operator mask) :
一個N*N的像素矩陣,用來遍歷原圖片,將其卷積化
e.g: 3X3捲積層
|0|0|0|
|1|1|1|
|0|0|0|
濾波影像:
捲積核*原圖片卷(此處\*為捲積運算)
捲積運算時,每次將捲積核內的點與覆蓋的原圖像素點進行點對點相乘,如圖:


最後出來的圖會從 M*M 變成 (M-N+1)*(M-N+1)
註: 以上將stride值設為1,stride也可以是其他數字(stride為捲積核移動時的跨度)
https://chih-sheng-huang821.medium.com/%E5%8D%B7%E7%A9%8D%E7%A5%9E%E7%B6%93%E7%B6%B2%E8%B7%AF-convolutional-neural-network-cnn-%E5%8D%B7%E7%A9%8D%E9%81%8B%E7%AE%97-%E6%B1%A0%E5%8C%96%E9%81%8B%E7%AE%97-856330c2b703
```go=
// 得到圖像邊界
maxImgX := img.Bounds().Max.X
maxImgY := img.Bounds().Max.Y
// 遍歷所有像素
for y := img.Bounds().Min.Y; y < maxImgY; y++ {
for x := img.Bounds().Min.X; x < maxImgX; x++ {
// 計算所有kernel
// allKernels 是 array[][]
// v 是 kernel
for i, subKernel := range allKernels {
// feature map
featureMap[i][y][x] = kannel(x, y, img, subKernel)
}
}
}
// x,y 是計算像素的中心,img 是圖像,subKernel 是kernel
func kannel(x int, y int, img image.Image, subKernel [][]int) int {
// v 是權重
var v int
// 遍歷 kernel
for kernelY := -len(subKernel) / 2; kernelY <= len(subKernel)/2; kernelY++ {
for kernelX := -len(subKernel[0]) / 2; kernelX <= len(subKernel[0])/2; kernelX++ {
// 超界略過
if x+kernelX < 0 || x+kernelX > img.Bounds().Max.X-1 || y+kernelY < 0 || y+kernelY > img.Bounds().Max.Y-1 {
continue
}
// 轉灰階並乘上kernel權重kernelY
v += int(color.GrayModel.Convert(img.At(x+kernelX, y+kernelY)).(color.Gray).Y) * subKernel[kernelY+len(subKernel)/2][kernelX+len(subKernel[0])/2]
}
}
return v
}
```
---
### 池化(Pooling)
池化是為了將資料簡化,但又不會使重要資料損失,對其進行的降維計算,其中常用的有最大化&平均的降維計算。
最大化池化如下圖:

https://chih-sheng-huang821.medium.com/%E5%8D%B7%E7%A9%8D%E7%A5%9E%E7%B6%93%E7%B6%B2%E8%B7%AF-convolutional-neural-network-cnn-%E5%8D%B7%E7%A9%8D%E9%81%8B%E7%AE%97-%E6%B1%A0%E5%8C%96%E9%81%8B%E7%AE%97-856330c2b703
平均池化如下圖:

https://blog.csdn.net/u013289254/article/details/99080916
```go=
// 最大池化
func pooling(featureMap [][][]int) {
var poolmap [][][]int
for i, subFeatureMap := range featureMap {
for y := 0; y < len(subFeatureMap); y += 2 {
for x := 2; x < len(subFeatureMap[0]); x += 2 {
poolmap[i][y][x] = max(max(subFeatureMap[y][x-2:x]...), max(subFeatureMap[y+1][x-2:x]...))
}
}
}
}
func max(input ...int) int {
var m int
for _, number := range input {
if number > m {
m = number
}
}
return m
}
```
### 全連接(Fully Connected)
step1 : 將池化後的資料平坦化,也就是將其從二維攤成一維的(可利用系統內建的空指針做到時間複雜度O(1)的轉換)
step2 : 將平坦化的資料接到ANN(人工神經網路)上,進行訓練
```cpp=
#include <helper_cuda.h>
#include <omp.h>
#include <stdio.h>
#include <vector>
#define precision float
struct nerve
{
std::vector<precision> weight;
};
struct layer
{
int indegree=0;
int outdegree=0;
std::vector<nerve> nerves;
};
__global__ precision calculate_nerve(int* g_a, nerve* g_b) {
// int idx = blockIdx.x * blockDim.x + threadIdx.x;
// g_a[idx] += b;
percision reduce_a=0;
for (unsigned int i = 0; i < g_a.size(); i++)
reduce_a += g_a[i] * g_b[i];
return reduce_a;
// All nerve calculation chould be down inside including
// activation function
}
layer init_layer(int indegree,int outdegree) {
layer content;
nerve initnerve;
content.indegree = indegree;
content.outdegree = outdegree;
initnerve.weight.assign(0,indegree);
content.nerves.assign(outdegree, initnerve);
}
int main() {
// working on it
// remember to buffer data before executing
return 0;
int num_gpus = 0; // number of CUDA GPUs
cudaGetDeviceCount(&num_gpus);
omp_set_num_threads(num_gpus); // create as many CPU threads as there are CUDA devices
}
#pragma omp parallel
{
unsigned int cpu_thread_id = omp_get_thread_num();
unsigned int num_cpu_threads = omp_get_num_threads();
// set and check the CUDA device for this CPU thread
int gpu_id = -1;
checkCudaErrors(cudaSetDevice(
cpu_thread_id %
num_gpus)); // "% num_gpus" allows more CPU threads than GPU devices
checkCudaErrors(cudaGetDevice(&gpu_id));
printf("CPU thread %d (of %d) uses CUDA device %d\n", cpu_thread_id, num_cpu_threads, gpu_id);
int* d_a = 0; // pointer to memory on the device associated with this CPU thread
int* sub_a = a + cpu_thread_id * n / num_cpu_threads; // pointer to this CPU thread's portion of data
unsigned int nbytes_per_kernel = nbytes / num_cpu_threads;
dim3 gpu_threads(128); // 128 threads per block
dim3 gpu_blocks(n / (gpu_threads.x * num_cpu_threads));
checkCudaErrors(cudaMalloc((void**)&d_a, nbytes_per_kernel));
checkCudaErrors(cudaMemset(d_a, 0, nbytes_per_kernel));
checkCudaErrors( cudaMemcpy(d_a, sub_a, nbytes_per_kernel, cudaMemcpyHostToDevice));
kernelAddConstant << <gpu_blocks, gpu_threads >> > (d_a, b);
checkCudaErrors(cudaMemcpy(sub_a, d_a, nbytes_per_kernel, cudaMemcpyDeviceToHost));
checkCudaErrors(cudaFree(d_a));
}
```
### 總流程圖
$捲積 \rightarrow 池化 \rightarrow 全連接 \rightarrow 丟進ANN裡面$

https://medium.com/jameslearningnote/%E8%B3%87%E6%96%99%E5%88%86%E6%9E%90-%E6%A9%9F%E5%99%A8%E5%AD%B8%E7%BF%92-%E7%AC%AC5-1%E8%AC%9B-%E5%8D%B7%E7%A9%8D%E7%A5%9E%E7%B6%93%E7%B6%B2%E7%B5%A1%E4%BB%8B%E7%B4%B9-convolutional-neural-network-4f8249d65d4f