--- tags: 進階電腦系統理論與實作, NCKU Linux Kernel Internals, 作業系統 --- # ARGB To BW contributed by < `RusselCK` > ###### tags: `RusselCK` 【目標】 - [ ] [2020q3 第 10 週測驗題 - 測驗 1](https://hackmd.io/@sysprog/2020-quiz10#%E6%B8%AC%E9%A9%97-1) | `rgba.bng` | | `bw.png` | |:------------------------------------:|:---:|:------------------------------------:| | ![](https://i.imgur.com/OfyUITV.png) | $\Rightarrow$ | ![](https://i.imgur.com/0EORL68.png) | 【實驗環境】 * [Raspberry Pi 4 Model B](https://www.raspberrypi.org/products/raspberry-pi-4-model-b/) (4GB RAM) 【GitHub】 * [RusselCK/sysprog2020/rgbaToBw](https://github.com/RusselCK/sysprog2020/tree/master/rgbaToBw) ## libattopng 製作彩色照片 - [ ] [libattopng](https://github.com/misc0110/libattopng) ```c= #include "libattopng.h" #define W 2048 #define H 1024 #define RGBA(r, g, b, a) ((r) | ((g) << 8) | ((b) << 16) | ((a) << 24)) libattopng_t* createRGBApng() { libattopng_t *png = libattopng_new(W, H, PNG_RGBA); int x, y; for (y = 0; y < H; y++) { for (x = 0; x < W; x++) { libattopng_set_pixel(png, x, y, RGBA(x & 255, y & 255, 128, (255 - ((x / 2) & 255)))); } } libattopng_save(png, "rgba.png"); return png; } libattopng_destroy(png); ``` ```shell $ gcc -c libattopng.c ``` ## v0: Original ```c= void rgba_to_bw(uint32_t *bitmap, int width, int height) { int row, col; uint32_t pixel, r, g, b, a, bw; for (row = 0; row < height; row++) { for (col = 0; col < width; col++) { pixel = bitmap[col + row * width]; a = (pixel >> 24) & 0xff; b = (pixel >> 16) & 0xff; g = (pixel >> 8) & 0xff; r = pixel & 0xff; bw = (uint32_t) (r * 0.299 + g * 0.587 + b * 0.114); bitmap[col + row * width] = (a << 24) + (bw << 16) + (bw << 8) + (bw); } } } ``` ### 編譯執行 ```shell $ gcc -c v0_Original.c $ gcc -o v0_Original v0_Original.o libattopng.o $ ./v0_Original ``` ```shell Execution time of rgbaToBw() : 0.097144 // -O3 Execution time of rgbaToBw() : 0.015616 ``` ### 組合語言 & perf ```shell $ gcc -S -fverbose-asm v0_Original.c $ perf record ./v0_Original $ perf annotate $ perf stat ./v0_Original ``` ``` @ v0_Original.c:22: bw = (uint32_t) (r * 0.299 + g * 0.587 + b * 0.114); Percent 9.22 ldr r3, [fp, #-32] @ tmp151, r vmov s15, r3 @ int @ tmp151, tmp151 vcvt.f64.u32 d7, s15 @ _8, tmp151 0.01 vldr.64 d6, .L6 @ tmp152, vmul.f64 d6, d7, d6 @ _9, _8, tmp152 2.02 ldr r3, [fp, #-28] @ tmp153, g vmov s15, r3 @ int @ tmp153, tmp153 vcvt.f64.u32 d7, s15 @ _10, tmp153 0.02 vldr.64 d5, .L6+8 @ tmp154, vmul.f64 d7, d7, d5 @ _11, _10, tmp154 vadd.f64 d6, d6, d7 @ _12, _9, _11 1.84 ldr r3, [fp, #-24] @ tmp155, b vmov s15, r3 @ int @ tmp155, tmp155 vcvt.f64.u32 d7, s15 @ _13, tmp155 1.77 vldr.64 d5, .L6+16 @ tmp156, vmul.f64 d7, d7, d5 @ _14, _13, tmp156 vadd.f64 d7, d6, d7 @ _15, _12, _14 vcvt.u32.f64 s15, d7 @ tmp157, _15 vmov r3, s15 @ int @ tmp157, tmp157 1.91 str r3, [fp, #-36] @ tmp157, bw @ v0_Original.c:24: bitmap[col + row * width] = (a << 24) + (bw << 16) + (bw << 8) + (bw); 5.86 ldr r3, [fp, #-20] @ tmp158, a lsl r2, r3, #24 @ _16, tmp158, 50.81 ldr r3, [fp, #-36] @ tmp159, bw lsl r3, r3, #16 @ _17, tmp159, add r2, r2, r3 @ _18, _16, _17 1.87 ldr r3, [fp, #-36] @ tmp160, bw lsl r3, r3, #8 @ _19, tmp160, add r1, r2, r3 @ _20, _18, _19 ldr r3, [fp, #-8] @ tmp161, row 1.84 ldr r2, [fp, #-44] @ tmp162, width mul r2, r2, r3 @ _21, tmp162, tmp161 ldr r3, [fp, #-12] @ tmp163, col add r3, r2, r3 @ _22, _21, tmp163 lsl r3, r3, #2 @ _24, _23, 1.90 ldr r2, [fp, #-40] @ tmp164, bitmap add r3, r2, r3 @ _25, tmp164, _24 ldr r2, [fp, #-36] @ tmp165, bw add r2, r1, r2 @ _26, _20, tmp165 1.93 str r2, [r3] @ _26, *_25 ``` ```shell Performance counter stats for './v0_Original': 9,107.01 msec task-clock # 0.996 CPUs utilized 236 context-switches # 25.914 /sec 2 cpu-migrations # 0.220 /sec 6,199 page-faults # 680.684 /sec <not supported> cycles <not supported> instructions <not supported> branches <not supported> branch-misses 9.144416980 seconds time elapsed 8.918296000 seconds user 0.190819000 seconds sys ``` ![](https://i.imgur.com/O9pFGN4.png) ## v1: RGB Table ```c= #define TABLE_SIZE 256 float table_R[TABLE_SIZE]; float table_G[TABLE_SIZE]; float table_B[TABLE_SIZE]; void generateRGBTable(){ int i; for(i = 0; i <= 0xff; i++){ table_R[i] = i * 0.299; table_G[i] = i * 0.587; table_B[i] = i * 0.114; } } void rgba_to_bw(uint32_t *bitmap, int width, int height) { int row, col; uint32_t pixel, r, g, b, a, bw; for (row = 0; row < height; row++) { for (col = 0; col < width; col++) { pixel = bitmap[col + row * width]; a = (pixel >> 24) & 0xff; b = (pixel >> 16) & 0xff; g = (pixel >> 8) & 0xff; r = pixel & 0xff; bw = (uint32_t) (table_R[r] + table_G[g] + table_B[b]); bitmap[col + row * width] = (a << 24) + (bw << 16) + (bw << 8) + (bw); } } } ``` ```shell $ gcc -c v1_RGB_Table.c $ gcc -o v1_RGB_Table v1_RGB_Table.o libattopng.o $ ./v1_RGB_Table ``` ```shell Execution time of rgbaToBw() : 0.080772 // -O3 Execution time of rgbaToBw() : 0.014121 ``` ### 組合語言 & perf ```shell $ gcc -S -fverbose-asm v1_RGB_Table.c $ perf record ./v1_RGB_Table $ perf annotate $ perf stat ./v1_RGB_Table ``` ``` @ v1_RGB_Table.c:36: bw = (uint32_t) (table_R[r] + table_G[g] + table_B[b]); Percent 1.68 ldr r2, .L11 @ tmp148, 5.35 ldr r3, [fp, #-32] @ tmp149, r lsl r3, r3, #2 @ tmp150, tmp149, add r3, r2, r3 @ tmp151, tmp148, tmp150 10.86 vldr.32 s14, [r3] @ _8, table_R ldr r2, .L11+4 @ tmp152, 1.79 ldr r3, [fp, #-28] @ tmp153, g lsl r3, r3, #2 @ tmp154, tmp153, add r3, r2, r3 @ tmp155, tmp152, tmp154 2.00 vldr.32 s15, [r3] @ _9, table_G vadd.f32 s14, s14, s15 @ _10, _8, _9 1.27 ldr r2, .L11+8 @ tmp156, 0.49 ldr r3, [fp, #-24] @ tmp157, b lsl r3, r3, #2 @ tmp158, tmp157, add r3, r2, r3 @ tmp159, tmp156, tmp158 4.01 vldr.32 s15, [r3] @ _11, table_B vadd.f32 s15, s14, s15 @ _12, _10, _11 vcvt.u32.f32 s15, s15 @ tmp160, _12 vmov r3, s15 @ int @ tmp160, tmp160 0.01 str r3, [fp, #-36] @ tmp160, bw @ v1_RGB_Table.c:38: bitmap[col + row * width] = (a << 24) + (bw << 16) + (bw << 8) + (bw); 1.71 ldr r3, [fp, #-20] @ tmp161, a lsl r2, r3, #24 @ _13, tmp161, 40.24 ldr r3, [fp, #-36] @ tmp162, bw lsl r3, r3, #16 @ _14, tmp162, add r2, r2, r3 @ _15, _13, _14 1.77 ldr r3, [fp, #-36] @ tmp163, bw lsl r3, r3, #8 @ _16, tmp163, add r1, r2, r3 @ _17, _15, _16 ldr r3, [fp, #-8] @ tmp164, row 1.73 ldr r2, [fp, #-44] @ tmp165, width mul r2, r2, r3 @ _18, tmp165, tmp164 ldr r3, [fp, #-12] @ tmp166, col add r3, r2, r3 @ _19, _18, tmp166 lsl r3, r3, #2 @ _21, _20, 1.77 ldr r2, [fp, #-40] @ tmp167, bitmap add r3, r2, r3 @ _22, tmp167, _21 ldr r2, [fp, #-36] @ tmp168, bw add r2, r1, r2 @ _23, _17, tmp168 1.86 str r2, [r3] @ _23, *_22 ``` ```shell Performance counter stats for './v1_RGB_Table': 9,659.24 msec task-clock # 0.991 CPUs utilized 393 context-switches # 40.686 /sec 4 cpu-migrations # 0.414 /sec 6,199 page-faults # 641.769 /sec <not supported> cycles <not supported> instructions <not supported> branches <not supported> branch-misses 9.742049997 seconds time elapsed 9.443774000 seconds user 0.210306000 seconds sys ``` ![](https://i.imgur.com/oSIIVEn.png) ![](https://i.imgur.com/armiDfC.png) ## v2: Pointer Offset ```c= void rgba_to_bw(uint32_t *bitmap, int width, int height) { int row, col; uint32_t *pixel = bitmap; uint8_t *r, *g, *b; uint8_t bw; for (row = 0; row < height; row++) { for (col = 0; col < width; col++) { b = (uint8_t*) pixel + 2; g = (uint8_t*) pixel + 1; r = (uint8_t*) pixel; bw = ((*r) * 0.299) + ((*g) * 0.587) + ((*b) * 0.114); *r = *g = *b = bw; pixel++; } } } ``` ```shell $ gcc -c v2_PointerOffset.c $ gcc -o v2_PointerOffset v2_PointerOffset.o libattopng.o $ ./v2_PointerOffset ``` ```shell Execution time of rgbaToBw() : 0.069611 // -O3 Execution time of rgbaToBw() : 0.015959 ``` ## v3: RGB Table + Pointer Offset ```c= #define TABLE_SIZE 256 float table_R[TABLE_SIZE]; float table_G[TABLE_SIZE]; float table_B[TABLE_SIZE]; void generateRGBTable(){ int i; for(i = 0; i <= 0xff; i++){ table_R[i] = i * 0.299; table_G[i] = i * 0.587; table_B[i] = i * 0.114; } } void rgba_to_bw(uint32_t *bitmap, int width, int height) { int row, col; uint32_t *pixel = bitmap; uint8_t *r, *g, *b; uint8_t bw; for (row = 0; row < height; row++) { for (col = 0; col < width; col++) { b = (uint8_t*) pixel + 2; g = (uint8_t*) pixel + 1; r = (uint8_t*) pixel; bw = table_R[*r] + table_G[*g] + table_B[*b]; *r = *g = *b = bw; pixel++; } } } ``` ```shell $ gcc -c v3_PointerOffset_RGBtable.c $ gcc -o v3_PointerOffset_RGBtable v3_PointerOffset_RGBtable.o libattopng.o $ ./v3_PointerOffset_RGBtable ``` ```shell Execution time of rgbaToBw() : 0.064737 // -O3 Execution time of rgbaToBw() : 0.015587 ``` ## v4: NEON ```c= #include <arm_neon.h> #define BW_SHIFT 8 void rgba_to_bw(uint32_t *bitmap, int width, int height) { uint8_t *pixel = (uint8_t*) bitmap; const uint8_t R2BW = 77, G2BW = 151, B2BW = 28; uint8x8_t _R2BW = vdup_n_u8(R2BW); uint8x8_t _G2BW = vdup_n_u8(G2BW); uint8x8_t _B2BW = vdup_n_u8(B2BW); int capacity = width * height; int round = capacity >> 3; for (int i = 0; i < round; ++i) { uint8x8x4_t _pixel = vld4_u8(pixel); uint16x8_t bw16; bw16 = vmull_u8( _pixel.val[2], _B2BW); bw16 = vmlal_u8(bw16, _pixel.val[1], _G2BW); bw16 = vmlal_u8(bw16, _pixel.val[0], _R2BW); _pixel.val[0] = vshrn_n_u16(bw16, BW_SHIFT); _pixel.val[1] = _pixel.val[0]; _pixel.val[2] = _pixel.val[1]; vst4_u8(pixel, _pixel); pixel += (4*8); } int remain = capacity & 0x7; uint8_t *r, *g, *b; uint8_t bw; for (int i = 0; i < remain; ++i) { r = pixel + 2; g = pixel + 1; b = pixel; bw = ((*r)*R2BW + (*g)*G2BW + (*b)*B2BW) >> BW_SHIFT; *r = *g = *b = bw; pixel += 4; } } ``` ```shell $ gcc -c -mfpu=neon v4_NEON.c $ gcc -o v4_NEON v4_NEON.o libattopng.o $ ./v4_NEON ``` ```shell Execution time of rgbaToBw() : 0.025005 // -O3 Execution time of rgbaToBw() : 0.008220 ``` ### Neon Intrinsics ```c // vdup -> ri = a; uint8x8_t vdup_n_u8 (uint8_t __a); // vld4 -> loads 4 vectors from memory. uint8x8x4_t vld4_u8 (const uint8_t * __a); // vst4 -> stores 4 vectors into memory. void vst4_u8 (uint8_t * __a, uint8x8x4_t __b); // vmull -> ri = ai * bi; uint16x8_t vmull_u8 (uint8x8_t __a, uint8x8_t __b); // vmlal -> ri = ai + bi * ci; uint16x8_t vmlal_u8 (uint16x8_t __a, uint8x8_t __b, uint8x8_t __c); // vshrn -> ri = ai >> b; uint8x8_t vshrn_n_u16 (uint16x8_t __a, const int __b); ``` :::info - [ ] [Neon Intrinsics各函数介绍](https://blog.csdn.net/fengbingchun/article/details/38085781) - [ ] [NEON Programmer's Guide](https://developer.arm.com/documentation/den0018/a/NEON-Intrinsics-Reference/Load-and-store/VLD4) ![](https://i.imgur.com/iMgT9Ag.png) ::: ## v5: NEON (Prefetch) ```c= #include <arm_neon.h> #define BW_SHIFT 8 void rgba_to_bw(uint32_t *bitmap, int width, int height) { uint8_t *pixel = (uint8_t*) bitmap; const uint8_t R2BW = 77, G2BW = 151, B2BW = 28; uint8x8_t _R2BW = vdup_n_u8(R2BW); uint8x8_t _G2BW = vdup_n_u8(G2BW); uint8x8_t _B2BW = vdup_n_u8(B2BW); int capacity = width * height; int round = capacity >> 3; //#pragma unroll for (int i = 0; i < round; ++i) { uint8x8x4_t _pixel = vld4_u8(pixel); __builtin_prefetch(&_pixel.val[0]); __builtin_prefetch(&_pixel.val[1]); __builtin_prefetch(&_pixel.val[2]); uint16x8_t bw16; bw16 = vmull_u8( _pixel.val[2], _B2BW); bw16 = vmlal_u8(bw16, _pixel.val[1], _G2BW); bw16 = vmlal_u8(bw16, _pixel.val[0], _R2BW); _pixel.val[0] = vshrn_n_u16(bw16, BW_SHIFT); _pixel.val[1] = _pixel.val[0]; _pixel.val[2] = _pixel.val[1]; vst4_u8(pixel, _pixel); pixel += (4*8); } int remain = capacity & 0x7; uint8_t *r, *g, *b; uint8_t bw; for (int i = 0; i < remain; ++i) { r = pixel + 2; g = pixel + 1; b = pixel; bw = ((*r)*R2BW + (*g)*G2BW + (*b)*B2BW) >> BW_SHIFT; *r = *g = *b = bw; pixel += 4; } } ``` ```shell $ gcc -c -mfpu=neon v5_NEON_Unroll_PLD.c $ gcc -o v5_NEON_Unroll_PLD v5_NEON_Unroll_PLD.o libattopng.o $ ./v5_NEON_Unroll_PLD ``` ```shell Execution time of rgbaToBw() : 0.023558 // -O3 Execution time of rgbaToBw() : 0.007971 ``` :::info - [ ] [RealView Compilation Tools NEON Vectorizing Compiler Guide - `#pragma unroll [(n)]`](https://developer.arm.com/documentation/dui0350/a/CIHJHADD) - [ ] [数据预取 __builtin_prefetch()](https://www.cnblogs.com/dongzhiquan/p/3694858.html) - [ ] [6.53 Other built-in functions provided by GCC](https://gcc.gnu.org/onlinedocs/gcc-4.6.4/gcc/Other-Builtins.html#index-g_t_005f_005fbuiltin_005fprefetch-3145) ::: ## 效能評比 ![](https://i.imgur.com/BlvPBjx.png)