---
tags: 進階電腦系統理論與實作, NCKU Linux Kernel Internals, 作業系統
---
# ARGB To BW
contributed by < `RusselCK` >
###### tags: `RusselCK`
【目標】
- [ ] [2020q3 第 10 週測驗題 - 測驗 1](https://hackmd.io/@sysprog/2020-quiz10#%E6%B8%AC%E9%A9%97-1)
| `rgba.bng` | | `bw.png` |
|:------------------------------------:|:---:|:------------------------------------:|
|  | $\Rightarrow$ |  |
【實驗環境】
* [Raspberry Pi 4 Model B](https://www.raspberrypi.org/products/raspberry-pi-4-model-b/) (4GB RAM)
【GitHub】
* [RusselCK/sysprog2020/rgbaToBw](https://github.com/RusselCK/sysprog2020/tree/master/rgbaToBw)
## libattopng 製作彩色照片
- [ ] [libattopng](https://github.com/misc0110/libattopng)
```c=
#include "libattopng.h"
#define W 2048
#define H 1024
#define RGBA(r, g, b, a) ((r) | ((g) << 8) | ((b) << 16) | ((a) << 24))
libattopng_t* createRGBApng() {
libattopng_t *png = libattopng_new(W, H, PNG_RGBA);
int x, y;
for (y = 0; y < H; y++) {
for (x = 0; x < W; x++) {
libattopng_set_pixel(png, x, y, RGBA(x & 255, y & 255, 128, (255 - ((x / 2) & 255))));
}
}
libattopng_save(png, "rgba.png");
return png;
}
libattopng_destroy(png);
```
```shell
$ gcc -c libattopng.c
```
## v0: Original
```c=
void rgba_to_bw(uint32_t *bitmap, int width, int height) {
int row, col;
uint32_t pixel, r, g, b, a, bw;
for (row = 0; row < height; row++) {
for (col = 0; col < width; col++) {
pixel = bitmap[col + row * width];
a = (pixel >> 24) & 0xff;
b = (pixel >> 16) & 0xff;
g = (pixel >> 8) & 0xff;
r = pixel & 0xff;
bw = (uint32_t) (r * 0.299 + g * 0.587 + b * 0.114);
bitmap[col + row * width] = (a << 24) + (bw << 16) + (bw << 8) + (bw);
}
}
}
```
### 編譯執行
```shell
$ gcc -c v0_Original.c
$ gcc -o v0_Original v0_Original.o libattopng.o
$ ./v0_Original
```
```shell
Execution time of rgbaToBw() : 0.097144
// -O3
Execution time of rgbaToBw() : 0.015616
```
### 組合語言 & perf
```shell
$ gcc -S -fverbose-asm v0_Original.c
$ perf record ./v0_Original
$ perf annotate
$ perf stat ./v0_Original
```
```
@ v0_Original.c:22: bw = (uint32_t) (r * 0.299 + g * 0.587 + b * 0.114);
Percent
9.22 ldr r3, [fp, #-32] @ tmp151, r
vmov s15, r3 @ int @ tmp151, tmp151
vcvt.f64.u32 d7, s15 @ _8, tmp151
0.01 vldr.64 d6, .L6 @ tmp152,
vmul.f64 d6, d7, d6 @ _9, _8, tmp152
2.02 ldr r3, [fp, #-28] @ tmp153, g
vmov s15, r3 @ int @ tmp153, tmp153
vcvt.f64.u32 d7, s15 @ _10, tmp153
0.02 vldr.64 d5, .L6+8 @ tmp154,
vmul.f64 d7, d7, d5 @ _11, _10, tmp154
vadd.f64 d6, d6, d7 @ _12, _9, _11
1.84 ldr r3, [fp, #-24] @ tmp155, b
vmov s15, r3 @ int @ tmp155, tmp155
vcvt.f64.u32 d7, s15 @ _13, tmp155
1.77 vldr.64 d5, .L6+16 @ tmp156,
vmul.f64 d7, d7, d5 @ _14, _13, tmp156
vadd.f64 d7, d6, d7 @ _15, _12, _14
vcvt.u32.f64 s15, d7 @ tmp157, _15
vmov r3, s15 @ int @ tmp157, tmp157
1.91 str r3, [fp, #-36] @ tmp157, bw
@ v0_Original.c:24: bitmap[col + row * width] = (a << 24) + (bw << 16) + (bw << 8) + (bw);
5.86 ldr r3, [fp, #-20] @ tmp158, a
lsl r2, r3, #24 @ _16, tmp158,
50.81 ldr r3, [fp, #-36] @ tmp159, bw
lsl r3, r3, #16 @ _17, tmp159,
add r2, r2, r3 @ _18, _16, _17
1.87 ldr r3, [fp, #-36] @ tmp160, bw
lsl r3, r3, #8 @ _19, tmp160,
add r1, r2, r3 @ _20, _18, _19
ldr r3, [fp, #-8] @ tmp161, row
1.84 ldr r2, [fp, #-44] @ tmp162, width
mul r2, r2, r3 @ _21, tmp162, tmp161
ldr r3, [fp, #-12] @ tmp163, col
add r3, r2, r3 @ _22, _21, tmp163
lsl r3, r3, #2 @ _24, _23,
1.90 ldr r2, [fp, #-40] @ tmp164, bitmap
add r3, r2, r3 @ _25, tmp164, _24
ldr r2, [fp, #-36] @ tmp165, bw
add r2, r1, r2 @ _26, _20, tmp165
1.93 str r2, [r3] @ _26, *_25
```
```shell
Performance counter stats for './v0_Original':
9,107.01 msec task-clock # 0.996 CPUs utilized
236 context-switches # 25.914 /sec
2 cpu-migrations # 0.220 /sec
6,199 page-faults # 680.684 /sec
<not supported> cycles
<not supported> instructions
<not supported> branches
<not supported> branch-misses
9.144416980 seconds time elapsed
8.918296000 seconds user
0.190819000 seconds sys
```

## v1: RGB Table
```c=
#define TABLE_SIZE 256
float table_R[TABLE_SIZE];
float table_G[TABLE_SIZE];
float table_B[TABLE_SIZE];
void generateRGBTable(){
int i;
for(i = 0; i <= 0xff; i++){
table_R[i] = i * 0.299;
table_G[i] = i * 0.587;
table_B[i] = i * 0.114;
}
}
void rgba_to_bw(uint32_t *bitmap, int width, int height) {
int row, col;
uint32_t pixel, r, g, b, a, bw;
for (row = 0; row < height; row++) {
for (col = 0; col < width; col++) {
pixel = bitmap[col + row * width];
a = (pixel >> 24) & 0xff;
b = (pixel >> 16) & 0xff;
g = (pixel >> 8) & 0xff;
r = pixel & 0xff;
bw = (uint32_t) (table_R[r] + table_G[g] + table_B[b]);
bitmap[col + row * width] = (a << 24) + (bw << 16) + (bw << 8) + (bw);
}
}
}
```
```shell
$ gcc -c v1_RGB_Table.c
$ gcc -o v1_RGB_Table v1_RGB_Table.o libattopng.o
$ ./v1_RGB_Table
```
```shell
Execution time of rgbaToBw() : 0.080772
// -O3
Execution time of rgbaToBw() : 0.014121
```
### 組合語言 & perf
```shell
$ gcc -S -fverbose-asm v1_RGB_Table.c
$ perf record ./v1_RGB_Table
$ perf annotate
$ perf stat ./v1_RGB_Table
```
```
@ v1_RGB_Table.c:36: bw = (uint32_t) (table_R[r] + table_G[g] + table_B[b]);
Percent
1.68 ldr r2, .L11 @ tmp148,
5.35 ldr r3, [fp, #-32] @ tmp149, r
lsl r3, r3, #2 @ tmp150, tmp149,
add r3, r2, r3 @ tmp151, tmp148, tmp150
10.86 vldr.32 s14, [r3] @ _8, table_R
ldr r2, .L11+4 @ tmp152,
1.79 ldr r3, [fp, #-28] @ tmp153, g
lsl r3, r3, #2 @ tmp154, tmp153,
add r3, r2, r3 @ tmp155, tmp152, tmp154
2.00 vldr.32 s15, [r3] @ _9, table_G
vadd.f32 s14, s14, s15 @ _10, _8, _9
1.27 ldr r2, .L11+8 @ tmp156,
0.49 ldr r3, [fp, #-24] @ tmp157, b
lsl r3, r3, #2 @ tmp158, tmp157,
add r3, r2, r3 @ tmp159, tmp156, tmp158
4.01 vldr.32 s15, [r3] @ _11, table_B
vadd.f32 s15, s14, s15 @ _12, _10, _11
vcvt.u32.f32 s15, s15 @ tmp160, _12
vmov r3, s15 @ int @ tmp160, tmp160
0.01 str r3, [fp, #-36] @ tmp160, bw
@ v1_RGB_Table.c:38: bitmap[col + row * width] = (a << 24) + (bw << 16) + (bw << 8) + (bw);
1.71 ldr r3, [fp, #-20] @ tmp161, a
lsl r2, r3, #24 @ _13, tmp161,
40.24 ldr r3, [fp, #-36] @ tmp162, bw
lsl r3, r3, #16 @ _14, tmp162,
add r2, r2, r3 @ _15, _13, _14
1.77 ldr r3, [fp, #-36] @ tmp163, bw
lsl r3, r3, #8 @ _16, tmp163,
add r1, r2, r3 @ _17, _15, _16
ldr r3, [fp, #-8] @ tmp164, row
1.73 ldr r2, [fp, #-44] @ tmp165, width
mul r2, r2, r3 @ _18, tmp165, tmp164
ldr r3, [fp, #-12] @ tmp166, col
add r3, r2, r3 @ _19, _18, tmp166
lsl r3, r3, #2 @ _21, _20,
1.77 ldr r2, [fp, #-40] @ tmp167, bitmap
add r3, r2, r3 @ _22, tmp167, _21
ldr r2, [fp, #-36] @ tmp168, bw
add r2, r1, r2 @ _23, _17, tmp168
1.86 str r2, [r3] @ _23, *_22
```
```shell
Performance counter stats for './v1_RGB_Table':
9,659.24 msec task-clock # 0.991 CPUs utilized
393 context-switches # 40.686 /sec
4 cpu-migrations # 0.414 /sec
6,199 page-faults # 641.769 /sec
<not supported> cycles
<not supported> instructions
<not supported> branches
<not supported> branch-misses
9.742049997 seconds time elapsed
9.443774000 seconds user
0.210306000 seconds sys
```


## v2: Pointer Offset
```c=
void rgba_to_bw(uint32_t *bitmap, int width, int height) {
int row, col;
uint32_t *pixel = bitmap;
uint8_t *r, *g, *b;
uint8_t bw;
for (row = 0; row < height; row++) {
for (col = 0; col < width; col++) {
b = (uint8_t*) pixel + 2;
g = (uint8_t*) pixel + 1;
r = (uint8_t*) pixel;
bw = ((*r) * 0.299) + ((*g) * 0.587) + ((*b) * 0.114);
*r = *g = *b = bw;
pixel++;
}
}
}
```
```shell
$ gcc -c v2_PointerOffset.c
$ gcc -o v2_PointerOffset v2_PointerOffset.o libattopng.o
$ ./v2_PointerOffset
```
```shell
Execution time of rgbaToBw() : 0.069611
// -O3
Execution time of rgbaToBw() : 0.015959
```
## v3: RGB Table + Pointer Offset
```c=
#define TABLE_SIZE 256
float table_R[TABLE_SIZE];
float table_G[TABLE_SIZE];
float table_B[TABLE_SIZE];
void generateRGBTable(){
int i;
for(i = 0; i <= 0xff; i++){
table_R[i] = i * 0.299;
table_G[i] = i * 0.587;
table_B[i] = i * 0.114;
}
}
void rgba_to_bw(uint32_t *bitmap, int width, int height) {
int row, col;
uint32_t *pixel = bitmap;
uint8_t *r, *g, *b;
uint8_t bw;
for (row = 0; row < height; row++) {
for (col = 0; col < width; col++) {
b = (uint8_t*) pixel + 2;
g = (uint8_t*) pixel + 1;
r = (uint8_t*) pixel;
bw = table_R[*r] + table_G[*g] + table_B[*b];
*r = *g = *b = bw;
pixel++;
}
}
}
```
```shell
$ gcc -c v3_PointerOffset_RGBtable.c
$ gcc -o v3_PointerOffset_RGBtable v3_PointerOffset_RGBtable.o libattopng.o
$ ./v3_PointerOffset_RGBtable
```
```shell
Execution time of rgbaToBw() : 0.064737
// -O3
Execution time of rgbaToBw() : 0.015587
```
## v4: NEON
```c=
#include <arm_neon.h>
#define BW_SHIFT 8
void rgba_to_bw(uint32_t *bitmap, int width, int height) {
uint8_t *pixel = (uint8_t*) bitmap;
const uint8_t R2BW = 77, G2BW = 151, B2BW = 28;
uint8x8_t _R2BW = vdup_n_u8(R2BW);
uint8x8_t _G2BW = vdup_n_u8(G2BW);
uint8x8_t _B2BW = vdup_n_u8(B2BW);
int capacity = width * height;
int round = capacity >> 3;
for (int i = 0; i < round; ++i) {
uint8x8x4_t _pixel = vld4_u8(pixel);
uint16x8_t bw16;
bw16 = vmull_u8( _pixel.val[2], _B2BW);
bw16 = vmlal_u8(bw16, _pixel.val[1], _G2BW);
bw16 = vmlal_u8(bw16, _pixel.val[0], _R2BW);
_pixel.val[0] = vshrn_n_u16(bw16, BW_SHIFT);
_pixel.val[1] = _pixel.val[0];
_pixel.val[2] = _pixel.val[1];
vst4_u8(pixel, _pixel);
pixel += (4*8);
}
int remain = capacity & 0x7;
uint8_t *r, *g, *b;
uint8_t bw;
for (int i = 0; i < remain; ++i) {
r = pixel + 2;
g = pixel + 1;
b = pixel;
bw = ((*r)*R2BW + (*g)*G2BW + (*b)*B2BW) >> BW_SHIFT;
*r = *g = *b = bw;
pixel += 4;
}
}
```
```shell
$ gcc -c -mfpu=neon v4_NEON.c
$ gcc -o v4_NEON v4_NEON.o libattopng.o
$ ./v4_NEON
```
```shell
Execution time of rgbaToBw() : 0.025005
// -O3
Execution time of rgbaToBw() : 0.008220
```
### Neon Intrinsics
```c
// vdup -> ri = a;
uint8x8_t vdup_n_u8 (uint8_t __a);
// vld4 -> loads 4 vectors from memory.
uint8x8x4_t vld4_u8 (const uint8_t * __a);
// vst4 -> stores 4 vectors into memory.
void vst4_u8 (uint8_t * __a, uint8x8x4_t __b);
// vmull -> ri = ai * bi;
uint16x8_t vmull_u8 (uint8x8_t __a, uint8x8_t __b);
// vmlal -> ri = ai + bi * ci;
uint16x8_t vmlal_u8 (uint16x8_t __a, uint8x8_t __b, uint8x8_t __c);
// vshrn -> ri = ai >> b;
uint8x8_t vshrn_n_u16 (uint16x8_t __a, const int __b);
```
:::info
- [ ] [Neon Intrinsics各函数介绍](https://blog.csdn.net/fengbingchun/article/details/38085781)
- [ ] [NEON Programmer's Guide](https://developer.arm.com/documentation/den0018/a/NEON-Intrinsics-Reference/Load-and-store/VLD4)

:::
## v5: NEON (Prefetch)
```c=
#include <arm_neon.h>
#define BW_SHIFT 8
void rgba_to_bw(uint32_t *bitmap, int width, int height) {
uint8_t *pixel = (uint8_t*) bitmap;
const uint8_t R2BW = 77, G2BW = 151, B2BW = 28;
uint8x8_t _R2BW = vdup_n_u8(R2BW);
uint8x8_t _G2BW = vdup_n_u8(G2BW);
uint8x8_t _B2BW = vdup_n_u8(B2BW);
int capacity = width * height;
int round = capacity >> 3;
//#pragma unroll
for (int i = 0; i < round; ++i) {
uint8x8x4_t _pixel = vld4_u8(pixel);
__builtin_prefetch(&_pixel.val[0]);
__builtin_prefetch(&_pixel.val[1]);
__builtin_prefetch(&_pixel.val[2]);
uint16x8_t bw16;
bw16 = vmull_u8( _pixel.val[2], _B2BW);
bw16 = vmlal_u8(bw16, _pixel.val[1], _G2BW);
bw16 = vmlal_u8(bw16, _pixel.val[0], _R2BW);
_pixel.val[0] = vshrn_n_u16(bw16, BW_SHIFT);
_pixel.val[1] = _pixel.val[0];
_pixel.val[2] = _pixel.val[1];
vst4_u8(pixel, _pixel);
pixel += (4*8);
}
int remain = capacity & 0x7;
uint8_t *r, *g, *b;
uint8_t bw;
for (int i = 0; i < remain; ++i) {
r = pixel + 2;
g = pixel + 1;
b = pixel;
bw = ((*r)*R2BW + (*g)*G2BW + (*b)*B2BW) >> BW_SHIFT;
*r = *g = *b = bw;
pixel += 4;
}
}
```
```shell
$ gcc -c -mfpu=neon v5_NEON_Unroll_PLD.c
$ gcc -o v5_NEON_Unroll_PLD v5_NEON_Unroll_PLD.o libattopng.o
$ ./v5_NEON_Unroll_PLD
```
```shell
Execution time of rgbaToBw() : 0.023558
// -O3
Execution time of rgbaToBw() : 0.007971
```
:::info
- [ ] [RealView Compilation Tools NEON Vectorizing Compiler Guide - `#pragma unroll [(n)]`](https://developer.arm.com/documentation/dui0350/a/CIHJHADD)
- [ ] [数据预取 __builtin_prefetch()](https://www.cnblogs.com/dongzhiquan/p/3694858.html)
- [ ] [6.53 Other built-in functions provided by GCC](https://gcc.gnu.org/onlinedocs/gcc-4.6.4/gcc/Other-Builtins.html#index-g_t_005f_005fbuiltin_005fprefetch-3145)
:::
## 效能評比
