# CSAPP Exercise ## 2.58 ```c= /****************************************** *2.58 * * *******************************************/ #include<stdio.h> int is_little_endian(){ int test_num = 0xff; char *ptr = (char*)&test_num; if(ptr[0] = 0xff) return 1; else return 0; } int main(){ int endian = is_little_endian(); return 0; } ``` TODO: use uintptr_t, 0xff ## 2.66 ```c= /****************************************** *2.66 * * *******************************************/ #include<stdio.h> int left_most(unsigned x){ //set bits left to the Most Significant Bit to 1 x |= x >> 1; x |= x >> 2; x |= x >> 4; x |= x >> 8; x |= x >> 16; //set all bits to 0 except the Most Significant Bit return x & (~x >> 1); } int main(){ unsigned test_num = 0xff; int mask = left_most(test_num) } ``` ## 2.92 ```c= /****************************************** * 2.92 * Compute -f. If f is NaN, then return f. * *******************************************/ #include<stdio.h> typedef unsinged float_bits float_bits float_negate(float_bits f){ unsinged sign = f >> 31; unsinged exp = (f >> 23) & 0xff; unsinged frac = f & 0x7fffff; //detect NaN if(exp == 0xff && frac != 0){ return f; } return (~sign << 31) | (f & 0x7fffffff); } ``` TODO: double ## 4.47 ```c= /****************************************** * 4.47 * org: /* Bubble sort: Array version */ void bubble_a(long *data, long count){ long i, last; for(last = count-1; last > 0; last--){ for(i = 0; i < last; i++){ if(data[i+1] < data[i]){ /* Swap adjacent elements */ long t = data[i+1]; data[i+1] = data[i]; data[i] = t; } } } } * *******************************************/ #include<stdio.h> /* Bubble sort: Pointer version */ void bubble_a(long *data, long count) { long i, last; for (last = count - 1; last > 0; last--) { for (i = 0; i < last; i++) { if (*(data + i + 1) < *(data + i)) { /* Swap adjacent elements */ long t = *(data + i + 1); *(data + i + 1) = *(data + i); *(data + i) = t; } } } } int main() { long test_array[7] = { 15, 7, 8, 4, 6, 2, 9 }; bubble_a(test_array, 7); int i; for (i = 0; i<7; i++) printf("%ld ", *(test_array + i)); return 0; } ``` ## 6.37 > 待補 | 函數 | N=64 | n=60 | |-----|------|------| |sumA | 25% | | |sumB | 100% | 100% | |sumC | | | 每次循環有 16/4 = 4 個不命中 64*64/16 = 256 個循環 ## 6.45 想法: 將矩陣分成數個小矩陣,再分別做轉置,可降低 Cache miss,並減少執行時間 ```c= /****************************************** * 6.45 * * *******************************************/ #include<stdio.h> //origin void transpose(int *dst, int *src, int dim){ int i, j; for(i = 0; i < dim; i++){ for(j = 0; j < dim; j++){ dst[j*dim + i] = src[i*dim + j]; } } } //effictive void transpose_2(int *dst, int *src, int dim){ int i, j, a, b; int block = 2; for(i = 0; i <= (dim - block); i += block){ for(j = 0; j <= (dim - block); j += block){ for(a = 0; a < (i+block); a++){ for(b = 0; b < (j+block); b++){ dst[b*dim + a] = src[a*dim + b]; } } } } } void printMatrix(int matrix[], int dim){ int i, j; for(i = 0; i < dim; i++){ for(j = 0; j < dim; j++){ printf("%d ",matrix[i*dim + j]); } printf("\n"); } } void run(){ int dim = 4; int test_matrix[16] = {1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12, 13,14,15,16}; int transpose_matrix[16] = {0}; printMatrix(test_matrix, dim); //transpose(transpose_matrix, test_matrix, dim); transpose_2(transpose_matrix, test_matrix, dim); printMatrix(transpose_matrix, dim); } int main(){ int i; for(i = 0; i < 1000; ++i) run(); return 0; } ``` time(執行 100 次): origin: real 0m0.080s user 0m0.008s sys 0m0.024s effective: real 0m0.031s user 0m0.016s sys 0m0.012s 以 perf 測試效能(執行 1000 次): * origin: Performance counter stats for './645' (10 runs): 5,771 cache-misses # 2.530 % of all cache refs ( +- 6.04% ) (64.43%) 228,135 cache-references ( +- 4.37% ) (72.49%) 410,200 L1-dcache-load-misses ( +- 2.56% ) (71.55%) 38,842 L1-dcache-store-misses ( +- 4.59% ) (71.40%) 13,642 L1-dcache-prefetch-misses ( +- 8.27% ) (68.02%) 2,519,930 L1-icache-load-misses ( +- 3.63% ) (60.08%) 0.044088132 seconds time elapsed ( +- 5.69% ) * effective: Performance counter stats for './645' (10 runs): 4,994 cache-misses # 3.228 % of all cache refs ( +- 14.34% ) (65.81%) 154,688 cache-references ( +- 14.68% ) (69.87%) 301,310 L1-dcache-load-misses ( +- 5.61% ) (70.85%) 33,559 L1-dcache-store-misses ( +- 6.88% ) (72.73%) 10,072 L1-dcache-prefetch-misses ( +- 20.29% ) (67.46%) 2,126,683 L1-icache-load-misses ( +- 7.58% ) (62.12%) 0.041932704 seconds time elapsed ( +- 6.70% ) 可以看到,,cache miss明顯減少,執行時間為原本的一半左右,效能有顯著提升。 ## 6.46 想法: 與上題類似 ```c= /****************************************** * 6.46 * * *******************************************/ #include<stdio.h> void printMatrix(int matrix[], int dim); //origin version void col_convert(int *G, int dim){ int i, j; for(i = 0; i < dim; i++){ for(j = 0; j < dim; j++){ G[j*dim + i] = G[j*dim + i] || G[i*dim + j]; } } } //effective version void col_convert_2(int *G, int dim){ int i, j, a, b; int block = 2; for(i = 0; i < (dim-block); i += block){ for(j = 0; j < (dim-block); j += block){ for(a = 0; a < (j +block); a++){ for(b = 0; b < (j + block); b++){ G[b*dim + a] = G[b*dim + a] || G[b*dim + a]; } } } } } void printMatrix(int matrix[], int dim){ int i, j; for(i = 0; i < dim; i++){ for(j = 0; j < dim; j++){ printf("%d " ,matrix[i*dim + j]); } printf("\n"); } } void run(){ int dim = 4; int test_G[16] = {1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1}; printMatrix(test_G, dim); col_convert(test_G, dim); //col_convert_2(test_G, dim); printMatrix(test_G, dim); } int main(){ int i; for(i = 0; i < 100; i++) run(); return 0; } ``` time(執行 100 次): origin: real 0m0.009s user 0m0.000s sys 0m0.004s effective: real 0m0.004s user 0m0.000s sys 0m0.000s 以 perf 測試效能(執行 1000 次): * origin: Performance counter stats for './646' (10 runs): 2,679 cache-misses # 9.948 % of all cache refs ( +- 36.41% ) (0.00%) 26,933 cache-references ( +- 7.47% ) 39,035 L1-dcache-load-misses ( +- 6.51% ) 5,246 L1-dcache-store-misses ( +- 4.28% ) 0 L1-dcache-prefetch-misses (58.88%) <not counted> L1-icache-load-misses ( +-100.00% ) (0.00%) 0.006124963 seconds time elapsed ( +- 43.83% ) * effective: Performance counter stats for './646' (10 runs): 1,542 cache-misses # 4.843 % of all cache refs ( +- 31.26% ) (0.00%) 31,842 cache-references ( +- 6.76% ) 43,680 L1-dcache-load-misses ( +- 3.20% ) 6,229 L1-dcache-store-misses ( +- 3.63% ) 0 L1-dcache-prefetch-misses (48.34%) <not counted> L1-icache-load-misses (0.00%) 0.003396345 seconds time elapsed ( +- 5.30% ) 可以看到,cache miss減少約四成,執行時間為原本的一半,效能有顯著提升。