# CSAPP Exercise
## 2.58
```c=
/******************************************
*2.58
*
*
*******************************************/
#include<stdio.h>
int is_little_endian(){
int test_num = 0xff;
char *ptr = (char*)&test_num;
if(ptr[0] = 0xff)
return 1;
else
return 0;
}
int main(){
int endian = is_little_endian();
return 0;
}
```
TODO: use uintptr_t, 0xff
## 2.66
```c=
/******************************************
*2.66
*
*
*******************************************/
#include<stdio.h>
int left_most(unsigned x){
//set bits left to the Most Significant Bit to 1
x |= x >> 1;
x |= x >> 2;
x |= x >> 4;
x |= x >> 8;
x |= x >> 16;
//set all bits to 0 except the Most Significant Bit
return x & (~x >> 1);
}
int main(){
unsigned test_num = 0xff;
int mask = left_most(test_num)
}
```
## 2.92
```c=
/******************************************
* 2.92
* Compute -f. If f is NaN, then return f.
*
*******************************************/
#include<stdio.h>
typedef unsinged float_bits
float_bits float_negate(float_bits f){
unsinged sign = f >> 31;
unsinged exp = (f >> 23) & 0xff;
unsinged frac = f & 0x7fffff;
//detect NaN
if(exp == 0xff && frac != 0){
return f;
}
return (~sign << 31) | (f & 0x7fffffff);
}
```
TODO: double
## 4.47
```c=
/******************************************
* 4.47
* org:
/* Bubble sort: Array version */
void bubble_a(long *data, long count){
long i, last;
for(last = count-1; last > 0; last--){
for(i = 0; i < last; i++){
if(data[i+1] < data[i]){
/* Swap adjacent elements */
long t = data[i+1];
data[i+1] = data[i];
data[i] = t;
}
}
}
}
*
*******************************************/
#include<stdio.h>
/* Bubble sort: Pointer version */
void bubble_a(long *data, long count) {
long i, last;
for (last = count - 1; last > 0; last--) {
for (i = 0; i < last; i++) {
if (*(data + i + 1) < *(data + i)) {
/* Swap adjacent elements */
long t = *(data + i + 1);
*(data + i + 1) = *(data + i);
*(data + i) = t;
}
}
}
}
int main() {
long test_array[7] = { 15, 7, 8, 4, 6, 2, 9 };
bubble_a(test_array, 7);
int i;
for (i = 0; i<7; i++)
printf("%ld ", *(test_array + i));
return 0;
}
```
## 6.37
> 待補
| 函數 | N=64 | n=60 |
|-----|------|------|
|sumA | 25% | |
|sumB | 100% | 100% |
|sumC | | |
每次循環有 16/4 = 4 個不命中
64*64/16 = 256 個循環
## 6.45
想法:
將矩陣分成數個小矩陣,再分別做轉置,可降低 Cache miss,並減少執行時間
```c=
/******************************************
* 6.45
*
*
*******************************************/
#include<stdio.h>
//origin
void transpose(int *dst, int *src, int dim){
int i, j;
for(i = 0; i < dim; i++){
for(j = 0; j < dim; j++){
dst[j*dim + i] = src[i*dim + j];
}
}
}
//effictive
void transpose_2(int *dst, int *src, int dim){
int i, j, a, b;
int block = 2;
for(i = 0; i <= (dim - block); i += block){
for(j = 0; j <= (dim - block); j += block){
for(a = 0; a < (i+block); a++){
for(b = 0; b < (j+block); b++){
dst[b*dim + a] = src[a*dim + b];
}
}
}
}
}
void printMatrix(int matrix[], int dim){
int i, j;
for(i = 0; i < dim; i++){
for(j = 0; j < dim; j++){
printf("%d ",matrix[i*dim + j]);
}
printf("\n");
}
}
void run(){
int dim = 4;
int test_matrix[16] = {1, 2, 3, 4,
5, 6, 7, 8,
9,10,11,12,
13,14,15,16};
int transpose_matrix[16] = {0};
printMatrix(test_matrix, dim);
//transpose(transpose_matrix, test_matrix, dim);
transpose_2(transpose_matrix, test_matrix, dim);
printMatrix(transpose_matrix, dim);
}
int main(){
int i;
for(i = 0; i < 1000; ++i)
run();
return 0;
}
```
time(執行 100 次):
origin:
real 0m0.080s
user 0m0.008s
sys 0m0.024s
effective:
real 0m0.031s
user 0m0.016s
sys 0m0.012s
以 perf 測試效能(執行 1000 次):
* origin:
Performance counter stats for './645' (10 runs):
5,771 cache-misses # 2.530 % of all cache refs ( +- 6.04% ) (64.43%)
228,135 cache-references ( +- 4.37% ) (72.49%)
410,200 L1-dcache-load-misses ( +- 2.56% ) (71.55%)
38,842 L1-dcache-store-misses ( +- 4.59% ) (71.40%)
13,642 L1-dcache-prefetch-misses ( +- 8.27% ) (68.02%)
2,519,930 L1-icache-load-misses ( +- 3.63% ) (60.08%)
0.044088132 seconds time elapsed ( +- 5.69% )
* effective:
Performance counter stats for './645' (10 runs):
4,994 cache-misses # 3.228 % of all cache refs ( +- 14.34% ) (65.81%)
154,688 cache-references ( +- 14.68% ) (69.87%)
301,310 L1-dcache-load-misses ( +- 5.61% ) (70.85%)
33,559 L1-dcache-store-misses ( +- 6.88% ) (72.73%)
10,072 L1-dcache-prefetch-misses ( +- 20.29% ) (67.46%)
2,126,683 L1-icache-load-misses ( +- 7.58% ) (62.12%)
0.041932704 seconds time elapsed ( +- 6.70% )
可以看到,,cache miss明顯減少,執行時間為原本的一半左右,效能有顯著提升。
## 6.46
想法:
與上題類似
```c=
/******************************************
* 6.46
*
*
*******************************************/
#include<stdio.h>
void printMatrix(int matrix[], int dim);
//origin version
void col_convert(int *G, int dim){
int i, j;
for(i = 0; i < dim; i++){
for(j = 0; j < dim; j++){
G[j*dim + i] = G[j*dim + i] || G[i*dim + j];
}
}
}
//effective version
void col_convert_2(int *G, int dim){
int i, j, a, b;
int block = 2;
for(i = 0; i < (dim-block); i += block){
for(j = 0; j < (dim-block); j += block){
for(a = 0; a < (j +block); a++){
for(b = 0; b < (j + block); b++){
G[b*dim + a] = G[b*dim + a] || G[b*dim + a];
}
}
}
}
}
void printMatrix(int matrix[], int dim){
int i, j;
for(i = 0; i < dim; i++){
for(j = 0; j < dim; j++){
printf("%d " ,matrix[i*dim + j]);
}
printf("\n");
}
}
void run(){
int dim = 4;
int test_G[16] = {1, 0, 1, 1,
0, 1, 0, 0,
0, 0, 1, 1,
0, 0, 0, 1};
printMatrix(test_G, dim);
col_convert(test_G, dim);
//col_convert_2(test_G, dim);
printMatrix(test_G, dim);
}
int main(){
int i;
for(i = 0; i < 100; i++)
run();
return 0;
}
```
time(執行 100 次):
origin:
real 0m0.009s
user 0m0.000s
sys 0m0.004s
effective:
real 0m0.004s
user 0m0.000s
sys 0m0.000s
以 perf 測試效能(執行 1000 次):
* origin:
Performance counter stats for './646' (10 runs):
2,679 cache-misses # 9.948 % of all cache refs ( +- 36.41% ) (0.00%)
26,933 cache-references ( +- 7.47% )
39,035 L1-dcache-load-misses ( +- 6.51% )
5,246 L1-dcache-store-misses ( +- 4.28% )
0 L1-dcache-prefetch-misses (58.88%)
<not counted> L1-icache-load-misses ( +-100.00% ) (0.00%)
0.006124963 seconds time elapsed ( +- 43.83% )
* effective:
Performance counter stats for './646' (10 runs):
1,542 cache-misses # 4.843 % of all cache refs ( +- 31.26% ) (0.00%)
31,842 cache-references ( +- 6.76% )
43,680 L1-dcache-load-misses ( +- 3.20% )
6,229 L1-dcache-store-misses ( +- 3.63% )
0 L1-dcache-prefetch-misses (48.34%)
<not counted> L1-icache-load-misses (0.00%)
0.003396345 seconds time elapsed ( +- 5.30% )
可以看到,cache miss減少約四成,執行時間為原本的一半,效能有顯著提升。