# 2018q3 Homework6 contributed by <`datuiji`> ## 題目 - CSAPP 5.17 庫函數 *memset 的原型如下: void *memset(void *s, int c, size_t n): 這個函數將從以 s 開始的 n 個字節的內存區域都填充為 c 的低位字節。例如,通過將參數 c 設置為 0 ,可以用這個函數來對一個內存區域清零,不過用其他值也是可以的。下面是 memset 最直接的實現: ```clike /* Basic implementtation of memset*/ void *basic_memset(void *s, int c, size_t n) { size_t cnt = 0; unsigned char *schar = s; while (cnt < n){ *schar++ = (unsigned char) c; cnt++; } return s; } ``` 實現該函數一個更有效的版本,使用數據類行為 unsigned long 的字來裝下8個 c ,然後用字級的寫遍歷目標內存區域。你可能發現增加額外的循環展開會有所幫助。在我們的參考機上,能夠把 CPE 從直接實現的1.00降到0.127。即,程序每個週期可以寫8個字節。 這裡是一些額外的指導原則。在此,假設 K 表示你運行程序的機器上的 sizeof(unsigned long) 的值。 * 你不可以調用任何庫函數 * 你的代碼應該對任意 n 的值都能工作,包括當它不是 K 的倍數的時候。你可以用類似於循環展開時完成最後幾次迭代的方法作到這一點。 * 你寫的代碼應該無論 K 的值是多少,都能正確編譯和運行。使用操作 sizeof 來做到這一點。 * 在某些機器上,未對齊的寫可能比對其的寫慢很多。(在某些非 x86 機器上,未對齊的寫甚至可能會導致段錯誤。)寫出這樣的代碼,開始時直到目的地址是 K 的倍數時,使用字節級的寫,然後進行字級的寫,(如果需要)最後採用用字節級的寫。 * 注意 cnt 足夠小以至於一些循環上界變成負數的情況。對於涉及 sizeof 運算符的表達式,可以用無符號運算來執行測試。(參見 2.2.8節和家庭作業2.72) ## basicMemset.c CPE 測試 ```clike= #include <stdio.h> #include <stdint.h> #include <stdlib.h> #include <string.h> #include "clock.h" void *basic_memset(void *s, int c, size_t n,double *cyc) { size_t cnt = 0; unsigned char *schar = s; start_comp_counter(); while (cnt < n){ *schar++ = (unsigned char) c; cnt++; } *cyc = get_comp_counter(); return s; } int main(int argc, char const *argv[]) { size_t space = sizeof(char) * 65537; void* basic_space = malloc(space); int basic_fill = 0xff; double cyc; basic_memset(basic_space, basic_fill, space, &cyc); float CPE = cyc / 65537; printf("CPE = %f\n", CPE); return 0; } ``` ```clike= $ gcc -o basicMemset basicMemset.c -L. -lclock $ ./basicMemset CPE = 8.508232 ``` - 一開始利用`gcc -g -Wall -o basicMemset basicMemset.c` 會報以下錯誤訊息 ```clike= /tmp/ccLgqXzP.o: 於函式 main: /home/datuiji/下載/CS-APP3e-labs-master/performancelab/basicMemset.c:25: 未定義參考到「start_comp_counter」 /home/datuiji/下載/CS-APP3e-labs-master/performancelab/basicMemset.c:27: 未定義參考到「get_comp_counter」 collect2: error: ld returned 1 exit status ``` 後來參考了[鳥哥的 Linux 私房菜-原始碼與 Tarball 套件管理員](http://linux.vbird.org/linux_basic/0520source/0520source_code_and_tarball-fc4.php#intro_library)、[使用 gcc 自製 C/C++ 靜態、共享與動態載入函式庫教學](https://blog.gtwang.org/programming/howto-create-library-using-gcc/)和[我使用过的Linux命令之ar - 创建静态库.a文件](http://codingstandards.iteye.com/blog/1142358)解決了這個問題。 ```clike= $ gcc -c -o clock.o clock.c $ ar -rcs libclock.a clock.o $ gcc basicMemset.c -L. -lclock -o basicMemset_static $ gcc basicMemset.c libclock.a -o basicMemset_static $ ldd basicMemset_static ``` - 編譯時使用`gcc -o basicMemset basicMemset.c -L. -lclock`就可以成功編譯。 ## effectiveMemset.c CPE 測試 - [DreamAndDead](https://github.com/DreamAndDead/CSAPP-3e-Solutions/blob/master/chapter5/code/5.17.c) 此為在 github 上找到的版本。 ```clike= #include <stdio.h> #include <stdint.h> #include <stdlib.h> #include <string.h> #include "clock.h" void* effective_memset(void *s, unsigned long cs, size_t n,double *cyc) { /* align to K */ size_t K = sizeof(unsigned long); size_t cnt = 0; unsigned char *schar = s; start_comp_counter(); while (cnt < n) { if ((size_t)schar % K == 0) { break; } *schar++ = (unsigned char)cs; cnt++; } /* set K chars one time */ unsigned long *slong = (unsigned long *)schar; size_t rest = n - cnt; size_t loop = rest / K; size_t tail = rest % K; for (size_t i = 0; i < loop; i++) { *slong++ = cs; } /* pad the tail part */ schar = (unsigned char *)slong; for (size_t i = 0; i < tail; i++) { *schar++ = (unsigned char)cs; } *cyc = get_comp_counter(); return s; } int main(int argc, char const *argv[]) { size_t space = sizeof(char) * 65537; void* effective_space = malloc(sizeof(char)*65537); unsigned long effective_fill = ~0; double cyc ; effective_memset(effective_space, effective_fill, space, &cyc); float CPE = cyc / 65537; printf("CPE = %f\n", CPE); return 0; } ``` ```clike= $ gcc -o effectiveMemset effectiveMemset.c -L. -lclock $ ./effectiveMemset CPE = 1.746708 ``` ## myMemset.c CPE 測試 ```clike= #include <stdio.h> #include <stdint.h> #include <inttypes.h> #include <stdlib.h> #include <string.h> #include "clock.h" void *my_memset(void *s, int cs, size_t n,double *cyc) { uint8_t *schar = (uint8_t *) s; uint64_t check = 0x07; start_comp_counter(); while (n-- && ((uint64_t) schar & check)) { *(schar++) = (uint8_t) cs; } uint64_t *slong = (uint64_t *)((void *) schar); while((n/8) > 0){ *slong++ = (uint64_t) cs; n -= 8; } schar = (uint8_t *)((void *) slong); while(n--){ *schar++ = (uint8_t) cs; } *cyc = get_comp_counter(); return s; } int main(int argc, char const *argv[]) { size_t space = sizeof(char) * 65537; void* my_space = malloc(sizeof(char)*65537); unsigned long my_fill = ~0; double cyc ; my_memset(my_space, my_fill, space, &cyc); double CPE = cyc / 65537; printf("CPE = %f\n", CPE); free(my_space); return 0; } ``` ```clike= $ gcc -o myMemset -g myMemset.c -L. -lclock $ ./myMemset CPE = 1.372446 ``` ## 問題討論 ### 為何 memset 第二個傳入參數型態為 int 而不是直接用 char? - 1.為了兼容用字符常量對字符串或是字符數組的初始化(字符常量(如“A”)在C語言中被認為成int類型) - 在 c 語言規格書(6.4.4.4)中提到 - >An **integer character constant** has type **int**. The value of an integer character constant containing a single character that maps to a single-byte execution character is the numerical value of the representation of the mapped character interpreted as an integer. The value of an integer character constant containing more than one character (e.g., 'ab'), or containing a character or escape sequence that does not map to a single-byte execution character, is implementation-defined. If an integer character constant contains a single character or escape sequence, its value is the one that results when an object with type char whose value is that of the single character or escape sequence is converted to type int. - 2.為了照顧較舊的代碼。 ```clike= #include <stdio.h> void test(char a) { printf("%d\t", sizeof(a)); } void main(void) { test('a'); printf("%d\t", sizeof(char) ); printf("%d\t", sizeof('x') ); printf("%d\n", sizeof(int) ); } ``` 輸出為:1 1 4 4 在C89標準出來之前,C的代碼中並不強制函數原型的宣告,如果一個函數的呼叫出現在了它的宣告之前,編譯器會去假設一個聲明比如: ```clike= void bar() { int a = foo(5); } int foo(int x) { return x + 1; } ``` - 在這段程式碼中,foo在bar中被呼叫,但是宣告卻在其之後,現在的編譯器是會給出編譯錯誤的,但是在C89之前,編譯器會根據函數呼叫的語句,int a = FOO(5)來猜出FOO的函數原型,比如傳入的值是5,就是一個int,返回值也是一個int。在一些較舊的程式碼中,memset的呼叫可以發生在它被宣告之前。沒有先宣告函數的原型,則不能向函數傳遞char類型的參數,如果這麼做了,那麼char會被向上轉換成int類型,函數的返回值也會是int類型。但在C89之後,函數聲明變成了必須的,於是memset就一定要被先宣告出來,但是為了照顧已有的比較舊的程式碼,將memse第二個參數定義成int型。 - 參考資料:[浅谈 memset 函数的第二个参数为什么是 int 而不是 char](https://blog.csdn.net/keypeople/article/details/51279250) ### 為何記憶體要對齊? - 電腦的記憶體通常會設計成以word-sized進行存取會最有效率。word是存取記憶體最自然的單位。word的大小是電腦架構所定義,一般現代的系統通常word不是4bytes(32bit)就是8bytes(64bit)。早期的電腦記憶體只能以word為單位進行存取,因此所有的記憶體存取都要落在以word為倍數的邊界上。但現代的電腦架構通常能夠存取小至單獨的byte,大至多個word。 - 參考資料:[關於記憶體對齊(Alignment)](http://opass.logdown.com/posts/743054-about-memory-alignment)、[你所不知道的C語言:記憶體管理、對齊及硬體特性(data alignment)](https://hackmd.io/s/BkuMDQ9K7#data-alignment) ## performancelab - 參考 [team15](https://hackmd.io/s/H1bmDn5am#performancelab) 的共筆筆記,用了 [zxc479773533](https://github.com/zxc479773533) 的 [performacelab](https://github.com/zxc479773533/CS-APP3e-labs/tree/master/performancelab) 評估程式的 CPE - 以下為 clock.c 的程式碼 ```clike= #include <stdio.h> #include <stdlib.h> #include <unistd.h> #include <sys/times.h> #include "clock.h" /* * Routines for using the cycle counter */ /* Detect whether running on Alpha */ #ifdef __alpha #define IS_ALPHA 1 #else #define IS_ALPHA 0 #endif /* Detect whether running on x86 */ #ifdef __i386__ #define IS_x86 1 #else #define IS_x86 0 #endif //#if IS_x86 /* $begin x86cyclecounter */ /* Initialize the cycle counter */ static unsigned cyc_hi = 0; static unsigned cyc_lo = 0; /* Set *hi and *lo to the high and low order bits of the cycle counter. Implementation requires assembly code to use the rdtsc instruction. */ void access_counter(unsigned *hi, unsigned *lo) { asm("rdtsc; movl %%edx,%0; movl %%eax,%1" /* Read cycle counter */ : "=r" (*hi), "=r" (*lo) /* and move results to */ : /* No input */ /* the two outputs */ : "%edx", "%eax"); } /* Record the current value of the cycle counter. */ void start_counter() { access_counter(&cyc_hi, &cyc_lo); } /* Return the number of cycles since the last call to start_counter. */ double get_counter() { unsigned ncyc_hi, ncyc_lo; unsigned hi, lo, borrow; double result; /* Get cycle counter */ access_counter(&ncyc_hi, &ncyc_lo); /* Do double precision subtraction */ lo = ncyc_lo - cyc_lo; borrow = lo > ncyc_lo; hi = ncyc_hi - cyc_hi - borrow; result = (double) hi * (1 << 30) * 4 + lo; if (result < 0) { fprintf(stderr, "Error: counter returns neg value: %.0f\n", result); } return result; } /* $end x86cyclecounter */ //#endif /* x86 */ double ovhd() { /* Do it twice to eliminate cache effects */ int i; double result; for (i = 0; i < 2; i++) { start_counter(); result = get_counter(); } return result; } /* $begin mhz */ /* Estimate the clock rate by measuring the cycles that elapse */ /* while sleeping for sleeptime seconds */ double mhz_full(int verbose, int sleeptime) { double rate; start_counter(); sleep(sleeptime); rate = get_counter() / (1e6*sleeptime); if (verbose) printf("Processor clock rate ~= %.1f MHz\n", rate); return rate; } /* $end mhz */ /* Version using a default sleeptime */ double mhz(int verbose) { return mhz_full(verbose, 2); } /** Special counters that compensate for timer interrupt overhead */ static double cyc_per_tick = 0.0; #define NEVENT 100 #define THRESHOLD 1000 #define RECORDTHRESH 3000 /* Attempt to see how much time is used by timer interrupt */ static void callibrate(int verbose) { double oldt; struct tms t; clock_t oldc; int e = 0; times(&t); oldc = t.tms_utime; start_counter(); oldt = get_counter(); while (e <NEVENT) { double newt = get_counter(); if (newt-oldt >= THRESHOLD) { clock_t newc; times(&t); newc = t.tms_utime; if (newc > oldc) { double cpt = (newt-oldt)/(newc-oldc); if ((cyc_per_tick == 0.0 || cyc_per_tick > cpt) && cpt > RECORDTHRESH) cyc_per_tick = cpt; /* if (verbose) printf("Saw event lasting %.0f cycles and %d ticks. Ratio = %f\n", newt-oldt, (int) (newc-oldc), cpt); */ e++; oldc = newc; } oldt = newt; } } /* ifdef added by Sanjit - 10/2001 */ #ifdef DEBUG if (verbose) printf("Setting cyc_per_tick to %f\n", cyc_per_tick); #endif } static clock_t start_tick = 0; void start_comp_counter() { struct tms t; if (cyc_per_tick == 0.0) callibrate(1); times(&t); start_tick = t.tms_utime; start_counter(); } double get_comp_counter() { double time = get_counter(); double ctime; struct tms t; clock_t ticks; times(&t); ticks = t.tms_utime - start_tick; ctime = time - ticks*cyc_per_tick; /* printf("Measured %.0f cycles. Ticks = %d. Corrected %.0f cycles\n", time, (int) ticks, ctime); */ return ctime; } ``` - start_counter - 透過access_counter 去得到 x86 的 cycle counter - get_counter - 去計算 start_counter 到 get_counter 之間經過了幾個 cycle - result = (double) hi * (1 << 30) * 4 + lo 相當於 result = hi << 32 + lo 形成一個 64-bits 的結果 ## 參考資料 - [memset 實現](https://www.youtube.com/watch?v=-i9epX-M-Tw&ab_channel=%E7%B6%B2%E8%B7%AF%E6%95%99%E5%AD%B8%E5%BD%B1%E7%89%87%E6%94%B6%E9%9B%86) - [性能杀手:”潜伏”的memset](https://yq.aliyun.com/articles/2539) - [透彻分析C/C++中memset函数](https://blog.csdn.net/dan15188387481/article/details/49621447) - [memset()的效率以及源码分析](https://blog.csdn.net/Hackbuteer1/article/details/7343189) - [memset详解](https://blog.csdn.net/leonharetd/article/details/8666384)