contributed by <vic85821
>
jkrvivian
不小心就把windows洗掉了…vic85821
一併列出硬體規格,像是 CPU processor, cache memory 等資訊,這樣日後實驗重現才有依據 jserv
因為我的硬體設備比較特別,linux抓不到我的無線網卡,因此要額外安裝驅動程式
sudo lshw -C network
查詢網卡產品名稱 : MT7630ecd
到該檔案所在的目錄$ sudo chmod +x install
$ sudo ./install
$ perf list
List of pre-defined events (to be used in -e):
branch-instructions OR branches [Hardware event]
branch-misses [Hardware event]
bus-cycles [Hardware event]
cache-misses [Hardware event]
cache-references [Hardware event]
cpu-cycles OR cycles [Hardware event]
instructions [Hardware event]
ref-cycles [Hardware event]
stalled-cycles-frontend OR idle-cycles-frontend [Hardware event]
alignment-faults [Software event]
bpf-output [Software event]
context-switches OR cs [Software event]
cpu-clock [Software event]
cpu-migrations OR migrations [Software event]
dummy [Software event]
emulation-faults [Software event]
major-faults [Software event]
minor-faults [Software event]
page-faults OR faults [Software event]
範例效能評析測試
第一次使用perf,還不是很熟悉 試了很多次
perf top -p $pid
才成功!! vic8521Sat, Sep 24, 2016 5:17 PM「程式輸出的文字訊息」請勿使用圖片表示,請改由文字貼上課程助教
之前大二在程式語言,曾經對matrix multiplication,以不同的方法執行(暴力解, 施特拉森演算法, Concurrency),並對效能進行分析
用來檢查coding style是否符合
astyle --style=kr --indent=spaces=4 --indent-switches --suffix=none *.[ch]
更改程式碼以降低cache-miss的比例
struct
typedef struct __PHONE_BOOK_ENTRY {
char lastName[MAX_LAST_NAME_SIZE];
char firstName[16];
char email[16];
char phone[10];
char cell[10];
char addr1[16];
char addr2[16];
char city[16];
char state[2];
char zip[5];
struct __PHONE_BOOK_ENTRY *pNext;
} entry;
findName
entry *findName(char lastname[], entry *pHead)
{
while (pHead != NULL) {
if (strcasecmp(lastname, pHead->lastName) == 0)
return pHead;
pHead = pHead->pNext;
}
return NULL;
}
append
entry *append(char lastName[], entry *e)
{
/* allocate memory for the new entry and put lastName */
e->pNext = (entry *) malloc(sizeof(entry));
e = e->pNext;
strcpy(e->lastName, lastName);
e->pNext = NULL;
return e;
}
make cache-test
size of entry : 136 bytes
execution time of append() : 0.043322 sec
execution time of findName() : 0.005194 sec
Performance counter stats for './phonebook_orig' (100 runs):
2,037,607 cache-misses # 94.069 % of all cache refs ( +- 0.38% )
2,166,081 cache-references ( +- 0.32% )
261,123,688 instructions # 1.36 insns per cycle ( +- 0.02% )
191,871,256 cycles ( +- 0.85% )
0.070512191 seconds time elapsed ( +- 1.71% )
main.c的一段程式碼
e = pHead;
/* the givn last name to find */
char input[MAX_LAST_NAME_SIZE] = "zyxel";
e = pHead;
assert(findName(input, e) &&
"Did you implement findName() in " IMPL "?");
assert(0 == strcmp(findName(input, e)->lastName, "zyxel"));
可以發現都是去找"zyxel",並且findname()都是根據lastname去搜尋
1.資料結構優化
typedef struct __PHONE_BOOK_ENTRY {
char lastName[MAX_LAST_NAME_SIZE];
union {
char firstName[16];
char email[16];
char phone[10];
char cell[10];
char addr1[16];
char addr2[16];
char city[16];
char state[2];
char zip[5];
};
struct __PHONE_BOOK_ENTRY *pNext;
} entry;
執行結果
make cache-test
size of entry : 40 bytes
execution time of append() : 0.072287 sec
execution time of findName() : 0.003676 sec
Performance counter stats for './phonebook_opt' (100 runs):
389,500 cache-misses # 66.630 % of all cache refs ( +- 0.33% )
584,572 cache-references ( +- 1.12% )
243,952,419 instructions # 1.72 insns per cycle ( +- 0.02% )
141,447,587 cycles ( +- 0.83% )
0.050986039 seconds time elapsed ( +- 1.70% )
cache-miss 原本 94.069%
-> 優化後 66.630%
老師上課也有說過,單純的簡化資料結構,也可以達到最佳化!!是真的!!
"example" 的簡寫是 e.g.,不是 "ex",後者是拿來描述前女友、前東家等詞彙
這段程式碼已經改變預期行為了,作最佳化不能改變正確性。 jserv
typedef struct _PHONE_BOOK_DETAIL {
char firstName[16];
char email[16];
char phone[10];
char cell[10];
char addr1[16];
char addr2[16];
char city[16];
char state[2];
char zip[5];
} detail;
typedef struct _PHONE_BOOK_ENTRY {
char lastName[MAX_LAST_NAME_SIZE];
detail* detailDate;
struct _PHONE_BOOK_ENTRY *pNext;
} entry;
執行結果
perf stat -r 100 -e cache-misses,cache-references,L1-dcache-load-misses,L1-dcache-store-misses,L1-dcache-prefetch-misses,L1-icache-load-misses ./phonebook_opt
size of entry : 24 bytes
execution time of append() : 0.070298 sec
execution time of findName() : 0.003760 sec
Performance counter stats for './phonebook_opt' (100 runs):
252,673 cache-misses # 67.435 % of all cache refs ( +- 0.44% ) (64.12%)
374,694 cache-references ( +- 0.82% ) (64.44%)
713,048 L1-dcache-load-misses ( +- 3.34% ) (65.51%)
240,697 L1-dcache-store-misses ( +- 1.05% ) (70.62%)
522,278 L1-dcache-prefetch-misses ( +- 3.61% ) (73.19%)
87,401 L1-icache-load-misses ( +- 3.72% ) (67.82%)
0.046339100 seconds time elapsed ( +- 1.24% )
備註:這部份都尚未修改findname() append(),而是使用phonebook_orig.c 裡的function
2.更改搜尋方法
+-----+
|node |
+----+ +----+ +-------+-----+
+----+ |node| |node| | |
|node| +---> +-------+ +--------+ +--v-+ +-v--+
+----+ | +----> | | +---> |node| |node|
+--v--+ | | +-----------+ +----+
|node | +--v-+ +--v-+ | |
+-----+ |node| |node| +-v--+ +-v--+
+----+ +----+ |node| |node|
+----+ +----+
對資料結構稍做更動,增加 bst
#define OPT 1
#define BST 1
typedef struct _PHONE_BOOK_DETAIL {
char firstName[16];
char email[16];
char phone[10];
char cell[10];
char addr1[16];
char addr2[16];
char city[16];
char state[2];
char zip[5];
} detail;
typedef struct _PHONE_BOOK_ENTRY {
char lastName[MAX_LAST_NAME_SIZE];
detail *detailDate;
struct _PHONE_BOOK_ENTRY *pNext;
} entry;
typedef struct _PHONE_BOOK_BST {
struct _PHONE_BOOK_ENTRY *data;
struct _PHONE_BOOK_BST *left;
struct _PHONE_BOOK_BST *right;
} bst;
bst *findName(char lastname[], bst *root);
entry *append(char lastName[], entry *e);
bst *build_bst(entry** pHead,int num);
#endif
建立binary search tree雖然findname()時間變快許多,但是一開始還要額外花時間在建樹
//build binary search
bst *build_bst(entry** pHead,int num)
{
if(num == 0)
return NULL;
bst *root,*left;
//recursive build left leaf
left = build_bst(pHead, num>>1);
// build root(center node)
root = (bst *) malloc(sizeof(bst));
root -> data = (entry *)malloc(sizeof(entry));
strcpy((root->data)->lastName, (*pHead)->lastName);
root->left = left;
// build right left
entry *tmp = *pHead;
*pHead = (*pHead)->pNext;
free(tmp);
root->right = build_bst(pHead, num-(num>>1)-1);
return root;
}
執行結果
perf stat -r 100 -e cache-misses,cache-references,L1-dcache-load-misses,L1-dcache-store-misses,L1-dcache-prefetch-misses,L1-icache-load-misses ./phonebook_opt
size of entry : 32 bytes
execution time of append() : 0.080784 sec # 包含建樹,可以看出用了許多時間
execution time of findName() : 0.000001 sec
Performance counter stats for './phonebook_opt' (100 runs):
555,542 cache-misses # 85.642 % of all cache refs ( +- 0.51% ) (65.22%)
648,682 cache-references ( +- 0.47% ) (66.16%)
1,059,861 L1-dcache-load-misses ( +- 0.47% ) (67.19%)
772,671 L1-dcache-store-misses ( +- 0.35% ) (68.97%)
314,794 L1-dcache-prefetch-misses ( +- 0.66% ) (68.96%)
89,415 L1-icache-load-misses ( +- 1.78% ) (66.29%)
0.100239647 seconds time elapsed ( +- 2.05% )
bst *left,*right
,在建樹的時候使用遞迴,造成大量cache miss之前很少用到makefile,如果有也只是簡單打幾行compile的指令,第一次看到如同這次的Makefile
CC ?= gcc
CFLAGS_common ?= -Wall -std=gnu99
CFLAGS_orig = -O0
CFLAGS_opt = -O0
EXEC = phonebook_orig phonebook_opt
all: $(EXEC)
SRCS_common = main.c
phonebook_orig: $(SRCS_common) phonebook_orig.c phonebook_orig.h
$(CC) $(CFLAGS_common) $(CFLAGS_orig) \
-DIMPL="\"$@.h\"" -o $@ \
$(SRCS_common) $@.c
phonebook_opt: $(SRCS_common) phonebook_opt.c phonebook_opt.h
$(CC) $(CFLAGS_common) $(CFLAGS_opt) \
-DIMPL="\"$@.h\"" -o $@ \
$(SRCS_common) $@.c
run: $(EXEC)
echo 3 | sudo tee /proc/sys/vm/drop_caches
watch -d -t "./phonebook_orig && echo 3 | sudo tee /proc/sys/vm/drop_caches"
cache-test: $(EXEC)
perf stat --repeat 100 \
-e cache-misses,cache-references,instructions,cycles \
./phonebook_orig
perf stat --repeat 100 \
-e cache-misses,cache-references,instructions,cycles \
./phonebook_opt
output.txt: cache-test calculate
./calculate
plot: output.txt
gnuplot scripts/runtime.gp
calculate: calculate.c
$(CC) $(CFLAGS_common) $^ -o $@
.PHONY: clean
clean:
$(RM) $(EXEC) *.o perf.* \
calculate orig.txt opt.txt output.txt runtime.png