# System Call states ## Hello.c ```c= //Hello.c #include <stdio.h> int main(void){ printf("Hello linux kernel\n"); return 0; } ``` 利用gcc編譯後並使用strace ```c= strace ./hello.o -o hello.log ``` 會得到 ```cmake= execve("./hello", ["./hello"], 0x7ffec1847c90 /* 62 vars */) = 0 brk(NULL) = 0x555ff5ee0000 access("/etc/ld.so.nohwcap", F_OK) = -1 ENOENT (No such file or directory) access("/etc/ld.so.preload", R_OK) = -1 ENOENT (No such file or directory) openat(AT_FDCWD, "/etc/ld.so.cache", O_RDONLY|O_CLOEXEC) = 3 fstat(3, {st_mode=S_IFREG|0644, st_size=77067, ...}) = 0 mmap(NULL, 77067, PROT_READ, MAP_PRIVATE, 3, 0) = 0x7f3ff20ff000 close(3) = 0 access("/etc/ld.so.nohwcap", F_OK) = -1 ENOENT (No such file or directory) openat(AT_FDCWD, "/lib/x86_64-linux-gnu/libc.so.6", O_RDONLY|O_CLOEXEC) = 3 read(3, "\177ELF\2\1\1\3\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\240\35\2\0\0\0\0\0"..., 832) = 832 fstat(3, {st_mode=S_IFREG|0755, st_size=2030928, ...}) = 0 mmap(NULL, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f3ff20fd000 mmap(NULL, 4131552, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x7f3ff1af8000 mprotect(0x7f3ff1cdf000, 2097152, PROT_NONE) = 0 mmap(0x7f3ff1edf000, 24576, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x1e7000) = 0x7f3ff1edf000 mmap(0x7f3ff1ee5000, 15072, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x7f3ff1ee5000 close(3) = 0 arch_prctl(ARCH_SET_FS, 0x7f3ff20fe4c0) = 0 mprotect(0x7f3ff1edf000, 16384, PROT_READ) = 0 mprotect(0x555ff5c25000, 4096, PROT_READ) = 0 mprotect(0x7f3ff2112000, 4096, PROT_READ) = 0 munmap(0x7f3ff20ff000, 77067) = 0 fstat(1, {st_mode=S_IFCHR|0620, st_rdev=makedev(136, 0), ...}) = 0 brk(NULL) = 0x555ff5ee0000 brk(0x555ff5f01000) = 0x555ff5f01000 write(1, "Hello linux kernel\n", 19) = 19 exit_group(0) = ? +++ exited with 0 +++ ``` 可以看到write指令會把字串進行輸出,測試python檔案會變成很長一串會有更多的system call。 ## systemcall的包裝函數(Wrapper Function) 在一般是無法使用C等高階語言進行System call要使用assembly language幫忙才可以(MOV、ADD等),所以我們這時候就會透過Wrapper function來進行委派,這時候一些instuction就可以用啦,以下進行範例使用getppid ```c= //loop.c #include <sys/types.h> #include <unistd.h> int main(void){ for(;;) getppid(); } ``` getppid()就是回傳父行程的意思。可以使用Sat命列來得知在CPU中的使用狀況 ```c= sar -P ALL 1 ``` 並且執行後可以使用下面指令來顯示pid ``` ./loop & ``` ``` 廿二時四分十九秒 CPU %user %nice %system %iowait %steal %idle 廿二時四分廿秒 all 9.58 0.00 12.85 0.00 0.00 77.57 廿二時四分廿秒 0 0.00 0.00 0.00 0.00 0.00 100.00 廿二時四分廿秒 1 41.30 0.00 58.70 0.00 0.00 0.00 廿二時四分廿秒 2 2.68 0.00 0.89 0.00 0.00 96.43 廿二時四分廿秒 3 0.00 0.00 0.00 0.00 0.00 100.00 廿二時四分廿秒 CPU %user %nice %system %iowait %steal %idle 廿二時四分廿一秒 all 10.08 0.00 12.34 0.00 0.00 77.58 廿二時四分廿一秒 0 0.00 0.00 0.95 0.00 0.00 99.05 廿二時四分廿一秒 1 42.35 0.00 57.65 0.00 0.00 0.00 廿二時四分廿一秒 2 2.91 0.00 0.00 0.00 0.00 97.09 廿二時四分廿一秒 3 0.00 0.00 0.00 0.00 0.00 100.00 廿二時四分廿一秒 CPU %user %nice %system %iowait %steal %idle 廿二時四分廿二秒 all 9.62 0.00 13.67 0.00 0.00 76.71 廿二時四分廿二秒 0 0.00 0.00 0.00 0.00 0.00 100.00 廿二時四分廿二秒 1 40.00 0.00 60.00 0.00 0.00 0.00 廿二時四分廿二秒 2 1.96 0.00 0.00 0.00 0.00 98.04 廿二時四分廿二秒 3 0.00 0.00 0.00 0.00 0.00 100.00 廿二時四分廿二秒 CPU %user %nice %system %iowait %steal %idle 廿二時四分廿三秒 all 9.61 0.00 12.99 0.00 0.00 77.40 廿二時四分廿三秒 0 0.00 0.00 0.00 0.00 0.00 100.00 廿二時四分廿三秒 1 42.35 0.00 57.65 0.00 0.00 0.00 廿二時四分廿三秒 2 2.04 0.00 0.00 0.00 0.00 97.96 廿二時四分廿三秒 3 0.00 0.00 0.00 0.00 0.00 100.00 廿二時四分廿三秒 CPU %user %nice %system %iowait %steal %idle 廿二時四分廿四秒 all 11.81 0.00 16.83 0.00 0.00 71.36 廿二時四分廿四秒 0 0.00 0.00 0.00 0.00 0.00 100.00 廿二時四分廿四秒 1 38.78 0.00 61.22 0.00 0.00 0.00 廿二時四分廿四秒 2 8.00 0.00 6.00 0.00 0.00 86.00 廿二時四分廿四秒 3 0.00 0.00 2.04 0.00 0.00 97.96 ``` ## 標準C函式庫 可以使用ldd來看與合種函式庫進行連結 ```c= $ ldd /bin/echo linux-vdso.so.1 (0x00007ffd25943000) libc.so.6 => /lib/x86_64-linux-gnu/libc.so.6 (0x00007f9c95a5b000) /lib64/ld-linux-x86-64.so.2 (0x00007f9c96055000) ``` 以及剛剛使用的loop檔 ```c= $ ldd loop linux-vdso.so.1 (0x00007ffc0d92e000) libc.so.6 => /lib/x86_64-linux-gnu/libc.so.6 (0x00007f833a263000) /lib64/ld-linux-x86-64.so.2 (0x00007f833a856000) ``` 也可以從這邊得知python其實也是引用了不少C語言的函式庫,所以才會說python還是基於C來跑。 ```c= $ldd /usr/bin/python3 linux-vdso.so.1 (0x00007ffc49924000) libc.so.6 => /lib/x86_64-linux-gnu/libc.so.6 (0x00007f4640eb4000) libpthread.so.0 => /lib/x86_64-linux-gnu/libpthread.so.0 (0x00007f4640c95000) libdl.so.2 => /lib/x86_64-linux-gnu/libdl.so.2 (0x00007f4640a91000) libutil.so.1 => /lib/x86_64-linux-gnu/libutil.so.1 (0x00007f464088e000) libexpat.so.1 => /lib/x86_64-linux-gnu/libexpat.so.1 (0x00007f464065c000) libz.so.1 => /lib/x86_64-linux-gnu/libz.so.1 (0x00007f464043f000) libm.so.6 => /lib/x86_64-linux-gnu/libm.so.6 (0x00007f46400a1000) /lib64/ld-linux-x86-64.so.2 (0x00007f46412a5000) ``` ## OS提供的指令 OS所提供的可以簡單用下方幾種來表示: * 系統初始化 init * 變更 sysctl,nice,sync * 檔案操作 touch,mkdir * 文字資料處理 grep,sort,uniq * 效能量測 sar,iostat * 編譯器 gcc * 腳本執行環境 perl,python,ruby * shell bash,sh * 視窗系統 X11 # 行程Process ## fork() 在系統中OS Kernel裡的PID(process identifier)0代表的是kernel來做SWAP交換分頁用,則1就是表示init,所以就在這邊會進行fork,例如Web server就可以fork出兩個child process,產生出web process和database process。 ```c= #include <stdio.h> #include <stdlib.h> #include <unistd.h> #include <err.h> static void child(){ printf("I'm child,my pid is %d\n",getpid()); exit(EXIT_SUCCESS); } static void parent(pid_t pid_c){ printf("I'm parent,my pid is %d.\n And my child pid is %d.\n",getpid(),pid_c); exit(EXIT_SUCCESS); } int main(void){ pid_t process; process = fork(); if(process==-1){ err(EXIT_FAILURE,"fork() failed"); } else if(process == 0){ child(); //fork函數會把子行程的pid返回給母行程、將0返回給子行程。利用這個來讓母行程與子行程處理分支 } else{ parent(process); //母行程會輸出本身的pid與子行程的pid後結束,子行程會在輸出本身的pid後結束 } err(EXIT_FAILURE,"shouldn't reach here"); } ``` 輸出結果如下 ```c= $ ./fork I'm parent,my pid is 7774. And my child pid is 7775. I'm child,my pid is 7775 ``` ## execve() 當我們想執行完一個程式後依然保有控制權,如果要新增行程,就會在母行程fork()出來後再呼叫exec(),就是所謂的fork and exec的流程,母行程會建立完echo hello的程式後將自身pid及子行程pid輸出後便結束。 ```c= //ForkandExec.c #include <stdio.h> #include <stdlib.h> #include <unistd.h> #include <err.h> static void child(){ char *args[] = {"/bin/echo","Hi Linux",NULL}; // argv list第一个參數應該指向與正在執行文件關聯的檔案,必須使NULL結尾 printf("I'm child,my pid is %d\n",getpid()); fflush(stdout); //強制印到檔案上 execve("/bin/echo",args,NULL); err(EXIT_FAILURE,"exec() failed"); } static void parent(pid_t pid_c){ printf("I'm parent,my pid is %d.\n And my child pid is %d.\n",getpid(),pid_c); exit(EXIT_SUCCESS); } int main(void){ pid_t process; process = fork(); if(process==-1){ err(EXIT_FAILURE,"fork() failed"); } else if(process == 0){ child(); } else{ parent(process); } err(EXIT_FAILURE,"shouldn't reach here"); } ``` 執行結果 ```c= $ ./ForkandExec I'm parent,my pid is 8656. I'm child,my pid is 8657 And my child pid is 8657. $ Hi Linux ``` ![](https://hackmd.io/_uploads/BJH0l5-c3.png) ## 排程器 行程會在各核心上執行,主要會利用下面的程式去計算一下幾個迴圈才會耗費到一毫秒。實驗中n代表要同時運作的行程數,total代表要讓程式運作的合計時間,resol統計資訊的採集間隔。 並使用taskset指定在某個CPU上執行。 ```c= taskset -c 0 ./sheduler 1 100 1 > core-process.txt ``` scheduler.c程式碼如下 ```c= #include <sys/types.h> #include <sys/wait.h> #include <time.h> #include <unistd.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <err.h> #define NLOOP_FOR_ESTIMATION 10000000000UL #define NSECS_PER_MSEC 1000000UL #define NSECS_PER_SEC 1000000000UL static inline long diff_nsec(struct timespec before,struct timespec after){ return ((after.tv_sec*NSECS_PER_SEC + after.tv_nsec) -(before.tv_sec*NSECS_PER_SEC+before.tv_nsec)); } //inline 內嵌函數,編譯也不一定會實作,如果處理很短並且很常呼叫的話可以加上他 //timesoec 測量時間的函數,常用在評估程式執行效能的方法 static unsigned long loops_per_msec(){ struct timespec before,after; clock_gettime(CLOCK_MONOTONIC,&before); unsigned long i; for(i=0;i<NLOOP_FOR_ESTIMATION;i++) ; clock_gettime(CLOCK_MONOTONIC,&after); int ret; return NLOOP_FOR_ESTIMATION*NSECS_PER_MSEC/diff_nsec(before,after); } static inline void load(unsigned long nloop){ unsigned long i; for(i=0;i<nloop;i++) ; } static void child_fn(int id,struct timespec *buf,int nrecord,unsigned long nloop_per_resol,struct timespec start){ int i; for(i=0;i<nrecord;i++){ struct timespec ts; load(nloop_per_resol); clock_gettime(CLOCK_MONOTONIC, &ts); buf[i] = ts; } for(i=0;i<nrecord;i++){ printf("%d\t%ld\t%d\n",id,diff_nsec(start,buf[i])/NSECS_PER_MSEC,(i+1)*100/nrecord); } exit(EXIT_SUCCESS); } static void parent_fn(int nproc){ int i; for(i=0;i<nproc;i++) wait(NULL); } static pid_t *pids; int main(int argc, char *argv[]){ int ret = EXIT_FAILURE; if(argc<4){ fprintf(stderr,"usage: %s <nproc> <total[ms]> <resolution [ms]>\n",argv[0]); exit(EXIT_FAILURE); } int nproc = atoi(argv[1]); int total = atoi(argv[2]); int resol = atoi(argv[3]); if(nproc<1){ fprintf(stderr,"<nproc>(%d) should be >= 1\n", nproc); exit(EXIT_FAILURE); } if(total<1){ fprintf(stderr,"<total>(%d) should be >= 1\n", total); exit(EXIT_FAILURE); } if(resol<1){ fprintf(stderr,"<resol>(%d) should be >= 1\n", resol); exit(EXIT_FAILURE); } if(total%resol){ fprintf(stderr,"<total>(%d) should be multiple of <resolution>(%d)\n", total,resol); exit(EXIT_FAILURE); } int nrecord = total/resol; struct timespec *logbuf = malloc (nrecord * sizeof(struct timespec)); if(!logbuf) err(EXIT_FAILURE,"malloc(logbuf) failed"); puts("estimation workload which takes just one millisecond"); unsigned long nloop_per_resol = loops_per_msec() * resol; puts("end estimation"); fflush(stdout); pids = malloc(nproc * sizeof(pid_t)); if(pids==NULL){ warn("malloc(pids) failed"); goto free_logbuf; } struct timespec start; clock_gettime(CLOCK_MONOTONIC,&start); int i,ncreated; for(i=0,ncreated=0;i<nproc;i++,ncreated++){ pids[i] = fork(); if(pids[i]<0){ goto wait_children; }else if (pids[i]==0){ child_fn(i,logbuf,nrecord,nloop_per_resol,start); } } ret = EXIT_SUCCESS; wait_children: if(ret == EXIT_FAILURE) for(i=0;i<ncreated;i++) warn("kill(%d) failed",pids[i]); for(i=0;i<ncreated;i++) if(wait(NULL)<0) warn("wait() failed."); free_pids: free(pids); free_logbuf: free(logbuf); exit(ret); } ``` 可以藉由此程式進行不同行程數的結果比較,例如:1、2、4。 ## Taskset指令 可以使用作業系統所提供的taskset的命令,並使用-c引數來指定在某個邏輯CPU上運作。 ```c= taskset -c 0 ./sheduler 1 100 1 > core-process.txt ``` 可以將行程為1,程式運作總時間100,統計資訊採樣間隔1(毫秒),輸出到core-process.txt上去看結果,並使用繪圖軟體進行繪製圖片, ## 行程狀態及轉換 | 狀態 | 意思 | | -------- | -------- | | 執行狀態 | 目前正在使用邏輯CPU | | 待命狀態 | 等待CPU時間分配 | | 休眠狀態 | 等待某事件的發生。事件發生前是不會使用到CPU時間 | | 殭屍狀態 | 在行程結束後等待母行程接受結束狀態 | 在Linux中可以使用ps ax命令來看系統行程的顯示列表來供確認,例如以下指令可以看到有多少行程在執行。 ```c= $ ps ax| wc -l 247 ``` ![](https://hackmd.io/_uploads/Syg7yiqcn.png) ## 吞吐量與延遲(Throughput & Latency) 吞吐量代表單位時間的總工作量。越高越好。公式是已完成的行程數量/經過時間。 延遲代表各處理開始到結束為止經過的時間。越短越好。公式是處理結束時刻-處理開始時刻。 簡單來說邏輯CPU的運算資源使用的越多吞吐量越高,代表CPU閒置的比率越低。則閒置的比率越低代表吞吐量越高。 我們可以透過以下的實驗來進行一下簡單測試,使用loop程式來進行實驗。 首先可以看到執行中的部分顯示0代表沒有待命的程式,表示閒置中 ```c $ sar -q 1 1 Linux 5.4.0-150-generic (LinuxKernel18) 廿廿三年七月廿三日 _x86_64_ (4 CPU) 廿三時十九分廿一秒 runq-sz plist-sz ldavg-1 ldavg-5 ldavg-15 blocked 廿三時十九分廿二秒 0 614 0.00 0.00 0.00 0 Average: 0 614 0.00 0.00 0.00 0 ``` 再來執行loop ```c $ taskset -c 0 ./loop & [1] 25127 ``` 再查看一次,發現出現了一個待命行程 ```c $ sar -q 1 1 Linux 5.4.0-150-generic (LinuxKernel18) 廿廿三年七月廿三日 _x86_64_ (4 CPU) 廿三時十九分57秒 runq-sz plist-sz ldavg-1 ldavg-5 ldavg-15 blocked 廿三時十九分58秒 1 615 0.15 0.03 0.01 0 Average: 1 615 0.15 0.03 0.01 0 ``` 再執行第二個程式loop ```c $ taskset -c 0 ./loop & [2] 25131 ``` 可以發現有兩個待命程式 ```c $ sar -q 1 1 Linux 5.4.0-150-generic (LinuxKernel18) 廿廿三年七月廿三日 _x86_64_ (4 CPU) 廿三時廿分廿七秒 runq-sz plist-sz ldavg-1 ldavg-5 ldavg-15 blocked 廿三時廿分廿八秒 2 616 0.57 0.14 0.05 0 Average: 2 616 0.57 0.14 0.05 0 ``` 最後殺死兩個行程,完成實驗 ```c $ kill 25127 25131 ``` 總結來說,在一個CPU行程上進行處理的行程只有一個,在可執行複數行程的情況下,就是將各行程於適當的time slot在CPU上依順序處理。 所以在多核心的CPU環境下不同時運作複數行程,吞吐量不會升高,則在同一個邏輯CPU時,就算行程數增加的比邏輯CPU數多吞吐量也不會升高。 ## 經過時間與使用時間 在Linux底下可以使用time命令來取得開始到結束的時間,分別為經過時間,從行程開始之後到結束為止的經過時間,就像是使用碼表去量測開始到結束的時間。使用時間則是代表行程實際使用到邏輯cpu的時間。 ![](https://hackmd.io/_uploads/HJpjcJhch.png) 我們使用total(處理的所需時間)10秒及resol(進度顯示)10秒來作為sheduler程式的測試 ```c $ time taskset -c 0 ./sheduler 1 10000 10000 estimation workload which takes just one millisecond end estimation 0 16293 100 real 0m26.901s user 0m26.895s sys 0m0.004s ``` real的值代表經過時間。將user加上sys就可以得到他。user值就代表行程在執行當中使用到CPU時間,sys值就代表來自user space委託kernel執行systemcall的時間。 行程數為2的執行結果 ```c $ time taskset -c 0 ./sheduler 2 10000 10000 estimation workload which takes just one millisecond end estimation 0 33275 100 1 33283 100 real 0m43.875s user 0m43.852s sys 0m0.004s ``` 那如果是兩個CPU一個行程的情況會得到與1個CPU差不多的結果 ```c $ time taskset -c 0,1 ./sheduler 1 10000 10000 estimation workload which takes just one millisecond end estimation 0 16719 100 real 0m27.329s user 0m27.328s sys 0m0.000s ``` 那如果是兩個CPU兩個行程的情況會得到與一個CPU兩個行程差不多的結果 ```c $ time taskset -c 0,1 ./sheduler 2 10000 10000 estimation workload which takes just one millisecond end estimation 1 16773 100 0 16812 100 real 0m27.479s user 0m44.204s sys 0m0.029s ``` 休眠行程就是我們可以指定秒數休眠然後並結束的行程 ```c $ time sleep 10 real 0m10.003s user 0m0.001s sys 0m0.000s ``` 實際的行程會以非常複雜的方式轉換各種狀態,這邊也可以使用ps -eo命令裡的etime欄位跟time欄位所示可以將每個行程、命令名、經過時間、使用時間呈現出來看看 ```c $ ps -eo pid,comm,etime,time PID COMMAND ELAPSED TIME 1 systemd 9-00:40:11 00:00:13 2 kthreadd 9-00:40:11 00:00:00 3 rcu_gp 9-00:40:11 00:00:00 4 rcu_par_gp 9-00:40:11 00:00:00 6 kworker/0:0H 9-00:40:11 00:00:00 8 mm_percpu_wq 9-00:40:11 00:00:00 9 ksoftirqd/0 9-00:40:11 00:00:01 10 rcu_sched 9-00:40:11 00:01:22 11 migration/0 9-00:40:11 00:00:12 12 idle_inject/0 9-00:40:11 00:00:00 14 cpuhp/0 9-00:40:11 00:00:00 15 cpuhp/1 9-00:40:11 00:00:00 16 idle_inject/1 9-00:40:11 00:00:00 17 migration/1 9-00:40:11 00:00:12 18 ksoftirqd/1 9-00:40:11 00:00:00 20 kworker/1:0H-kb 9-00:40:11 00:00:00 21 cpuhp/2 9-00:40:11 00:00:00 22 idle_inject/2 9-00:40:11 00:00:00 23 migration/2 9-00:40:11 00:00:11 24 ksoftirqd/2 9-00:40:11 00:00:00 26 kworker/2:0H 9-00:40:11 00:00:00 27 cpuhp/3 9-00:40:11 00:00:00 28 idle_inject/3 9-00:40:11 00:00:00 29 migration/3 9-00:40:11 00:00:12 30 ksoftirqd/3 9-00:40:11 00:00:05 32 kworker/3:0H-kb 9-00:40:11 00:00:00 33 kdevtmpfs 9-00:40:11 00:00:00 34 netns 9-00:40:11 00:00:00 35 rcu_tasks_kthre 9-00:40:11 00:00:00 36 kauditd 9-00:40:11 00:00:00 37 khungtaskd 9-00:40:11 00:00:01 38 oom_reaper 9-00:40:11 00:00:00 39 writeback 9-00:40:11 00:00:00 40 kcompactd0 9-00:40:11 00:00:00 41 ksmd 9-00:40:11 00:00:00 42 khugepaged 9-00:40:11 00:00:03 89 kintegrityd 9-00:40:11 00:00:00 90 kblockd 9-00:40:11 00:00:00 91 blkcg_punt_bio 9-00:40:11 00:00:00 92 tpm_dev_wq 9-00:40:11 00:00:00 93 ata_sff 9-00:40:11 00:00:00 94 md 9-00:40:11 00:00:00 95 edac-poller 9-00:40:11 00:00:00 96 devfreq_wq 9-00:40:11 00:00:00 97 watchdogd 9-00:40:11 00:00:00 102 kswapd0 9-00:40:10 00:00:00 103 ecryptfs-kthrea 9-00:40:10 00:00:00 105 kthrotld 9-00:40:10 00:00:00 106 acpi_thermal_pm 9-00:40:10 00:00:00 107 scsi_eh_0 9-00:40:10 00:00:00 108 scsi_tmf_0 9-00:40:10 00:00:00 109 scsi_eh_1 9-00:40:10 00:00:00 110 scsi_tmf_1 9-00:40:10 00:00:00 112 vfio-irqfd-clea 9-00:40:10 00:00:00 114 ipv6_addrconf 9-00:40:10 00:00:00 123 kstrp 9-00:40:10 00:00:00 126 kworker/u9:0 9-00:40:10 00:00:00 139 charger_manager 9-00:40:10 00:00:00 182 scsi_eh_2 9-00:40:10 00:00:00 183 scsi_tmf_2 9-00:40:10 00:00:00 185 kworker/0:1H-kb 9-00:40:10 00:00:05 206 kworker/3:1H-kb 9-00:40:09 00:00:05 208 jbd2/sda1-8 9-00:40:09 00:00:03 209 ext4-rsv-conver 9-00:40:09 00:00:00 223 kworker/1:1H-kb 9-00:40:09 00:00:08 239 kworker/2:1H-kb 9-00:40:09 00:00:08 244 systemd-journal 9-00:40:09 00:00:04 271 systemd-udevd 9-00:40:09 00:00:01 281 loop0 9-00:40:09 00:00:00 297 loop1 9-00:40:09 00:00:00 298 loop2 9-00:40:09 00:00:00 319 systemd-resolve 9-00:40:09 00:00:03 322 loop3 9-00:40:09 00:00:00 348 loop4 9-00:40:09 00:00:00 349 loop5 9-00:40:09 00:00:00 350 loop6 9-00:40:09 00:00:00 377 loop7 9-00:40:09 00:00:00 383 iprt-VBoxWQueue 9-00:40:09 00:00:00 398 irq/18-vmwgfx 9-00:40:09 00:00:01 402 ttm_swap 9-00:40:09 00:00:00 412 cryptd 9-00:40:08 00:00:00 415 loop9 9-00:40:08 00:00:00 522 dbus-daemon 9-00:40:08 00:00:29 573 cron 9-00:40:08 00:00:01 574 systemd-logind 9-00:40:08 00:00:01 575 acpid 9-00:40:08 00:00:00 576 ModemManager 9-00:40:08 00:00:00 577 networkd-dispat 9-00:40:08 00:00:00 579 wpa_supplicant 9-00:40:08 00:00:05 580 avahi-daemon 9-00:40:08 00:00:00 583 irqbalance 9-00:40:08 00:00:15 587 avahi-daemon 9-00:40:08 00:00:00 591 udisksd 9-00:40:08 00:00:01 595 rsyslogd 9-00:40:08 00:00:00 603 NetworkManager 9-00:40:08 00:00:20 604 accounts-daemon 9-00:40:08 00:00:15 636 polkitd 9-00:40:08 00:00:00 737 unattended-upgr 9-00:40:08 00:00:00 758 gdm3 9-00:40:08 00:00:00 763 VBoxDRMClient 9-00:40:08 00:04:23 766 VBoxService 9-00:40:08 00:01:34 806 gdm-session-wor 9-00:40:07 00:00:00 884 systemd 9-00:40:07 00:00:00 885 (sd-pam) 9-00:40:07 00:00:00 904 gdm-wayland-ses 9-00:40:07 00:00:00 906 dbus-daemon 9-00:40:07 00:00:00 916 gnome-session-b 9-00:40:07 00:00:00 933 gnome-shell 9-00:40:07 00:01:51 941 upowerd 9-00:40:07 00:00:00 959 Xwayland 9-00:40:06 00:00:00 970 at-spi-bus-laun 9-00:40:06 00:00:00 975 dbus-daemon 9-00:40:06 00:00:00 977 at-spi2-registr 9-00:40:06 00:00:00 981 pulseaudio 9-00:40:06 00:00:00 982 rtkit-daemon 9-00:40:06 00:00:14 997 ibus-daemon 9-00:40:06 00:00:00 1000 ibus-dconf 9-00:40:06 00:00:00 1003 ibus-x11 9-00:40:06 00:00:00 1007 ibus-portal 9-00:40:06 00:00:00 1013 xdg-permission- 9-00:40:06 00:00:00 1027 dhclient 9-00:40:05 00:00:00 1041 whoopsie 9-00:40:05 00:00:00 1046 kerneloops 9-00:40:05 00:00:05 1048 kerneloops 9-00:40:05 00:00:05 1100 boltd 9-00:40:05 00:00:00 1104 packagekitd 9-00:40:05 00:00:16 1105 gsd-xsettings 9-00:40:05 00:00:00 1108 gsd-a11y-settin 9-00:40:05 00:00:00 1111 gsd-clipboard 9-00:40:05 00:00:00 1115 gsd-color 9-00:40:05 00:04:32 1116 gsd-datetime 9-00:40:05 00:00:00 1117 gsd-housekeepin 9-00:40:05 00:00:00 1119 gsd-keyboard 9-00:40:05 00:00:00 1123 gsd-media-keys 9-00:40:05 00:00:00 1129 gsd-mouse 9-00:40:05 00:00:00 1131 gsd-power 9-00:40:05 00:00:00 1134 gsd-print-notif 9-00:40:05 00:00:00 1135 gsd-rfkill 9-00:40:05 00:00:00 1137 gsd-screensaver 9-00:40:05 00:00:00 1147 gsd-sharing 9-00:40:05 00:00:00 1150 gsd-smartcard 9-00:40:05 00:00:00 1152 gsd-sound 9-00:40:05 00:00:00 1162 gsd-wacom 9-00:40:05 00:00:00 1176 colord 9-00:40:05 00:00:00 1199 ibus-engine-sim 9-00:40:05 00:00:00 1221 gdm-session-wor 9-00:39:19 00:00:00 1225 systemd 9-00:39:16 00:00:00 1226 (sd-pam) 9-00:39:16 00:00:00 1239 gnome-keyring-d 9-00:39:16 00:00:00 1243 gdm-x-session 9-00:39:16 00:00:00 1245 Xorg 9-00:39:16 00:04:50 1257 dbus-daemon 9-00:39:16 00:00:00 1260 gnome-session-b 9-00:39:16 00:00:00 1359 VBoxClient 9-00:39:15 00:00:00 1360 VBoxClient 9-00:39:15 00:00:00 1374 VBoxClient 9-00:39:15 00:00:00 1375 VBoxClient 9-00:39:15 00:05:37 1382 VBoxClient 9-00:39:15 00:00:00 1383 VBoxClient 9-00:39:15 00:39:03 1387 VBoxClient 9-00:39:15 00:00:00 1388 VBoxClient 9-00:39:15 00:03:06 1398 ssh-agent 9-00:39:15 00:00:05 1408 at-spi-bus-laun 9-00:39:15 00:00:00 1413 dbus-daemon 9-00:39:15 00:00:00 1415 at-spi2-registr 9-00:39:15 00:00:23 1450 gnome-shell 9-00:39:15 00:09:09 1464 gvfsd 9-00:39:15 00:00:00 1469 gvfsd-fuse 9-00:39:14 00:00:00 1480 pulseaudio 9-00:39:14 00:00:04 1493 ibus-daemon 9-00:39:14 00:00:10 1497 ibus-dconf 9-00:39:14 00:00:00 1499 xdg-permission- 9-00:39:14 00:00:00 1501 ibus-x11 9-00:39:14 00:00:00 1507 ibus-portal 9-00:39:14 00:00:00 1516 gnome-shell-cal 9-00:39:14 00:00:00 1526 evolution-sourc 9-00:39:13 00:00:00 1528 gvfs-udisks2-vo 9-00:39:13 00:00:00 1535 gvfs-mtp-volume 9-00:39:13 00:00:00 1539 gvfs-afc-volume 9-00:39:13 00:00:00 1544 gvfs-gphoto2-vo 9-00:39:13 00:00:00 1547 goa-daemon 9-00:39:13 00:00:00 1551 gvfs-goa-volume 9-00:39:13 00:00:00 1562 goa-identity-se 9-00:39:13 00:00:00 1567 gsd-power 9-00:39:13 00:00:00 1568 gsd-print-notif 9-00:39:13 00:00:00 1571 gsd-rfkill 9-00:39:13 00:00:00 1573 gsd-screensaver 9-00:39:13 00:00:00 1578 gsd-sharing 9-00:39:13 00:00:00 1580 gsd-smartcard 9-00:39:13 00:00:00 1585 gsd-xsettings 9-00:39:13 00:00:00 1590 gsd-wacom 9-00:39:13 00:00:00 1593 gsd-sound 9-00:39:13 00:00:00 1602 gsd-a11y-settin 9-00:39:13 00:00:00 1605 gsd-color 9-00:39:13 00:04:27 1608 gsd-clipboard 9-00:39:13 00:00:00 1611 gsd-housekeepin 9-00:39:13 00:00:07 1613 gsd-datetime 9-00:39:13 00:00:00 1616 gsd-media-keys 9-00:39:13 00:00:00 1618 gsd-keyboard 9-00:39:13 00:00:00 1625 gsd-mouse 9-00:39:13 00:00:00 1647 gsd-printer 9-00:39:12 00:00:00 1665 nautilus-deskto 9-00:39:12 00:00:43 1667 gsd-disk-utilit 9-00:39:12 00:00:00 1712 evolution-calen 9-00:39:12 00:00:00 1715 gvfsd-trash 9-00:39:12 00:00:00 1731 ibus-engine-sim 9-00:39:12 00:00:03 1742 dconf-service 9-00:39:11 00:00:00 1753 evolution-calen 9-00:39:11 00:00:00 1772 evolution-addre 9-00:39:11 00:00:00 1785 evolution-addre 9-00:39:11 00:00:00 1804 gnome-terminal- 9-00:39:08 00:00:35 1813 bash 9-00:39:07 00:00:00 1831 update-notifier 9-00:38:12 00:00:19 1833 gnome-software 9-00:38:12 00:00:37 1900 deja-dup-monito 9-00:37:12 00:00:00 2001 gvfsd-metadata 9-00:23:26 00:00:00 3390 bash 9-00:21:58 00:00:00 4281 loop10 8-16:49:23 00:00:00 4305 snapd 8-16:49:21 00:02:15 4787 loop11 8-16:47:59 00:00:00 4939 loop12 8-16:47:54 00:00:00 5032 loop13 8-16:47:53 00:00:00 5244 loop14 8-16:47:52 00:00:00 5271 loop15 8-16:47:52 00:00:00 5415 loop16 8-16:47:50 00:00:00 5547 loop17 8-16:47:24 00:00:00 5679 loop18 8-16:47:00 00:00:00 5817 loop19 8-16:45:05 00:00:00 5943 kworker/2:7-eve 8-16:41:34 00:01:03 5948 loop20 8-16:41:34 00:00:00 6071 kworker/1:7-eve 8-16:41:27 00:01:03 6076 loop21 8-16:41:27 00:00:00 6351 kworker/3:5-eve 8-16:39:54 00:01:29 6357 loop22 8-16:39:54 00:00:00 12855 kworker/0:2-eve 6-12:15:22 00:02:43 25281 cupsd 22:14:22 00:00:00 25282 cups-browsed 22:14:22 00:00:00 26240 kworker/2:1-cgr 07:04:16 00:00:00 27079 kworker/u8:1-ev 03:14:09 00:00:00 27090 kworker/u8:0-ev 03:02:40 00:00:00 27128 kworker/0:1-cgr 02:18:18 00:00:00 27188 kworker/1:0-cgr 01:19:43 00:00:00 27232 kworker/u8:2-ev 23:41 00:00:00 27234 kworker/3:0-cgr 20:18 00:00:00 27254 ps 00:00 00:00:00 ``` 可以從上述知道大多數的行程從開始已經過了8-9天左右,也可以看到他們分別的邏輯CPU使用時間。一些程式很多時間處於休眠是因為一些程式是開啟才會用,都等待來自使用者的開啟等等。 若不斷使用CPU時間進行處理運算的話,如果沒有其他的運作中行程存在會讓使用時間=經過時間與並行執行數相乘的結果接近。 ## 優先權的變更 在System call中有個nice()指令可以進行優先權的變更,數值範圍在[-19,20]之間,預設為0,在高優先權的行程會拿到比較多的CPU時間,在低優先權的行程只能獲得較少的CPU時間,任何人都可以調低優先權,但是只有ROOT(sudo)可以調高優先權。 接著我們改變一下sceduler程式的要求 * 運作的行程數固定為2個 * 將引數1指定為total,引數2指定為resol * 兩個行程中將其中一個設定為priority 0。另一個為priority 5。 以下為sceduler_nice.c程式 ```c #include <sys/types.h> #include <sys/wait.h> #include <time.h> #include <unistd.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <err.h> #define NLOOP_FOR_ESTIMATION 10000000000UL #define NSECS_PER_MSEC 1000000UL #define NSECS_PER_SEC 1000000000UL static inline long diff_nsec(struct timespec before,struct timespec after){ return ((after.tv_sec*NSECS_PER_SEC + after.tv_nsec) -(before.tv_sec*NSECS_PER_SEC+before.tv_nsec)); } //inline 內嵌函數,編譯也不一定會實作,如果處理很短並且很常呼叫的話可以加上他 //timesoec 測量時間的函數,常用在評估程式執行效能的方法 static unsigned long loops_per_msec(){ struct timespec before,after; clock_gettime(CLOCK_MONOTONIC,&before); unsigned long i; for(i=0;i<NLOOP_FOR_ESTIMATION;i++) ; clock_gettime(CLOCK_MONOTONIC,&after); int ret; return NLOOP_FOR_ESTIMATION*NSECS_PER_MSEC/diff_nsec(before,after); } static inline void load(unsigned long nloop){ unsigned long i; for(i=0;i<nloop;i++) ; } static void child_fn(int id,struct timespec *buf,int nrecord,unsigned long nloop_per_resol,struct timespec start){ int i; for(i=0;i<nrecord;i++){ struct timespec ts; load(nloop_per_resol); clock_gettime(CLOCK_MONOTONIC, &ts); buf[i] = ts; } for(i=0;i<nrecord;i++){ printf("%d\t%ld\t%d\n",id,diff_nsec(start,buf[i])/NSECS_PER_MSEC,(i+1)*100/nrecord); } exit(EXIT_SUCCESS); } static void parent_fn(int nproc){ int i; for(i=0;i<nproc;i++) wait(NULL); } static pid_t *pids; int main(int argc, char *argv[]){ int ret = EXIT_FAILURE; if(argc<3){ fprintf(stderr,"usage: %s <nproc> <total[ms]> <resolution [ms]>\n",argv[0]); exit(EXIT_FAILURE); } int nproc = 2; int total = atoi(argv[1]); int resol = atoi(argv[2]); if(total<1){ fprintf(stderr,"<total>(%d) should be >= 1\n", total); exit(EXIT_FAILURE); } if(resol<1){ fprintf(stderr,"<resol>(%d) should be >= 1\n", resol); exit(EXIT_FAILURE); } if(total % resol){ fprintf(stderr,"<total>(%d) should be multiple of <resolution>(%d)\n",total, resol); exit(EXIT_FAILURE); } int nrecord = total/resol; struct timespec *logbuf = malloc (nrecord * sizeof(struct timespec)); if(!logbuf) err(EXIT_FAILURE,"malloc(logbuf) failed"); unsigned long nloop_per_resol = loops_per_msec() * resol; pids = malloc(nproc * sizeof(pid_t)); if(pids==NULL){ warn("malloc(pids) failed"); goto free_logbuf; } struct timespec start; clock_gettime(CLOCK_MONOTONIC,&start); int i,ncreated; for(i=0,ncreated=0;i<nproc;i++,ncreated++){ pids[i] = fork(); if(pids[i]<0){ goto wait_children; }else if (pids[i]==0){ child_fn(i,logbuf,nrecord,nloop_per_resol,start); } } ret = EXIT_SUCCESS; wait_children: if(ret == EXIT_FAILURE) for(i=0;i<ncreated;i++) if(kill(pids[i],SIGINT)<0) warn("kill(%d) failed",pids[i]); for(i=0;i<ncreated;i++) if(wait(NULL)<0) warn("wait() failed."); free_pids: free(pids); free_logbuf: free(logbuf); exit(ret); } ``` 執行為了要凸顯效果所以會在邏輯CPU 0上執行 ```c $ taskset -c 0 ./sheduler_nice 100 1 1 1 1 1 2 2 1 4 3 1 6 4 1 8 5 1 9 6 1 11 7 1 25 8 1 27 9 1 29 10 1 30 11 1 32 12 1 33 13 1 35 14 1 51 15 1 53 16 1 55 17 1 56 18 1 58 19 1 60 20 1 61 21 1 63 22 1 77 23 1 79 24 1 80 25 1 82 26 1 84 27 1 85 28 1 100 29 1 102 30 1 103 31 1 105 32 1 107 33 1 108 34 1 110 35 1 112 36 1 125 37 1 126 38 1 128 39 1 129 40 1 131 41 1 132 42 1 133 43 1 148 44 1 149 45 1 151 46 1 152 47 1 154 48 1 156 49 1 157 50 1 159 51 1 173 52 1 174 53 1 176 54 1 178 55 1 179 56 1 181 57 1 182 58 1 199 59 1 200 60 1 202 61 1 203 62 1 205 63 1 206 64 1 208 65 1 210 66 1 211 67 1 213 68 1 215 69 1 216 70 1 218 71 1 231 72 1 232 73 1 234 74 1 236 75 1 237 76 1 239 77 1 251 78 1 252 79 1 254 80 1 256 81 1 257 82 1 259 83 1 272 84 1 274 85 1 275 86 1 277 87 1 279 88 1 280 89 1 282 90 1 283 91 1 304 92 1 305 93 1 307 94 1 309 95 1 310 96 1 312 97 1 314 98 1 316 99 1 317 100 0 13 1 0 15 2 0 17 3 0 18 4 0 20 5 0 22 6 0 23 7 0 37 8 0 39 9 0 41 10 0 42 11 0 44 12 0 45 13 0 47 14 0 49 15 0 50 16 0 64 17 0 65 18 0 67 19 0 69 20 0 71 21 0 72 22 0 74 23 0 87 24 0 88 25 0 90 26 0 91 27 0 93 28 0 94 29 0 96 30 0 98 31 0 112 32 0 113 33 0 115 34 0 116 35 0 118 36 0 120 37 0 122 38 0 123 39 0 135 40 0 137 41 0 138 42 0 140 43 0 142 44 0 143 45 0 144 46 0 146 47 0 160 48 0 162 49 0 163 50 0 165 51 0 167 52 0 168 53 0 170 54 0 184 55 0 185 56 0 187 57 0 188 58 0 190 59 0 191 60 0 193 61 0 194 62 0 196 63 0 218 64 0 220 65 0 222 66 0 223 67 0 225 68 0 226 69 0 228 70 0 240 71 0 242 72 0 243 73 0 245 74 0 247 75 0 248 76 0 260 77 0 262 78 0 264 79 0 265 80 0 267 81 0 269 82 0 270 83 0 286 84 0 287 85 0 289 86 0 290 87 0 292 88 0 294 89 0 295 90 0 297 91 0 298 92 0 300 93 0 302 94 0 303 95 0 319 96 0 321 97 0 322 98 0 323 99 0 325 100 ``` 可以簡單的發現優先權較高(nice值0)與優先權低的(nice值5)相較之下獲得了比較多的CPU時間。所以行程1會接著行程0之後才會執行。 優先權設定可以透過nice指令,並使用-n來指定優先權。 可以用sar的輸出結果來去看%nice所表示的,可以看到透過他的%user時間比例。