linux kernel - HackMD

# linux kernel ## HW1 更改kernel 版本(同major但是最新的版本) source: https://www.kernel.org/ $ git clone $ cd sr/src/linux $ tar cvf xxxxxx ## week3 上課內容 Linux Kernel Development 3rd-CH3 (他大概講到p33.fork()那邊，後半段就隨便講講) 老師花了半節課在介紹工具 lxr, cscope, git * lxr 一個線上工具，可以看什麼目錄有什麼東西(可選不同的版本)，可以尋找程式碼在哪邊 * example: /linux/include/linux/sched.h /usr/src/linux2.xx/arch/x86/include(.h)/asm(組語) * cscope 類似lxr 在local端 $ sudo apt-get install -y cscope # 安裝cscope $ cd /usr/src/linux-4.14.305 $ find . -type f -name "*.[chxsS]" > cscope.files $ cscope -b -q -k $ cscope #以後就要到/usr/src/linux-4.14.305 打cscope使用 * 或是用 git log看如果跟課本或老師的版本不一樣的話，東西找不到，可以用這個看程式碼被搬到哪裡去以後再改路徑 (前提是，kernel要是git clone下來的才有紀錄，我忘了我的kernel是怎麼下載的) ## HW2 這周是要來寫kernel module 內容大致參考 Linux Device Drivers, Third Edition-CH2 寫一個module * 需要 .c 和 makefile檔 * makefile路徑老師給的example設好了，唯一要改的是 obj-m := (跟要編譯的.c同名).o ![](https://i.imgur.com/WFhSnXK.png) $|smod | grep hello #此時module還沒有load，所以搜不到 $insmod hello.ko #載入hello 這個module (要sudo) $lsmod | grep hello $rmmod hello #卸載hello ************************************************************************************************* 作業繳交項目 1.執行作業中指令的畫面(截圖) 2.節錄dmesg中所新增的內容(截圖或文字檔) * 可開另一個terminal來監控syslog，從這裡截圖 * $tail -f /var/log/syslog * 或是 cat syslog 到另一個file裡再去截圖 * $cat /var/log/syslog > dmesg.files 3.hellop3.c 與 hellop3.ko ```C= #include <linux/init.h> #include <linux/module.h> #include <linux/moduleparam.h> MODULE_LICENSE("Dual BSD/GPL"); /* * These lines, although not shown in the book, * are needed to make hello.c run properly even * your kernel has version support enable */ /* * three parameters that can be passed in: print a string c a*b times */ static char *c = "world"; //default static int a = 1; //default static int b = 1; //defalut module_param(a, int, S_IRUGO); //passed in parameter module_param(b, int, S_IRUGO); //passed in parameter module_param(c, charp, S_IRUGO); //passed in parameter static int hello_init(void) { int i; for(i = 0; i < a*b; i++) printk(KERN_ALERT "(%d) %s\n", i, c); return 0; } static void hello_exit(void) { printk(KERN_ALERT "Goodbye, cruel world\n"); } module_init(hello_init); module_exit(hello_exit); ``` ``` 當 $insmod hellop3.ko a=3 b=2 c=module_text //passed in parameter 當 $insmod hellop3.ko //default ``` ## HW3 * 列出每一個process的pid,uid,comm 繳交內容 1.module輸出 * process的uid * 一個process的uid: 實際資料結構上就是一個task_struct裡的cred結構的uid結構裡的val變數(__kernel_uint32也就是unsign int) * 或是可以使用get_task_cred() $\quad //$不過我用不了，include看起來也沒有錯但就是一直undefined，不懂 ```c= #include <linux/init.h> #include <linux/module.h> #include <linux/sched/signal.h> //#include <linux/cred.h> MODULE_LICENSE("Dual BSD/GPL"); struct task_struct *task; //Structure defined in sched.h for tasks/processes struct task_struct *task_child; // Structure needed to iterate through task children struct list_head *list; //Structure needed to iterate through the list in each task->children struct static int process_init(void) { printk(KERN_INFO "%s","LOADING MODULE\n"); for_each_process( task ){ // for_each_process() MACRO for iterating through each task in the os located in linux/sched/signal.h printk(KERN_INFO "PID:%d, UID:%u, PROCESS: %s \n",task->pid, task->cred->uid.val ,task->comm); // log id/uid/executable name } return 0; } static void process_exit(void) { printk(KERN_INFO "%s","REMOVING MODULE\n"); } module_init(process_init); module_exit(process_exit); ``` 2. ps -ax -opid,uid,comm的輸出 3. module source code 可以善用https://elixir.bootlin.com/linux/v4.14.305/source/include 去找kernel source code ## HW4 (Due 5/1) ### 題目說明與筆記 ``` 請參考myproc的寫入撰寫一個mykpid的module讓他達成 echo *PID* > /proc/mykpid echo INT > /proc/mykpid 執行完上兩行後 PID 會收到 SIGINT 的 signal 接收signal的程式可以參考signal.c 請繳交module source與編譯好的ko檔與執行過程的截圖(請包含uname -a的訊息) ``` 可以用以下來做測試 signel.c 是否可以收到 Signel ```bash= kill -INT 3351 # 這樣不會關掉程式，但是上面的程式會收到 signel kill 3351 # 會直接 kill 掉 ``` 注意: * signal.c 是run 在 user space裡面 * signal.c 要先記得用g++或gcc編譯，再執行xxxx.o檔 * ps -aux | grep xxxx.o 找到pid編號 * mykpid.c * 因為要做kernel 和 user space的資料交換，所以要用raw_copy_from_user() 和 raw_copy_to_user() * 第47行，條件判斷(是不是輸入INT)我寫的很粗暴 * 第48-49行多了這兩行就一定要加MODULE_LICENSE("GPL");不然會無法insmod，因為沒有透過Module_LICENSE聚集指定授權方式的話，在載入驅動程式的時候就會顯示警告訊息(網路上是寫說會跳警告，但是我會直接跳 "module license 'unspecified' taints kernel"不給我insmod) ``` find_get_pid()的宣告為 extern struct pid *find_get_pid(int nr); kill_pid()的宣告為 extern int kill_pid(struct pid *pid, int sig, int priv); ``` ```c= #include <linux/module.h> #include <linux/proc_fs.h> #include <linux/uaccess.h> #include <linux/sched/signal.h> #include <linux/pid.h> MODULE_LICENSE("GPL"); static struct proc_dir_entry *proc_entry; static ssize_t mykpid_write(struct file *file, const char __user *buff, size_t len, loff_t *pos); static ssize_t mykpid_read(struct file *filp,char __user *buff,size_t len, loff_t *pos); static char msg[255]; static int kpid = -1; static struct file_operations my_ops={ .read=mykpid_read, .write=mykpid_write, .owner=THIS_MODULE, }; static ssize_t mykpid_read(struct file *filp,char __user *buff,size_t count, loff_t *pos) { unsigned int len=strlen(msg); unsigned long p = *pos; if(p>=len) return 0; if (count > len-p) count = len-p; if(raw_copy_to_user(buff+p,msg+p, count)) return -EFAULT; *pos+=count; printk("kpid=%d", kpid); printk("read pos:%ld,count:%ld\n",p,count); return count; } static ssize_t mykpid_write(struct file *file, const char __user *buff, size_t len, loff_t *pos) { unsigned long len2 = len; struct pid* tmp; if (len2>=sizeof(msg)) len2=sizeof(msg)-1; if (raw_copy_from_user(msg, buff, len2)) return -EFAULT; if(kpid == -1) sscanf(msg, "%d", &kpid); else if((msg[0]=='I') && (msg[1]=='N') && (msg[2]=='T')){ tmp = find_get_pid(kpid); kill_pid(tmp, SIGTERM, 1); printk("ok!"); }else return -EFAULT; msg[len2] = '\0'; return len; } static int __init mykpid_init(void) { proc_entry = proc_create("mykpid", 0666, NULL,&my_ops); if (!proc_entry) { printk(KERN_ERR "Can't create /proc/mykpid\n"); return -1; } return 0; } static void __exit mykpid_exit(void) { remove_proc_entry("mykpid", NULL); } module_init(mykpid_init); module_exit(mykpid_exit); ``` * 執行結果截圖: ![](https://i.imgur.com/egUqG2A.jpg) reference: https://www.coolcou.com/linux-kernel/linux-process-management-kernel-api/the-linux-kernel-find-get-pid.html https://devarea.com/linux-kernel-development-creating-a-proc-file-and-interfacing-with-user-space/#.ZEeQSXZBxPZ ## midterm exam ### CH 1, 2 * Linux is a Unix-like system follwing POSIX(Unix API) * linux **kernel** property * no libc or Standard Headers (not linked to the standard C, because of speed and size) * printf() 不可以用 * all the codes are coded in GNU C * 使用的組合語言也都是 inline assembly，所以也都算是GNU C * cannot easily execute floating-point operations (ex. divide zero, overflow), but still can use like in C (need to do some exception handling) * a small per-process fixed-sized stack * all the process are preemptive (including kernel process) * no any memory prtection (you can read what you want in memory) * 但是user space就會有記憶體保護 * use-space 如果存取到 kernel-space 的記憶體，OS會給他 SIGSEGV (segmentation fault) * linux是非商業化產品(kernel本身) ，但是可以被當作商業產品 * 可攜帶性portable * 所有都當檔案處理(in UNIX) read write * 會用 lseek, read, write 來進行存取 * Linux supports symmetrical multiprocessing (SMP) * 將每個處理器地位視成一樣，所以同個resource can be concurrently access by more processors * inline function * GNU C support * eliminate the overhead of function invocation and return and allow greater optimization. * 像是可以使用branch annotation在條件判斷之前，比較常發生/不發生 (likely/unlikey)，可以加註解讓compiler去讀並優化，執行會更有效率 * 大部分的情況下都會使用 inline function (p.18) inline assembly * Synchronization and Concurrency (p.21) * Linux is a preemptive multitasking operating system * Linux supports symmetrical multiprocessing (SMP) * Interrupts occur asynchronously * The Linux kernel is preemptive ### CH 3 Process Management * a program itself is not a process; a process is an active program. * 每一個Processor 或 Theads 都有自己的 program counter, process stack, and set of processor registers * init process whose PID is one 是所有process的祖先 * The kernel stores the list of processes in a circular doubnly linked list called the task list. * each element in the task list is a process descriptor of t he type sturcut task_struct * fork() is copy on write * 建立所呼叫的clone()所傳進去的flag不會包含 * 不會call vfork() * 呼叫 exit() * parant 要呼叫 wait4() 來結束掉 * 如果沒有，就會變成 zombie state * Processor 是由 task_struct 的 linked list 串起來的 * Process Kernel Stack* 的安排方式 -> p.4 * PID 最大值為 32,768 (short int 的最大值) * /proc/sys/kernel/pid_max -> 可以取得最多可以使用的 PID * 狀態圖 (p.6) * TASK_INTERRUPTIBLE -> 比較常見，等中斷 * TASK_UNINTERRUPTIBLE -> 完全睡著 * set_task_state(task, state); * 可以設定目前 task 的 state * 在 thead 大部分的記憶體是共用的 * exit_files(), exit_fs() * 會檢查程式的檔案取用的數量 * 跟退出USB說還有程式還沒結束一樣 ### 課本筆記 * The Process * To Linux, It does not differentiate between threads and processes, a thread is just a special kind of process. * threads * unique program counter * process stack * set of processor registers * processes provide two virtualizations * virtualized processor * virtual memory * threads **share the virtual memory abstraction**, whereas each receives its **own virtualized processor**. * The process that calls fork() is the parent, whereas the new process is the child * The fork() system call returns from the kernel twice: once in the parent process and again in the newborn child. * fork() 是用 clone() 實作 * 常常fork完直接呼叫 exec() * creates a new address space * loads a new program * exit() -> 程式離開 * Storing the Process Descriptor * PID * pid_t: int * maximum value is only 32,768 (short int) * maximum number of processes that may exist concurrently on the system * `current`: useful to be able to quickly look up the process descriptor of the currently executing task * x86 * current is calculated by masking out the 13 least-significant bits of the stack pointer * 8KB -> 8192 * 4KB -> 4096 * Process State * ![](https://i.imgur.com/7p1SMLt.png) * Process Context * When a program executes a system call, or triggers an exception, it enters kernel-space. * At this point, the kernel is said to be “executing on behalf of the process”(代表 Process 執行) and is in process context. * The Process Family Tree * All processes are descendants of the init process (PID=1) * `parent` 指向 process parent, `child` 指向 list of children * Process Creation * fork() * creates a child process that is a copy of the current task. * exec() * loads a new executable into the address space and begins executing it * Copy-on-Write * to delay or altogether prevent copying of the data * Rather than duplicate the process address space, the parent and the child can share a single copy * 如果寫入：marked in such a way that if it is written to * **delays the copying of each page in the address space until it is actually written to** * for example, if exec() is called immediately after fork()—they never need to be copied. * Forking * via the clone() system call * The clone() system call, in turn, calls **do_fork()** * 然後她 call copy_process() * takes a series of flags that specify which resources, if any, the parent and child process should share. * The interesting work is done by copy_process(): * It calls dup_task_struct()：creates a new kernel stack, thread_info structure, and task_struct for the new process * 檢查 child 不會超過資源限制與process 上限 * 一些process descriptor裡的成員資料會清除或設定為初始值 * child state is set to TASK_UNINTERRUPTIBLE * call copy_flags() to update the flags member in task_struct * PF_SUPERPRIV: used super- user privileges * PF_FORKNOEXEC: process that has not called exec() * alloc_pid() * 根據帶入的flag，copy_process() either duplicates or shares open files, filesystem information, signal handlers, process address space, and namespace * shared between threads，通常是 threads 在使用 * copy_process() cleans up and returns to the caller a pointer to the new child. * 如果執行成功，通常會先 run child process * 通常 child 都先呼叫 exec()，就不用處理 COW * vfork() * 跟 fork（）很像，except that the page table entries of the parent process are not copied * the only benefit to vfork() is not copying the parent page tables entries. * 通常不會 call vfork() * Kernel Threads * kernel threads do not have an address space. * Process Termination 之後請參考課本 ## CH 4 Process scheduling ### 課本筆記 * Linux 2.5 使用 O(1) Scheduler * Linux 2.6.23 以後使用 Completely Fair Scheduler (CFS) (插入節點時為 O(logN)) * I/O-bound or processor-bound * I/O-bound * spends much of its time submitting and waiting on I/O requests * such a process is runnable for **only short durations** * e.g. GPU applications * Processor-bound * They tend to run until they are preempted because they do not block on I/O requests very often * tends to run such processes **less frequently** but for **longer durations** * e.g. MATLAB, ssh-keygen * 也有同時為I/O-bound跟Processor-bound的 * e.g. X Windows * Scheduling policy 要滿足以下兩個相互衝突的目標： * fast process response time (low latency) * maximal system utilization (high throughput) * Process Priority * The Linux kernel implements 2 separate priority ranges * Nice value * -20 ~ +19 * Nice 越小 Priority 越高，獲得更大的 proportion of the system’s processor。（比起大 Nice 的 Process） * Real Time priority * 0 ~ 99 * Real-time 更高，Priority 更高。 * Timeslice * 定義：The timeslice2 is the numeric value that represents how long a task can run until it is pre- empted * **I/O-bound processes** do not need **longer timeslices** (although they do like to run often), whereas **processor-bound** processes crave **long timeslices** (to keep their caches hot). * Linux’s CFS scheduler 不會直接 Assign timeslices 給 Process * CFS assigns processes a **proportion of the processor** * Nice 會變成 Wights 的概念 * 更高的 Nice Value 會有更低的 Priority，會得到更小的 proportion of the processor * 更低的 Nice Value 會有更高的 Priority，會得到更高的 proportion of the processor * CFS 的 Preemtive 標準 * 根據新 runnable process 的 proportion of the processor 來判斷 * **若新的 Process 的 proportion of the processor 比現在正在運行的 Process 還小，那就 Preemtive 現在執行的 Process。** (ch.4, p.5) * Scheduling Policy in Action * 兩個 Process，Text-editor (I/O Bound) 與 Video encoder (processor-bound) * Linux 作法 * It guarantees the text editor a specific proportion of the processor * 假設只有兩個 Process 在跑 * 各 50 % 的 Proportion * Because the text editor spends most of its time blocked, waiting for user key presses, **it does not use anywhere near 50% of the processor**. Conversely, the video encoder is free to **use more than its allotted 50%**, enabling it to finish the encoding quickly. * CFS 知道 text-editor 用的筆 50 % 還少，知道他用的比 Video Encoder 更少時間 * CFS 訴求：give all processes a fair share of the processor * 因此，it then preempts the video encoder and enables the text editor to run * 總結：CFS 會讓 Text-edior 在需要跑的時候跑，然後剩下的時間給 Video Encoder * Linux Scheduler Classes * CSF -> `SCHED_NORMAL` * Fair Shceduler * Model process scheduling as if the system had an ideal, perfectly multitasking processor. * Each process would receive 1/n of the processor’s time, where n is the number of runnable processes * CFS * CFS will run each process for some amount of time, round-robin, selecting next the process that has run the least. * **不會 Assign Timeslice 給 Process，CFS calculates how long a process should run as a function of the total number of runnable processes** * Nice 值越高 (lower priority)，Propotion 越低，反之亦然 * Targeted Latency: * CFS sets a target for its approximation of the “infinitely small” scheduling duration in perfect multitasking. This target is called the targeted latency. * 翻譯：CFS为其在完美多任务中的 "无限小 "调度持续时间的近似设定了一个目标。这个目标被称为目标延时。 * Minimum granularity * Note as the number of runnable tasks approaches infinity, the proportion of allotted processor and the assigned timeslice approaches zero * Task 數量逼近無限大 -> Timeslice 比例約等於 0 * CFS imposes a floor on the timeslice assigned to each process.This floor is called the Minimum granularity -> 有一個最小的 timeslice 給每一個 process * 預設是 1 millisecond * 來防止一直做 context switch * **The Linux Scheduling Implementation (重要，應該必考)** * Time Accounting * **CFS does not have the notion of a timeslice** * but it must **still keep account for the time that each process runs**, because it needs to ensure that each process runs only for its fair share of the processor * Scheduler Entity Structure * 在 process descriptor,`struct task_stuct` 的 `se` 裡面 * **Virtual Runtime**（重要） * `vruntime` * which is the actual runtime (the amount of time spent running) normalized (or weighted) by the number of runnable processes * 單位：nanoseconds (ns) * CFS uses vruntime to account for **how long a process has run** and thus **how much longer it ought to run**. * Code * ` update_curr()` * calculates the execution time of the current process and stores that value in delta_exec. * It then passes that runtime to `__update_curr()` * **invoked periodically by the system timer and also whenever a process becomes runnable or blocks, becoming unrunnable** * `__update_curr()` * which weights the time by the number of runnable * In this manner, `vruntime` is an accurate measure of the runtime of a given process and an indicator of **what process should run next**. * 使用 vruntime 來判斷下一個 proecess 該輪到誰來跑 * Process Selection: * When CFS is deciding what process to run next, it picks the process with the smallest `vruntime`. -> 選最小的 vruntime 的 process 排程為下一個執行的 process * The core of CFS’s scheduling algorithm: **Pick the task with the smallest vruntime.** * CFS uses a **red-black** tree to manage the list of runnable processes and efficiently find the process with the smallest vruntime. * key for each node -> runnable process’s vir- tual runtime * CFS wants to run next, which is the process with the smallest vruntime, is the **leftmost node in the tree**. * 如果你往左樹找最左的 leaf，他就是最小 runtime 的 process * Picking the Next Task: * 結論：**Run the process represented by the leftmost node in the rbtree** * code * `__pick_next_entity()` * No need actually traverse the tree to find the left- most node, **because the value is cached by rb_leftmost.** * 如果回傳 NULL -> 代表沒有leftnode -> 代表沒 node 在 rbtree -> 沒有 process 可以 run -> CFS schedules the idle task * Adding Processes to the Tree: * Would occur When: * Process becomes runnable (wakes up) * First created via fork() * Code * `enqueue_entity():` * `__enqueue_entity()` * perform the actual heavy lifting of inserting the entry into the red-black tree * (ch.4, p.115) * Removing Processes from the Tree: * Would occur When: * A process blocks (becomes unrunnable) * Terminates (ceases to exist): * Code * `dequeue_entity()` * the real work is performed by a helper function, `__dequeue_entity()` * The Scheduler Entry Point * The main entry point into the process schedule is the function `schedule()` * This is the function that the rest of the kernel uses to invoke the process scheduler, deciding which process to run and then running it * Code * `schedule()` * 呼叫 `pick_next_task()` 去選擇下一個 task * goes through each scheduler class, starting with the highest priority, and selects the highest priority process in the highest priority class * (ch.4, p.18) * Sleeping and Waking Up: * Tasks that are sleeping (blocked) are in a special nonrunnable state. * Task 需要 Sleep 的幾個原因: * 基本上，always while it is waiting for some event * more data from a file I/O * another hardware event * Kernel 的行為相同 * **步驟: (重要)** * The task **marks itself as sleeping** * Puts itself on a **wait queue** * **Removes itself from the red-black tree** of runnable * **Calls `schedule()`** to select a new process to execute * Two states are associated with sleeping: * TASK_INTERRUPTIBLE * Wake up prematurely * Respond to a signal if one is issued * TASK_UNINTERRUPTIBLE * 忽略 Signals * Sleeping Task 都會進入 wait queue，等待 event 發生，並且不可執行 * Wait Queues * Processes put themselves on a wait queue and mark themselves not runnable. * Wait Queues 新增的方式 * `DECLARE_WAITQUEUE()` 或 dynamically via `init_waitqueue_head()` * ![](https://i.imgur.com/OgmJcRi.png) * Task 把自己新增到 wait queue 的步驟： * 用 `DEFINE_WAIT()` 建立「wait queue entry」 * 用 `add_wait_queue()` 把自己加入到 wait queue * 如果 Condition 達成，Wait Queue 會喚醒 Task * 需要其他程式碼呼叫 `wake_up()` * 用 `prepare_to_wait()` 來改變 Process State 為 `TASK_INTERRUPTIBLE` 或是 `TASK_UNINTERRUPTIBLE` * 如果是 `TASK_INTERRUPTIBLE`，signal 會叫醒 Process -> `spurious wake up` (a wake-up not caused by the occurrence of the event), so check and handle signals. * When the task awakens, it again checks whether the condition is true. If it is, it exits the loop. Otherwise, it again calls `schedule()` and repeats. * Now that the condition is true, the task sets itself to `TASK_RUNNING` and removes itself from the wait queue via `finish_wait()`. * Wake Up * `wake_up()`, which wakes up all the tasks waiting on the given wait queue * It calls `try_to_wake_up()`，他會： * sets the task’s state to `TASK_RUNNING` * calls enqueue_task() to add the task to the red-black tree * sets need_resched if the awakened task’s priority is higher than the priority of the current task * 重要的是，可能會有**虛假喚醒（spurious wake-ups）** * Just because a task is awakened does **not mean that the event for which the task is waiting has occurred** * ![](https://i.imgur.com/iF6w9Hf.png) * Preemption and Context Switching * Context switching, the switching from one runnable task to another, is handled by the `context_switch()` * It is called by `schedule()` when a new process has been selected to run. It does two basic jobs: * Calls `switch_mm()` * switch the **virtual memory mapping** from the previous process’s to that of the new process. * Calls `switch_to()` * Switch the **processor state** from the previous process’s to the current’s * 儲存或恢復 **stack information, processor register, any other architecture-specific state** * The kernel, however, **must know when to call schedule()** * Kernel provides the need_resched flag to signify whether a reschedule should be performed * This flag is set by scheduler_tick() when a process should be preempted. * try_to_wake_up() when a process that has a higher priority than the currently run- ning process is awakened * **The flag is a message to the kernel that the scheduler should be invoked as soon as possible because another process deserves to run.** * User Preemption: * User preemption occurs when the **kernel is about to return to user-space** * need_resched is set, scheduler is invoked * Whenever the kernel is preparing to return to user-space either on return from an interrupt or after a system call, **the value of need_resched is checked.** * If it is set, the scheduler is invoked to select a new (more fit) process to execute. * 發生時機： * When returning to user-space from a system call * When returning to user-space from an interrupt handler * Kernel Preemption * Linux kernel -> fully preemptive kernel * Safe? -> The kernel can preempt a task running in the kernel so long as it does not hold a lock * preemption counter: `preempt_count` (in the process’s `thread_info`) * counter begins at zero * increments once for each lock that is acquired. * decrements once for each lock that is released. * **When the counter is zero, the kernel is preemptible** * 舉例：return from interrupt, if returning to kernel-space, the kernel checks the values of need_resched and preempt_count * If need_resched is set and preempt_count is zero * it is safe to preempt -> scheduler is invoked * If preempt_count is nonzero, a lock is held * unsafe to reschedule * interrupt returns as usual to the **currently executing task** * When all the locks that the current task is holding are released, preempt_count returns to zero. * unlock code checks whether need_resched is set. If so -> scheduler is invoked * 發生時機： * When an interrupt handler exits, before returning to kernel-space * When kernel code becomes preemptible again * If a task in the kernel explicitly calls schedule() * If a task in the kernel blocks (which results in a call to schedule()) * Real-Time Scheduling Policies * SCHED_FIFO * first-in, first-out scheduling algorithm without timeslices * When a SCHED_FIFO task becomes runnable, it continues to run until it blocks or explicitly yields the processor; * it has no timeslice and can run indefinitely. * Only a higher priority SCHED_FIFO or SCHED_RR task can preempt a SCHED_FIFO task. * Two or more SCHED_FIFO tasks at the same priority run round-robin, but again only yielding the processor when they explicitly choose to do so. * If a SCHED_FIFO task is runnable, all tasks at a lower priority cannot run until it becomes unrunnable. * SCHED_RR * identical to SCHED_FIFO except that **each process can run only until it exhausts a predetermined timeslice** * **SCHED_RR is SCHED_FIFO with timeslices** * When a SCHED_RR task exhausts its times- lice, any other real-time processes at its priority are scheduled round-robin * As with SCHED_FIFO (跟 FIFO 一樣) , a higher-priority process always immediately preempts a lower-priority one, and a lower- priority process can never preempt a SCHED_RR task, **even if its timeslice is exhausted.** * Real time scheduling Policy * soft real-time * refers to the notion that the kernel tries to schedule applications within timing deadlines, but the kernel does not promise to always achieve these goals. * hard real-time * systems are guaranteed to meet any scheduling requirements within certain limits * **Linux makes no guarantees on the capability to schedule real-time tasks** * Real-time priorities range * zero to MAX_RT_PRIO minus 1 (By default, MAX_RT_PRIO is 100) -> zero to 99 * priority space is shared with the nice values * They use the space from MAX_RT_PRIO to (MAX_RT_PRIO + 40) * The –20 to +19 nice range maps directly onto the priority space from 100 to 139. * Yielding Processor Time * `sched_yield()` for a process to explicitly yield the processor to other waiting processes * It works by * removing the process from the active array * putting it at the end of its priority * inserting it into the expired array. * 保證不會執行一陣子 * Real-time * moved to the end of their priority list (and **not inserted into the expired array**) ### 考古題 * 下面哪一個程式是processor-bound * SSH key keygen * matlab * machine learning * 哪一個是linux kernel schedule會考慮的 * 極小的time slice (x) -> 錯，應該是適當的 timeslice * The timeslice is the numeric value that represents how long a task can run until it is preempted. * process的反應時間短latency (o) -> 是對的，每一個 Processor 對外界的反應時間可以越短會越好 * 最大系統使用率 (o) * scheduler 複雜度越低越好(o) * scheduler有哪些stratergy * CFS 分為schedule other/normal (o) * real time schedul 分成 RR FIFO * NICE可以被使用著自行調整 (-20-19) * Nice value越低越優先 * 那些是scheduler會做(實現)的事情 * time accounting （會去數 Procesor 做了多久）是實現的部分 * processor selection (o) * interupt 不是實現的部分(因為無法被scheduler安排) * sleeping and wake up 是 scheduler 會實現的 * four components of CFS 時間複雜度O(1) * Time Accounting * vruntime variable stores the virtual runtime of a process * normalized the actual runtime * function: update_curr() 更新update vruntime * Process selection * pick the process with the smallest vruntime * using a red-black tree to manage the list of runnable process * removing process from red black tree, when a process blocks(unrunnable) or terminate(exit). * The Scheduler Entry Point * function: schedule() -> pick_next_task() * Sleeping and Waking Up * tasks that are sleeping(blocked), two states: * TASK_INTERRUPTIBLE and TASK_UNINTERRUPTIBLE * sleeping is handled via wait queues * Context Switching * schedule()-> select a new process ->call context_switch() -> switch_mm(), switch_to() * switch_mm(), which is declared in <asm/mmu_context.h>, to switch the virtual memory mapping from the previous process’s to that of the new process. * switch_to(), declared in <asm/system.h>, to switch the processor state (including stack information, the processor register, and other architecture-specific state) from the previous process’s to the current’s. * user preemption * occurs when the kernel is about to return to user-space * when returning to user-spaace from a system call * when returning to user-space from an interrupt handler. * kernel preemption * occur * When an interrupt handler exits, before returning to kernel-space * When kernel code becomes preemptible again * If a task in the kernel explicitly calls schedule() * If a task in the kernel blocks (which results in a call to schedule()) * kernel preemption不會發生在什麼時候 * 當system call返回 user space ### CH 5 system call * user space的processor 不可以直接存取kernel memory 但可以透過system call 來存取 kernel space * copy from user * copy to user 可以防止user space的程式存取到不該存取的記憶體 * Denoting the Correct System Call * On x86, the **syscall number** is fed to the kernel via the **eax register** * checks the validity of the given s**ystem call number by comparing it to NR_syscalls** * \>\= NR_syscalls，噴 ENOSYS 錯誤 * The parameters are stored in registers, On x86-32 * **ebx, ecx, edx, esi, edi** * 若大於 6 個參數 * a single register is used to hold a pointer to user-space -> 存所有的 parameters * The return value -> **eax register** * Verifying the Parameters * copy_to_user(): writing into user-space * 1st parms: destination memory address in the process’s address space (user space) * 2nd parms: source pointer in kernel-space * 3rd parms: size in bytes of the data to copy * copy_from_user(): reading from user-space * 大致與 to user 相同，差別： * reads from the second parameter into the first parameter the number of bytes specified in the third parameter. * 第二個是 user_space，第一個是 kernel_space * Both copy_to_user() and copy_from_user() may block * page containing the user data is not in physical memory but is swapped to disk * process sleeps until the page fault handler can bring the page from the swap file on disk into physical memory. * capable() * valid capabilities flag returns nonzero if the caller holds the specified capability and zero otherwise. * By default, the superuser possesses all capabilities and nonroot possesses none * ### CH 6 * 有提供哪些資料結構 * linked list * stack * queue * tree (有binary tree) * graph (x) * Red Black Tree 貌似有 * Map ----------- ### 還沒整理 * system call 是透過暫存器傳參數 (o) * process不會有哪種狀態 * task block * kernel system call * 精簡 * 具有目的 * 都在kernel space運行 * user spcace可以呼叫 * 通常不會改變規格 * linux運行中的程式不可以透過什麼方式來跟kernel溝通 * system call(o) * 讀寫一個kern device (o) * 讀寫一個proc / moudle (o) * 直接存取 memory (x) * 關於linux kernel哪個是正確的? * 是模組化的 * 支援dynamic loadable module * 支援preemptimptive 模式 * 可以同時運行多個kernel (x理論上是不行) make config make oldconfig task_sturcut 把所有的process列出來 pid_t 用途 process discriont適用linked list 怎麼設計kernel device? 可以個別調整每個燈的亮暗有一個硬體led燈 32個 led 0xC0000 32bit 0 0 0 0 .............. 0 unsigned int val-isa_read1(addr) bolean ret=isa_writel(val, addr) proc/myproc read/write echo ???? > /proc/myproc -------------------------------- ## HW5 我個人感覺在寫這個作業之前要先去讀一下CH13或是至少要知道個大概，下面[CH13筆記](#CH13-The-Virtial-Filesystem)大概寫了一點作業內容: 寫個簡單的filesystem 繳交內容: * A.myfs.c * B.myfs.ko * C.執行截圖 * C.1 #insmod myfs.ko * C.2 #lsmod // 需看到myfs.ko * C.3 #mount -t myfs /dev/loop0 /mnt #mount // 需看到 /mnt的掛載資訊如 /dev/loop0 on /mnt type myfs (rw) * C.4 #echo 3 >/mnt/input/a #cat /mnt/input/a #echo 2 > /mnt/input/b #cat /mnt/input/b #cat /mnt/output/add #cat /mnt/output/sub #cat /mnt/output/add #cat /mnt/output/sub **請根據以下程式修改，達成以下功能:** 1. 將檔案系統的跟目錄改為下列結構於static void myfs_create_files()中修改 ``` /--+ input (dir) | | | +-- a (file) | +-- b (file) | + output (dir) | +-- add (file) +-- sub (file) ``` 2. 可以透過 echo 數字 > /input/a 和 echo 數字 >/input/b 來設定a和b的值，數值大小0~255之間 3. 可以透過 cat /output/add取得a+b的值，透過cat /output/sub取得a-b的值要修改static ssize_t myfs_read_file() ```c= #include <asm/atomic.h> #include <linux/fs.h> /* This is where libfs stuff is declared */ #include <linux/init.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/pagemap.h> /* PAGE_CACHE_SIZE */ #include <linux/string.h> #include <linux/uaccess.h> /* copy_to_user */ MODULE_LICENSE("GPL"); #define MYFS_MAGIC 0x20210607 static struct dataAb { int *a; int *b; } abData; static struct inode *myfs_make_inode(struct super_block *sb, int mode) { struct inode *ret = new_inode(sb); if (ret) { ret->i_mode = mode; ret->i_uid.val = 0; ret->i_gid.val = 0; ret->i_blocks = 0; ret->i_atime = current_kernel_time(); ret->i_mtime = current_kernel_time(); ret->i_ctime = current_kernel_time(); } return ret; } static int myfs_open(struct inode *inode, struct file *filp) { filp->private_data = inode->i_private; return 0; } #define TMPSIZE 20 static ssize_t myfs_read_file(struct file *filp, char *buf, size_t count, loff_t *offset) { // int *counter = (atomic_t *)filp->private_data; int *counter; struct dataAb *dataStog; struct dentry *dentry = filp->f_path.dentry; const char *name = dentry->d_name.name; const char *strA = "a"; const char *strB = "b"; const char *strAdd = "add"; const char *strSub = "sub"; int v, len; char tmp[TMPSIZE]; printk("d_iname2: %c", name[0]); if (*offset > 0) { printk("*offset > 0"); } else { if (strcmp(name, strA) == 0) { printk("name == a"); counter = (int *)filp->private_data; v = *counter; } else if (strcmp(name, strB) == 0) { printk("name == b"); counter = (int *)filp->private_data; v = *counter; } else if (strcmp(name, strAdd) == 0) { printk("name == add"); dataStog = (struct dataAb *)filp->private_data; v = *(dataStog->a) + *(dataStog->b); printk("a+b = %d", v); // *(dataStog->a) = v; } else if (strcmp(name, strSub) == 0) { printk("name == sub"); dataStog = (struct dataAb *)filp->private_data; v = *(dataStog->a) - *(dataStog->b); printk("a-b = %d", v); // *(dataStog->a) = v; } } // if (*offset > 0) // v -= 1; /* the value returned when offset was zero */ // else // atomic_inc(counter); len = snprintf(tmp, TMPSIZE, "%d\n", v); if (*offset > len) return 0; if (count > len - *offset) count = len - *offset; if (copy_to_user(buf, tmp + *offset, count)) return -EFAULT; *offset += count; return count; } static ssize_t myfs_write_file(struct file *filp, const char *buf, size_t count, loff_t *offset) { int *counter = (int *)filp->private_data; char tmp[TMPSIZE]; if (*offset != 0) return -EINVAL; if (count >= TMPSIZE) return -EINVAL; memset(tmp, 0, TMPSIZE); if (copy_from_user(tmp, buf, count)) return -EFAULT; *counter = simple_strtol(tmp, NULL, 10); return count; } static struct file_operations myfs_file_ops = { .open = myfs_open, .read = myfs_read_file, .write = myfs_write_file, }; static struct dentry *myfs_create_file(struct super_block *sb, struct dentry *dir, const char *name, int *counter) { struct dentry *dentry; struct inode *inode; struct qstr qname; qname.name = name; qname.len = strlen(name); qname.hash = full_name_hash(dir, name, qname.len); dentry = d_alloc(dir, &qname); if (!dentry) goto out; inode = myfs_make_inode(sb, S_IFREG | 0644); if (!inode) goto out_dput; inode->i_fop = &myfs_file_ops; inode->i_private = counter; d_add(dentry, inode); return dentry; out_dput: dput(dentry); out: return 0; } static struct dentry *myfs_create_func_file(struct super_block *sb, struct dentry *dir, const char *name, struct dataAb *counter) { struct dentry *dentry; struct inode *inode; struct qstr qname; qname.name = name; qname.len = strlen(name); qname.hash = full_name_hash(dir, name, qname.len); dentry = d_alloc(dir, &qname); if (!dentry) goto out; inode = myfs_make_inode(sb, S_IFREG | 0644); if (!inode) goto out_dput; inode->i_fop = &myfs_file_ops; inode->i_private = counter; d_add(dentry, inode); return dentry; out_dput: dput(dentry); out: return 0; } static struct dentry *myfs_create_dir(struct super_block *sb, struct dentry *parent, const char *name) { struct dentry *dentry; struct inode *inode; struct qstr qname; qname.name = name; qname.len = strlen(name); qname.hash = full_name_hash(parent, name, qname.len); dentry = d_alloc(parent, &qname); if (!dentry) goto out; inode = myfs_make_inode(sb, S_IFDIR | 0644); if (!inode) goto out_dput; inode->i_op = &simple_dir_inode_operations; inode->i_fop = &simple_dir_operations; d_add(dentry, inode); return dentry; out_dput: dput(dentry); out: return 0; } // static atomic_t counter, subcounter; static int a, b; static void myfs_create_files(struct super_block *sb, struct dentry *root) { struct dentry *inputdir; struct dentry *outputdir; inputdir = myfs_create_dir(sb, root, "input"); outputdir = myfs_create_dir(sb, root, "output"); // atomic_set(&a, 0); // atomic_set(&b, 0); if (inputdir) { myfs_create_file(sb, inputdir, "a", &a); myfs_create_file(sb, inputdir, "b", &b); } abData.a = &a; abData.b = &b; if (outputdir) { myfs_create_func_file(sb, outputdir, "add", &abData); myfs_create_func_file(sb, outputdir, "sub", &abData); } } static struct super_operations myfs_s_ops = { .statfs = simple_statfs, .drop_inode = generic_delete_inode, }; static int myfs_fill_super(struct super_block *sb, void *data, int silent) { struct inode *root; struct dentry *root_dentry; sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; sb->s_magic = MYFS_MAGIC; sb->s_op = &myfs_s_ops; root = myfs_make_inode(sb, S_IFDIR | 0755); if (!root) goto out; root->i_op = &simple_dir_inode_operations; root->i_fop = &simple_dir_operations; root_dentry = d_make_root(root); if (!root_dentry) goto out_iput; sb->s_root = root_dentry; myfs_create_files(sb, root_dentry); return 0; out_iput: iput(root); out: return -ENOMEM; } static struct dentry *myfs_get_super(struct file_system_type *fst, int flags, const char *devname, void *data) { return mount_bdev(fst, flags, devname, data, myfs_fill_super); } static struct file_system_type myfs_type = { .owner = THIS_MODULE, .name = "myfs", .mount = myfs_get_super, .kill_sb = kill_litter_super, }; static int __init myfs_init(void) { return register_filesystem(&myfs_type); } static void __exit myfs_exit(void) { unregister_filesystem(&myfs_type); } module_init(myfs_init); module_exit(myfs_exit); ``` atomic_t 版本 ```c= #include <asm/atomic.h> #include <linux/fs.h> /* This is where libfs stuff is declared */ #include <linux/init.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/pagemap.h> /* PAGE_CACHE_SIZE */ #include <linux/string.h> #include <linux/uaccess.h> /* copy_to_user */ MODULE_LICENSE("GPL"); #define MYFS_MAGIC 0x20210607 static struct dataAb { atomic_t *a; atomic_t *b; } abData; static struct inode *myfs_make_inode(struct super_block *sb, int mode) { struct inode *ret = new_inode(sb); if (ret) { ret->i_mode = mode; ret->i_uid.val = 0; ret->i_gid.val = 0; ret->i_blocks = 0; ret->i_atime = current_kernel_time(); ret->i_mtime = current_kernel_time(); ret->i_ctime = current_kernel_time(); } return ret; } static int myfs_open(struct inode *inode, struct file *filp) { filp->private_data = inode->i_private; return 0; } #define TMPSIZE 20 static ssize_t myfs_read_file(struct file *filp, char *buf, size_t count, loff_t *offset) { atomic_t *counter = (atomic_t *)filp->private_data; //int *counter; struct dataAb *dataStog; struct dentry *dentry = filp->f_path.dentry; const char *name = dentry->d_name.name; const char *strA = "a"; const char *strB = "b"; const char *strAdd = "add"; const char *strSub = "sub"; int v, len; char tmp[TMPSIZE]; if (*offset > 0) { printk("*offset > 0"); } else { printk("--%s--", name); if (strcmp(name, strA) == 0) { v = atomic_read(counter); printk("a = %d", v); } else if (strcmp(name, strB) == 0) { v = atomic_read(counter); printk("b = %d", v); } else if (strcmp(name, strAdd) == 0) { dataStog = (struct dataAb *)filp->private_data; atomic_add(atomic_read(dataStog->b), dataStog->a); v = atomic_read(dataStog->a); printk("a+b = %d", v); } else if (strcmp(name, strSub) == 0) { dataStog = (struct dataAb *)filp->private_data; atomic_sub(atomic_read(dataStog->b), dataStog->a); v = atomic_read(dataStog->a); printk("a-b = %d", v); } } len = snprintf(tmp, TMPSIZE, "%d\n", v); if (*offset > len) return 0; if (count > len - *offset) count = len - *offset; if (copy_to_user(buf, tmp + *offset, count)) return -EFAULT; *offset += count; return count; } static ssize_t myfs_write_file(struct file *filp, const char *buf, size_t count, loff_t *offset) { atomic_t *counter = (atomic_t *)filp->private_data; char tmp[TMPSIZE]; if (*offset != 0) return -EINVAL; if (count >= TMPSIZE) return -EINVAL; memset(tmp, 0, TMPSIZE); if (copy_from_user(tmp, buf, count)) return -EFAULT; atomic_set(counter, simple_strtol(tmp, NULL, 10)); return count; } static struct file_operations myfs_file_ops = { .open = myfs_open, .read = myfs_read_file, .write = myfs_write_file, }; static struct dentry *myfs_create_file(struct super_block *sb, struct dentry *dir, const char *name, atomic_t *counter) { struct dentry *dentry; struct inode *inode; struct qstr qname; qname.name = name; qname.len = strlen(name); qname.hash = full_name_hash(dir, name, qname.len); dentry = d_alloc(dir, &qname); if (!dentry) goto out; inode = myfs_make_inode(sb, S_IFREG | 0644); if (!inode) goto out_dput; inode->i_fop = &myfs_file_ops; inode->i_private = counter; d_add(dentry, inode); return dentry; out_dput: dput(dentry); out: return 0; } static struct dentry *myfs_create_func_file(struct super_block *sb, struct dentry *dir, const char *name, struct dataAb *counter) { struct dentry *dentry; struct inode *inode; struct qstr qname; qname.name = name; qname.len = strlen(name); qname.hash = full_name_hash(dir, name, qname.len); dentry = d_alloc(dir, &qname); if (!dentry) goto out; inode = myfs_make_inode(sb, S_IFREG | 0644); if (!inode) goto out_dput; inode->i_fop = &myfs_file_ops; inode->i_private = counter; d_add(dentry, inode); return dentry; out_dput: dput(dentry); out: return 0; } static struct dentry *myfs_create_dir(struct super_block *sb, struct dentry *parent, const char *name) { struct dentry *dentry; struct inode *inode; struct qstr qname; qname.name = name; qname.len = strlen(name); qname.hash = full_name_hash(parent, name, qname.len); dentry = d_alloc(parent, &qname); if (!dentry) goto out; inode = myfs_make_inode(sb, S_IFDIR | 0644); if (!inode) goto out_dput; inode->i_op = &simple_dir_inode_operations; inode->i_fop = &simple_dir_operations; d_add(dentry, inode); return dentry; out_dput: dput(dentry); out: return 0; } static atomic_t a, b; static void myfs_create_files(struct super_block *sb, struct dentry *root) { struct dentry *inputdir; struct dentry *outputdir; inputdir = myfs_create_dir(sb, root, "input"); outputdir = myfs_create_dir(sb, root, "output"); atomic_set(&a, 0); atomic_set(&b, 0); if (inputdir) { myfs_create_file(sb, inputdir, "a", &a); myfs_create_file(sb, inputdir, "b", &b); } abData.a = &a; abData.b = &b; printk("creating files"); if (outputdir) { myfs_create_func_file(sb, outputdir, "add", &abData); myfs_create_func_file(sb, outputdir, "sub", &abData); } } static struct super_operations myfs_s_ops = { .statfs = simple_statfs, .drop_inode = generic_delete_inode, }; static int myfs_fill_super(struct super_block *sb, void *data, int silent) { struct inode *root; struct dentry *root_dentry; sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; sb->s_magic = MYFS_MAGIC; sb->s_op = &myfs_s_ops; root = myfs_make_inode(sb, S_IFDIR | 0755); if (!root) goto out; root->i_op = &simple_dir_inode_operations; root->i_fop = &simple_dir_operations; root_dentry = d_make_root(root); if (!root_dentry) goto out_iput; sb->s_root = root_dentry; myfs_create_files(sb, root_dentry); return 0; out_iput: iput(root); out: return -ENOMEM; } static struct dentry *myfs_get_super(struct file_system_type *fst, int flags, const char *devname, void *data) { return mount_bdev(fst, flags, devname, data, myfs_fill_super); } static struct file_system_type myfs_type = { .owner = THIS_MODULE, .name = "myfs", .mount = myfs_get_super, .kill_sb = kill_litter_super, }; static int __init myfs_init(void) { return register_filesystem(&myfs_type); } static void __exit myfs_exit(void) { unregister_filesystem(&myfs_type); } module_init(myfs_init); module_exit(myfs_exit); ``` ## Final exam 考古題 * file link 和 sublink 是不是都要有一個inode的資料結構來儲存來儲存? * Yes * 在linux上面，我們是透過VFS對各種類型的檔案進行操作，包括我們的type(?)或是hardlink還有一般的目錄，這些都是透過VFS的介面來操作 * 如果我們的kernel要配置一塊記憶體(可能會要超過stack的大小)，需要動態配置在MMU單元 * 可以用哪個function來配置 * kmalloc() * vmalloc() * 配置以後，利用VMA，透過memory struct對映回實際上的physical page * MMU對於physical 記憶體處理時，是以page為單位(要是2的次方數量)，所以不可以7個 * * 時間的部分(**jiffies**) * jiffies: 計算時間單位，跟HZ有關。用來判斷時間先後順序 * 時間順序比較 * 不能直接用減的，然後看看是正負號來比較哪個時間是比較先的，因為有可能會overflow * 要用 time_before(), time_after() time_before_eq(), time_after_eq() * 發生一次ticks (1/HZ) jiffies時間就會加1 * 要等多久? ``` HZ = 1000 //HZ是多少根本不重要 delay=jiffies + 2*HZ while(time_before(jiffies,delay)); ``` * 2秒，因為**2***HZ * jiffies是用來記錄系統從開機到現在的ticks數? * Yes * jiffies/HZ 表示什麼? * 系統到現在的開機時間(sec) * jiffies*HZ 表示什麼? * 沒意義 * With a tick rate of 100, a 32-bit jiffies variable would overflow in about 497 days. * 反正就是秒數 * tick rate(HZ) >= $2^{32}$ 秒數再換成天數，就是jiffies大概什麼時候會overflow * tick rate越高 * system calls such as poll() and select() 時間會更準 (o) * timer越精準 (o) * process preemption 發生的更準 (o) * 可以減輕系統排成負擔(X) * 負擔會變高(o) * * **DeadLock** * 下面例子有可能發生 deadlock![](https://hackmd.io/_uploads/BkIidJK8h.png) * 會給例子，要會判斷會不會發生deadlock * * **Critical region** * lock 是為了要避免在critical region發生race condition * critical region範圍從lock到unlock之間 * 如果兩個threads在執行各自的critical region，所執行的邏輯跟實際想要執行的邏輯不一樣，就稱說這兩個region會發生race condition * * **softirq** * 有幾個softirq就會有幾顆cpu * tasklet 是一種softirq * softirq可以註冊的最大數量不可以改變(定值，因為是unsigned int) * 範例 * 先執行 ``` disable_irq(7) .... disable_irq(5) .... disable_irq(7) .... enable_irq(5) enable_irq(7) ``` * 這時候，發生中斷 irq(7)，會不會被執行? * 不會，因為irq7被disable兩次，所以也要被enable2次才會消掉 * cat interrupts ![](https://hackmd.io/_uploads/r1lG_2wU3.png) * timer 是cpu0在處理，因為cpu0處理過29次，其他cpu都沒有處理過0 * 會考收到網路封包後，將網路卡緩衝區的資料複製到記憶體中的中斷行為，是誰在處理 * chci_hcd:usb1, eth0那個，所以是CPU0 * 將資料複製到記憶體是**上半部(top havles)**的中斷在處理 * 因為有critical-time(就是比較急) * 不能被中斷的 (top 的性質) * 跟硬體有直接相關的 (top的性質) * 封包重組是在bottom havles * 比較不急 * 每一個中斷都會對應到一個interrupt handle，隨時都會被執行(o) * 每一個中斷請求(irq)都會有個號碼 * irq 0 timer中斷 * irq 1 是鍵盤中斷(i8042) * i8042 鍵盤控制器 * 對於驅動來說，和按鍵盤相關的最重要的硬件是兩個芯片。一個是intel 8042芯片，位於主板上，CPU通過IO端口直接連接和這個芯片通信，獲得按鍵的掃描碼或發送各種按鍵盤指令。另外一個是intel 8048芯片或者其他芯片，位於按鍵盤中，這個芯片主要作用是從按鍵盤的硬件中得到被按下的按鍵所產生的掃描碼，與i8042通信，控制按鍵盤本體。 * 註冊 (register) 中斷處理function要注意 * 有哪些**flag?** (這邊要了解用途) * **IRQF_DISABLE** * When set, this flag instructs the kernel to disable all interrupts when executing this interrupt handler. 翻譯: 在執行此中斷處理程序時，禁用其他的irq * **IRQF_SAMPLE_RANDOM** * This flag specifies that interrupts generated by this device should contribute to the kernel entropy pool. * 亂數產生器(/dev/random) * 會需要用到kernel entropy pool: 亂數產生器有一個容納噪聲資料的entropy pool，在讀取時，/dev/random裝置會返回小於entropy pool中噪聲總數的隨機位元組。/dev/random可生成高隨機性的公鑰或一次性密碼本。若entropy pool空了，對/dev/random的讀操作將會被阻塞，直到收集到了足夠的環境噪聲為止。這樣的設計使得/dev/random是真正的亂數發生器，提供了最大可能的亂數據熵 * **IRQF_TIMER** * This flag specifies that this handler processes interrupts for the system time * 翻譯: 這個flag就是用來標註專門用來處理時間的handler * **IRQF_SHARED** * 只說此interrupt line可以共用，被多個interrupt handler來使用 * 但是此interrupt並不會同時發生 * 我們在哪個程式段落是不會去檢查或執行還沒被執行的softirq的? * 從硬體中斷返回程式碼 (會檢查) * kernel thread testlet irq(會檢查) * 任何想要一齊(?)要檢查有沒有擱置的softirq的程式碼 (會檢查) * 舉例來說，網路的subsystem裡面，當沒事的時候，其實會去看softirq裡面有沒有還沒做完的後半段的東西 * 中斷程式在top裡面 (X 不會檢查) * 後半段的部分在kernel2.5以後 BH 不會被用 * 當cpu遇到程式導致的指令錯誤(除0, page fault)，會發生exception，會透過哪些方式來處理? * 會產生硬體中斷 (X) * 使用softirq處理 (O) * 會不會發生kernel ti(?)(X) * 會延後指令執行 (X) * VFS * vfsmount structure is defined in <linux/mount.h> * 這個結構可以指定什麼? * **可以**指定![](https://hackmd.io/_uploads/rkgzWpv83.png) * **不可以**指定 *不可以read* * 因為檔案只要mount上來，就至少是read-only * VFS裡面的cache有那些東西? * 裡面只有inode (inode是透過cache去實作的)，所以struct inode裡面有什麼就有什麼 * 使用中目錄的linked list (dentry) (o) * 每個檔案的hash(o) * 檔案裡面的內容(x) * 在 block IO才會，不會在VFS * VFS 裡面的主要的 data structure (4種) * superblock * mount filesystem * inode * 所有東西的information都要用inode存起來 * dentry * 建目錄建檔案路徑 * file * * slab 的設計考慮的是那些項目 * 如果記憶體被頻繁分配和釋放 * 就是使用cache的方式 * 避免記憶體破碎的情況 * 提高 free list 的使用效率 * 整體來說，**並不會**節省記憶體，**但是有可能會**使得記憶體配置更有效率更快 * Zones * 在 linux分區上 (zones) 某些硬體只能存取特定的記憶體區域 * ex. DMA * 在某些架構下，實體記憶體與虛擬記憶體是沒辦法直接map到kernel address，所以要透過DMA, virtual memory area的方式(透過pgd, pmd, pte)再對應到phsical memory。 * high memory(ZONE_HIGHMEM)的 page**不能永久**對應到kernel上 * 需要做matching * ZONE_NORMAL 透過mapping記憶體空間去 * 記憶體配置時，取得連續的page一定要是2的n次方 * 不能3, 7 * tasklet_schedule() 的順序 1. **Check whether the tasklet’s state is TASKLET_STATE_SCHED**. If it is, the tasklet is already scheduled to run and the function can immediately return. 2. **Call tasklet_schedule()**. 3. Save the state of the interrupt system, and then **disable local interrupts**.This ensures that nothing on this processor will mess with the tasklet code while **tasklet_schedule() is manipulating the tasklets.** 4. **Add the tasklet to be scheduled to the head of the tasklet_vec or tasklet_hi_vec linked list**, which is unique to each processor in the system. 5. **Raise the TASKLET_SOFTIRQ or HI_SOFTIRQ softirq, so do_softirq()** executes this tasklet in the near future. 6. **Restore** interrupts to their **previous state and return** * kernel synchronization 的方法 (挑幾個寫出來) * spin lock * semaphores * reader-writer seamphores * mutexs * completion variable * BKL (the big kernel lock) * sequential lock * preemption disable * Ordering and Barries ## CH7 Interrupts and interrupt handlers * hardware就是比較慢，而硬體又要由kernel去hadle，所以要如何有效的使得整體processor work的performace不被拖垮，就有了兩種處理方式polling和這張的重點interrupt。 * 不同的device可以有的不同的interrupts，透過unique value來對應不同的interrupts。不同的value可以使OS區分開這個interrupt是哪個device的，並做出相應的服務。 * These interrupt values are often called **interrupt request(IRQ)**. * The function the kernel runs in response to a specfific interrupt is called an interrupt handler or interrupt service routine(ISR). * The processing of interrupts 被分成兩個部分(two halves) * top half * 比較具有即時性time-critical * 跟硬體直接相關 * 不可睡覺不可被中斷 * bottom half (CH8) * 比較不急，可以晚點做 * register/free an interrupt handler * 使用 request_irq() (in <linux/interrupt.h>) * ![](https://hackmd.io/_uploads/BkapXXG8h.png) * 第一個參數irq: interrupt number to allocate * 第二個handler: funciton pointer to the actual in terrupt handler * 第三個flags * 第四個\*name: 就名子，像是the keyboard interrupt on as PC is key-board * 第五個\*dev: 這個是為了shared interrupts lines的關係用的，下面會講 * Note that **request_irq() can sleep** and therefore **cannot** be called from interrupt context ort other sutiations where code cannot block. * interrupt 是可以共享的，有IRQF_SHARED 這個flag可設 * 在free_irq()時 * if the specified interrupt line is not shared, this function removes the handler and disables the line. * If the interrupt line is shared, the handler identified via dev is removed, but the interrupt line is disable only when the last handler is removed. * 所以為什麼 a qunique dev is importanta，因為 dev provides a unique cookie to enable the removal of only the desired interrupt handler fromm the interrupt line. Whitout this parameter, it would be impossible for the kernel to know which handler to remove on a given interrupt line. * 但如果irq 沒有shared的話，dev 傳null就可以了 ## CH8 Bottom Halves and Deferring work * The job of bottom halves is to perform any interrupt-related work not performed by the interrupt handler. * 哪些job要放在interrupt哪些放在bottom halves是寫device driver的人自己決定的，沒有標準答案, 但是課本建議 * If the work is time sensitive, perform it in the interrupt handler. * If the work is related to the hardware, perform it in the interrupt handler. * If the work needs to ensure that another interrupt (particularly the same interrupt) does not interrupt it, perform it in the interrupt handler. * For everything else, consider performing the work in the bottom half. * ![](https://hackmd.io/_uploads/ry_oxEMI3.png) * BH 和 Task queues已經成為遺跡了在kernel 2.5後 * softirq * 比較底層比較少用，比較critical，用在networking * Softirqs are a set of statically defined bottom halves that can run simultaneously on any processor; **even two of the same type can run concurrently**. * Tasklets * flexible, dynamically created bottom halves built on top of softirqs. * 比較常用 * Two different tasklets can run concurrently on different processors, but two of the same type of tasklet **cannot** run simultaneously * Work queue ## CH9, 10 Kernel Synchronization * If multiple threads of execution access and manipulate the same data at the same time, the threads may overwrite each other' s change. * the Linux kernel is preemptive, so protection is essential. * **critical sections and race condition** * Code paths that access and manipulate shared data are called critical regions (also called critical sections). 翻譯:訪問和操作共享數據的那塊程式路徑稱為臨界區（也稱為臨界區）。 * It is a bug if it is possible for two threads of execution to be simultaneously executing within the same critical region. When this does occur, we call it a **race condition** * 也就是說，這樣就會發生預期結果與實際結果不一所以critical sections一次只能有一個thread進入 * **Locking** * A lock provides such a mechanism; it works much like a lock on a door. * When a thread enters the room, it locks the door behind it.When the thread is finished manipulating the shared data, it leaves the room and unlocks the door. * If another thread reaches the door while it is locked, it must wait for the thread inside to exit the room and unlock the door before it can enter.Threads hold locks; locks protect data. * 實際執行的指令(硬體instruction) * an atomic **test and set** instruction * 或是 **compare and exchange** * On the popular x86 architecture * 其實有很多種不同的lock，下面會講 * 像是 spin lock, semaphores, reader-writer spin lock, ......之類的 * **Deadlock** * A deadlock is a condition involving one or more threads of execution and one or more resources, such that each thread waits for one of the resources, but all the resources are already held. 大致上就是: deadlock就是有一群(2個以上)thread在等待對方手中握有的資源。 * Atomic Operations * Atomic operations provide instructions that execute atomically—without interruption. * Atomic Integer Operation * 在舊的系統中，雖然atomic_t 是 32bit，但是他是採用SPARC 機制，所以其實只有24 usable bits可以用，另外8個是用來lock的 * 在新的系統，提供 a fully usable 32-bit atomic_t (限制不在了) * Atomic Bitwise Operation * **Spin Locks** * The most common lock in the Linux kernel is the spin lock. * thread在等待unlock的時候，其實cpu是站著的空轉(不會sleep)，所以cpu utilization會受影響，只是若是去sleep則需要做context switch(增加負擔)，所以要取trade-off * code defined in <asm/spinlock.h>. The actual usable interfaces are defined in <linux/spinlock.h> * ![](https://hackmd.io/_uploads/SJoJLlt83.png) * 其實有提供各種不同的spin locks ![](https://hackmd.io/_uploads/HyMCUlFLn.png) * 另外，在執行bootom halves時，可以使用 spin_lock_bh(), spin_unlock_bh() * As discussed in Chapter 8,“Bottom Halves and Deferring Work,” 在使用下半部時必須採取某些鎖定預防措施。函數 spin_lock_bh()獲得給定的鎖並禁用所有下半部分。函數 spin_unlock_bh()執行相反的操作 * Reader-Writer Spin Locks * 又稱shared/exclusive 或 concurrent/exclusive locks * Reader-writer spin locks provide separate reader and writer variants of the lock. * One or more readers can concurrently hold the reader lock.The writer lock, conversely, can be held by at most one writer with no concurrent readers. * **semaphores** * Semaphores in Linux are sleeping locks. * When a task attempts to acquire a semaphore that is unavailable, the semaphore places the task onto a wait queue and puts the task to sleep. * defined in <asm/semaphore.h>. * down_interruptible() attempts to acquire the given semaphore * down_trylock() to try to acquire the given semaphore without blocking * To release a given semaphore, call up() * **Reader-Writer Semaphores** * 功能跟 reader-writer spin lock差不多，但是只是用semaphores實作 * All reader-writer semaphores are mutexes—that is, their usage count is one * down_read(), up_read() * down_write(), up_write() * **Mutex** * a sleeping version of the spin lock * Most users of semaphores instantiated a semaphore with a count of one and treated them as a mutual exclusion lock. 大概就是說，其實mutex就是利用semaphores但是count都設置成1，所以一次就只能有1個process進到critical region * spin lock vs. semaphores ![](https://hackmd.io/_uploads/S1fwheYLn.png) * Completion Variable * BKL: The Big Kernel Lock * can sleep while holding the BKL * 所以interrupt context裡面不能用，一般的process contex可以 * The BKL is a recursive lock * Sequential Locks * The sequential lock, generally shortened to seq lock, is a newer type of lock introduced in the 2.6 kernel. * Preemption Disabling * Ordering and Barriers ![](https://hackmd.io/_uploads/BkStAetL2.png) ## CH11 Timers and Time Management * The Tick Rate: HZ * The frequency of the system timer (the tick rate) is programmed on system boot based on a static preprocessor define, HZ. * **Jiffies** * The global variable jiffies holds the number of ticks that have occurred since the system booted. * On boot, the kernel initializes the variable to zero, and it is incremented by one during each timer interrupt.Thus, because there are HZ timer interrupts in a second, there are HZ jiffies in a second. * The system uptime is therefore jiffies/HZ seconds. * 但有可能會overflow ## CH12 Memory Management * Unlike user-space, the kernel is not always afforded the capability to easily allocate memory. * Memory allocation inside the kernel is not easy as outside the kernel * **Page** * The kernel treats physical pages as the basic unit of memory management. * the memory management unit (**MMU**, the hardware that **manages memory** and **performs virtual to physical address translations**) typically deals in pages. * The kernel represents every physical page on the system with a struct page structure. * page structure is associated with physical pages, not virtual pages. * struct page{} 中有某些field: * unsigned long flags; * 是Bit flags，其中有bit會代表是否dirty和locked與否 * atomic_t _count; * stores the usage count of the page * void *virtual; * page’s virtual address * Some memory (called high memory) is not permanently mapped in the kernel’s address space. In that case, this field is NULL, and the page must be dynamically mapped if needed. * **Zones** * 因為受硬體的限制，kernel無法將每個page都是為一樣的。一些page它們在memory中的physical address，不能用於某些任務。 * the kernel divides pages into different zones. * The kernel uses the zones to group pages of similar properties. * In particular, Linux has to deal with two shortcomings of hardware with respect to memory addressing: ![](https://hackmd.io/_uploads/ryviBWFUn.png) ![](https://hackmd.io/_uploads/rJMGUWt82.png) * kmalloc() * The kmalloc() function’s operation is similar to that of user-space’s familiar malloc(). * The kmalloc() function guarantees that the pages are physically contiguous (and virtually contiguous). * However, in user-space’s malloc() can not allocate necessarily pyhsical contiguous. * **vmalloc()** * The vmalloc() function works in a similar fashion to kmalloc(), except it allocates memory that is only virtually contiguous and not necessarily physically contiguous. * 也是因為不能有實際的連續，所以從虛擬地址轉實際地址時，無法直接mapping，如此，就需要更大的TLB去mapping，因此通常不太用，除非真的真的需要allocate很大的memory. * **Slab Layer** * 分配和釋放數據結構是最常見的kernel操作之一，為了方便數據的頻繁分配和釋放，progammers會引進 **free list**. * A free list contains a block of available, already allocated, data structures. * free list 就可以當作是暫存區，如此一來就不用一直allocate和free了。 * 然而，kernel中free list的主要問題之一是不存在 global control * 所以， Linux kernel 提供了 slab layer (also called the slab allocator). * The slab layer divides different objects into groups called caches * each of cache stores a different type of object.There is one cache per object type. * For example, one cache is for process descriptors (a free list of task_struct structures), whereas another cache is for inode objects (struct inode). * The caches are then divided into slabs (hence the name of this subsystem). * The slabs are composed of one or more physically contiguous pages.Typically, slabs are composed of only a single page. Each cache may consist of multiple slabs. * Each slab contains some number of objects. * ![](https://hackmd.io/_uploads/HJI3cZY8n.png) ## CH13 The Virtial Filesystem * 基礎概念可以直接看這幾個網站，有點概念再去看課本會比較清楚資料: https://zhuanlan.zhihu.com/p/354100369, https://blog.51cto.com/liangchaoxi/5422262, 課本 * VFS, which provides the abstraction allowing myriad filesystems to behave as one. * 大意: Virtual Filesystem(VFS)提供一個抽象層(介面?)使得不同的filesystems(像是硬碟、隨身碟、磁碟等等亂七八糟的)都能夠用一套標準來使用 * VFS is object-oriented. ther are four primary object type * superblock: represents a specific mounted filesystem * 記錄本文件系統的整體信息，包括inode/block的總量、使用量、剩餘量、文件系統的格式及相關信息等。 * inode: represnets a specfific file. * 記錄file文件的屬性訊息(像是文件大、用戶標示符號、文件讀取...等/ 也被稱為metadata)，也就是，一個文件佔用一個inode，記錄這個文件的數據所在的塊號； * dentry: represents a directory entry, which is a single componenet of a path. * directory entry: (課本).A path example is /home/wolfman/butter—the root directory /, the directories home and wolfman, and the file butter are all directory entries, called dentries. In Unix, directories are actually normal files that simply list the files contained therein. Because a directory is a file to the VFS, the same operations performed on files can be performed on directories. * 翻譯: 一個路徑例子是/home/wolfman/butter——根目錄/，目錄home和wolfman，以及文件butter都是目錄條目，稱為dentries。在Unix，目錄實際上是普通文件，只是簡單地列出其中包含的文件。因為目錄是 VFS 的文件，所以對文件執行的相同操作可以對目錄執行。 * file: represents an open file as associated with a process. * 然後每種object都會有自己的operations object * super_operations * inode_operations * dentry_operations * file_operations * superblock * 我個人是感覺有點像是"preocess descriptor"的概念，但是它是"filesystm的descriptor"，一樣都是使用雙向linked list串在一起 * 結構為struct super_block (in <linux/fs.h>) * 其中s_op為其super_operations * When a filesystem needs to perform an operation on its superblock, it follows the pointers from its superblock object to the desired method. * for a example, write as: ``` sb->s_op->write_super(sb); ``` * 像是物件導向的寫法，但C又不是真的物件導向語言，所以要將本身(parent)傳入當作參數才行 * inode * The inode object represents all the information needed by the kernel to manipulate a file or directory. * 在Unix系統上, inode(記錄著file infroamation或是 file controls)和file data是分開來儲存的 * 也有inode_operations * 我個人覺得這個就是把它當成"file的descriptor"理解就好了 * Dentry (not physically stored on the disk) * VFS treats directories as a type of file. * 在課本的例子中 "/bin/vi"， bin 和 vi 都被視為file文件——bin 是特殊目錄文件，vi 是常規文件，而這些file 都有個對應的inode object. * 所以其實不要有dentry也是可以找的到文件的只是，因為VFS 經常需要執行特定於目錄的操作，例如路徑名查找。 * Path name lookup involves translating each component of a path, ensuring it is valid, and following it to the next component * 翻譯: 路徑名查找涉及翻譯路徑的每個組件，確保它是有效的，並遵循它到下一個組件。 * 為了促進這一點，VFS 採用了目錄條目 (dentry) 的概念。 * struct dentry defined in <linux/dcache.h> * file * The file object is used to represent a file opened by a process. ## CH14 The Block I/O Layer * 有兩種device * Block devices * 塊設備是硬件設備，通過隨機訪問固定大小的block來進行操作(not necessarily sequential) * 像是 disk, blu-ray raders, flash memort,...等 * 通常用於掛載文件系統 * 要如何管理block device以及其請求，就是 block I/O layer. * character device * 是以連續的數據流式進行訪問的設備 * 像是 serial ports, keyboard,....等 * 兩者差在: * 設備是否能隨機訪問數據，即能否從一個位置跳轉到另一個位置。 * 管理Block devices比管理character device需要更多的注意和準備工作。Block devices需要能夠足夠在媒體的任意位置進行後續導航。 * Block devices的性能對於系統來說非常重要，因此內核為Block devices提供了專門的子系統來管理它。Block devices的複合性提供了許許多多性能優化的空間。 * block I/O layer * The basic container for block I/O within the kernel is the bio structure * I/O vectors * The bi_io_vec field points to an array of bio_vec structures. ![](https://hackmd.io/_uploads/rJDGgzY82.png) * I/O Schedulers * An I/O scheduler aims to enhance the overall performance of the system by optimizing the order and timing of requests in the queue. * An I/O scheduler works by managing a block device’s request queue. * I/O schedulers perform two primary actions to minimize seeks: **merging** and **sorting** * Merging: the coalescing of two or more requests into one. * 如果對列中已經存在一個請求要讀取disk上相關的區域（例如，同一個文件的較早的block），這兩個請求可以合併為一個請求，操作一個或多個相對的disk片區。通過並請求，I/O scheduler將多個請求的開銷減少到一個請求中。 ## CH15 The Process Address Space * **Address Space** * managing its own memory, the kernel also has to manage the memory of user-space processes.This memory is called the process address space, which is the representation of memory given to each user-space process on the system. * The process address space consists of the virtual memory addressable by a process and the addresses within the virtual memory that the process is allowed to use. * The Memory Descriptor * This structure contains all the information related to the process address space. The memory descriptor is represented by **struct mm_struct** and defined in <linux/mm_types.h> ![](https://hackmd.io/_uploads/S1ridMKUh.png) * **Virtual Memory Areas** * The memory area structure, **vm_area_struct**, represents memory areas. It is defined in <linux/mm_types.h>. In the Linux kernel, memory areas are often called **virtual memory areas (VMAs)** * The vm_area_struct structure describes a single memory area over a contiguous interval in a given address space. * The kernel treats each memory area as a unique memory object. Each memory area possesses certain properties, such as permissions and a set of associated operations. * each VMA structure can represent different types of memory areas * for example, memory-mapped files or the process’s user-space stack. * ![](https://hackmd.io/_uploads/ByYNGGKLn.png) ## CH16 The Page Cache and Page Writeback 我累了，不想讀