###### tags: `Linux` `ARM` `底層` # AArch64 EL1 IRQ handling in Linux 主要想追蹤一下 Linux 是怎麼做 irq handling 的。 本篇追蹤的 Linux 版本為 5.17.1 # head.S [arch/arm64/kernel/head.S](https://elixir.bootlin.com/linux/v5.17.1/source/arch/arm64/kernel/head.S) 先找 vector table 的配置 ## primary_entry ```c /* * The following callee saved general purpose registers are used on the * primary lowlevel boot path: * * Register Scope Purpose * x21 primary_entry() .. start_kernel() FDT pointer passed at boot in x0 * x23 primary_entry() .. start_kernel() physical misalignment/KASLR offset * x28 __create_page_tables() callee preserved temp register * x19/x20 __primary_switch() callee preserved temp registers * x24 __primary_switch() .. relocate_kernel() current RELR displacement */ SYM_CODE_START(primary_entry) bl preserve_boot_args bl init_kernel_el // w0=cpu_boot_mode ... b __primary_switch SYM_CODE_END(primary_entry) /* * Preserve the arguments passed by the bootloader in x0 .. x3 */ SYM_CODE_START_LOCAL(preserve_boot_args) mov x21, x0 // x21=FDT ... b dcache_inval_poc // tail call SYM_CODE_END(preserve_boot_args) ``` * `preserve_boot_args` 將 fdt 的位址存放到 x21,並另外做一些事情 * `init_kernel_el` 將 Exception Level 從 EL2 降級到 EL1 ## init_kernel_el ```c /* * Starting from EL2 or EL1, configure the CPU to execute at the highest * reachable EL supported by the kernel in a chosen default state. If dropping * from EL2 to EL1, configure EL2 before configuring EL1. * * Since we cannot always rely on ERET synchronizing writes to sysregs (e.g. if * SCTLR_ELx.EOS is clear), we place an ISB prior to ERET. * * Returns either BOOT_CPU_MODE_EL1 or BOOT_CPU_MODE_EL2 in w0 if * booted in EL1 or EL2 respectively. */ SYM_FUNC_START(init_kernel_el) mrs x0, CurrentEL cmp x0, #CurrentEL_EL2 b.eq init_el2 SYM_INNER_LABEL(init_el1, SYM_L_LOCAL) mov_q x0, INIT_SCTLR_EL1_MMU_OFF msr sctlr_el1, x0 isb mov_q x0, INIT_PSTATE_EL1 msr spsr_el1, x0 msr elr_el1, lr mov w0, #BOOT_CPU_MODE_EL1 eret SYM_INNER_LABEL(init_el2, SYM_L_LOCAL) mov_q x0, HCR_HOST_NVHE_FLAGS msr hcr_el2, x0 isb init_el2_state /* Hypervisor stub */ adr_l x0, __hyp_stub_vectors msr vbar_el2, x0 isb /* * Fruity CPUs seem to have HCR_EL2.E2H set to RES1, * making it impossible to start in nVHE mode. Is that * compliant with the architecture? Absolutely not! */ mrs x0, hcr_el2 and x0, x0, #HCR_E2H cbz x0, 1f /* Switching to VHE requires a sane SCTLR_EL1 as a start */ mov_q x0, INIT_SCTLR_EL1_MMU_OFF msr_s SYS_SCTLR_EL12, x0 /* * Force an eret into a helper "function", and let it return * to our original caller... This makes sure that we have * initialised the basic PSTATE state. */ mov x0, #INIT_PSTATE_EL2 msr spsr_el1, x0 adr x0, __cpu_stick_to_vhe msr elr_el1, x0 eret 1: mov_q x0, INIT_SCTLR_EL1_MMU_OFF msr sctlr_el1, x0 msr elr_el2, lr mov w0, #BOOT_CPU_MODE_EL2 eret __cpu_stick_to_vhe: mov x0, #HVC_VHE_RESTART hvc #0 mov x0, #BOOT_CPU_MODE_EL2 ret SYM_FUNC_END(init_kernel_el) ``` * 設置 EL2 vector table 為 `__hyp_stub_vectors` * 降級到 EL1 後,執行 `__cpu_stick_to_vhe` ## __primary_switch ```c SYM_FUNC_START_LOCAL(__primary_switch) ... adrp x1, init_pg_dir bl __enable_mmu ... ldr x8, =__primary_switched adrp x0, __PHYS_OFFSET br x8 SYM_FUNC_END(__primary_switch) ``` * 呼叫 `__primary_switched` ## __primary_switched ```c /* * The following fragment of code is executed with the MMU enabled. * * x0 = __PHYS_OFFSET */ SYM_FUNC_START_LOCAL(__primary_switched) adr_l x4, init_task init_cpu_task x4, x5, x6 adr_l x8, vectors // load VBAR_EL1 with virtual msr vbar_el1, x8 // vector table address isb stp x29, x30, [sp, #-16]! mov x29, sp str_l x21, __fdt_pointer, x5 // Save FDT pointer ldr_l x4, kimage_vaddr // Save the offset between sub x4, x4, x0 // the kernel virtual and str_l x4, kimage_voffset, x5 // physical mappings // Clear BSS adr_l x0, __bss_start mov x1, xzr adr_l x2, __bss_stop sub x2, x2, x0 bl __pi_memset dsb ishst // Make zero page visible to PTW ... mov x0, x21 // pass FDT address in x0 bl early_fdt_map // Try mapping the FDT early bl init_feature_override // Parse cpu feature overrides ... bl switch_to_vhe // Prefer VHE if possible ldp x29, x30, [sp], #16 bl start_kernel ASM_BUG() SYM_FUNC_END(__primary_switched) ``` * 可以看到 clear bss,傳遞 fdt address 給 `early_fdt_map` * 設置 EL1 vector table 為 `vectors`,這就是我感興趣的部分! # entry.S [arch/arm64/kernel/entry.S](https://elixir.bootlin.com/linux/v5.17.1/source/arch/arm64/kernel/entry.S) ## vectors ```c /* * Exception vectors. */ .pushsection ".entry.text", "ax" .align 11 SYM_CODE_START(vectors) kernel_ventry 1, t, 64, sync // Synchronous EL1t kernel_ventry 1, t, 64, irq // IRQ EL1t kernel_ventry 1, t, 64, fiq // FIQ EL1h kernel_ventry 1, t, 64, error // Error EL1t kernel_ventry 1, h, 64, sync // Synchronous EL1h kernel_ventry 1, h, 64, irq // IRQ EL1h kernel_ventry 1, h, 64, fiq // FIQ EL1h kernel_ventry 1, h, 64, error // Error EL1h kernel_ventry 0, t, 64, sync // Synchronous 64-bit EL0 kernel_ventry 0, t, 64, irq // IRQ 64-bit EL0 kernel_ventry 0, t, 64, fiq // FIQ 64-bit EL0 kernel_ventry 0, t, 64, error // Error 64-bit EL0 kernel_ventry 0, t, 32, sync // Synchronous 32-bit EL0 kernel_ventry 0, t, 32, irq // IRQ 32-bit EL0 kernel_ventry 0, t, 32, fiq // FIQ 32-bit EL0 kernel_ventry 0, t, 32, error // Error 32-bit EL0 SYM_CODE_END(vectors) ``` 而 `kernel_ventry` 定義如下: ```c .macro kernel_ventry, el:req, ht:req, regsize:req, label:req .align 7 .Lventry_start\@: .if \el == 0 ... .endif sub sp, sp, #PT_REGS_SIZE #ifdef CONFIG_VMAP_STACK /* * Test whether the SP has overflowed, without corrupting a GPR. * Task and IRQ stacks are aligned so that SP & (1 << THREAD_SHIFT) * should always be zero. */ add sp, sp, x0 // sp' = sp + x0 sub x0, sp, x0 // x0' = sp' - x0 = (sp + x0) - x0 = sp tbnz x0, #THREAD_SHIFT, 0f sub x0, sp, x0 // x0'' = sp' - x0' = (sp + x0) - sp = x0 sub sp, sp, x0 // sp'' = sp' - x0 = (sp + x0) - x0 = sp b el\el\ht\()_\regsize\()_\label 0: /* * Either we've just detected an overflow, or we've taken an exception * while on the overflow stack. Either way, we won't return to * userspace, and can clobber EL0 registers to free up GPRs. */ /* Stash the original SP (minus PT_REGS_SIZE) in tpidr_el0. */ msr tpidr_el0, x0 /* Recover the original x0 value and stash it in tpidrro_el0 */ sub x0, sp, x0 msr tpidrro_el0, x0 /* Switch to the overflow stack */ adr_this_cpu sp, overflow_stack + OVERFLOW_STACK_SIZE, x0 /* * Check whether we were already on the overflow stack. This may happen * after panic() re-enables interrupts. */ mrs x0, tpidr_el0 // sp of interrupted context sub x0, sp, x0 // delta with top of overflow stack tst x0, #~(OVERFLOW_STACK_SIZE - 1) // within range? b.ne __bad_stack // no? -> bad stack pointer /* We were already on the overflow stack. Restore sp/x0 and carry on. */ sub sp, sp, x0 mrs x0, tpidrro_el0 #endif b el\el\ht\()_\regsize\()_\label .org .Lventry_start\@ + 128 // Did we overflow the ventry slot? .endm ``` 目前重點先關注 IRQ EL1h 的部分,經過展開後會變成 `el1h_64_irq` * `\()` 用來區隔 macro 參數名稱,[參考手冊](https://sourceware.org/binutils/docs/as/Macro.html) 後面建立了各 exception handler: ```c /* * Early exception handlers */ entry_handler 1, t, 64, sync entry_handler 1, t, 64, irq entry_handler 1, t, 64, fiq entry_handler 1, t, 64, error entry_handler 1, h, 64, sync entry_handler 1, h, 64, irq entry_handler 1, h, 64, fiq entry_handler 1, h, 64, error entry_handler 0, t, 64, sync entry_handler 0, t, 64, irq entry_handler 0, t, 64, fiq entry_handler 0, t, 64, error entry_handler 0, t, 32, sync entry_handler 0, t, 32, irq entry_handler 0, t, 32, fiq entry_handler 0, t, 32, error ``` `entry_handler` macro 定義如下: ```c .macro entry_handler el:req, ht:req, regsize:req, label:req SYM_CODE_START_LOCAL(el\el\ht\()_\regsize\()_\label) kernel_entry \el, \regsize mov x0, sp bl el\el\ht\()_\regsize\()_\label\()_handler .if \el == 0 b ret_to_user .else b ret_to_kernel .endif SYM_CODE_END(el\el\ht\()_\regsize\()_\label) .endm ``` 以 `entry_handler 1, h, 64, irq` 來說,其會展開成以下: ```c SYM_CODE_START_LOCAL(el1h_64_irq) kernel_entry 1, 64 mov x0, sp bl el1h_64_irq_handler b ret_to_kernel SYM_CODE_END(el1h_64_irq) ``` # exception.h [arch/arm64/include/asm/exception.h](https://elixir.bootlin.com/linux/v5.17.1/source/arch/arm64/include/asm/exception.h) 包含了 `el1h_64_irq_handler` 的宣告: ```c asmlinkage void el1t_64_sync_handler(struct pt_regs *regs); asmlinkage void el1t_64_irq_handler(struct pt_regs *regs); asmlinkage void el1t_64_fiq_handler(struct pt_regs *regs); asmlinkage void el1t_64_error_handler(struct pt_regs *regs); asmlinkage void el1h_64_sync_handler(struct pt_regs *regs); asmlinkage void el1h_64_irq_handler(struct pt_regs *regs); asmlinkage void el1h_64_fiq_handler(struct pt_regs *regs); asmlinkage void el1h_64_error_handler(struct pt_regs *regs); asmlinkage void el0t_64_sync_handler(struct pt_regs *regs); asmlinkage void el0t_64_irq_handler(struct pt_regs *regs); asmlinkage void el0t_64_fiq_handler(struct pt_regs *regs); asmlinkage void el0t_64_error_handler(struct pt_regs *regs); asmlinkage void el0t_32_sync_handler(struct pt_regs *regs); asmlinkage void el0t_32_irq_handler(struct pt_regs *regs); asmlinkage void el0t_32_fiq_handler(struct pt_regs *regs); asmlinkage void el0t_32_error_handler(struct pt_regs *regs); ``` # entry-common.c [arch/arm64/kernel/entry-common.c](https://elixir.bootlin.com/linux/v5.17.1/source/arch/arm64/kernel/entry-common.c) ## el1h_64_irq_handler ```c asmlinkage void noinstr el1h_64_irq_handler(struct pt_regs *regs) { el1_interrupt(regs, handle_arch_irq); } ``` ## el1_interrupt ```c static void noinstr el1_interrupt(struct pt_regs *regs, void (*handler)(struct pt_regs *)) { /* Mask {I, F} bits */ write_sysreg(DAIF_PROCCTX_NOIRQ, daif); if (IS_ENABLED(CONFIG_ARM64_PSEUDO_NMI) && !interrupts_enabled(regs)) __el1_pnmi(regs, handler); else __el1_irq(regs, handler); } ``` ## __el1_irq ```c static __always_inline void __el1_irq(struct pt_regs *regs, void (*handler)(struct pt_regs *)) { enter_from_kernel_mode(regs); irq_enter_rcu(); do_interrupt_handler(regs, handler); irq_exit_rcu(); /* * Note: thread_info::preempt_count includes both thread_info::count * and thread_info::need_resched, and is not equivalent to * preempt_count(). */ if (IS_ENABLED(CONFIG_PREEMPTION) && READ_ONCE(current_thread_info()->preempt_count) == 0) arm64_preempt_schedule_irq(); exit_to_kernel_mode(regs); } ``` ## do_interrupt_handler ```c static void do_interrupt_handler(struct pt_regs *regs, void (*handler)(struct pt_regs *)) { struct pt_regs *old_regs = set_irq_regs(regs); if (on_thread_stack()) call_on_irq_stack(regs, handler); else handler(regs); set_irq_regs(old_regs); } ``` 追到底就是呼叫 handler,也就是 `handle_arch_irq` # irq.c [arch/arm64/kernel/irq.c](https://elixir.bootlin.com/linux/v5.17.1/source/arch/arm64/kernel/irq.c) ## handle_arch_irq ```c static void default_handle_irq(struct pt_regs *regs) { panic("IRQ taken without a root IRQ handler\n"); } void (*handle_arch_irq)(struct pt_regs *) __ro_after_init = default_handle_irq; int __init set_handle_irq(void (*handle_irq)(struct pt_regs *)) { if (handle_arch_irq != default_handle_irq) return -EBUSY; handle_arch_irq = handle_irq; pr_info("Root IRQ handler: %ps\n", handle_irq); return 0; } ``` * 可以看到 `handle_arch_irq` 是一個全域變數,預設為 `default_handle_irq`,kernel 需要在初始化階段呼叫 `set_handle_irq` 來配置 `handle_arch_irq` * `driver/irqchip/` 底下的 interrupt controller drivers 會呼叫 `set_handle_irq` * 這邊追蹤 `driver/irqchip/irq-bcm2836.c` # irq-bcm2836.c [drivers/irqchip/irq-bcm2836.c](https://elixir.bootlin.com/linux/v5.17.1/source/drivers/irqchip/irq-bcm2836.c) ## bcm2836_arm_irqchip_l1_intc_of_init ```c static int __init bcm2836_arm_irqchip_l1_intc_of_init(struct device_node *node, struct device_node *parent) { intc.base = of_iomap(node, 0); if (!intc.base) { panic("%pOF: unable to map local interrupt registers\n", node); } bcm2835_init_local_timer_frequency(); intc.domain = irq_domain_add_linear(node, LAST_IRQ + 1, &bcm2836_arm_irqchip_intc_ops, NULL); if (!intc.domain) panic("%pOF: unable to create IRQ domain\n", node); irq_domain_update_bus_token(intc.domain, DOMAIN_BUS_WIRED); bcm2836_arm_irqchip_smp_init(); set_handle_irq(bcm2836_arm_irqchip_handle_irq); return 0; } ``` * 這邊註冊 irq handler 為 `bcm2836_arm_irqchip_handle_irq` ## bcm2836_arm_irqchip_handle_irq ```c static void __exception_irq_entry bcm2836_arm_irqchip_handle_irq(struct pt_regs *regs) { int cpu = smp_processor_id(); u32 stat; stat = readl_relaxed(intc.base + LOCAL_IRQ_PENDING0 + 4 * cpu); if (stat) { u32 hwirq = ffs(stat) - 1; generic_handle_domain_irq(intc.domain, hwirq); } } ``` * 最終呼叫 `generic_handle_domain_irq` * TODO: 搞懂 `intc`、`hwirq` 意義,目前看起來 `intc` 會從 fdt 取得資訊,`hwirq` 應該是對應裝置的數字,但不是很確定