###### tags: `Linux` `ARM` `底層`
# AArch64 EL1 IRQ handling in Linux
主要想追蹤一下 Linux 是怎麼做 irq handling 的。
本篇追蹤的 Linux 版本為 5.17.1
# head.S
[arch/arm64/kernel/head.S](https://elixir.bootlin.com/linux/v5.17.1/source/arch/arm64/kernel/head.S)
先找 vector table 的配置
## primary_entry
```c
/*
* The following callee saved general purpose registers are used on the
* primary lowlevel boot path:
*
* Register Scope Purpose
* x21 primary_entry() .. start_kernel() FDT pointer passed at boot in x0
* x23 primary_entry() .. start_kernel() physical misalignment/KASLR offset
* x28 __create_page_tables() callee preserved temp register
* x19/x20 __primary_switch() callee preserved temp registers
* x24 __primary_switch() .. relocate_kernel() current RELR displacement
*/
SYM_CODE_START(primary_entry)
bl preserve_boot_args
bl init_kernel_el // w0=cpu_boot_mode
...
b __primary_switch
SYM_CODE_END(primary_entry)
/*
* Preserve the arguments passed by the bootloader in x0 .. x3
*/
SYM_CODE_START_LOCAL(preserve_boot_args)
mov x21, x0 // x21=FDT
...
b dcache_inval_poc // tail call
SYM_CODE_END(preserve_boot_args)
```
* `preserve_boot_args` 將 fdt 的位址存放到 x21,並另外做一些事情
* `init_kernel_el` 將 Exception Level 從 EL2 降級到 EL1
## init_kernel_el
```c
/*
* Starting from EL2 or EL1, configure the CPU to execute at the highest
* reachable EL supported by the kernel in a chosen default state. If dropping
* from EL2 to EL1, configure EL2 before configuring EL1.
*
* Since we cannot always rely on ERET synchronizing writes to sysregs (e.g. if
* SCTLR_ELx.EOS is clear), we place an ISB prior to ERET.
*
* Returns either BOOT_CPU_MODE_EL1 or BOOT_CPU_MODE_EL2 in w0 if
* booted in EL1 or EL2 respectively.
*/
SYM_FUNC_START(init_kernel_el)
mrs x0, CurrentEL
cmp x0, #CurrentEL_EL2
b.eq init_el2
SYM_INNER_LABEL(init_el1, SYM_L_LOCAL)
mov_q x0, INIT_SCTLR_EL1_MMU_OFF
msr sctlr_el1, x0
isb
mov_q x0, INIT_PSTATE_EL1
msr spsr_el1, x0
msr elr_el1, lr
mov w0, #BOOT_CPU_MODE_EL1
eret
SYM_INNER_LABEL(init_el2, SYM_L_LOCAL)
mov_q x0, HCR_HOST_NVHE_FLAGS
msr hcr_el2, x0
isb
init_el2_state
/* Hypervisor stub */
adr_l x0, __hyp_stub_vectors
msr vbar_el2, x0
isb
/*
* Fruity CPUs seem to have HCR_EL2.E2H set to RES1,
* making it impossible to start in nVHE mode. Is that
* compliant with the architecture? Absolutely not!
*/
mrs x0, hcr_el2
and x0, x0, #HCR_E2H
cbz x0, 1f
/* Switching to VHE requires a sane SCTLR_EL1 as a start */
mov_q x0, INIT_SCTLR_EL1_MMU_OFF
msr_s SYS_SCTLR_EL12, x0
/*
* Force an eret into a helper "function", and let it return
* to our original caller... This makes sure that we have
* initialised the basic PSTATE state.
*/
mov x0, #INIT_PSTATE_EL2
msr spsr_el1, x0
adr x0, __cpu_stick_to_vhe
msr elr_el1, x0
eret
1:
mov_q x0, INIT_SCTLR_EL1_MMU_OFF
msr sctlr_el1, x0
msr elr_el2, lr
mov w0, #BOOT_CPU_MODE_EL2
eret
__cpu_stick_to_vhe:
mov x0, #HVC_VHE_RESTART
hvc #0
mov x0, #BOOT_CPU_MODE_EL2
ret
SYM_FUNC_END(init_kernel_el)
```
* 設置 EL2 vector table 為 `__hyp_stub_vectors`
* 降級到 EL1 後,執行 `__cpu_stick_to_vhe`
## __primary_switch
```c
SYM_FUNC_START_LOCAL(__primary_switch)
...
adrp x1, init_pg_dir
bl __enable_mmu
...
ldr x8, =__primary_switched
adrp x0, __PHYS_OFFSET
br x8
SYM_FUNC_END(__primary_switch)
```
* 呼叫 `__primary_switched`
## __primary_switched
```c
/*
* The following fragment of code is executed with the MMU enabled.
*
* x0 = __PHYS_OFFSET
*/
SYM_FUNC_START_LOCAL(__primary_switched)
adr_l x4, init_task
init_cpu_task x4, x5, x6
adr_l x8, vectors // load VBAR_EL1 with virtual
msr vbar_el1, x8 // vector table address
isb
stp x29, x30, [sp, #-16]!
mov x29, sp
str_l x21, __fdt_pointer, x5 // Save FDT pointer
ldr_l x4, kimage_vaddr // Save the offset between
sub x4, x4, x0 // the kernel virtual and
str_l x4, kimage_voffset, x5 // physical mappings
// Clear BSS
adr_l x0, __bss_start
mov x1, xzr
adr_l x2, __bss_stop
sub x2, x2, x0
bl __pi_memset
dsb ishst // Make zero page visible to PTW
...
mov x0, x21 // pass FDT address in x0
bl early_fdt_map // Try mapping the FDT early
bl init_feature_override // Parse cpu feature overrides
...
bl switch_to_vhe // Prefer VHE if possible
ldp x29, x30, [sp], #16
bl start_kernel
ASM_BUG()
SYM_FUNC_END(__primary_switched)
```
* 可以看到 clear bss,傳遞 fdt address 給 `early_fdt_map`
* 設置 EL1 vector table 為 `vectors`,這就是我感興趣的部分!
# entry.S
[arch/arm64/kernel/entry.S](https://elixir.bootlin.com/linux/v5.17.1/source/arch/arm64/kernel/entry.S)
## vectors
```c
/*
* Exception vectors.
*/
.pushsection ".entry.text", "ax"
.align 11
SYM_CODE_START(vectors)
kernel_ventry 1, t, 64, sync // Synchronous EL1t
kernel_ventry 1, t, 64, irq // IRQ EL1t
kernel_ventry 1, t, 64, fiq // FIQ EL1h
kernel_ventry 1, t, 64, error // Error EL1t
kernel_ventry 1, h, 64, sync // Synchronous EL1h
kernel_ventry 1, h, 64, irq // IRQ EL1h
kernel_ventry 1, h, 64, fiq // FIQ EL1h
kernel_ventry 1, h, 64, error // Error EL1h
kernel_ventry 0, t, 64, sync // Synchronous 64-bit EL0
kernel_ventry 0, t, 64, irq // IRQ 64-bit EL0
kernel_ventry 0, t, 64, fiq // FIQ 64-bit EL0
kernel_ventry 0, t, 64, error // Error 64-bit EL0
kernel_ventry 0, t, 32, sync // Synchronous 32-bit EL0
kernel_ventry 0, t, 32, irq // IRQ 32-bit EL0
kernel_ventry 0, t, 32, fiq // FIQ 32-bit EL0
kernel_ventry 0, t, 32, error // Error 32-bit EL0
SYM_CODE_END(vectors)
```
而 `kernel_ventry` 定義如下:
```c
.macro kernel_ventry, el:req, ht:req, regsize:req, label:req
.align 7
.Lventry_start\@:
.if \el == 0
...
.endif
sub sp, sp, #PT_REGS_SIZE
#ifdef CONFIG_VMAP_STACK
/*
* Test whether the SP has overflowed, without corrupting a GPR.
* Task and IRQ stacks are aligned so that SP & (1 << THREAD_SHIFT)
* should always be zero.
*/
add sp, sp, x0 // sp' = sp + x0
sub x0, sp, x0 // x0' = sp' - x0 = (sp + x0) - x0 = sp
tbnz x0, #THREAD_SHIFT, 0f
sub x0, sp, x0 // x0'' = sp' - x0' = (sp + x0) - sp = x0
sub sp, sp, x0 // sp'' = sp' - x0 = (sp + x0) - x0 = sp
b el\el\ht\()_\regsize\()_\label
0:
/*
* Either we've just detected an overflow, or we've taken an exception
* while on the overflow stack. Either way, we won't return to
* userspace, and can clobber EL0 registers to free up GPRs.
*/
/* Stash the original SP (minus PT_REGS_SIZE) in tpidr_el0. */
msr tpidr_el0, x0
/* Recover the original x0 value and stash it in tpidrro_el0 */
sub x0, sp, x0
msr tpidrro_el0, x0
/* Switch to the overflow stack */
adr_this_cpu sp, overflow_stack + OVERFLOW_STACK_SIZE, x0
/*
* Check whether we were already on the overflow stack. This may happen
* after panic() re-enables interrupts.
*/
mrs x0, tpidr_el0 // sp of interrupted context
sub x0, sp, x0 // delta with top of overflow stack
tst x0, #~(OVERFLOW_STACK_SIZE - 1) // within range?
b.ne __bad_stack // no? -> bad stack pointer
/* We were already on the overflow stack. Restore sp/x0 and carry on. */
sub sp, sp, x0
mrs x0, tpidrro_el0
#endif
b el\el\ht\()_\regsize\()_\label
.org .Lventry_start\@ + 128 // Did we overflow the ventry slot?
.endm
```
目前重點先關注 IRQ EL1h 的部分,經過展開後會變成 `el1h_64_irq`
* `\()` 用來區隔 macro 參數名稱,[參考手冊](https://sourceware.org/binutils/docs/as/Macro.html)
後面建立了各 exception handler:
```c
/*
* Early exception handlers
*/
entry_handler 1, t, 64, sync
entry_handler 1, t, 64, irq
entry_handler 1, t, 64, fiq
entry_handler 1, t, 64, error
entry_handler 1, h, 64, sync
entry_handler 1, h, 64, irq
entry_handler 1, h, 64, fiq
entry_handler 1, h, 64, error
entry_handler 0, t, 64, sync
entry_handler 0, t, 64, irq
entry_handler 0, t, 64, fiq
entry_handler 0, t, 64, error
entry_handler 0, t, 32, sync
entry_handler 0, t, 32, irq
entry_handler 0, t, 32, fiq
entry_handler 0, t, 32, error
```
`entry_handler` macro 定義如下:
```c
.macro entry_handler el:req, ht:req, regsize:req, label:req
SYM_CODE_START_LOCAL(el\el\ht\()_\regsize\()_\label)
kernel_entry \el, \regsize
mov x0, sp
bl el\el\ht\()_\regsize\()_\label\()_handler
.if \el == 0
b ret_to_user
.else
b ret_to_kernel
.endif
SYM_CODE_END(el\el\ht\()_\regsize\()_\label)
.endm
```
以 `entry_handler 1, h, 64, irq` 來說,其會展開成以下:
```c
SYM_CODE_START_LOCAL(el1h_64_irq)
kernel_entry 1, 64
mov x0, sp
bl el1h_64_irq_handler
b ret_to_kernel
SYM_CODE_END(el1h_64_irq)
```
# exception.h
[arch/arm64/include/asm/exception.h](https://elixir.bootlin.com/linux/v5.17.1/source/arch/arm64/include/asm/exception.h)
包含了 `el1h_64_irq_handler` 的宣告:
```c
asmlinkage void el1t_64_sync_handler(struct pt_regs *regs);
asmlinkage void el1t_64_irq_handler(struct pt_regs *regs);
asmlinkage void el1t_64_fiq_handler(struct pt_regs *regs);
asmlinkage void el1t_64_error_handler(struct pt_regs *regs);
asmlinkage void el1h_64_sync_handler(struct pt_regs *regs);
asmlinkage void el1h_64_irq_handler(struct pt_regs *regs);
asmlinkage void el1h_64_fiq_handler(struct pt_regs *regs);
asmlinkage void el1h_64_error_handler(struct pt_regs *regs);
asmlinkage void el0t_64_sync_handler(struct pt_regs *regs);
asmlinkage void el0t_64_irq_handler(struct pt_regs *regs);
asmlinkage void el0t_64_fiq_handler(struct pt_regs *regs);
asmlinkage void el0t_64_error_handler(struct pt_regs *regs);
asmlinkage void el0t_32_sync_handler(struct pt_regs *regs);
asmlinkage void el0t_32_irq_handler(struct pt_regs *regs);
asmlinkage void el0t_32_fiq_handler(struct pt_regs *regs);
asmlinkage void el0t_32_error_handler(struct pt_regs *regs);
```
# entry-common.c
[arch/arm64/kernel/entry-common.c](https://elixir.bootlin.com/linux/v5.17.1/source/arch/arm64/kernel/entry-common.c)
## el1h_64_irq_handler
```c
asmlinkage void noinstr el1h_64_irq_handler(struct pt_regs *regs)
{
el1_interrupt(regs, handle_arch_irq);
}
```
## el1_interrupt
```c
static void noinstr el1_interrupt(struct pt_regs *regs,
void (*handler)(struct pt_regs *))
{
/* Mask {I, F} bits */
write_sysreg(DAIF_PROCCTX_NOIRQ, daif);
if (IS_ENABLED(CONFIG_ARM64_PSEUDO_NMI) && !interrupts_enabled(regs))
__el1_pnmi(regs, handler);
else
__el1_irq(regs, handler);
}
```
## __el1_irq
```c
static __always_inline void __el1_irq(struct pt_regs *regs,
void (*handler)(struct pt_regs *))
{
enter_from_kernel_mode(regs);
irq_enter_rcu();
do_interrupt_handler(regs, handler);
irq_exit_rcu();
/*
* Note: thread_info::preempt_count includes both thread_info::count
* and thread_info::need_resched, and is not equivalent to
* preempt_count().
*/
if (IS_ENABLED(CONFIG_PREEMPTION) &&
READ_ONCE(current_thread_info()->preempt_count) == 0)
arm64_preempt_schedule_irq();
exit_to_kernel_mode(regs);
}
```
## do_interrupt_handler
```c
static void do_interrupt_handler(struct pt_regs *regs,
void (*handler)(struct pt_regs *))
{
struct pt_regs *old_regs = set_irq_regs(regs);
if (on_thread_stack())
call_on_irq_stack(regs, handler);
else
handler(regs);
set_irq_regs(old_regs);
}
```
追到底就是呼叫 handler,也就是 `handle_arch_irq`
# irq.c
[arch/arm64/kernel/irq.c](https://elixir.bootlin.com/linux/v5.17.1/source/arch/arm64/kernel/irq.c)
## handle_arch_irq
```c
static void default_handle_irq(struct pt_regs *regs)
{
panic("IRQ taken without a root IRQ handler\n");
}
void (*handle_arch_irq)(struct pt_regs *) __ro_after_init = default_handle_irq;
int __init set_handle_irq(void (*handle_irq)(struct pt_regs *))
{
if (handle_arch_irq != default_handle_irq)
return -EBUSY;
handle_arch_irq = handle_irq;
pr_info("Root IRQ handler: %ps\n", handle_irq);
return 0;
}
```
* 可以看到 `handle_arch_irq` 是一個全域變數,預設為 `default_handle_irq`,kernel 需要在初始化階段呼叫 `set_handle_irq` 來配置 `handle_arch_irq`
* `driver/irqchip/` 底下的 interrupt controller drivers 會呼叫 `set_handle_irq`
* 這邊追蹤 `driver/irqchip/irq-bcm2836.c`
# irq-bcm2836.c
[drivers/irqchip/irq-bcm2836.c](https://elixir.bootlin.com/linux/v5.17.1/source/drivers/irqchip/irq-bcm2836.c)
## bcm2836_arm_irqchip_l1_intc_of_init
```c
static int __init bcm2836_arm_irqchip_l1_intc_of_init(struct device_node *node,
struct device_node *parent)
{
intc.base = of_iomap(node, 0);
if (!intc.base) {
panic("%pOF: unable to map local interrupt registers\n", node);
}
bcm2835_init_local_timer_frequency();
intc.domain = irq_domain_add_linear(node, LAST_IRQ + 1,
&bcm2836_arm_irqchip_intc_ops,
NULL);
if (!intc.domain)
panic("%pOF: unable to create IRQ domain\n", node);
irq_domain_update_bus_token(intc.domain, DOMAIN_BUS_WIRED);
bcm2836_arm_irqchip_smp_init();
set_handle_irq(bcm2836_arm_irqchip_handle_irq);
return 0;
}
```
* 這邊註冊 irq handler 為 `bcm2836_arm_irqchip_handle_irq`
## bcm2836_arm_irqchip_handle_irq
```c
static void
__exception_irq_entry bcm2836_arm_irqchip_handle_irq(struct pt_regs *regs)
{
int cpu = smp_processor_id();
u32 stat;
stat = readl_relaxed(intc.base + LOCAL_IRQ_PENDING0 + 4 * cpu);
if (stat) {
u32 hwirq = ffs(stat) - 1;
generic_handle_domain_irq(intc.domain, hwirq);
}
}
```
* 最終呼叫 `generic_handle_domain_irq`
* TODO: 搞懂 `intc`、`hwirq` 意義,目前看起來 `intc` 會從 fdt 取得資訊,`hwirq` 應該是對應裝置的數字,但不是很確定