--- title: Translate TCG-IR to host ISA tags: QEMU, compiler, toolchain, ISA, DBT --- # Translate TCG-IR to host ISA(X86) ### TODO - [x] Direct branch instruction - [x] Indirect branch instruction - [ ] Register allocation - [x] Emualtion - [x] Prologue / Epilogue - [x] Block chaining - [x] System Call - [x] Interrupt Handler - [x] Code Execution ### RISC-V Register ![](https://i.imgur.com/vnDnhPT.png) ```cpp=1 struct CPURISCVState { target_ulong gpr[32]; uint64_t fpr[32]; /* assume both F and D extensions */ target_ulong pc; target_ulong load_res; target_ulong load_val; target_ulong frm; target_ulong badaddr; target_ulong user_ver; target_ulong priv_ver; target_ulong misa; uint32_t features; ... ... ... ``` ### TCG $\rightarrow$ IR Translation and States Synchronization QEMU通常在翻譯instruction的時候,會分成幾個點去synchronize architecture states。首先,QEMU並沒有做register mapping,他在每個instruction 都會去 synchronize,比方說 `addi a5, a4, 32`,那麼 Qemu其實不會做register mapping,會直接把 a4 + 32 的值寫到 a5 的 states裡面。以下面的 RISC-V $\rightarrow$ TCG $\rightarrow$ X86 為例子: - **RISC-V** ```asm=1 IN: __libc_setup_tls 0x0001079e: addi a5,a5,32 ``` - **TCG-IR** ```bash=1 OP: ---- prologue ----- ld_i32 tmp0,env,$0xffffffffffffffec movi_i32 tmp1,$0x0 brcond_i32 tmp0,tmp1,lt,$L0 ---- 0001079e mov_i32 tmp0,a5 movi_i32 tmp1,$0x20 add_i32 tmp0,tmp0,tmp1 mov_i32 a5 ,tmp0 ----- epilogue movi_i32 pc,$0x107a2 exit_tb $0x0 set_label $L0 exit_tb $0x55e7ec638ec3 ``` `addi` 會變成上面的四道TCG-IR,a5這個並不是一個virtual register,而是architecture states的a5,看接下來的 x86 assembly code - **x86** ```asm=1 # ---- Prologue ------- movl -0x14(%r14), %ebp testl %ebp, %ebp jl 0x55e7ec638f67 # ---- 0001079e ------- movl 0x3c(%r14), %ebp addl $0x20, %ebp movl %ebp, 0x3c(%r14) # ---- Epilogue ------- movl $0x107a2, 0x180(%r14) jmp 0x55e7ec628016 leaq -0xab(%rip), %rax jmp 0x55e7ec628018 ``` `mov_i32 tmp0,a5` 的 a5 會被load到一個tmp0所對應到的register,a5則是從 architecture states **(0x3c(%r14))** load 到 %ebp,然後做完 `addl` 在放回 `0x3c(%r14)`。換句話說,Qemu並不會以一個 basic block 為單位做 register mapping,而是一個 guest instruction 為單位。 那如果是 Load/Store instruction 呢? 下面舉例: - **RISC-V** ```cpp=1 0x00010798: lw a4,0(a5) 0x0001079a: beq a4,a2,260 ``` - **TCG-IR** ```bash=1 OP: ld_i32 tmp0,env,$0xffffffffffffffec movi_i32 tmp1,$0x0 brcond_i32 tmp0,tmp1,lt,$L0 ---- 00010798 mov_i32 tmp0,a5 qemu_ld_i32 tmp1,tmp0,leul,0 mov_i32 a4 ,tmp1 ---- 0001079a mov_i32 tmp0,a4 mov_i32 tmp1,a2 brcond_i32 tmp0,tmp1,eq,$L1 goto_tb $0x1 movi_i32 pc,$0x1079e exit_tb $0x55ad9f9cb501 set_label $L1 goto_tb $0x0 movi_i32 pc,$0x1089e exit_tb $0x55ad9f9cb500 set_label $L0 exit_tb $0x55ad9f9cb503 ``` `lw` instruction 會用 `qemu_ld_i32` 來翻譯,如果是 store 的話就會用 qemu_st_i32 來翻譯 - **x86** ```cpp=1 movl -0x14(%r14), %ebp testl %ebp, %ebp jl 0x55ad9f9cb5df // ---- 00010798 movl 0x3c(%r14) /*a5*/, %ebp /*tmp0*/ movl %gs:0(%ebp /*tmp0*/), %ebp /*tmp1*/ movl %ebp /*tmp1*/, 0x38(%r14) /*a4*/ // ---- 00010798 movl 0x30(%r14), %ebx cmpl %ebx, %ebp je 0x55ad9f9cb5c3 nop jmp 0x55ad9f9cb5ac movl $0x1079e, 0x180(%r14) leaq -0xbd(%rip), %rax jmp 0x55ad9f9c6018 jmp 0x55ad9f9cb5c8 movl $0x1089e, 0x180(%r14) leaq -0xda(%rip), %rax jmp 0x55ad9f9c6018 leaq -0xe3(%rip), %rax jmp 0x55ad9f9c6018 ㄤㄤㄤ #### Store Instruction - RISC-V ```cpp=1 IN: sysmalloc 0x0001c7fe: sw s3,1156(s2) ``` - TCG-IR ```cpp=1 ---- 0001c7fe mov_i32 tmp0,s2 movi_i32 tmp2,$0x484 add_i32 tmp0,tmp0,tmp2 mov_i32 tmp1,s3 qemu_st_i32 tmp1,tmp0,leul,0 ``` - x86 ```cpp=1 movl 0x48(%r14)/*s2*/, %ebp /*tmp0*/ addl $0x484, %ebp/*tmp0*/ movl 0x4c(%r14)/*s3*/, %ebx/*tmp1*/ movl %ebx/*tmp1*/, %gs:0(%ebp/*tmp0*/) ``` ### Branch instructions translation 最主要會講到怎麼處理branch的問題,branch基於condition code決定要跳哪裡,那麼QEMU TCG-IR 是怎麼產生這種code: - Conditional jump 會分成兩種: - Direct branch - **RISC-V** ```cpp=1 IN: __libc_setup_tls 0x00010798: lw a4,0(a5) 0x0001079a: beq a4,a2,260 # 0x1089e ``` - **TCG-IR** ```cpp=1 OP: ld_i32 tmp0,env,$0xffffffffffffffec movi_i32 tmp1,$0x0 brcond_i32 tmp0,tmp1,lt,$L0 ---- 00010798 mov_i32 tmp0,a5 qemu_ld_i32 tmp1,tmp0,leul,0 mov_i32 a4 ,tmp1 ---- 0001079a mov_i32 tmp0,a4 mov_i32 tmp1,a2 brcond_i32 tmp0,tmp1,eq,$L1 // If equal, goto L1 goto_tb $0x1 // Patch point movi_i32 pc,$0x1079e // Save PC back to CPUState exit_tb $0x5578dbc76741 // Return to QEMU to find NON-TAKEN set_label $L1 goto_tb $0x0 // Patch point movi_i32 pc,$0x1089e // Save PC back to CPUState exit_tb $0x5578dbc76740 // Return to qemu to find TAKEN -------- Exit // If interrupt happends, jump set_label $L0 exit_tb $0x5578dbc76743 ``` - **x86** ```cpp=1 OUT: [size=107] movl -0x14(%r14), %ebp testl %ebp, %ebp jl L0 ---- 00010798 movl 0x3c(%r14), %ebp movl %gs:0(%ebp), %ebp movl %ebp, 0x38(%r14) ---- 0001079a movl 0x38(%r14), %ebp movl 0x30(%r14), %ebx cmpl %ebx, %ebp je L1 nop jmp 0x5578dbc767ec // Patch Point 0x5578dbc767ec: movl $0x1079e, 0x180(%r14) leaq -0xbd(%rip), %rax jmp 0x5578dbc71018 L1: jmp 0x5578dbc76808 // Patch Point 0x5578dbc76808: movl $0x1089e, 0x180(%r14) leaq -0xda(%rip), %rax jmp 0x5578dbc71018 // QEUM L0: leaq -0xe3(%rip), %rax jmp 0x5578dbc71018 // QEMU ``` - Indirect branch - 呼應先前提到的 code location problem, indirect branch 大概有這幾種: - Switch-case control flow - Indirect function call (e.g. call *eax) - return instruction 下面會以funtion return當作是example code - **RISC-V** ```cpp=1 0x000103b4: lw s0,28(sp) 0x000103b6: addi sp,sp,32 0x000103b8: ret ``` - **TCG-IR** ```cpp=1 ---- 000103b4 mov_i32 tmp0,sp movi_i32 tmp2,$0x1c add_i32 tmp0,tmp0,tmp2 qemu_ld_i32 tmp1,tmp0,leul,0 mov_i32 s0 ,tmp1 ---- 000103b6 mov_i32 tmp0,sp movi_i32 tmp1,$0x20 add_i32 tmp0,tmp0,tmp1 mov_i32 sp ,tmp0 ---- 000103b8 mov_i32 pc,ra // Move ra to program counter movi_i32 tmp1,$0xfffffffffffffffe // create mask and_i32 pc,pc,tmp1 // Filter exit_tb $0x0 // Go back to QEMU set_label $L0 exit_tb $0x555555bb7043 ``` - **x86** ```cpp=1 ---- 000103b8 movl 4(%r14)/*ra*/, %ebp andl $0xfffffffffffffffe, %ebp /*tmp1*/ movl %ebp, 0x180(%r14)/*pc*/ jmp 0x555555b8d016 // Jump back to qemu, and clear %eax register (ret=0) ``` 由於在indrect branch這邊把 $ra$ 的 reigster value 塞進 $pc$ 裡面,跳回qemu的時候會去看說他有沒有在cache hash table, 如果有被cache起來就直接return tb回去,沒有的話再去查表,拿到tb之後又可以繼續執行下一個translation block。像是下面的$ret = tcg_qemu_tb_exec()$ 的 ret 會拿到 0 ```cpp 163 #endif 164 qemu_log_unlock(); 165 } 166 #endif /* DEBUG_DISAS */ 167 168 cpu->can_do_io = !use_icount; 169 ret = tcg_qemu_tb_exec(env, tb_ptr); 170 cpu->can_do_io = 1; 171 last_tb = (TranslationBlock *)(ret & ~TB_EXIT_MASK); 172 tb_exit = ret & TB_EXIT_MASK; 173 trace_exec_tb_exit(last_tb, tb_exit); ``` - Unconditional jump: 跟direct jump一樣,只是不用處理taken/non-taken的問題,範例如下: - Input Assembly ```cpp=1 IN: _dl_aux_init 0x000229c6: lw s9,4(a0) 0x000229ca: addi s6,zero,1 0x000229cc: j -316 # 0x22890 ``` - TCG-IR ```cpp=1 ---- 000229cc goto_tb $0x0 movi_i32 pc,$0x22890 exit_tb $0x55c9819eff40 ``` 在 goto_tb 這邊基本上是block-chaining的patch point - Output Assembly ```asm=1 0x55c9819effe7: jmp 0x55c9819effec 0x55c9819effec: movl $0x22890, 0x180(%r14) 0x55c9819efff4: 0x55c9819efff7: leaq -0xbe(%rip), %rax 0x55c9819efffe: jmp 0x55c9819ef018 ``` - Function call: 在Qemu翻譯RISC-V instructions的時候應該已經全部換成branch了,因此在TCG-IR$\rightarrow$x86 這邊對於call instruction會變成push會變成jump+各種mov instruction。 - 如果branch target 還沒被翻譯,這時候又會跳回Qemu翻譯。在原來的TCG-I底下的部分會這時候Qemu會在code cache這邊保留 - Binary patch: 由於翻譯的時候不確定之後會跳到哪裡去(因為target BB還沒翻譯),所以會在code cache保留兩個部分(taken/non-taken)之後讓QEMU Patch - 如果還沒翻譯,synchronize architecture states 跳回QEMU - 如果已經翻譯好,在QEMU裡面做block chaining,patch剛剛保留的部分 ### Special instruction translation 為了解決host machine上不一定都有支援的功能(e.g. hardwar floating point),通常Qemu會使用helper_function 用軟體來模擬她的行為,在這裡會用 fadd, fmul 舉例,然後怎麼把在helper_function 的 TCG-IR 轉成 x86 的 code。 由於 helper_function 是在 QEMU 的 compile time 的時候所編譯出來的,~~因此QEMU看到 helper_function 會產生state synchronization code~~ + call instruction 跳到在QEMU裡面描述的helper function。 通常Helper function也可以幫助我們對guest binary 做 instrumentation,所以如果想要蒐集guest program 的 runtime behavior,也可以直接用QEMU來幫忙插code,或許可以做一個簡單的demo。 - Floating point with double precision instructions ### Helper function call - Qemu為了要re-targetable,通常會假設host machine 沒有 hardware fp,因此通常都會用helper_function 來實作。~~在呼叫helper function 之前都會先把 architecture states 寫回 CPURISCVState 這個 structure 裡面~~ (這裡是HQEMU的實作,因為我們有做register mapping,所以如果call helper_function 才需要把 states 全部寫回 CPUState) - 而QEMU每一道instruction都會去synchronize architecture states,所以不用擔心call helper function 之後 states 會壞掉,因此在呼叫helper function call 的時候就直接把 parameter 傳進去。這裡用fadd.d 簡單解釋: - RISC-V source binary ```asm=1 0x00010408: fadd.d dyn,fa5,fa4,fa5 ``` - TCG-IR ```asm=1 ---- 00010408 movi_i32 tmp0,$0x7 call set_rounding_mode,$0x20,$0,env,tmp0 call fadd_d,$0x10,$1,fa5 ,env,fa4 ,fa5 ``` - x86 binary (call fadd_d 那個TCG-IR) ```asm=1 movq %r14, %rdi movq %rbx, %rsi movq %r12, %rdx callq 0x55e86662b7fd movq %rax, 0xf8(%r14) ``` helper function call 一開始會根據 calling convention 把我們想要傳進去的參數放到特定的register,比方說我們想要傳這裡簡單敘述一下,通常 %r14 會放的 CPURISCVState 的 pointer address (看你的host是什麼,如果是ARMv8 就會放在 r19),helper用法就是把return value($rax) 存進 `fa5` architecture states(0xf8*(%r14))。 ```cpp=250 uint64_t helper_fadd_d(CPURISCVState *env, uint64_t frs1, uint64_t frs2) { return float64_add(frs1, frs2, &env->fp_status); } ``` ```cpp=751 float64 __attribute__((flatten)) float64_add(float64 a, float64 b, float_status *status) { FloatParts pa = float64_unpack_canonical(a, status); FloatParts pb = float64_unpack_canonical(b, status); FloatParts pr = addsub_floats(pa, pb, false, status); return float64_round_pack_canonical(pr, status); } ``` ```cpp=531 static FloatParts float64_unpack_canonical(float64 f, float_status *s) { return canonicalize(float64_unpack_raw(f), &float64_params, s); } ``` ```cpp=325 /* Canonicalize EXP and FRAC, setting CLS. */ static FloatParts canonicalize(FloatParts part, const FloatFmt *parm, float_status *status) { if (part.exp == parm->exp_max) { if (part.frac == 0) { part.cls = float_class_inf; } else { #ifdef NO_SIGNALING_NANS part.cls = float_class_qnan; #else int64_t msb = part.frac << (parm->frac_shift + 2); if ((msb < 0) == status->snan_bit_is_one) { part.cls = float_class_snan; } else { part.cls = float_class_qnan; } #endif } } else if (part.exp == 0) { if (likely(part.frac == 0)) { part.cls = float_class_zero; } else if (status->flush_inputs_to_zero) { float_raise(float_flag_input_denormal, status); part.cls = float_class_zero; part.frac = 0; } else { int shift = clz64(part.frac) - 1; part.cls = float_class_normal; part.exp = parm->frac_shift - parm->exp_bias - shift + 1; part.frac <<= shift; } } else { part.cls = float_class_normal; part.exp -= parm->exp_bias; part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift); } return part; ``` ### system call translation 我想如果知道bianry的ABI,做system call translation應該不是太大的問題(?) 這裡先簡單講一下,如果遇到RSIC-V的software trap instruction (ecall),QEMU會先翻譯出 call raise_exception (這個有點像是 helper_function),然後跳過去執行,可是跳過去之後就不會回來了,直接回到執行 trapno = cpu_exec()的那個位子。由於這裡比較小複雜,有點不太想講太detail的東西(high-level idea first) #### RISCV ```asm=1 0x0003bcee: mv a5,a0 0x0003bcf0: addi a7,zero,214 0x0003bcf4: ecall ``` #### TCG-IR ```asm=1 OP: ld_i32 tmp0,env,$0xffffffffffffffec movi_i32 tmp1,$0x0 brcond_i32 tmp0,tmp1,lt,$L0 ---- 0003bcee movi_i32 tmp0,$0x0 mov_i32 tmp1,a0 add_i32 tmp0,tmp0,tmp1 mov_i32 a5 ,tmp0 ---- 0003bcf0 movi_i32 tmp0,$0x0 movi_i32 tmp1,$0xd6 add_i32 tmp0,tmp0,tmp1 mov_i32 a7 ,tmp0 ---- 0003bcf4 movi_i32 tmp0,$0x0 movi_i32 pc,$0x3bcf4 movi_i32 tmp3,$0x0 movi_i32 tmp1,$0x0 movi_i32 pc,$0x3bcf4 movi_i32 tmp5,$0x8 call raise_exception,$0x0,$0,env,tmp5 exit_tb $0x0 set_label $L0 exit_tb $0x555555b92d03 ``` #### x86: ```cpp=1 OUT: [size=69] 0x555555b92d80: movl -0x14(%r14), %ebp 0x555555b92d84: testl %ebp, %ebp 0x555555b92d86: jl 0x555555b92db9 0x555555b92d8c: movl 0x28(%r14), %ebp 0x555555b92d90: movl %ebp, 0x3c(%r14) 0x555555b92d94: movl $0xd6, 0x44(%r14) 0x555555b92d9c: movl $0x3bcf4, 0x180(%r14) 0x555555b92da4: 0x555555b92da7: movq %r14, %rdi 0x555555b92daa: movl $8, %esi 0x555555b92daf: callq 0x555555635cf1 0x555555b92db4: jmp 0x555555b8d016 0x555555b92db9: leaq -0xbd(%rip), %rax 0x555555b92dc0: jmp 0x555555b8d018 ``` `callq 0x555555635cf1` 會呼叫 raise_exception 這個helper function # Emulation ### System call emulation raise_exception 接下來會做下面這些事情 ```bash #0 cpu_loop_exit (cpu=0x555557baa3f0) at /home/chihmin/qemu-riscv/accel/tcg/cpu-exec-common.c:68 #1 0x00005555555ffaca in cpu_loop_exit_restore (cpu=0x555557baa3f0, pc=0) at /home/chihmin/qemu-riscv/accel/tcg/cpu-exec-common.c:76 #2 0x0000555555635cf1 in do_raise_exception_err (env=0x555557bb2690, exception=8, pc=0) at /home/chihmin/qemu-riscv/target/riscv/op_helper.c:67 #3 0x0000555555635d16 in helper_raise_exception (env=0x555557bb2690, exception=8) at /home/chihmin/qemu-riscv/target/riscv/op_helper.c:72 #4 0x0000555555b92db4 in static_code_gen_buffer () #5 0x00005555555feb32 in cpu_tb_exec (cpu=0x555557baa3f0, itb=0x555555b92d00 <static_code_gen_buffer+25216>) at /home/chihmin/qemu-riscv/accel/tcg/cpu-exec.c:169 #6 0x00005555555ff744 in cpu_loop_exec_tb (cpu=0x555557baa3f0, tb=0x555555b92d00 <static_code_gen_buffer+25216>, last_tb=0x7fffffffd7e8, tb_exit=0x7fffffffd7e0) at /home/chihmin/qemu-riscv/accel/tcg/cpu-exec.c:626 #7 0x00005555555ff9b7 in cpu_exec (cpu=0x555557baa3f0) at /home/chihmin/qemu-riscv/accel/tcg/cpu-exec.c:734 #8 0x0000555555606080 in cpu_loop (env=0x555557bb2690) at /home/chihmin/qemu-riscv/linux-user/main.c:3579 #9 0x00005555556079be in main (argc=8, argv=0x7fffffffdfc8, envp=0x7fffffffe010) at /home/chihmin/qemu-riscv/linux-user/main.c:5147 ``` 之後 `cpu_look_exit` 會呼叫在 `roms/ipxe/src/arch/x86_64/core/setjmp.S` 的 `long_jump` (assembly code): 之後在long_jump function裡面,會根據不同architecture 的function call convention,return value 到 sigsetjump,之後跳到`qemu-riscv/accel/tcg/cpu-exec.c:694`: ```c=693 /* prepare setjmp context for exception handling */ if (sigsetjmp(cpu->jmp_env, 0) != 0) { #if defined(__clang__) || !QEMU_GNUC_PREREQ(4, 6) /* Some compilers wrongly smash all local variables after * siglongjmp. There were bug reports for gcc 4.5.0 and clang. * Reload essential local variables here for those compilers. * Newer versions of gcc would complain about this code (-Wclobbered). */ cpu = current_cpu; cc = CPU_GET_CLASS(cpu); #else /* buggy compiler */ /* Assert that the compiler does not smash local variables. */ g_assert(cpu == current_cpu); g_assert(cc == CPU_GET_CLASS(cpu)); #endif /* buggy compiler */ tb_lock_reset(); if (qemu_mutex_iothread_locked()) { qemu_mutex_unlock_iothread(); } } /* if an exception is pending, we execute it here */ while (!cpu_handle_exception(cpu, &ret)) { TranslationBlock *last_tb = NULL; int tb_exit = 0; while (!cpu_handle_interrupt(cpu, &last_tb)) { uint32_t cflags = cpu->cflags_next_tb; TranslationBlock *tb; /* When requested, use an exact setting for cflags for the next execution. This is used for icount, precise smc, and stop- after-access watchpoints. Since this request should never have CF_INVALID set, -1 is a convenient invalid value that does not require tcg headers for cpu_common_reset. */ if (cflags == -1) { cflags = curr_cflags(); } else { cpu->cflags_next_tb = -1; } tb = tb_find(cpu, last_tb, tb_exit, cflags); cpu_loop_exec_tb(cpu, tb, &last_tb, &tb_exit); /* Try to align the host and virtual clocks if the guest is in advance */ align_clocks(&sc, cpu); } } ``` 如果沒有exception,會跳進while loop裡面繼續執行下一個Translation block。反之,如果有exception,在cpu_loop那邊用host OS的system call處理掉,並且把trap number傳給 `qemu-riscv/linux-user/main.c:3579` 的 `trapnr` ,然後在 `qemu-riscv/linux-user/main.c:3614` 會寫回 architecture states: ```c=3570 void cpu_loop(CPURISCVState *env) { CPUState *cs = CPU(riscv_env_get_cpu(env)); int trapnr, signum, sigcode; target_ulong sigaddr; target_ulong ret; for (;;) { cpu_exec_start(cs); trapnr = cpu_exec(cs); cpu_exec_end(cs); process_queued_cpu_work(cs); signum = 0; sigcode = 0; sigaddr = 0; switch (trapnr) { case EXCP_INTERRUPT: /* just indicate that signals should be handled asap */ break; case EXCP_ATOMIC: cpu_exec_step_atomic(cs); break; case RISCV_EXCP_U_ECALL: env->pc += 4; if (env->gpr[xA7] == TARGET_NR_arch_specific_syscall + 15) { /* riscv_flush_icache_syscall is a no-op in QEMU as self-modifying code is automatically detected */ ret = 0; } else { ret = do_syscall(env, env->gpr[xA7], env->gpr[xA0], env->gpr[xA1], env->gpr[xA2], env->gpr[xA3], env->gpr[xA4], env->gpr[xA5], 0, 0); } if (ret == -TARGET_ERESTARTSYS) { env->pc -= 4; } else if (ret != -TARGET_QEMU_ESIGRETURN) { env->gpr[xA0] = ret; } if (cs->singlestep_enabled) { goto gdbstep; } break; ``` 拿到trapnr之後,會把該做system call 的事情做完,之後會把ret寫回call的return value。如果是 x86,基本上會寫到 eax 這個 register。如果是riscv,他會寫到 A0 這個 return register。 ### Handle Signal 這裡的實驗實在是有點難做,這裡簡單敘述幾個追這邊的方法,有時間在繼續往下追XD 先把handle signal handler的結果dump出來: ```bash=1 #0 host_to_target_siginfo_noswap (tinfo=0x7fffffffc720, #0 host_to_target_siginfo_noswap (tinfo=0x7fffffffc720, info=0x7fffffffc8f0) at /home/chihmin/qemu-riscv/linux-user/signal.c:303 #1 0x0000555555628d4b in host_signal_handler (host_signum=14, info=0x7fffffffc8f0, puc=0x7fffffffc7c0) at /home/chihmin/qemu-riscv/linux-user/signal.c:658 #2 <signal handler called> #3 is_nan (c=float_class_zero) at /home/chihmin/qemu-riscv/fpu/softfloat.c:553 #4 0x00005555555e2bb1 in addsub_floats (a=..., b=..., subtract=false, s=0x555557ba7644) at /home/chihmin/qemu-riscv/fpu/softfloat.c:711 #5 0x00005555555e2ddf in float64_add (a=0, b=0, status=0x555557ba7644) at /home/chihmin/qemu-riscv/fpu/softfloat.c:756 #6 0x000055555563782f in helper_fadd_d (env=0x555557ba74a0, frs1=0, frs2=0) at /home/chihmin/qemu-riscv/target/riscv/fpu_helper.c:252 #7 0x0000555555bb6f09 in static_code_gen_buffer () #8 0x00005555555feb32 in cpu_tb_exec (cpu=0x555557b9f200, itb=0x555555bb6c00 <static_code_gen_buffer+172416>) at /home/chihmin/qemu-riscv/accel/tcg/cpu-exec.c:169 ``` `host_signal_handler` 是 QEMU 註冊的 signal handler, 相關的context會存在`host_signal_handler`的 `puc` 這個參數裡面。這有點久遠了,我沒記錯的話好像是 (ucontext_t*)puc 就可以拿到register的所有的states(%rip這種跟program counter 相關的 register),我想只要這樣應該就可以反查接下來要回去執行code cache的哪一道instruction,發生interrupt之後QEMU會採取 "延遲 Handle Exception"。如果當前的Translation Block 發生exception,那麼就會等到下一個Translation Block 才跳回QEMU,而每個一個Translation Block 的 Prologue 都會去 compare 到底要不要執行當前的 Translation Block。 ### Run benchmark Spec2006 我只有編 int 的 test & train input,大程式測這幾支的效果還不錯 - 操作方法 1. 90.89 的 `/home/chihmin/CPU2006` 底下你應該會看到所有的benchmark,先選定其中一支,我們先選其中一支 (401.bzip2) 2. 這時候你會看到有三個資料夾,跳進去 `run/` 這個資料夾裡面有三個資料夾,比較小的input可以看 `run_base_test_riscv32.0000/`,比較大的 input 可以看 `run_base_train_riscv32.0000` 3. 跳進去這兩個資料夾其中一個之後,`speccmds.cmd` 把一些command的參數放在這裡,我以 `401.bzip2/run/run_base_test_riscv32.0000` 這個資料夾的 .cmd 舉例: ```bash=1 $ cat speccmds.cmd -C /home/chihmin/spec2006/riscv32/benchspec/CPU2006/401.bzip2/run/run_base_test_riscv32.0000 -o input.program.out -e input.program.err qemu-riscv32 -L /home/chihmin/riscv/sysroot/ ../run_base_test_riscv32.0000/bzip2_base.riscv32 input.program 5 -o dryer.jpg.out -e dryer.jpg.err qemu-riscv32 -L /home/chihmin/riscv/sysroot/ ../run_base_test_riscv32.0000/bzip2_base.riscv32 dryer.jpg 2 ``` **參數說明**: - -C 執行黨的位置,因為我一開始是在別台電腦compile所以絕對路徑有點小問題 - -o standard output - -e error output - `../run_base_test_riscv32.0000/bzip2_base.riscv32` 以後都是你要執行的 command 參數,比方說 `../run_base_test_riscv32.0000/bzip2_base.riscv32 input.program 5` - `-i`: 如果有這個flag,代表他是standard input,直接在command 的結尾加上 `..... < testcase.in` 之類的 不同行代表的是不同的testcase,一般我們都會直接把他們的時間加起來計算 - **Test Input** ```bash=1 Estimated Estimated Base Base Base Peak Peak Peak Benchmarks Ref. Run Time Ratio Ref. Run Time Ratio -------------- ------ --------- --------- ------ --------- --------- 400.perlbench NR 401.bzip2 -- 12.1 -- S 403.gcc NR 429.mcf -- 3.45 -- S 445.gobmk -- 76.0 -- S 456.hmmer NR 458.sjeng -- 24.5 -- S 462.libquantum -- 0.227 -- S 464.h264ref NR 471.omnetpp -- 18.1 -- S 473.astar -- 124 -- S 483.xalancbmk -- 15.7 -- S ``` 可以動的就只有標記 'S' 的那幾個benchmark,跑那幾支benchmark就好了 - **Train Input** 我目前還沒有跑 Train input 所以不曉得會不會動,你可以看上面哪幾支benchmark會動就側那幾支就可以了 ### Entry point from QEMU to code cache ```cpp=168 cpu->can_do_io = !use_icount; ret = tcg_qemu_tb_exec(env, tb_ptr); cpu->can_do_io = 1; last_tb = (TranslationBlock *)(ret & ~TB_EXIT_MASK); tb_exit = ret & TB_EXIT_MASK; ``` ```cpp=1 mov %rdx,%rsi mov %rax,%rdi callq *%rcx ``` ```cpp=1 0x555555b8d000 push %rbp 0x555555b8d001 push %rbx 0x555555b8d002 push %r12 0x555555b8d004 push %r13 0x555555b8d006 push %r14 0x555555b8d008 push %r15 0x555555b8d00a mov %rdi,%r14 0x555555b8d00d add $0xfffffffffffffb78,%rsp 0x555555b8d014 jmpq *%rsi ``` ```cpp=1 0x555555b8d016 xor %eax,%eax 0x555555b8d018 add $0x488,%rsp 0x555555b8d01f vzeroupper 0x555555b8d022 pop %r15 0x555555b8d024 pop %r14 0x555555b8d026 pop %r13 0x555555b8d028 pop %r12 0x555555b8d02a pop %rbx 0x555555b8d02b pop %rbp 0x555555b8d02c retq ```