wasm3 Strategy

clang compile

這邊要用 clang-14 才支援 musttail。

mkdir build
cd build
cmake -GNinja -DCLANG_SUFFIX="-14" ..
ninja

Tail Call

透過 gef 追蹤得知，程式主要由 main -> repl_call -> m3_CallArgv->op_Call->op_Compile->Compile_funciton->…，function dispatch 主要透過 op_Call 這個 function 完成，op_Call 這個 function 是由 d_m3Op (Call) 所展開，而 d_m3Op (Call) 最原始是由 RunCode 所展開，由程式碼可以發現 RunCode 的寫法符合 tail-call 的定義。然而，參考這篇 Parsing Protobuf at 2+GB/s: How I Learned To Love Tail Calls in C 發現 __attribute__((musttail)) 這個 attribute 目前移植性並不高，當我直接定義 M3_MUSTTAIL 為__attribute__((musttail)) 時，我的電腦並不支援。

#define d_m3BaseOpSig  pc_t _pc, m3stack_t _sp, M3MemoryHeader * _mem, m3reg_t _r0

#if M3_COMPILER_HAS_ATTRIBUTE(musttail)
    #define M3_MUSTTAIL __attribute__((musttail))
#define nextOpDirect()  M3_MUSTTAIL return nextOpImpl()

d_m3RetSig  RunCode  (d_m3OpSig)
{
    nextOpDirect();
}

d_m3Op  (Call)
{
    pc_t callPC                 = immediate (pc_t);
    i32 stackOffset             = immediate (i32);
    IM3Memory memory            = m3MemInfo (_mem);

    m3stack_t sp = _sp + stackOffset;

    m3ret_t r = Call (callPC, sp, _mem, d_m3OpDefaultArgs);
    _mem = memory->mallocated;

    if (M3_LIKELY(not r))
        nextOp ();
    else
    {
        pushBacktraceFrame ();
        forwardTrap (r);
    }
}

d_m3RetSig  Call  (d_m3OpSig)
{
    m3ret_t possible_trap = m3_Yield ();
    if (M3_UNLIKELY(possible_trap)) return possible_trap;

    nextOpDirect();
}

Yield

M3Result m3_Yield ()
{
    return m3Err_none; // NULL
}

透過 disassemble 展開也可發現，編譯器並沒有使用 tail-call optimization 來最佳化op_Call 這個 function， function 還是頻繁的操作 stack。

另外，猜測是沒有利用 tail-call optimization 的關係，參數並沒有直接 mapping 到這些暫存器上，而是先透過 r12~15 來存取。

M3 Register	x86 Register
program counter (pc)	rdi
stack pointer (sp)	rsi
linear memory (mem)	rdx
integer register (r0)	rcx
floating-point register (fp0)	xmm0

000000000000104d0 <op_Call>:
   104d0:	55                   	push   %rbp
   104d1:	41 57                	push   %r15
   104d3:	41 56                	push   %r14
   104d5:	41 55                	push   %r13
   104d7:	41 54                	push   %r12
   104d9:	53                   	push   %rbx
   104da:	48 83 ec 18          	sub    $0x18,%rsp
   104de:	c5 fb 11 44 24 10    	vmovsd %xmm0,0x10(%rsp)
   104e4:	48 89 4c 24 08       	mov    %rcx,0x8(%rsp)
   104e9:	49 89 d4             	mov    %rdx,%r12
   104ec:	49 89 f7             	mov    %rsi,%r15
   104ef:	48 89 fb             	mov    %rdi,%rbx
   104f2:	48 8b 2f             	mov    (%rdi),%rbp
   104f5:	4c 63 77 08          	movslq 0x8(%rdi),%r14
   104f9:	4c 8b 2a             	mov    (%rdx),%r13
   104fc:	e8 1f b0 00 00       	callq  1b520 <m3_Yield>
   10501:	48 85 c0             	test   %rax,%rax
   10504:	75 49                	jne    1054f <op_Call+0x7f>
   10506:	4b 8d 34 b7          	lea    (%r15,%r14,4),%rsi
   1050a:	48 8d 7d 08          	lea    0x8(%rbp),%rdi
   1050e:	c5 f8 57 c0          	vxorps %xmm0,%xmm0,%xmm0
   10512:	4c 89 e2             	mov    %r12,%rdx
   10515:	31 c9                	xor    %ecx,%ecx
   10517:	ff 55 00             	callq  *0x0(%rbp)
   1051a:	48 85 c0             	test   %rax,%rax
   1051d:	75 30                	jne    1054f <op_Call+0x7f>
   1051f:	49 8b 95 b8 29 00 00 	mov    0x29b8(%r13),%rdx
   10526:	48 8b 43 10          	mov    0x10(%rbx),%rax
   1052a:	48 83 c3 18          	add    $0x18,%rbx
   1052e:	48 89 df             	mov    %rbx,%rdi
   10531:	4c 89 fe             	mov    %r15,%rsi
   10534:	48 8b 4c 24 08       	mov    0x8(%rsp),%rcx
   10539:	c5 fb 10 44 24 10    	vmovsd 0x10(%rsp),%xmm0
   1053f:	48 83 c4 18          	add    $0x18,%rsp
   10543:	5b                   	pop    %rbx
   10544:	41 5c                	pop    %r12
   10546:	41 5d                	pop    %r13
   10548:	41 5e                	pop    %r14
   1054a:	41 5f                	pop    %r15
   1054c:	5d                   	pop    %rbp
   1054d:	ff e0                	jmpq   *%rax
   1054f:	48 83 c4 18          	add    $0x18,%rsp
   10553:	5b                   	pop    %rbx
   10554:	41 5c                	pop    %r12
   10556:	41 5d                	pop    %r13
   10558:	41 5e                	pop    %r14
   1055a:	41 5f                	pop    %r15
   1055c:	5d                   	pop    %rbp
   1055d:	c3                   	retq   
   1055e:	66 90                	xchg   %ax,%ax

issue Report

https://github.com/wasm3/wasm3/issues/411

Coremark 測試

wasm3-0.5.0 (clang 16) 2901.478358
wasm3-0.5.0 (clang 14) 2953.427087
native clang           33154.666519
wavm (2022-05-14)      27219.358985
WebAssembly.sh v0.1.0  23652.217670
emscripten             23466.833542

with and without `attribute((musttail))` in O0

int factorial(int n, int sum) {
    if (n == 0)
        return sum;
    sum *= n;
    // without `__attribute__((musttail))`
    return factorial(n - 1, sum);
    // with `__attribute__((musttail))`
    __attribute__((musttail)) return factorial(n - 1, sum);
}

比較兩份指令可發現利用 __attribute__((musttail))，它能幫助作到的事情只有不去對 %rsp 去做增加或減少，然後將 callq 改成 jmpq，還是要利用到 function stack，但是用到的都是同一塊。

without __attribute__((musttail))

0000000000001130 <factorial>:
    1130:	55                   	push   %rbp
    1131:	48 89 e5             	mov    %rsp,%rbp
    1134:	48 83 ec 10          	sub    $0x10,%rsp
    1138:	89 7d f8             	mov    %edi,-0x8(%rbp)
    113b:	89 75 f4             	mov    %esi,-0xc(%rbp)
    113e:	83 7d f8 00          	cmpl   $0x0,-0x8(%rbp)
    1142:	0f 85 0b 00 00 00    	jne    1153 <factorial+0x23>
    1148:	8b 45 f4             	mov    -0xc(%rbp),%eax
    114b:	89 45 fc             	mov    %eax,-0x4(%rbp)
    114e:	e9 1b 00 00 00       	jmpq   116e <factorial+0x3e>
    1153:	8b 45 f8             	mov    -0x8(%rbp),%eax
    1156:	0f af 45 f4          	imul   -0xc(%rbp),%eax
    115a:	89 45 f4             	mov    %eax,-0xc(%rbp)
    115d:	8b 7d f8             	mov    -0x8(%rbp),%edi
    1160:	83 ef 01             	sub    $0x1,%edi
    1163:	8b 75 f4             	mov    -0xc(%rbp),%esi
    1166:	e8 c5 ff ff ff       	callq  1130 <factorial>
    116b:	89 45 fc             	mov    %eax,-0x4(%rbp)
    116e:	8b 45 fc             	mov    -0x4(%rbp),%eax
    1171:	48 83 c4 10          	add    $0x10,%rsp
    1175:	5d                   	pop    %rbp
    1176:	c3                   	retq   
    1177:	66 0f 1f 84 00 00 00 	nopw   0x0(%rax,%rax,1)
    117e:	00 00

with __attribute__((musttail))

0000000000001130 <factorial>:
    1130:	55                   	push   %rbp
    1131:	48 89 e5             	mov    %rsp,%rbp
    1134:	89 7d f8             	mov    %edi,-0x8(%rbp)
    1137:	89 75 f4             	mov    %esi,-0xc(%rbp)
    113a:	83 7d f8 00          	cmpl   $0x0,-0x8(%rbp)
    113e:	0f 85 0b 00 00 00    	jne    114f <factorial+0x1f>
    1144:	8b 45 f4             	mov    -0xc(%rbp),%eax
    1147:	89 45 fc             	mov    %eax,-0x4(%rbp)
    114a:	e9 1a 00 00 00       	jmpq   1169 <factorial+0x39>
    114f:	8b 4d f8             	mov    -0x8(%rbp),%ecx
    1152:	8b 45 f4             	mov    -0xc(%rbp),%eax
    1155:	0f af c1             	imul   %ecx,%eax
    1158:	89 45 f4             	mov    %eax,-0xc(%rbp)
    115b:	8b 7d f8             	mov    -0x8(%rbp),%edi
    115e:	ff cf                	dec    %edi
    1160:	8b 75 f4             	mov    -0xc(%rbp),%esi
    1163:	5d                   	pop    %rbp
    1164:	e9 c7 ff ff ff       	jmpq   1130 <factorial>
    1169:	8b 45 fc             	mov    -0x4(%rbp),%eax
    116c:	5d                   	pop    %rbp
    116d:	c3                   	retq   
    116e:	66 90                	xchg   %ax,%ax

原本懷疑可能是這篇提到的要將 slow path 也包成一個 function，因此做改寫

int fall_back(int n, int sum) {
    return sum;
}
int factorial(int n, int sum) {
    if (n == 0)
        __attribute__((musttail)) return fall_back(n, sum);
    sum *= n;
    __attribute__((musttail)) return factorial(n - 1, sum);
}

然而，只是多了要跳過去 fall_back funciton 而已，還是要利用到 function stack

0000000000001130 <fall_back>:
    1130:	55                   	push   %rbp
    1131:	48 89 e5             	mov    %rsp,%rbp
    1134:	89 7d fc             	mov    %edi,-0x4(%rbp)
    1137:	89 75 f8             	mov    %esi,-0x8(%rbp)
    113a:	8b 45 f8             	mov    -0x8(%rbp),%eax
    113d:	5d                   	pop    %rbp
    113e:	c3                   	retq   
    113f:	90                   	nop

0000000000001140 <factorial>:
    1140:	55                   	push   %rbp
    1141:	48 89 e5             	mov    %rsp,%rbp
    1144:	89 7d f8             	mov    %edi,-0x8(%rbp)
    1147:	89 75 f4             	mov    %esi,-0xc(%rbp)
    114a:	83 7d f8 00          	cmpl   $0x0,-0x8(%rbp)
    114e:	0f 85 0c 00 00 00    	jne    1160 <factorial+0x20>
    1154:	8b 7d f8             	mov    -0x8(%rbp),%edi
    1157:	8b 75 f4             	mov    -0xc(%rbp),%esi
    115a:	5d                   	pop    %rbp
    115b:	e9 d0 ff ff ff       	jmpq   1130 <fall_back>
    1160:	8b 4d f8             	mov    -0x8(%rbp),%ecx
    1163:	8b 45 f4             	mov    -0xc(%rbp),%eax
    1166:	0f af c1             	imul   %ecx,%eax
    1169:	89 45 f4             	mov    %eax,-0xc(%rbp)
    116c:	8b 7d f8             	mov    -0x8(%rbp),%edi
    116f:	ff cf                	dec    %edi
    1171:	8b 75 f4             	mov    -0xc(%rbp),%esi
    1174:	5d                   	pop    %rbp
    1175:	e9 c6 ff ff ff       	jmpq   1140 <factorial>
    117a:	66 0f 1f 44 00 00    	nopw   0x0(%rax,%rax,1)

但是用 O3 編譯的話，就可以避免掉建立 function stack，但用 O3 來編譯的話，有沒有加 __attribute__((musttail)) 結果是一樣的。

參考這個 commit 發現其他 op_XXX 的 function 也會被 tail-call optimization 優化，但不知道為何只有之前分析的 op_Call 沒有。

找其中一個確定有優化的 function 來測試，先用 O0 來編譯，可以發現它需要用到 function stack，有沒有加 __attribute__((musttail)) 的差異也跟上面的實驗結果一樣。

00000000000216c0 <op_u64_Or_rs>:
   216c0:	48 83 ec 38          	sub    $0x38,%rsp
   216c4:	48 89 7c 24 30       	mov    %rdi,0x30(%rsp)
   216c9:	48 89 74 24 28       	mov    %rsi,0x28(%rsp)
   216ce:	48 89 54 24 20       	mov    %rdx,0x20(%rsp)
   216d3:	48 89 4c 24 18       	mov    %rcx,0x18(%rsp)
   216d8:	c5 fb 11 44 24 10    	vmovsd %xmm0,0x10(%rsp)
   216de:	48 8b 44 24 28       	mov    0x28(%rsp),%rax
   216e3:	48 8b 4c 24 30       	mov    0x30(%rsp),%rcx
   216e8:	48 89 ca             	mov    %rcx,%rdx
   216eb:	48 83 c2 08          	add    $0x8,%rdx
   216ef:	48 89 54 24 30       	mov    %rdx,0x30(%rsp)
   216f4:	48 63 09             	movslq (%rcx),%rcx
   216f7:	48 8b 04 88          	mov    (%rax,%rcx,4),%rax
   216fb:	48 89 44 24 08       	mov    %rax,0x8(%rsp)
   21700:	48 8b 44 24 08       	mov    0x8(%rsp),%rax
   21705:	48 0b 44 24 18       	or     0x18(%rsp),%rax
   2170a:	48 89 44 24 18       	mov    %rax,0x18(%rsp)
   2170f:	48 8b 44 24 30       	mov    0x30(%rsp),%rax
   21714:	48 8b 00             	mov    (%rax),%rax
   21717:	48 8b 7c 24 30       	mov    0x30(%rsp),%rdi
   2171c:	48 83 c7 08          	add    $0x8,%rdi
   21720:	48 8b 74 24 28       	mov    0x28(%rsp),%rsi
   21725:	48 8b 54 24 20       	mov    0x20(%rsp),%rdx
   2172a:	48 8b 4c 24 18       	mov    0x18(%rsp),%rcx
   2172f:	c5 fb 10 44 24 10    	vmovsd 0x10(%rsp),%xmm0
   21735:	ff d0                	callq  *%rax
   21737:	48 83 c4 38          	add    $0x38,%rsp
   2173b:	c3                   	retq   
   2173c:	0f 1f 40 00          	nopl   0x0(%rax)

0000000000020770 <op_u64_Or_rs>:
   20770:	48 89 7c 24 f8       	mov    %rdi,-0x8(%rsp)
   20775:	48 89 74 24 f0       	mov    %rsi,-0x10(%rsp)
   2077a:	48 89 54 24 e8       	mov    %rdx,-0x18(%rsp)
   2077f:	48 89 4c 24 e0       	mov    %rcx,-0x20(%rsp)
   20784:	c5 fb 11 44 24 d8    	vmovsd %xmm0,-0x28(%rsp)
   2078a:	48 8b 44 24 f0       	mov    -0x10(%rsp),%rax
   2078f:	48 8b 4c 24 f8       	mov    -0x8(%rsp),%rcx
   20794:	48 89 ca             	mov    %rcx,%rdx
   20797:	48 83 c2 08          	add    $0x8,%rdx
   2079b:	48 89 54 24 f8       	mov    %rdx,-0x8(%rsp)
   207a0:	48 63 09             	movslq (%rcx),%rcx
   207a3:	48 8b 04 88          	mov    (%rax,%rcx,4),%rax
   207a7:	48 89 44 24 d0       	mov    %rax,-0x30(%rsp)
   207ac:	48 8b 44 24 d0       	mov    -0x30(%rsp),%rax
   207b1:	48 8b 4c 24 e0       	mov    -0x20(%rsp),%rcx
   207b6:	48 09 c8             	or     %rcx,%rax
   207b9:	48 89 44 24 e0       	mov    %rax,-0x20(%rsp)
   207be:	48 8b 7c 24 f8       	mov    -0x8(%rsp),%rdi
   207c3:	48 8b 07             	mov    (%rdi),%rax
   207c6:	48 83 c7 08          	add    $0x8,%rdi
   207ca:	48 8b 74 24 f0       	mov    -0x10(%rsp),%rsi
   207cf:	48 8b 54 24 e8       	mov    -0x18(%rsp),%rdx
   207d4:	48 8b 4c 24 e0       	mov    -0x20(%rsp),%rcx
   207d9:	c5 fb 10 44 24 d8    	vmovsd -0x28(%rsp),%xmm0
   207df:	ff e0                	jmpq   *%rax
   207e1:	66 66 66 66 66 66 2e 	data16 data16 data16 data16 data16 nopw %cs:0x0(%rax,%rax,1)
   207e8:	0f 1f 84 00 00 00 00 
   207ef:	00

接下用 O3 去編譯，可發現不需要到 function stack，

0000000000014630 <op_u64_Or_rs>:
   14630:	48 63 07             	movslq (%rdi),%rax
   14633:	48 0b 0c 86          	or     (%rsi,%rax,4),%rcx
   14637:	48 8b 47 08          	mov    0x8(%rdi),%rax
   1463b:	48 83 c7 10          	add    $0x10,%rdi
   1463f:	ff e0                	jmpq   *%rax

用 gcc 編出來的

0000000000011810 <op_u64_Or_rs>:
   11810:	f3 0f 1e fa          	endbr64 
   11814:	48 63 07             	movslq (%rdi),%rax
   11817:	4c 8d 47 10          	lea    0x10(%rdi),%r8
   1181b:	48 0b 0c 86          	or     (%rsi,%rax,4),%rcx
   1181f:	48 8b 47 08          	mov    0x8(%rdi),%rax
   11823:	4c 89 c7             	mov    %r8,%rdi
   11826:	ff e0                	jmpq   *%rax
   11828:	0f 1f 84 00 00 00 00 	nopl   0x0(%rax,%rax,1)
   1182f:	00

sqrt

有 callq 的 funciton 就會用到 function stack

`attribute((musttail))` 限制

function 參數宣告要一樣
slow path 要包成一個 function (似乎沒影響)
不能放在函式宣告處

error: 'musttail' attribute cannot be applied to a declaration
__attribute__((musttail)) int factorial(int n, int sum) {
               ^              ~~~~~~~~~
1 error generated.

wasm3 porfiler

將 m3_config.h 中的 d_m3EnableOpProfiling 設成 1，跑 coremark 時就會印出以下資訊

   1324428702  op_SetSlot_i32
    540959457  op_i32_Add_ss
    418617774  op_CopySlot_32
    318460691  op_i32_Load_i32_s
    228893365  op_u32_And_rs
    179737778  op_i32_Add_rs
    155645868  op_BranchIf_r
    129274454  op_u32_Xor_rs
    126925034  op_SetRegister_i32
    113785674  op_i32_Multiply_rs
    100791878  op_i32_Store_i32_ss
     94615534  op_u32_ShiftRight_sr
     89468905  op_u32_ShiftRight_ss
     89339933  op_u32_And_ss
     87556818  op_i32_Load_i16_s
     81719750  op_i32_Load_u16_s
     80991816  op_i32_Equal_ss
     75518225  op_Select_i32_rss
     64609494  op_If_r
     61167788  op_i32_Load_u8_r
     60755923  op_i32_Equal_rs
     57889024  op_i32_Load_u8_s
     53672190  op_i32_NotEqual_ss
     49348860  op_i32_EqualToZero_r
     47495786  op_i32_Store_i32_rs
     39187964  op_PreserveSetSlot_i32
     37928724  op_i32_GreaterThan_ss
     35663950  op_i32_Load_u16_r
     28192083  op_u32_LessThanOrEqual_sr
     21272720  op_i32_EqualToZero_s
     21145878  op_Call
     18940351  op_ContinueLoopIf
     18782683  op_u32_ShiftLeft_ss
     17971255  op_u32_Or_ss
     16252015  op_i32_NotEqual_rs
     15694560  op_Select_i32_srs
     14192986  op_u32_Xor_ss
     12013411  op_u32_LessThan_sr
      8283274  op_i32_Store_i32_sr
      8186576  op_u32_ShiftLeft_sr
      8186565  op_i32_Store_i16_rs
      7266221  op_u32_GreaterThan_sr
      6154402  op_i32_Store_u8_rs
      5207319  op_i32_ShiftRight_sr
      3947862  op_i32_Subtract_sr
      2519636  op_i32_LessThanOrEqual_rs
      2494769  op_i32_LessThan_sr
      2470467  op_i32_GreaterThan_sr
      2386041  op_i32_Load_i32_r
      1979450  op_i32_LessThan_ss
      1380540  op_i32_Load_i16_r
      1344938  op_u32_LessThanOrEqual_rs
      1211117  op_u32_LessThan_ss
      1017538  op_u32_Or_rs
       986275  op_BranchIf_s
       872075  op_i32_Multiply_ss
       363370  op_i32_Store_i16_ss
       218039  op_i32_GreaterThanOrEqual_ss
       193813  op_i64_Store_i64_ss
       193805  op_i64_Store_i64_sr
        97373  op_i32_Subtract_ss
        97036  op_SetGlobal_i32
        48536  op_If_s
        48518  op_GetGlobal_s32
        48499  op_u32_GreaterThan_ss
        24238  op_i32_LessThanOrEqual_sr
          218  op_i32_GreaterThan_rs
          212  op_Const32
          204  op_i32_Subtract_rs
          100  op_i32_Store_u8_sr
           90  op_i32_Load_i8_s
           81  op_i32_Remainder_sr
           74  op_u32_GreaterThanOrEqual_ss
           68  op_u32_GreaterThanOrEqual_sr
           61  op_SetSlot_i64
           58  op_u32_Divide_ss
           49  op_Select_i32_ssr
           44  op_i64_Load_i64_s
           42  op_Const64
           33  op_BranchIfPrologue_r
           32  op_CopySlot_64
           32  op_SetSlot_f64
           27  op_i32_Load_i8_r
           27  op_i32_Wrap_i64_s
           23  op_Select_i32_sss
           20  op_u64_ShiftRight_ss
           20  op_i64_NotEqual_rs
           16  op_i64_Store_i64_rs
           15  op_i32_Store_u8_ss
           15  op_f64_NotEqual_rs
           13  op_f64_LessThan_ss
           13  op_u32_Trunc_f64_r_s
           13  op_f64_GreaterThanOrEqual_ss
           13  op_u32_LessThanOrEqual_ss
           12  op_f64_Subtract_rs
           12  op_f64_Convert_u32_r_s
           12  op_i32_LessThanOrEqual_ss
           12  op_i64_EqualToZero_s
           12  op_f64_Multiply_rs
           10  op_u32_GreaterThanOrEqual_rs
            8  op_i64_Load_u32_s
            7  op_u64_LessThan_ss
            7  op_f64_LessThan_sr
            7  op_i64_LessThanOrEqual_sr
            6  op_i64_Reinterpret_f64_r_s
            6  op_f64_Divide_sr
            6  op_f64_Add_ss
            5  op_u32_GreaterThan_rs
            5  op_i64_EqualToZero_r
            4  op_i64_Load_i32_s
            4  op_i32_Wrap_i64_r
            4  op_i64_Subtract_rs
            3  op_f64_Multiply_ss
            3  op_u64_And_ss
            3  op_f64_Store_f64_rs
            3  op_Select_f64_rsr
            3  op_f64_Load_f64_s
            3  op_u64_Or_rs
            3  op_f64_Convert_i64_r_s
            3  op_Select_f64_rss
            3  op_u32_Divide_sr
            3  op_i32_Divide_sr
            3  op_f64_Abs_s
            3  op_i32_LessThan_rs
            3  op_f64_Equal_rs
            3  op_u64_ShiftRight_sr
            3  op_f64_Reinterpret_i64_r_r
            3  op_f64_Convert_i64_r_r
            3  op_Select_f64_rrs
            2  op_u32_ShiftRight_rs
            2  op_f64_Convert_u32_r_r
            2  op_f64_Convert_i64_s_s
            2  op_u32_Divide_rs
            2  op_f64_Divide_ss
            2  op_PreserveCopySlot_32
            2  op_f64_Divide_rs
            1  op_MemSize
            1  op_i64_Multiply_rs
            1  op_f64_GreaterThan_sr
            1  op_i64_Store_i32_ss
            1  op_i64_Extend_u32_s

wasm3 Strategy

clang compile

Tail Call

issue Report

Coremark 測試

with and without __attribute__((musttail)) in O0

sqrt

__attribute__((musttail)) 限制

wasm3 porfiler

Read more

實作 Thread package

Cache Coherency & I/O ordering

CO Final

New EBB Strategy

with and without `attribute((musttail))` in O0

`attribute((musttail))` 限制