Assignment1: RISC-V Assembly and Instruction Pipeline

contributed by < jimmylu890303 >

Implementations of Float32 multiplication & BFloat16 multiplication

1. Format of Float32 and BFloat16

Type	sign bit	exponent bits	significand bits
Float32	1	8	23
Bfloat16	1	8	7

Disadvantage: The architecture comparison between the two shows that Bfloat16 has fewer significand bits, only 7 bits, which results in lower precision compared to Float32.
Advantage: Bfloat16 is employed to reduce storage requirements and enhance the calculation speed of machine learning algorithms.

2. Difference in Design When Implementing Multiplication:

The simple algorithm for IEEE754 32-bit multiplication involves the following steps:

1. Check if either of the operands (A and B) is zero (early exit).
1. Check for potential exponent overflow and throw corresponding overflow errors.
1. Compute the sign as Csign = Asign XOR Bsign.
1. Compute the exponent Cexponent = Aexponent + Bexponent - 127.
1. Compute the mantissa Cmantissa = Amantissa * Bmantissa (23-bit integer multiplication) and round the result according to the currently set rounding mode.
1. If Cmantissa overflows, normalize the results.

When using Float32 to multiply two Float32 numbers, at step 5, which involves computing the mantissa Cmantissa, we need to use two registers to store the result of Amantissa * Bmantissa. Furthermore, we need to iterate 24 times (24 bits * 24 bits, where 1 bit is the integer part, and 23 bits are the mantissa) for the multiplication.

However, when using Bfloat16, at step 5, we only require one register to store the result of Amantissa * Bmantissa. Additionally, we only need to iterate 8 times (8 bits * 8 bits, where one bit represents the integer part, and 7 bits are the mantissa) for the multiplication.

In conclusion, Bfloat16 is more hardware-friendly (requiring only one register) and time-efficient (fewer iterations) when performing multiplication operations.

3. Implement Float32 multiplication with C

The code below is an implementation of float multiplication.
















































float float_mul (float A, float B)
{

  int32_t *A_int_ptr = (int *) &A;
  int32_t *B_int_ptr = (int *) &B;

  /*  Check if either of the operands (A and B) is zero  */
  if(*A_int_ptr == 0x80000000 | *A_int_ptr == 0x00000000)
    return A;
  if(*B_int_ptr == 0x80000000 | *B_int_ptr == 0x00000000)
    return B;

  /*  sign bit  */
  int32_t sign = 0x80000000;
  int32_t a_sign = *A_int_ptr & sign;
  int32_t b_sign = *B_int_ptr & sign;
  int32_t r_sign = a_sign ^ b_sign;

  /*  exp bit  */
  int32_t expo = 0x7F800000;
  int32_t a_exp = *A_int_ptr & expo;
  int32_t b_exp = *B_int_ptr & expo;
  int32_t bias = 0x3F800000;
  int32_t r_exp = a_exp + b_exp - bias;

  /*  mantisa  */
  int32_t man = 0x007FFFFF;
  int32_t a_man = *A_int_ptr & man;
  a_man = a_man | 0x00800000;
  int32_t b_man = *B_int_ptr & man;
  b_man = b_man | 0x00800000;
  int64_t r = unsign32_mul (a_man, b_man);

  /*  normalization   */
  int64_t needNorm = r & 0x800000000000;
  r = r >> 23;
  int64_t r_man = r & 0x7FFFFF;
  if (needNorm)
    {
      r_exp = r_exp + 0x00800000;
      r = r >> 1;
      r_man = r & 0x7FFFFF;
    }

  int32_t result = r_sign + r_exp + (int32_t) r_man;

  return *(float*)&result;
}

The code below is an implementation of mantissa multiplication when performing floating-point multiplication.



















int64_t unsign32_mul (int32_t A, int32_t B)
{
  int32_t LSB = 1;
  int64_t multiplican = (int64_t) A;

  int64_t result = 0;
  int loopCounter = 24;
  while (loopCounter > 0){
    int32_t lsb = B & LSB;
    /*  if lsb = 1, do addition  */
    if (lsb){
	    result = result + multiplican;
    }
    multiplican = multiplican << 1;
    B = B >> 1;
    loopCounter--;
  }
  return result;
}

From the above code, we can see that we need to use 64 bits to store the multiplication result when performing a multiplication of 24-bit mantissas. We need to iterate 24 times to complete the multiplication.

The assembly code of the implementation of float multiplication can be found on my GitHub.

4. Implement BFloat16 multiplication with C

The code below is an implementation of bfloat multiplication.
















































float bfloat_mul(float A, float B){

    int32_t* A_int_ptr = (int *)&A;
    int32_t* B_int_ptr = (int *)&B;

    /*  Check if either of the operands (A and B) is zero  */
    if(*A_int_ptr == 0x80000000 | *A_int_ptr == 0x00000000)
        return A;
    if(*B_int_ptr == 0x80000000 | *B_int_ptr == 0x00000000)
        return B;
    
    /*  sign bit  */
    int32_t sign = 0x80000000;
    int32_t a_sign = *A_int_ptr&sign;
    int32_t b_sign = *B_int_ptr&sign;
    int32_t r_sign = a_sign^b_sign;
    
    /* exp bit  */
    int32_t expo = 0x7F800000;
    int32_t a_exp = *A_int_ptr&expo;
    int32_t b_exp = *B_int_ptr&expo;
    int32_t bias = 0x3F800000;
    int32_t r_exp = a_exp+b_exp-bias;
    
    /*  mantisa */
    int32_t man = 0x007F0000;
    int32_t a_man = *A_int_ptr&man;
    a_man = a_man | 0x00800000;
    a_man = a_man >> 16;
    int32_t b_man = *B_int_ptr&man;
    b_man = b_man | 0x00800000;
    b_man = b_man >> 16;
    int32_t r = unsign8_mul(a_man,b_man);

    /*  normalization   */
    int32_t needNorm = r & 0x00008000;
    r = r >> 7;
    int32_t r_man = r & 0x7F;
    if(needNorm){
        r_exp = r_exp + 0x00800000;
        r = r >> 1;
        r_man = r & 0x7F;
    }
    r_man = r_man << 16;
    int32_t result = r_sign+r_exp + r_man;
    
    return *(float*)&result;
}

The code below is an implementation of mantissa multiplication when performing bfloating-point multiplication.


















int32_t unsign8_mul(int32_t A, int32_t B){
    int32_t LSB = 1;
    int32_t multiplican = A;
    
    int32_t result = 0;
    int loopCounter = 8;
    while(loopCounter>0){
        int32_t lsb = B & LSB;
        /*  if lsb = 1, do addition  */
        if(lsb){
            result = result + multiplican;
        }
        multiplican = multiplican << 1;
        B = B >> 1;
        loopCounter--;
    }
    return result;
}

From the above code, we can see that we only need to use 32 bits to store the multiplication result when performing a multiplication of 8-bit mantissas. We need to iterate 8 times to complete the multiplication.

The assembly code of the implementation of bfloat multiplication can be found on my GitHub.

5. Compare and analyze the results of Float32 multiplication and BFloat16 multiplication

The number of cycles required to execute the multiplication.

Float32 multiplication:GitHub branch
BFloat16 multiplication(Using pre-defined data):GitHub branch
Convert two Float32 numbers to BFloat16 and then perform BFloat16 multiplication (By invoking the fp32_to_bf16 function twice to convert):GitHub branch

We can clearly see that using BFloat16 to perform multiplication can speed up significantly.

The precision of the results of multiplication.

the result of Float32 multiplication

0xc5664300 = -3684.1875
the result of BFloat16 multiplication

0xc5660000 = -3680.0

Although we sacrifice some computational precision, it is worth it in terms of the benefits we gain

Implementation of Quiz1 B problem

C code























float fp32_to_bf16(float x)                 
{
    float y = x;
    int *p = (int *) &y;
    unsigned int exp = *p & 0x7F800000;
    unsigned int man = *p & 0x007FFFFF;
    if (exp == 0 && man == 0) /* zero */
        return x;
    if (exp == B01 /* Fill this! */) /* infinity or NaN */
        return x;

    /* Normalized number */
    /* round to nearest */
    float r = x;
    int *pr = (int *) &r;
    *pr &= 0xFF800000;  /* r has the same exp as x */
    r /= B02 /* Fill this! */;
    y = x + r;

    *p &= 0xFFFF0000;

    return y;
}

Assembly code




































































fp32_to_bf16:
    # s0 = y = x
    # s1 = exp
    # s2 = man
    
    addi sp, sp, -20
    sw ra, 0(sp)
    sw a0, 4(sp)
    sw s0, 8(sp)
    sw s1, 12(sp)
    sw s2, 16(sp)
    
    # Load y = x in s0
    mv s0, a0

    # Get actuall exp bit in s1
    # Load exp mask
    la s1 exp_mask
    lw s1, 0(s1)
    # Get exp bit
    and s1, s1, s0
    
    # Get actuall man bit in s2
    # Load man mask
    la s2 man_mask
    lw s2, 0(s2)
    # Get man bit
    and s2, s2, s0
    
    # /*     zero     */
    # Check if exp==0 , t0 = 1, else t0 = 0
    mv a0, s1
    jal ra, checkZero
    mv t0, a0
    # Check if man==0 , t1 = 1, else t1 = 0
    mv a0, s2
    jal ra, checkZero
    mv t1, a0
    # Check if (exp==0 and man ==0) return X
    and t2, t0, t1
    bnez t2, returnX
    
    
    # /*     infinity or NaN     */
    la t0, nan_mask
    lw t0, 0(t0)
    beq s1, t0, returnX
    
    # /*     Normalized number    */
    # /*     round to nearest    */
    # r = r / 256
    la t0, div_mask
    lw t0, 0(t0)
    # y = x + r
    add s0, s0, t0  
    # result
    la t0, result_mask
    lw t0, 0(t0)
    and s0, s0, t0
    
    mv a0, s0    # return val
    
    lw s0, 8(sp)
    lw s1, 12(sp)
    lw s2, 16(sp)
    lw ra, 0(sp)
    addi sp, sp, 20 
    ret

Full implementation of my project

Follow my GitHub and check my full implementation.

Analysis

The code was tested using the Ripes simulator.
Here are the first few lines of the disassembled executable code:










































00000000 <main>:
    0:        170000ef        jal x1 368 <test>
    4:        10000517        auipc x10 0x10000
    8:        06850513        addi x10 x10 104
    c:        00400893        addi x17 x0 4
    10:        00000073        ecall
    14:        10000517        auipc x10 0x10000
    18:        07250513        addi x10 x10 114
    1c:        00400893        addi x17 x0 4
    20:        00000073        ecall
    24:        42488537        lui x10 0x42488
    28:        c29305b7        lui x11 0xc2930
    2c:        2cc000ef        jal x1 716 <float32_mul>
    30:        02200893        addi x17 x0 34
    34:        00000073        ecall
    38:        10000517        auipc x10 0x10000
    3c:        03450513        addi x10 x10 52
    40:        00400893        addi x17 x0 4
    44:        00000073        ecall
    48:        42488537        lui x10 0x42488
    4c:        04c000ef        jal x1 76 <fp32_to_bf16>
    50:        00050413        addi x8 x10 0
    54:        c2930537        lui x10 0xc2930
    58:        040000ef        jal x1 64 <fp32_to_bf16>
    5c:        00050493        addi x9 x10 0
    60:        10000517        auipc x10 0x10000
    64:        04450513        addi x10 x10 68
    68:        00400893        addi x17 x0 4
    6c:        00000073        ecall
    70:        00040513        addi x10 x8 0
    74:        00048593        addi x11 x9 0
    78:        400000ef        jal x1 1024 <bfloat16_mul>
    7c:        02200893        addi x17 x0 34
    80:        00000073        ecall
    84:        10000517        auipc x10 0x10000
    88:        fe850513        addi x10 x10 -24
    8c:        00400893        addi x17 x0 4
    90:        00000073        ecall
    94:        5300006f        jal x0 1328 <End>
00000170 <test>:
    170:        ffc10113        addi x2 x2 -4
    174:        00112023        sw x1 0 x2

Instruction Pipeline

Let’s take a look at how the jal x1 368 <test> instruction works during execution, which is at address 0x0.

IF Stage

At the beginning, PC is 0x0, meaning that we are going to fetch instruction at 0x0, jal x1 368 <test>.
PC will be update to 0x4 next cycle.
From Instruction memory,we can get instruction machine code(0x170000ef).

ID Stage

The machine code 0x170000ef is decoded by the Decoder.
0x00 (zero) and 0x10 (a0) are sent to the Register circuit to read values.
`0x01 (ra)`` will be sent to the next stage.
Since the opcode is equal to 'jal,the address of the label test is decoded, and its value is 0x170.

EX Stage

The ALU takes inputs 0x0 (PC) and 0x170, sums them to obtain the new address 0x170 (Label: test).
a0 and ra will be sent to next stage.

MEM Stage

Flush 2 instructions to NOPs after jal.
The IF stage loads the new instruction at the address test+4.
0x4(return address) and 0x1(ra index) will be sent to next stage.

WB Stage

Write 0x4(return address) into 0x1(ra reg.)

Assembly Optimization

I am comparing the performance of the optimized code with the previous code, which converts two Float32 numbers to BFloat16 and then performs BFloat16 multiplication (by invoking the fp32_to_bf16 function twice for conversion).

Complete optimized code is in main_optimized.s.

Optimize fp32_to_bf16 (Using reducing jump)

A segment from the original code version












    # /*     zero     */
    # Check if exp==0 , t0 = 1, else t0 = 0
    mv a0, s1
    jal ra, checkZero
    mv t0, a0
    # Check if man==0 , t1 = 1, else t1 = 0
    mv a0, s2
    jal ra, checkZero
    mv t1, a0
    # Check if (exp==0 and man ==0) return X
    and t2, t0, t1
    bnez t2, returnX

Using reducing jump instruction




    # /*     zero     */
    or t0, s1, s2
    # Check if (exp==0 and man ==0) return X
    beqz t0, returnX

The performance before optimization
The performance after optimization

We can observe that there's no need to flush the two instructions behind jal ra, checkZero,resulting in a reduction in the total number of cycles.

Optimize BFloat16 (Using loop unrolling)

A segment from the original code version

















    # Loop to perform multiplication
    bf16_mul_loop:
        mv t3, s0        # multiplicand
        mv t4, s1        # multiplier

        andi t5, t4, 1   # extract LSB from multiplier
        
        beqz t5, bf16_no_add  # if LSB=0, skip addition
        
        sll t5, t3, t6   # left shift multiplicand with n bits (t6)
        add t0, t0, t5   # lower 32bits = lower 32bits + left shifted multiplicand
        
        bf16_no_add:
        srli s1, s1, 1   # right shift multiplier
        addi t6, t6, 1   # shift counter++
        addi t2, t2, -1  # Decrement the loop counter
        bnez t2, bf16_mul_loop    # Continue the loop until all bits are processed

Using loop unrolling
































    # Loop to perform multiplication
    bf16_mul_loop:
        mv t3, s0        # multiplicand
        mv t4, s1        # multiplier

        andi t5, t4, 1   # extract LSB from multiplier
        
        beqz t5, bf16_no_add  # if LSB=0, skip addition
        
        sll t5, t3, t6   # left shift multiplicand with n bits (t6)
        add t0, t0, t5   # lower 32bits = lower 32bits + left shifted multiplicand
        
        bf16_no_add:
        srli s1, s1, 1   # right shift multiplier
        addi t6, t6, 1   # shift counter++
        
        mv t3, s0        # multiplicand
        mv t4, s1        # multiplier

        andi t5, t4, 1   # extract LSB from multiplier
        
        beqz t5, bf16_no_add1  # if LSB=0, skip addition
        
        sll t5, t3, t6   # left shift multiplicand with n bits (t6)
        add t0, t0, t5   # lower 32bits = lower 32bits + left shifted multiplicand
        
        bf16_no_add1:
        srli s1, s1, 1   # right shift multiplier
        addi t6, t6, 1   # shift counter++
        
        addi t2, t2, -2  # Decrement the loop counter
        bnez t2, bf16_mul_loop    # Continue the loop until all bits are processed

The performance before optimization
The performance after optimization

We can observe that the number of required flush instructions can be reduced when employing loop unrolling.