Try   HackMD

Assignment1: RISC-V Assembly and Instruction Pipeline

contributed by < Hotmercury >
we can find instruction reference
code on github

Find the position 0 the most close to LSB, and then make the position left to this postion become 0.
we can use this to compute the carry flag.

example
mask = 1010 0111
1010 0111 -> 0000 0111
and we can use (mask << 1) xor (mask) will get cary bit
-> 0000 1000

uint64_t mask_lowest_zero(uint64_t x)
{
    uint64_t mask = x;
    mask &= (mask << 1) | 0x1;
    mask &= (mask << 2) | 0x3;
    mask &= (mask << 4) | 0xF;
    mask &= (mask << 8) | 0xFF;
    mask &= (mask << 16) | 0xFFFF;
    mask &= (mask << 32) | 0xFFFFFFFF;
    return mask;
}

use ori immidiate only has 12 bit

Image Not Showing Possible Reasons
  • The image was uploaded to a note which you don't have access to
  • The note which the image was originally uploaded to has been deleted
Learn More →

mask bigger than 211 must to use register

.data
    # dword -> 8 bytes
    test_1: .dword 0x10FFFFFFFFFF3333

.text
main:   
    la t0, test_1 # lui t0, test_1[31:12]  # lw t0, test_1[11:0]
    lw a0, 0(t0) 
    lw a1, 4(t0)
    jal mask_lowest_zero
    li       a7,  10           # return 0
	ecall

mask_lowest_zero:
    # a0 low , a1 high 
    # mask &= (mask << 1) | 1;
    # a1,a0 = 64 bits parameter 
    slli t1, a1, 1
    srli t0, a0, 31
    or t1, t1, t0     # t1,t0 
    slli t0, a0, 1    # t0 = a0 << 1
    ori t0, t0, 1     # x = x | 1

    and a0, a0, t0
    and a1, a1, t1

    # mask &= (mask << 2) | 0x3;
    slli t1, a1, 2
    srli t0, a0, 30
    or t1, t1, t0     # left  32 bits
    slli t0, a0, 2    # t0 = a0 << 2
    ori t0, t0, 3     # x = x | 3

    and a0, a0, t0
    and a1, a1, t1

    # mask &= (mask << 4) | 0xF;
    slli t1, a1, 4
    srli t0, a0, 28
    or t1, t1, t0     # left  32 bits
    slli t0, a0, 4    # t0 = a0 << 4
    ori t0, t0, 0xF   # x = x | 0xF

    and a0, a0, t0
    and a1, a1, t1

    # mask &= (mask << 8) | 0xFF;
    slli t1, a1, 8
    srli t0, a0, 24
    or t1, t1, t0     # left  32 bits
    slli t0, a0, 8    # t0 = a0 << 8
    ori t0, t0, 0xFF  # x = x | 0xFF

    and a0, a0, t0
    and a1, a1, t1

    # mask &= (mask << 16) | 0xFFFF;
    li t3 , 0xFFFF # lui + addi
    slli t1, a1, 16
    srli t0, a0, 16
    or t1, t1, t0     # left  32 bits
    slli t0, a0, 16   # t0 = a0 << 16
    or t0, t0, t3    # x = x | 0xFFFF

    and a0, a0, t0
    and a1, a1, t1

    # mask &= (mask << 32) | 0xFFFFFFFF;
    and a1, a1,a0
    

x = x + 1

flow

  1. test if overflow
  2. find the carry flag
  3. return flag & another bit
int64_t inc(int64_t x)
{
    // x = all one bits will overflow to 0
    if (~x == 0)
        return 0;
    // set carry flag
    int64_t mask = mask_lowest_zero(x);
    // 0011 -> 0100
    int64_t z1 = mask ^ ((mask << 1) | 1);
    return (x & ~mask) | z1;
}
inc:
    addi sp, sp -12
    sw ra, 0(sp)
    sw a0, 4(sp)
    sw a1, 8(sp)     # save parameters

    jal mask_lowest_zero
    mv t0, a0
    mv t1, a1        # t1,t0 mask
    
    lw a0, 4(sp)
    lw a1, 8(sp)     # restore parameters
    
    slli t3, t1, 1
    srli t2, t0, 31
    or t3, t3, t2
    slli t2, t0, 1    # a1,a0 << 1
    ori t2, t2, 1      # a1,a0 | 1

    xor t2, t2, t0
    xor t3, t3, t1    #  t2,t3  z1

    not t1, t1
    not t0, t0         # ~mask       

    and a1, a1, t1
    and a0, a0, t0    # a1,a0 & ~mask

    or a1, a1, t3
    or a0, a0, t2     # a1,a0 | z1

    lw ra, 0(sp)
    addi sp, sp, 12
    ret

find the value of nth bit

static inline int64_t getbit(int64_t value, int n)
{
    return (value >> n) & 1;
}
getbit:
    addi sp, sp, -8
    sw ra, 0(sp)
    sw s0, 4(sp)
    li s0, 31

    bge a3, s0, getbit_l        # if (pos >= 32);
    srl a0, a0, a3
    andi a0, a0, 1
    j getbit_end
getbit_l:
    sub s0, a3, s0   
    srl a1, a1, s0
    andi a1, a1, 1
    mv a0, a1
getbit_end:
    lw ra, 0(sp)
    lw s0, 4(sp)
    addi sp, sp, 16
    ret

multiplicand multiplier = result
flow

  1. get specify bit of b
  2. if(1) result + a << i
int64_t imul32(int32_t a, int32_t b)
{
    int64_t r = 0, a64 = (int64_t) a, b64 = (int64_t) b;
    for (int i = 0; i < 32; i++) {
        if (getbit(b64, i))
            r += a64 << i;
    }
    return r;
}
imul32:
    addi sp, sp, -28
    sw ra,0(sp)
    sw s0,4(sp)
    sw s1,8(sp)
    sw s2,12(sp)
    sw s3,16(sp)
    sw s4,20(sp)
    sw s5,24(sp)
    mv s0, a0   # a
    mv s1, a1   # b
    li s2, 0   
    li s3, 0    # result s3,s2 
    li s4, 0    # i counter
    li s5, 32   # loop bound
imul32_loop:
    beq s4, s5, imul32_end
    mv a0, a1   # a0 = b
                # a1 dont care
    mv a2, s4   # a2 = i
    jal getbit
    beq a0, zero, imul_skip # if (getbit(b, i))
    sub t2, s5, s4  # t2 = 32 - i 
    mv t0, s0       # t0 = a
    srl t1, t0, t2  # shift left i
    sll t0, t0, s4  # shift left i

    slli t3, s2, 31 # overflow check
    slli t4, t0, 31 # overflow check
    and t5, t3, t4  # overflow check
    beq t5, zero, 8   # if (overflow)
    addi s3, s3, 1  # result++
    add s2, s2, t0  
    add s3, s3, t1  # r += a << i
imul_skip:
    addi s4, s4, 1  # i++
    j imul32_loop

imul32_end:
    mv a0, s2
    mv a1, s3
    lw ra,0(sp)
    lw s0,4(sp)
    lw s1,8(sp)
    lw s2,12(sp)
    lw s3,16(sp)
    lw s4,20(sp)
    lw s5,24(sp)
    addi sp,sp,24
    ret

merge result and b -> result(32),b(32)
so we dont care overflow by use risc32 to implement 64 bit

// improve int32 multiply
int64_t imul32_improve(int32_t a, int32_t b)
{
    int64_t r = (int64_t)b;
    int64_t a64 = (int64_t)a;
    for (int i = 0; i < 32; i++){
        if(r & 1){
            r = r + (a64 << 32);
        }
        r = r >> 1;
    }
    return r;
}

float32 multiply
flow

  1. transform tp IEEE
  2. decide sign bit
  3. find the mantissa and add another 1 23 + 1
  4. exponent
  5. use imul32 compute the multiply of mantissa of two number
  6. 24 * 24 most 48 bit - 23 = 25 we only perserve 24 bit,sowe need to check another shift int mshift = getbit(mrtmp, 24); will get the position of 25 from LSB
  7. int32_t er = mshift ? inc(ertmp) : ertmp; if had shift that we should add another 1 to exponent
  8. conbime new IEEE
float fmul32(float a, float b)
{
    /* TODO: Special values like NaN and INF */
    int32_t ia = *(int32_t *) &a, ib = *(int32_t *) &b;

    /* sign */
    int sa = ia >> 31;
    int sb = ib >> 31;

    /* mantissa */
    int32_t ma = (ia & 0x7FFFFF) | 0x800000;
    int32_t mb = (ib & 0x7FFFFF) | 0x800000;

    /* exponent */
    int32_t ea = ((ia >> 23) & 0xFF);
    int32_t eb = ((ib >> 23) & 0xFF);

    /* 'r' = result */
    int64_t mrtmp = imul32(ma, mb) >> 23;
    int mshift = getbit(mrtmp, C01);

    int64_t mr = mrtmp >> mshift;
    int32_t ertmp = ea + eb - C02;
    int32_t er = mshift ? inc(ertmp) : ertmp;
    /* TODO: Overflow ^ */
    int sr = sa ^ sb;
    int32_t r = (sr << C03) | ((er & 0xFF) << C04) | (mr & 0x7FFFFF);
    return *(float *) &r;
}
fmul32:
    addi sp, sp, -24
    sw ra, 0(sp)
    sw s0, 4(sp)
    sw s1, 8(sp)
    sw s2, 12(sp)
    sw s3, 16(sp)
    sw s4, 20(sp)

    srli s0, a0, 31
    srli s1, a1, 31
    xor s0, s0, s1  # s0 = sign_a ^ sign_b

    li t0, 0x7FFFFF
    li t1, 0x800000
    and s1, a0, t0
    or s1, s1, t1   # s1 = mantissa_a  
    and s2, a1, t0  
    or s2, s2, t1   # s2 = mantissa_b

    srli s3, a0, 23
    andi s3, s3, 0xFF # s3 = exp_a
    srli s4, a1, 23
    andi s4, s4, 0xFF # s4 = exp_b

    mv a0, s1
    mv a1, s2
    jal imul32
    mv s1, a0
    mv s2, a1        # s1,s2 = mantissa_a * mantissa_b
    srli s1, s1, 23
    slli s2, s2, 9
    or s1, s1, s2   # s1 = mantissa_a * mantissa_b >> 23
                     # s2 dont care
    mv a0, s1        # a0 = mantissa_a * mantissa_b >> 23
    li a2, 24        # a1 = 24
    jal getbit
    srl s1, s1, a0   # s1 = mantissa_a * mantissa_b >> 23 >> getbit

    add s3, s3, s4   # s3 = exp_a + exp_b
    addi s3, s3, -127
                     # s4 dont care
    # int32_t er = mshift ? inc(ertmp) : ertmp;
    # skip inc
    add s3, s3, a0   # s3 = er

    srli s0, s0, 31 # s0 = (sr << 31)
    andi s3, s3, 0xFF # s3 = er
    slli s3, s3, 23 # s3 = er << 23
    li t0, 0x7FFFFF
    and s1, s1, t0  # s1 = mr
    or s0, s0, s3   # s0 = (sr << 31) | (er << 23)
    or s0, s0, s1   # s0 = (sr << 31) | (er << 23) | mr
    mv a0, s0

    lw ra, 0(sp)
    lw s0, 4(sp)
    lw s1, 8(sp)
    lw s2, 12(sp)
    lw s3, 16(sp)
    lw s4, 20(sp)
    addi sp, sp, 24
    ret

Analyze

  • what is CPI ?
    Below is the mathematical calculation for Cycle Per Instruction (CPI), so a higher CPI is considered better.

    CPI=i(ICi)(CCi)IC

  • What is IPC ?
    Instructions per cycle
    higher is better

When calculating floating-point multiplication, we need to compute the sign bit, exponent, and mantissa. This calculation involves multiplying the mantissas of two floating-point numbers, which means using two 23-bit segments from the IEEE representation, denoted as |1|8|23|. Upon observing the actual operation using a simulator, we identified that the performance bottleneck primarily lies in the mantissa calculation. The two diagrams below show the original and improved data, demonstrating a significant reduction of nearly 50% in cycles.

difference
original code

imul32_loop:
    beq s4, s5, imul32_end
    mv a0, a1   # a0 = b
    mv a2, s4   # a2 = i
    jal getbit
    beq a0, zero, imul_skip # if (getbit(b, i))
    sub t2, s5, s4  # t2 = 32 - i 
    mv t0, s0       # t0 = a
    srl t1, t0, t2  # shift left i
    sll t0, t0, s4  # shift left i

    slli t3, s2, 31 # overflow check
    slli t4, t0, 31 # overflow check
    and t5, t3, t4  # overflow check
    beq t5, zero, 8   # if (overflow)
    addi s3, s3, 1  # result++
    add s2, s2, t0  
    add s3, s3, t1  # r += a << i
imul_skip:
    addi s4, s4, 1  # i++
    j imul32_loop
  1. Calling the getbit function using jal getbit results in a significant cycle overhead. By observing this pipeline, we can also note that when we use jal, it introduces two additional NOP instructions.



    And clear will happen

  2. When getbit is true, we need to perform the operation r += a64 << i;. We first shift a64 left by i and then add it to r. However, this operation introduces a potential issue - register overflow.
    In our simulation, we are using the RV32 architecture, which operates with 32-bit principles. But considering that a 32-bit multiplication can result in a 64-bit value, we need to use two registers to accommodate this. This introduces the overflow concern. When performing addition on the lower 32 bits, it may result in a carry, effectively incrementing the value in the higher bits of the register. Hence, we see various overflow checks in the RISC-V code mentioned above.
    These checks ensure that we handle potential overflow scenarios appropriately to maintain the integrity of our calculations.

example

let proccesor rv4
when 1001(a1) 1001(a0) + 0100(t1) 1000(t0)
we need to record that (a0 + t0) has carry bit and add sum(a1,t1,0)

  1. reodering the instructions
    For example, in the following code, instructions have been deliberately reordered. This can be an effective way to avoid "Read after Write" hazards. However, through observation, we can see that this doesn't have an impact because data forwarding can resolve this issue. You can notice that the gray selectors differ, indicating that the value in t3 comes from the calculation in the previous pipeline stage.

order

    srli s1, s1, 1  # result >> 1
    slli t3, s2, 31  # result << (32 - 1)
    or s1, s1, t3   # result | (result << (32 - 1))
    srli s2, s2, 1  # result >> 1


reorder

    srli s1, s1, 1  # result >> 1
    slli t3, s2, 31  # result << (32 - 1)
    srli s2, s2, 1  # result >> 1
    or s1, s1, t3   # result | (result << (32 - 1))


improve

imul32_loop:
    beq t0, t1, imul32_end
    andi t2, s1, 1 # if (result & 1)
    beq t2, zero, imul32_skip
    add s2, s2, s0  # result += a

imul32_skip:
    srli s1, s1, 1  # result >> 1
    slli t3, s2, 31  # result << (32 - 1)
    srli s2, s2, 1  # result >> 1
    or s1, s1, t3   # result | (result << (32 - 1))
    addi t0, t0, 1  # i++
    j imul32_loop

Hazard

various hazard example

  • structural hazard
    • two instructions try to write to the register at the same time
  • Data Hazards
  • Control hazards

problem

branch often happen,and will come with two nop, can we avoid it?