Try โ€‚โ€‰HackMD

Assignment1: RISC-V Assembly and Instruction Pipeline

contributed by <shhung>

Implement transformation from integet to float

This assignment implements the conversion from a 64-bit integer to a 64 bit floating-point data type.

Implementation

C code

// naive implementation union DoubleConverter { unsigned long long intValue; double doubleValue; }; double ll_to_double(unsigned long long significand) { // Only support 0 < significand < 1 << 53. if (significand == 0 || significand >= 1ULL << 53) return -1.0; int shifts = 0; while ((significand & (1ULL << 52)) == 0) { significand <<= 1; shifts++; } unsigned long long exponent = 1023 + 52 - shifts; unsigned long long merged = (exponent << 52) | (significand & 0xFFFFFFFFFFFFF); // Use union for type conversion union DoubleConverter converter; converter.intValue = merged; return converter.doubleValue; }

Instead of accumulating shifts through iteration, using clz is more efficient

-int shifts = 0; -while ((significand & (1ULL << 52)) == 0) -{ - significand <<= 1; - shifts++; -} +int shifts = __builtin_clzll(significand) - 11; +significand <<= shifts;

Assembly code

.data
num:        .dword        0xBBFFFFFFFF, 0x84f2, 0x811111111
mask:       .word        0xFFFFF
maskclz:    .word        0x55555555, 0x33333333, 0x0f0f0f0f
.text
    la t0, num
    lw a0, 12(t0)
    lw a1, 8(t0)
    jal itof
    jal exit

# ====== subroutines ======
# cast int64 to double
# input uint64[a0, a1] 
# output double[a0, a1]
itof:
    addi sp, sp, -4
    sw ra, 0(sp)
    bnez a0, inrange
    bnez a1, inrange
    li t0, 1
    slli t0, t0, 21
    blt a0, t0, inrange
# overrange, set msb 1
    li t0, 1
    slli t0, t0, 31
    or a0, a0, t0
    ret
inrange:
    addi sp, sp, -8
    sw a0, 0(sp)
    sw a1, 4(sp)
    call clz
    addi a2, a0, -11
    lw a0, 0(sp)
    lw a1, 4(sp)
    addi sp, sp 8
    li a3, 32
    bge a2, a3, ge32
# lt32
    sub a3, a3, a2
    sll a0, a0, a2
    srl t0, a1, a3
    sll a1, a1, a2
    or a0, a0, t0
    j merged
ge32:
    sub a3, a2, a3
    sll a1, a1, a3
    mv a0, a1
    li a1, 0
# exponent = 1023 + 52 - shifts
merged:
    li a3, 1075
    sub a3, a3, a2
    slli a3, a3, 20
    la t0, mask
    lw t0, 0(t0)
    and a0, a0, t0
    or a0, a0, a3
    lw ra, 0(sp)
    addi sp, sp, 4
    ret
    
clz:
# input int64[a0, a1]
# output int32[a0]
# x |= (x >> {1, 2, 4, 8, 16})
    addi sp, sp, -4
    sw ra, 0(sp)
    
    addi t1, x0, 0x1
Loop1:
    addi t2, x0, 32
    srl t0, a0, t1
    or a0, a0, t0
    srl t0, a1, t1
    or a1, a1, t0
    sub t2, t2, t1
    sll t0, a0, t2
    or a1, a1, t0
    slli t1, t1, 1
    addi t2, x0, 32
    bgt t2, t1, Loop1
# x |= (x >> 32)
    or a1, a1, a0
# x -= ((x >> 1) & 0x5555555555555555);
    la t6, maskclz    
    srli t0, a0, 1
    lw t5, 0(t6)
    and t0, t0, t5 # t0 = (a0 >> 1) & 0x55555555
    srli t1, a1, 1
    slli t2, a0, 31
    or t1, t1, t2
    and t1, t1, t5 # t1 = (a1 >> 1) & 0x55555555
    sub a0, a0, t0
    sltu t3, a1, t1
    bne t3, x0, Borrow
    sub a1, a1, t1
    j Done
Borrow:
    addi a0, a0, -1
    sub a1, t1, a1
    addi t3, x0, -1
    sub a1, t3, a1
Done:
# x = ((x >> 2) & 0x3333333333333333) + (x & 0x3333333333333333)
    lw t5, 4(t6)
    srli t0, a0, 2
    srli t1, a1, 2
    slli t2, a0, 30
    or t1, t1, t2
# [t0, t1] = x >> 2
    and t0, t0, t5
    and t1, t1, t5
    and a0, a0, t5
    and a1, a1, t5
    mv a2, t0
    mv a3, t1
    jal ra, Add64
# ((x >> 4) + x) & 0x0f0f0f0f0f0f0f0f
    srli t0, a0, 4
    srli t1, a1, 4
    slli t2, a0, 28
    or t1, t1, t2
    mv a2, t0
    mv a3, t1
    jal ra, Add64
    lw t5, 8(t6)
    and a0, a0 ,t5
    and a1, a1 ,t5
# x += (x >> 8)
    srli t0, a0, 8
    srli t1, a1, 8
    slli t2, a0, 24
    or t1, t1, t2
    mv a2, t0
    mv a3, t1
    jal ra, Add64
# x += (x >> 16)
    srli t0, a0, 16
    srli t1, a1, 16
    slli t2, a0, 16
    or t1, t1, t2
    mv a2, t0
    mv a3, t1
    jal ra, Add64 
# x += (x >> 32)
    mv a2, x0
    mv a3, a0
    jal ra, Add64
# return (64 - (x & 0x7f))
    andi a0, a1, 0x7f
    addi a1, x0, 64
    sub a0, a1, a0
    lw ra, 0(sp)
    addi sp, sp, 4
    ret
Add64:
    add a0, a0, a2
    add a1, a1, a3
    sltu s0, a1, a3
    bne s0, x0, Carry
    ret
Carry:
    addi a0, a0, 1
    ret

exit:
    # Exit program
    li a7, 10
    ecall

Explanation

Data section

.data
num:        .dword       0x2187654321
mask:       .word        0xFFFFF
maskclz:    .word        0x55555555, 0x33333333, 0x0f0f0f0f

64 bits value is represented using dword, num represents test data.
mask is used to filter the mantisa of integer. Although under the IEEE 754 standard, Double-precision floating-point has a 53-bit mantissa(52 explicitly stored), in the context of rv32, two registers are used to store the value. Therefore, processing only needs to be done on the upper half of the bits.
maskclz is used to implement popCount in clz

Subroutines

  • itof
    • Input: uint64 [a0, a1]
    • Output: double[a0, a1]
      Convert the input integer to the IEEE 754 double format
  • clz
    • Input: uint64 [a0, a1]
    • Output: uint32 [a0]
      Count leading zero of the input value

Program flow

The program stores a 64-bit value in two registers, with the upper half in a0 and the lower half in a1. itof first checks if it exceeds the range that double storage allows, and if it does, the conversion process begins. It starts by determining the shifts to calculate the exponent, and then aligns and retrieves the mantissa based on the shifts value. Finally, it combines these components to obtain the IEEE 754 double format value.

64 bits operations

Since two registers are used to store the value, operations such as addition, subtraction, bit shifting, etc., need to take into account the crossing of bits between the two registers.

  • Addition
    For addition, it's relatively straightforwardโ€”just need to check whether there will be a carry from the lower half of the registers to the upper half. If the result after addition is less than either of the operands, it means there's a need for carrying over

    โ€‹โ€‹โ€‹โ€‹# [a0, a1] + [a2, a3]
    โ€‹โ€‹โ€‹โ€‹Add64:
    โ€‹โ€‹โ€‹โ€‹    add a0, a0, a2
    โ€‹โ€‹โ€‹โ€‹    add a1, a1, a3
    โ€‹โ€‹โ€‹โ€‹    sltu s0, a1, a3
    โ€‹โ€‹โ€‹โ€‹    bne s0, x0, Carry
    โ€‹โ€‹โ€‹โ€‹    ret
    โ€‹โ€‹โ€‹โ€‹Carry:
    โ€‹โ€‹โ€‹โ€‹    addi a0, a0, 1
    โ€‹โ€‹โ€‹โ€‹    ret
    
  • Subtraction
    For subtraction, in this assignment, the minuend is guaranteed to be greater than the subtrahend. Therefore, it's only necessary to check whether borrowing is needed from the upper half of the registers to the lower half. If borrowing is needed, then swap the minuend and subtrahend, and subtract the difference with the borrow.

    โ€‹โ€‹โ€‹โ€‹# suppose a - b
    โ€‹โ€‹โ€‹โ€‹    sub a0, a0, t0
    โ€‹โ€‹โ€‹โ€‹    sltu t3, a1, t1
    โ€‹โ€‹โ€‹โ€‹# if a < b, Borrow
    โ€‹โ€‹โ€‹โ€‹    bne t3, x0, Borrow
    โ€‹โ€‹โ€‹โ€‹    sub a1, a1, t1
    โ€‹โ€‹โ€‹โ€‹    j Done
    โ€‹โ€‹โ€‹โ€‹Borrow:
    โ€‹โ€‹โ€‹โ€‹# t = b - a
    โ€‹โ€‹โ€‹โ€‹# res = borrow - t
    โ€‹โ€‹โ€‹โ€‹    addi a0, a0, -1
    โ€‹โ€‹โ€‹โ€‹    sub a1, t1, a1
    โ€‹โ€‹โ€‹โ€‹# The borrow value is stored in t3, and since the immediate value has limitations, it is assigned through t3 = 0 - 1
    โ€‹โ€‹โ€‹โ€‹    addi t3, x0, -1
    โ€‹โ€‹โ€‹โ€‹    sub a1, t3, a1
    โ€‹โ€‹โ€‹โ€‹Done:
    
  • Shifting
    Shifting, whether left or right, will always encounter cases where the upper half crosses over to the lower half of the registers, or vice versa.
    In the case of the left shift used in this assignment, if the shift is less than 32, it means the bits will move to the lower bits of the upper half of the registers. Therefore, reverse shifting is performed to the correct position, and then the or operation is used to set the bits in the upper half of the registers.
    When the shift is greater than or equal to 32, it means the bits will move to the higher bits of the upper half of the registers. In this case, the shift direction aligns with the original operation, but it needs to subtract 32 to obtain the correct bits. These corrected bits are then directly assigned to the upper half of the registers, and the lower half is set to zero.

    โ€‹โ€‹โ€‹โ€‹    li a3, 32
    โ€‹โ€‹โ€‹โ€‹    bge a2, a3, ge32
    โ€‹โ€‹โ€‹โ€‹# lt32
    โ€‹โ€‹โ€‹โ€‹    sub a3, a3, a2
    โ€‹โ€‹โ€‹โ€‹    sll a0, a0, a2
    โ€‹โ€‹โ€‹โ€‹    srl t0, a1, a3
    โ€‹โ€‹โ€‹โ€‹    sll a1, a1, a2
    โ€‹โ€‹โ€‹โ€‹    or a0, a0, t0
    โ€‹โ€‹โ€‹โ€‹    j merged
    โ€‹โ€‹โ€‹โ€‹ge32:
    โ€‹โ€‹โ€‹โ€‹    sub a3, a2, a3
    โ€‹โ€‹โ€‹โ€‹    sll a1, a1, a3
    โ€‹โ€‹โ€‹โ€‹    mv a0, a1
    โ€‹โ€‹โ€‹โ€‹    li a1, 0
    

Result

Since ripes doesn't provide output for 64-bit integers and 64-bit floating-point numbers, it's necessary to inspect the values in registers to verify the results.
The results are stored in two registers, with a0 holding the upper half and a1 holding the lower half.
Here are two tools for quick format conversion. The Hexadecimal to Decimal converter is used to obtain the decimal representation, while FractionConvert provides the IEEE 754 standard representation.

To have a clear comparison of results, I've stored the input values in s0 and s1 registers.

# test case
0xBBFFFFFFFF = 0x42677FFFFFFFE000 (IEEE 754)
0x84f2 = 0x40E09E4000000000 (IEEE 754)
0x11 = 4031000000000000 (IEEE 754)
  • num1
    test1
  • num2
    test2
  • num3
    test3

Analysis

Comparing the execution results of different test cases, it can be observed that for larger values, the performance of conversion based on clz is inferior to the naive approach. However, for smaller values, it outperforms the naive conversion, and the performance difference becomes more pronounced as the values decrease.
The main difference lies in the fact that the clz method can obtain the shifts value in a fixed number of cycles, while the naive approach involves iterative calculations, leading to an increase in iterations as the values decrease.

  • 0xBBFFFFFFFF

    itof_clz itof
    cycles 234 156
    Insrs: retired 182 111
    CPI 1.29 1.41
    IPC 0.774 0.712
  • 0x84f2

    itof_clz itof
    cycles 234 342
    Insrs: retired 181 253
    CPI 1.29 1.35
    IPC 0.774 0.74
  • 0x11

    itof_clz itof
    cycles 234 430
    Insrs: retired 181 319
    CPI 1.29 1.35
    IPC 0.774 0.742

Summarize

Thanks to population count, we were able to achieve constant time complexity for CLZ.
In this assignment, we leveraged this optimization to improve the conversion from integers to floating-point numbers.