Assignment1: Implement count leading zero and popcount to solve LeetCode problem 1342.
contributed by < KAILIAO
>
Full source code here
Quiz 1_C
I chose problem C from Quiz 1 and translated it from C code into a complete RISC-V assembly program. The implementation includes the fabsf
function and the fp16_to_fp32
function, which converts a 16-bit floating-point number in IEEE half-precision format (bit representation) to a 32-bit floating-point number in IEEE single-precision format (bit representation). This implementation avoids using any floating-point arithmetic and utilizes the clz function
Fabsf function
C code
The program output will be like
Test number: fabsf(test_value) = result (Expected:expect_value) -> Correct/Incorrect
Assembly
.data
test_values:
.word 0x4048F5C3, 0xC048F5C3, 0x00000000, 0x80000000, 0x3F800000, 0xBF800000, 0x47F7C000, 0xA7E7C3D1
expected_values:
.word 0x4048F5C3, 0x4048F5C3, 0x00000000, 0x00000000, 0x3F800000, 0x3F800000, 0x47F7C000, 0x27E7C3D1
correct_msg:
.asciz " -> Correct\n"
incorrect_msg:
.asciz " -> Incorrect\n"
input_msg:
.asciz "Input: "
output_msg:
.asciz " Output: "
ans_msg:
.asciz ", Expected: "
.text
main:
la s0, test_values # Load the address of test values
la s1, expected_values # Load the address of expected results
li t2, 8 # Number of test cases
li t3, 0 # Initialize index to 0
loop:
lw a1, 0(s0) # Load floating point number (as integer) from test_values
# Print input value
la a0, input_msg # Load input string
li a7, 4
ecall
mv a0, a1 # Set the value to print as a1 (input value)
li a7, 2 # ecall 34: Print hexadecimal integer
ecall
call fabsf # Call fabsf function
# Print output value
la a0, output_msg # Load output string
li a7, 4
ecall
mv a0, a1 # Set the value to print as a1 (output value)
li a7, 2 # ecall 34: Print hexadecimal integer
ecall
lw a2, 0(s1) # Load expected result from expected_results
# Print expected value
la a0, ans_msg # Load expected value string
li a7, 4
ecall
mv a0, a2 # Set the value to print as a2 (expected value)
li a7, 2 # ecall 34: Print hexadecimal integer
ecall
# Compare output with expected value
beq a1, a2, result_correct # If equal, jump to result_correct
j print_incorrect # If not equal, jump to print_incorrect
result_correct:
call print_correct # Print "Result: Correct"
next_test:
addi s0, s0, 4 # Point to the next test value (each floating point is 4 bytes)
addi s1, s1, 4 # Point to the next expected result
addi t3, t3, 1 # Increment test index
blt t3, t2, loop # If not all values tested, continue loop
li a0, 0 # ecall 93: Program exit
li a7, 93
ecall
fabsf:
li t0, 0x7FFFFFFF # Load 0x7FFFFFFF to clear the sign bit
and a1, a1, t0 # Clear the sign bit
ret
print_correct:
la a0, correct_msg
li a7, 4 # ecall 4: Print string
ecall
ret
print_incorrect:
la a0, incorrect_msg
li a7, 4 # ecall 4: Print string
ecall
j next_test
The program output will be like
Input : test_value Output: result Expected:expect_value -> Correct/Incorrect
Execution info |
|
Cycles |
549 |
Instr. retired |
337 |
CPI |
1.63 |
IPC |
0.614 |
Fp16_to_fp32
C code
The program output will be like
16-bit: test_value[i] -> 32-bit: result expected_value= expected_values[i] result:Correct/Incorrect
Assembly
.data
test_values:
.half 0x3C00, 0xC000, 0x7BFF, 0xFC00, 0x7E00, 0x0000, 0x8000, 0x3555, 0x0400, 0x3000, 0x7800
ans_values:
.word 0x3F800000, 0xC0000000, 0x477FE000, 0xFF800000, 0x7FC00000, 0x00000000, 0x80000000, 0x3EAAA000, 0x38800000, 0x3e000000, 0x47000000
input_msg: .asciz "Input ->"
output_msg: .asciz " Output -> "
ans_msg: .asciz " expected_value -> "
correct_msg: .asciz " Result: Correct\n"
false_msg: .asciz " Result: False\n"
format: .asciz "%08X\n" # Format string for printing hexadecimal numbers
.text
# Main function
main:
la s0, test_values # Load the address of the test data
la s1, ans_values
li t0, 11 # Array size
li t1, 0 # Initialize index i to 0
test_loop:
lhu a1, 0(s0) # Load the current 16-bit number (load halfword)
la a0, input_msg
li a7, 4
ecall
mv a0, a1
li a7, 34
ecall
call fp16_to_fp32 # Call the fp16_to_fp32 conversion
lw a2, 0(s1)
call print_fp32 # Print the conversion result
# Verify if the conversion result is correct
beq a1, a2, result_correct # If the conversion result matches the expected result, jump to result_correct
call print_false # Otherwise, print "Result: False"
j next_iteration
result_correct:
call print_correct # Print "Result: Correct"
next_iteration:
addi s0, s0, 2 # Move to the next test value (advance by 2 bytes each time)
addi s1, s1, 4 # Move to the next expected value (advance by 4 bytes each time)
addi t1, t1, 1 # i++
blt t1, t0, test_loop # If i < 11, continue the loop
# Use ecall to exit the program
li a0, 0
li a7, 93 # ecall 93: Exit the program
ecall
# Print "Result: Correct"
print_correct:
la a0, correct_msg
li a7, 4 # ecall 4: Print string
ecall
ret
# Print "Result: False"
print_false:
la a0, false_msg
li a7, 4 # ecall 4: Print string
ecall
ret
# fp16_to_fp32 function: Convert a 16-bit floating-point number to a 32-bit floating-point number
fp16_to_fp32:
addi sp, sp, -16 # Allocate stack space for saved registers
sw ra, 12(sp) # Save return address
sw t0, 8(sp) # Save register t0
sw t1, 4(sp) # Save register t1
sw t2, 0(sp) # Save register t2
slli a1, a1, 16 # w = h << 16
li t0, 0x80000000 # Sign mask
and t1, a1, t0 # sign = w & 0x80000000
li t0, 0x7FFFFFFF # Nonsign mask
and t2, a1, t0 # nonsign = w & 0x7FFFFFFF
# Calculate the number of leading zeros
mv a1, t2 # Prepare argument
call my_clz # renorm_shift = my_clz(nonsign)
mv t3, a1 # Save renorm_shift
# Handle cases where renorm_shift is greater than 5
li t0, 5
bgt t3, t0, renorm_sub # If renorm_shift > 5, jump to subtraction
li t3, 0 # Otherwise, renorm_shift = 0
j renorm_done
renorm_sub:
sub t3, t3, t0 # renorm_shift -= 5
renorm_done:
li t0, 0x04000000 # Calculate inf_nan_mask
add t5, t2, t0
srai t5, t5, 8
li t0, 0x7F800000
and t5, t5, t0 # inf_nan_mask done
addi t0, t2, -1 # Calculate zero_mask
srai t0, t0, 31 # zero_mask done
sll t2, t2, t3 # nonsign << renorm_shift
srli t2, t2, 3 # >> 3
li t4, 0x70
sub t4, t4, t3
slli t4, t4, 23 # (0x70 - renorm_shift) << 23
add t2, t2, t4 # (nonsign << renorm_shift >> 3) + ((0x70 - renorm_shift) << 23)
or t2, t2, t5 # Add inf_nan_mask
not t0, t0 # ~zero_mask
and t2, t2, t0 # Handle zero_mask
or a1, t1, t2 # sign | nonsign
lw ra, 12(sp) # Restore return address
lw t0, 8(sp) # Restore register t0
lw t1, 4(sp) # Restore register t1
lw t2, 0(sp) # Restore register t2
addi sp, sp, 16 # Release stack space
ret # Return 32-bit result
# Use ecall to print the 32-bit floating-point number
print_fp32:
la a0, output_msg # Load "Output -> " string
li a7, 4 # ecall 4: Print string
ecall
# Print the hexadecimal representation of the 32-bit floating-point number
mv a0, a1
li a7, 34 # ecall 34: Print integer, hexadecimal format
ecall
print_ans:
la a0, ans_msg # Load ", Ans -> " string
li a7, 4 # ecall 4: Print string
ecall
mv a0, a2 # Output 32-bit number
li a7, 34 # ecall 34: Print integer, hexadecimal format
ecall
ret
# my_clz function: Count the number of leading zeros
my_clz:
addi sp, sp, -16 # Allocate stack space for saved registers
sw t2, 12(sp)
sw t1, 8(sp)
sw ra, 4(sp) # Save return address
sw t0, 0(sp) # Save register t0
li t0, 31 # Initialize i = 31
li t1, 0 # count = 0
li t2, 1
clz_loop:
beqz a1, all_zeros # If x is 0, jump to all_zeros
sll s3, t2, t0 # t0 = 1 << i
and s3, a1, s3 # t0 = x & (1 << i)
bne s3, zero, end_loop # If t0 != 0, jump to end_loop
addi t1, t1, 1 # count++
addi t0, t0, -1 # i--
bge t0, zero, clz_loop # If i >= 0, continue looping
all_zeros:
li t1, 32 # If x is 0, count = 32
end_loop:
mv a1, t1 # Return count
lw t2, 12(sp)
lw t1, 8(sp)
lw ra, 4(sp) # Restore return address
lw t0, 0(sp) # Restore register t0
addi sp, sp, 16 # Release stack space
ret
Execution info |
|
Cycles |
1680 |
Instr. retired |
1226 |
CPI |
1.37 |
IPC |
0.73 |
Selected Problems
My question is implement count leading zero and popcount in C and use them to solve LeetCode problem 1342.Number of Steps to Reduce a Number to Zero
Given an integer num, return the number of steps to reduce it to zero.
Count leading zero can be used for quick solving task selection problem in OS.
In one step, if the current number is even, you have to divide it by 2, otherwise, you have to subtract 1 from it.
Input: num = 14
Output: 6
Explanation:
Step 1) 14 is even; divide by 2 and obtain 7.
Step 2) 7 is odd; subtract 1 and obtain 6.
Step 3) 6 is even; divide by 2 and obtain 3.
Step 4) 3 is odd; subtract 1 and obtain 2.
Step 5) 2 is even; divide by 2 and obtain 1.
Step 6) 1 is odd; subtract 1 and obtain 0.
Constraints:
0 <= num <= 106
The concept is from Quiz 1, question C, involving my_clz function,which calculates how many leading zeros are in the binary representation of a 32-bit unsigned integer (uint32_t).Popcount function calculates the number of 1 bits in the binary representation of an integer.
Based on these function. I would like to implement count leading zero and popcount in C and use them to solve LeetCode problem 1342 to reduce the number of clock cycles.
my_clz
The my_clz function counts the number of leading zeros in a 32-bit unsigned integer x. It initializes a counter called count to zero and uses a for loop to iterate from the most significant bit (MSB) to the least significant bit (LSB). The expression x & (1U << i) checks if the i-th bit of x is 1; if it finds a 1, the loop breaks. If the current bit is 0, the counter increments. This continues until a 1 is found or all bits have been checked, and the function returns the total count of leading zeros.
Is it better? Can you prove?
Thanks, This indeed guides me toward another direction of thought.
The my_clz function uses a loop to count leading zeros by checking each bit from the MSB to LSB. It’s simple and predictable, especially when the input has many leading zeros. However, it has several downsides:
- Performance: In the worst case, the loop runs 32 iterations. This adds unnecessary overhead compared to more efficient methods.
- Branching: Each iteration introduces a conditional check, which may cause pipeline stalls and branch prediction failures, especially on architectures like RISC-V.
- Inefficiency: Bitwise AND and shifting by i are costlier compared to other approaches that progressively reduce the bit range(for example,byte shift clz).
But when using the optimized clz implementations that use byte shifts strategy, it can outperform the original loop version. Those versions check large blocks of bits at once, reducing instructions and improving performance.
The corresponding RISC-V assembly are showing below.
Solution
My thinking path
Approaching it in an intuitive way, actually,it is a very easy problem.LeetCode 1342 asks for the number of operations required to reduce a given integer 𝑛 to zero. The operations are defined as follows:
- If n is even, perform n/2.
- If n is odd, perform n−1.
We can use a loop to implement this logic. Here is a solution written in C language:
Original C code (loop version)
However, if solving it using bit manipulation, it involves many interesting derived knowledge points.
Since dividing a positive integer by 2 is equivalent to shifting it right by 1 bit, the total number of divisions required is equal to the number of times the most significant bit (MSB) needs to be shifted to the least significant position. Shifting the MSB to the least significant position takes 31 - __builtin_clz(num) steps, so the number of divisions required is also the same.
Additionally, when the binary representation of num contains k ones, it means a total of k subtractions of 1 are needed, as these ones will eventually be shifted to the least significant position. When the least significant bit is 1, it indicates that the number is odd and requires a subtraction of 1, so in total, popcount(num) subtractions of 1 are needed.
C code(with clz_loop and popcount)
C code(with clz_binary and popcount)
Does the above make sense to RV32I? Prove and discuss.
Yes, the code is suited for RV32I. This approach to bit manipulations for clz and popcnt matches well with the capabilities of the RV32I instruction set.
- Shifts: num >> X → SRLI instruction
- Bitwise AND: & → AND instruction
- Additions: + → ADDI instruction
- Loops and conditionals: if, for → BNE, BEQ for branching
- Function calls: → JAL for function jumps, and JALR for returns.
This version of the clz function counts the number of leading zeros in a 32-bit unsigned integer (x). Here's the step-by-step breakdown of its logic:
- Initial Check for Zero:
The function first checks if the input x is 0. If x == 0, the function immediately returns 32, since all bits are zero, meaning there are 32 leading zeros.
- Checking and Shifting in Blocks:
The function proceeds to check the leading zeros by analyzing blocks of bits, starting from the most significant ones (leftmost).
Step 1: (x >> 16) == 0
It checks if the upper 16 bits of x are all zeros. If so, it adds 16 to n (since all 16 bits are zeros) and shifts x 16 bits to the left, discarding the leading zeros.
Step 2: (x >> 24) == 0
After the shift, it checks if the upper 8 bits of the remaining 16-bit value are all zeros. If so, it adds 8 to n and shifts x left by 8 bits.
Step 3: (x >> 28) == 0
Now, it checks if the upper 4 bits of the remaining 8-bit value are all zeros. If so, it adds 4 to n and shifts x left by 4 bits.
Step 4: (x >> 30) == 0
It then checks if the upper 2 bits of the remaining 4-bit value are all zeros. If so, it adds 2 to n and shifts x left by 2 bits.
Step 5: (x >> 31) == 0
Finally, it checks if the most significant bit (MSB) of the remaining 2-bit value is zero. If so, it adds 1 to n. And n is the count of leading zeros.
I included another method for comparison, and both versions use the same basic strategy of bit shifts and bit checking.
Byte shift clz
second version(My version)
- Number of conditionals (if):
First version: Reduces one if by handling the 31st bit with the final line n = n - (x >> 31). This avoids one conditional branch but introduces an additional subtraction operation.
Second version: Contains an extra conditional branch for the 31st bit (x >> 31). This conditional statement causes RISC-V to execute one more comparison and branch.
For RISC-V, conditional branches may lead to potential branch prediction failures and pipeline stalls, which increase CPU cycle costs.
- Code simplicity and ease of translation to RISC-V I:
The first version is slightly simpler in logic because it avoids extra conditional checks, especially when handling the highest bit. For RISC-V’s pipeline and branch prediction mechanisms, reducing branch jumps can decrease potential performance bottlenecks.
The second version contains an extra branch, which requires more branch instruction checks in RISC-V I (e.g., BNE, BEQ). Although the impact is not significant, this slightly increases processing time.
Thus, the first version might have a slight advantage in terms of fewer conditional branches.
If there are any errors or gaps in the response, I would appreciate it if you could point them out, thanks.
RISC-V assembly code
Loop version 1
.data
nums:
.word 14, 8, 123 # Test data nums = 14, 8, 123
input_str:
.string "Input: num = " # Defines the string "Input: num = "
output_str:
.string " Output: " # Defines the string "Output: "
.text
main:
# Initialize pointers and array size
la s0, nums # Load nums array start address
li s1, 3 # Number of elements in nums (optimized: hardcoded)
li t0, 0 # i = 0
loop_outer:
beq t0, s1, done_outer # Exit if i == size
lw t3, 0(s0) # Load nums[i] into t3
addi s0, s0, 4 # Move nums pointer
addi t0, t0, 1 # i++
mv a1, t3 # Save original num
jal ra, numberOfSteps # Call numberOfSteps, result in a1
# Print "Input: num = ", num, " Output: ", and step count in one go
la a0, input_str # Load "Input: num = " string
li a7, 4 # Syscall 4: print string
ecall
mv a0, a1 # Move original num into a0
li a7, 1 # Syscall 1: print integer
ecall
la a0, output_str # Load " Output: " string
li a7, 4 # Syscall 4: print string
ecall
mv a0, a2 # Move step count into a0
li a7, 1 # Syscall 1: print integer
ecall
li a0, 0x0A # Print newline
li a7, 11 # Syscall 11: print char
ecall
j loop_outer # Repeat for next element
done_outer:
li a7, 10 # Syscall 10: exit program
ecall
numberOfSteps:
li a2, 0 # count = 0
loop_inner:
beqz t3, done_inner # Exit if num == 0
andi t5, t3, 1 # Check if num is odd
beqz t5, even # If even, jump to even
addi t3, t3, -1 # If odd, subtract 1
j update_count # Jump to update count
even:
srli t3, t3, 1 # Divide by 2 if even
update_count:
addi a2, a2, 1 # Increment count
j loop_inner # Loop until num is 0
done_inner:
ret
Execution info |
|
Cycles |
380 |
Instr. retired |
230 |
CPI |
1.65 |
IPC |
0.605 |
Shorten the above, using fewer instructions.
Loop version 2 (Using fewer instructions.)
I revised the code in loop_inner
section.
- Branch elimination: Using sub and add combined with condition flags avoids the need for two branch conditions for odd and even numbers, simplifying the control flow.
- t5 stores the state of whether num is odd.
- When num is odd, sub t3, t3, t5 essentially subtracts 1 (i.e., turning the odd number into an even one), and add a2, a2, t5 adds 1 to a2, which represents the step count for this operation.
- Unified processing: The operation srli t3, t3, 1, which divides num by 2, is placed afterward, unifying the operation flow for both cases.
.data
nums:
.word 14, 8, 123, # Test data nums = 14, 8, 123
input_str:
.string "Input: num = " # Defines the string "Input: num = "
output_str:
.string " Output: " # Defines the string "Output: "
.text
main:
# Initialize pointers and array size
la s0, nums # Load nums array start address
li s1, 3 # Number of elements in nums (optimized: hardcoded)
li t0, 0 # i = 0
loop_outer:
beq t0, s1, done_outer # Exit if i == size
lw t3, 0(s0) # Load nums[i] into t3
addi s0, s0, 4 # Move nums pointer
addi t0, t0, 1 # i++
mv a1, t3 # Save original num
jal ra, numberOfSteps # Call numberOfSteps, result in a1
# Print "Input: num = ", num, " Output: ", and step count in one go
la a0, input_str # Load "Input: num = " string
li a7, 4 # Syscall 4: print string
ecall
mv a0, a1 # Move original num into a0
li a7, 1 # Syscall 1: print integer
ecall
la a0, output_str # Load " Output: " string
li a7, 4 # Syscall 4: print string
ecall
mv a0, a2 # Move step count into a0
li a7, 1 # Syscall 1: print integer
ecall
li a0, 0x0A # Print newline
li a7, 11 # Syscall 11: print char
ecall
j loop_outer # Repeat for next element
done_outer:
li a7, 10 # Syscall 10: exit program
ecall
numberOfSteps:
li a2, 0 # count = 0
loop_inner:
andi t5, t3, 1 # Check if num is odd
sub t3, t3, t5 # If odd, subtract 1
add a2, a2, t5 # If odd, count + 1(t5),else, count+0(5)
srli t3, t3, 1
beqz t3, done_inner # Exit if num == 0
addi a2, a2, 1 # Increment count
j loop_inner # Jump to update count
done_inner:
ret
Execution info |
|
Cycles |
270 |
Instr. retired |
184 |
CPI |
1.47 |
IPC |
0.681 |
Using clz_loop and popcount
.data
nums:
.word 14, 8, 123 # Test data nums
input_str:
.string "Input: num = "
output_str:
.string " Output: "
.text
main:
la t0, nums # Load nums array start address
li t6, 3 # Number of elements in nums (optimized: hardcoded)
li t2, 0 # i = 0
loop_outer:
beq t2, t6, done_outer # If i == size, exit the outer loop
addi t0, t0, 4 # Move the nums pointer forward by 4 bytes (size of an int)
addi t2, t2, 1 # i++
lw t3, -4(t0) # Get nums[i], load it into t3 (num), using -4 offset
mv t4, t3 # Save the original num into t4 for later display
jal ra, my_popcnt # Call my_popcnt, store the step count in a1
# Display string "Input: num = "
la a0, input_str # Load "Input: num = "
li a7, 4 # System call number 4 (print string)
ecall # Execute system call
# Display the original num
mv a0, t4 # Pass the original num to a0 (syscall 1: print integer)
li a7, 1 # System call number 1 (print integer)
ecall # Display the number
# Display string " Output: "
la a0, output_str # Load " Output: "
li a7, 4 # System call number 4 (print string)
ecall # Execute system call
# Display the step count
mv a0, a1 # Pass the step count (a1) to a0
li a7, 1 # System call number 1 (print integer)
ecall # Display the step count
# New line
li a0, 0x0A # New line (0x0A is the newline character)
li a7, 11 # System call number 11 (print character)
ecall
# Move to the next test data
j loop_outer # Return to the outer loop
done_outer:
# End program
li a7, 10 # System call number 10 (exit)
ecall
my_popcnt:
# u = (u & 0x55555555) + ((u >> 1) & 0x55555555)
li s0, 0x55555555 # Load mask 0x55555555
and s1, s0, t3 # s1 = u & 0x55555555
srli s2, t3, 1 # s2 = u >> 1
and s2, s2, s0 # s2 = (u >> 1) & 0x55555555
add a1, s2, s1 # a1 = (u & 0x55555555) + ((u >> 1) & 0x55555555)
# u = (u & 0x33333333) + ((u >> 2) & 0x33333333)
li s0, 0x33333333 # Load mask 0x33333333
and s1, s0, a1 # s1 = u & 0x33333333
srli s2, a1, 2 # s2 = u >> 2
and s2, s2, s0 # s2 = (u >> 2) & 0x33333333
add a1, s2, s1 # a1 = (u & 0x33333333) + ((u >> 2) & 0x33333333)
# u = (u & 0x0F0F0F0F) + ((u >> 4) & 0x0F0F0F0F)
li s0, 0x0F0F0F0F # Load mask 0x0F0F0F0F
and s1, s0, a1 # s1 = u & 0x0F0F0F0F
srli s2, a1, 4 # s2 = u >> 4
and s2, s2, s0 # s2 = (u >> 4) & 0x0F0F0F0F
add a1, s2, s1 # a1 = (u & 0x0F0F0F0F) + ((u >> 4) & 0x0F0F0F0F)
# u = (u & 0x00FF00FF) + ((u >> 8) & 0x00FF00FF)
li s0, 0x00FF00FF # Load mask 0x00FF00FF
and s1, s0, a1 # s1 = u & 0x00FF00FF
srli s2, a1, 8 # s2= u >> 8
and s2, s2, s0 # s2 = (u >> 8) & 0x00FF00FF
add a1, s2, s1 # a1 = (u & 0x00FF00FF) + ((u >> 8) & 0x00FF00FF)
# u = (u & 0x0000FFFF) + ((u >> 16) & 0x0000FFFF)
li s0, 0x0000FFFF # Load mask 0x0000FFFF
and s1, s0, a1 # s1 = u & 0x0000FFFF
srli s2, a1, 16 # s2= u >> 16
and s2, s2, s0 # s2 = (u >> 16) & 0x0000FFFF
add a1, s2, s1 # a1 = (u & 0x0000FFFF) + ((u >> 16) & 0x0000FFFF)
length:
li s1, 0 # s1 = count = 0
li s0, 31 # s0 = i = 31
li s2, 1 # s2 = 1
beqz t3, all_zeros # If x is 0, jump to all_zeros
clz_loop:
sll s3, s2, s0 # s3 = 1 << i
and s3, t3, s3 # s3 = x & (1 << i)
bne s3, zero, end_loop # If s3 != 0, jump to end_loop
addi s1, s1, 1 # count++
addi s0, s0, -1 # i--
bge s0, zero, clz_loop # If i >= 0, continue loop
all_zeros:
li s1, 32 # If x is 0, count = 32
end_loop:
li s2, 31 # s2 = 31
sub a0, s2, s1 # a0 = 31 - clz (return value)
add a1, a0, a1
ret # Return to main program
Execution info |
|
Cycles |
915 |
Instr. retired |
688 |
CPI |
1.33 |
IPC |
0.752 |
Using clz_binary and popcount
.data
nums:
.word 14, 8, 123 # Test data nums
input_str:
.string "Input: num = "
output_str:
.string " Output: "
.text
main:
la t0, nums # Load nums array start address
li t6, 3 # Number of elements in nums (optimized: hardcoded)
li t2, 0 # i = 0
loop_outer:
beq t2, t6, done_outer # If i == size, exit the outer loop
addi t0, t0, 4 # Move the nums pointer forward by 4 bytes (size of an int)
addi t2, t2, 1 # Increment i
lw t3, -4(t0) # Load nums[i] into t3 (num), note the -4 offset
mv t4, t3 # Save the original num into t4 for later display
jal ra, my_popcnt
# Display the string "Input: num = "
la a0, input_str
li a7, 4 # Syscall number 4 (print string)
ecall
# Display the original num
mv a0, t4
li a7, 1 # Syscall number 1 (print integer)
ecall
# Display the string " Output: "
la a0, output_str
li a7, 4 # Syscall number 4 (print string)
ecall
# Display the step count
mv a0, a1
li a7, 1 # Syscall number 1 (print integer)
ecall
# Print a newline
li a0, 0x0A # Newline (0x0A is the newline character)
li a7, 11 # Syscall number 11 (print char)
ecall
# Move to the next test data
j loop_outer
done_outer:
# Exit the program
li a7, 10 # Syscall number 10 (exit)
ecall
my_popcnt:
# u = (u & 0x55555555) + ((u >> 1) & 0x55555555)
li s0, 0x55555555 # Load mask 0x55555555
and s1, s0, t3 # s1 = u & 0x55555555
srli s2, t3, 1 # s2 = u >> 1
and s2, s2, s0 # s2 = (u >> 1) & 0x55555555
add a1, s2, s1 # a1 = (u & 0x55555555) + ((u >> 1) & 0x55555555)
# u = (u & 0x33333333) + ((u >> 2) & 0x33333333)
li s0, 0x33333333 # Load mask 0x33333333
and s1, s0, a1 # s1 = u & 0x33333333
srli s2, a1, 2 # s2 = u >> 2
and s2, s2, s0 # s2 = (u >> 2) & 0x33333333
add a1, s2, s1 # a1 = (u & 0x33333333) + ((u >> 2) & 0x33333333)
# u = (u & 0x0F0F0F0F) + ((u >> 4) & 0x0F0F0F0F)
li s0, 0x0F0F0F0F # Load mask 0x0F0F0F0F
and s1, s0, a1 # s1 = u & 0x0F0F0F0F
srli s2, a1, 4 # s2 = u >> 4
and s2, s2, s0 # s2 = (u >> 4) & 0x0F0F0F0F
add a1, s2, s1 # a1 = (u & 0x0F0F0F0F) + ((u >> 4) & 0x0F0F0F0F)
# u = (u & 0x00FF00FF) + ((u >> 8) & 0x00FF00FF)
li s0, 0x00FF00FF # Load mask 0x00FF00FF
and s1, s0, a1 # s1 = u & 0x00FF00FF
srli s2, a1, 8 # s2 = u >> 8
and s2, s2, s0 # s2 = (u >> 8) & 0x00FF00FF
add a1, s2, s1 # a1 = (u & 0x00FF00FF) + ((u >> 8) & 0x00FF00FF)
# u = (u & 0x0000FFFF) + ((u >> 16) & 0x0000FFFF)
li s0, 0x0000FFFF # Load mask 0x0000FFFF
and s1, s0, a1 # s1 = u & 0x0000FFFF
srli s2, a1, 16 # s2 = u >> 16
and s2, s2, s0 # s2 = (u >> 16) & 0x0000FFFF
add a1, s2, s1 # a1 = (u & 0x0000FFFF) + ((u >> 16) & 0x0000FFFF)
my_clz:
li s0, 0 # s0 = clz = 0
# if ((num >> 16) == 0)
srli s1, t3, 16 # s1 = num >> 16
bnez s1, check_24 # If num >> 16 is not 0, jump to check_24
addi s0, s0, 16 # clz += 16
slli t3, t3, 16 # num <<= 16
check_24:
# if ((num >> 24) == 0)
srli s1, t3, 24 # s1 = num >> 24
bnez s1, check_28 # If num >> 24 is not 0, jump to check_28
addi s0, s0, 8 # clz += 8
slli t3, t3, 8 # num <<= 8
check_28:
# if ((num >> 28) == 0)
srli s1, t3, 28 # s1 = num >> 28
bnez s1, check_30 # If num >> 28 is not 0, jump to check_30
addi s0, s0, 4 # clz += 4
slli t3, t3, 4 # num <<= 4
check_30:
# if ((num >> 30) == 0)
srli s1, t3, 30 # s1 = num >> 30
bnez s1, check_31 # If num >> 30 is not 0, jump to check_31
addi s0, s0, 2 # clz += 2
slli t3, t3, 2 # num <<= 2
check_31:
# if ((num >> 31) == 0)
srli s1, t3, 31 # s1 = num >> 31
bnez s1, done # If num >> 31 is not 0, jump to done
addi s0, s0, 1 # clz += 1
done:
li s1, 31
sub a0, s1, s0 # a0 = 31 - clz (return value)
add a1, a0, a1 # Add clz result to popcnt result
ret # Return to main program
Execution info |
|
Cycles |
302 |
Instr. retired |
231 |
CPI |
1.31 |
IPC |
0.765 |
Analysis
When I first got the result, I was a bit confused. It’s not surprising that the loop version has fewer cycles than the clz_loop version because clz_loop iterates up to 30 times for each data point(it iterates more times due to the value of test datas are small), while the loop version iterates approximately O(log2(106)=19.931) times at most for each data point. However, it seems reasonable that the clz_binary version should have even fewer cycles. So, I tested again using 106 as the test data and got the following result(I only used one test data)
Loop version 2 (Using fewer instructions.)
Execution info |
|
Cycles |
235 |
Instr. retired |
171 |
CPI |
1.37 |
IPC |
0.728 |
Using clz_binary and popcount
Execution info |
|
Cycles |
111 |
Instr. retired |
80 |
CPI |
1.39 |
IPC |
0.721 |
Comparison Analysis
- Cycle Count:
clz_binary and popcount is significantly faster, using only 111 cycles compared to 235 cycles for Loop Version 2. This indicates that the clz_binary approach is much more efficient in terms of the number of cycles required to execute the program.
- Instructions Retired:
The clz_binary and popcount method has 80 instructions retired, while Loop Version 2 has 171 instructions retired. This suggests that clz_binary is not only executing fewer cycles but also utilizing fewer instructions to achieve its result.
- CPI (Cycles Per Instruction):
The CPI for Loop Version 2 is 1.37, while for clz_binary, it is slightly higher at 1.39. A lower CPI generally indicates a more efficient program. In this case, despite having a slightly higher CPI, the clz_binary method compensates for this by executing far fewer instructions overall, resulting in fewer total cycles.
- IPC (Instructions Per Cycle):
Loop Version 2 has an IPC of 0.728, whereas clz_binary has an IPC of 0.721. A higher IPC is generally better, as it indicates more instructions are being executed per cycle. In this case, Loop Version 2 performs slightly better in IPC, but again, this is overshadowed by the overall lower cycle count of clz_binary.
Brief summary
Efficiency: The clz_binary and popcount method outperforms Loop Version 2 in terms of execution cycles and instructions retired. It shows a more efficient implementation for this task.
Performance Trade-off: While Loop Version 2 has a slightly better IPC and CPI, the significantly lower cycle count and instructions make the clz_binary approach more favorable overall.
Optimization Considerations: The reduction in both cycles and instructions in the clz_binary and popcount implementation suggests that optimizing the algorithm for counting bits (like using bitwise operation) can lead to substantial performance gains, particularly in scenarios involving larger numbers or datasets.
manual hazard detection
A 5-stage pipeline processor with hazard detection and forwarding functionality can produce correct results for each test data. I would like to attempt manual hazard detection. When using a processor without hazard detection, the simulator shows the message "Unknown system call in register 'a7': 0."
It's clear that there is a hazard around the "ecall" instruction. Furthermore,when the processor has a forwarding unit, most data hazards can be resolved, but a load-use
data hazard still requires stalling for one cycle to prevent data loss. Below, I will discuss in detail and demonstrate the various situations and solutions I encountered during this process.
Three type of Data Hazard
-
Structure Hazard
Insufficient hardware resources. (e.g. Assuming there is no difference between IM and DM, IF and MEM both have memory access, and problems will occur when they overlap, resulting in the inability to execute multiple instructions at the same time)
-
Data Hazard
Data Hazards occur when an instruction depends on the result of previous instruction and that result of instruction has not yet been computed.
There are four types of data dependencies: Read after Write (RAW), Write after Read (WAR), Write after Write (WAW), and Read after Read (RAR). These are explained as follows below.
- Read after Write (RAW) :
It is also known as True dependency or Flow dependency. It occurs when the value produced by an instruction is required by a subsequent instruction. For example,
ADD R1, --, --; SUB --, R1, --;
- Write after Read (WAR) :
It is also known as anti dependency. These hazards occur when the output register of an instruction is used right after read by a previous instruction. For example,
ADD --, R1, --; SUB R1, --, --;
- Write after Write (WAW) :
It is also known as output dependency. These hazards occur when the output register of an instruction is used for write after written by previous instruction. For example,
ADD R1, --, --; SUB R1, --, --;
- Read after Read (RAR) :
It occurs when the instruction both read from the same register. For example,
ADD --, R1, --; SUB --, R1, --;
Since reading a register value does not change the register value, these Read after Read (RAR) dependencies don’t cause a problem for the processor.
-
Control Hazard
Also known as branch hazards, occur when the pipeline makes wrong decisions on branch prediction, leading to the fetching of incorrect instructions. This typically happens with conditional branches and jumps.
Load-Use Data Hazard
Consider the following sequence of instructions:
This hazard cannot be resolved by simple forwarding.
The value lw
writes into $t2
is not available until lw completes the MEM stage
, but and
needs that value when it enters the EX stage
, which is when lw enters the MEM stage.
|
c1 |
c2 |
c3 |
c4 |
c5 |
c6 |
lw t1) |
IF |
ID |
EX |
MEM |
WB |
|
and $t4, $t2, $t5 |
|
IF |
ID |
EX |
MEM |
WB |
Handling Data Hazards :
These are various methods we use to handle hazards:
Forwarding
:
It adds special circuitry to the pipeline. This method works because it takes less time for the required values to travel through a wire than it does for a pipeline segment to compute its result.
Code reordering
:
We need a special type of software to reorder code. We call this type of software a hardware-dependent compiler.
Stall Insertion
:
it inserts one or more stall (no-op instructions) into the pipeline, which delays the execution of the current instruction until the required operand is written to the register file, but this method decreases pipeline efficiency and throughput.
A load-use hazard requires delaying the execution of the using instruction until the result from the loading instruction can be made available to the using instruction
If we can stall the execution of the using instruction for one cycle:
|
c1 |
c2 |
c3 |
c4 |
c5 |
c6 |
c7 |
lw t1) |
IF |
ID |
EX |
MEM |
WB |
|
|
NOP |
|
IF |
ID |
EX |
MEM |
WB |
|
and $t4, $t2, $t5 |
|
|
IF |
ID |
EX |
MEM |
WB |
- value to be loaded to
$t2
will be available in the MEM/WB buffer when the using instruction moves from ID
to EX
- that value can be forwarded to the using instruction as the using instruction enters the EX stage
Implement
I chose the version with clz_binary and popcount to implement the assembly code for a processor without a hazard detection unit.
First, I ran the original code and encountered the error Unknown system call in register 'a7': 0
. After referring to some previous notes for solutions, I learned that three NOPs need to be inserted before the ecall instruction for the system call to work.
After fixing all the system call instructions, I ran the code again and found that the output was empty.

I summarized the data hazards of branch instructions and divided them into three scenarios for discussion.
- If the branch instruction's register has a data dependency with the destination register of the second or third preceding ALU instruction, forwarding can resolve the issue.
- If the branch instruction's register has a data dependency with the destination register of the immediately preceding ALU instruction or the second preceding load instruction, the pipeline must stall for one clock cycle.
- If the branch instruction's register has a data dependency with the destination register of the immediately preceding load instruction, the pipeline must stall for two clock cycles.
Based on this principle, I corrected my code and rearranged the la t0 nums instruction to be placed after the li t2 0 instruction without affecting the original program, allowing me to reduce one NOP.
I used this code as an example.
When it starts executing, it will be translated into the following machine code.
First, li s0, 0x55555555 will be translated into two instructions: lui x8, 0x55555, followed by addi x8, x8, 1365 to load the mask 0x55555555 into x8. When and s1, s0, t3 reaches the EX stage, since the previous instruction is addi x8, x8, 1365 and there is no load-use hazard, it can execute normally without adding a NOP. (When the li instruction is translated from pseudo code to machine code, it should inherently include a stall.)
Next, I used this code as an example.I will delve into the internal circuit behavior.
When it starts executing, it will be translated into the following machine code.
The first image shows that the lw t3, -4(t0) instruction retrieves the data 0x0000000e in the MEM stage but has not yet been stored in the Mem/WB pipeline register. Therefore, even with forwarding, the correct value cannot be passed back to the EX stage at this time.

The second image shows that, through forwarding, even if the addi x29 x28 0 instruction reads an incorrect value from x28, it can be corrected to the right value via forwarding.

The issue here confuses me a bit. In my opinion, a circuit with full forwarding capability when dealing with a data hazard, it should only require stalling for one cycle. However, it seems that the branch cannot receive values from two-stage forwarding.
I initially thought that if there was a data hazard with an ALU instruction before a branch, the result could be forwarded from the EX stage to the ID stage to determine whether the branch should be taken, requiring only one clock cycle. However, after observing the circuit behavior, I found that the branch after the srli instruction needed two NOPs.I still need to conduct more testing and research to clarify this part.
But for now, I have already modified a version without hazard detection. Below is my complete source code.
.data
nums:
.word 14, 8, 123 # Test data nums = 14, 8, 123
input_str:
.string "Input: num = " # Defines the string "Input: num = "
output_str:
.string " Output: " # Defines the string "Output: "
.text
main:
# Initialize pointers and array size
li t6, 3 # Number of elements in nums (optimized: hardcoded)
li t2, 0 # i = 0
la t0, nums # Load nums array start address
loop_outer:
beq t2, t6, done_outer # If i == size, exit the outer loop
addi t0, t0, 4 # Move the nums pointer forward by 4 bytes (size of an int)
addi t2, t2, 1 # Increment i
lw t3, -4(t0) # Load nums[i] into t3 (num), note the -4 offset
addi x0,x0,0 #NOP
mv t4, t3 # Save the original num into t4 for later display
jal ra, my_popcnt # Call my_popcnt, step count is stored in a1
# Display the string "Input: num = "
la a0, input_str # Load the address of "Input: num = "
li a7, 4 # Syscall number 4 (print string)
addi x0,x0,0 #NOP
addi x0,x0,0 #NOP
addi x0,x0,0 #NOP
ecall # Execute syscall
# Display the original num
mv a0, t4 # Move the original num into a0 (syscall 1: print integer)
li a7, 1 # Syscall number 1 (print integer)
addi x0,x0,0 #NOP
addi x0,x0,0 #NOP
addi x0,x0,0 #NOP
ecall # Display the number
# Display the string " Output: "
la a0, output_str # Load the address of " Output: "
li a7, 4 # Syscall number 4 (print string)
addi x0,x0,0 #NOP
addi x0,x0,0 #NOP
addi x0,x0,0 #NOP
ecall # Execute syscall
# Display the step count
mv a0, a1 # Move the step count (a1) into a0
li a7, 1 # Syscall number 1 (print integer)
addi x0,x0,0 #NOP
addi x0,x0,0 #NOP
addi x0,x0,0 #NOP
ecall # Display the step count
# Print a newline
li a0, 0x0A # Newline (0x0A is the newline character)
li a7, 11 # Syscall number 11 (print char)
addi x0,x0,0 #NOP
addi x0,x0,0 #NOP
addi x0,x0,0 #NOP
ecall
# Move to the next test data
j loop_outer # Jump back to the outer loop
done_outer:
# Exit the program
li a7, 10 # Syscall number 10 (exit)
addi x0,x0,0 #NOP
addi x0,x0,0 #NOP
addi x0,x0,0 #NOP
ecall
my_popcnt:
# u = (u & 0x55555555) + ((u >> 1) & 0x55555555)
li s0, 0x55555555 # Load mask 0x55555555
and s1, s0, t3 # s1 = u & 0x55555555
srli s2, t3, 1 # s2 = u >> 1
and s2, s2, s0 # s2 = (u >> 1) & 0x55555555
add a1, s2, s1 # a1 = (u & 0x55555555) + ((u >> 1) & 0x55555555)
# u = (u & 0x33333333) + ((u >> 2) & 0x33333333)
li s0, 0x33333333 # Load mask 0x33333333
and s1, s0, a1 # s1 = u & 0x33333333
srli s2, a1, 2 # s2 = u >> 2
and s2, s2, s0 # s2 = (u >> 2) & 0x33333333
add a1, s2, s1 # a1 = (u & 0x33333333) + ((u >> 2) & 0x33333333)
# u = (u & 0x0F0F0F0F) + ((u >> 4) & 0x0F0F0F0F)
li s0, 0x0F0F0F0F # Load mask 0x0F0F0F0F
and s1, s0, a1 # s1 = u & 0x0F0F0F0F
srli s2, a1, 4 # s2 = u >> 4
and s2, s2, s0 # s2 = (u >> 4) & 0x0F0F0F0F
add a1, s2, s1 # a1 = (u & 0x0F0F0F0F) + ((u >> 4) & 0x0F0F0F0F)
# u = (u & 0x00FF00FF) + ((u >> 8) & 0x00FF00FF)
li s0, 0x00FF00FF # Load mask 0x00FF00FF
and s1, s0, a1 # s1 = u & 0x00FF00FF
srli s2, a1, 8 # s2 = u >> 8
and s2, s2, s0 # s2 = (u >> 8) & 0x00FF00FF
add a1, s2, s1 # a1 = (u & 0x00FF00FF) + ((u >> 8) & 0x00FF00FF)
# u = (u & 0x0000FFFF) + ((u >> 16) & 0x0000FFFF)
li s0, 0x0000FFFF # Load mask 0x0000FFFF
and s1, s0, a1 # s1 = u & 0x0000FFFF
srli s2, a1, 16 # s2 = u >> 16
and s2, s2, s0 # s2 = (u >> 16) & 0x0000FFFF
add a1, s2, s1 # a1 = (u & 0x0000FFFF) + ((u >> 16) & 0x0000FFFF)
my_clz:
li s0, 0 # s0 = clz = 0
# if ((num >> 16) == 0)
srli s1, t3, 16 # s1 = num >> 16
addi x0,x0,0 # NOP
addi x0,x0,0 # NOP
bnez s1, check_24 # If num >> 16 is not 0, jump to check_24
addi s0, s0, 16 # clz += 16
slli t3, t3, 16 # num <<= 16
check_24:
# if ((num >> 24) == 0)
srli s1, t3, 24 # s1 = num >> 24
addi x0,x0,0 # NOP
addi x0,x0,0 # NOP
bnez s1, check_28 # If num >> 24 is not 0, jump to check_28
addi s0, s0, 8 # clz += 8
slli t3, t3, 8 # num <<= 8
check_28:
# if ((num >> 28) == 0)
srli s1, t3, 28 # s1 = num >> 28
addi x0,x0,0 # NOP
addi x0,x0,0 # NOP
bnez s1, check_30 # If num >> 28 is not 0, jump to check_30
addi s0, s0, 4 # clz += 4
slli t3, t3, 4 # num <<= 4
check_30:
# if ((num >> 30) == 0)
srli s1, t3, 30 # s1 = num >> 30
addi x0,x0,0 # NOP
addi x0,x0,0 # NOP
bnez s1, check_31 # If num >> 30 is not 0, jump to check_31
addi s0, s0, 2 # clz += 2
slli t3, t3, 2 # num <<= 2
check_31:
# if ((num >> 31) == 0)
srli s1, t3, 31 # s1 = num >> 31
addi x0,x0,0 # NOP
addi x0,x0,0 # NOP
bnez s1, done # If num >> 31 is not 0, jump to done
addi s0, s0, 1 # clz += 1
done:
li s1, 31 # s1 = 31
addi x0,x0,0 # NOP
sub a0, s1, s0 # a0 = 31 - clz (return value)
add a1, a0, a1 # Add clz result to popcnt result
ret # Return to main program
Clz_binary_popcnt (Without hazard detection)
Execution info |
|
Cycles |
383 |
Instr. retired |
315 |
CPI |
1.22 |
IPC |
0.822 |
參考資料
Quiz1 of Computer Architecture (2024 Fall)
Leetcode 1342.Number of Steps to Reduce a Number to Zero
The RISC-V Instruction Set Manual
Always refer to primary sources, such as official RISC-V documentation.