Assignment1: Implement count leading zero and popcount to solve LeetCode problem 1342.

contributed by < KAILIAO>
Full source code here

Quiz 1_C

I chose problem C from Quiz 1 and translated it from C code into a complete RISC-V assembly program. The implementation includes the fabsf function and the fp16_to_fp32 function, which converts a 16-bit floating-point number in IEEE half-precision format (bit representation) to a 32-bit floating-point number in IEEE single-precision format (bit representation). This implementation avoids using any floating-point arithmetic and utilizes the clz function

Fabsf function

C code

#include <stdio.h>
#include <stdint.h>

// Function to return the absolute value of a floating-point number
static inline float fabsf(float x) {
    uint32_t i = *(uint32_t *)&x;  // Read the bits of the float into an integer
    i &= 0x7FFFFFFF;               // Clear the sign bit to get the absolute value
    x = *(float *)&i;              // Write the modified bits back into the float
    return x;
}

int main() {
    // Test values
    float test_values[] = {
        3.14f, -3.14f, 0.0f, -0.0f, 1.0f, -1.0f, 123456.789f, -123456.789f
    };

    // Expected results for absolute values
    float expected_results[] = {
        3.14f, 3.14f, 0.0f, 0.0f, 1.0f, 1.0f, 123456.789f, 123456.789f
    };

    // Number of test cases
    int num_tests = sizeof(test_values) / sizeof(test_values[0]);

    // Run tests
    for (int i = 0; i < num_tests; i++) {
        float result = fabsf(test_values[i]);
        printf("Test %d: fabsf(%.6f) = %.6f (Expected: %.6f) -> %s\n",
               i + 1, test_values[i], result, expected_results[i],
               result == expected_results[i] ? "Correct" : "Incorrect");
    }

    return 0;
}

The program output will be like
Test number: fabsf(test_value) = result (Expected:expect_value) -> Correct/Incorrect

Assembly

  .data
test_values:
    .word   0x4048F5C3, 0xC048F5C3, 0x00000000, 0x80000000, 0x3F800000, 0xBF800000, 0x47F7C000, 0xA7E7C3D1
expected_values:
    .word   0x4048F5C3, 0x4048F5C3, 0x00000000, 0x00000000, 0x3F800000, 0x3F800000, 0x47F7C000, 0x27E7C3D1
correct_msg:
    .asciz  "    -> Correct\n"
incorrect_msg: 
    .asciz  "    -> Incorrect\n"
input_msg:
    .asciz  "Input: "
output_msg:
    .asciz  "    Output: "
ans_msg:
    .asciz  ",    Expected: "

.text    
main:
    la      s0, test_values          # Load the address of test values
    la      s1, expected_values     # Load the address of expected results
    li      t2, 8                    # Number of test cases
    li      t3, 0                    # Initialize index to 0

loop:
    lw      a1, 0(s0)                # Load floating point number (as integer) from test_values  
    # Print input value
    la      a0, input_msg            # Load input string
    li      a7, 4
    ecall
    mv      a0, a1                   # Set the value to print as a1 (input value)
    li      a7, 2                    # ecall 34: Print hexadecimal integer
    ecall
    call    fabsf                    # Call fabsf function

    # Print output value
    la      a0, output_msg           # Load output string
    li      a7, 4
    ecall
    mv      a0, a1                   # Set the value to print as a1 (output value)
    li      a7, 2                    # ecall 34: Print hexadecimal integer
    ecall

    lw      a2, 0(s1)                # Load expected result from expected_results

    # Print expected value
    la      a0, ans_msg              # Load expected value string
    li      a7, 4
    ecall
    mv      a0, a2                   # Set the value to print as a2 (expected value)
    li      a7, 2                    # ecall 34: Print hexadecimal integer
    ecall

    # Compare output with expected value
    beq     a1, a2, result_correct   # If equal, jump to result_correct
    j       print_incorrect          # If not equal, jump to print_incorrect
result_correct:
    call    print_correct            # Print "Result: Correct"

next_test:
    addi    s0, s0, 4                # Point to the next test value (each floating point is 4 bytes)
    addi    s1, s1, 4                # Point to the next expected result
    addi    t3, t3, 1                # Increment test index
    blt     t3, t2, loop             # If not all values tested, continue loop

    li      a0, 0                    # ecall 93: Program exit
    li      a7, 93                   
    ecall

fabsf:
    li      t0, 0x7FFFFFFF           # Load 0x7FFFFFFF to clear the sign bit
    and     a1, a1, t0               # Clear the sign bit
    ret

print_correct:
    la      a0, correct_msg
    li      a7, 4                    # ecall 4: Print string
    ecall
    ret

print_incorrect: 
    la      a0, incorrect_msg
    li      a7, 4                    # ecall 4: Print string
    ecall
    j       next_test

The program output will be like
Input : test_value Output: result Expected:expect_value -> Correct/Incorrect

Execution info
Cycles	549
Instr. retired	337
CPI	1.63
IPC	0.614

Fp16_to_fp32

C code

#include <stdio.h>
#include <stdint.h>

// Custom my_clz function to count the number of leading zeros in a 32-bit integer
static inline int my_clz(uint32_t x) {
    int count = 0;
    for (int i = 31; i >= 0; --i) {
        if (x & (1U << i))
            break;
        count++;
    }
    return count;
}

// Function to convert a 16-bit floating-point number to a 32-bit floating-point number
static inline uint32_t fp16_to_fp32(uint16_t h) {
    const uint32_t w = (uint32_t) h << 16;
    const uint32_t sign = w & UINT32_C(0x80000000);
    const uint32_t nonsign = w & UINT32_C(0x7FFFFFFF);

    uint32_t renorm_shift = my_clz(nonsign);
    renorm_shift = renorm_shift > 5 ? renorm_shift - 5 : 0;

    const int32_t inf_nan_mask = ((int32_t)(nonsign + 0x04000000) >> 8) & INT32_C(0x7F800000);
    const int32_t zero_mask = (int32_t)(nonsign - 1) >> 31;

    return sign | ((((nonsign << renorm_shift >> 3) +
            ((0x70 - renorm_shift) << 23)) | inf_nan_mask) & ~zero_mask);
}

// Function to output the bit pattern of a 32-bit floating-point number in hexadecimal format
void print_fp32(uint32_t fp32) {
    printf("0x%08X", fp32);
}

// Main function: tests the fp16_to_fp32 function
int main() {
    // Test data (16-bit half-precision floating-point numbers)
    uint16_t test_values[] = {
        0x3C00,  // +1.0 (16-bit)
        0xC000,  // -1.0 (16-bit)
        0x7BFF,  // +Inf (16-bit)
        0xFC00,  // -Inf (16-bit)
        0x7E00,  // NaN (16-bit)
        0x0000,  // +0.0 (16-bit)
        0x8000,  // -0.0 (16-bit)
        0x3555,  // Arbitrary small number (16-bit)
    };
    uint32_t expected_values[] = {
        0x3F800000, 
        0xC0000000, 
        0x477FE000, 
        0xFF800000, 
        0x7FC00000, 
        0x00000000, 
        0x80000000, 
        0x3EAAA000,
    };
    // Convert each test value and print the result
    for (int i = 0; i < sizeof(test_values) / sizeof(test_values[0]); ++i) {
        uint32_t result = fp16_to_fp32(test_values[i]);
            printf("16-bit: 0x%04X -> 32-bit: ", test_values[i]);
            print_fp32(result);
            printf("    expected_value= 0x%08X", expected_values[i]);
            if (result == expected_values[i])
                printf("    result: correct\n");
            else
                printf("    result: false\n");
        
    }

    return 0;
}

The program output will be like
16-bit: test_value[i] -> 32-bit: result expected_value= expected_values[i] result:Correct/Incorrect

Assembly

.data 
test_values:
    .half  0x3C00, 0xC000,  0x7BFF,  0xFC00,  0x7E00,  0x0000,  0x8000, 0x3555, 0x0400, 0x3000, 0x7800
ans_values:     
    .word  0x3F800000, 0xC0000000, 0x477FE000, 0xFF800000, 0x7FC00000, 0x00000000, 0x80000000, 0x3EAAA000, 0x38800000, 0x3e000000, 0x47000000
input_msg: .asciz "Input ->"
output_msg: .asciz "    Output -> "
ans_msg: .asciz "    expected_value -> "
correct_msg: .asciz "    Result: Correct\n"
false_msg:   .asciz "    Result: False\n"
format:     .asciz "%08X\n"          # Format string for printing hexadecimal numbers

.text
# Main function
main:
    la      s0, test_values     # Load the address of the test data
    la      s1, ans_values
    li      t0, 11             # Array size
    li      t1, 0               # Initialize index i to 0

test_loop:
    lhu      a1, 0(s0)           # Load the current 16-bit number (load halfword)
    la      a0, input_msg
    li      a7, 4
    ecall
    mv      a0, a1
    li      a7, 34
    ecall    
    call    fp16_to_fp32        # Call the fp16_to_fp32 conversion
    lw      a2, 0(s1)
    call    print_fp32          # Print the conversion result
# Verify if the conversion result is correct
    beq     a1, a2, result_correct   # If the conversion result matches the expected result, jump to result_correct
    call    print_false              # Otherwise, print "Result: False"
    j       next_iteration

result_correct:
    call    print_correct            # Print "Result: Correct"

next_iteration:
    addi    s0, s0, 2           # Move to the next test value (advance by 2 bytes each time)
    addi    s1, s1, 4           # Move to the next expected value (advance by 4 bytes each time)
    addi    t1, t1, 1           # i++
    blt     t1, t0, test_loop   # If i < 11, continue the loop
    # Use ecall to exit the program
    li      a0, 0
    li      a7, 93              # ecall 93: Exit the program
    ecall
# Print "Result: Correct"
print_correct:
    la      a0, correct_msg
    li      a7, 4              # ecall 4: Print string
    ecall
    ret

# Print "Result: False"
print_false:
    la      a0, false_msg
    li      a7, 4              # ecall 4: Print string
    ecall
    ret

# fp16_to_fp32 function: Convert a 16-bit floating-point number to a 32-bit floating-point number
fp16_to_fp32:
    addi    sp, sp, -16         # Allocate stack space for saved registers
    sw      ra, 12(sp)          # Save return address
    sw      t0, 8(sp)           # Save register t0
    sw      t1, 4(sp)           # Save register t1
    sw      t2, 0(sp)           # Save register t2

    slli    a1, a1, 16          # w = h << 16
    li      t0, 0x80000000      # Sign mask
    and     t1, a1, t0          # sign = w & 0x80000000
    li      t0, 0x7FFFFFFF      # Nonsign mask
    and     t2, a1, t0          # nonsign = w & 0x7FFFFFFF

    # Calculate the number of leading zeros
    mv      a1, t2              # Prepare argument
    call    my_clz              # renorm_shift = my_clz(nonsign)
    mv      t3, a1              # Save renorm_shift

    # Handle cases where renorm_shift is greater than 5
    li      t0, 5               
    bgt     t3, t0, renorm_sub  # If renorm_shift > 5, jump to subtraction
    li      t3, 0               # Otherwise, renorm_shift = 0
    j       renorm_done

renorm_sub:
    sub     t3, t3, t0          # renorm_shift -= 5

renorm_done:
    li      t0, 0x04000000      # Calculate inf_nan_mask
    add     t5, t2, t0
    srai    t5, t5, 8
    li      t0, 0x7F800000
    and     t5, t5, t0          # inf_nan_mask done

    addi    t0, t2, -1          # Calculate zero_mask
    srai    t0, t0, 31          # zero_mask done

    sll     t2, t2, t3          # nonsign << renorm_shift
    srli    t2, t2, 3           # >> 3
    li      t4, 0x70
    sub     t4, t4, t3
    slli    t4, t4, 23          # (0x70 - renorm_shift) << 23
    add     t2, t2, t4          # (nonsign << renorm_shift >> 3) + ((0x70 - renorm_shift) << 23)
    or      t2, t2, t5          # Add inf_nan_mask
    not     t0, t0              # ~zero_mask
    and     t2, t2, t0          # Handle zero_mask
    or      a1, t1, t2          # sign | nonsign

    lw      ra, 12(sp)          # Restore return address
    lw      t0, 8(sp)           # Restore register t0
    lw      t1, 4(sp)           # Restore register t1
    lw      t2, 0(sp)           # Restore register t2
    addi    sp, sp, 16          # Release stack space
    ret                         # Return 32-bit result

# Use ecall to print the 32-bit floating-point number
print_fp32:
    la      a0, output_msg      # Load "Output -> " string
    li      a7, 4               # ecall 4: Print string
    ecall
    # Print the hexadecimal representation of the 32-bit floating-point number
    mv      a0, a1
    li      a7, 34             # ecall 34: Print integer, hexadecimal format
    ecall

print_ans:
    la      a0, ans_msg         # Load ", Ans -> " string
    li      a7, 4               # ecall 4: Print string
    ecall

    mv      a0, a2              # Output 32-bit number
    li      a7, 34              # ecall 34: Print integer, hexadecimal format
    ecall

    ret

# my_clz function: Count the number of leading zeros
my_clz:
    addi    sp, sp, -16          # Allocate stack space for saved registers
    sw      t2, 12(sp)
    sw      t1, 8(sp)
    sw      ra, 4(sp)           # Save return address
    sw      t0, 0(sp)           # Save register t0


    li      t0, 31              # Initialize i = 31
    li      t1, 0               # count = 0
    li      t2, 1

clz_loop:
    beqz a1, all_zeros     # If x is 0, jump to all_zeros
    sll s3, t2, t0         # t0 = 1 << i
    and s3, a1, s3         # t0 = x & (1 << i)
    bne s3, zero, end_loop # If t0 != 0, jump to end_loop

    addi t1, t1, 1          # count++
    addi t0, t0, -1         # i--

    bge t0, zero, clz_loop  # If i >= 0, continue looping

all_zeros:
    li t1, 32               # If x is 0, count = 32

end_loop:
    mv      a1, t1               # Return count
    lw      t2, 12(sp)
    lw      t1, 8(sp)
    lw      ra, 4(sp)           # Restore return address
    lw      t0, 0(sp)           # Restore register t0
    addi    sp, sp, 16           # Release stack space
    ret

Execution info
Cycles	1680
Instr. retired	1226
CPI	1.37
IPC	0.73

Selected Problems

My question is implement count leading zero and popcount in C and use them to solve LeetCode problem 1342.Number of Steps to Reduce a Number to Zero
Given an integer num, return the number of steps to reduce it to zero.
Count leading zero can be used for quick solving task selection problem in OS.

In one step, if the current number is even, you have to divide it by 2, otherwise, you have to subtract 1 from it.

Example :

Input: num = 14
Output: 6
Explanation:
Step 1) 14 is even; divide by 2 and obtain 7.
Step 2) 7 is odd; subtract 1 and obtain 6.
Step 3) 6 is even; divide by 2 and obtain 3.
Step 4) 3 is odd; subtract 1 and obtain 2.
Step 5) 2 is even; divide by 2 and obtain 1.
Step 6) 1 is odd; subtract 1 and obtain 0.
Constraints:
0 <= num <= 10⁶

The concept is from Quiz 1, question C, involving my_clz function,which calculates how many leading zeros are in the binary representation of a 32-bit unsigned integer (uint32_t).Popcount function calculates the number of 1 bits in the binary representation of an integer.
Based on these function. I would like to implement count leading zero and popcount in C and use them to solve LeetCode problem 1342 to reduce the number of clock cycles.

my_clz

The my_clz function counts the number of leading zeros in a 32-bit unsigned integer x. It initializes a counter called count to zero and uses a for loop to iterate from the most significant bit (MSB) to the least significant bit (LSB). The expression x & (1U << i) checks if the i-th bit of x is 1; if it finds a 1, the loop breaks. If the current bit is 0, the counter increments. This continues until a 1 is found or all bits have been checked, and the function returns the total count of leading zeros.

Is it better? Can you prove?

Thanks, This indeed guides me toward another direction of thought.
The my_clz function uses a loop to count leading zeros by checking each bit from the MSB to LSB. It’s simple and predictable, especially when the input has many leading zeros. However, it has several downsides:

Performance: In the worst case, the loop runs 32 iterations. This adds unnecessary overhead compared to more efficient methods.
Branching: Each iteration introduces a conditional check, which may cause pipeline stalls and branch prediction failures, especially on architectures like RISC-V.
Inefficiency: Bitwise AND and shifting by i are costlier compared to other approaches that progressively reduce the bit range(for example,byte shift clz).

But when using the optimized clz implementations that use byte shifts strategy, it can outperform the original loop version. Those versions check large blocks of bits at once, reducing instructions and improving performance.

static inline int my_clz(uint32_t x) {
    int count = 0;
    for (int i = 31; i >= 0; --i) {
        if (x & (1U << i))
            break;
        count++;
    }
    return count;
}

The corresponding RISC-V assembly are showing below.

my_clz:
    li s1, 0              # Initialize s1 (count) to 0
    li s0, 31             # Initialize s0 (bit position) to 31
    li s2, 1              # Set s2 to 1, used for bit shifting

clz_loop:
    beqz t3, all_zeros     # If t3 (input value) is 0, jump to all_zeros
    sll s3, s2, s0         # Shift 1 left by s0 and store in s3
    and s3, t3, s3         # Bitwise AND between t3 and s3
    bne s3, zero, end_loop # If the result is not 0, jump to end_loop

    addi s1, s1, 1         # Increment count (s1)
    addi s0, s0, -1        # Decrement bit position (s0)

    bge s0, zero, clz_loop # If bit position (s0) >= 0, repeat the loop

all_zeros:
    li s1, 32              # If the input is 0, set count to 32

end_loop:
    mv a0, s1              # Move count (s1) to a0 (return value)

Solution

My thinking path

Approaching it in an intuitive way, actually,it is a very easy problem.LeetCode 1342 asks for the number of operations required to reduce a given integer 𝑛 to zero. The operations are defined as follows:

If n is even, perform n/2.
If n is odd, perform n−1.

We can use a loop to implement this logic. Here is a solution written in C language:

Original C code (loop version)

int numberOfSteps(int num){
    int stepCounter = 0;
	
    while(num != 0){
        if(num % 2 == 0){
            num = num /2;
        }
        else{
            num--;
        }
        stepCounter++;
    }
    return stepCounter;
}

However, if solving it using bit manipulation, it involves many interesting derived knowledge points.
Since dividing a positive integer by 2 is equivalent to shifting it right by 1 bit, the total number of divisions required is equal to the number of times the most significant bit (MSB) needs to be shifted to the least significant position. Shifting the MSB to the least significant position takes 31 - __builtin_clz(num) steps, so the number of divisions required is also the same.
Additionally, when the binary representation of num contains k ones, it means a total of k subtractions of 1 are needed, as these ones will eventually be shifted to the least significant position. When the least significant bit is 1, it indicates that the number is odd and requires a subtraction of 1, so in total, popcount(num) subtractions of 1 are needed.

C code(with clz_loop and popcount)

#include <stdio.h>
#include <stdint.h>
int my_clz(uint32_t x) {
    int count = 0;
    for (int i = 31; i >= 0; --i) {
        if (x & (1U << i))
            break;
        count++;
    }
    return 31-count;
}
uint32_t my_popcnt (uint32_t u)
 {
      u = (u & 0x55555555) + ((u >> 1) & 0x55555555);
      u = (u & 0x33333333) + ((u >> 2) & 0x33333333);
      u = (u & 0x0F0F0F0F) + ((u >> 4) & 0x0F0F0F0F);
      u = (u & 0x00FF00FF) + ((u >> 8) & 0x00FF00FF);
      u = (u & 0x0000FFFF) + ((u >> 16) & 0x0000FFFF);
      return u;
 }
int main() {
    int test_data[] = {14, 8, 123};
    
    for (int i = 0; i < 3; i++) {
        int result = test_data[i] ? my_popcnt(test_data[i]) + my_clz(test_data[i]) : 0;
        printf("Input num = %d Output = %d\n",test_data[i],result);
    }

    return 0;
}

The program output 
Input num = 14 Output = 6
Input num = 8 Output = 4
Input num = 123 Output = 12

C code(with clz_binary and popcount)

#include <stdio.h>
#include <stdint.h>
int length(uint32_t num) {
    uint32_t clz = 0;
    if ((num >> 16) == 0) {
        clz += 16;
        num <<= 16;
    }
    if ((num >> 24) == 0) {
        clz += 8;
        num <<= 8;
    }
    if ((num >> 28) == 0) {
        clz += 4;
        num <<= 4;
    }
    if ((num >> 30) == 0) {
        clz += 2;
        num <<= 2;
    }
    if ((num >> 31) == 0) {
        clz += 1;
    }
    return 31 - clz;
}

uint32_t my_popcnt (uint32_t u)
 {
      u = (u & 0x55555555) + ((u >> 1) & 0x55555555);
      u = (u & 0x33333333) + ((u >> 2) & 0x33333333);
      u = (u & 0x0F0F0F0F) + ((u >> 4) & 0x0F0F0F0F);
      u = (u & 0x00FF00FF) + ((u >> 8) & 0x00FF00FF);
      u = (u & 0x0000FFFF) + ((u >> 16) & 0x0000FFFF);
      return u;
 }
int main() {
    int test_data[] = {14, 8, 123};
    
    for (int i = 0; i < 3; i++) {
        int result = test_data[i] ? my_popcnt(test_data[i]) + length(test_data[i]) : 0;
        printf("Input num = %d Output = %d\n",test_data[i],result);
    }

    return 0;
}

Does the above make sense to RV32I? Prove and discuss.

Yes, the code is suited for RV32I. This approach to bit manipulations for clz and popcnt matches well with the capabilities of the RV32I instruction set.

Shifts: num >> X → SRLI instruction
Bitwise AND: & → AND instruction
Additions: + → ADDI instruction
Loops and conditionals: if, for → BNE, BEQ for branching
Function calls: → JAL for function jumps, and JALR for returns.

This version of the clz function counts the number of leading zeros in a 32-bit unsigned integer (x). Here's the step-by-step breakdown of its logic:

Initial Check for Zero:
The function first checks if the input x is 0. If x == 0, the function immediately returns 32, since all bits are zero, meaning there are 32 leading zeros.
Checking and Shifting in Blocks:
The function proceeds to check the leading zeros by analyzing blocks of bits, starting from the most significant ones (leftmost).
Step 1: (x >> 16) == 0
It checks if the upper 16 bits of x are all zeros. If so, it adds 16 to n (since all 16 bits are zeros) and shifts x 16 bits to the left, discarding the leading zeros.
Step 2: (x >> 24) == 0
After the shift, it checks if the upper 8 bits of the remaining 16-bit value are all zeros. If so, it adds 8 to n and shifts x left by 8 bits.
Step 3: (x >> 28) == 0
Now, it checks if the upper 4 bits of the remaining 8-bit value are all zeros. If so, it adds 4 to n and shifts x left by 4 bits.
Step 4: (x >> 30) == 0
It then checks if the upper 2 bits of the remaining 4-bit value are all zeros. If so, it adds 2 to n and shifts x left by 2 bits.
Step 5: (x >> 31) == 0
Finally, it checks if the most significant bit (MSB) of the remaining 2-bit value is zero. If so, it adds 1 to n. And n is the count of leading zeros.
I included another method for comparison, and both versions use the same basic strategy of bit shifts and bit checking.

Byte shift clz

int clz(uint32_t x) {
    if (x == 0) return 32;
    int n = 1;
    if ((x >> 16) == 0) { n += 16; x <<= 16; }
    if ((x >> 24) == 0) { n += 8; x <<= 8; }
    if ((x >> 28) == 0) { n += 4; x <<= 4; }
    if ((x >> 30) == 0) { n += 2; x <<= 2; }
    n = n - (x >> 31);
    return n;
}

second version(My version)

int clz(uint32_t x) {
    if (x == 0) return 32;
    int n = 0;
    if ((x >> 16) == 0) { n += 16; x <<= 16; }
    if ((x >> 24) == 0) { n += 8; x <<= 8; }
    if ((x >> 28) == 0) { n += 4; x <<= 4; }
    if ((x >> 30) == 0) { n += 2; x <<= 2; }
    if ((x >> 31) == 0) { n += 1; }
    return n;
}

Number of conditionals (if):
First version: Reduces one if by handling the 31st bit with the final line n = n - (x >> 31). This avoids one conditional branch but introduces an additional subtraction operation.
Second version: Contains an extra conditional branch for the 31st bit (x >> 31). This conditional statement causes RISC-V to execute one more comparison and branch.
For RISC-V, conditional branches may lead to potential branch prediction failures and pipeline stalls, which increase CPU cycle costs.
Code simplicity and ease of translation to RISC-V I:
The first version is slightly simpler in logic because it avoids extra conditional checks, especially when handling the highest bit. For RISC-V’s pipeline and branch prediction mechanisms, reducing branch jumps can decrease potential performance bottlenecks.
The second version contains an extra branch, which requires more branch instruction checks in RISC-V I (e.g., BNE, BEQ). Although the impact is not significant, this slightly increases processing time.
Thus, the first version might have a slight advantage in terms of fewer conditional branches.

If there are any errors or gaps in the response, I would appreciate it if you could point them out, thanks.

RISC-V assembly code

Loop version 1

.data
nums:
.word 14, 8, 123  # Test data nums = 14, 8, 123
input_str:
.string "Input: num = "  # Defines the string "Input: num = "
output_str:
.string " Output: "  # Defines the string "Output: "

.text
main:
# Initialize pointers and array size
la s0, nums  # Load nums array start address
li s1, 3     # Number of elements in nums (optimized: hardcoded)
li t0, 0  # i = 0

loop_outer:
beq t0, s1, done_outer  # Exit if i == size
lw t3, 0(s0)  # Load nums[i] into t3
addi s0, s0, 4  # Move nums pointer
addi t0, t0, 1  # i++
mv a1, t3  # Save original num
jal ra, numberOfSteps  # Call numberOfSteps, result in a1

# Print "Input: num = ", num, " Output: ", and step count in one go
la a0, input_str  # Load "Input: num = " string
li a7, 4  # Syscall 4: print string
ecall
mv a0, a1  # Move original num into a0
li a7, 1  # Syscall 1: print integer
ecall
la a0, output_str  # Load " Output: " string
li a7, 4  # Syscall 4: print string
ecall
mv a0, a2  # Move step count into a0
li a7, 1  # Syscall 1: print integer
ecall
li a0, 0x0A  # Print newline
li a7, 11  # Syscall 11: print char
ecall

j loop_outer  # Repeat for next element

done_outer:
li a7, 10  # Syscall 10: exit program
ecall

numberOfSteps:
li a2, 0  # count = 0

loop_inner:
beqz t3, done_inner  # Exit if num == 0
andi t5, t3, 1  # Check if num is odd
beqz t5, even  # If even, jump to even
addi t3, t3, -1  # If odd, subtract 1
j update_count  # Jump to update count

even:
srli t3, t3, 1  # Divide by 2 if even

update_count:
addi a2, a2, 1  # Increment count
j loop_inner  # Loop until num is 0

done_inner:
ret

The program output 
Input num = 14 Output = 6
Input num = 8 Output = 4
Input num = 123 Output = 12

Execution info
Cycles	380
Instr. retired	230
CPI	1.65
IPC	0.605

Shorten the above, using fewer instructions.

Loop version 2 (Using fewer instructions.)

I revised the code in loop_inner section.

Branch elimination: Using sub and add combined with condition flags avoids the need for two branch conditions for odd and even numbers, simplifying the control flow.
t5 stores the state of whether num is odd.
When num is odd, sub t3, t3, t5 essentially subtracts 1 (i.e., turning the odd number into an even one), and add a2, a2, t5 adds 1 to a2, which represents the step count for this operation.
Unified processing: The operation srli t3, t3, 1, which divides num by 2, is placed afterward, unifying the operation flow for both cases.

.data
nums:
.word 14, 8, 123,  # Test data nums = 14, 8, 123
input_str:
.string "Input: num = "  # Defines the string "Input: num = "
output_str:
.string " Output: "  # Defines the string "Output: "

.text
main:
# Initialize pointers and array size
la s0, nums  # Load nums array start address
li s1, 3     # Number of elements in nums (optimized: hardcoded)
li t0, 0  # i = 0

loop_outer:
beq t0, s1, done_outer  # Exit if i == size
lw t3, 0(s0)  # Load nums[i] into t3
addi s0, s0, 4  # Move nums pointer
addi t0, t0, 1  # i++
mv a1, t3  # Save original num
jal ra, numberOfSteps  # Call numberOfSteps, result in a1

# Print "Input: num = ", num, " Output: ", and step count in one go
la a0, input_str  # Load "Input: num = " string
li a7, 4  # Syscall 4: print string
ecall
mv a0, a1  # Move original num into a0
li a7, 1  # Syscall 1: print integer
ecall
la a0, output_str  # Load " Output: " string
li a7, 4  # Syscall 4: print string
ecall
mv a0, a2  # Move step count into a0
li a7, 1  # Syscall 1: print integer
ecall
li a0, 0x0A  # Print newline
li a7, 11  # Syscall 11: print char
ecall

j loop_outer  # Repeat for next element

done_outer:
li a7, 10  # Syscall 10: exit program
ecall

numberOfSteps:
li a2, 0  # count = 0

loop_inner:
andi t5, t3, 1  # Check if num is odd
sub t3, t3, t5  # If odd, subtract 1
add a2, a2, t5  # If odd, count + 1(t5),else, count+0(5)
srli t3, t3, 1
beqz t3, done_inner  # Exit if num == 0
addi a2, a2, 1  # Increment count
j loop_inner  # Jump to update count

done_inner:
ret

Execution info
Cycles	270
Instr. retired	184
CPI	1.47
IPC	0.681

Using clz_loop and popcount

.data
nums:
.word 14, 8, 123  # Test data nums
input_str:
.string "Input: num = "  
output_str:
.string " Output: "  

.text
main:
la t0, nums  # Load nums array start address
li t6, 3     # Number of elements in nums (optimized: hardcoded)
li t2, 0  # i = 0
    
loop_outer:
    beq t2, t6, done_outer # If i == size, exit the outer loop

    addi t0, t0, 4        # Move the nums pointer forward by 4 bytes (size of an int)
    addi t2, t2, 1        # i++

    lw t3, -4(t0)         # Get nums[i], load it into t3 (num), using -4 offset
    mv t4, t3             # Save the original num into t4 for later display
    jal ra, my_popcnt      # Call my_popcnt, store the step count in a1

# Display string "Input: num = "
    la a0, input_str      # Load "Input: num = "
    li a7, 4              # System call number 4 (print string)
    ecall                 # Execute system call

# Display the original num
    mv a0, t4             # Pass the original num to a0 (syscall 1: print integer)
    li a7, 1              # System call number 1 (print integer)
    ecall                 # Display the number

# Display string " Output: "
    la a0, output_str     # Load " Output: "
    li a7, 4              # System call number 4 (print string)
    ecall                 # Execute system call

# Display the step count
    mv a0, a1             # Pass the step count (a1) to a0
    li a7, 1              # System call number 1 (print integer)
    ecall                 # Display the step count

# New line
    li a0, 0x0A           # New line (0x0A is the newline character)
    li a7, 11             # System call number 11 (print character)
    ecall

# Move to the next test data
    j loop_outer          # Return to the outer loop

done_outer:
# End program
    li a7, 10             # System call number 10 (exit)
    ecall

my_popcnt:
# u = (u & 0x55555555) + ((u >> 1) & 0x55555555)
    li s0, 0x55555555     # Load mask 0x55555555
    and s1, s0, t3        # s1 = u & 0x55555555
    srli s2, t3, 1        # s2 = u >> 1
    and s2, s2, s0        # s2 = (u >> 1) & 0x55555555
    add a1, s2, s1        # a1 = (u & 0x55555555) + ((u >> 1) & 0x55555555)

# u = (u & 0x33333333) + ((u >> 2) & 0x33333333)
    li s0, 0x33333333     # Load mask 0x33333333
    and s1, s0, a1        # s1 = u & 0x33333333
    srli s2, a1, 2        # s2 = u >> 2 
    and s2, s2, s0        # s2 = (u >> 2) & 0x33333333
    add a1, s2, s1        # a1 = (u & 0x33333333) + ((u >> 2) & 0x33333333)

# u = (u & 0x0F0F0F0F) + ((u >> 4) & 0x0F0F0F0F)
    li s0, 0x0F0F0F0F     # Load mask 0x0F0F0F0F
    and s1, s0, a1        # s1 = u & 0x0F0F0F0F
    srli s2, a1, 4        # s2 = u >> 4 
    and s2, s2, s0        # s2 = (u >> 4) & 0x0F0F0F0F
    add a1, s2, s1        # a1 = (u & 0x0F0F0F0F) + ((u >> 4) & 0x0F0F0F0F)

# u = (u & 0x00FF00FF) + ((u >> 8) & 0x00FF00FF)
    li s0, 0x00FF00FF     # Load mask 0x00FF00FF
    and s1, s0, a1        # s1 = u & 0x00FF00FF
    srli s2, a1, 8        # s2= u >> 8 
    and s2, s2, s0        # s2 = (u >> 8) & 0x00FF00FF
    add a1, s2, s1        # a1 = (u & 0x00FF00FF) + ((u >> 8) & 0x00FF00FF)

# u = (u & 0x0000FFFF) + ((u >> 16) & 0x0000FFFF)
    li s0, 0x0000FFFF     # Load mask 0x0000FFFF
    and s1, s0, a1        # s1 = u & 0x0000FFFF
    srli s2, a1, 16       # s2= u >> 16 
    and s2, s2, s0        # s2 = (u >> 16) & 0x0000FFFF
    add a1, s2, s1        # a1 = (u & 0x0000FFFF) + ((u >> 16) & 0x0000FFFF)

length:
    li s1, 0              # s1 = count = 0
    li s0, 31             # s0 = i = 31
    li s2, 1              # s2 = 1
    beqz t3, all_zeros     # If x is 0, jump to all_zeros
    
clz_loop:    
    sll s3, s2, s0         # s3 = 1 << i 
    and s3, t3, s3         # s3 = x & (1 << i) 
    bne s3, zero, end_loop # If s3 != 0, jump to end_loop 

    addi s1, s1, 1          # count++
    addi s0, s0, -1         # i--
    bge s0, zero, clz_loop  # If i >= 0, continue loop

all_zeros:
    li s1, 32               # If x is 0, count = 32

end_loop:
    li s2, 31               # s2 = 31 
    sub a0, s2, s1          # a0 = 31 - clz (return value)
    add a1, a0, a1

ret                         # Return to main program

Execution info
Cycles	915
Instr. retired	688
CPI	1.33
IPC	0.752

Using clz_binary and popcount

.data
nums:
.word 14, 8, 123  # Test data nums
input_str:
.string "Input: num = "  
output_str:
.string " Output: "  

.text
main:
la t0, nums  # Load nums array start address
li t6, 3     # Number of elements in nums (optimized: hardcoded)
li t2, 0  # i = 0

loop_outer:
beq t2, t6, done_outer  # If i == size, exit the outer loop

addi t0, t0, 4  # Move the nums pointer forward by 4 bytes (size of an int)
addi t2, t2, 1  # Increment i

lw t3, -4(t0)  # Load nums[i] into t3 (num), note the -4 offset
mv t4, t3  # Save the original num into t4 for later display
jal ra, my_popcnt  

# Display the string "Input: num = "
la a0, input_str  
li a7, 4  # Syscall number 4 (print string)
ecall  

# Display the original num
mv a0, t4  
li a7, 1  # Syscall number 1 (print integer)
ecall 
# Display the string " Output: "
la a0, output_str   
li a7, 4  # Syscall number 4 (print string)
ecall 

# Display the step count
mv a0, a1  
li a7, 1  # Syscall number 1 (print integer)
ecall  
# Print a newline
li a0, 0x0A  # Newline (0x0A is the newline character)
li a7, 11  # Syscall number 11 (print char)
ecall

# Move to the next test data
j loop_outer 

done_outer:
# Exit the program
li a7, 10  # Syscall number 10 (exit)
ecall

my_popcnt:
# u = (u & 0x55555555) + ((u >> 1) & 0x55555555)
li s0, 0x55555555        # Load mask 0x55555555
and s1, s0, t3           # s1 = u & 0x55555555
srli s2, t3, 1           # s2 = u >> 1
and s2, s2, s0           # s2 = (u >> 1) & 0x55555555
add a1, s2, s1           # a1 = (u & 0x55555555) + ((u >> 1) & 0x55555555)

# u = (u & 0x33333333) + ((u >> 2) & 0x33333333)
li s0, 0x33333333        # Load mask 0x33333333
and s1, s0, a1           # s1 = u & 0x33333333
srli s2, a1, 2           # s2 = u >> 2
and s2, s2, s0           # s2 = (u >> 2) & 0x33333333
add a1, s2, s1           # a1 = (u & 0x33333333) + ((u >> 2) & 0x33333333)

# u = (u & 0x0F0F0F0F) + ((u >> 4) & 0x0F0F0F0F)
li s0, 0x0F0F0F0F        # Load mask 0x0F0F0F0F
and s1, s0, a1           # s1 = u & 0x0F0F0F0F
srli s2, a1, 4           # s2 = u >> 4
and s2, s2, s0           # s2 = (u >> 4) & 0x0F0F0F0F
add a1, s2, s1           # a1 = (u & 0x0F0F0F0F) + ((u >> 4) & 0x0F0F0F0F)

# u = (u & 0x00FF00FF) + ((u >> 8) & 0x00FF00FF)
li s0, 0x00FF00FF        # Load mask 0x00FF00FF
and s1, s0, a1           # s1 = u & 0x00FF00FF
srli s2, a1, 8           # s2 = u >> 8
and s2, s2, s0           # s2 = (u >> 8) & 0x00FF00FF
add a1, s2, s1           # a1 = (u & 0x00FF00FF) + ((u >> 8) & 0x00FF00FF)

# u = (u & 0x0000FFFF) + ((u >> 16) & 0x0000FFFF)
li s0, 0x0000FFFF        # Load mask 0x0000FFFF
and s1, s0, a1           # s1 = u & 0x0000FFFF
srli s2, a1, 16          # s2 = u >> 16
and s2, s2, s0           # s2 = (u >> 16) & 0x0000FFFF
add a1, s2, s1           # a1 = (u & 0x0000FFFF) + ((u >> 16) & 0x0000FFFF)

my_clz:
li s0, 0  # s0 = clz = 0

# if ((num >> 16) == 0)
srli s1, t3, 16  # s1 = num >> 16
bnez s1, check_24  # If num >> 16 is not 0, jump to check_24
addi s0, s0, 16  # clz += 16
slli t3, t3, 16  # num <<= 16

check_24:
# if ((num >> 24) == 0)
srli s1, t3, 24  # s1 = num >> 24
bnez s1, check_28  # If num >> 24 is not 0, jump to check_28
addi s0, s0, 8  # clz += 8
slli t3, t3, 8  # num <<= 8

check_28:
# if ((num >> 28) == 0)
srli s1, t3, 28  # s1 = num >> 28
bnez s1, check_30  # If num >> 28 is not 0, jump to check_30
addi s0, s0, 4  # clz += 4
slli t3, t3, 4  # num <<= 4

check_30:
# if ((num >> 30) == 0)
srli s1, t3, 30  # s1 = num >> 30
bnez s1, check_31  # If num >> 30 is not 0, jump to check_31
addi s0, s0, 2  # clz += 2
slli t3, t3, 2  # num <<= 2

check_31:
# if ((num >> 31) == 0)
srli s1, t3, 31  # s1 = num >> 31
bnez s1, done  # If num >> 31 is not 0, jump to done
addi s0, s0, 1  # clz += 1

done:
li s1, 31  
sub a0, s1, s0  # a0 = 31 - clz (return value)
add a1, a0, a1  # Add clz result to popcnt result

ret  # Return to main program

Execution info
Cycles	302
Instr. retired	231
CPI	1.31
IPC	0.765

Analysis

When I first got the result, I was a bit confused. It’s not surprising that the loop version has fewer cycles than the clz_loop version because clz_loop iterates up to 30 times for each data point(it iterates more times due to the value of test datas are small), while the loop version iterates approximately O(log₂(10⁶)=19.931) times at most for each data point. However, it seems reasonable that the clz_binary version should have even fewer cycles. So, I tested again using 10⁶ as the test data and got the following result(I only used one test data)

Loop version 2 (Using fewer instructions.)

Execution info
Cycles	235
Instr. retired	171
CPI	1.37
IPC	0.728

Using clz_binary and popcount

Execution info
Cycles	111
Instr. retired	80
CPI	1.39
IPC	0.721

Comparison Analysis

Cycle Count:
clz_binary and popcount is significantly faster, using only 111 cycles compared to 235 cycles for Loop Version 2. This indicates that the clz_binary approach is much more efficient in terms of the number of cycles required to execute the program.
Instructions Retired:
The clz_binary and popcount method has 80 instructions retired, while Loop Version 2 has 171 instructions retired. This suggests that clz_binary is not only executing fewer cycles but also utilizing fewer instructions to achieve its result.
CPI (Cycles Per Instruction):
The CPI for Loop Version 2 is 1.37, while for clz_binary, it is slightly higher at 1.39. A lower CPI generally indicates a more efficient program. In this case, despite having a slightly higher CPI, the clz_binary method compensates for this by executing far fewer instructions overall, resulting in fewer total cycles.
IPC (Instructions Per Cycle):
Loop Version 2 has an IPC of 0.728, whereas clz_binary has an IPC of 0.721. A higher IPC is generally better, as it indicates more instructions are being executed per cycle. In this case, Loop Version 2 performs slightly better in IPC, but again, this is overshadowed by the overall lower cycle count of clz_binary.

Brief summary

Efficiency: The clz_binary and popcount method outperforms Loop Version 2 in terms of execution cycles and instructions retired. It shows a more efficient implementation for this task.
Performance Trade-off: While Loop Version 2 has a slightly better IPC and CPI, the significantly lower cycle count and instructions make the clz_binary approach more favorable overall.
Optimization Considerations: The reduction in both cycles and instructions in the clz_binary and popcount implementation suggests that optimizing the algorithm for counting bits (like using bitwise operation) can lead to substantial performance gains, particularly in scenarios involving larger numbers or datasets.

manual hazard detection

A 5-stage pipeline processor with hazard detection and forwarding functionality can produce correct results for each test data. I would like to attempt manual hazard detection. When using a processor without hazard detection, the simulator shows the message "Unknown system call in register 'a7': 0." It's clear that there is a hazard around the "ecall" instruction. Furthermore,when the processor has a forwarding unit, most data hazards can be resolved, but a load-use data hazard still requires stalling for one cycle to prevent data loss. Below, I will discuss in detail and demonstrate the various situations and solutions I encountered during this process.

Three type of Data Hazard

Structure Hazard

Insufficient hardware resources. (e.g. Assuming there is no difference between IM and DM, IF and MEM both have memory access, and problems will occur when they overlap, resulting in the inability to execute multiple instructions at the same time)
Data Hazard

Data Hazards occur when an instruction depends on the result of previous instruction and that result of instruction has not yet been computed.
There are four types of data dependencies: Read after Write (RAW), Write after Read (WAR), Write after Write (WAW), and Read after Read (RAR). These are explained as follows below.
- Read after Write (RAW) :
  It is also known as True dependency or Flow dependency. It occurs when the value produced by an instruction is required by a subsequent instruction. For example,
  ADD R1, --, --; SUB --, R1, --;
- Write after Read (WAR) :
  It is also known as anti dependency. These hazards occur when the output register of an instruction is used right after read by a previous instruction. For example,
  ADD --, R1, --; SUB R1, --, --;
- Write after Write (WAW) :
  It is also known as output dependency. These hazards occur when the output register of an instruction is used for write after written by previous instruction. For example,
  ADD R1, --, --; SUB R1, --, --;
- Read after Read (RAR) :
  It occurs when the instruction both read from the same register. For example,
  ADD --, R1, --; SUB --, R1, --;
  Since reading a register value does not change the register value, these Read after Read (RAR) dependencies don’t cause a problem for the processor.
Control Hazard

Also known as branch hazards, occur when the pipeline makes wrong decisions on branch prediction, leading to the fetching of incorrect instructions. This typically happens with conditional branches and jumps.

Load-Use Data Hazard

Consider the following sequence of instructions:

lw $t2, 20($t1)  # a load-use hazard
and $t4, $t2, $t5

This hazard cannot be resolved by simple forwarding.
The value lw writes into $t2 is not available until lw completes the MEM stage, but and needs that value when it enters the EX stage, which is when lw enters the MEM stage.

	c1	c2	c3	c4	c5	c6
lw $t 2, 20 ($ t1)	IF	ID	EX	MEM	WB
and $t4, $t2, $t5		IF	ID	EX	MEM	WB

Handling Data Hazards :

These are various methods we use to handle hazards:

Forwarding :
It adds special circuitry to the pipeline. This method works because it takes less time for the required values to travel through a wire than it does for a pipeline segment to compute its result.
Code reordering :
We need a special type of software to reorder code. We call this type of software a hardware-dependent compiler.
Stall Insertion :
it inserts one or more stall (no-op instructions) into the pipeline, which delays the execution of the current instruction until the required operand is written to the register file, but this method decreases pipeline efficiency and throughput.
A load-use hazard requires delaying the execution of the using instruction until the result from the loading instruction can be made available to the using instruction
If we can stall the execution of the using instruction for one cycle:

	c1	c2	c3	c4	c5	c6	c7
lw $t 2, 20 ($ t1)	IF	ID	EX	MEM	WB
NOP		IF	ID	EX	MEM	WB
and $t4, $t2, $t5			IF	ID	EX	MEM	WB

value to be loaded to $t2 will be available in the MEM/WB buffer when the using instruction moves from ID to EX
that value can be forwarded to the using instruction as the using instruction enters the EX stage

Implement

I chose the version with clz_binary and popcount to implement the assembly code for a processor without a hazard detection unit.
First, I ran the original code and encountered the error Unknown system call in register 'a7': 0. After referring to some previous notes for solutions, I learned that three NOPs need to be inserted before the ecall instruction for the system call to work.

done_outer:
# Exit the program
li a7, 10  # Syscall number 10 (exit)
addi x0,x0,0 #NOP
addi x0,x0,0 #NOP
addi x0,x0,0 #NOP
ecall

After fixing all the system call instructions, I ran the code again and found that the output was empty.

I summarized the data hazards of branch instructions and divided them into three scenarios for discussion.

If the branch instruction's register has a data dependency with the destination register of the second or third preceding ALU instruction, forwarding can resolve the issue.
If the branch instruction's register has a data dependency with the destination register of the immediately preceding ALU instruction or the second preceding load instruction, the pipeline must stall for one clock cycle.
If the branch instruction's register has a data dependency with the destination register of the immediately preceding load instruction, the pipeline must stall for two clock cycles.

Based on this principle, I corrected my code and rearranged the la t0 nums instruction to be placed after the li t2 0 instruction without affecting the original program, allowing me to reduce one NOP.

main:
# Initialize pointers and array size
li t6, 3     # Number of elements in nums
li t2, 0  # i = 0
la t0, nums  # Load nums array start address

loop_outer:
beq t2, t6, done_outer  # If i == size, exit the outer loop

I used this code as an example.

li s0, 0x55555555        # Load mask 0x55555555
and s1, s0, t3           # s1 = u & 0x55555555
srli s2, t3, 1           # s2 = u >> 1
and s2, s2, s0           # s2 = (u >> 1) & 0x55555555
add a1, s2, s1           # a1 = (u & 0x55555555) + ((u >> 1) & 0x55555555)

When it starts executing, it will be translated into the following machine code.

    c8:        55555437        lui x8 0x55555
    cc:        55540413        addi x8 x8 1365
    d0:        01c474b3        and x9 x8 x28
    d4:        001e5913        srli x18 x28 1
    d8:        00897933        and x18 x18 x8
    dc:        009905b3        add x11 x18 x9

First, li s0, 0x55555555 will be translated into two instructions: lui x8, 0x55555, followed by addi x8, x8, 1365 to load the mask 0x55555555 into x8. When and s1, s0, t3 reaches the EX stage, since the previous instruction is addi x8, x8, 1365 and there is no load-use hazard, it can execute normally without adding a NOP. (When the li instruction is translated from pseudo code to machine code, it should inherently include a stall.)

Next, I used this code as an example.I will delve into the internal circuit behavior.

loop_outer:
beq t2, t6, done_outer  # If i == size, exit the outer loop
addi t0, t0, 4  # Move the nums pointer forward by 4 bytes (size of an int)
addi t2, t2, 1  # Increment i
lw t3, -4(t0)  # Load nums[i] into t3 (num), note the -4 offset
addi x0,x0,0 #NOP
mv t4, t3  # Save the original num into t4 for later display
jal ra, my_popcnt  # Call my_popcnt, step count is stored in a1

When it starts executing, it will be translated into the following machine code.

    10:        0bf38063        beq x7 x31 160 <done_outer>
    14:        00428293        addi x5 x5 4
    18:        00138393        addi x7 x7 1
    1c:        ffc2ae03        lw x28 -4 x5
    20:        00000013        addi x0 x0 0
    24:        000e0e93        addi x29 x28 0
    28:        09c000ef        jal x1 156 <my_popcnt>

The first image shows that the lw t3, -4(t0) instruction retrieves the data 0x0000000e in the MEM stage but has not yet been stored in the Mem/WB pipeline register. Therefore, even with forwarding, the correct value cannot be passed back to the EX stage at this time.

The second image shows that, through forwarding, even if the addi x29 x28 0 instruction reads an incorrect value from x28, it can be corrected to the right value via forwarding.

The issue here confuses me a bit. In my opinion, a circuit with full forwarding capability when dealing with a data hazard, it should only require stalling for one cycle. However, it seems that the branch cannot receive values from two-stage forwarding.

I initially thought that if there was a data hazard with an ALU instruction before a branch, the result could be forwarded from the EX stage to the ID stage to determine whether the branch should be taken, requiring only one clock cycle. However, after observing the circuit behavior, I found that the branch after the srli instruction needed two NOPs.I still need to conduct more testing and research to clarify this part.

But for now, I have already modified a version without hazard detection. Below is my complete source code.

.data
nums:
.word 14, 8, 123  # Test data nums = 14, 8, 123
input_str:
.string "Input: num = "  # Defines the string "Input: num = "
output_str:
.string " Output: "  # Defines the string "Output: "

.text
main:
# Initialize pointers and array size
li t6, 3     # Number of elements in nums (optimized: hardcoded)
li t2, 0  # i = 0
la t0, nums  # Load nums array start address

loop_outer:
beq t2, t6, done_outer  # If i == size, exit the outer loop
addi t0, t0, 4  # Move the nums pointer forward by 4 bytes (size of an int)
addi t2, t2, 1  # Increment i
lw t3, -4(t0)  # Load nums[i] into t3 (num), note the -4 offset
addi x0,x0,0 #NOP
mv t4, t3  # Save the original num into t4 for later display
jal ra, my_popcnt  # Call my_popcnt, step count is stored in a1

# Display the string "Input: num = "
la a0, input_str  # Load the address of "Input: num = "
li a7, 4  # Syscall number 4 (print string)
addi x0,x0,0 #NOP
addi x0,x0,0 #NOP
addi x0,x0,0 #NOP
ecall  # Execute syscall

# Display the original num
mv a0, t4  # Move the original num into a0 (syscall 1: print integer)
li a7, 1  # Syscall number 1 (print integer)
addi x0,x0,0 #NOP
addi x0,x0,0 #NOP
addi x0,x0,0 #NOP
ecall  # Display the number

# Display the string " Output: "
la a0, output_str  # Load the address of " Output: "
li a7, 4  # Syscall number 4 (print string)
addi x0,x0,0 #NOP
addi x0,x0,0 #NOP
addi x0,x0,0 #NOP
ecall  # Execute syscall

# Display the step count
mv a0, a1  # Move the step count (a1) into a0
li a7, 1  # Syscall number 1 (print integer)
addi x0,x0,0 #NOP
addi x0,x0,0 #NOP
addi x0,x0,0 #NOP
ecall  # Display the step count

# Print a newline
li a0, 0x0A  # Newline (0x0A is the newline character)
li a7, 11  # Syscall number 11 (print char)
addi x0,x0,0 #NOP
addi x0,x0,0 #NOP
addi x0,x0,0 #NOP
ecall

# Move to the next test data
j loop_outer  # Jump back to the outer loop

done_outer:
# Exit the program
li a7, 10  # Syscall number 10 (exit)
addi x0,x0,0 #NOP
addi x0,x0,0 #NOP
addi x0,x0,0 #NOP
ecall

my_popcnt:
# u = (u & 0x55555555) + ((u >> 1) & 0x55555555)
li s0, 0x55555555        # Load mask 0x55555555
and s1, s0, t3           # s1 = u & 0x55555555
srli s2, t3, 1           # s2 = u >> 1
and s2, s2, s0           # s2 = (u >> 1) & 0x55555555
add a1, s2, s1           # a1 = (u & 0x55555555) + ((u >> 1) & 0x55555555)

# u = (u & 0x33333333) + ((u >> 2) & 0x33333333)
li s0, 0x33333333        # Load mask 0x33333333
and s1, s0, a1           # s1 = u & 0x33333333
srli s2, a1, 2           # s2 = u >> 2
and s2, s2, s0           # s2 = (u >> 2) & 0x33333333
add a1, s2, s1           # a1 = (u & 0x33333333) + ((u >> 2) & 0x33333333)

# u = (u & 0x0F0F0F0F) + ((u >> 4) & 0x0F0F0F0F)
li s0, 0x0F0F0F0F        # Load mask 0x0F0F0F0F
and s1, s0, a1           # s1 = u & 0x0F0F0F0F
srli s2, a1, 4           # s2 = u >> 4
and s2, s2, s0           # s2 = (u >> 4) & 0x0F0F0F0F
add a1, s2, s1           # a1 = (u & 0x0F0F0F0F) + ((u >> 4) & 0x0F0F0F0F)

# u = (u & 0x00FF00FF) + ((u >> 8) & 0x00FF00FF)
li s0, 0x00FF00FF        # Load mask 0x00FF00FF
and s1, s0, a1           # s1 = u & 0x00FF00FF
srli s2, a1, 8           # s2 = u >> 8
and s2, s2, s0           # s2 = (u >> 8) & 0x00FF00FF
add a1, s2, s1           # a1 = (u & 0x00FF00FF) + ((u >> 8) & 0x00FF00FF)

# u = (u & 0x0000FFFF) + ((u >> 16) & 0x0000FFFF)
li s0, 0x0000FFFF        # Load mask 0x0000FFFF
and s1, s0, a1           # s1 = u & 0x0000FFFF
srli s2, a1, 16          # s2 = u >> 16
and s2, s2, s0           # s2 = (u >> 16) & 0x0000FFFF
add a1, s2, s1           # a1 = (u & 0x0000FFFF) + ((u >> 16) & 0x0000FFFF)

my_clz:
li s0, 0  # s0 = clz = 0

# if ((num >> 16) == 0)
srli s1, t3, 16  # s1 = num >> 16
addi x0,x0,0 # NOP
addi x0,x0,0 # NOP
bnez s1, check_24  # If num >> 16 is not 0, jump to check_24
addi s0, s0, 16  # clz += 16
slli t3, t3, 16  # num <<= 16

check_24:
# if ((num >> 24) == 0)
srli s1, t3, 24  # s1 = num >> 24
addi x0,x0,0 # NOP
addi x0,x0,0 # NOP
bnez s1, check_28  # If num >> 24 is not 0, jump to check_28
addi s0, s0, 8  # clz += 8
slli t3, t3, 8  # num <<= 8

check_28:
# if ((num >> 28) == 0)
srli s1, t3, 28  # s1 = num >> 28
addi x0,x0,0 # NOP
addi x0,x0,0 # NOP
bnez s1, check_30  # If num >> 28 is not 0, jump to check_30
addi s0, s0, 4  # clz += 4
slli t3, t3, 4  # num <<= 4

check_30:
# if ((num >> 30) == 0)
srli s1, t3, 30  # s1 = num >> 30
addi x0,x0,0 # NOP
addi x0,x0,0 # NOP
bnez s1, check_31  # If num >> 30 is not 0, jump to check_31
addi s0, s0, 2  # clz += 2
slli t3, t3, 2  # num <<= 2

check_31:
# if ((num >> 31) == 0)
srli s1, t3, 31  # s1 = num >> 31
addi x0,x0,0 # NOP
addi x0,x0,0 # NOP
bnez s1, done  # If num >> 31 is not 0, jump to done
addi s0, s0, 1  # clz += 1

done:
li s1, 31  # s1 = 31
addi x0,x0,0 # NOP
sub a0, s1, s0  # a0 = 31 - clz (return value)
add a1, a0, a1  # Add clz result to popcnt result

ret  # Return to main program

Clz_binary_popcnt (Without hazard detection)

Execution info
Cycles	383
Instr. retired	315
CPI	1.22
IPC	0.822

參考資料

Quiz1 of Computer Architecture (2024 Fall)
Leetcode 1342.Number of Steps to Reduce a Number to Zero
The RISC-V Instruction Set Manual

Always refer to primary sources, such as official RISC-V documentation.

Assignment1: Implement count leading zero and popcount to solve LeetCode problem 1342.

Quiz 1_C

Fabsf function

C code

Assembly

Fp16_to_fp32

C code

Assembly

Selected Problems

my_clz

Solution

My thinking path

Original C code (loop version)

C code(with clz_loop and popcount)

C code(with clz_binary and popcount)

RISC-V assembly code

Loop version 1

Loop version 2 (Using fewer instructions.)

Using clz_loop and popcount

Using clz_binary and popcount

Analysis

Loop version 2 (Using fewer instructions.)

Using clz_binary and popcount

Comparison Analysis

Brief summary

manual hazard detection

Three type of Data Hazard

Structure Hazard

Data Hazard

Control Hazard

Load-Use Data Hazard

Handling Data Hazards :

Implement

Clz_binary_popcnt (Without hazard detection)

參考資料

Read more

linux2025-ideas

lab0-c