# Assignment1: Implement count leading zero and popcount to solve LeetCode problem 1342. contributed by < [`KAILIAO`](https://github.com/eason9999)> Full source code [here](https://github.com/eason9999/Computer_Architecture_Hw1) ## Quiz 1_C I chose problem C from Quiz 1 and translated it from C code into a complete RISC-V assembly program. The implementation includes the ```fabsf``` function and the ```fp16_to_fp32``` function, which converts a 16-bit floating-point number in IEEE half-precision format (bit representation) to a 32-bit floating-point number in IEEE single-precision format (bit representation). This implementation avoids using any floating-point arithmetic and utilizes the clz function ## Fabsf function ### C code ```c #include <stdio.h> #include <stdint.h> // Function to return the absolute value of a floating-point number static inline float fabsf(float x) { uint32_t i = *(uint32_t *)&x; // Read the bits of the float into an integer i &= 0x7FFFFFFF; // Clear the sign bit to get the absolute value x = *(float *)&i; // Write the modified bits back into the float return x; } int main() { // Test values float test_values[] = { 3.14f, -3.14f, 0.0f, -0.0f, 1.0f, -1.0f, 123456.789f, -123456.789f }; // Expected results for absolute values float expected_results[] = { 3.14f, 3.14f, 0.0f, 0.0f, 1.0f, 1.0f, 123456.789f, 123456.789f }; // Number of test cases int num_tests = sizeof(test_values) / sizeof(test_values[0]); // Run tests for (int i = 0; i < num_tests; i++) { float result = fabsf(test_values[i]); printf("Test %d: fabsf(%.6f) = %.6f (Expected: %.6f) -> %s\n", i + 1, test_values[i], result, expected_results[i], result == expected_results[i] ? "Correct" : "Incorrect"); } return 0; } ``` >The program output will be like Test number: fabsf(==test_value==) = ==result== (Expected:==expect_value==) -> ==Correct==/==Incorrect== ### Assembly ```c .data test_values: .word 0x4048F5C3, 0xC048F5C3, 0x00000000, 0x80000000, 0x3F800000, 0xBF800000, 0x47F7C000, 0xA7E7C3D1 expected_values: .word 0x4048F5C3, 0x4048F5C3, 0x00000000, 0x00000000, 0x3F800000, 0x3F800000, 0x47F7C000, 0x27E7C3D1 correct_msg: .asciz " -> Correct\n" incorrect_msg: .asciz " -> Incorrect\n" input_msg: .asciz "Input: " output_msg: .asciz " Output: " ans_msg: .asciz ", Expected: " .text main: la s0, test_values # Load the address of test values la s1, expected_values # Load the address of expected results li t2, 8 # Number of test cases li t3, 0 # Initialize index to 0 loop: lw a1, 0(s0) # Load floating point number (as integer) from test_values # Print input value la a0, input_msg # Load input string li a7, 4 ecall mv a0, a1 # Set the value to print as a1 (input value) li a7, 2 # ecall 34: Print hexadecimal integer ecall call fabsf # Call fabsf function # Print output value la a0, output_msg # Load output string li a7, 4 ecall mv a0, a1 # Set the value to print as a1 (output value) li a7, 2 # ecall 34: Print hexadecimal integer ecall lw a2, 0(s1) # Load expected result from expected_results # Print expected value la a0, ans_msg # Load expected value string li a7, 4 ecall mv a0, a2 # Set the value to print as a2 (expected value) li a7, 2 # ecall 34: Print hexadecimal integer ecall # Compare output with expected value beq a1, a2, result_correct # If equal, jump to result_correct j print_incorrect # If not equal, jump to print_incorrect result_correct: call print_correct # Print "Result: Correct" next_test: addi s0, s0, 4 # Point to the next test value (each floating point is 4 bytes) addi s1, s1, 4 # Point to the next expected result addi t3, t3, 1 # Increment test index blt t3, t2, loop # If not all values tested, continue loop li a0, 0 # ecall 93: Program exit li a7, 93 ecall fabsf: li t0, 0x7FFFFFFF # Load 0x7FFFFFFF to clear the sign bit and a1, a1, t0 # Clear the sign bit ret print_correct: la a0, correct_msg li a7, 4 # ecall 4: Print string ecall ret print_incorrect: la a0, incorrect_msg li a7, 4 # ecall 4: Print string ecall j next_test ``` >The program output will be like >Input : ==test_value== Output: ==result== Expected:==expect_value== -> ==Correct==/==Incorrect== | Execution info | | |:-------------- |:----- | | Cycles | 549 | | Instr. retired | 337 | | CPI | 1.63 | | IPC | 0.614 | ## Fp16_to_fp32 ### C code ```c #include <stdio.h> #include <stdint.h> // Custom my_clz function to count the number of leading zeros in a 32-bit integer static inline int my_clz(uint32_t x) { int count = 0; for (int i = 31; i >= 0; --i) { if (x & (1U << i)) break; count++; } return count; } // Function to convert a 16-bit floating-point number to a 32-bit floating-point number static inline uint32_t fp16_to_fp32(uint16_t h) { const uint32_t w = (uint32_t) h << 16; const uint32_t sign = w & UINT32_C(0x80000000); const uint32_t nonsign = w & UINT32_C(0x7FFFFFFF); uint32_t renorm_shift = my_clz(nonsign); renorm_shift = renorm_shift > 5 ? renorm_shift - 5 : 0; const int32_t inf_nan_mask = ((int32_t)(nonsign + 0x04000000) >> 8) & INT32_C(0x7F800000); const int32_t zero_mask = (int32_t)(nonsign - 1) >> 31; return sign | ((((nonsign << renorm_shift >> 3) + ((0x70 - renorm_shift) << 23)) | inf_nan_mask) & ~zero_mask); } // Function to output the bit pattern of a 32-bit floating-point number in hexadecimal format void print_fp32(uint32_t fp32) { printf("0x%08X", fp32); } // Main function: tests the fp16_to_fp32 function int main() { // Test data (16-bit half-precision floating-point numbers) uint16_t test_values[] = { 0x3C00, // +1.0 (16-bit) 0xC000, // -1.0 (16-bit) 0x7BFF, // +Inf (16-bit) 0xFC00, // -Inf (16-bit) 0x7E00, // NaN (16-bit) 0x0000, // +0.0 (16-bit) 0x8000, // -0.0 (16-bit) 0x3555, // Arbitrary small number (16-bit) }; uint32_t expected_values[] = { 0x3F800000, 0xC0000000, 0x477FE000, 0xFF800000, 0x7FC00000, 0x00000000, 0x80000000, 0x3EAAA000, }; // Convert each test value and print the result for (int i = 0; i < sizeof(test_values) / sizeof(test_values[0]); ++i) { uint32_t result = fp16_to_fp32(test_values[i]); printf("16-bit: 0x%04X -> 32-bit: ", test_values[i]); print_fp32(result); printf(" expected_value= 0x%08X", expected_values[i]); if (result == expected_values[i]) printf(" result: correct\n"); else printf(" result: false\n"); } return 0; } ``` >The program output will be like >16-bit: ==test_value[i]== -> 32-bit: ==result== expected_value= ==expected_values[i]== result:==Correct==/==Incorrect== ### Assembly ```c .data test_values: .half 0x3C00, 0xC000, 0x7BFF, 0xFC00, 0x7E00, 0x0000, 0x8000, 0x3555, 0x0400, 0x3000, 0x7800 ans_values: .word 0x3F800000, 0xC0000000, 0x477FE000, 0xFF800000, 0x7FC00000, 0x00000000, 0x80000000, 0x3EAAA000, 0x38800000, 0x3e000000, 0x47000000 input_msg: .asciz "Input ->" output_msg: .asciz " Output -> " ans_msg: .asciz " expected_value -> " correct_msg: .asciz " Result: Correct\n" false_msg: .asciz " Result: False\n" format: .asciz "%08X\n" # Format string for printing hexadecimal numbers .text # Main function main: la s0, test_values # Load the address of the test data la s1, ans_values li t0, 11 # Array size li t1, 0 # Initialize index i to 0 test_loop: lhu a1, 0(s0) # Load the current 16-bit number (load halfword) la a0, input_msg li a7, 4 ecall mv a0, a1 li a7, 34 ecall call fp16_to_fp32 # Call the fp16_to_fp32 conversion lw a2, 0(s1) call print_fp32 # Print the conversion result # Verify if the conversion result is correct beq a1, a2, result_correct # If the conversion result matches the expected result, jump to result_correct call print_false # Otherwise, print "Result: False" j next_iteration result_correct: call print_correct # Print "Result: Correct" next_iteration: addi s0, s0, 2 # Move to the next test value (advance by 2 bytes each time) addi s1, s1, 4 # Move to the next expected value (advance by 4 bytes each time) addi t1, t1, 1 # i++ blt t1, t0, test_loop # If i < 11, continue the loop # Use ecall to exit the program li a0, 0 li a7, 93 # ecall 93: Exit the program ecall # Print "Result: Correct" print_correct: la a0, correct_msg li a7, 4 # ecall 4: Print string ecall ret # Print "Result: False" print_false: la a0, false_msg li a7, 4 # ecall 4: Print string ecall ret # fp16_to_fp32 function: Convert a 16-bit floating-point number to a 32-bit floating-point number fp16_to_fp32: addi sp, sp, -16 # Allocate stack space for saved registers sw ra, 12(sp) # Save return address sw t0, 8(sp) # Save register t0 sw t1, 4(sp) # Save register t1 sw t2, 0(sp) # Save register t2 slli a1, a1, 16 # w = h << 16 li t0, 0x80000000 # Sign mask and t1, a1, t0 # sign = w & 0x80000000 li t0, 0x7FFFFFFF # Nonsign mask and t2, a1, t0 # nonsign = w & 0x7FFFFFFF # Calculate the number of leading zeros mv a1, t2 # Prepare argument call my_clz # renorm_shift = my_clz(nonsign) mv t3, a1 # Save renorm_shift # Handle cases where renorm_shift is greater than 5 li t0, 5 bgt t3, t0, renorm_sub # If renorm_shift > 5, jump to subtraction li t3, 0 # Otherwise, renorm_shift = 0 j renorm_done renorm_sub: sub t3, t3, t0 # renorm_shift -= 5 renorm_done: li t0, 0x04000000 # Calculate inf_nan_mask add t5, t2, t0 srai t5, t5, 8 li t0, 0x7F800000 and t5, t5, t0 # inf_nan_mask done addi t0, t2, -1 # Calculate zero_mask srai t0, t0, 31 # zero_mask done sll t2, t2, t3 # nonsign << renorm_shift srli t2, t2, 3 # >> 3 li t4, 0x70 sub t4, t4, t3 slli t4, t4, 23 # (0x70 - renorm_shift) << 23 add t2, t2, t4 # (nonsign << renorm_shift >> 3) + ((0x70 - renorm_shift) << 23) or t2, t2, t5 # Add inf_nan_mask not t0, t0 # ~zero_mask and t2, t2, t0 # Handle zero_mask or a1, t1, t2 # sign | nonsign lw ra, 12(sp) # Restore return address lw t0, 8(sp) # Restore register t0 lw t1, 4(sp) # Restore register t1 lw t2, 0(sp) # Restore register t2 addi sp, sp, 16 # Release stack space ret # Return 32-bit result # Use ecall to print the 32-bit floating-point number print_fp32: la a0, output_msg # Load "Output -> " string li a7, 4 # ecall 4: Print string ecall # Print the hexadecimal representation of the 32-bit floating-point number mv a0, a1 li a7, 34 # ecall 34: Print integer, hexadecimal format ecall print_ans: la a0, ans_msg # Load ", Ans -> " string li a7, 4 # ecall 4: Print string ecall mv a0, a2 # Output 32-bit number li a7, 34 # ecall 34: Print integer, hexadecimal format ecall ret # my_clz function: Count the number of leading zeros my_clz: addi sp, sp, -16 # Allocate stack space for saved registers sw t2, 12(sp) sw t1, 8(sp) sw ra, 4(sp) # Save return address sw t0, 0(sp) # Save register t0 li t0, 31 # Initialize i = 31 li t1, 0 # count = 0 li t2, 1 clz_loop: beqz a1, all_zeros # If x is 0, jump to all_zeros sll s3, t2, t0 # t0 = 1 << i and s3, a1, s3 # t0 = x & (1 << i) bne s3, zero, end_loop # If t0 != 0, jump to end_loop addi t1, t1, 1 # count++ addi t0, t0, -1 # i-- bge t0, zero, clz_loop # If i >= 0, continue looping all_zeros: li t1, 32 # If x is 0, count = 32 end_loop: mv a1, t1 # Return count lw t2, 12(sp) lw t1, 8(sp) lw ra, 4(sp) # Restore return address lw t0, 0(sp) # Restore register t0 addi sp, sp, 16 # Release stack space ret ``` | Execution info | | |:-------------- |:----- | | Cycles | 1680 | | Instr. retired | 1226 | | CPI | 1.37 | | IPC | 0.73 | --- ## Selected Problems My question is implement count leading zero and popcount in C and use them to solve LeetCode problem [1342.Number of Steps to Reduce a Number to Zero](https://leetcode.com/problems/number-of-steps-to-reduce-a-number-to-zero/description/) Given an integer num, return the number of steps to reduce it to zero. Count leading zero can be used for quick solving task selection problem in OS. In one step, if the current number is even, you have to divide it by 2, otherwise, you have to subtract 1 from it. - Example : >Input: num = 14 Output: 6 Explanation: Step 1) 14 is even; divide by 2 and obtain 7. Step 2) 7 is odd; subtract 1 and obtain 6. Step 3) 6 is even; divide by 2 and obtain 3. Step 4) 3 is odd; subtract 1 and obtain 2. Step 5) 2 is even; divide by 2 and obtain 1. Step 6) 1 is odd; subtract 1 and obtain 0. Constraints: 0 <= num <= 10^6^ The concept is from Quiz 1, question C, involving ==my_clz== function,which calculates how many leading zeros are in the binary representation of a 32-bit unsigned integer (uint32_t).==Popcount== function calculates the number of 1 bits in the binary representation of an integer. Based on these function. I would like to implement count leading zero and popcount in C and use them to solve LeetCode problem 1342 to reduce the number of clock cycles. ## my_clz The my_clz function counts the number of leading zeros in a 32-bit unsigned integer x. It initializes a counter called count to zero and uses a for loop to iterate from the most significant bit (MSB) to the least significant bit (LSB). The expression x & (1U << i) checks if the i-th bit of x is 1; if it finds a 1, the loop breaks. If the current bit is 0, the counter increments. This continues until a 1 is found or all bits have been checked, and the function returns the total count of leading zeros. :::danger Is it better? Can you prove? ::: :::success Thanks, This indeed guides me toward another direction of thought. The my_clz function uses a loop to count leading zeros by checking each bit from the MSB to LSB. It’s simple and predictable, especially when the input has many leading zeros. However, it has several downsides: - Performance: In the worst case, the loop runs 32 iterations. This adds unnecessary overhead compared to more efficient methods. - Branching: Each iteration introduces a conditional check, which may cause pipeline stalls and branch prediction failures, especially on architectures like RISC-V. - Inefficiency: Bitwise AND and shifting by i are costlier compared to other approaches that progressively reduce the bit range(for example,byte shift clz). But when using the optimized clz implementations that use byte shifts strategy, it can outperform the original loop version. Those versions check large blocks of bits at once, reducing instructions and improving performance. ::: ```c static inline int my_clz(uint32_t x) { int count = 0; for (int i = 31; i >= 0; --i) { if (x & (1U << i)) break; count++; } return count; } ``` The corresponding RISC-V assembly are showing below. ```c my_clz: li s1, 0 # Initialize s1 (count) to 0 li s0, 31 # Initialize s0 (bit position) to 31 li s2, 1 # Set s2 to 1, used for bit shifting clz_loop: beqz t3, all_zeros # If t3 (input value) is 0, jump to all_zeros sll s3, s2, s0 # Shift 1 left by s0 and store in s3 and s3, t3, s3 # Bitwise AND between t3 and s3 bne s3, zero, end_loop # If the result is not 0, jump to end_loop addi s1, s1, 1 # Increment count (s1) addi s0, s0, -1 # Decrement bit position (s0) bge s0, zero, clz_loop # If bit position (s0) >= 0, repeat the loop all_zeros: li s1, 32 # If the input is 0, set count to 32 end_loop: mv a0, s1 # Move count (s1) to a0 (return value) ``` ## Solution ### My thinking path Approaching it in an intuitive way, actually,it is a very easy problem.LeetCode 1342 asks for the number of operations required to reduce a given integer 𝑛 to zero. The operations are defined as follows: - If n is even, perform n/2. - If n is odd, perform nβˆ’1. We can use a loop to implement this logic. Here is a solution written in C language: ### Original C code (loop version) ```c int numberOfSteps(int num){ int stepCounter = 0; while(num != 0){ if(num % 2 == 0){ num = num /2; } else{ num--; } stepCounter++; } return stepCounter; } ``` However, if solving it using bit manipulation, it involves many interesting derived knowledge points. Since dividing a positive integer by 2 is equivalent to shifting it right by 1 bit, the total number of divisions required is equal to the number of times the most significant bit (MSB) needs to be shifted to the least significant position. Shifting the MSB to the least significant position takes ***31 - __builtin_clz(num)*** steps, so the number of divisions required is also the same. Additionally, when the binary representation of num contains k ones, it means a total of k subtractions of 1 are needed, as these ones will eventually be shifted to the least significant position. When the least significant bit is 1, it indicates that the number is odd and requires a subtraction of 1, so in total, ***popcount(num)*** subtractions of 1 are needed. ### C code(with clz_loop and popcnt_naive) ```c #include <stdio.h> #include <stdint.h> int my_clz(uint32_t x) { int count = 0; for (int i = 31; i >= 0; i--) { if (x & (1U << i)) break; count++; } return count; } int my_popcnt(unsigned v) { int count = 0; while (v){ v &= (v - 1); count++; } return count; } int main() { int test_data[] = {14, 8, 123}; for (int i = 0; i < 3; i++) { int result = test_data[i] ? my_popcnt(test_data[i]) + 31 - my_clz(test_data[i]) : 0; printf("Input num = %d Output = %d\n",test_data[i],result); } return 0; } ``` ``` The program output Input num = 14 Output = 6 Input num = 8 Output = 4 Input num = 123 Output = 12 ``` ### C code(with clz_binary and popcnt_branchless) ```c #include <stdio.h> #include <stdint.h> int my_clz(uint32_t num) { int clz = 0; if ((num >> 16) == 0) { clz += 16,num <<= 16; } if ((num >> 24) == 0) { clz += 8,num <<= 8; } if ((num >> 28) == 0) { clz += 4,num <<= 4; } if ((num >> 30) == 0) { clz += 2,num <<= 2; } if ((num >> 31) == 0) { clz += 1; } return clz; } uint32_t my_popcnt (uint32_t u) { u = (u & 0x55555555) + ((u >> 1) & 0x55555555); u = (u & 0x33333333) + ((u >> 2) & 0x33333333); u = (u & 0x0F0F0F0F) + ((u >> 4) & 0x0F0F0F0F); u = (u & 0x00FF00FF) + ((u >> 8) & 0x00FF00FF); u = (u & 0x0000FFFF) + ((u >> 16) & 0x0000FFFF); return u; } int main() { int test_data[] = {14, 8, 123}; for (int i = 0; i < 3; i++) { int result = test_data[i] ? my_popcnt(test_data[i]) + 31 - my_clz(test_data[i]) : 0; printf("Input num = %d Output = %d\n",test_data[i],result); } return 0; } ``` :::danger Does the above make sense to RV32I? Prove and discuss. ::: :::success Yes, the code is suited for RV32I. This approach to bit manipulations for clz and popcnt matches well with the capabilities of the RV32I instruction set. - Shifts: num >> X β†’ SRLI instruction - Bitwise AND: & β†’ AND instruction - Additions: + β†’ ADDI instruction - Loops and conditionals: if, for β†’ BNE, BEQ for branching - Function calls: β†’ JAL for function jumps, and JALR for returns. This version of the clz function counts the number of leading zeros in a 32-bit unsigned integer (x). Here's the step-by-step breakdown of its logic: - Initial Check for Zero: The function first checks if the input x is 0. If x == 0, the function immediately returns 32, since all bits are zero, meaning there are 32 leading zeros. - Checking and Shifting in Blocks: The function proceeds to check the leading zeros by analyzing blocks of bits, starting from the most significant ones (leftmost). Step 1: (x >> 16) == 0 It checks if the upper 16 bits of x are all zeros. If so, it adds 16 to n (since all 16 bits are zeros) and shifts x 16 bits to the left, discarding the leading zeros. Step 2: (x >> 24) == 0 After the shift, it checks if the upper 8 bits of the remaining 16-bit value are all zeros. If so, it adds 8 to n and shifts x left by 8 bits. Step 3: (x >> 28) == 0 Now, it checks if the upper 4 bits of the remaining 8-bit value are all zeros. If so, it adds 4 to n and shifts x left by 4 bits. Step 4: (x >> 30) == 0 It then checks if the upper 2 bits of the remaining 4-bit value are all zeros. If so, it adds 2 to n and shifts x left by 2 bits. Step 5: (x >> 31) == 0 Finally, it checks if the most significant bit (MSB) of the remaining 2-bit value is zero. If so, it adds 1 to n. And n is the count of leading zeros. I included another method for comparison, and both versions use the same basic strategy of bit shifts and bit checking. `Byte shift clz` ``` int clz(uint32_t x) { if (x == 0) return 32; int n = 1; if ((x >> 16) == 0) { n += 16; x <<= 16; } if ((x >> 24) == 0) { n += 8; x <<= 8; } if ((x >> 28) == 0) { n += 4; x <<= 4; } if ((x >> 30) == 0) { n += 2; x <<= 2; } n = n - (x >> 31); return n; } ``` `second version(My version)` ``` int clz(uint32_t x) { if (x == 0) return 32; int n = 0; if ((x >> 16) == 0) { n += 16; x <<= 16; } if ((x >> 24) == 0) { n += 8; x <<= 8; } if ((x >> 28) == 0) { n += 4; x <<= 4; } if ((x >> 30) == 0) { n += 2; x <<= 2; } if ((x >> 31) == 0) { n += 1; } return n; } ``` 1. **Number of conditionals (if):** First version: Reduces one if by handling the 31st bit with the final line n = n - (x >> 31). This avoids one conditional branch but introduces an additional subtraction operation. Second version: Contains an extra conditional branch for the 31st bit (x >> 31). This conditional statement causes RISC-V to execute one more comparison and branch. ***For RISC-V, conditional branches may lead to potential branch prediction failures and pipeline stalls, which increase CPU cycle costs.*** 2. **Code simplicity and ease of translation to RISC-V I:** The first version is slightly simpler in logic because it avoids extra conditional checks, especially when handling the highest bit. For RISC-V’s pipeline and branch prediction mechanisms, reducing branch jumps can decrease potential performance bottlenecks. The second version contains an extra branch, which requires more branch instruction checks in RISC-V I (e.g., BNE, BEQ). Although the impact is not significant, this slightly increases processing time. Thus, the first version might have a slight advantage in terms of fewer conditional branches. If there are any errors or gaps in the response, I would appreciate it if you could point them out, thanks. ::: ## RISC-V assembly code ### Loop version 1 ```c .data nums: .word 14, 8, 123 # Test data nums = 14, 8, 123 input_str: .string "Input: num = " # Defines the string "Input: num = " output_str: .string " Output: " # Defines the string "Output: " .text main: # Initialize pointers and array size la s0, nums # Load nums array start address li s1, 3 # Number of elements in nums (optimized: hardcoded) li t0, 0 # i = 0 loop_outer: beq t0, s1, done_outer # Exit if i == size lw t3, 0(s0) # Load nums[i] into t3 addi s0, s0, 4 # Move nums pointer addi t0, t0, 1 # i++ mv a1, t3 # Save original num jal ra, numberOfSteps # Call numberOfSteps, result in a1 # Print "Input: num = ", num, " Output: ", and step count in one go la a0, input_str # Load "Input: num = " string li a7, 4 # Syscall 4: print string ecall mv a0, a1 # Move original num into a0 li a7, 1 # Syscall 1: print integer ecall la a0, output_str # Load " Output: " string li a7, 4 # Syscall 4: print string ecall mv a0, a2 # Move step count into a0 li a7, 1 # Syscall 1: print integer ecall li a0, 0x0A # Print newline li a7, 11 # Syscall 11: print char ecall j loop_outer # Repeat for next element done_outer: li a7, 10 # Syscall 10: exit program ecall numberOfSteps: li a2, 0 # count = 0 loop_inner: beqz t3, done_inner # Exit if num == 0 andi t5, t3, 1 # Check if num is odd beqz t5, even # If even, jump to even addi t3, t3, -1 # If odd, subtract 1 j update_count # Jump to update count even: srli t3, t3, 1 # Divide by 2 if even update_count: addi a2, a2, 1 # Increment count j loop_inner # Loop until num is 0 done_inner: ret ``` ``` The program output Input num = 14 Output = 6 Input num = 8 Output = 4 Input num = 123 Output = 12 ``` | Execution info | | |:-------------- |:----- | | Cycles | 380 | | Instr. retired | 230 | | CPI | 1.65 | | IPC | 0.605 | :::danger Shorten the above, using fewer instructions. ::: ## Loop version 2 (Using fewer instructions.) I revised the code ```in loop_inner``` section. - Branch elimination: Using sub and add combined with condition flags avoids the need for two branch conditions for odd and even numbers, simplifying the control flow. - t5 stores the state of whether num is odd. - When num is odd, sub t3, t3, t5 essentially subtracts 1 (i.e., turning the odd number into an even one), and add a2, a2, t5 adds 1 to a2, which represents the step count for this operation. - Unified processing: The operation srli t3, t3, 1, which divides num by 2, is placed afterward, unifying the operation flow for both cases. ```c .data nums: .word 14, 8, 123, # Test data nums = 14, 8, 123 input_str: .string "Input: num = " # Defines the string "Input: num = " output_str: .string " Output: " # Defines the string "Output: " .text main: # Initialize pointers and array size la s0, nums # Load nums array start address li s1, 3 # Number of elements in nums (optimized: hardcoded) li t0, 0 # i = 0 loop_outer: beq t0, s1, done_outer # Exit if i == size lw t3, 0(s0) # Load nums[i] into t3 addi s0, s0, 4 # Move nums pointer addi t0, t0, 1 # i++ mv a1, t3 # Save original num jal ra, numberOfSteps # Call numberOfSteps, result in a1 # Print "Input: num = ", num, " Output: ", and step count in one go la a0, input_str # Load "Input: num = " string li a7, 4 # Syscall 4: print string ecall mv a0, a1 # Move original num into a0 li a7, 1 # Syscall 1: print integer ecall la a0, output_str # Load " Output: " string li a7, 4 # Syscall 4: print string ecall mv a0, a2 # Move step count into a0 li a7, 1 # Syscall 1: print integer ecall li a0, 0x0A # Print newline li a7, 11 # Syscall 11: print char ecall j loop_outer # Repeat for next element done_outer: li a7, 10 # Syscall 10: exit program ecall numberOfSteps: li a2, 0 # count = 0 loop_inner: andi t5, t3, 1 # Check if num is odd sub t3, t3, t5 # If odd, subtract 1 add a2, a2, t5 # If odd, count + 1(t5),else count+0(t5) srli t3, t3, 1 beqz t3, done_inner # Exit if num == 0 addi a2, a2, 1 # Increment count j loop_inner # Jump to update count done_inner: ret ``` | Execution info | | |:-------------- |:----- | | Cycles | 270 | | Instr. retired | 184 | | CPI | 1.47 | | IPC | 0.681 | ### Using clz_loop and popcount ```c .data nums: .word 14, 8, 123 # Test data nums input_str: .string "Input: num = " output_str: .string " Output: " .text main: la t0, nums # Load nums array start address li t6, 3 # Number of elements in nums (optimized: hardcoded) li t2, 0 # i = 0 loop_outer: beq t2, t6, done_outer # If i == size, exit the outer loop addi t0, t0, 4 # Move the nums pointer forward by 4 bytes (size of an int) addi t2, t2, 1 # i++ lw t3, -4(t0) # Get nums[i], load it into t3 (num), using -4 offset mv t4, t3 # Save the original num into t4 for later display jal ra, my_popcnt # Call my_popcnt, store the step count in a1 # Display string "Input: num = " la a0, input_str # Load "Input: num = " li a7, 4 # System call number 4 (print string) ecall # Execute system call # Display the original num mv a0, t4 # Pass the original num to a0 (syscall 1: print integer) li a7, 1 # System call number 1 (print integer) ecall # Display the number # Display string " Output: " la a0, output_str # Load " Output: " li a7, 4 # System call number 4 (print string) ecall # Execute system call # Display the step count mv a0, a1 # Pass the step count (a1) to a0 li a7, 1 # System call number 1 (print integer) ecall # Display the step count # New line li a0, 0x0A # New line (0x0A is the newline character) li a7, 11 # System call number 11 (print character) ecall # Move to the next test data j loop_outer # Return to the outer loop done_outer: # End program li a7, 10 # System call number 10 (exit) ecall my_popcnt: # u = (u & 0x55555555) + ((u >> 1) & 0x55555555) li s0, 0x55555555 # Load mask 0x55555555 and s1, s0, t3 # s1 = u & 0x55555555 srli s2, t3, 1 # s2 = u >> 1 and s2, s2, s0 # s2 = (u >> 1) & 0x55555555 add a1, s2, s1 # a1 = (u & 0x55555555) + ((u >> 1) & 0x55555555) # u = (u & 0x33333333) + ((u >> 2) & 0x33333333) li s0, 0x33333333 # Load mask 0x33333333 and s1, s0, a1 # s1 = u & 0x33333333 srli s2, a1, 2 # s2 = u >> 2 and s2, s2, s0 # s2 = (u >> 2) & 0x33333333 add a1, s2, s1 # a1 = (u & 0x33333333) + ((u >> 2) & 0x33333333) # u = (u & 0x0F0F0F0F) + ((u >> 4) & 0x0F0F0F0F) li s0, 0x0F0F0F0F # Load mask 0x0F0F0F0F and s1, s0, a1 # s1 = u & 0x0F0F0F0F srli s2, a1, 4 # s2 = u >> 4 and s2, s2, s0 # s2 = (u >> 4) & 0x0F0F0F0F add a1, s2, s1 # a1 = (u & 0x0F0F0F0F) + ((u >> 4) & 0x0F0F0F0F) # u = (u & 0x00FF00FF) + ((u >> 8) & 0x00FF00FF) li s0, 0x00FF00FF # Load mask 0x00FF00FF and s1, s0, a1 # s1 = u & 0x00FF00FF srli s2, a1, 8 # s2= u >> 8 and s2, s2, s0 # s2 = (u >> 8) & 0x00FF00FF add a1, s2, s1 # a1 = (u & 0x00FF00FF) + ((u >> 8) & 0x00FF00FF) # u = (u & 0x0000FFFF) + ((u >> 16) & 0x0000FFFF) li s0, 0x0000FFFF # Load mask 0x0000FFFF and s1, s0, a1 # s1 = u & 0x0000FFFF srli s2, a1, 16 # s2= u >> 16 and s2, s2, s0 # s2 = (u >> 16) & 0x0000FFFF add a1, s2, s1 # a1 = (u & 0x0000FFFF) + ((u >> 16) & 0x0000FFFF) length: li s1, 0 # s1 = count = 0 li s0, 31 # s0 = i = 31 li s2, 1 # s2 = 1 beqz t3, all_zeros # If x is 0, jump to all_zeros clz_loop: sll s3, s2, s0 # s3 = 1 << i and s3, t3, s3 # s3 = x & (1 << i) bne s3, zero, end_loop # If s3 != 0, jump to end_loop addi s1, s1, 1 # count++ addi s0, s0, -1 # i-- bge s0, zero, clz_loop # If i >= 0, continue loop all_zeros: li s1, 32 # If x is 0, count = 32 end_loop: li s2, 31 # s2 = 31 sub a0, s2, s1 # a0 = 31 - clz (return value) add a1, a0, a1 ret # Return to main program ``` | Execution info | | |:-------------- |:----- | | Cycles | 915 | | Instr. retired | 688 | | CPI | 1.33 | | IPC | 0.752 | ### Using clz_binary and popcount ```c .data nums: .word 14, 8, 123 # Test data nums input_str: .string "Input: num = " output_str: .string " Output: " .text main: la t0, nums # Load nums array start address li t6, 3 # Number of elements in nums (optimized: hardcoded) li t2, 0 # i = 0 loop_outer: beq t2, t6, done_outer # If i == size, exit the outer loop addi t0, t0, 4 # Move the nums pointer forward by 4 bytes (size of an int) addi t2, t2, 1 # Increment i lw t3, -4(t0) # Load nums[i] into t3 (num), note the -4 offset mv t4, t3 # Save the original num into t4 for later display jal ra, my_popcnt # Display the string "Input: num = " la a0, input_str li a7, 4 # Syscall number 4 (print string) ecall # Display the original num mv a0, t4 li a7, 1 # Syscall number 1 (print integer) ecall # Display the string " Output: " la a0, output_str li a7, 4 # Syscall number 4 (print string) ecall # Display the step count mv a0, a1 li a7, 1 # Syscall number 1 (print integer) ecall # Print a newline li a0, 0x0A # Newline (0x0A is the newline character) li a7, 11 # Syscall number 11 (print char) ecall # Move to the next test data j loop_outer done_outer: # Exit the program li a7, 10 # Syscall number 10 (exit) ecall my_popcnt: # u = (u & 0x55555555) + ((u >> 1) & 0x55555555) li s0, 0x55555555 # Load mask 0x55555555 and s1, s0, t3 # s1 = u & 0x55555555 srli s2, t3, 1 # s2 = u >> 1 and s2, s2, s0 # s2 = (u >> 1) & 0x55555555 add a1, s2, s1 # a1 = (u & 0x55555555) + ((u >> 1) & 0x55555555) # u = (u & 0x33333333) + ((u >> 2) & 0x33333333) li s0, 0x33333333 # Load mask 0x33333333 and s1, s0, a1 # s1 = u & 0x33333333 srli s2, a1, 2 # s2 = u >> 2 and s2, s2, s0 # s2 = (u >> 2) & 0x33333333 add a1, s2, s1 # a1 = (u & 0x33333333) + ((u >> 2) & 0x33333333) # u = (u & 0x0F0F0F0F) + ((u >> 4) & 0x0F0F0F0F) li s0, 0x0F0F0F0F # Load mask 0x0F0F0F0F and s1, s0, a1 # s1 = u & 0x0F0F0F0F srli s2, a1, 4 # s2 = u >> 4 and s2, s2, s0 # s2 = (u >> 4) & 0x0F0F0F0F add a1, s2, s1 # a1 = (u & 0x0F0F0F0F) + ((u >> 4) & 0x0F0F0F0F) # u = (u & 0x00FF00FF) + ((u >> 8) & 0x00FF00FF) li s0, 0x00FF00FF # Load mask 0x00FF00FF and s1, s0, a1 # s1 = u & 0x00FF00FF srli s2, a1, 8 # s2 = u >> 8 and s2, s2, s0 # s2 = (u >> 8) & 0x00FF00FF add a1, s2, s1 # a1 = (u & 0x00FF00FF) + ((u >> 8) & 0x00FF00FF) # u = (u & 0x0000FFFF) + ((u >> 16) & 0x0000FFFF) li s0, 0x0000FFFF # Load mask 0x0000FFFF and s1, s0, a1 # s1 = u & 0x0000FFFF srli s2, a1, 16 # s2 = u >> 16 and s2, s2, s0 # s2 = (u >> 16) & 0x0000FFFF add a1, s2, s1 # a1 = (u & 0x0000FFFF) + ((u >> 16) & 0x0000FFFF) my_clz: li s0, 0 # s0 = clz = 0 # if ((num >> 16) == 0) srli s1, t3, 16 # s1 = num >> 16 bnez s1, check_24 # If num >> 16 is not 0, jump to check_24 addi s0, s0, 16 # clz += 16 slli t3, t3, 16 # num <<= 16 check_24: # if ((num >> 24) == 0) srli s1, t3, 24 # s1 = num >> 24 bnez s1, check_28 # If num >> 24 is not 0, jump to check_28 addi s0, s0, 8 # clz += 8 slli t3, t3, 8 # num <<= 8 check_28: # if ((num >> 28) == 0) srli s1, t3, 28 # s1 = num >> 28 bnez s1, check_30 # If num >> 28 is not 0, jump to check_30 addi s0, s0, 4 # clz += 4 slli t3, t3, 4 # num <<= 4 check_30: # if ((num >> 30) == 0) srli s1, t3, 30 # s1 = num >> 30 bnez s1, check_31 # If num >> 30 is not 0, jump to check_31 addi s0, s0, 2 # clz += 2 slli t3, t3, 2 # num <<= 2 check_31: # if ((num >> 31) == 0) srli s1, t3, 31 # s1 = num >> 31 bnez s1, done # If num >> 31 is not 0, jump to done addi s0, s0, 1 # clz += 1 done: li s1, 31 sub a0, s1, s0 # a0 = 31 - clz (return value) add a1, a0, a1 # Add clz result to popcnt result ret # Return to main program ``` | Execution info | | |:-------------- |:----- | | Cycles | 302 | | Instr. retired | 231 | | CPI | 1.31 | | IPC | 0.765 | ## Analysis When I first got the result, I was a bit confused. It’s not surprising that the loop version has fewer cycles than the clz_loop version because clz_loop iterates up to 30 times for each data point(it iterates more times due to the value of test datas are small), while the loop version iterates approximately O(log~2~(10^6^)=19.931) times at most for each data point. However, it seems reasonable that the clz_binary version should have even fewer cycles. So, I tested again using 10^6^ as the test data and got the following result(I only used one test data) ### Loop version 2 (Using fewer instructions.) | Execution info | | |:-------------- |:----- | | Cycles | 235 | | Instr. retired | 171 | | CPI | 1.37 | | IPC | 0.728 | ### Using clz_binary and popcount | Execution info | | |:-------------- |:----- | | Cycles | 111 | | Instr. retired | 80 | | CPI | 1.39 | | IPC | 0.721 | ### Comparison Analysis - Cycle Count: clz_binary and popcount is significantly faster, using only 111 cycles compared to 235 cycles for Loop Version 2. This indicates that the clz_binary approach is much more efficient in terms of the number of cycles required to execute the program. - Instructions Retired: The clz_binary and popcount method has 80 instructions retired, while Loop Version 2 has 171 instructions retired. This suggests that clz_binary is not only executing fewer cycles but also utilizing fewer instructions to achieve its result. - CPI (Cycles Per Instruction): The CPI for Loop Version 2 is 1.37, while for clz_binary, it is slightly higher at 1.39. A lower CPI generally indicates a more efficient program. In this case, despite having a slightly higher CPI, the clz_binary method compensates for this by executing far fewer instructions overall, resulting in fewer total cycles. - IPC (Instructions Per Cycle): Loop Version 2 has an IPC of 0.728, whereas clz_binary has an IPC of 0.721. A higher IPC is generally better, as it indicates more instructions are being executed per cycle. In this case, Loop Version 2 performs slightly better in IPC, but again, this is overshadowed by the overall lower cycle count of clz_binary. ## Brief summary Efficiency: The clz_binary and popcount method outperforms Loop Version 2 in terms of execution cycles and instructions retired. It shows a more efficient implementation for this task. Performance Trade-off: While Loop Version 2 has a slightly better IPC and CPI, the significantly lower cycle count and instructions make the clz_binary approach more favorable overall. Optimization Considerations: The reduction in both cycles and instructions in the clz_binary and popcount implementation suggests that optimizing the algorithm for counting bits (like using bitwise operation) can lead to substantial performance gains, particularly in scenarios involving larger numbers or datasets. ## manual hazard detection A 5-stage pipeline processor with hazard detection and forwarding functionality can produce correct results for each test data. I would like to attempt manual hazard detection. When using a processor without hazard detection, the simulator shows the message ```"Unknown system call in register 'a7': 0."``` It's clear that there is a hazard around the "ecall" instruction. Furthermore,when the processor has a forwarding unit, most data hazards can be resolved, but a ```load-use``` data hazard still requires stalling for one cycle to prevent data loss. Below, I will discuss in detail and demonstrate the various situations and solutions I encountered during this process. ## Three type of Data Hazard - Structure Hazard - Insufficient hardware resources. (e.g. Assuming there is no difference between IM and DM, IF and MEM both have memory access, and problems will occur when they overlap, resulting in the inability to execute multiple instructions at the same time) - Data Hazard - Data Hazards occur when an instruction depends on the result of previous instruction and that result of instruction has not yet been computed. There are four types of data dependencies: Read after Write (RAW), Write after Read (WAR), Write after Write (WAW), and Read after Read (RAR). These are explained as follows below. - Read after Write (RAW) : It is also known as True dependency or Flow dependency. It occurs when the value produced by an instruction is required by a subsequent instruction. For example, `` ADD R1, --, --; SUB --, R1, --; `` - Write after Read (WAR) : It is also known as anti dependency. These hazards occur when the output register of an instruction is used right after read by a previous instruction. For example, ``ADD --, R1, --; SUB R1, --, --;`` - Write after Write (WAW) : It is also known as output dependency. These hazards occur when the output register of an instruction is used for write after written by previous instruction. For example, ``ADD R1, --, --; SUB R1, --, --;`` - Read after Read (RAR) : It occurs when the instruction both read from the same register. For example, ``ADD --, R1, --; SUB --, R1, --;`` Since reading a register value does not change the register value, these Read after Read (RAR) dependencies don’t cause a problem for the processor. - Control Hazard - Also known as branch hazards, occur when the pipeline makes wrong decisions on branch prediction, leading to the fetching of incorrect instructions. This typically happens with conditional branches and jumps. ## Load-Use Data Hazard Consider the following sequence of instructions: ``` lw $t2, 20($t1) # a load-use hazard and $t4, $t2, $t5 ``` This hazard cannot be resolved by simple forwarding. The value ``lw`` writes into ``$t2`` is not available until lw completes the ``MEM stage``, but ``and`` needs that value when it enters the ``EX stage``, which is when lw enters the MEM stage. | | c1 | c2 | c3 | c4 | c5 | c6 | |:----------------- |:--- | --- | --- |:--- |:--- |:--- | | lw $t2,20($t1) | IF | ID | EX | MEM | WB | | | and $t4, $t2, $t5 | | IF | ID | EX | MEM | WB | ## Handling Data Hazards : These are various methods we use to handle hazards: - `Forwarding` : It adds special circuitry to the pipeline. This method works because it takes less time for the required values to travel through a wire than it does for a pipeline segment to compute its result. - `Code reordering` : We need a special type of software to reorder code. We call this type of software a hardware-dependent compiler. - `Stall Insertion` : it inserts one or more stall (no-op instructions) into the pipeline, which delays the execution of the current instruction until the required operand is written to the register file, but this method decreases pipeline efficiency and throughput. A load-use hazard requires delaying the execution of the using instruction until the result from the loading instruction can be made available to the using instruction If we can stall the execution of the using instruction for one cycle: | | c1 | c2 | c3 | c4 | c5 | c6 | c7 | |:----------------- |:--- |:--- | --- |:--- |:--- |:--- | --- | | lw $t2,20($t1) | IF | ID | EX | MEM | WB | | | | NOP | | IF | ID | EX | MEM | WB | | | and $t4, $t2, $t5 | | | IF | ID | EX | MEM | WB | - value to be loaded to `$t2` will be available in the MEM/WB buffer when the using instruction moves from `ID` to `EX` - that value can be forwarded to the using instruction as the using instruction enters the EX stage ## Implement I chose the version with clz_binary and popcount to implement the assembly code for a processor without a hazard detection unit. First, I ran the original code and encountered the error ``Unknown system call in register 'a7': 0``. After referring to some previous notes for solutions, I learned that three NOPs need to be inserted before the ecall instruction for the system call to work. ``` done_outer: # Exit the program li a7, 10 # Syscall number 10 (exit) addi x0,x0,0 #NOP addi x0,x0,0 #NOP addi x0,x0,0 #NOP ecall ``` After fixing all the system call instructions, I ran the code again and found that the output was empty. ![ζˆͺεœ– 2024-10-13 ζ™šδΈŠ8.14.03](https://hackmd.io/_uploads/H1t5YNF1yx.png) I summarized the data hazards of branch instructions and divided them into three scenarios for discussion. - If the branch instruction's register has a data dependency with the destination register of the second or third preceding ALU instruction, forwarding can resolve the issue. - If the branch instruction's register has a data dependency with the destination register of the immediately preceding ALU instruction or the second preceding load instruction, the pipeline must stall for one clock cycle. - If the branch instruction's register has a data dependency with the destination register of the immediately preceding load instruction, the pipeline must stall for two clock cycles. Based on this principle, I corrected my code and rearranged the la t0 nums instruction to be placed after the li t2 0 instruction without affecting the original program, allowing me to reduce one NOP. ```c main: # Initialize pointers and array size li t6, 3 # Number of elements in nums li t2, 0 # i = 0 la t0, nums # Load nums array start address loop_outer: beq t2, t6, done_outer # If i == size, exit the outer loop ``` --- I used this code as an example. ``` li s0, 0x55555555 # Load mask 0x55555555 and s1, s0, t3 # s1 = u & 0x55555555 srli s2, t3, 1 # s2 = u >> 1 and s2, s2, s0 # s2 = (u >> 1) & 0x55555555 add a1, s2, s1 # a1 = (u & 0x55555555) + ((u >> 1) & 0x55555555) ``` When it starts executing, it will be translated into the following machine code. ``` c8: 55555437 lui x8 0x55555 cc: 55540413 addi x8 x8 1365 d0: 01c474b3 and x9 x8 x28 d4: 001e5913 srli x18 x28 1 d8: 00897933 and x18 x18 x8 dc: 009905b3 add x11 x18 x9 ``` First, li s0, 0x55555555 will be translated into two instructions: lui x8, 0x55555, followed by addi x8, x8, 1365 to load the mask 0x55555555 into x8. When and s1, s0, t3 reaches the EX stage, since the previous instruction is addi x8, x8, 1365 and there is no load-use hazard, it can execute normally without adding a NOP. (When the li instruction is translated from pseudo code to machine code, it should inherently include a stall.) Next, I used this code as an example.I will delve into the internal circuit behavior. ``` loop_outer: beq t2, t6, done_outer # If i == size, exit the outer loop addi t0, t0, 4 # Move the nums pointer forward by 4 bytes (size of an int) addi t2, t2, 1 # Increment i lw t3, -4(t0) # Load nums[i] into t3 (num), note the -4 offset addi x0,x0,0 #NOP mv t4, t3 # Save the original num into t4 for later display jal ra, my_popcnt # Call my_popcnt, step count is stored in a1 ``` When it starts executing, it will be translated into the following machine code. ``` 10: 0bf38063 beq x7 x31 160 <done_outer> 14: 00428293 addi x5 x5 4 18: 00138393 addi x7 x7 1 1c: ffc2ae03 lw x28 -4 x5 20: 00000013 addi x0 x0 0 24: 000e0e93 addi x29 x28 0 28: 09c000ef jal x1 156 <my_popcnt> ``` The first image shows that the lw t3, -4(t0) instruction retrieves the data 0x0000000e in the MEM stage but has not yet been stored in the Mem/WB pipeline register. Therefore, even with forwarding, the correct value cannot be passed back to the EX stage at this time. ![ζˆͺεœ– 2024-10-14 ε‡Œζ™¨12.54.22](https://hackmd.io/_uploads/HyGdsOY11e.png) The second image shows that, through forwarding, even if the addi x29 x28 0 instruction reads an incorrect value from x28, it can be corrected to the right value via forwarding. ![ζˆͺεœ– 2024-10-14 ε‡Œζ™¨12.51.30](https://hackmd.io/_uploads/HJ925dKyyl.png) The issue here confuses me a bit. In my opinion, a circuit with full forwarding capability when dealing with a data hazard, it should only require stalling for one cycle. However, it seems that the branch cannot receive values from two-stage forwarding. I initially thought that if there was a data hazard with an ALU instruction before a branch, the result could be forwarded from the EX stage to the ID stage to determine whether the branch should be taken, requiring only one clock cycle. However, after observing the circuit behavior, I found that the branch after the srli instruction needed two NOPs.I still need to conduct more testing and research to clarify this part. But for now, I have already modified a version without hazard detection. Below is my complete source code. ```c .data nums: .word 14, 8, 123 # Test data nums = 14, 8, 123 input_str: .string "Input: num = " # Defines the string "Input: num = " output_str: .string " Output: " # Defines the string "Output: " .text main: # Initialize pointers and array size li t6, 3 # Number of elements in nums (optimized: hardcoded) li t2, 0 # i = 0 la t0, nums # Load nums array start address loop_outer: beq t2, t6, done_outer # If i == size, exit the outer loop addi t0, t0, 4 # Move the nums pointer forward by 4 bytes (size of an int) addi t2, t2, 1 # Increment i lw t3, -4(t0) # Load nums[i] into t3 (num), note the -4 offset addi x0,x0,0 #NOP mv t4, t3 # Save the original num into t4 for later display jal ra, my_popcnt # Call my_popcnt, step count is stored in a1 # Display the string "Input: num = " la a0, input_str # Load the address of "Input: num = " li a7, 4 # Syscall number 4 (print string) addi x0,x0,0 #NOP addi x0,x0,0 #NOP addi x0,x0,0 #NOP ecall # Execute syscall # Display the original num mv a0, t4 # Move the original num into a0 (syscall 1: print integer) li a7, 1 # Syscall number 1 (print integer) addi x0,x0,0 #NOP addi x0,x0,0 #NOP addi x0,x0,0 #NOP ecall # Display the number # Display the string " Output: " la a0, output_str # Load the address of " Output: " li a7, 4 # Syscall number 4 (print string) addi x0,x0,0 #NOP addi x0,x0,0 #NOP addi x0,x0,0 #NOP ecall # Execute syscall # Display the step count mv a0, a1 # Move the step count (a1) into a0 li a7, 1 # Syscall number 1 (print integer) addi x0,x0,0 #NOP addi x0,x0,0 #NOP addi x0,x0,0 #NOP ecall # Display the step count # Print a newline li a0, 0x0A # Newline (0x0A is the newline character) li a7, 11 # Syscall number 11 (print char) addi x0,x0,0 #NOP addi x0,x0,0 #NOP addi x0,x0,0 #NOP ecall # Move to the next test data j loop_outer # Jump back to the outer loop done_outer: # Exit the program li a7, 10 # Syscall number 10 (exit) addi x0,x0,0 #NOP addi x0,x0,0 #NOP addi x0,x0,0 #NOP ecall my_popcnt: # u = (u & 0x55555555) + ((u >> 1) & 0x55555555) li s0, 0x55555555 # Load mask 0x55555555 and s1, s0, t3 # s1 = u & 0x55555555 srli s2, t3, 1 # s2 = u >> 1 and s2, s2, s0 # s2 = (u >> 1) & 0x55555555 add a1, s2, s1 # a1 = (u & 0x55555555) + ((u >> 1) & 0x55555555) # u = (u & 0x33333333) + ((u >> 2) & 0x33333333) li s0, 0x33333333 # Load mask 0x33333333 and s1, s0, a1 # s1 = u & 0x33333333 srli s2, a1, 2 # s2 = u >> 2 and s2, s2, s0 # s2 = (u >> 2) & 0x33333333 add a1, s2, s1 # a1 = (u & 0x33333333) + ((u >> 2) & 0x33333333) # u = (u & 0x0F0F0F0F) + ((u >> 4) & 0x0F0F0F0F) li s0, 0x0F0F0F0F # Load mask 0x0F0F0F0F and s1, s0, a1 # s1 = u & 0x0F0F0F0F srli s2, a1, 4 # s2 = u >> 4 and s2, s2, s0 # s2 = (u >> 4) & 0x0F0F0F0F add a1, s2, s1 # a1 = (u & 0x0F0F0F0F) + ((u >> 4) & 0x0F0F0F0F) # u = (u & 0x00FF00FF) + ((u >> 8) & 0x00FF00FF) li s0, 0x00FF00FF # Load mask 0x00FF00FF and s1, s0, a1 # s1 = u & 0x00FF00FF srli s2, a1, 8 # s2 = u >> 8 and s2, s2, s0 # s2 = (u >> 8) & 0x00FF00FF add a1, s2, s1 # a1 = (u & 0x00FF00FF) + ((u >> 8) & 0x00FF00FF) # u = (u & 0x0000FFFF) + ((u >> 16) & 0x0000FFFF) li s0, 0x0000FFFF # Load mask 0x0000FFFF and s1, s0, a1 # s1 = u & 0x0000FFFF srli s2, a1, 16 # s2 = u >> 16 and s2, s2, s0 # s2 = (u >> 16) & 0x0000FFFF add a1, s2, s1 # a1 = (u & 0x0000FFFF) + ((u >> 16) & 0x0000FFFF) my_clz: li s0, 0 # s0 = clz = 0 # if ((num >> 16) == 0) srli s1, t3, 16 # s1 = num >> 16 addi x0,x0,0 # NOP addi x0,x0,0 # NOP bnez s1, check_24 # If num >> 16 is not 0, jump to check_24 addi s0, s0, 16 # clz += 16 slli t3, t3, 16 # num <<= 16 check_24: # if ((num >> 24) == 0) srli s1, t3, 24 # s1 = num >> 24 addi x0,x0,0 # NOP addi x0,x0,0 # NOP bnez s1, check_28 # If num >> 24 is not 0, jump to check_28 addi s0, s0, 8 # clz += 8 slli t3, t3, 8 # num <<= 8 check_28: # if ((num >> 28) == 0) srli s1, t3, 28 # s1 = num >> 28 addi x0,x0,0 # NOP addi x0,x0,0 # NOP bnez s1, check_30 # If num >> 28 is not 0, jump to check_30 addi s0, s0, 4 # clz += 4 slli t3, t3, 4 # num <<= 4 check_30: # if ((num >> 30) == 0) srli s1, t3, 30 # s1 = num >> 30 addi x0,x0,0 # NOP addi x0,x0,0 # NOP bnez s1, check_31 # If num >> 30 is not 0, jump to check_31 addi s0, s0, 2 # clz += 2 slli t3, t3, 2 # num <<= 2 check_31: # if ((num >> 31) == 0) srli s1, t3, 31 # s1 = num >> 31 addi x0,x0,0 # NOP addi x0,x0,0 # NOP bnez s1, done # If num >> 31 is not 0, jump to done addi s0, s0, 1 # clz += 1 done: li s1, 31 # s1 = 31 addi x0,x0,0 # NOP sub a0, s1, s0 # a0 = 31 - clz (return value) add a1, a0, a1 # Add clz result to popcnt result ret # Return to main program ``` ### Clz_binary_popcnt (Without hazard detection) | Execution info | | |:-------------- |:----- | | Cycles | 383 | | Instr. retired | 315 | | CPI | 1.22 | | IPC | 0.822 | ## 參考資料 [Quiz1 of Computer Architecture (2024 Fall)](https://hackmd.io/@sysprog/arch2024-quiz1-sol) [Leetcode 1342.Number of Steps to Reduce a Number to Zero](https://leetcode.com/problems/number-of-steps-to-reduce-a-number-to-zero/description/) [The RISC-V Instruction Set Manual](https://riscv.org/technical/specifications/) :::danger Always refer to primary sources, such as official RISC-V documentation. :::