Computer Architecture Hw1

# Assignment1: RISC-V Assembly and Instruction Pipeline ## Quiz1 Problem C ### fabsf ```c static inline float fabsf(float x) { uint32_t i = *(uint32_t *)&x; // Read the bits of the float into an integer i &= 0x7FFFFFFF; // Clear the sign bit to get the absolute value x = *(float *)&i; // Write the modified bits back into the float return x; } ``` :::danger Don't paste code snip without comprehensive discussions. ::: ```c .data # Define three arbitrary 32-bit floating-point test values test_value1: .word 0xBF800000 # -1.0 in IEEE 754 floating-point format test_value2: .word 0x3F800000 # 1.0 in IEEE 754 floating-point format test_value3: .word 0xC1200000 # -10.0 in IEEE 754 floating-point format # Storage for the absolute values result1: .word 0 # Storage for fabsf(test_value1) result2: .word 0 # Storage for fabsf(test_value2) result3: .word 0 # Storage for fabsf(test_value3) .text .globl main main: # Load the first test value and call fabsf la t0, test_value1 # Load address of test_value1 lw t0, 0(t0) # Load the value into t0 jal ra, fabsf # Call the fabsf function # Store the result la t1, result1 # Load address of result1 sw t0, 0(t1) # Store the result # Print the result mv a0, t0 # Move the result into a0 li a7, 2 # Syscall code for print float ecall # Print newline li a0, 10 # ASCII code for newline '\n' li a7, 11 # Syscall code for print character ecall # Load the second test value and call fabsf la t0, test_value2 # Load address of test_value2 lw t0, 0(t0) # Load the value into t0 jal ra, fabsf # Call the fabsf function # Store the result la t1, result2 # Load address of result2 sw t0, 0(t1) # Store the result # Print the result mv a0, t0 # Move the result into a0 li a7, 2 # Syscall code for print float ecall # Print newline li a0, 10 # ASCII code for newline '\n' li a7, 11 # Syscall code for print character ecall # Load the third test value and call fabsf la t0, test_value3 # Load address of test_value3 lw t0, 0(t0) # Load the value into t0 jal ra, fabsf # Call the fabsf function # Store the result la t1, result3 # Load address of result3 sw t0, 0(t1) # Store the result # Print the result mv a0, t0 # Move the result into a0 li a7, 2 # Syscall code for print float ecall # Print newline li a0, 10 # ASCII code for newline '\n' li a7, 11 # Syscall code for print character ecall # Exit the program li a7, 10 # Syscall code for program exit ecall # Function: fabsf fabsf: # Remove the sign bit by masking with 0x7FFFFFFF li t1, 0x7FFFFFFF # Load mask value 0x7FFFFFFF into t1 and t0, t0, t1 # t0 = t0 & t1 (clear the sign bit) ret ``` ### my_clz ```c static inline int my_clz(uint32_t x) { int count = 0; for (int i = 31; i >= 0; --i) { if (x & (1U << i)) break; count++; } return count; } ``` ```c .data # Define three arbitrary 32-bit test values test_value1: .word 0x00F0F0F0 # Example value 1 test_value2: .word 0x00000001 # Example value 2 test_value3: .word 0x80000000 # Example value 3 .text .globl main main: # Load the first test value and call my_clz la t0, test_value1 # Load address of test_value1 lw t0, 0(t0) # Load the value into t0 call my_clz # Call the my_clz function mv t3, a0 # Move the result to t3 for later use # Print the result mv a0, a0 # The result is already in a0 li a7, 1 # Syscall code for print integer ecall # Print newline li a0, 10 # ASCII code for newline '\n' li a7, 11 # Syscall code for print character ecall # Load the second test value and call my_clz la t0, test_value2 # Load address of test_value2 lw t0, 0(t0) # Load the value into t0 call my_clz # Call the my_clz function mv t4, a0 # Move the result to t4 for later use # Print the result mv a0, a0 # The result is already in a0 li a7, 1 # Syscall code for print integer ecall # Print newline li a0, 10 # ASCII code for newline '\n' li a7, 11 # Syscall code for print character ecall # Load the third test value and call my_clz la t0, test_value3 # Load address of test_value3 lw t0, 0(t0) # Load the value into t0 call my_clz # Call the my_clz function mv t5, a0 # Move the result to t5 for later use # Print the result mv a0, a0 # The result is already in a0 li a7, 1 # Syscall code for print integer ecall # Print newline li a0, 10 # ASCII code for newline '\n' li a7, 11 # Syscall code for print character ecall # Exit the program li a7, 10 # Syscall code for program exit ecall # Function: my_clz my_clz: li t1, 0 # Initialize count to 0 li t2, 31 # Start checking from bit position 31 clz_loop: blt t2, zero, clz_end # If t2 < 0, exit the loop (all bits checked) li t3, 1 # Load 1 into t3 sll t3, t3, t2 # Compute (1 << t2), store in t3 and t4, t0, t3 # Perform bitwise AND with x and (1 << t2) bne t4, zero, clz_end # If the result is non-zero, break the loop addi t1, t1, 1 # Increment count addi t2, t2, -1 # Decrement t2 to move to the next bit j clz_loop # Repeat the loop clz_end: mv a0, t1 # Move the count result to a0 ret ``` ### fp16_to_fp32 ```c static inline uint32_t fp16_to_fp32(uint16_t h) { /* * Extends the 16-bit half-precision floating-point number to 32 bits * by shifting it to the upper half of a 32-bit word: * +---+-----+------------+-------------------+ * | S |EEEEE|MM MMMM MMMM|0000 0000 0000 0000| * +---+-----+------------+-------------------+ * Bits 31 26-30 16-25 0-15 * * S - sign bit, E - exponent bits, M - mantissa bits, 0 - zero bits. */ const uint32_t w = (uint32_t) h << 16; /* * Isolates the sign bit from the input number, placing it in the most * significant bit of a 32-bit word: * * +---+----------------------------------+ * | S |0000000 00000000 00000000 00000000| * +---+----------------------------------+ * Bits 31 0-31 */ const uint32_t sign = w & UINT32_C(0x80000000); /* * Extracts the mantissa and exponent from the input number, placing * them in bits 0-30 of the 32-bit word: * * +---+-----+------------+-------------------+ * | 0 |EEEEE|MM MMMM MMMM|0000 0000 0000 0000| * +---+-----+------------+-------------------+ * Bits 30 27-31 17-26 0-16 */ const uint32_t nonsign = w & UINT32_C(0x7FFFFFFF); /* * The renorm_shift variable indicates how many bits the mantissa * needs to be shifted to normalize the half-precision number. * For normalized numbers, renorm_shift will be 0. For denormalized * numbers, renorm_shift will be greater than 0. Shifting a * denormalized number will move the mantissa into the exponent, * normalizing it. */ uint32_t renorm_shift = my_clz(nonsign); renorm_shift = renorm_shift > 5 ? renorm_shift - 5 : 0; /* * If the half-precision number has an exponent of 15, adding a * specific value will cause overflow into bit 31, which converts * the upper 9 bits into ones. Thus: * inf_nan_mask == * 0x7F800000 if the half-precision number is * NaN or infinity (exponent of 15) * 0x00000000 otherwise */ const int32_t inf_nan_mask = ((int32_t)(nonsign + 0x04000000) >> 8) & INT32_C(0x7F800000); /* * If nonsign equals 0, subtracting 1 will cause overflow, setting * bit 31 to 1. Otherwise, bit 31 will be 0. Shifting this result * propagates bit 31 across all bits in zero_mask. Thus: * zero_mask == * 0xFFFFFFFF if the half-precision number is * zero (+0.0h or -0.0h) * 0x00000000 otherwise */ const int32_t zero_mask = (int32_t)(nonsign - 1) >> 31; /* * 1. Shifts nonsign left by renorm_shift to normalize it (for denormal * inputs). * 2. Shifts nonsign right by 3, adjusting the exponent to fit in the * 8-bit exponent field and moving the mantissa into the correct * position within the 23-bit mantissa field of the single-precision * format. * 3. Adds 0x70 to the exponent to account for the difference in bias * between half-precision and single-precision. * 4. Subtracts renorm_shift from the exponent to account for any * renormalization that occurred. * 5. ORs with inf_nan_mask to set the exponent to 0xFF if the input * was NaN or infinity. * 6. ANDs with the inverted zero_mask to set the mantissa and exponent * to zero if the input was zero. * 7. Combines everything with the sign bit of the input number. */ return sign | ((((nonsign << renorm_shift >> 3) + ((0x70 - renorm_shift) << 23)) | inf_nan_mask) & ~zero_mask); } ``` ```c .data # Input half-precision floating-point numbers input1: .half 0x3800 # 1st half-precision input input2: .half 0x3C00 # 2nd half-precision input input3: .half 0x7C00 # 3rd half-precision input # Storage for the converted single-precision floating-point results result1: .word 0 # Storage for 1st conversion result result2: .word 0 # Storage for 2nd conversion result result3: .word 0 # Storage for 3rd conversion result .text .globl main main: # Load the first input and call the conversion function lh t0, input1 # Load the first half-precision float from memory into t0 call fp16_to_fp32 # Call the conversion function la t1, result1 # Load the address of result1 sw a0, 0(t1) # Store the conversion result into result1 # Load the second input and call the conversion function lh t0, input2 # Load the second half-precision float into t0 call fp16_to_fp32 # Call the conversion function la t1, result2 # Load the address of result2 sw a0, 0(t1) # Store the conversion result into result2 # Load the third input and call the conversion function lh t0, input3 # Load the third half-precision float into t0 call fp16_to_fp32 # Call the conversion function la t1, result3 # Load the address of result3 sw a0, 0(t1) # Store the conversion result into result3 # Exit the program li a7, 10 # System call number 10 indicates program exit ecall # Function section # Convert half-precision float to single-precision float fp16_to_fp32: # t0 contains the 16-bit half-precision floating-point number # Step 1: Extract the sign bit srai t1, t0, 15 # Right shift by 15 bits to get the sign bit andi t1, t1, 0x1 # Keep only the least significant bit (sign bit) slli t1, t1, 31 # Left shift by 31 bits to position the sign bit at the highest bit # Step 2: Extract the exponent bits srai t2, t0, 10 # Right shift by 10 bits to get the exponent bits andi t2, t2, 0x1F # Keep only 5 bits of exponent # Step 3: Extract the mantissa bits andi t3, t0, 0x3FF # Keep only 10 bits of mantissa # Step 4: Handle special cases beqz t2, fp16_zero_or_subnormal # If exponent is zero, handle zero or subnormal numbers li t4, 0x1F beq t2, t4, fp16_infinite_or_nan # If exponent is all ones, handle infinity or NaN # Handle normalized numbers addi t2, t2, 112 # Adjust exponent bias: t2 = exponent + (127 - 15) slli t2, t2, 23 # Shift exponent to the correct position slli t3, t3, 13 # Shift mantissa to the correct position or t2, t2, t3 # Combine exponent and mantissa or a0, t1, t2 # Combine sign bit with the rest ret fp16_zero_or_subnormal: # Handle zero or subnormal numbers beqz t3, fp16_zero # If mantissa is zero, it's zero # Normalize subnormal numbers mv t5, t3 # Copy mantissa to t5 call my_clz_16 # Calculate the number of leading zeros in t5 mv t4, a0 # t4 = number of leading zeros # Adjust exponent li t6, 113 # t6 = 113 (127 - 15 + 1) sub t2, t6, t4 # t2 = 113 - number of leading zeros # Adjust mantissa addi a7, t4, 1 # t7 = number of leading zeros + 1 sll t3, t3, a7 # Left shift mantissa to normalize # Shift mantissa to the correct position slli t3, t3, 12 # Left shift by 12 bits to position mantissa at [22:0] # Combine exponent and mantissa slli t2, t2, 23 # Shift exponent to bits [30:23] or t2, t2, t3 # Combine exponent and mantissa or a0, t1, t2 # Combine sign bit with the rest ret fp16_infinite_or_nan: # Handle infinity or NaN li t2, 0xFF # Set exponent bits to 0xFF slli t2, t2, 23 # Shift exponent to the correct position slli t3, t3, 13 # Shift mantissa to the correct position or t2, t2, t3 # Combine exponent and mantissa or a0, t1, t2 # Combine sign bit with the rest ret fp16_zero: # Handle zero mv a0, t1 # Result is just the sign bit (zero) ret # Calculate the number of leading zeros in a 16-bit integer my_clz_16: # t5 contains a 16-bit value # Return value is in a0 li t1, 0 # Initialize counter li t2, 15 # Start from the highest bit clz16_loop: bltz t2, clz16_end # If t2 < 0, exit the loop srl t3, t5, t2 # Right shift t5 by t2 bits andi t3, t3, 1 clz16_end: mv a0, t1 ret ``` ## LeetCode : 2348. Number of Zero-Filled https://leetcode.com/problems/number-of-zero-filled-subarrays/ Given an integer array `nums`, return the number of subarrays filled with `0`. A subarray is a contiguous non-empty sequence of elements within an array. **Example 1:** > **Input:** nums = [1,3,0,0,2,0,0,4] **Output:** 6 **Explanation:** There are 4 occurrences of [0] as a subarray. There are 2 occurrences of [0,0] as a subarray. There is no occurrence of a subarray with a size more than 2 filled with 0. Therefore, we return 6. **Example 2:** > **Input:** nums = [0,0,0,2,0,0] **Output:** 9 **Explanation:** There are 5 occurrences of [0] as a subarray. There are 3 occurrences of [0,0] as a subarray. There is 1 occurrence of [0,0,0] as a subarray. There is no occurrence of a subarray with a size more than 3 filled with 0. Therefore, we return 9. **Example 3:** > **Input:** nums = [2,10,2019] **Output:** 0 **Explanation:** There is no subarray filled with 0. Therefore, we return 0. **Constraints:** * `1 <= nums.length <= 10^5` * `-10^9 <= nums[i] <= 10^9` ## Solution > *With C programming.* ### First version > *without using the clz_function* This code uses a `for` loop to iterate through each element of the array. For each element: If the current element is zero, `count_tmp += 1` (indicating how many consecutive zeros have been encountered so far). If the current element is not zero, `count_tmp` is reset to 0, meaning the current sequence of consecutive zeros has ended. With each new zero encountered, the count of subarrays is progressively accumulated. For example, if the number of consecutive zeros is `k`, then the number of subarrays formed by these zeros is `1 + 2 + 3 + ... + k`. This is accumulated in the result through `count_tmp` in the loop. ```c #include <stdio.h> long long zeroFilledSubarray(int* nums, int numsSize) { short count_tmp = 0; long long result = 0; for(int i = 0; i < numsSize; i++) { if (nums[i] == 0) { count_tmp += 1; result += count_tmp; } else{ count_tmp = 0; } } return result; } int main() { // Example 1 int nums1[] = {1, 3, 0, 0, 2, 0, 0, 4}; int numsSize1 = sizeof(nums1) / sizeof(nums1[0]); long long result1 = zeroFilledSubarray(nums1, numsSize1); printf("Example 1 - The number of subarrays filled with 0: %lld\n", result1); // Example 2 int nums2[] = {0, 0, 0, 2, 0, 0}; int numsSize2 = sizeof(nums2) / sizeof(nums2[0]); long long result2 = zeroFilledSubarray(nums2, numsSize2); printf("Example 2 - The number of subarrays filled with 0: %lld\n", result2); // Example 3 int nums3[] = {2, 10, 2019}; int numsSize3 = sizeof(nums3) / sizeof(nums3[0]); long long result3 = zeroFilledSubarray(nums3, numsSize3); printf("Example 3 - The number of subarrays filled with 0: %lld\n", result3); return 0; } ``` ### Second version > *with the clz_function* Based on the statement "If the number of consecutive zeros is `k`, then the number of subarrays is `1 + 2 + 3 + ... + k`" we can say that if we can directly determine `k` using the `clz_function` function, then we can calculate the result using the trapezoid area formula: `(k + 1) * k / 2`. ```c #include <stdio.h> static inline short clz(int* nums, int numsStart) { int count = 0; for (int i = numsStart - 1; i >= 0; --i) { if (nums[i] != 0) break; count++; } return count; } long long zeroFilledSubarray(int* nums, int numsSize) { long long result = 0; short zero_count = 0; while (numsSize > 0) { zero_count = clz(nums, numsSize); result += (zero_count + 1) * zero_count / 2; numsSize -= zero_count+1; } return result; } int main() { // Example 1 int nums1[] = {1, 3, 0, 0, 2, 0, 0, 4}; int numsSize1 = sizeof(nums1) / sizeof(nums1[0]); long long result1 = zeroFilledSubarray(nums1, numsSize1); printf("Example 1 - The number of subarrays filled with 0: %lld\n", result1); // Example 2 int nums2[] = {0, 0, 0, 2, 0, 0}; int numsSize2 = sizeof(nums2) / sizeof(nums2[0]); long long result2 = zeroFilledSubarray(nums2, numsSize2); printf("Example 2 - The number of subarrays filled with 0: %lld\n", result2); // Example 3 int nums3[] = {2, 10, 2019}; int numsSize3 = sizeof(nums3) / sizeof(nums3[0]); long long result3 = zeroFilledSubarray(nums3, numsSize3); printf("Example 3 - The number of subarrays filled with 0: %lld\n", result3); return 0; } ``` ## assembly code ```c .data nums1: .word 1, 3, 0, 0, 2, 0, 0, 4 numsSize1: .word 8 nums2: .word 0, 0, 0, 2 ,0 ,0 numsSize2: .word 6 nums3: .word 2, 10, 2019 numsSize3: .word 3 .text .globl main main: # Initialize pointers for nums1 array la t0, nums1 # Load the address of nums1 into t0 lw t1, numsSize1 call zeroFilledSubarray # Print result mv a0, a2 # Move result to a1 for printing li a7, 1 # Print integer syscall ecall # Print newline li a0, 10 # ASCII code for newline '\n' li a7, 11 # Syscall code for print character ecall # Initialize pointers for nums1 array la t0, nums2 # Load the address of nums2 into t0 lw t1, numsSize2 call zeroFilledSubarray # Print result mv a0, a2 # Move result to a1 for printing li a7, 1 # Print integer syscall ecall # Print newline li a0, 10 # ASCII code for newline '\n' li a7, 11 # Syscall code for print character ecall # Initialize pointers for nums1 array la t0, nums3 # Load the address of nums3 into t0 lw t1, numsSize3 call zeroFilledSubarray # Print result mv a0, a2 # Move result to a1 for printing li a7, 1 # Print integer syscall ecall # Print newline li a0, 10 # ASCII code for newline '\n' li a7, 11 # Syscall code for print character ecall # Exit li a7, 10 # Exit syscall ecall zeroFilledSubarray: # Arguments: # t0: nums (pointer to array) # t1: numsSize (size of array) addi sp, sp, -12 # Allocate stack space for 3 registers sw ra, 8(sp) # Save return address sw s1, 4(sp) # Save s1 (zero_count) sw s2, 0(sp) # Save s2 (result) # Initialize result and zero_count li s2, 0 # result = 0 li s1, 0 # zero_count = 0 loop: beqz t1, end # If numsSize == 0, exit loop # Call clz function call clz # Update result mv s1, a1 # Move clz result to s1 (zero_count) addi t2, s1, 1 # t2 = zero_count + 1 mul t2, t2, s1 # t2 = (zero_count + 1) * zero_count srai t2, t2, 1 # t2 = t2 / 2 add s2, s2, t2 # result += t2 # Update nums pointer and numsSize addi t0, t0, 4 beqz t1, loop # If numsSize == 0, numsSize don't have to -1 addi t1, t1, -1 # numsSize -= 1 (for the non-zero element) j loop # Repeat loop end: # Load count to return mv a2, s2 # Move count to a0 # Restore registers and return lw ra, 8(sp) # Restore return address lw s1, 4(sp) # Restore s1 lw s2, 0(sp) # Restore s2 addi sp, sp, 12 # Deallocate stack space ret clz: # Arguments: # t0: nums (pointer to array) # t1: numsStart (start index in array) addi sp, sp, -8 # Allocate stack space for 2 registers sw ra, 4(sp) # Save return address sw s0, 0(sp) # Save s0 (count) # Initialize count to 0 li s0, 0 # count = 0 # Loop from numsStart - 1 to 0 loop_clz: beqz t1, end_clz # If a1 == 0, exit loop # Load value from nums lb a0, 0(t0) # Load byte from nums bnez a0, end_clz # If nums[i] != 0, exit loop addi s0, s0, 1 # Increment count addi t0, t0, 4 addi t1, t1, -1 j loop_clz # Repeat loop end_clz: # Load count to return mv a1, s0 # Move count to a0 # Restore registers and return lw ra, 4(sp) # Restore return address lw s0, 0(sp) # Restore s0 addi sp, sp, 8 # Deallocate stack space ret ```

Syntax	Example	Reference
# Header	Header	基本排版
- Unordered List	Unordered List
1. Ordered List	Ordered List
- [ ] Todo List	Todo List
> Blockquote	Blockquote
Bold font	Bold font
Italics font	Italics font
~~Strikethrough~~	~~Strikethrough~~
19^th^	19^th
H~2~O	H₂O
++Inserted text++	Inserted text
==Marked text==	Marked text
[link text](https:// "title")	Link
![image alt](https:// "title")	Image
`Code`	`Code`	在筆記中貼入程式碼
```javascript var i = 0; ```	`var i = 0;`
:smile:		Emoji list
{%youtube youtube_id %}	Externals
$L^aT_eX$	L^aT_eX
:::info This is a alert area. :::	This is a alert area.