owned this note
owned this note
Published
Linked with GitHub
# Assignment1: RISC-V Assembly and Instruction Pipeline
## Quiz1 Problem C
### fabsf
```c
static inline float fabsf(float x) {
uint32_t i = *(uint32_t *)&x; // Read the bits of the float into an integer
i &= 0x7FFFFFFF; // Clear the sign bit to get the absolute value
x = *(float *)&i; // Write the modified bits back into the float
return x;
}
```
:::danger
Don't paste code snip without comprehensive discussions.
:::
```c
.data
# Define three arbitrary 32-bit floating-point test values
test_value1: .word 0xBF800000 # -1.0 in IEEE 754 floating-point format
test_value2: .word 0x3F800000 # 1.0 in IEEE 754 floating-point format
test_value3: .word 0xC1200000 # -10.0 in IEEE 754 floating-point format
# Storage for the absolute values
result1: .word 0 # Storage for fabsf(test_value1)
result2: .word 0 # Storage for fabsf(test_value2)
result3: .word 0 # Storage for fabsf(test_value3)
.text
.globl main
main:
# Load the first test value and call fabsf
la t0, test_value1 # Load address of test_value1
lw t0, 0(t0) # Load the value into t0
jal ra, fabsf # Call the fabsf function
# Store the result
la t1, result1 # Load address of result1
sw t0, 0(t1) # Store the result
# Print the result
mv a0, t0 # Move the result into a0
li a7, 2 # Syscall code for print float
ecall
# Print newline
li a0, 10 # ASCII code for newline '\n'
li a7, 11 # Syscall code for print character
ecall
# Load the second test value and call fabsf
la t0, test_value2 # Load address of test_value2
lw t0, 0(t0) # Load the value into t0
jal ra, fabsf # Call the fabsf function
# Store the result
la t1, result2 # Load address of result2
sw t0, 0(t1) # Store the result
# Print the result
mv a0, t0 # Move the result into a0
li a7, 2 # Syscall code for print float
ecall
# Print newline
li a0, 10 # ASCII code for newline '\n'
li a7, 11 # Syscall code for print character
ecall
# Load the third test value and call fabsf
la t0, test_value3 # Load address of test_value3
lw t0, 0(t0) # Load the value into t0
jal ra, fabsf # Call the fabsf function
# Store the result
la t1, result3 # Load address of result3
sw t0, 0(t1) # Store the result
# Print the result
mv a0, t0 # Move the result into a0
li a7, 2 # Syscall code for print float
ecall
# Print newline
li a0, 10 # ASCII code for newline '\n'
li a7, 11 # Syscall code for print character
ecall
# Exit the program
li a7, 10 # Syscall code for program exit
ecall
# Function: fabsf
fabsf:
# Remove the sign bit by masking with 0x7FFFFFFF
li t1, 0x7FFFFFFF # Load mask value 0x7FFFFFFF into t1
and t0, t0, t1 # t0 = t0 & t1 (clear the sign bit)
ret
```
### my_clz
```c
static inline int my_clz(uint32_t x) {
int count = 0;
for (int i = 31; i >= 0; --i) {
if (x & (1U << i))
break;
count++;
}
return count;
}
```
```c
.data
# Define three arbitrary 32-bit test values
test_value1: .word 0x00F0F0F0 # Example value 1
test_value2: .word 0x00000001 # Example value 2
test_value3: .word 0x80000000 # Example value 3
.text
.globl main
main:
# Load the first test value and call my_clz
la t0, test_value1 # Load address of test_value1
lw t0, 0(t0) # Load the value into t0
call my_clz # Call the my_clz function
mv t3, a0 # Move the result to t3 for later use
# Print the result
mv a0, a0 # The result is already in a0
li a7, 1 # Syscall code for print integer
ecall
# Print newline
li a0, 10 # ASCII code for newline '\n'
li a7, 11 # Syscall code for print character
ecall
# Load the second test value and call my_clz
la t0, test_value2 # Load address of test_value2
lw t0, 0(t0) # Load the value into t0
call my_clz # Call the my_clz function
mv t4, a0 # Move the result to t4 for later use
# Print the result
mv a0, a0 # The result is already in a0
li a7, 1 # Syscall code for print integer
ecall
# Print newline
li a0, 10 # ASCII code for newline '\n'
li a7, 11 # Syscall code for print character
ecall
# Load the third test value and call my_clz
la t0, test_value3 # Load address of test_value3
lw t0, 0(t0) # Load the value into t0
call my_clz # Call the my_clz function
mv t5, a0 # Move the result to t5 for later use
# Print the result
mv a0, a0 # The result is already in a0
li a7, 1 # Syscall code for print integer
ecall
# Print newline
li a0, 10 # ASCII code for newline '\n'
li a7, 11 # Syscall code for print character
ecall
# Exit the program
li a7, 10 # Syscall code for program exit
ecall
# Function: my_clz
my_clz:
li t1, 0 # Initialize count to 0
li t2, 31 # Start checking from bit position 31
clz_loop:
blt t2, zero, clz_end # If t2 < 0, exit the loop (all bits checked)
li t3, 1 # Load 1 into t3
sll t3, t3, t2 # Compute (1 << t2), store in t3
and t4, t0, t3 # Perform bitwise AND with x and (1 << t2)
bne t4, zero, clz_end # If the result is non-zero, break the loop
addi t1, t1, 1 # Increment count
addi t2, t2, -1 # Decrement t2 to move to the next bit
j clz_loop # Repeat the loop
clz_end:
mv a0, t1 # Move the count result to a0
ret
```
### fp16_to_fp32
```c
static inline uint32_t fp16_to_fp32(uint16_t h) {
/*
* Extends the 16-bit half-precision floating-point number to 32 bits
* by shifting it to the upper half of a 32-bit word:
* +---+-----+------------+-------------------+
* | S |EEEEE|MM MMMM MMMM|0000 0000 0000 0000|
* +---+-----+------------+-------------------+
* Bits 31 26-30 16-25 0-15
*
* S - sign bit, E - exponent bits, M - mantissa bits, 0 - zero bits.
*/
const uint32_t w = (uint32_t) h << 16;
/*
* Isolates the sign bit from the input number, placing it in the most
* significant bit of a 32-bit word:
*
* +---+----------------------------------+
* | S |0000000 00000000 00000000 00000000|
* +---+----------------------------------+
* Bits 31 0-31
*/
const uint32_t sign = w & UINT32_C(0x80000000);
/*
* Extracts the mantissa and exponent from the input number, placing
* them in bits 0-30 of the 32-bit word:
*
* +---+-----+------------+-------------------+
* | 0 |EEEEE|MM MMMM MMMM|0000 0000 0000 0000|
* +---+-----+------------+-------------------+
* Bits 30 27-31 17-26 0-16
*/
const uint32_t nonsign = w & UINT32_C(0x7FFFFFFF);
/*
* The renorm_shift variable indicates how many bits the mantissa
* needs to be shifted to normalize the half-precision number.
* For normalized numbers, renorm_shift will be 0. For denormalized
* numbers, renorm_shift will be greater than 0. Shifting a
* denormalized number will move the mantissa into the exponent,
* normalizing it.
*/
uint32_t renorm_shift = my_clz(nonsign);
renorm_shift = renorm_shift > 5 ? renorm_shift - 5 : 0;
/*
* If the half-precision number has an exponent of 15, adding a
* specific value will cause overflow into bit 31, which converts
* the upper 9 bits into ones. Thus:
* inf_nan_mask ==
* 0x7F800000 if the half-precision number is
* NaN or infinity (exponent of 15)
* 0x00000000 otherwise
*/
const int32_t inf_nan_mask = ((int32_t)(nonsign + 0x04000000) >> 8) &
INT32_C(0x7F800000);
/*
* If nonsign equals 0, subtracting 1 will cause overflow, setting
* bit 31 to 1. Otherwise, bit 31 will be 0. Shifting this result
* propagates bit 31 across all bits in zero_mask. Thus:
* zero_mask ==
* 0xFFFFFFFF if the half-precision number is
* zero (+0.0h or -0.0h)
* 0x00000000 otherwise
*/
const int32_t zero_mask = (int32_t)(nonsign - 1) >> 31;
/*
* 1. Shifts nonsign left by renorm_shift to normalize it (for denormal
* inputs).
* 2. Shifts nonsign right by 3, adjusting the exponent to fit in the
* 8-bit exponent field and moving the mantissa into the correct
* position within the 23-bit mantissa field of the single-precision
* format.
* 3. Adds 0x70 to the exponent to account for the difference in bias
* between half-precision and single-precision.
* 4. Subtracts renorm_shift from the exponent to account for any
* renormalization that occurred.
* 5. ORs with inf_nan_mask to set the exponent to 0xFF if the input
* was NaN or infinity.
* 6. ANDs with the inverted zero_mask to set the mantissa and exponent
* to zero if the input was zero.
* 7. Combines everything with the sign bit of the input number.
*/
return sign | ((((nonsign << renorm_shift >> 3) +
((0x70 - renorm_shift) << 23)) | inf_nan_mask) & ~zero_mask);
}
```
```c
.data
# Input half-precision floating-point numbers
input1: .half 0x3800 # 1st half-precision input
input2: .half 0x3C00 # 2nd half-precision input
input3: .half 0x7C00 # 3rd half-precision input
# Storage for the converted single-precision floating-point results
result1: .word 0 # Storage for 1st conversion result
result2: .word 0 # Storage for 2nd conversion result
result3: .word 0 # Storage for 3rd conversion result
.text
.globl main
main:
# Load the first input and call the conversion function
lh t0, input1 # Load the first half-precision float from memory into t0
call fp16_to_fp32 # Call the conversion function
la t1, result1 # Load the address of result1
sw a0, 0(t1) # Store the conversion result into result1
# Load the second input and call the conversion function
lh t0, input2 # Load the second half-precision float into t0
call fp16_to_fp32 # Call the conversion function
la t1, result2 # Load the address of result2
sw a0, 0(t1) # Store the conversion result into result2
# Load the third input and call the conversion function
lh t0, input3 # Load the third half-precision float into t0
call fp16_to_fp32 # Call the conversion function
la t1, result3 # Load the address of result3
sw a0, 0(t1) # Store the conversion result into result3
# Exit the program
li a7, 10 # System call number 10 indicates program exit
ecall
# Function section
# Convert half-precision float to single-precision float
fp16_to_fp32:
# t0 contains the 16-bit half-precision floating-point number
# Step 1: Extract the sign bit
srai t1, t0, 15 # Right shift by 15 bits to get the sign bit
andi t1, t1, 0x1 # Keep only the least significant bit (sign bit)
slli t1, t1, 31 # Left shift by 31 bits to position the sign bit at the highest bit
# Step 2: Extract the exponent bits
srai t2, t0, 10 # Right shift by 10 bits to get the exponent bits
andi t2, t2, 0x1F # Keep only 5 bits of exponent
# Step 3: Extract the mantissa bits
andi t3, t0, 0x3FF # Keep only 10 bits of mantissa
# Step 4: Handle special cases
beqz t2, fp16_zero_or_subnormal # If exponent is zero, handle zero or subnormal numbers
li t4, 0x1F
beq t2, t4, fp16_infinite_or_nan # If exponent is all ones, handle infinity or NaN
# Handle normalized numbers
addi t2, t2, 112 # Adjust exponent bias: t2 = exponent + (127 - 15)
slli t2, t2, 23 # Shift exponent to the correct position
slli t3, t3, 13 # Shift mantissa to the correct position
or t2, t2, t3 # Combine exponent and mantissa
or a0, t1, t2 # Combine sign bit with the rest
ret
fp16_zero_or_subnormal:
# Handle zero or subnormal numbers
beqz t3, fp16_zero # If mantissa is zero, it's zero
# Normalize subnormal numbers
mv t5, t3 # Copy mantissa to t5
call my_clz_16 # Calculate the number of leading zeros in t5
mv t4, a0 # t4 = number of leading zeros
# Adjust exponent
li t6, 113 # t6 = 113 (127 - 15 + 1)
sub t2, t6, t4 # t2 = 113 - number of leading zeros
# Adjust mantissa
addi a7, t4, 1 # t7 = number of leading zeros + 1
sll t3, t3, a7 # Left shift mantissa to normalize
# Shift mantissa to the correct position
slli t3, t3, 12 # Left shift by 12 bits to position mantissa at [22:0]
# Combine exponent and mantissa
slli t2, t2, 23 # Shift exponent to bits [30:23]
or t2, t2, t3 # Combine exponent and mantissa
or a0, t1, t2 # Combine sign bit with the rest
ret
fp16_infinite_or_nan:
# Handle infinity or NaN
li t2, 0xFF # Set exponent bits to 0xFF
slli t2, t2, 23 # Shift exponent to the correct position
slli t3, t3, 13 # Shift mantissa to the correct position
or t2, t2, t3 # Combine exponent and mantissa
or a0, t1, t2 # Combine sign bit with the rest
ret
fp16_zero:
# Handle zero
mv a0, t1 # Result is just the sign bit (zero)
ret
# Calculate the number of leading zeros in a 16-bit integer
my_clz_16:
# t5 contains a 16-bit value
# Return value is in a0
li t1, 0 # Initialize counter
li t2, 15 # Start from the highest bit
clz16_loop:
bltz t2, clz16_end # If t2 < 0, exit the loop
srl t3, t5, t2 # Right shift t5 by t2 bits
andi t3, t3, 1
clz16_end:
mv a0, t1
ret
```
## LeetCode : 2348. Number of Zero-Filled
https://leetcode.com/problems/number-of-zero-filled-subarrays/
Given an integer array `nums`, return the number of subarrays filled with `0`.
A subarray is a contiguous non-empty sequence of elements within an array.
**Example 1:**
> **Input:** nums = [1,3,0,0,2,0,0,4]
**Output:** 6
**Explanation:**
There are 4 occurrences of [0] as a subarray.
There are 2 occurrences of [0,0] as a subarray.
There is no occurrence of a subarray with a size more than 2 filled with 0. Therefore, we return 6.
**Example 2:**
> **Input:** nums = [0,0,0,2,0,0]
**Output:** 9
**Explanation:**
There are 5 occurrences of [0] as a subarray.
There are 3 occurrences of [0,0] as a subarray.
There is 1 occurrence of [0,0,0] as a subarray.
There is no occurrence of a subarray with a size more than 3 filled with 0. Therefore, we return 9.
**Example 3:**
> **Input:** nums = [2,10,2019]
**Output:** 0
**Explanation:** There is no subarray filled with 0. Therefore, we return 0.
**Constraints:**
* `1 <= nums.length <= 10^5`
* `-10^9 <= nums[i] <= 10^9`
## Solution
> *With C programming.*
### First version
> *without using the clz_function*
This code uses a `for` loop to iterate through each element of the array. For each element:
If the current element is zero, `count_tmp += 1` (indicating how many consecutive zeros have been encountered so far).
If the current element is not zero, `count_tmp` is reset to 0, meaning the current sequence of consecutive zeros has ended.
With each new zero encountered, the count of subarrays is progressively accumulated. For example, if the number of consecutive zeros is `k`, then the number of subarrays formed by these zeros is `1 + 2 + 3 + ... + k`. This is accumulated in the result through `count_tmp` in the loop.
```c
#include <stdio.h>
long long zeroFilledSubarray(int* nums, int numsSize) {
short count_tmp = 0;
long long result = 0;
for(int i = 0; i < numsSize; i++) {
if (nums[i] == 0) {
count_tmp += 1;
result += count_tmp;
} else{
count_tmp = 0;
}
}
return result;
}
int main() {
// Example 1
int nums1[] = {1, 3, 0, 0, 2, 0, 0, 4};
int numsSize1 = sizeof(nums1) / sizeof(nums1[0]);
long long result1 = zeroFilledSubarray(nums1, numsSize1);
printf("Example 1 - The number of subarrays filled with 0: %lld\n", result1);
// Example 2
int nums2[] = {0, 0, 0, 2, 0, 0};
int numsSize2 = sizeof(nums2) / sizeof(nums2[0]);
long long result2 = zeroFilledSubarray(nums2, numsSize2);
printf("Example 2 - The number of subarrays filled with 0: %lld\n", result2);
// Example 3
int nums3[] = {2, 10, 2019};
int numsSize3 = sizeof(nums3) / sizeof(nums3[0]);
long long result3 = zeroFilledSubarray(nums3, numsSize3);
printf("Example 3 - The number of subarrays filled with 0: %lld\n", result3);
return 0;
}
```
### Second version
> *with the clz_function*
Based on the statement "If the number of consecutive zeros is `k`, then the number of subarrays is `1 + 2 + 3 + ... + k`" we can say that if we can directly determine `k` using the `clz_function` function, then we can calculate the result using the trapezoid area formula: `(k + 1) * k / 2`.
```c
#include <stdio.h>
static inline short clz(int* nums, int numsStart) {
int count = 0;
for (int i = numsStart - 1; i >= 0; --i) {
if (nums[i] != 0)
break;
count++;
}
return count;
}
long long zeroFilledSubarray(int* nums, int numsSize) {
long long result = 0;
short zero_count = 0;
while (numsSize > 0) {
zero_count = clz(nums, numsSize);
result += (zero_count + 1) * zero_count / 2;
numsSize -= zero_count+1;
}
return result;
}
int main() {
// Example 1
int nums1[] = {1, 3, 0, 0, 2, 0, 0, 4};
int numsSize1 = sizeof(nums1) / sizeof(nums1[0]);
long long result1 = zeroFilledSubarray(nums1, numsSize1);
printf("Example 1 - The number of subarrays filled with 0: %lld\n", result1);
// Example 2
int nums2[] = {0, 0, 0, 2, 0, 0};
int numsSize2 = sizeof(nums2) / sizeof(nums2[0]);
long long result2 = zeroFilledSubarray(nums2, numsSize2);
printf("Example 2 - The number of subarrays filled with 0: %lld\n", result2);
// Example 3
int nums3[] = {2, 10, 2019};
int numsSize3 = sizeof(nums3) / sizeof(nums3[0]);
long long result3 = zeroFilledSubarray(nums3, numsSize3);
printf("Example 3 - The number of subarrays filled with 0: %lld\n", result3);
return 0;
}
```
## assembly code
```c
.data
nums1: .word 1, 3, 0, 0, 2, 0, 0, 4
numsSize1: .word 8
nums2: .word 0, 0, 0, 2 ,0 ,0
numsSize2: .word 6
nums3: .word 2, 10, 2019
numsSize3: .word 3
.text
.globl main
main:
# Initialize pointers for nums1 array
la t0, nums1 # Load the address of nums1 into t0
lw t1, numsSize1
call zeroFilledSubarray
# Print result
mv a0, a2 # Move result to a1 for printing
li a7, 1 # Print integer syscall
ecall
# Print newline
li a0, 10 # ASCII code for newline '\n'
li a7, 11 # Syscall code for print character
ecall
# Initialize pointers for nums1 array
la t0, nums2 # Load the address of nums2 into t0
lw t1, numsSize2
call zeroFilledSubarray
# Print result
mv a0, a2 # Move result to a1 for printing
li a7, 1 # Print integer syscall
ecall
# Print newline
li a0, 10 # ASCII code for newline '\n'
li a7, 11 # Syscall code for print character
ecall
# Initialize pointers for nums1 array
la t0, nums3 # Load the address of nums3 into t0
lw t1, numsSize3
call zeroFilledSubarray
# Print result
mv a0, a2 # Move result to a1 for printing
li a7, 1 # Print integer syscall
ecall
# Print newline
li a0, 10 # ASCII code for newline '\n'
li a7, 11 # Syscall code for print character
ecall
# Exit
li a7, 10 # Exit syscall
ecall
zeroFilledSubarray:
# Arguments:
# t0: nums (pointer to array)
# t1: numsSize (size of array)
addi sp, sp, -12 # Allocate stack space for 3 registers
sw ra, 8(sp) # Save return address
sw s1, 4(sp) # Save s1 (zero_count)
sw s2, 0(sp) # Save s2 (result)
# Initialize result and zero_count
li s2, 0 # result = 0
li s1, 0 # zero_count = 0
loop:
beqz t1, end # If numsSize == 0, exit loop
# Call clz function
call clz
# Update result
mv s1, a1 # Move clz result to s1 (zero_count)
addi t2, s1, 1 # t2 = zero_count + 1
mul t2, t2, s1 # t2 = (zero_count + 1) * zero_count
srai t2, t2, 1 # t2 = t2 / 2
add s2, s2, t2 # result += t2
# Update nums pointer and numsSize
addi t0, t0, 4
beqz t1, loop # If numsSize == 0, numsSize don't have to -1
addi t1, t1, -1 # numsSize -= 1 (for the non-zero element)
j loop # Repeat loop
end:
# Load count to return
mv a2, s2 # Move count to a0
# Restore registers and return
lw ra, 8(sp) # Restore return address
lw s1, 4(sp) # Restore s1
lw s2, 0(sp) # Restore s2
addi sp, sp, 12 # Deallocate stack space
ret
clz:
# Arguments:
# t0: nums (pointer to array)
# t1: numsStart (start index in array)
addi sp, sp, -8 # Allocate stack space for 2 registers
sw ra, 4(sp) # Save return address
sw s0, 0(sp) # Save s0 (count)
# Initialize count to 0
li s0, 0 # count = 0
# Loop from numsStart - 1 to 0
loop_clz:
beqz t1, end_clz # If a1 == 0, exit loop
# Load value from nums
lb a0, 0(t0) # Load byte from nums
bnez a0, end_clz # If nums[i] != 0, exit loop
addi s0, s0, 1 # Increment count
addi t0, t0, 4
addi t1, t1, -1
j loop_clz # Repeat loop
end_clz:
# Load count to return
mv a1, s0 # Move count to a0
# Restore registers and return
lw ra, 4(sp) # Restore return address
lw s0, 0(sp) # Restore s0
addi sp, sp, 8 # Deallocate stack space
ret
```