Try   HackMD

Assignment1: RISC-V Assembly and Instruction Pipeline

contributed by < 李尚宸 >

Quiz1 Problem C (Counting Leading Zeros)

Choose one problem (A, B, or C) from Quiz1, translate it from C code to a complete RISC-V assembly program, and include the relevant test data.

C programs

fabsf.c

#include <stdio.h>
#include <stdint.h>

static inline float fabsf(float x) {
    uint32_t i = *(uint32_t *)&x;
    i &= 0x7FFFFFFF;
    x = *(float *)&i;
    return x;
}

int main() {
    float float_nums[] = {3.14f, -3.14f, 0.0f, -0.0f, 1.0f, -1.0f, 123.99f, -123.99f};
    int n = sizeof(float_nums) / sizeof(float_nums[0]);
    for (int i = 0; i < n; i++) {
        printf("%f\n", fabsf(float_nums[i]));
    }
    return 0;
}

my_clz.c

#include <stdio.h>
#include <stdint.h>

static inline int my_clz(uint32_t x) {
    int count = 0;
    for (int i = 31; i >= 0; --i) {
        if (x & (1U << i))
            break;
        count++;
    }
    return count;
}

int main() {
    /* 
        Test case 0: Input 0, binary: 00000000 00000000 00000000 00000000, Expected: 32
        Test case 1: Input 1, binary: 00000000 00000000 00000000 00000001, Expected: 31
        Test case 2: Input 1024, binary: 00000000 00000000 00000100 00000000, Expected: 21
    */
    int test_case[] = {0, 1, 1024};

    for (int i = 0; i < 3; ++i) {
        printf("%d\n", my_clz(test_case[i]));
    }
    
    return 0;
}

Image Not Showing Possible Reasons
  • The image was uploaded to a note which you don't have access to
  • The note which the image was originally uploaded to has been deleted
Learn More →

Show the full C source code corresponding to the original problem set.

fp16_to_fp32.c

#include <stdio.h>
#include <stdint.h>

static inline uint32_t fp16_to_fp32(uint16_t h) {
    const uint32_t w = (uint32_t)h << 16;
    const uint32_t sign = w & UINT32_C(0x80000000);
    const uint32_t nonsign = w & UINT32_C(0x7FFFFFFF);
    uint32_t renorm_shift = __builtin_clz(nonsign);
    renorm_shift = renorm_shift > 5 ? renorm_shift - 5 : 0;
    const int32_t inf_nan_mask = ((int32_t)(nonsign + 0x04000000) >> 8) & INT32_C(0x7F800000);
    const int32_t zero_mask = (int32_t)(nonsign - 1) >> 31;

    return sign | ((((nonsign << renorm_shift >> 3) +
            ((0x70 - renorm_shift) << 23)) | inf_nan_mask) & ~zero_mask);
}

void test_fp16_to_fp32(uint16_t h) {
    uint32_t result = fp16_to_fp32(h);
    printf("fp16: 0x%04X, fp32: 0x%08X\n", h, result);
}

int main() {
    uint16_t test_cases[] = {
        0x3C00,  // 1.0 in half-precision
        0x4000,  // 2.0 in half-precision
        0x4040,  // 3.0 in half-precision
        0x0000,  // +0.0 in half-precision
        0x8000,  // -0.0 in half-precision
        0x7C00,  // +Inf in half-precision
        0xFC00,  // -Inf in half-precision
        0x7E00,  // NaN in half-precision
        0xB800,  // -2.0 in half-precision
        0x0400   // Very small positive number in half-precision
    };

    int num_tests = sizeof(test_cases) / sizeof(test_cases[0]);
    for (int i = 0; i < num_tests; i++) {
        test_fp16_to_fp32(test_cases[i]);
    }

    return 0;
}

Assembly programs

fabsf.s

.data
.align 4
    
float_nums:
    .word 0x4048F5C3  # 3.14f (IEEE 754 representation)
    .word 0xC048F5C3  # -3.14f
    .word 0x00000000  # 0.0f
    .word 0x80000000  # -0.0f
    .word 0x3F800000  # 1.0f
    .word 0xBF800000  # -1.0f
    .word 0x42F7E666  # 123.99f
    .word 0xC2F7E666  # -123.99f

newline:
    .asciz "\n" 

.text
.globl main

main:
    la t0, float_nums 
    li t1, 8                # number of elements in float_nums

loop:
    lw t2, 0(t0)            # float_nums[i]
    li t3, 0x7FFFFFFF       # clear the sign bit
    and t2, t2, t3          # float_nums[i] = float_nums[i] & 0x7FFFFFFF

    jal ra, print_float

    addi t0, t0, 4          # Next element
    addi t1, t1, -1         # t1--
    bnez t1, loop           # If (t1 != 0), continue the loop

    li a0, 0                # Exit status code
    li a7, 93               # System call for exit
    ecall 

print_float:
    li a7, 2    
    mv a0, t2  
    ecall  

    li a7, 4   
    la a0, newline
    ecall

    ret 

my_clz.s

.data
num_test_cases: 
    .word 3

test_case_array: 
    .word 0, 1, 1024
    
newline:
    .asciz "\n" 

.text
.globl main

main:
    la t0, num_test_cases 
    lw t1, 0(t0)
    la t0, test_case_array
    li t2, 0  

loop_main:
    beq t1, t2, end_main    # if (i == number) of test cases, exit loop
    lw a0, 0(t0)            # test_case[i]

    jal ra, my_clz

    li a7, 1                # System call code for print integer
    ecall

    la a0, newline  
    li a7, 4                # System call code for print string
    ecall

    addi t0, t0, 4          # Next test_case
    addi t2, t2, 1          # i++
    j loop_main

end_main:
    li a7, 10               # System call code for exit
    ecall

my_clz:
    li t3, 31               # i = 31
    li t4, 0                # count = 0

loop_my_clz:
    li t5, 1                # t5 = 1
    sll t5, t5, t3          # t5 = 1 << i
    and t5, a0, t5          # t5 = x & (1 << i)
    bnez t5, end_my_clz     # if ((x & (1 << i)) != 0), exit loop
    addi t4, t4, 1          # count++
    addi t3, t3, -1         # i--
    bltz t3, end_my_clz     # if (i < 0), exit loop
    j loop_my_clz 

end_my_clz:
    mv a0, t4               # return count
    ret 

Use fewer instructions.

image

fp16_to_fp32.s


Summary of Registers Used

my_clz.s

  • main function:
    • t0: Pointers for num_test_cases and test_case_array.
    • t1: Number of test cases.
    • t2:i
    • a7: System call register.
  • my_clz function:
    • a0: Argument register used for passing the current test case and returning the result
    • t3: i
    • t4: count
    • t5: x & (1U << i)

Loop Unrolling

my_clz_loop_unrolling.s

  • We perform loop unrolling in the main function
.data
num_test_cases: 
    .word 3

test_case_array: 
    .word 0, 1, 1024
    
newline:
    .asciz "\n" 

.text
.globl main

main:
    lw t1, 3
    la t0, test_case_array
    li t2, 0  

test_case0:
    lw a0, 0(t0) 
    jal ra, my_clz

    li a7, 1                # System call code for print integer
    ecall

    la a0, newline  
    li a7, 4                # System call code for print string
    ecall

    addi t0, t0, 4          # Next test_case
    addi t2, t2, 1          # i++

test_case1:
    lw a0, 0(t0)
    jal ra, my_clz

    li a7, 1            
    ecall

    la a0, newline  
    li a7, 4  
    ecall

    addi t0, t0, 4  
    addi t2, t2, 1 
    
test_case2:
    lw a0, 0(t0) 
    jal ra, my_clz

    li a7, 1    
    ecall

    la a0, newline  
    li a7, 4
    ecall

end_main:
    li a7, 10               # System call code for exit
    ecall

my_clz:
    li t3, 31               # i = 31
    li t4, 0                # count = 0

loop_my_clz:
    li t5, 1                # t5 = 1
    sll t5, t5, t3          # t5 = 1 << i
    and t5, a0, t5          # t5 = x & (1 << i)
    bnez t5, end_my_clz     # if ((x & (1 << i)) != 0), exit loop
    addi t4, t4, 1          # count++
    addi t3, t3, -1         # i--
    bltz t3, end_my_clz     # if (i < 0), exit loop
    j loop_my_clz 

end_my_clz:
    mv a0, t4               # return count
    ret 

image

A Comparison of C, Non-Loop Unrolling, and Loop Unrolling Implementations

my_clz

C Non-Loop Unrolling Assembly Loop Unrolling Assembly
Cycles 7469 946 928
Instrs. retired 5603 736 726
CPI 1.33 1.29 1.28
IPC 0.75 0.778 0.782
Clock rate 6.48 KHz 59.17 Hz 1.33 KHz

Leetcode 268. Missing Number

Problem

Given an array nums containing n distinct numbers in the range [0, n], return the only number in the range that is missing from the array.

Example 1:

Input: nums = [3,0,1]
Output: 2
Explanation: n = 3 since there are 3 numbers, so all numbers are in the range [0,3]. 2 is the missing number in the range since it does not appear in nums.

Example 2:

Input: nums = [0,1]
Output: 2
Explanation: n = 2 since there are 2 numbers, so all numbers are in the range [0,2]. 2 is the missing number in the range since it does not appear in nums.

Example 3:

Input: nums = [9,6,4,2,3,5,7,0,1]
Output: 8
Explanation: n = 9 since there are 9 numbers, so all numbers are in the range [0,9]. 8 is the missing number in the range since it does not appear in nums.

C Program

#include <stdio.h>

int missing_number(int* nums, int n) {
    int mask = 0, i = 0;
    for (i = 0; i < n; i++) {
        mask = mask ^ i ^ nums[i];
    }
    return mask ^ i;
}

int main() {
    int nums0[] = {3, 0, 1};
    int nums1[] = {0, 1};
    int nums2[] = {9, 6, 4, 2, 3, 5, 7, 0, 1};
    
    printf("%d\n", missing_number(nums0, sizeof(nums0) / sizeof(nums0[0])));  // Expected missing number is 2
    printf("%d\n", missing_number(nums1, sizeof(nums1) / sizeof(nums1[0])));  // Expected missing number is 2
    printf("%d\n", missing_number(nums2, sizeof(nums2) / sizeof(nums2[0])));  // Expected missing number is 8

    return 0;
}

image

You should automate the testing procedures as much as possible, meaning there's no need to manually check each program output. Instead, implement internal validation to handle the result checking.

Assembly Program

.data
nums0:      
    .word 3, 0, 1  

nums1:      
    .word 0, 1 

nums2:      
    .word 9, 6, 4, 2, 3, 5, 7, 0, 1 

len_nums0:  
    .word 3

len_nums1:  
    .word 2 

len_nums2:  
    .word 9  

newline:    
    .asciz "\n"

.text
.globl main

main:
    # missing number of num0
    la a0, nums0
    lw a1, len_nums0
    jal ra, missing_number
    li a7, 1                        # System call code for print integer
    ecall

    la a0, newline
    li a7, 4                        # System call code for print string
    ecall

    # missing number of num1
    la a0, nums1
    lw a1, len_nums1 
    jal ra, missing_number 
    li a7, 1 
    ecall

    la a0, newline  
    li a7, 4  
    ecall

    # missing number of num2
    la a0, nums2 
    lw a1, len_nums2
    jal ra, missing_number
    li a7, 1 
    ecall

    la a0, newline
    li a7, 4
    ecall

end_main:
    li a7, 10                       # System call code for exit
    ecall

missing_number:
    li t0, 0                        # mask = 0
    li t1, 0                        # i = 0

loop_missing_number:
    beq t1, a1, end_missing_number  # if (i == n), exit loop
    lw t2, 0(a0)                    # Load nums[i]
    xor t0, t0, t1                  # mask = mask ^ i
    xor t0, t0, t2                  # mask = mask ^ nums[i]
    addi a0, a0, 4                  # Next element
    addi t1, t1, 1                  # i++
    j loop_missing_number

end_missing_number:
    xor a0, t0, t1                  # mask ^ n
    ret

Use fewer instructions.

image

Summary of Registers Used

  • main function:
    • a0: Pointer to the current array (nums1, nums2, nums3).
    • a1: Length of the current array (len_nums1, len_nums2, len_nums3).
    • a7: System call register.
  • missing_number function:
    • t0: mask
    • t1: i
    • t2: nums[i]

Loop Unrolling

  • We perform loop unrolling in the missing_number function
.data
nums0:      
    .word 3, 0, 1  

nums1:      
    .word 0, 1 

nums2:      
    .word 9, 6, 4, 2, 3, 5, 7, 0, 1 

len_nums0:  
    .word 3

len_nums1:  
    .word 2 

len_nums2:  
    .word 9  

newline:    
    .asciz "\n"

.text
.globl main

main:

nums0_missing_number:
    la a0, nums0
    li t0, 0                        # mask = 0
    li t1, 0                        # i = 0
    
    lw t2, 0(a0)                    # Load nums0[0]
    xor t0, t0, t1                  # mask = mask ^ i
    xor t0, t0, t2                  # mask = mask ^ nums0[0]
    addi a0, a0, 4                  # Next element
    addi t1, t1, 1                  # i++
    
    lw t2, 0(a0)                    # Load nums0[1]
    xor t0, t0, t1                  # mask = mask ^ i
    xor t0, t0, t2                  # mask = mask ^ nums0[1]
    addi a0, a0, 4                  # Next element
    addi t1, t1, 1                  # i++

    lw t2, 0(a0)                    # Load nums0[2]
    xor t0, t0, t1                  # mask = mask ^ i
    xor t0, t0, t2                  # mask = mask ^ nums0[2]
    addi t1, t1, 1                  # i++
    
    xor a0, t0, t1                  # mask ^ n

    li a7, 1                        # System call code for print integer
    ecall

    la a0, newline
    li a7, 4                        # System call code for print string
    ecall

nums1_missing_number:
    la a0, nums1
    li t0, 0                        # mask = 0
    li t1, 0                        # i = 0

    lw t2, 0(a0)                    # Load nums1[0]
    xor t0, t0, t1                  # mask = mask ^ i
    xor t0, t0, t2                  # mask = mask ^ nums1[0]
    addi a0, a0, 4                  # Next element
    addi t1, t1, 1                  # i++
        
    lw t2, 0(a0)                    # Load nums1[1]
    xor t0, t0, t1                  # mask = mask ^ i
    xor t0, t0, t2                  # mask = mask ^ nums1[1]
    addi t1, t1, 1                  # i++
    
    xor a0, t0, t1                  # mask ^ n

    li a7, 1
    ecall

    la a0, newline  
    li a7, 4  
    ecall

nums2_missing_number:
    la a0, nums2
    li t0, 0                        # mask = 0
    li t1, 0                        # i = 0
    
    lw t2, 0(a0)                    # Load nums2[0]
    xor t0, t0, t1                  # mask = mask ^ i
    xor t0, t0, t2                  # mask = mask ^ nums2[0]
    addi a0, a0, 4                  # Next element
    addi t1, t1, 1                  # i++
        
    lw t2, 0(a0)                    # Load nums2[1]
    xor t0, t0, t1                  # mask = mask ^ i
    xor t0, t0, t2                  # mask = mask ^ nums2[1]
    addi a0, a0, 4                  # Next element
    addi t1, t1, 1                  # i++
        
    lw t2, 0(a0)                    # Load nums2[2]
    xor t0, t0, t1                  # mask = mask ^ i
    xor t0, t0, t2                  # mask = mask ^ nums2[2]
    addi a0, a0, 4                  # Next element
    addi t1, t1, 1                  # i++
        
    lw t2, 0(a0)                    # Load nums2[3]
    xor t0, t0, t1                  # mask = mask ^ i
    xor t0, t0, t2                  # mask = mask ^ nums2[3]
    addi a0, a0, 4                  # Next element
    addi t1, t1, 1                  # i++
        
    lw t2, 0(a0)                    # Load nums2[4]
    xor t0, t0, t1                  # mask = mask ^ i
    xor t0, t0, t2                  # mask = mask ^ nums2[4]
    addi a0, a0, 4                  # Next element
    addi t1, t1, 1                  # i++
        
    lw t2, 0(a0)                    # Load nums2[5]
    xor t0, t0, t1                  # mask = mask ^ i
    xor t0, t0, t2                  # mask = mask ^ nums2[5]
    addi a0, a0, 4                  # Next element
    addi t1, t1, 1                  # i++
        
    lw t2, 0(a0)                    # Load nums2[6]
    xor t0, t0, t1                  # mask = mask ^ i
    xor t0, t0, t2                  # mask = mask ^ nums2[6]
    addi a0, a0, 4                  # Next element
    addi t1, t1, 1                  # i++
        
    lw t2, 0(a0)                    # Load nums2[7]
    xor t0, t0, t1                  # mask = mask ^ i
    xor t0, t0, t2                  # mask = mask ^ nums2[7]
    addi a0, a0, 4                  # Next element
    addi t1, t1, 1                  # i++
    
    lw t2, 0(a0)                    # Load nums2[8]
    xor t0, t0, t1                  # mask = mask ^ i
    xor t0, t0, t2                  # mask = mask ^ nums2[8]
    addi t1, t1, 1                  # i++
    
    xor a0, t0, t1                  # mask ^ n

    li a7, 1 
    ecall

    la a0, newline
    li a7, 4
    ecall

end_main:
    li a7, 10                       # System call code for exit
    ecall

image

A Comparison of C, Non-Loop Unrolling, and Loop Unrolling Implementations

C Non-Loop Unrolling Assembly Loop Unrolling Assembly
Cycles 5112 212 120
Instrs. retired 3847 148 102
CPI 1.33 1.43 1.18
IPC 0.753 0.698 0.85
Clock rate 7.83 KHz 27.25 Hz 77.42 KHz

Hazard

  1. Data Hazards:

    • RAW (Read After Write): An instruction reads data before a previous instruction writes to it.

      • Example:
        ​​​​​​​​add x1, x2, x3   # Writes to x1
        ​​​​​​​​sub x4, x1, x5   # Reads x1 before write completes
        
      • Solution: Data forwarding, pipeline stalling.
    • WAR (Write After Read): An instruction writes data before a previous one reads it.

      • Example:
        ​​​​​​​​sub x1, x2, x3   # Reads x3
        ​​​​​​​​add x3, x4, x5   # Writes to x3 before the read occurs
        
      • Solution: Reordering instructions, or register renaming.
    • WAW (Write After Write): Two instructions write to the same register in the wrong order, causing data loss.

      • Example:
        ​​​​​​​​add x1, x2, x3   # Writes to x1
        ​​​​​​​​sub x1, x4, x5   # Overwrites x1 before the first write completes
        
      • Solution: Pipeline stall, register renaming.
  2. Control Hazards:

    • Caused by branch instructions that change the instruction flow. Mispredicting can delay execution.
    • Solution: Branch prediction, delayed branching.
  3. Structural Hazards:

    • Occur when multiple instructions require the same hardware resource at the same time.
    • Solution: Adding more resources (e.g., multiple ALUs), or using multiple pipelines.

Analysis

Ripes Simulator

We use Ripes, a graphical processor simulator and assembly editor for the RISC-V.

There are five stages in the Ripes pipeline.

  • IF: Instruction Fetch
  • ID: Instruction Decode & Register Read
  • EX: Execution or Address Calculation
  • MEM: Data Memory Access
  • WB: Write Back

image

Case 1: my_clz.s

  • Data Hazard
loop_my_clz:
    li t5, 1                # t5 = 1
    sll t5, t5, t3          # t5 = 1 << i
  • There is a RAW hazard between the li t5, 1 instruction and the following instructions (sll).

image

  • Data forwarding ensures that the value of t5 is immediately available to the sll instruction, without having to wait for it to propagate through the entire pipeline and write back to the register file.

  • Control Hazard

  • Before jal x1 48 <my_clz> execution

image
image

  • After jal x1 48 <my_clz> execution

image
image

Case 2: missing_number.s

  • Data Hazard
xor t0, t0, t1                  # mask = mask ^ i
xor t0, t0, t2                  # mask = mask ^ nums[i]

image

  • There is a RAW hazard between two xor instruction.

image

  • This example, like the one above in my_clz.s, uses data forwarding to avoid data hazards.

  • Control Hazard

  • Before jal x1 124 <missing_number> execution

image
image

  • After jal x1 124 <missing_number> execution

image
image

  • Due to branch instructions that alter the flow of execution, nop(flush) instructions are inserted here.