# Assignment1: RISC-V Assembly and Instruction Pipeline [GitHub ](https://github.com/h0w726/CA-hw) ## Problem B of Quiz1 Problem B of Quiz1 is for converting between 32-bit floating point values (IEEE 754 single-precision) and 16-bit bfloat16 floating point values. **float32** ``` ┌ sign │ │ ┌ exponent │ │ │ │ ┌ mantissa │ │ │ │┌──┴────┐┌──────────┴─────────┐ 0b00000000000000000000000000000000 float32 31 22 ``` **bfloat16** ``` ┌ sign │ │ ┌ exponent │ │ │ │ ┌ mantissa │ │ │ │┌──┴────┐┌─┴──┐ 0b0000000000000000 bfloat16 ``` bfloat16 (bf16) shares the same number of exponent bits as a 32-bit float. ### C Code ```c= #include <stdio.h> #include <stdint.h> typedef struct { uint16_t bits; // 16-bit bfloat16 representation } bf16_t; /*bf16 to fp32*/ static inline float bf16_to_fp32(bf16_t h) { union { float f; uint32_t i; } u = {.i = (uint32_t)h.bits << 16}; return u.f; } /*fp32 to bf16*/ static inline bf16_t fp32_to_bf16(float s) { bf16_t h; union { float f; uint32_t i; } u = {.f = s}; if ((u.i & 0x7fffffff) > 0x7f800000) { /* NaN */ h.bits = (u.i >> 16) | 64; /* force to quiet */ return h; } h.bits = (u.i + (0x7fff + ((u.i >> 0x10) & 1))) >> 0x10; return h; } /*print*/ void process(float t){ printf("Original float value: %f\n", t); union { float a; uint32_t i; } f= {.a = t}; printf("float32 bits: "); for (int j = 31; j >= 0; j--) { printf("%d", (f.i >> j) & 1); } printf("\n"); bf16_t bf = fp32_to_bf16(t); printf("Converted bfloat16 bits:"); for (int i = 15; i >= 0; i--) { printf("%d", (bf.bits >> i) & 1); } printf("\n"); /*bfloat16 to float32*/ float fp = bf16_to_fp32(bf); union { float b; uint32_t i; } fb= {.b= t}; printf("Converted back to float32 bits:"); for (int j = 31; j >= 0; j--) { printf("%d", (fb.i >> j) & 1); } printf("\n"); } int main() { float test0= -1.45; float test1= 2.55; float test2= -3.99; process(test0); process(test1); process(test2); return 0; } ``` #### C Code Output **Test0:** ``` float value : -1.450000 float32 bits : 10111111101110011001100110011010 Converted bfloat16 bits : 1011111110111010 Converted back to float32 bits : 10111111101110011001100110011010 ``` **Test1:** ``` float value : 2.550000 float32 bits : 01000000001000110011001100110011 Converted bfloat16 bits : 0100000000100011 Converted back to float32 bits : 01000000001000110011001100110011 ``` **Test2:** ``` float value : -3.990000 float32 bits : 11000000011111110101110000101001 Converted bfloat16 bits : 1100000001111111 Converted back to float32 bits : 11000000011111110101110000101001 ``` ### Assembly Code ```c= .data test0: .word 0xBFB9999A #-1.45 in fp32 test1: .word 0x40233333 # 2.55 in fp32 test2: .word 0xC07F5C29 #-3.99 in fp32 newline: .string "\n" .text main: la s0 ,test0 # load address of test0 lw t0 ,0(s0) jal fp32_to_bf16 la s0 ,test1 # load address of test1 lw t0 ,0(s0) jal fp32_to_bf16 la s0 ,test2 # load address of test2 lw t0 ,0(s0) jal fp32_to_bf16 j Exit fp32_to_bf16: li t1,0x7fffffff # t1 = 0x7fffffff li t2,0x7f800000 # t2 = 0x7f800000 and t3,t1,t0 # u.i & 0x7fffffff bge t3,t2 Else # NaN go to Else srli t4,t0,0x10 # t4 = u.i >> 0x10 andi t4,t4,1 # t4 &1 li t5,0x7fff # t5 = 0x7fff add t4,t5,t4 # t4 = (0x7fff + ((u.i >> 0x10) & 1)) add t4,t0,t4 # t4 = (u.i + (0x7fff + ((u.i >> 0x10) & 1))) srli t4,t4,0x10 # t4 = (u.i + (0x7fff + ((u.i >> 0x10) & 1))) >> 0x10; addi a0,t4,0 j print jr ra Else: srli t4,t0,0x10 # t4 = (u.i >> 16) ori t4,t4,64 # t4 = (u.i >> 16) | 64; addi a0,t4,0 j print jr ra print: li a7 ,1 # syscall code for print integer ecall la a0,newline li a7,4 # syscall code for print string ecall ret Exit: li a7,10 ecall ``` ## Leetcode 1310. Xor Queries of a Subarray ### Question [Leetcode 1310. Xor Queries of a Subarray](https://leetcode.com/problems/xor-queries-of-a-subarray/description/) You are given an array arr of positive integers. You are also given the array queries where queries[i] = [left~i~, right~i~]. For each query i compute the XOR of elements from left~i~ to right~i~ (that is, arr[left~i~] XOR arr[left~i~ + 1] XOR ... XOR arr[right~i~] ). Return an array answer where answer[i] is the answer to the ith query. ``` Example 1: Input: arr = [1,3,4,8], queries = [[0,1],[1,2],[0,3],[3,3]] Output: [2,7,14,8] Explanation: The binary representation of the elements in the array are: 1 = 0001 3 = 0011 4 = 0100 8 = 1000 The XOR values for queries are: [0,1] = 1 xor 3 = 2 [1,2] = 3 xor 4 = 7 [0,3] = 1 xor 3 xor 4 xor 8 = 14 [3,3] = 8 Example 2: Input: arr = [4,8,2,10], queries = [[2,3],[1,3],[0,0],[0,3]] Output: [8,0,4,4] ``` ### Solution **Use Prefix XOR** #### Prefix XOR ``` Example 1: Input: arr = [1,3,4,8] prefix[0] = arr[0] = 1 prefix[1] = arr[0] ^ arr[1] = 1 ^ 3 = 2 prefix[2] = arr[0] ^ arr[1] ^ arr[2] = 1 ^ 3 ^ 4 = 6 prefix[3] = arr[0] ^ arr[1] ^ arr[2] ^ arr[3] = 1 ^ 3 ^ 4 ^ 8 = 14 ``` For the query `[left, right]` , the answer is `prefix[left] XOR prefix[right+1]` ### C Code ```c= #include <stdio.h> void xorQueries(int* arr, int arrSize, int queries[][2], int queriesSize, int* result) { int prefix[arrSize + 1]; prefix[0] = 0; for (int i = 1; i <= arrSize; i++) { prefix[i] = prefix[i - 1] ^ arr[i - 1]; } for (int i = 0; i < queriesSize; i++) { int left = queries[i][0]; int right = queries[i][1]; result[i] = prefix[right + 1] ^ prefix[left]; } } int main() { // Input arrays int arr0[] = {4, 8, 2, 10}; int arr1[] = {1, 3, 4, 8}; int arr2[] = {1, 2, 3, 4}; // Queries (left, right) int queries[][2] = {{0, 3}, {0, 1}}; int queriesSize = sizeof(queries) / sizeof(queries[0]); int result[queriesSize]; // Testing with each array printf("Results for arr0:\n"); xorQueries(arr0, 4, queries, queriesSize, result); for (int i = 0; i < queriesSize; i++) { printf("%d ", result[i]); } printf("\n"); printf("Results for arr1:\n"); xorQueries(arr1, 4, queries, queriesSize, result); for (int i = 0; i < queriesSize; i++) { printf("%d ", result[i]); } printf("\n"); printf("Results for arr2:\n"); xorQueries(arr2, 4, queries, queriesSize, result); for (int i = 0; i < queriesSize; i++) { printf("%d ", result[i]); } printf("\n"); return 0; } ``` **Test0:** * arr0= [4, 8, 2, 10] * queries = [[0, 3], [0, 1]] ``` Results for arr0: 4 12 ``` **Test1:** * arr1 = [1, 3, 4, 8] * queries = [[0, 3], [0, 1]] ``` Results for arr1: 14 2 ``` **Test2:** * arr2 = [1,2,3,4] * queries = [[0, 3], [0, 1]] ``` Results for arr2: 4 3 ``` ### Assembly Code ```c= .data test0: .word 4,8,2,10 test1: .word 1,3,4,8 test2: .word 1,2,3,4 space: .string " " newline: .string "\n" queries: .word 0,3,0,1 .text main: la a1,test0 # a1=arr[] addi a2,x0,5 # a2=arrsize + 1 la a3,queries # a3=queries[] jal ra initial la a1,test1 # a1=arr[] addi a2,x0,5 # a2=arrsize + 1 la a3,queries # a3=queries[] addi a2,x0,5 la a3,queries jal ra initial la a1,test2 # a1=arr[] addi a2,x0,5 # a2=arrsize + 1 la a3,queries # a3=queries[] la a3,queries jal ra initial j Exit initial: addi sp, sp, -28 # allocate for prefix loop sw s4,0(sp) # store prefix[0] in stack addi a5, sp, 0 addi t0 zero,1 # set i=1 j prefix_loop prefix_loop: bge t0, a2, prefix_done# if i >= arrsize, exit loop lw t1,0(a1) # load value of arr[i-1] addi a1,a1,4 # move to next element in arr lw t2, 0(a5) # load value of prefix[i-1] xor t2, t1, t2 # prefix[i]=arr[i-1]^prefix[i-1] addi a5,a5,4 # move to next space in prefix sw t2, 0(a5) # store prefix[i] addi t0, t0, 1 # i++ j prefix_loop prefix_done: addi s6, x0, 4 # queriesSize = 4 # Compute results for each query addi t0, x0, 0 # Initialize loop index for queries to 0 j queries_loop queries_loop: bge t0,s6 ,print # If loop index >= queriesSize, exit loop lw t1, 0(a3) # Load left value of query lw t2, 4(a3) # Load right value of query addi a3, a3, 8 # Increment queries pointer to the next query slli t1, t1, 2 # Calculate byte offset for prefix[left] add t3, sp, t1 # Address of prefix[left] lw s7, 0(t3) # load prefix[left] addi t2, t2, 1 # right + 1 slli t2, t2, 2 # calculate byte offset for prefix[right + 1] add t4 ,sp ,t2 # address of prefix[right + 1] lw s8 ,0(t4) # load prefix[right + 1] xor s8,s7,s8 # result = prefix[right + 1] ^ prefix[left] addi t0, t0, 2 # move to next query addi a0, s8,0 # Load result[t0] into a0 li a7, 1 # syscall code for print integer ecall la a0 ,space li a7 ,4 # syscall code for print string ecall j queries_loop print: addi sp, sp, 28 # Restore stack pointer la a0 ,newline li a7,4 ecall ret Exit: li a7,10 ecall ``` #### Assembly Output ![2024-10-14 (16)](https://hackmd.io/_uploads/HkLP2Hckye.png) | Execution Info | | | --------------- | --- | | Cycles | 436 | | Instrs.retired: | 308 | | CPI: | 1.42 | | IPC: | 0.706 | ## Analysis We use 5-Stage RISC-V Processor in Ripes. ### 5-Stage RISC-V Processor 5-stage pipeline is a common design in processors where the instruction execution process is split into five stages. This approach allows multiple instructions to be processed simultaneously in different stages, improving the processor’s throughput and efficiency. ![2024-10-13](https://hackmd.io/_uploads/HJcNrEtkkx.png) The "5-stage" are: 1. Instruction fetch (IF) 2. Instruction decode and register fetch (ID) 3. Execute (EX) 4. Memory access (MEM) 5. Register write back (WB) #### Instruction fetch(IF) Program Counter (PC) is a register that store the address of the next instruction to be fetched. At the start of the Instruction Fetch stage, the processor uses the address stored in the PC to determine where to fetch the next instruction. ![2024-10-14 (3)](https://hackmd.io/_uploads/BydAabqJJe.png) We start from instruction store at `0x00000000`, so `addr` is `0x00000000`. The first instruction is `auipc x11 0x10000`,and its machine code is `0x10000597` We use RV32I Base Instruction Set. **AUIPC** | imm[31:12] | rd | opcode | | -------- | -------- | -------- | | 00010000000000000000|01011| 0010111| So the machine code is `0x10000597`. After fetching the instruction, if no branch occurs, the value of the PC will be incremented by 4 to fetch the next instruction. #### Instruction decode and register fetch (ID) ![2024-10-14 (6)](https://hackmd.io/_uploads/r1u59m9JJg.png) In this stage Instruction `0x10000597` is decoded to three part: * `imm` = `0x10000000` * `Wr_reg_idx` = `0x0b` * `opcode` = `AUIPC` #### Execute (EX) ![2024-10-14 (8)](https://hackmd.io/_uploads/SJojE45Jkl.png) * Op1 : PC value = `0x00000000` * Op2 : immediate value = `0x10000000` * Res : The result of the addition in the ALU is `0x10000000`. #### Memory access (MEM) ![2024-10-14 (10)](https://hackmd.io/_uploads/HkXdPE9J1e.png) * `Addr` = `0x10000000` * `Data in` = `0x00000000` * `Wr en` = `0x0` * `Read out` = `0x00000004` `Addr` is target address for memory access. `Data in` shows the data that is to be written to memory. `Wr en` = `0x0` represents memory doesn't enable writing. `Read out` represents the data that is being read from memory. In this case , `Read out` = `0x00000004` means that the memory location at address `0x10000000` store `0x00000004`. As shown in the table below. ![2024-10-14 (11)](https://hackmd.io/_uploads/rk5wCE9yJx.png) #### Register write back (WB) ![2024-10-14 (12)](https://hackmd.io/_uploads/SJ6AJrqy1x.png) The value `0x10000000` is written into register `x11`. After these stages are done, the value of register is: ![2024-10-14 (14)](https://hackmd.io/_uploads/rJYLVBcJye.png) ## Reference [Leetcode 1310](https://leetcode.com/problems/xor-queries-of-a-subarray/description/) [Single-precision floating-point format](https://en.wikipedia.org/wiki/Single-precision_floating-point_format) [bfloat16 floating-point format](https://en.wikipedia.org/wiki/Bfloat16_floating-point_format) [Quiz1 of Computer Architecture (2024 Fall)](https://hackmd.io/@sysprog/arch2024-quiz1-sol)