--- tags: computer-arch --- # Quiz3 of Computer Architecture (2024 Fall) > Solutions ## Problem `A` Consider a RISC-V program that outputs the "Hello World!" message. Its assembly code is shown below. ```c .global _start _start: addi a7, zero, 64 # syscall 64 = write addi a0, zero, 1 # fd = stdout la a1, .helloworld addi a2, zero, 13 # count = 13 ecall addi a7, zero, 93 # syscall 93 = exit addi a0, zero, 42 ecall .helloworld: .ascii "Hello World!\n" ``` The content of this program is displayed using the [hexdump](https://man7.org/linux/man-pages/man1/hexdump.1.html) tool. ``` 0000000 0893 0400 0513 0010 0597 0000 8593 01c5 0000010 0613 00d0 0073 0000 0893 05d0 0513 02a0 0000020 0073 0000 6548 6c6c 206f 6f57 6c72 2164 0000030 000a 0000 0000034 ``` You are tasked with writing a simplified RISC-V instruction set simulator capable of running the program above. The simulator should be able to fetch, decode, and execute a subset of RISC-V instructions. The partial implementation is shown below, and you are required to complete the code. ```c #include <stdint.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #define fatal(args...) \ do { \ printf("Fatal: "); \ printf(args); \ exit(-1); \ } while (0) enum { /* I-type integer computation or integer register-immediate */ INTEGER_COMP_RI = 0x13, LUI = #A01 /* Your answer here */, AUIPC = #A02 /* Your answer here */, /* ECALL and EBREAK */ ECALL = 0x73, }; /* FUNCT3 for INTEGER_COMPI_RI */ enum { ADDI = 0x0, }; enum { OPCODE_MASK = 0x7F, REG_ADDR_MASK = 0x1F }; enum { RD_SHIFT = #A03 /* Your answer here */, RS1_SHIFT = #A04 /* Your answer here */, RS2_SHIFT = 20, FUNCT3_SHIFT = #A05 /* Your answer here */, FUNCT7_SHIFT = 25 }; enum { FUNCT3_MASK = 0x7, FUNCT7_MASK = 0x7F }; enum { RAM_SIZE = 1204 * 1024 * 2, /* 2 MiB */ RAM_BASE = 0x0, }; typedef struct { uint8_t *mem; } ram_t; typedef struct { uint32_t regs[32]; size_t pc; ram_t *ram; } cpu_t; typedef struct { uint8_t opcode; uint8_t rd, rs1, rs2, funct3, funct7; } insn_t; ram_t *ram_new(uint8_t *code, size_t len) { ram_t *ram = malloc(sizeof(ram_t)); ram->mem = malloc(RAM_SIZE); memcpy(ram->mem, code, len); return ram; } void ram_free(ram_t *mem) { free(mem->mem); free(mem); } static inline void check_addr(int32_t addr) { size_t index = addr - RAM_BASE; if (index >= RAM_SIZE) fatal("out of memory.\n"); } static uint32_t ram_load8(ram_t *mem, uint32_t addr) { size_t index = addr - RAM_BASE; return (uint32_t) mem->mem[index]; } static uint32_t ram_load16(ram_t *mem, uint32_t addr) { size_t index = addr - RAM_BASE; return ((uint32_t) mem->mem[index] | ((uint32_t) mem->mem[index + 1] << 8)); } static uint32_t ram_load32(ram_t *mem, uint32_t addr) { size_t index = addr - RAM_BASE; return ((uint32_t) mem->mem[index] | ((uint32_t) mem->mem[index + 1] << 8) | ((uint32_t) mem->mem[index + 2] << 16) | ((uint32_t) mem->mem[index + 3] << 24)); } uint32_t ram_load(ram_t *mem, uint32_t addr, uint8_t size) { check_addr(addr); uint32_t r = 0; switch (size) { case 8: r = ram_load8(mem, addr); break; case 16: r = ram_load16(mem, addr); break; case 32: r = ram_load32(mem, addr); break; default: fatal("RAM: invalid load size(%d)\n", size); } return r; } cpu_t *cpu_new(uint8_t *code, size_t len) { cpu_t *cpu = malloc(sizeof(cpu_t)); memset(cpu->regs, 0, sizeof(cpu->regs)); cpu->regs[0] = 0; /* Stack pointer */ cpu->regs[2] = RAM_BASE + RAM_SIZE; cpu->pc = RAM_BASE; cpu->ram = ram_new(code, len); return cpu; } void cpu_free(cpu_t *cpu) { ram_free(cpu->ram); free(cpu); } uint32_t cpu_load(cpu_t *cpu, uint32_t addr, uint8_t size) { return ram_load(cpu->ram, addr, size); } uint32_t cpu_fetch(cpu_t *cpu) { return cpu_load(cpu, cpu->pc, 32); } void cpu_decode(uint32_t raw, insn_t *inst) { uint8_t opcode = raw & OPCODE_MASK; uint8_t rd = (raw >> RD_SHIFT) & REG_ADDR_MASK; uint8_t rs1 = (raw >> RS1_SHIFT) & REG_ADDR_MASK; uint8_t rs2 = (raw >> RS2_SHIFT) & REG_ADDR_MASK; uint8_t funct3 = (raw >> FUNCT3_SHIFT) & FUNCT3_MASK; uint8_t funct7 = (raw >> FUNCT7_SHIFT) & FUNCT7_MASK; inst->opcode = opcode; inst->rd = rd, inst->rs1 = rs1, inst->rs2 = rs2; inst->funct3 = funct3; inst->funct7 = funct7; } static inline int32_t i_imm(uint32_t raw) { /* imm[11:0] = inst[31:20] */ return ((int32_t) raw) >> #A06 /* Your answer here */; } static inline int32_t u_imm(uint32_t raw) { /* imm[20:0] = inst[31:12] */ return ((int32_t) raw & 0xFFFFF000); } enum { SYSCALL_WRITE = 64, SYSCALL_EXIT = 93 }; enum { STDOUT = 1, STDERR = 2 }; void ecall_handler(cpu_t *cpu) { uint32_t syscall_nr = cpu->regs[17]; switch (syscall_nr) { case SYSCALL_WRITE: { uint32_t fd = cpu->regs[10]; uint32_t addr = cpu->regs[11]; uint32_t count = cpu->regs[12]; FILE *stream; switch (fd) { case STDOUT: stream = stdout; break; case STDERR: stream = stderr; break; default: fatal("invalid file-descriptor %d.\n", fd); } for (uint32_t i = 0; i < count; i++) fprintf(stream, "%c", cpu_load(cpu, addr + i, 8)); break; } case SYSCALL_EXIT: { int32_t exit_code = cpu->regs[10]; exit(exit_code); break; } default: fatal("unkown syscall number %d.\n", syscall_nr); } } static insn_t inst; void cpu_execute(cpu_t *cpu, uint32_t raw) { cpu_decode(raw, &inst); cpu->regs[0] = 0; switch (inst.opcode) { case INTEGER_COMP_RI: { int32_t imm = i_imm(raw); switch (inst.funct3) { case ADDI: cpu->regs[inst.rd] = cpu->regs[inst.rs1] + imm; break; default: fatal("Unknown FUNCT3 for INTEGER_COMP_RI"); } break; } case LUI: { int32_t imm = u_imm(raw); cpu->regs[inst.rd] = imm; break; } case AUIPC: { int32_t imm = u_imm(raw); cpu->regs[inst.rd] = (cpu->pc + (#A07 /* Your answer here */)) + imm; break; } case ECALL: if (raw == 0x100073) break; /* EBREAK */ ecall_handler(cpu); break; default: fatal("Illegal Instruction 0x%x\n", inst.opcode); } } int main(int argc, char *argv[]) { if (argc < 2) { printf("Usage: %s [filename]\n", argv[0]); exit(-1); } const char *filename = argv[1]; FILE *file = fopen(filename, "rb"); if (!file) { printf("Failed to open %s\n", filename); exit(-1); } fseek(file, 0L, SEEK_END); size_t code_size = ftell(file); rewind(file); uint8_t *code = malloc(code_size); fread(code, sizeof(uint8_t), code_size, file); fclose(file); cpu_t *cpu = cpu_new(code, code_size); while (1) { uint32_t raw = cpu_fetch(cpu); cpu->pc += #A08 /* Your answer here */; if (cpu->pc > code_size) break; cpu_execute(cpu, raw); if (!cpu->pc) break; } cpu_free(cpu); return 0; } ``` The values for `#A01` and `#A02` must be expressed as hexadecimal integer literals in their shortest form. For example, use 0x7C00 rather than 0x007C00. The values for `#A03`, `#A04`, `#A05`, `#A06`, `#A07`, and `#A08` must be expressed as decimal integer literals in their shortest form. In the decimal system, negative integers are simply indicated by placing a `-` sign in front of the number (e.g., `-5`). A01 = ==`0x37`== A02 = ==`0x17`== A03 = ==`7`== A04 = ==`15`== A05 = ==`12`== A06 = ==`20`== A07 = ==`-4`== A08 = ==`4`== --- ## Problem `B` [Booth's multiplication algorithm](https://en.wikipedia.org/wiki/Booth%27s_multiplication_algorithm) is a method used to multiply two signed binary numbers in two's complement notation. This algorithm examines adjacent pairs of bits in the 'N'-bit multiplier $Y$, which is represented in two's complement form, with an implicit bit below the least significant bit ($y_{-1} = 0$). For each bit $y_i$ (where $i$ runs from 0 to $N - 1$), the algorithm considers the current bit $y_i$ and the previous bit $y_{i-1}$. The actions taken are as follows: - If $y_i$ and $y_{i-1}$ are equal, no changes are made to the product accumulator $P$. - If $y_i = 0$ and $y_{i-1} = 1$, the multiplicand multiplied by $2^i$ is added to $P$. - If $y_i = 1$ and $y_{i-1} = 0$, the multiplicand multiplied by $2^i$ is subtracted from $P$. The final value of $P$ represents the signed product of the multiplication. Booth's algorithm does not specify how the multiplicand and product are represented, though they are typically in two's complement notation like the multiplier. However, the algorithm can work with any number system that supports addition and subtraction. The steps generally proceed from the least significant bit (LSB) to the most significant bit (MSB), starting at $i = 0$. Instead of directly multiplying by $2^i$, the algorithm typically shifts the accumulator $P$ to the right between steps. This allows low bits to be shifted out, and subsequent operations (additions and subtractions) can focus on the highest $N$ bits of $P$. The algorithm effectively converts sequences of 1s in the multiplier into a high-order +1 and a low-order -1 at the boundaries of the sequence. When a sequence extends through the MSB, there is no high-order +1, and the effect is interpreted as the negative of the corresponding value. The following code is an implementation of Booth's multiplication algorithm, written in C99. ```c #include <stdbool.h> #include <stdint.h> typedef uint64_t u64; typedef uint32_t u32; typedef uint16_t u16; typedef uint8_t u8; typedef int64_t s64; /* Generate a mask with a range of bits set to 1 */ static inline u64 mask(int lo, int hi) { u64 size = hi - lo; return ((1ull << size) - 1) << lo; } /* Check if the nth bit of a 64-bit unsigned integer is set. */ static inline bool bit(u64 x, int n) { return (x >> n) & 1; } static inline u64 sign_extend(u64 value, int from_size, int to_size) { bool negative = bit(value, from_size - 1); if (negative) value |= mask(from_size, to_size); return value; }; /* Perform an arithmetic shift right (ASR) operation on a 64-bit value. * This function sign-extends the 'value' based on the given 'size' and then * shifts it right. The result is masked to ensure it stays within the * 'size'-bit range. */ static inline u64 asr(u64 value, int shift, int size) { s64 sign_extended = (s64) sign_extend(value, size, 64); sign_extended >>= shift; return sign_extended & mask(0, size); } /* A realistic limit for this is a 3-bit value */ typedef u8 booth_chunk_t; typedef struct { u64 recoded_out; bool carry; } booth_recording_out_t; typedef struct { booth_recording_out_t m[4]; } recorded_multiplicands_t; booth_recording_out_t booth_recode(u64 input, booth_chunk_t chunk) { booth_recording_out_t out; switch (chunk) { case 0: out = (booth_recording_out_t) {0, 0}; break; case 1: out = (booth_recording_out_t) {input, 0}; break; case 2: out = (booth_recording_out_t) {input, 0}; break; case 3: out = (booth_recording_out_t) {2 * input, 0}; break; case 4: out = (booth_recording_out_t) {~(2 * input), 1}; break; case 5: out = (booth_recording_out_t) {~input, 1}; break; case 6: out = (booth_recording_out_t) {~input, 1}; break; case 7: out = (booth_recording_out_t) {0, 0}; break; } out.recoded_out &= 0x3FFFFFFFFULL; return out; } /* Structure to store the result of the Carry-Save Adder (CSA) */ typedef struct { u64 out; /* holds the partial sum */ u64 carry; /* holds the propagated carry bits */ } csa_out_t; csa_out_t perform_csa(u64 a, u64 b, u64 c) { u64 out = a ^ b ^ c; u64 carry = (a & b) | (b & c) | (c & a); return (csa_out_t) {out, carry}; } /* Store the current upper 31 bits of the accumulator, which are shifted by * 2 after each CSA (Carry-Save Adder) operation. This register holds the * accumulator's upper bits and aids in the processing of multi-bit addends. */ u64 acc_shift_register = 0; /* Perform a series of CSA operations on an array of recoded multiplicands. */ csa_out_t perform_csa_array(u64 partial_sum, u64 partial_carry, recorded_multiplicands_t addends) { csa_out_t csa_out = {partial_sum, partial_carry}; csa_out_t final_csa_out = {0, 0}; for (int i = 0; i < 4; i++) { csa_out.out &= 0x1FFFFFFFFULL; csa_out.carry &= 0x1FFFFFFFFULL; csa_out_t result = perform_csa(csa_out.out, addends.m[i].recoded_out & 0x1FFFFFFFFULL, csa_out.carry); /* Introduce the carry generated by Booth recoding: * - Shift the carry left by 1 to align it properly. * - Add the Booth recoding's carry into the result. */ result.carry <<= 1; result.carry |= addends.m[i].carry; /* Extract the lowest two bits and inject them into the final.out: * - The lower two bits are directly inserted into the final result * because they will not be affected by subsequent addends, which are * at least 4 times larger. * - This saves space in hardware by reducing the size of the final CSA * stage. */ final_csa_out.out |= (result.out & 3) << (2 * i); final_csa_out.carry |= (result.carry & 3) << (2 * i); /* The next CSA will only operate on the upper bits: * The lower two bits have already been processed, so now we shift both * the.out and carry by 2 to continue processing the higher bits. */ result.out >>= 2; result.carry >>= 2; /* Perform the operations specified in the tables for handling UpperHalf * and High: * - The 'acc_shift_register' holds the upper 31 bits of the accumulator * and the logic updates it accordingly. * - A "magic" value is computed based on the bits of the accumulator * and addends, and it's inserted into the result's.out and carry. */ u64 magic = bit(acc_shift_register, #B01) + !bit(csa_out.carry, #B02) + !bit(addends.m[i].recoded_out, #B03); result.out |= magic << 31; result.carry |= (u64) !bit(acc_shift_register, 1) << 32; /* Shift the acc_shift_register by 2 bits for the next iteration */ acc_shift_register >>= 2; csa_out = result; } /* Final step: shift the remaining.out and carry left by 8 and insert * them into the final result. */ final_csa_out.out |= csa_out.out << 8; final_csa_out.carry |= csa_out.carry << 8; return final_csa_out; } recorded_multiplicands_t get_recoded_multiplicands(u64 multiplicand, u64 multiplier) { recorded_multiplicands_t rm; for (int i = 0; i < 4; i++) rm.m[i] = booth_recode(multiplicand, (multiplier >> (2 * i)) & 0b111); return rm; } csa_out_t perform_one_cycle_of_booths_mutliplication(csa_out_t prev_out, u64 multiplicand, u64 multiplier) { recorded_multiplicands_t rm = get_recoded_multiplicands(multiplicand, multiplier); return perform_csa_array(prev_out.out, prev_out.carry, rm); } typedef enum { SHORT, LONG_SIGNED, LONG_UNSIGNED, } mul_flavor_t; bool is_long(mul_flavor_t flavor) { return flavor == LONG_SIGNED || flavor == LONG_UNSIGNED; } bool is_signed(mul_flavor_t flavor) { return flavor == LONG_SIGNED || flavor == SHORT; } bool should_terminate(u64 multiplier, mul_flavor_t flavor) { if (is_signed(flavor)) return multiplier == 0x1FFFFFFFF || multiplier == 0; return multiplier == 0; } /* Structure to hold the result of the adder operation */ typedef struct { u32 out; /* stores the sum */ bool carry; /* indicates if a carry/overflow occurred */ } adder_out_t; /* Perform addition with carry */ adder_out_t adder(u32 a, u32 b, bool carry) { u32 out = a + b + carry; bool overflow = out < a || out < b; return (adder_out_t) {out, overflow}; } /* Structure to hold the result of a multiplication operation */ typedef struct { u64 out; /* stores the lower 64 bits of the result */ /* 'carry' indicates if there was an overflow (carry) during the * multiplication. */ bool carry; } mul_out_t; struct u128 { u64 lo, hi; }; static inline struct u128 u128_ror(struct u128 input, int shift) { return (struct u128) { (input.lo >> shift) | (input.hi << (64 - shift)), (input.hi >> shift) | (input.lo << (64 - shift)), }; } mul_out_t booths_multiplication(mul_flavor_t flavor, u64 multiplicand, u64 multiplier, u64 accumulator) { csa_out_t csa_out = {0, 0}; bool alu_carry_in = multiplier & 1; if (is_signed(flavor)) multiplier = sign_extend(multiplier, 32, 34); else multiplier = multiplier & 0x1FFFFFFFFull; if (is_signed(flavor)) multiplicand = sign_extend(multiplicand, 32, 34); else multiplicand = multiplicand & 0x1FFFFFFFFull; csa_out.carry = (multiplier & 1) ? ~(multiplicand) : 0; csa_out.out = accumulator; acc_shift_register = accumulator >> #B04; struct u128 partial_sum = {0, 0}; struct u128 partial_carry = {0, 0}; partial_sum.lo = csa_out.out & 1; partial_carry.lo = csa_out.carry & 1; csa_out.out >>= 1; csa_out.carry >>= 1; partial_sum = u128_ror(partial_sum, 1); partial_carry = u128_ror(partial_carry, 1); int n_iterations = 0; do { csa_out = #B05; partial_sum.lo |= csa_out.out & 0xFF; partial_carry.lo |= csa_out.carry & 0xFF; csa_out.out >>= 8; csa_out.carry >>= 8; partial_sum = u128_ror(partial_sum, 8); partial_carry = u128_ror(partial_carry, 8); multiplier = asr(multiplier, 8, 33); n_iterations++; } while (!should_terminate(multiplier, flavor)); partial_sum.lo |= csa_out.out; partial_carry.lo |= csa_out.carry; /* The 'partial_sum' and 'partial_carry' were rotated right by * 8 * n_iterations + 1. Now, they need to be rotated back, but the result * is still off by one, despite attempts to align with the table. */ int correction_ror; if (n_iterations == 1) correction_ror = #B06; if (n_iterations == 2) correction_ror = #B07; if (n_iterations == 3) correction_ror = #B08; if (n_iterations == 4) correction_ror = #B09; partial_sum = u128_ror(partial_sum, correction_ror); partial_carry = u128_ror(partial_carry, correction_ror); if (is_long(flavor)) { if (n_iterations == 4) { adder_out_t adder_out_lo = adder(partial_sum.hi, partial_carry.hi, alu_carry_in); adder_out_t adder_out_hi = adder(partial_sum.hi >> 32, partial_carry.hi >> 32, adder_out_lo.carry); return (mul_out_t) { ((u64) adder_out_hi.out << 32) | adder_out_lo.out, (partial_carry.hi >> 63) & 1}; } adder_out_t adder_out_lo = adder(partial_sum.hi >> 32, partial_carry.hi >> 32, alu_carry_in); int shift_amount = 1 + 8 * n_iterations; shift_amount++; partial_carry.lo = #B10; partial_sum.lo |= acc_shift_register << (shift_amount); adder_out_t adder_out_hi = #B11; return (mul_out_t) {((u64) adder_out_hi.out << 32) | adder_out_lo.out, (partial_carry.hi >> 63) & 1}; } if (n_iterations == 4) { adder_out_t adder_out = #B11; return (mul_out_t) {adder_out.out, (partial_carry.hi >> 31) & 1}; } adder_out_t adder_out = adder(partial_sum.hi >> 32, partial_carry.hi >> 32, alu_carry_in); return (mul_out_t) {adder_out.out, (partial_carry.hi >> 63) & 1}; } /* Return the result of the multiplication using Booth's algorithm */ mul_out_t mul(u32 rm /* multiplier */, u32 rs /* multiplicand */) { return booths_multiplication(SHORT, rm, rs, 0); } /* Perform a multiply-accumulate operation using Booth's algorithm */ mul_out_t mla(u32 rm, u32 rs, u32 rn /* value to accumulate */) { return booths_multiplication(SHORT, rm, rs, rn); } /* Perform an unsigned long multiplication (64-bit result) */ mul_out_t umull(u32 rm, u32 rs) { return booths_multiplication(LONG_UNSIGNED, rm, rs, 0); } /* Perform an unsigned long multiply-accumulate operation (64-bit result) * @rdlo and @rdhi represent the lower and upper parts of the initial 64-bit * value to accumulate. */ mul_out_t umlal(u32 rdlo, u32 rdhi, u32 rm, u32 rs) { return booths_multiplication(LONG_UNSIGNED, rm, rs, ((u64) rdhi << 32) | (u64) rdlo); } /* Perform a signed long multiplication (64-bit result) */ mul_out_t smull(u32 rm, u32 rs) { return booths_multiplication(LONG_SIGNED, rm, rs, 0); } /* Perform a signed long multiply-accumulate operation (64-bit result) */ mul_out_t smlal(u32 rdlo, u32 rdhi, u32 rm, u32 rs) { return booths_multiplication(LONG_SIGNED, rm, rs, (u64) rdhi << 32 | (u64) rdlo); } ``` The corresponding test code: ```c #include <stdio.h> #include <stdlib.h> #include <time.h> /* generate interesting random numbers */ unsigned int get_rand_32() { retry: switch (rand() & 0b1111) { case 0: return rand() & 0xFF; case 1: return rand() & 0xFFFF; case 2: return rand() & 0xFFFFFF; case 3: return 0xFFFFFFFF - (rand() & 0xFF); case 4: return 0xFFFFFFFF - (rand() & 0xFFFF); case 5: return 0xFFFFFFFF - (rand() & 0xFFFFFF); case 6: return 0xAAAAAAAA & (rand() & 0xFFFF) | ((rand() & 0xFFFF) << 16); case 7: return 0x55555555 & (rand() & 0xFFFF) | ((rand() & 0xFFFF) << 16); case 8: return 0; case 9: return ((rand() & 0xFFFF) | ((rand() & 0xFFFF) << 16)); default: goto retry; } } bool guess_mul_zero(uint32_t x) { if (x >> 8 == 0xFFFFFF) { unsigned int masked_x = x & 0xFF; if (masked_x >= 0xC0) return false; return (masked_x & 0x55) != 0; } if (x >> 16 == 0xFFFF) { unsigned int masked_x = x & 0xFFFF; if (masked_x >= 0xC000) return false; return (masked_x & 0x5555) != 0; } if (x >> 24 == 0xFF) { unsigned int masked_x = x & 0xFFFFFF; if (masked_x >= 0xC00000) return false; return (masked_x & 0x555555) != 0; } else { return (x >> 30) == 2; } } int main() { srand(time(NULL)); printf("Running mul carry regression tests...\n"); for (int i = 0; i < 256; i++) { unsigned int multiplicand = 0; unsigned int multiplier = 0xFFFFFFFF - i; unsigned int guess = mul(multiplicand, multiplier).carry; unsigned int actual = guess_mul_zero(multiplier); if (guess != actual) { printf( "[MUL CARRY REGRESSION] Failed: %llx * %llx = %llx, got %llx\n", multiplicand, multiplier, actual, guess); return 1; } printf("[MUL CARRY REGRESSION] Passed: %llx * %llx = %llx, got %llx\n", multiplicand, multiplier, actual, guess); } printf("Running mul regression tests...\n"); for (int i = 0; i < 10000; i++) { unsigned int multiplicand = get_rand_32(); unsigned int multiplier = get_rand_32(); unsigned int guess = mul(multiplicand, multiplier).out; unsigned int actual = (unsigned int) multiplicand * (unsigned int) multiplier; if (guess != actual) { printf("[MUL REGRESSION] Failed: %llx * %llx = %llx, got %llx\n", multiplicand, multiplier, actual, guess); return 1; } printf("[MUL REGRESSION] Passed: %llx * %llx = %llx, got %llx\n", multiplicand, multiplier, actual, guess); } printf("Running mla regression tests...\n"); for (int i = 0; i < 10000; i++) { unsigned int multiplicand = get_rand_32(); unsigned int multiplier = get_rand_32(); unsigned int accumulate = get_rand_32(); unsigned int guess = mla(multiplicand, multiplier, accumulate).out; unsigned int actual = (unsigned int) multiplicand * (unsigned int) multiplier + (unsigned int) accumulate; if (guess != actual) { printf( "[MLA REGRESSION] Failed: %llx * %llx + %llx = %llx, got " "%llx\n", multiplicand, multiplier, accumulate, actual, guess); return 1; } } printf("Running umull regression tests...\n"); for (int i = 0; i < 10000; i++) { unsigned int multiplicand = get_rand_32(); unsigned int multiplier = get_rand_32(); unsigned int accumulate = get_rand_32(); unsigned long long guess = umull(multiplicand, multiplier).out; unsigned long long actual = (unsigned long long) multiplicand * (unsigned long long) multiplier; if (guess != actual) { printf( "[UMULL REGRESSION] Failed #%d: %llx * %llx = %llx, got %llx\n", i, multiplicand, multiplier, actual, guess); return 1; } printf("[UMULL REGRESSION] Passed #%d: %llx * %llx = %llx, got %llx\n", i, multiplicand, multiplier, actual, guess); } printf("Running umlal regression tests...\n"); for (int i = 0; i < 10000; i++) { unsigned int multiplicand = get_rand_32(); unsigned int multiplier = get_rand_32(); unsigned int accumulate = get_rand_32(); unsigned int accumulate2 = get_rand_32(); unsigned long long guess = umlal(accumulate, accumulate2, multiplicand, multiplier).out; unsigned long long actual_acc = (unsigned long long) accumulate + ((unsigned long long) accumulate2 << 32); unsigned long long actual = (unsigned long long) multiplicand * (unsigned long long) multiplier + actual_acc; if (guess != actual) { printf( "[UMLAL REGRESSION] Failed: %llx * %llx + %llx = %llx, got " "%llx\n", multiplicand, multiplier, actual_acc, actual, guess); return 1; } printf( "[UMLAL REGRESSION] Passed: %llx * %llx + %llx = %llx, got " "%llx\n", multiplicand, multiplier, actual_acc, actual, guess); } printf("Running smull regression tests...\n"); for (int i = 0; i < 10000; i++) { unsigned int multiplicand = get_rand_32(); unsigned int multiplier = get_rand_32(); long long guess = smull(multiplicand, multiplier).out; long long actual = (long long) (int) multiplicand * (long long) (int) multiplier; if (guess != actual) { printf("[SMULL REGRESSION] Failed: %llx * %llx = %llx, got %llx\n", multiplicand, multiplier, actual, guess); return 1; } printf("[SMULL REGRESSION] Passed: %llx * %llx = %llx, got %llx\n", multiplicand, multiplier, actual, guess); } printf("Running smlal regression tests...\n"); for (int i = 0; i < 10000; i++) { unsigned int multiplicand = get_rand_32(); unsigned int multiplier = get_rand_32(); unsigned int accumulate = get_rand_32(); unsigned int accumulate2 = get_rand_32(); long long guess = smlal(accumulate, accumulate2, multiplicand, multiplier).out; long long actual_acc = (long long) accumulate + ((long long) accumulate2 << 32); long long actual = (long long) (int) multiplicand * (long long) (int) multiplier + actual_acc; if (guess != actual) { printf( "[SMLAL REGRESSION] Failed: %llx * %llx + %llx = %llx, got " "%llx\n", multiplicand, multiplier, actual_acc, actual, guess); return 1; } printf( "[SMLAL REGRESSION] Passed: %llx * %llx + %llx = %llx, got " "%llx\n", multiplicand, multiplier, actual_acc, actual, guess); } printf("All tests passed!\n"); } ``` The reference output: ``` Running mul carry regression tests... [MUL CARRY REGRESSION] Passed: 0 * ffffffff = 0, got 0 [MUL CARRY REGRESSION] Passed: 0 * fffffffe = 0, got 0 ... [SMLAL REGRESSION] Passed: fffff312 * ffffdd11 + 2ac800000000 = 2ac801c3ae32, got 2ac801c3ae32 [SMLAL REGRESSION] Passed: 73 * 4060 + ff03f84273551101 = ff03f8427371fc21, got ff03f8427371fc21 All tests passed! ``` Complete the code above to ensure it always passes the test cases. The values for #B01, #B02, #B03, #B04, #B06, #B07, #B08, and #B09 must be expressed as decimal integer literals in their shortest form. #B05, #B10, and #B11 are valid C99 expressions without semicolons that invoke the helper functions defined in the code above. B01 = ==0== B02 = ==32== B03 = ==33== B04 = ==34== B05 = ==`perform_one_cycle_of_booths_mutliplication(csa_out, multiplicand, multiplier)`== B06 = ==23== B07 = ==15== B08 = ==7== B09 = ==31== B10 = ==`sign_extend(partial_carry.lo, shift_amount, 64)`== B11 = ==`adder(partial_sum.hi, partial_carry.hi, alu_carry_in)`== --- ## Problem `C` Besides the RV32I instructions, you are allowed to use the following multiply instruction introduced in the RISC-V M extension: ``` MUL rd, rs1, rs2 ``` This instruction multiplies operands `rs1` and `rs2` and stores the lower 32 bits of the result in `rd`. Note that there are no signed or unsigned variants; it doesn’t matter whether `rs1` and `rs2` are signed or unsigned because the result is the same. You can easily verify this with a 2-bit × 2-bit example: - Unsigned: `2'b11 × 2'b11 = 3 × 3 = 9 = 4'b1001` $\to$ `2'b01` - Signed: `2'b11 × 2'b11 = -1 × -1 = 1 = 4'b0001` $\to$ `2'b01` - Unsigned: `2'b01 × 2'b11 = 1 × 3 = 3 = 4'b0011` $\to$ `2'b11` - Signed: `2'b01 × 2'b11 = 1 × -1 = -1 = 4'b1111` $\to$ `2'b11` When the result is truncated to the same number of bits as the operands, the outcome is identical. Mnemonic: bseti rd, rs1,shamt The `bseti` (Single-Bit Set) instruction sets a single bit in `rs1` at the position specified by `shamt`. The bit position is determined by the lower log2(XLEN) bits of `shamt`. For RV32, encodings where `shamt[5] = 1` are reserved. ![image](https://hackmd.io/_uploads/SJKnDXVeJl.png) ![image](https://hackmd.io/_uploads/BJ9gdm4gyx.png) [Fixed-point arithmetic](https://en.wikipedia.org/wiki/Fixed-point_arithmetic) can be challenging, especially for complex algorithms or poorly behaved data. However, with careful preparation and testing, it can be implemented effectively. Below is a general approach to transitioning from floating-point to fixed-point arithmetic, using a 32-bit signed integer as the fixed-point data type. For each variable, pick an appropriate Q format based on its range. The Q format defines the number of fractional bits used in a fixed-point number. Here are some examples: - A variable with a range of $[6, -6)$ can be represented using Q28 (4 bits for the integer part, 28 bits for the fractional part). - If you want to include an extra guard bit to protect against overflow, use Q27 instead. - A variable with a range of $[10, -10)$ can be represented in Q27. The idea is to fit each variable's range into the number of bits available while keeping track of the precision. The `sqrtf` function, which computes the square root using Newton-Raphson iterations, will require careful handling when transitioning to fixed-point. Here is how you can approach it: 1. Initial Floating-Point Prototype: Before converting to fixed-point, the algorithm must be simulated and tested using floating-point arithmetic, ensuring all edge cases (e.g., small inputs, negative values, large values) are covered. 2. Range Determination: For the `sqrtf` implementation: - The mantissa values typically fall in a normalized range (e.g., between 1 and 2). - The reciprocal square root approximation will also fall within a specific range, depending on the input data. - You need to simulate the maximum possible input to estimate the worst-case range for both the mantissa and the square root approximation. 3. Q Format Assignment: For example: - The mantissa might be assigned a Q27 format, giving sufficient precision while covering the expected range. - The reciprocal square root approximation can be in Q24 or Q27, depending on the accuracy required for the Newton-Raphson iterations. 4. Handling Iterations and Adjustments: - During the Newton-Raphson iterations, maintain the Q format for all intermediate results. - Operations like multiplication will require adjustment of the Q format (e.g., a multiplication in Q27 may need a shift right to adjust back to Q27). 5. Final Rounding: After the iterations, ensure that any remaining error is properly rounded, using appropriate bit shifts to bring the result back into the desired format. 6. Guard Bits for Clipping: In cases where the algorithm processes large or near-boundary values, include guard bits to protect against overflow, ensuring that the results remain within range without excessive clipping. By following these steps, you can methodically implement the fixed-point version of `sqrtf`, ensuring precision while handling the constraints of the fixed-point format. Q-format (or Q notation) is a way to represent fixed-point numbers. It allows fractional numbers to be stored using integers by implicitly representing the fraction through bit positions. This is often used in digital signal processing (DSP), embedded systems, and other performance-constrained applications where floating-point operations are too costly in terms of speed or resources. In Q-format, the number is represented as a signed or unsigned integer, and the fractional part is encoded by assuming that the lower bits of the integer represent the fraction. The format is denoted as `Qm.n`, where: - $m$: Number of bits used for the integer (or whole part). - $n$: Number of bits used for the fractional part. For example, in a 32-bit number represented as `Q4.28`, 4 bits are reserved for the integer part, and 28 bits are reserved for the fractional part. Key Concepts of Q-Format 1. Fixed-Point Representation: A fixed-point number is essentially an integer that represents a scaled real number. The scaling factor is typically a power of two, so the decimal point is assumed to be "fixed" at a certain position in the binary representation. 2. Q-Format Representation: The general Q-format, denoted as **Qm.n**, divides a word (32-bit or 16-bit) into an integer part and a fractional part: - The $m$ bits represent the integer part. - The $n$ bits represent the fractional part. For instance: - `Q4.28`: Uses 4 bits for the integer part and 28 bits for the fractional part, providing a high level of precision for small numbers. - `Q15.16`: Uses 15 bits for the integer part and 16 bits for the fractional part, offering a balance between integer range and fractional precision. Assume we are using a 32-bit signed integer to represent a number in `Q15.16` format: - 15 bits for the integer part, including the sign bit (most significant bit). - 16 bits for the fractional part. In this case, a 32-bit integer $N$ in `Q15.16` format is interpreted as: $$ N = I + F \times 2^{-16} $$ Where: - $I$ is the integer value (stored in the upper 16 bits). - $F$ is the fractional value (stored in the lower 16 bits). For example, the number 3.25 would be stored as: $$ Binary: 0000 0000 0000 0011.0100 0000 0000 0000 $$ - The integer part (3) is stored in the upper 16 bits. - The fractional part (0.25) is stored as $0.25 \times 2^{16} = 16384$ in the lower 16 bits. Scaling and Range in Q-Format - Scaling: The real value represented by a fixed-point number is obtained by dividing the integer by $2^n$, where $n$ is the number of fractional bits. For example, in Q15.16 format, dividing by $2^{16}$ gives the real number. - Range: The range of values that can be represented in Q-format depends on the number of integer bits $m$ and whether the format is signed or unsigned: - Signed `Qm.n`: For signed numbers, the range is $-2^m \leq x < 2^m$. For example, in Q15.16, the range is $-32768 \leq x < 32768$. - Unsigned `Qm.n`: For unsigned numbers, the range is $0 \leq x < 2^m$. For example, in Q16.16, the range is $0 \leq x < 65536$. When multiplying two numbers in Q-format, the result will have twice the fractional bits (Qn × Qn = Q2n). After the multiplication, you will need to right-shift by $n$ bits to bring the result back into the original Q-format. For example, in `Q15.16`: $$ Result = (Num1 \times Num2) >> 16 $$ In the `sqrtf` implementation (for square root approximation), using Q-format would require careful handling of variables: - Mantissa: You could represent the mantissa in **Q27** format, giving enough precision for the initial reciprocal square root approximation. - Newton-Raphson Iterations: The iterative process of squaring and correcting would need careful management of Q-formats (e.g., after multiplication, shift the result appropriately). By ensuring the correct scaling (shifts) during operations, the accuracy of the algorithm can be preserved while transitioning from floating-point to fixed-point representation. You are now tasked with implementing the RISC-V assembly code to replace [sqrtf](https://man7.org/linux/man-pages/man3/sqrt.3.html). You should use RV32I instructions and the `MUL` and `bseti` instruction mentioned above. Code: ```c // The square root routine starts with an initial approximation of the // reciprocal of the square root based on the top four bits of the // mantissa (potentially shifted to make the exponent even). Two // Newton-Raphson iterations are performed, providing approximately // 14 bits of accuracy. The reciprocal is multiplied by the original // argument to give an approximation of the square root, also with // around 14 bits of accuracy. The remainder is then calculated and // multiplied by the reciprocal estimate to generate a correction term, // resulting in an accuracy of about 28 bits. Finally, a remainder check // ensures correct rounding if necessary. The fixed-point calculation // carefully maintains accuracy, similar to the considerations made in // the fast division routine. The reciprocal square root calculation has // been thoroughly tested with all possible input mantissa values // (potentially shifted). .global sqrtf sqrtf: // Check if the argument is negative. bltz a0, sq_0 // If negative, jump to sq_0. // Extract mantissa (23+1 bits) without the exponent and add the // implied hidden bit '1' -> A1. slli a1, a0, 8 // Shift mantissa left by 8 bits. bseti a1, a1, 31 // Set the implied hidden bit '1'. #C01 // Restore mantissa to its original position. // Extract the exponent (8 bits, without the sign bit) -> A2. srli a2, a0, 23 // Extract exponent and store it in A2. // Check if the number is zero. beqz a2, sq_2 // If exponent is zero, jump to sq_2. // Check if the number is infinity. li a4, 255 // Load the exponent value for infinity. #C02 // If exponent equals 255, jump to sq_1. // Pre-correction for packing - add 125 to the exponent. add a2, a2, 125 // Adjust exponent for packing. // Halve the exponent (square root operation halves the exponent) - if // odd, double the mantissa. slli a4, a2, 31 // Save the lowest bit of the exponent in A4. #C03 // Divide the exponent by 2. beqz a4, 1f // If exponent is even, jump to label 1. slli a1, a1, 1 // If exponent was odd, double the mantissa. // Retrieve the first approximation from the table based on the top 4 // bits of the mantissa. 1: lui a4, %hi(rsqrtapp-4) // Load the high address of the table. #C04 // Extract the top 4 bits of the mantissa. add a4, a4, a3 // Compute the table address. lbu a4, %lo(rsqrtapp-4)(a4) // Load the initial approximation. // Perform the first Newton-Raphson iteration. srli a0, a1, 7 // Shift right to prepare for iteration. mul a0, a0, a4 // Multiply m0 by y (Q24 result). mul a0, a0, a4 // Square the result (Q32). #C05 // Scale back to Q20. mul a0, a0, a4 // Multiply by m0 again (dy0 Q28). srai a0, a0, 13 // Scale back to Q15. slli a4, a4, 8 // Scale m0 to Q16. sub a4, a4, a0 // Subtract dy0/2 from m0 (m1 Q16). addi a4, a4, 170 // Apply correction to gain ~1 bit accuracy. // Perform the second Newton-Raphson iteration. mv a0, a4 // Prepare for the second iteration. #C06 // Square m1 (Q32). #C07 // Scale back to Q17. srli a3, a1, 8 // Prepare y (Q15). mul a0, a0, a3 // Multiply m1*m1 by y (Q32). srai a0, a0, 12 // Scale back to Q20. mul a0, a0, a4 // Multiply by m1 again (dy1 Q36). #C08 // Scale back to Q15. sub a4, a4, a0 // Subtract dy1/2 from m1 (m2 Q16). mul a3, a3, a4 // Multiply y by m2 (Q31). srli a3, a3, 15 // Scale back to Q16. // At this point, m2 is the reciprocal square root approximation, and // m3 is the square root approximation. mv a0, a3 // Move m3 to a0. mul a0, a0, a0 // Square m3 (Q32). #C09 // Scale y to Q32. sub a0, a1, a0 // Calculate the remainder (a2 = y - m3*m3). #C10 // Scale remainder to Q27. mul a4, a4, a0 // Multiply a2 by m2 (Q43). slli a3, a3, 7 // Scale m3 to Q23. srai a0, a4, 15 // Scale the result (a2*m2 Q28). #C11 // Apply rounding to Q24. srai a0, a0, 6 // Scale back to Q22. // Perform final rounding. add a3, a3, a0 // Add the correction term to m4 (Q23). #C12 // Check for overflow. beqz a4, sq_3 // If no overflow, skip rounding. add a4, a4, a3 // Add 0.5 ULP to m4 (Q24). mul a4, a4, a4 // Square m4 (Q48). #C13 // Scale y to Q48. sub a1, a1, a4 // Calculate the remainder (Q48). bltz a1, sq_3 // If negative, skip. add a3, a3, 1 // Round up. // Pack the final result. sq_3: #C14 // Pack the exponent. add a0, a2, a3 // Combine the exponent and mantissa. sq_6: ret // Return the result. // Handle negative argument. sq_0: slli a1, a0, 1 // Clear the sign bit. srli a1, a1, 24 // Check if exponent is zero. beqz a1, sq_2 // If -0, return it. // Return -Inf for negative non-zero values. srai a0, a0, 31 sq_5: slli a0, a0, 23 ret // Return +Inf for positive infinity. sq_1: srli a0, a0, 23 j sq_5 // Return ±zero. sq_2: srli a0, a0, 31 slli a0, a0, 31 ret // Lookup table for reciprocal square root approximations. rsqrtapp: .byte 0xf1, 0xda, 0xc9, 0xbb, 0xb0, 0xa6, 0x9e, 0x97, 0x91, 0x8b, 0x86, 0x82 ``` Pseudoinstructions are not allowed in your answers. C01 = ==`srli a1, a1, 8`== C02 = ==`beq a2, a4, sq_1`== C03 = ==`srai a2, a2, 1`== C04 = ==`srli a3, a1, 21`== C05 = ==`srai a0, a0, 12`== C06 = ==`mul a0, a0, a0`== C07 = ==`srli a0, a0, 15`== C08 = ==`srai a0, a0, 21`== C09 = ==`slli a1, a1, 9`== C10 = ==`srai a0, a0, 5`== C11 = ==`addi a0, a0, 16 `== C12 = ==`sltu a4, a3, a0`== C13 = ==`slli a1, a1, 16`== C14 = ==`slli a2, a2, 23`== --- ## Problem `D` What is the logical expression for the circuit shown below? __ #D01 __ (Hint: You can derive the expression using a truth table.). ![circuit](https://hackmd.io/_uploads/BkbFY8VgJg.png) * D01 = ==`Out = (¬A + ¬B)¬C`== > Alternative: ![answer](https://hackmd.io/_uploads/BycotU4e1g.png =50%x) Below is a state transition diagram of a finite state machine (FSM) with 4 states. ![FSM](https://hackmd.io/_uploads/BJ8bjL4x1l.png) Given the input bit sequence 10011010 to the FSM, the output is __ `#D02` __ * D02 = ==00101000== In which state does the FSM end? __ `#D03` __ * D03 = ==A== Assume that the 1-bit full adder is implemented as shown in the figure below. The delay at each gate is 1ps. What is the propagation delay of the 1-bit full adder? __ `#D04` __ ps ![full adder](https://hackmd.io/_uploads/HJfS1wNlyx.png) * D04 = ==3== Consider the following circuit. ![circuit](https://hackmd.io/_uploads/r10NlwVx1l.png) Each square-shaped component represents a 1-bit full adder: $A$ and $B$ are its two 1-bit inputs, $C_{in}$ is the 1-bit carry-in, $\Sigma$ is the 1-bit output, and $C_{out}$ is the 1-bit carry-out. $A_3, A_2, A_1, A_0$ and $B_3, B_2, B_1, B_0$ are all 1-bit inputs. $P_3, P_2, P_1, P_0$ are the outputs of the four XOR gates. $\Sigma_3, \Sigma_2, \Sigma_1, \Sigma_0$ are the 1-bit outputs of the four full adders. What is the total propagation delay of the entire circuit shown above, assuming that the XOR gates outside the full adder also have a delay of 1 ps? __ `#D05` __ * D05 = ==9== > the critical path is > B0 -(1 gate)- P0 > P0 -(3 gate)- Cout0 > Cout0 -(2 gate)- Cout1 > Cout1 -(2 gate)- Cout2 > Cout2 -(1 gate)- Σ3 --- ## Problem `E` The following circuit will be analyzed: ![circuit](https://hackmd.io/_uploads/rJXpGDVx1l.png) Consider the following information: - AND gates have a propagation delay of 9 ns - OR gates have a propagation delay of 14 ns - NOT gates have a propagation delay of 5 ns - The $x_{input}$ switches value (i.e., from 1 to 0, or 0 to 1) 30 ns after the rising edge of the clock - The $y_{output}$ is directly connected to a register - Setup time is 3 ns - Clock-to-Q delay time is 4 ns What is the minimum clock period in nanoseconds? __ E01 __ ns * E01 = ==42== > Critical path = clk-to-q + longest CL + setup = 30ns for x_input to change (includes clk-to-q) + 9 AND + 3 setup = 42 ns What is the max hold time in ns? __ E02 __ * E02 = ==18== > Shortest CL: NOT $\to$ AND = 5 + 9 = 14ns clk-to-q + shortest CL = 4ns + 14ns = 18ns Assume the clock period is 50 ns, with the first rising edge occurring at 25 ns, and $x_{input}$ initialized to 0 at 0 ns. At what time will $y_{output}$ become 1? __ E03 __ ns * E03 = ==102== > 25 + 50 (clock period) + 4 (clk-to-q) + 14 (OR) + 9 (AND) = 102 ns --- ## Problem `F` You are working on a new chip for an embedded application and want to create a new ISA. Frustrated with the various RISC-V instruction types, you decide to include only one universal instruction type: the X-type instruction. Consider the following instructions: * `add rd1, rs1, rs2` * `and rd1, rs1, rs2` * `lw rd1, offset1(rs1)` * `sw rs2, offset1(rs1)` * `addi rd1, rs1, imm1` * `beq rs1, rs2, offset1` * `lui rd1, offset1` * `jal rd1, imm` * `stw rs3, offset1, offset2(rs1)` (new instruction) The new `stw` instruction stores the contents of `rs3` into both `rs1 + offset1` and `rs1 + offset2`. C-like description of the `stw` instruction: ```c Mem[R[rs1] + offset1] = R[rs3]; /* Store rs3 into memory at address rs1 + offset1 */ Mem[R[rs1] + offset2] = R[rs3]; /* Store rs3 into memory at address rs1 + offset2 */ ``` You want to eliminate the funct3 and funct7 fields, using only an opcode. If we only need to support the instructions listed above, what is the minimum number of bits required for the opcode field? __ F01 __ * F01 = ==4== > We have 9 instructions, so we need 4 bits to represent them. We need the ability to jump up to 64 KiB in either direction with a single instruction. How many bits are required to encode an immediate that allows this? __ F02 __ bits. Assume, similar to RV32, that the least significant bit is implicitly 0 and not stored in the instruction. Note: `jal` is the only jump instruction provided, so `jal` will determine the size of the immediate field. * F02 = ==16== > Recall that, in RISC-V, the immediates encoded in instructions work in units of half-instructions, so jumping up to 64 KiB in either direction is the same as saying we want to jump up to $2^{15}$ half instructions in both directions. Since immediates are signed, we need 16 bits to represent this range of values.