Quiz3 of Computer Architecture (2024 Fall)
Solutions
Problem A
Consider a RISC-V program that outputs the "Hello World!" message. Its assembly code is shown below.
The content of this program is displayed using the hexdump tool.
You are tasked with writing a simplified RISC-V instruction set simulator capable of running the program above. The simulator should be able to fetch, decode, and execute a subset of RISC-V instructions. The partial implementation is shown below, and you are required to complete the code.
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define fatal(args...) \
do { \
printf("Fatal: "); \
printf(args); \
exit(-1); \
} while (0)
enum {
INTEGER_COMP_RI = 0x13,
LUI = #A01 ,
AUIPC = #A02 ,
ECALL = 0x73,
};
enum {
ADDI = 0x0,
};
enum { OPCODE_MASK = 0x7F, REG_ADDR_MASK = 0x1F };
enum {
RD_SHIFT = #A03 ,
RS1_SHIFT = #A04 ,
RS2_SHIFT = 20,
FUNCT3_SHIFT = #A05 ,
FUNCT7_SHIFT = 25
};
enum { FUNCT3_MASK = 0x7, FUNCT7_MASK = 0x7F };
enum {
RAM_SIZE = 1204 * 1024 * 2,
RAM_BASE = 0x0,
};
typedef struct {
uint8_t *mem;
} ram_t;
typedef struct {
uint32_t regs[32];
size_t pc;
ram_t *ram;
} cpu_t;
typedef struct {
uint8_t opcode;
uint8_t rd, rs1, rs2, funct3, funct7;
} insn_t;
ram_t *ram_new(uint8_t *code, size_t len)
{
ram_t *ram = malloc(sizeof(ram_t));
ram->mem = malloc(RAM_SIZE);
memcpy(ram->mem, code, len);
return ram;
}
void ram_free(ram_t *mem)
{
free(mem->mem);
free(mem);
}
static inline void check_addr(int32_t addr)
{
size_t index = addr - RAM_BASE;
if (index >= RAM_SIZE) fatal("out of memory.\n");
}
static uint32_t ram_load8(ram_t *mem, uint32_t addr)
{
size_t index = addr - RAM_BASE;
return (uint32_t) mem->mem[index];
}
static uint32_t ram_load16(ram_t *mem, uint32_t addr)
{
size_t index = addr - RAM_BASE;
return ((uint32_t) mem->mem[index] | ((uint32_t) mem->mem[index + 1] << 8));
}
static uint32_t ram_load32(ram_t *mem, uint32_t addr)
{
size_t index = addr - RAM_BASE;
return ((uint32_t) mem->mem[index] | ((uint32_t) mem->mem[index + 1] << 8) |
((uint32_t) mem->mem[index + 2] << 16) |
((uint32_t) mem->mem[index + 3] << 24));
}
uint32_t ram_load(ram_t *mem, uint32_t addr, uint8_t size)
{
check_addr(addr);
uint32_t r = 0;
switch (size) {
case 8: r = ram_load8(mem, addr); break;
case 16: r = ram_load16(mem, addr); break;
case 32: r = ram_load32(mem, addr); break;
default: fatal("RAM: invalid load size(%d)\n", size);
}
return r;
}
cpu_t *cpu_new(uint8_t *code, size_t len)
{
cpu_t *cpu = malloc(sizeof(cpu_t));
memset(cpu->regs, 0, sizeof(cpu->regs));
cpu->regs[0] = 0;
cpu->regs[2] = RAM_BASE + RAM_SIZE;
cpu->pc = RAM_BASE;
cpu->ram = ram_new(code, len);
return cpu;
}
void cpu_free(cpu_t *cpu)
{
ram_free(cpu->ram);
free(cpu);
}
uint32_t cpu_load(cpu_t *cpu, uint32_t addr, uint8_t size)
{
return ram_load(cpu->ram, addr, size);
}
uint32_t cpu_fetch(cpu_t *cpu) { return cpu_load(cpu, cpu->pc, 32); }
void cpu_decode(uint32_t raw, insn_t *inst)
{
uint8_t opcode = raw & OPCODE_MASK;
uint8_t rd = (raw >> RD_SHIFT) & REG_ADDR_MASK;
uint8_t rs1 = (raw >> RS1_SHIFT) & REG_ADDR_MASK;
uint8_t rs2 = (raw >> RS2_SHIFT) & REG_ADDR_MASK;
uint8_t funct3 = (raw >> FUNCT3_SHIFT) & FUNCT3_MASK;
uint8_t funct7 = (raw >> FUNCT7_SHIFT) & FUNCT7_MASK;
inst->opcode = opcode;
inst->rd = rd, inst->rs1 = rs1, inst->rs2 = rs2;
inst->funct3 = funct3;
inst->funct7 = funct7;
}
static inline int32_t i_imm(uint32_t raw)
{
return ((int32_t) raw) >> #A06 ;
}
static inline int32_t u_imm(uint32_t raw)
{
return ((int32_t) raw & 0xFFFFF000);
}
enum { SYSCALL_WRITE = 64, SYSCALL_EXIT = 93 };
enum { STDOUT = 1, STDERR = 2 };
void ecall_handler(cpu_t *cpu)
{
uint32_t syscall_nr = cpu->regs[17];
switch (syscall_nr) {
case SYSCALL_WRITE: {
uint32_t fd = cpu->regs[10];
uint32_t addr = cpu->regs[11];
uint32_t count = cpu->regs[12];
FILE *stream;
switch (fd) {
case STDOUT: stream = stdout; break;
case STDERR: stream = stderr; break;
default: fatal("invalid file-descriptor %d.\n", fd);
}
for (uint32_t i = 0; i < count; i++)
fprintf(stream, "%c", cpu_load(cpu, addr + i, 8));
break;
}
case SYSCALL_EXIT: {
int32_t exit_code = cpu->regs[10];
exit(exit_code);
break;
}
default: fatal("unkown syscall number %d.\n", syscall_nr);
}
}
static insn_t inst;
void cpu_execute(cpu_t *cpu, uint32_t raw)
{
cpu_decode(raw, &inst);
cpu->regs[0] = 0;
switch (inst.opcode) {
case INTEGER_COMP_RI: {
int32_t imm = i_imm(raw);
switch (inst.funct3) {
case ADDI: cpu->regs[inst.rd] = cpu->regs[inst.rs1] + imm; break;
default: fatal("Unknown FUNCT3 for INTEGER_COMP_RI");
}
break;
}
case LUI: {
int32_t imm = u_imm(raw);
cpu->regs[inst.rd] = imm;
break;
}
case AUIPC: {
int32_t imm = u_imm(raw);
cpu->regs[inst.rd] = (cpu->pc + (#A07 )) + imm;
break;
}
case ECALL:
if (raw == 0x100073) break;
ecall_handler(cpu);
break;
default: fatal("Illegal Instruction 0x%x\n", inst.opcode);
}
}
int main(int argc, char *argv[])
{
if (argc < 2) {
printf("Usage: %s [filename]\n", argv[0]);
exit(-1);
}
const char *filename = argv[1];
FILE *file = fopen(filename, "rb");
if (!file) {
printf("Failed to open %s\n", filename);
exit(-1);
}
fseek(file, 0L, SEEK_END);
size_t code_size = ftell(file);
rewind(file);
uint8_t *code = malloc(code_size);
fread(code, sizeof(uint8_t), code_size, file);
fclose(file);
cpu_t *cpu = cpu_new(code, code_size);
while (1) {
uint32_t raw = cpu_fetch(cpu);
cpu->pc += #A08 ;
if (cpu->pc > code_size) break;
cpu_execute(cpu, raw);
if (!cpu->pc) break;
}
cpu_free(cpu);
return 0;
}
The values for #A01
and #A02
must be expressed as hexadecimal integer literals in their shortest form. For example, use 0x7C00 rather than 0x007C00. The values for #A03
, #A04
, #A05
, #A06
, #A07
, and #A08
must be expressed as decimal integer literals in their shortest form. In the decimal system, negative integers are simply indicated by placing a -
sign in front of the number (e.g., -5
).
A01 = 0x37
A02 = 0x17
A03 = 7
A04 = 15
A05 = 12
A06 = 20
A07 = -4
A08 = 4
Problem B
Booth's multiplication algorithm is a method used to multiply two signed binary numbers in two's complement notation. This algorithm examines adjacent pairs of bits in the 'N'-bit multiplier , which is represented in two's complement form, with an implicit bit below the least significant bit ().
For each bit (where runs from 0 to ), the algorithm considers the current bit and the previous bit . The actions taken are as follows:
- If and are equal, no changes are made to the product accumulator .
- If and , the multiplicand multiplied by is added to .
- If and , the multiplicand multiplied by is subtracted from .
The final value of represents the signed product of the multiplication.
Booth's algorithm does not specify how the multiplicand and product are represented, though they are typically in two's complement notation like the multiplier. However, the algorithm can work with any number system that supports addition and subtraction. The steps generally proceed from the least significant bit (LSB) to the most significant bit (MSB), starting at . Instead of directly multiplying by , the algorithm typically shifts the accumulator to the right between steps. This allows low bits to be shifted out, and subsequent operations (additions and subtractions) can focus on the highest bits of .
The algorithm effectively converts sequences of 1s in the multiplier into a high-order +1 and a low-order -1 at the boundaries of the sequence. When a sequence extends through the MSB, there is no high-order +1, and the effect is interpreted as the negative of the corresponding value.
The following code is an implementation of Booth's multiplication algorithm, written in C99.
#include <stdbool.h>
#include <stdint.h>
typedef uint64_t u64;
typedef uint32_t u32;
typedef uint16_t u16;
typedef uint8_t u8;
typedef int64_t s64;
static inline u64 mask(int lo, int hi)
{
u64 size = hi - lo;
return ((1ull << size) - 1) << lo;
}
static inline bool bit(u64 x, int n) { return (x >> n) & 1; }
static inline u64 sign_extend(u64 value, int from_size, int to_size)
{
bool negative = bit(value, from_size - 1);
if (negative) value |= mask(from_size, to_size);
return value;
};
static inline u64 asr(u64 value, int shift, int size)
{
s64 sign_extended = (s64) sign_extend(value, size, 64);
sign_extended >>= shift;
return sign_extended & mask(0, size);
}
typedef u8 booth_chunk_t;
typedef struct {
u64 recoded_out;
bool carry;
} booth_recording_out_t;
typedef struct {
booth_recording_out_t m[4];
} recorded_multiplicands_t;
booth_recording_out_t booth_recode(u64 input, booth_chunk_t chunk)
{
booth_recording_out_t out;
switch (chunk) {
case 0: out = (booth_recording_out_t) {0, 0}; break;
case 1: out = (booth_recording_out_t) {input, 0}; break;
case 2: out = (booth_recording_out_t) {input, 0}; break;
case 3: out = (booth_recording_out_t) {2 * input, 0}; break;
case 4: out = (booth_recording_out_t) {~(2 * input), 1}; break;
case 5: out = (booth_recording_out_t) {~input, 1}; break;
case 6: out = (booth_recording_out_t) {~input, 1}; break;
case 7: out = (booth_recording_out_t) {0, 0}; break;
}
out.recoded_out &= 0x3FFFFFFFFULL;
return out;
}
typedef struct {
u64 out;
u64 carry;
} csa_out_t;
csa_out_t perform_csa(u64 a, u64 b, u64 c)
{
u64 out = a ^ b ^ c;
u64 carry = (a & b) | (b & c) | (c & a);
return (csa_out_t) {out, carry};
}
u64 acc_shift_register = 0;
csa_out_t perform_csa_array(u64 partial_sum,
u64 partial_carry,
recorded_multiplicands_t addends)
{
csa_out_t csa_out = {partial_sum, partial_carry};
csa_out_t final_csa_out = {0, 0};
for (int i = 0; i < 4; i++) {
csa_out.out &= 0x1FFFFFFFFULL;
csa_out.carry &= 0x1FFFFFFFFULL;
csa_out_t result =
perform_csa(csa_out.out, addends.m[i].recoded_out & 0x1FFFFFFFFULL,
csa_out.carry);
result.carry <<= 1;
result.carry |= addends.m[i].carry;
final_csa_out.out |= (result.out & 3) << (2 * i);
final_csa_out.carry |= (result.carry & 3) << (2 * i);
result.out >>= 2;
result.carry >>= 2;
u64 magic = bit(acc_shift_register, #B01) +
!bit(csa_out.carry, #B02) +
!bit(addends.m[i].recoded_out, #B03);
result.out |= magic << 31;
result.carry |= (u64) !bit(acc_shift_register, 1) << 32;
acc_shift_register >>= 2;
csa_out = result;
}
final_csa_out.out |= csa_out.out << 8;
final_csa_out.carry |= csa_out.carry << 8;
return final_csa_out;
}
recorded_multiplicands_t get_recoded_multiplicands(u64 multiplicand,
u64 multiplier)
{
recorded_multiplicands_t rm;
for (int i = 0; i < 4; i++)
rm.m[i] = booth_recode(multiplicand, (multiplier >> (2 * i)) & 0b111);
return rm;
}
csa_out_t perform_one_cycle_of_booths_mutliplication(csa_out_t prev_out,
u64 multiplicand,
u64 multiplier)
{
recorded_multiplicands_t rm =
get_recoded_multiplicands(multiplicand, multiplier);
return perform_csa_array(prev_out.out, prev_out.carry, rm);
}
typedef enum {
SHORT,
LONG_SIGNED,
LONG_UNSIGNED,
} mul_flavor_t;
bool is_long(mul_flavor_t flavor)
{
return flavor == LONG_SIGNED || flavor == LONG_UNSIGNED;
}
bool is_signed(mul_flavor_t flavor)
{
return flavor == LONG_SIGNED || flavor == SHORT;
}
bool should_terminate(u64 multiplier, mul_flavor_t flavor)
{
if (is_signed(flavor)) return multiplier == 0x1FFFFFFFF || multiplier == 0;
return multiplier == 0;
}
typedef struct {
u32 out;
bool carry;
} adder_out_t;
adder_out_t adder(u32 a, u32 b, bool carry)
{
u32 out = a + b + carry;
bool overflow = out < a || out < b;
return (adder_out_t) {out, overflow};
}
typedef struct {
u64 out;
bool carry;
} mul_out_t;
struct u128 {
u64 lo, hi;
};
static inline struct u128 u128_ror(struct u128 input, int shift)
{
return (struct u128) {
(input.lo >> shift) | (input.hi << (64 - shift)),
(input.hi >> shift) | (input.lo << (64 - shift)),
};
}
mul_out_t booths_multiplication(mul_flavor_t flavor,
u64 multiplicand,
u64 multiplier,
u64 accumulator)
{
csa_out_t csa_out = {0, 0};
bool alu_carry_in = multiplier & 1;
if (is_signed(flavor))
multiplier = sign_extend(multiplier, 32, 34);
else
multiplier = multiplier & 0x1FFFFFFFFull;
if (is_signed(flavor))
multiplicand = sign_extend(multiplicand, 32, 34);
else
multiplicand = multiplicand & 0x1FFFFFFFFull;
csa_out.carry = (multiplier & 1) ? ~(multiplicand) : 0;
csa_out.out = accumulator;
acc_shift_register = accumulator >> #B04;
struct u128 partial_sum = {0, 0};
struct u128 partial_carry = {0, 0};
partial_sum.lo = csa_out.out & 1;
partial_carry.lo = csa_out.carry & 1;
csa_out.out >>= 1;
csa_out.carry >>= 1;
partial_sum = u128_ror(partial_sum, 1);
partial_carry = u128_ror(partial_carry, 1);
int n_iterations = 0;
do {
csa_out = #B05;
partial_sum.lo |= csa_out.out & 0xFF;
partial_carry.lo |= csa_out.carry & 0xFF;
csa_out.out >>= 8;
csa_out.carry >>= 8;
partial_sum = u128_ror(partial_sum, 8);
partial_carry = u128_ror(partial_carry, 8);
multiplier = asr(multiplier, 8, 33);
n_iterations++;
} while (!should_terminate(multiplier, flavor));
partial_sum.lo |= csa_out.out;
partial_carry.lo |= csa_out.carry;
int correction_ror;
if (n_iterations == 1) correction_ror = #B06;
if (n_iterations == 2) correction_ror = #B07;
if (n_iterations == 3) correction_ror = #B08;
if (n_iterations == 4) correction_ror = #B09;
partial_sum = u128_ror(partial_sum, correction_ror);
partial_carry = u128_ror(partial_carry, correction_ror);
if (is_long(flavor)) {
if (n_iterations == 4) {
adder_out_t adder_out_lo =
adder(partial_sum.hi, partial_carry.hi, alu_carry_in);
adder_out_t adder_out_hi =
adder(partial_sum.hi >> 32, partial_carry.hi >> 32,
adder_out_lo.carry);
return (mul_out_t) {
((u64) adder_out_hi.out << 32) | adder_out_lo.out,
(partial_carry.hi >> 63) & 1};
}
adder_out_t adder_out_lo =
adder(partial_sum.hi >> 32, partial_carry.hi >> 32, alu_carry_in);
int shift_amount = 1 + 8 * n_iterations;
shift_amount++;
partial_carry.lo = #B10;
partial_sum.lo |= acc_shift_register << (shift_amount);
adder_out_t adder_out_hi = #B11;
return (mul_out_t) {((u64) adder_out_hi.out << 32) | adder_out_lo.out,
(partial_carry.hi >> 63) & 1};
}
if (n_iterations == 4) {
adder_out_t adder_out = #B11;
return (mul_out_t) {adder_out.out, (partial_carry.hi >> 31) & 1};
}
adder_out_t adder_out =
adder(partial_sum.hi >> 32, partial_carry.hi >> 32, alu_carry_in);
return (mul_out_t) {adder_out.out, (partial_carry.hi >> 63) & 1};
}
mul_out_t mul(u32 rm , u32 rs )
{
return booths_multiplication(SHORT, rm, rs, 0);
}
mul_out_t mla(u32 rm, u32 rs, u32 rn )
{
return booths_multiplication(SHORT, rm, rs, rn);
}
mul_out_t umull(u32 rm, u32 rs)
{
return booths_multiplication(LONG_UNSIGNED, rm, rs, 0);
}
mul_out_t umlal(u32 rdlo, u32 rdhi, u32 rm, u32 rs)
{
return booths_multiplication(LONG_UNSIGNED, rm, rs,
((u64) rdhi << 32) | (u64) rdlo);
}
mul_out_t smull(u32 rm, u32 rs)
{
return booths_multiplication(LONG_SIGNED, rm, rs, 0);
}
mul_out_t smlal(u32 rdlo, u32 rdhi, u32 rm, u32 rs)
{
return booths_multiplication(LONG_SIGNED, rm, rs,
(u64) rdhi << 32 | (u64) rdlo);
}
The corresponding test code:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
unsigned int get_rand_32()
{
retry:
switch (rand() & 0b1111) {
case 0: return rand() & 0xFF;
case 1: return rand() & 0xFFFF;
case 2: return rand() & 0xFFFFFF;
case 3: return 0xFFFFFFFF - (rand() & 0xFF);
case 4: return 0xFFFFFFFF - (rand() & 0xFFFF);
case 5: return 0xFFFFFFFF - (rand() & 0xFFFFFF);
case 6: return 0xAAAAAAAA & (rand() & 0xFFFF) | ((rand() & 0xFFFF) << 16);
case 7: return 0x55555555 & (rand() & 0xFFFF) | ((rand() & 0xFFFF) << 16);
case 8: return 0;
case 9: return ((rand() & 0xFFFF) | ((rand() & 0xFFFF) << 16));
default: goto retry;
}
}
bool guess_mul_zero(uint32_t x)
{
if (x >> 8 == 0xFFFFFF) {
unsigned int masked_x = x & 0xFF;
if (masked_x >= 0xC0) return false;
return (masked_x & 0x55) != 0;
}
if (x >> 16 == 0xFFFF) {
unsigned int masked_x = x & 0xFFFF;
if (masked_x >= 0xC000) return false;
return (masked_x & 0x5555) != 0;
}
if (x >> 24 == 0xFF) {
unsigned int masked_x = x & 0xFFFFFF;
if (masked_x >= 0xC00000) return false;
return (masked_x & 0x555555) != 0;
}
else {
return (x >> 30) == 2;
}
}
int main()
{
srand(time(NULL));
printf("Running mul carry regression tests...\n");
for (int i = 0; i < 256; i++) {
unsigned int multiplicand = 0;
unsigned int multiplier = 0xFFFFFFFF - i;
unsigned int guess = mul(multiplicand, multiplier).carry;
unsigned int actual = guess_mul_zero(multiplier);
if (guess != actual) {
printf(
"[MUL CARRY REGRESSION] Failed: %llx * %llx = %llx, got %llx\n",
multiplicand, multiplier, actual, guess);
return 1;
}
printf("[MUL CARRY REGRESSION] Passed: %llx * %llx = %llx, got %llx\n",
multiplicand, multiplier, actual, guess);
}
printf("Running mul regression tests...\n");
for (int i = 0; i < 10000; i++) {
unsigned int multiplicand = get_rand_32();
unsigned int multiplier = get_rand_32();
unsigned int guess = mul(multiplicand, multiplier).out;
unsigned int actual =
(unsigned int) multiplicand * (unsigned int) multiplier;
if (guess != actual) {
printf("[MUL REGRESSION] Failed: %llx * %llx = %llx, got %llx\n",
multiplicand, multiplier, actual, guess);
return 1;
}
printf("[MUL REGRESSION] Passed: %llx * %llx = %llx, got %llx\n",
multiplicand, multiplier, actual, guess);
}
printf("Running mla regression tests...\n");
for (int i = 0; i < 10000; i++) {
unsigned int multiplicand = get_rand_32();
unsigned int multiplier = get_rand_32();
unsigned int accumulate = get_rand_32();
unsigned int guess = mla(multiplicand, multiplier, accumulate).out;
unsigned int actual =
(unsigned int) multiplicand * (unsigned int) multiplier +
(unsigned int) accumulate;
if (guess != actual) {
printf(
"[MLA REGRESSION] Failed: %llx * %llx + %llx = %llx, got "
"%llx\n",
multiplicand, multiplier, accumulate, actual, guess);
return 1;
}
}
printf("Running umull regression tests...\n");
for (int i = 0; i < 10000; i++) {
unsigned int multiplicand = get_rand_32();
unsigned int multiplier = get_rand_32();
unsigned int accumulate = get_rand_32();
unsigned long long guess = umull(multiplicand, multiplier).out;
unsigned long long actual =
(unsigned long long) multiplicand * (unsigned long long) multiplier;
if (guess != actual) {
printf(
"[UMULL REGRESSION] Failed #%d: %llx * %llx = %llx, got %llx\n",
i, multiplicand, multiplier, actual, guess);
return 1;
}
printf("[UMULL REGRESSION] Passed #%d: %llx * %llx = %llx, got %llx\n",
i, multiplicand, multiplier, actual, guess);
}
printf("Running umlal regression tests...\n");
for (int i = 0; i < 10000; i++) {
unsigned int multiplicand = get_rand_32();
unsigned int multiplier = get_rand_32();
unsigned int accumulate = get_rand_32();
unsigned int accumulate2 = get_rand_32();
unsigned long long guess =
umlal(accumulate, accumulate2, multiplicand, multiplier).out;
unsigned long long actual_acc =
(unsigned long long) accumulate +
((unsigned long long) accumulate2 << 32);
unsigned long long actual = (unsigned long long) multiplicand *
(unsigned long long) multiplier +
actual_acc;
if (guess != actual) {
printf(
"[UMLAL REGRESSION] Failed: %llx * %llx + %llx = %llx, got "
"%llx\n",
multiplicand, multiplier, actual_acc, actual, guess);
return 1;
}
printf(
"[UMLAL REGRESSION] Passed: %llx * %llx + %llx = %llx, got "
"%llx\n",
multiplicand, multiplier, actual_acc, actual, guess);
}
printf("Running smull regression tests...\n");
for (int i = 0; i < 10000; i++) {
unsigned int multiplicand = get_rand_32();
unsigned int multiplier = get_rand_32();
long long guess = smull(multiplicand, multiplier).out;
long long actual =
(long long) (int) multiplicand * (long long) (int) multiplier;
if (guess != actual) {
printf("[SMULL REGRESSION] Failed: %llx * %llx = %llx, got %llx\n",
multiplicand, multiplier, actual, guess);
return 1;
}
printf("[SMULL REGRESSION] Passed: %llx * %llx = %llx, got %llx\n",
multiplicand, multiplier, actual, guess);
}
printf("Running smlal regression tests...\n");
for (int i = 0; i < 10000; i++) {
unsigned int multiplicand = get_rand_32();
unsigned int multiplier = get_rand_32();
unsigned int accumulate = get_rand_32();
unsigned int accumulate2 = get_rand_32();
long long guess =
smlal(accumulate, accumulate2, multiplicand, multiplier).out;
long long actual_acc =
(long long) accumulate + ((long long) accumulate2 << 32);
long long actual =
(long long) (int) multiplicand * (long long) (int) multiplier +
actual_acc;
if (guess != actual) {
printf(
"[SMLAL REGRESSION] Failed: %llx * %llx + %llx = %llx, got "
"%llx\n",
multiplicand, multiplier, actual_acc, actual, guess);
return 1;
}
printf(
"[SMLAL REGRESSION] Passed: %llx * %llx + %llx = %llx, got "
"%llx\n",
multiplicand, multiplier, actual_acc, actual, guess);
}
printf("All tests passed!\n");
}
The reference output:
Complete the code above to ensure it always passes the test cases.
The values for #B01, #B02, #B03, #B04, #B06, #B07, #B08, and #B09 must be expressed as decimal integer literals in their shortest form. #B05, #B10, and #B11 are valid C99 expressions without semicolons that invoke the helper functions defined in the code above.
B01 = 0
B02 = 32
B03 = 33
B04 = 34
B05 = perform_one_cycle_of_booths_mutliplication(csa_out, multiplicand, multiplier)
B06 = 23
B07 = 15
B08 = 7
B09 = 31
B10 = sign_extend(partial_carry.lo, shift_amount, 64)
B11 = adder(partial_sum.hi, partial_carry.hi, alu_carry_in)
Problem C
Besides the RV32I instructions, you are allowed to use the following multiply instruction introduced in the RISC-V M extension:
This instruction multiplies operands rs1
and rs2
and stores the lower 32 bits of the result in rd
.
Note that there are no signed or unsigned variants; it doesn’t matter whether rs1
and rs2
are signed or unsigned because the result is the same. You can easily verify this with a 2-bit × 2-bit example:
- Unsigned:
2'b11 × 2'b11 = 3 × 3 = 9 = 4'b1001
2'b01
- Signed:
2'b11 × 2'b11 = -1 × -1 = 1 = 4'b0001
2'b01
- Unsigned:
2'b01 × 2'b11 = 1 × 3 = 3 = 4'b0011
2'b11
- Signed:
2'b01 × 2'b11 = 1 × -1 = -1 = 4'b1111
2'b11
When the result is truncated to the same number of bits as the operands, the outcome is identical.
Mnemonic: bseti rd, rs1,shamt
The bseti
(Single-Bit Set) instruction sets a single bit in rs1
at the position specified by shamt
. The bit position is determined by the lower log2(XLEN) bits of shamt
. For RV32, encodings where shamt[5] = 1
are reserved.


Fixed-point arithmetic can be challenging, especially for complex algorithms or poorly behaved data. However, with careful preparation and testing, it can be implemented effectively. Below is a general approach to transitioning from floating-point to fixed-point arithmetic, using a 32-bit signed integer as the fixed-point data type.
For each variable, pick an appropriate Q format based on its range. The Q format defines the number of fractional bits used in a fixed-point number. Here are some examples:
- A variable with a range of can be represented using Q28 (4 bits for the integer part, 28 bits for the fractional part).
- If you want to include an extra guard bit to protect against overflow, use Q27 instead.
- A variable with a range of can be represented in Q27.
The idea is to fit each variable's range into the number of bits available while keeping track of the precision.
The sqrtf
function, which computes the square root using Newton-Raphson iterations, will require careful handling when transitioning to fixed-point. Here is how you can approach it:
- Initial Floating-Point Prototype:
Before converting to fixed-point, the algorithm must be simulated and tested using floating-point arithmetic, ensuring all edge cases (e.g., small inputs, negative values, large values) are covered.
- Range Determination:
For the sqrtf
implementation:
- The mantissa values typically fall in a normalized range (e.g., between 1 and 2).
- The reciprocal square root approximation will also fall within a specific range, depending on the input data.
- You need to simulate the maximum possible input to estimate the worst-case range for both the mantissa and the square root approximation.
- Q Format Assignment:
For example:
- The mantissa might be assigned a Q27 format, giving sufficient precision while covering the expected range.
- The reciprocal square root approximation can be in Q24 or Q27, depending on the accuracy required for the Newton-Raphson iterations.
- Handling Iterations and Adjustments:
- During the Newton-Raphson iterations, maintain the Q format for all intermediate results.
- Operations like multiplication will require adjustment of the Q format (e.g., a multiplication in Q27 may need a shift right to adjust back to Q27).
- Final Rounding:
After the iterations, ensure that any remaining error is properly rounded, using appropriate bit shifts to bring the result back into the desired format.
- Guard Bits for Clipping:
In cases where the algorithm processes large or near-boundary values, include guard bits to protect against overflow, ensuring that the results remain within range without excessive clipping.
By following these steps, you can methodically implement the fixed-point version of sqrtf
, ensuring precision while handling the constraints of the fixed-point format.
Q-format (or Q notation) is a way to represent fixed-point numbers. It allows fractional numbers to be stored using integers by implicitly representing the fraction through bit positions. This is often used in digital signal processing (DSP), embedded systems, and other performance-constrained applications where floating-point operations are too costly in terms of speed or resources.
In Q-format, the number is represented as a signed or unsigned integer, and the fractional part is encoded by assuming that the lower bits of the integer represent the fraction. The format is denoted as Qm.n
, where:
- : Number of bits used for the integer (or whole part).
- : Number of bits used for the fractional part.
For example, in a 32-bit number represented as Q4.28
, 4 bits are reserved for the integer part, and 28 bits are reserved for the fractional part.
Key Concepts of Q-Format
- Fixed-Point Representation:
A fixed-point number is essentially an integer that represents a scaled real number. The scaling factor is typically a power of two, so the decimal point is assumed to be "fixed" at a certain position in the binary representation.
- Q-Format Representation:
The general Q-format, denoted as Qm.n, divides a word (32-bit or 16-bit) into an integer part and a fractional part:
- The bits represent the integer part.
- The bits represent the fractional part.
For instance:
Q4.28
: Uses 4 bits for the integer part and 28 bits for the fractional part, providing a high level of precision for small numbers.
Q15.16
: Uses 15 bits for the integer part and 16 bits for the fractional part, offering a balance between integer range and fractional precision.
Assume we are using a 32-bit signed integer to represent a number in Q15.16
format:
- 15 bits for the integer part, including the sign bit (most significant bit).
- 16 bits for the fractional part.
In this case, a 32-bit integer in Q15.16
format is interpreted as:
Where:
- is the integer value (stored in the upper 16 bits).
- is the fractional value (stored in the lower 16 bits).
For example, the number 3.25 would be stored as:
- The integer part (3) is stored in the upper 16 bits.
- The fractional part (0.25) is stored as in the lower 16 bits.
Scaling and Range in Q-Format
- Scaling: The real value represented by a fixed-point number is obtained by dividing the integer by , where is the number of fractional bits. For example, in Q15.16 format, dividing by gives the real number.
- Range: The range of values that can be represented in Q-format depends on the number of integer bits and whether the format is signed or unsigned:
- Signed
Qm.n
: For signed numbers, the range is . For example, in Q15.16, the range is .
- Unsigned
Qm.n
: For unsigned numbers, the range is . For example, in Q16.16, the range is .
When multiplying two numbers in Q-format, the result will have twice the fractional bits (Qn × Qn = Q2n). After the multiplication, you will need to right-shift by bits to bring the result back into the original Q-format.
For example, in Q15.16
:
In the sqrtf
implementation (for square root approximation), using Q-format would require careful handling of variables:
- Mantissa: You could represent the mantissa in Q27 format, giving enough precision for the initial reciprocal square root approximation.
- Newton-Raphson Iterations: The iterative process of squaring and correcting would need careful management of Q-formats (e.g., after multiplication, shift the result appropriately).
By ensuring the correct scaling (shifts) during operations, the accuracy of the algorithm can be preserved while transitioning from floating-point to fixed-point representation.
You are now tasked with implementing the RISC-V assembly code to replace sqrtf. You should use RV32I instructions and the MUL
and bseti
instruction mentioned above.
Code:
.global sqrtf
sqrtf:
bltz a0, sq_0
slli a1, a0, 8
bseti a1, a1, 31
#C01
srli a2, a0, 23
beqz a2, sq_2
li a4, 255
#C02
add a2, a2, 125
slli a4, a2, 31
#C03
beqz a4, 1f
slli a1, a1, 1
1: lui a4, %hi(rsqrtapp-4)
#C04
add a4, a4, a3
lbu a4, %lo(rsqrtapp-4)(a4)
srli a0, a1, 7
mul a0, a0, a4
mul a0, a0, a4
#C05
mul a0, a0, a4
srai a0, a0, 13
slli a4, a4, 8
sub a4, a4, a0
addi a4, a4, 170
mv a0, a4
#C06
#C07
srli a3, a1, 8
mul a0, a0, a3
srai a0, a0, 12
mul a0, a0, a4
#C08
sub a4, a4, a0
mul a3, a3, a4
srli a3, a3, 15
mv a0, a3
mul a0, a0, a0
#C09
sub a0, a1, a0
#C10
mul a4, a4, a0
slli a3, a3, 7
srai a0, a4, 15
#C11
srai a0, a0, 6
add a3, a3, a0
#C12
beqz a4, sq_3
add a4, a4, a3
mul a4, a4, a4
#C13
sub a1, a1, a4
bltz a1, sq_3
add a3, a3, 1
sq_3:
#C14
add a0, a2, a3
sq_6: ret
sq_0:
slli a1, a0, 1
srli a1, a1, 24
beqz a1, sq_2
srai a0, a0, 31
sq_5:
slli a0, a0, 23
ret
sq_1:
srli a0, a0, 23
j sq_5
sq_2:
srli a0, a0, 31
slli a0, a0, 31
ret
rsqrtapp:
.byte 0xf1, 0xda, 0xc9, 0xbb, 0xb0, 0xa6, 0x9e, 0x97, 0x91, 0x8b, 0x86, 0x82
Pseudoinstructions are not allowed in your answers.
C01 = srli a1, a1, 8
C02 = beq a2, a4, sq_1
C03 = srai a2, a2, 1
C04 = srli a3, a1, 21
C05 = srai a0, a0, 12
C06 = mul a0, a0, a0
C07 = srli a0, a0, 15
C08 = srai a0, a0, 21
C09 = slli a1, a1, 9
C10 = srai a0, a0, 5
C11 = addi a0, a0, 16
C12 = sltu a4, a3, a0
C13 = slli a1, a1, 16
C14 = slli a2, a2, 23
Problem D
What is the logical expression for the circuit shown below? __ #D01 __ (Hint: You can derive the expression using a truth table.).

Alternative: 
Below is a state transition diagram of a finite state machine (FSM) with 4 states.

Given the input bit sequence 10011010 to the FSM, the output is __ #D02
__
In which state does the FSM end? __ #D03
__
Assume that the 1-bit full adder is implemented
as shown in the figure below. The delay at each gate is 1ps. What is the propagation delay of the 1-bit full adder? __ #D04
__ ps

Consider the following circuit.

Each square-shaped component represents a 1-bit full adder: and are its two 1-bit inputs, is the 1-bit carry-in, is the 1-bit output, and is the 1-bit carry-out. and are all 1-bit inputs. are the outputs of the four XOR gates. are the 1-bit outputs of the four full adders.
What is the total propagation delay of the entire circuit shown above, assuming that the XOR gates outside the full adder also have a delay of 1 ps? __ #D05
__
the critical path is
B0 -(1 gate)- P0
P0 -(3 gate)- Cout0
Cout0 -(2 gate)- Cout1
Cout1 -(2 gate)- Cout2
Cout2 -(1 gate)- Σ3
Problem E
The following circuit will be analyzed:

Consider the following information:
- AND gates have a propagation delay of 9 ns
- OR gates have a propagation delay of 14 ns
- NOT gates have a propagation delay of 5 ns
- The switches value (i.e., from 1 to 0, or 0 to 1) 30 ns after the rising edge of the clock
- The is directly connected to a register
- Setup time is 3 ns
- Clock-to-Q delay time is 4 ns
What is the minimum clock period in nanoseconds? __ E01 __ ns
Critical path = clk-to-q + longest CL + setup = 30ns for x_input to change (includes clk-to-q) + 9 AND + 3 setup = 42 ns
What is the max hold time in ns? __ E02 __
Shortest CL: NOT AND = 5 + 9 = 14ns clk-to-q + shortest CL = 4ns + 14ns = 18ns
Assume the clock period is 50 ns, with the first rising edge occurring at 25 ns, and initialized to 0 at 0 ns. At what time will become 1? __ E03 __ ns
25 + 50 (clock period) + 4 (clk-to-q) + 14 (OR) + 9 (AND) = 102 ns
Problem F
You are working on a new chip for an embedded application and want to create a new ISA. Frustrated with the various RISC-V instruction types, you decide to include only one universal instruction type: the X-type instruction.
Consider the following instructions:
add rd1, rs1, rs2
and rd1, rs1, rs2
lw rd1, offset1(rs1)
sw rs2, offset1(rs1)
addi rd1, rs1, imm1
beq rs1, rs2, offset1
lui rd1, offset1
jal rd1, imm
stw rs3, offset1, offset2(rs1)
(new instruction)
The new stw
instruction stores the contents of rs3
into both rs1 + offset1
and rs1 + offset2
.
C-like description of the stw
instruction:
You want to eliminate the funct3 and funct7 fields, using only an opcode. If we only need to support the instructions listed above, what is the minimum number of bits required for the opcode field? __ F01 __
We have 9 instructions, so we need 4 bits to represent them.
We need the ability to jump up to 64 KiB in either direction with a single instruction. How many bits are required to encode an immediate that allows this? __ F02 __ bits. Assume, similar to RV32, that the least significant bit is implicitly 0 and not stored in the instruction. Note: jal
is the only jump instruction provided, so jal
will determine the size of the immediate field.
Recall that, in RISC-V, the immediates encoded in instructions work in units of half-instructions, so jumping up to 64 KiB in either direction is the same as saying we want to jump up to half instructions in both directions. Since immediates are signed, we need 16 bits to represent this range of values.