Quiz6 of Computer Architecture (2024 Fall)

Solutions

Problem `A`

Given that we are developing an operating system kernel for RISC-V, implement mutex routines to handle locking and unlocking.

#include <stdbool.h>

typedef enum {
    MUTEX_UNLOCKED,
    MUTEX_LOCKED
} mutex_t;

/* Attempt to acquire the mutex lock. Returns true if the lock was
 * successfully acquired, and false otherwise.
 */
bool mutex_trylock(mutex_t *mutex)
{
    int old;
    asm volatile (
        "__ A01 __ %0, %1, (%2)" /* Fill in your answer */
        : "=r"(old)
        : "r"(MUTEX_LOCKED), "r"(mutex)
        : "memory"
    );

    /* If the previous value of the mutex was MUTEX_LOCKED, it means
     * the lock was already held. Return false to indicate the lock
     * was not acquired.
     */
    return __ A02 __ ; /* Fill in your answer */
}

/* Spin until the mutex lock is acquired. */
void mutex_lock(mutex_t *mutex)
{
    while (!mutex_trylock(mutex)) {
        /* Busy wait */
    }
}

/* Release the mutex lock. */
void mutex_unlock(mutex_t *mutex)
{
    asm volatile (
        "__ A03 __ zero, zero, (%0)" /* Fill in your answer */
        : 
        : "r"(mutex)
        : "memory"
    );
}

Complete the above implementation to ensure the mutex functions correctly using the RISC-V Atomic extension.

A01 = ?

amoswap.w.aq

A02 = ?

old != MUTEX_LOCKED OR old != 1

A03 = ?

amoswap.w.rl

Next, we will implement a condition variable based on the mutex implementation provided earlier.

/*
 * RISC-V lightweight condition variable implementation using mutexes.
 */

typedef struct {
    volatile unsigned int status;
    volatile unsigned int waiting;

    /* Mutex for protecting the condition variable state */
    mutex_t mutex;
} cond_t;

/* Initialize the condition variable */
#define cond_init(cond)               \
    do {                              \
        (cond)->status = 0;           \
        (cond)->waiting = 0;          \
        mutex_unlock(&(cond)->mutex); \
    } while (0)

/* Broadcast to all threads waiting on the condition variable */
#define cond_broadcast(cond)          \
    do {                              \
        mutex_lock(&(cond)->mutex);   \
        (cond)->status = 1;           \
        mutex_unlock(&(cond)->mutex); \
    } while (0)

/* Wait on the condition variable */
#define cond_wait(cond, mtx)                  \
    do {                                      \
        mutex_lock(&(cond)->mutex);           \
        __ A04 __ ; * Fill in your answer */  \
        mutex_unlock(&(cond)->mutex);         \
                                              \
        /* Release the external mutex */      \
        mutex_unlock(mtx);                    \
                                              \
        /* Wait for broadcast signal */       \
        while (1) {                           \
            __ A05 __ ; /* Your answer */     \
            if ((cond)->status == 1) {        \
                (cond)->waiting--;            \
                if ((cond)->waiting == 0) {   \
                    __ A06 __ ; /* Your answer */ \
                }                             \
                __ A07 __ ; /* Your answer */ \
                break;                        \
            }                                 \
            __ A08 __ ; /* Your answer */ ;   \
        }                                     \
                                              \
        /* Reacquire the external mutex */    \
        mutex_lock(mtx);                      \
    } while (0)

Complete the above implementation to ensure the condition variable function-like macros correctly.

A04 = ?

(cond)->waiting++

A05 = ?

mutex_lock(&(cond)->mutex)

A06 = ?

(cond)->status = 0

A07 = ?

mutex_unlock(&(cond)->mutex)

A08 = ?

mutex_unlock(&(cond)->mutex)

Lastly, we aim to implement pthread_spin_lock using the RISC-V Atomic extension. Below is the C implementation:

typedef int pthread_spinlock_t;
int pthread_spin_trylock(pthread_spinlock_t *lock);
#define	EBUSY		16	/* Device or resource busy */

And the RISC-V implementation:

/* Attempt to acquire a spinlock without blocking. Return 0 on success or EBUSY if the lock is already held. */
pthread_spin_trylock:
    /* Save the input arguments */
    /* a0: lock pointer */

    /* Load the value of the lock into t0 */
    lw t0, 0(a0)               /* t0 = *lock */

    /* Set t1 to the value of EBUSY */
    li t1, EBUSY               /* t1 = EBUSY */

    /* Check if the lock is already held */
    __ A09 __ # Your answer!

    /* Attempt to acquire the lock using an atomic swap */
    __ A10 __ # Your answer!  /* t0 = atomic swap(*lock, EBUSY) */

lock_held:  /* Return the result */
    mv a0, t0                  /* Move the result into a0 (return value) */
    ret                        /* Return from the function */

A09 = ?

bnez t0, lock_held

A10 = ?

amoswap.w.aq t0, t1, (a0)

Problem `B`

Let's assume we are developing an operating system kernel, and now it's time to implement the context switching mechanism for RISC-V.

The generic code:

typedef struct {
	/*
	 * Note that sp must be the first element in the task struct
	 * for __switchto() to work.
	 */
	uint32_t sp; /* Saved stack pointer for context switch */
	atomic_t events; /* Bitmaps of received events */
	uint64_t runtime; /* Time spent in task */
	uint32_t *stack; /* Start of stack */
} task_;

/* Contexts for all tasks */
static task_ tasks[TASK_ID_COUNT] __attribute__((section(".bss.tasks")));

/* Reserve space to discard context on first context switch. */
uint32_t scratchpad[TASK_SCRATCHPAD_SIZE]
	__attribute__((section(".bss.task_scratchpad")));

task_ *current_task = (task_ *) scratchpad;

/*
 * Determines whether IRQs should chain to svc_handler(). This flag should be set if:
 *
 * 1) Task scheduling is active.
 *
 * 2) An interrupt sets an event that could unblock a higher-priority task. 
 *    svc_handler() will check for a potential task switch and clear this flag 
 *    afterward.
 */
int need_resched;

static inline task_ *__task_id_to_ptr(task_id_t id)
{
    return tasks + id;
}

task_ *next_sched_task(void)
{
    task_ *new_task = __task_id_to_ptr(__fls(tasks_ready & tasks_enabled));
    return new_task;
}

/* in interrupt context */
volatile bool in_interrupt;

void end_irq_handler(void)
{
}

IRQ code:

__irq_exit:
    jal end_irq_handler    /* Jump to the end of the interrupt handler */
    la t0, in_interrupt    /* Clear the in_interrupt flag to signal the end of the interrupt */
    sb zero, 0(t0)
    csrr sp, mscratch      /* Restore the stack pointer (sp) from the mscratch register */

/**
 * Task context switching
 *
 * Change the task scheduled after returning from an interruption.
 *
 * This function must be called in interrupt context.
 *
 * Save the registers of the current task below the interrupt context on
 * its task, then restore the live registers of the next task and set the
 * process stack pointer to the new stack.
 *
 * The structure of the saved context on the stack is:
 * ra, a0-a7, t0-t6 (caller-saved), s0-s11 (callee-saved), mepc
 *
 */
.global __switch_task
__switch_task:
    /* Get the (new) highest priority task pointer in a0 */
    jal next_sched_task

    /* Pointer to the current task (which we are switching from) */
    la t1, current_task
    lw t0, 0(t1)

    /* Reset the rescheduling request */
    la t2, need_resched
    sw zero, 0(t2)

    /* If no task switch is needed, return to the current task */
    beq a0, t0, __irq_exit

    /* Save the new scheduled task */
    sw a0, 0(t1)

    /* Save the current system stack pointer */
    add t3, sp, zero

    /* Restore the current process stack pointer */
    csrr sp, mscratch

    /* Save the task program counter */
    csrr t5, mepc

    /* Save callee-saved registers (s0-s11) on the current process stack */
    sw s11, -12*4(sp)
    sw s10, -11*4(sp)
    sw s9,  -10*4(sp)
    sw s8,  -9*4(sp)
    sw s7,  -8*4(sp)
    sw s6,  -7*4(sp)
    sw s5,  -6*4(sp)
    sw s4,  -5*4(sp)
    sw s3,  -4*4(sp)
    sw s2,  -3*4(sp)
    sw s1,  -2*4(sp)
    sw s0,  -1*4(sp)

    /* Save the program counter on the current process stack */
    __ B01 __ # Your answer here

    /* Adjust the stack pointer */
    addi sp, sp, __ B02 __ # Your answer here

    /* Save the task stack pointer in its context */
    sw sp, 0(t0)

    /* Get the new scheduled task stack pointer */
    lw sp, 0(a0)

    /* Adjust the stack pointer */
    addi sp, sp, __ B03 __ # Your answer here

    /* Restore the program counter from the next task's stack context */
    lw t0, __ B06 __
    csrw mepc, t0

    /* Restore callee-saved registers (s0-s11) from the next task's stack */
    lw s11, -12*4(sp)
    lw s10, -11*4(sp)
    lw s9,  -10*4(sp)
    lw s8,  -9*4(sp)
    lw s7,  -8*4(sp)
    lw s6,  -7*4(sp)
    lw s5,  -6*4(sp)
    lw s4,  -5*4(sp)
    lw s3,  -4*4(sp)
    lw s2,  -3*4(sp)
    lw s1,  -2*4(sp)
    lw s0,  -1*4(sp)

    /* Save sp to the scratch register and switch to the system stack */
    csrw __ B04 __ , __ B05 __

    /* Restore the system stack */
    add sp, t3, zero

    j __irq_exit

B01 = ?

sw t5, -13*4(sp)

B02 = ?

-13*4

B03 = ?

13*4

B04 = ?

mscratch

B05 = ?

sp

B06 = ?

-13*4(sp)

Problem `C`

Consider a complex task divided into four segments: PartA, PartB, PartC, and PartD. Each segment is assigned to a core in a quad-core processor system, where each core has its own dedicated cache. Cache coherence is maintained using a snoopy-based, write-invalidate MSI (Modified, Shared, Invalid) protocol, as depicted. The objective is to analyze the performance of this system and explore potential optimizations for improvement.

Each core begins by accessing and updating its local copy of the population data, stored under a designated label (e.g., pop_A for the population of PartA). This process is illustrated in the pseudocode below:

call pop_count      # Invoke a function to calculate the additional population in the area.
lw a1, <pop_label>  # Load the previous population count.
add a1, a1, a0      # Compute the new total by adding the additional population.
sw a1, <pop_label>  # Store the updated population count back into memory.

The sequence of memory accesses for this process is as follows:

(1) `lw a1, pop_A`	(2) `lw a1, pop_B`	(3) `lw a1, pop_C`	(4) `lw a1, pop_D`
(5) `sw a1, pop_A`	(6) `sw a1, pop_B`	(7) `sw a1, pop_C`	(8) `sw a1, pop_D`

Our focus is to closely analyze the activities occurring on Core A. Since the memory accesses are independent (i.e., pop_A, pop_B, pop_C, and pop_D correspond to distinct memory locations), we can concentrate solely on Core A's memory interactions to understand its behavior. Please complete the table below by filling in columns C01 and C02. For each entry, describe the bus transaction(s) triggered by each access and the resulting state of the cache line for pop_A after the access.

Access	Shared bus transactions	Cache A
Initial state	-	pop_A: I
lw a1, pop_A	BusRd	pop_A: C01
sw a1, pop_A	BusRdX	pop_A: C02

C01 = ?

S

C02 = ?

M

We are interested in exploring whether adopting the MESI protocol, as illustrated, could improve the performance of this processor. Assuming the four cores utilize a snoopy-based, write-invalidate MESI protocol, what would be the sequence of bus transactions and the cache line states for pop_A on Core A? Please complete the table below by filling in columns C03 and C04.

Access	Shared bus transactions	Cache A
Initial state	-	pop_A: I
lw a1, pop_A	BusRd	pop_A: C03
sw a1, pop_A	-	pop_A: C04

C03 = ?

E

C04 = ?

M

Each core then updates the overall census count, stored at a shared location labeled sum. This process is illustrated in the following pseudocode:

lw a2, sum       # Load the current global census count.
add a2, a2, a0   # Add the local population count to the global count.
sw a2, sum       # Store the updated global census count.

The resulting sequence of memory accesses is as follows:

(1) `lw a2, sum`	(3) `lw a2, sum`	(4) `lw a2, sum`	(6) `lw a2, sum`
(2) `sw a2, sum`	(7) `sw a2, sum`	(5) `sw a2, sum`	(8) `sw a2, sum`

We observe a critical issue in the current census counting implementation: there is a possibility that two processors may simultaneously read sum, each add their local population count to it independently, and then write back to sum, resulting in inaccuracies.

To ensure the final value of sum accurately reflects the combined a0 values from all processors, semaphores should be integrated strategically to enforce mutual exclusion. These semaphores should minimize instruction blocking while preserving data correctness. Define and initialize any necessary semaphores, then insert WAIT and SIGNAL commands into the pseudocode below to ensure accurate updates to sum. It is not required to include ecall instructions for this modification.

Semaphore Initialization:

sum_sem = 1

Pseudocode:

call pop_count     # L1: counts the new population in the area
lw a1, <pop_label> # L2: loads the previous population
add a1, a1, a0     # L3: calculates the new total
sw a1, <pop_label> # L4: stores the updated population
lw a2, sum         # L5: retrieves the current global count
add a2, a2, a0     # L6: updates the global population count
sw a2, sum         # L7: writes the updated population count

Specify the exact instruction, labeled as L1 to L7, after which the WAIT(sum_sem) command should be inserted to ensure exclusive access to sum during critical updates, thus maintaining data accuracy.

C05 = ?

L4or sw a1, <pop_label>

Identify the exact instruction, labeled as L1 to L7, after which the SIGNAL(sum_sem) command should be placed. This ensures exclusive access to sum during critical updates, maintaining data accuracy.

C06 = ?

L7 or sw a2, sum

We are writing code to compute the greatest common divisor (GCD) using popcount.

/* Count the number of 1 bits in the binary representation of x */
int popcount(long long x)
{
    x = (x & 0x5555555555555555LL) + ((x >> 1) & 0x5555555555555555LL);
    x = (x & 0x3333333333333333LL) + ((x >> 2) & 0x3333333333333333LL);
    x = (x & 0x0f0f0f0f0f0f0f0fLL) + ((x >> 4) & 0x0f0f0f0f0f0f0f0fLL);
    x = (x & 0x00ff00ff00ff00ffLL) + ((x >> 8) & 0x00ff00ff00ff00ffLL);
    x = (x & 0x0000ffff0000ffffLL) + ((x >> 16) & 0x0000ffff0000ffffLL);
    return (x & 0x00000000ffffffffLL) + (x >> 32 & 0x00000000ffffffffLL);
}

/* Count the number of trailing zeros in x */
int ctz(long long x)
{
    return popcount( /* C07: Your answer! */ );
}

/* binary GCD algorithm */
long long gcd(long long a, long long b)
{
    if (a == 0) return b; // GCD(0, b) = b
    if (b == 0) return a; // GCD(a, 0) = a

    // Count the common factors of 2 in a and b
    int k = ctz( /* C08: Your answer! */ );
    a >>= ctz(a); // Remove factors of 2 from a
    b >>= ctz(b); // Remove factors of 2 from b

    while (a != b) {
        if (a > b) {
            a -= b; // Reduce a
            a >>= ctz(a); // Remove factors of 2
        } else {
            b -= a; // Reduce b
            b >>= ctz(b); // Remove factors of 2
        }
    }

    /* Restore the common factors of 2 */
    return /* C09: Your answer! */;
}

Complete the above GCD implementation to make it work as expected.

Reference: Binary GCD algorithm

C07 = ?

(x & (-x)) - 1

C08 = ?

a | b

C09 = ?

a << k

Problem `D`

The diagram below illustrates a classic fully-bypassed 5-stage pipeline, enhanced with an unpipelined divider operating in parallel with the ALU. While bypass paths are not depicted, the iterative divider generates 2 bits per cycle until producing a complete 32-bit result.

In this pipeline design, asynchronous interrupts are handled during the MEM stage and trigger a jump to a dedicated interrupt trap handler address. The interrupt latency is defined as the number of cycles from when an interrupt request is raised in the MEM stage to when the first instruction of the interrupt handler reaches the MEM stage.

Consider the following code execution scenario. An interrupt is raised during cycle 8, causing a jump to isr. The label isr refers to an Interrupt Service Routine, which increments a counter at a fixed memory address before resuming the original execution context. The architectural guarantee of precise interrupts must be preserved. Assume all memory accesses take one cycle in the MEM stage.

    lw x2, 0(x1)
    div x1, x2, x3
    slli x3, x2, 1
    lui x4, 0x100
    addi x4, x4, 0xf
    xor x5, x3, x4
    sub x3, x5, x2
    ...
isr:
    sw x1, 0(x0) # Save register to a known location
    lw x1, 4(x0) # Load counter value
    addi x1, x1, 1
    sw x1, 4(x0) # Increment counter and store it
    lw x1, 0(x0) # Restore register before returning
    mret         # Return from interrupt handler

Notably, the div instruction in RISC-V does not raise data-dependent exceptions. To prevent pipeline stalls during a multi-cycle divide operation, the pipeline control logic permits subsequent instructions that are independent of the divide result to proceed and complete before the divide finishes execution.

The RISC-V instructions MRET, HRET, SRET, and URET are used to return from traps in M-mode, H-mode, S-mode, and U-mode, respectively. When executing an xRET instruction:

The privilege level changes to the value stored in xPP.
The interrupt-enable bit (yIE) is updated with the value in xPIE.
The xPIE field is set to 1.
The xPP field is reset to U.

Trap handlers typically save the current system state:

PC              -> MEPC
Privileged mode -> MPP
MIE             -> MPIE

And restore the system state upon executing the MRET instruction:

MEPC            -> PC
MPP             -> Privileged mode
MPIE            -> MIE

Reference: The RISC-V Instruction Set Manual: Volume II - Privileged Architecture

What is the interrupt latency for the code above? __ D01 __ cycles

14
cycles 8 to 21, inclusive.

The interrupt is raised in cycle 4. The earliest uncommitted instruction when the interrupt is taken is labeled with "EPC" (exception PC), while "MTVEC" denotes the first instruction of the interrupt trap handler.

To ensure the program continues executing correctly, which instruction should the isr return to? Answer in one of the instructions shown above.
D02 = ?

lui x4, 0x100
The EPC (exception PC) should point to the earliest uncommitted instruction.

To reduce interrupt latency, we propose constraining the destination register for a div instruction. The ABI could be modified to reserve a specific x register to always store the divide result, allowing the interrupt handler to avoid using it. In a more generalized exception handler, reading this register could be deferred until the end of the context save, potentially hiding the divide latency.

Does this approach require any hardware modifications? __ D03 __ (Answer in Yes or No)
What is the corresponding limitation? __ D04 __ (Explain!)

No
It relies on code voluntarily complying with the ABI. (意思相近就給分)

Problem `E`

Suppose you have been tasked with evaluating the performance of two processor designs:

Processor₁
Processor₂

You plan to benchmark both processors using the following assembly sequence:

L1:     slli x11, x10, 2       # x11 = x10 << 2 (calculate the byte offset)
        lw x12, 0x80(x11)      # Load word from memory at address (0x80 + x11) into x12
        add x13, x13, x12      # x13 = x13 + x12
        addi x10, x10, 1       # Increment x10 by 1
        blt x10, x14, L1       # Branch to L1 if x10 < x14 (loop condition)
        sub x14, x14, x15      # x14 = x14 - x15
        xori x15, x14, 0x1     # x15 = x14 ^ 0x1

Processor₁ is a 4-stage pipelined processor with full bypassing, designed to reduce the number of cycles required to execute each instruction. It combines the EXE and MEM stages, with load requests sent in the EXE stage and received in the WB stage one cycle later. The IF stage speculatively fetches the instruction at PC+4 and annuls incorrectly fetched instructions after a branch misprediction. Branches are resolved in the EXE stage.

Processor₂ is a 6-stage pipelined processor with full bypassing, designed to improve performance by increasing the clock speed. The EXE stage is divided into two stages (EXE1 and EXE2) to reduce the critical path through the ALU. The IF stage always fetches the instruction at PC+4 and annuls incorrectly fetched instructions after a taken branch. ALU and Branch ALU results become available in the EXE2 stage.

How many cycles does it take Processor₁ to execute one iteration of the loop? __ E01 __ cycle(s)

8

How many cycles does it take Processor₂ to execute one iteration of the loop? __ E02 __ cycle(s)

12

What is the clock period for Processor₁? __ E03 __ ns per loop iteration

48
8 cycles (6 ns/cycle) = 48 ns per loop iteration

Quiz6 of Computer Architecture (2024 Fall)

Problem A

Problem B

Problem C

Problem D

Problem E

Problem `A`

Problem `B`

Problem `C`

Problem `D`

Problem `E`