Quiz7 of Computer Architecture (2022 Fall)

General Information

You are allowed to read lecture materials.
- That is, an open book exam.
- Even ChatGPT is available for consultation, however keep in mind the restrictions below.
  $\to$ Please let the instructor know if you do receive the right answers through ChatGPT.
We are using the honor system during this quiz, and would like you to accept the following:
1. You will not share the quiz with anyone.
2. You will not talk to anyone about the information on the quiz until the answers are made public.
Each answer has 4 points.
You must answer in valid numeric/C99/assembly representation and/or English alphabet except your formal name.
Message the instructor via Facebook if you found something wrong.
09:10 ~ 10:15AM on Dec 27, 2022

Problem `A`

Coroutines are computer program components that generalize subroutines for non-preemptive multitasking, by allowing execution to be suspended and resumed. In order to implement general-purpose coroutines, a second call stack must be obtained, which is a feature not directly supported by the C language. A reliable (albeit platform-specific) way to achieve this is to use a small amount of inline assembly to explicitly manipulate the stack pointer during initial creation of the coroutine. Once a second call stack has been obtained, the setjmp and longjmp functions in the standard C library can then be used to implement the switches between coroutines.

These functions save and restore, respectively, the stack pointer, program counter, callee-saved registers, and any other internal state as required by the ABI, such that returning to a coroutine after having yielded restores all the state that would be restored upon returning from a function call. Minimalist implementations, which do not piggyback off the setjmp and longjmp functions, may achieve the same result via a small block of inline assembly which swaps merely the stack pointer and program counter, and clobbers all other registers. This can be significantly faster, as setjmp and longjmp must conservatively store all registers which may be in use according to the ABI, whereas the clobber method allows the compiler to store (by spilling to the stack) only what it knows is actually in use.

Let's implement a minimalist coroutine in RV32I, assuming no floating points in 32-bit.

Makefile

CFLAGS = -march=rv32i -mabi=ilp32 -O2 -Wall

OBJS = \
    context.o \
    main.o
BIN = coroutine.elf

%.o: %.S
	$(CROSS_COMPILE)gcc $(CFLAGS) -c -o $@ $<

%.o: %.c
	$(CROSS_COMPILE)gcc $(CFLAGS) -c -o $@ $<

all: $(BIN)

$(BIN): $(OBJS)
	 $(CROSS_COMPILE)gcc -o $@ $^

main.c

#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

typedef struct {
    uintptr_t ra;
    uintptr_t sp;
    uintptr_t s0;
    uintptr_t s1;
    uintptr_t s2;
    uintptr_t s3;
    uintptr_t s4;
    uintptr_t s5;
    uintptr_t s6;
    uintptr_t s7;
    uintptr_t s8;
    uintptr_t s9;
    uintptr_t s10;
    uintptr_t s11;

    void (*entry)(void *);
    void *data;
} context_t[1];

extern void switch_context(context_t from, context_t to);

extern void helper(void);

static void initialize_context(context_t ctx,
                               void *stack_base,
                               size_t stack_size,
                               void (*entry)(void *data),
                               void *data)
{
#define RED_ZONE 128
    uintptr_t stack_end = (uintptr_t) stack_base + stack_size - RED_ZONE;
    stack_end &= -16; /* ensure that the stack is 16-byte aligned */

    memset(ctx, 0, sizeof *ctx);
    ctx->sp = stack_end;
    ctx->ra = (uintptr_t) helper;
    ctx->data = data;
    ctx->entry = entry;
}

#define ITERATIONS (1000 * 1000)
#define STACKSIZE 32768

static context_t thread1, thread2, thread3;

static void thread1_fun(void *data)
{
    for (int i = 0; i < ITERATIONS; ++i)
        switch_context(thread1, thread2);

    printf("%d context switches\n", 3 * ITERATIONS);
}

static void thread2_fun(void *data)
{
    /* thread 2 -> thread 1 */
    switch_context(thread2, thread1);

    /* thread 2 -> thread 1 */
    switch_context(thread2, thread1);

    /* thread 2 -> thread 3 */
    switch_context(thread2, thread3);

    /* thread 2 -> thread 1 */
    switch_context(thread2, thread1);

    /* thread 2 -> thread 3 */
    for (int i = 0;; ++i)
        switch_context(thread2, thread3);
}

static void thread3_fun(void *data)
{
    /* thread 3 -> thread 2 */
    switch_context(thread3, thread2);

    /* thread 3 -> thread 1 */
    switch_context(thread3, thread1);

    /* thread 3 -> thread 1 */
    for (int i = 0;; ++i)
        switch_context(thread3, thread1);
}

#define ALIGN 64 /* cache line */
static void *aligned_malloc(int size)
{
    void *mem = malloc(size + ALIGN + sizeof(void *));
    void **ptr =
        (void **) (((uintptr_t) mem + ALIGN + sizeof(void *)) & ~(ALIGN - 1));
    ptr[-1] = mem;
    return ptr;
}

#define RED_ZONE 128
static void init_context(context_t ctx, void (*entry)(void *data), void *data)
{
    void *stack = aligned_malloc(STACKSIZE + RED_ZONE + 8) - 8;
    initialize_context(ctx, stack, STACKSIZE, entry, data);
}

int main()
{
    init_context(thread2, thread2_fun, (void *) 0xDEEEECAF);
    init_context(thread3, thread3_fun, (void *) 0xF000000D);
    thread1_fun((void *) 0xBABEBABE);

    return 0;
}

context.S (incomplete)

.text
.align 4
.globl switch_context
.type switch_context, @function

/*
 * All we have to save is the callee saved registers. Everything is
 * dumped to the stack and the only thing stored in, and restored from,
 * the context is the stack pointer.
 */
 switch_context:
    sw ra,   0(a0)
    sw sp,   4(a0)
    sw s0,   8(a0)
    sw s1,  12(a0)
    sw s2,  16(a0)
    sw s3,  20(a0)
    sw s4,  24(a0)
    sw s5,  28(a0)
    sw s6,  32(a0)
    sw s7,  36(a0)
    sw s8,  40(a0)
    sw s9,  44(a0)
    sw s10, 48(a0)
    sw s11, 52(a0)

    lw ra,  0(a1)
    lw sp,  4(a1)
    lw s0,  8(a1)
    lw s1,  12(a1)
    lw s2,  16(a1)
    lw s3,  20(a1)
    lw s4,  24(a1)
    lw s5,  28(a1)
    lw s6,  32(a1)
    lw s7,  36(a1)
    lw s8,  40(a1)
    lw s9,  44(a1)
    lw s10, 48(a1)
    lw s11, 52(a1)
    ret

.size switch_context, .- switch_context

/*
 * The helper function is the first to be entered in a new context
 * and serves to call the user entry function with the correct
 * argument. The reason we need a helper is that user entry function
 * argument is not one of the saved registers.
 */
.align 4
.globl helper
.type helper, @function
helper:
    lw A01 # Implement this!
    lw A02 # Implement this!
    jr t0

.size helper, .- helper

Please implement helper with minimal offset to make it work on rv32emu. You should answer with register ABI names.

A01 = ?

A02 = ?

Problem `B`

Assume that we are about to implement spinlock with RV32A.

#include <stdint.h>
volatile uint32_t lock;

static inline void spin_lock(volatile uint32_t *plock)
{
    uint32_t t = 1;
    while (t)
        asm volatile("B01 %0, %0, (B02)" : "+r"(t) : "r"(plock));
}

inline void spin_unlock(volatile uint32_t *plock)
{
    asm volatile("B03 x0, x0, (B04)" : : "r"(plock));
}

The above is shown with GNU inline assembly.

Both B02 and B02 MUST contain %. You should pick one of %0 and %1. Both B01 and B03 are atomic instructions and containg .w. in their mnemonic form.

B01 = ?

B02 = ?

B03 = ?

B04 = ?

Problem `C`

Given the following RISC-V machine code (and instruction addresses), disassembly manually and fill in C01 and C02 for the following instructions.

0x002cff00: loop: add t1, t2, t0 |_0000000|___00101|___00111|_____000|___00110|__0x33__|
0x002cff04:       C01            |_____________0x14|_____________________00001|__0x6F__|
0x002cff08:       C02            |_1111111|___00000|___00110|_____001|___11001|__0x63__|
...
0x002cff2c: foo: jr ra ra=0x002cff08

See RISC-V instructions for encoding.

Both C01 and C02 MUST contain label names. That is, loop and foo.

C01 = ?

C02 = ?

Problem `D`

We are going to write a function sumsqr in RISC-V that, when given an integer

n

, returns the summation below. If

n

is not positive, then the function returns 0.

n^{2} + (n - 1)^{2} + (n - 2)^{2} + . . . + 1^{2}

You are provided a RISC-V function named square for this problem, which accepts an integer and returns its square. Square is used as a subroutine in sumsqr to be implemented.

sumsqr:
    addi sp, sp, -12
    sw   ra, 8(sp)
    sw   s0, 4(sp)
    sw   s1, 0(sp)

    add  s0, a0, x0
    add  s1, x0, x0

loop:
    D01 # Implement this!

    add  a0, s0, x0
    jal  ra, square
    add  s1, s1, a0

    addi s0, s0, -1
    j    loop

xxx:
    D02 # Implement this!

    lw   ra, 8(sp)
    lw   s0, 4(sp)
    lw   s1, 0(sp)

    jr   ra

Fill the above D01 and D02 with valid RISC-V assembly.

D01 = ?

D02 = ?

Problem `E`

Assume that our system certains memory addresses correspond to registers in I/O devices and not normal memory. That is, memory-mapped I/O.

0xFFFF0000 – Receiver Control: LSB is the ready bit, there may be other bits set that we do not need right now.
0xFFFF0004 – Receiver Data: Received data stored at lowest byte.
0xFFFF0008 – Transmitter Control: LSB is the ready bit, there may be other bits set that we do not need right now.
0xFFFF000C – Transmitter Data: Transmitted data stored at lowest byte

We write the following RISC-V code to read a byte from the receiver and immediately send it to the transmitter.

    lui  t0, 0xffff    # load memmory mapped I/O address

receiver_poll:
    lw   t1, 0(t1)     # wait for the
    andi t1, t1, 0x1   # receiver
    beq  E01           # "ready" state

    lb   t2, 4(t0)     # load data from the receiver

transmitter_poll:
    lw   t1, 8(t0)     # wait for the
    andi t1, t1, 0x1   # transmitter
    beq  E02           # "ready" state
    E03                # write data to the transmitter device

Fill E01, E02, and E03 to make it work as expected.

E01 = ?

E02 = ?

E03 = ?

Problem `F`

The benefits of multi-threading programming come only after you understand concurrency. Here are two most common concurrency issues:

Cache-incoherence: each hardware thread has its own cache, hence data modified in one thread may not be immediately reflected in the other.
- They can often be solved by bypassing cache and writing directly to memory, i.e. using volatile keyword in many languages.
The famous Read-modify-write: Read-modify-write is a very common pattern in programming.
- In the context of multi-thread programming, the interleaving of R,M,W stages often produces a lot of issues.

To solve problem with Read-modify-write, we have to rely on the idea of undisrupted execution. Consider the following code using Amo.swap (single, undisrupted memory operation) to achieve the atomic primitive Test-and-set (TaS).

start:
    addi t0, x0, 1 	# locked state is 1
    amoswap.w.aq t1, t0, (a0)
    bne t1, x0, start
    
    # critical section

    amoswap.w.rl x0, x0, a0	# release lock

Fill the above F01 to make TaS work as expected.
- F01 = ?
why do we need special instructions for these operations? Why can't we use normal load and store? Explain!
- F02 = ?

Problem `G`

The following shows a classic fully-bypassed 5-stage pipeline that has been augmented with an unpipelined divider in parallel with the ALU. Bypass paths are not shown in the diagram. This iterative divider produces 2 bits per cycle until it outputs a full 32-bit result.

In this design, asynchronous interrupts are handled in the MEM stage and cause a jump to a dedicated interrupt trap handler address. The interrupt latency is defined as the number of cycles from when an interrupt request is raised in the MEM stage until the first instruction of the interrupt handler reaches the MEM stage.

Consider the execution of the code below. Suppose an interrupt is raised during cycle 8, which causes a jump to isr. Here, isr stands for interrupt Service Routine or interrupt handler. The handler increments a counter at a fixed memory address before returning to the original context. The architectural guarantee of precise interrupts should be upheld. Assume that all memory accesses take one cycle in the MEM stage.

    lw x2, 0(x1)
    div x1, x2, x3
    slli x3, x2, 1
    lui x4, 0x100
    addi x4, x4, 0xf
    xor x5, x3, x4
    sub x3, x5, x2
    ...
isr:
    sw x1, 0(x0) # Save register in known location
    lw x1, 4(x0) # Use register to increment counter
    addi x1, x1, 1
    sw x1, 4(x0)
    lw x1, 0(x0) # Restore register before returning
    mret # Return from interrupt handler

Note that the div instruction in RISC-V can not raise a data-dependent exception. To avoid pipeline stalls while a multi-cycle divide operation is in progress, the pipeline control logic allows subsequent instructions that do not depend on the divide result to be issued and completed before the divide has completed.

The RISC-V MRET, HRET, SRET, or URET instructions are used to return from traps in M-mode, H-mode, S-mode, or U-mode respectively. When executing an xRET instruction, supposing xPP holds the value y, yIE is set to xPIE; the privilege mode is changed to y; xPIE is set to 1; and xPP is set to U.

Typically the trap handlers will save the current state

PC              MEPC
Priv   ->       MPP
MIE             MPIE

and restore the state on the MRET instruction:

MEPC            PC
MPP     ->      Priv
MPIE            MIE

Reference: The RISC-V Instruction Set Manual Volume II: Privileged Architecture

What is the interrupt latency for the code above? (Answer in cycles)
- G01 = ?
Which instruction should isr return to in order to ensure that the program will continue to execute correctly? Answer in one of the instructions shown above.
- G02 = ?
In order to reduce the interrupt latency, we would constrain the destination register for a div instruction. The ABI could be changed to reserve a specific x register to always hold the divide result, so the interrupt handler can avoid using it. In a more generalized exception handler, reading the register can be deferred towards the end of the context save, which might hide the divide latency. Does this approach require any hardware modifications? G03 (Answer in Yes or No) What is the corresponding limitation? G04 (Explain!)
- G03 = ?
- G04 = ?
Let's think of the alternative approach to reduce the interrupt latency. The div instruction could be redefined to write to a special-purpose register separate from the x register file. This is the approach adopted by MIPS with its hi and lo registers, but it involves an ISA change with RISC-V. A tempting proposition is to kill the ongoing divide operation and save the intermediate pipeline state so that it can be restarted after the interrupt. Would this approach create precise interrupt? Explain!
- G05 = ?

Problem `H`

Consider a RISC-V processor that includes 2³⁰ bytes of virtual memory, 2²⁰ bytes of physical memory, and uses a page size of 2⁷ bytes. With a 4-element, fully-associative Translation Lookaside Buffer (TLB) with an LRU replacement policy. A program running on the processor is halted right before executing the following instruction located at address 0x4BA:

. = 0x4BA
lw x3, 0(x4) // x4 = 0xCAFE4

The first 8 entries of the page table are shown below. The page table uses an LRU replacement policy. Assume that all physical pages are currently in use.

	VPN	V	R	D	PPN
LRU $\to$	0xD09	1	1	0	0x2
	0x1	1	1	0	0xFA
	0xCAF	1	1	1	0xFF
	0x3	1	1	0	0xB

$↑$ TLB

	VPN	R	D	PPN
Next LRU $\to$	0	1	0	0x13
	1	1	0	0xFA
	2	0	0	0x3
	3	1	0	0xB
	4	1	0	0x1
	5	0	-	-
	6	0	-	-
LRU $\to$	7	1	0	0xF
	…

$↑$ Page Table

For each virtual address in the chart below, please indicate the VPN, whether or not the access results in a TLB Hit, whether or not the access results in a page fault, the PPN, and the physical address. If there is not enough information given to determine a given value, please write None. Please write all numerical values in hexadecimal.

Virtual Address	VPN	TLB Hit (Answer in Yes/No)	Page Fault (Answer in Yes/No)	PPN	Physical Address
`0x4BA`	`0x4`	H01	H02	H03	H04
`0xCAFE4`	H05	H06	H07	H08	H09

H01 = ?

H02 = ?

H03 = ?

H04 = ?

H05 = ?

H06 = ?

H07 = ?

H08 = ?

H09 = ?

Problem `I`

Below shows the solution to two process mutual exclusion problem. Answer True or False to the following questions:

Since variable turn can take only two values then it imposes "strict alternations" between two processes.
- I01 = ?
This algorithm satisfies the "mutual exclusion" and "progress" conditions but not the "bounded waiting" condition.
- I02 = ?
This algorithm may cause "deadlock" if both processes set their flags to True at the same time.
- I03 = ?
This algorithm has a flaw as the variable turn can be modified by both processes at the same time.
- I04 = ?

Problem `J`

Consider the C code below:

void QuickSort(int *lst, int start, int end)
{
    if (end > start) {
        int i = start, j = end, key = lst[start];
        while (i < j) {
            for (; i < j && key <= lst[j]; j--);
            lst[i] = lst[j];
            for (; i < j && key >= lst[i]; i++);
            lst[j] = lst[i];
        }
        lst[i] = key;
        QuickSort(lst, start, i - 1);
        QuickSort(lst, i + 1, end);
    }
}

We are about to convert to RISC-V assembly:














































































































































.org 0x0
    .global _start
_start:

main:
    lui    a0, 0x00000
    addi   sp, a0, 0x400

    or     a2, a0, zero
    
    addi   t0, zero, -3
    sw     t0,  (a2)
    addi   a2, a2, 4
    addi   t0, zero, -7
    sw     t0,  (a2)
    addi   a2, a2, 4
    addi   t0, zero, 6
    sw     t0,  (a2)
    addi   a2, a2, 4
    addi   t0, zero, 5
    sw     t0,  (a2)
    addi   a2, a2, 4
    addi   t0, zero, -2
    sw     t0,  (a2)
    addi   a2, a2, 4
    addi   t0, zero, 2
    sw     t0,  (a2)
    addi   a2, a2, 4
    addi   t0, zero, -9
    sw     t0,  (a2)
    addi   a2, a2, 4
    addi   t0, zero, -4
    sw     t0,  (a2)
    addi   a2, a2, 4
    addi   t0, zero, -6
    sw     t0,  (a2)
    addi   a2, a2, 4
    addi   t0, zero, 8
    sw     t0,  (a2)
    addi   a2, a2, 4
    addi   t0, zero, 1
    sw     t0,  (a2)
    addi   a2, a2, 4
    addi   t0, zero, -5
    sw     t0,  (a2)
    addi   a2, a2, 4
    addi   t0, zero, 7
    sw     t0,  (a2)
    addi   a2, a2, 4
    addi   t0, zero, 0
    sw     t0,  (a2)
    addi   a2, a2, 4
    addi   t0, zero, 3
    sw     t0,  (a2)
    addi   a2, a2, 4
    addi   t0, zero, -1
    sw     t0,  (a2)
    addi   a2, a2, 4
    addi   t0, zero, 4
    sw     t0,  (a2)
    addi   a2, a2, 4
    addi   t0, zero, 9
    sw     t0,  (a2)
    addi   a2, a2, 4
    addi   t0, zero, -8
    sw     t0,  (a2)

    or     a1, zero, zero
    sub    a2, a2, a0
    jal    ra, QuickSort
infinity_loop:
    jal   zero, infinity_loop

QuickSort:
        bge    a1, a2, QuickSortReturn                # if a1>=a2, end<=start, jump to return
        or     t1, a1, zero                           # t1=i=a1=start
        or     t2, a2, zero                           # t2=j=a2=end
        add    t0, a0, t1                             # 
        lw     t0, (t0)                               # t0=key=lst[start]

        PartationStart:          
            PartationFirstStart:                      # start of for loop
                bge    t1, t2, PartationEnd           # if i>=j, branch to next step
                add    t3, a0, t2                     # 
                lw     t3, (t3)                       # t3=lst[j]
                blt    t3, t0, PartationFirstEnd      # if lst[j]<key, branch to next step
                addi   t2, t2, -4                     # t2-=4  j--
                jal    zero, PartationFirstStart      # for loop
            PartationFirstEnd:                        # end of for loop
            add    t4, a0, t1                         # t4=lst+i
            sw     t3, (t4)                           # lst[i] = t3 = lst[j]
            
            PartationSecondStart:                     # start of for loop
                bge    t1, t2, PartationEnd           # if i>=j, branch to next step
                add    t3, a0, t1                     # 
                lw     t3, (t3)                       # t3=lst[i]
                blt    t0, t3, PartationSecondEnd     # if key<lst[i], branch to next step
                addi   t1, t1, 4                      # t1+=4  i++
                jal    zero, PartationSecondStart     # for loop
            PartationSecondEnd:                       # end of for loop 
            add    t4, a0, t2                         # t4=lst+j
            sw     t3, (t4)                           # lst[j] = t3 = lst[i]
            
            J01 # Implement this!
        PartationEnd:

        add    t4, a0, t1                             # t4=lst+i
        sw     t0, (t4)                               # lst[i] = t0 = key
        
        addi   sp, sp, -4                              # sp-=4        
        sw     ra, (sp)                                # mem[sp] = ra # push ra to stack
        addi   sp, sp, -4                              # sp-=4
        sw     a1, (sp)                                # mem[sp] = a1 # push a1 to stack, save start
        addi   sp, sp, -4                              # sp-=4        
        sw     a2, (sp)                                # mem[sp] = a2 # push a2 to stack, save end
        addi   sp, sp, -4                              # sp-=4        
        sw     t1, (sp)                                # mem[sp] = t1 # push t1 to stack, save i
        addi   a2, t1, -4                              # a2 = i-4, a parameter for recursive call
        J02 # Implement this!
        lw     t1, (sp)                                # pop i form stack 
        addi   sp, sp,  4                              # sp+=4
        lw     a2, (sp)                                # pop end form stack 
        addi   sp, sp,  4                              # sp+=4
        lw     a1, (sp)                                # pop start form stack 

        addi   sp, sp, -4                              # sp-=4        
        sw     a2, (sp)                                # mem[sp] = a2 # push a2 to stack, save end
        addi   sp, sp, -4                              # sp-=4        
        sw     t1, (sp)                                # mem[sp] = t1 # push t1 to stack, save i
        addi   a1, t1, 4                               # a1 = i+4, a parameter for recursive call
        J03 # Implement this!
        lw     t1, (sp)                                # pop i form stack 
        addi   sp, sp,  4                              # sp+=4
        lw     a2, (sp)                                # pop end form stack 
        addi   sp, sp,  4                              # sp+=4
        lw     a1, (sp)                                # pop start form stack 
        addi   sp, sp,  4                              # sp+=4
        lw     ra, (sp)                                # pop ra form stack 
        addi   sp, sp,  4                              # sp+=4

    QuickSortReturn:
        jalr   zero, ra, 0

Fill in J01 (Line 104), J02 (Line 119), and J03 (Line 131) to make the above program work as expected.

J01 = ?

J02 = ?

J03 = ?

Problem `K`

Assume that we are working on a new microprocessor with new ISA. We are bored with the different RISC-V instruction types, and therefore we decide to include only one, universal type – the Z-type instruction.

Say we wish to include the following instructions:

add rd1, rs1, rs2
and rd1, rs1, rs2
lw rd1, offset1 (rs1)
sw rs2, offset1 (rs1)
addi rd1, rs1, imm1
beq rs1, rs2, offset1
lui rd1, offset1
jal rd1, imm
stw rs3, offset1, offset2 (rs1)

The new stw instruction stores the contents of rs3 into both rs1 + offset1 and rs1 + offset2. The RTL is:

Mem(R[rs1] + offset1) ← R[rs3] AND Mem(R[rs1] + offset2) ← R[rs3]

The funct3 and funct7 fields should be eliminated, and simply an opcode should be used instead. How many bits must the opcode field have if we simply want to support the above instructions? (Answer in bits)
- K01 = ?
With only one instruction, we want to be able to jump up to 64 KiB in either direction. How many bits are required to encode an immediate for us to be able to achieve this? Assume that, like RV32, the instruction's least significant bit is an implicit 0 and not saved. (Answer in bits)
- K02 = ?
Then, we ultimately choose the following instruction format. We have added a few fields to account for any future additions of additional instructions. The list index provided at the start of this problem's instructions is also the opcode for each instruction (sw, for instance, has opcode 3).

imm2 imm1 rs3 rs2 rs1 rd2 rd1 opcode

This instruction format is quite long, so we decide to work on a 64-bit machine. Each immediate field is 11 bits, and the opcode is 7 bits. What is the maximum number of registers we can have? (Answer in registers)
- K03 = ?

Quiz7 of Computer Architecture (2022 Fall)

Problem A

Problem B

Problem C

Problem D

Problem E

Problem F

Problem G

Problem H

Problem I

Problem J

Problem K