Assignment1: RISC-V Assembly and Instruction Pipeline
===
> contributed by [weiweiwei68](https://github.com/weiweiwei68/RISC-V-Assembly-and-Instruction-Pipeline)
Translate C code to RISC-V assembly language
---
In this assignment, Problem A from Quiz1 is chosen to convert it from C code to a RISC-V assembly program. For the convenience, the C code will be presented below.
```c!
#include <stdint.h>
uint16_t count_leading_zeros(uint64_t x)
{
x |= (x >> 1);
x |= (x >> 2);
x |= (x >> 4);
x |= (x >> 8);
x |= (x >> 16);
x |= (x >> 32);
/* count ones (population count) */
x -= ((x >> 1) & 0x5555555555555555);
x = ((x >> 2) & 0x3333333333333333) + (x & 0x3333333333333333);
x = ((x >> 4) + x) & 0x0f0f0f0f0f0f0f0f;
x += (x >> 8);
x += (x >> 16);
x += (x >> 32);
return (64 - (x & 0x7f));
}
```
As RV32I instructions are required, the input of this function, defined as ```uint64_t x```, should be split into two parts. For more specific, since ```x``` is a 64-bit value, it should be loaded into two separates on a RV32I system.
The following block presents the RISC-V instruction corresponding to the previously mentioned C code.
```c
.data
# num1 and num2 represent two separate 32-bit data values that can be combined to form a single 64-bit data value
num1: .word 0x2B52EF8D
num2: .word 0x465C46C8
num3: .word 0xFF54C6B8
num4: .word 0x58AFE26C
num5: .word 0xF5C486AB
num6: .word 0xEFA52F6C
.text
main:
lw a0, num2
lw a1, num1
jal ra, count_leading_zeros
lw a0, num4
lw a1, num3
jal ra, count_leading_zeros
lw a0, num6
lw a1, num5
jal ra, count_leading_zeros
#Exit program
li a7, 10
ecall
count_leading_zeros:
addi sp, sp, -24
sw ra, 16(sp)
sw a0, 8(sp)
sw a1, 0(sp)
# Prepare two registers for storing argument x
# Arguments: a0 = x_low (lower 32 bits of x)
# a1 = x_high (upper 32 bits of x)
# Return value: a0 = number of leading zeros
# Initialize the x as input value
add a0, a0, x0 # a0 = x_low
add a1, a1, x0 # a1 = x_high
addi t0, x0, 32 # t0 = 32
addi t1, x0, 1 # t1 = 1, 2, 4, 8, 16, 32 for each iteration
loop: # x |= (x >> n)
sub t2, t0, t1 # t2 = 31, 30, 28, 24, 16, 0 for each iteration
sll a3, a1, t2 #put the shift value from x_high to correct position
srl a2, a0, t1 # shift right t1 bits in x_low
or a2, a2, a3 # add the shift value from x_high to x_low
or a0, a0, a2 # or x_low
srl a2, a1, t1 # shift x_high
or a1, a1, a2 # or x_high
slli t1, t1, 1
bge t0, t1, loop
count_ones: # Count ones (population count)
# x -= ((x >> 1) & 0x5555555555555555)
slli a3, a1, 31 #put the shift value from x_high to correct position
srli a2, a0, 1 # shift right 1 bit in x_low
or a2, a2, a3
srli a4, a1, 1 #shift right 1 bit in x_high
li a7, 0x55555555
add t0, x0, a7
and a2, a2, t0
and a4, a4, t0
# x = a1 a0 - a4 a2
bge a2, a0, sub # a5 = 1 when a0 < a2
sub a0, a0, a2
sub a1, a1, a4
#sub a1, a1, a5
# x = ((x >> 2) & 0x3333333333333333) + (x & 0x3333333333333333)
slli a3, a1, 30 #put the shift value from x_high to correct position
srli a2, a0, 2 # shift right 2 bits in x_low
or a2, a2, a3
srli a4, a1, 2 #shift right 2 bits in x_high
li a7, 0x33333333
add t0, x0, a7
and a2, a2, t0 # lower 32 bits of ((x >> 2) & 0x3333333333333333)
and a4, a4, t0 # higher 32 bits of ((x >> 2) & 0x3333333333333333)
and a0, a0, t0 # lower 32 bits of (x & 0x3333333333333333)
and a1, a1, t0 # higher 32 bits of (x & 0x3333333333333333)
# x = a4 a2 + a1 a0
add a0, a0, a2
slt a5, a0, a2 # a5 = 1 when a0 < a2
add a1, a1, a4
add a1, a1, a5
# x = ((x >> 4) + x) & 0x0f0f0f0f0f0f0f0f
slli a3, a1, 28 #put the shift value from x_high to correct position
srli a2, a0, 4 # shift right 4 bits in x_low
or a2, a2, a3
srli a4, a1, 4 # shift right 4 bits in x_high
# add a4 a2 and a1 a0. ((x >> 4) + x)
add a0, a0, a2
slt a5, a0, a2 # a5 = 1 when a0 < a2
add a1, a1, a4
add a1, a1, a5
# a1 a0 & 0x0f0f0f0f0f0f0f0f
li a7, 0x0f0f0f0f
add t0, x0, a7
and a0, a0, t0
and a1, a1, t0
# x += (x >> 8)
slli a3, a1, 24 #put the shift value from x_high to correct position
srli a2, a0, 8 # shift right 4 bits in x_low
or a2, a2, a3
srli a4, a1, 8 #shift right 4 bits in x_high
# x = a1 a0 + a4 a2
add a0, a0, a2
slt a5, a0, a2 # a5 = 1 when a0 < a2
add a1, a1, a4
add a1, a1, a5
# x += (x >> 16)
slli a3, a1, 16 #put the shift value from x_high to correct position
srli a2, a0, 16 # shift right 4 bits in x_low
or a2, a2, a3
srli a4, a1, 16 #shift right 4 bits in x_high
# x = a1 a0 + a4 a2
add a0, a0, a2
slt a5, a0, a2 # a5 = 1 when a0 < a2
add a1, a1, a4
add a1, a1, a5
# x += (x >> 32)
slli a3, a1, 0
li a7, 32 #put the shift value from x_high to correct position
srl a2, a0, a7
or a2, a2, a3
srl a4, a1, s7
# x = a1 a0 + a4 a2
add a0, a0, a2
slt a5, a0, a2 # a5 = 1 when a0 < a2
add a1, a1, a4
add a1, a1, a5
# (64 - (x & 0x7F))
addi t0, x0, 64
andi a0, a0, 0x7f
sub a0, t0, a0 # a0 represent the number of leading zeros
lw ra, 16(sp)
addi sp, sp, 24
ret
sub:
addi a1, a1, -1
li a7, 0xffffffff
sub a7, a7, a2
addi a7, a7, 1
add a0, a7, a0
sub a1, a1, a4
# x = ((x >> 2) & 0x3333333333333333) + (x & 0x3333333333333333)
slli a3, a1, 30 #put the shift value from x_high to correct position
srli a2, a0, 2 # shift right 2 bits in x_low
or a2, a2, a3
srli a4, a1, 2 #shift right 2 bits in x_high
li a7, 0x33333333
add t0, x0, a7
and a2, a2, t0 # lower 32 bits of ((x >> 2) & 0x3333333333333333)
and a4, a4, t0 # higher 32 bits of ((x >> 2) & 0x3333333333333333)
and a0, a0, t0 # lower 32 bits of (x & 0x3333333333333333)
and a1, a1, t0 # higher 32 bits of (x & 0x3333333333333333)
# x = a4 a2 + a1 a0
add a0, a0, a2
slt a5, a0, a2 # a5 = 1 when a0 < a2
add a1, a1, a4
add a1, a1, a5
# x = ((x >> 4) + x) & 0x0f0f0f0f0f0f0f0f
slli a3, a1, 28 #put the shift value from x_high to correct position
srli a2, a0, 4 # shift right 4 bits in x_low
or a2, a2, a3
srli a4, a1, 4 # shift right 4 bits in x_high
# add a4 a2 and a1 a0. ((x >> 4) + x)
add a0, a0, a2
slt a5, a0, a2 # a5 = 1 when a0 < a2
add a1, a1, a4
add a1, a1, a5
# a1 a0 & 0x0f0f0f0f0f0f0f0f
li a7, 0x0f0f0f0f
add t0, x0, a7
and a0, a0, t0
and a1, a1, t0
# x += (x >> 8)
slli a3, a1, 24 #put the shift value from x_high to correct position
srli a2, a0, 8 # shift right 4 bits in x_low
or a2, a2, a3
srli a4, a1, 8 #shift right 4 bits in x_high
# x = a1 a0 + a4 a2
add a0, a0, a2
slt a5, a0, a2 # a5 = 1 when a0 < a2
add a1, a1, a4
add a1, a1, a5
# x += (x >> 16)
slli a3, a1, 16 #put the shift value from x_high to correct position
srli a2, a0, 16 # shift right 4 bits in x_low
or a2, a2, a3
srli a4, a1, 16 #shift right 4 bits in x_high
# x = a1 a0 + a4 a2
add a0, a0, a2
slt a5, a0, a2 # a5 = 1 when a0 < a2
add a1, a1, a4
add a1, a1, a5
# x += (x >> 32)
slli a3, a1, 0
li a7, 32 #put the shift value from x_high to correct position
srl a2, a0, a7
or a2, a2, a3
srl a4, a1, s7
# x = a1 a0 + a4 a2
add a0, a0, a2
slt a5, a0, a2 # a5 = 1 when a0 < a2
add a1, a1, a4
add a1, a1, a5
# (64 - (x & 0x7F))
addi t0, x0, 63
andi a0, a0, 0x7f
sub a0, t0, a0 # a0 represent the number of leading zeros
lw ra, 16(sp)
addi sp, sp, 24
ret
```
Explaination
---
```c
x |= (x >> 1);
x |= (x >> 2);
x |= (x >> 4);
x |= (x >> 8);
x |= (x >> 16);
x |= (x >> 32);
```
The code snippet provided is a part of the Count Leading Zeros (CLZ) algorithm. Each line in the code performs a right shift operation on the variable ```x``` with a varying number of bits to the right. To streamline and enhance the code's readability, we can use a loop to control the right-shifting operation, with each iteration using a unique shift value.
The following RISC-V code is the initial version before introducing the loop function, also referred to as a branch. The primary variation in this code lies in the number of bits shifted to the right. To streamline the code, we can use the ```bge t0, t1, loop``` instruction, where ```t0``` is set to 32 and ```t1``` is updated to 1, 2, 4, 8, 16, and 32 during each loop iteration. This enables more efficient control of the right-shifting process.
```
# x |= (x >> 1)
slli a3, a1, 31 #put the shift value from x_high to correct position
srli a2, a0, 1 # shift x_low
or a2, a2, a3 # add the shift value from x_high
or a0, a0, a2 # or x_low
srli a2, a1, 1 # shift x_high
or a1, a1, a2 # or x_high
# x |= (x >> 2)
slli a3, a1, 30
srli a2, a0, 2
or a2, a2, a3
or a0, a0, a2
srli a2, a1, 2
or a1, a1, a2
# x |= (x >> 4)
slli a3, a1, 28
srli a2, a0, 4
or a2, a2, a3
or a0, a0, a2
srli a2, a1, 4
or a1, a1, a2
# x |= (x >> 8)
slli a3, a1, 24
srli a2, a0, 8
or a2, a2, a3
or a0, a0, a2
srli a2, a1, 8
or a1, a1, a2
# x |= (x >> 16)
slli a3, a1, 16
srli a2, a0, 16
or a2, a2, a3
or a0, a0, a2
srli a2, a1, 16
or a1, a1, a2
# x |= (x >> 32)
li a7, 32
sll a3, a1, a7
srli a2, a0, 0
or a2, a2, a3
or a0, a0, a2
srli a2, a1, 0
or a1, a1, a2
```
The following image demonstrate the result of assembly.

Enhancing buffer overflow defense with return address protection and CLZ
---
Return Address Protection is a security mechanism aimed at preventing buffer overflow attacks by safeguarding the integrity of the return addresses stores on the stack. In buffer overflow attacks, attackers often manipulate the return address of a function to redirect program execution to the malicious code. Buffer overflow attacks involve overwriting the return address on the stack with a malicious address, making the program jump to the attacker's code instead of returning to the intended location.
In this assignment, i will uitilize the CLZ(counting leading zeros) to help detect the return address and check if the address has been tampered with.
```c
#include <stdio.h>
#include <stdint.h>
#define CANARY_SIZE 8 // Size of the canary in bytes (64 bits)
// Define the CLZ function (as provided)
uint16_t count_leading_zeros(uint64_t x)
{
x |= (x >> 1);
x |= (x >> 2);
x |= (x >> 4);
x |= (x >> 8);
x |= (x >> 16);
x |= (x >> 32);
/* count ones (population count) */
x -= ((x >> 1) & 0x5555555555555555);
x = ((x >> 2) & 0x3333333333333333) + (x & 0x3333333333333333);
x = ((x >> 4) + x) & 0x0f0f0f0f0f0f0f0f;
x += (x >> 8);
x += (x >> 16);
return (64 - (x & 0x7f));
}
// Function to simulate a buffer overflow attack
void buffer_overflow_attack(char* buffer)
{
// Overwrite the return address with a malicious address
uint64_t malicious_address = 0xDEADBEEF;
*((uint64_t*)(buffer + CANARY_SIZE)) = malicious_address;
}
int main()
{
char buffer[32]; // Example buffer with space for the canary and data
// Generate a random canary value (simulated)
uint64_t canary_value = 0x123456789ABCDEF0;
// Place the canary value at the beginning of the buffer
*((uint64_t*)buffer) = canary_value;
// Simulate a buffer overflow attack
buffer_overflow_attack(buffer);
// Check the integrity of the canary value using CLZ
uint64_t stored_canary = *((uint64_t*)buffer);
uint16_t clz_result = count_leading_zeros(stored_canary ^ canary_value);
// If CLZ detects a change in the canary value, it indicates an attack
if (clz_result != 64) {
printf("Buffer overflow detected! The value start to be different at %d bits from left.\n", clz_result+1);
} else {
printf("No buffer overflow detected.\n");
}
return 0;
}
```
In this C code, the CLZ is employed to check if the canary value and the stored canary are the smae. If they are identical in binary format, the operation ```stored_canary ^ canary_value``` would equal to zero among 64 bits. Oppositely, if the two values are different, the CLZ can find the left most bits where the value is tampered.
```c
void buffer_overflow_attack(char* buffer)
{
// Overwrite the return address with a malicious address
uint64_t malicious_address = 0xDEADBEEF;
*((uint64_t*)(buffer + CANARY_SIZE)) = malicious_address;
}
```
The function ```buffer_overflow_attack``` simulate a buffer overflow attack. This function takes a return address as input and output a tampered return address. This tampered address may lead to malicious instructions exercuted. So our work is to determine whether the return address is changed or not.
In this code, we have the following variables and actions:
1. ```canary_value```: Initially set to ```0x123456789ABCDEF0```, this variable represents a security canary value, often used to detect buffer overflow attacks.
2. A buffer overflow attack simulation is performed, which changes the ```canary_value``` to ```0x12345678DEADBEEF```. This change symbolizes the tampering of the return address in a real-world buffer overflow attack.
3. The operation ```stored_canary ^ canary_value``` is used to check whether the original return address matches the final return address. If they are identical, this operation will result in all zeros. In this instance, this operation produces a result with 32 leading zeros, indicating that the original and tampered return addresses match for the first 32 bits.
4. The 33rd bit from the left is the first bit to be different between the original and tampered return addresses, indicating a potential security breach or tampering.
Reference
===
[Buffer overflow](https://medium.com/@b3rm1nG/%E7%B7%A9%E8%A1%9D%E5%8D%80%E6%BA%A2%E4%BD%8D%E6%94%BB%E6%93%8A%E4%B9%8B%E4%B8%80-buffer-overflow-83516aa80240)