林濤
Implementation of "Decimation-in-time" FFT algorithm by using C and RISC-V. Test performance using spike.
Since the complexity of "Discrete Fourier Transform" is
In order to have less complexity, FFT(Fast Fourier Transform) technic was introduced.
Consider the definition of DFT mentioned above
the series
For example, consider
Continue after above case. Take
Further step, we can decomposite
In 8 points case, finally, we can obtain 2 points DFT. Now we can also represent 2 points DFT in the form of twiddle factor.
Worthy to notice is that in the each stage(3 in 8 points), we can represent only one pair of twiddle factors. That is, take stage 2,
With this idea, we can decomposite
In this version, all function are implemented using C. The logint() and reverse() functions are refered to Problem D, quize 5. The major function, DIT_FFT, is shown below.
void DIT_FFT(int size, float *real, float *image) {
int i, j, k, stage, step;
int halfStep;
float tempReal, tempImage;
// data swift to new index
for (i = 0; i < size; i++) {
int revIndex = reverse(size, i);
if (i < revIndex) {
// Swap real parts
tempReal = real[i];
real[i] = real[revIndex];
real[revIndex] = tempReal;
// Swap imaginary parts
tempImage = image[i];
image[i] = image[revIndex];
image[revIndex] = tempImage;
}
}
// Perform FFT computation
for (stage = 1; stage <= logint(size); stage++) {
step = 1 << stage; // 2^stage
halfStep = step >> 1; // Step size / 2
for (k = 0; k < size; k += step) {
for (j = 0; j < halfStep; j++) {
// Compute twiddle factor
float angle = -2.0f * M_PI * (float)j / (float)step; // Angle for twiddle factor
float cosValue = cos(angle); // Replace with precomputed cosine lookup table for optimization
float sinValue = sin(angle); // Replace with precomputed sine lookup table for optimization
// Butterfly computation
int index1 = k + j;
int index2 = index1 + halfStep;
tempReal = cosValue * real[index2] - sinValue * image[index2];
tempImage = sinValue * real[index2] + cosValue * image[index2];
real[index2] = real[index1] - tempReal;
image[index2] = image[index1] - tempImage;
real[index1] += tempReal;
image[index1] += tempImage;
}
}
}
}
Function of this assembly code, we can reordered index in the algorithm of FFT now.
This code with some revices refer to quize5 problem D, C.A.2024.
.global logint
logint:
addi sp, sp, -4
sw t0, 0(sp)
add t0, a0, zero # k = N
add a0, zero, zero # i = 0
logloop:
beq t0, zero, logloop_end # Exit if k == 0
srai t0, t0, 1 # k >>= 1
addi a0, a0, 1 # i++
j logloop
logloop_end:
addi a0, a0, -1 # Return i - 1
lw t0, 0(sp)
addi sp, sp, 4
jr ra
reverse:
addi sp, sp, -28
sw ra, 0(sp)
sw s0, 4(sp)
sw s1, 8(sp)
sw s2, 12(sp)
sw s3, 16(sp)
sw s4, 20(sp)
sw s5, 24(sp)
jal logint # jump to logint and save position to ra
addi s0, zero, 1 # j = 1
add s1, zero, zero # p = 0
forloop_reverse:
bgt s0, a0, forloop_reverse_end
sub s2, a0, s0# s2 = a0 - s0
addi s3, zero, 1
sll s3, s3, s2
and s3, a1, s3
beq s3, zero, elses3 # If not, skip
ifs3:
addi s4, s0, -1 # s4 = j - 1
addi s5, zero, 1
sll s5, s5, s4
or s1,s1,s5
elses3:
addi s0, s0, 1 # j++
j forloop_reverse
forloop_reverse_end:
add a0, s1, zero # Return p
lw ra, 0(sp)
lw s0, 4(sp)
lw s1, 8(sp)
lw s2, 12(sp)
lw s3, 16(sp)
lw s4, 20(sp)
lw s5, 24(sp)
addi sp, sp, 28
jr ra
// a0=points
// a1=current index
// a2=address of real sequence
// a3=address of image sequence
swift:
addi sp, sp, -28
sw ra, 0(sp)
sw s0, 4(sp)
sw s1, 8(sp)
sw s2, 12(sp)
sw s3, 16(sp)
sw s4, 20(sp)
sw s5, 24(sp)
addi s0, a1, 0 #s0 = a1, current index save to s0
# a0 = N(size), a1 = n(current index)
jal reverse # now a0 = reverse (a0)
slli s0, s0, 2 # s0 = s0 * 4 address shift of current index
slli s1, a0, 2 # s1 = a0 * 4 address shift of reversed index
# Deal with real part exchange
add s2, a2, s0 # s2 = address of current index
add s3, a2, s1 # s3 = address of exchange index
lw s4, 0(s2) # s4 = the value of current index
lw s5, 0(s3) # s5 = the value of reverse index
sw s5, 0(s2) # store reversed value to current index
sw s4, 0(s3) # store current value to reversed index
# Deal with image part exchange
add s2, a3, s0 # s2 = address of current index
add s3, a3, s1 # s3 = address of exchange index
lw s4, 0(s2) # s4 = the value of current index
lw s5, 0(s3) # s5 = the value of reverse index
sw s5, 0(s2) # store reversed value to current index
sw s4, 0(s3) # store current value to reversed index
#recover registers
lw ra,0(sp)
lw s0,4(sp)
lw s1,8(sp)
lw s2,12(sp)
lw s3,16(sp)
lw s4,20(sp)
lw s5,24(sp)
addi sp,sp,28
jr ra
In this assembly, I try to implement the addition operation of two float point numbers of IEEE 754 standard. Below are the steps how I realized this assembly.
However, due to deadline, I still can't overcome problems about overflow, even some specil cases.
.global f_add
f_add:
addi sp,sp,-24
sw ra, 0(sp)
sw s0,4(sp)
sw s1,8(sp)
sw s2,12(sp)
sw s3,16(sp)
sw s4,20(sp)
slli t0, a0, 1
slli t1, a1, 1 # ignore sign bit, compare which is larger
srli t0, t0, 1
srli t1, t1, 1
#####(now a0 >= a1)
lui t2 ,0x80000
and t2, a0, t2
and s0, a1, t2 #check sign difference, store in s0
bgt t1, t0, reorder
keep_add:
lui t0, 0x7F800 # mask for exp
lui t1, 0x80000 # mask for sign
and s1, a0, t0
and s2, a1, t0 # exp in s1, s2
lui t1, 0x007FF
li t0, 0xFFF
add t0, t0, t1 # mask for mantissa
srli s1,s1,23
srli s2,s2,23
and s3, t0, a0
and s4, t0, a1 # mantissa in s3, s4
lui t1, 0x00800 # implicit bit
sub s2, s1, s2 # exp s1-s2
add s3, s3, t1
add s4, s4, t1 # with implicit
bnez s0, sign_diff
# sign same
# mantissa +
# consider bit =? 0 at position24, if so, shift mantissa and exp -1
srl s4, s4, s2 # align exp
add s2, s3, s4 # result mantissa = s3
j clnz
sign_diff:
#mantissa a0- mantissa a1
# use sign of a0 as major
# case if all zero
srl s4, s4, s2 # align exp
sub s2, s3, s4 # result mantissa = s3
j clnz
reorder:
addi t0, a0, 0
addi a0, a1, 0
addi a1, t0, 0
j keep_add
mantissa_0:
li a0,0
j over
# used for exp shift
# s1 exp, s2, mantissa result with implicit
clnz:
li t0,32 #count non leading zero poosition
li t1,128
beqz s2, mantissa_0
bgt s2,t1, over
clnz_loop:
lui t1, 0x80000 # fitst bit mask
and t1, s2, t1 # check first bit
bnez t1, end_clnz
slli s2, s2,1
addi t0, t0, -1 # position -1
j clnz_loop
end_clnz:
addi t0, t0,-24 # bit position should at 24
lui t1, 0x00800 # implicit bit
bltz t0, clnz_exp_minus
add s1, s1, t0
srli s2, s2, 8
sub s2, s2, t1
slli s1, s1, 23
j result # return with mantissa(no implict)
clnz_exp_minus:
neg t0, t0
sub s1, s1, t0
srli s2, s2, 8 # shift implicit bit to 24th bit position
sub s2, s2, t1
slli s1, s1, 23
result:
# combine , s1 = exp, s2 = result mantissa
add s1, s1, s2
lui t0, 0x80000
and a0, t0, a0 # sign bit
or a0, a0, s1
over: #special case
lw ra,0(sp)
lw s0,4(sp)
lw s1, 8(sp)
lw s2, 12(sp)
lw s3, 16(sp)
lw s4,20 (sp)
addi sp,sp,24
jr ra
Below are my float point multiplication code.
.global f_mul
f_mul:
addi sp,sp,-28
sw ra,0(sp)
sw s0,4(sp)
sw s1,8(sp)
sw s2,12(sp)
sw s3,16(sp)
sw s4,20(sp)
sw s5,24(sp)
srli s0, a0, 31 # s0 = a0 >> 31, sign bit of s0
srli s1, a1, 31 # s1 = a1 >> 31, sign bit of s1
li t0, 1
lui s2, 0x007FF
li s3, 0xFFF
add s3, s3, s2 # s3, mantissa mask
and s2, a0, s3 # s2 = a0 & 0x7FFFFF, mantissa of a0
and s3, a1, s3 # s3 = a1 & 0x7FFFFF, mantissa of a1
slli t0, t0, 23 # t0 = implicit 1, at bit position 24
li s4, 1
xor s4, s0, s1 # s4 = s0 ^ s1, sign of result
lui s5, 0x7F800 # s5 = 0x7F800000, exponent mask
add s2, s2, t0 # s2 = s2 + t0, mantissa of a0 with implicit 1
add s3, s3, t0 # s3 = s3 + t0, mantissa of a1 with implicit 1
jal mul_mantissa # s2, s3 input
and s1, a0, s5 # s1 = a0 & 0x7F800000, exponent of a0
and s2, a1, s5 # s2 = a1 & 0x7F800000, exponent of a1
slli s4,s4,31 # s4 = s4 << 31, sign of result
add s3, s2, s1 # s3 = s2 + s1, exponent of result
srli s3, s3, 23 # s3 = s3 >> 23, s3 = exponent of result
bgtz s3, normal_exp
addi s3, s3, -126 # s3 = s3 - 127, one bias, s3 = exponent of result
slli s3, s3, 23
or s4, s4, s0 # s4 = s4 + s0, sign + mantissa
or s4, s4, s3 # s4 = s4 + s3, sign + mantissa + exponent
j end_f_mul
normal_exp:
addi s3, s3, -127 # s3 = s3 - 127, one bias, s3 = exponent of result
slli s3, s3, 23 # s3 = s3 << 23, s3 = exponent of result
or s4, s4, s0 # s4 = s4 + s0, sign + mantissa
or s4, s4, s3 # s4 = s4 + s3, sign + mantissa + exponent
end_f_mul:
add a0, s4, zero # a0 = s4 + 0, return value
lw ra, 0(sp) # restore ra
lw s0, 4(sp) # restore s0
lw s1, 8(sp) # restore s1
lw s2, 12(sp) # restore s2
lw s3, 16(sp) # restore s3
lw s4, 20(sp) # restore s4
lw s5, 24(sp) # restore s5
addi sp, sp, 28 # restore sp
jr ra
mul_mantissa:
# mantissa s2, s3
srli s2, s2, 8
srli s3, s3, 8
lui t1, 0x00008 # bit position of implicit 1, start at 16
li t2, 0 # t2 = 0, temp value
add_mantissa:
lui t0, 0x80000 # bit position 32 check
and t3, s3, t1 # t3 = s3 & 0x000008, addition check bit
beqz t3, skip_mul_add # if this bit = 0, skip add
add t2,t2,s2 # t2 = t2 + s2
slli t2, t2, 1 # right shift temp value
slli s3,s3,1 # left shift new 16th bit of s3
and t0, t0, t2 # take out 32th bit of t2
bnez t0, end_mul_mantissa # if 32th bit is 1
j add_mantissa
skip_mul_add:
slli t2, t2, 1 # right shift temp value
slli s3,s3,1 # left shift new 16th bit of s3
and t0, t0, t2 # take out 32th bit of t2
bnez t0, end_mul_mantissa # if 32th bit is 1
j add_mantissa
end_mul_mantissa:
lui t1, 0x80000
sub t2, t2, t1
srli s0,t2,8 # right shift to right position of mantissa
jr ra