劉益祥
The CORDIC algorithm operates using iterative shift-add arithmetic and is based on vector rotation in a plane. By sequentially rotating a vector through predefined angles, the algorithm computes trigonometric functions like sine and cosine. It avoids the need for direct multiplication or division by using only addition, subtraction, bit shifts, and table lookups.
Trying to use uint32_t
with shift and add operations to replace float
with floating-point operations in cordic algorithm.
refrence from 2024 Quiz1 Problem A
static inline float bits_to_fp32(uint32_t w)
{
union {
uint32_t as_bits;
float as_value;
} fp32 = {.as_bits = w};
return fp32.as_value;
}
refrence from 2024 Quiz1 Problem A
static inline uint32_t fp32_to_bits(float f)
{
union {
float as_value;
uint32_t as_bits;
} fp32 = {.as_value = f};
return fp32.as_bits;
}
modified from 2023 Quiz1 Problem A
uint32_t myclz_32(uint32_t x)
{
x |= (x >> 1);
x |= (x >> 2);
x |= (x >> 4);
x |= (x >> 8);
x |= (x >> 16);
x -= ((x >> 1) & 0x55555555);
x = ((x >> 2) & 0x33333333) + (x & 0x33333333);
x = ((x >> 4) + x) & 0x0f0f0f0f;
x += (x >> 8);
x += (x >> 16);
return (32 - (x & 0x7f));
}
uint32_t add_32(uint32_t a, uint32_t b){
uint32_t rst_sign, rst_exp, rst_fra, rst;
uint32_t a_sign = a >> 31;
uint32_t a_exp = (uint32_t)(a << 1) >> 24;
uint32_t a_fra = (a & 0x7fffff) | 0x800000;
uint32_t b_sign = b >> 31;
uint32_t b_exp = (uint32_t)(b << 1) >> 24;
uint32_t b_fra = (b & 0x7fffff) | 0x800000;
if(a_exp > b_exp){
uint32_t sft = a_exp - b_exp;
b_fra >>= sft;
if(a_sign ^ b_sign)
rst_fra = a_fra - b_fra;
else
rst_fra = a_fra + b_fra;
rst_exp = a_exp;
rst_sign = a_sign;
}
else if(a_exp < b_exp){
uint32_t sft = b_exp - a_exp;
a_fra >>= sft;
if(a_sign ^ b_sign)
rst_fra = b_fra - a_fra;
else
rst_fra = a_fra + b_fra;
rst_exp = b_exp;
rst_sign = b_sign;
}
else{
rst_exp = a_exp;
if(a_sign ^ b_sign){
if(a_fra > b_fra){
rst_fra = a_fra - b_fra;
rst_sign = a_sign;
}
else{
rst_fra = b_fra - a_fra;
rst_sign = b_sign;
}
}
else{
rst_fra = a_fra + b_fra;
rst_sign = a_sign;
}
}
//normalize
int lz = myclz_32(rst_fra);
if(lz <= 8){
lz = 8 - lz;
rst_fra >>= lz;
rst_exp += lz;
}
else{
lz -= 8;
rst_fra <<= lz;
rst_exp -= lz;
}
rst_fra &= 0x7fffff;
rst_sign <<= 31;
rst_exp <<= 23;
rst = rst_sign | rst_exp | rst_fra;
return rst;
}
uint32_t mul_32(uint32_t a, uint32_t b){
uint32_t a_sign = a >> 31;
uint32_t a_exp = (uint32_t)(a << 1) >> 24;
uint32_t a_fra = (a & 0x7fffff) | 0x800000;
uint32_t b_sign = b >> 31;
uint32_t b_exp = (uint32_t)(b << 1) >> 24;
uint32_t b_fra = (b & 0x7fffff) | 0x800000;
uint32_t sign = a_sign ^ b_sign;
uint32_t exp = a_exp + b_exp - 127;
uint64_t fra_64 = (uint64_t)a_fra * b_fra;
fra_64 >>= 23;
uint32_t fra = (uint32_t)fra_64;
//normalize
uint32_t lz = myclz_32(fra);
lz = 8 - lz;
fra >>= lz;
exp += lz;
fra -= 0x800000;
sign <<= 31;
exp <<= 23;
uint32_t rst = sign | exp | fra;
return rst;
}
input1 | input2 | answer | myanswer | error rate |
---|---|---|---|---|
15.961235 | 43.132904 | 59.094139 | 59.094139 | 0.000000e+00 |
36.952332 | 17.209785 | 54.162117 | 54.162117 | 0.000000e+00 |
-22.274956 | 44.483910 | 22.208954 | 22.208954 | 0.000000e+00 |
-27.907221 | -45.500877 | -73.408096 | -73.408096 | 0.000000e+00 |
-32.052887 | 31.473969 | -0.578918 | -0.578918 | 0.000000e+00 |
5.428631 | 45.491280 | 50.919910 | 50.919910 | 0.000000e+00 |
34.613480 | -40.486797 | -5.873318 | -5.873318 | 0.000000e+00 |
37.345215 | 38.584961 | 75.930176 | 75.930176 | 0.000000e+00 |
-22.956657 | -22.030470 | -44.987129 | -44.987125 | 8.479530e-08 |
12.832539 | 9.602684 | 22.435223 | 22.435223 | 0.000000e+00 |
input1 | input2 | answer | myanswer | error rate |
---|---|---|---|---|
-2.152653 | -18.543297 | 39.917278 | 39.917278 | 0.000000e+00 |
29.927048 | -44.895565 | -1343.591675 | -1343.591675 | 0.000000e+00 |
19.867577 | 34.633919 | 688.092041 | 688.091980 | 8.870202e-08 |
-13.439705 | 9.803593 | -131.757385 | -131.757385 | 0.000000e+00 |
49.927086 | 24.040535 | 1200.273804 | 1200.273804 | 0.000000e+00 |
1.543598 | 47.617355 | 73.502060 | 73.502060 | 0.000000e+00 |
-43.442417 | -24.092621 | 1046.641724 | 1046.641602 | 1.166305e-07 |
-23.320192 | -19.306118 | 450.222382 | 450.222382 | 0.000000e+00 |
44.823502 | 24.745850 | 1109.195679 | 1109.195557 | 1.100530e-07 |
-5.549877 | 11.534317 | -64.014046 | -64.014038 | 1.191831e-07 |
Noticed that when the hardware does not support floating-point operations, the compiler will translate floating-point computations into a software-supported form, similar to what I did above but with better performance.
Due to poor performance and the discovery that the mado
project uses fixed-point arithmetic for trigonometric calculations, an alternative method is adopted.
focus on the trig.c
todo : Replace the division in twin_tan()
with another method that requires fewer cycles.
Try using jemalloc
instead of division operations in twin_tan()
.
/* find last (most-significant) bit set */
#define fls32(x) ((x) ? sizeof(int) * 8 - __builtin_clz(x) : 0)
void fastdiv_prepare(uint32_t div, uint32_t *m, uint8_t *s1, uint8_t *s2)
{
const int l = fls32(div - 1);
const uint64_t nextpow2 = UINT64_C(1) << l;
uint32_t magic = ((nextpow2 - div) << 32) / div;
*m = magic + 1;
*s1 = (l > 1) ? 1 : l;
*s2 = (l == 0) ? 0 : l - 1;
if (div == 1)
return;
if (magic == 0) { /* div == nextpow2 */
*s1 -= 1, *s2 += 1;
} else {
magic = (magic + UINT64_C(0x100000000)) / 2;
uint32_t rem = (nextpow2 << 31) - magic * div;
assert(rem > 0 && rem < div);
if (rem > div - (nextpow2 >> 1)) {
*m = magic + 1;
*s1 -= 1;
}
}
}
static uint32_t fastdiv(uint32_t val, uint64_t mul, uint8_t s1, uint8_t s2)
{
const uint32_t hi = (val * mul) >> 32;
return (hi + ((val - hi) >> s1)) >> s2;
}
Need to know the value of the divisor in advance. Can try precomputing the divisor using fastdiv_prepare()
, storing the resulting values of m
, s1
, and s2
. When needed, retrieve these precomputed values from a lookup table for fast computation. The issue is that this approach requires a significant amount of storage space. Even if the divisor precision is limited to two decimal places, it would still require 100 × 48 bits, exceeding 4 kilobits of storage space.
table_gen()
full code
Code to generate lookup table. The original divisor had a precision of 16 bits, but due to table size considerations, only 7 bits of precision are retained.
void table_gen(){
FILE *f = fopen("data.txt", "w");
uint32_t m[127];
uint8_t s1[127], s2[127];
for(int i = 512, j = 0; i < 65536; i += 512, j++){
fastdiv_prepare(i, &m[j], &s1[j], &s2[j]);
fprintf(f, "%d,%d ", j, i);
}
fprintf(f, "\n\n");
for(int i = 0; i < 127; i++){
fprintf(f, "%u", m[i]);
if(i != 126)
fprintf(f, ", ");
}
fprintf(f, "\n\n");
for(int i = 0; i < 127; i++){
fprintf(f, "%u", s1[i]);
if(i != 126)
fprintf(f, ", ");
}
fprintf(f, "\n\n");
for(int i = 0; i < 127; i++){
fprintf(f, "%u", s2[i]);
if(i != 126)
fprintf(f, ", ");
}
fclose(f);
}
table be generated by table_gen()
static const uint32_t m[] = {1, 1, 2863311531, 1, 3435973837, 2863311531, 613566757, 1, 3817748708, 3435973837, 3123612579, 2863311531,
2643056798, 613566757, 2290649225, 1, 4042322161, 3817748708, 2938661835, 3435973837, 2249744775, 3123612579, 2987803337, 2863311531,
2748779070, 2643056798, 795364315, 613566757, 2369637129, 2290649225, 138547333, 1, 4164816772, 4042322161, 3558687189, 3817748708,
3134165325, 2938661835, 2753184165, 3435973837, 3352169597, 2249744775, 3196254732, 3123612579, 1813430637, 2987803337, 2924233053,
2863311531, 2804876602, 2748779070, 2694881441, 2643056798, 891408307, 795364315, 702812831, 613566757, 527452125, 2369637129, 2329473788,
2290649225, 2253097598, 138547333, 68174085, 1, 4228890877, 4164816772, 4102655328, 4042322161, 3983737782, 3558687189, 3871519817,
3817748708, 3235934265, 3134165325, 3665038760, 2938661835, 3569842948, 2753184165, 3479467177, 3435973837, 3393554407, 3352169597,
3311782012, 2249744775, 3233857729, 3196254732, 3159516172, 3123612579, 3088515809, 1813430637, 1746305385, 2987803337, 2955676419,
2924233053, 1491936009, 2863311531, 1372618415, 2804876602, 2776544515, 2748779070, 1148159575, 2694881441, 1042467791, 2643056798,
940802361, 891408307, 842937507, 795364315, 748664025, 702812831, 657787785, 613566757, 570128403, 527452125, 485518043, 2369637129,
403800345, 2329473788, 2309898378, 2290649225, 248469183, 2253097598, 174592167, 138547333, 2199023256, 68174085, 33818641};
static const uint8_t s1[] = {0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0,
1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0,
1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0,
1, 0, 1, 1, 0, 1, 1};
static const uint8_t s2[] = {9, 10, 10, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15};
twin_tan()
use lookup table to apply jemalloc's fast division approach in the twin_tan()
function.
twin_fixed_t twin_tan_my(twin_angle_t a)
{
twin_fixed_t s, c;
twin_sincos(a, &s, &c);
if (c == 0) {
if (s > 0)
return TWIN_FIXED_MAX;
return TWIN_FIXED_MIN;
}
if (s == 0)
return 0;
int sign = s >> 31 ^ c >> 31;
if(c < 0)
c = -c;
if(s < 0)
s = -s;
int c1 = (c >> 9) + ((c & 256) >> 8);
c = c >> 9;
if(c == 127)
return sign ? -s : s;
else{
twin_fixed_t res = fastdiv(s << 15, m[c], s1[c], s2[c]) << 1;
return sign ? -res : res;
}
}
twin_tan()
without lookup tableDirectly using jemalloc fast division to replace the division in twin_tan()
twin_fixed_t twin_tan_fast(twin_angle_t a)
{
twin_fixed_t s, c;
twin_sincos(a, &s, &c);
if (c == 0) {
if (s > 0)
return TWIN_FIXED_MAX;
return TWIN_FIXED_MIN;
}
if (s == 0)
return 0;
int sign = s >> 31 ^ c >> 31 ? -1 : 1;
if(c < 0)
c = -c;
if(s < 0)
s = -s;
uint32_t mul;
uint8_t s1, s2;
fastdiv_prepare(c, &mul, &s1, &s2);
return sign * fastdiv(s << 15, mul, s1, s2) << 1;
}
code used to test accuracy
full code
int main(){
FILE *f = fopen("test_fastdiv.txt", "w");
fprintf(f, "| angle | answer | myanswer | fast_div | error rate | fast_div error rate |\n");
fprintf(f, "| ----- | ------ | -------- | -------- | ---------- | ------------------- |\n");
int16_t test[20] = {14659, 41967, 21825, 41634, 20873, 36471, 41746, 30660, 60680, 17029, 28151, 22910, 40912, 64147, 10802, 2959, 34279, 21628, 12564, 3480};
for(int i = 0; i < 20; i++){
float error_rate, error_rate_fast;
twin_fixed_t my = twin_tan_my(test[i]);
twin_fixed_t golden = twin_tan(test[i]);
twin_fixed_t fast = twin_tan_fast(test[i]);
int d = abs(my - golden), dfast = abs(fast - golden);
error_rate = (float)d / abs(golden);
error_rate_fast = (float)dfast / abs(golden);
fprintf(f, "| %d | %d | %d | %d | %f | %f |\n", test[i], golden, my, fast, error_rate, error_rate_fast);
}
fclose(f);
}
angle | answer | myanswer | fast_div answer | error rate | fast_div error rate |
---|---|---|---|---|---|
14659 | 35398 | 35282 | 35398 | 0.003277 | 0.000000 |
-23569 | 2513772 | 2096448 | 2513772 | 0.166015 | 0.000000 |
21825 | -122226 | -231048 | -122226 | 0.890334 | 0.000000 |
-23902 | 110156 | 112652 | 112652 | 0.022659 | 0.022659 |
20873 | 45094 | 44860 | 74302 | 0.005189 | 0.647714 |
-29065 | -45094 | -44860 | -74302 | 0.005189 | 0.647714 |
-23790 | 171578 | 244916 | 171578 | 0.427432 | 0.000000 |
30660 | -6044 | -6019 | -12038 | 0.004136 | 0.991727 |
-4856 | -152984 | -240992 | -152984 | 0.575276 | 0.000000 |
17029 | 99770 | 109556 | 109556 | 0.098086 | 0.098086 |
28151 | -67372 | -66810 | -93954 | 0.008342 | 0.394556 |
22910 | 43480 | 43344 | 72468 | 0.003128 | 0.666697 |
-24624 | -4830 | -4818 | -9636 | 0.002484 | 0.995031 |
-1389 | 104600 | 111078 | 111078 | 0.061931 | 0.061931 |
10802 | 76458 | 75822 | 76458 | 0.008318 | 0.000000 |
2959 | 374556 | 516480 | 374556 | 0.378913 | 0.000000 |
-31257 | -70772 | -96172 | -70772 | 0.358899 | 0.000000 |
21628 | -340624 | -514888 | -340624 | 0.511602 | 0.000000 |
12564 | 29512 | 29442 | 53826 | 0.002372 | 0.823868 |
3480 | -90698 | -106244 | -106244 | 0.171404 | 0.171404 |
The accuracy fluctuates significantly. Even when directly using jemalloc fast division to replace the division, there are still some results with excessively large accuracy errors.
Todo : Identify the cause of low accuracy.
Use Ripes to test the cycle count.
twin_tan()
directly using the division
twin_tan_my()
using jemalloc fast division with lookup table
twin_tan_fast()
using jemalloc fast division without lookup table
The version that uses a lookup table requires significantly fewer cycles compared to the version without the lookup table, but it still takes more cycles than directly performing division.