Try   HackMD

Optimization Design of CORDIC Algorithm

劉益祥

Study the CORDIC algorithm

The CORDIC algorithm operates using iterative shift-add arithmetic and is based on vector rotation in a plane. By sequentially rotating a vector through predefined angles, the algorithm computes trigonometric functions like sine and cosine. It avoids the need for direct multiplication or division by using only addition, subtraction, bit shifts, and table lookups.

Perform floating-point operations without using floating-point arithmetic

Trying to use uint32_t with shift and add operations to replace float with floating-point operations in cordic algorithm.

code

bits_to_fp32

refrence from 2024 Quiz1 Problem A

static inline float bits_to_fp32(uint32_t w)
{
    union {
        uint32_t as_bits;
        float as_value;
    } fp32 = {.as_bits = w};
    return fp32.as_value;
}

fp32_to_bits

refrence from 2024 Quiz1 Problem A

static inline uint32_t fp32_to_bits(float f)
{
    union {
        float as_value;
        uint32_t as_bits;
    } fp32 = {.as_value = f};
    return fp32.as_bits;
}

myclz_32

modified from 2023 Quiz1 Problem A

uint32_t myclz_32(uint32_t x)
{
    x |= (x >> 1);
    x |= (x >> 2);
    x |= (x >> 4);
    x |= (x >> 8);
    x |= (x >> 16);
    
    x -= ((x >> 1) & 0x55555555);
    x = ((x >> 2) & 0x33333333) + (x & 0x33333333);
    x = ((x >> 4) + x) & 0x0f0f0f0f;
    x += (x >> 8);
    x += (x >> 16);

    return (32 - (x & 0x7f));
}

add32

uint32_t add_32(uint32_t a, uint32_t b){
    uint32_t rst_sign, rst_exp, rst_fra, rst;
    uint32_t a_sign = a >> 31;
    uint32_t a_exp = (uint32_t)(a << 1) >> 24;
    uint32_t a_fra = (a & 0x7fffff) | 0x800000;

    uint32_t b_sign = b >> 31;
    uint32_t b_exp = (uint32_t)(b << 1) >> 24;
    uint32_t b_fra = (b & 0x7fffff) | 0x800000;
    if(a_exp > b_exp){
        uint32_t sft = a_exp - b_exp;
        b_fra >>= sft;
        if(a_sign ^ b_sign)
            rst_fra = a_fra - b_fra;
        else
            rst_fra = a_fra + b_fra;
        rst_exp = a_exp;
        rst_sign = a_sign;
    }
    else if(a_exp < b_exp){
        uint32_t sft = b_exp - a_exp;
        a_fra >>= sft;
        if(a_sign ^ b_sign)
            rst_fra = b_fra - a_fra;
        else
            rst_fra = a_fra + b_fra;
        rst_exp = b_exp;
        rst_sign = b_sign;
    }
    else{
        rst_exp = a_exp;
        if(a_sign ^ b_sign){
            if(a_fra > b_fra){
                rst_fra = a_fra - b_fra;
                rst_sign = a_sign;
            }
            else{
                rst_fra = b_fra - a_fra;
                rst_sign = b_sign;
            }
        }
        else{
            rst_fra = a_fra + b_fra;
            rst_sign = a_sign;
        }
    }
    //normalize
    int lz = myclz_32(rst_fra);
    if(lz <= 8){
        lz = 8 - lz;
        rst_fra >>= lz;
        rst_exp += lz; 
    }
    else{
        lz -= 8;
        rst_fra <<= lz;
        rst_exp -= lz;
    }
    rst_fra &= 0x7fffff;
    rst_sign <<= 31;
    rst_exp <<= 23;
    rst = rst_sign | rst_exp | rst_fra;

    return rst;
}

mul32

uint32_t mul_32(uint32_t a, uint32_t b){
    uint32_t a_sign = a >> 31;
    uint32_t a_exp = (uint32_t)(a << 1) >> 24;
    uint32_t a_fra = (a & 0x7fffff) | 0x800000;

    uint32_t b_sign = b >> 31;
    uint32_t b_exp = (uint32_t)(b << 1) >> 24;
    uint32_t b_fra = (b & 0x7fffff) | 0x800000;

    uint32_t sign = a_sign ^ b_sign;
    uint32_t exp = a_exp + b_exp - 127;
    uint64_t fra_64 = (uint64_t)a_fra * b_fra;
    fra_64 >>= 23;
    uint32_t fra = (uint32_t)fra_64;
    //normalize
    uint32_t lz = myclz_32(fra);
    lz = 8 - lz;
    fra >>= lz;
    exp += lz;
    fra -= 0x800000;

    sign <<= 31;
    exp <<= 23;
    uint32_t rst = sign | exp | fra;
    return rst;
}

verify correctness

add32

input1 input2 answer myanswer error rate
15.961235 43.132904 59.094139 59.094139 0.000000e+00
36.952332 17.209785 54.162117 54.162117 0.000000e+00
-22.274956 44.483910 22.208954 22.208954 0.000000e+00
-27.907221 -45.500877 -73.408096 -73.408096 0.000000e+00
-32.052887 31.473969 -0.578918 -0.578918 0.000000e+00
5.428631 45.491280 50.919910 50.919910 0.000000e+00
34.613480 -40.486797 -5.873318 -5.873318 0.000000e+00
37.345215 38.584961 75.930176 75.930176 0.000000e+00
-22.956657 -22.030470 -44.987129 -44.987125 8.479530e-08
12.832539 9.602684 22.435223 22.435223 0.000000e+00

mul32

input1 input2 answer myanswer error rate
-2.152653 -18.543297 39.917278 39.917278 0.000000e+00
29.927048 -44.895565 -1343.591675 -1343.591675 0.000000e+00
19.867577 34.633919 688.092041 688.091980 8.870202e-08
-13.439705 9.803593 -131.757385 -131.757385 0.000000e+00
49.927086 24.040535 1200.273804 1200.273804 0.000000e+00
1.543598 47.617355 73.502060 73.502060 0.000000e+00
-43.442417 -24.092621 1046.641724 1046.641602 1.166305e-07
-23.320192 -19.306118 450.222382 450.222382 0.000000e+00
44.823502 24.745850 1109.195679 1109.195557 1.100530e-07
-5.549877 11.534317 -64.014046 -64.014038 1.191831e-07

Noticed that when the hardware does not support floating-point operations, the compiler will translate floating-point computations into a software-supported form, similar to what I did above but with better performance.
Due to poor performance and the discovery that the mado project uses fixed-point arithmetic for trigonometric calculations, an alternative method is adopted.

Understand how trigonometric functions work in mado

focus on the trig.c

todo : Replace the division in twin_tan() with another method that requires fewer cycles.

Study Linux 核心原始程式碼的整數除法

Try using jemalloc instead of division operations in twin_tan().

/* find last (most-significant) bit set */
#define fls32(x) ((x) ? sizeof(int) * 8 - __builtin_clz(x) : 0)

void fastdiv_prepare(uint32_t div, uint32_t *m, uint8_t *s1, uint8_t *s2)
{
    const int l = fls32(div - 1);
    const uint64_t nextpow2 = UINT64_C(1) << l;
    uint32_t magic = ((nextpow2 - div) << 32) / div;

    *m = magic + 1;
    *s1 = (l > 1) ? 1 : l;
    *s2 = (l == 0) ? 0 : l - 1;

    if (div == 1)
        return;

    if (magic == 0) { /* div == nextpow2 */
        *s1 -= 1, *s2 += 1;
    } else {
        magic = (magic + UINT64_C(0x100000000)) / 2;
        uint32_t rem = (nextpow2 << 31) - magic * div; 
        assert(rem > 0 && rem < div);
        if (rem > div - (nextpow2 >> 1)) {
            *m = magic + 1;
            *s1 -= 1;
        }
    }
}

static uint32_t fastdiv(uint32_t val, uint64_t mul, uint8_t s1, uint8_t s2)
{   
    const uint32_t hi = (val * mul) >> 32;
    return (hi + ((val - hi) >> s1)) >> s2;
}

Need to know the value of the divisor in advance. Can try precomputing the divisor using fastdiv_prepare(), storing the resulting values of m, s1, and s2. When needed, retrieve these precomputed values from a lookup table for fast computation. The issue is that this approach requires a significant amount of storage space. Even if the divisor precision is limited to two decimal places, it would still require 100 × 48 bits, exceeding 4 kilobits of storage space.

code

table_gen()

full code
Code to generate lookup table. The original divisor had a precision of 16 bits, but due to table size considerations, only 7 bits of precision are retained.

void table_gen(){
    FILE *f = fopen("data.txt", "w");
    uint32_t m[127];
    uint8_t s1[127], s2[127];
    for(int i = 512, j = 0; i < 65536; i += 512, j++){
        fastdiv_prepare(i, &m[j], &s1[j], &s2[j]);
        fprintf(f, "%d,%d ", j, i);
    }
    fprintf(f, "\n\n");
    for(int i = 0; i < 127; i++){
        fprintf(f, "%u", m[i]);
        if(i != 126)
            fprintf(f, ", ");
    }
    fprintf(f, "\n\n");
    for(int i = 0; i < 127; i++){
        fprintf(f, "%u", s1[i]);
        if(i != 126)
            fprintf(f, ", ");
    }
    fprintf(f, "\n\n");
    for(int i = 0; i < 127; i++){
        fprintf(f, "%u", s2[i]);
        if(i != 126)
            fprintf(f, ", ");
    }
    fclose(f);
}

table be generated by table_gen()

static const uint32_t m[] = {1, 1, 2863311531, 1, 3435973837, 2863311531, 613566757, 1, 3817748708, 3435973837, 3123612579, 2863311531, 
    2643056798, 613566757, 2290649225, 1, 4042322161, 3817748708, 2938661835, 3435973837, 2249744775, 3123612579, 2987803337, 2863311531, 
    2748779070, 2643056798, 795364315, 613566757, 2369637129, 2290649225, 138547333, 1, 4164816772, 4042322161, 3558687189, 3817748708, 
    3134165325, 2938661835, 2753184165, 3435973837, 3352169597, 2249744775, 3196254732, 3123612579, 1813430637, 2987803337, 2924233053, 
    2863311531, 2804876602, 2748779070, 2694881441, 2643056798, 891408307, 795364315, 702812831, 613566757, 527452125, 2369637129, 2329473788, 
    2290649225, 2253097598, 138547333, 68174085, 1, 4228890877, 4164816772, 4102655328, 4042322161, 3983737782, 3558687189, 3871519817, 
    3817748708, 3235934265, 3134165325, 3665038760, 2938661835, 3569842948, 2753184165, 3479467177, 3435973837, 3393554407, 3352169597, 
    3311782012, 2249744775, 3233857729, 3196254732, 3159516172, 3123612579, 3088515809, 1813430637, 1746305385, 2987803337, 2955676419, 
    2924233053, 1491936009, 2863311531, 1372618415, 2804876602, 2776544515, 2748779070, 1148159575, 2694881441, 1042467791, 2643056798, 
    940802361, 891408307, 842937507, 795364315, 748664025, 702812831, 657787785, 613566757, 570128403, 527452125, 485518043, 2369637129, 
    403800345, 2329473788, 2309898378, 2290649225, 248469183, 2253097598, 174592167, 138547333, 2199023256, 68174085, 33818641};

static const uint8_t s1[] = {0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 
    1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0,
     1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 
     1, 0, 1, 1, 0, 1, 1};

static const uint8_t s2[] = {9, 10, 10, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 
    13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 
    14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 
    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15};

modified twin_tan()

use lookup table to apply jemalloc's fast division approach in the twin_tan() function.

twin_fixed_t twin_tan_my(twin_angle_t a)
{
    twin_fixed_t s, c;
    twin_sincos(a, &s, &c);
    if (c == 0) {
        if (s > 0)
            return TWIN_FIXED_MAX;
        return TWIN_FIXED_MIN;
    }
    if (s == 0)
        return 0;
    int sign = s >> 31 ^ c >> 31;
    if(c < 0)
        c = -c;
    if(s < 0)
        s = -s;
    int c1 = (c >> 9) + ((c & 256) >> 8);
    c = c >> 9;

    if(c == 127)
        return sign ? -s : s;
    else{
        twin_fixed_t res = fastdiv(s << 15, m[c], s1[c], s2[c]) << 1;
        return sign ? -res : res;
    }
}

twin_tan() without lookup table

Directly using jemalloc fast division to replace the division in twin_tan()

twin_fixed_t twin_tan_fast(twin_angle_t a)
{
    twin_fixed_t s, c;
    twin_sincos(a, &s, &c);

    if (c == 0) {
        if (s > 0)
            return TWIN_FIXED_MAX;
        return TWIN_FIXED_MIN;
    }
    if (s == 0)
        return 0;
    int sign = s >> 31 ^ c >> 31 ? -1 : 1;
    if(c < 0)
        c = -c;
    if(s < 0)
        s = -s;
    uint32_t mul;
    uint8_t s1, s2;

    fastdiv_prepare(c, &mul, &s1, &s2); 
    return sign * fastdiv(s << 15, mul, s1, s2) << 1;
}

Accuracy

code used to test accuracy
full code

int main(){
    FILE *f = fopen("test_fastdiv.txt", "w");
    fprintf(f, "| angle | answer | myanswer | fast_div | error rate | fast_div error rate |\n");
    fprintf(f, "| ----- | ------ | -------- | -------- | ---------- | ------------------- |\n");
    int16_t test[20] = {14659, 41967, 21825, 41634, 20873, 36471, 41746, 30660, 60680, 17029, 28151, 22910, 40912, 64147, 10802, 2959, 34279, 21628, 12564, 3480};
    for(int i = 0; i < 20; i++){
        float error_rate, error_rate_fast;
        twin_fixed_t my = twin_tan_my(test[i]);
        twin_fixed_t golden = twin_tan(test[i]);
        twin_fixed_t fast = twin_tan_fast(test[i]);
        int d = abs(my - golden), dfast = abs(fast - golden);
        error_rate = (float)d / abs(golden);
        error_rate_fast = (float)dfast / abs(golden);
        fprintf(f, "| %d | %d | %d | %d | %f | %f |\n", test[i], golden, my, fast, error_rate, error_rate_fast);
    }
    fclose(f);
}
angle answer myanswer fast_div answer error rate fast_div error rate
14659 35398 35282 35398 0.003277 0.000000
-23569 2513772 2096448 2513772 0.166015 0.000000
21825 -122226 -231048 -122226 0.890334 0.000000
-23902 110156 112652 112652 0.022659 0.022659
20873 45094 44860 74302 0.005189 0.647714
-29065 -45094 -44860 -74302 0.005189 0.647714
-23790 171578 244916 171578 0.427432 0.000000
30660 -6044 -6019 -12038 0.004136 0.991727
-4856 -152984 -240992 -152984 0.575276 0.000000
17029 99770 109556 109556 0.098086 0.098086
28151 -67372 -66810 -93954 0.008342 0.394556
22910 43480 43344 72468 0.003128 0.666697
-24624 -4830 -4818 -9636 0.002484 0.995031
-1389 104600 111078 111078 0.061931 0.061931
10802 76458 75822 76458 0.008318 0.000000
2959 374556 516480 374556 0.378913 0.000000
-31257 -70772 -96172 -70772 0.358899 0.000000
21628 -340624 -514888 -340624 0.511602 0.000000
12564 29512 29442 53826 0.002372 0.823868
3480 -90698 -106244 -106244 0.171404 0.171404

The accuracy fluctuates significantly. Even when directly using jemalloc fast division to replace the division, there are still some results with excessively large accuracy errors.
Todo : Identify the cause of low accuracy.

Performance

Use Ripes to test the cycle count.

twin_tan()

directly using the division
image

twin_tan_my()

using jemalloc fast division with lookup table
image

twin_tan_fast()

using jemalloc fast division without lookup table
image

The version that uses a lookup table requires significantly fewer cycles compared to the version without the lookup table, but it still takes more cycles than directly performing division.

Study mado: 三角函數