# Parallel Programming Homework 1
## Part 1
### 1
```C
void clampedExpVector(float *values, int *exponents, float *output, int N)
{
__pp_vec_float val;
__pp_vec_int ex;
__pp_vec_float result;
__pp_vec_float clamp = _pp_vset_float(9.999999f);
__pp_vec_int zero = _pp_vset_int(0);
__pp_vec_int ones = _pp_vset_int(1);
__pp_mask m_data, m_done;
int i = 0;
float *data = values;
int *expn = exponents;
float *out = output;
do
{
int t = (N - i >= VECTOR_WIDTH)?(VECTOR_WIDTH):(N - i);
m_data = _pp_init_ones(t);
_pp_vload_float(val, data, m_data);
_pp_vload_int(ex, expn, m_data);
__pp_mask m_active = m_data;
result = _pp_vset_float(1.f);
while(true){
__pp_mask m_clamp, m_zero, m_t1;
// check clamp
_pp_vgt_float(m_clamp, result, clamp, m_active);
_pp_vset_float(result, 9.999999f, m_clamp);
// check zero
_pp_veq_int(m_zero, ex, zero, m_active);
//update m_active
m_t1 = _pp_mask_or(m_zero, m_clamp);
m_t1 = _pp_mask_not(m_t1);
m_active = _pp_mask_and(m_active, m_t1);
if(_pp_cntbits(m_active) == 0)
break;
//calculation
_pp_vmult_float(result, result, val, m_active);
_pp_vsub_int(ex, ex, ones, m_active);
}
_pp_vstore_float(out, result, m_data);
i += VECTOR_WIDTH;
data += VECTOR_WIDTH;
expn += VECTOR_WIDTH;
out += VECTOR_WIDTH;
}while(i < N);
}
```
### 2: Q1-1
We can see a decrease in utilization.
| VECTOR_WIDTH | Utilization |
|:------------:|:-----------:|
| 2 | 80.3 |
| 4 | 74.7 |
| 8 | 71.4 |
| 16 | 69.6 |

The decrease in utilization is expected since each batch of data
processed by vector instruction, the amount of cycle needed to finish processing the data is limited by the maximum exponent. Since each exponent is a random variable with uniform distribution, the expected number of working lanes decrease with the increase in _VECTOR_WIDTH_.
### 3
```C
float arraySumVector(float *values, int N)
{
float t[2*N+1+VECTOR_WIDTH] = {0.f};
int s = VECTOR_WIDTH;
float *p_end, *p_r, *p_w;
p_end = t + 2*N+VECTOR_WIDTH;
p_r = t;
p_w = t + N;
__pp_vec_float tmp, res;
__pp_mask m_all, m_half;
m_all = _pp_init_ones();
m_half = _pp_init_ones(VECTOR_WIDTH >> 1);
for (int i = 0; i < N; i += VECTOR_WIDTH)
{
_pp_vload_float(tmp, values + i, m_all);
_pp_vstore_float(t + i, tmp, m_all);
}
do
{
_pp_vload_float(res, p_r, m_all);
_pp_hadd_float(tmp, res);
_pp_interleave_float(res, tmp);
_pp_vstore_float(p_w, res, m_half);
p_r = min(p_r + VECTOR_WIDTH, p_w);
p_w += (VECTOR_WIDTH >> 1);
}while(p_r != p_end);
_pp_vstore_float(t, res, m_all);
return t[0];
}
```
## Part 2
### 1: Q2-1
```c=
void test1(float* __restrict a, float* __restrict b, float* __restrict c, int N) {
__builtin_assume(N == 1024);
a = (float *)__builtin_assume_aligned(a, 32);
b = (float *)__builtin_assume_aligned(b, 32);
c = (float *)__builtin_assume_aligned(c, 32);
fasttime_t time1 = gettime();
for (int i=0; i<I; i++) {
for (int j=0; j<N; j++) {
c[j] = a[j] + b[j];
}
}
fasttime_t time2 = gettime();
double elapsedf = tdiff(time1, time2);
std::cout << "Elapsed execution time of the loop in test1():\n"
<< elapsedf << "sec (N: " << N << ", I: " << I << ")\n";
}
```
The code can be fixed by changing the alligned bits to 32 since
a floating number occupied 4 bytes.
### 2: Q2-2
The performance of _test1_ is as follow.
| | NoVec | Vec | Vec+Avx2 |
|---------|---------|---------|----------|
| rep1 | 8.42606 | 2.82125 | 1.54542 |
| rep2 | 8.89745 | 2.79625 | 1.57578 |
| rep3 | 8.71283 | 2.79518 | 1.56529 |
| rep4 | 8.426 | 2.94945 | 1.61671 |
| rep5 | 8.51389 | 2.94363 | 1.57235 |
| median | 8.51389 | 2.82125 | 1.57235 |
| speedup | 100.00% | 300.00% | 540.00% |
From the speed up we can infer that the bitwidth of the vector instruction is at least four times of that of the original, whereas _AVX2_ should have at least eight times more bit than the original. Since each number is a four bytes floating point number,
the bit width of the vector instruction registor should be 128,
and the bit width of the _AVX2_ registor should be 256.
### 3: Q2-3
The original _test2_ assign _c[j]_ multiple times in a single loop, which complicates the dependencies between data, therefore, the compiler can not infer the data in the next loop can be process simultaneously.