###### tags `平行程式設計` `資訊工程相關`
# 平行程式設計 HW2 楊淨 109062530
## Implementation
本次作業實作,我先實作HW2A 把phtread 版本先實作做出來,在將此版本拓展到MPI+OPEN MP的模式.


HW2A:
1. 首先觀察結果,因為能量大的點都可能會很相近,所以分配THREAD時應該要將相近的點給不同的THREAD算,於是我根據THREAD數量,連續的分配PIXEL給不同的THREAD算能量大小。
`for(int index = tid; index < maxidx; index+=MAX_THREADS)`
2. 如此實作完,發現排行榜上,我的耗時依然是前段人的2倍,但經過各項觀察,我認為程式已經沒太多可以精簡的步驟,所以只剩下SIMD指令可以嘗試。
3. 在SIMD指令下,我使用 _m128d讓一個THREAD每次都計算相鄰的2個PIXEL。
`for(int index = tid*2; index < maxidx; index+=MAX_THREADS*2)`
4. 這樣的結果使計算的速度提升非常多,因為一次計算2個DOUBLE的浮點數,提升幾乎接近2倍的速度。
HW2B:
1. 首先我直接將先前打得很散的pthread 版本改成 open MP的版本
2. 再直接根據NODE數量將大塊的圖直接切分

3. 這樣的結果就是慘不忍睹,不同的NODE間的 LOAD BALANCE 非常不好,耗時也極久

4. 後來的想法是實做出MASTER SLAVE 分配資源的模式,但這部分實作上、溝通消耗上,我認為都不是很容易處理。
5. 於是我根據最一開始的想法,相近的點能量會相對大、小,將每個node改切以下模式

6. 最後我使用 MPI_Reduce 將結果整合起來
7. 最後的效果也不算太差,judge從原本的耗時1000多秒 降低到300秒。
HW2A 實作如下:
```c=cpp
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#define PNG_NO_SETJMP
#include <sched.h>
#include <assert.h>
#include <png.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <pthread.h>
#include <gmp.h>
#include <emmintrin.h>
int MAX_THREADS;
const double two= 2;
__m128d _2 = _mm_load1_pd(&two);
int my_j,my_i;
int my_iters ;
int my_width ;
int my_height;
double left ;
double right ;
double lower;
double upper;
/* allocate memory for image */
int* my_image ;
double y0_base;
double x0_base;
double *global_treadtime;
void* iterCal(void* threadid){
double length_squared[2]={0,0};
__m128d tmp_x, tmp_y,_length_squared ;
__m128d x_square,y_square ;
__m128d _x,_y;
__m128d _y0,_x0;
int a,b;
int tid = *(int*)threadid;
int maxidx=my_height*my_width;
for (int index = tid*2; index < maxidx; index+=MAX_THREADS*2) {
if (index!=maxidx-1){
a=index;
b=index+1;
double y0[2],x0[2] ;
y0[0]=a/my_width * y0_base + lower;
y0[1]=b/my_width * y0_base + lower;
x0[0] = (a%my_width) * x0_base + left;
x0[1] = (b%my_width) * x0_base + left;
_y0=_mm_load_pd(y0);
_x0=_mm_load_pd(x0);
int repeats[2];
repeats[0]=0;
repeats[1]=0;
x_square = y_square =tmp_y=_y=_x=tmp_x =_length_squared = _mm_setzero_pd();
while (1) {
tmp_x=_x;
tmp_y=_y;
x_square=_mm_mul_pd(tmp_x, tmp_x);
y_square=_mm_mul_pd(tmp_y, tmp_y);
_length_squared =_mm_add_pd(x_square,y_square);
_mm_store_pd(length_squared, _length_squared);
if (length_squared[0] >= 4 || length_squared[1] >= 4){
break;
}
_x = _mm_add_pd(_mm_sub_pd(x_square, y_square), _x0);
_y = _mm_add_pd(_mm_mul_pd(_mm_mul_pd(tmp_x, tmp_y), _2), _y0);
repeats[0]++;
repeats[1]++;
if (repeats[0] >= my_iters || repeats[1] >= my_iters)
break;
}
if(length_squared[0]<4 && repeats[0]<my_iters){
double y0 = a/my_width * y0_base + lower;
double x0 = (a%my_width) * x0_base + left;
double x = 0;
double y = 0;
_mm_storel_pd(&x,_x);
_mm_storel_pd(&y,_y);
while (repeats[0] < my_iters && length_squared[0] < 4) {
double temp = x * x - y * y + x0;
y = 2 * x * y + y0;
x = temp;
length_squared[0] = x * x + y * y;
++repeats[0];
}
}
else if(length_squared[1]<4 && repeats[1]<my_iters){
double y0 = b/my_width * y0_base + lower;
double x0 = (b%my_width)* x0_base + left;
double x = 0;
double y = 0;
_mm_storeh_pd(&x,_x);
_mm_storeh_pd(&y,_y);
while (repeats[1] < my_iters && length_squared[1] < 4) {
double temp = x * x - y * y + x0;
y = 2 * x * y + y0;
x = temp;
length_squared[1] = x * x + y * y;
++repeats[1];
}
}
my_image[a]=repeats[0];
my_image[b]=repeats[1];
}
else{
double y0 = index/my_width * y0_base + lower;
double x0 = (index%my_width)* x0_base + left;
double x = 0;
double y = 0;
int repeats_L;
double length_squared_L;
while (repeats_L < my_iters && length_squared_L < 4) {
double temp = x * x - y * y + x0;
y = 2 * x * y + y0;
x = temp;
length_squared_L = x * x + y * y;
++repeats_L;
}
my_image[index]=repeats_L;
}
}
pthread_exit(NULL);
}
void write_png(const char* filename, int iters, int width, int height, const int* buffer) {
FILE* fp = fopen(filename, "wb");
assert(fp);
png_structp png_ptr = png_create_write_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL);
assert(png_ptr);
png_infop info_ptr = png_create_info_struct(png_ptr);
assert(info_ptr);
png_init_io(png_ptr, fp);
png_set_IHDR(png_ptr, info_ptr, width, height, 8, PNG_COLOR_TYPE_RGB, PNG_INTERLACE_NONE,
PNG_COMPRESSION_TYPE_DEFAULT, PNG_FILTER_TYPE_DEFAULT);
png_set_filter(png_ptr, 0, PNG_NO_FILTERS);
png_write_info(png_ptr, info_ptr);
png_set_compression_level(png_ptr, 1);
size_t row_size = 3 * width * sizeof(png_byte);
png_bytep row = (png_bytep)malloc(row_size);
for (int y = 0; y < height; ++y) {
memset(row, 0, row_size);
for (int x = 0; x < width; ++x) {
int p = buffer[(height - 1 - y) * width + x];
png_bytep color = row + x * 3;
if (p != iters) {
if (p & 16) {
color[0] = 240;
color[1] = color[2] = p % 16 * 16;
} else {
color[0] = p % 16 * 16;
}
}
}
png_write_row(png_ptr, row);
}
free(row);
png_write_end(png_ptr, NULL);
png_destroy_write_struct(&png_ptr, &info_ptr);
fclose(fp);
}
int main(int argc, char** argv) {
cpu_set_t cpu_set;
sched_getaffinity(0, sizeof(cpu_set), &cpu_set);
MAX_THREADS=500;
//printf("%d cpus available\n", CPU_COUNT(&cpu_set));
pthread_t threads[MAX_THREADS];
long threadnum=0;
int rc;
int ID[MAX_THREADS];
void *ret; // 子執行緒傳回值
/* argument parsing */
assert(argc == 9);
const char* filename = argv[1];
my_iters = strtol(argv[2], 0, 10);
left = strtod(argv[3], 0);
right = strtod(argv[4], 0);
lower = strtod(argv[5], 0);
upper = strtod(argv[6], 0);
my_width = strtol(argv[7], 0, 10);
my_height = strtol(argv[8], 0, 10);
my_image = (int*)malloc(my_width * my_height * sizeof(int));
y0_base= ((upper - lower) / my_height);
x0_base= ((right - left) / my_width);
for (int t = 0; t < MAX_THREADS; t++) {
ID[t] = t;
rc = pthread_create(&threads[t], NULL, iterCal, (void*)&ID[t]);
}
/* mandelbrot set */
for (int t = 0; t < MAX_THREADS; t++) {
pthread_join(threads[t], &ret);
}
/* draw and cleanup */
write_png(filename, my_iters, my_width, my_height, my_image);
free(my_image);
}
```
hw2b:
```=cpp
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#define PNG_NO_SETJMP
#include <sched.h>
#include <assert.h>
#include <png.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <pthread.h>
#include <gmp.h>
#include <emmintrin.h>
#include <omp.h>
#include <mpi.h>
#include <iostream>
#include <string>
int MAX_THREADS;
const double two= 2;
__m128d _2 = _mm_load1_pd(&two);
int my_j,my_i;
int my_iters ;
int my_width ;
int my_height;
double left ;
double right ;
double lower;
double upper;
/* allocate memory for image */
int* my_image ;
int* global_image;
double y0_base;
double x0_base;
double *global_treadtime;
int spiltsize;
int rank, size, omp_threads, omp_thread;
int node_statue;
void write_png(const char* filename, int iters, int width, int height, const int* buffer) {
FILE* fp = fopen(filename, "wb");
assert(fp);
png_structp png_ptr = png_create_write_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL);
assert(png_ptr);
png_infop info_ptr = png_create_info_struct(png_ptr);
assert(info_ptr);
png_init_io(png_ptr, fp);
png_set_IHDR(png_ptr, info_ptr, width, height, 8, PNG_COLOR_TYPE_RGB, PNG_INTERLACE_NONE,
PNG_COMPRESSION_TYPE_DEFAULT, PNG_FILTER_TYPE_DEFAULT);
png_set_filter(png_ptr, 0, PNG_NO_FILTERS);
png_write_info(png_ptr, info_ptr);
png_set_compression_level(png_ptr, 1);
size_t row_size = 3 * width * sizeof(png_byte);
png_bytep row = (png_bytep)malloc(row_size);
for (int y = 0; y < height; ++y) {
memset(row, 0, row_size);
for (int x = 0; x < width; ++x) {
int p = buffer[(height - 1 - y) * width + x];
png_bytep color = row + x * 3;
if (p != iters) {
if (p & 16) {
color[0] = 240;
color[1] = color[2] = p % 16 * 16;
} else {
color[0] = p % 16 * 16;
}
}
}
png_write_row(png_ptr, row);
}
free(row);
png_write_end(png_ptr, NULL);
png_destroy_write_struct(&png_ptr, &info_ptr);
fclose(fp);
}
int main(int argc, char** argv) {
double CpuTime=0,MPIIO=0,MPICOMM=0;
double starttime,endtime,comm_start,comm_end;
cpu_set_t cpu_set;
sched_getaffinity(0, sizeof(cpu_set), &cpu_set);
MAX_THREADS=CPU_COUNT(&cpu_set);
MPI_Init(&argc, &argv);
comm_start = MPI_Wtime();
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
long threadnum=0;
int rc;
int ID[MAX_THREADS];
pthread_t thread;
void *ret; // 子執行緒傳回值
/* argument parsing */
assert(argc == 9);
const char* filename = argv[1];
my_iters = strtol(argv[2], 0, 10);
left = strtod(argv[3], 0);
right = strtod(argv[4], 0);
lower = strtod(argv[5], 0);
upper = strtod(argv[6], 0);
my_width = strtol(argv[7], 0, 10);
my_height = strtol(argv[8], 0, 10);
my_image = (int*)malloc(my_width * my_height * sizeof(int));
memset(my_image,0,my_width * my_height* sizeof(int));
global_image = (int*)malloc(my_width * my_height * sizeof(int));
memset(global_image,0,my_width * my_height* sizeof(int));
y0_base= ((upper - lower) / my_height);
x0_base= ((right - left) / my_width);
int totalDataLen=my_width * my_height;
//分割資料
int i=0;
//unsigned long long pixels = 0;
#pragma omp parallel
{
omp_threads = omp_get_num_threads();
}
spiltsize=my_height;
#pragma omp parallel num_threads(MAX_THREADS)
{
for(int i=rank;i<spiltsize; i+=size){
int localstart_end_n[3];
localstart_end_n[0]=i*my_width;
localstart_end_n[1]=(i+1)*my_width-1;
localstart_end_n[2]=my_width;
int tid = omp_get_thread_num( );
double length_squared[2]={0,0};
__m128d tmp_x, tmp_y,_length_squared ;
__m128d x_square,y_square ;
__m128d _x,_y;
__m128d _y0,_x0;
int a,b;
int maxidx=my_height*my_width;
int index=0;
for (index = tid*2+localstart_end_n[0]; index < localstart_end_n[1]; index+=MAX_THREADS*2) {
a=index;
b=index+1;
double y0[2],x0[2] ;
y0[0]=a/my_width * y0_base + lower;
y0[1]=b/my_width * y0_base + lower;
x0[0] = (a%my_width) * x0_base + left;
x0[1] = (b%my_width) * x0_base + left;
_y0=_mm_load_pd(y0);
_x0=_mm_load_pd(x0);
int repeats[2];
repeats[0]=0;
repeats[1]=0;
x_square = y_square =tmp_y=_y=_x=tmp_x =_length_squared = _mm_setzero_pd();
while (1) {
tmp_x=_x;
tmp_y=_y;
x_square=_mm_mul_pd(tmp_x, tmp_x);
y_square=_mm_mul_pd(tmp_y, tmp_y);
_length_squared =_mm_add_pd(x_square,y_square);
_mm_store_pd(length_squared, _length_squared);
if (length_squared[0] >= 4 || length_squared[1] >= 4){
break;
}
_x = _mm_add_pd(_mm_sub_pd(x_square, y_square), _x0);
_y = _mm_add_pd(_mm_mul_pd(_mm_mul_pd(tmp_x, tmp_y), _2), _y0);
repeats[0]++;
repeats[1]++;
if (repeats[0] >= my_iters || repeats[1] >= my_iters)
break;
}
if(length_squared[0]<4 && repeats[0]<my_iters){
double y0 = a/my_width * y0_base + lower;
double x0 = (a%my_width) * x0_base + left;
double x = 0;
double y = 0;
_mm_storel_pd(&x,_x);
_mm_storel_pd(&y,_y);
while (repeats[0] < my_iters && length_squared[0] < 4) {
double temp = x * x - y * y + x0;
y = 2 * x * y + y0;
x = temp;
length_squared[0] = x * x + y * y;
++repeats[0];
}
}
else if(length_squared[1]<4 && repeats[1]<my_iters){
double y0 = b/my_width * y0_base + lower;
double x0 = (b%my_width)* x0_base + left;
double x = 0;
double y = 0;
_mm_storeh_pd(&x,_x);
_mm_storeh_pd(&y,_y);
while (repeats[1] < my_iters && length_squared[1] < 4) {
double temp = x * x - y * y + x0;
y = 2 * x * y + y0;
x = temp;
length_squared[1] = x * x + y * y;
++repeats[1];
}
}
my_image[a]=repeats[0];
my_image[b]=repeats[1];
}
if(index==localstart_end_n[1]) {
double y0 = index/my_width * y0_base + lower;
double x0 = (index%my_width)* x0_base + left;
double x = 0;
double y = 0;
int repeats_L=0;
double length_squared_L=0;
while (repeats_L < my_iters && length_squared_L < 4) {
double temp = x * x - y * y + x0;
y = 2 * x * y + y0;
x = temp;
length_squared_L = x * x + y * y;
++repeats_L;
}
my_image[index]=repeats_L;
}
}
}
MPI_Reduce((void *)my_image,(void *)global_image,totalDataLen,MPI_INT,MPI_MAX,0,MPI_COMM_WORLD);
if(rank==0){
write_png(filename, my_iters, my_width, my_height, global_image);
}
MPI_Finalize();
free(my_image);
free(global_image);
}
```
---
## Experiment & Analysis
1. Methodology
* System Spec:
這一部分我是使用實驗室的機器

* Performance Metrics:
時間測量部分:
我HW2A使用Mclock_gettime(CLOCK_MONOTONIC, &end);測量
我HW2B使用MPI_Wtime()測量
將所有RANK都執行,紀錄消耗時間最長的處理器
分成下列狀況:
1. Cpu Time: 所有CPU運行時間。
2. MPI IO: 執行IO總時間。
3. MPI COMM: 執行MPI資料交換時間。
使用資料為JUDGE strict23 題
srun -n4 -c8 hw2b out.png 10000 0.3182631 0.3182632 -0.4128295 -0.4128296 2575 3019
計算量夠多,可以測試出差別
同一個測試項目測試5次,去頭尾取平均。
最後一筆用較大差距當比較


2. Plots: Speedup Factor & Time Profile
#### HW2A 不同 CPU 時間比較

### HW2A SPEED FACTOR

#### HW2B 多Node 固定8 CPU

### HW2B SPEED FACTOR

3. Discussion (Must base on the results in your plots)
在HW2A單一節點增加處理器數量的情況下,可以看到CPU使用時間是越來越短
不確定是否是因為一些誤差時間導致12顆CPU 能夠比起1顆CPU翻超過12倍的速度
又或是因為多個THREAD切分法剛好避開了一些極端的連續數值,可能1、10000這種交界
而使計算不用等待,進而加速超過理想數值。
總之,這個pthread與simd的版本加速效率,我認為是很不錯的,也沒有因為太多的over head導致加速率下降。
在HW2b中,首先可以看到,單一cpu下速度是與HW2a版本幾乎一致,這代表我的寫法沒有再前期增加過多overhead。
而增加node的情況加速率,一直到第四顆也都大致上維持1.7倍的提升
我認為這個也算不錯的,但依然可以看到,mpi io的時間是明顯增大的
因為我用的mpi reduce,這個會需要較多的時間去處理全部的結果。
4. Experiences / Conclusion
最後這次作業,讓我更熟悉Pthread的用法,也了解到負載平衡的重要性,使用SIMD指令集也算少有的體驗,雖然大學時期有使用過,但沒用到像這次這麼複雜。
而且發現到在這種資料計算下,拆分運算的點位也是一門學問,如何將資料打散也是該考慮的因素。