###### tags `平行程式設計` `資訊工程相關` # 平行程式設計 HW2 楊淨 109062530 ## Implementation 本次作業實作,我先實作HW2A 把phtread 版本先實作做出來,在將此版本拓展到MPI+OPEN MP的模式. ![](https://i.imgur.com/pnk2rnB.png) ![](https://i.imgur.com/rVmB1qZ.png) HW2A: 1. 首先觀察結果,因為能量大的點都可能會很相近,所以分配THREAD時應該要將相近的點給不同的THREAD算,於是我根據THREAD數量,連續的分配PIXEL給不同的THREAD算能量大小。 `for(int index = tid; index < maxidx; index+=MAX_THREADS)` 2. 如此實作完,發現排行榜上,我的耗時依然是前段人的2倍,但經過各項觀察,我認為程式已經沒太多可以精簡的步驟,所以只剩下SIMD指令可以嘗試。 3. 在SIMD指令下,我使用 _m128d讓一個THREAD每次都計算相鄰的2個PIXEL。 `for(int index = tid*2; index < maxidx; index+=MAX_THREADS*2)` 4. 這樣的結果使計算的速度提升非常多,因為一次計算2個DOUBLE的浮點數,提升幾乎接近2倍的速度。 HW2B: 1. 首先我直接將先前打得很散的pthread 版本改成 open MP的版本 2. 再直接根據NODE數量將大塊的圖直接切分 ![](https://i.imgur.com/GyhP6xJ.png) 3. 這樣的結果就是慘不忍睹,不同的NODE間的 LOAD BALANCE 非常不好,耗時也極久 ![](https://i.imgur.com/nuKRjJq.png) 4. 後來的想法是實做出MASTER SLAVE 分配資源的模式,但這部分實作上、溝通消耗上,我認為都不是很容易處理。 5. 於是我根據最一開始的想法,相近的點能量會相對大、小,將每個node改切以下模式 ![](https://i.imgur.com/FHPVflQ.png) 6. 最後我使用 MPI_Reduce 將結果整合起來 7. 最後的效果也不算太差,judge從原本的耗時1000多秒 降低到300秒。 HW2A 實作如下: ```c=cpp #ifndef _GNU_SOURCE #define _GNU_SOURCE #endif #define PNG_NO_SETJMP #include <sched.h> #include <assert.h> #include <png.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <pthread.h> #include <gmp.h> #include <emmintrin.h> int MAX_THREADS; const double two= 2; __m128d _2 = _mm_load1_pd(&two); int my_j,my_i; int my_iters ; int my_width ; int my_height; double left ; double right ; double lower; double upper; /* allocate memory for image */ int* my_image ; double y0_base; double x0_base; double *global_treadtime; void* iterCal(void* threadid){ double length_squared[2]={0,0}; __m128d tmp_x, tmp_y,_length_squared ; __m128d x_square,y_square ; __m128d _x,_y; __m128d _y0,_x0; int a,b; int tid = *(int*)threadid; int maxidx=my_height*my_width; for (int index = tid*2; index < maxidx; index+=MAX_THREADS*2) { if (index!=maxidx-1){ a=index; b=index+1; double y0[2],x0[2] ; y0[0]=a/my_width * y0_base + lower; y0[1]=b/my_width * y0_base + lower; x0[0] = (a%my_width) * x0_base + left; x0[1] = (b%my_width) * x0_base + left; _y0=_mm_load_pd(y0); _x0=_mm_load_pd(x0); int repeats[2]; repeats[0]=0; repeats[1]=0; x_square = y_square =tmp_y=_y=_x=tmp_x =_length_squared = _mm_setzero_pd(); while (1) { tmp_x=_x; tmp_y=_y; x_square=_mm_mul_pd(tmp_x, tmp_x); y_square=_mm_mul_pd(tmp_y, tmp_y); _length_squared =_mm_add_pd(x_square,y_square); _mm_store_pd(length_squared, _length_squared); if (length_squared[0] >= 4 || length_squared[1] >= 4){ break; } _x = _mm_add_pd(_mm_sub_pd(x_square, y_square), _x0); _y = _mm_add_pd(_mm_mul_pd(_mm_mul_pd(tmp_x, tmp_y), _2), _y0); repeats[0]++; repeats[1]++; if (repeats[0] >= my_iters || repeats[1] >= my_iters) break; } if(length_squared[0]<4 && repeats[0]<my_iters){ double y0 = a/my_width * y0_base + lower; double x0 = (a%my_width) * x0_base + left; double x = 0; double y = 0; _mm_storel_pd(&x,_x); _mm_storel_pd(&y,_y); while (repeats[0] < my_iters && length_squared[0] < 4) { double temp = x * x - y * y + x0; y = 2 * x * y + y0; x = temp; length_squared[0] = x * x + y * y; ++repeats[0]; } } else if(length_squared[1]<4 && repeats[1]<my_iters){ double y0 = b/my_width * y0_base + lower; double x0 = (b%my_width)* x0_base + left; double x = 0; double y = 0; _mm_storeh_pd(&x,_x); _mm_storeh_pd(&y,_y); while (repeats[1] < my_iters && length_squared[1] < 4) { double temp = x * x - y * y + x0; y = 2 * x * y + y0; x = temp; length_squared[1] = x * x + y * y; ++repeats[1]; } } my_image[a]=repeats[0]; my_image[b]=repeats[1]; } else{ double y0 = index/my_width * y0_base + lower; double x0 = (index%my_width)* x0_base + left; double x = 0; double y = 0; int repeats_L; double length_squared_L; while (repeats_L < my_iters && length_squared_L < 4) { double temp = x * x - y * y + x0; y = 2 * x * y + y0; x = temp; length_squared_L = x * x + y * y; ++repeats_L; } my_image[index]=repeats_L; } } pthread_exit(NULL); } void write_png(const char* filename, int iters, int width, int height, const int* buffer) { FILE* fp = fopen(filename, "wb"); assert(fp); png_structp png_ptr = png_create_write_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL); assert(png_ptr); png_infop info_ptr = png_create_info_struct(png_ptr); assert(info_ptr); png_init_io(png_ptr, fp); png_set_IHDR(png_ptr, info_ptr, width, height, 8, PNG_COLOR_TYPE_RGB, PNG_INTERLACE_NONE, PNG_COMPRESSION_TYPE_DEFAULT, PNG_FILTER_TYPE_DEFAULT); png_set_filter(png_ptr, 0, PNG_NO_FILTERS); png_write_info(png_ptr, info_ptr); png_set_compression_level(png_ptr, 1); size_t row_size = 3 * width * sizeof(png_byte); png_bytep row = (png_bytep)malloc(row_size); for (int y = 0; y < height; ++y) { memset(row, 0, row_size); for (int x = 0; x < width; ++x) { int p = buffer[(height - 1 - y) * width + x]; png_bytep color = row + x * 3; if (p != iters) { if (p & 16) { color[0] = 240; color[1] = color[2] = p % 16 * 16; } else { color[0] = p % 16 * 16; } } } png_write_row(png_ptr, row); } free(row); png_write_end(png_ptr, NULL); png_destroy_write_struct(&png_ptr, &info_ptr); fclose(fp); } int main(int argc, char** argv) { cpu_set_t cpu_set; sched_getaffinity(0, sizeof(cpu_set), &cpu_set); MAX_THREADS=500; //printf("%d cpus available\n", CPU_COUNT(&cpu_set)); pthread_t threads[MAX_THREADS]; long threadnum=0; int rc; int ID[MAX_THREADS]; void *ret; // 子執行緒傳回值 /* argument parsing */ assert(argc == 9); const char* filename = argv[1]; my_iters = strtol(argv[2], 0, 10); left = strtod(argv[3], 0); right = strtod(argv[4], 0); lower = strtod(argv[5], 0); upper = strtod(argv[6], 0); my_width = strtol(argv[7], 0, 10); my_height = strtol(argv[8], 0, 10); my_image = (int*)malloc(my_width * my_height * sizeof(int)); y0_base= ((upper - lower) / my_height); x0_base= ((right - left) / my_width); for (int t = 0; t < MAX_THREADS; t++) { ID[t] = t; rc = pthread_create(&threads[t], NULL, iterCal, (void*)&ID[t]); } /* mandelbrot set */ for (int t = 0; t < MAX_THREADS; t++) { pthread_join(threads[t], &ret); } /* draw and cleanup */ write_png(filename, my_iters, my_width, my_height, my_image); free(my_image); } ``` hw2b: ```=cpp #ifndef _GNU_SOURCE #define _GNU_SOURCE #endif #define PNG_NO_SETJMP #include <sched.h> #include <assert.h> #include <png.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <pthread.h> #include <gmp.h> #include <emmintrin.h> #include <omp.h> #include <mpi.h> #include <iostream> #include <string> int MAX_THREADS; const double two= 2; __m128d _2 = _mm_load1_pd(&two); int my_j,my_i; int my_iters ; int my_width ; int my_height; double left ; double right ; double lower; double upper; /* allocate memory for image */ int* my_image ; int* global_image; double y0_base; double x0_base; double *global_treadtime; int spiltsize; int rank, size, omp_threads, omp_thread; int node_statue; void write_png(const char* filename, int iters, int width, int height, const int* buffer) { FILE* fp = fopen(filename, "wb"); assert(fp); png_structp png_ptr = png_create_write_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL); assert(png_ptr); png_infop info_ptr = png_create_info_struct(png_ptr); assert(info_ptr); png_init_io(png_ptr, fp); png_set_IHDR(png_ptr, info_ptr, width, height, 8, PNG_COLOR_TYPE_RGB, PNG_INTERLACE_NONE, PNG_COMPRESSION_TYPE_DEFAULT, PNG_FILTER_TYPE_DEFAULT); png_set_filter(png_ptr, 0, PNG_NO_FILTERS); png_write_info(png_ptr, info_ptr); png_set_compression_level(png_ptr, 1); size_t row_size = 3 * width * sizeof(png_byte); png_bytep row = (png_bytep)malloc(row_size); for (int y = 0; y < height; ++y) { memset(row, 0, row_size); for (int x = 0; x < width; ++x) { int p = buffer[(height - 1 - y) * width + x]; png_bytep color = row + x * 3; if (p != iters) { if (p & 16) { color[0] = 240; color[1] = color[2] = p % 16 * 16; } else { color[0] = p % 16 * 16; } } } png_write_row(png_ptr, row); } free(row); png_write_end(png_ptr, NULL); png_destroy_write_struct(&png_ptr, &info_ptr); fclose(fp); } int main(int argc, char** argv) { double CpuTime=0,MPIIO=0,MPICOMM=0; double starttime,endtime,comm_start,comm_end; cpu_set_t cpu_set; sched_getaffinity(0, sizeof(cpu_set), &cpu_set); MAX_THREADS=CPU_COUNT(&cpu_set); MPI_Init(&argc, &argv); comm_start = MPI_Wtime(); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &size); long threadnum=0; int rc; int ID[MAX_THREADS]; pthread_t thread; void *ret; // 子執行緒傳回值 /* argument parsing */ assert(argc == 9); const char* filename = argv[1]; my_iters = strtol(argv[2], 0, 10); left = strtod(argv[3], 0); right = strtod(argv[4], 0); lower = strtod(argv[5], 0); upper = strtod(argv[6], 0); my_width = strtol(argv[7], 0, 10); my_height = strtol(argv[8], 0, 10); my_image = (int*)malloc(my_width * my_height * sizeof(int)); memset(my_image,0,my_width * my_height* sizeof(int)); global_image = (int*)malloc(my_width * my_height * sizeof(int)); memset(global_image,0,my_width * my_height* sizeof(int)); y0_base= ((upper - lower) / my_height); x0_base= ((right - left) / my_width); int totalDataLen=my_width * my_height; //分割資料 int i=0; //unsigned long long pixels = 0; #pragma omp parallel { omp_threads = omp_get_num_threads(); } spiltsize=my_height; #pragma omp parallel num_threads(MAX_THREADS) { for(int i=rank;i<spiltsize; i+=size){ int localstart_end_n[3]; localstart_end_n[0]=i*my_width; localstart_end_n[1]=(i+1)*my_width-1; localstart_end_n[2]=my_width; int tid = omp_get_thread_num( ); double length_squared[2]={0,0}; __m128d tmp_x, tmp_y,_length_squared ; __m128d x_square,y_square ; __m128d _x,_y; __m128d _y0,_x0; int a,b; int maxidx=my_height*my_width; int index=0; for (index = tid*2+localstart_end_n[0]; index < localstart_end_n[1]; index+=MAX_THREADS*2) { a=index; b=index+1; double y0[2],x0[2] ; y0[0]=a/my_width * y0_base + lower; y0[1]=b/my_width * y0_base + lower; x0[0] = (a%my_width) * x0_base + left; x0[1] = (b%my_width) * x0_base + left; _y0=_mm_load_pd(y0); _x0=_mm_load_pd(x0); int repeats[2]; repeats[0]=0; repeats[1]=0; x_square = y_square =tmp_y=_y=_x=tmp_x =_length_squared = _mm_setzero_pd(); while (1) { tmp_x=_x; tmp_y=_y; x_square=_mm_mul_pd(tmp_x, tmp_x); y_square=_mm_mul_pd(tmp_y, tmp_y); _length_squared =_mm_add_pd(x_square,y_square); _mm_store_pd(length_squared, _length_squared); if (length_squared[0] >= 4 || length_squared[1] >= 4){ break; } _x = _mm_add_pd(_mm_sub_pd(x_square, y_square), _x0); _y = _mm_add_pd(_mm_mul_pd(_mm_mul_pd(tmp_x, tmp_y), _2), _y0); repeats[0]++; repeats[1]++; if (repeats[0] >= my_iters || repeats[1] >= my_iters) break; } if(length_squared[0]<4 && repeats[0]<my_iters){ double y0 = a/my_width * y0_base + lower; double x0 = (a%my_width) * x0_base + left; double x = 0; double y = 0; _mm_storel_pd(&x,_x); _mm_storel_pd(&y,_y); while (repeats[0] < my_iters && length_squared[0] < 4) { double temp = x * x - y * y + x0; y = 2 * x * y + y0; x = temp; length_squared[0] = x * x + y * y; ++repeats[0]; } } else if(length_squared[1]<4 && repeats[1]<my_iters){ double y0 = b/my_width * y0_base + lower; double x0 = (b%my_width)* x0_base + left; double x = 0; double y = 0; _mm_storeh_pd(&x,_x); _mm_storeh_pd(&y,_y); while (repeats[1] < my_iters && length_squared[1] < 4) { double temp = x * x - y * y + x0; y = 2 * x * y + y0; x = temp; length_squared[1] = x * x + y * y; ++repeats[1]; } } my_image[a]=repeats[0]; my_image[b]=repeats[1]; } if(index==localstart_end_n[1]) { double y0 = index/my_width * y0_base + lower; double x0 = (index%my_width)* x0_base + left; double x = 0; double y = 0; int repeats_L=0; double length_squared_L=0; while (repeats_L < my_iters && length_squared_L < 4) { double temp = x * x - y * y + x0; y = 2 * x * y + y0; x = temp; length_squared_L = x * x + y * y; ++repeats_L; } my_image[index]=repeats_L; } } } MPI_Reduce((void *)my_image,(void *)global_image,totalDataLen,MPI_INT,MPI_MAX,0,MPI_COMM_WORLD); if(rank==0){ write_png(filename, my_iters, my_width, my_height, global_image); } MPI_Finalize(); free(my_image); free(global_image); } ``` --- ## Experiment & Analysis 1. Methodology * System Spec: 這一部分我是使用實驗室的機器 ![](https://i.imgur.com/ugbwgv8.png) * Performance Metrics: 時間測量部分: 我HW2A使用Mclock_gettime(CLOCK_MONOTONIC, &end);測量 我HW2B使用MPI_Wtime()測量 將所有RANK都執行,紀錄消耗時間最長的處理器 分成下列狀況: 1. Cpu Time: 所有CPU運行時間。 2. MPI IO: 執行IO總時間。 3. MPI COMM: 執行MPI資料交換時間。 使用資料為JUDGE strict23 題 srun -n4 -c8 hw2b out.png 10000 0.3182631 0.3182632 -0.4128295 -0.4128296 2575 3019 計算量夠多,可以測試出差別 同一個測試項目測試5次,去頭尾取平均。 最後一筆用較大差距當比較 ![](https://i.imgur.com/TtPgBcx.png) ![](https://i.imgur.com/xSb4jYv.png) 2. Plots: Speedup Factor & Time Profile #### HW2A 不同 CPU 時間比較 ![](https://i.imgur.com/T67Rusq.png) ### HW2A SPEED FACTOR ![](https://i.imgur.com/YZbs0qM.png) #### HW2B 多Node 固定8 CPU ![](https://i.imgur.com/jHrVSBo.png) ### HW2B SPEED FACTOR ![](https://i.imgur.com/EgCfK5I.png) 3. Discussion (Must base on the results in your plots) 在HW2A單一節點增加處理器數量的情況下,可以看到CPU使用時間是越來越短 不確定是否是因為一些誤差時間導致12顆CPU 能夠比起1顆CPU翻超過12倍的速度 又或是因為多個THREAD切分法剛好避開了一些極端的連續數值,可能1、10000這種交界 而使計算不用等待,進而加速超過理想數值。 總之,這個pthread與simd的版本加速效率,我認為是很不錯的,也沒有因為太多的over head導致加速率下降。 在HW2b中,首先可以看到,單一cpu下速度是與HW2a版本幾乎一致,這代表我的寫法沒有再前期增加過多overhead。 而增加node的情況加速率,一直到第四顆也都大致上維持1.7倍的提升 我認為這個也算不錯的,但依然可以看到,mpi io的時間是明顯增大的 因為我用的mpi reduce,這個會需要較多的時間去處理全部的結果。 4. Experiences / Conclusion 最後這次作業,讓我更熟悉Pthread的用法,也了解到負載平衡的重要性,使用SIMD指令集也算少有的體驗,雖然大學時期有使用過,但沒用到像這次這麼複雜。 而且發現到在這種資料計算下,拆分運算的點位也是一門學問,如何將資料打散也是該考慮的因素。