```cpp= #include <algorithm> #include <chrono> #include <cmath> #include <iomanip> #include <iostream> #include <numa.h> #include <numeric> #include <omp.h> #include <random> #include <utility> #include <vector> using Matrix = std::vector<double *>; void usage(const char *name) { std::cout << "usage: " << name << " matrix_size n_workers\n"; exit(-1); } void print_matrix(const Matrix &mat) { std::cout << std::setprecision(4) << std::fixed; int n = mat.size(); for (int i = 0; i < n; ++i) { for (int j = 0; j < n; ++j) { std::cout << mat[i][j] << ' '; } std::cout << '\n'; } } void lu_initialize(Matrix &A, Matrix &A_copy) { #pragma omp parallel { std::random_device rd; std::mt19937 gen(rd()); std::uniform_real_distribution<> dis(0.0, 1.0); int n = A.size(), row_size = sizeof(double) * n; int thread_num = omp_get_thread_num(), num_threads = omp_get_num_threads(); for (int i = thread_num; i < n; i += num_threads) { A[i] = (double *) numa_alloc_local(row_size); A_copy[i] = (double *) numa_alloc_local(row_size); for (int j = 0; j < n; ++j) { A[i][j] = A_copy[i][j] = dis(gen); } } } } void lu_decomposition(Matrix& A, Matrix &L, Matrix &U, std::vector<int> &pi, std::vector<int> &pi_inverse) { double global_pivot = 0.0; int global_pivot_index = -1; // std::iota(pi.begin(), pi.end(), 0); // std::iota(pi_inverse.begin(), pi_inverse.end(), 0); #pragma omp parallel { int n = A.size(), row_size = sizeof(double) * n; int thread_num = omp_get_thread_num(), num_threads = omp_get_num_threads(); for (int i = thread_num; i < n; i += num_threads) { L[i] = (double *) numa_alloc_local(row_size); U[i] = (double *) numa_alloc_local(row_size); memset(L[i], 0, row_size); memset(U[i], 0, row_size); L[i][i] = 1.0; pi[i] = pi_inverse[i] = i; } #pragma omp barrier for (int k = 0; k < n; ++k) { double local_pivot = 0.0; int local_pivot_index = -1; // #pragma omp for nowait // for (int i = k; i < n; ++i) { // if (local_pivot < std::abs(A[i][k])) { // local_pivot = std::abs(A[i][k]); // local_pivot_index = i; // } // } for (int i = thread_num; i < n; i += num_threads) { // if (pi_inverse[i] < k) continue; // if (local_pivot < std::abs(A[pi_inverse[i]][k])) { // local_pivot = std::abs(A[pi_inverse[i]][k]); // local_pivot_index = pi_inverse[i]; // } if (i < k) continue; if (local_pivot < std::abs(A[i][k])) { local_pivot = std::abs(A[i][k]); local_pivot_index = i; } } #pragma omp critical { if (local_pivot > global_pivot) { global_pivot = local_pivot; global_pivot_index = local_pivot_index; } } #pragma omp barrier #pragma omp master { if (global_pivot == 0.0) { std::cout << "singular matrix!\n"; exit(-1); } if (global_pivot_index != k) { // pi_inverse[pi[k]] = global_pivot_index; // pi_inverse[pi[global_pivot_index]] = k; std::swap(pi[k], pi[global_pivot_index]); // std::swap(A[k], A[global_pivot_index]); std::swap_ranges(A[k], A[k] + n, A[global_pivot_index]); std::swap_ranges(L[k], L[k] + k, L[global_pivot_index]); } U[k][k] = A[k][k]; global_pivot = 0.0; global_pivot_index = -1; } #pragma omp barrier // #pragma omp parallel for // for (int i = k + 1; i < n; ++i) { // L[i][k] = A[i][k] / U[k][k]; // U[k][i] = A[k][i]; // } for (int i = thread_num; i < n; i += num_threads) { // if (pi_inverse[i] <= k) continue; // L[pi_inverse[i]][k] = A[pi_inverse[i]][k] / U[k][k]; // U[k][pi_inverse[i]] = A[k][pi_inverse[i]]; if (i <= k) continue; L[i][k] = A[i][k] / U[k][k]; U[k][i] = A[k][i]; } #pragma omp barrier // #pragma omp parallel // { // #pragma omp for nowait // for (int i = k + 1; i < n; ++i) { // for (int j = k + 1; j < n; ++j) { // A[i][j] -= L[i][k] * U[k][j]; // } // } // } for (int i = thread_num; i < n; i += num_threads) { // if (pi_inverse[i] <= k) continue; // for (int j = k + 1; j < n; ++j) { // A[pi_inverse[i]][j] -= L[pi_inverse[i]][k] * U[k][j]; // } if (i <= k) continue; for (int j = k + 1; j < n; ++j) { A[i][j] -= L[i][k] * U[k][j]; } } } } } double lu_verify(const Matrix &A, const Matrix &L, const Matrix &U, const std::vector<int> &pi) { int n = A.size(); double residual = 0.0; std::vector<double> column_norms(n); Matrix LU(n); #pragma omp parallel { int n = A.size(), row_size = sizeof(double) * n; int thread_num = omp_get_thread_num(), num_threads = omp_get_num_threads(); for (int i = thread_num; i < n; i += num_threads) { LU[i] = (double *) numa_alloc_local(row_size); for (int k = 0; k < n; ++k) { if (i < k) break; for (int j = 0; j < n; ++j) { if (j < k) continue; LU[i][j] += L[i][k] * U[k][j]; } } } #pragma omp barrier // for (int i = thread_num; i < n; i += num_threads) { // for (int j = 0; j < n; ++j) { // // std::cout << A[pi[i]][j] - LU[i][j] << '\n'; // column_norms[j] += pow(A[pi[i]][j] - LU[i][j], 2); // } // } for (int j = 0; j < n; ++j) { double square_sum = 0.0; for (int i = thread_num; i < n; i += num_threads) { square_sum += pow(A[pi[i]][j] - LU[i][j], 2); } #pragma omp critical { column_norms[j] += square_sum; } } } for (double& column_norm : column_norms) { residual += sqrt(column_norm); } // #pragma omp parallel for // for (int i = 0; i < n; ++i) { // for (int k = 0; k < n; ++k) { // if (i < k) break; // for (int j = 0; j < n; ++j) { // if (j < k) continue; // LU[i][j] += L[i][k] * U[k][j]; // } // } // } // #pragma omp parallel for reduction(+: residual) // for (int j = 0; j < n; ++j) { // double norm = 0.0; // for (int i = 0; i < n; ++i) { // norm += pow(A[p[i]][j] - LU[i][j], 2); // } // residual += sqrt(norm); // } // std::cout << "L:\n"; // print_matrix(L); // std::cout << '\n'; // std::cout << "U:\n"; // print_matrix(U); // std::cout << '\n'; // std::cout << "LU:\n"; // print_matrix(LU); // std::cout << '\n'; // std::cout << "A:\n"; // for (int i = 0; i < n; ++i) { // for (int j = 0; j < n; ++j) { // std::cout << A[pi[i]][j] << ' '; // } // std::cout << '\n'; // } // std::cout << '\n'; return residual; } int main(int argc, char **argv) { const char *name = argv[0]; if (argc < 3) { usage(name); } int matrix_size = atoi(argv[1]); int n_workers = atoi(argv[2]); omp_set_num_threads(n_workers); std::cout << name << ": " << matrix_size << " " << n_workers << '\n'; std::cout << "Initializaing matrix A with random numbers ...\n"; auto start = std::chrono::steady_clock::now(); Matrix A(matrix_size), A_copy(matrix_size); lu_initialize(A, A_copy); auto end = std::chrono::steady_clock::now(); std::chrono::duration<double> diff = end - start; std::cout << "duration: " << diff.count() << " s\n"; std::cout << "Running a LU decomposition ...\n"; start = std::chrono::steady_clock::now(); Matrix L(matrix_size); Matrix U(matrix_size); std::vector<int> pi(matrix_size); std::vector<int> pi_inverse(matrix_size); lu_decomposition(A, L, U, pi, pi_inverse); end = std::chrono::steady_clock::now(); diff = end - start; std::cout << "duration: " << diff.count() << " s\n"; std::cout << "Verifying the LU decomposition ...\n"; start = std::chrono::steady_clock::now(); double residual = lu_verify(A_copy, L, U, pi); std::cout << "residual = " << residual << '\n'; end = std::chrono::steady_clock::now(); diff = end - start; std::cout << "duration: " << diff.count() << " s\n"; return 0; } ```