```cpp=
#include <algorithm>
#include <chrono>
#include <cmath>
#include <iomanip>
#include <iostream>
#include <numa.h>
#include <numeric>
#include <omp.h>
#include <random>
#include <utility>
#include <vector>
using Matrix = std::vector<double *>;
void usage(const char *name) {
std::cout << "usage: " << name << " matrix_size n_workers\n";
exit(-1);
}
void print_matrix(const Matrix &mat) {
std::cout << std::setprecision(4) << std::fixed;
int n = mat.size();
for (int i = 0; i < n; ++i) {
for (int j = 0; j < n; ++j) {
std::cout << mat[i][j] << ' ';
}
std::cout << '\n';
}
}
void lu_initialize(Matrix &A, Matrix &A_copy) {
#pragma omp parallel
{
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_real_distribution<> dis(0.0, 1.0);
int n = A.size(), row_size = sizeof(double) * n;
int thread_num = omp_get_thread_num(), num_threads = omp_get_num_threads();
for (int i = thread_num; i < n; i += num_threads) {
A[i] = (double *) numa_alloc_local(row_size);
A_copy[i] = (double *) numa_alloc_local(row_size);
for (int j = 0; j < n; ++j) {
A[i][j] = A_copy[i][j] = dis(gen);
}
}
}
}
void lu_decomposition(Matrix& A, Matrix &L, Matrix &U, std::vector<int> &pi, std::vector<int> &pi_inverse) {
double global_pivot = 0.0;
int global_pivot_index = -1;
// std::iota(pi.begin(), pi.end(), 0);
// std::iota(pi_inverse.begin(), pi_inverse.end(), 0);
#pragma omp parallel
{
int n = A.size(), row_size = sizeof(double) * n;
int thread_num = omp_get_thread_num(), num_threads = omp_get_num_threads();
for (int i = thread_num; i < n; i += num_threads) {
L[i] = (double *) numa_alloc_local(row_size);
U[i] = (double *) numa_alloc_local(row_size);
memset(L[i], 0, row_size);
memset(U[i], 0, row_size);
L[i][i] = 1.0;
pi[i] = pi_inverse[i] = i;
}
#pragma omp barrier
for (int k = 0; k < n; ++k) {
double local_pivot = 0.0;
int local_pivot_index = -1;
// #pragma omp for nowait
// for (int i = k; i < n; ++i) {
// if (local_pivot < std::abs(A[i][k])) {
// local_pivot = std::abs(A[i][k]);
// local_pivot_index = i;
// }
// }
for (int i = thread_num; i < n; i += num_threads) {
// if (pi_inverse[i] < k) continue;
// if (local_pivot < std::abs(A[pi_inverse[i]][k])) {
// local_pivot = std::abs(A[pi_inverse[i]][k]);
// local_pivot_index = pi_inverse[i];
// }
if (i < k) continue;
if (local_pivot < std::abs(A[i][k])) {
local_pivot = std::abs(A[i][k]);
local_pivot_index = i;
}
}
#pragma omp critical
{
if (local_pivot > global_pivot) {
global_pivot = local_pivot;
global_pivot_index = local_pivot_index;
}
}
#pragma omp barrier
#pragma omp master
{
if (global_pivot == 0.0) {
std::cout << "singular matrix!\n";
exit(-1);
}
if (global_pivot_index != k) {
// pi_inverse[pi[k]] = global_pivot_index;
// pi_inverse[pi[global_pivot_index]] = k;
std::swap(pi[k], pi[global_pivot_index]);
// std::swap(A[k], A[global_pivot_index]);
std::swap_ranges(A[k], A[k] + n, A[global_pivot_index]);
std::swap_ranges(L[k], L[k] + k, L[global_pivot_index]);
}
U[k][k] = A[k][k];
global_pivot = 0.0;
global_pivot_index = -1;
}
#pragma omp barrier
// #pragma omp parallel for
// for (int i = k + 1; i < n; ++i) {
// L[i][k] = A[i][k] / U[k][k];
// U[k][i] = A[k][i];
// }
for (int i = thread_num; i < n; i += num_threads) {
// if (pi_inverse[i] <= k) continue;
// L[pi_inverse[i]][k] = A[pi_inverse[i]][k] / U[k][k];
// U[k][pi_inverse[i]] = A[k][pi_inverse[i]];
if (i <= k) continue;
L[i][k] = A[i][k] / U[k][k];
U[k][i] = A[k][i];
}
#pragma omp barrier
// #pragma omp parallel
// {
// #pragma omp for nowait
// for (int i = k + 1; i < n; ++i) {
// for (int j = k + 1; j < n; ++j) {
// A[i][j] -= L[i][k] * U[k][j];
// }
// }
// }
for (int i = thread_num; i < n; i += num_threads) {
// if (pi_inverse[i] <= k) continue;
// for (int j = k + 1; j < n; ++j) {
// A[pi_inverse[i]][j] -= L[pi_inverse[i]][k] * U[k][j];
// }
if (i <= k) continue;
for (int j = k + 1; j < n; ++j) {
A[i][j] -= L[i][k] * U[k][j];
}
}
}
}
}
double lu_verify(const Matrix &A, const Matrix &L, const Matrix &U, const std::vector<int> &pi) {
int n = A.size();
double residual = 0.0;
std::vector<double> column_norms(n);
Matrix LU(n);
#pragma omp parallel
{
int n = A.size(), row_size = sizeof(double) * n;
int thread_num = omp_get_thread_num(), num_threads = omp_get_num_threads();
for (int i = thread_num; i < n; i += num_threads) {
LU[i] = (double *) numa_alloc_local(row_size);
for (int k = 0; k < n; ++k) {
if (i < k) break;
for (int j = 0; j < n; ++j) {
if (j < k) continue;
LU[i][j] += L[i][k] * U[k][j];
}
}
}
#pragma omp barrier
// for (int i = thread_num; i < n; i += num_threads) {
// for (int j = 0; j < n; ++j) {
// // std::cout << A[pi[i]][j] - LU[i][j] << '\n';
// column_norms[j] += pow(A[pi[i]][j] - LU[i][j], 2);
// }
// }
for (int j = 0; j < n; ++j) {
double square_sum = 0.0;
for (int i = thread_num; i < n; i += num_threads) {
square_sum += pow(A[pi[i]][j] - LU[i][j], 2);
}
#pragma omp critical
{
column_norms[j] += square_sum;
}
}
}
for (double& column_norm : column_norms) {
residual += sqrt(column_norm);
}
// #pragma omp parallel for
// for (int i = 0; i < n; ++i) {
// for (int k = 0; k < n; ++k) {
// if (i < k) break;
// for (int j = 0; j < n; ++j) {
// if (j < k) continue;
// LU[i][j] += L[i][k] * U[k][j];
// }
// }
// }
// #pragma omp parallel for reduction(+: residual)
// for (int j = 0; j < n; ++j) {
// double norm = 0.0;
// for (int i = 0; i < n; ++i) {
// norm += pow(A[p[i]][j] - LU[i][j], 2);
// }
// residual += sqrt(norm);
// }
// std::cout << "L:\n";
// print_matrix(L);
// std::cout << '\n';
// std::cout << "U:\n";
// print_matrix(U);
// std::cout << '\n';
// std::cout << "LU:\n";
// print_matrix(LU);
// std::cout << '\n';
// std::cout << "A:\n";
// for (int i = 0; i < n; ++i) {
// for (int j = 0; j < n; ++j) {
// std::cout << A[pi[i]][j] << ' ';
// }
// std::cout << '\n';
// }
// std::cout << '\n';
return residual;
}
int main(int argc, char **argv) {
const char *name = argv[0];
if (argc < 3) {
usage(name);
}
int matrix_size = atoi(argv[1]);
int n_workers = atoi(argv[2]);
omp_set_num_threads(n_workers);
std::cout << name << ": " << matrix_size << " " << n_workers << '\n';
std::cout << "Initializaing matrix A with random numbers ...\n";
auto start = std::chrono::steady_clock::now();
Matrix A(matrix_size), A_copy(matrix_size);
lu_initialize(A, A_copy);
auto end = std::chrono::steady_clock::now();
std::chrono::duration<double> diff = end - start;
std::cout << "duration: " << diff.count() << " s\n";
std::cout << "Running a LU decomposition ...\n";
start = std::chrono::steady_clock::now();
Matrix L(matrix_size);
Matrix U(matrix_size);
std::vector<int> pi(matrix_size);
std::vector<int> pi_inverse(matrix_size);
lu_decomposition(A, L, U, pi, pi_inverse);
end = std::chrono::steady_clock::now();
diff = end - start;
std::cout << "duration: " << diff.count() << " s\n";
std::cout << "Verifying the LU decomposition ...\n";
start = std::chrono::steady_clock::now();
double residual = lu_verify(A_copy, L, U, pi);
std::cout << "residual = " << residual << '\n';
end = std::chrono::steady_clock::now();
diff = end - start;
std::cout << "duration: " << diff.count() << " s\n";
return 0;
}
```