```cpp #include "core/common/graupel.hpp" #include <algorithm> #include <iostream> #include <numeric> #include <vector> #include <execution> #include <chrono> #include <thread> #include <ranges> #include <atomic> using namespace property; using namespace thermo; using namespace transition; using namespace idx; using namespace graupel_ct; struct t_qx_ptr { t_qx_ptr (array_1d_t< real_t > &p_, array_1d_t< real_t > &x_) : p (p_), x (x_) { } array_1d_t< real_t > &p; array_1d_t< real_t > &x; }; // pointer vector /** * @brief TODO * * @param precip time step for integration of microphysics (s) * @param params fall speed parameters * @param zeta dt/(2dz) * @param vc state dependent fall speed correction * @param flx flux into cell from above * @param vt terminal velocity * @param q specific mass of hydrometeor * @param q_kp1 specific mass in next lower cell * @param rho density */ void precip (const real_t (&params)[3], real_t (&precip)[3], real_t zeta, real_t vc, real_t flx, real_t vt, real_t q, real_t q_kp1, real_t rho) { real_t rho_x, flx_eff, flx_partial; rho_x = q * rho; flx_eff = (rho_x / zeta) + static_cast< real_t > (2.0) * flx; flx_partial = rho_x * vc * fall_speed (rho_x, params); flx_partial = std::fmin (flx_partial, flx_eff); precip[0] = (zeta * (flx_eff - flx_partial)) / ((static_cast< real_t > (1.0) + zeta * vt) * rho); // q update precip[1] = (precip[0] * rho * vt + flx_partial) * static_cast< real_t > (0.5); // flx rho_x = (precip[0] + q_kp1) * static_cast< real_t > (0.5) * rho; precip[2] = vc * fall_speed (rho_x, params); // vt } struct Result { bool flag; bool is_sig; }; void graupel (size_t &nvec, size_t &ke, size_t &ivstart, size_t &ivend, size_t &kstart, real_t &dt, array_1d_t< real_t > &dz, array_1d_t< real_t > &t, array_1d_t< real_t > &rho, array_1d_t< real_t > &p, array_1d_t< real_t > &qv, array_1d_t< real_t > &qc, array_1d_t< real_t > &qi, array_1d_t< real_t > &qr, array_1d_t< real_t > &qs, array_1d_t< real_t > &qg, real_t &qnc, array_1d_t< real_t > &prr_gsp, array_1d_t< real_t > &pri_gsp, array_1d_t< real_t > &prs_gsp, array_1d_t< real_t > &prg_gsp, array_1d_t< real_t > &pre_gsp, array_1d_t< real_t > &pflx) { // std::cout << "sequential graupel" << std::endl; array_1d_t< char > is_sig_present ( nvec * ke); // is snow, ice or graupel present? array_1d_t< size_t > ind_k (nvec * ke), ind_i (nvec * ke); // k index of gathered point, iv index of gathered point array_2d_t< size_t > kmin ( nvec, array_1d_t< size_t > (np)); // first level with condensate real_t cv, vc, eta, zeta, qvsi, qice, qliq, qtot, dvsw, dvsw0, dvsi, n_ice, m_ice, x_ice, n_snow, l_snow, ice_dep, e_int, stot, xrho; real_t update[3], // scratch array with output from precipitation step sink[nx], // tendencies dqdt[nx]; // tendencies array_1d_t< real_t > eflx ( nvec); // internal energy flux from precipitation (W/m2 ) array_2d_t< real_t > sx2x ( nx, array_1d_t< real_t > (nx, ZERO)), // conversion rates vt (nvec, array_1d_t< real_t > ( np)); // terminal velocity for different hydrometeor categories array_1d_t< t_qx_ptr > q{}; // vector of pointers to point to four hydrometeor inouts array_1d_t< real_t > emptyArray; q.emplace_back (prr_gsp, qr); q.emplace_back (pri_gsp, qi); q.emplace_back (prs_gsp, qs); q.emplace_back (prg_gsp, qg); q.emplace_back (emptyArray, qc); q.emplace_back (emptyArray, qv); size_t jmx = 0; size_t jmx_ = jmx; size_t count = ke * (ivend - ivstart); std::vector< int > flag_array (count); std::vector< int > is_sig_array (count); std::for_each_n (std::execution::par, std::views::iota (0).begin (), count, [=, // global variable lqc = lqc, lqr = lqr, lqs = lqs, lqi = lqi, lqg = lqg, qmin = qmin, tfrz_het2 = tfrz_het2, np = np, ZERO = ZERO, c1es = c1es, c3ies = c3ies, tmelt = thermodyn::tmelt, c4ies = c4ies, rv = rv, // vector / vector reference / vector pointer ind_k = ind_k.data (), ind_i = ind_i.data (), is_sig_present = is_sig_present.data (), // this is vector<bool>, change to vector<char> kmin = kmin.data (), t = t.data (), rho = rho.data (), flag_array = flag_array.data (), is_sig_array = is_sig_array.data (), // struct with vector q_lqc_x = q[lqc].x.data (), q_lqr_x = q[lqr].x.data (), q_lqs_x = q[lqs].x.data (), q_lqi_x = q[lqi].x.data (), q_lqg_x = q[lqg].x.data (), q_lqv_x = q[lqv].x.data ()] (size_t jmx) { size_t i_offset = jmx / (ivend - ivstart); size_t j_offset = jmx % (ivend - ivstart); size_t i = ke - 1 - i_offset; size_t j = ivstart + j_offset; size_t oned_vec_index = i * ivend + j; //printf ("%llu, %llu, %llu\n", i, j, jmx); bool cond = ((std::max (std::max (std::max (std::max (q_lqc_x[oned_vec_index], q_lqr_x[oned_vec_index]), q_lqs_x[oned_vec_index]), q_lqi_x[oned_vec_index]), q_lqg_x[oned_vec_index]) > qmin) || ((t[oned_vec_index] < tfrz_het2) && (q_lqv_x[oned_vec_index] > (c1es * exp (c3ies * (t[oned_vec_index] - thermodyn::tmelt) / (t[oned_vec_index] - c4ies)) / (rho[oned_vec_index] * rv * t[oned_vec_index]))))); if (cond) { flag_array[jmx] = true; is_sig_array[jmx] = (std::max (std::max (q_lqs_x[oned_vec_index], q_lqi_x[oned_vec_index]), q_lqg_x[oned_vec_index]) > qmin); } }); // The loop is intentionally i<nlev; since we are using an unsigned integer // data type, when i reaches 0, and you try to decrement further, (to -1), it // wraps to the maximum value representable by size_t. size_t oned_vec_index; //std::vector< int > counter (count, 0); //counter = counter.data (), //counter[jmx]++; //int sum = 0; //for (auto i : counter) //{ //sum += i; //} //std::cout << "DEBUG counter= " << sum << std::endl; for (size_t idx = 0; idx < flag_array.size (); ++idx) { if (flag_array[idx]) { size_t i_offset = idx / (ivend - ivstart); size_t j_offset = idx % (ivend - ivstart); size_t i = ke - 1 - i_offset; size_t j = ivstart + j_offset; ind_k[jmx_] = i; ind_i[jmx_] = j; is_sig_present[jmx_] = is_sig_array[idx]; jmx_++; } } //for (auto i : ind_k) //{ //std::cout << i; //} //std::cout << std::endl; for (size_t i = ke - 1; i < ke; --i) { for (size_t j = ivstart; j < ivend; j++) { size_t oned_vec_index = i * ivend + j; for (size_t ix = 0; ix < np; ix++) { if (i == (ke - 1)) { kmin[j][qp_ind[ix]] = ke + 1; q[qp_ind[ix]].p[j] = ZERO; vt[j][ix] = ZERO; } if (q[qp_ind[ix]].x[oned_vec_index] > qmin) { kmin[j][qp_ind[ix]] = i; } } } } ``` ### sbatch ``` #!/bin/bash #SBATCH --account=ka1273 #SBATCH --job-name=scc #SBATCH --partition=gpu #SBATCH --gpus=1 #SBATCH --output=%x.%j.log #SBATCH --exclusive #SBATCH --time=00:10:00 # compiler flags NOFLAGS="-O0 -std=c++20 -stdpar=gpu -gpu=cc80,noflushz -nofma -Minfo -Kieee -Mfcon -Mnofma -Mdalign -Minfo=all" FLAGS="-O0 -std=c++20 -stdpar=gpu -gpu=cc80 -nofma" #FLAGS='-O0 -stdpar=gpu -gpu=cc80,nofma,noflushz' # input file IN_FILE=$(pwd)/tasks/11k.nc # output file OUT_FILE=output_gpu_double.nc # reference file associated with input file REF_FILE=$(pwd)/reference_results/seq_11k_double.nc # setup env . scripts/levante-setup.sh nvidia gpu rm -r build_std_gpu # build the code cmake -B build_std_gpu -S . \ -DMU_IMPL=std \ -DMU_ARCH=a100 \ -DMU_ENABLE_SINGLE=OFF \ -DCMAKE_CXX_COMPILER=nvc++ \ -DCMAKE_CXX_FLAGS="${FLAGS}" cmake --build build_std_gpu -j 2 # cleanup previous results (if any) if [ -f $OUT_FILE ]; then rm $OUT_FILE fi # run the executable ./build_std_gpu/bin/graupel $IN_FILE $OUT_FILE # check correctness . scripts/check-correctness.sh check $OUT_FILE $REF_FILE echo "Job done!" ```