```cpp
#include "core/common/graupel.hpp"
#include <algorithm>
#include <iostream>
#include <numeric>
#include <vector>
#include <execution>
#include <chrono>
#include <thread>
#include <ranges>
#include <atomic>
using namespace property;
using namespace thermo;
using namespace transition;
using namespace idx;
using namespace graupel_ct;
struct t_qx_ptr
{
t_qx_ptr (array_1d_t< real_t > &p_, array_1d_t< real_t > &x_)
: p (p_), x (x_)
{
}
array_1d_t< real_t > &p;
array_1d_t< real_t > &x;
}; // pointer vector
/**
* @brief TODO
*
* @param precip time step for integration of microphysics (s)
* @param params fall speed parameters
* @param zeta dt/(2dz)
* @param vc state dependent fall speed correction
* @param flx flux into cell from above
* @param vt terminal velocity
* @param q specific mass of hydrometeor
* @param q_kp1 specific mass in next lower cell
* @param rho density
*/
void
precip (const real_t (¶ms)[3],
real_t (&precip)[3],
real_t zeta,
real_t vc,
real_t flx,
real_t vt,
real_t q,
real_t q_kp1,
real_t rho)
{
real_t rho_x, flx_eff, flx_partial;
rho_x = q * rho;
flx_eff = (rho_x / zeta) + static_cast< real_t > (2.0) * flx;
flx_partial = rho_x * vc * fall_speed (rho_x, params);
flx_partial = std::fmin (flx_partial, flx_eff);
precip[0] = (zeta * (flx_eff - flx_partial))
/ ((static_cast< real_t > (1.0) + zeta * vt) * rho); // q update
precip[1] = (precip[0] * rho * vt + flx_partial)
* static_cast< real_t > (0.5); // flx
rho_x = (precip[0] + q_kp1) * static_cast< real_t > (0.5) * rho;
precip[2] = vc * fall_speed (rho_x, params); // vt
}
struct Result
{
bool flag;
bool is_sig;
};
void
graupel (size_t &nvec,
size_t &ke,
size_t &ivstart,
size_t &ivend,
size_t &kstart,
real_t &dt,
array_1d_t< real_t > &dz,
array_1d_t< real_t > &t,
array_1d_t< real_t > &rho,
array_1d_t< real_t > &p,
array_1d_t< real_t > &qv,
array_1d_t< real_t > &qc,
array_1d_t< real_t > &qi,
array_1d_t< real_t > &qr,
array_1d_t< real_t > &qs,
array_1d_t< real_t > &qg,
real_t &qnc,
array_1d_t< real_t > &prr_gsp,
array_1d_t< real_t > &pri_gsp,
array_1d_t< real_t > &prs_gsp,
array_1d_t< real_t > &prg_gsp,
array_1d_t< real_t > &pre_gsp,
array_1d_t< real_t > &pflx)
{
// std::cout << "sequential graupel" << std::endl;
array_1d_t< char > is_sig_present (
nvec * ke); // is snow, ice or graupel present?
array_1d_t< size_t > ind_k (nvec * ke),
ind_i (nvec * ke); // k index of gathered point, iv index of gathered point
array_2d_t< size_t > kmin (
nvec, array_1d_t< size_t > (np)); // first level with condensate
real_t cv, vc, eta, zeta, qvsi, qice, qliq, qtot, dvsw, dvsw0, dvsi, n_ice,
m_ice, x_ice, n_snow, l_snow, ice_dep, e_int, stot, xrho;
real_t update[3], // scratch array with output from precipitation step
sink[nx], // tendencies
dqdt[nx]; // tendencies
array_1d_t< real_t > eflx (
nvec); // internal energy flux from precipitation (W/m2 )
array_2d_t< real_t > sx2x (
nx, array_1d_t< real_t > (nx, ZERO)), // conversion rates
vt (nvec,
array_1d_t< real_t > (
np)); // terminal velocity for different hydrometeor categories
array_1d_t< t_qx_ptr >
q{}; // vector of pointers to point to four hydrometeor inouts
array_1d_t< real_t > emptyArray;
q.emplace_back (prr_gsp, qr);
q.emplace_back (pri_gsp, qi);
q.emplace_back (prs_gsp, qs);
q.emplace_back (prg_gsp, qg);
q.emplace_back (emptyArray, qc);
q.emplace_back (emptyArray, qv);
size_t jmx = 0;
size_t jmx_ = jmx;
size_t count = ke * (ivend - ivstart);
std::vector< int > flag_array (count);
std::vector< int > is_sig_array (count);
std::for_each_n (std::execution::par,
std::views::iota (0).begin (),
count,
[=,
// global variable
lqc = lqc,
lqr = lqr,
lqs = lqs,
lqi = lqi,
lqg = lqg,
qmin = qmin,
tfrz_het2 = tfrz_het2,
np = np,
ZERO = ZERO,
c1es = c1es,
c3ies = c3ies,
tmelt = thermodyn::tmelt,
c4ies = c4ies,
rv = rv,
// vector / vector reference / vector pointer
ind_k = ind_k.data (),
ind_i = ind_i.data (),
is_sig_present
= is_sig_present.data (), // this is vector<bool>, change to vector<char>
kmin = kmin.data (),
t = t.data (),
rho = rho.data (),
flag_array = flag_array.data (),
is_sig_array = is_sig_array.data (),
// struct with vector
q_lqc_x = q[lqc].x.data (),
q_lqr_x = q[lqr].x.data (),
q_lqs_x = q[lqs].x.data (),
q_lqi_x = q[lqi].x.data (),
q_lqg_x = q[lqg].x.data (),
q_lqv_x = q[lqv].x.data ()] (size_t jmx)
{
size_t i_offset = jmx / (ivend - ivstart);
size_t j_offset = jmx % (ivend - ivstart);
size_t i = ke - 1 - i_offset;
size_t j = ivstart + j_offset;
size_t oned_vec_index = i * ivend + j;
//printf ("%llu, %llu, %llu\n", i, j, jmx);
bool cond
= ((std::max (std::max (std::max (std::max (q_lqc_x[oned_vec_index],
q_lqr_x[oned_vec_index]),
q_lqs_x[oned_vec_index]),
q_lqi_x[oned_vec_index]),
q_lqg_x[oned_vec_index])
> qmin)
|| ((t[oned_vec_index] < tfrz_het2)
&& (q_lqv_x[oned_vec_index]
> (c1es
* exp (c3ies * (t[oned_vec_index] - thermodyn::tmelt)
/ (t[oned_vec_index] - c4ies))
/ (rho[oned_vec_index] * rv * t[oned_vec_index])))));
if (cond)
{
flag_array[jmx] = true;
is_sig_array[jmx] = (std::max (std::max (q_lqs_x[oned_vec_index],
q_lqi_x[oned_vec_index]),
q_lqg_x[oned_vec_index])
> qmin);
}
});
// The loop is intentionally i<nlev; since we are using an unsigned integer
// data type, when i reaches 0, and you try to decrement further, (to -1), it
// wraps to the maximum value representable by size_t.
size_t oned_vec_index;
//std::vector< int > counter (count, 0);
//counter = counter.data (),
//counter[jmx]++;
//int sum = 0;
//for (auto i : counter)
//{
//sum += i;
//}
//std::cout << "DEBUG counter= " << sum << std::endl;
for (size_t idx = 0; idx < flag_array.size (); ++idx)
{
if (flag_array[idx])
{
size_t i_offset = idx / (ivend - ivstart);
size_t j_offset = idx % (ivend - ivstart);
size_t i = ke - 1 - i_offset;
size_t j = ivstart + j_offset;
ind_k[jmx_] = i;
ind_i[jmx_] = j;
is_sig_present[jmx_] = is_sig_array[idx];
jmx_++;
}
}
//for (auto i : ind_k)
//{
//std::cout << i;
//}
//std::cout << std::endl;
for (size_t i = ke - 1; i < ke; --i)
{
for (size_t j = ivstart; j < ivend; j++)
{
size_t oned_vec_index = i * ivend + j;
for (size_t ix = 0; ix < np; ix++)
{
if (i == (ke - 1))
{
kmin[j][qp_ind[ix]] = ke + 1;
q[qp_ind[ix]].p[j] = ZERO;
vt[j][ix] = ZERO;
}
if (q[qp_ind[ix]].x[oned_vec_index] > qmin)
{
kmin[j][qp_ind[ix]] = i;
}
}
}
}
```
### sbatch
```
#!/bin/bash
#SBATCH --account=ka1273
#SBATCH --job-name=scc
#SBATCH --partition=gpu
#SBATCH --gpus=1
#SBATCH --output=%x.%j.log
#SBATCH --exclusive
#SBATCH --time=00:10:00
# compiler flags
NOFLAGS="-O0 -std=c++20 -stdpar=gpu -gpu=cc80,noflushz -nofma -Minfo -Kieee -Mfcon -Mnofma -Mdalign -Minfo=all"
FLAGS="-O0 -std=c++20 -stdpar=gpu -gpu=cc80 -nofma"
#FLAGS='-O0 -stdpar=gpu -gpu=cc80,nofma,noflushz'
# input file
IN_FILE=$(pwd)/tasks/11k.nc
# output file
OUT_FILE=output_gpu_double.nc
# reference file associated with input file
REF_FILE=$(pwd)/reference_results/seq_11k_double.nc
# setup env
. scripts/levante-setup.sh nvidia gpu
rm -r build_std_gpu
# build the code
cmake -B build_std_gpu -S . \
-DMU_IMPL=std \
-DMU_ARCH=a100 \
-DMU_ENABLE_SINGLE=OFF \
-DCMAKE_CXX_COMPILER=nvc++ \
-DCMAKE_CXX_FLAGS="${FLAGS}"
cmake --build build_std_gpu -j 2
# cleanup previous results (if any)
if [ -f $OUT_FILE ]; then
rm $OUT_FILE
fi
# run the executable
./build_std_gpu/bin/graupel $IN_FILE $OUT_FILE
# check correctness
. scripts/check-correctness.sh
check $OUT_FILE $REF_FILE
echo "Job done!"
```