# Different approaches to make reverse groups filling more faster [Results table](https://docs.google.com/spreadsheets/d/1ZkAHW2fJy8q5UfkVBx0PyaPF4BUdbHoTf60xSgFR18M/edit?usp=sharing) ## Old ```cpp static Array<int> reverse_indices_in_groups_old(const Span<int> group_indices, const OffsetIndices<int> offsets) { SCOPED_TIMER_AVERAGED(__func__); if (group_indices.is_empty()) { return {}; } BLI_assert(*std::max_element(group_indices.begin(), group_indices.end()) < offsets.size()); BLI_assert(*std::min_element(group_indices.begin(), group_indices.end()) >= 0); Array<int> counts(offsets.size(), -1); Array<int> results(group_indices.size()); threading::parallel_for(group_indices.index_range(), 1024, [&](const IndexRange range) { for (const int64_t i : range) { const int group_index = group_indices[i]; const int index_in_group = atomic_add_and_fetch_int32(&counts[group_index], 1); results[offsets[group_index][index_in_group]] = int(i); } }); sort_small_groups(offsets, 1024, results); return results; } ``` ## With manually allocating of array for int-iterators ```cpp static Array<int> reverse_indices_in_groups_malloc(const Span<int> group_indices, const OffsetIndices<int> offsets) { SCOPED_TIMER_AVERAGED(__func__); if (group_indices.is_empty()) { return {}; } BLI_assert(*std::max_element(group_indices.begin(), group_indices.end()) < offsets.size()); BLI_assert(*std::min_element(group_indices.begin(), group_indices.end()) >= 0); /* `calloc` can be measurably faster than a parallel fill of zero. */ int *counts = MEM_cnew_array<int>(size_t(offsets.size()), __func__); Array<int> results(group_indices.size()); threading::parallel_for(group_indices.index_range(), 1024, [&](const IndexRange range) { for (const int64_t i : range) { const int group_index = group_indices[i]; const int index_in_group = atomic_fetch_and_add_int32(&counts[group_index], 1); results[offsets[group_index][index_in_group]] = int(i); } }); sort_small_groups(offsets, 1024, results); MEM_freeN(counts); return results; } ``` ## A prefix sum subtraction by using temporary copy of offset ```cpp static Array<int> reverse_indices_in_groups_copy_offsets(const Span<int> group_indices, const OffsetIndices<int> groups) { SCOPED_TIMER_AVERAGED(__func__); if (group_indices.is_empty()) { return {}; } BLI_assert(*std::max_element(group_indices.begin(), group_indices.end()) < groups.size()); BLI_assert(*std::min_element(group_indices.begin(), group_indices.end()) >= 0); /* Use the existing offsets to fill the groups in parallel, starting at the end of each group. */ Array<int> offsets(groups.size()); array_utils::copy(groups.data().drop_back(1), offsets.as_mutable_span()); Array<int> results(group_indices.size()); threading::parallel_for(group_indices.index_range(), 1024, [&](const IndexRange range) { for (const int64_t i : range) { const int group_index = group_indices[i]; const int result_index = atomic_fetch_and_add_int32(&offsets[group_index], 1); results[result_index] = int(i); } }); sort_small_groups(groups, 1024, results); return results; } ``` ## Prefix sum and temporal copy of group indices to rewrite its in a main loop and apply for scatter_iota later ```cpp static Array<int> reverse_indices_in_groups_copy_offsets_and_v(const Span<int> group_indices, const OffsetIndices<int> groups) { SCOPED_TIMER_AVERAGED(__func__); if (group_indices.is_empty()) { return {}; } BLI_assert(*std::max_element(group_indices.begin(), group_indices.end()) < groups.size()); BLI_assert(*std::min_element(group_indices.begin(), group_indices.end()) >= 0); /* Use the existing offsets to fill the groups in parallel, starting at the end of each group. */ Array<int> offsets(groups.size()); Array<int> tmp_results(group_indices.size()); array_utils::copy(groups.data().drop_back(1), offsets.as_mutable_span()); array_utils::copy(group_indices, tmp_results.as_mutable_span()); threading::parallel_for(group_indices.index_range(), 2048, [&](const IndexRange range) { for (int &group_index : tmp_results.as_mutable_span().slice(range)) { group_index = atomic_fetch_and_add_int32(&offsets[group_index], 1); } }); Array<int> results(group_indices.size()); threading::parallel_for(results.index_range(), 2048, [&](const IndexRange range) { for (const int64_t index : range) { results[tmp_results[index]] = int(index); } }); sort_small_groups(groups, 1024, results); return results; } ``` ## Only temporal copy of group indices to rewrite its in a main loop and apply for scatter_iota later ```cpp static Array<int> reverse_indices_in_groups_copy_v(const Span<int> group_indices, const OffsetIndices<int> groups) { SCOPED_TIMER_AVERAGED(__func__); Array<int> group_indices_tmp(group_indices.size()); Array<int> counts(groups.size(), -1); threading::parallel_for(group_indices.index_range(), 1024, [&](const IndexRange range) { MutableSpan<int> local_group_indices = group_indices_tmp.as_mutable_span().slice(range); const Span<int> local_group_indices_src = group_indices.slice(range); std::copy(local_group_indices_src.begin(), local_group_indices_src.end(), local_group_indices.begin()); for (int &group_index : local_group_indices) { const int index_in_group = atomic_add_and_fetch_int32(&counts[group_index], 1); group_index = int(groups[group_index][index_in_group]); } }); Array<int> results(group_indices.size()); threading::parallel_for(results.index_range(), 2048, [&](const IndexRange range) { for (const int64_t index : range) { results[group_indices_tmp[index]] = int(index); } }); sort_small_groups(groups, 1024, results); return results; } ``` ## Results: ![](https://hackmd.io/_uploads/rJA5m9w0n.png) ![](https://hackmd.io/_uploads/SkRqX9DRn.png)