# Different approaches to make reverse groups filling more faster
[Results table](https://docs.google.com/spreadsheets/d/1ZkAHW2fJy8q5UfkVBx0PyaPF4BUdbHoTf60xSgFR18M/edit?usp=sharing)
## Old
```cpp
static Array<int> reverse_indices_in_groups_old(const Span<int> group_indices, const OffsetIndices<int> offsets)
{
SCOPED_TIMER_AVERAGED(__func__);
if (group_indices.is_empty()) {
return {};
}
BLI_assert(*std::max_element(group_indices.begin(), group_indices.end()) < offsets.size());
BLI_assert(*std::min_element(group_indices.begin(), group_indices.end()) >= 0);
Array<int> counts(offsets.size(), -1);
Array<int> results(group_indices.size());
threading::parallel_for(group_indices.index_range(), 1024, [&](const IndexRange range) {
for (const int64_t i : range) {
const int group_index = group_indices[i];
const int index_in_group = atomic_add_and_fetch_int32(&counts[group_index], 1);
results[offsets[group_index][index_in_group]] = int(i);
}
});
sort_small_groups(offsets, 1024, results);
return results;
}
```
## With manually allocating of array for int-iterators
```cpp
static Array<int> reverse_indices_in_groups_malloc(const Span<int> group_indices, const OffsetIndices<int> offsets)
{
SCOPED_TIMER_AVERAGED(__func__);
if (group_indices.is_empty()) {
return {};
}
BLI_assert(*std::max_element(group_indices.begin(), group_indices.end()) < offsets.size());
BLI_assert(*std::min_element(group_indices.begin(), group_indices.end()) >= 0);
/* `calloc` can be measurably faster than a parallel fill of zero. */
int *counts = MEM_cnew_array<int>(size_t(offsets.size()), __func__);
Array<int> results(group_indices.size());
threading::parallel_for(group_indices.index_range(), 1024, [&](const IndexRange range) {
for (const int64_t i : range) {
const int group_index = group_indices[i];
const int index_in_group = atomic_fetch_and_add_int32(&counts[group_index], 1);
results[offsets[group_index][index_in_group]] = int(i);
}
});
sort_small_groups(offsets, 1024, results);
MEM_freeN(counts);
return results;
}
```
## A prefix sum subtraction by using temporary copy of offset
```cpp
static Array<int> reverse_indices_in_groups_copy_offsets(const Span<int> group_indices, const OffsetIndices<int> groups)
{
SCOPED_TIMER_AVERAGED(__func__);
if (group_indices.is_empty()) {
return {};
}
BLI_assert(*std::max_element(group_indices.begin(), group_indices.end()) < groups.size());
BLI_assert(*std::min_element(group_indices.begin(), group_indices.end()) >= 0);
/* Use the existing offsets to fill the groups in parallel, starting at the end of each group. */
Array<int> offsets(groups.size());
array_utils::copy(groups.data().drop_back(1), offsets.as_mutable_span());
Array<int> results(group_indices.size());
threading::parallel_for(group_indices.index_range(), 1024, [&](const IndexRange range) {
for (const int64_t i : range) {
const int group_index = group_indices[i];
const int result_index = atomic_fetch_and_add_int32(&offsets[group_index], 1);
results[result_index] = int(i);
}
});
sort_small_groups(groups, 1024, results);
return results;
}
```
## Prefix sum and temporal copy of group indices to rewrite its in a main loop and apply for scatter_iota later
```cpp
static Array<int> reverse_indices_in_groups_copy_offsets_and_v(const Span<int> group_indices, const OffsetIndices<int> groups)
{
SCOPED_TIMER_AVERAGED(__func__);
if (group_indices.is_empty()) {
return {};
}
BLI_assert(*std::max_element(group_indices.begin(), group_indices.end()) < groups.size());
BLI_assert(*std::min_element(group_indices.begin(), group_indices.end()) >= 0);
/* Use the existing offsets to fill the groups in parallel, starting at the end of each group. */
Array<int> offsets(groups.size());
Array<int> tmp_results(group_indices.size());
array_utils::copy(groups.data().drop_back(1), offsets.as_mutable_span());
array_utils::copy(group_indices, tmp_results.as_mutable_span());
threading::parallel_for(group_indices.index_range(), 2048, [&](const IndexRange range) {
for (int &group_index : tmp_results.as_mutable_span().slice(range)) {
group_index = atomic_fetch_and_add_int32(&offsets[group_index], 1);
}
});
Array<int> results(group_indices.size());
threading::parallel_for(results.index_range(), 2048, [&](const IndexRange range) {
for (const int64_t index : range) {
results[tmp_results[index]] = int(index);
}
});
sort_small_groups(groups, 1024, results);
return results;
}
```
## Only temporal copy of group indices to rewrite its in a main loop and apply for scatter_iota later
```cpp
static Array<int> reverse_indices_in_groups_copy_v(const Span<int> group_indices, const OffsetIndices<int> groups)
{
SCOPED_TIMER_AVERAGED(__func__);
Array<int> group_indices_tmp(group_indices.size());
Array<int> counts(groups.size(), -1);
threading::parallel_for(group_indices.index_range(), 1024, [&](const IndexRange range) {
MutableSpan<int> local_group_indices = group_indices_tmp.as_mutable_span().slice(range);
const Span<int> local_group_indices_src = group_indices.slice(range);
std::copy(local_group_indices_src.begin(), local_group_indices_src.end(), local_group_indices.begin());
for (int &group_index : local_group_indices) {
const int index_in_group = atomic_add_and_fetch_int32(&counts[group_index], 1);
group_index = int(groups[group_index][index_in_group]);
}
});
Array<int> results(group_indices.size());
threading::parallel_for(results.index_range(), 2048, [&](const IndexRange range) {
for (const int64_t index : range) {
results[group_indices_tmp[index]] = int(index);
}
});
sort_small_groups(groups, 1024, results);
return results;
}
```
## Results:
![](https://hackmd.io/_uploads/rJA5m9w0n.png)
![](https://hackmd.io/_uploads/SkRqX9DRn.png)