Cubiod with 1000x1000x1000 points. 150 iterations: 1. reverse_indices_in_groups_new2: 209.8 ms 1. reverse_indices_in_groups_new0: 212.1 ms 2. reverse_indices_in_groups_old: 219.6 ms 3. reverse_indices_in_groups_new1: 222.0 ms 4. reverse_indices_in_groups_oldest: 341.6 ms ## reverse_indices_in_groups_new0 ```Cpp static Array<int> reverse_indices_in_groups_new0(const Span<int> group_indices, const OffsetIndices<int> offsets) { SCOPED_TIMER_AVERAGED(__func__); if (group_indices.is_empty()) { return {}; } static constexpr const int list_start = -1; Array<int> counts(offsets.size(), list_start); Array<int> group_lists(group_indices.size()); threading::parallel_for(group_indices.index_range(), 1024, [&](const IndexRange range) { MutableSpan<int> local_group_lists = group_lists.as_mutable_span().slice(range); std::iota(local_group_lists.begin(), local_group_lists.end(), range.first()); for (int &index : local_group_lists) { const int group_index = group_indices[index]; atomic_a_swap_b(counts[group_index], index); } }); Array<int> results(group_indices.size()); threading::parallel_for(counts.index_range(), 1024, [&](const IndexRange range) { for (const int64_t group_v : range) { MutableSpan<int> dst_results = results.as_mutable_span().slice(offsets[group_v]); int next_index = counts[group_v]; dst_results.first() = next_index; for (int &result_value : dst_results.drop_front(1)) { next_index = group_lists[next_index]; result_value = next_index; } std::sort(dst_results.begin(), dst_results.end()); } }); BLI_assert(!results.as_span().contains(list_start)); return results; } ``` ## reverse_indices_in_groups_new1 ```Cpp static Array<int> reverse_indices_in_groups_new1(const Span<int> group_indices, const OffsetIndices<int> offsets) { SCOPED_TIMER_AVERAGED(__func__); if (group_indices.is_empty()) { return {}; } static constexpr const int list_start = -1; Array<int> counts(offsets.size(), list_start); Array<int> group_lists(group_indices.size()); threading::parallel_for(group_indices.index_range(), 1024, [&](const IndexRange range) { for (const int64_t i : range) { const int group_index = group_indices[i]; group_lists[i] = int(i); atomic_a_swap_b(counts[group_index], group_lists[i]); } }); Array<int> results(group_indices.size()); threading::parallel_for(counts.index_range(), 1024, [&](const IndexRange range) { for (const int64_t group_v : range) { MutableSpan<int> dst_results = results.as_mutable_span().slice(offsets[group_v]); int next_index = counts[group_v]; dst_results.first() = next_index; for (int &result_value : dst_results.drop_front(1)) { next_index = group_lists[next_index]; result_value = next_index; } std::sort(dst_results.begin(), dst_results.end()); } }); BLI_assert(!results.as_span().contains(list_start)); return results; } ``` ## reverse_indices_in_groups_new2 ```Cpp static Array<int> reverse_indices_in_groups_new2(const Span<int> group_indices, const OffsetIndices<int> offsets) { SCOPED_TIMER_AVERAGED(__func__); if (group_indices.is_empty()) { return {}; } static constexpr const int list_start = -1; Array<int> counts(offsets.size(), list_start); Array<int> group_lists(group_indices.size()); threading::parallel_for(group_indices.index_range(), 1024, [&](const IndexRange range) { MutableSpan<int> local_group_lists = group_lists.as_mutable_span().slice(range); std::iota(local_group_lists.begin(), local_group_lists.end(), range.first()); for (int &index : local_group_lists) { const int group_index = group_indices[index]; atomic_a_swap_b(counts[group_index], index); } }); Array<int> results(group_indices.size()); threading::parallel_for(counts.index_range(), 1024, [&](const IndexRange range) { for (const int64_t group_v : range) { MutableSpan<int> dst_results = results.as_mutable_span().slice(offsets[group_v]); int next_index = counts[group_v]; dst_results.first() = next_index; for (int &result_value : dst_results.drop_front(1)) { next_index = group_lists[next_index]; result_value = next_index; } } }); BLI_assert(!results.as_span().contains(list_start)); sort_small_groups(offsets, 1024, results); return results; } ``` ## reverse_indices_in_groups_old ```Cpp static Array<int> reverse_indices_in_groups_old(const Span<int> group_indices, const OffsetIndices<int> offsets) { SCOPED_TIMER_AVERAGED(__func__); if (group_indices.is_empty()) { return {}; } BLI_assert(*std::max_element(group_indices.begin(), group_indices.end()) < offsets.size()); BLI_assert(*std::min_element(group_indices.begin(), group_indices.end()) >= 0); constexpr const int list_start = -1; Array<int> lists_ends(offsets.size(), list_start); Array<int> group_lists(group_indices.size()); /* Construction of a multiple single-linked lists. * Each element point on previous one or list_start. Length the same as group size. * End of each groups is contained in lists_ends array. */ threading::parallel_for(group_indices.index_range(), 1024, [&](const IndexRange range) { for (const int64_t i : range) { const int group_index = group_indices[i]; group_lists[i] = int(i); /* Swap current index with any other in list head. */ atomic_a_swap_b(lists_ends[group_index], group_lists[i]); } }); Array<int> results(group_indices.size()); threading::parallel_for(lists_ends.index_range(), 1024, [&](const IndexRange range) { for (const int64_t group_v : range) { MutableSpan<int> dst_results = results.as_mutable_span().slice(offsets[group_v]); int next_index = lists_ends[group_v]; dst_results.first() = next_index; for (int &result_value : dst_results.drop_front(1)) { next_index = group_lists[next_index]; result_value = next_index; } } }); sort_small_groups(offsets, 1024, results); return results; } ``` ## reverse_indices_in_groups_oldest ```Cpp static Array<int> reverse_indices_in_groups_oldest(const Span<int> group_indices, const OffsetIndices<int> offsets) { SCOPED_TIMER_AVERAGED(__func__); if (group_indices.is_empty()) { return {}; } BLI_assert(*std::max_element(group_indices.begin(), group_indices.end()) < offsets.size()); BLI_assert(*std::min_element(group_indices.begin(), group_indices.end()) >= 0); /* `counts` keeps track of how many elements have been added to each group, and is incremented * atomically by many threads in parallel. `calloc` can be measurably faster than a parallel fill * of zero. Alternatively the offsets could be copied and incremented directly, but the cost of * the copy is slightly higher than the cost of `calloc`. */ int *counts = MEM_cnew_array<int>(size_t(offsets.size()), __func__); BLI_SCOPED_DEFER([&]() { MEM_freeN(counts); }) Array<int> results(group_indices.size()); threading::parallel_for(group_indices.index_range(), 1024, [&](const IndexRange range) { for (const int64_t i : range) { const int group_index = group_indices[i]; const int index_in_group = atomic_fetch_and_add_int32(&counts[group_index], 1); results[offsets[group_index][index_in_group]] = int(i); } }); sort_small_groups(offsets, 1024, results); return results; } ``` --- ## sort_small_groups ```Cpp static void sort_small_groups(const OffsetIndices<int> groups, const int grain_size, MutableSpan<int> indices) { threading::parallel_for(groups.index_range(), grain_size, [&](const IndexRange range) { for (const int64_t index : range) { MutableSpan<int> group = indices.slice(groups[index]); std::sort(group.begin(), group.end()); } }); } ``` ## atomic_a_swap_b ```Cpp static void atomic_a_swap_b(int &a, int &b) { int prew_a_state = a; while (true) { const int result = atomic_cas_int32(&a, prew_a_state, b); if (result == b) { break; } prew_a_state = result; } b = prew_a_state; } ```