### Datasets:
1. **1.100.000** edges and face corners: 
2. **8.600.000** edges and face corners: 
### Original version:
1. `Timer 'build_edge_to_loop_map': (Average: 25.5 ms, Min: 22.3 ms, Last: 27.7 ms)`
2. `Timer 'build_edge_to_loop_map': (Average: 509.1 ms, Min: 405.2 ms, Last: 539.0 ms)`
```C++
GroupedSpan<int> build_edge_to_loop_map(const Span<int> corner_edges,
const int edges_num,
Array<int> &r_offsets,
Array<int> &r_indices)
{
SCOPED_TIMER_AVERAGED(__func__);
r_offsets = create_reverse_offsets(corner_edges, edges_num);
r_indices.reinitialize(r_offsets.last());
Array<int> counts(edges_num, 0);
for (const int64_t corner : corner_edges.index_range()) {
const int edge = corner_edges[corner];
r_indices[r_offsets[edge] + counts[edge]] = int(corner);
counts[edge]++;
}
return {OffsetIndices<int>(r_offsets), r_indices};
}
```
### Iota-sorting and offsets parallel:
1. `Timer 'build_edge_to_loop_map': (Average: 18.5 ms, Min: 17.5 ms, Last: 18.7 ms)`
2. `Timer 'build_edge_to_loop_map': (Average: 250.9 ms, Min: 241.1 ms, Last: 243.5 ms)`
```C++
static Array<int> gather_reverse(const Span<int> indices)
{
Array<int> results(indices.size());
std::iota(results.begin(), results.end(), 0);
parallel_sort(results.begin(), results.end(), [indices](const int a, const int b) -> bool {
if (UNLIKELY(indices[a] == indices[b])) {
return a < b;
}
return indices[a] < indices[b];
});
return results;
}
GroupedSpan<int> build_edge_to_loop_map(const Span<int> corner_edges,
const int edges_num,
Array<int> &r_offsets,
Array<int> &r_indices)
{
SCOPED_TIMER_AVERAGED(__func__);
threading::parallel_invoke(
[&]() { r_offsets = create_reverse_offsets(corner_edges, edges_num); },
[&]() { r_indices = gather_reverse(corner_edges); });
return {OffsetIndices<int>(r_offsets), r_indices};
}
```
### Iota-sorting and offsets non-parallel:
1. `Timer 'build_edge_to_loop_map': (Average: 19.2 ms, Min: 18.4 ms, Last: 19.0 ms)`
2. `Timer 'build_edge_to_loop_map': (Average: 266.9 ms, Min: 258.9 ms, Last: 258.9 ms)`
```C++
static Array<int> gather_reverse(const Span<int> indices)
{
Array<int> results(indices.size());
std::iota(results.begin(), results.end(), 0);
parallel_sort(results.begin(), results.end(), [indices](const int a, const int b) -> bool {
if (UNLIKELY(indices[a] == indices[b])) {
return a < b;
}
return indices[a] < indices[b];
});
return results;
}
GroupedSpan<int> build_edge_to_loop_map(const Span<int> corner_edges,
const int edges_num,
Array<int> &r_offsets,
Array<int> &r_indices)
{
SCOPED_TIMER_AVERAGED(__func__);
r_offsets = create_reverse_offsets(corner_edges, edges_num);
r_indices = gather_reverse(corner_edges);
return {OffsetIndices<int>(r_offsets), r_indices};
}
```
### Iota-sorting and offsets non-parallel and with second step to make stabil group indices order:
*Previously, stability was provided by a more complex comparator in `gather_reverse`.*
1. `Timer 'build_edge_to_loop_map': (Average: 17.5 ms, Min: 16.5 ms, Last: 17.3 ms)`
2. `Timer 'build_edge_to_loop_map': (Average: 212.1 ms, Min: 204.7 ms, Last: 220.8 ms)`
```C++
static Array<int> gather_reverse(const Span<int> indices)
{
Array<int> results(indices.size());
std::iota(results.begin(), results.end(), 0);
parallel_sort(results.begin(), results.end(), [indices](const int a, const int b) -> bool {
return indices[a] < indices[b];
});
return results;
}
static void sort_groups(const OffsetIndices<int> groups, MutableSpan<int> indices)
{
const auto comparator = [&](const int index_a, const int index_b) {
return index_a > index_b;
};
threading::parallel_for(groups.index_range(), 1024, [&](const IndexRange range) {
for (const int64_t index : range) {
MutableSpan<int> group = indices.slice(groups[index]);
parallel_sort(group.begin(), group.end(), comparator);
}
});
}
GroupedSpan<int> build_edge_to_loop_map(const Span<int> corner_edges,
const int edges_num,
Array<int> &r_offsets,
Array<int> &r_indices)
{
SCOPED_TIMER_AVERAGED(__func__);
r_offsets = create_reverse_offsets(corner_edges, edges_num);
r_indices = gather_reverse(corner_edges);
sort_groups(r_offsets.as_span(), r_indices);
return {OffsetIndices<int>(r_offsets), r_indices};
}
```
### Parallel fetch and add + post group sorting
1. `Timer 'build_edge_to_loop_map': (Average: 8.2 ms, Min: 7.6 ms, Last: 8.5 ms)`
2. `Timer 'build_edge_to_loop_map': (Average: 136.2 ms, Min: 127.2 ms, Last: 135.3 ms)`
```C++
static void sort_groups(const OffsetIndices<int> groups, MutableSpan<int> indices)
{
const auto comparator = [&](const int index_a, const int index_b) {
return index_a < index_b;
};
threading::parallel_for(groups.index_range(), 1024, [&](const IndexRange range) {
for (const int64_t index : range) {
MutableSpan<int> group = indices.slice(groups[index]);
parallel_sort(group.begin(), group.end(), comparator);
}
});
}
GroupedSpan<int> build_edge_to_loop_map(const Span<int> corner_edges,
const int edges_num,
Array<int> &r_offsets,
Array<int> &r_indices)
{
SCOPED_TIMER_AVERAGED(__func__);
r_offsets = create_reverse_offsets(corner_edges, edges_num);
const OffsetIndices offsets(r_offsets.as_span());
Array<int> counts(edges_num, -1);
r_indices.reinitialize(corner_edges.size());
threading::parallel_for(corner_edges.index_range(), 1024, [&](const IndexRange range) {
for (const int64_t corner : range) {
const int edge = corner_edges[corner];
const IndexRange range = offsets[edge];
const int index = atomic_add_and_fetch_int32(&counts[edge], 1);
r_indices[range[index]] = int(corner);
}
});
sort_groups(offsets, r_indices);
return {OffsetIndices<int>(r_offsets), r_indices};
}
```