### Datasets: 1. **1.100.000** edges and face corners: ![](https://hackmd.io/_uploads/BkUpH0vj2.png) 2. **8.600.000** edges and face corners: ![](https://hackmd.io/_uploads/BJsw8Cwon.png) ### Original version: 1. `Timer 'build_edge_to_loop_map': (Average: 25.5 ms, Min: 22.3 ms, Last: 27.7 ms)` 2. `Timer 'build_edge_to_loop_map': (Average: 509.1 ms, Min: 405.2 ms, Last: 539.0 ms)` ```C++ GroupedSpan<int> build_edge_to_loop_map(const Span<int> corner_edges, const int edges_num, Array<int> &r_offsets, Array<int> &r_indices) { SCOPED_TIMER_AVERAGED(__func__); r_offsets = create_reverse_offsets(corner_edges, edges_num); r_indices.reinitialize(r_offsets.last()); Array<int> counts(edges_num, 0); for (const int64_t corner : corner_edges.index_range()) { const int edge = corner_edges[corner]; r_indices[r_offsets[edge] + counts[edge]] = int(corner); counts[edge]++; } return {OffsetIndices<int>(r_offsets), r_indices}; } ``` ### Iota-sorting and offsets parallel: 1. `Timer 'build_edge_to_loop_map': (Average: 18.5 ms, Min: 17.5 ms, Last: 18.7 ms)` 2. `Timer 'build_edge_to_loop_map': (Average: 250.9 ms, Min: 241.1 ms, Last: 243.5 ms)` ```C++ static Array<int> gather_reverse(const Span<int> indices) { Array<int> results(indices.size()); std::iota(results.begin(), results.end(), 0); parallel_sort(results.begin(), results.end(), [indices](const int a, const int b) -> bool { if (UNLIKELY(indices[a] == indices[b])) { return a < b; } return indices[a] < indices[b]; }); return results; } GroupedSpan<int> build_edge_to_loop_map(const Span<int> corner_edges, const int edges_num, Array<int> &r_offsets, Array<int> &r_indices) { SCOPED_TIMER_AVERAGED(__func__); threading::parallel_invoke( [&]() { r_offsets = create_reverse_offsets(corner_edges, edges_num); }, [&]() { r_indices = gather_reverse(corner_edges); }); return {OffsetIndices<int>(r_offsets), r_indices}; } ``` ### Iota-sorting and offsets non-parallel: 1. `Timer 'build_edge_to_loop_map': (Average: 19.2 ms, Min: 18.4 ms, Last: 19.0 ms)` 2. `Timer 'build_edge_to_loop_map': (Average: 266.9 ms, Min: 258.9 ms, Last: 258.9 ms)` ```C++ static Array<int> gather_reverse(const Span<int> indices) { Array<int> results(indices.size()); std::iota(results.begin(), results.end(), 0); parallel_sort(results.begin(), results.end(), [indices](const int a, const int b) -> bool { if (UNLIKELY(indices[a] == indices[b])) { return a < b; } return indices[a] < indices[b]; }); return results; } GroupedSpan<int> build_edge_to_loop_map(const Span<int> corner_edges, const int edges_num, Array<int> &r_offsets, Array<int> &r_indices) { SCOPED_TIMER_AVERAGED(__func__); r_offsets = create_reverse_offsets(corner_edges, edges_num); r_indices = gather_reverse(corner_edges); return {OffsetIndices<int>(r_offsets), r_indices}; } ``` ### Iota-sorting and offsets non-parallel and with second step to make stabil group indices order: *Previously, stability was provided by a more complex comparator in `gather_reverse`.* 1. `Timer 'build_edge_to_loop_map': (Average: 17.5 ms, Min: 16.5 ms, Last: 17.3 ms)` 2. `Timer 'build_edge_to_loop_map': (Average: 212.1 ms, Min: 204.7 ms, Last: 220.8 ms)` ```C++ static Array<int> gather_reverse(const Span<int> indices) { Array<int> results(indices.size()); std::iota(results.begin(), results.end(), 0); parallel_sort(results.begin(), results.end(), [indices](const int a, const int b) -> bool { return indices[a] < indices[b]; }); return results; } static void sort_groups(const OffsetIndices<int> groups, MutableSpan<int> indices) { const auto comparator = [&](const int index_a, const int index_b) { return index_a > index_b; }; threading::parallel_for(groups.index_range(), 1024, [&](const IndexRange range) { for (const int64_t index : range) { MutableSpan<int> group = indices.slice(groups[index]); parallel_sort(group.begin(), group.end(), comparator); } }); } GroupedSpan<int> build_edge_to_loop_map(const Span<int> corner_edges, const int edges_num, Array<int> &r_offsets, Array<int> &r_indices) { SCOPED_TIMER_AVERAGED(__func__); r_offsets = create_reverse_offsets(corner_edges, edges_num); r_indices = gather_reverse(corner_edges); sort_groups(r_offsets.as_span(), r_indices); return {OffsetIndices<int>(r_offsets), r_indices}; } ``` ### Parallel fetch and add + post group sorting 1. `Timer 'build_edge_to_loop_map': (Average: 8.2 ms, Min: 7.6 ms, Last: 8.5 ms)` 2. `Timer 'build_edge_to_loop_map': (Average: 136.2 ms, Min: 127.2 ms, Last: 135.3 ms)` ```C++ static void sort_groups(const OffsetIndices<int> groups, MutableSpan<int> indices) { const auto comparator = [&](const int index_a, const int index_b) { return index_a < index_b; }; threading::parallel_for(groups.index_range(), 1024, [&](const IndexRange range) { for (const int64_t index : range) { MutableSpan<int> group = indices.slice(groups[index]); parallel_sort(group.begin(), group.end(), comparator); } }); } GroupedSpan<int> build_edge_to_loop_map(const Span<int> corner_edges, const int edges_num, Array<int> &r_offsets, Array<int> &r_indices) { SCOPED_TIMER_AVERAGED(__func__); r_offsets = create_reverse_offsets(corner_edges, edges_num); const OffsetIndices offsets(r_offsets.as_span()); Array<int> counts(edges_num, -1); r_indices.reinitialize(corner_edges.size()); threading::parallel_for(corner_edges.index_range(), 1024, [&](const IndexRange range) { for (const int64_t corner : range) { const int edge = corner_edges[corner]; const IndexRange range = offsets[edge]; const int index = atomic_add_and_fetch_int32(&counts[edge], 1); r_indices[range[index]] = int(corner); } }); sort_groups(offsets, r_indices); return {OffsetIndices<int>(r_offsets), r_indices}; } ```