std::unordered_set / std::unordered_map

# Hash_MAp in C++ --- ### Agenda * Introduction to hash_map * Data Structure of std::unordered_map and std::unordered_set (gnu) * Implementation of some function in std::unordered_<map/set> (gnu) --- ### Introduction to hash map * A data structure that is used to store and retrieve values based on keys. * As we know, Operation ==Insert==, ==find==, and ==remove==... are ==O(1)==. * ![](https://hackmd.io/_uploads/BJ0CTNad2.png) * Solution for hash collision. ---- ### Solution for hash collision * What is hash collision? * When two diffierent objects have the same hash value. * ![](https://hackmd.io/_uploads/r1cChVpOh.png) ---- ### Solution for hash collision * [Collision Resolution](https://learn.saylor.org/mod/book/view.php?id=32990&chapterid=12838) * Open Addressing - Linear Probing * ![](https://hackmd.io/_uploads/S1EvA4p_2.png) * Open Addressing Quadratic Probing * ==h==, ==h+1==, ==h+1+3==, ==h+1+3+5==, ... * ![](https://hackmd.io/_uploads/r17CRNpd2.png) ---- ### Solution for hash collision * [Collision Resolution](https://learn.saylor.org/mod/book/view.php?id=32990&chapterid=12838) * Separate Chaining Technique(gnu) * ![](https://hackmd.io/_uploads/rywqkrad2.png) --- ### Data Structure of std::unordered_map and std::unordered_set ---- ### Data Structure ```mermaid graph TB; A[unordered_map] ---> C[_Hashtable]; B[unordered_set] ---> C[_Hashtable]; C -- derived_from ---> D[__detail::_Hashtable_base]; C -- derived_from ----> E[__detail::_Map_base]; C -- derived_from ---> F[__detail::_Insert]; C -- derived_from ----> G[__detail::_Rehash_base]; C -- derived_from ---> H[__detail::_Equality]; C -- derived_from ----> I[__detail::_Hashtable_alloc]; ``` ---- ```cpp= // In /depotbld/RHEL7.0/gcc-9.2.0/include/c++/9.2.0/bits/unordered_set.h template <typename _Value, typename _Hash = hash<_Value>, typename _Pred = equal_to<_Value>, typename _Alloc = allocator<_Value>> class unordered_set { typedef __uset_hashtable<_Value, _Hash, _Pred, _Alloc> _Hashtable; _Hashtable _M_h; } template <typename _Value, typename _Hash = hash<_Value>, typename _Pred = std::equal_to<_Value>, typename _Alloc = std::allocator<_Value>, typename _Tr = __uset_traits<__cache_default<_Value, _Hash>::value>> using __uset_hashtable = _Hashtable<_Value, _Value, _Alloc, __detail::_Identity, _Pred, _Hash, __detail::_Mod_range_hashing, __detail::_Default_ranged_hash, __detail::_Prime_rehash_policy, _Tr>; // In /depotbld/RHEL7.0/gcc-9.2.0/include/c++/9.2.0/bits/hashtable.h template <typename _Key, typename _Value, typename _Alloc, typename _ExtractKey, typename _Equal, typename _H1, typename _H2, typename _Hash, typename _RehashPolicy, typename _Traits> class _Hashtable : public __detail::_Hashtable_base<_Key, _Value, _ExtractKey, _Equal, _H1, _H2, _Hash, _Traits>, public __detail::_Map_base<_Key, _Value, _Alloc, _ExtractKey, _Equal, _H1, _H2, _Hash, _RehashPolicy, _Traits>, public __detail::_Insert<_Key, _Value, _Alloc, _ExtractKey, _Equal, _H1, _H2, _Hash, _RehashPolicy, _Traits>, public __detail::_Rehash_base<_Key, _Value, _Alloc, _ExtractKey, _Equal, _H1, _H2, _Hash, _RehashPolicy, _Traits>, public __detail::_Equality<_Key, _Value, _Alloc, _ExtractKey, _Equal, _H1, _H2, _Hash, _RehashPolicy, _Traits>, private __detail::_Hashtable_alloc<__alloc_rebind< _Alloc, __detail::_Hash_node<_Value, _Traits::__hash_cached::value>>> ``` ---- * ==__detail== namespace you can find here: * /global/freeware/Linux/3.10/gcc-9.2.0/include/c++/9.2.0/bits/hashtable_policy.h ```mermaid graph LR; B[unordered_set::insert] -- Call ---> C[__detail::_Insert::insert]; C -- Call ---> D[_Hashtable::_M_insert] ``` ---- ```cpp= class unordered_set { std::pair<iterator, bool> insert(const value_type &__x) { return _M_h.insert(__x); } } namespace __detail { struct _Insert { __ireturn_type insert(value_type &&__v) { // return the _Hashtable __hashtable &__h = this->_M_conjure_hashtable(); __node_gen_type __node_gen(__h); return __h._M_insert(std::move(__v), __node_gen, __unique_keys{}); } } } auto _Hashtable< _Key, _Value, _Alloc, _ExtractKey, _Equal, _Hash, _RangeHash,_Unused, _RehashPolicy, _Traits> ::_M_insert (_Arg && __v, const _NodeGenerator &__node_gen, true_type /* __uks */) ->pair<iterator, bool> { // ... } ``` ---- Data Structure ```cpp= class _HashTable { __bucket_type * _M_buckets = &_M_single_bucket; size_type _M_bucket_count = 1; __node_base _M_before_begin; size_type _M_element_count = 0; _RehashPolicy _M_rehash_policy; __bucket_type _M_single_bucket = nullptr; }; using __bucket_type = __node_base*; using __node_base = __detail::_Hash_node_base; struct _Hash_node_base { _Hash_node_base *_M_nxt; }; template <typename _Value> struct _Hash_node<_Value, true> : _Hash_node_value_base<_Value> { std::size_t _M_hash_code; }; template <typename _Value> struct _Hash_node<_Value, false> : _Hash_node_value_base<_Value> { // don't cache the hash code // std::size_t _M_hash_code; }; template <typename _Value> struct _Hash_node_value_base : _Hash_node_base { __gnu_cxx::__aligned_buffer<_Value> _M_storage; }; // _RehashPolicy = struct __detail::_Prime_rehash_policy ``` ---- ![](https://hackmd.io/_uploads/HJpF_JCdn.png) --- ### Implementation of some function in std::unordered_set * begin / end * find * insert * reserve / rehashe * erase * clear * extract (C++17) * merge (C++17) ---- ### preliminary * _M_bucket_index(node_type* n, size_t bucket_size) * h2(h1(n), bucket_size) 1. key = extract_key(n) 2. h1(n) = ==std::hash\<key\>(key)== 3. h2(h1(n), bucket_size) = ==h1(n) % bucket_size== * Complexity * std::hash\<int\> is O(1) * std::hash\<string\> is O(s.size()/8); * Be careful when use long string as key. ---- ### preliminary * max_load, next_rehash * Load : ==#element / #buckets== ```cpp= class _HashTable { _RehashPolicy _M_rehash_policy; }; struct _Prime_rehash_policy { _Prime_rehash_policy(float __z = 1.0) noexcept : _M_max_load_factor(__z), _M_next_resize(0) {} float _M_max_load_factor; mutable std::size_t _M_next_resize; } ``` ---- ### preliminary * max_load, next_rehash * Load : ==#element / #buckets== ![](https://hackmd.io/_uploads/Sy3jJKR_n.png) ---- ### begin/end * iterator ```cpp= struct _Node_iterator_base { __node_type *_M_cur; } struct iterator : public _Node_iterator_base{ // ... } iterator begin() { return iterator{_M_begin()}; } iterator end() {return iterator{nullptr};} __node_type* _M_begin() { return static_cast<__node_type *>(_M_before_begin._M_nxt); } ``` ---- begin() ![](https://hackmd.io/_uploads/Bkh8qFAd3.png) ---- ### unordered_set::find ```cpp= iterator __HashTable::find(const key_type& __k) const { __hash_code __code = this->_M_hash_code(__k); // h1(key) std::size_t __n = _M_bucket_index(__k, __code); // h2(h1(key), bkt) __node_type *__p = _M_find_node(__n, __k, __code); return __p ? iterator(__p) : end(); } __node_type *_M_find_node( size_type __bkt, // bucket index const key_type &__key, // key __hash_code __c // hash code ) const { __node_base *__before_n = _M_find_before_node(__bkt, __key, __c); if (__before_n) return static_cast<__node_type *>(__before_n->_M_nxt); return nullptr; } __node_base * _HashTable::_M_find_before_node( size_type __n, const key_type &__k, __hash_code __code) const { __node_base *__prev_p = _M_buckets[__n]; if (!__prev_p) return nullptr; for (__node_type *__p = static_cast<__node_type *>(__prev_p->_M_nxt);; __p = __p->_M_next()) { if (this->_M_equals(__k, __code, __p)) return __prev_p; if (!__p->_M_nxt || _M_bucket_index(__p->_M_next()) != __n) break; __prev_p = __p; } return nullptr; } ``` ---- ### Find a value * Find 2, 9, 17 ![](https://hackmd.io/_uploads/HJ6ryt0u3.png) ---- ![](https://hackmd.io/_uploads/B1pwxKCun.png) ---- Errata: _M_buckets[2] -> _M_buckets[1] ![](https://hackmd.io/_uploads/rJLPTh0_2.png) ---- Errata: _M_buckets[2] -> _M_buckets[1] ![](https://hackmd.io/_uploads/BJEIWtC_2.png) --- ### insert ```cpp= pair<iterator, bool> _HashTable::_M_insert( _Arg &&__v, const _NodeGenerator &__node_gen, true_type, /* unique key */ size_type __n_elt) { const key_type &__k = this->_M_extract()(__v); __hash_code __code = this->_M_hash_code(__k); size_type __bkt = _M_bucket_index(__k, __code); // Return if we can find the key in the hashtable __node_type *__n = _M_find_node(__bkt, __k, __code); if (__n) { return std::make_pair(iterator(__n), false); } // generate a node and insert __n = __node_gen(std::forward<_Arg>(__v)); return {_M_insert_unique_node(__bkt, __code, __n, __n_elt), true}; } iterator _HashTable::_M_insert_unique_node( size_type __bkt, __hash_code __code, __node_type *__node, size_type __n_elt) { // backup the state const __rehash_state &__saved_state = _M_rehash_policy._M_state(); // check if we need to rehash std::pair<bool, std::size_t> __do_rehash = _M_rehash_policy._M_need_rehash(_M_bucket_count, _M_element_count, __n_elt); __try { // Rehash if (__do_rehash.first) { _M_rehash(__do_rehash.second, __saved_state); __bkt = _M_bucket_index(this->_M_extract()(__node->_M_v()), __code); } // cache the hashcode for the node this->_M_store_code(__node, __code); // Always insert at the beginning of the bucket. _M_insert_bucket_begin(__bkt, __node); ++_M_element_count; return iterator(__node); } __catch(...) { this->_M_deallocate_node(__node); __throw_exception_again; } } void _HashTable::_M_insert_bucket_begin( size_type __bkt, __node_type *__node) { if (_M_buckets[__bkt]) { // if __bkt has the nodes already. // push the new node in the head. __node->_M_nxt = _M_buckets[__bkt]->_M_nxt; _M_buckets[__bkt]->_M_nxt = __node; } else { // Make the node as the new begin __node->_M_nxt = _M_before_begin._M_nxt; _M_before_begin._M_nxt = __node; if (__node->_M_nxt) { // Update the _M_buckets if needed _M_buckets[_M_bucket_index(__node->_M_next())] = __node; } // Update _M_buckets[__bkt] _M_buckets[__bkt] = &_M_before_begin; } } ``` ---- Insert 0, 17 ![](https://hackmd.io/_uploads/SkP3e50_h.png) ---- Insert 17 ![](https://hackmd.io/_uploads/HyRHGqRun.png) ---- Insert 0 ![](https://hackmd.io/_uploads/SJqEEqCuh.png) ---- ### Rehash policy 1. When to rehash * ==size() + #InsertElement >= _M_next_resize== 2. How the bucket_count growth * About 2x~3x * Line 46 in [GNU source code](https://github.com/gcc-mirror/gcc/blob/master/libstdc%2B%2B-v3/src/c%2B%2B11/hashtable_c%2B%2B0x.cc) * [GodBolt](https://godbolt.org/z/4Y4nj9EnW) --- ### reserve Call the rehash actually. ```cpp= void reserve(std::size_t __n) { __hashtable *__this = static_cast<__hashtable *>(this); __this->rehash(__builtin_ceil(__n / max_load_factor())); } ``` ---- ### Rehash * unordered_set::rehash(size_t count) * rehash when count >= size() / max_load_factor() ```cpp= void _HashTable::_M_rehash_aux( size_type __n, std::true_type) { // allocate a new bucets __bucket_type *__new_buckets = _M_allocate_buckets(__n); // get the begin node of the list __node_type *__p = _M_begin(); // reset the _M_before_begin _M_before_begin._M_nxt = nullptr; std::size_t __bbegin_bkt = 0; // Insert the original node to the __new_buckets // Similar with insert() while (__p) { __node_type *__next = __p->_M_next(); std::size_t __bkt = __hash_code_base::_M_bucket_index(__p, __n); if (!__new_buckets[__bkt]) { __p->_M_nxt = _M_before_begin._M_nxt; _M_before_begin._M_nxt = __p; __new_buckets[__bkt] = &_M_before_begin; if (__p->_M_nxt) __new_buckets[__bbegin_bkt] = __p; __bbegin_bkt = __bkt; } else { __p->_M_nxt = __new_buckets[__bkt]->_M_nxt; __new_buckets[__bkt]->_M_nxt = __p; } __p = __next; } // deallocate the original buckets _M_deallocate_buckets(); // update the new bucket_count and buckets _M_bucket_count = __n; _M_buckets = __new_buckets; } ``` ---- rehash(9) ![](https://hackmd.io/_uploads/HJ6ryt0u3.png) ---- rehash(9) ![](https://hackmd.io/_uploads/SkojtcRO3.png) ---- rehash(9) ![](https://hackmd.io/_uploads/B1yccq0O3.png) ---- rehash(9) ![](https://hackmd.io/_uploads/HJwzjqC_2.png) ---- rehash(9) * the order of begin to end is different * before: 9 1 5 6 * after: 6 5 1 9 ![](https://hackmd.io/_uploads/Skr2sqAd3.png) --- ### erase ```cpp= size_type _HashTable::_M_erase( std::true_type, const key_type &__k) { // Calculate bucket index __hash_code __code = this->_M_hash_code(__k); std::size_t __bkt = _M_bucket_index(__k, __code); // Early return if the key is not in hashtable __node_base *__prev_n = _M_find_before_node(__bkt, __k, __code); if (!__prev_n) { return 0; } // We found a matching node, erase it. __node_type *__n = static_cast<__node_type *>(__prev_n->_M_nxt); _M_erase(__bkt, __prev_n, __n); return 1; } iterator _HashTable::_M_erase( size_type __bkt, __node_base *__prev_n, // previous of the erased node __node_type *__n // Erased node ) { if (__prev_n == _M_buckets[__bkt]) { _M_remove_bucket_begin(__bkt, __n->_M_next(), __n->_M_nxt ? _M_bucket_index(__n->_M_next()) : 0); } else if (__n->_M_nxt) { size_type __next_bkt = _M_bucket_index(__n->_M_next()); if (__next_bkt != __bkt) { _M_buckets[__next_bkt] = __prev_n; } } // Disconnect the previous __prev_n->_M_nxt = __n->_M_nxt; // Store the return type iterator __result(__n->_M_next()); // Deallocate the __n this->_M_deallocate_node(__n); --_M_element_count; return __result; } void _HashTable::_M_remove_bucket_begin( size_type __bkt, __node_type *__next, size_type __next_bkt) { // clean the _M_buckets[__bkt] // if the node is the last one in this bucket if (!__next || __next_bkt != __bkt) { if (__next) { _M_buckets[__next_bkt] = _M_buckets[__bkt]; } if (&_M_before_begin == _M_buckets[__bkt]) { _M_before_begin._M_nxt = __next; } _M_buckets[__bkt] = nullptr; } } ``` ---- erase(1) and erase(5) ![](https://hackmd.io/_uploads/SyeiXjRun.png) ---- erase(1) ![](https://hackmd.io/_uploads/B1qYEiRuh.png) ---- erase(1) ![](https://hackmd.io/_uploads/rkruHo0dn.png) ---- erase(1) ![](https://hackmd.io/_uploads/SJx-8s0Oh.png) ---- erase(5) ![](https://hackmd.io/_uploads/B1T4IiRdn.png) ---- erase(5) ![](https://hackmd.io/_uploads/Hy3MvjAO2.png) ---- erase(5) ![](https://hackmd.io/_uploads/Sk3qDsRdh.png) --- ### clear ```cpp= clear() noexcept { this->_M_deallocate_nodes(_M_begin()); __builtin_memset(_M_buckets, 0, _M_bucket_count * sizeof(__bucket_type)); _M_element_count = 0; _M_before_begin._M_nxt = nullptr; } ``` ---- clear ![](https://hackmd.io/_uploads/Sk68siROh.png) ---- clear * Complexity is O(n) where n is element_size * ![](https://hackmd.io/_uploads/rkhAsj0O2.png) ---- clear * Actually, complexity is * std::max( O(element_size), O(bucket_count)) * memory_set is O(n) * [Clear is slower than erase](https://shininglionking.blogspot.com/2019/04/c-deep-into-hash-map-in-stl.html) * Be careful when large buckets and small element size ```cpp= std::unordered_set<int> s; s.reserve(99999999); // reserve a large buckets while(...) { s.insert(...); // Only Insert 1 element in the loop // ... s.clear(); // clear the unordered_set } ``` --- ### Extract (C++17) * Extract the node_handle from unordered_<set/map> * Reinsert the node_handle without extra allocation ```cpp= int main() { std::unordered_map<int, char> cont{{1, 'a'}, {2, 'b'}, {3, 'c'}}; auto nh = cont.extract(1); nh.key() = 4; cont.insert(std::move(nh)); } /* Start: 1(a) 2(b) 3(c) End: 2(b) 3(c) 4(a) */ ``` ---- node_handle ```cpp= node_type extract(const key_type &__key); using node_type = _Node_handle<_Key, _Value, __node_alloc_type>; struct _Node_handle { __node_type* _M_ptr; __hashtable_alloc* _M_alloc; } ``` ---- extract(9) ![](https://hackmd.io/_uploads/SyeiXjRun.png) ---- extract(9) ![](https://hackmd.io/_uploads/H1C1VnRdh.png) --- ### Merge (C++17) * Merge the key-value from ==source== to ==destination== if key is not existed in p. ```cpp= // void merge( std::unordered_map<Key, T, H2, P2, Allocator>& source ); int main() { std::unordered_map<std::string, int> dstination{ {"C", 3}, {"B", 2}, {"A", 1} }, source{ {"E", 6}, {"D", 5}, {"A", 4} }; destination.merge(source); } /* destination: [3] { {A, 1}, {B, 2}, {C, 3} } source: [3] { {A, 4}, {D, 5}, {E, 6} } destination: [5] { {E, 6}, {D, 5}, {A, 1}, {B, 2}, {C, 3} } source: [1] { {A, 4} } * / ``` ---- merge ```cpp= void _M_merge_unique(_Compatible_Hashtable &__src) noexcept { static_assert(is_same_v<typename _Compatible_Hashtable::node_type, node_type>, "Node types are compatible"); // allocator must be same __glibcxx_assert(get_allocator() == __src.get_allocator()); // __n_elt is # src elements // To avoid rehash multiple times in _M_insert_unique_node auto __n_elt = __src.size(); // itearator all node in source for (auto __i = __src.begin(), __end = __src.end(); __i != __end;) { auto __pos = __i++; // calculate __bkt const key_type &__k = this->_M_extract()(__pos._M_cur->_M_v()); __hash_code __code = this->_M_hash_code(__k); size_type __bkt = _M_bucket_index(__k, __code); // extract and insert if key is not existed in this. if (_M_find_node(__bkt, __k, __code) == nullptr) { auto __nh = __src.extract(__pos); _M_insert_unique_node(__bkt, __code, __nh._M_ptr, __n_elt); __nh._M_ptr = nullptr; __n_elt = 1; } else if (__n_elt != 1) { --__n_elt; } } } ``` --- ### Q&A ---