# Hash_MAp in C++
---
### Agenda
* Introduction to hash_map
* Data Structure of std::unordered_map and std::unordered_set (gnu)
* Implementation of some function in std::unordered_<map/set> (gnu)
---
### Introduction to hash map
* A data structure that is used to store and retrieve values based on keys.
* As we know, Operation ==Insert==, ==find==, and ==remove==... are ==O(1)==.
* 
* Solution for hash collision.
----
### Solution for hash collision
* What is hash collision?
* When two diffierent objects have the same hash value.
* 
----
### Solution for hash collision
* [Collision Resolution](https://learn.saylor.org/mod/book/view.php?id=32990&chapterid=12838)
* Open Addressing - Linear Probing
* 
* Open Addressing Quadratic Probing
* ==h==, ==h+1==, ==h+1+3==, ==h+1+3+5==, ...
* 
----
### Solution for hash collision
* [Collision Resolution](https://learn.saylor.org/mod/book/view.php?id=32990&chapterid=12838)
* Separate Chaining Technique(gnu)
* 
---
### Data Structure of std::unordered_map and std::unordered_set
----
### Data Structure
```mermaid
graph TB;
A[unordered_map] ---> C[_Hashtable];
B[unordered_set] ---> C[_Hashtable];
C -- derived_from ---> D[__detail::_Hashtable_base];
C -- derived_from ----> E[__detail::_Map_base];
C -- derived_from ---> F[__detail::_Insert];
C -- derived_from ----> G[__detail::_Rehash_base];
C -- derived_from ---> H[__detail::_Equality];
C -- derived_from ----> I[__detail::_Hashtable_alloc];
```
----
```cpp=
// In /depotbld/RHEL7.0/gcc-9.2.0/include/c++/9.2.0/bits/unordered_set.h
template <typename _Value, typename _Hash = hash<_Value>,
typename _Pred = equal_to<_Value>,
typename _Alloc = allocator<_Value>>
class unordered_set {
typedef
__uset_hashtable<_Value, _Hash, _Pred, _Alloc>
_Hashtable;
_Hashtable _M_h;
}
template <typename _Value, typename _Hash = hash<_Value>,
typename _Pred = std::equal_to<_Value>,
typename _Alloc = std::allocator<_Value>,
typename _Tr = __uset_traits<__cache_default<_Value, _Hash>::value>>
using __uset_hashtable =
_Hashtable<_Value, _Value, _Alloc, __detail::_Identity, _Pred, _Hash,
__detail::_Mod_range_hashing, __detail::_Default_ranged_hash,
__detail::_Prime_rehash_policy, _Tr>;
// In /depotbld/RHEL7.0/gcc-9.2.0/include/c++/9.2.0/bits/hashtable.h
template <typename _Key, typename _Value, typename _Alloc, typename _ExtractKey,
typename _Equal, typename _H1, typename _H2, typename _Hash,
typename _RehashPolicy, typename _Traits>
class _Hashtable
: public __detail::_Hashtable_base<_Key, _Value, _ExtractKey, _Equal, _H1,
_H2, _Hash, _Traits>,
public __detail::_Map_base<_Key, _Value, _Alloc, _ExtractKey, _Equal, _H1,
_H2, _Hash, _RehashPolicy, _Traits>,
public __detail::_Insert<_Key, _Value, _Alloc, _ExtractKey, _Equal, _H1,
_H2, _Hash, _RehashPolicy, _Traits>,
public __detail::_Rehash_base<_Key, _Value, _Alloc, _ExtractKey, _Equal,
_H1, _H2, _Hash, _RehashPolicy, _Traits>,
public __detail::_Equality<_Key, _Value, _Alloc, _ExtractKey, _Equal, _H1,
_H2, _Hash, _RehashPolicy, _Traits>,
private __detail::_Hashtable_alloc<__alloc_rebind<
_Alloc,
__detail::_Hash_node<_Value, _Traits::__hash_cached::value>>>
```
----
* ==__detail== namespace you can find here:
* /global/freeware/Linux/3.10/gcc-9.2.0/include/c++/9.2.0/bits/hashtable_policy.h
```mermaid
graph LR;
B[unordered_set::insert] -- Call ---> C[__detail::_Insert::insert];
C -- Call ---> D[_Hashtable::_M_insert]
```
----
```cpp=
class unordered_set {
std::pair<iterator, bool> insert(const value_type &__x) {
return _M_h.insert(__x);
}
}
namespace __detail {
struct _Insert {
__ireturn_type insert(value_type &&__v) {
// return the _Hashtable
__hashtable &__h = this->_M_conjure_hashtable();
__node_gen_type __node_gen(__h);
return __h._M_insert(std::move(__v), __node_gen, __unique_keys{});
}
}
}
auto _Hashtable<
_Key, _Value, _Alloc,
_ExtractKey, _Equal, _Hash,
_RangeHash,_Unused, _RehashPolicy, _Traits>
::_M_insert
(_Arg && __v, const _NodeGenerator &__node_gen, true_type /* __uks */)
->pair<iterator, bool> {
// ...
}
```
----
Data Structure
```cpp=
class _HashTable {
__bucket_type * _M_buckets = &_M_single_bucket;
size_type _M_bucket_count = 1;
__node_base _M_before_begin;
size_type _M_element_count = 0;
_RehashPolicy _M_rehash_policy;
__bucket_type _M_single_bucket = nullptr;
};
using __bucket_type = __node_base*;
using __node_base = __detail::_Hash_node_base;
struct _Hash_node_base {
_Hash_node_base *_M_nxt;
};
template <typename _Value>
struct _Hash_node<_Value, true> : _Hash_node_value_base<_Value> {
std::size_t _M_hash_code;
};
template <typename _Value>
struct _Hash_node<_Value, false> : _Hash_node_value_base<_Value> {
// don't cache the hash code
// std::size_t _M_hash_code;
};
template <typename _Value> struct _Hash_node_value_base : _Hash_node_base {
__gnu_cxx::__aligned_buffer<_Value> _M_storage;
};
// _RehashPolicy = struct __detail::_Prime_rehash_policy
```
----

---
### Implementation of some function in std::unordered_set
* begin / end
* find
* insert
* reserve / rehashe
* erase
* clear
* extract (C++17)
* merge (C++17)
----
### preliminary
* _M_bucket_index(node_type* n, size_t bucket_size)
* h2(h1(n), bucket_size)
1. key = extract_key(n)
2. h1(n) = ==std::hash\<key\>(key)==
3. h2(h1(n), bucket_size) = ==h1(n) % bucket_size==
* Complexity
* std::hash\<int\> is O(1)
* std::hash\<string\> is O(s.size()/8);
* Be careful when use long string as key.
----
### preliminary
* max_load, next_rehash
* Load : ==#element / #buckets==
```cpp=
class _HashTable {
_RehashPolicy _M_rehash_policy;
};
struct _Prime_rehash_policy {
_Prime_rehash_policy(float __z = 1.0) noexcept
: _M_max_load_factor(__z), _M_next_resize(0) {}
float _M_max_load_factor;
mutable std::size_t _M_next_resize;
}
```
----
### preliminary
* max_load, next_rehash
* Load : ==#element / #buckets==

----
### begin/end
* iterator
```cpp=
struct _Node_iterator_base {
__node_type *_M_cur;
}
struct iterator : public _Node_iterator_base{
// ...
}
iterator begin() { return iterator{_M_begin()}; }
iterator end() {return iterator{nullptr};}
__node_type* _M_begin() {
return static_cast<__node_type *>(_M_before_begin._M_nxt);
}
```
----
begin()

----
### unordered_set::find
```cpp=
iterator __HashTable::find(const key_type& __k) const {
__hash_code __code = this->_M_hash_code(__k); // h1(key)
std::size_t __n = _M_bucket_index(__k, __code); // h2(h1(key), bkt)
__node_type *__p = _M_find_node(__n, __k, __code);
return __p ? iterator(__p) : end();
}
__node_type *_M_find_node(
size_type __bkt, // bucket index
const key_type &__key, // key
__hash_code __c // hash code
) const {
__node_base *__before_n = _M_find_before_node(__bkt, __key, __c);
if (__before_n)
return static_cast<__node_type *>(__before_n->_M_nxt);
return nullptr;
}
__node_base * _HashTable::_M_find_before_node(
size_type __n,
const key_type &__k,
__hash_code __code) const {
__node_base *__prev_p = _M_buckets[__n];
if (!__prev_p)
return nullptr;
for (__node_type *__p = static_cast<__node_type *>(__prev_p->_M_nxt);;
__p = __p->_M_next()) {
if (this->_M_equals(__k, __code, __p))
return __prev_p;
if (!__p->_M_nxt || _M_bucket_index(__p->_M_next()) != __n)
break;
__prev_p = __p;
}
return nullptr;
}
```
----
### Find a value
* Find 2, 9, 17

----

----
Errata: _M_buckets[2] -> _M_buckets[1]

----
Errata: _M_buckets[2] -> _M_buckets[1]

---
### insert
```cpp=
pair<iterator, bool> _HashTable::_M_insert(
_Arg &&__v,
const _NodeGenerator &__node_gen,
true_type, /* unique key */
size_type __n_elt) {
const key_type &__k = this->_M_extract()(__v);
__hash_code __code = this->_M_hash_code(__k);
size_type __bkt = _M_bucket_index(__k, __code);
// Return if we can find the key in the hashtable
__node_type *__n = _M_find_node(__bkt, __k, __code);
if (__n) {
return std::make_pair(iterator(__n), false);
}
// generate a node and insert
__n = __node_gen(std::forward<_Arg>(__v));
return {_M_insert_unique_node(__bkt, __code, __n, __n_elt), true};
}
iterator _HashTable::_M_insert_unique_node(
size_type __bkt,
__hash_code __code,
__node_type *__node,
size_type __n_elt) {
// backup the state
const __rehash_state &__saved_state = _M_rehash_policy._M_state();
// check if we need to rehash
std::pair<bool, std::size_t> __do_rehash = _M_rehash_policy._M_need_rehash(_M_bucket_count, _M_element_count,
__n_elt);
__try {
// Rehash
if (__do_rehash.first) {
_M_rehash(__do_rehash.second, __saved_state);
__bkt = _M_bucket_index(this->_M_extract()(__node->_M_v()), __code);
}
// cache the hashcode for the node
this->_M_store_code(__node, __code);
// Always insert at the beginning of the bucket.
_M_insert_bucket_begin(__bkt, __node);
++_M_element_count;
return iterator(__node);
}
__catch(...) {
this->_M_deallocate_node(__node);
__throw_exception_again;
}
}
void _HashTable::_M_insert_bucket_begin(
size_type __bkt,
__node_type *__node)
{
if (_M_buckets[__bkt]) {
// if __bkt has the nodes already.
// push the new node in the head.
__node->_M_nxt = _M_buckets[__bkt]->_M_nxt;
_M_buckets[__bkt]->_M_nxt = __node;
} else {
// Make the node as the new begin
__node->_M_nxt = _M_before_begin._M_nxt;
_M_before_begin._M_nxt = __node;
if (__node->_M_nxt) {
// Update the _M_buckets if needed
_M_buckets[_M_bucket_index(__node->_M_next())] = __node;
}
// Update _M_buckets[__bkt]
_M_buckets[__bkt] = &_M_before_begin;
}
}
```
----
Insert 0, 17

----
Insert 17

----
Insert 0

----
### Rehash policy
1. When to rehash
* ==size() + #InsertElement >= _M_next_resize==
2. How the bucket_count growth
* About 2x~3x
* Line 46 in [GNU source code](https://github.com/gcc-mirror/gcc/blob/master/libstdc%2B%2B-v3/src/c%2B%2B11/hashtable_c%2B%2B0x.cc)
* [GodBolt](https://godbolt.org/z/4Y4nj9EnW)
---
### reserve
Call the rehash actually.
```cpp=
void reserve(std::size_t __n) {
__hashtable *__this = static_cast<__hashtable *>(this);
__this->rehash(__builtin_ceil(__n / max_load_factor()));
}
```
----
### Rehash
* unordered_set::rehash(size_t count)
* rehash when count >= size() / max_load_factor()
```cpp=
void _HashTable::_M_rehash_aux(
size_type __n,
std::true_type) {
// allocate a new bucets
__bucket_type *__new_buckets = _M_allocate_buckets(__n);
// get the begin node of the list
__node_type *__p = _M_begin();
// reset the _M_before_begin
_M_before_begin._M_nxt = nullptr;
std::size_t __bbegin_bkt = 0;
// Insert the original node to the __new_buckets
// Similar with insert()
while (__p) {
__node_type *__next = __p->_M_next();
std::size_t __bkt = __hash_code_base::_M_bucket_index(__p, __n);
if (!__new_buckets[__bkt]) {
__p->_M_nxt = _M_before_begin._M_nxt;
_M_before_begin._M_nxt = __p;
__new_buckets[__bkt] = &_M_before_begin;
if (__p->_M_nxt)
__new_buckets[__bbegin_bkt] = __p;
__bbegin_bkt = __bkt;
} else {
__p->_M_nxt = __new_buckets[__bkt]->_M_nxt;
__new_buckets[__bkt]->_M_nxt = __p;
}
__p = __next;
}
// deallocate the original buckets
_M_deallocate_buckets();
// update the new bucket_count and buckets
_M_bucket_count = __n;
_M_buckets = __new_buckets;
}
```
----
rehash(9)

----
rehash(9)

----
rehash(9)

----
rehash(9)

----
rehash(9)
* the order of begin to end is different
* before: 9 1 5 6
* after: 6 5 1 9

---
### erase
```cpp=
size_type _HashTable::_M_erase(
std::true_type, const key_type &__k) {
// Calculate bucket index
__hash_code __code = this->_M_hash_code(__k);
std::size_t __bkt = _M_bucket_index(__k, __code);
// Early return if the key is not in hashtable
__node_base *__prev_n = _M_find_before_node(__bkt, __k, __code);
if (!__prev_n) {
return 0;
}
// We found a matching node, erase it.
__node_type *__n = static_cast<__node_type *>(__prev_n->_M_nxt);
_M_erase(__bkt, __prev_n, __n);
return 1;
}
iterator _HashTable::_M_erase(
size_type __bkt,
__node_base *__prev_n, // previous of the erased node
__node_type *__n // Erased node
) {
if (__prev_n == _M_buckets[__bkt]) {
_M_remove_bucket_begin(__bkt, __n->_M_next(), __n->_M_nxt ? _M_bucket_index(__n->_M_next()) : 0);
} else if (__n->_M_nxt) {
size_type __next_bkt = _M_bucket_index(__n->_M_next());
if (__next_bkt != __bkt) {
_M_buckets[__next_bkt] = __prev_n;
}
}
// Disconnect the previous
__prev_n->_M_nxt = __n->_M_nxt;
// Store the return type
iterator __result(__n->_M_next());
// Deallocate the __n
this->_M_deallocate_node(__n);
--_M_element_count;
return __result;
}
void _HashTable::_M_remove_bucket_begin(
size_type __bkt,
__node_type *__next,
size_type __next_bkt) {
// clean the _M_buckets[__bkt]
// if the node is the last one in this bucket
if (!__next || __next_bkt != __bkt) {
if (__next) {
_M_buckets[__next_bkt] = _M_buckets[__bkt];
}
if (&_M_before_begin == _M_buckets[__bkt]) {
_M_before_begin._M_nxt = __next;
}
_M_buckets[__bkt] = nullptr;
}
}
```
----
erase(1) and erase(5)

----
erase(1)

----
erase(1)

----
erase(1)

----
erase(5)

----
erase(5)

----
erase(5)

---
### clear
```cpp=
clear() noexcept {
this->_M_deallocate_nodes(_M_begin());
__builtin_memset(_M_buckets, 0, _M_bucket_count * sizeof(__bucket_type));
_M_element_count = 0;
_M_before_begin._M_nxt = nullptr;
}
```
----
clear

----
clear
* Complexity is O(n) where n is element_size
* 
----
clear
* Actually, complexity is
* std::max( O(element_size), O(bucket_count))
* memory_set is O(n)
* [Clear is slower than erase](https://shininglionking.blogspot.com/2019/04/c-deep-into-hash-map-in-stl.html)
* Be careful when large buckets and small element size
```cpp=
std::unordered_set<int> s; s.reserve(99999999); // reserve a large buckets
while(...) {
s.insert(...); // Only Insert 1 element in the loop
// ...
s.clear(); // clear the unordered_set
}
```
---
### Extract (C++17)
* Extract the node_handle from unordered_<set/map>
* Reinsert the node_handle without extra allocation
```cpp=
int main()
{
std::unordered_map<int, char> cont{{1, 'a'}, {2, 'b'}, {3, 'c'}};
auto nh = cont.extract(1);
nh.key() = 4;
cont.insert(std::move(nh));
}
/*
Start: 1(a) 2(b) 3(c)
End: 2(b) 3(c) 4(a)
*/
```
----
node_handle
```cpp=
node_type extract(const key_type &__key);
using node_type = _Node_handle<_Key, _Value, __node_alloc_type>;
struct _Node_handle {
__node_type* _M_ptr;
__hashtable_alloc* _M_alloc;
}
```
----
extract(9)

----
extract(9)

---
### Merge (C++17)
* Merge the key-value from ==source== to ==destination== if key is not existed in p.
```cpp=
// void merge( std::unordered_map<Key, T, H2, P2, Allocator>& source );
int main()
{
std::unordered_map<std::string, int>
dstination{ {"C", 3}, {"B", 2}, {"A", 1} },
source{ {"E", 6}, {"D", 5}, {"A", 4} };
destination.merge(source);
}
/*
destination: [3] { {A, 1}, {B, 2}, {C, 3} }
source: [3] { {A, 4}, {D, 5}, {E, 6} }
destination: [5] { {E, 6}, {D, 5}, {A, 1}, {B, 2}, {C, 3} }
source: [1] { {A, 4} }
* /
```
----
merge
```cpp=
void _M_merge_unique(_Compatible_Hashtable &__src) noexcept {
static_assert(is_same_v<typename _Compatible_Hashtable::node_type, node_type>, "Node types are compatible");
// allocator must be same
__glibcxx_assert(get_allocator() == __src.get_allocator());
// __n_elt is # src elements
// To avoid rehash multiple times in _M_insert_unique_node
auto __n_elt = __src.size();
// itearator all node in source
for (auto __i = __src.begin(),
__end = __src.end();
__i != __end;)
{
auto __pos = __i++;
// calculate __bkt
const key_type &__k = this->_M_extract()(__pos._M_cur->_M_v());
__hash_code __code = this->_M_hash_code(__k);
size_type __bkt = _M_bucket_index(__k, __code);
// extract and insert if key is not existed in this.
if (_M_find_node(__bkt, __k, __code) == nullptr) {
auto __nh = __src.extract(__pos);
_M_insert_unique_node(__bkt, __code, __nh._M_ptr, __n_elt);
__nh._M_ptr = nullptr;
__n_elt = 1;
} else if (__n_elt != 1) {
--__n_elt;
}
}
}
```
---
### Q&A
---
{"breaks":true,"description":"Handle hash collision","slideOptions":"{\"theme\":\"white\"}","title":"std::unordered_set / std::unordered_map","contributors":"[{\"id\":\"09379b25-db04-47a4-8912-78e722b7a548\",\"add\":26848,\"del\":7354}]"}