From df2c27eb486383a291dfe1db3dfda51f830f59c5 Mon Sep 17 00:00:00 2001
From: Andrew Kelley <andrew@ziglang.org>
Date: Thu, 2 Jul 2020 22:33:52 +0000
Subject: [PATCH 1/2] stage1 HashMap: store hash & do robin hood hashing

This adds these two fields to a HashMap Entry:

uint32_t hash
uint32_t distance_from_start_index

Compared to master branch, standard library tests compiled 8.4% faster
and took negligible (0.001%) more memory to complete. The amount of
memory used is still down from before 8b82c4010480 which moved indexes
to be stored separately from entries.

So, it turns out, keeping robin hood hashing plus separating indexes
did result in a performance improvement. What happened previously is
that the gains from separating indexes balanced out the losses from
removing robin hood hashing, resulting in a wash.

This also serves as an inspiration for adding a benchmark to
std.AutoHashMap and improving the implementation.
---
 src/hash_map.hpp | 129 ++++++++++++++++++++++++++++++++++++-----------
 src/list.hpp     |   3 ++
 2 files changed, 102 insertions(+), 30 deletions(-)
diff --git a/src/hash_map.hpp b/src/hash_map.hpp
index 45440d61e7..ce5369eac8 100644
--- a/src/hash_map.hpp
+++ b/src/hash_map.hpp
@@ -25,6 +25,8 @@ public:
     }
 
     struct Entry {
+        uint32_t hash;
+        uint32_t distance_from_start_index;
         K key;
         V value;
     };
@@ -56,21 +58,24 @@ public:
                 Entry *entry = &_entries.items[i];
                 switch (sz) {
                     case 1:
-                        put_index(key_to_index(entry->key), i, (uint8_t*)_index_bytes);
+                        put_index(entry, i, (uint8_t*)_index_bytes);
                         continue;
                     case 2:
-                        put_index(key_to_index(entry->key), i, (uint16_t*)_index_bytes);
+                        put_index(entry, i, (uint16_t*)_index_bytes);
                         continue;
                     case 4:
-                        put_index(key_to_index(entry->key), i, (uint32_t*)_index_bytes);
+                        put_index(entry, i, (uint32_t*)_index_bytes);
                         continue;
                     default:
-                        put_index(key_to_index(entry->key), i, (size_t*)_index_bytes);
+                        put_index(entry, i, (size_t*)_index_bytes);
                         continue;
                 }
             }
         }
 
+        // This allows us to take a pointer to an entry in `internal_put` which
+        // will not become a dead pointer when the array list is appended.
+        _entries.ensure_capacity(_entries.length + 1);
 
         switch (capacity_index_size(_indexes_len)) {
             case 1: return internal_put(key, value, (uint8_t*)_index_bytes);
@@ -187,42 +192,99 @@ private:
 
     template <typename I>
     void internal_put(const K &key, const V &value, I *indexes) {
-        size_t start_index = key_to_index(key);
-        for (size_t roll_over = 0, distance_from_start_index = 0;
-                roll_over < _indexes_len; roll_over += 1, distance_from_start_index += 1)
+        uint32_t hash = HashFunction(key);
+        uint32_t distance_from_start_index = 0;
+        size_t start_index = hash_to_index(hash);
+        for (size_t roll_over = 0; roll_over < _indexes_len;
+                roll_over += 1, distance_from_start_index += 1)
         {
             size_t index_index = (start_index + roll_over) % _indexes_len;
             I index_data = indexes[index_index];
             if (index_data == 0) {
-                _entries.append({key, value});
+                _entries.append_assuming_capacity({ hash, distance_from_start_index, key, value });
                 indexes[index_index] = _entries.length;
                 if (distance_from_start_index > _max_distance_from_start_index)
                     _max_distance_from_start_index = distance_from_start_index;
                 return;
             }
+            // This pointer survives the following append because we call
+            // _entries.ensure_capacity before internal_put.
             Entry *entry = &_entries.items[index_data - 1];
-            if (EqualFn(entry->key, key)) {
-                *entry = {key, value};
+            if (entry->hash == hash && EqualFn(entry->key, key)) {
+                *entry = {hash, distance_from_start_index, key, value};
                 if (distance_from_start_index > _max_distance_from_start_index)
                     _max_distance_from_start_index = distance_from_start_index;
                 return;
             }
+            if (entry->distance_from_start_index < distance_from_start_index) {
+                // In this case, we did not find the item. We will put a new entry.
+                // However, we will use this index for the new entry, and move
+                // the previous index down the line, to keep the _max_distance_from_start_index
+                // as small as possible.
+                _entries.append_assuming_capacity({ hash, distance_from_start_index, key, value });
+                indexes[index_index] = _entries.length;
+                if (distance_from_start_index > _max_distance_from_start_index)
+                    _max_distance_from_start_index = distance_from_start_index;
+
+                distance_from_start_index = entry->distance_from_start_index;
+
+                // Find somewhere to put the index we replaced by shifting
+                // following indexes backwards.
+                roll_over += 1;
+                distance_from_start_index += 1;
+                for (; roll_over < _indexes_len; roll_over += 1, distance_from_start_index += 1) {
+                    size_t index_index = (start_index + roll_over) % _indexes_len;
+                    I next_index_data = indexes[index_index];
+                    if (next_index_data == 0) {
+                        if (distance_from_start_index > _max_distance_from_start_index)
+                            _max_distance_from_start_index = distance_from_start_index;
+                        entry->distance_from_start_index = distance_from_start_index;
+                        indexes[index_index] = index_data;
+                        return;
+                    }
+                    Entry *next_entry = &_entries.items[next_index_data - 1];
+                    if (next_entry->distance_from_start_index < distance_from_start_index) {
+                        if (distance_from_start_index > _max_distance_from_start_index)
+                            _max_distance_from_start_index = distance_from_start_index;
+                        entry->distance_from_start_index = distance_from_start_index;
+                        indexes[index_index] = index_data;
+                        distance_from_start_index = next_entry->distance_from_start_index;
+                        entry = next_entry;
+                        index_data = next_index_data;
+                    }
+                }
+                zig_unreachable();
+            }
         }
         zig_unreachable();
     }
 
     template <typename I>
-    void put_index(size_t start_index, size_t entry_index, I *indexes) {
+    void put_index(Entry *entry, size_t entry_index, I *indexes) {
+        size_t start_index = hash_to_index(entry->hash);
+        size_t index_data = entry_index + 1;
         for (size_t roll_over = 0, distance_from_start_index = 0;
                 roll_over < _indexes_len; roll_over += 1, distance_from_start_index += 1)
         {
             size_t index_index = (start_index + roll_over) % _indexes_len;
-            if (indexes[index_index] == 0) {
-                indexes[index_index] = entry_index + 1;
+            size_t next_index_data = indexes[index_index];
+            if (next_index_data == 0) {
                 if (distance_from_start_index > _max_distance_from_start_index)
                     _max_distance_from_start_index = distance_from_start_index;
+                entry->distance_from_start_index = distance_from_start_index;
+                indexes[index_index] = index_data;
                 return;
             }
+            Entry *next_entry = &_entries.items[next_index_data - 1];
+            if (next_entry->distance_from_start_index < distance_from_start_index) {
+                if (distance_from_start_index > _max_distance_from_start_index)
+                    _max_distance_from_start_index = distance_from_start_index;
+                entry->distance_from_start_index = distance_from_start_index;
+                indexes[index_index] = index_data;
+                distance_from_start_index = next_entry->distance_from_start_index;
+                entry = next_entry;
+                index_data = next_index_data;
+            }
         }
         zig_unreachable();
     }
@@ -238,7 +300,8 @@ private:
 
     template <typename I>
     Entry *internal_get2(const K &key, I *indexes) const {
-        size_t start_index = key_to_index(key);
+        uint32_t hash = HashFunction(key);
+        size_t start_index = hash_to_index(hash);
         for (size_t roll_over = 0; roll_over <= _max_distance_from_start_index; roll_over += 1) {
             size_t index_index = (start_index + roll_over) % _indexes_len;
             size_t index_data = indexes[index_index];
@@ -246,19 +309,20 @@ private:
                 return nullptr;
 
             Entry *entry = &_entries.items[index_data - 1];
-            if (EqualFn(entry->key, key))
+            if (entry->hash == hash && EqualFn(entry->key, key))
                 return entry;
         }
         return nullptr;
     }
 
-    size_t key_to_index(const K &key) const {
-        return ((size_t)HashFunction(key)) % _indexes_len;
+    size_t hash_to_index(uint32_t hash) const {
+        return ((size_t)hash) % _indexes_len;
     }
 
     template <typename I>
     bool internal_remove(const K &key, I *indexes) {
-        size_t start_index = key_to_index(key);
+        uint32_t hash = HashFunction(key);
+        size_t start_index = hash_to_index(hash);
         for (size_t roll_over = 0; roll_over <= _max_distance_from_start_index; roll_over += 1) {
             size_t index_index = (start_index + roll_over) % _indexes_len;
             size_t index_data = indexes[index_index];
@@ -267,10 +331,10 @@ private:
 
             size_t index = index_data - 1;
             Entry *entry = &_entries.items[index];
-            if (!EqualFn(entry->key, key))
+            if (entry->hash != hash || !EqualFn(entry->key, key))
                 continue;
 
-            indexes[index_index] = 0;
+            size_t prev_index = index_index;
             _entries.swap_remove(index);
             if (_entries.length > 0 && _entries.length != index) {
                 // Because of the swap remove, now we need to update the index that was
@@ -280,24 +344,29 @@ private:
 
             // Now we have to shift over the following indexes.
             roll_over += 1;
-            for (; roll_over <= _max_distance_from_start_index; roll_over += 1) {
+            for (; roll_over < _indexes_len; roll_over += 1) {
                 size_t next_index = (start_index + roll_over) % _indexes_len;
-                if (indexes[next_index] == 0)
-                    break;
-                size_t next_start_index = key_to_index(_entries.items[indexes[next_index]].key);
-                if (next_start_index != start_index)
-                    break;
-                indexes[next_index - 1] = indexes[next_index];
+                if (indexes[next_index] == 0) {
+                    indexes[prev_index] = 0;
+                    return true;
+                }
+                Entry *next_entry = &_entries.items[indexes[next_index] - 1];
+                if (next_entry->distance_from_start_index == 0) {
+                    indexes[prev_index] = 0;
+                    return true;
+                }
+                indexes[prev_index] = indexes[next_index];
+                prev_index = next_index;
+                next_entry->distance_from_start_index -= 1;
             }
-
-            return true;
+            zig_unreachable();
         }
         return false;
     }
 
     template <typename I>
     void update_entry_index(size_t old_entry_index, size_t new_entry_index, I *indexes) {
-        size_t start_index = key_to_index(_entries.items[new_entry_index].key);
+        size_t start_index = hash_to_index(_entries.items[new_entry_index].hash);
         for (size_t roll_over = 0; roll_over <= _max_distance_from_start_index; roll_over += 1) {
             size_t index_index = (start_index + roll_over) % _indexes_len;
             if (indexes[index_index] == old_entry_index + 1) {
diff --git a/src/list.hpp b/src/list.hpp
index 2c6d90c855..803a251437 100644
--- a/src/list.hpp
+++ b/src/list.hpp
@@ -19,6 +19,9 @@ struct ZigList {
         ensure_capacity(length + 1);
         items[length++] = item;
     }
+    void append_assuming_capacity(const T& item) {
+        items[length++] = item;
+    }
     // remember that the pointer to this item is invalid after you
     // modify the length of the list
     const T & at(size_t index) const {

From 22f0a103c39f84140ee1fbfe2bffed5fcec19a26 Mon Sep 17 00:00:00 2001
From: Andrew Kelley <andrew@ziglang.org>
Date: Fri, 3 Jul 2020 03:49:03 +0000
Subject: [PATCH 2/2] stage1 HashMap: linear scan for < 16 entries

---
 src/hash_map.hpp | 58 ++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 49 insertions(+), 9 deletions(-)

diff --git a/src/hash_map.hpp b/src/hash_map.hpp
index ce5369eac8..8681e5b761 100644
--- a/src/hash_map.hpp
+++ b/src/hash_map.hpp
@@ -45,6 +45,26 @@ public:
     void put(const K &key, const V &value) {
         _modification_count += 1;
 
+        // This allows us to take a pointer to an entry in `internal_put` which
+        // will not become a dead pointer when the array list is appended.
+        _entries.ensure_capacity(_entries.length + 1);
+
+        if (_index_bytes == nullptr) {
+            if (_entries.length < 16) {
+                _entries.append({HashFunction(key), 0, key, value});
+                return;
+            } else {
+                _indexes_len = 32;
+                _index_bytes = heap::c_allocator.allocate<uint8_t>(_indexes_len);
+                _max_distance_from_start_index = 0;
+                for (size_t i = 0; i < _entries.length; i += 1) {
+                    Entry *entry = &_entries.items[i];
+                    put_index(entry, i, _index_bytes);
+                }
+                return internal_put(key, value, _index_bytes);
+            }
+        }
+
         // if we would get too full (60%), double the indexes size
         if ((_entries.length + 1) * 5 >= _indexes_len * 3) {
             heap::c_allocator.deallocate(_index_bytes,
@@ -73,10 +93,6 @@ public:
             }
         }
 
-        // This allows us to take a pointer to an entry in `internal_put` which
-        // will not become a dead pointer when the array list is appended.
-        _entries.ensure_capacity(_entries.length + 1);
-
         switch (capacity_index_size(_indexes_len)) {
             case 1: return internal_put(key, value, (uint8_t*)_index_bytes);
             case 2: return internal_put(key, value, (uint16_t*)_index_bytes);
@@ -114,6 +130,16 @@ public:
 
     bool maybe_remove(const K &key) {
         _modification_count += 1;
+        if (_index_bytes == nullptr) {
+            uint32_t hash = HashFunction(key);
+            for (size_t i = 0; i < _entries.length; i += 1) {
+                if (_entries.items[i].hash == hash && EqualFn(_entries.items[i].key, key)) {
+                    _entries.swap_remove(i);
+                    return true;
+                }
+            }
+            return false;
+        }
         switch (capacity_index_size(_indexes_len)) {
             case 1: return internal_remove(key, (uint8_t*)_index_bytes);
             case 2: return internal_remove(key, (uint16_t*)_index_bytes);
@@ -170,11 +196,16 @@ private:
     void init_capacity(size_t capacity) {
         _entries = {};
         _entries.ensure_capacity(capacity);
-        // So that at capacity it will only be 60% full.
-        _indexes_len = capacity * 5 / 3;
-        size_t sz = capacity_index_size(_indexes_len);
-        // This zero initializes _index_bytes which sets them all to empty.
-        _index_bytes = heap::c_allocator.allocate<uint8_t>(_indexes_len * sz);
+        _indexes_len = 0;
+        if (capacity >= 16) {
+            // So that at capacity it will only be 60% full.
+            _indexes_len = capacity * 5 / 3;
+            size_t sz = capacity_index_size(_indexes_len);
+            // This zero initializes _index_bytes which sets them all to empty.
+            _index_bytes = heap::c_allocator.allocate<uint8_t>(_indexes_len * sz);
+        } else {
+            _index_bytes = nullptr;
+        }
 
         _max_distance_from_start_index = 0;
         _modification_count = 0;
@@ -290,6 +321,15 @@ private:
     }
 
     Entry *internal_get(const K &key) const {
+        if (_index_bytes == nullptr) {
+            uint32_t hash = HashFunction(key);
+            for (size_t i = 0; i < _entries.length; i += 1) {
+                if (_entries.items[i].hash == hash && EqualFn(_entries.items[i].key, key)) {
+                    return &_entries.items[i];
+                }
+            }
+            return nullptr;
+        }
         switch (capacity_index_size(_indexes_len)) {
             case 1: return internal_get2(key, (uint8_t*)_index_bytes);
             case 2: return internal_get2(key, (uint16_t*)_index_bytes);