.section .text
// This is an implementation of hashtables based on Google's Swiss table
// hashtables, using as reference
// ["Designing a Fast, Efficient, Cache-friendly Hash Table, Step by Step"](https://www.youtube.com/watch?v=ncHmEUmJZf4).
//
// This also implements the "cache-line-aware groups" optimization described at
// 47:25 of "Designing a Fast, Efficient, Cache-friendly Hash Table, Step by
// Step," since it only supports 8-byte entries.
// A hashtable is a region laid out as:
//
// +-------------------+-------------------+ 0x00
// | Group Pointer | log2(Group Count) |
// +-------------------+-------------------+ 0x10
// | Length | Capacity |
// +-------------------+-------------------+ 0x20
// | Alloc Function | Free Function |
// +-------------------+-------------------+ 0x30
// | Equals Function | Hash Function |
// +-------------------+-------------------+ 0x40
//
// All the functions defined in this file take hashtables by reference, so they
// take a pointer to the above.
//
// The signatures of the Alloc, Equals, and Hash functions are described in the
// documentation on the `hashtable_init` procedure.
//
// The Group Pointer points to an array of (1 << log2(Group Count)) hash
// groups, which are laid out as:
//
// 0x00 0x08 0x40
// +-+-+-+-+-+-+-+-+----- ... -----+
// | |H|H|H|H|H|H|H| Values |
// +-+-+-+-+-+-+-+-+----- ... -----+
// | \
// | +----------+
// | \
// +-+-+-+-+-+-+-+-+
// |F|P|P|P|P|P|P|P|
// +-+-+-+-+-+-+-+-+
// 0x00 0x01
//
// - `H`: The high 8 bits of the hash of the value in the corresponding slot.
// - `P`: A flag for whether the entry is present.
// - `F`: A flag for whether the entire group has been full (since the last resize).
/**
* Initializes a hashtable.
*
* ## Inputs
*
* - `x0`: A pointer to the area to initialize as a hashtable.
* - `x1`: log2() of the initial number of groups.
* - `x2`: The `alloc` function. See below for details.
* - `x3`: The `free` function. See below for details.
* - `x4`: The `equals` function. See below for details.
* - `x5`: The `hash` function. See below for details.
*
* ## Notes
*
* All of the functions must obey the
* [Procedure Call Standard for the ARM 64-bit Architecture (AArch64)](https://github.com/ARM-software/abi-aa/blob/78b0cd76d3b49f8212be1945e8ba10b15c7cf7e0/aapcs64/aapcs64.rst).
*
* In C-like syntax, the functions must have the given signatures:
*
* - `void* alloc(size_t size);`
* - `void free(void* ptr, size_t size);`
* - `unsigned long long equals(unsigned long long x, unsigned long long y);`
* - `unsigned long long hash(unsigned long long value);`
*
* Furthermore:
*
* - The `alloc` function will always be called with a multiple of 64, and must
* return pointers aligned to 64 bytes, and must always return a valid
* pointer to a zeroed region. (A wrapper around `posix_memalign` and
* `memset` is recommended on Unix systems.)
* - The `free` function will be passed the address and size of an allocation
* returned by `alloc`, and should free it.
* - The `equals` function must be a reflexive and symmetric relation that
* returns `-1` when true and `0` when false.
* - The `hash` function should mix entropy well into high bits, and must
* respect `equals`; that is, `equals(x, y) == -1` implies
* `hash(x) == hash(y)`.
*/
.global hashtable_init
.p2align 6
hashtable_init:
// Make a frame and save some callee-save registers.
stp fp, lr, [sp, #-0x40]!
stp x20, x21, [sp, #0x10]
stp x22, x23, [sp, #0x20]
stp x24, x25, [sp, #0x30]
mov fp, sp
// If the initial number of groups is 50 or above, bail out.
cmp x1, #50
b.hs hashtable_init.too_many_groups
// Save the arguments to callee-save registers.
mov x20, x0
mov x21, x1
mov x22, x2
mov x23, x3
mov x24, x4
mov x25, x5
// Allocate 64 bytes per group.
mov x6, #64
lsl x0, x6, x1
blr x2
cbz x0, hashtable_init.oom
// Store the saved arguments to the table.
stp x0, x21, [x20]
stp x22, x23, [x20, #0x20]
stp x24, x25, [x20, #0x30]
// Compute the capacity and store it as well. We use a maximum load
// factor of 15/16 (93.75%), giving us a capacity of
// (7 * group count * 15/16); this gives a load factor close to the
// recommended 95%, while being much cheaper to compute.
mov x6, #105
lsl x6, x6, x21
lsr x6, x6, #4
str x6, [x20, #0x18]
// Reload the callee-save registers, clean up the stack frame, and return.
ldp x20, x21, [sp, #0x10]
ldp x22, x23, [sp, #0x20]
ldp x24, x25, [sp, #0x30]
ldp fp, lr, [sp], #0x40
ret
hashtable_init.too_many_groups:
adr x0, hashtable_init.too_many_groups.msg
mov x1, hashtable_init.too_many_groups.len
bl panic
udf #0
hashtable_init.oom:
adr x0, hashtable_init.oom.msg
mov x1, hashtable_init.oom.len
bl panic
udf #0
/**
* Cleans up a hashtable. Note that this does not free the values of the table.
*
* ## Inputs
*
* - `x0`: The pointer to the hashtable.
*/
.global hashtable_destroy
.p2align 6
hashtable_destroy:
// Load the free function.
ldr x2, [x0, #0x28]
// Load the group pointer and compute the size of the groups.
ldp x0, x1, [x0]
mov x3, #1
lsl x1, x3, x1
// Tail-call the free function.
br x2
/**
* Finds a value in a hashtable.
*
* ## Inputs
*
* - `x0`: The pointer to the hashtable.
* - `x1`: The value to search for.
*
* ## Outputs
*
* - `x0`: The address of the group the value is in (if it was found), or could
* be placed in in (if it was not found).
* - `x1`: The 1-based index of the value within the group if it was found, or
* 0 if it was not.
* - `x2`: The hash of the value.
*
* ## Notes
*
* If the value was not found, `x0` is guaranteed to not be full.
*/
.global hashtable_find_internal
.p2align 6
hashtable_find_internal:
// Make a frame and save some callee-save registers.
stp fp, lr, [sp, #-0x50]!
stp x20, x21, [sp, #0x10]
stp x22, x23, [sp, #0x20]
stp x24, x25, [sp, #0x30]
stp x26, x27, [sp, #0x40]
mov fp, sp
// Save the arguments to callee-save registers.
mov x20, x0
mov x21, x1
// Hash the element and store the hash in x22.
ldr x1, [x20, #0x38]
mov x0, x21
blr x1
mov x22, x0
// Create a mask for indexing the hash groups in x23, and use it to
// mask the hash to get the index of the first group to check, stored
// in x24. At the same time (to save a cycle in the common case), load
// the group pointer into x2.
ldp x2, x0, [x20]
mov x1, #-1
lsl x23, x1, x0
bic x24, x22, x23
// We store the high byte of the hash in every byte of v0. This will
// allow us to perform very fast and loop-free checks for the hash
// within a group.
lsr x0, x22, #56
dup v0.16b, w0
// The outer loop traverses over groups, visiting every one that could
// contain values with the matching lower-bits of hash.
hashtable_find_internal.outer_loop:
// Store the address of the group in x25.
add x25, x2, x24, lsl #6
// Read the metadata into v1, and compare it against the hash bytes
// from v0, storing the result of the comparison in x26.
ldr x1, [x25]
mov v1.d[0], x1
cmeq v1.8b, v0.8b, v1.8b
mov x26, v1.d[0]
// The inner loop checks groups for values with matching top-bits of
// hash.
hashtable_find_internal.inner_loop:
// Find the first match. We do this by finding the highest set bit,
// dividing by 8, and subtracting it from 8. x27 is then the match
// index.
clz x2, x26
lsr x2, x2, #3
mov x3, #7
sub x27, x3, x2
// If x2 is zero or negative one, only the metadata byte matched, or no
// bytes matched, so we know the element isn't in this hash group.
cmp x27, #0
b.le hashtable_find_internal.not_in_hash_group
// Then check that the present bit is actually set for that slot; if
// not, try the next one.
mov x0, #1
lsl x0, x0, x27
tst x0, x1
b.eq hashtable_find_internal.next_slot
// Otherwise, load the value into x1 and original value into x0, then
// call the equals function.
ldr x0, [x25, x27, lsl #3]
mov x1, x21
ldr x2, [x20, #0x30]
blr x2
// If it returned false, we should go try the next slot.
cbz x0, hashtable_find_internal.next_slot
// If it returned true, we can return this element, so we load all the
// returned values to the appropriate registers.
mov x0, x25
mov x1, x27
mov x2, x22
hashtable_find_internal.end:
// Reload the callee-save registers, clean up the stack frame, and return.
ldp x20, x21, [sp, #0x10]
ldp x22, x23, [sp, #0x20]
ldp x24, x25, [sp, #0x30]
ldp x26, x27, [sp, #0x40]
ldp fp, lr, [sp], #0x50
ret
hashtable_find_internal.next_slot:
// The post-comparison metadata is still in x26, and the just-failed
// match index is in x27. We construct a mask, remove the
// now-known-to-be-different slot from the post-comparison metadata,
// then go another iteration of the inner loop.
mov x0, #0xff
lsl x3, x27, #3
lsl x0, x0, x3
bic x26, x26, x0
b hashtable_find_internal.inner_loop
hashtable_find_internal.not_in_hash_group:
// We next check to see if the search should continue. If the "has ever
// been full" bit of the hash group is set, it should, since that's the
// case where an element could be in another hash group. The metadata
// is still in x1, so we can directly test from there.
tbnz x1, #0, hashtable_find_internal.next_hash_group
// Since it wasn't found, we're done.
mov x0, x25
mov x1, #0
mov x2, x22
b hashtable_find_internal.end
hashtable_find_internal.next_hash_group:
// At this point, the mask for the hash is in x23 and the index of the
// current group is in x24. We increment the index, then re-mask the
// result to get the new group index.
add x24, x24, #1
bic x24, x24, x23
// Reload the group pointer, and continue looping over groups.
ldr x2, [x20]
b hashtable_find_internal.outer_loop
/**
* Inserts a value into a hashtable, writing back any previous version.
*
* ## Inputs
*
* - `x0`: The pointer to the hashtable.
* - `x1`: The value to insert.
* - `x2`: A pointer to a location to write the old value to if a value was
* replaced, or null if the value shouldn't be written back.
*
* ## Outputs
*
* - `x0`: A flag for whether a value was replaced; `-1` if so, `0` if not.
*/
.global hashtable_insert
.p2align 6
hashtable_insert:
// Make a frame and save some callee-save registers.
stp fp, lr, [sp, #-0x30]!
stp x20, x21, [sp, #0x10]
stp x22, x23, [sp, #0x20]
mov fp, sp
// Save the arguments to callee-save registers.
mov x20, x0
mov x21, x1
mov x22, x2
hashtable_insert.before_find:
// Find the existing item, or the place where an item would be inserted.
bl hashtable_find_internal
// Branch on whether a replacement is being performed or not.
cbz x1, hashtable_insert.new_element
// If the old-value location pointer was null, skip writing back to it.
// Otherwise, do the writeback.
cbz x22, hashtable_insert.after_writeback
ldr x2, [x0, x1, lsl #3]
str x2, [x22]
hashtable_insert.after_writeback:
// Now we can write the value to the hashtable.
str x21, [x0, x1, lsl #3]
// And finally, set the return value and jump to the end.
mov x0, #-1
b hashtable_insert.end
.p2align 6
hashtable_insert.new_element:
// At this point, x0 is the address of the group the value can be
// placed in, x1 is zero, and x2 is the hash.
// Check if a resize is required; if the length is one less than the
// capacity, resize and loop back to the beginning. Otherwise, write
// back the new length.
ldp x3, x4, [x20, #0x10]
add x3, x3, #1
cmp x3, x4
b.eq hashtable_insert.resize
str x3, [x20, #0x10]
hashtable_insert.TODO_name_me:
// Load the metadata byte into x1, then find the first one in it and
// write its index to x4. The correctness of the funny bitwise math
// here is justified in the docs as "Find Free Slot". If no slot is
// free, x4 will be 0 or -1. Otherwise, it'll be the index of a free
// slot.
ldrb w1, [x0]
orr x3, x1, #0xffffffffffffff01
mvn x3, x3
clz x4, x3
mov x5, #63
sub x4, x5, x4
// Check if no slot was free, and if so advance to the next group.
cmp x4, #0
b.le hashtable_insert.next_group
// Write the hash and value out.
strb w2, [x0, x4]
str x21, [x0, x4, lsl #3]
// Set the appropriate present bit for the value.
mov x5, #1
lsl x5, x5, x4
orr x1, x1, x5
strb w1, [x0]
// Set the flag for new-value-added, and return.
mov x0, #0
hashtable_insert.end:
// Reload the callee-save registers, clean up the stack frame, and return.
ldp x20, x21, [sp, #0x10]
ldp x22, x23, [sp, #0x20]
ldp fp, lr, [sp], #0x30
ret
hashtable_insert.next_group:
// First, we update the metadata byte of the group to indicate that
// we've spilled into the next group.
orr w1, w1, #1
strb w1, [x0]
// Next, get the group pointer and number of groups, and convert x0
// (the previous group address) back to an index.
ldp x6, x7, [x20]
sub x0, x0, x6
lsr x0, x0, #6
// Then, convert the number of groups to a mask.
mov x8, #-1
lsl x7, x8, x7
// Finally, increment the group count, mask it, and compute a new group
// pointer.
add x0, x0, #1
bic x0, x0, x7
add x0, x6, x0, lsl #6
// We can then go back to trying to insert.
b hashtable_insert.TODO_name_me
hashtable_insert.resize:
// TODO check for correctness
mov x7, #0x0606060606060606
bl todo
// Do the resize.
mov x0, x20
bl hashtable_grow
// We then retry to insert, which should definitely succeed. To do so,
// we restore the arguments to hashtable_find_internal, and jump back
// to its call.
mov x0, x20
mov x1, x21
b hashtable_insert.before_find
/**
* Grows a hashtable, doubling the number of groups.
*
* ## Inputs
*
* - `x0`: The pointer to the hashtable.
*/
hashtable_grow:
// TODO
hashtable_grow.oom:
adr x0, hashtable_grow.oom.msg
mov x1, hashtable_grow.oom.len
bl panic
udf #0
/**
* Gets the number of elements in the hashtable.
*
* ## Inputs
*
* - `x0`: The pointer to the hashtable.
*
* ## Outputs
*
* - `x0`: The number of elements in the hashtable.
*/
.global hashtable_size
hashtable_size:
ldr x0, [x0, #0x10]
ret
todo:
adr x0, todo.msg
mov x1, todo.len
bl panic
udf #0
.section .rodata
hashtable_init.too_many_groups.msg: .ascii "hashtable_init: too many groups"
.set hashtable_init.too_many_groups.len, . - hashtable_init.too_many_groups.msg
hashtable_init.oom.msg: .ascii "hashtable_init: out of memory"
.set hashtable_init.oom.len, . - hashtable_init.oom.msg
hashtable_grow.oom.msg: .ascii "hashtable_grow: out of memory"
.set hashtable_grow.oom.len, . - hashtable_grow.oom.msg
todo.msg: .ascii "TODO"
.set todo.len, . - todo.msg
// vim: set ft=arm64asm :