.section .text

// This is an implementation of hashtables based on Google's Swiss table
// hashtables, using as reference
// ["Designing a Fast, Efficient, Cache-friendly Hash Table, Step by Step"](https://www.youtube.com/watch?v=ncHmEUmJZf4).
// This also implements the "cache-line-aware groups" optimization described at
// 47:25 of "Designing a Fast, Efficient, Cache-friendly Hash Table, Step by
// Step," since it only supports 8-byte entries.

// A hashtable is a region laid out as:
//     +-------------------+-------------------+ 0x00
//     |   Group Pointer   | log2(Group Count) |
//     +-------------------+-------------------+ 0x10
//     |       Length      |      Capacity     |
//     +-------------------+-------------------+ 0x20
//     |  Alloc Function   |   Free Function   |
//     +-------------------+-------------------+ 0x30
//     |  Equals Function  |   Hash Function   |
//     +-------------------+-------------------+ 0x40
// All the functions defined in this file take hashtables by reference, so they
// take a pointer to the above.
// The signatures of the Alloc, Equals, and Hash functions are described in the
// documentation on the `hashtable_init` procedure.
// The Group Pointer points to an array of (1 << log2(Group Count)) hash
// groups, which are laid out as:
//     0x00            0x08         0x40
//     +-+-+-+-+-+-+-+-+----- ... -----+
//     | |H|H|H|H|H|H|H|     Values    |
//     +-+-+-+-+-+-+-+-+----- ... -----+
//     |  \
//     |   +----------+
//     |               \
//     +-+-+-+-+-+-+-+-+
//     |F|P|P|P|P|P|P|P|
//     +-+-+-+-+-+-+-+-+
//     0x00         0x01
// - `H`: The high 8 bits of the hash of the value in the corresponding slot.
// - `P`: A flag for whether the entry is present.
// - `F`: A flag for whether the entire group has been full (since the last resize).

 * Initializes a hashtable.
 * ## Inputs
 * - `x0`: A pointer to the area to initialize as a hashtable.
 * - `x1`: log2() of the initial number of groups.
 * - `x2`: The `alloc` function. See below for details.
 * - `x3`: The `free` function. See below for details.
 * - `x4`: The `equals` function. See below for details.
 * - `x5`: The `hash` function. See below for details.
 * ## Notes
 * All of the functions must obey the
 * [Procedure Call Standard for the ARM 64-bit Architecture (AArch64)](https://github.com/ARM-software/abi-aa/blob/78b0cd76d3b49f8212be1945e8ba10b15c7cf7e0/aapcs64/aapcs64.rst).
 * In C-like syntax, the functions must have the given signatures:
 * - `void* alloc(size_t size);`
 * - `void free(void* ptr, size_t size);`
 * - `unsigned long long equals(unsigned long long x, unsigned long long y);`
 * - `unsigned long long hash(unsigned long long value);`
 * Furthermore:
 * - The `alloc` function will always be called with a multiple of 64, and must
 *   return pointers aligned to 64 bytes, and must always return a valid
 *   pointer to a zeroed region. (A wrapper around `posix_memalign` and
 *   `memset` is recommended on Unix systems.)
 * - The `free` function will be passed the address and size of an allocation
 *   returned by `alloc`, and should free it.
 * - The `equals` function must be a reflexive and symmetric relation that
 *   returns `-1` when true and `0` when false.
 * - The `hash` function should mix entropy well into high bits, and must
 *   respect `equals`; that is, `equals(x, y) == -1` implies
 *   `hash(x) == hash(y)`.
.global hashtable_init
.p2align 6
	// Make a frame and save some callee-save registers.
	stp fp, lr, [sp, #-0x40]!
	stp x20, x21, [sp, #0x10]
	stp x22, x23, [sp, #0x20]
	stp x24, x25, [sp, #0x30]
	mov fp, sp

	// If the initial number of groups is 50 or above, bail out.
	cmp x1, #50
	b.hs hashtable_init.too_many_groups

	// Save the arguments to callee-save registers.
	mov x20, x0
	mov x21, x1
	mov x22, x2
	mov x23, x3
	mov x24, x4
	mov x25, x5

	// Allocate 64 bytes per group.
	mov x6, #64
	lsl x0, x6, x1
	blr x2
	cbz x0, hashtable_init.oom

	// Store the saved arguments to the table.
	stp x0, x21, [x20]
	stp x22, x23, [x20, #0x20]
	stp x24, x25, [x20, #0x30]

	// Compute the capacity and store it as well. We use a maximum load
	// factor of 15/16 (93.75%), giving us a capacity of
	// (7 * group count * 15/16); this gives a load factor close to the
	// recommended 95%, while being much cheaper to compute.
	mov x6, #105
	lsl x6, x6, x21
	lsr x6, x6, #4
	str x6, [x20, #0x18]

	// Reload the callee-save registers, clean up the stack frame, and return.
	ldp x20, x21, [sp, #0x10]
	ldp x22, x23, [sp, #0x20]
	ldp x24, x25, [sp, #0x30]
	ldp fp, lr, [sp], #0x40

	adr x0, hashtable_init.too_many_groups.msg
	mov x1, hashtable_init.too_many_groups.len
	bl panic
	udf #0

	adr x0, hashtable_init.oom.msg
	mov x1, hashtable_init.oom.len
	bl panic
	udf #0

 * Cleans up a hashtable. Note that this does not free the values of the table.
 * ## Inputs
 * - `x0`: The pointer to the hashtable.
.global hashtable_destroy
.p2align 6
	// Load the free function.
	ldr x2, [x0, #0x28]

	// Load the group pointer and compute the size of the groups.
	ldp x0, x1, [x0]
	mov x3, #1
	lsl x1, x3, x1

	// Tail-call the free function.
	br x2

 * Finds a value in a hashtable.
 * ## Inputs
 * - `x0`: The pointer to the hashtable.
 * - `x1`: The value to search for.
 * ## Outputs
 * - `x0`: The address of the group the value is in (if it was found), or could
 *         be placed in in (if it was not found).
 * - `x1`: The 1-based index of the value within the group if it was found, or
 *         0 if it was not.
 * - `x2`: The hash of the value.
 * ## Notes
 * If the value was not found, `x0` is guaranteed to not be full.
.global hashtable_find_internal
.p2align 6
	// Make a frame and save some callee-save registers.
	stp fp, lr, [sp, #-0x50]!
	stp x20, x21, [sp, #0x10]
	stp x22, x23, [sp, #0x20]
	stp x24, x25, [sp, #0x30]
	stp x26, x27, [sp, #0x40]
	mov fp, sp

	// Save the arguments to callee-save registers.
	mov x20, x0
	mov x21, x1

	// Hash the element and store the hash in x22.
	ldr x1, [x20, #0x38]
	mov x0, x21
	blr x1
	mov x22, x0

	// Create a mask for indexing the hash groups in x23, and use it to
	// mask the hash to get the index of the first group to check, stored
	// in x24. At the same time (to save a cycle in the common case), load
	// the group pointer into x2.
	ldp x2, x0, [x20]
	mov x1, #-1
	lsl x23, x1, x0
	bic x24, x22, x23

	// We store the high byte of the hash in every byte of v0. This will
	// allow us to perform very fast and loop-free checks for the hash
	// within a group.
	lsr x0, x22, #56
	dup v0.16b, w0

	// The outer loop traverses over groups, visiting every one that could
	// contain values with the matching lower-bits of hash.
	// Store the address of the group in x25.
	add x25, x2, x24, lsl #6

	// Read the metadata into v1, and compare it against the hash bytes
	// from v0, storing the result of the comparison in x26.
	ldr x1, [x25]
	mov v1.d[0], x1
	cmeq v1.8b, v0.8b, v1.8b
	mov x26, v1.d[0]

	// The inner loop checks groups for values with matching top-bits of
	// hash.
	// Find the first match. We do this by finding the highest set bit,
	// dividing by 8, and subtracting it from 8. x27 is then the match
	// index.
	clz x2, x26
	lsr x2, x2, #3
	mov x3, #7
	sub x27, x3, x2

	// If x2 is zero or negative one, only the metadata byte matched, or no
	// bytes matched, so we know the element isn't in this hash group.
	cmp x27, #0
	b.le hashtable_find_internal.not_in_hash_group

	// Then check that the present bit is actually set for that slot; if
	// not, try the next one.
	mov x0, #1
	lsl x0, x0, x27
	tst x0, x1
	b.eq hashtable_find_internal.next_slot

	// Otherwise, load the value into x1 and original value into x0, then
	// call the equals function.
	ldr x0, [x25, x27, lsl #3]
	mov x1, x21
	ldr x2, [x20, #0x30]
	blr x2

	// If it returned false, we should go try the next slot.
	cbz x0, hashtable_find_internal.next_slot

	// If it returned true, we can return this element, so we load all the
	// returned values to the appropriate registers.
	mov x0, x25
	mov x1, x27
	mov x2, x22

	// Reload the callee-save registers, clean up the stack frame, and return.
	ldp x20, x21, [sp, #0x10]
	ldp x22, x23, [sp, #0x20]
	ldp x24, x25, [sp, #0x30]
	ldp x26, x27, [sp, #0x40]
	ldp fp, lr, [sp], #0x50

	// The post-comparison metadata is still in x26, and the just-failed
	// match index is in x27. We construct a mask, remove the
	// now-known-to-be-different slot from the post-comparison metadata,
	// then go another iteration of the inner loop.
	mov x0, #0xff
	lsl x3, x27, #3
	lsl x0, x0, x3
	bic x26, x26, x0
	b hashtable_find_internal.inner_loop

	// We next check to see if the search should continue. If the "has ever
	// been full" bit of the hash group is set, it should, since that's the
	// case where an element could be in another hash group. The metadata
	// is still in x1, so we can directly test from there.
	tbnz x1, #0, hashtable_find_internal.next_hash_group

	// Since it wasn't found, we're done.
	mov x0, x25
	mov x1, #0
	mov x2, x22
	b hashtable_find_internal.end

	// At this point, the mask for the hash is in x23 and the index of the
	// current group is in x24. We increment the index, then re-mask the
	// result to get the new group index.
	add x24, x24, #1
	bic x24, x24, x23

	// Reload the group pointer, and continue looping over groups.
	ldr x2, [x20]
	b hashtable_find_internal.outer_loop

 * Inserts a value into a hashtable, writing back any previous version.
 * ## Inputs
 * - `x0`: The pointer to the hashtable.
 * - `x1`: The value to insert.
 * - `x2`: A pointer to a location to write the old value to if a value was
 *         replaced, or null if the value shouldn't be written back.
 * ## Outputs
 * - `x0`: A flag for whether a value was replaced; `-1` if so, `0` if not.
.global hashtable_insert
.p2align 6
	// Make a frame and save some callee-save registers.
	stp fp, lr, [sp, #-0x30]!
	stp x20, x21, [sp, #0x10]
	stp x22, x23, [sp, #0x20]
	mov fp, sp

	// Save the arguments to callee-save registers.
	mov x20, x0
	mov x21, x1
	mov x22, x2

	// Find the existing item, or the place where an item would be inserted.
	bl hashtable_find_internal

	// Branch on whether a replacement is being performed or not.
	cbz x1, hashtable_insert.new_element

	// If the old-value location pointer was null, skip writing back to it.
	// Otherwise, do the writeback.
	cbz x22, hashtable_insert.after_writeback
	ldr x2, [x0, x1, lsl #3]
	str x2, [x22]

	// Now we can write the value to the hashtable.
	str x21, [x0, x1, lsl #3]

	// And finally, set the return value and jump to the end.
	mov x0, #-1
	b hashtable_insert.end

.p2align 6
	// At this point, x0 is the address of the group the value can be
	// placed in, x1 is zero, and x2 is the hash.

	// Check if a resize is required; if the length is one less than the
	// capacity, resize and loop back to the beginning. Otherwise, write
	// back the new length.
	ldp x3, x4, [x20, #0x10]
	add x3, x3, #1
	cmp x3, x4
	b.eq hashtable_insert.resize
	str x3, [x20, #0x10]

	// Load the metadata byte into x1, then find the first one in it and
	// write its index to x4. The correctness of the funny bitwise math
	// here is justified in the docs as "Find Free Slot". If no slot is
	// free, x4 will be 0 or -1. Otherwise, it'll be the index of a free
	// slot.
	ldrb w1, [x0]
	orr x3, x1, #0xffffffffffffff01
	mvn x3, x3
	clz x4, x3
	mov x5, #63
	sub x4, x5, x4

	// Check if no slot was free, and if so advance to the next group.
	cmp x4, #0
	b.le hashtable_insert.next_group

	// Write the hash and value out.
	strb w2, [x0, x4]
	str x21, [x0, x4, lsl #3]

	// Set the appropriate present bit for the value.
	mov x5, #1
	lsl x5, x5, x4
	orr x1, x1, x5
	strb w1, [x0]

	// Set the flag for new-value-added, and return.
	mov x0, #0

	// Reload the callee-save registers, clean up the stack frame, and return.
	ldp x20, x21, [sp, #0x10]
	ldp x22, x23, [sp, #0x20]
	ldp fp, lr, [sp], #0x30

	// First, we update the metadata byte of the group to indicate that
	// we've spilled into the next group.
	orr w1, w1, #1
	strb w1, [x0]

	// Next, get the group pointer and number of groups, and convert x0
	// (the previous group address) back to an index.
	ldp x6, x7, [x20]
	sub x0, x0, x6
	lsr x0, x0, #6

	// Then, convert the number of groups to a mask.
	mov x8, #-1
	lsl x7, x8, x7

	// Finally, increment the group count, mask it, and compute a new group
	// pointer.
	add x0, x0, #1
	bic x0, x0, x7
	add x0, x6, x0, lsl #6

	// We can then go back to trying to insert.
	b hashtable_insert.TODO_name_me

	// TODO check for correctness
	mov x7, #0x0606060606060606
	bl todo

	// Do the resize.
	mov x0, x20
	bl hashtable_grow

	// We then retry to insert, which should definitely succeed. To do so,
	// we restore the arguments to hashtable_find_internal, and jump back
	// to its call.
	mov x0, x20
	mov x1, x21
	b hashtable_insert.before_find

 * Grows a hashtable, doubling the number of groups.
 * ## Inputs
 * - `x0`: The pointer to the hashtable.
	// TODO

	adr x0, hashtable_grow.oom.msg
	mov x1, hashtable_grow.oom.len
	bl panic
	udf #0

 * Gets the number of elements in the hashtable.
 * ## Inputs
 * - `x0`: The pointer to the hashtable.
 * ## Outputs
 * - `x0`: The number of elements in the hashtable.
.global hashtable_size
	ldr x0, [x0, #0x10]

	adr x0, todo.msg
	mov x1, todo.len
	bl panic
	udf #0

.section .rodata

hashtable_init.too_many_groups.msg: .ascii "hashtable_init: too many groups"
.set hashtable_init.too_many_groups.len, . - hashtable_init.too_many_groups.msg

hashtable_init.oom.msg: .ascii "hashtable_init: out of memory"
.set hashtable_init.oom.len, . - hashtable_init.oom.msg

hashtable_grow.oom.msg: .ascii "hashtable_grow: out of memory"
.set hashtable_grow.oom.len, . - hashtable_grow.oom.msg

todo.msg: .ascii "TODO"
.set todo.len, . - todo.msg

// vim: set ft=arm64asm :