.section .text.crypto

// This is an implementation of BLAKE2s based on RFC7693. This has not been
// audited, don't put mission-critical data in StahlOS, etc., etc.

// Performs the setup for an unkeyed BLAKE2s hash.
//
// ## Returns
//
// - v0-v1: The initialized hash state.
//
// ## Side Effects
//
// - Trashes x0, x1
// - Trashes v14
.global blake2s_init
blake2s_init:
	adr x0, iv
	ldp q0, q1, [x0]
	eor v14.16b, v14.16b, v14.16b
	mov x1, #0x0020
	movk x1, #0x0101, lsl #16
	mov v14.4s[0], w1
	eor v0.16b, v0.16b, v14.16b
	ret

// Hashes an additional block into the hash state. The pointer to the data
// (`x0`) must be aligned to 16 bytes, and the data it points to must be
// exactly 64 bytes.
//
// ## Arguments
//
// - x0: The pointer to the block of data.
// - x1: The total number of bytes of data that have been accumulated in the
//       hash so far.
// - v0-v1: The prior hash state.
//
// ## Returns
//
// - v0-v1: The new hash state.
//
// ## Side Effects
//
// - Trashes x2, x3
// - Trashes v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14
.global blake2s_update
blake2s_update:
	str lr, [sp, #-16]!

	// Load the message.
	ldp q2, q3, [x0]
	ldp q4, q5, [x0, #32]

	// Initialize the working vector.
	mov v6.16b, v0.16b
	mov v7.16b, v1.16b
	adr x2, iv
	ldp q8, q9, [x2]
	eor v14.16b, v14.16b, v14.16b
	mov v14.2d[0], x1
	eor v9.16b, v9.16b, v14.16b

	// Load the loop counter. We use the sigma table itself as our counter.
	adr x2, sigma
	adr x3, sigma.end
blake2s_update.loop:
	ldp q10, q11, [x2], #32
	ldp q12, q13, [x2], #32

	tbl v10.16b, {v2.16b, v3.16b, v4.16b, v5.16b}, v10.16b
	tbl v11.16b, {v2.16b, v3.16b, v4.16b, v5.16b}, v11.16b
	bl mix_four

	tbl v10.16b, {v2.16b, v3.16b, v4.16b, v5.16b}, v12.16b
	tbl v11.16b, {v2.16b, v3.16b, v4.16b, v5.16b}, v13.16b
	ext v7.16b, v7.16b, v7.16b, #4
	ext v8.16b, v8.16b, v8.16b, #8
	ext v9.16b, v9.16b, v9.16b, #12
	bl mix_four
	ext v7.16b, v7.16b, v7.16b, #12
	ext v8.16b, v8.16b, v8.16b, #8
	ext v9.16b, v9.16b, v9.16b, #4

	cmp x2, x3
	b.ne blake2s_update.loop

	// Mix the working vector back into the hash.
	eor v0.16b, v0.16b, v6.16b
	eor v1.16b, v1.16b, v7.16b
	eor v0.16b, v0.16b, v8.16b
	eor v1.16b, v1.16b, v9.16b

	ldr lr, [sp], #16
	ret

// Hashes the final block and returns the hash. The pointer to the data (`x0`)
// must be aligned to 16 bytes, and the data it points to must be zero-padded
// to 64 bytes.
//
// ## Arguments
//
// - x0: The pointer to the padded last block of data.
// - x1: The total number of bytes of data that are accumulated in the hash.
// - v0-v1: The prior hash state.
//
// ## Returns
//
// - v0-v1: The BLAKE2s hash.
//
// ## Side Effects
//
// - Trashes x2, x3
// - Trashes v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14
.global blake2s_final
blake2s_final:
	str lr, [sp, #-16]!

	// Load the message.
	ldp q2, q3, [x0]
	ldp q4, q5, [x0, #32]

	// Initialize the working vector.
	mov v6.16b, v0.16b
	mov v7.16b, v1.16b
	adr x2, iv
	ldp q8, q9, [x2]
	eor v14.16b, v14.16b, v14.16b
	mov v14.2d[0], x1
	mov w3, #-1
	mov v14.4s[2], w3
	eor v9.16b, v9.16b, v14.16b

	// Load the loop counter. We use the sigma table itself as our counter.
	adr x2, sigma
	adr x3, sigma.end
blake2s_final.loop:
	ldp q10, q11, [x2], #32
	ldp q12, q13, [x2], #32

	tbl v10.16b, {v2.16b, v3.16b, v4.16b, v5.16b}, v10.16b
	tbl v11.16b, {v2.16b, v3.16b, v4.16b, v5.16b}, v11.16b
	bl mix_four

	tbl v10.16b, {v2.16b, v3.16b, v4.16b, v5.16b}, v12.16b
	tbl v11.16b, {v2.16b, v3.16b, v4.16b, v5.16b}, v13.16b
	ext v7.16b, v7.16b, v7.16b, #4
	ext v8.16b, v8.16b, v8.16b, #8
	ext v9.16b, v9.16b, v9.16b, #12
	bl mix_four
	ext v7.16b, v7.16b, v7.16b, #12
	ext v8.16b, v8.16b, v8.16b, #8
	ext v9.16b, v9.16b, v9.16b, #4

	cmp x2, x3
	b.ne blake2s_final.loop

	// Mix the working vector back into the hash.
	eor v0.16b, v0.16b, v6.16b
	eor v1.16b, v1.16b, v7.16b
	eor v0.16b, v0.16b, v8.16b
	eor v1.16b, v1.16b, v9.16b

	ldr lr, [sp], #16
	ret

// Performs four invocations of the mixing function, G, from section 3.1 of RFC7693 in parallel.
//
// ## Arguments
//
// - v6-v9: The working vector
// - v10-v11: The shuffled data from the current block to add during this
//            half-round.
//
// ## Returns
//
// ## Side Effects
//
// - Trashes v14
mix_four:
	add v6.4s, v6.4s, v7.4s
	add v6.4s, v6.4s, v10.4s
	eor v9.16b, v9.16b, v6.16b
	ushr v14.4s, v9.4s, #16
	shl v9.4s, v9.4s, #16
	orr v9.16b, v9.16b, v14.16b

	add v8.4s, v8.4s, v9.4s
	eor v7.16b, v7.16b, v8.16b
	ushr v14.4s, v7.4s, #12
	shl v7.4s, v7.4s, #20
	orr v7.16b, v7.16b, v14.16b

	add v6.4s, v6.4s, v7.4s
	add v6.4s, v6.4s, v11.4s
	eor v9.16b, v9.16b, v6.16b
	ushr v14.4s, v9.4s, #8
	shl v9.4s, v9.4s, #24
	orr v9.16b, v9.16b, v14.16b

	add v8.4s, v8.4s, v9.4s
	eor v7.16b, v7.16b, v8.16b
	ushr v14.4s, v7.4s, #7
	shl v7.4s, v7.4s, #25
	orr v7.16b, v7.16b, v14.16b

	ret

.section .rodata.crypto

.p2align 4
iv: .word 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19

.macro sigma_entry n
	.byte (4*\n), (4*\n)+1, (4*\n)+2, (4*\n)+3
.endm

.macro sigma_row x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15
	sigma_entry \x0
	sigma_entry \x2
	sigma_entry \x4
	sigma_entry \x6

	sigma_entry \x1
	sigma_entry \x3
	sigma_entry \x5
	sigma_entry \x7

	sigma_entry \x8
	sigma_entry \x10
	sigma_entry \x12
	sigma_entry \x14

	sigma_entry \x9
	sigma_entry \x11
	sigma_entry \x13
	sigma_entry \x15
.endm

.p2align 4
sigma:
        sigma_row 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
        sigma_row 0xe, 0xa, 0x4, 0x8, 0x9, 0xf, 0xd, 0x6, 0x1, 0xc, 0x0, 0x2, 0xb, 0x7, 0x5, 0x3
        sigma_row 0xb, 0x8, 0xc, 0x0, 0x5, 0x2, 0xf, 0xd, 0xa, 0xe, 0x3, 0x6, 0x7, 0x1, 0x9, 0x4
        sigma_row 0x7, 0x9, 0x3, 0x1, 0xd, 0xc, 0xb, 0xe, 0x2, 0x6, 0x5, 0xa, 0x4, 0x0, 0xf, 0x8
        sigma_row 0x9, 0x0, 0x5, 0x7, 0x2, 0x4, 0xa, 0xf, 0xe, 0x1, 0xb, 0xc, 0x6, 0x8, 0x3, 0xd
        sigma_row 0x2, 0xc, 0x6, 0xa, 0x0, 0xb, 0x8, 0x3, 0x4, 0xd, 0x7, 0x5, 0xf, 0xe, 0x1, 0x9
        sigma_row 0xc, 0x5, 0x1, 0xf, 0xe, 0xd, 0x4, 0xa, 0x0, 0x7, 0x6, 0x3, 0x9, 0x2, 0x8, 0xb
        sigma_row 0xd, 0xb, 0x7, 0xe, 0xc, 0x1, 0x3, 0x9, 0x5, 0x0, 0xf, 0x4, 0x8, 0x6, 0x2, 0xa
        sigma_row 0x6, 0xf, 0xe, 0x9, 0xb, 0x3, 0x0, 0x8, 0xc, 0x2, 0xd, 0x7, 0x1, 0x4, 0xa, 0x5
        sigma_row 0xa, 0x2, 0x8, 0x4, 0x7, 0x6, 0x1, 0x5, 0xf, 0xb, 0x9, 0xe, 0x3, 0xc, 0xd, 0x0
sigma.end:

// vim: set ft=arm64asm :