.section .text.crypto
// This is an implementation of BLAKE2s based on RFC7693. This has not been
// audited, don't put mission-critical data in StahlOS, etc., etc.
// Performs the setup for an unkeyed BLAKE2s hash.
//
// ## Returns
//
// - v0-v1: The initialized hash state.
//
// ## Side Effects
//
// - Trashes x0, x1
// - Trashes v14
.global blake2s_init
blake2s_init:
adr x0, iv
ldp q0, q1, [x0]
eor v14.16b, v14.16b, v14.16b
mov x1, #0x0020
movk x1, #0x0101, lsl #16
mov v14.4s[0], w1
eor v0.16b, v0.16b, v14.16b
ret
// Hashes an additional block into the hash state. The pointer to the data
// (`x0`) must be aligned to 16 bytes, and the data it points to must be
// exactly 64 bytes.
//
// ## Arguments
//
// - x0: The pointer to the block of data.
// - x1: The total number of bytes of data that have been accumulated in the
// hash so far.
// - v0-v1: The prior hash state.
//
// ## Returns
//
// - v0-v1: The new hash state.
//
// ## Side Effects
//
// - Trashes x2, x3
// - Trashes v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14
.global blake2s_update
blake2s_update:
str lr, [sp, #-16]!
// Load the message.
ldp q2, q3, [x0]
ldp q4, q5, [x0, #32]
// Initialize the working vector.
mov v6.16b, v0.16b
mov v7.16b, v1.16b
adr x2, iv
ldp q8, q9, [x2]
eor v14.16b, v14.16b, v14.16b
mov v14.2d[0], x1
eor v9.16b, v9.16b, v14.16b
// Load the loop counter. We use the sigma table itself as our counter.
adr x2, sigma
adr x3, sigma.end
blake2s_update.loop:
ldp q10, q11, [x2], #32
ldp q12, q13, [x2], #32
tbl v10.16b, {v2.16b, v3.16b, v4.16b, v5.16b}, v10.16b
tbl v11.16b, {v2.16b, v3.16b, v4.16b, v5.16b}, v11.16b
bl mix_four
tbl v10.16b, {v2.16b, v3.16b, v4.16b, v5.16b}, v12.16b
tbl v11.16b, {v2.16b, v3.16b, v4.16b, v5.16b}, v13.16b
ext v7.16b, v7.16b, v7.16b, #4
ext v8.16b, v8.16b, v8.16b, #8
ext v9.16b, v9.16b, v9.16b, #12
bl mix_four
ext v7.16b, v7.16b, v7.16b, #12
ext v8.16b, v8.16b, v8.16b, #8
ext v9.16b, v9.16b, v9.16b, #4
cmp x2, x3
b.ne blake2s_update.loop
// Mix the working vector back into the hash.
eor v0.16b, v0.16b, v6.16b
eor v1.16b, v1.16b, v7.16b
eor v0.16b, v0.16b, v8.16b
eor v1.16b, v1.16b, v9.16b
ldr lr, [sp], #16
ret
// Hashes the final block and returns the hash. The pointer to the data (`x0`)
// must be aligned to 16 bytes, and the data it points to must be zero-padded
// to 64 bytes.
//
// ## Arguments
//
// - x0: The pointer to the padded last block of data.
// - x1: The total number of bytes of data that are accumulated in the hash.
// - v0-v1: The prior hash state.
//
// ## Returns
//
// - v0-v1: The BLAKE2s hash.
//
// ## Side Effects
//
// - Trashes x2, x3
// - Trashes v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14
.global blake2s_final
blake2s_final:
str lr, [sp, #-16]!
// Load the message.
ldp q2, q3, [x0]
ldp q4, q5, [x0, #32]
// Initialize the working vector.
mov v6.16b, v0.16b
mov v7.16b, v1.16b
adr x2, iv
ldp q8, q9, [x2]
eor v14.16b, v14.16b, v14.16b
mov v14.2d[0], x1
mov w3, #-1
mov v14.4s[2], w3
eor v9.16b, v9.16b, v14.16b
// Load the loop counter. We use the sigma table itself as our counter.
adr x2, sigma
adr x3, sigma.end
blake2s_final.loop:
ldp q10, q11, [x2], #32
ldp q12, q13, [x2], #32
tbl v10.16b, {v2.16b, v3.16b, v4.16b, v5.16b}, v10.16b
tbl v11.16b, {v2.16b, v3.16b, v4.16b, v5.16b}, v11.16b
bl mix_four
tbl v10.16b, {v2.16b, v3.16b, v4.16b, v5.16b}, v12.16b
tbl v11.16b, {v2.16b, v3.16b, v4.16b, v5.16b}, v13.16b
ext v7.16b, v7.16b, v7.16b, #4
ext v8.16b, v8.16b, v8.16b, #8
ext v9.16b, v9.16b, v9.16b, #12
bl mix_four
ext v7.16b, v7.16b, v7.16b, #12
ext v8.16b, v8.16b, v8.16b, #8
ext v9.16b, v9.16b, v9.16b, #4
cmp x2, x3
b.ne blake2s_final.loop
// Mix the working vector back into the hash.
eor v0.16b, v0.16b, v6.16b
eor v1.16b, v1.16b, v7.16b
eor v0.16b, v0.16b, v8.16b
eor v1.16b, v1.16b, v9.16b
ldr lr, [sp], #16
ret
// Performs four invocations of the mixing function, G, from section 3.1 of RFC7693 in parallel.
//
// ## Arguments
//
// - v6-v9: The working vector
// - v10-v11: The shuffled data from the current block to add during this
// half-round.
//
// ## Returns
//
// ## Side Effects
//
// - Trashes v14
mix_four:
add v6.4s, v6.4s, v7.4s
add v6.4s, v6.4s, v10.4s
eor v9.16b, v9.16b, v6.16b
ushr v14.4s, v9.4s, #16
shl v9.4s, v9.4s, #16
orr v9.16b, v9.16b, v14.16b
add v8.4s, v8.4s, v9.4s
eor v7.16b, v7.16b, v8.16b
ushr v14.4s, v7.4s, #12
shl v7.4s, v7.4s, #20
orr v7.16b, v7.16b, v14.16b
add v6.4s, v6.4s, v7.4s
add v6.4s, v6.4s, v11.4s
eor v9.16b, v9.16b, v6.16b
ushr v14.4s, v9.4s, #8
shl v9.4s, v9.4s, #24
orr v9.16b, v9.16b, v14.16b
add v8.4s, v8.4s, v9.4s
eor v7.16b, v7.16b, v8.16b
ushr v14.4s, v7.4s, #7
shl v7.4s, v7.4s, #25
orr v7.16b, v7.16b, v14.16b
ret
.section .rodata.crypto
.p2align 4
iv: .word 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
.macro sigma_entry n
.byte (4*\n), (4*\n)+1, (4*\n)+2, (4*\n)+3
.endm
.macro sigma_row x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15
sigma_entry \x0
sigma_entry \x2
sigma_entry \x4
sigma_entry \x6
sigma_entry \x1
sigma_entry \x3
sigma_entry \x5
sigma_entry \x7
sigma_entry \x8
sigma_entry \x10
sigma_entry \x12
sigma_entry \x14
sigma_entry \x9
sigma_entry \x11
sigma_entry \x13
sigma_entry \x15
.endm
.p2align 4
sigma:
sigma_row 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
sigma_row 0xe, 0xa, 0x4, 0x8, 0x9, 0xf, 0xd, 0x6, 0x1, 0xc, 0x0, 0x2, 0xb, 0x7, 0x5, 0x3
sigma_row 0xb, 0x8, 0xc, 0x0, 0x5, 0x2, 0xf, 0xd, 0xa, 0xe, 0x3, 0x6, 0x7, 0x1, 0x9, 0x4
sigma_row 0x7, 0x9, 0x3, 0x1, 0xd, 0xc, 0xb, 0xe, 0x2, 0x6, 0x5, 0xa, 0x4, 0x0, 0xf, 0x8
sigma_row 0x9, 0x0, 0x5, 0x7, 0x2, 0x4, 0xa, 0xf, 0xe, 0x1, 0xb, 0xc, 0x6, 0x8, 0x3, 0xd
sigma_row 0x2, 0xc, 0x6, 0xa, 0x0, 0xb, 0x8, 0x3, 0x4, 0xd, 0x7, 0x5, 0xf, 0xe, 0x1, 0x9
sigma_row 0xc, 0x5, 0x1, 0xf, 0xe, 0xd, 0x4, 0xa, 0x0, 0x7, 0x6, 0x3, 0x9, 0x2, 0x8, 0xb
sigma_row 0xd, 0xb, 0x7, 0xe, 0xc, 0x1, 0x3, 0x9, 0x5, 0x0, 0xf, 0x4, 0x8, 0x6, 0x2, 0xa
sigma_row 0x6, 0xf, 0xe, 0x9, 0xb, 0x3, 0x0, 0x8, 0xc, 0x2, 0xd, 0x7, 0x1, 0x4, 0xa, 0x5
sigma_row 0xa, 0x2, 0x8, 0x4, 0x7, 0x6, 0x1, 0x5, 0xf, 0xb, 0x9, 0xe, 0x3, 0xc, 0xd, 0x0
sigma.end:
// vim: set ft=arm64asm :