.section .text.crypto

// This is an implementation of ChaCha20 based on RFC8439. This has not been
// audited, don't put mission-critical data in StahlOS, etc., etc.

// Performs a single round of ChaCha on four vector registers. This requires an
// extra temporary vector register.
//
// Each vector register contains a single row (for column rounds) of the ChaCha
// state.
.macro column_round a, b, c, d, tmp
	add \a\().4s, \a\().4s, \b\().4s
	eor \d\().16b, \d\().16b, \a\().16b
	shl \tmp\().4s, \d\().4s, #16
	ushr \d\().4s, \d\().4s, #16
	orr \d\().16b, \d\().16b, \tmp\().16b

	add \c\().4s, \c\().4s, \d\().4s
	eor \b\().16b, \b\().16b, \c\().16b
	shl \tmp\().4s, \b\().4s, #12
	ushr \b\().4s, \b\().4s, #20
	orr \b\().16b, \b\().16b, \tmp\().16b

	add \a\().4s, \a\().4s, \b\().4s
	eor \d\().16b, \d\().16b, \a\().16b
	shl \tmp\().4s, \d\().4s, #8
	ushr \d\().4s, \d\().4s, #24
	orr \d\().16b, \d\().16b, \tmp\().16b

	add \c\().4s, \c\().4s, \d\().4s
	eor \b\().16b, \b\().16b, \c\().16b
	shl \tmp\().4s, \b\().4s, #7
	ushr \b\().4s, \b\().4s, #25
	orr \b\().16b, \b\().16b, \tmp\().16b
.endm

// Performs a double round of ChaCha on four vector registers. This requires an
// extra temporary vector register.
//
// Each vector register contains a single row (for column rounds) of the ChaCha
// state.
.macro double_round a, b, c, d, tmp
	column_round \a, \b, \c, \d, \tmp
	ext \b\().16b, \b\().16b, \b\().16b, #4
	ext \c\().16b, \c\().16b, \c\().16b, #8
	ext \d\().16b, \d\().16b, \d\().16b, #12
	column_round \a, \b, \c, \d, \tmp
	ext \b\().16b, \b\().16b, \b\().16b, #12
	ext \c\().16b, \c\().16b, \c\().16b, #8
	ext \d\().16b, \d\().16b, \d\().16b, #4
.endm

// Performs 20 rounds of ChaCha, and sums the initial state with the result.
// This is the ChaCha20 block function.
//
// ## Arguments
//
// - v0-v3: The ChaCha state.
//
// ## Returns
//
// - v0-v3: The output ChaCha state.
//
// ## Side Effects
//
// - Trashes x0.
// - Trashes v4, v5, v6, v7, and v8.
.global chacha20
chacha20:
	// Save a copy of the initial state.
	mov v4.16b, v0.16b
	mov v5.16b, v1.16b
	mov v6.16b, v2.16b
	mov v7.16b, v3.16b

	mov x0, #10
chacha20.loop:
	double_round v0, v1, v2, v3, v8
	subs x0, x0, #1
	b.ne chacha20.loop

	// Sum the initial state into the output state.
	add v0.4s, v0.4s, v4.4s
	add v1.4s, v1.4s, v5.4s
	add v2.4s, v2.4s, v6.4s
	add v3.4s, v3.4s, v7.4s

	// Clear the temporary registers.
	eor v4.16b, v4.16b, v4.16b
	eor v5.16b, v5.16b, v5.16b
	eor v6.16b, v6.16b, v6.16b
	eor v7.16b, v7.16b, v7.16b
	eor v8.16b, v8.16b, v8.16b
	ret

// vim: set ft=arm64asm :