.section .text.crypto
// This is an implementation of ChaCha20 based on RFC8439. This has not been
// audited, don't put mission-critical data in StahlOS, etc., etc.
// Performs a single round of ChaCha on four vector registers. This requires an
// extra temporary vector register.
//
// Each vector register contains a single row (for column rounds) of the ChaCha
// state.
.macro column_round a, b, c, d, tmp
add \a\().4s, \a\().4s, \b\().4s
eor \d\().16b, \d\().16b, \a\().16b
shl \tmp\().4s, \d\().4s, #16
ushr \d\().4s, \d\().4s, #16
orr \d\().16b, \d\().16b, \tmp\().16b
add \c\().4s, \c\().4s, \d\().4s
eor \b\().16b, \b\().16b, \c\().16b
shl \tmp\().4s, \b\().4s, #12
ushr \b\().4s, \b\().4s, #20
orr \b\().16b, \b\().16b, \tmp\().16b
add \a\().4s, \a\().4s, \b\().4s
eor \d\().16b, \d\().16b, \a\().16b
shl \tmp\().4s, \d\().4s, #8
ushr \d\().4s, \d\().4s, #24
orr \d\().16b, \d\().16b, \tmp\().16b
add \c\().4s, \c\().4s, \d\().4s
eor \b\().16b, \b\().16b, \c\().16b
shl \tmp\().4s, \b\().4s, #7
ushr \b\().4s, \b\().4s, #25
orr \b\().16b, \b\().16b, \tmp\().16b
.endm
// Performs a double round of ChaCha on four vector registers. This requires an
// extra temporary vector register.
//
// Each vector register contains a single row (for column rounds) of the ChaCha
// state.
.macro double_round a, b, c, d, tmp
column_round \a, \b, \c, \d, \tmp
ext \b\().16b, \b\().16b, \b\().16b, #4
ext \c\().16b, \c\().16b, \c\().16b, #8
ext \d\().16b, \d\().16b, \d\().16b, #12
column_round \a, \b, \c, \d, \tmp
ext \b\().16b, \b\().16b, \b\().16b, #12
ext \c\().16b, \c\().16b, \c\().16b, #8
ext \d\().16b, \d\().16b, \d\().16b, #4
.endm
// Performs 20 rounds of ChaCha, and sums the initial state with the result.
// This is the ChaCha20 block function.
//
// ## Arguments
//
// - v0-v3: The ChaCha state.
//
// ## Returns
//
// - v0-v3: The output ChaCha state.
//
// ## Side Effects
//
// - Trashes x0.
// - Trashes v4, v5, v6, v7, and v8.
.global chacha20
chacha20:
// Save a copy of the initial state.
mov v4.16b, v0.16b
mov v5.16b, v1.16b
mov v6.16b, v2.16b
mov v7.16b, v3.16b
mov x0, #10
chacha20.loop:
double_round v0, v1, v2, v3, v8
subs x0, x0, #1
b.ne chacha20.loop
// Sum the initial state into the output state.
add v0.4s, v0.4s, v4.4s
add v1.4s, v1.4s, v5.4s
add v2.4s, v2.4s, v6.4s
add v3.4s, v3.4s, v7.4s
// Clear the temporary registers.
eor v4.16b, v4.16b, v4.16b
eor v5.16b, v5.16b, v5.16b
eor v6.16b, v6.16b, v6.16b
eor v7.16b, v7.16b, v7.16b
eor v8.16b, v8.16b, v8.16b
ret
// vim: set ft=arm64asm :