[PATCH 06/19] crypto: arm/ghash - Move NEON GHASH assembly into its own file
Eric Biggers
ebiggers at kernel.org
Thu Mar 19 17:17:07 AEDT 2026
arch/arm/crypto/ghash-ce-core.S implements pmull_ghash_update_p8(),
which is used only by a crypto_shash implementation of GHASH. It also
implements other functions, including pmull_ghash_update_p64() and
others, which are used only by a crypto_aead implementation of AES-GCM.
While some code is shared between pmull_ghash_update_p8() and
pmull_ghash_update_p64(), it's not very much. Since
pmull_ghash_update_p8() will also need to be migrated into lib/crypto/
to achieve parity in the standalone GHASH support, let's move it into a
separate file ghash-neon-core.S.
Signed-off-by: Eric Biggers <ebiggers at kernel.org>
---
arch/arm/crypto/Makefile | 2 +-
arch/arm/crypto/ghash-ce-core.S | 171 ++----------------------
arch/arm/crypto/ghash-neon-core.S | 207 ++++++++++++++++++++++++++++++
3 files changed, 222 insertions(+), 158 deletions(-)
create mode 100644 arch/arm/crypto/ghash-neon-core.S
diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile
index e73099e120b3..cedce94d5ee5 100644
--- a/arch/arm/crypto/Makefile
+++ b/arch/arm/crypto/Makefile
@@ -8,6 +8,6 @@ obj-$(CONFIG_CRYPTO_AES_ARM_BS) += aes-arm-bs.o
obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o
obj-$(CONFIG_CRYPTO_GHASH_ARM_CE) += ghash-arm-ce.o
aes-arm-bs-y := aes-neonbs-core.o aes-neonbs-glue.o
aes-arm-ce-y := aes-ce-core.o aes-ce-glue.o
-ghash-arm-ce-y := ghash-ce-core.o ghash-ce-glue.o
+ghash-arm-ce-y := ghash-ce-core.o ghash-ce-glue.o ghash-neon-core.o
diff --git a/arch/arm/crypto/ghash-ce-core.S b/arch/arm/crypto/ghash-ce-core.S
index 858c0d66798b..a449525d61f8 100644
--- a/arch/arm/crypto/ghash-ce-core.S
+++ b/arch/arm/crypto/ghash-ce-core.S
@@ -1,8 +1,8 @@
/* SPDX-License-Identifier: GPL-2.0-only */
/*
- * Accelerated GHASH implementation with NEON/ARMv8 vmull.p8/64 instructions.
+ * Accelerated AES-GCM implementation with ARMv8 Crypto Extensions.
*
* Copyright (C) 2015 - 2017 Linaro Ltd.
* Copyright (C) 2023 Google LLC. <ardb at google.com>
*/
@@ -27,43 +27,14 @@
XL_H .req d5
XM_L .req d6
XM_H .req d7
XH_L .req d8
- t0l .req d10
- t0h .req d11
- t1l .req d12
- t1h .req d13
- t2l .req d14
- t2h .req d15
- t3l .req d16
- t3h .req d17
- t4l .req d18
- t4h .req d19
-
- t0q .req q5
- t1q .req q6
- t2q .req q7
- t3q .req q8
- t4q .req q9
XH2 .req q9
- s1l .req d20
- s1h .req d21
- s2l .req d22
- s2h .req d23
- s3l .req d24
- s3h .req d25
- s4l .req d26
- s4h .req d27
-
MASK .req d28
- SHASH2_p8 .req d28
- k16 .req d29
- k32 .req d30
- k48 .req d31
SHASH2_p64 .req d31
HH .req q10
HH3 .req q11
HH4 .req q12
@@ -91,76 +62,10 @@
T3_L .req d16
T3_H .req d17
.text
- .macro __pmull_p64, rd, rn, rm, b1, b2, b3, b4
- vmull.p64 \rd, \rn, \rm
- .endm
-
- /*
- * This implementation of 64x64 -> 128 bit polynomial multiplication
- * using vmull.p8 instructions (8x8 -> 16) is taken from the paper
- * "Fast Software Polynomial Multiplication on ARM Processors Using
- * the NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and
- * Ricardo Dahab (https://hal.inria.fr/hal-01506572)
- *
- * It has been slightly tweaked for in-order performance, and to allow
- * 'rq' to overlap with 'ad' or 'bd'.
- */
- .macro __pmull_p8, rq, ad, bd, b1=t4l, b2=t3l, b3=t4l, b4=t3l
- vext.8 t0l, \ad, \ad, #1 @ A1
- .ifc \b1, t4l
- vext.8 t4l, \bd, \bd, #1 @ B1
- .endif
- vmull.p8 t0q, t0l, \bd @ F = A1*B
- vext.8 t1l, \ad, \ad, #2 @ A2
- vmull.p8 t4q, \ad, \b1 @ E = A*B1
- .ifc \b2, t3l
- vext.8 t3l, \bd, \bd, #2 @ B2
- .endif
- vmull.p8 t1q, t1l, \bd @ H = A2*B
- vext.8 t2l, \ad, \ad, #3 @ A3
- vmull.p8 t3q, \ad, \b2 @ G = A*B2
- veor t0q, t0q, t4q @ L = E + F
- .ifc \b3, t4l
- vext.8 t4l, \bd, \bd, #3 @ B3
- .endif
- vmull.p8 t2q, t2l, \bd @ J = A3*B
- veor t0l, t0l, t0h @ t0 = (L) (P0 + P1) << 8
- veor t1q, t1q, t3q @ M = G + H
- .ifc \b4, t3l
- vext.8 t3l, \bd, \bd, #4 @ B4
- .endif
- vmull.p8 t4q, \ad, \b3 @ I = A*B3
- veor t1l, t1l, t1h @ t1 = (M) (P2 + P3) << 16
- vmull.p8 t3q, \ad, \b4 @ K = A*B4
- vand t0h, t0h, k48
- vand t1h, t1h, k32
- veor t2q, t2q, t4q @ N = I + J
- veor t0l, t0l, t0h
- veor t1l, t1l, t1h
- veor t2l, t2l, t2h @ t2 = (N) (P4 + P5) << 24
- vand t2h, t2h, k16
- veor t3l, t3l, t3h @ t3 = (K) (P6 + P7) << 32
- vmov.i64 t3h, #0
- vext.8 t0q, t0q, t0q, #15
- veor t2l, t2l, t2h
- vext.8 t1q, t1q, t1q, #14
- vmull.p8 \rq, \ad, \bd @ D = A*B
- vext.8 t2q, t2q, t2q, #13
- vext.8 t3q, t3q, t3q, #12
- veor t0q, t0q, t1q
- veor t2q, t2q, t3q
- veor \rq, \rq, t0q
- veor \rq, \rq, t2q
- .endm
-
- //
- // PMULL (64x64->128) based reduction for CPUs that can do
- // it in a single instruction.
- //
.macro __pmull_reduce_p64
vmull.p64 T1, XL_L, MASK
veor XH_L, XH_L, XM_H
vext.8 T1, T1, T1, #8
@@ -168,34 +73,11 @@
veor T1, T1, XL
vmull.p64 XL, T1_H, MASK
.endm
- //
- // Alternative reduction for CPUs that lack support for the
- // 64x64->128 PMULL instruction
- //
- .macro __pmull_reduce_p8
- veor XL_H, XL_H, XM_L
- veor XH_L, XH_L, XM_H
-
- vshl.i64 T1, XL, #57
- vshl.i64 T2, XL, #62
- veor T1, T1, T2
- vshl.i64 T2, XL, #63
- veor T1, T1, T2
- veor XL_H, XL_H, T1_L
- veor XH_L, XH_L, T1_H
-
- vshr.u64 T1, XL, #1
- veor XH, XH, XL
- veor XL, XL, T1
- vshr.u64 T1, T1, #6
- vshr.u64 XL, XL, #1
- .endm
-
- .macro ghash_update, pn, enc, aggregate=1, head=1
+ .macro ghash_update, enc, aggregate=1, head=1
vld1.64 {XL}, [r1]
.if \head
/* do the head block first, if supplied */
ldr ip, [sp]
@@ -204,12 +86,11 @@
vld1.64 {T1}, [ip]
teq r0, #0
b 3f
.endif
-0: .ifc \pn, p64
- .if \aggregate
+0: .if \aggregate
tst r0, #3 // skip until #blocks is a
bne 2f // round multiple of 4
vld1.8 {XL2-XM2}, [r2]!
1: vld1.8 {T2-T3}, [r2]!
@@ -286,11 +167,10 @@
veor T1, T1, XH
veor XL, XL, T1
b 1b
.endif
- .endif
2: vld1.8 {T1}, [r2]!
.ifnb \enc
\enc\()_1x T1
@@ -306,29 +186,29 @@
vext.8 IN1, T1, T1, #8
veor T1_L, T1_L, XL_H
veor XL, XL, IN1
- __pmull_\pn XH, XL_H, SHASH_H, s1h, s2h, s3h, s4h @ a1 * b1
+ vmull.p64 XH, XL_H, SHASH_H @ a1 * b1
veor T1, T1, XL
- __pmull_\pn XL, XL_L, SHASH_L, s1l, s2l, s3l, s4l @ a0 * b0
- __pmull_\pn XM, T1_L, SHASH2_\pn @ (a1+a0)(b1+b0)
+ vmull.p64 XL, XL_L, SHASH_L @ a0 * b0
+ vmull.p64 XM, T1_L, SHASH2_p64 @ (a1+a0)(b1+b0)
4: veor T1, XL, XH
veor XM, XM, T1
- __pmull_reduce_\pn
+ __pmull_reduce_p64
veor T1, T1, XH
veor XL, XL, T1
bne 0b
.endm
/*
- * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
- * struct ghash_key const *k, const char *head)
+ * void pmull_ghash_update_p64(int blocks, u64 dg[], const char *src,
+ * u64 const h[4][2], const char *head)
*/
ENTRY(pmull_ghash_update_p64)
vld1.64 {SHASH}, [r3]!
vld1.64 {HH}, [r3]!
vld1.64 {HH3-HH4}, [r3]
@@ -339,39 +219,16 @@ ENTRY(pmull_ghash_update_p64)
veor HH34_H, HH4_L, HH4_H
vmov.i8 MASK, #0xe1
vshl.u64 MASK, MASK, #57
- ghash_update p64
+ ghash_update
vst1.64 {XL}, [r1]
bx lr
ENDPROC(pmull_ghash_update_p64)
-ENTRY(pmull_ghash_update_p8)
- vld1.64 {SHASH}, [r3]
- veor SHASH2_p8, SHASH_L, SHASH_H
-
- vext.8 s1l, SHASH_L, SHASH_L, #1
- vext.8 s2l, SHASH_L, SHASH_L, #2
- vext.8 s3l, SHASH_L, SHASH_L, #3
- vext.8 s4l, SHASH_L, SHASH_L, #4
- vext.8 s1h, SHASH_H, SHASH_H, #1
- vext.8 s2h, SHASH_H, SHASH_H, #2
- vext.8 s3h, SHASH_H, SHASH_H, #3
- vext.8 s4h, SHASH_H, SHASH_H, #4
-
- vmov.i64 k16, #0xffff
- vmov.i64 k32, #0xffffffff
- vmov.i64 k48, #0xffffffffffff
-
- ghash_update p8
- vst1.64 {XL}, [r1]
-
- bx lr
-ENDPROC(pmull_ghash_update_p8)
-
e0 .req q9
e1 .req q10
e2 .req q11
e3 .req q12
e0l .req d18
@@ -534,11 +391,11 @@ ENTRY(pmull_gcm_encrypt)
ldrd r4, r5, [sp, #24]
ldrd r6, r7, [sp, #32]
vld1.64 {SHASH}, [r3]
- ghash_update p64, enc, head=0
+ ghash_update enc, head=0
vst1.64 {XL}, [r1]
pop {r4-r8, pc}
ENDPROC(pmull_gcm_encrypt)
@@ -552,11 +409,11 @@ ENTRY(pmull_gcm_decrypt)
ldrd r4, r5, [sp, #24]
ldrd r6, r7, [sp, #32]
vld1.64 {SHASH}, [r3]
- ghash_update p64, dec, head=0
+ ghash_update dec, head=0
vst1.64 {XL}, [r1]
pop {r4-r8, pc}
ENDPROC(pmull_gcm_decrypt)
@@ -601,11 +458,11 @@ ENTRY(pmull_gcm_enc_final)
vmov.i8 MASK, #0xe1
veor SHASH2_p64, SHASH_L, SHASH_H
vshl.u64 MASK, MASK, #57
mov r0, #1
bne 3f // process head block first
- ghash_update p64, aggregate=0, head=0
+ ghash_update aggregate=0, head=0
vrev64.8 XL, XL
vext.8 XL, XL, XL, #8
veor XL, XL, e1
@@ -658,11 +515,11 @@ ENTRY(pmull_gcm_dec_final)
vmov.i8 MASK, #0xe1
veor SHASH2_p64, SHASH_L, SHASH_H
vshl.u64 MASK, MASK, #57
mov r0, #1
bne 3f // process head block first
- ghash_update p64, aggregate=0, head=0
+ ghash_update aggregate=0, head=0
vrev64.8 XL, XL
vext.8 XL, XL, XL, #8
veor XL, XL, e1
diff --git a/arch/arm/crypto/ghash-neon-core.S b/arch/arm/crypto/ghash-neon-core.S
new file mode 100644
index 000000000000..bdf6fb6d063c
--- /dev/null
+++ b/arch/arm/crypto/ghash-neon-core.S
@@ -0,0 +1,207 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Accelerated GHASH implementation with NEON vmull.p8 instructions.
+ *
+ * Copyright (C) 2015 - 2017 Linaro Ltd.
+ * Copyright (C) 2023 Google LLC. <ardb at google.com>
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+ .fpu neon
+
+ SHASH .req q0
+ T1 .req q1
+ XL .req q2
+ XM .req q3
+ XH .req q4
+ IN1 .req q4
+
+ SHASH_L .req d0
+ SHASH_H .req d1
+ T1_L .req d2
+ T1_H .req d3
+ XL_L .req d4
+ XL_H .req d5
+ XM_L .req d6
+ XM_H .req d7
+ XH_L .req d8
+
+ t0l .req d10
+ t0h .req d11
+ t1l .req d12
+ t1h .req d13
+ t2l .req d14
+ t2h .req d15
+ t3l .req d16
+ t3h .req d17
+ t4l .req d18
+ t4h .req d19
+
+ t0q .req q5
+ t1q .req q6
+ t2q .req q7
+ t3q .req q8
+ t4q .req q9
+
+ s1l .req d20
+ s1h .req d21
+ s2l .req d22
+ s2h .req d23
+ s3l .req d24
+ s3h .req d25
+ s4l .req d26
+ s4h .req d27
+
+ SHASH2_p8 .req d28
+
+ k16 .req d29
+ k32 .req d30
+ k48 .req d31
+
+ T2 .req q7
+
+ .text
+
+ /*
+ * This implementation of 64x64 -> 128 bit polynomial multiplication
+ * using vmull.p8 instructions (8x8 -> 16) is taken from the paper
+ * "Fast Software Polynomial Multiplication on ARM Processors Using
+ * the NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and
+ * Ricardo Dahab (https://hal.inria.fr/hal-01506572)
+ *
+ * It has been slightly tweaked for in-order performance, and to allow
+ * 'rq' to overlap with 'ad' or 'bd'.
+ */
+ .macro __pmull_p8, rq, ad, bd, b1=t4l, b2=t3l, b3=t4l, b4=t3l
+ vext.8 t0l, \ad, \ad, #1 @ A1
+ .ifc \b1, t4l
+ vext.8 t4l, \bd, \bd, #1 @ B1
+ .endif
+ vmull.p8 t0q, t0l, \bd @ F = A1*B
+ vext.8 t1l, \ad, \ad, #2 @ A2
+ vmull.p8 t4q, \ad, \b1 @ E = A*B1
+ .ifc \b2, t3l
+ vext.8 t3l, \bd, \bd, #2 @ B2
+ .endif
+ vmull.p8 t1q, t1l, \bd @ H = A2*B
+ vext.8 t2l, \ad, \ad, #3 @ A3
+ vmull.p8 t3q, \ad, \b2 @ G = A*B2
+ veor t0q, t0q, t4q @ L = E + F
+ .ifc \b3, t4l
+ vext.8 t4l, \bd, \bd, #3 @ B3
+ .endif
+ vmull.p8 t2q, t2l, \bd @ J = A3*B
+ veor t0l, t0l, t0h @ t0 = (L) (P0 + P1) << 8
+ veor t1q, t1q, t3q @ M = G + H
+ .ifc \b4, t3l
+ vext.8 t3l, \bd, \bd, #4 @ B4
+ .endif
+ vmull.p8 t4q, \ad, \b3 @ I = A*B3
+ veor t1l, t1l, t1h @ t1 = (M) (P2 + P3) << 16
+ vmull.p8 t3q, \ad, \b4 @ K = A*B4
+ vand t0h, t0h, k48
+ vand t1h, t1h, k32
+ veor t2q, t2q, t4q @ N = I + J
+ veor t0l, t0l, t0h
+ veor t1l, t1l, t1h
+ veor t2l, t2l, t2h @ t2 = (N) (P4 + P5) << 24
+ vand t2h, t2h, k16
+ veor t3l, t3l, t3h @ t3 = (K) (P6 + P7) << 32
+ vmov.i64 t3h, #0
+ vext.8 t0q, t0q, t0q, #15
+ veor t2l, t2l, t2h
+ vext.8 t1q, t1q, t1q, #14
+ vmull.p8 \rq, \ad, \bd @ D = A*B
+ vext.8 t2q, t2q, t2q, #13
+ vext.8 t3q, t3q, t3q, #12
+ veor t0q, t0q, t1q
+ veor t2q, t2q, t3q
+ veor \rq, \rq, t0q
+ veor \rq, \rq, t2q
+ .endm
+
+ .macro __pmull_reduce_p8
+ veor XL_H, XL_H, XM_L
+ veor XH_L, XH_L, XM_H
+
+ vshl.i64 T1, XL, #57
+ vshl.i64 T2, XL, #62
+ veor T1, T1, T2
+ vshl.i64 T2, XL, #63
+ veor T1, T1, T2
+ veor XL_H, XL_H, T1_L
+ veor XH_L, XH_L, T1_H
+
+ vshr.u64 T1, XL, #1
+ veor XH, XH, XL
+ veor XL, XL, T1
+ vshr.u64 T1, T1, #6
+ vshr.u64 XL, XL, #1
+ .endm
+
+ .macro ghash_update
+ vld1.64 {XL}, [r1]
+
+ /* do the head block first, if supplied */
+ ldr ip, [sp]
+ teq ip, #0
+ beq 0f
+ vld1.64 {T1}, [ip]
+ teq r0, #0
+ b 3f
+
+0:
+ vld1.8 {T1}, [r2]!
+ subs r0, r0, #1
+
+3: /* multiply XL by SHASH in GF(2^128) */
+ vrev64.8 T1, T1
+
+ vext.8 IN1, T1, T1, #8
+ veor T1_L, T1_L, XL_H
+ veor XL, XL, IN1
+
+ __pmull_p8 XH, XL_H, SHASH_H, s1h, s2h, s3h, s4h @ a1 * b1
+ veor T1, T1, XL
+ __pmull_p8 XL, XL_L, SHASH_L, s1l, s2l, s3l, s4l @ a0 * b0
+ __pmull_p8 XM, T1_L, SHASH2_p8 @ (a1+a0)(b1+b0)
+
+ veor T1, XL, XH
+ veor XM, XM, T1
+
+ __pmull_reduce_p8
+
+ veor T1, T1, XH
+ veor XL, XL, T1
+
+ bne 0b
+ .endm
+
+ /*
+ * void pmull_ghash_update_p8(int blocks, u64 dg[], const char *src,
+ * u64 const h[1][2], const char *head)
+ */
+ENTRY(pmull_ghash_update_p8)
+ vld1.64 {SHASH}, [r3]
+ veor SHASH2_p8, SHASH_L, SHASH_H
+
+ vext.8 s1l, SHASH_L, SHASH_L, #1
+ vext.8 s2l, SHASH_L, SHASH_L, #2
+ vext.8 s3l, SHASH_L, SHASH_L, #3
+ vext.8 s4l, SHASH_L, SHASH_L, #4
+ vext.8 s1h, SHASH_H, SHASH_H, #1
+ vext.8 s2h, SHASH_H, SHASH_H, #2
+ vext.8 s3h, SHASH_H, SHASH_H, #3
+ vext.8 s4h, SHASH_H, SHASH_H, #4
+
+ vmov.i64 k16, #0xffff
+ vmov.i64 k32, #0xffffffff
+ vmov.i64 k48, #0xffffffffffff
+
+ ghash_update
+ vst1.64 {XL}, [r1]
+
+ bx lr
+ENDPROC(pmull_ghash_update_p8)
--
2.53.0
More information about the Linuxppc-dev
mailing list