crypto: arm64/crc-t10dif - move NEON yield to C code

Instead of yielding from the bowels of the asm routine if a reschedule is needed, divide up the input into 4 KB chunks in the C glue. This simplifies the code substantially, and avoids scheduling out the task with the asm routine on the call stack, which is undesirable from a CFI/instrumentation point of view. Signed-off-by: Ard Biesheuvel <ardb@kernel.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2025-07-23 20:51:03 +02:00 · 2021-02-03 12:36:25 +01:00
parent f0070f4a79
commit fc754c024a
2 changed files with 35 additions and 38 deletions
--- a/arch/arm64/crypto/crct10dif-ce-core.S
+++ b/arch/arm64/crypto/crct10dif-ce-core.S
@@ -68,10 +68,10 @@
 	.text
 	.arch		armv8-a+crypto
-	init_crc	.req	w19
+	init_crc	.req	w0
-	buf		.req	x20
+	buf		.req	x1
-	len		.req	x21
+	len		.req	x2
-	fold_consts_ptr	.req	x22
+	fold_consts_ptr	.req	x3
 	fold_consts	.req	v10
@@ -257,12 +257,6 @@ CPU_LE(	ext		v12.16b, v12.16b, v12.16b, #8	)
 	.endm
 	.macro		crc_t10dif_pmull, p
 	frame_push	4, 128
 	mov		init_crc, w0
 	mov		buf, x1
 	mov		len, x2
 	__pmull_init_\p
 	// For sizes less than 256 bytes, we can't fold 128 bytes at a time.
@@ -317,26 +311,7 @@ CPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
 	fold_32_bytes	\p, v6, v7
 	subs		len, len, #128
-	b.lt		.Lfold_128_bytes_loop_done_\@
+	b.ge		.Lfold_128_bytes_loop_\@
 	if_will_cond_yield_neon
 	stp		q0, q1, [sp, #.Lframe_local_offset]
 	stp		q2, q3, [sp, #.Lframe_local_offset + 32]
 	stp		q4, q5, [sp, #.Lframe_local_offset + 64]
 	stp		q6, q7, [sp, #.Lframe_local_offset + 96]
 	do_cond_yield_neon
 	ldp		q0, q1, [sp, #.Lframe_local_offset]
 	ldp		q2, q3, [sp, #.Lframe_local_offset + 32]
 	ldp		q4, q5, [sp, #.Lframe_local_offset + 64]
 	ldp		q6, q7, [sp, #.Lframe_local_offset + 96]
 	ld1		{fold_consts.2d}, [fold_consts_ptr]
 	__pmull_init_\p
 	__pmull_pre_\p	fold_consts
 	endif_yield_neon
 	b		.Lfold_128_bytes_loop_\@
 .Lfold_128_bytes_loop_done_\@:
 	// Now fold the 112 bytes in v0-v6 into the 16 bytes in v7.
@@ -453,7 +428,9 @@ CPU_LE(	ext		v0.16b, v0.16b, v0.16b, #8	)
 	// Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of v0.
 	umov		w0, v0.h[0]
-	frame_pop
+	.ifc		\p, p8
 	ldp		x29, x30, [sp], #16
 	.endif
 	ret
 .Lless_than_256_bytes_\@:
@@ -489,6 +466,8 @@ CPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
 // Assumes len >= 16.
 //
 SYM_FUNC_START(crc_t10dif_pmull_p8)
 	stp		x29, x30, [sp, #-16]!
 	mov		x29, sp
 	crc_t10dif_pmull p8
 SYM_FUNC_END(crc_t10dif_pmull_p8)
--- a/arch/arm64/crypto/crct10dif-ce-glue.c
+++ b/arch/arm64/crypto/crct10dif-ce-glue.c
@@ -37,9 +37,18 @@ static int crct10dif_update_pmull_p8(struct shash_desc *desc, const u8 *data,
 	u16 *crc = shash_desc_ctx(desc);
 	if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) {
 		do {
 			unsigned int chunk = length;
 			if (chunk > SZ_4K + CRC_T10DIF_PMULL_CHUNK_SIZE)
 				chunk = SZ_4K;
 			kernel_neon_begin();
-		*crc = crc_t10dif_pmull_p8(*crc, data, length);
+			*crc = crc_t10dif_pmull_p8(*crc, data, chunk);
 			kernel_neon_end();
 			data += chunk;
 			length -= chunk;
 		} while (length);
 	} else {
 		*crc = crc_t10dif_generic(*crc, data, length);
 	}
@@ -53,9 +62,18 @@ static int crct10dif_update_pmull_p64(struct shash_desc *desc, const u8 *data,
 	u16 *crc = shash_desc_ctx(desc);
 	if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) {
 		do {
 			unsigned int chunk = length;
 			if (chunk > SZ_4K + CRC_T10DIF_PMULL_CHUNK_SIZE)
 				chunk = SZ_4K;
 			kernel_neon_begin();
-		*crc = crc_t10dif_pmull_p64(*crc, data, length);
+			*crc = crc_t10dif_pmull_p64(*crc, data, chunk);
 			kernel_neon_end();
 			data += chunk;
 			length -= chunk;
 		} while (length);
 	} else {
 		*crc = crc_t10dif_generic(*crc, data, length);
 	}