mirror of
https://github.com/tbsdtv/linux_media.git
synced 2025-07-23 20:51:03 +02:00
crypto: arm64/crc-t10dif - move NEON yield to C code
Instead of yielding from the bowels of the asm routine if a reschedule is needed, divide up the input into 4 KB chunks in the C glue. This simplifies the code substantially, and avoids scheduling out the task with the asm routine on the call stack, which is undesirable from a CFI/instrumentation point of view. Signed-off-by: Ard Biesheuvel <ardb@kernel.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
This commit is contained in:
committed by
Herbert Xu
parent
f0070f4a79
commit
fc754c024a
@@ -68,10 +68,10 @@
|
|||||||
.text
|
.text
|
||||||
.arch armv8-a+crypto
|
.arch armv8-a+crypto
|
||||||
|
|
||||||
init_crc .req w19
|
init_crc .req w0
|
||||||
buf .req x20
|
buf .req x1
|
||||||
len .req x21
|
len .req x2
|
||||||
fold_consts_ptr .req x22
|
fold_consts_ptr .req x3
|
||||||
|
|
||||||
fold_consts .req v10
|
fold_consts .req v10
|
||||||
|
|
||||||
@@ -257,12 +257,6 @@ CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 )
|
|||||||
.endm
|
.endm
|
||||||
|
|
||||||
.macro crc_t10dif_pmull, p
|
.macro crc_t10dif_pmull, p
|
||||||
frame_push 4, 128
|
|
||||||
|
|
||||||
mov init_crc, w0
|
|
||||||
mov buf, x1
|
|
||||||
mov len, x2
|
|
||||||
|
|
||||||
__pmull_init_\p
|
__pmull_init_\p
|
||||||
|
|
||||||
// For sizes less than 256 bytes, we can't fold 128 bytes at a time.
|
// For sizes less than 256 bytes, we can't fold 128 bytes at a time.
|
||||||
@@ -317,26 +311,7 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
|
|||||||
fold_32_bytes \p, v6, v7
|
fold_32_bytes \p, v6, v7
|
||||||
|
|
||||||
subs len, len, #128
|
subs len, len, #128
|
||||||
b.lt .Lfold_128_bytes_loop_done_\@
|
b.ge .Lfold_128_bytes_loop_\@
|
||||||
|
|
||||||
if_will_cond_yield_neon
|
|
||||||
stp q0, q1, [sp, #.Lframe_local_offset]
|
|
||||||
stp q2, q3, [sp, #.Lframe_local_offset + 32]
|
|
||||||
stp q4, q5, [sp, #.Lframe_local_offset + 64]
|
|
||||||
stp q6, q7, [sp, #.Lframe_local_offset + 96]
|
|
||||||
do_cond_yield_neon
|
|
||||||
ldp q0, q1, [sp, #.Lframe_local_offset]
|
|
||||||
ldp q2, q3, [sp, #.Lframe_local_offset + 32]
|
|
||||||
ldp q4, q5, [sp, #.Lframe_local_offset + 64]
|
|
||||||
ldp q6, q7, [sp, #.Lframe_local_offset + 96]
|
|
||||||
ld1 {fold_consts.2d}, [fold_consts_ptr]
|
|
||||||
__pmull_init_\p
|
|
||||||
__pmull_pre_\p fold_consts
|
|
||||||
endif_yield_neon
|
|
||||||
|
|
||||||
b .Lfold_128_bytes_loop_\@
|
|
||||||
|
|
||||||
.Lfold_128_bytes_loop_done_\@:
|
|
||||||
|
|
||||||
// Now fold the 112 bytes in v0-v6 into the 16 bytes in v7.
|
// Now fold the 112 bytes in v0-v6 into the 16 bytes in v7.
|
||||||
|
|
||||||
@@ -453,7 +428,9 @@ CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
|
|||||||
// Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of v0.
|
// Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of v0.
|
||||||
|
|
||||||
umov w0, v0.h[0]
|
umov w0, v0.h[0]
|
||||||
frame_pop
|
.ifc \p, p8
|
||||||
|
ldp x29, x30, [sp], #16
|
||||||
|
.endif
|
||||||
ret
|
ret
|
||||||
|
|
||||||
.Lless_than_256_bytes_\@:
|
.Lless_than_256_bytes_\@:
|
||||||
@@ -489,6 +466,8 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
|
|||||||
// Assumes len >= 16.
|
// Assumes len >= 16.
|
||||||
//
|
//
|
||||||
SYM_FUNC_START(crc_t10dif_pmull_p8)
|
SYM_FUNC_START(crc_t10dif_pmull_p8)
|
||||||
|
stp x29, x30, [sp, #-16]!
|
||||||
|
mov x29, sp
|
||||||
crc_t10dif_pmull p8
|
crc_t10dif_pmull p8
|
||||||
SYM_FUNC_END(crc_t10dif_pmull_p8)
|
SYM_FUNC_END(crc_t10dif_pmull_p8)
|
||||||
|
|
||||||
|
@@ -37,9 +37,18 @@ static int crct10dif_update_pmull_p8(struct shash_desc *desc, const u8 *data,
|
|||||||
u16 *crc = shash_desc_ctx(desc);
|
u16 *crc = shash_desc_ctx(desc);
|
||||||
|
|
||||||
if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) {
|
if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) {
|
||||||
|
do {
|
||||||
|
unsigned int chunk = length;
|
||||||
|
|
||||||
|
if (chunk > SZ_4K + CRC_T10DIF_PMULL_CHUNK_SIZE)
|
||||||
|
chunk = SZ_4K;
|
||||||
|
|
||||||
kernel_neon_begin();
|
kernel_neon_begin();
|
||||||
*crc = crc_t10dif_pmull_p8(*crc, data, length);
|
*crc = crc_t10dif_pmull_p8(*crc, data, chunk);
|
||||||
kernel_neon_end();
|
kernel_neon_end();
|
||||||
|
data += chunk;
|
||||||
|
length -= chunk;
|
||||||
|
} while (length);
|
||||||
} else {
|
} else {
|
||||||
*crc = crc_t10dif_generic(*crc, data, length);
|
*crc = crc_t10dif_generic(*crc, data, length);
|
||||||
}
|
}
|
||||||
@@ -53,9 +62,18 @@ static int crct10dif_update_pmull_p64(struct shash_desc *desc, const u8 *data,
|
|||||||
u16 *crc = shash_desc_ctx(desc);
|
u16 *crc = shash_desc_ctx(desc);
|
||||||
|
|
||||||
if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) {
|
if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) {
|
||||||
|
do {
|
||||||
|
unsigned int chunk = length;
|
||||||
|
|
||||||
|
if (chunk > SZ_4K + CRC_T10DIF_PMULL_CHUNK_SIZE)
|
||||||
|
chunk = SZ_4K;
|
||||||
|
|
||||||
kernel_neon_begin();
|
kernel_neon_begin();
|
||||||
*crc = crc_t10dif_pmull_p64(*crc, data, length);
|
*crc = crc_t10dif_pmull_p64(*crc, data, chunk);
|
||||||
kernel_neon_end();
|
kernel_neon_end();
|
||||||
|
data += chunk;
|
||||||
|
length -= chunk;
|
||||||
|
} while (length);
|
||||||
} else {
|
} else {
|
||||||
*crc = crc_t10dif_generic(*crc, data, length);
|
*crc = crc_t10dif_generic(*crc, data, length);
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user