x86: improve on the non-rep 'clear_user' function

The old version was oddly written to have the repeat count in multiple
registers.  So instead of taking advantage of %rax being zero, it had
some sub-counts in it.  All just for a "single word clearing" loop,
which isn't even efficient to begin with.

So get rid of those games, and just keep all the state in the same
registers we got it in (and that we should return things in).  That not
only makes this act much more like 'rep stos' (which this function is
replacing), but makes it much easier to actually do the obvious loop
unrolling.

Also rename the function from the now nonsensical 'clear_user_original'
to what it now clearly is: 'rep_stos_alternative'.

End result: if we don't have a fast 'rep stosb', at least we can have a
fast fallback for it.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
Linus Torvalds
2023-04-16 14:06:58 -07:00
parent 577e6a7fd5
commit 8c9b6a88b7
3 changed files with 74 additions and 48 deletions

View File

@@ -83,7 +83,7 @@ __copy_from_user_flushcache(void *dst, const void __user *src, unsigned size)
*/ */
__must_check unsigned long __must_check unsigned long
clear_user_original(void __user *addr, unsigned long len); rep_stos_alternative(void __user *addr, unsigned long len);
static __always_inline __must_check unsigned long __clear_user(void __user *addr, unsigned long size) static __always_inline __must_check unsigned long __clear_user(void __user *addr, unsigned long size)
{ {
@@ -97,7 +97,7 @@ static __always_inline __must_check unsigned long __clear_user(void __user *addr
asm volatile( asm volatile(
"1:\n\t" "1:\n\t"
ALTERNATIVE("rep stosb", ALTERNATIVE("rep stosb",
"call clear_user_original", ALT_NOT(X86_FEATURE_FSRS)) "call rep_stos_alternative", ALT_NOT(X86_FEATURE_FSRS))
"2:\n" "2:\n"
_ASM_EXTABLE_UA(1b, 2b) _ASM_EXTABLE_UA(1b, 2b)
: "+c" (size), "+D" (addr), ASM_CALL_CONSTRAINT : "+c" (size), "+D" (addr), ASM_CALL_CONSTRAINT

View File

@@ -57,59 +57,85 @@ EXPORT_SYMBOL_GPL(clear_page_erms)
* Input: * Input:
* rdi destination * rdi destination
* rcx count * rcx count
* rax is zero
* *
* Output: * Output:
* rcx: uncleared bytes or 0 if successful. * rcx: uncleared bytes or 0 if successful.
*/ */
SYM_FUNC_START(clear_user_original) SYM_FUNC_START(rep_stos_alternative)
/* cmpq $64,%rcx
* Copy only the lower 32 bits of size as that is enough to handle the rest bytes, jae .Lunrolled
* i.e., no need for a 'q' suffix and thus a REX prefix.
*/
mov %ecx,%eax
shr $3,%rcx
jz .Lrest_bytes
# do the qwords first cmp $8,%ecx
.p2align 4 jae .Lword
.Lqwords:
movq $0,(%rdi)
lea 8(%rdi),%rdi
dec %rcx
jnz .Lqwords
.Lrest_bytes: testl %ecx,%ecx
and $7, %eax je .Lexit
jz .Lexit
# now do the rest bytes .Lclear_user_tail:
.Lbytes: 0: movb %al,(%rdi)
movb $0,(%rdi)
inc %rdi inc %rdi
dec %eax dec %rcx
jnz .Lbytes jnz .Lclear_user_tail
.Lexit: .Lexit:
RET
_ASM_EXTABLE_UA( 0b, .Lexit)
.Lword:
1: movq %rax,(%rdi)
addq $8,%rdi
sub $8,%ecx
je .Lexit
cmp $8,%ecx
jae .Lword
jmp .Lclear_user_tail
.p2align 4
.Lunrolled:
10: movq %rax,(%rdi)
11: movq %rax,8(%rdi)
12: movq %rax,16(%rdi)
13: movq %rax,24(%rdi)
14: movq %rax,32(%rdi)
15: movq %rax,40(%rdi)
16: movq %rax,48(%rdi)
17: movq %rax,56(%rdi)
addq $64,%rdi
subq $64,%rcx
cmpq $64,%rcx
jae .Lunrolled
cmpl $8,%ecx
jae .Lword
testl %ecx,%ecx
jne .Lclear_user_tail
RET
/* /*
* %rax still needs to be cleared in the exception case because this function is called * If we take an exception on any of the
* from inline asm and the compiler expects %rax to be zero when exiting the inline asm, * word stores, we know that %rcx isn't zero,
* in case it might reuse it somewhere. * so we can just go to the tail clearing to
* get the exact count.
*
* The unrolled case might end up clearing
* some bytes twice. Don't care.
*
* We could use the value in %rdi to avoid
* a second fault on the exact count case,
* but do we really care? No.
*
* Finally, we could try to align %rdi at the
* top of the unrolling. But unaligned stores
* just aren't that common or expensive.
*/ */
xor %eax,%eax _ASM_EXTABLE_UA( 1b, .Lclear_user_tail)
RET _ASM_EXTABLE_UA(10b, .Lclear_user_tail)
_ASM_EXTABLE_UA(11b, .Lclear_user_tail)
.Lqwords_exception: _ASM_EXTABLE_UA(12b, .Lclear_user_tail)
# convert remaining qwords back into bytes to return to caller _ASM_EXTABLE_UA(13b, .Lclear_user_tail)
shl $3, %rcx _ASM_EXTABLE_UA(14b, .Lclear_user_tail)
and $7, %eax _ASM_EXTABLE_UA(15b, .Lclear_user_tail)
add %rax,%rcx _ASM_EXTABLE_UA(16b, .Lclear_user_tail)
jmp .Lexit _ASM_EXTABLE_UA(17b, .Lclear_user_tail)
SYM_FUNC_END(rep_stos_alternative)
.Lbytes_exception: EXPORT_SYMBOL(rep_stos_alternative)
mov %eax,%ecx
jmp .Lexit
_ASM_EXTABLE_UA(.Lqwords, .Lqwords_exception)
_ASM_EXTABLE_UA(.Lbytes, .Lbytes_exception)
SYM_FUNC_END(clear_user_original)
EXPORT_SYMBOL(clear_user_original)

View File

@@ -1284,7 +1284,7 @@ static const char *uaccess_safe_builtin[] = {
"copy_mc_fragile_handle_tail", "copy_mc_fragile_handle_tail",
"copy_mc_enhanced_fast_string", "copy_mc_enhanced_fast_string",
"ftrace_likely_update", /* CONFIG_TRACE_BRANCH_PROFILING */ "ftrace_likely_update", /* CONFIG_TRACE_BRANCH_PROFILING */
"clear_user_original", "rep_stos_alternative",
"copy_user_generic_unrolled", "copy_user_generic_unrolled",
"__copy_user_nocache", "__copy_user_nocache",
NULL NULL