From 6becb5026b8192e0ed6619d6e7793c2f1288244f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 8 Feb 2023 18:10:51 +0100 Subject: [PATCH 01/11] x86/alternative: Make debug-alternative selective Using debug-alternative generates a *LOT* of output, extend it a bit to select which of the many rewrites it reports on. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20230208171431.253636689@infradead.org --- arch/x86/kernel/alternative.c | 62 +++++++++++++++++++++-------------- 1 file changed, 37 insertions(+), 25 deletions(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index f615e0cb6d93..b3ae6cfa92f4 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -37,11 +37,23 @@ EXPORT_SYMBOL_GPL(alternatives_patched); #define MAX_PATCH_LEN (255-1) -static int __initdata_or_module debug_alternative; +#define DA_ALL (~0) +#define DA_ALT 0x01 +#define DA_RET 0x02 +#define DA_RETPOLINE 0x04 +#define DA_ENDBR 0x08 +#define DA_SMP 0x10 + +static unsigned int __initdata_or_module debug_alternative; static int __init debug_alt(char *str) { - debug_alternative = 1; + if (str && *str == '=') + str++; + + if (!str || kstrtouint(str, 0, &debug_alternative)) + debug_alternative = DA_ALL; + return 1; } __setup("debug-alternative", debug_alt); @@ -55,15 +67,15 @@ static int __init setup_noreplace_smp(char *str) } __setup("noreplace-smp", setup_noreplace_smp); -#define DPRINTK(fmt, args...) \ +#define DPRINTK(type, fmt, args...) \ do { \ - if (debug_alternative) \ + if (debug_alternative & DA_##type) \ printk(KERN_DEBUG pr_fmt(fmt) "\n", ##args); \ } while (0) -#define DUMP_BYTES(buf, len, fmt, args...) \ +#define DUMP_BYTES(type, buf, len, fmt, args...) \ do { \ - if (unlikely(debug_alternative)) { \ + if (unlikely(debug_alternative & DA_##type)) { \ int j; \ \ if (!(len)) \ @@ -148,7 +160,7 @@ recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insn_buff) tgt_rip = next_rip + o_dspl; n_dspl = tgt_rip - orig_insn; - DPRINTK("target RIP: %px, new_displ: 0x%x", tgt_rip, n_dspl); + DPRINTK(ALT, "target RIP: %px, new_displ: 0x%x", tgt_rip, n_dspl); if (tgt_rip - orig_insn >= 0) { if (n_dspl - 2 <= 127) @@ -183,7 +195,7 @@ five_byte_jmp: done: - DPRINTK("final displ: 0x%08x, JMP 0x%lx", + DPRINTK(ALT, "final displ: 0x%08x, JMP 0x%lx", n_dspl, (unsigned long)orig_insn + n_dspl + repl_len); } @@ -217,7 +229,7 @@ static __always_inline int optimize_nops_range(u8 *instr, u8 instrlen, int off) add_nops(instr + off, nnops); local_irq_restore(flags); - DUMP_BYTES(instr, instrlen, "%px: [%d:%d) optimized NOPs: ", instr, off, i); + DUMP_BYTES(ALT, instr, instrlen, "%px: [%d:%d) optimized NOPs: ", instr, off, i); return nnops; } @@ -270,7 +282,7 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, u8 *instr, *replacement; u8 insn_buff[MAX_PATCH_LEN]; - DPRINTK("alt table %px, -> %px", start, end); + DPRINTK(ALT, "alt table %px, -> %px", start, end); /* * The scan order should be from start to end. A later scanned * alternative code can overwrite previously scanned alternative code. @@ -297,15 +309,15 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, if (!boot_cpu_has(a->cpuid) == !(a->flags & ALT_FLAG_NOT)) goto next; - DPRINTK("feat: %s%d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d)", + DPRINTK(ALT, "feat: %s%d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d)", (a->flags & ALT_FLAG_NOT) ? "!" : "", a->cpuid >> 5, a->cpuid & 0x1f, instr, instr, a->instrlen, replacement, a->replacementlen); - DUMP_BYTES(instr, a->instrlen, "%px: old_insn: ", instr); - DUMP_BYTES(replacement, a->replacementlen, "%px: rpl_insn: ", replacement); + DUMP_BYTES(ALT, instr, a->instrlen, "%px: old_insn: ", instr); + DUMP_BYTES(ALT, replacement, a->replacementlen, "%px: rpl_insn: ", replacement); memcpy(insn_buff, replacement, a->replacementlen); insn_buff_sz = a->replacementlen; @@ -318,7 +330,7 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, */ if (a->replacementlen == 5 && *insn_buff == 0xe8) { *(s32 *)(insn_buff + 1) += replacement - instr; - DPRINTK("Fix CALL offset: 0x%x, CALL 0x%lx", + DPRINTK(ALT, "Fix CALL offset: 0x%x, CALL 0x%lx", *(s32 *)(insn_buff + 1), (unsigned long)instr + *(s32 *)(insn_buff + 1) + 5); } @@ -329,7 +341,7 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, for (; insn_buff_sz < a->instrlen; insn_buff_sz++) insn_buff[insn_buff_sz] = 0x90; - DUMP_BYTES(insn_buff, insn_buff_sz, "%px: final_insn: ", instr); + DUMP_BYTES(ALT, insn_buff, insn_buff_sz, "%px: final_insn: ", instr); text_poke_early(instr, insn_buff, insn_buff_sz); @@ -555,15 +567,15 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) continue; } - DPRINTK("retpoline at: %pS (%px) len: %d to: %pS", + DPRINTK(RETPOLINE, "retpoline at: %pS (%px) len: %d to: %pS", addr, addr, insn.length, addr + insn.length + insn.immediate.value); len = patch_retpoline(addr, &insn, bytes); if (len == insn.length) { optimize_nops(bytes, len); - DUMP_BYTES(((u8*)addr), len, "%px: orig: ", addr); - DUMP_BYTES(((u8*)bytes), len, "%px: repl: ", addr); + DUMP_BYTES(RETPOLINE, ((u8*)addr), len, "%px: orig: ", addr); + DUMP_BYTES(RETPOLINE, ((u8*)bytes), len, "%px: repl: ", addr); text_poke_early(addr, bytes, len); } } @@ -630,14 +642,14 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end) addr, dest, 5, addr)) continue; - DPRINTK("return thunk at: %pS (%px) len: %d to: %pS", + DPRINTK(RET, "return thunk at: %pS (%px) len: %d to: %pS", addr, addr, insn.length, addr + insn.length + insn.immediate.value); len = patch_return(addr, &insn, bytes); if (len == insn.length) { - DUMP_BYTES(((u8*)addr), len, "%px: orig: ", addr); - DUMP_BYTES(((u8*)bytes), len, "%px: repl: ", addr); + DUMP_BYTES(RET, ((u8*)addr), len, "%px: orig: ", addr); + DUMP_BYTES(RET, ((u8*)bytes), len, "%px: repl: ", addr); text_poke_early(addr, bytes, len); } } @@ -667,13 +679,13 @@ static void poison_endbr(void *addr, bool warn) return; } - DPRINTK("ENDBR at: %pS (%px)", addr, addr); + DPRINTK(ENDBR, "ENDBR at: %pS (%px)", addr, addr); /* * When we have IBT, the lack of ENDBR will trigger #CP */ - DUMP_BYTES(((u8*)addr), 4, "%px: orig: ", addr); - DUMP_BYTES(((u8*)&poison), 4, "%px: repl: ", addr); + DUMP_BYTES(ENDBR, ((u8*)addr), 4, "%px: orig: ", addr); + DUMP_BYTES(ENDBR, ((u8*)&poison), 4, "%px: repl: ", addr); text_poke_early(addr, &poison, 4); } @@ -1148,7 +1160,7 @@ void __init_or_module alternatives_smp_module_add(struct module *mod, smp->locks_end = locks_end; smp->text = text; smp->text_end = text_end; - DPRINTK("locks %p -> %p, text %p -> %p, name %s\n", + DPRINTK(SMP, "locks %p -> %p, text %p -> %p, name %s\n", smp->locks, smp->locks_end, smp->text, smp->text_end, smp->name); From 270a69c4485d7d07516d058bcc0473c90ee22185 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 8 Feb 2023 18:10:52 +0100 Subject: [PATCH 02/11] x86/alternative: Support relocations in alternatives A little while ago someone (Kirill) ran into the whole 'alternatives don't do relocations nonsense' again and I got annoyed enough to actually look at the code. Since the whole alternative machinery already fully decodes the instructions it is simple enough to adjust immediates and displacement when needed. Specifically, the immediates for IP modifying instructions (JMP, CALL, Jcc) and the displacement for RIP-relative instructions. [ bp: Massage comment some more and get rid of third loop in apply_relocation(). ] Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20230208171431.313857925@infradead.org --- arch/x86/kernel/alternative.c | 261 ++++++++++++++++++++----------- tools/objtool/arch/x86/special.c | 8 +- 2 files changed, 173 insertions(+), 96 deletions(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index b3ae6cfa92f4..28eb1d0bc4a0 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -134,71 +134,6 @@ extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; extern s32 __smp_locks[], __smp_locks_end[]; void text_poke_early(void *addr, const void *opcode, size_t len); -/* - * Are we looking at a near JMP with a 1 or 4-byte displacement. - */ -static inline bool is_jmp(const u8 opcode) -{ - return opcode == 0xeb || opcode == 0xe9; -} - -static void __init_or_module -recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insn_buff) -{ - u8 *next_rip, *tgt_rip; - s32 n_dspl, o_dspl; - int repl_len; - - if (a->replacementlen != 5) - return; - - o_dspl = *(s32 *)(insn_buff + 1); - - /* next_rip of the replacement JMP */ - next_rip = repl_insn + a->replacementlen; - /* target rip of the replacement JMP */ - tgt_rip = next_rip + o_dspl; - n_dspl = tgt_rip - orig_insn; - - DPRINTK(ALT, "target RIP: %px, new_displ: 0x%x", tgt_rip, n_dspl); - - if (tgt_rip - orig_insn >= 0) { - if (n_dspl - 2 <= 127) - goto two_byte_jmp; - else - goto five_byte_jmp; - /* negative offset */ - } else { - if (((n_dspl - 2) & 0xff) == (n_dspl - 2)) - goto two_byte_jmp; - else - goto five_byte_jmp; - } - -two_byte_jmp: - n_dspl -= 2; - - insn_buff[0] = 0xeb; - insn_buff[1] = (s8)n_dspl; - add_nops(insn_buff + 2, 3); - - repl_len = 2; - goto done; - -five_byte_jmp: - n_dspl -= 5; - - insn_buff[0] = 0xe9; - *(s32 *)&insn_buff[1] = n_dspl; - - repl_len = 5; - -done: - - DPRINTK(ALT, "final displ: 0x%08x, JMP 0x%lx", - n_dspl, (unsigned long)orig_insn + n_dspl + repl_len); -} - /* * optimize_nops_range() - Optimize a sequence of single byte NOPs (0x90) * @@ -265,6 +200,139 @@ static void __init_or_module noinline optimize_nops(u8 *instr, size_t len) } } +/* + * In this context, "source" is where the instructions are placed in the + * section .altinstr_replacement, for example during kernel build by the + * toolchain. + * "Destination" is where the instructions are being patched in by this + * machinery. + * + * The source offset is: + * + * src_imm = target - src_next_ip (1) + * + * and the target offset is: + * + * dst_imm = target - dst_next_ip (2) + * + * so rework (1) as an expression for target like: + * + * target = src_imm + src_next_ip (1a) + * + * and substitute in (2) to get: + * + * dst_imm = (src_imm + src_next_ip) - dst_next_ip (3) + * + * Now, since the instruction stream is 'identical' at src and dst (it + * is being copied after all) it can be stated that: + * + * src_next_ip = src + ip_offset + * dst_next_ip = dst + ip_offset (4) + * + * Substitute (4) in (3) and observe ip_offset being cancelled out to + * obtain: + * + * dst_imm = src_imm + (src + ip_offset) - (dst + ip_offset) + * = src_imm + src - dst + ip_offset - ip_offset + * = src_imm + src - dst (5) + * + * IOW, only the relative displacement of the code block matters. + */ + +#define apply_reloc_n(n_, p_, d_) \ + do { \ + s32 v = *(s##n_ *)(p_); \ + v += (d_); \ + BUG_ON((v >> 31) != (v >> (n_-1))); \ + *(s##n_ *)(p_) = (s##n_)v; \ + } while (0) + + +static __always_inline +void apply_reloc(int n, void *ptr, uintptr_t diff) +{ + switch (n) { + case 1: apply_reloc_n(8, ptr, diff); break; + case 2: apply_reloc_n(16, ptr, diff); break; + case 4: apply_reloc_n(32, ptr, diff); break; + default: BUG(); + } +} + +static __always_inline +bool need_reloc(unsigned long offset, u8 *src, size_t src_len) +{ + u8 *target = src + offset; + /* + * If the target is inside the patched block, it's relative to the + * block itself and does not need relocation. + */ + return (target < src || target > src + src_len); +} + +static void __init_or_module noinline +apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len) +{ + for (int next, i = 0; i < len; i = next) { + struct insn insn; + + if (WARN_ON_ONCE(insn_decode_kernel(&insn, &buf[i]))) + return; + + next = i + insn.length; + + switch (insn.opcode.bytes[0]) { + case 0x0f: + if (insn.opcode.bytes[1] < 0x80 || + insn.opcode.bytes[1] > 0x8f) + break; + + fallthrough; /* Jcc.d32 */ + case 0x70 ... 0x7f: /* Jcc.d8 */ + case JMP8_INSN_OPCODE: + case JMP32_INSN_OPCODE: + case CALL_INSN_OPCODE: + if (need_reloc(next + insn.immediate.value, src, src_len)) { + apply_reloc(insn.immediate.nbytes, + buf + i + insn_offset_immediate(&insn), + src - dest); + } + + /* + * Where possible, convert JMP.d32 into JMP.d8. + */ + if (insn.opcode.bytes[0] == JMP32_INSN_OPCODE) { + s32 imm = insn.immediate.value; + imm += src - dest; + imm += JMP32_INSN_SIZE - JMP8_INSN_SIZE; + if ((imm >> 31) == (imm >> 7)) { + buf[i+0] = JMP8_INSN_OPCODE; + buf[i+1] = (s8)imm; + + memset(&buf[i+2], INT3_INSN_OPCODE, insn.length - 2); + } + } + break; + } + + if (insn_rip_relative(&insn)) { + if (need_reloc(next + insn.displacement.value, src, src_len)) { + apply_reloc(insn.displacement.nbytes, + buf + i + insn_offset_displacement(&insn), + src - dest); + } + } + + + /* + * See if this and any potentially following NOPs can be + * optimized. + */ + if (insn.length == 1 && insn.opcode.bytes[0] == 0x90) + next = i + optimize_nops_range(buf, len, i); + } +} + /* * Replace instructions with better alternatives for this CPU type. This runs * before SMP is initialized to avoid SMP problems with self modifying code. @@ -306,8 +374,10 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, * - feature not present but ALT_FLAG_NOT is set to mean, * patch if feature is *NOT* present. */ - if (!boot_cpu_has(a->cpuid) == !(a->flags & ALT_FLAG_NOT)) - goto next; + if (!boot_cpu_has(a->cpuid) == !(a->flags & ALT_FLAG_NOT)) { + optimize_nops(instr, a->instrlen); + continue; + } DPRINTK(ALT, "feat: %s%d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d)", (a->flags & ALT_FLAG_NOT) ? "!" : "", @@ -316,37 +386,19 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, instr, instr, a->instrlen, replacement, a->replacementlen); - DUMP_BYTES(ALT, instr, a->instrlen, "%px: old_insn: ", instr); - DUMP_BYTES(ALT, replacement, a->replacementlen, "%px: rpl_insn: ", replacement); - memcpy(insn_buff, replacement, a->replacementlen); insn_buff_sz = a->replacementlen; - /* - * 0xe8 is a relative jump; fix the offset. - * - * Instruction length is checked before the opcode to avoid - * accessing uninitialized bytes for zero-length replacements. - */ - if (a->replacementlen == 5 && *insn_buff == 0xe8) { - *(s32 *)(insn_buff + 1) += replacement - instr; - DPRINTK(ALT, "Fix CALL offset: 0x%x, CALL 0x%lx", - *(s32 *)(insn_buff + 1), - (unsigned long)instr + *(s32 *)(insn_buff + 1) + 5); - } - - if (a->replacementlen && is_jmp(replacement[0])) - recompute_jump(a, instr, replacement, insn_buff); - for (; insn_buff_sz < a->instrlen; insn_buff_sz++) insn_buff[insn_buff_sz] = 0x90; + apply_relocation(insn_buff, a->instrlen, instr, replacement, a->replacementlen); + + DUMP_BYTES(ALT, instr, a->instrlen, "%px: old_insn: ", instr); + DUMP_BYTES(ALT, replacement, a->replacementlen, "%px: rpl_insn: ", replacement); DUMP_BYTES(ALT, insn_buff, insn_buff_sz, "%px: final_insn: ", instr); text_poke_early(instr, insn_buff, insn_buff_sz); - -next: - optimize_nops(instr, a->instrlen); } } @@ -1344,6 +1396,35 @@ static noinline void __init int3_selftest(void) unregister_die_notifier(&int3_exception_nb); } +static __initdata int __alt_reloc_selftest_addr; + +__visible noinline void __init __alt_reloc_selftest(void *arg) +{ + WARN_ON(arg != &__alt_reloc_selftest_addr); +} + +static noinline void __init alt_reloc_selftest(void) +{ + /* + * Tests apply_relocation(). + * + * This has a relative immediate (CALL) in a place other than the first + * instruction and additionally on x86_64 we get a RIP-relative LEA: + * + * lea 0x0(%rip),%rdi # 5d0: R_X86_64_PC32 .init.data+0x5566c + * call +0 # 5d5: R_X86_64_PLT32 __alt_reloc_selftest-0x4 + * + * Getting this wrong will either crash and burn or tickle the WARN + * above. + */ + asm_inline volatile ( + ALTERNATIVE("", "lea %[mem], %%" _ASM_ARG1 "; call __alt_reloc_selftest;", X86_FEATURE_ALWAYS) + : /* output */ + : [mem] "m" (__alt_reloc_selftest_addr) + : _ASM_ARG1 + ); +} + void __init alternative_instructions(void) { int3_selftest(); @@ -1431,6 +1512,8 @@ void __init alternative_instructions(void) restart_nmi(); alternatives_patched = 1; + + alt_reloc_selftest(); } /** diff --git a/tools/objtool/arch/x86/special.c b/tools/objtool/arch/x86/special.c index 7c97b7391279..799ad6bb72e5 100644 --- a/tools/objtool/arch/x86/special.c +++ b/tools/objtool/arch/x86/special.c @@ -42,13 +42,7 @@ bool arch_support_alt_relocation(struct special_alt *special_alt, struct instruction *insn, struct reloc *reloc) { - /* - * The x86 alternatives code adjusts the offsets only when it - * encounters a branch instruction at the very beginning of the - * replacement group. - */ - return insn->offset == special_alt->new_off && - (insn->type == INSN_CALL || is_jump(insn)); + return true; } /* From 14e4ec9c3e9164c6719f98d8a3065c487be2aaa5 Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Sun, 26 Feb 2023 21:04:26 +0100 Subject: [PATCH 03/11] x86/lib/memmove: Decouple ERMS from FSRM Up until now it was perceived that FSRM is an improvement to ERMS and thus it was made dependent on latter. However, there are AMD BIOSes out there which allow for disabling of either features and thus preventing kernels from booting due to the CMP disappearing and thus breaking the logic in the memmove() function. Similar observation happens on some VM migration scenarios. Patch the proper sequences depending on which feature is enabled. Reported-by: Daniel Verkamp Reported-by: Jiri Slaby Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/Y/yK0dyzI0MMdTie@zn.tnic --- arch/x86/lib/memmove_64.S | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S index 02661861e5dd..0559b206fb11 100644 --- a/arch/x86/lib/memmove_64.S +++ b/arch/x86/lib/memmove_64.S @@ -38,10 +38,12 @@ SYM_FUNC_START(__memmove) cmp %rdi, %r8 jg 2f - /* FSRM implies ERMS => no length checks, do the copy directly */ +#define CHECK_LEN cmp $0x20, %rdx; jb 1f +#define MEMMOVE_BYTES movq %rdx, %rcx; rep movsb; RET .Lmemmove_begin_forward: - ALTERNATIVE "cmp $0x20, %rdx; jb 1f", "", X86_FEATURE_FSRM - ALTERNATIVE "", "jmp .Lmemmove_erms", X86_FEATURE_ERMS + ALTERNATIVE_2 __stringify(CHECK_LEN), \ + __stringify(CHECK_LEN; MEMMOVE_BYTES), X86_FEATURE_ERMS, \ + __stringify(MEMMOVE_BYTES), X86_FEATURE_FSRM /* * movsq instruction have many startup latency @@ -207,11 +209,6 @@ SYM_FUNC_START(__memmove) movb %r11b, (%rdi) 13: RET - -.Lmemmove_erms: - movq %rdx, %rcx - rep movsb - RET SYM_FUNC_END(__memmove) EXPORT_SYMBOL(__memmove) From 6c480f22212826425b57932f09b1f0abbec85485 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 8 Feb 2023 18:10:53 +0100 Subject: [PATCH 04/11] x86/alternative: Rewrite optimize_nops() some Address two issues: - it no longer hard requires single byte NOP runs - now it accepts any NOP and NOPL encoded instruction (but not the more complicated 32bit NOPs). - it writes a single 'instruction' replacement. Specifically, ORC unwinder relies on the tail NOP of an alternative to be a single instruction. In particular, it relies on the inner bytes not being executed. Once the max supported NOP length has been reached (currently 8, could easily be extended to 11 on x86_64), switch to JMP.d8 and INT3 padding to achieve the same result. Objtool uses this guarantee in the analysis of alternative/overlapping CFI state for the ORC unwinder data. Every instruction edge gets a CFI state and the more instructions the larger the chance of conflicts. [ bp: - Add a comment over add_nop() to explain why it does it this way - Make add_nops() PARAVIRT only as it is used solely there now ] Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20230208171431.373412974@infradead.org --- arch/x86/kernel/alternative.c | 133 +++++++++++++++++++--------------- 1 file changed, 73 insertions(+), 60 deletions(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 28eb1d0bc4a0..839bc6d1498a 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -113,17 +113,35 @@ const unsigned char * const x86_nops[ASM_NOP_MAX+1] = x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, }; -/* Use this to add nops to a buffer, then text_poke the whole buffer. */ -static void __init_or_module add_nops(void *insns, unsigned int len) +/* + * In order not to issue an ORC stack depth tracking CFI entry (Call Frame Info) + * for every single-byte NOP, try to generate the maximally available NOP of + * size <= ASM_NOP_MAX such that only a single CFI entry is generated (vs one for + * each single-byte NOPs). If @len to fill out is > ASM_NOP_MAX, pad with INT3 and + * *jump* over instead of executing long and daft NOPs. + */ +static void __init_or_module add_nop(u8 *instr, unsigned int len) { - while (len > 0) { - unsigned int noplen = len; - if (noplen > ASM_NOP_MAX) - noplen = ASM_NOP_MAX; - memcpy(insns, x86_nops[noplen], noplen); - insns += noplen; - len -= noplen; + u8 *target = instr + len; + + if (!len) + return; + + if (len <= ASM_NOP_MAX) { + memcpy(instr, x86_nops[len], len); + return; } + + if (len < 128) { + __text_gen_insn(instr, JMP8_INSN_OPCODE, instr, target, JMP8_INSN_SIZE); + instr += JMP8_INSN_SIZE; + } else { + __text_gen_insn(instr, JMP32_INSN_OPCODE, instr, target, JMP32_INSN_SIZE); + instr += JMP32_INSN_SIZE; + } + + for (;instr < target; instr++) + *instr = INT3_INSN_OPCODE; } extern s32 __retpoline_sites[], __retpoline_sites_end[]; @@ -134,39 +152,32 @@ extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; extern s32 __smp_locks[], __smp_locks_end[]; void text_poke_early(void *addr, const void *opcode, size_t len); -/* - * optimize_nops_range() - Optimize a sequence of single byte NOPs (0x90) - * - * @instr: instruction byte stream - * @instrlen: length of the above - * @off: offset within @instr where the first NOP has been detected - * - * Return: number of NOPs found (and replaced). - */ -static __always_inline int optimize_nops_range(u8 *instr, u8 instrlen, int off) +static bool insn_is_nop(struct insn *insn) { - unsigned long flags; - int i = off, nnops; + if (insn->opcode.bytes[0] == 0x90) + return true; - while (i < instrlen) { - if (instr[i] != 0x90) + if (insn->opcode.bytes[0] == 0x0F && insn->opcode.bytes[1] == 0x1F) + return true; + + /* TODO: more nops */ + + return false; +} + +static int skip_nops(u8 *instr, int offset, int len) +{ + struct insn insn; + + for (; offset < len; offset += insn.length) { + if (insn_decode_kernel(&insn, &instr[offset])) break; - i++; + if (!insn_is_nop(&insn)) + break; } - nnops = i - off; - - if (nnops <= 1) - return nnops; - - local_irq_save(flags); - add_nops(instr + off, nnops); - local_irq_restore(flags); - - DUMP_BYTES(ALT, instr, instrlen, "%px: [%d:%d) optimized NOPs: ", instr, off, i); - - return nnops; + return offset; } /* @@ -175,28 +186,19 @@ static __always_inline int optimize_nops_range(u8 *instr, u8 instrlen, int off) */ static void __init_or_module noinline optimize_nops(u8 *instr, size_t len) { - struct insn insn; - int i = 0; + for (int next, i = 0; i < len; i = next) { + struct insn insn; - /* - * Jump over the non-NOP insns and optimize single-byte NOPs into bigger - * ones. - */ - for (;;) { if (insn_decode_kernel(&insn, &instr[i])) return; - /* - * See if this and any potentially following NOPs can be - * optimized. - */ - if (insn.length == 1 && insn.opcode.bytes[0] == 0x90) - i += optimize_nops_range(instr, len, i); - else - i += insn.length; + next = i + insn.length; - if (i >= len) - return; + if (insn_is_nop(&insn)) { + next = skip_nops(instr, next, len); + add_nop(instr + i, next - i); + DUMP_BYTES(ALT, instr, len, "%px: [%d:%d) optimized NOPs: ", instr, i, next); + } } } @@ -323,13 +325,10 @@ apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len) } } - - /* - * See if this and any potentially following NOPs can be - * optimized. - */ - if (insn.length == 1 && insn.opcode.bytes[0] == 0x90) - next = i + optimize_nops_range(buf, len, i); + if (insn_is_nop(&insn)) { + next = skip_nops(buf, next, len); + add_nop(buf + i, next - i); + } } } @@ -1289,6 +1288,20 @@ int alternatives_text_reserved(void *start, void *end) #endif /* CONFIG_SMP */ #ifdef CONFIG_PARAVIRT + +/* Use this to add nops to a buffer, then text_poke the whole buffer. */ +static void __init_or_module add_nops(void *insns, unsigned int len) +{ + while (len > 0) { + unsigned int noplen = len; + if (noplen > ASM_NOP_MAX) + noplen = ASM_NOP_MAX; + memcpy(insns, x86_nops[noplen], noplen); + insns += noplen; + len -= noplen; + } +} + void __init_or_module apply_paravirt(struct paravirt_patch_site *start, struct paravirt_patch_site *end) { From b6c881b248ef9d629ec2365808cb4894991c0837 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 8 Feb 2023 18:10:54 +0100 Subject: [PATCH 05/11] x86/alternative: Complicate optimize_nops() some more Because: SMP alternatives: ffffffff810026dc: [2:44) optimized NOPs: eb 2a eb 28 cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc is quite daft, make things more complicated and have the NOP runlength detection eat the preceding JMP if they both end at the same target. SMP alternatives: ffffffff810026dc: [0:44) optimized NOPs: eb 2a cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20230208171431.433132442@infradead.org --- arch/x86/kernel/alternative.c | 60 +++++++++++++++++++++++++++++------ 1 file changed, 50 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 839bc6d1498a..b78d55f0dfad 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -114,6 +114,8 @@ const unsigned char * const x86_nops[ASM_NOP_MAX+1] = }; /* + * Fill the buffer with a single effective instruction of size @len. + * * In order not to issue an ORC stack depth tracking CFI entry (Call Frame Info) * for every single-byte NOP, try to generate the maximally available NOP of * size <= ASM_NOP_MAX such that only a single CFI entry is generated (vs one for @@ -152,6 +154,9 @@ extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; extern s32 __smp_locks[], __smp_locks_end[]; void text_poke_early(void *addr, const void *opcode, size_t len); +/* + * Matches NOP and NOPL, not any of the other possible NOPs. + */ static bool insn_is_nop(struct insn *insn) { if (insn->opcode.bytes[0] == 0x90) @@ -165,6 +170,10 @@ static bool insn_is_nop(struct insn *insn) return false; } +/* + * Find the offset of the first non-NOP instruction starting at @offset + * but no further than @len. + */ static int skip_nops(u8 *instr, int offset, int len) { struct insn insn; @@ -180,12 +189,47 @@ static int skip_nops(u8 *instr, int offset, int len) return offset; } +/* + * Optimize a sequence of NOPs, possibly preceded by an unconditional jump + * to the end of the NOP sequence into a single NOP. + */ +static bool __optimize_nops(u8 *instr, size_t len, struct insn *insn, + int *next, int *prev, int *target) +{ + int i = *next - insn->length; + + switch (insn->opcode.bytes[0]) { + case JMP8_INSN_OPCODE: + case JMP32_INSN_OPCODE: + *prev = i; + *target = *next + insn->immediate.value; + return false; + } + + if (insn_is_nop(insn)) { + int nop = i; + + *next = skip_nops(instr, *next, len); + if (*target && *next == *target) + nop = *prev; + + add_nop(instr + nop, *next - nop); + DUMP_BYTES(ALT, instr, len, "%px: [%d:%d) optimized NOPs: ", instr, nop, *next); + return true; + } + + *target = 0; + return false; +} + /* * "noinline" to cause control flow change and thus invalidate I$ and * cause refetch after modification. */ static void __init_or_module noinline optimize_nops(u8 *instr, size_t len) { + int prev, target = 0; + for (int next, i = 0; i < len; i = next) { struct insn insn; @@ -194,11 +238,7 @@ static void __init_or_module noinline optimize_nops(u8 *instr, size_t len) next = i + insn.length; - if (insn_is_nop(&insn)) { - next = skip_nops(instr, next, len); - add_nop(instr + i, next - i); - DUMP_BYTES(ALT, instr, len, "%px: [%d:%d) optimized NOPs: ", instr, i, next); - } + __optimize_nops(instr, len, &insn, &next, &prev, &target); } } @@ -275,6 +315,8 @@ bool need_reloc(unsigned long offset, u8 *src, size_t src_len) static void __init_or_module noinline apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len) { + int prev, target = 0; + for (int next, i = 0; i < len; i = next) { struct insn insn; @@ -283,6 +325,9 @@ apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len) next = i + insn.length; + if (__optimize_nops(buf, len, &insn, &next, &prev, &target)) + continue; + switch (insn.opcode.bytes[0]) { case 0x0f: if (insn.opcode.bytes[1] < 0x80 || @@ -324,11 +369,6 @@ apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len) src - dest); } } - - if (insn_is_nop(&insn)) { - next = skip_nops(buf, next, len); - add_nop(buf + i, next - i); - } } } From d2408e043e7296017420aa5929b3bba4d5e61013 Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Fri, 12 May 2023 14:05:11 +0200 Subject: [PATCH 06/11] x86/alternative: Optimize returns patching Instead of decoding each instruction in the return sites range only to realize that that return site is a jump to the default return thunk which is needed - X86_FEATURE_RETHUNK is enabled - lift that check before the loop and get rid of that loop overhead. Add comments about what gets patched, while at it. Signed-off-by: Borislav Petkov (AMD) Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20230512120952.7924-1-bp@alien8.de --- arch/x86/kernel/alternative.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index b78d55f0dfad..3bb0a5f61e8c 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -693,13 +693,12 @@ static int patch_return(void *addr, struct insn *insn, u8 *bytes) { int i = 0; + /* Patch the custom return thunks... */ if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) { - if (x86_return_thunk == __x86_return_thunk) - return -1; - i = JMP32_INSN_SIZE; __text_gen_insn(bytes, JMP32_INSN_OPCODE, addr, x86_return_thunk, i); } else { + /* ... or patch them out if not needed. */ bytes[i++] = RET_INSN_OPCODE; } @@ -712,6 +711,14 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end) { s32 *s; + /* + * Do not patch out the default return thunks if those needed are the + * ones generated by the compiler. + */ + if (cpu_feature_enabled(X86_FEATURE_RETHUNK) && + (x86_return_thunk == __x86_return_thunk)) + return; + for (s = start; s < end; s++) { void *dest = NULL, *addr = (void *)s + *s; struct insn insn; From d42a2a89121071cc8dd285235253a4c739641635 Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Sat, 13 May 2023 16:01:39 +0200 Subject: [PATCH 07/11] x86/alternatives: Fix section mismatch warnings Fix stuff like: WARNING: modpost: vmlinux.o: section mismatch in reference: \ __optimize_nops (section: .text) -> debug_alternative (section: .init.data) Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20230513160146.16039-1-bp@alien8.de --- arch/x86/kernel/alternative.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 3bb0a5f61e8c..93aa95afd005 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -193,8 +193,8 @@ static int skip_nops(u8 *instr, int offset, int len) * Optimize a sequence of NOPs, possibly preceded by an unconditional jump * to the end of the NOP sequence into a single NOP. */ -static bool __optimize_nops(u8 *instr, size_t len, struct insn *insn, - int *next, int *prev, int *target) +static bool __init_or_module +__optimize_nops(u8 *instr, size_t len, struct insn *insn, int *next, int *prev, int *target) { int i = *next - insn->length; @@ -765,7 +765,7 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end) { } #ifdef CONFIG_X86_KERNEL_IBT -static void poison_endbr(void *addr, bool warn) +static void __init_or_module poison_endbr(void *addr, bool warn) { u32 endbr, poison = gen_endbr_poison(); From df25edbac31ea87b488789d44a362063542b5967 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 15 May 2023 11:28:05 +0200 Subject: [PATCH 08/11] x86/alternatives: Add longer 64-bit NOPs By adding support for longer NOPs there are a few more alternatives that can turn into a single instruction. Add up to NOP11, the same limit where GNU as .nops also stops generating longer nops. This is because a number of uarchs have severe decode penalties for more than 3 prefixes. [ bp: Sync up with the version in tools/ while at it. ] Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20230515093020.661756940@infradead.org --- arch/x86/include/asm/nops.h | 16 ++++++++++++++-- arch/x86/kernel/alternative.c | 10 ++++++++++ tools/arch/x86/include/asm/nops.h | 16 ++++++++++++++-- 3 files changed, 38 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/nops.h b/arch/x86/include/asm/nops.h index c5573eaa5bb9..1c1b7550fa55 100644 --- a/arch/x86/include/asm/nops.h +++ b/arch/x86/include/asm/nops.h @@ -34,6 +34,8 @@ #define BYTES_NOP7 0x8d,0xb4,0x26,0x00,0x00,0x00,0x00 #define BYTES_NOP8 0x3e,BYTES_NOP7 +#define ASM_NOP_MAX 8 + #else /* @@ -47,6 +49,9 @@ * 6: osp nopl 0x00(%eax,%eax,1) * 7: nopl 0x00000000(%eax) * 8: nopl 0x00000000(%eax,%eax,1) + * 9: cs nopl 0x00000000(%eax,%eax,1) + * 10: osp cs nopl 0x00000000(%eax,%eax,1) + * 11: osp osp cs nopl 0x00000000(%eax,%eax,1) */ #define BYTES_NOP1 0x90 #define BYTES_NOP2 0x66,BYTES_NOP1 @@ -56,6 +61,15 @@ #define BYTES_NOP6 0x66,BYTES_NOP5 #define BYTES_NOP7 0x0f,0x1f,0x80,0x00,0x00,0x00,0x00 #define BYTES_NOP8 0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00 +#define BYTES_NOP9 0x2e,BYTES_NOP8 +#define BYTES_NOP10 0x66,BYTES_NOP9 +#define BYTES_NOP11 0x66,BYTES_NOP10 + +#define ASM_NOP9 _ASM_BYTES(BYTES_NOP9) +#define ASM_NOP10 _ASM_BYTES(BYTES_NOP10) +#define ASM_NOP11 _ASM_BYTES(BYTES_NOP11) + +#define ASM_NOP_MAX 11 #endif /* CONFIG_64BIT */ @@ -68,8 +82,6 @@ #define ASM_NOP7 _ASM_BYTES(BYTES_NOP7) #define ASM_NOP8 _ASM_BYTES(BYTES_NOP8) -#define ASM_NOP_MAX 8 - #ifndef __ASSEMBLY__ extern const unsigned char * const x86_nops[]; #endif diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 93aa95afd005..0747d29c8eaf 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -98,6 +98,11 @@ static const unsigned char x86nops[] = BYTES_NOP6, BYTES_NOP7, BYTES_NOP8, +#ifdef CONFIG_64BIT + BYTES_NOP9, + BYTES_NOP10, + BYTES_NOP11, +#endif }; const unsigned char * const x86_nops[ASM_NOP_MAX+1] = @@ -111,6 +116,11 @@ const unsigned char * const x86_nops[ASM_NOP_MAX+1] = x86nops + 1 + 2 + 3 + 4 + 5, x86nops + 1 + 2 + 3 + 4 + 5 + 6, x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, +#ifdef CONFIG_64BIT + x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8, + x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9, + x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10, +#endif }; /* diff --git a/tools/arch/x86/include/asm/nops.h b/tools/arch/x86/include/asm/nops.h index c5573eaa5bb9..1c1b7550fa55 100644 --- a/tools/arch/x86/include/asm/nops.h +++ b/tools/arch/x86/include/asm/nops.h @@ -34,6 +34,8 @@ #define BYTES_NOP7 0x8d,0xb4,0x26,0x00,0x00,0x00,0x00 #define BYTES_NOP8 0x3e,BYTES_NOP7 +#define ASM_NOP_MAX 8 + #else /* @@ -47,6 +49,9 @@ * 6: osp nopl 0x00(%eax,%eax,1) * 7: nopl 0x00000000(%eax) * 8: nopl 0x00000000(%eax,%eax,1) + * 9: cs nopl 0x00000000(%eax,%eax,1) + * 10: osp cs nopl 0x00000000(%eax,%eax,1) + * 11: osp osp cs nopl 0x00000000(%eax,%eax,1) */ #define BYTES_NOP1 0x90 #define BYTES_NOP2 0x66,BYTES_NOP1 @@ -56,6 +61,15 @@ #define BYTES_NOP6 0x66,BYTES_NOP5 #define BYTES_NOP7 0x0f,0x1f,0x80,0x00,0x00,0x00,0x00 #define BYTES_NOP8 0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00 +#define BYTES_NOP9 0x2e,BYTES_NOP8 +#define BYTES_NOP10 0x66,BYTES_NOP9 +#define BYTES_NOP11 0x66,BYTES_NOP10 + +#define ASM_NOP9 _ASM_BYTES(BYTES_NOP9) +#define ASM_NOP10 _ASM_BYTES(BYTES_NOP10) +#define ASM_NOP11 _ASM_BYTES(BYTES_NOP11) + +#define ASM_NOP_MAX 11 #endif /* CONFIG_64BIT */ @@ -68,8 +82,6 @@ #define ASM_NOP7 _ASM_BYTES(BYTES_NOP7) #define ASM_NOP8 _ASM_BYTES(BYTES_NOP8) -#define ASM_NOP_MAX 8 - #ifndef __ASSEMBLY__ extern const unsigned char * const x86_nops[]; #endif From 3496d1c64a0fcc9bae3ed40decc3ecd7f8ac072f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 10 Feb 2023 10:10:57 +0000 Subject: [PATCH 09/11] x86/nospec: Shorten RESET_CALL_DEPTH RESET_CALL_DEPTH is a pretty fat monster and blows up UNTRAIN_RET to 20 bytes: 19: 48 c7 c0 80 00 00 00 mov $0x80,%rax 20: 48 c1 e0 38 shl $0x38,%rax 24: 65 48 89 04 25 00 00 00 00 mov %rax,%gs:0x0 29: R_X86_64_32S pcpu_hot+0x10 Shrink it by 4 bytes: 0: 31 c0 xor %eax,%eax 2: 48 0f ba e8 3f bts $0x3f,%rax 7: 65 48 89 04 25 00 00 00 00 mov %rax,%gs:0x0 Shrink RESET_CALL_DEPTH_FROM_CALL by 5 bytes by only setting %al, the other bits are shifted out (the same could be done for RESET_CALL_DEPTH, but the XOR+BTS sequence has less dependencies due to the zeroing). Suggested-by: Andrew Cooper Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20230515093020.729622326@infradead.org --- arch/x86/include/asm/nospec-branch.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index edb2b0cb8efe..55388c9f7601 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -84,12 +84,12 @@ movq $-1, PER_CPU_VAR(pcpu_hot + X86_call_depth); #define RESET_CALL_DEPTH \ - mov $0x80, %rax; \ - shl $56, %rax; \ + xor %eax, %eax; \ + bts $63, %rax; \ movq %rax, PER_CPU_VAR(pcpu_hot + X86_call_depth); #define RESET_CALL_DEPTH_FROM_CALL \ - mov $0xfc, %rax; \ + movb $0xfc, %al; \ shl $56, %rax; \ movq %rax, PER_CPU_VAR(pcpu_hot + X86_call_depth); \ CALL_THUNKS_DEBUG_INC_CALLS From 9350a629e839ca1c2b529a83a916cf2370bd1c64 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 31 May 2023 09:24:19 -0400 Subject: [PATCH 10/11] x86/alternatives: Add cond_resched() to text_poke_bp_batch() Debugging in the kernel has started slowing down the kernel by a noticeable amount. The ftrace start up tests are triggering the softlockup watchdog on some boxes. This is caused by the start up tests that enable function and function graph tracing several times. Sprinkling cond_resched() just in the start up test code was not enough to stop the softlockup from triggering. It would sometimes trigger in the text_poke_bp_batch() code. When function tracing enables all functions, it will call text_poke_queue() to queue the places that need to be patched. Every 256 entries will do a "flush" that calls text_poke_bp_batch() to do the update of the 256 locations. As this is in a scheduleable context, calling cond_resched() at the start of text_poke_bp_batch() will ensure that other tasks could get a chance to run while the patching is happening. This keeps the softlockup from triggering in the start up tests. Signed-off-by: Steven Rostedt (Google) Signed-off-by: Borislav Petkov (AMD) Acked-by: Masami Hiramatsu (Google) Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20230531092419.4d051374@rorschach.local.home --- arch/x86/kernel/alternative.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 0747d29c8eaf..bbfbf7ad17ca 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -2118,6 +2118,16 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries */ atomic_set_release(&bp_desc.refs, 1); + /* + * Function tracing can enable thousands of places that need to be + * updated. This can take quite some time, and with full kernel debugging + * enabled, this could cause the softlockup watchdog to trigger. + * This function gets called every 256 entries added to be patched. + * Call cond_resched() here to make sure that other tasks can get scheduled + * while processing all the functions being patched. + */ + cond_resched(); + /* * Corresponding read barrier in int3 notifier for making sure the * nr_entries and handler are correctly ordered wrt. patching. From 2bd4aa9325821551648cf9738d6aa3a49317d7e5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 14 Jun 2023 16:35:50 +0200 Subject: [PATCH 11/11] x86/alternative: PAUSE is not a NOP While chasing ghosts, I did notice that optimize_nops() was replacing 'REP NOP' aka 'PAUSE' with NOP2. This is clearly not right. Fixes: 6c480f222128 ("x86/alternative: Rewrite optimize_nops() some") Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/linux-next/20230524130104.GR83892@hirez.programming.kicks-ass.net/ --- arch/x86/kernel/alternative.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index bbfbf7ad17ca..a7e1ec50ad29 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -169,9 +169,12 @@ void text_poke_early(void *addr, const void *opcode, size_t len); */ static bool insn_is_nop(struct insn *insn) { - if (insn->opcode.bytes[0] == 0x90) + /* Anything NOP, but no REP NOP */ + if (insn->opcode.bytes[0] == 0x90 && + (!insn->prefixes.nbytes || insn->prefixes.bytes[0] != 0xF3)) return true; + /* NOPL */ if (insn->opcode.bytes[0] == 0x0F && insn->opcode.bytes[1] == 0x1F) return true;