Le 25/10/2022 à 06:44, Benjamin Gray a écrit :
> From: "Christopher M. Riedl" <c...@bluescreens.de>
> 
> x86 supports the notion of a temporary mm which restricts access to
> temporary PTEs to a single CPU. A temporary mm is useful for situations
> where a CPU needs to perform sensitive operations (such as patching a
> STRICT_KERNEL_RWX kernel) requiring temporary mappings without exposing
> said mappings to other CPUs. Another benefit is that other CPU TLBs do
> not need to be flushed when the temporary mm is torn down.
> 
> Mappings in the temporary mm can be set in the userspace portion of the
> address-space.
> 
> Interrupts must be disabled while the temporary mm is in use. HW
> breakpoints, which may have been set by userspace as watchpoints on
> addresses now within the temporary mm, are saved and disabled when
> loading the temporary mm. The HW breakpoints are restored when unloading
> the temporary mm. All HW breakpoints are indiscriminately disabled while
> the temporary mm is in use - this may include breakpoints set by perf.
> 
> Use the `poking_init` init hook to prepare a temporary mm and patching
> address. Initialize the temporary mm by copying the init mm. Choose a
> randomized patching address inside the temporary mm userspace address
> space. The patching address is randomized between PAGE_SIZE and
> DEFAULT_MAP_WINDOW-PAGE_SIZE.
> 
> Bits of entropy with 64K page size on BOOK3S_64:
> 
>       bits of entropy = log2(DEFAULT_MAP_WINDOW_USER64 / PAGE_SIZE)
> 
>       PAGE_SIZE=64K, DEFAULT_MAP_WINDOW_USER64=128TB
>       bits of entropy = log2(128TB / 64K)
>       bits of entropy = 31
> 
> The upper limit is DEFAULT_MAP_WINDOW due to how the Book3s64 Hash MMU
> operates - by default the space above DEFAULT_MAP_WINDOW is not
> available. Currently the Hash MMU does not use a temporary mm so
> technically this upper limit isn't necessary; however, a larger
> randomization range does not further "harden" this overall approach and
> future work may introduce patching with a temporary mm on Hash as well.
> 
> Randomization occurs only once during initialization for each CPU as it
> comes online.
> 
> The patching page is mapped with PAGE_KERNEL to set EAA[0] for the PTE
> which ignores the AMR (so no need to unlock/lock KUAP) according to
> PowerISA v3.0b Figure 35 on Radix.
> 
> Based on x86 implementation:
> 
> commit 4fc19708b165
> ("x86/alternatives: Initialize temporary mm for patching")
> 
> and:
> 
> commit b3fd8e83ada0
> ("x86/alternatives: Use temporary mm for text poking")
> 
> From: Benjamin Gray <bg...@linux.ibm.com>
> 
> Synchronisation is done according to ISA 3.1B Book 3 Chapter 13
> "Synchronization Requirements for Context Alterations". Switching the mm
> is a change to the PID, which requires a CSI before and after the change,
> and a hwsync between the last instruction that performs address
> translation for an associated storage access.
> 
> Instruction fetch is an associated storage access, but the instruction
> address mappings are not being changed, so it should not matter which
> context they use. We must still perform a hwsync to guard arbitrary
> prior code that may have accessed a userspace address.
> 
> TLB invalidation is local and VA specific. Local because only this core
> used the patching mm, and VA specific because we only care that the
> writable mapping is purged. Leaving the other mappings intact is more
> efficient, especially when performing many code patches in a row (e.g.,
> as ftrace would).
> 
> Signed-off-by: Christopher M. Riedl <c...@bluescreens.de>
> Signed-off-by: Benjamin Gray <bg...@linux.ibm.com>
> ---
> v9:   * Add back Christopher M. Riedl signed-off-by
>       * Remove temp_mm_state
> ---
>   arch/powerpc/lib/code-patching.c | 221 ++++++++++++++++++++++++++++++-
>   1 file changed, 216 insertions(+), 5 deletions(-)
> 
> diff --git a/arch/powerpc/lib/code-patching.c 
> b/arch/powerpc/lib/code-patching.c
> index b0a12b2d5a9b..3fe99d0086fc 100644
> --- a/arch/powerpc/lib/code-patching.c
> +++ b/arch/powerpc/lib/code-patching.c
> @@ -4,12 +4,17 @@
>    */
>   
>   #include <linux/kprobes.h>
> +#include <linux/mmu_context.h>
> +#include <linux/random.h>
>   #include <linux/vmalloc.h>
>   #include <linux/init.h>
>   #include <linux/cpuhotplug.h>
>   #include <linux/uaccess.h>
>   #include <linux/jump_label.h>
>   
> +#include <asm/debug.h>
> +#include <asm/pgalloc.h>
> +#include <asm/tlb.h>
>   #include <asm/tlbflush.h>
>   #include <asm/page.h>
>   #include <asm/code-patching.h>
> @@ -42,11 +47,54 @@ int raw_patch_instruction(u32 *addr, ppc_inst_t instr)
>   }
>   
>   #ifdef CONFIG_STRICT_KERNEL_RWX
> +
>   static DEFINE_PER_CPU(struct vm_struct *, text_poke_area);
> +static DEFINE_PER_CPU(struct mm_struct *, cpu_patching_mm);
> +static DEFINE_PER_CPU(unsigned long, cpu_patching_addr);
> +static DEFINE_PER_CPU(pte_t *, cpu_patching_pte);
>   
>   static int map_patch_area(void *addr, unsigned long text_poke_addr);
>   static void unmap_patch_area(unsigned long addr);
>   
> +static bool mm_patch_enabled(void)
> +{
> +     return IS_ENABLED(CONFIG_SMP) && radix_enabled();
> +}
> +
> +/*
> + * The following applies for Radix MMU. Hash MMU has different requirements,
> + * and so is not supported.
> + *
> + * Changing mm requires context synchronising instructions on both sides of
> + * the context switch, as well as a hwsync between the last instruction for
> + * which the address of an associated storage access was translated using
> + * the current context.
> + *
> + * switch_mm_irqs_off() performs an isync after the context switch. It is
> + * the responsibility of the caller to perform the CSI and hwsync before
> + * starting/stopping the temp mm.
> + */
> +static struct mm_struct *start_using_temp_mm(struct mm_struct *temp_mm)
> +{
> +     struct mm_struct *orig_mm = current->active_mm;
> +
> +     lockdep_assert_irqs_disabled();
> +     switch_mm_irqs_off(orig_mm, temp_mm, current);
> +
> +     WARN_ON(!mm_is_thread_local(temp_mm));
> +
> +     suspend_breakpoints();
> +     return orig_mm;
> +}
> +
> +static void stop_using_temp_mm(struct mm_struct *temp_mm,
> +                            struct mm_struct *orig_mm)
> +{
> +     lockdep_assert_irqs_disabled();
> +     switch_mm_irqs_off(temp_mm, orig_mm, current);
> +     restore_breakpoints();
> +}
> +
>   static int text_area_cpu_up(unsigned int cpu)
>   {
>       struct vm_struct *area;
> @@ -80,14 +128,127 @@ static int text_area_cpu_down(unsigned int cpu)
>       return 0;
>   }
>   
> +static int text_area_cpu_up_mm(unsigned int cpu)
> +{
> +     struct mm_struct *mm;
> +     unsigned long addr;
> +     pgd_t *pgdp;
> +     p4d_t *p4dp;
> +     pud_t *pudp;
> +     pmd_t *pmdp;
> +     pte_t *ptep;
> +
> +     mm = copy_init_mm();
> +     if (WARN_ON(!mm))
> +             goto fail_no_mm;
> +
> +     /*
> +      * Choose a random page-aligned address from the interval
> +      * [PAGE_SIZE .. DEFAULT_MAP_WINDOW - PAGE_SIZE].
> +      * The lower address bound is PAGE_SIZE to avoid the zero-page.
> +      */
> +     addr = (1 + (get_random_long() % (DEFAULT_MAP_WINDOW / PAGE_SIZE - 2))) 
> << PAGE_SHIFT;

There is some work in progress to get rid of (get_random_long() % 
something), see 
https://patchwork.kernel.org/project/linux-media/cover/20221010230613.1076905-1-ja...@zx2c4.com/

> +
> +     /*
> +      * PTE allocation uses GFP_KERNEL which means we need to
> +      * pre-allocate the PTE here because we cannot do the
> +      * allocation during patching when IRQs are disabled.
> +      */
> +     pgdp = pgd_offset(mm, addr);
> +
> +     p4dp = p4d_alloc(mm, pgdp, addr);
> +     if (WARN_ON(!p4dp))
> +             goto fail_no_p4d;
> +
> +     pudp = pud_alloc(mm, p4dp, addr);
> +     if (WARN_ON(!pudp))
> +             goto fail_no_pud;
> +
> +     pmdp = pmd_alloc(mm, pudp, addr);
> +     if (WARN_ON(!pmdp))
> +             goto fail_no_pmd;
> +
> +     ptep = pte_alloc_map(mm, pmdp, addr);
> +     if (WARN_ON(!ptep))
> +             goto fail_no_pte;

Insn't there standard generic functions to do that ?

For instance, __get_locked_pte() seems to do more or less the same.

> +
> +     this_cpu_write(cpu_patching_mm, mm);
> +     this_cpu_write(cpu_patching_addr, addr);
> +     this_cpu_write(cpu_patching_pte, ptep);
> +
> +     return 0;
> +
> +fail_no_pte:
> +     pmd_free(mm, pmdp);
> +     mm_dec_nr_pmds(mm);
> +fail_no_pmd:
> +     pud_free(mm, pudp);
> +     mm_dec_nr_puds(mm);
> +fail_no_pud:
> +     p4d_free(patching_mm, p4dp);
> +fail_no_p4d:
> +     mmput(mm);
> +fail_no_mm:
> +     return -ENOMEM;
> +}
> +
> +static int text_area_cpu_down_mm(unsigned int cpu)
> +{
> +     struct mm_struct *mm;
> +     unsigned long addr;
> +     pte_t *ptep;
> +     pmd_t *pmdp;
> +     pud_t *pudp;
> +     p4d_t *p4dp;
> +     pgd_t *pgdp;
> +
> +     mm = this_cpu_read(cpu_patching_mm);
> +     addr = this_cpu_read(cpu_patching_addr);
> +
> +     pgdp = pgd_offset(mm, addr);
> +     p4dp = p4d_offset(pgdp, addr);
> +     pudp = pud_offset(p4dp, addr);
> +     pmdp = pmd_offset(pudp, addr);
> +     ptep = pte_offset_map(pmdp, addr);
> +
> +     pte_free(mm, ptep);
> +     pmd_free(mm, pmdp);
> +     pud_free(mm, pudp);
> +     p4d_free(mm, p4dp);
> +     /* pgd is dropped in mmput */
> +
> +     mm_dec_nr_ptes(mm);
> +     mm_dec_nr_pmds(mm);
> +     mm_dec_nr_puds(mm);

Same question, can't something generic be used, something like 
free_pgd_range() ?

> +
> +     mmput(mm);
> +
> +     this_cpu_write(cpu_patching_mm, NULL);
> +     this_cpu_write(cpu_patching_addr, 0);
> +     this_cpu_write(cpu_patching_pte, NULL);
> +
> +     return 0;
> +}
> +
>   static __ro_after_init DEFINE_STATIC_KEY_FALSE(poking_init_done);
>   
>   void __init poking_init(void)
>   {
> -     WARN_ON(cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
> -                               "powerpc/text_poke:online",
> -                               text_area_cpu_up,
> -                               text_area_cpu_down) < 0);
> +     int ret;
> +
> +     if (mm_patch_enabled())
> +             ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
> +                                     "powerpc/text_poke_mm:online",
> +                                     text_area_cpu_up_mm,
> +                                     text_area_cpu_down_mm);
> +     else
> +             ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
> +                                     "powerpc/text_poke:online",
> +                                     text_area_cpu_up,
> +                                     text_area_cpu_down);
> +
> +     /* cpuhp_setup_state returns >= 0 on success */
> +     WARN_ON(ret < 0);
>   
>       static_branch_enable(&poking_init_done);
>   }
> @@ -145,6 +306,53 @@ static void unmap_patch_area(unsigned long addr)
>       flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
>   }
>   
> +static int __do_patch_instruction_mm(u32 *addr, ppc_inst_t instr)
> +{
> +     int err;
> +     u32 *patch_addr;
> +     unsigned long text_poke_addr;
> +     pte_t *pte;
> +     unsigned long pfn = get_patch_pfn(addr);
> +     struct mm_struct *patching_mm;
> +     struct mm_struct *orig_mm;
> +
> +     patching_mm = __this_cpu_read(cpu_patching_mm);
> +     pte = __this_cpu_read(cpu_patching_pte);
> +     text_poke_addr = __this_cpu_read(cpu_patching_addr);
> +     patch_addr = (u32 *)(text_poke_addr + offset_in_page(addr));
> +
> +     if (unlikely(!patching_mm))
> +             return -ENOMEM;
> +
> +     set_pte_at(patching_mm, text_poke_addr, pte, pfn_pte(pfn, PAGE_KERNEL));
> +
> +     /* order PTE update before use, also serves as the hwsync */
> +     asm volatile("ptesync": : :"memory");

You assume it is radix only ?

> +
> +     /* order context switch after arbitrary prior code */
> +     isync();
> +
> +     orig_mm = start_using_temp_mm(patching_mm);
> +
> +     err = __patch_instruction(addr, instr, patch_addr);
> +
> +     /* hwsync performed by __patch_instruction (sync) if successful */
> +     if (err)
> +             mb();  /* sync */
> +
> +     /* context synchronisation performed by __patch_instruction (isync or 
> exception) */
> +     stop_using_temp_mm(patching_mm, orig_mm);
> +
> +     pte_clear(patching_mm, text_poke_addr, pte);
> +     /*
> +      * ptesync to order PTE update before TLB invalidation done
> +      * by radix__local_flush_tlb_page_psize (in _tlbiel_va)
> +      */
> +     local_flush_tlb_page_psize(patching_mm, text_poke_addr, 
> mmu_virtual_psize);
> +
> +     return err;
> +}
> +
>   static int __do_patch_instruction(u32 *addr, ppc_inst_t instr)
>   {
>       int err;
> @@ -189,7 +397,10 @@ static int do_patch_instruction(u32 *addr, ppc_inst_t 
> instr)
>               return raw_patch_instruction(addr, instr);
>   
>       local_irq_save(flags);
> -     err = __do_patch_instruction(addr, instr);
> +     if (mm_patch_enabled())
> +             err = __do_patch_instruction_mm(addr, instr);
> +     else
> +             err = __do_patch_instruction(addr, instr);
>       local_irq_restore(flags);
>   
>       WARN_ON(!err && !ppc_inst_equal(instr, ppc_inst_read(addr)));

Reply via email to