[PATCH v9 6/7] powerpc/code-patching: Use temporary mm for Radix MMU

Thu Nov 3 14:10:42 AEDT 2022

On Wed, 2022-11-02 at 10:11 +0000, Christophe Leroy wrote:
> Le 25/10/2022 à 06:44, Benjamin Gray a écrit :
> > +static int text_area_cpu_up_mm(unsigned int cpu)
> > +{
> > +       struct mm_struct *mm;
> > +       unsigned long addr;
> > +       pgd_t *pgdp;
> > +       p4d_t *p4dp;
> > +       pud_t *pudp;
> > +       pmd_t *pmdp;
> > +       pte_t *ptep;
> > +
> > +       mm = copy_init_mm();
> > +       if (WARN_ON(!mm))
> > +               goto fail_no_mm;
> > +
> > +       /*
> > +        * Choose a random page-aligned address from the interval
> > +        * [PAGE_SIZE .. DEFAULT_MAP_WINDOW - PAGE_SIZE].
> > +        * The lower address bound is PAGE_SIZE to avoid the zero-
> > page.
> > +        */
> > +       addr = (1 + (get_random_long() % (DEFAULT_MAP_WINDOW /
> > PAGE_SIZE - 2))) << PAGE_SHIFT;
> 
> There is some work in progress to get rid of (get_random_long() % 
> something), see 
> https://patchwork.kernel.org/project/linux-media/cover/20221010230613.1076905-1-Jason@zx2c4.com/

I had been read that series before sending. get_random_int is removed,
and the guidelines give instructions for fixed value sizes (u32, u64),
but get_random_long() doesn't have a corresponding
get_random_long_max().

> > +
> > +       /*
> > +        * PTE allocation uses GFP_KERNEL which means we need to
> > +        * pre-allocate the PTE here because we cannot do the
> > +        * allocation during patching when IRQs are disabled.
> > +        */
> > +       pgdp = pgd_offset(mm, addr);
> > +
> > +       p4dp = p4d_alloc(mm, pgdp, addr);
> > +       if (WARN_ON(!p4dp))
> > +               goto fail_no_p4d;
> > +
> > +       pudp = pud_alloc(mm, p4dp, addr);
> > +       if (WARN_ON(!pudp))
> > +               goto fail_no_pud;
> > +
> > +       pmdp = pmd_alloc(mm, pudp, addr);
> > +       if (WARN_ON(!pmdp))
> > +               goto fail_no_pmd;
> > +
> > +       ptep = pte_alloc_map(mm, pmdp, addr);
> > +       if (WARN_ON(!ptep))
> > +               goto fail_no_pte;
> 
> Insn't there standard generic functions to do that ?
> 
> For instance, __get_locked_pte() seems to do more or less the same.

__get_locked_pte invokes walk_to_pmd, which leaks memory if the
allocation fails. This may not be a concern necessarily at boot (though
I still don't like it), but startup is run every time a CPU comes
online, so the leak is theoretically unbounded.

There's no need to leak it in this context, because we know that each
page is exclusively used by the corresponding patching mm.

> > +
> > +       this_cpu_write(cpu_patching_mm, mm);
> > +       this_cpu_write(cpu_patching_addr, addr);
> > +       this_cpu_write(cpu_patching_pte, ptep);
> > +
> > +       return 0;
> > +
> > +fail_no_pte:
> > +       pmd_free(mm, pmdp);
> > +       mm_dec_nr_pmds(mm);
> > +fail_no_pmd:
> > +       pud_free(mm, pudp);
> > +       mm_dec_nr_puds(mm);
> > +fail_no_pud:
> > +       p4d_free(patching_mm, p4dp);
> > +fail_no_p4d:
> > +       mmput(mm);
> > +fail_no_mm:
> > +       return -ENOMEM;
> > +}
> > +
> > +static int text_area_cpu_down_mm(unsigned int cpu)
> > +{
> > +       struct mm_struct *mm;
> > +       unsigned long addr;
> > +       pte_t *ptep;
> > +       pmd_t *pmdp;
> > +       pud_t *pudp;
> > +       p4d_t *p4dp;
> > +       pgd_t *pgdp;
> > +
> > +       mm = this_cpu_read(cpu_patching_mm);
> > +       addr = this_cpu_read(cpu_patching_addr);
> > +
> > +       pgdp = pgd_offset(mm, addr);
> > +       p4dp = p4d_offset(pgdp, addr);
> > +       pudp = pud_offset(p4dp, addr);
> > +       pmdp = pmd_offset(pudp, addr);
> > +       ptep = pte_offset_map(pmdp, addr);
> > +
> > +       pte_free(mm, ptep);
> > +       pmd_free(mm, pmdp);
> > +       pud_free(mm, pudp);
> > +       p4d_free(mm, p4dp);
> > +       /* pgd is dropped in mmput */
> > +
> > +       mm_dec_nr_ptes(mm);
> > +       mm_dec_nr_pmds(mm);
> > +       mm_dec_nr_puds(mm);
> 
> Same question, can't something generic be used, something like 
> free_pgd_range() ?

Possibly, but I don't have a struct mmu_gather to give it. If there's a
way to make one temporarily then it might work.

> > +static int __do_patch_instruction_mm(u32 *addr, ppc_inst_t instr)
> > +{
> > +       int err;
> > +       u32 *patch_addr;
> > +       unsigned long text_poke_addr;
> > +       pte_t *pte;
> > +       unsigned long pfn = get_patch_pfn(addr);
> > +       struct mm_struct *patching_mm;
> > +       struct mm_struct *orig_mm;
> > +
> > +       patching_mm = __this_cpu_read(cpu_patching_mm);
> > +       pte = __this_cpu_read(cpu_patching_pte);
> > +       text_poke_addr = __this_cpu_read(cpu_patching_addr);
> > +       patch_addr = (u32 *)(text_poke_addr +
> > offset_in_page(addr));
> > +
> > +       if (unlikely(!patching_mm))
> > +               return -ENOMEM;
> > +
> > +       set_pte_at(patching_mm, text_poke_addr, pte, pfn_pte(pfn,
> > PAGE_KERNEL));
> > +
> > +       /* order PTE update before use, also serves as the hwsync
> > */
> > +       asm volatile("ptesync": : :"memory");
> 
> You assume it is radix only ?

I enforce it in mm_patch_enabled