[RFC PATCH 3/3] powerpc/lib: Use a temporary mm for code patching

Wed Apr 15 15:11:16 AEST 2020

> On March 24, 2020 11:25 AM Christophe Leroy <christophe.leroy at c-s.fr> wrote:
> 
>  
> Le 23/03/2020 à 05:52, Christopher M. Riedl a écrit :
> > Currently, code patching a STRICT_KERNEL_RWX exposes the temporary
> > mappings to other CPUs. These mappings should be kept local to the CPU
> > doing the patching. Use the pre-initialized temporary mm and patching
> > address for this purpose. Also add a check after patching to ensure the
> > patch succeeded.
> > 
> > Based on x86 implementation:
> > 
> > commit b3fd8e83ada0
> > ("x86/alternatives: Use temporary mm for text poking")
> > 
> > Signed-off-by: Christopher M. Riedl <cmr at informatik.wtf>
> > ---
> >   arch/powerpc/lib/code-patching.c | 128 ++++++++++++++-----------------
> >   1 file changed, 57 insertions(+), 71 deletions(-)
> > 
> > diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
> > index 18b88ecfc5a8..f156132e8975 100644
> > --- a/arch/powerpc/lib/code-patching.c
> > +++ b/arch/powerpc/lib/code-patching.c
> > @@ -19,6 +19,7 @@
> >   #include <asm/page.h>
> >   #include <asm/code-patching.h>
> >   #include <asm/setup.h>
> > +#include <asm/mmu_context.h>
> >   
> >   static int __patch_instruction(unsigned int *exec_addr, unsigned int instr,
> >   			       unsigned int *patch_addr)
> > @@ -65,99 +66,79 @@ void __init poking_init(void)
> >   	pte_unmap_unlock(ptep, ptl);
> >   }
> >   
> > -static DEFINE_PER_CPU(struct vm_struct *, text_poke_area);
> > -
> > -static int text_area_cpu_up(unsigned int cpu)
> > -{
> > -	struct vm_struct *area;
> > -
> > -	area = get_vm_area(PAGE_SIZE, VM_ALLOC);
> > -	if (!area) {
> > -		WARN_ONCE(1, "Failed to create text area for cpu %d\n",
> > -			cpu);
> > -		return -1;
> > -	}
> > -	this_cpu_write(text_poke_area, area);
> > -
> > -	return 0;
> > -}
> > -
> > -static int text_area_cpu_down(unsigned int cpu)
> > -{
> > -	free_vm_area(this_cpu_read(text_poke_area));
> > -	return 0;
> > -}
> > -
> > -/*
> > - * Run as a late init call. This allows all the boot time patching to be done
> > - * simply by patching the code, and then we're called here prior to
> > - * mark_rodata_ro(), which happens after all init calls are run. Although
> > - * BUG_ON() is rude, in this case it should only happen if ENOMEM, and we judge
> > - * it as being preferable to a kernel that will crash later when someone tries
> > - * to use patch_instruction().
> > - */
> > -static int __init setup_text_poke_area(void)
> > -{
> > -	BUG_ON(!cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
> > -		"powerpc/text_poke:online", text_area_cpu_up,
> > -		text_area_cpu_down));
> > -
> > -	return 0;
> > -}
> > -late_initcall(setup_text_poke_area);
> > +struct patch_mapping {
> > +	spinlock_t *ptl; /* for protecting pte table */
> > +	struct temp_mm temp_mm;
> > +};
> >   
> >   /*
> >    * This can be called for kernel text or a module.
> >    */
> > -static int map_patch_area(void *addr, unsigned long text_poke_addr)
> > +static int map_patch(const void *addr, struct patch_mapping *patch_mapping)
> 
> Why change the name ?
> 

It's not really an "area" anymore.

> >   {
> > -	unsigned long pfn;
> > -	int err;
> > +	struct page *page;
> > +	pte_t pte, *ptep;
> > +	pgprot_t pgprot;
> >   
> >   	if (is_vmalloc_addr(addr))
> > -		pfn = vmalloc_to_pfn(addr);
> > +		page = vmalloc_to_page(addr);
> >   	else
> > -		pfn = __pa_symbol(addr) >> PAGE_SHIFT;
> > +		page = virt_to_page(addr);
> >   
> > -	err = map_kernel_page(text_poke_addr, (pfn << PAGE_SHIFT), PAGE_KERNEL);
> > +	if (radix_enabled())
> > +		pgprot = __pgprot(pgprot_val(PAGE_KERNEL));
> > +	else
> > +		pgprot = PAGE_SHARED;
> 
> Can you explain the difference between radix and non radix ?
> 
> Why PAGE_KERNEL for a page that is mapped in userspace ?
> 
> Why do you need to do __pgprot(pgprot_val(PAGE_KERNEL)) instead of just 
> using PAGE_KERNEL ?
> 

On hash there is a manual check which prevents setting _PAGE_PRIVILEGED for
kernel to userspace access in __hash_page - hence we cannot access the mapping
if the page is mapped PAGE_KERNEL on hash. However, I would like to use
PAGE_KERNEL here as well and am working on understanding why this check is
done in hash and if this can change. On radix this works just fine.

The page is mapped PAGE_KERNEL because the address is technically a userspace
address - but only to keep the mapping local to this CPU doing the patching.
PAGE_KERNEL makes it clear both in intent and protection that this is a kernel
mapping.

I think the correct way is pgprot_val(PAGE_KERNEL) since PAGE_KERNEL is defined
as:

#define PAGE_KERNEL	__pgprot(_PAGE_BASE | _PAGE_KERNEL_RW)

and __pgprot() is defined as:

typedef struct { unsigned long pgprot; } pgprot_t;
#define pgprot_val(x)   ((x).pgprot)
#define __pgprot(x)     ((pgprot_t) { (x) })

> >   
> > -	pr_devel("Mapped addr %lx with pfn %lx:%d\n", text_poke_addr, pfn, err);
> > -	if (err)
> > +	ptep = get_locked_pte(patching_mm, patching_addr, &patch_mapping->ptl);
> > +	if (unlikely(!ptep)) {
> > +		pr_warn("map patch: failed to allocate pte for patching\n");
> >   		return -1;
> > +	}
> > +
> > +	pte = mk_pte(page, pgprot);
> > +	set_pte_at(patching_mm, patching_addr, ptep, pte);
> > +
> > +	init_temp_mm(&patch_mapping->temp_mm, patching_mm);
> > +	use_temporary_mm(&patch_mapping->temp_mm);
> >   
> >   	return 0;
> >   }
> >   
> > -static inline int unmap_patch_area(unsigned long addr)
> > +static int unmap_patch(struct patch_mapping *patch_mapping)
> >   {
> >   	pte_t *ptep;
> >   	pmd_t *pmdp;
> >   	pud_t *pudp;
> >   	pgd_t *pgdp;
> >   
> > -	pgdp = pgd_offset_k(addr);
> > +	pgdp = pgd_offset(patching_mm, patching_addr);
> >   	if (unlikely(!pgdp))
> >   		return -EINVAL;
> >   
> > -	pudp = pud_offset(pgdp, addr);
> > +	pudp = pud_offset(pgdp, patching_addr);
> >   	if (unlikely(!pudp))
> >   		return -EINVAL;
> >   
> > -	pmdp = pmd_offset(pudp, addr);
> > +	pmdp = pmd_offset(pudp, patching_addr);
> >   	if (unlikely(!pmdp))
> >   		return -EINVAL;
> >   
> > -	ptep = pte_offset_kernel(pmdp, addr);
> > +	ptep = pte_offset_kernel(pmdp, patching_addr);
> 
> ptep should be stored in the patch_mapping struct instead of walking 
> again the page tables.
> 

Oh yes - this will be in the next version.

> >   	if (unlikely(!ptep))
> >   		return -EINVAL;
> >   
> > -	pr_devel("clearing mm %p, pte %p, addr %lx\n", &init_mm, ptep, addr);
> > +	/*
> > +	 * In hash, pte_clear flushes the tlb
> > +	 */
> > +	pte_clear(patching_mm, patching_addr, ptep);
> > +	unuse_temporary_mm(&patch_mapping->temp_mm);
> >   
> >   	/*
> > -	 * In hash, pte_clear flushes the tlb, in radix, we have to
> > +	 * In radix, we have to explicitly flush the tlb (no-op in hash)
> >   	 */
> > -	pte_clear(&init_mm, addr, ptep);
> > -	flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
> > +	local_flush_tlb_mm(patching_mm);
> > +	pte_unmap_unlock(ptep, patch_mapping->ptl);
> >   
> >   	return 0;
> >   }
> > @@ -167,33 +148,38 @@ static int do_patch_instruction(unsigned int *addr, unsigned int instr)
> >   	int err;
> >   	unsigned int *patch_addr = NULL;
> >   	unsigned long flags;
> > -	unsigned long text_poke_addr;
> > -	unsigned long kaddr = (unsigned long)addr;
> > +	struct patch_mapping patch_mapping;
> >   
> >   	/*
> > -	 * During early early boot patch_instruction is called
> > -	 * when text_poke_area is not ready, but we still need
> > -	 * to allow patching. We just do the plain old patching
> > +	 * The patching_mm is initialized before calling mark_rodata_ro. Prior
> > +	 * to this, patch_instruction is called when we don't have (and don't
> > +	 * need) the patching_mm so just do plain old patching.
> >   	 */
> > -	if (!this_cpu_read(text_poke_area))
> > +	if (!patching_mm)
> >   		return raw_patch_instruction(addr, instr);
> >   
> >   	local_irq_save(flags);
> >   
> > -	text_poke_addr = (unsigned long)__this_cpu_read(text_poke_area)->addr;
> > -	if (map_patch_area(addr, text_poke_addr)) {
> > -		err = -1;
> > +	err = map_patch(addr, &patch_mapping);
> > +	if (err)
> >   		goto out;
> > -	}
> >   
> > -	patch_addr = (unsigned int *)(text_poke_addr) +
> > -			((kaddr & ~PAGE_MASK) / sizeof(unsigned int));
> > +	patch_addr = (unsigned int *)(patching_addr) +
> > +			(offset_in_page((unsigned long)addr) /
> > +				sizeof(unsigned int));
> >   
> >   	__patch_instruction(addr, instr, patch_addr);
> 
> The error returned by __patch_instruction() should be managed.
> 

Agreed, will do something in the next spin.

> >   
> > -	err = unmap_patch_area(text_poke_addr);
> > +	err = unmap_patch(&patch_mapping);
> >   	if (err)
> > -		pr_warn("failed to unmap %lx\n", text_poke_addr);
> > +		pr_warn("unmap patch: failed to unmap patch\n");
> > +
> > +	/*
> > +	 * Something is wrong if what we just wrote doesn't match what we
> > +	 * think we just wrote.
> > +	 * XXX: BUG_ON() instead?
> 
> No, not a BUG_ON(). If patching fails, that's no a vital fault, we can 
> fail gracefully. You should return a fault instead.
> 

Yup - will make these changes in the next version.

> > +	 */
> > +	WARN_ON(memcmp(addr, &instr, sizeof(instr)));
> 
> Come on. addr is an *int, instr is an int. By doing a memcmp() on 
> &instr, you for the compiler to write instr into the stack whereas local 
> vars are mainly in registers on RISC processors like powerpc. Following 
> should do it:
> 
> 	WARN_ON(*addr != instr);
> 

Oh man - I agree, that's just embarrassing.
Appreciate your feedback on this RFC series, thanks!

> >   
> >   out:
> >   	local_irq_restore(flags);
> > 
> 
> Christophe