[PATCH v5 2/8] lkdtm/powerpc: Add test to hijack a patch mapping

Thu Aug 5 19:13:03 AEST 2021

Le 13/07/2021 à 07:31, Christopher M. Riedl a écrit :
> When live patching with STRICT_KERNEL_RWX the CPU doing the patching
> must temporarily remap the page(s) containing the patch site with +W
> permissions. While this temporary mapping is in use, another CPU could
> write to the same mapping and maliciously alter kernel text. Implement a
> LKDTM test to attempt to exploit such an opening during code patching.
> The test is implemented on powerpc and requires LKDTM built into the
> kernel (building LKDTM as a module is insufficient).
> 
> The LKDTM "hijack" test works as follows:
> 
>    1. A CPU executes an infinite loop to patch an instruction. This is
>       the "patching" CPU.
>    2. Another CPU attempts to write to the address of the temporary
>       mapping used by the "patching" CPU. This other CPU is the
>       "hijacker" CPU. The hijack either fails with a fault/error or
>       succeeds, in which case some kernel text is now overwritten.
> 
> The virtual address of the temporary patch mapping is provided via an
> LKDTM-specific accessor to the hijacker CPU. This test assumes a
> hypothetical situation where this address was leaked previously.
> 
> How to run the test:
> 
> 	mount -t debugfs none /sys/kernel/debug
> 	(echo HIJACK_PATCH > /sys/kernel/debug/provoke-crash/DIRECT)
> 
> A passing test indicates that it is not possible to overwrite kernel
> text from another CPU by using the temporary mapping established by
> a CPU for patching.
> 
> Signed-off-by: Christopher M. Riedl <cmr at linux.ibm.com>
> 
> ---
> 
> v5:  * Use `u32*` instead of `struct ppc_inst*` based on new series in
>         upstream.
> 
> v4:  * Separate the powerpc and x86_64 bits into individual patches.
>       * Use __put_kernel_nofault() when attempting to hijack the mapping
>       * Use raw_smp_processor_id() to avoid triggering the BUG() when
>         calling smp_processor_id() in preemptible code - the only thing
>         that matters is that one of the threads is bound to a different
>         CPU - we are not using smp_processor_id() to access any per-cpu
>         data or similar where preemption should be disabled.
>       * Rework the patching_cpu() kthread stop condition to avoid:
>         https://lwn.net/Articles/628628/
> ---
>   drivers/misc/lkdtm/core.c  |   1 +
>   drivers/misc/lkdtm/lkdtm.h |   1 +
>   drivers/misc/lkdtm/perms.c | 134 +++++++++++++++++++++++++++++++++++++
>   3 files changed, 136 insertions(+)
> 
> diff --git a/drivers/misc/lkdtm/core.c b/drivers/misc/lkdtm/core.c
> index 8024b6a5cc7fc..fbcb95eda337b 100644
> --- a/drivers/misc/lkdtm/core.c
> +++ b/drivers/misc/lkdtm/core.c
> @@ -147,6 +147,7 @@ static const struct crashtype crashtypes[] = {
>   	CRASHTYPE(WRITE_RO),
>   	CRASHTYPE(WRITE_RO_AFTER_INIT),
>   	CRASHTYPE(WRITE_KERN),
> +	CRASHTYPE(HIJACK_PATCH),
>   	CRASHTYPE(REFCOUNT_INC_OVERFLOW),
>   	CRASHTYPE(REFCOUNT_ADD_OVERFLOW),
>   	CRASHTYPE(REFCOUNT_INC_NOT_ZERO_OVERFLOW),
> diff --git a/drivers/misc/lkdtm/lkdtm.h b/drivers/misc/lkdtm/lkdtm.h
> index 99f90d3e5e9cb..87e7e6136d962 100644
> --- a/drivers/misc/lkdtm/lkdtm.h
> +++ b/drivers/misc/lkdtm/lkdtm.h
> @@ -62,6 +62,7 @@ void lkdtm_EXEC_USERSPACE(void);
>   void lkdtm_EXEC_NULL(void);
>   void lkdtm_ACCESS_USERSPACE(void);
>   void lkdtm_ACCESS_NULL(void);
> +void lkdtm_HIJACK_PATCH(void);
>   
>   /* refcount.c */
>   void lkdtm_REFCOUNT_INC_OVERFLOW(void);
> diff --git a/drivers/misc/lkdtm/perms.c b/drivers/misc/lkdtm/perms.c
> index 2dede2ef658f3..39e7456852229 100644
> --- a/drivers/misc/lkdtm/perms.c
> +++ b/drivers/misc/lkdtm/perms.c
> @@ -9,6 +9,7 @@
>   #include <linux/vmalloc.h>
>   #include <linux/mman.h>
>   #include <linux/uaccess.h>
> +#include <linux/kthread.h>
>   #include <asm/cacheflush.h>
>   
>   /* Whether or not to fill the target memory area with do_nothing(). */
> @@ -222,6 +223,139 @@ void lkdtm_ACCESS_NULL(void)
>   	pr_err("FAIL: survived bad write\n");
>   }
>   
> +#if (IS_BUILTIN(CONFIG_LKDTM) && defined(CONFIG_STRICT_KERNEL_RWX) && \
> +	defined(CONFIG_PPC))

I think this test shouldn't be limited to CONFIG_PPC and shouldn't be limited to 
CONFIG_STRICT_KERNEL_RWX. It should be there all the time.

Also why limiting it to IS_BUILTIN(CONFIG_LKDTM) ?

> +/*
> + * This is just a dummy location to patch-over.
> + */
> +static void patching_target(void)
> +{
> +	return;
> +}
> +
> +#include <asm/code-patching.h>
> +const u32 *patch_site = (const u32 *)&patching_target;
> +
> +static inline int lkdtm_do_patch(u32 data)
> +{
> +	return patch_instruction((u32 *)patch_site, ppc_inst(data));
> +}
> +
> +static inline u32 lkdtm_read_patch_site(void)
> +{
> +	return READ_ONCE(*patch_site);
> +}
> +
> +/* Returns True if the write succeeds */
> +static inline bool lkdtm_try_write(u32 data, u32 *addr)
> +{
> +	__put_kernel_nofault(addr, &data, u32, err);
> +	return true;
> +
> +err:
> +	return false;
> +}
> +
> +static int lkdtm_patching_cpu(void *data)
> +{
> +	int err = 0;
> +	u32 val = 0xdeadbeef;
> +
> +	pr_info("starting patching_cpu=%d\n", raw_smp_processor_id());
> +
> +	do {
> +		err = lkdtm_do_patch(val);
> +	} while (lkdtm_read_patch_site() == val && !err && !kthread_should_stop());
> +
> +	if (err)
> +		pr_warn("XFAIL: patch_instruction returned error: %d\n", err);
> +
> +	while (!kthread_should_stop()) {
> +		set_current_state(TASK_INTERRUPTIBLE);
> +		schedule();
> +	}
> +
> +	return err;
> +}
> +
> +void lkdtm_HIJACK_PATCH(void)
> +{
> +	struct task_struct *patching_kthrd;
> +	int patching_cpu, hijacker_cpu, attempts;
> +	unsigned long addr;
> +	bool hijacked;
> +	const u32 bad_data = 0xbad00bad;
> +	const u32 original_insn = lkdtm_read_patch_site();
> +
> +	if (!IS_ENABLED(CONFIG_SMP)) {
> +		pr_err("XFAIL: this test requires CONFIG_SMP\n");
> +		return;
> +	}
> +
> +	if (num_online_cpus() < 2) {
> +		pr_warn("XFAIL: this test requires at least two cpus\n");
> +		return;
> +	}
> +
> +	hijacker_cpu = raw_smp_processor_id();
> +	patching_cpu = cpumask_any_but(cpu_online_mask, hijacker_cpu);
> +
> +	patching_kthrd = kthread_create_on_node(&lkdtm_patching_cpu, NULL,
> +						cpu_to_node(patching_cpu),
> +						"lkdtm_patching_cpu");
> +	kthread_bind(patching_kthrd, patching_cpu);
> +	wake_up_process(patching_kthrd);
> +
> +	addr = offset_in_page(patch_site) | read_cpu_patching_addr(patching_cpu);
> +
> +	pr_info("starting hijacker_cpu=%d\n", hijacker_cpu);
> +	for (attempts = 0; attempts < 100000; ++attempts) {
> +		/* Try to write to the other CPU's temp patch mapping */
> +		hijacked = lkdtm_try_write(bad_data, (u32 *)addr);
> +
> +		if (hijacked) {
> +			if (kthread_stop(patching_kthrd)) {
> +				pr_info("hijack attempts: %d\n", attempts);
> +				pr_err("XFAIL: error stopping patching cpu\n");
> +				return;
> +			}
> +			break;
> +		}
> +	}
> +	pr_info("hijack attempts: %d\n", attempts);
> +
> +	if (hijacked) {
> +		if (lkdtm_read_patch_site() == bad_data)
> +			pr_err("overwrote kernel text\n");
> +		/*
> +		 * There are window conditions where the hijacker cpu manages to
> +		 * write to the patch site but the site gets overwritten again by
> +		 * the patching cpu. We still consider that a "successful" hijack
> +		 * since the hijacker cpu did not fault on the write.
> +		 */
> +		pr_err("FAIL: wrote to another cpu's patching area\n");
> +	} else {
> +		kthread_stop(patching_kthrd);
> +	}
> +
> +	/* Restore the original data to be able to run the test again */
> +	lkdtm_do_patch(original_insn);
> +}
> +
> +#else
> +
> +void lkdtm_HIJACK_PATCH(void)
> +{
> +	if (!IS_ENABLED(CONFIG_PPC))
> +		pr_err("XFAIL: this test only runs on powerpc\n");
> +	if (!IS_ENABLED(CONFIG_STRICT_KERNEL_RWX))
> +		pr_err("XFAIL: this test requires CONFIG_STRICT_KERNEL_RWX\n");
> +	if (!IS_BUILTIN(CONFIG_LKDTM))
> +		pr_err("XFAIL: this test requires CONFIG_LKDTM=y (not =m!)\n");
> +}
> +
> +#endif
> +
>   void __init lkdtm_perms_init(void)
>   {
>   	/* Make sure we can write to __ro_after_init values during __init */
>