[PATCH v10 6/6] arm64: introduce copy_mc_to_kernel() implementation
Tong Tiangen
tongtiangen at huawei.com
Wed Jan 31 00:56:46 AEDT 2024
在 2024/1/30 18:20, Mark Rutland 写道:
> On Mon, Jan 29, 2024 at 09:46:52PM +0800, Tong Tiangen wrote:
>> The copy_mc_to_kernel() helper is memory copy implementation that handles
>> source exceptions. It can be used in memory copy scenarios that tolerate
>> hardware memory errors(e.g: pmem_read/dax_copy_to_iter).
>>
>> Currnently, only x86 and ppc suuport this helper, after arm64 support
>> machine check safe framework, we introduce copy_mc_to_kernel()
>> implementation.
>>
>> Signed-off-by: Tong Tiangen <tongtiangen at huawei.com>
>> ---
>> arch/arm64/include/asm/string.h | 5 +
>> arch/arm64/include/asm/uaccess.h | 21 +++
>> arch/arm64/lib/Makefile | 2 +-
>> arch/arm64/lib/memcpy_mc.S | 257 +++++++++++++++++++++++++++++++
>> mm/kasan/shadow.c | 12 ++
>> 5 files changed, 296 insertions(+), 1 deletion(-)
>> create mode 100644 arch/arm64/lib/memcpy_mc.S
>
> Looking at the diffstat and code, this duplicates arch/arm64/lib/memcpy.S with
> a few annotations. Duplicating that code is not maintainable, and so we cannot
> take this as-is.
>
> If you want a version that can handle faults that *must* be written such that
> the code is shared with the regular memcpy. That could be done by using macros
> to instantiate two copies (one with fault handling, the other without).
>
> It would also be very helpful to see *any* indication that this has been
> tested, which is sorely lacking in the series as-is.
>
> Mark.
OK, so that's what I'm really want to solve right now, a lot of
duplicate code, and I'm going to think about how to deal with that.
Thank you very much for your good advice:)
Tong.
>
>> diff --git a/arch/arm64/include/asm/string.h b/arch/arm64/include/asm/string.h
>> index 3a3264ff47b9..995b63c26e99 100644
>> --- a/arch/arm64/include/asm/string.h
>> +++ b/arch/arm64/include/asm/string.h
>> @@ -35,6 +35,10 @@ extern void *memchr(const void *, int, __kernel_size_t);
>> extern void *memcpy(void *, const void *, __kernel_size_t);
>> extern void *__memcpy(void *, const void *, __kernel_size_t);
>>
>> +#define __HAVE_ARCH_MEMCPY_MC
>> +extern int memcpy_mcs(void *, const void *, __kernel_size_t);
>> +extern int __memcpy_mcs(void *, const void *, __kernel_size_t);
>> +
>> #define __HAVE_ARCH_MEMMOVE
>> extern void *memmove(void *, const void *, __kernel_size_t);
>> extern void *__memmove(void *, const void *, __kernel_size_t);
>> @@ -57,6 +61,7 @@ void memcpy_flushcache(void *dst, const void *src, size_t cnt);
>> */
>>
>> #define memcpy(dst, src, len) __memcpy(dst, src, len)
>> +#define memcpy_mcs(dst, src, len) __memcpy_mcs(dst, src, len)
>> #define memmove(dst, src, len) __memmove(dst, src, len)
>> #define memset(s, c, n) __memset(s, c, n)
>>
>> diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h
>> index 14be5000c5a0..61e28ef2112a 100644
>> --- a/arch/arm64/include/asm/uaccess.h
>> +++ b/arch/arm64/include/asm/uaccess.h
>> @@ -425,4 +425,25 @@ static inline size_t probe_subpage_writeable(const char __user *uaddr,
>>
>> #endif /* CONFIG_ARCH_HAS_SUBPAGE_FAULTS */
>>
>> +#ifdef CONFIG_ARCH_HAS_COPY_MC
>> +/**
>> + * copy_mc_to_kernel - memory copy that handles source exceptions
>> + *
>> + * @dst: destination address
>> + * @src: source address
>> + * @len: number of bytes to copy
>> + *
>> + * Return 0 for success, or #size if there was an exception.
>> + */
>> +static inline unsigned long __must_check
>> +copy_mc_to_kernel(void *to, const void *from, unsigned long size)
>> +{
>> + int ret;
>> +
>> + ret = memcpy_mcs(to, from, size);
>> + return (ret == -EFAULT) ? size : 0;
>> +}
>> +#define copy_mc_to_kernel copy_mc_to_kernel
>> +#endif
>> +
>> #endif /* __ASM_UACCESS_H */
>> diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
>> index a2fd865b816d..899d6ae9698c 100644
>> --- a/arch/arm64/lib/Makefile
>> +++ b/arch/arm64/lib/Makefile
>> @@ -3,7 +3,7 @@ lib-y := clear_user.o delay.o copy_from_user.o \
>> copy_to_user.o copy_page.o \
>> clear_page.o csum.o insn.o memchr.o memcpy.o \
>> memset.o memcmp.o strcmp.o strncmp.o strlen.o \
>> - strnlen.o strchr.o strrchr.o tishift.o
>> + strnlen.o strchr.o strrchr.o tishift.o memcpy_mc.o
>>
>> ifeq ($(CONFIG_KERNEL_MODE_NEON), y)
>> obj-$(CONFIG_XOR_BLOCKS) += xor-neon.o
>> diff --git a/arch/arm64/lib/memcpy_mc.S b/arch/arm64/lib/memcpy_mc.S
>> new file mode 100644
>> index 000000000000..7076b500d154
>> --- /dev/null
>> +++ b/arch/arm64/lib/memcpy_mc.S
>> @@ -0,0 +1,257 @@
>> +/* SPDX-License-Identifier: GPL-2.0-only */
>> +/*
>> + * Copyright (c) 2012-2021, Arm Limited.
>> + *
>> + * Adapted from the original at:
>> + * https://github.com/ARM-software/optimized-routines/blob/afd6244a1f8d9229/string/aarch64/memcpy.S
>> + */
>> +
>> +#include <linux/linkage.h>
>> +#include <asm/assembler.h>
>> +
>> +/* Assumptions:
>> + *
>> + * ARMv8-a, AArch64, unaligned accesses.
>> + *
>> + */
>> +
>> +#define L(label) .L ## label
>> +
>> +#define dstin x0
>> +#define src x1
>> +#define count x2
>> +#define dst x3
>> +#define srcend x4
>> +#define dstend x5
>> +#define A_l x6
>> +#define A_lw w6
>> +#define A_h x7
>> +#define B_l x8
>> +#define B_lw w8
>> +#define B_h x9
>> +#define C_l x10
>> +#define C_lw w10
>> +#define C_h x11
>> +#define D_l x12
>> +#define D_h x13
>> +#define E_l x14
>> +#define E_h x15
>> +#define F_l x16
>> +#define F_h x17
>> +#define G_l count
>> +#define G_h dst
>> +#define H_l src
>> +#define H_h srcend
>> +#define tmp1 x14
>> +
>> +/* This implementation handles overlaps and supports both memcpy and memmove
>> + from a single entry point. It uses unaligned accesses and branchless
>> + sequences to keep the code small, simple and improve performance.
>> +
>> + Copies are split into 3 main cases: small copies of up to 32 bytes, medium
>> + copies of up to 128 bytes, and large copies. The overhead of the overlap
>> + check is negligible since it is only required for large copies.
>> +
>> + Large copies use a software pipelined loop processing 64 bytes per iteration.
>> + The destination pointer is 16-byte aligned to minimize unaligned accesses.
>> + The loop tail is handled by always copying 64 bytes from the end.
>> +*/
>> +
>> +SYM_FUNC_START(__pi_memcpy_mcs)
>> + add srcend, src, count
>> + add dstend, dstin, count
>> + cmp count, 128
>> + b.hi L(copy_long)
>> + cmp count, 32
>> + b.hi L(copy32_128)
>> +
>> + /* Small copies: 0..32 bytes. */
>> + cmp count, 16
>> + b.lo L(copy16)
>> + CPY_MC(9998f, ldp A_l, A_h, [src])
>> + CPY_MC(9998f, ldp D_l, D_h, [srcend, -16])
>> + CPY_MC(9998f, stp A_l, A_h, [dstin])
>> + CPY_MC(9998f, stp D_l, D_h, [dstend, -16])
>> + mov x0, #0
>> + ret
>> +
>> + /* Copy 8-15 bytes. */
>> +L(copy16):
>> + tbz count, 3, L(copy8)
>> + CPY_MC(9998f, ldr A_l, [src])
>> + CPY_MC(9998f, ldr A_h, [srcend, -8])
>> + CPY_MC(9998f, str A_l, [dstin])
>> + CPY_MC(9998f, str A_h, [dstend, -8])
>> + mov x0, #0
>> + ret
>> +
>> + .p2align 3
>> + /* Copy 4-7 bytes. */
>> +L(copy8):
>> + tbz count, 2, L(copy4)
>> + CPY_MC(9998f, ldr A_lw, [src])
>> + CPY_MC(9998f, ldr B_lw, [srcend, -4])
>> + CPY_MC(9998f, str A_lw, [dstin])
>> + CPY_MC(9998f, str B_lw, [dstend, -4])
>> + mov x0, #0
>> + ret
>> +
>> + /* Copy 0..3 bytes using a branchless sequence. */
>> +L(copy4):
>> + cbz count, L(copy0)
>> + lsr tmp1, count, 1
>> + CPY_MC(9998f, ldrb A_lw, [src])
>> + CPY_MC(9998f, ldrb C_lw, [srcend, -1])
>> + CPY_MC(9998f, ldrb B_lw, [src, tmp1])
>> + CPY_MC(9998f, strb A_lw, [dstin])
>> + CPY_MC(9998f, strb B_lw, [dstin, tmp1])
>> + CPY_MC(9998f, strb C_lw, [dstend, -1])
>> +L(copy0):
>> + mov x0, #0
>> + ret
>> +
>> + .p2align 4
>> + /* Medium copies: 33..128 bytes. */
>> +L(copy32_128):
>> + CPY_MC(9998f, ldp A_l, A_h, [src])
>> + CPY_MC(9998f, ldp B_l, B_h, [src, 16])
>> + CPY_MC(9998f, ldp C_l, C_h, [srcend, -32])
>> + CPY_MC(9998f, ldp D_l, D_h, [srcend, -16])
>> + cmp count, 64
>> + b.hi L(copy128)
>> + CPY_MC(9998f, stp A_l, A_h, [dstin])
>> + CPY_MC(9998f, stp B_l, B_h, [dstin, 16])
>> + CPY_MC(9998f, stp C_l, C_h, [dstend, -32])
>> + CPY_MC(9998f, stp D_l, D_h, [dstend, -16])
>> + mov x0, #0
>> + ret
>> +
>> + .p2align 4
>> + /* Copy 65..128 bytes. */
>> +L(copy128):
>> + CPY_MC(9998f, ldp E_l, E_h, [src, 32])
>> + CPY_MC(9998f, ldp F_l, F_h, [src, 48])
>> + cmp count, 96
>> + b.ls L(copy96)
>> + CPY_MC(9998f, ldp G_l, G_h, [srcend, -64])
>> + CPY_MC(9998f, ldp H_l, H_h, [srcend, -48])
>> + CPY_MC(9998f, stp G_l, G_h, [dstend, -64])
>> + CPY_MC(9998f, stp H_l, H_h, [dstend, -48])
>> +L(copy96):
>> + CPY_MC(9998f, stp A_l, A_h, [dstin])
>> + CPY_MC(9998f, stp B_l, B_h, [dstin, 16])
>> + CPY_MC(9998f, stp E_l, E_h, [dstin, 32])
>> + CPY_MC(9998f, stp F_l, F_h, [dstin, 48])
>> + CPY_MC(9998f, stp C_l, C_h, [dstend, -32])
>> + CPY_MC(9998f, stp D_l, D_h, [dstend, -16])
>> + mov x0, #0
>> + ret
>> +
>> + .p2align 4
>> + /* Copy more than 128 bytes. */
>> +L(copy_long):
>> + /* Use backwards copy if there is an overlap. */
>> + sub tmp1, dstin, src
>> + cbz tmp1, L(copy0)
>> + cmp tmp1, count
>> + b.lo L(copy_long_backwards)
>> +
>> + /* Copy 16 bytes and then align dst to 16-byte alignment. */
>> +
>> + CPY_MC(9998f, ldp D_l, D_h, [src])
>> + and tmp1, dstin, 15
>> + bic dst, dstin, 15
>> + sub src, src, tmp1
>> + add count, count, tmp1 /* Count is now 16 too large. */
>> + CPY_MC(9998f, ldp A_l, A_h, [src, 16])
>> + CPY_MC(9998f, stp D_l, D_h, [dstin])
>> + CPY_MC(9998f, ldp B_l, B_h, [src, 32])
>> + CPY_MC(9998f, ldp C_l, C_h, [src, 48])
>> + CPY_MC(9998f, ldp D_l, D_h, [src, 64]!)
>> + subs count, count, 128 + 16 /* Test and readjust count. */
>> + b.ls L(copy64_from_end)
>> +
>> +L(loop64):
>> + CPY_MC(9998f, stp A_l, A_h, [dst, 16])
>> + CPY_MC(9998f, ldp A_l, A_h, [src, 16])
>> + CPY_MC(9998f, stp B_l, B_h, [dst, 32])
>> + CPY_MC(9998f, ldp B_l, B_h, [src, 32])
>> + CPY_MC(9998f, stp C_l, C_h, [dst, 48])
>> + CPY_MC(9998f, ldp C_l, C_h, [src, 48])
>> + CPY_MC(9998f, stp D_l, D_h, [dst, 64]!)
>> + CPY_MC(9998f, ldp D_l, D_h, [src, 64]!)
>> + subs count, count, 64
>> + b.hi L(loop64)
>> +
>> + /* Write the last iteration and copy 64 bytes from the end. */
>> +L(copy64_from_end):
>> + CPY_MC(9998f, ldp E_l, E_h, [srcend, -64])
>> + CPY_MC(9998f, stp A_l, A_h, [dst, 16])
>> + CPY_MC(9998f, ldp A_l, A_h, [srcend, -48])
>> + CPY_MC(9998f, stp B_l, B_h, [dst, 32])
>> + CPY_MC(9998f, ldp B_l, B_h, [srcend, -32])
>> + CPY_MC(9998f, stp C_l, C_h, [dst, 48])
>> + CPY_MC(9998f, ldp C_l, C_h, [srcend, -16])
>> + CPY_MC(9998f, stp D_l, D_h, [dst, 64])
>> + CPY_MC(9998f, stp E_l, E_h, [dstend, -64])
>> + CPY_MC(9998f, stp A_l, A_h, [dstend, -48])
>> + CPY_MC(9998f, stp B_l, B_h, [dstend, -32])
>> + CPY_MC(9998f, stp C_l, C_h, [dstend, -16])
>> + mov x0, #0
>> + ret
>> +
>> + .p2align 4
>> +
>> + /* Large backwards copy for overlapping copies.
>> + Copy 16 bytes and then align dst to 16-byte alignment. */
>> +L(copy_long_backwards):
>> + CPY_MC(9998f, ldp D_l, D_h, [srcend, -16])
>> + and tmp1, dstend, 15
>> + sub srcend, srcend, tmp1
>> + sub count, count, tmp1
>> + CPY_MC(9998f, ldp A_l, A_h, [srcend, -16])
>> + CPY_MC(9998f, stp D_l, D_h, [dstend, -16])
>> + CPY_MC(9998f, ldp B_l, B_h, [srcend, -32])
>> + CPY_MC(9998f, ldp C_l, C_h, [srcend, -48])
>> + CPY_MC(9998f, ldp D_l, D_h, [srcend, -64]!)
>> + sub dstend, dstend, tmp1
>> + subs count, count, 128
>> + b.ls L(copy64_from_start)
>> +
>> +L(loop64_backwards):
>> + CPY_MC(9998f, stp A_l, A_h, [dstend, -16])
>> + CPY_MC(9998f, ldp A_l, A_h, [srcend, -16])
>> + CPY_MC(9998f, stp B_l, B_h, [dstend, -32])
>> + CPY_MC(9998f, ldp B_l, B_h, [srcend, -32])
>> + CPY_MC(9998f, stp C_l, C_h, [dstend, -48])
>> + CPY_MC(9998f, ldp C_l, C_h, [srcend, -48])
>> + CPY_MC(9998f, stp D_l, D_h, [dstend, -64]!)
>> + CPY_MC(9998f, ldp D_l, D_h, [srcend, -64]!)
>> + subs count, count, 64
>> + b.hi L(loop64_backwards)
>> +
>> + /* Write the last iteration and copy 64 bytes from the start. */
>> +L(copy64_from_start):
>> + CPY_MC(9998f, ldp G_l, G_h, [src, 48])
>> + CPY_MC(9998f, stp A_l, A_h, [dstend, -16])
>> + CPY_MC(9998f, ldp A_l, A_h, [src, 32])
>> + CPY_MC(9998f, stp B_l, B_h, [dstend, -32])
>> + CPY_MC(9998f, ldp B_l, B_h, [src, 16])
>> + CPY_MC(9998f, stp C_l, C_h, [dstend, -48])
>> + CPY_MC(9998f, ldp C_l, C_h, [src])
>> + CPY_MC(9998f, stp D_l, D_h, [dstend, -64])
>> + CPY_MC(9998f, stp G_l, G_h, [dstin, 48])
>> + CPY_MC(9998f, stp A_l, A_h, [dstin, 32])
>> + CPY_MC(9998f, stp B_l, B_h, [dstin, 16])
>> + CPY_MC(9998f, stp C_l, C_h, [dstin])
>> + mov x0, #0
>> + ret
>> +
>> +9998: mov x0, #-EFAULT
>> + ret
>> +SYM_FUNC_END(__pi_memcpy_mcs)
>> +
>> +SYM_FUNC_ALIAS(__memcpy_mcs, __pi_memcpy_mcs)
>> +EXPORT_SYMBOL(__memcpy_mcs)
>> +SYM_FUNC_ALIAS_WEAK(memcpy_mcs, __memcpy_mcs)
>> +EXPORT_SYMBOL(memcpy_mcs)
>> diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c
>> index 9ef84f31833f..e6519fd329b2 100644
>> --- a/mm/kasan/shadow.c
>> +++ b/mm/kasan/shadow.c
>> @@ -79,6 +79,18 @@ void *memcpy(void *dest, const void *src, size_t len)
>> }
>> #endif
>>
>> +#ifdef __HAVE_ARCH_MEMCPY_MC
>> +#undef memcpy_mcs
>> +int memcpy_mcs(void *dest, const void *src, size_t len)
>> +{
>> + if (!check_memory_region((unsigned long)src, len, false, _RET_IP_) ||
>> + !check_memory_region((unsigned long)dest, len, true, _RET_IP_))
>> + return (unsigned long)len;
>> +
>> + return __memcpy_mcs(dest, src, len);
>> +}
>> +#endif
>> +
>> void *__asan_memset(void *addr, int c, ssize_t len)
>> {
>> if (!kasan_check_range(addr, len, true, _RET_IP_))
>> --
>> 2.25.1
>>
> .
More information about the Linuxppc-dev
mailing list