[PATCH v1 1/3] powerpc: Align bytes before fall back to .Lshort in powerpc memcmp
Christophe LEROY
christophe.leroy at c-s.fr
Tue Sep 19 22:20:57 AEST 2017
Hi
Could you in the email/patch subject write powerpc/64 instead pof
powerpc as it doesn't apply to powerpc/32
Le 19/09/2017 à 12:03, wei.guo.simon at gmail.com a écrit :
> From: Simon Guo <wei.guo.simon at gmail.com>
>
> Currently memcmp() in powerpc will fall back to .Lshort (compare per byte
Say powerpc/64 here too.
Christophe
> mode) if either src or dst address is not 8 bytes aligned. It can be
> opmitized if both addresses are with the same offset with 8 bytes boundary.
>
> memcmp() can align the src/dst address with 8 bytes firstly and then
> compare with .Llong mode.
>
> This patch optmizes memcmp() behavior in this situation.
>
> Test result:
>
> (1) 256 bytes
> Test with the existing tools/testing/selftests/powerpc/stringloops/memcmp:
> - without patch
> 50.715169506 seconds time elapsed ( +- 0.04% )
> - with patch
> 28.906602373 seconds time elapsed ( +- 0.02% )
> -> There is ~+75% percent improvement.
>
> (2) 32 bytes
> To observe performance impact on < 32 bytes, modify
> tools/testing/selftests/powerpc/stringloops/memcmp.c with following:
> -------
> #include <string.h>
> #include "utils.h"
>
> -#define SIZE 256
> +#define SIZE 32
> #define ITERATIONS 10000
>
> int test_memcmp(const void *s1, const void *s2, size_t n);
> --------
>
> - Without patch
> 0.390677136 seconds time elapsed ( +- 0.03% )
> - with patch
> 0.375685926 seconds time elapsed ( +- 0.05% )
> -> There is ~+4% improvement
>
> (3) 0~8 bytes
> To observe <8 bytes performance impact, modify
> tools/testing/selftests/powerpc/stringloops/memcmp.c with following:
> -------
> #include <string.h>
> #include "utils.h"
>
> -#define SIZE 256
> -#define ITERATIONS 10000
> +#define SIZE 8
> +#define ITERATIONS 100000
>
> int test_memcmp(const void *s1, const void *s2, size_t n);
> -------
> - Without patch
> 3.169203981 seconds time elapsed ( +- 0.23% )
> - With patch
> 3.208257362 seconds time elapsed ( +- 0.13% )
> -> There is ~ -1% decrease.
> (I don't know why yet, since there are the same number of instructions
> in the code path for 0~8 bytes memcmp() with/without this patch. Any
> comments will be appreciated).
>
> Signed-off-by: Simon Guo <wei.guo.simon at gmail.com>
> ---
> arch/powerpc/lib/memcmp_64.S | 86 +++++++++++++++++++++++++++++++++++++++++---
> 1 file changed, 82 insertions(+), 4 deletions(-)
>
> diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S
> index d75d18b..6dbafdb 100644
> --- a/arch/powerpc/lib/memcmp_64.S
> +++ b/arch/powerpc/lib/memcmp_64.S
> @@ -24,25 +24,95 @@
> #define rH r31
>
> #ifdef __LITTLE_ENDIAN__
> +#define LH lhbrx
> +#define LW lwbrx
> #define LD ldbrx
> #else
> +#define LH lhzx
> +#define LW lwzx
> #define LD ldx
> #endif
>
> _GLOBAL(memcmp)
> cmpdi cr1,r5,0
>
> - /* Use the short loop if both strings are not 8B aligned */
> - or r6,r3,r4
> + /* Use the short loop if the src/dst addresses are not
> + * with the same offset of 8 bytes align boundary.
> + */
> + xor r6,r3,r4
> andi. r6,r6,7
>
> - /* Use the short loop if length is less than 32B */
> - cmpdi cr6,r5,31
> + /* fall back to short loop if compare at aligned addrs
> + * with no greater than 8 bytes.
> + */
> + cmpdi cr6,r5,8
>
> beq cr1,.Lzero
> bne .Lshort
> + ble cr6,.Lshort
> +
> +.Lalignbytes_start:
> + /* The bits 0/1/2 of src/dst addr are the same. */
> + neg r0,r3
> + andi. r0,r0,7
> + beq .Lalign8bytes
> +
> + PPC_MTOCRF(1,r0)
> + bf 31,.Lalign2bytes
> + lbz rA,0(r3)
> + lbz rB,0(r4)
> + cmplw cr0,rA,rB
> + bne cr0,.LcmpAB_lightweight
> + addi r3,r3,1
> + addi r4,r4,1
> + subi r5,r5,1
> +.Lalign2bytes:
> + bf 30,.Lalign4bytes
> + LH rA,0,r3
> + LH rB,0,r4
> + cmplw cr0,rA,rB
> + bne cr0,.LcmpAB_lightweight
> + bne .Lnon_zero
> + addi r3,r3,2
> + addi r4,r4,2
> + subi r5,r5,2
> +.Lalign4bytes:
> + bf 29,.Lalign8bytes
> + LW rA,0,r3
> + LW rB,0,r4
> + cmpld cr0,rA,rB
> + bne cr0,.LcmpAB_lightweight
> + addi r3,r3,4
> + addi r4,r4,4
> + subi r5,r5,4
> +.Lalign8bytes:
> + /* Now addrs are aligned with 8 bytes. Use the short loop if left
> + * bytes are less than 8B.
> + */
> + cmpdi cr6,r5,7
> + ble cr6,.Lshort
> +
> + /* Use .Llong loop if left cmp bytes are equal or greater than 32B */
> + cmpdi cr6,r5,31
> bgt cr6,.Llong
>
> +.Lcmploop_8bytes_31bytes:
> + /* handle 8 ~ 31 bytes with 8 bytes aligned addrs */
> + srdi. r0,r5,3
> + clrldi r5,r5,61
> + mtctr r0
> +831:
> + LD rA,0,r3
> + LD rB,0,r4
> + cmpld cr0,rA,rB
> + bne cr0,.LcmpAB_lightweight
> + addi r3,r3,8
> + addi r4,r4,8
> + bdnz 831b
> +
> + cmpwi r5,0
> + beq .Lzero
> +
> .Lshort:
> mtctr r5
>
> @@ -232,4 +302,12 @@ _GLOBAL(memcmp)
> ld r28,-32(r1)
> ld r27,-40(r1)
> blr
> +
> +.LcmpAB_lightweight: /* skip NV GPRS restore */
> + li r3,1
> + bgt cr0,8f
> + li r3,-1
> +8:
> + blr
> +
> EXPORT_SYMBOL(memcmp)
>
More information about the Linuxppc-dev
mailing list