[PATCH v4 3/4] powerpc/lib: implement strlen() in assembly

Gabriel Paubert paubert at iram.es
Fri Jun 8 21:45:13 AEST 2018


On Fri, Jun 08, 2018 at 10:20:41AM +0000, Christophe Leroy wrote:
> The generic implementation of strlen() reads strings byte per byte.
> 
> This patch implements strlen() in assembly based on a read of entire
> words, in the same spirit as what some other arches and glibc do.
> 
> On a 8xx the time spent in strlen is reduced by 2/3 for long strings.
> 
> strlen() selftest on an 8xx provides the following values:
> 
> Before the patch (ie with the generic strlen() in lib/string.c):
> 
> len 256 : time = 0.803648
> len 16  : time = 0.062989
> len 4   : time = 0.026269
> 
> After the patch:
> 
> len 256 : time = 0.267791  ==>  66% improvment
> len 16  : time = 0.037902  ==>  41% improvment
> len 4   : time = 0.026124  ==>  no degradation
> 
> Signed-off-by: Christophe Leroy <christophe.leroy at c-s.fr>
> ---
> Not tested on PPC64.
> 
> Changes in v4:
>  - Added alignment of the loop
>  - doing the andc only if still not 0 as it happends only for bytes above 0x7f which is pretty rare in a string
> 
> Changes in v3:
>  - Made it common to PPC32 and PPC64
> 
> Changes in v2:
>  - Moved handling of unaligned strings outside of the main path as it is very unlikely.
>  - Removed the verification of the fourth byte in case none of the three first ones are NUL.
> 
> 
>  arch/powerpc/include/asm/asm-compat.h |  4 +++
>  arch/powerpc/include/asm/string.h     |  1 +
>  arch/powerpc/lib/string.S             | 57 +++++++++++++++++++++++++++++++++++
>  3 files changed, 62 insertions(+)
> 
> diff --git a/arch/powerpc/include/asm/asm-compat.h b/arch/powerpc/include/asm/asm-compat.h
> index 7f2a7702596c..0e99fe7570c0 100644
> --- a/arch/powerpc/include/asm/asm-compat.h
> +++ b/arch/powerpc/include/asm/asm-compat.h
> @@ -20,8 +20,10 @@
>  
>  /* operations for longs and pointers */
>  #define PPC_LL		stringify_in_c(ld)
> +#define PPC_LLU		stringify_in_c(ldu)
>  #define PPC_STL		stringify_in_c(std)
>  #define PPC_STLU	stringify_in_c(stdu)
> +#define PPC_ROTLI	stringify_in_c(rotldi)
>  #define PPC_LCMPI	stringify_in_c(cmpdi)
>  #define PPC_LCMPLI	stringify_in_c(cmpldi)
>  #define PPC_LCMP	stringify_in_c(cmpd)
> @@ -53,8 +55,10 @@
>  
>  /* operations for longs and pointers */
>  #define PPC_LL		stringify_in_c(lwz)
> +#define PPC_LLU		stringify_in_c(lwzu)
>  #define PPC_STL		stringify_in_c(stw)
>  #define PPC_STLU	stringify_in_c(stwu)
> +#define PPC_ROTLI	stringify_in_c(rotlwi)
>  #define PPC_LCMPI	stringify_in_c(cmpwi)
>  #define PPC_LCMPLI	stringify_in_c(cmplwi)
>  #define PPC_LCMP	stringify_in_c(cmpw)
> diff --git a/arch/powerpc/include/asm/string.h b/arch/powerpc/include/asm/string.h
> index 9b8cedf618f4..8fdcb532de72 100644
> --- a/arch/powerpc/include/asm/string.h
> +++ b/arch/powerpc/include/asm/string.h
> @@ -13,6 +13,7 @@
>  #define __HAVE_ARCH_MEMCHR
>  #define __HAVE_ARCH_MEMSET16
>  #define __HAVE_ARCH_MEMCPY_FLUSHCACHE
> +#define __HAVE_ARCH_STRLEN
>  
>  extern char * strcpy(char *,const char *);
>  extern char * strncpy(char *,const char *, __kernel_size_t);
> diff --git a/arch/powerpc/lib/string.S b/arch/powerpc/lib/string.S
> index 4b41970e9ed8..238f61e2024f 100644
> --- a/arch/powerpc/lib/string.S
> +++ b/arch/powerpc/lib/string.S
> @@ -67,3 +67,60 @@ _GLOBAL(memchr)
>  2:	li	r3,0
>  	blr
>  EXPORT_SYMBOL(memchr)
> +
> +_GLOBAL(strlen)
> +	andi.   r9, r3, (SZL - 1)
> +	addi	r10, r3, -SZL
> +	bne-	1f
> +2:	lis	r6, 0x8080
> +	ori	r6, r6, 0x8080		/* r6 = 0x80808080 (himagic) */
> +#ifdef CONFIG_PPC64
> +	rldimi	r6, r6, 32, 0		/* r6 = 0x8080808080808080 (himagic) */
> +#endif
> +	PPC_ROTLI  r7, r6, 1 		/* r7 = 0x01010101(01010101) (lomagic)*/
> +	.balign IFETCH_ALIGN_BYTES
> +3:	PPC_LLU	r9, SZL(r10)
> +	/* ((x - lomagic) & ~x & himagic) == 0 means no byte in x is NUL */
> +	subf	r8, r7, r9
> +	and.	r8, r8, r6
> +	beq+	3b
> +	andc.	r8, r8, r9
> +	beq+	3b
> +#ifdef CONFIG_PPC64
> +	rldicl.	r8, r9, 8, 56
> +	beq	20f
> +	rldicl.	r8, r9, 16, 56
> +	beq	21f
> +	rldicl.	r8, r9, 24, 56
> +	beq	22f
> +	rldicl.	r8, r9, 32, 56
> +	beq	23f
> +	addi	r10, r10, 4
> +#endif
> +	rlwinm.	r8, r9, 0, 0xff000000
> +	beq	20f
> +	rlwinm.	r8, r9, 0, 0x00ff0000
> +	beq	21f
> +	rlwinm.	r8, r9, 0, 0x0000ff00
> +	beq	22f
> +23:	subf	r3, r3, r10

Actually these rlwinm. can likely be replaced by a single
cntlzw /cntlzd; for 32 bit something like:

	cntlzw  r8,r9
	subf    r3,r3,r10	
	srwi	r8,r8,3
	add	r3,r3,r8
	blr

and similar for 64 bit but with cntlzd.

	Gabriel


> +	addi	r3, r3, 3
> +	blr
> +22:	subf	r3, r3, r10
> +	addi	r3, r3, 2
> +	blr
> +21:	subf	r3, r3, r10
> +	addi	r3, r3, 1
> +	blr
> +19:	addi	r10, r10, (SZL - 1)
> +20:	subf	r3, r3, r10
> +	blr
> +
> +1:	lbz	r9, SZL(r10)
> +	addi	r10, r10, 1
> +	cmpwi	cr1, r9, 0
> +	andi.	r9, r10, (SZL - 1)
> +	beq	cr1, 19b
> +	bne	1b
> +	b	2b
> +EXPORT_SYMBOL(strlen)
> -- 
> 2.13.3
> 


More information about the Linuxppc-dev mailing list