[PATCH v4 2/5] lib/lz4: update LZ4 decompressor module
Jonathan Liu
net147 at gmail.com
Tue May 28 23:28:16 AEST 2024
Hi Jianan,
Here are the LZ4 decompression times I got running "time unlz4 ..." on
Rock 4 SE with RK3399 CPU.
v2024.04: 1.329 seconds
v2024.04 with 26c7fdadcb7 ("lib/lz4: update LZ4 decompressor module")
reverted: 0.665 seconds
v2024.04 with your patch: 1.216 seconds
I managed to get better performance by optimizing it myself.
v2024.04 with my patch: 0.785 seconds
With my patch, it makes no difference if I use __builtin_memcpy or
memcpy in lz4.c and lz4_wrapper.c so I just left it using memcpy.
It is still slower than reverting the LZ4 update though.
My patch:
--- a/lib/lz4.c
+++ b/lib/lz4.c
@@ -41,6 +41,16 @@ static FORCE_INLINE u16 LZ4_readLE16(const void *src)
return get_unaligned_le16(src);
}
+static FORCE_INLINE void LZ4_copy2(void *dst, const void *src)
+{
+ put_unaligned(get_unaligned((const u16 *)src), (u16 *)dst);
+}
+
+static FORCE_INLINE void LZ4_copy4(void *dst, const void *src)
+{
+ put_unaligned(get_unaligned((const u32 *)src), (u32 *)dst);
+}
+
static FORCE_INLINE void LZ4_copy8(void *dst, const void *src)
{
put_unaligned(get_unaligned((const u64 *)src), (u64 *)dst);
@@ -215,7 +225,10 @@ static FORCE_INLINE int LZ4_decompress_generic(
&& likely((endOnInput ? ip < shortiend : 1) &
(op <= shortoend))) {
/* Copy the literals */
- memcpy(op, ip, endOnInput ? 16 : 8);
+ LZ4_copy8(op, ip);
+ if (endOnInput)
+ LZ4_copy8(op + 8, ip + 8);
+
op += length; ip += length;
/*
@@ -234,9 +247,9 @@ static FORCE_INLINE int LZ4_decompress_generic(
(offset >= 8) &&
(dict == withPrefix64k || match >= lowPrefix)) {
/* Copy the match. */
- memcpy(op + 0, match + 0, 8);
- memcpy(op + 8, match + 8, 8);
- memcpy(op + 16, match + 16, 2);
+ LZ4_copy8(op, match);
+ LZ4_copy8(op + 8, match + 8);
+ LZ4_copy2(op + 16, match + 16);
op += length + MINMATCH;
/* Both stages worked, load the next token. */
continue;
@@ -466,7 +479,7 @@ _copy_match:
op[2] = match[2];
op[3] = match[3];
match += inc32table[offset];
- memcpy(op + 4, match, 4);
+ LZ4_copy4(op + 4, match);
match -= dec64table[offset];
} else {
LZ4_copy8(op, match);
Let me know if you have any further suggestions.
Thanks.
Regards,
Jonathan
On Sun, 26 May 2024 at 22:18, Jianan Huang <jnhuang95 at gmail.com> wrote:
>
> Hi Jonathan,
>
> Could you please try the following patch ? It replaces all memcpy() calls in lz4 with __builtin_memcpy().
>
> diff --git a/lib/lz4.c b/lib/lz4.c
> index d365dc727c..2afe31c1c3 100644
> --- a/lib/lz4.c
> +++ b/lib/lz4.c
> @@ -34,6 +34,8 @@
> #include <asm/unaligned.h>
> #include <u-boot/lz4.h>
>
> +#define LZ4_memcpy(dst, src, size) __builtin_memcpy(dst, src, size)
> +
> #define FORCE_INLINE inline __attribute__((always_inline))
>
> static FORCE_INLINE u16 LZ4_readLE16(const void *src)
> @@ -215,7 +217,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
> && likely((endOnInput ? ip < shortiend : 1) &
> (op <= shortoend))) {
> /* Copy the literals */
> - memcpy(op, ip, endOnInput ? 16 : 8);
> + LZ4_memcpy(op, ip, endOnInput ? 16 : 8);
> op += length; ip += length;
>
> /*
> @@ -234,9 +236,9 @@ static FORCE_INLINE int LZ4_decompress_generic(
> (offset >= 8) &&
> (dict == withPrefix64k || match >= lowPrefix)) {
> /* Copy the match. */
> - memcpy(op + 0, match + 0, 8);
> - memcpy(op + 8, match + 8, 8);
> - memcpy(op + 16, match + 16, 2);
> + LZ4_memcpy(op + 0, match + 0, 8);
> + LZ4_memcpy(op + 8, match + 8, 8);
> + LZ4_memcpy(op + 16, match + 16, 2);
> op += length + MINMATCH;
> /* Both stages worked, load the next token. */
> continue;
> @@ -416,7 +418,7 @@ _copy_match:
> size_t const copySize = (size_t)(lowPrefix - match);
> size_t const restSize = length - copySize;
>
> - memcpy(op, dictEnd - copySize, copySize);
> + LZ4_memcpy(op, dictEnd - copySize, copySize);
> op += copySize;
> if (restSize > (size_t)(op - lowPrefix)) {
> /* overlap copy */
> @@ -426,7 +428,7 @@ _copy_match:
> while (op < endOfMatch)
> *op++ = *copyFrom++;
> } else {
> - memcpy(op, lowPrefix, restSize);
> + LZ4_memcpy(op, lowPrefix, restSize);
> op += restSize;
> }
> }
> @@ -452,7 +454,7 @@ _copy_match:
> while (op < copyEnd)
> *op++ = *match++;
> } else {
> - memcpy(op, match, mlen);
> + LZ4_memcpy(op, match, mlen);
> }
> op = copyEnd;
> if (op == oend)
> @@ -466,7 +468,7 @@ _copy_match:
> op[2] = match[2];
> op[3] = match[3];
> match += inc32table[offset];
> - memcpy(op + 4, match, 4);
> + LZ4_memcpy(op + 4, match, 4);
> match -= dec64table[offset];
> } else {
> LZ4_copy8(op, match);
> diff --git a/lib/lz4_wrapper.c b/lib/lz4_wrapper.c
> index 4d48e7b0e8..e09c8d7057 100644
> --- a/lib/lz4_wrapper.c
> +++ b/lib/lz4_wrapper.c
> @@ -80,7 +80,7 @@ int ulz4fn(const void *src, size_t srcn, void *dst, size_t *dstn)
>
> if (block_header & LZ4F_BLOCKUNCOMPRESSED_FLAG) {
> size_t size = min((ptrdiff_t)block_size, (ptrdiff_t)(end - out));
> - memcpy(out, in, size);
> + LZ4_memcpy(out, in, size);
> out += size;
> if (size < block_size) {
> ret = -ENOBUFS; /* output overrun */
>
>
> Thanks,
>
> Jianan
>
> On 2024/5/26 16:06, Jonathan Liu wrote:
>
> Hi Gao,
>
> On Sat, 25 May 2024 at 02:52, Gao Xiang <hsiangkao at linux.alibaba.com> wrote:
>
> Hi,
>
> On 2024/5/24 22:26, Jonathan Liu wrote:
>
> Hi Jianan,
>
> On Sat, 26 Feb 2022 at 18:05, Huang Jianan <jnhuang95 at gmail.com> wrote:
>
> Update the LZ4 compression module based on LZ4 v1.8.3 in order to
> use the newest LZ4_decompress_safe_partial() which can now decode
> exactly the nb of bytes requested.
>
> Signed-off-by: Huang Jianan <jnhuang95 at gmail.com>
>
> I noticed after this commit LZ4 decompression is slower.
> ulz4fn function call takes 1.209670 seconds with this commit.
> After reverting this commit, the ulz4fn function call takes 0.587032 seconds.
>
> I am decompressing a LZ4 compressed kernel (compressed with lz4 v1.9.4
> using -9 option for maximum compression) on RK3399.
>
> Any ideas why it is slower with this commit and how the performance
> regression can be fixed?
>
> Just the quick glance, I think the issue may be due to memcpy/memmove
> since it seems the main difference between these two codebases
> (I'm not sure which LZ4 version the old codebase was based on) and
> the new version mainly relies on memcpy/memmove instead of its own
> versions.
>
> Would you mind to check the assembly how memcpy/memset is generated
> on your platform?
>
> Here is the assembly (-mcpu=cortex-a72.cortex-a53 -march=armv8-a+crc+crypto):
> 000000000028220c <memset>:
> #if !CONFIG_IS_ENABLED(TINY_MEMSET)
> unsigned long cl = 0;
> int i;
>
> /* do it one word at a time (32 bits or 64 bits) while possible */
> if ( ((ulong)s & (sizeof(*sl) - 1)) == 0) {
> 28220c: f2400803 ands x3, x0, #0x7
> 282210: 540002c1 b.ne 282268 <memset+0x5c> // b.any
> for (i = 0; i < sizeof(*sl); i++) {
> cl <<= 8;
> cl |= c & 0xff;
> 282214: 92401c26 and x6, x1, #0xff
> unsigned long cl = 0;
> 282218: d2800004 mov x4, #0x0 // #0
> 28221c: 52800105 mov w5, #0x8 // #8
> cl |= c & 0xff;
> 282220: aa0420c4 orr x4, x6, x4, lsl #8
> for (i = 0; i < sizeof(*sl); i++) {
> 282224: 710004a5 subs w5, w5, #0x1
> 282228: 54ffffc1 b.ne 282220 <memset+0x14> // b.any
> }
> while (count >= sizeof(*sl)) {
> 28222c: cb030045 sub x5, x2, x3
> 282230: f1001cbf cmp x5, #0x7
> 282234: 54000148 b.hi 28225c <memset+0x50> // b.pmore
> 282238: d343fc43 lsr x3, x2, #3
> 28223c: 928000e4 mov x4, #0xfffffffffffffff8 // #-8
> 282240: 9b047c63 mul x3, x3, x4
> 282244: 8b030042 add x2, x2, x3
> 282248: cb030003 sub x3, x0, x3
> unsigned long *sl = (unsigned long *) s;
> 28224c: d2800004 mov x4, #0x0 // #0
> count -= sizeof(*sl);
> }
> }
> #endif /* fill 8 bits at a time */
> s8 = (char *)sl;
> while (count--)
> 282250: eb04005f cmp x2, x4
> 282254: 540000e1 b.ne 282270 <memset+0x64> // b.any
> *s8++ = c;
>
> return s;
> }
> 282258: d65f03c0 ret
> *sl++ = cl;
> 28225c: f8236804 str x4, [x0, x3]
> count -= sizeof(*sl);
> 282260: 91002063 add x3, x3, #0x8
> 282264: 17fffff2 b 28222c <memset+0x20>
> unsigned long *sl = (unsigned long *) s;
> 282268: aa0003e3 mov x3, x0
> 28226c: 17fffff8 b 28224c <memset+0x40>
> *s8++ = c;
> 282270: 38246861 strb w1, [x3, x4]
> 282274: 91000484 add x4, x4, #0x1
> 282278: 17fffff6 b 282250 <memset+0x44>
>
> 000000000028227c <memcpy>:
> __used void * memcpy(void *dest, const void *src, size_t count)
> {
> unsigned long *dl = (unsigned long *)dest, *sl = (unsigned long *)src;
> char *d8, *s8;
>
> if (src == dest)
> 28227c: eb01001f cmp x0, x1
> 282280: 54000100 b.eq 2822a0 <memcpy+0x24> // b.none
> return dest;
>
> /* while all data is aligned (common case), copy a word at a time */
> if ( (((ulong)dest | (ulong)src) & (sizeof(*dl) - 1)) == 0) {
> 282284: aa010003 orr x3, x0, x1
> 282288: f2400863 ands x3, x3, #0x7
> 28228c: 54000120 b.eq 2822b0 <memcpy+0x34> // b.none
> 282290: aa0003e4 mov x4, x0
> 282294: d2800003 mov x3, #0x0 // #0
> }
> }
> /* copy the reset one byte at a time */
> d8 = (char *)dl;
> s8 = (char *)sl;
> while (count--)
> 282298: eb03005f cmp x2, x3
> 28229c: 540001e1 b.ne 2822d8 <memcpy+0x5c> // b.any
> *d8++ = *s8++;
>
> return dest;
> }
> 2822a0: d65f03c0 ret
> *dl++ = *sl++;
> 2822a4: f8636824 ldr x4, [x1, x3]
> 2822a8: f8236804 str x4, [x0, x3]
> count -= sizeof(*dl);
> 2822ac: 91002063 add x3, x3, #0x8
> while (count >= sizeof(*dl)) {
> 2822b0: cb030044 sub x4, x2, x3
> 2822b4: f1001c9f cmp x4, #0x7
> 2822b8: 54ffff68 b.hi 2822a4 <memcpy+0x28> // b.pmore
> 2822bc: d343fc43 lsr x3, x2, #3
> 2822c0: 928000e4 mov x4, #0xfffffffffffffff8 // #-8
> 2822c4: 9b047c63 mul x3, x3, x4
> 2822c8: 8b030042 add x2, x2, x3
> 2822cc: cb030004 sub x4, x0, x3
> 2822d0: cb030021 sub x1, x1, x3
> 2822d4: 17fffff0 b 282294 <memcpy+0x18>
> *d8++ = *s8++;
> 2822d8: 38636825 ldrb w5, [x1, x3]
> 2822dc: 38236885 strb w5, [x4, x3]
> 2822e0: 91000463 add x3, x3, #0x1
> 2822e4: 17ffffed b 282298 <memcpy+0x1c>
>
>
> I tried enabling CONFIG_USE_ARCH_MEMCPY=y, CONFIG_USE_ARCH_MEMSET=y in
> the .config (but leaving it disabled in SPL/TPL) and it results in
> Synchronous Abort:
> U-Boot SPL 2024.04 (Apr 02 2024 - 10:58:58 +0000)
> Trying to boot from MMC1
> NOTICE: BL31: v1.3(release):8f40012ab
> NOTICE: BL31: Built : 14:20:53, Feb 16 2023
> NOTICE: BL31: Rockchip release version: v1.1
> INFO: GICv3 with legacy support detected. ARM GICV3 driver initialized in EL3
> INFO: Using opteed sec cpu_context!
> INFO: boot cpu mask: 0
> INFO: plat_rockchip_pmu_init(1203): pd status 3e
> INFO: BL31: Initializing runtime services
> WARNING: No OPTEE provided by BL2 boot loader, Booting device without
> OPTEE initialization. SMC`s destined for OPTEE will return SMC_UNK
> ERROR: Error initializing runtime service opteed_fast
> INFO: BL31: Preparing for EL3 exit to normal world
> INFO: Entry point address = 0x200000
> INFO: SPSR = 0x3c9
> "Synchronous Abort" handler, esr 0x96000021, far 0x2957e1
> elr: 000000000020233c lr : 000000000026a388
> x0 : 00000000002fbf38 x1 : 00000000002957e1
> x2 : 0000000000000008 x3 : 0000000000000065
> x4 : 00000000002957e9 x5 : 00000000002fbf40
> x6 : 0000000000000065 x7 : 0000000000000003
> x8 : 00000000002c7960 x9 : 000000000000ffd0
> x10: 00000000002fbc5c x11: 00000000000132e8
> x12: 00000000002fbce8 x13: 00000000002c7960
> x14: 00000000002c7960 x15: 0000000000000000
> x16: 0000000000000000 x17: 0000000000000000
> x18: 00000000002fbe30 x19: 0000000000000007
> x20: 00000000002957d8 x21: 0000000000000009
> x22: 000000000029d189 x23: 0000000000000020
> x24: 00000000002fbf38 x25: 00000000002957e7
> x26: 00000000002957b2 x27: 0000000000007fff
> x28: 0000000000000000 x29: 00000000002fbcc0
>
> Code: a9001c06 a93f34ac d65f03c0 361800c2 (f9400026)
> Resetting CPU ...
>
> resetting ...
>
> Thanks,
> Gao Xiang
>
> Regards,
> Jonathan
More information about the Linux-erofs
mailing list