gcc optimizes loops badly.
Joakim Tjernlund
joakim.tjernlund at lumentis.se
Fri Jan 3 02:57:22 EST 2003
Hi Daniel
Thanks for running the test for me.
The option "-mregnames" exists only on gcc for PPC.
Results are as before for x86. crc32do_while is the
winner followed by crc32do_while_dec.
Gcc should be able generate the same code for
crc32org and crc32do_while, it's a simple optimization.
crc32do_while_dec is possibly only useful on PPC.
On PPC I expect crc32do_while_dec to be the winner.
Do you have a gcc 3.2 which will generate PPC assembly?
Jocke
PS.
You don't have to be on the list to post to it. I will
CC the list for now.
>
>
> Jocke,
>
> The option "-mregnames" no longer exists in version 3.2 of gcc. I
> couldn't find anything equivalent. I ran it without that option (gcc -S
> -O2 testcode.c) and produced the following on a i686 RedHat 7.3 box
> using gcc 3.2 (gcc 3.2.1 is the latest release I believe)
>
> I am not on the list, hence I cannot CC the list. This message was
> forwarded to me from someone else.
>
> Dan Eisenhut
> GE Medical Systems - Information Technologies
> daniel.eisenhut at med.ge.com
> 414-362-3151
>
>
> .file "testcode.c"
> .text
> .align 2
> .p2align 4,,15
> .globl crc32org
> .type crc32org, at function
> crc32org:
> pushl %ebp
> movl %esp, %ebp
> pushl %esi
> movl 16(%ebp), %edx
> pushl %ebx
> movl 8(%ebp), %ecx
> movl 12(%ebp), %ebx
> decl %edx
> cmpl $-1, %edx
> je .L7
> movl $crc32_table, %esi
> .p2align 4,,15
> .L5:
> movzbl (%ebx), %eax
> decl %edx
> incl %ebx
> xorb %cl, %al
> shrl $8, %ecx
> movzbl %al, %eax
> xorl (%esi,%eax,4), %ecx
> cmpl $-1, %edx
> jne .L5
> .L7:
> popl %ebx
> movl %ecx, %eax
> popl %esi
> popl %ebp
> ret
> .Lfe1:
> .size crc32org,.Lfe1-crc32org
> .align 2
> .p2align 4,,15
> .globl crc32do_while
> .type crc32do_while, at function
> crc32do_while:
> pushl %ebp
> movl %esp, %ebp
> pushl %esi
> movl 8(%ebp), %edx
> pushl %ebx
> movl 16(%ebp), %ebx
> movl 12(%ebp), %ecx
> testl %ebx, %ebx
> je .L9
> movl $crc32_table, %esi
> .p2align 4,,15
> .L10:
> movzbl (%ecx), %eax
> incl %ecx
> xorb %dl, %al
> shrl $8, %edx
> movzbl %al, %eax
> xorl (%esi,%eax,4), %edx
> decl %ebx
> jne .L10
> .L9:
> popl %ebx
> movl %edx, %eax
> popl %esi
> popl %ebp
> ret
> .Lfe2:
> .size crc32do_while,.Lfe2-crc32do_while
> .align 2
> .p2align 4,,15
> .globl crc32do_while_dec
> .type crc32do_while_dec, at function
> crc32do_while_dec:
> pushl %ebp
> movl %esp, %ebp
> pushl %esi
> movl 8(%ebp), %edx
> pushl %ebx
> movl 16(%ebp), %ebx
> movl 12(%ebp), %ecx
> testl %ebx, %ebx
> je .L15
> decl %ecx
> movl $crc32_table, %esi
> .p2align 4,,15
> .L16:
> incl %ecx
> movzbl (%ecx), %eax
> xorb %dl, %al
> shrl $8, %edx
> movzbl %al, %eax
> xorl (%esi,%eax,4), %edx
> decl %ebx
> jne .L16
> .L15:
> popl %ebx
> movl %edx, %eax
> popl %esi
> popl %ebp
> ret
> .Lfe3:
> .size crc32do_while_dec,.Lfe3-crc32do_while_dec
> .ident "GCC: (GNU) 3.2"
>
>
> -----Original Message-----
> From: Joakim Tjernlund [mailto:Joakim.Tjernlund at lumentis.se]
> Sent: Wednesday, January 01, 2003 8:45 AM
> To: linuxppc-embedded at lists.linuxppc.org
> Subject: gcc optimizes loops badly.
>
>
>
> I have spent some time to optimize the crc32 function since JFFS2 uses
> it heavily. I found that
> gcc 2.95.3 optimizes loops badly, even gcc 2.96 RH produces better code
> for x86 in some cases.
>
> So I optimized the C code a bit and got much better results.
> Now I wounder how recent(>= 3.2) gcc performs. Could somebody run gcc -S
> -O2 -mregnames on
> functions below and mail me the results?
>
> Jocke
>
> These are different version of the same crc32 function:
> #include <linux/types.h>
>
> extern const __u32 crc32_table[256];
>
> /* Return a 32-bit CRC of the contents of the buffer. */
>
> __u32 crc32org(__u32 val, const void *ss, unsigned int len)
> {
> const unsigned char *s = ss;
>
> while (len--){
> val = crc32_table[(val ^ *s++) & 0xff] ^ (val >> 8);
> }
> return val;
> }
> __u32 crc32do_while(__u32 val, const void *ss, unsigned int len)
> {
> const unsigned char *s = ss;
>
> if(len){
> do {
> val = crc32_table[(val ^ *s++) & 0xff] ^ (val >> 8);
> } while (--len);
> }
> return val;
> }
> __u32 crc32do_while_dec(__u32 val, const void *ss, unsigned int len)
> {
> const unsigned char *s = ss;
>
> if(len){
> --s;
> do {
> val = crc32_table[(val ^ *(++s)) & 0xff] ^ (val >> 8);
> } while (--len);
> }
> return val;
> }
>
> and the resulting assembly:
> .file "crc32.c"
> gcc2_compiled.:
> .section ".text"
> .align 2
> .globl crc32org
> .type crc32org, at function
> crc32org:
> cmpwi %cr0,%r5,0
> addi %r5,%r5,-1
> bclr 12,2
> lis %r9,crc32_table at ha
> la %r10,crc32_table at l(%r9)
> .L18:
> lbz %r0,0(%r4)
> cmpwi %cr0,%r5,0
> xor %r0,%r3,%r0
> rlwinm %r0,%r0,2,22,29
> lwzx %r11,%r10,%r0
> srwi %r9,%r3,8
> xor %r3,%r11,%r9
> addi %r4,%r4,1
> addi %r5,%r5,-1
> bc 4,2,.L18
> blr
> .Lfe1:
> .size crc32org,.Lfe1-crc32org
> .align 2
> .globl crc32do_while
> .type crc32do_while, at function
> crc32do_while:
> mr. %r0,%r5
> mtctr %r0
> bclr 12,2
> lis %r9,crc32_table at ha
> la %r10,crc32_table at l(%r9)
> .L25:
> lbz %r0,0(%r4)
> srwi %r11,%r3,8
> xor %r0,%r3,%r0
> rlwinm %r0,%r0,2,22,29
> lwzx %r9,%r10,%r0
> addi %r4,%r4,1
> xor %r3,%r9,%r11
> bdnz .L25
> blr
> .Lfe2:
> .size crc32do_while,.Lfe2-crc32do_while
> .align 2
> .globl crc32do_while_dec
> .type crc32do_while_dec, at function
> crc32do_while_dec:
> mr. %r0,%r5
> mtctr %r0
> bclr 12,2
> lis %r9,crc32_table at ha
> la %r10,crc32_table at l(%r9)
> addi %r4,%r4,-1
> .L31:
> lbzu %r0,1(4)
> srwi %r11,%r3,8
> xor %r0,%r3,%r0
> rlwinm %r0,%r0,2,22,29
> lwzx %r9,%r10,%r0
> xor %r3,%r9,%r11
> bdnz .L31
> blr
>
>
** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/
More information about the Linuxppc-embedded
mailing list