Question about MPC7410/MPC7455 Altivec and performance.

Fri Feb 14 14:25:05 EST 2003

Greetings!

On a PowerMAC G4 (7455) a test C code as shown towards the
end of this file was tried: with standard gcc, and other with
gcc with Altivec enabled (and C code preprocessed with a
Altivec preprocessor).

The Altivec/vectorized code functions better than non-vectorized
by about 40%.

The same binaries (statically linked) were tried on a MPC7410
based board. The performance of the vectorized program was
observed to be 18% slower than the non-vectorized code.

The board is MPC7410 with 8260 in companion mode (core disabled),
and the Linux kernel has been Altivec enabled (the program anyway
will not work if Altivec is disabled in the Linux kernel).

There is one change I have made to the Linux kernel, which can
be described as follows. I dont see how it can affect the Altivec,
but mentioning it here - just in case I am missing something.

The memory controller is MPC8260, and does not recognize TLBIE
transaction type (0x18) as a special case. The Linux kernel code
performaing the TLBIEs currently provided the virtual/effective
address whose TLBE needs invalidation. To work around this, I
modify address passed to tlbie so that only bits 14 to 19 remain the
same as the original address, and other bits are zero'd (essentially,
the address is guaranteed to fall in the physical memory address
range, and the memory controller responds).

Anyway, this seems to work quite well under different combinations
of non-Altivec/non-vectorized load conditions.

The Linux kernel version is 2.4.20, and GCC is 2.95.2 (with patch
to support for "-fvec" option, availabel at altivec.org).

Any ideas why 7410 performance would degrade as described above?
Or how this could be debugged?

Best regards,
-Arun.

-------------------------------------------------------------
int
main(int ac, char *av[]) {

        float a[99], b[99], x;
        int i, j, n = atoi(av[1]);

        for ( i=0; i < n; i++ )
                for(j=0; j<99; j++)
                        x += a[j]*b[j];

        return 0;
}
--------------------------------------------------------------
 int  main( int ac, char *av[] )
 {
    float a[99], b[99], x;
    int i, j, n = atoi(av[1]);
    for ( i=0; i < n; i++  )
    {
       if ( (((int )&a[0] | (int )&b[0]) & 15) != 0 )
       {
          {
             {
                int j1, j2, j3, j4, j5, j6, j7;
                vector float a1v, b1v, x1v, r2v;
                vector float x2v = (vector float )(0);
                vector float r6v = (vector float )(0);
                vector float r1v = (vector float )(0.);
                vector float a9v, a10v, b9v, b10v;
                vector float r5v = (vector float )(0);
                vector float a7v, a8v, b7v, b8v;
                vector float r4v = (vector float )(0);
                vector float a5v, a6v, b5v, b6v;
                vector float r3v = (vector float )(0);
                vector float a2v, a3v;
                vector unsigned char a4v = vec_lvsl(0, &a[0]);
                vector float b2v, b3v;
                vector unsigned char b4v = vec_lvsl(0, &b[0]);
                static vector unsigned long j1v[3] =  { (
                vector unsigned long )(0, 0XFFFFFFFF, 0XFFFFFFFF,
                0XFFFFFFFF), (vector unsigned long )(0, 0, 0XFFFFFFFF,
                0XFFFFFFFF), (vector unsigned long )(0, 0, 0, 0XFFFFFFFF)
                } ;
                vector float r7v;
                vector signed short k1v = (vector signed short )(0, 0, 0,
                0, 0, 0, 1, 0);
                vec_mtvscr( k1v );
                *((float *)&x2v) = x;
                x1v = vec_splat(x2v, 0);
                a2v = vec_ld(0, &a[0]);
                b2v = vec_ld(0, &b[0]);
                for ( j1 = 0; j1 < (99 - 4 * 4) + 1; j1 += 4 * 4 )
                {
                   j3 = j1 * sizeof(int );
                   j2 = j3 + 4 * sizeof(int );
                   a3v = vec_ld(j2, &a[0]);
                   b3v = vec_ld(j2, &b[0]);
                   a5v = vec_ld(j2 + 16, &a[0]);
                   b5v = vec_ld(j2 + 16, &b[0]);
                   a7v = vec_ld(j2 + 32, &a[0]);
                   b7v = vec_ld(j2 + 32, &b[0]);
                   a1v = vec_perm(a2v, a3v, a4v);
                   a2v = vec_ld(j2 + 48, &a[0]);
                   b1v = vec_perm(b2v, b3v, b4v);
                   b2v = vec_ld(j2 + 48, &b[0]);
                   r1v = vec_madd(a1v, b1v, r1v);
                   a6v = vec_perm(a3v, a5v, a4v);
                   b6v = vec_perm(b3v, b5v, b4v);
                   r3v = vec_madd(a6v, b6v, r3v);
                   a8v = vec_perm(a5v, a7v, a4v);
                   b8v = vec_perm(b5v, b7v, b4v);
                   r4v = vec_madd(a8v, b8v, r4v);
                   a10v = vec_perm(a7v, a2v, a4v);
                   b10v = vec_perm(b7v, b2v, b4v);
                   r5v = vec_madd(a10v, b10v, r5v);
                }
                if ( j1 )
                {
                   r1v = vec_add(r1v, r3v);
                   r1v = vec_add(r1v, r4v);
                   r1v = vec_add(r1v, r5v);
                }
                j3 = j1 * sizeof(int );
                j2 = j3 + 4 * sizeof(int );
                a3v = vec_ld(j2, &a[0]);
                a1v = vec_perm(a2v, a3v, a4v);
                b3v = vec_ld(j2, &b[0]);
                b1v = vec_perm(b2v, b3v, b4v);
                r7v = vec_sel(a1v, r6v, j1v[3-1]);
                r1v = vec_madd(r7v, b1v, r1v);
                r2v = vec_sld(r1v, r1v, 8);
                r1v = vec_add(r1v, r2v);
                r2v = vec_sld(r1v, r1v, 4);
                r1v = vec_add(r1v, r2v);
                r1v = vec_add(r1v, x1v);
                vec_ste(r1v, 0, &x);
             }
          }
       }
       else
       {
          {
             {
                int j8, j9, j10, j11, j12, j13, j14;
                vector float a11v, b11v, x3v, r9v;
                vector float x4v = (vector float )(0);
                vector float r13v = (vector float )(0);
                vector float r8v = (vector float )(0.);
                vector float a14v, b14v;
                vector float r12v = (vector float )(0);
                vector float a13v, b13v;
                vector float r11v = (vector float )(0);
                vector float a12v, b12v;
                vector float r10v = (vector float )(0);
                static vector unsigned long j2v[3] =  { (
                vector unsigned long )(0, 0XFFFFFFFF, 0XFFFFFFFF,
                0XFFFFFFFF), (vector unsigned long )(0, 0, 0XFFFFFFFF,
                0XFFFFFFFF), (vector unsigned long )(0, 0, 0, 0XFFFFFFFF)
                } ;
                vector float r14v;
                vector signed short k2v = (vector signed short )(0, 0, 0,
                0, 0, 0, 1, 0);
                vec_mtvscr( k2v );
                *((float *)&x4v) = x;
                x3v = vec_splat(x4v, 0);
                for ( j8 = 0; j8 < (99 - 4 * 4) + 1; j8 += 4 * 4 )
                {
                   j10 = j8 * sizeof(int );
                   j9 = j10;
                   a11v = vec_ld(j10, &a[0]);
                   b11v = vec_ld(j10, &b[0]);
                   a12v = vec_ld(j10 + 16, &a[0]);
                   b12v = vec_ld(j10 + 16, &b[0]);
                   a13v = vec_ld(j10 + 32, &a[0]);
                   b13v = vec_ld(j10 + 32, &b[0]);
                   a14v = vec_ld(j10 + 48, &a[0]);
                   b14v = vec_ld(j10 + 48, &b[0]);
                   r8v = vec_madd(a11v, b11v, r8v);
                   r10v = vec_madd(a12v, b12v, r10v);
                   r11v = vec_madd(a13v, b13v, r11v);
                   r12v = vec_madd(a14v, b14v, r12v);
                }
                if ( j8 )
                {
                   r8v = vec_add(r8v, r10v);
                   r8v = vec_add(r8v, r11v);
                   r8v = vec_add(r8v, r12v);
                }
                j10 = j8 * sizeof(int );
                j9 = j10;
                a11v = vec_ld(j10, &a[0]);
                b11v = vec_ld(j10, &b[0]);
                r14v = vec_sel(a11v, r13v, j2v[3-1]);
                r8v = vec_madd(r14v, b11v, r8v);
                r9v = vec_sld(r8v, r8v, 8);
                r8v = vec_add(r8v, r9v);
                r9v = vec_sld(r8v, r8v, 4);
                r8v = vec_add(r8v, r9v);
                r8v = vec_add(r8v, x3v);
                vec_ste(r8v, 0, &x);
             }
          }
       }
    }
    return 0;
 }

--------------------------------------------------------------

** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/