Question about MPC7410/MPC7455 Altivec and performance.
Arun Dharankar
ADharankar at attbi.com
Mon Feb 17 04:57:34 EST 2003
Greetings... this turns out to be a vectorizer issue. If
the "float" arrays (the sample/test C code) are moved out
into global/extern scope, the program functions almost 100%
better than the non-vectorized code (on 745x and 7410).
Best regards,
-Arun.
On Thursday 13 February 2003 10:25 pm, I wrote:
> Greetings!
>
> On a PowerMAC G4 (7455) a test C code as shown towards the
> end of this file was tried: with standard gcc, and other with
> gcc with Altivec enabled (and C code preprocessed with a
> Altivec preprocessor).
>
> The Altivec/vectorized code functions better than non-vectorized
> by about 40%.
>
>
> The same binaries (statically linked) were tried on a MPC7410
> based board. The performance of the vectorized program was
> observed to be 18% slower than the non-vectorized code.
>
>
>
> The board is MPC7410 with 8260 in companion mode (core disabled),
> and the Linux kernel has been Altivec enabled (the program anyway
> will not work if Altivec is disabled in the Linux kernel).
>
>
>
> There is one change I have made to the Linux kernel, which can
> be described as follows. I dont see how it can affect the Altivec,
> but mentioning it here - just in case I am missing something.
>
> The memory controller is MPC8260, and does not recognize TLBIE
> transaction type (0x18) as a special case. The Linux kernel code
> performaing the TLBIEs currently provided the virtual/effective
> address whose TLBE needs invalidation. To work around this, I
> modify address passed to tlbie so that only bits 14 to 19 remain the
> same as the original address, and other bits are zero'd (essentially,
> the address is guaranteed to fall in the physical memory address
> range, and the memory controller responds).
>
> Anyway, this seems to work quite well under different combinations
> of non-Altivec/non-vectorized load conditions.
>
>
> The Linux kernel version is 2.4.20, and GCC is 2.95.2 (with patch
> to support for "-fvec" option, availabel at altivec.org).
>
>
> Any ideas why 7410 performance would degrade as described above?
> Or how this could be debugged?
>
>
> Best regards,
> -Arun.
>
> -------------------------------------------------------------
> int
> main(int ac, char *av[]) {
>
> float a[99], b[99], x;
> int i, j, n = atoi(av[1]);
>
> for ( i=0; i < n; i++ )
> for(j=0; j<99; j++)
> x += a[j]*b[j];
>
> return 0;
> }
> --------------------------------------------------------------
> int main( int ac, char *av[] )
> {
> float a[99], b[99], x;
> int i, j, n = atoi(av[1]);
> for ( i=0; i < n; i++ )
> {
> if ( (((int )&a[0] | (int )&b[0]) & 15) != 0 )
> {
> {
> {
> int j1, j2, j3, j4, j5, j6, j7;
> vector float a1v, b1v, x1v, r2v;
> vector float x2v = (vector float )(0);
> vector float r6v = (vector float )(0);
> vector float r1v = (vector float )(0.);
> vector float a9v, a10v, b9v, b10v;
> vector float r5v = (vector float )(0);
> vector float a7v, a8v, b7v, b8v;
> vector float r4v = (vector float )(0);
> vector float a5v, a6v, b5v, b6v;
> vector float r3v = (vector float )(0);
> vector float a2v, a3v;
> vector unsigned char a4v = vec_lvsl(0, &a[0]);
> vector float b2v, b3v;
> vector unsigned char b4v = vec_lvsl(0, &b[0]);
> static vector unsigned long j1v[3] = { (
> vector unsigned long )(0, 0XFFFFFFFF, 0XFFFFFFFF,
> 0XFFFFFFFF), (vector unsigned long )(0, 0, 0XFFFFFFFF,
> 0XFFFFFFFF), (vector unsigned long )(0, 0, 0, 0XFFFFFFFF)
> } ;
> vector float r7v;
> vector signed short k1v = (vector signed short )(0, 0, 0,
> 0, 0, 0, 1, 0);
> vec_mtvscr( k1v );
> *((float *)&x2v) = x;
> x1v = vec_splat(x2v, 0);
> a2v = vec_ld(0, &a[0]);
> b2v = vec_ld(0, &b[0]);
> for ( j1 = 0; j1 < (99 - 4 * 4) + 1; j1 += 4 * 4 )
> {
> j3 = j1 * sizeof(int );
> j2 = j3 + 4 * sizeof(int );
> a3v = vec_ld(j2, &a[0]);
> b3v = vec_ld(j2, &b[0]);
> a5v = vec_ld(j2 + 16, &a[0]);
> b5v = vec_ld(j2 + 16, &b[0]);
> a7v = vec_ld(j2 + 32, &a[0]);
> b7v = vec_ld(j2 + 32, &b[0]);
> a1v = vec_perm(a2v, a3v, a4v);
> a2v = vec_ld(j2 + 48, &a[0]);
> b1v = vec_perm(b2v, b3v, b4v);
> b2v = vec_ld(j2 + 48, &b[0]);
> r1v = vec_madd(a1v, b1v, r1v);
> a6v = vec_perm(a3v, a5v, a4v);
> b6v = vec_perm(b3v, b5v, b4v);
> r3v = vec_madd(a6v, b6v, r3v);
> a8v = vec_perm(a5v, a7v, a4v);
> b8v = vec_perm(b5v, b7v, b4v);
> r4v = vec_madd(a8v, b8v, r4v);
> a10v = vec_perm(a7v, a2v, a4v);
> b10v = vec_perm(b7v, b2v, b4v);
> r5v = vec_madd(a10v, b10v, r5v);
> }
> if ( j1 )
> {
> r1v = vec_add(r1v, r3v);
> r1v = vec_add(r1v, r4v);
> r1v = vec_add(r1v, r5v);
> }
> j3 = j1 * sizeof(int );
> j2 = j3 + 4 * sizeof(int );
> a3v = vec_ld(j2, &a[0]);
> a1v = vec_perm(a2v, a3v, a4v);
> b3v = vec_ld(j2, &b[0]);
> b1v = vec_perm(b2v, b3v, b4v);
> r7v = vec_sel(a1v, r6v, j1v[3-1]);
> r1v = vec_madd(r7v, b1v, r1v);
> r2v = vec_sld(r1v, r1v, 8);
> r1v = vec_add(r1v, r2v);
> r2v = vec_sld(r1v, r1v, 4);
> r1v = vec_add(r1v, r2v);
> r1v = vec_add(r1v, x1v);
> vec_ste(r1v, 0, &x);
> }
> }
> }
> else
> {
> {
> {
> int j8, j9, j10, j11, j12, j13, j14;
> vector float a11v, b11v, x3v, r9v;
> vector float x4v = (vector float )(0);
> vector float r13v = (vector float )(0);
> vector float r8v = (vector float )(0.);
> vector float a14v, b14v;
> vector float r12v = (vector float )(0);
> vector float a13v, b13v;
> vector float r11v = (vector float )(0);
> vector float a12v, b12v;
> vector float r10v = (vector float )(0);
> static vector unsigned long j2v[3] = { (
> vector unsigned long )(0, 0XFFFFFFFF, 0XFFFFFFFF,
> 0XFFFFFFFF), (vector unsigned long )(0, 0, 0XFFFFFFFF,
> 0XFFFFFFFF), (vector unsigned long )(0, 0, 0, 0XFFFFFFFF)
> } ;
> vector float r14v;
> vector signed short k2v = (vector signed short )(0, 0, 0,
> 0, 0, 0, 1, 0);
> vec_mtvscr( k2v );
> *((float *)&x4v) = x;
> x3v = vec_splat(x4v, 0);
> for ( j8 = 0; j8 < (99 - 4 * 4) + 1; j8 += 4 * 4 )
> {
> j10 = j8 * sizeof(int );
> j9 = j10;
> a11v = vec_ld(j10, &a[0]);
> b11v = vec_ld(j10, &b[0]);
> a12v = vec_ld(j10 + 16, &a[0]);
> b12v = vec_ld(j10 + 16, &b[0]);
> a13v = vec_ld(j10 + 32, &a[0]);
> b13v = vec_ld(j10 + 32, &b[0]);
> a14v = vec_ld(j10 + 48, &a[0]);
> b14v = vec_ld(j10 + 48, &b[0]);
> r8v = vec_madd(a11v, b11v, r8v);
> r10v = vec_madd(a12v, b12v, r10v);
> r11v = vec_madd(a13v, b13v, r11v);
> r12v = vec_madd(a14v, b14v, r12v);
> }
> if ( j8 )
> {
> r8v = vec_add(r8v, r10v);
> r8v = vec_add(r8v, r11v);
> r8v = vec_add(r8v, r12v);
> }
> j10 = j8 * sizeof(int );
> j9 = j10;
> a11v = vec_ld(j10, &a[0]);
> b11v = vec_ld(j10, &b[0]);
> r14v = vec_sel(a11v, r13v, j2v[3-1]);
> r8v = vec_madd(r14v, b11v, r8v);
> r9v = vec_sld(r8v, r8v, 8);
> r8v = vec_add(r8v, r9v);
> r9v = vec_sld(r8v, r8v, 4);
> r8v = vec_add(r8v, r9v);
> r8v = vec_add(r8v, x3v);
> vec_ste(r8v, 0, &x);
> }
> }
> }
> }
> return 0;
> }
>
> --------------------------------------------------------------
>
>
** Sent via the linuxppc-embedded mail list. See http://lists.linuxppc.org/
More information about the Linuxppc-embedded
mailing list