[PATCH 2/3] powerpc/e6500: hw tablewalk: optimize a bit for tcd lock acquiring codes

Mon Aug 17 21:16:55 AEST 2015

On Fri, Aug 14, 2015 at 09:44:28PM -0500, Scott Wood wrote:
> I tried a couple different benchmarks and didn't find a significant 
> difference, relative to the variability of the results running on the same 
> kernel.  A patch that claims to "optimize a bit" as its main purpose ought to 
> show some results. :-)

I tried to compare the execution time of these two code sequences with the
following test module:

#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/printk.h>

static void test1(void)
{
	int i;
	unsigned char lock, c;
	unsigned short cpu, s;

	for (i = 0; i < 100000; i++) {
		lock = 0;
		cpu = 1;

		asm volatile (	
"1:		lbarx	%0,0,%2\n\
		lhz	%1,0(%3)\n\
		cmpdi	%0,0\n\
		cmpdi	cr1,%1,1\n\
		addi	%1,%1,1\n\
		bne	2f\n\
		stbcx.	%1,0,%2\n\
		bne	1b\n\
2:"
		: "=&r" (c), "=&r" (s) : "r" (&lock), "r" (&cpu) : "cr0", "cr1", "memory"); 
	}
}

static void test2(void)
{
	int i;
	unsigned char lock, c;
	unsigned short cpu, s;

	for (i = 0; i < 100000; i++) {
		lock = 0;
		cpu = 1;

		asm volatile (	
"		lhz	%1,0(%3)\n\
		addi	%1,%1,1\n\
		crclr	cr1*4+eq\n\
1:		lbarx	%0,0,%2\n\
		cmpdi	%0,0\n\
		bne	2f\n\
		stbcx.	%1,0,%2\n\
		bne	1b\n\
2:"
		: "=&r" (c), "=&r" (s) : "r" (&lock), "r" (&cpu) : "cr0", "cr1", "memory"); 
	}
}

static int test_init(void)
{
	unsigned long s, e, tm1, tm2;

	__hard_irq_disable();
	/* Just for prefetch */
	test1();
	s = mftb();
	test1();
	e = mftb();
	tm1 = e - s;

	/* Just for prefetch */
	test2();
	s = mftb();
	test2();
	e = mftb();
	tm2 = e - s;
	__hard_irq_enable();

	pr_err("test1: %ld, test2: %ld, %%%ld\n", tm1, tm2, (tm1 - tm2) * 100 / tm1);

	return 0;
}

static void test_exit(void)
{
	return;
}

module_init(test_init);
module_exit(test_exit);
MODULE_LICENSE("GPL");

The results:
test1: 156568, test2: 151675, %3
test1: 156604, test2: 151670, %3
test1: 156567, test2: 151684, %3
test1: 156567, test2: 151678, %3
test1: 156567, test2: 151688, %3
test1: 156570, test2: 151683, %3
test1: 156565, test2: 151675, %3
test1: 156565, test2: 151673, %3

It seems that there do have a %3 gain in performance by moving the
3 instructions out of lbarx/stbcx loop.

Thanks,
Kevin
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 473 bytes
Desc: not available
URL: <http://lists.ozlabs.org/pipermail/linuxppc-dev/attachments/20150817/06750483/attachment-0001.sig>