[PATCH 2/3] powerpc/e6500: hw tablewalk: optimize a bit for tcd lock acquiring codes
Kevin Hao
haokexin at gmail.com
Mon Aug 17 21:16:55 AEST 2015
On Fri, Aug 14, 2015 at 09:44:28PM -0500, Scott Wood wrote:
> I tried a couple different benchmarks and didn't find a significant
> difference, relative to the variability of the results running on the same
> kernel. A patch that claims to "optimize a bit" as its main purpose ought to
> show some results. :-)
I tried to compare the execution time of these two code sequences with the
following test module:
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/printk.h>
static void test1(void)
{
int i;
unsigned char lock, c;
unsigned short cpu, s;
for (i = 0; i < 100000; i++) {
lock = 0;
cpu = 1;
asm volatile (
"1: lbarx %0,0,%2\n\
lhz %1,0(%3)\n\
cmpdi %0,0\n\
cmpdi cr1,%1,1\n\
addi %1,%1,1\n\
bne 2f\n\
stbcx. %1,0,%2\n\
bne 1b\n\
2:"
: "=&r" (c), "=&r" (s) : "r" (&lock), "r" (&cpu) : "cr0", "cr1", "memory");
}
}
static void test2(void)
{
int i;
unsigned char lock, c;
unsigned short cpu, s;
for (i = 0; i < 100000; i++) {
lock = 0;
cpu = 1;
asm volatile (
" lhz %1,0(%3)\n\
addi %1,%1,1\n\
crclr cr1*4+eq\n\
1: lbarx %0,0,%2\n\
cmpdi %0,0\n\
bne 2f\n\
stbcx. %1,0,%2\n\
bne 1b\n\
2:"
: "=&r" (c), "=&r" (s) : "r" (&lock), "r" (&cpu) : "cr0", "cr1", "memory");
}
}
static int test_init(void)
{
unsigned long s, e, tm1, tm2;
__hard_irq_disable();
/* Just for prefetch */
test1();
s = mftb();
test1();
e = mftb();
tm1 = e - s;
/* Just for prefetch */
test2();
s = mftb();
test2();
e = mftb();
tm2 = e - s;
__hard_irq_enable();
pr_err("test1: %ld, test2: %ld, %%%ld\n", tm1, tm2, (tm1 - tm2) * 100 / tm1);
return 0;
}
static void test_exit(void)
{
return;
}
module_init(test_init);
module_exit(test_exit);
MODULE_LICENSE("GPL");
The results:
test1: 156568, test2: 151675, %3
test1: 156604, test2: 151670, %3
test1: 156567, test2: 151684, %3
test1: 156567, test2: 151678, %3
test1: 156567, test2: 151688, %3
test1: 156570, test2: 151683, %3
test1: 156565, test2: 151675, %3
test1: 156565, test2: 151673, %3
It seems that there do have a %3 gain in performance by moving the
3 instructions out of lbarx/stbcx loop.
Thanks,
Kevin
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 473 bytes
Desc: not available
URL: <http://lists.ozlabs.org/pipermail/linuxppc-dev/attachments/20150817/06750483/attachment-0001.sig>
More information about the Linuxppc-dev
mailing list