PowerPC fastpaths for mutex subsystem
    Ingo Molnar 
    mingo at elte.hu
       
    Sun Jan  8 20:48:39 EST 2006
    
    
  
* Joel Schopp <jschopp at austin.ibm.com> wrote:
> Tested on a 4 core (2 SMT threads/core) Power5 machine with gcc 3.3.2.
> Test results from synchro-test.ko:
> 
> All tests run for default 5 seconds
> Threads semaphores  mutexes     mutexes+attached
> 1       63,465,364  58,404,630  62,109,571
> 4       58,424,282  35,541,297  37,820,794
> 8       40,731,668  35,541,297  40,281,768
> 16      38,372,769  37,256,298  41,751,764
> 32      38,406,895  36,933,675  38,731,571
> 64      37,232,017  36,222,480  40,766,379
interesting. Could you try two things? Firstly, could you add some 
minimal delays to the lock/unlock path, of at least 1 usec? E.g.  
"synchro-test.ko load=1 interval=1". [but you could try longer delays 
too, 10 usecs is still realistic.]
secondly, could you try the VFS creat+unlink test via the test-mutex.c 
code below, with something like:
	./test-mutex V 16 10
(this tests with 16 tasks, for 10 seconds.) You'll get a useful ops/sec 
number out of this test, but the other stats will only be calculated if 
you implement the rdtsc() macro to read cycles - right now it defaults 
to 'always 0' on ppc, i386 and ia64 has it implemented. Also, beware 
that the default atomic_inc()/dec() is unsafe (only i386 and ia64 has 
the real thing implemented), you might want to add a safe PPC 
implementation.
thirdly, could you run 'vmstat 1' during the tests, and post those lines 
too? Here i'm curious about two things: the average runqueue length 
(whether we have overscheduling), and CPU utilization and idle time left 
(how efficiently cycles are preserved in contention). [btw., does ppc 
have an idle=poll equivalent mode of idling?]
also, there seems to be some fluctuation in the numbers - could you try 
to run a few more to see how stable the numbers are?
	Ingo
------------
/*
 * Copyright (C) 2005, Ingo Molnar <mingo at redhat.com>
 */
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <signal.h>
#include <sys/wait.h>
#include <linux/unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdarg.h>
#include <stdlib.h>
#include <signal.h>
#include <sys/wait.h>
#include <linux/unistd.h>
#include <unistd.h>
#include <string.h>
#include <pwd.h>
#include <grp.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/time.h>
#include <regex.h>
#include <fcntl.h>
#include <time.h>
#include <sys/mman.h>
#include <dlfcn.h>
#include <popt.h>
#include <sys/socket.h>
#include <ctype.h>
#include <assert.h>
#include <sched.h>
#ifdef __ia64__
#include <sys/ioctl.h>
#include "mmtimer.h"
int mmtimer_fd;
unsigned long __mm_timer_clock_res;
unsigned long *__mm_clock_dev;
unsigned long __mm_clock_offset;
#endif
unsigned long *shared;
#define mutex_lock()    gettimeofday((void *)0, (void *)10)
#define mutex_unlock()  gettimeofday((void *)0, (void *)20)
#define down()          gettimeofday((void *)0, (void *)100)
#define up()            gettimeofday((void *)0, (void *)200)
#define down_write()    gettimeofday((void *)0, (void *)1000)
#define up_write()      gettimeofday((void *)0, (void *)2000)
#define down_read()     gettimeofday((void *)0, (void *)10000)
#define up_read()       gettimeofday((void *)0, (void *)20000)
/*
 * Shared locks and variables between the test tasks:
 */
#define CACHELINE_SIZE (128/sizeof(long))
enum {
	SHARED_DELTA_SUM		= 0*CACHELINE_SIZE,
	SHARED_DELTA_MAX		= 1*CACHELINE_SIZE,
	SHARED_DELTA2_SUM		= 2*CACHELINE_SIZE,
	SHARED_DELTA2_MAX		= 3*CACHELINE_SIZE,
	SHARED_DELTA3_SUM		= 4*CACHELINE_SIZE,
	SHARED_DELTA3_MAX		= 5*CACHELINE_SIZE,
	SHARED_DELTA_DELTA_SUM		= 6*CACHELINE_SIZE,
	SHARED_COUNT			= 7*CACHELINE_SIZE,
	SHARED_SUM			= 8*CACHELINE_SIZE,
	SHARED_LOCK			= 9*CACHELINE_SIZE,
	SHARED_END			= 10*CACHELINE_SIZE,
};
#define SHARED(x)	(*(shared + SHARED_##x))
#define SHARED_LL(x)	(*(unsigned long long *)(shared + SHARED_##x))
#define BUG_ON(c)	assert(!(c))
static unsigned long *setup_shared_var(void)
{
	char zerobuff [4096] = { 0, };
	int ret, fd;
	unsigned long *buf;
	char tmpfile[100];
	sprintf(tmpfile, ".tmp_mmap-%d", getpid());
	fd = creat(tmpfile, 0700);
	BUG_ON(fd == -1);
	close(fd);
	fd = open(tmpfile, O_RDWR|O_CREAT|O_TRUNC);
	unlink(tmpfile);
	BUG_ON(fd == -1);
	ret = write(fd, zerobuff, 4096);
	BUG_ON(ret != 4096);
	buf = (void *)mmap(0, 4096, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
	BUG_ON(buf == (void *)-1);
	close(fd);
	return buf;
}
#define LOOPS 10000
#ifdef __ia64__
static int setup_mmtimer(void)
{
	unsigned long regoff;
	int fd, _t;
	size_t pagesize;
	if ((fd = open ("/dev/mmtimer", O_RDONLY)) == -1)
		perror("missing /dev/mmtimer");
	else {
		pagesize = getpagesize();
		__mm_clock_dev = mmap(0, pagesize, PROT_READ,
				      MAP_SHARED, fd, 0);
		if (__mm_clock_dev != MAP_FAILED) {
			regoff = ioctl(fd, MMTIMER_GETOFFSET, 0);
			if (regoff >= 0) {
				__mm_clock_dev += regoff;
				__mm_clock_offset = *__mm_clock_dev;
			} else
				perror("reg offset ioctl failed");
			_t = ioctl(fd, MMTIMER_GETFREQ, &__mm_timer_clock_res);
			if (_t)
				perror("get freq ioctl fail");
		}
	}
}
#define ia64_fetchadd8_rel(p, inc)					\
({									\
	__u64 ia64_intri_res;						\
	asm volatile ("fetchadd8.rel %0=[%1],%2"			\
				: "=r"(ia64_intri_res) : "r"(p), "i" (inc) \
				: "memory");				\
									\
	ia64_intri_res;							\
})
static inline void atomic_inc(unsigned long *flag)
{
	ia64_fetchadd8_rel(flag, 1);
}
static inline void atomic_dec(unsigned long *flag)
{
	ia64_fetchadd8_rel(flag, -1);
}
#elif defined(__i386__)
static inline void atomic_inc(unsigned long *flag)
{
	__asm__ __volatile__(
		"lock; incl %0\n"
			     : "=g"(*flag) : : "memory");
}
static inline void atomic_dec(unsigned long *flag)
{
	__asm__ __volatile__(
		"lock; decl %0\n"
			     : "=g"(*flag) : : "memory");
}
#else
static inline void atomic_inc(unsigned long *flag)
{
	++*flag;
}
static inline void atomic_dec(unsigned long *flag)
{
	--*flag;
}
#endif
static void LOCK(unsigned long *shared)
{
	for (;;) {
		atomic_inc(&SHARED(LOCK));
		if (SHARED(LOCK) == 1)
			break;
		atomic_dec(&SHARED(LOCK));
		usleep(1);
	}
}
static void UNLOCK(unsigned long *shared)
{
	atomic_dec(&SHARED(LOCK));
}
static void sigint(int sig)
{
	atomic_inc(&SHARED(END));
}
static void print_status(unsigned long *shared)
{
	unsigned long count;
	count = SHARED(COUNT);
	SHARED(COUNT) = 0;
	SHARED_LL(SUM) += count;
	printf("\r| loops/sec: %ld    \r", count);
	fflush(stdout);
}
enum {
	TYPE_MUTEX,
	TYPE_SEM,
	TYPE_RSEM,
	TYPE_WSEM,
	TYPE_VFS,
	NR_TYPES
};
const char * type_names[NR_TYPES] =
	{	"Mutex",
		"Semaphore",
		"RW-semaphore Read",
		"RW-semaphore Write",
		"VFS"
	};
typedef unsigned long long cycles_t;
typedef unsigned long long usecs_t;
#ifdef __ia64__
# define rdtscll(val)					\
do {							\
	val = *__mm_clock_dev;				\
} while (0)
#elif defined(__i386__)
# define rdtscll(val)					\
do {							\
	__asm__ __volatile__("rdtsc" : "=A" (val));	\
} while (0)
#else
# define rdtscll(val) \
	do { (val) = 0LL; } while (0)
#endif
#define rdtod(val)					\
do {							\
	struct timeval tv;				\
							\
	gettimeofday(&tv, NULL);			\
	(val) = tv.tv_sec * 1000000ULL + tv.tv_usec;	\
} while (0)
#define max(x,y) ({ \
	typeof(x) _x = (x);	\
	typeof(y) _y = (y);	\
	(void) (&_x == &_y);		\
	_x > _y ? _x : _y; })
#define unlikely(x) __builtin_expect(!!(x), 0)
int main(int argc, char **argv)
{
	int i, parent, me, first = 1;
	unsigned long cpus, tasks, seconds = 0;
	cycles_t t0, t01, t1, delta, delta2, delta3, delta_sum = 0,
		delta2_sum = 0, delta3_sum = 0, delta_delta,
		delta_delta_sum = 0, prev_delta,
		delta_max = 0, delta2_max = 0, delta3_max = 0;
	char str[100];
	double freq;
	int type;
	if (argc <= 1 || argc > 4) {
usage:
		fprintf(stderr,
			"usage: test-mutex [Mutex|Sem|Rsem|Wsem|Vfs creat+unlink] <threads> <seconds>\n");
		exit(-1);
usage2:
		fprintf(stderr, "the Mutex/Sem/Rsem/Wsem tests are not available.\n");
		goto usage;
	}
	switch (argv[1][0]) {
		case 'M': type = TYPE_MUTEX; goto usage2; break;
		case 'S': type = TYPE_SEM; goto usage2;  break;
		case 'R': type = TYPE_RSEM; goto usage2; break;
		case 'W': type = TYPE_WSEM; goto usage2; break;
		case 'V': type = TYPE_VFS; break;
		 default: goto usage;
	}
	system("rm -f /tmp/* 2>/dev/null >/dev/null");
	cpus = system("exit `grep processor /proc/cpuinfo  | wc -l`");
	cpus = WEXITSTATUS(cpus);
	tasks = cpus;
	if (argc >= 3) {
		tasks = atol(argv[2]);
		if (!tasks)
			goto usage;
	}
	if (argc >= 4)
		seconds = atol(argv[3]);
	else
		seconds = -1;
#ifdef __ia64__
	setup_mmtimer();
#endif
	printf("%ld CPUs, running %ld parallel test-tasks.\n", cpus, tasks);
	printf("checking %s performance.\n", type_names[type]);
	shared = setup_shared_var();
	signal(SIGINT, sigint);
	signal(SIGHUP, sigint);
	parent = getpid();
	for (i = 0; i < tasks; i++)
		if (!fork())
			break;
	sleep(1);
	me = getpid();
	sprintf(str, "/tmp/tmp-%d", me);
	if (me == parent) {
		unsigned long long total_count;
		int i = 0, j;
		for (;;) {
			sleep(1);
			if (i == seconds || SHARED(END))
				break;
			i++;
			print_status(shared);
		}
		atomic_inc(&SHARED(END));
		total_count = SHARED(SUM);
		for (j = 0; j < tasks; j++)
			wait(NULL);
		if (i)
			printf("\navg ops/sec:               %Ld\n", total_count / i);
		LOCK(shared);
//		printf("delta_sum: %Ld\n", SHARED_LL(DELTA_SUM));
//		printf("delta_delta_sum: %Ld\n", SHARED_LL(DELTA_DELTA_SUM));
#ifdef __ia64__
		freq = 25.0;
#else
		freq = 700.0;
#endif
		printf("average cost per op:       %.2f usecs\n",
			(double)SHARED_LL(DELTA_SUM)/total_count/freq);
		printf("average cost per lock:     %.2f usecs\n",
			(double)SHARED_LL(DELTA2_SUM)/total_count/freq);
		printf("average cost per unlock:   %.2f usecs\n",
			(double)SHARED_LL(DELTA3_SUM)/total_count/freq);
		printf("max cost per op:           %.2f usecs\n",
			(double)SHARED_LL(DELTA_MAX)/freq);
		printf("max cost per lock:         %.2f usecs\n",
			(double)SHARED_LL(DELTA2_MAX)/freq);
		printf("max cost per unlock:       %.2f usecs\n",
			(double)SHARED_LL(DELTA3_MAX)/freq);
		printf("average deviance per op:   %.2f usecs\n",
			(double)SHARED_LL(DELTA_DELTA_SUM)/total_count/freq/2.0);
		UNLOCK(shared);
		exit(0);
	}
	for (;;) {
		rdtscll(t0);
		switch (type) {
			case TYPE_MUTEX:
				mutex_lock();
				rdtscll(t01);
				mutex_unlock();
				break;
			case TYPE_SEM:
				down();
				rdtscll(t01);
				up();
				break;
			case TYPE_RSEM:
				down_read();
				rdtscll(t01);
				up_read();
				break;
			case TYPE_WSEM:
				down_write();
				rdtscll(t01);
				up_write();
				break;
			case TYPE_VFS:
			{
				int fd;
				fd = creat(str, S_IRWXU);
				rdtscll(t01);
				close(fd);
				break;
			}
		}
		rdtscll(t1);
		delta = t1-t0;
		if (unlikely(delta > delta_max))
			delta_max = delta;
		delta_sum += delta;
		delta2 = t01-t0;
		if (unlikely(delta2 > delta2_max))
			delta2_max = delta2;
		delta2_sum += delta2;
		delta3 = t1-t01;
		if (unlikely(delta3 > delta3_max))
			delta3_max = delta3;
		delta3_sum += delta3;
		if (!first) {
			if (prev_delta < delta)
				delta_delta = delta - prev_delta;
			else
				delta_delta = prev_delta - delta;
			delta_delta_sum += delta_delta;
#if 0
			printf("%Ld-%Ld {%Ld} prev: {%Ld} / [%Ld]\n",
				t0, t1, delta, prev_delta, delta_delta);
			printf("  {%Ld} - {%Ld}\n",
				delta_sum, delta_delta_sum);
#endif
		} else
			first = 0;
		prev_delta = delta;
		atomic_inc(&SHARED(COUNT));
		if (unlikely(SHARED(END))) {
			LOCK(shared);
			SHARED_LL(DELTA_SUM) += delta_sum;
			SHARED_LL(DELTA_MAX) = max(SHARED_LL(DELTA_MAX),
							delta_max);
			SHARED_LL(DELTA2_SUM) += delta2_sum;
			SHARED_LL(DELTA2_MAX) = max(SHARED_LL(DELTA2_MAX),
							delta2_max);
			SHARED_LL(DELTA3_SUM) += delta3_sum;
			SHARED_LL(DELTA3_MAX) = max(SHARED_LL(DELTA3_MAX),
							delta3_max);
			SHARED_LL(DELTA_DELTA_SUM) += delta_delta_sum;
#if 0
			printf("delta_sum: %Ld\n", delta_sum);
			printf("delta_delta_sum: %Ld\n", delta_delta_sum);
			printf("DELTA_SUM: %Ld\n", SHARED_LL(DELTA_SUM));
			printf("DELTA_DELTA_SUM: %Ld\n", SHARED_LL(DELTA_DELTA_SUM));
#endif
			UNLOCK(shared);
			exit(0);
		}
	}
	return 0;
}
    
    
More information about the Linuxppc64-dev
mailing list