[Cbe-oss-dev] [PATCH 6:7] spufs: implement deferred queuing for spu class 1 faults

Sat May 17 01:05:59 EST 2008

Implement deferred queuing for spu class 1 faults
 
Yield the spu when processing class 1 faults as they take a 
long time to be resolved.  Activate after the fault has been 
resolved so that the context may be scheduled.

Time slice contexts in this state but don't put them on the
run queue.  Let the fault handling code re-queue the context
when it is ready to run.  

In general, we don't want spus in the faulting state to be 
added to the run queue as it gives a false indication that 
there are jobs waiting to run which may cause a bunch of 
secondary context switches, particularly, in the time slicing 
code.  Nor do we want to resume a context in this state if 
there are other running contexts to be resumed.  Therefore,
we need to keep contexts that are processing class 1 faults
off the runqueue.  We don't really care about the other
types of faults because they are resolved quickly.

Signed-off-by: Luke Browning <lukebrowning at us.ibm.com>

---

Index: spufs/arch/powerpc/platforms/cell/spufs/sched.c
===================================================================

--- spufs.orig/arch/powerpc/platforms/cell/spufs/sched.c
+++ spufs/arch/powerpc/platforms/cell/spufs/sched.c
@@ -444,25 +444,26 @@ static void spu_unbind_context(struct sp
 	ctx->spu = NULL;
 }
 
+static int spu_on_rq(struct spu_context *ctx)
+{
+	int ret;
+
+	spin_lock(&spu_prio->runq_lock);
+	ret = !list_empty(&ctx->rq);
+	spin_unlock(&spu_prio->runq_lock);
+
+	return ret;
+}
+
 /**
  * spu_add_to_rq - add a context to the runqueue
  * @ctx:       context to add
  */
 static void __spu_add_to_rq(struct spu_context *ctx)
 {
-	/*
-	 * Unfortunately this code path can be called from multiple threads
-	 * on behalf of a single context due to the way the problem state
-	 * mmap support works.
-	 *
-	 * Fortunately we need to wake up all these threads at the same time
-	 * and can simply skip the runqueue addition for every but the first
-	 * thread getting into this codepath.
-	 *
-	 * It's still quite hacky, and long-term we should proxy all other
-	 * threads through the owner thread so that spu_run is in control
-	 * of all the scheduling activity for a given context.
-	 */
+	BUG_ON(ctx->state == SPU_STATE_RUNNABLE);
+	BUG_ON(!test_bit(SPU_SCHED_SPU_RUN, &ctx->sched_flags));
+
 	if (list_empty(&ctx->rq)) {
 		list_add_tail(&ctx->rq, &spu_prio->runq[ctx->prio]);
 		set_bit(ctx->prio, spu_prio->bitmap);
@@ -473,9 +474,11 @@ static void __spu_add_to_rq(struct spu_c
 
 static void spu_add_to_rq(struct spu_context *ctx)
 {
-	spin_lock(&spu_prio->runq_lock);
-	__spu_add_to_rq(ctx);
-	spin_unlock(&spu_prio->runq_lock);
+	if (test_bit(SPU_SCHED_SPU_RUN, &ctx->sched_flags)) {
+		spin_lock(&spu_prio->runq_lock);
+		__spu_add_to_rq(ctx);
+		spin_unlock(&spu_prio->runq_lock);
+	}
 }
 
 static void __spu_del_from_rq(struct spu_context *ctx)
@@ -590,6 +593,8 @@ static struct spu *spu_get_idle(struct s
 static struct spu *find_victim(struct spu_context *ctx)
 {
 	struct spu_context *victim = NULL;
+	struct spu_context *victim_pf = NULL;
+	struct spu_context *victim_um = NULL;
 	struct spu *spu;
 	int node, n;
 
@@ -613,13 +618,42 @@ static struct spu *find_victim(struct sp
 		list_for_each_entry(spu, &cbe_spu_info[node].spus, cbe_list) {
 			struct spu_context *tmp = spu->ctx;
 
-			if (tmp && tmp->prio > ctx->prio &&
-			    !(tmp->flags & SPU_CREATE_NOSCHED) &&
-			    (!victim || tmp->prio > victim->prio))
-				victim = spu->ctx;
+			if (tmp && !(tmp->flags & SPU_CREATE_NOSCHED)) {
+				/*
+				 * This test identifies faulting contexts
+				 * that are either lazily loaded or explicitly
+				 * loaded to resolve problem state accesses.
+				 * In either case, the context is not running
+				 * and is a good candidate for preemption.
+				 */
+				if ((tmp->csa.class_1_dsisr) &&
+				    (!victim_pf || tmp->prio > victim_pf->prio))
+					victim_pf = tmp;
+
+				/*
+				 * The test (tmp->prio > ctx->prio) slows
+				 * down preemption and improves concurrency a
+				 * little bit, since contexts within the same
+				 * process generally have the same priorities
+				 * and are usually created and queued in a
+				 * tight sequence.  But, the test can be
+				 * removed if necessary.
+				 */
+				if (!test_bit(SPU_SCHED_SPU_RUN,
+					&ctx->sched_flags) &&
+				    (tmp->prio > ctx->prio) &&
+				    (!victim_um || tmp->prio > victim_um->prio))
+					victim_um = tmp;
+
+				if ((tmp->prio > ctx->prio) &&
+				    (!victim || tmp->prio > victim->prio))
+					victim = tmp;
+			}
 		}
 		mutex_unlock(&cbe_spu_info[node].list_mutex);
 
+		victim = victim_pf ? victim_pf : victim_um ? victim_um : victim;
+
 		if (victim) {
 			/*
 			 * This nests ctx->state_mutex, but we always lock
@@ -657,8 +691,14 @@ static struct spu *find_victim(struct sp
 
 			victim->stats.invol_ctx_switch++;
 			spu->stats.invol_ctx_switch++;
-			if (test_bit(SPU_SCHED_SPU_RUN, &victim->sched_flags))
-				spu_add_to_rq(victim);
+
+			/*
+			 * If the context was loaded, then it needs to be
+			 * put back on the runqueue.  We ignore the faulting
+			 * state of the context as it may have been loaded to
+			 * resolve an asychronous problem state reference.
+			 */
+			spu_add_to_rq(victim);
 
 			mutex_unlock(&victim->state_mutex);
 
@@ -674,29 +714,48 @@ static void __spu_schedule(struct spu *s
 	int node = spu->node;
 	int success = 0;
 
+	BUG_ON(ctx->state == SPU_STATE_RUNNABLE);
+	BUG_ON(spu_on_rq(ctx));
+	BUG_ON(test_bit(SPU_SCHED_DEACTIVATE, &ctx->sched_flags));
+
 	spu_set_timeslice(ctx);
 
 	mutex_lock(&cbe_spu_info[node].list_mutex);
 	if (spu->ctx == NULL) {
 		spu_bind_context(spu, ctx);
 		cbe_spu_info[node].nr_active++;
+		ctx->sched_count++;
 		spu->alloc_state = SPU_USED;
 		success = 1;
 	}
 	mutex_unlock(&cbe_spu_info[node].list_mutex);
 
-	if (success)
+	if (success) {
 		wake_up_all(&ctx->run_wq);
+		if (ctx->flags & SPU_CREATE_NOSCHED)
+			wake_up(&ctx->stop_wq);
+	}
 	else
 		spu_add_to_rq(ctx);
 }
 
 static void spu_schedule(struct spu *spu, struct spu_context *ctx)
 {
-	/* not a candidate for interruptible because it's called either
-	   from the scheduler thread or from spu_deactivate */
+	/*
+	 * This routine is invoked by the schduler thread, yield, and
+	 * spu_deactivate.  Basically, anywhere grab_runnable_context is
+	 * used.  Need to recheck state under the context lock to ensure
+	 * that the context has not been scheduled or added to the
+	 * runqueue by spu_run.
+	 *
+	 * Note this lock is a not a candidate for interruptible because we
+	 * can't afford not to take advantage of idle spus.
+	 */
 	mutex_lock(&ctx->state_mutex);
-	__spu_schedule(spu, ctx);
+	if (ctx->state == SPU_STATE_SAVED) {
+		spu_del_from_rq(ctx);
+		__spu_schedule(spu, ctx);
+	}
 	spu_release(ctx);
 }
 
@@ -727,25 +786,22 @@ int spu_activate(struct spu_context *ctx
 	struct spu *spu;
 
 	/*
-	 * If there are multiple threads waiting for a single context
-	 * only one actually binds the context while the others will
-	 * only be able to acquire the state_mutex once the context
-	 * already is in runnable state.
+	 * If ctx is currently loaded or it is on the runqueue, it has
+	 * already been activated.  This might happen if a context was
+	 * timesliced at the same time an exception occurred as the
+	 * interrupt code can't take the state mutex.
 	 */
-	if (ctx->spu)
+	if (ctx->spu || spu_on_rq(ctx))
 		return 0;
 
-spu_activate_top:
 	if (signal_pending(current))
 		return -ERESTARTSYS;
 
 	spu = spu_get_idle(ctx);
-	/*
-	 * If this is a realtime thread we try to get it running by
-	 * preempting a lower priority thread.
-	 */
-	if (!spu && rt_prio(ctx->prio))
+
+	if (!spu)
 		spu = find_victim(ctx);
+
 	if (spu) {
 		unsigned long runcntl;
 
@@ -757,12 +813,10 @@ spu_activate_top:
 		return 0;
 	}
 
-	if (ctx->flags & SPU_CREATE_NOSCHED) {
+	if (ctx->flags & SPU_CREATE_NOSCHED)
 		spu_prio_wait(ctx);
-		goto spu_activate_top;
-	}
-
-	spu_add_to_rq(ctx);
+	else
+		spu_add_to_rq(ctx);
 
 	return 0;
 }
@@ -787,6 +841,9 @@ static struct spu_context *grab_runnable
 			/* XXX(hch): check for affinity here aswell */
 			if (__node_allowed(ctx, node)) {
 				__spu_del_from_rq(ctx);
+				if (test_bit(SPU_SCHED_DEACTIVATE,
+					&ctx->sched_flags))
+					continue;
 				goto found;
 			}
 		}
@@ -805,18 +862,16 @@ static int __spu_deactivate(struct spu_c
 
 	if (spu) {
 		new = grab_runnable_context(max_prio, spu->node);
+
 		if (new || force) {
 			spu_unschedule(spu, ctx);
+
 			if (new) {
-				if (new->flags & SPU_CREATE_NOSCHED)
-					wake_up(&new->stop_wq);
-				else {
-					spu_release(ctx);
-					spu_schedule(spu, new);
-					/* this one can't easily be made
-					   interruptible */
-					mutex_lock(&ctx->state_mutex);
-				}
+				spu_release(ctx);
+				spu_schedule(spu, new);
+				/* this one can't easily be made
+				   interruptible */
+				mutex_lock(&ctx->state_mutex);
 			}
 		}
 	}
@@ -834,7 +889,18 @@ static int __spu_deactivate(struct spu_c
 void spu_deactivate(struct spu_context *ctx)
 {
 	spu_context_nospu_trace(spu_deactivate__enter, ctx);
+
+	/*
+	 * Set this bit under the runq_lock so that grab_runnable_context
+	 * can check for deactivated contexts as it is pulling contexts
+	 * off the runqueue.
+	 */
+	spin_lock(&spu_prio->runq_lock);
+	set_bit(SPU_SCHED_DEACTIVATE, &ctx->sched_flags);
+	spin_unlock(&spu_prio->runq_lock);
+
 	__spu_deactivate(ctx, 1, MAX_PRIO);
+	spu_del_from_rq(ctx);
 }
 
 /**
@@ -845,6 +911,13 @@ void spu_deactivate(struct spu_context *
  * unbind @ctx from the physical spu and schedule the highest
  * priority context to run on the freed physical spu instead.
  */
+void __spu_yield(struct spu_context *ctx)
+{
+	spu_context_nospu_trace(spu_yield__enter, ctx);
+	if (!(ctx->flags & SPU_CREATE_NOSCHED))
+		__spu_deactivate(ctx, 0, MAX_PRIO);
+}
+
 void spu_yield(struct spu_context *ctx)
 {
 	spu_context_nospu_trace(spu_yield__enter, ctx);
@@ -859,6 +932,7 @@ static noinline void spusched_tick(struc
 {
 	struct spu_context *new = NULL;
 	struct spu *spu = NULL;
+	int active;
 
 	if (spu_acquire(ctx))
 		BUG();	/* a kernel thread never has signals pending */
@@ -870,7 +944,10 @@ static noinline void spusched_tick(struc
 	if (ctx->policy == SCHED_FIFO)
 		goto out;
 
-	if (--ctx->time_slice && test_bit(SPU_SCHED_SPU_RUN, &ctx->sched_flags))
+	active = test_bit(SPU_SCHED_SPU_RUN, &ctx->sched_flags) &&
+		 !ctx->csa.class_1_dsisr;
+
+	if (--ctx->time_slice && active)
 		goto out;
 
 	spu = ctx->spu;
@@ -878,9 +955,10 @@ static noinline void spusched_tick(struc
 	spu_context_trace(spusched_tick__preempt, ctx, spu);
 
 	new = grab_runnable_context(ctx->prio + 1, spu->node);
+
 	if (new) {
 		spu_unschedule(spu, ctx);
-		if (test_bit(SPU_SCHED_SPU_RUN, &ctx->sched_flags))
+		if (active)
 			spu_add_to_rq(ctx);
 	} else {
 		spu_context_nospu_trace(spusched_tick__newslice, ctx);
@@ -944,8 +1022,10 @@ static void spuloadavg_wake(unsigned lon
 
 static int spusched_thread(void *unused)
 {
+	struct spu_context *ctx;
 	struct spu *spu;
 	int node;
+	int idle;
 
 	while (!kthread_should_stop()) {
 		set_current_state(TASK_INTERRUPTIBLE);
@@ -953,18 +1033,45 @@ static int spusched_thread(void *unused)
 		for (node = 0; node < MAX_NUMNODES; node++) {
 			struct mutex *mtx = &cbe_spu_info[node].list_mutex;
 
+			idle = 0;
+
 			mutex_lock(mtx);
 			list_for_each_entry(spu, &cbe_spu_info[node].spus,
 					cbe_list) {
-				struct spu_context *ctx = spu->ctx;
+
+				ctx = spu->ctx;
 
 				if (ctx) {
 					mutex_unlock(mtx);
 					spusched_tick(ctx);
 					mutex_lock(mtx);
 				}
+				else {
+					idle++;
+				}
 			}
 			mutex_unlock(mtx);
+			for ( ; idle; idle--) {
+				ctx = grab_runnable_context(MAX_PRIO, node);
+				if (!ctx)
+					break;
+
+				/*
+				 * Recheck state under the lock to ensure
+				 * that the context has not been scheduled
+				 * or added to the run queue by spu_run.
+				 */
+				mutex_lock(&ctx->state_mutex);
+				if (ctx->state == SPU_STATE_SAVED) {
+					spu_del_from_rq(ctx);
+					spu = spu_get_idle(ctx);
+					if (spu)
+						__spu_schedule(spu, ctx);
+					else
+						spu_add_to_rq(ctx);
+				}
+				mutex_unlock(&ctx->state_mutex);
+			}
 		}
 	}
 
Index: spufs/arch/powerpc/platforms/cell/spufs/spufs.h
===================================================================
--- spufs.orig/arch/powerpc/platforms/cell/spufs/spufs.h
+++ spufs/arch/powerpc/platforms/cell/spufs/spufs.h
@@ -43,8 +43,8 @@ struct spu_gang;
 /* ctx->sched_flags */
 enum {
 	SPU_SCHED_NOTIFY_ACTIVE,
-	SPU_SCHED_WAS_ACTIVE,	/* was active upon spu_acquire_saved()  */
 	SPU_SCHED_SPU_RUN,	/* context is within spu_run */
+	SPU_SCHED_DEACTIVATE,
 };
 
 struct spu_context {
@@ -98,6 +98,7 @@ struct spu_context {
 	int policy;
 	int prio;
 	int last_ran;
+	int sched_count;
 
 	/* statistics */
 	struct {
@@ -255,6 +256,7 @@ int spu_stopped(struct spu_context *ctx,
 void spu_del_from_rq(struct spu_context *ctx);
 int spu_activate(struct spu_context *ctx, unsigned long flags);
 void spu_deactivate(struct spu_context *ctx);
+void __spu_yield(struct spu_context *ctx);
 void spu_yield(struct spu_context *ctx);
 void spu_switch_notify(struct spu *spu, struct spu_context *ctx);
 void spu_set_timeslice(struct spu_context *ctx);
Index: spufs/arch/powerpc/platforms/cell/spufs/fault.c
===================================================================
--- spufs.orig/arch/powerpc/platforms/cell/spufs/fault.c
+++ spufs/arch/powerpc/platforms/cell/spufs/fault.c
@@ -136,8 +136,13 @@ int spufs_handle_class1(struct spu_conte
 		dsisr, ctx->state);
 
 	ctx->stats.hash_flt++;
-	if (ctx->state == SPU_STATE_RUNNABLE)
+	if (ctx->state == SPU_STATE_RUNNABLE) {
 		ctx->spu->stats.hash_flt++;
+		__spu_yield(ctx);
+	}
+	else {
+		spu_del_from_rq(ctx);
+	}
 
 	/* we must not hold the lock when entering spu_handle_mm_fault */
 	spu_release(ctx);
@@ -166,6 +171,9 @@ int spufs_handle_class1(struct spu_conte
 	ctx->csa.class_1_dar = ctx->csa.class_1_dsisr = 0;
 	smp_mb();
 
+	if (ctx->state != SPU_STATE_RUNNABLE)
+		spu_activate(ctx, 0);
+
 	/*
 	 * If we handled the fault successfully and are in runnable
 	 * state, restart the DMA.
Index: spufs/arch/powerpc/platforms/cell/spufs/file.c
===================================================================
--- spufs.orig/arch/powerpc/platforms/cell/spufs/file.c
+++ spufs/arch/powerpc/platforms/cell/spufs/file.c
@@ -387,7 +387,32 @@ static unsigned long spufs_ps_nopfn(stru
 	if (ctx->state == SPU_STATE_SAVED) {
 		up_read(&current->mm->mmap_sem);
 		spu_context_nospu_trace(spufs_ps_nopfn__sleep, ctx);
-		ret = spufs_wait(ctx->run_wq, ctx->state == SPU_STATE_RUNNABLE);
+
+		/*
+		 * Activate context if it is inside spu_run.  It may
+		 * not be schedulable under its own control if it has
+		 * faulted.  This should provide concurrency between the
+		 * the faulting thread and the target spu context enabling
+		 * the fault to be more quickly resolved.  find_victim
+		 * may even find a lazily loaded context to preempt
+		 * resulting in no downside at all.  On the other hand,
+		 * there is nothing to keep the context loaded if it
+		 * has faulted as it may become the target of another
+		 * activation.  For this reason, we don't have to worry
+		 * about the context be lazily loaded for an entire
+		 * quantum.  It might even be worth removing the test
+		 * for SPU_SCHED_SPU_RUN.
+		 */
+		if (test_bit(SPU_SCHED_SPU_RUN, &ctx->sched_flags))
+			ret = spu_activate(ctx, 0);
+		else
+			ret = 0;
+		if (!ret) {
+			ret = spufs_wait(ctx->run_wq,
+					ctx->state == SPU_STATE_RUNNABLE);
+			if (ret)
+				goto refault;
+		}
 		spu_context_trace(spufs_ps_nopfn__wake, ctx, ctx->spu);
 		down_read(&current->mm->mmap_sem);
 	} else {
@@ -396,8 +421,7 @@ static unsigned long spufs_ps_nopfn(stru
 		spu_context_trace(spufs_ps_nopfn__insert, ctx, ctx->spu);
 	}
 
-	if (!ret)
-		spu_release(ctx);
+	spu_release(ctx);
 
 refault:
 	put_spu_context(ctx);
Index: spufs/arch/powerpc/platforms/cell/spufs/run.c
===================================================================
--- spufs.orig/arch/powerpc/platforms/cell/spufs/run.c
+++ spufs/arch/powerpc/platforms/cell/spufs/run.c
@@ -186,6 +186,8 @@ static int spu_run_init(struct spu_conte
 	int ret;
 
 	spuctx_switch_state(ctx, SPU_UTIL_SYSTEM);
+	set_bit(SPU_SCHED_SPU_RUN, &ctx->sched_flags);
+	clear_bit(SPU_SCHED_DEACTIVATE, &ctx->sched_flags);
 
 	/*
 	 * NOSCHED is synchronous scheduling with respect to the caller.
@@ -243,7 +245,6 @@ static int spu_run_init(struct spu_conte
 		}
 	}
 
-	set_bit(SPU_SCHED_SPU_RUN, &ctx->sched_flags);
 	return 0;
 }
 
Index: spufs/arch/powerpc/platforms/cell/spufs/context.c
===================================================================
--- spufs.orig/arch/powerpc/platforms/cell/spufs/context.c
+++ spufs/arch/powerpc/platforms/cell/spufs/context.c
@@ -154,10 +154,7 @@ int spu_acquire_saved(struct spu_context
 	if (ret)
 		return ret;
 
-	if (ctx->state != SPU_STATE_SAVED) {
-		set_bit(SPU_SCHED_WAS_ACTIVE, &ctx->sched_flags);
-		spu_deactivate(ctx);
-	}
+	spu_deactivate(ctx);
 
 	return 0;
 }
@@ -170,8 +167,7 @@ void spu_release_saved(struct spu_contex
 {
 	BUG_ON(ctx->state != SPU_STATE_SAVED);
 
-	if (test_and_clear_bit(SPU_SCHED_WAS_ACTIVE, &ctx->sched_flags) &&
-			test_bit(SPU_SCHED_SPU_RUN, &ctx->sched_flags))
+	if (test_bit(SPU_SCHED_SPU_RUN, &ctx->sched_flags))
 		spu_activate(ctx, 0);
 
 	spu_release(ctx);