[Cbe-oss-dev] [PATCH 6:6] spufs: implement deferred queuing for spu class 1 faults

Tue May 13 00:37:22 EST 2008

Implement deferred queuing for spu class 1 faults
 
Yield the spu when processing class 1 faults as they take a 
long time to be resolved.  Activate after the fault has been 
resolved so that the context may be scheduled.

Time slice contexts in this state but don't put them on the
run queue.  Let the fault handling code re-queue the context
when it is ready to run.  

In general, we don't want spus in the faulting state to be 
added to the run queue as it gives a false indication that 
there are jobs waiting to run which may cause a bunch of 
secondary context switches, particularly, in the time slicing 
code.  Nor do we want to resume a context in this state if 
there are other running contexts to be resumed.  Therefore,
we need to keep contexts that are processing class 1 faults
off the runqueue.  We don't really care about the other
types of faults because they are resolved quickly.

Signed-off-by: Luke Browning <lukebrowning at us.ibm.com>

---

Index: spufs/arch/powerpc/platforms/cell/spufs/sched.c
===================================================================

--- spufs.orig/arch/powerpc/platforms/cell/spufs/sched.c
+++ spufs/arch/powerpc/platforms/cell/spufs/sched.c
@@ -444,12 +444,26 @@ static void spu_unbind_context(struct sp
 	ctx->spu = NULL;
 }
 
+static int spu_on_rq(struct spu_context *ctx)
+{
+	int ret;
+
+	spin_lock(&spu_prio->runq_lock);
+	ret = !list_empty(&ctx->rq);
+	spin_unlock(&spu_prio->runq_lock);
+
+	return ret;
+}
+
 /**
  * spu_add_to_rq - add a context to the runqueue
  * @ctx:       context to add
  */
 static void __spu_add_to_rq(struct spu_context *ctx)
 {
+	BUG_ON(ctx->state == SPU_STATE_RUNNABLE);
+	BUG_ON(!test_bit(SPU_SCHED_SPU_RUN, &ctx->sched_flags));
+
 	/*
 	 * Unfortunately this code path can be called from multiple threads
 	 * on behalf of a single context due to the way the problem state
@@ -590,6 +604,8 @@ static struct spu *spu_get_idle(struct s
 static struct spu *find_victim(struct spu_context *ctx)
 {
 	struct spu_context *victim = NULL;
+	struct spu_context *victim_pf = NULL;
+	struct spu_context *victim_um = NULL;
 	struct spu *spu;
 	int node, n;
 
@@ -613,13 +629,38 @@ static struct spu *find_victim(struct sp
 		list_for_each_entry(spu, &cbe_spu_info[node].spus, cbe_list) {
 			struct spu_context *tmp = spu->ctx;
 
-			if (tmp && tmp->prio > ctx->prio &&
-			    !(tmp->flags & SPU_CREATE_NOSCHED) &&
-			    (!victim || tmp->prio > victim->prio))
-				victim = spu->ctx;
+			if (tmp && !(tmp->flags & SPU_CREATE_NOSCHED)) {
+				/*
+				 * This test is used to preempt contexts that
+				 * were loaded to resolve asynchronous problem
+				 * state accesses.
+				 */
+				if ((tmp->csa.class_1_dsisr) &&
+				    (!victim_pf || tmp->prio > victim_pf->prio))
+					victim_pf = tmp;
+
+				/*
+				 * The test (tmp->prio > ctx->prio) slows
+				 * does preemption and improves concurrency a
+				 * bit, since contexts within the same
+				 * process are generally created and queued
+				 * at the same time.
+				 */
+				if (!test_bit(SPU_SCHED_SPU_RUN,
+					&ctx->sched_flags) &&
+				    (tmp->prio > ctx->prio) &&
+				    (!victim_um || tmp->prio > victim_um->prio))
+					victim_um = tmp;
+
+				if ((tmp->prio > ctx->prio) &&
+				    (!victim || tmp->prio > victim->prio))
+					victim = tmp;
+			}
 		}
 		mutex_unlock(&cbe_spu_info[node].list_mutex);
 
+		victim = victim_pf ? victim_pf : victim_um ? victim_um : victim;
+
 		if (victim) {
 			/*
 			 * This nests ctx->state_mutex, but we always lock
@@ -657,6 +698,13 @@ static struct spu *find_victim(struct sp
 
 			victim->stats.invol_ctx_switch++;
 			spu->stats.invol_ctx_switch++;
+
+			/*
+			 * if the context was loaded, then it needs to be
+			 * put back on the runqueue, regardless of its
+			 * faulting state as it may need to be loaded to
+			 * resolve an asynchronous problem state access.
+			 */
 			if (test_bit(SPU_SCHED_SPU_RUN, &victim->sched_flags))
 				spu_add_to_rq(victim);
 
@@ -674,6 +722,10 @@ static void __spu_schedule(struct spu *s
 	int node = spu->node;
 	int success = 0;
 
+	BUG_ON(ctx->state == SPU_STATE_RUNNABLE);
+	BUG_ON(!test_bit(SPU_SCHED_SPU_RUN, &ctx->sched_flags));
+	BUG_ON(test_bit(SPU_SCHED_DEACTIVATE, &ctx->sched_flags));
+
 	spu_set_timeslice(ctx);
 
 	mutex_lock(&cbe_spu_info[node].list_mutex);
@@ -685,18 +737,52 @@ static void __spu_schedule(struct spu *s
 	}
 	mutex_unlock(&cbe_spu_info[node].list_mutex);
 
-	if (success)
+	if (success) {
 		wake_up_all(&ctx->run_wq);
+		if (ctx->flags & SPU_CREATE_NOSCHED)
+			wake_up(&ctx->stop_wq);
+	}
 	else
 		spu_add_to_rq(ctx);
 }
 
 static void spu_schedule(struct spu *spu, struct spu_context *ctx)
 {
-	/* not a candidate for interruptible because it's called either
-	   from the scheduler thread or from spu_deactivate */
+	/*
+	 * This routine is invoked by the schduler thread, yield, and
+	 * spu_deactivate.  Basically, anywhere grab_runnable_context is
+	 * used.  This latter routine works from the bottom up from a
+	 * locking perspective.  It grabs a context off the runqueue
+	 * without holding the context lock.  We need to recheck state
+	 * under the context lock to ensure that it is still schedulable.
+	 * The context is always removed from the runqueue, when the
+	 * controlling thread returns to user mode.
+	 *
+	 * There is a race between grab_runnnable_context and the controlling
+	 * thread returning from spu_run, where the context would have
+	 * been removed from the runqueue.  There is a second race between
+	 * grab_runnable_context and the re-invocation of spu_run where the
+	 * context is placed directly on an spu.
+	 *
+	 * In neither case, should we schedule the context.  This could
+	 * potentially lead to a starvation case if there are no more contexts
+	 * invoking spu_run to invoke the scheduler.  After all, we pulled a
+	 * context off the queue that we had to ignore.  There might be a
+	 * context stranded on the runqueue.
+	 *
+	 * We solve this problem by adding a safety net in the scheduler
+	 * thread.  After timeslicing, it attempts to pull contexts off the
+	 * runqueue and schedule them.  We always get a second chance.
+	 *
+	 * Note this lock is a not a candidate for interruptible because we
+	 * can't afford not to take advantage of idle spus.
+	 */
 	mutex_lock(&ctx->state_mutex);
-	__spu_schedule(spu, ctx);
+	if (test_bit(SPU_SCHED_SPU_RUN, &ctx->sched_flags) &&
+	    !test_bit(SPU_SCHED_DEACTIVATE, &ctx->sched_flags) &&
+	    ctx->state == SPU_STATE_SAVED) {
+		__spu_schedule(spu, ctx);
+	}
 	spu_release(ctx);
 }
 
@@ -727,25 +813,22 @@ int spu_activate(struct spu_context *ctx
 	struct spu *spu;
 
 	/*
-	 * If there are multiple threads waiting for a single context
-	 * only one actually binds the context while the others will
-	 * only be able to acquire the state_mutex once the context
-	 * already is in runnable state.
+	 * If ctx is currently loaded or it is on the runqueue, it has
+	 * already been activated.  This might happen if a context was
+	 * timesliced at the same time an exception occurred as the
+	 * interrupt code can't take the state mutex.
 	 */
-	if (ctx->spu)
+	if (ctx->spu || spu_on_rq(ctx))
 		return 0;
 
-spu_activate_top:
 	if (signal_pending(current))
 		return -ERESTARTSYS;
 
 	spu = spu_get_idle(ctx);
-	/*
-	 * If this is a realtime thread we try to get it running by
-	 * preempting a lower priority thread.
-	 */
-	if (!spu && rt_prio(ctx->prio))
+
+	if (!spu)
 		spu = find_victim(ctx);
+
 	if (spu) {
 		unsigned long runcntl;
 
@@ -757,12 +840,10 @@ spu_activate_top:
 		return 0;
 	}
 
-	if (ctx->flags & SPU_CREATE_NOSCHED) {
+	if (ctx->flags & SPU_CREATE_NOSCHED)
 		spu_prio_wait(ctx);
-		goto spu_activate_top;
-	}
-
-	spu_add_to_rq(ctx);
+	else
+		spu_add_to_rq(ctx);
 
 	return 0;
 }
@@ -805,18 +886,16 @@ static int __spu_deactivate(struct spu_c
 
 	if (spu) {
 		new = grab_runnable_context(max_prio, spu->node);
+
 		if (new || force) {
 			spu_unschedule(spu, ctx);
+
 			if (new) {
-				if (new->flags & SPU_CREATE_NOSCHED)
-					wake_up(&new->stop_wq);
-				else {
-					spu_release(ctx);
-					spu_schedule(spu, new);
-					/* this one can't easily be made
-					   interruptible */
-					mutex_lock(&ctx->state_mutex);
-				}
+				spu_release(ctx);
+				spu_schedule(spu, new);
+				/* this one can't easily be made
+				   interruptible */
+				mutex_lock(&ctx->state_mutex);
 			}
 		}
 	}
@@ -834,7 +913,9 @@ static int __spu_deactivate(struct spu_c
 void spu_deactivate(struct spu_context *ctx)
 {
 	spu_context_nospu_trace(spu_deactivate__enter, ctx);
+	set_bit(SPU_SCHED_DEACTIVATE, &ctx->sched_flags);
 	__spu_deactivate(ctx, 1, MAX_PRIO);
+	spu_del_from_rq(ctx);
 }
 
 /**
@@ -845,6 +926,13 @@ void spu_deactivate(struct spu_context *
  * unbind @ctx from the physical spu and schedule the highest
  * priority context to run on the freed physical spu instead.
  */
+void __spu_yield(struct spu_context *ctx)
+{
+       spu_context_nospu_trace(spu_yield__enter, ctx);
+       if (!(ctx->flags & SPU_CREATE_NOSCHED))
+		__spu_deactivate(ctx, 0, MAX_PRIO);
+}
+
 void spu_yield(struct spu_context *ctx)
 {
 	spu_context_nospu_trace(spu_yield__enter, ctx);
@@ -859,6 +947,7 @@ static noinline void spusched_tick(struc
 {
 	struct spu_context *new = NULL;
 	struct spu *spu = NULL;
+	int active;
 
 	if (spu_acquire(ctx))
 		BUG();	/* a kernel thread never has signals pending */
@@ -870,7 +959,10 @@ static noinline void spusched_tick(struc
 	if (ctx->policy == SCHED_FIFO)
 		goto out;
 
-	if (--ctx->time_slice && test_bit(SPU_SCHED_SPU_RUN, &ctx->sched_flags))
+	active = test_bit(SPU_SCHED_SPU_RUN, &ctx->sched_flags) &&
+		 !ctx->csa.class_1_dsisr;
+
+	if (--ctx->time_slice && active)
 		goto out;
 
 	spu = ctx->spu;
@@ -878,9 +970,10 @@ static noinline void spusched_tick(struc
 	spu_context_trace(spusched_tick__preempt, ctx, spu);
 
 	new = grab_runnable_context(ctx->prio + 1, spu->node);
+
 	if (new) {
 		spu_unschedule(spu, ctx);
-		if (test_bit(SPU_SCHED_SPU_RUN, &ctx->sched_flags))
+		if (active)
 			spu_add_to_rq(ctx);
 	} else {
 		spu_context_nospu_trace(spusched_tick__newslice, ctx);
@@ -944,8 +1037,10 @@ static void spuloadavg_wake(unsigned lon
 
 static int spusched_thread(void *unused)
 {
+	struct spu_context *ctx;
 	struct spu *spu;
 	int node;
+	int idle;
 
 	while (!kthread_should_stop()) {
 		set_current_state(TASK_INTERRUPTIBLE);
@@ -953,18 +1048,50 @@ static int spusched_thread(void *unused)
 		for (node = 0; node < MAX_NUMNODES; node++) {
 			struct mutex *mtx = &cbe_spu_info[node].list_mutex;
 
+			idle = 0;
+
 			mutex_lock(mtx);
 			list_for_each_entry(spu, &cbe_spu_info[node].spus,
 					cbe_list) {
-				struct spu_context *ctx = spu->ctx;
+
+				ctx = spu->ctx;
 
 				if (ctx) {
 					mutex_unlock(mtx);
 					spusched_tick(ctx);
 					mutex_lock(mtx);
 				}
+				else {
+					idle++;
+				}
 			}
 			mutex_unlock(mtx);
+			for ( ; idle; idle--) {
+				ctx = grab_runnable_context(MAX_PRIO, node);
+				if (!ctx)
+					break;
+
+				/*
+				 * There is a race between
+				 * grab_runnnable_context and the controlling
+				 * thread returning from spu_run, where the
+				 * context would have been removed from the
+				 * runqueue.  There is a second race between
+				 * with the re-invocation of spu_run where the
+				 * context is placed directly on an spu.
+				 */
+				mutex_lock(&ctx->state_mutex);
+				if (test_bit(SPU_SCHED_SPU_RUN,
+						&ctx->sched_flags) &&
+				    !test_bit(SPU_SCHED_DEACTIVATE,
+						&ctx->sched_flags) &&
+				    (ctx->state == SPU_STATE_SAVED)) {
+					spu = spu_get_idle(ctx);
+					if (spu)
+						__spu_schedule(spu, ctx);
+				}
+				mutex_unlock(&ctx->state_mutex);
+			}
 		}
 	}
 
Index: spufs/arch/powerpc/platforms/cell/spufs/spufs.h
===================================================================
--- spufs.orig/arch/powerpc/platforms/cell/spufs/spufs.h
+++ spufs/arch/powerpc/platforms/cell/spufs/spufs.h
@@ -45,6 +45,7 @@ enum {
 	SPU_SCHED_NOTIFY_ACTIVE,
 	SPU_SCHED_WAS_ACTIVE,	/* was active upon spu_acquire_saved()  */
 	SPU_SCHED_SPU_RUN,	/* context is within spu_run */
+	SPU_SCHED_DEACTIVATE,
 };
 
 struct spu_context {
@@ -255,6 +256,7 @@ int spu_stopped(struct spu_context *ctx,
 void spu_del_from_rq(struct spu_context *ctx);
 int spu_activate(struct spu_context *ctx, unsigned long flags);
 void spu_deactivate(struct spu_context *ctx);
+void __spu_yield(struct spu_context *ctx);
 void spu_yield(struct spu_context *ctx);
 void spu_switch_notify(struct spu *spu, struct spu_context *ctx);
 void spu_set_timeslice(struct spu_context *ctx);
Index: spufs/arch/powerpc/platforms/cell/spufs/fault.c
===================================================================
--- spufs.orig/arch/powerpc/platforms/cell/spufs/fault.c
+++ spufs/arch/powerpc/platforms/cell/spufs/fault.c
@@ -136,8 +136,13 @@ int spufs_handle_class1(struct spu_conte
 		dsisr, ctx->state);
 
 	ctx->stats.hash_flt++;
-	if (ctx->state == SPU_STATE_RUNNABLE)
+	if (ctx->state == SPU_STATE_RUNNABLE) {
 		ctx->spu->stats.hash_flt++;
+		__spu_yield(ctx);
+	}
+	else {
+		spu_del_from_rq(ctx);
+	}
 
 	/* we must not hold the lock when entering spu_handle_mm_fault */
 	spu_release(ctx);
@@ -166,6 +171,9 @@ int spufs_handle_class1(struct spu_conte
 	ctx->csa.class_1_dar = ctx->csa.class_1_dsisr = 0;
 	smp_wmb();
 
+	if (ctx->state != SPU_STATE_RUNNABLE)
+		spu_activate(ctx, 0);
+
 	/*
 	 * If we handled the fault successfully and are in runnable
 	 * state, restart the DMA.
Index: spufs/arch/powerpc/platforms/cell/spufs/file.c
===================================================================
--- spufs.orig/arch/powerpc/platforms/cell/spufs/file.c
+++ spufs/arch/powerpc/platforms/cell/spufs/file.c
@@ -387,7 +387,32 @@ static unsigned long spufs_ps_nopfn(stru
 	if (ctx->state == SPU_STATE_SAVED) {
 		up_read(&current->mm->mmap_sem);
 		spu_context_nospu_trace(spufs_ps_nopfn__sleep, ctx);
-		ret = spufs_wait(ctx->run_wq, ctx->state == SPU_STATE_RUNNABLE);
+
+		/*
+		 * Activate context if it is inside spu_run.  It may
+		 * not be schedulable under its own control if it has
+		 * faulted.  This should provide concurrency between the
+		 * the faulting thread and the target spu context enabling
+		 * the fault to be more quickly resolved.  find_victim
+		 * may even find a lazily loaded context to preempt
+		 * resulting in no downside at all.  On the other hand,
+		 * there is nothing to keep the context loaded if it
+		 * has faulted as it may become the target of another
+		 * activation.  For this reason, we don't have to worry
+		 * about the context be lazily loaded for an entire
+		 * quantum.  It might even be worth removing the test
+		 * for SPU_SCHED_SPU_RUN.
+		 */
+		if (test_bit(SPU_SCHED_SPU_RUN, &ctx->sched_flags))
+			ret = spu_activate(ctx, 0);
+		else
+			ret = 0;
+		if (!ret) {
+			ret = spufs_wait(ctx->run_wq,
+					ctx->state == SPU_STATE_RUNNABLE);
+			if (ret)
+				goto refault;
+		}
 		spu_context_trace(spufs_ps_nopfn__wake, ctx, ctx->spu);
 		down_read(&current->mm->mmap_sem);
 	} else {
@@ -396,8 +421,7 @@ static unsigned long spufs_ps_nopfn(stru
 		spu_context_trace(spufs_ps_nopfn__insert, ctx, ctx->spu);
 	}
 
-	if (!ret)
-		spu_release(ctx);
+	spu_release(ctx);
 
 refault:
 	put_spu_context(ctx);