[Cbe-oss-dev] [PATCH 7:11] spufs: Implement deferred queuing for spu class 1 faults

Fri May 23 06:10:26 EST 2008

Implement deferred queuing for spu class 1 faults
 
Yield the spu when processing class 1 faults as they take a 
long time to be resolved.  Activate after the fault has been 
resolved so that the context may be scheduled.

Time slice contexts in this state but don't put them on the
run queue.  Let the fault handling code re-queue the context
when it is ready to run.  

In general, we don't want spus in the faulting state to be 
added to the run queue as it gives a false indication that 
there are jobs waiting to run which may cause a bunch of 
secondary context switches, particularly, in the time slicing 
code.  Nor do we want to resume a context in this state if 
there are other running contexts to be resumed.  Therefore,
we need to keep contexts that are processing class 1 faults
off the run queue.  We don't really care about the other
types of faults because they are resolved quickly.

Signed-off-by: Luke Browning <lukebrowning at us.ibm.com>

---

Index: linux-2.6.25/arch/powerpc/platforms/cell/spufs/sched.c
===================================================================

--- linux-2.6.25.orig/arch/powerpc/platforms/cell/spufs/sched.c
+++ linux-2.6.25/arch/powerpc/platforms/cell/spufs/sched.c
@@ -446,25 +446,26 @@ static void spu_unbind_context(struct sp
 	ctx->spu = NULL;
 }
 
+static int spu_on_rq(struct spu_context *ctx)
+{
+	int ret;
+
+	spin_lock(&spu_prio->runq_lock);
+	ret = !list_empty(&ctx->rq);
+	spin_unlock(&spu_prio->runq_lock);
+
+	return ret;
+}
+
 /**
  * spu_add_to_rq - add a context to the runqueue
  * @ctx:       context to add
  */
 static void __spu_add_to_rq(struct spu_context *ctx)
 {
-	/*
-	 * Unfortunately this code path can be called from multiple threads
-	 * on behalf of a single context due to the way the problem state
-	 * mmap support works.
-	 *
-	 * Fortunately we need to wake up all these threads at the same time
-	 * and can simply skip the runqueue addition for every but the first
-	 * thread getting into this codepath.
-	 *
-	 * It's still quite hacky, and long-term we should proxy all other
-	 * threads through the owner thread so that spu_run is in control
-	 * of all the scheduling activity for a given context.
-	 */
+	BUG_ON(ctx->state == SPU_STATE_RUNNABLE);
+	BUG_ON(!test_bit(SPU_SCHED_SPU_RUN, &ctx->sched_flags));
+
 	if (list_empty(&ctx->rq)) {
 		list_add_tail(&ctx->rq, &spu_prio->runq[ctx->prio]);
 		set_bit(ctx->prio, spu_prio->bitmap);
@@ -475,9 +476,11 @@ static void __spu_add_to_rq(struct spu_c
 
 static void spu_add_to_rq(struct spu_context *ctx)
 {
-	spin_lock(&spu_prio->runq_lock);
-	__spu_add_to_rq(ctx);
-	spin_unlock(&spu_prio->runq_lock);
+	if (test_bit(SPU_SCHED_SPU_RUN, &ctx->sched_flags)) {
+		spin_lock(&spu_prio->runq_lock);
+		__spu_add_to_rq(ctx);
+		spin_unlock(&spu_prio->runq_lock);
+	}
 }
 
 static void __spu_del_from_rq(struct spu_context *ctx)
@@ -592,8 +595,10 @@ static struct spu *spu_get_idle(struct s
 static struct spu *find_victim(struct spu_context *ctx)
 {
 	struct spu_context *victim = NULL;
+	struct spu_context *victim_pf = NULL;
+	struct spu_context *victim_um = NULL;
 	struct spu *spu;
-	int node, n;
+	int node, n, retry;
 
 	spu_context_nospu_trace(spu_find_victim__enter, ctx);
 
@@ -604,49 +609,70 @@ static struct spu *find_victim(struct sp
 	 * a strong node affinity.  We might want to fine-tune this in
 	 * the future.
 	 */
- restart:
 	node = cpu_to_node(raw_smp_processor_id());
 	for (n = 0; n < MAX_NUMNODES; n++, node++) {
 		node = (node < MAX_NUMNODES) ? node : 0;
 		if (!node_allowed(ctx, node))
 			continue;
 
+		retry = 2;
+restart:
+		if (!retry)
+			continue;
+
 		mutex_lock(&cbe_spu_info[node].list_mutex);
 		list_for_each_entry(spu, &cbe_spu_info[node].spus, cbe_list) {
 			struct spu_context *tmp = spu->ctx;
 
-			if (tmp && tmp->prio > ctx->prio &&
-			    !(tmp->flags & SPU_CREATE_NOSCHED) &&
-			    (!victim || tmp->prio > victim->prio))
-				victim = spu->ctx;
+			if (tmp && !(tmp->flags & SPU_CREATE_NOSCHED)) {
+
+				if ((tmp->csa.class_1_dsisr) &&
+				    (!victim_pf || tmp->prio > victim_pf->prio))
+					victim_pf = tmp;
+
+				if (!test_bit(SPU_SCHED_SPU_RUN,
+					&tmp->sched_flags) &&
+				    (!victim_um || tmp->prio > victim_um->prio))
+					victim_um = tmp;
+
+				if ((tmp->prio > ctx->prio) &&
+				    (!victim || tmp->prio > victim->prio))
+					victim = tmp;
+			}
 		}
 		mutex_unlock(&cbe_spu_info[node].list_mutex);
 
+		/*
+		 * Preemption order. First, faulting ctxts, then contexts
+		 * in user mode, and finally less favored work based on
+		 * priority.  In the first two cases the spu is stopped.
+		 */
+		victim = victim_pf ? victim_pf : victim_um ? victim_um : victim;
+
 		if (victim) {
-			/*
-			 * This nests ctx->state_mutex, but we always lock
-			 * higher priority contexts before lower priority
-			 * ones, so this is safe until we introduce
-			 * priority inheritance schemes.
-			 *
-			 * XXX if the highest priority context is locked,
-			 * this can loop a long time.  Might be better to
-			 * look at another context or give up after X retries.
-			 */
+
+			/* Lock the context and re-check state. */
 			if (!mutex_trylock(&victim->state_mutex)) {
 				victim = NULL;
+				retry--;
 				goto restart;
 			}
 
+			/*
+			 * Try again, if the victim appears to be in a good
+			 * runnable state inside spu_run.  It has a good
+			 * relative priority and is not waiting for a class 1
+			 * fault to be resolved.
+			 */
 			spu = victim->spu;
-			if (!spu || victim->prio <= ctx->prio) {
-				/*
-				 * This race can happen because we've dropped
-				 * the active list mutex.  Not a problem, just
-				 * restart the search.
-				 */
+			if (!spu ||
+			   (test_bit(SPU_SCHED_SPU_RUN, &victim->sched_flags) &&
+			    (!victim->csa.class_1_dsisr) &&
+			    (victim->prio <= ctx->prio))) {
+
 				mutex_unlock(&victim->state_mutex);
 				victim = NULL;
+				retry--;
 				goto restart;
 			}
 
@@ -659,8 +685,14 @@ static struct spu *find_victim(struct sp
 
 			victim->stats.invol_ctx_switch++;
 			spu->stats.invol_ctx_switch++;
-			if (test_bit(SPU_SCHED_SPU_RUN, &victim->sched_flags))
-				spu_add_to_rq(victim);
+
+			/*
+			 * If the context was loaded, then it needs to be
+			 * put back on the runqueue.  We ignore the faulting
+			 * state of the context as it may have been loaded to
+			 * resolve an asychronous problem state reference.
+			 */
+			spu_add_to_rq(victim);
 
 			mutex_unlock(&victim->state_mutex);
 
@@ -676,30 +708,55 @@ static void __spu_schedule(struct spu *s
 	int node = spu->node;
 	int success = 0;
 
+	BUG_ON(ctx->state == SPU_STATE_RUNNABLE);
+	BUG_ON(spu_on_rq(ctx));
+	BUG_ON(test_bit(SPU_SCHED_DEACTIVATE, &ctx->sched_flags));
+
 	spu_set_timeslice(ctx);
 
 	mutex_lock(&cbe_spu_info[node].list_mutex);
 	if (spu->ctx == NULL) {
 		spu_bind_context(spu, ctx);
 		cbe_spu_info[node].nr_active++;
+		ctx->sched_count++;
 		spu->alloc_state = SPU_USED;
 		success = 1;
 	}
 	mutex_unlock(&cbe_spu_info[node].list_mutex);
 
-	if (success)
+	if (success) {
 		wake_up_all(&ctx->run_wq);
+		if (ctx->flags & SPU_CREATE_NOSCHED)
+			wake_up(&ctx->stop_wq);
+	}
 	else
 		spu_add_to_rq(ctx);
 }
 
-static void spu_schedule(struct spu *spu, struct spu_context *ctx)
+static int spu_schedule(struct spu *spu, struct spu_context *ctx)
 {
-	/* not a candidate for interruptible because it's called either
-	   from the scheduler thread or from spu_deactivate */
+	int ret = -1;
+
+	/*
+	 * This routine is invoked by the schduler thread, yield, and
+	 * spu_deactivate.  Basically, anywhere grab_runnable_context is
+	 * used.  Need to recheck state under the context lock to ensure
+	 * that the context has not been scheduled or added to the
+	 * runqueue by spu_run.
+	 *
+	 * Note this lock is a not a candidate for interruptible because we
+	 * can't afford not to take advantage of idle spus.
+	 */
 	mutex_lock(&ctx->state_mutex);
-	__spu_schedule(spu, ctx);
+	if ((ctx->state == SPU_STATE_SAVED) &&
+	    !test_bit(SPU_SCHED_DEACTIVATE, &ctx->sched_flags)) {
+		spu_del_from_rq(ctx);
+		__spu_schedule(spu, ctx);
+		ret = 0;
+	}
 	spu_release(ctx);
+
+	return ret;
 }
 
 static void spu_unschedule(struct spu *spu, struct spu_context *ctx)
@@ -729,25 +786,22 @@ int spu_activate(struct spu_context *ctx
 	struct spu *spu;
 
 	/*
-	 * If there are multiple threads waiting for a single context
-	 * only one actually binds the context while the others will
-	 * only be able to acquire the state_mutex once the context
-	 * already is in runnable state.
+	 * If ctx is currently loaded or it is on the runqueue, it has
+	 * already been activated.  This might happen if a context was
+	 * timesliced at the same time an exception occurred as the
+	 * interrupt code can't take the state mutex.
 	 */
-	if (ctx->spu)
+	if (ctx->spu || spu_on_rq(ctx))
 		return 0;
 
-spu_activate_top:
 	if (signal_pending(current))
 		return -ERESTARTSYS;
 
 	spu = spu_get_idle(ctx);
-	/*
-	 * If this is a realtime thread we try to get it running by
-	 * preempting a lower priority thread.
-	 */
-	if (!spu && rt_prio(ctx->prio))
+
+	if (!spu)
 		spu = find_victim(ctx);
+
 	if (spu) {
 		unsigned long runcntl;
 
@@ -759,12 +813,10 @@ spu_activate_top:
 		return 0;
 	}
 
-	if (ctx->flags & SPU_CREATE_NOSCHED) {
+	if (ctx->flags & SPU_CREATE_NOSCHED)
 		spu_prio_wait(ctx);
-		goto spu_activate_top;
-	}
-
-	spu_add_to_rq(ctx);
+	else
+		spu_add_to_rq(ctx);
 
 	return 0;
 }
@@ -807,18 +859,22 @@ static int __spu_deactivate(struct spu_c
 
 	if (spu) {
 		new = grab_runnable_context(max_prio, spu->node);
+
 		if (new || force) {
 			spu_unschedule(spu, ctx);
+
 			if (new) {
-				if (new->flags & SPU_CREATE_NOSCHED)
-					wake_up(&new->stop_wq);
-				else {
-					spu_release(ctx);
-					spu_schedule(spu, new);
-					/* this one can't easily be made
-					   interruptible */
-					mutex_lock(&ctx->state_mutex);
-				}
+				spu_release(ctx);
+				/*
+				 * This might fail to schedule 'new' context
+				 * in rare circumstances, but it doesn't
+				 * matter as the dedicated scheduler thread
+				 * handles idle spus.
+				 */
+				spu_schedule(spu, new);
+				/* this one can't easily be made
+				   interruptible */
+				mutex_lock(&ctx->state_mutex);
 			}
 		}
 	}
@@ -836,7 +892,16 @@ static int __spu_deactivate(struct spu_c
 void spu_deactivate(struct spu_context *ctx)
 {
 	spu_context_nospu_trace(spu_deactivate__enter, ctx);
+
+	/*
+	 * The scheduler pulls contexts off the runqueue without taking
+	 * the context lock.  It must recheck state under the mutex lock
+	 * to determine whether the context is still runnable.  See
+	 * the callers of grab_runnable_context.
+	 */
+	set_bit(SPU_SCHED_DEACTIVATE, &ctx->sched_flags);
 	__spu_deactivate(ctx, 1, MAX_PRIO);
+	spu_del_from_rq(ctx);
 }
 
 /**
@@ -847,6 +912,13 @@ void spu_deactivate(struct spu_context *
  * unbind @ctx from the physical spu and schedule the highest
  * priority context to run on the freed physical spu instead.
  */
+void __spu_yield(struct spu_context *ctx)
+{
+	spu_context_nospu_trace(spu_yield__enter, ctx);
+	if (!(ctx->flags & SPU_CREATE_NOSCHED))
+		__spu_deactivate(ctx, 0, MAX_PRIO);
+}
+
 void spu_yield(struct spu_context *ctx)
 {
 	spu_context_nospu_trace(spu_yield__enter, ctx);
@@ -857,10 +929,12 @@ void spu_yield(struct spu_context *ctx)
 	}
 }
 
-static noinline void spusched_tick(struct spu_context *ctx)
+static noinline int spusched_tick(struct spu_context *ctx)
 {
 	struct spu_context *new = NULL;
 	struct spu *spu = NULL;
+	int active;
+	int ret = 0;
 
 	if (spu_acquire(ctx))
 		BUG();	/* a kernel thread never has signals pending */
@@ -872,7 +946,10 @@ static noinline void spusched_tick(struc
 	if (ctx->policy == SCHED_FIFO)
 		goto out;
 
-	if (--ctx->time_slice && test_bit(SPU_SCHED_SPU_RUN, &ctx->sched_flags))
+	active = test_bit(SPU_SCHED_SPU_RUN, &ctx->sched_flags) &&
+		 !ctx->csa.class_1_dsisr;
+
+	if (--ctx->time_slice && active)
 		goto out;
 
 	spu = ctx->spu;
@@ -880,9 +957,10 @@ static noinline void spusched_tick(struc
 	spu_context_trace(spusched_tick__preempt, ctx, spu);
 
 	new = grab_runnable_context(ctx->prio + 1, spu->node);
+
 	if (new) {
 		spu_unschedule(spu, ctx);
-		if (test_bit(SPU_SCHED_SPU_RUN, &ctx->sched_flags))
+		if (active)
 			spu_add_to_rq(ctx);
 	} else {
 		spu_context_nospu_trace(spusched_tick__newslice, ctx);
@@ -893,7 +971,9 @@ out:
 	spu_release(ctx);
 
 	if (new)
-		spu_schedule(spu, new);
+		ret = spu_schedule(spu, new);
+
+	return ret;
 }
 
 /**
@@ -946,8 +1026,12 @@ static void spuloadavg_wake(unsigned lon
 
 static int spusched_thread(void *unused)
 {
+	struct spu_context *ctx;
 	struct spu *spu;
 	int node;
+	int idle;
+	int ret;
+	int done;
 
 	while (!kthread_should_stop()) {
 		set_current_state(TASK_INTERRUPTIBLE);
@@ -955,18 +1039,54 @@ static int spusched_thread(void *unused)
 		for (node = 0; node < MAX_NUMNODES; node++) {
 			struct mutex *mtx = &cbe_spu_info[node].list_mutex;
 
+			idle = 0;
+
 			mutex_lock(mtx);
 			list_for_each_entry(spu, &cbe_spu_info[node].spus,
 					cbe_list) {
-				struct spu_context *ctx = spu->ctx;
+
+				ctx = spu->ctx;
 
 				if (ctx) {
 					mutex_unlock(mtx);
-					spusched_tick(ctx);
+					ret = spusched_tick(ctx);
 					mutex_lock(mtx);
+					if (ret)
+						idle++;
+				}
+				else {
+					idle++;
 				}
 			}
 			mutex_unlock(mtx);
+
+			done = 0;
+			while (!done && idle) {
+
+				ctx = grab_runnable_context(MAX_PRIO, node);
+				if (!ctx)
+					break;
+
+				/*
+				 * Recheck state under the lock to ensure
+				 * that the context has not been re-scheduled
+				 * by spu_run or deactivated.
+				 */
+				mutex_lock(&ctx->state_mutex);
+				if ((ctx->state == SPU_STATE_SAVED) &&
+				    !test_bit(SPU_SCHED_DEACTIVATE,
+						&ctx->sched_flags)) {
+					spu = spu_get_idle(ctx);
+					if (spu) {
+						spu_del_from_rq(ctx);
+						__spu_schedule(spu, ctx);
+					} else {
+						spu_add_to_rq(ctx);
+						done = 1;
+					}
+				}
+				mutex_unlock(&ctx->state_mutex);
+			}
 		}
 	}
 
Index: linux-2.6.25/arch/powerpc/platforms/cell/spufs/spufs.h
===================================================================
--- linux-2.6.25.orig/arch/powerpc/platforms/cell/spufs/spufs.h
+++ linux-2.6.25/arch/powerpc/platforms/cell/spufs/spufs.h
@@ -43,8 +43,8 @@ struct spu_gang;
 /* ctx->sched_flags */
 enum {
 	SPU_SCHED_NOTIFY_ACTIVE,
-	SPU_SCHED_WAS_ACTIVE,	/* was active upon spu_acquire_saved()  */
 	SPU_SCHED_SPU_RUN,	/* context is within spu_run */
+	SPU_SCHED_DEACTIVATE,
 };
 
 enum {
@@ -122,6 +122,7 @@ struct spu_context {
 	int policy;
 	int prio;
 	int last_ran;
+	int sched_count;
 
 	/* statistics */
 	struct {
@@ -282,6 +283,7 @@ int spu_stopped(struct spu_context *ctx,
 void spu_del_from_rq(struct spu_context *ctx);
 int spu_activate(struct spu_context *ctx, unsigned long flags);
 void spu_deactivate(struct spu_context *ctx);
+void __spu_yield(struct spu_context *ctx);
 void spu_yield(struct spu_context *ctx);
 void spu_switch_notify(struct spu *spu, struct spu_context *ctx);
 void spu_switch_log_notify(struct spu *spu, struct spu_context *ctx,
Index: linux-2.6.25/arch/powerpc/platforms/cell/spufs/fault.c
===================================================================
--- linux-2.6.25.orig/arch/powerpc/platforms/cell/spufs/fault.c
+++ linux-2.6.25/arch/powerpc/platforms/cell/spufs/fault.c
@@ -136,8 +136,13 @@ int spufs_handle_class1(struct spu_conte
 		dsisr, ctx->state);
 
 	ctx->stats.hash_flt++;
-	if (ctx->state == SPU_STATE_RUNNABLE)
+	if (ctx->state == SPU_STATE_RUNNABLE) {
 		ctx->spu->stats.hash_flt++;
+		__spu_yield(ctx);
+	}
+	else {
+		spu_del_from_rq(ctx);
+	}
 
 	/* we must not hold the lock when entering spu_handle_mm_fault */
 	spu_release(ctx);
@@ -166,6 +171,9 @@ int spufs_handle_class1(struct spu_conte
 	ctx->csa.class_1_dar = ctx->csa.class_1_dsisr = 0;
 	smp_mb();
 
+	if (ctx->state != SPU_STATE_RUNNABLE)
+		spu_activate(ctx, 0);
+
 	/*
 	 * If we handled the fault successfully and are in runnable
 	 * state, restart the DMA.
Index: linux-2.6.25/arch/powerpc/platforms/cell/spufs/file.c
===================================================================
--- linux-2.6.25.orig/arch/powerpc/platforms/cell/spufs/file.c
+++ linux-2.6.25/arch/powerpc/platforms/cell/spufs/file.c
@@ -387,7 +387,32 @@ static unsigned long spufs_ps_nopfn(stru
 	if (ctx->state == SPU_STATE_SAVED) {
 		up_read(&current->mm->mmap_sem);
 		spu_context_nospu_trace(spufs_ps_nopfn__sleep, ctx);
-		ret = spufs_wait(ctx->run_wq, ctx->state == SPU_STATE_RUNNABLE);
+
+		/*
+		 * Activate context if it is inside spu_run.  It may
+		 * not be schedulable under its own control if it has
+		 * faulted.  This should provide concurrency between the
+		 * the faulting thread and the target spu context enabling
+		 * the fault to be more quickly resolved.  find_victim
+		 * may even find a lazily loaded context to preempt
+		 * resulting in no downside at all.  On the other hand,
+		 * there is nothing to keep the context loaded if it
+		 * has faulted as it may become the target of another
+		 * activation.  For this reason, we don't have to worry
+		 * about the context be lazily loaded for an entire
+		 * quantum.  It might even be worth removing the test
+		 * for SPU_SCHED_SPU_RUN.
+		 */
+		if (test_bit(SPU_SCHED_SPU_RUN, &ctx->sched_flags))
+			ret = spu_activate(ctx, 0);
+		else
+			ret = 0;
+		if (!ret) {
+			ret = spufs_wait(ctx->run_wq,
+					ctx->state == SPU_STATE_RUNNABLE);
+			if (ret)
+				goto refault;
+		}
 		spu_context_trace(spufs_ps_nopfn__wake, ctx, ctx->spu);
 		down_read(&current->mm->mmap_sem);
 	} else {
@@ -396,8 +421,7 @@ static unsigned long spufs_ps_nopfn(stru
 		spu_context_trace(spufs_ps_nopfn__insert, ctx, ctx->spu);
 	}
 
-	if (!ret)
-		spu_release(ctx);
+	spu_release(ctx);
 
 refault:
 	put_spu_context(ctx);
Index: linux-2.6.25/arch/powerpc/platforms/cell/spufs/run.c
===================================================================
--- linux-2.6.25.orig/arch/powerpc/platforms/cell/spufs/run.c
+++ linux-2.6.25/arch/powerpc/platforms/cell/spufs/run.c
@@ -186,6 +186,8 @@ static int spu_run_init(struct spu_conte
 	int ret;
 
 	spuctx_switch_state(ctx, SPU_UTIL_SYSTEM);
+	set_bit(SPU_SCHED_SPU_RUN, &ctx->sched_flags);
+	clear_bit(SPU_SCHED_DEACTIVATE, &ctx->sched_flags);
 
 	/*
 	 * NOSCHED is synchronous scheduling with respect to the caller.
@@ -243,7 +245,6 @@ static int spu_run_init(struct spu_conte
 		}
 	}
 
-	set_bit(SPU_SCHED_SPU_RUN, &ctx->sched_flags);
 	return 0;
 }
 
Index: linux-2.6.25/arch/powerpc/platforms/cell/spufs/context.c
===================================================================
--- linux-2.6.25.orig/arch/powerpc/platforms/cell/spufs/context.c
+++ linux-2.6.25/arch/powerpc/platforms/cell/spufs/context.c
@@ -159,10 +159,7 @@ int spu_acquire_saved(struct spu_context
 	if (ret)
 		return ret;
 
-	if (ctx->state != SPU_STATE_SAVED) {
-		set_bit(SPU_SCHED_WAS_ACTIVE, &ctx->sched_flags);
-		spu_deactivate(ctx);
-	}
+	spu_deactivate(ctx);
 
 	return 0;
 }
@@ -175,9 +172,10 @@ void spu_release_saved(struct spu_contex
 {
 	BUG_ON(ctx->state != SPU_STATE_SAVED);
 
-	if (test_and_clear_bit(SPU_SCHED_WAS_ACTIVE, &ctx->sched_flags) &&
-			test_bit(SPU_SCHED_SPU_RUN, &ctx->sched_flags))
+	if (test_bit(SPU_SCHED_SPU_RUN, &ctx->sched_flags))
 		spu_activate(ctx, 0);
+	else
+		clear_bit(SPU_SCHED_DEACTIVATE, &ctx->sched_flags);
 
 	spu_release(ctx);
 }