[Cbe-oss-dev] [PATCH] spusched: extend list_mutex coverage

Christoph Hellwig hch at lst.de
Tue Jul 22 19:16:56 EST 2008


On Tue, Jul 15, 2008 at 06:08:12AM +0200, Christoph Hellwig wrote:
> spu->ctx is only stable while we hold the list_mutex is held.  Extent
> hold times of the lock in find_victim and spusched_tick to avoid potentially
> dangling dereferences.  To do so we have to switch to a trylock
> and opencode spu_schedule in spusched_tick which is rather ugly, but I
> can't find a good way around it.

ping?  This is a quite important fix for a bug found by the Boeblingen
Test team.

> Signed-off-by: Christoph Hellwig <hch at lst.de>
> 
> Index: linux-2.6/arch/powerpc/platforms/cell/spufs/sched.c
> ===================================================================
> --- linux-2.6.orig/arch/powerpc/platforms/cell/spufs/sched.c	2008-07-15 06:02:19.000000000 +0200
> +++ linux-2.6/arch/powerpc/platforms/cell/spufs/sched.c	2008-07-15 06:03:40.000000000 +0200
> @@ -633,7 +633,6 @@ static struct spu *find_victim(struct sp
>  			    (!victim || tmp->prio > victim->prio))
>  				victim = spu->ctx;
>  		}
> -		mutex_unlock(&cbe_spu_info[node].list_mutex);
>  
>  		if (victim) {
>  			/*
> @@ -647,6 +646,7 @@ static struct spu *find_victim(struct sp
>  			 * look at another context or give up after X retries.
>  			 */
>  			if (!mutex_trylock(&victim->state_mutex)) {
> +				mutex_unlock(&cbe_spu_info[node].list_mutex);
>  				victim = NULL;
>  				goto restart;
>  			}
> @@ -659,16 +659,15 @@ static struct spu *find_victim(struct sp
>  				 * restart the search.
>  				 */
>  				mutex_unlock(&victim->state_mutex);
> +				mutex_unlock(&cbe_spu_info[node].list_mutex);
>  				victim = NULL;
>  				goto restart;
>  			}
>  
>  			spu_context_trace(__spu_deactivate__unload, ctx, spu);
>  
> -			mutex_lock(&cbe_spu_info[node].list_mutex);
>  			cbe_spu_info[node].nr_active--;
>  			spu_unbind_context(spu, victim);
> -			mutex_unlock(&cbe_spu_info[node].list_mutex);
>  
>  			victim->stats.invol_ctx_switch++;
>  			spu->stats.invol_ctx_switch++;
> @@ -676,9 +675,10 @@ static struct spu *find_victim(struct sp
>  				spu_add_to_rq(victim);
>  
>  			mutex_unlock(&victim->state_mutex);
> -
> +			mutex_unlock(&cbe_spu_info[node].list_mutex);
>  			return spu;
>  		}
> +		mutex_unlock(&cbe_spu_info[node].list_mutex);
>  	}
>  
>  	return NULL;
> @@ -715,16 +715,21 @@ static void spu_schedule(struct spu *spu
>  	spu_release(ctx);
>  }
>  
> -static void spu_unschedule(struct spu *spu, struct spu_context *ctx)
> +static void __spu_unschedule(struct spu *spu, struct spu_context *ctx)
>  {
> -	int node = spu->node;
> -
> -	mutex_lock(&cbe_spu_info[node].list_mutex);
> -	cbe_spu_info[node].nr_active--;
> +	cbe_spu_info[spu->node].nr_active--;
>  	spu->alloc_state = SPU_FREE;
>  	spu_unbind_context(spu, ctx);
>  	ctx->stats.invol_ctx_switch++;
>  	spu->stats.invol_ctx_switch++;
> +}
> +
> +static void spu_unschedule(struct spu *spu, struct spu_context *ctx)
> +{
> +	int node = spu->node;
> +
> +	mutex_lock(&cbe_spu_info[node].list_mutex);
> +	__spu_unschedule(spu, ctx);
>  	mutex_unlock(&cbe_spu_info[node].list_mutex);
>  }
>  
> @@ -875,8 +880,13 @@ static noinline void spusched_tick(struc
>  	struct spu_context *new = NULL;
>  	struct spu *spu = NULL;
>  
> -	if (spu_acquire(ctx))
> -		BUG();	/* a kernel thread never has signals pending */
> +	/*
> +	 * If we can't lock the context give it another timeslice.
> +	 */
> +	if (!mutex_trylock(&ctx->state_mutex)) {
> +		ctx->time_slice++;
> +		return;
> +	}
>  
>  	if (ctx->state != SPU_STATE_RUNNABLE)
>  		goto out;
> @@ -892,9 +902,16 @@ static noinline void spusched_tick(struc
>  
>  	spu_context_trace(spusched_tick__preempt, ctx, spu);
>  
> -	new = grab_runnable_context(ctx->prio + 1, spu->node);
> +	/*
> +	 * Try to find a new context on the runqueue that we can lock
> +	 * without sleeping.
> +	 */
> +	do {
> +		new = grab_runnable_context(ctx->prio + 1, spu->node);
> +	} while (new && !mutex_trylock(&new->state_mutex));
> +
>  	if (new) {
> -		spu_unschedule(spu, ctx);
> +		__spu_unschedule(spu, ctx);
>  		if (test_bit(SPU_SCHED_SPU_RUN, &ctx->sched_flags))
>  			spu_add_to_rq(ctx);
>  	} else {
> @@ -904,8 +921,24 @@ static noinline void spusched_tick(struc
>  out:
>  	spu_release(ctx);
>  
> -	if (new)
> -		spu_schedule(spu, new);
> +	/*
> +	 * This is an opencoded spu_schedule accounting for the fact
> +	 * that we have the list_lock already taken.
> +	 *
> +	 * Not pretty.
> +	 */
> +	if (new) {
> +		int node = spu->node;
> +
> +		BUG_ON(spu->ctx);
> +
> +		spu_set_timeslice(new);
> +		spu_bind_context(spu, new);
> +		cbe_spu_info[node].nr_active++;
> +		spu->alloc_state = SPU_USED;
> +		wake_up_all(&new->run_wq);
> +		spu_release(new);
> +	}
>  }
>  
>  /**
> @@ -972,11 +1005,8 @@ static int spusched_thread(void *unused)
>  					cbe_list) {
>  				struct spu_context *ctx = spu->ctx;
>  
> -				if (ctx) {
> -					mutex_unlock(mtx);
> +				if (ctx)
>  					spusched_tick(ctx);
> -					mutex_lock(mtx);
> -				}
>  			}
>  			mutex_unlock(mtx);
>  		}
> _______________________________________________
> cbe-oss-dev mailing list
> cbe-oss-dev at ozlabs.org
> https://ozlabs.org/mailman/listinfo/cbe-oss-dev
---end quoted text---



More information about the cbe-oss-dev mailing list