[Cbe-oss-dev] [PATCH 8/10 v2] MARS: workload queue mutex protection

Sat Aug 30 11:26:26 EST 2008

This adds mutex protection when accessing the shared workload queue.

Prior to this patch the kernel scheduler accessed the whole queue block
atomically. Now the kernel scheduler atomically locks the queue block when it
needs to access it.

Host-side access of the queue block now also uses the mutex locks to keep the
workload queue functions thread-safe.

This also replaces the previously used MARS context mutex. The workload queue
handles its own mutex without the workload model APIs needing to manage the
MARS context mutex.

Signed-off-by: Yuji Mano <yuji.mano at am.sony.com>
Acked-by: Kazunori Asayama <asayama at sm.sony.co.jp>

---
v2:
 - rebased patch

 include/common/mars/mars_workload_types.h |   10 -
 include/host/mars/mars.h                  |    2 
 include/mpu/mars/mars_kernel.h            |    3 
 src/host/lib/mars.c                       |   11 -
 src/host/lib/mars_workload_queue.c        |  156 ++++++++++++++++-----
 src/mpu/kernel/Makefile.am                |    3 
 src/mpu/kernel/mars_kernel_scheduler.c    |   73 ++--------
 src/mpu/kernel/mars_kernel_workload.c     |  218 ++++++++++++------------------
 8 files changed, 244 insertions(+), 232 deletions(-)

--- a/include/common/mars/mars_workload_types.h
+++ b/include/common/mars/mars_workload_types.h
@@ -67,9 +67,9 @@ extern "C" {
 #define MARS_WORKLOAD_SIGNAL_ON			0x1	/* signal set on */
 #define MARS_WORKLOAD_SIGNAL_OFF		0x0	/* signal set off */
 
-#define MARS_WORKLOAD_MAX			1024	/* wl max */
-#define MARS_WORKLOAD_PER_BLOCK			16	/* wl per block */
-#define MARS_WORKLOAD_NUM_BLOCKS		64	/* wl max / per block */
+#define MARS_WORKLOAD_PER_BLOCK			15	/* wl/block */
+#define MARS_WORKLOAD_NUM_BLOCKS		100	/* total blocks */
+#define MARS_WORKLOAD_MAX			1500	/* blocks * wl/block */
 
 #define MARS_WORKLOAD_CONTEXT_SIZE		128	/* size of 128 bytes */
 #define MARS_WORKLOAD_CONTEXT_ALIGN		128	/* align to 128 bytes */
@@ -85,6 +85,8 @@ struct mars_workload_context {
 
 /* 128 byte workload queue header structure */
 struct mars_workload_queue_header {
+	uint32_t lock;
+	uint32_t pad;
 	uint64_t queue_ea;
 	uint64_t context_ea;
 	uint8_t flag;
@@ -102,6 +104,8 @@ struct mars_workload_queue_block_bits {
 
 /* 128 byte workload queue block structure */
 struct mars_workload_queue_block {
+	uint32_t lock;
+	uint32_t pad;
 	struct mars_workload_queue_block_bits bits[MARS_WORKLOAD_PER_BLOCK];
 } __attribute__((aligned(MARS_WORKLOAD_QUEUE_BLOCK_ALIGN)));
 
--- a/include/host/mars/mars.h
+++ b/include/host/mars/mars.h
@@ -79,8 +79,6 @@ struct mars_context {
 	struct mars_params *params;
 	/* parameters for the MARS kernel */
 	struct mars_kernel_params *kernel_params;
-	/* mutex for the MARS context */
-	struct mars_mutex *mutex;
 	/* process queue where process requests are added */
 	struct mars_workload_queue *workload_queue;
 	/* array of mpu context threads */
--- a/include/mpu/mars/mars_kernel.h
+++ b/include/mpu/mars/mars_kernel.h
@@ -43,9 +43,10 @@ extern "C" {
 #endif
 
 #include <stdint.h>
+
 #include "mars/mars_dma.h"
+#include "mars/mars_mutex.h"
 #include "mars/mars_timer.h"
-#include "mars/mars_atomic.h"
 #include "mars/mars_error.h"
 #include "mars/mars_kernel_types.h"
 #include "mars/mars_workload_types.h"
--- a/src/host/lib/mars.c
+++ b/src/host/lib/mars.c
@@ -156,16 +156,6 @@ int mars_initialize(struct mars_context 
 	MARS_CHECK_CLEANUP_RET(mars->kernel_params, mars_finalize(mars),
 				MARS_ERROR_MEMORY);
 
-	/* allocate mutex */
-	mars->mutex = (struct mars_mutex *)
-		memalign(MARS_MUTEX_ALIGN, sizeof(struct mars_mutex));
-	MARS_CHECK_CLEANUP_RET(mars->mutex, mars_finalize(mars),
-				MARS_ERROR_MEMORY);
-
-	/* initialize mutex */
-	ret = mars_mutex_initialize(mars->mutex);
-	MARS_CHECK_CLEANUP_RET(ret == MARS_SUCCESS, mars_finalize(mars), ret);
-
 	/* allocate workload queue */
 	mars->workload_queue = (struct mars_workload_queue *)
 		memalign(MARS_WORKLOAD_QUEUE_ALIGN,
@@ -211,7 +201,6 @@ int mars_finalize(struct mars_context *m
 	/* free allocated memory */
 	free(mars->params);
 	free(mars->kernel_params);
-	free(mars->mutex);
 	free(mars->workload_queue);
 	free(mars->mpu_context_threads);
 
--- a/src/host/lib/mars_workload_queue.c
+++ b/src/host/lib/mars_workload_queue.c
@@ -48,12 +48,32 @@ int workload_queue_initialize(struct mar
 {
 	MARS_CHECK_RET(queue, MARS_ERROR_NULL);
 
-	memset(queue, 0, sizeof(struct mars_workload_queue));
+	int block;
+	int index;
 
 	queue->header.queue_ea = (uint64_t)(uintptr_t)queue;
 	queue->header.context_ea = (uint64_t)(uintptr_t)&queue->context;
 	queue->header.flag = MARS_FLAG_NONE;
 
+	/* initialize workload queue blocks */
+	for (block = 0; block < MARS_WORKLOAD_NUM_BLOCKS; block++) {
+		struct mars_workload_queue_block *p = &queue->block[block];
+
+		mars_mutex_initialize((struct mars_mutex *)p);
+		mars_mutex_lock((struct mars_mutex *)p);
+
+		for (index = 0; index < MARS_WORKLOAD_PER_BLOCK; index++) {
+			p->bits[index].type = MARS_WORKLOAD_TYPE_NONE;
+			p->bits[index].state = MARS_WORKLOAD_STATE_NONE;
+			p->bits[index].priority = MARS_WORKLOAD_PRIORITY_MIN;
+			p->bits[index].counter = MARS_WORKLOAD_COUNTER_MIN;
+			p->bits[index].signal = MARS_WORKLOAD_SIGNAL_OFF;
+			p->bits[index].wait = MARS_WORKLOAD_ID_NONE;
+		}
+
+		mars_mutex_unlock((struct mars_mutex *)p);
+	}
+
 	return MARS_SUCCESS;
 }
 
@@ -99,25 +119,37 @@ int workload_queue_add_begin(struct mars
 
 	*id = 0;
 
-	/* find first available empty slot */
-	while (*id < MARS_WORKLOAD_MAX) {
-		block = *id / MARS_WORKLOAD_PER_BLOCK;
-		index = *id % MARS_WORKLOAD_PER_BLOCK;
-
-		if (queue->block[block].bits[index].state ==
-			MARS_WORKLOAD_STATE_NONE)
-			break;
+	mars_mutex_lock((struct mars_mutex *)&queue->block[block]);
 
+	while (queue->block[block].bits[index].state !=
+		MARS_WORKLOAD_STATE_NONE) {
 		(*id)++;
+		index++;
+		if (index == MARS_WORKLOAD_PER_BLOCK) {
+			index = 0;
+
+			mars_mutex_unlock(
+				(struct mars_mutex *)&queue->block[block]);
+
+			if (++block == MARS_WORKLOAD_NUM_BLOCKS)
+				return MARS_ERROR_LIMIT;
+
+			mars_mutex_lock(
+				(struct mars_mutex *)&queue->block[block]);
+		}
 	}
 
-	/* no empty slot found - workload queue is full */
-	MARS_CHECK_RET(*id < MARS_WORKLOAD_MAX, MARS_ERROR_LIMIT);
+	MARS_CHECK_CLEANUP_RET(queue->block[block].bits[index].state ==
+		MARS_WORKLOAD_STATE_NONE,
+		mars_mutex_unlock((struct mars_mutex *)&queue->block[block]),
+		MARS_ERROR_STATE);
 
 	/* set type and set state to adding */
 	queue->block[block].bits[index].type = type;
 	queue->block[block].bits[index].state = MARS_WORKLOAD_STATE_ADDING;
 
+	mars_mutex_unlock((struct mars_mutex *)&queue->block[block]);
+
 	/* if requested set workload context pointer to return */
 	if (workload)
 		*workload = &queue->context[*id];
@@ -133,12 +165,18 @@ int workload_queue_add_end(struct mars_w
 	int block = id / MARS_WORKLOAD_PER_BLOCK;
 	int index = id % MARS_WORKLOAD_PER_BLOCK;
 
-	MARS_CHECK_RET(queue->block[block].bits[index].state ==
-			MARS_WORKLOAD_STATE_ADDING, MARS_ERROR_STATE);
+	mars_mutex_lock((struct mars_mutex *)&queue->block[block]);
+
+	MARS_CHECK_CLEANUP_RET(queue->block[block].bits[index].state ==
+		MARS_WORKLOAD_STATE_ADDING,
+		mars_mutex_unlock((struct mars_mutex *)&queue->block[block]),
+		MARS_ERROR_STATE);
 
 	/* reset workload queue bits and set state to finished state */
 	queue->block[block].bits[index].state = MARS_WORKLOAD_STATE_FINISHED;
 
+	mars_mutex_unlock((struct mars_mutex *)&queue->block[block]);
+
 	return MARS_SUCCESS;
 }
 
@@ -150,12 +188,18 @@ int workload_queue_add_cancel(struct mar
 	int block = id / MARS_WORKLOAD_PER_BLOCK;
 	int index = id % MARS_WORKLOAD_PER_BLOCK;
 
-	MARS_CHECK_RET(queue->block[block].bits[index].state ==
-			MARS_WORKLOAD_STATE_ADDING, MARS_ERROR_STATE);
+	mars_mutex_lock((struct mars_mutex *)&queue->block[block]);
+
+	MARS_CHECK_CLEANUP_RET(queue->block[block].bits[index].state ==
+		MARS_WORKLOAD_STATE_ADDING,
+		mars_mutex_unlock((struct mars_mutex *)&queue->block[block]),
+		MARS_ERROR_STATE);
 
 	/* set state back to none state */
 	queue->block[block].bits[index].state = MARS_WORKLOAD_STATE_NONE;
 
+	mars_mutex_unlock((struct mars_mutex *)&queue->block[block]);
+
 	return MARS_SUCCESS;
 }
 
@@ -168,12 +212,18 @@ int workload_queue_remove_begin(struct m
 	int block = id / MARS_WORKLOAD_PER_BLOCK;
 	int index = id % MARS_WORKLOAD_PER_BLOCK;
 
-	MARS_CHECK_RET(queue->block[block].bits[index].state ==
-			MARS_WORKLOAD_STATE_FINISHED, MARS_ERROR_STATE);
+	mars_mutex_lock((struct mars_mutex *)&queue->block[block]);
+
+	MARS_CHECK_CLEANUP_RET(queue->block[block].bits[index].state ==
+		MARS_WORKLOAD_STATE_FINISHED,
+		mars_mutex_unlock((struct mars_mutex *)&queue->block[block]),
+		MARS_ERROR_STATE);
 
 	/* set state to removing */
 	queue->block[block].bits[index].state = MARS_WORKLOAD_STATE_REMOVING;
 
+	mars_mutex_unlock((struct mars_mutex *)&queue->block[block]);
+
 	/* if requested set workload context pointer to return */
 	if (workload)
 		*workload = &queue->context[id];
@@ -189,12 +239,18 @@ int workload_queue_remove_end(struct mar
 	int block = id / MARS_WORKLOAD_PER_BLOCK;
 	int index = id % MARS_WORKLOAD_PER_BLOCK;
 
-	MARS_CHECK_RET(queue->block[block].bits[index].state ==
-			MARS_WORKLOAD_STATE_REMOVING, MARS_ERROR_STATE);
+	mars_mutex_lock((struct mars_mutex *)&queue->block[block]);
+
+	MARS_CHECK_CLEANUP_RET(queue->block[block].bits[index].state ==
+		MARS_WORKLOAD_STATE_REMOVING,
+		mars_mutex_unlock((struct mars_mutex *)&queue->block[block]),
+		MARS_ERROR_STATE);
 
 	/* set state to none */
 	queue->block[block].bits[index].type = MARS_WORKLOAD_TYPE_NONE;
 
+	mars_mutex_unlock((struct mars_mutex *)&queue->block[block]);
+
 	return MARS_SUCCESS;
 }
 
@@ -206,12 +262,18 @@ int workload_queue_remove_cancel(struct 
 	int block = id / MARS_WORKLOAD_PER_BLOCK;
 	int index = id % MARS_WORKLOAD_PER_BLOCK;
 
-	MARS_CHECK_RET(queue->block[block].bits[index].state ==
-			MARS_WORKLOAD_STATE_REMOVING, MARS_ERROR_STATE);
+	mars_mutex_lock((struct mars_mutex *)&queue->block[block]);
+
+	MARS_CHECK_CLEANUP_RET(queue->block[block].bits[index].state ==
+		MARS_WORKLOAD_STATE_REMOVING,
+		mars_mutex_unlock((struct mars_mutex *)&queue->block[block]),
+		MARS_ERROR_STATE);
 
 	/* set state back to finished */
 	queue->block[block].bits[index].type = MARS_WORKLOAD_STATE_FINISHED;
 
+	mars_mutex_unlock((struct mars_mutex *)&queue->block[block]);
+
 	return MARS_SUCCESS;
 }
 
@@ -225,8 +287,12 @@ int workload_queue_schedule_begin(struct
 	int block = id / MARS_WORKLOAD_PER_BLOCK;
 	int index = id % MARS_WORKLOAD_PER_BLOCK;
 
-	MARS_CHECK_RET(queue->block[block].bits[index].state ==
-			MARS_WORKLOAD_STATE_FINISHED, MARS_ERROR_STATE);
+	mars_mutex_lock((struct mars_mutex *)&queue->block[block]);
+
+	MARS_CHECK_CLEANUP_RET(queue->block[block].bits[index].state ==
+		MARS_WORKLOAD_STATE_FINISHED,
+		mars_mutex_unlock((struct mars_mutex *)&queue->block[block]),
+		MARS_ERROR_STATE);
 
 	/* reset workload queue bits and set state to scheduling */
 	queue->block[block].bits[index].state = MARS_WORKLOAD_STATE_SCHEDULING;
@@ -235,6 +301,8 @@ int workload_queue_schedule_begin(struct
 	queue->block[block].bits[index].signal = MARS_WORKLOAD_SIGNAL_OFF;
 	queue->block[block].bits[index].wait = MARS_WORKLOAD_ID_NONE;
 
+	mars_mutex_unlock((struct mars_mutex *)&queue->block[block]);
+
 	/* if requested set workload context pointer to return */
 	if (workload)
 		*workload = &queue->context[id];
@@ -250,12 +318,18 @@ int workload_queue_schedule_end(struct m
 	int block = id / MARS_WORKLOAD_PER_BLOCK;
 	int index = id % MARS_WORKLOAD_PER_BLOCK;
 
-	MARS_CHECK_RET(queue->block[block].bits[index].state ==
-			MARS_WORKLOAD_STATE_SCHEDULING, MARS_ERROR_STATE);
+	mars_mutex_lock((struct mars_mutex *)&queue->block[block]);
+
+	MARS_CHECK_CLEANUP_RET(queue->block[block].bits[index].state ==
+		MARS_WORKLOAD_STATE_SCHEDULING,
+		mars_mutex_unlock((struct mars_mutex *)&queue->block[block]),
+		MARS_ERROR_STATE);
 
 	/* set state to ready */
 	queue->block[block].bits[index].state = MARS_WORKLOAD_STATE_READY;
 
+	mars_mutex_unlock((struct mars_mutex *)&queue->block[block]);
+
 	return MARS_SUCCESS;
 }
 
@@ -268,12 +342,18 @@ int workload_queue_schedule_cancel(struc
 	int block = id / MARS_WORKLOAD_PER_BLOCK;
 	int index = id % MARS_WORKLOAD_PER_BLOCK;
 
-	MARS_CHECK_RET(queue->block[block].bits[index].state ==
-			MARS_WORKLOAD_STATE_SCHEDULING, MARS_ERROR_STATE);
+	mars_mutex_lock((struct mars_mutex *)&queue->block[block]);
+
+	MARS_CHECK_CLEANUP_RET(queue->block[block].bits[index].state ==
+		MARS_WORKLOAD_STATE_SCHEDULING,
+		mars_mutex_unlock((struct mars_mutex *)&queue->block[block]),
+		MARS_ERROR_STATE);
 
 	/* set state back to finished */
 	queue->block[block].bits[index].state = MARS_WORKLOAD_STATE_FINISHED;
 
+	mars_mutex_unlock((struct mars_mutex *)&queue->block[block]);
+
 	return MARS_SUCCESS;
 }
 
@@ -286,12 +366,13 @@ int workload_queue_wait(struct mars_work
 	int block = id / MARS_WORKLOAD_PER_BLOCK;
 	int index = id % MARS_WORKLOAD_PER_BLOCK;
 
-	MARS_CHECK_RET(queue->block[block].bits[index].state !=
-			MARS_WORKLOAD_STATE_NONE, MARS_ERROR_STATE);
-
 	while (queue->block[block].bits[index].state !=
-		MARS_WORKLOAD_STATE_FINISHED)
+		MARS_WORKLOAD_STATE_FINISHED) {
+		MARS_CHECK_RET(queue->block[block].bits[index].state !=
+			MARS_WORKLOAD_STATE_NONE,
+			MARS_ERROR_STATE);
 		sched_yield();
+	}
 
 	/* if requested set workload context pointer to return */
 	if (workload)
@@ -310,7 +391,8 @@ int workload_queue_try_wait(struct mars_
 	int index = id % MARS_WORKLOAD_PER_BLOCK;
 
 	MARS_CHECK_RET(queue->block[block].bits[index].state !=
-			MARS_WORKLOAD_STATE_NONE, MARS_ERROR_STATE);
+		MARS_WORKLOAD_STATE_NONE,
+		MARS_ERROR_STATE);
 
 	if (queue->block[block].bits[index].state !=
 		MARS_WORKLOAD_STATE_FINISHED)
@@ -331,10 +413,16 @@ int workload_queue_signal_send(struct ma
 	int block = id / MARS_WORKLOAD_PER_BLOCK;
 	int index = id % MARS_WORKLOAD_PER_BLOCK;
 
-	MARS_CHECK_RET(queue->block[block].bits[index].state !=
-			MARS_WORKLOAD_STATE_NONE, MARS_ERROR_STATE);
+	mars_mutex_lock((struct mars_mutex *)&queue->block[block]);
+
+	MARS_CHECK_CLEANUP_RET(queue->block[block].bits[index].state !=
+		MARS_WORKLOAD_STATE_NONE,
+		mars_mutex_unlock((struct mars_mutex *)&queue->block[block]),
+		MARS_ERROR_STATE);
 
 	queue->block[block].bits[index].signal = MARS_WORKLOAD_SIGNAL_ON;
 
+	mars_mutex_unlock((struct mars_mutex *)&queue->block[block]);
+
 	return MARS_SUCCESS;
 }
--- a/src/mpu/kernel/Makefile.am
+++ b/src/mpu/kernel/Makefile.am
@@ -54,11 +54,14 @@ mars_kernel_CFLAGS = \
 	-Wmissing-declarations
 
 mars_kernel_LDFLAGS = \
+	-L../lib \
 	-Wl,--defsym=__stack=0x0fff0 \
 	-Wl,-gc-sections \
 	-Wl,-N \
 	-Wl,-s
 
+mars_kernel_LDADD = -lmars
+
 mars_kernel_SOURCES = \
 	mars_kernel.c \
 	mars_kernel_registers.c \
--- a/src/mpu/kernel/mars_kernel_scheduler.c
+++ b/src/mpu/kernel/mars_kernel_scheduler.c
@@ -137,75 +137,40 @@ static int search_block(int block)
 
 static int reserve_block(int block)
 {
-	int status;
 	int index;
-
-	atomic_event_setup();
-
-	/* attempt to reserve workload block until successful */
-	do {
-		/* reserve cache line of workload queue block */
-		atomic_get(&queue_block,
-			queue_header.queue_ea +
+	uint64_t block_ea = queue_header.queue_ea +
 			offsetof(struct mars_workload_queue, block) +
-			sizeof(struct mars_workload_queue_block) * block);
+			sizeof(struct mars_workload_queue_block) * block;
 
-		/* set the workload index */
-		index = search_block(block);
-		if (index >= 0) {
-			/* update the current state of the workload */
-			queue_block.bits[index].state =
-				MARS_WORKLOAD_STATE_RUNNING;
-
-			/* reset the counter for reserved workload */
-			queue_block.bits[index].counter =
-				MARS_WORKLOAD_COUNTER_MIN;
-		}
-
-		/* attempt to write back workload queue block to cache line */
-		status = atomic_put(&queue_block,
-			queue_header.queue_ea +
-			offsetof(struct mars_workload_queue, block) +
-			sizeof(struct mars_workload_queue_block) * block);
+	mars_mutex_lock_get(block_ea, (struct mars_mutex *)&queue_block);
+
+	/* set the workload index */
+	index = search_block(block);
+	if (index >= 0) {
+		/* update the current state of the workload */
+		queue_block.bits[index].state = MARS_WORKLOAD_STATE_RUNNING;
 
-		/* write back failed so wait until reservation is lost */
-		if (status)
-			atomic_event_wait();
-	} while (status);
+		/* reset the counter for reserved workload */
+		queue_block.bits[index].counter = MARS_WORKLOAD_COUNTER_MIN;
+	}
 
-	atomic_event_restore();
+	mars_mutex_unlock_put(block_ea, (struct mars_mutex *)&queue_block);
 
 	return index;
 }
 
 static void release_block(int block, int index)
 {
-	int status;
-
-	atomic_event_setup();
-
-	do {
-		/* reserve cache line of workload queue block */
-		atomic_get(&queue_block,
-			queue_header.queue_ea +
+	uint64_t block_ea = queue_header.queue_ea +
 			offsetof(struct mars_workload_queue, block) +
-			sizeof(struct mars_workload_queue_block) * block);
+			sizeof(struct mars_workload_queue_block) * block;
 
-		/* update current workload state in workload queue block */
-		queue_block.bits[index].state = workload_state;
-
-		/* attempt to write back workload queue block to cache line */
-		status = atomic_put(&queue_block,
-			queue_header.queue_ea +
-			offsetof(struct mars_workload_queue, block) +
-			sizeof(struct mars_workload_queue_block) * block);
+	mars_mutex_lock_get(block_ea, (struct mars_mutex *)&queue_block);
 
-		/* write back failed so wait until reservation is lost */
-		if (status)
-			atomic_event_wait();
-	} while (status);
+	/* update current workload state in workload queue block */
+	queue_block.bits[index].state = workload_state;
 
-	atomic_event_restore();
+	mars_mutex_unlock_put(block_ea, (struct mars_mutex *)&queue_block);
 }
 
 int reserve_workload(void)
--- a/src/mpu/kernel/mars_kernel_workload.c
+++ b/src/mpu/kernel/mars_kernel_workload.c
@@ -131,69 +131,52 @@ int workload_schedule(uint16_t workload_
 
 	int block = workload_id / MARS_WORKLOAD_PER_BLOCK;
 	int index = workload_id % MARS_WORKLOAD_PER_BLOCK;
-	int status;
 
-	atomic_event_setup();
-
-	/* attempt to schedule workload until successful */
-	do {
-		/* reserve cache line of workload queue block */
-		atomic_get(&queue_block,
-			queue_header.queue_ea +
+	uint64_t block_ea = queue_header.queue_ea +
 			offsetof(struct mars_workload_queue, block) +
-			sizeof(struct mars_workload_queue_block) * block);
+			sizeof(struct mars_workload_queue_block) * block;
+
+	mars_mutex_lock_get(block_ea, (struct mars_mutex *)&queue_block);
+
+	/* make sure workload is in the correct state */
+	if (queue_block.bits[index].state != MARS_WORKLOAD_STATE_FINISHED) {
+		mars_mutex_unlock_put(block_ea,
+			(struct mars_mutex *)&queue_block);
+		return MARS_ERROR_STATE;
+	}
 
-		/* make sure workload is in the correct state */
-		if (queue_block.bits[index].state !=
-			MARS_WORKLOAD_STATE_FINISHED) {
-			atomic_event_restore();
-			return MARS_ERROR_STATE;
-		}
-
-		/* get information of workload to schedule */
-		schedule_workload_type = queue_block.bits[index].type;
-		schedule_workload_ea = queue_header.context_ea +
+	/* get information of workload to schedule */
+	schedule_workload_type = queue_block.bits[index].type;
+	schedule_workload_ea = queue_header.context_ea +
 			workload_id * sizeof(struct mars_workload_context);
 
-		/* dma the workload context code into LS from main memory */
-		mars_dma_get_and_wait((void *)&schedule_workload,
-					schedule_workload_ea,
-					sizeof(struct mars_workload_context),
-					MARS_DMA_TAG);
-
-		/* workload type specific handling */
-		switch (schedule_workload_type) {
-		case MARS_WORKLOAD_TYPE_TASK:
-			task_schedule((struct mars_task_context *)
-					&schedule_workload,
-					(struct mars_task_args *)args);
-			break;
-		}
-
-		/* dma the workload context code into main memory from LS */
-		mars_dma_put_and_wait((void *)&schedule_workload,
-					schedule_workload_ea,
-					sizeof(struct mars_workload_context),
-					MARS_DMA_TAG);
-
-		queue_block.bits[index].state = MARS_WORKLOAD_STATE_READY;
-		queue_block.bits[index].priority = priority;
-		queue_block.bits[index].counter = MARS_WORKLOAD_COUNTER_MIN;
-		queue_block.bits[index].signal = MARS_WORKLOAD_SIGNAL_OFF;
-		queue_block.bits[index].wait = MARS_WORKLOAD_ID_NONE;
-
-		/* attempt to write back workload queue block to cache line */
-		status = atomic_put(&queue_block,
-			queue_header.queue_ea +
-			offsetof(struct mars_workload_queue, block) +
-			sizeof(struct mars_workload_queue_block) * block);
+	/* dma the workload context code into LS from main memory */
+	mars_dma_get_and_wait((void *)&schedule_workload,
+				schedule_workload_ea,
+				sizeof(struct mars_workload_context),
+				MARS_DMA_TAG);
 
-		/* write back failed so wait until reservation is lost */
-		if (status)
-			atomic_event_wait();
-	} while (status);
+	/* workload type specific handling */
+	switch (schedule_workload_type) {
+	case MARS_WORKLOAD_TYPE_TASK:
+		task_schedule((struct mars_task_context *)&schedule_workload,
+				(struct mars_task_args *)args);
+		break;
+	}
+
+	/* dma the workload context code into main memory from LS */
+	mars_dma_put_and_wait((void *)&schedule_workload,
+				schedule_workload_ea,
+				sizeof(struct mars_workload_context),
+				MARS_DMA_TAG);
+
+	queue_block.bits[index].state = MARS_WORKLOAD_STATE_READY;
+	queue_block.bits[index].priority = priority;
+	queue_block.bits[index].counter = MARS_WORKLOAD_COUNTER_MIN;
+	queue_block.bits[index].signal = MARS_WORKLOAD_SIGNAL_OFF;
+	queue_block.bits[index].wait = MARS_WORKLOAD_ID_NONE;
 
-	atomic_event_restore();
+	mars_mutex_unlock_put(block_ea, (struct mars_mutex *)&queue_block);
 
 	return MARS_SUCCESS;
 }
@@ -205,38 +188,24 @@ int workload_wait(uint16_t workload_id)
 
 	int block = workload_index / MARS_WORKLOAD_PER_BLOCK;
 	int index = workload_index % MARS_WORKLOAD_PER_BLOCK;
-	int status;
 
-	atomic_event_setup();
-
-	/* attempt to update workload queue block until successful */
-	do {
-		atomic_get(&queue_block,
-			queue_header.queue_ea +
+	uint64_t block_ea = queue_header.queue_ea +
 			offsetof(struct mars_workload_queue, block) +
-			sizeof(struct mars_workload_queue_block) * block);
+			sizeof(struct mars_workload_queue_block) * block;
 
-		/* make sure workload is initialized */
-		if (queue_block.bits[index].state == MARS_WORKLOAD_STATE_NONE) {
-			atomic_event_restore();
-			return MARS_ERROR_STATE;
-		}
-
-		/* set the workload id to wait for */
-		queue_block.bits[index].wait = workload_id;
-
-		/* attempt to write back workload queue block to cache line */
-		status = atomic_put(&queue_block,
-			queue_header.queue_ea +
-			offsetof(struct mars_workload_queue, block) +
-			sizeof(struct mars_workload_queue_block) * block);
+	mars_mutex_lock_get(block_ea, (struct mars_mutex *)&queue_block);
+
+	/* make sure workload is initialized */
+	if (queue_block.bits[index].state == MARS_WORKLOAD_STATE_NONE) {
+		mars_mutex_unlock_put(block_ea,
+			(struct mars_mutex *)&queue_block);
+		return MARS_ERROR_STATE;
+	}
 
-		/* write back failed so wait until reservation is lost */
-		if (status)
-			atomic_event_wait();
-	} while (status);
+	/* set the workload id to wait for */
+	queue_block.bits[index].wait = workload_id;
 
-	atomic_event_restore();
+	mars_mutex_unlock_put(block_ea, (struct mars_mutex *)&queue_block);
 
 	return MARS_SUCCESS;
 }
@@ -249,14 +218,19 @@ int workload_try_wait(uint16_t workload_
 	int block = workload_id / MARS_WORKLOAD_PER_BLOCK;
 	int index = workload_id % MARS_WORKLOAD_PER_BLOCK;
 
-	atomic_get(&queue_block,
-		queue_header.queue_ea +
-		offsetof(struct mars_workload_queue, block) +
-		sizeof(struct mars_workload_queue_block) * block);
-
-	MARS_CHECK_RET(queue_block.bits[index].state !=
-			MARS_WORKLOAD_STATE_NONE,
-			MARS_ERROR_STATE);
+	uint64_t block_ea = queue_header.queue_ea +
+			offsetof(struct mars_workload_queue, block) +
+			sizeof(struct mars_workload_queue_block) * block;
+
+	mars_mutex_lock_get(block_ea, (struct mars_mutex *)&queue_block);
+
+	MARS_CHECK_CLEANUP_RET(queue_block.bits[index].state !=
+		MARS_WORKLOAD_STATE_NONE,
+		mars_mutex_unlock_put(block_ea,
+			(struct mars_mutex *)&queue_block),
+		MARS_ERROR_STATE);
+
+	mars_mutex_unlock_put(block_ea, (struct mars_mutex *)&queue_block);
 
 	if (queue_block.bits[index].state != MARS_WORKLOAD_STATE_FINISHED)
 		return MARS_ERROR_BUSY;
@@ -271,39 +245,24 @@ int workload_signal_send(uint16_t worklo
 
 	int block = workload_id / MARS_WORKLOAD_PER_BLOCK;
 	int index = workload_id % MARS_WORKLOAD_PER_BLOCK;
-	int status;
-
-	atomic_event_setup();
 
-	/* attempt to update workload queue block until successful */
-	do {
-		/* reserve cache line of workload queue block */
-		atomic_get(&queue_block,
-			queue_header.queue_ea +
+	uint64_t block_ea = queue_header.queue_ea +
 			offsetof(struct mars_workload_queue, block) +
-			sizeof(struct mars_workload_queue_block) * block);
+			sizeof(struct mars_workload_queue_block) * block;
 
-		/* make sure workload is initialized */
-		if (queue_block.bits[index].state == MARS_WORKLOAD_STATE_NONE) {
-			atomic_event_restore();
-			return MARS_ERROR_STATE;
-		}
-
-		/* set the workload signal */
-		queue_block.bits[index].signal = MARS_WORKLOAD_SIGNAL_ON;
-
-		/* attempt to write back workload queue block to cache line */
-		status = atomic_put(&queue_block,
-			queue_header.queue_ea +
-			offsetof(struct mars_workload_queue, block) +
-			sizeof(struct mars_workload_queue_block) * block);
+	mars_mutex_lock_get(block_ea, (struct mars_mutex *)&queue_block);
 
-		/* write back failed so wait until reservation is lost */
-		if (status)
-			atomic_event_wait();
-	} while (status);
+	/* make sure workload is initialized */
+	if (queue_block.bits[index].state == MARS_WORKLOAD_STATE_NONE) {
+		mars_mutex_unlock_put(block_ea,
+			(struct mars_mutex *)&queue_block);
+		return MARS_ERROR_STATE;
+	}
+
+	/* set the workload signal */
+	queue_block.bits[index].signal = MARS_WORKLOAD_SIGNAL_ON;
 
-	atomic_event_restore();
+	mars_mutex_unlock_put(block_ea, (struct mars_mutex *)&queue_block);
 
 	return MARS_SUCCESS;
 }
@@ -330,14 +289,19 @@ int workload_signal_try_wait(void)
 	int block = workload_index / MARS_WORKLOAD_PER_BLOCK;
 	int index = workload_index % MARS_WORKLOAD_PER_BLOCK;
 
-	atomic_get(&queue_block,
-		queue_header.queue_ea +
-		offsetof(struct mars_workload_queue, block) +
-		sizeof(struct mars_workload_queue_block) * block);
-
-	MARS_CHECK_RET(queue_block.bits[index].state !=
-			MARS_WORKLOAD_STATE_NONE,
-			MARS_ERROR_STATE);
+	uint64_t block_ea = queue_header.queue_ea +
+			offsetof(struct mars_workload_queue, block) +
+			sizeof(struct mars_workload_queue_block) * block;
+
+	mars_mutex_lock_get(block_ea, (struct mars_mutex *)&queue_block);
+
+	MARS_CHECK_CLEANUP_RET(queue_block.bits[index].state !=
+		MARS_WORKLOAD_STATE_NONE,
+		mars_mutex_unlock_put(block_ea,
+			(struct mars_mutex *)&queue_block),
+		MARS_ERROR_STATE);
+
+	mars_mutex_unlock_put(block_ea, (struct mars_mutex *)&queue_block);
 
 	/* return busy if task has not received signal */
 	if (queue_block.bits[index].signal != MARS_WORKLOAD_SIGNAL_ON)