[Cbe-oss-dev] [RFC, PATCH] CELL Oprofile SPU profiling updated patch

Tue Feb 27 10:50:59 EST 2007

On Thursday 22 February 2007, Carl Love wrote:
> This patch updates the existing arch/powerpc/oprofile/op_model_cell.c
> to add in the SPU profiling capabilities.  In addition, a 'cell' subdirectory
> was added to arch/powerpc/oprofile to hold Cell-specific SPU profiling
> code.

There was a significant amount of whitespace breakage in this patch,
which I cleaned up. The patch below consists of the other things
I changed as a further cleanup. Note that I changed the format
of the context switch record, which I found too complicated, as
I described on IRC last week.

	Arnd <><

--
Subject: cleanup spu oprofile code

From: Arnd Bergmann <arnd.bergmann at de.ibm.com>
This cleans up some of the new oprofile code. It's mostly
cosmetic changes, like way multi-line comments are formatted.
The most significant change is a simplification of the
context-switch record format.

It does mean the oprofile report tool needs to be adapted,
but I'm sure that it pays off in the end.

Signed-off-by: Arnd Bergmann <arnd.bergmann at de.ibm.com>
Index: linux-2.6/arch/powerpc/oprofile/cell/spu_task_sync.c
===================================================================

--- linux-2.6.orig/arch/powerpc/oprofile/cell/spu_task_sync.c
+++ linux-2.6/arch/powerpc/oprofile/cell/spu_task_sync.c
@@ -61,11 +61,12 @@ static void destroy_cached_info(struct k
 static struct cached_info * get_cached_info(struct spu * the_spu, int spu_num)
 {
 	struct kref * ref;
-	struct cached_info * ret_info = NULL;
+	struct cached_info * ret_info;
 	if (spu_num >= num_spu_nodes) {
 		printk(KERN_ERR "SPU_PROF: "
 		       "%s, line %d: Invalid index %d into spu info cache\n",
 		       __FUNCTION__, __LINE__, spu_num);
+		ret_info = NULL;
 		goto out;
 	}
 	if (!spu_info[spu_num] && the_spu) {
@@ -89,9 +90,9 @@ static struct cached_info * get_cached_i
 static int
 prepare_cached_spu_info(struct spu * spu, unsigned int objectId)
 {
-	unsigned long flags = 0;
+	unsigned long flags;
 	struct vma_to_fileoffset_map * new_map;
-	int retval = 0;
+	int retval;
 	struct cached_info * info;
 
 	/* We won't bother getting cache_lock here since
@@ -112,6 +113,7 @@ prepare_cached_spu_info(struct spu * spu
 		printk(KERN_ERR "SPU_PROF: "
 		       "%s, line %d: create vma_map failed\n",
 		       __FUNCTION__, __LINE__);
+		retval = -ENOMEM;
 		goto err_alloc;
 	}
 	new_map = create_vma_map(spu, objectId);
@@ -119,6 +121,7 @@ prepare_cached_spu_info(struct spu * spu
 		printk(KERN_ERR "SPU_PROF: "
 		       "%s, line %d: create vma_map failed\n",
 		       __FUNCTION__, __LINE__);
+		retval = -ENOMEM;
 		goto err_alloc;
 	}
 
@@ -144,7 +147,7 @@ prepare_cached_spu_info(struct spu * spu
 	goto out;
 
 err_alloc:
-	retval = -1;
+	kfree(info);
 out:
 	return retval;
 }
@@ -215,11 +218,9 @@ static inline unsigned long fast_get_dco
 static unsigned long
 get_exec_dcookie_and_offset(struct spu * spu, unsigned int * offsetp,
 			    unsigned long * spu_bin_dcookie,
-			    unsigned long * shlib_dcookie,
 			    unsigned int spu_ref)
 {
 	unsigned long app_cookie = 0;
-	unsigned long * image_cookie = NULL;
 	unsigned int my_offset = 0;
 	struct file * app = NULL;
 	struct vm_area_struct * vma;
@@ -252,24 +253,17 @@ get_exec_dcookie_and_offset(struct spu *
 			 my_offset, spu_ref,
 			 vma->vm_file->f_dentry->d_name.name);
 		*offsetp = my_offset;
-		if (my_offset == 0)
-			image_cookie = spu_bin_dcookie;
-		else if (vma->vm_file != app)
-			image_cookie = shlib_dcookie;
 		break;
 	}
 
-	if (image_cookie) {
-		*image_cookie = fast_get_dcookie(vma->vm_file->f_dentry,
+	*spu_bin_dcookie = fast_get_dcookie(vma->vm_file->f_dentry,
 						 vma->vm_file->f_vfsmnt);
-		pr_debug("got dcookie for %s\n",
-			 vma->vm_file->f_dentry->d_name.name);
-	}
+	pr_debug("got dcookie for %s\n", vma->vm_file->f_dentry->d_name.name);
 
- out:
+out:
 	return app_cookie;
 
- fail_no_image_cookie:
+fail_no_image_cookie:
 	printk(KERN_ERR "SPU_PROF: "
 		"%s, line %d: Cannot find dcookie for SPU binary\n",
 		__FUNCTION__, __LINE__);
@@ -285,18 +279,18 @@ get_exec_dcookie_and_offset(struct spu *
 static int process_context_switch(struct spu * spu, unsigned int objectId)
 {
 	unsigned long flags;
-	int retval = 0;
-	unsigned int offset = 0;
-	unsigned long spu_cookie = 0, app_dcookie = 0, shlib_cookie = 0;
+	int retval;
+	unsigned int offset;
+	unsigned long spu_cookie, app_dcookie;
+
 	retval = prepare_cached_spu_info(spu, objectId);
-	if (retval == -1) {
+	if (retval)
 		goto out;
-	}
+
 	/* Get dcookie first because a mutex_lock is taken in that
 	 * code path, so interrupts must not be disabled.
 	 */
-	app_dcookie = get_exec_dcookie_and_offset(spu, &offset, &spu_cookie,
-						  &shlib_cookie, objectId);
+	app_dcookie = get_exec_dcookie_and_offset(spu, &offset, &spu_cookie, objectId);
 
 	/* Record context info in event buffer */
 	spin_lock_irqsave(&buffer_lock, flags);
@@ -306,27 +300,8 @@ static int process_context_switch(struct
 	add_event_entry(spu->pid);
 	add_event_entry(spu->tgid);
 	add_event_entry(app_dcookie);
-
-	if (offset) {
-		/* When offset is non-zero, the SPU ELF was embedded;
-		 * otherwise, it was loaded from a separate binary file. For
-		 * embedded case, we record the offset into the embedding file
-		 * where the SPU ELF was placed.  The embedding file may be
-		 * either the executable application binary or shared library.
-		 * For the non-embedded case, we record a dcookie that
-		 * points to the location of the separate SPU binary that was
-		 * loaded.
-		 */
-		if (shlib_cookie) {
-			add_event_entry(SPU_SHLIB_COOKIE_CODE);
-			add_event_entry(shlib_cookie);
-		}
-		add_event_entry(SPU_OFFSET_CODE);
-		add_event_entry(offset);
-	} else {
-		add_event_entry(SPU_COOKIE_CODE);
-		add_event_entry(spu_cookie);
-	}
+	add_event_entry(spu_cookie);
+	add_event_entry(offset);
 	spin_unlock_irqrestore(&buffer_lock, flags);
 	smp_wmb();
 out:
@@ -343,8 +318,8 @@ static int spu_active_notify(struct noti
 				void * data)
 {
 	int retval;
-	unsigned long flags = 0;
-	struct spu * the_spu = data;
+	unsigned long flags;
+	struct spu *the_spu = data;
 	pr_debug("SPU event notification arrived\n");
 	if (!val){
 		spin_lock_irqsave(&cache_lock, flags);
@@ -403,8 +378,7 @@ void spu_sync_buffer(int spu_num, unsign
 		     int num_samples)
 {
 	unsigned long long file_offset;
-	unsigned long cache_lock_flags = 0;
-	unsigned long buffer_lock_flags = 0;
+	unsigned long flags;
 	int i;
 	struct vma_to_fileoffset_map * map;
 	struct spu * the_spu;
@@ -417,29 +391,27 @@ void spu_sync_buffer(int spu_num, unsign
 	 * corresponding to this cached_info may end, thus resulting
 	 * in the destruction of the cached_info.
 	 */
-	spin_lock_irqsave(&cache_lock, cache_lock_flags);
+	spin_lock_irqsave(&cache_lock, flags);
 	c_info = get_cached_info(NULL, spu_num);
-	if (c_info == NULL) {
+	if (!c_info) {
 	/* This legitimately happens when the SPU task ends before all
 	 * samples are recorded.  No big deal -- so we just drop a few samples.
 	 */
 		pr_debug("SPU_PROF: No cached SPU contex "
 			  "for SPU #%d. Dropping samples.\n", spu_num);
-		spin_unlock_irqrestore(&cache_lock, cache_lock_flags);
-		return ;
+		goto out;
 	}
 
 	map = c_info->map;
 	the_spu = c_info->the_spu;
-	spin_lock_irqsave(&buffer_lock, buffer_lock_flags);
+	spin_lock(&buffer_lock);
 	for (i = 0; i < num_samples; i++) {
 		unsigned int sample = *(samples+i);
 		int grd_val = 0;
 		file_offset = 0;
 		if (sample == 0)
 			continue;
-		file_offset = vma_map_lookup(
-			map, sample, the_spu, &grd_val);
+		file_offset = vma_map_lookup( map, sample, the_spu, &grd_val);
 
 		/* If overlays are used by this SPU application, the guard
 		 * value is non-zero, indicating which overlay section is in
@@ -460,8 +432,9 @@ void spu_sync_buffer(int spu_num, unsign
 			continue;
 		add_event_entry(file_offset | spu_num_shifted);
 	}
-	spin_unlock_irqrestore(&buffer_lock, buffer_lock_flags);
-	spin_unlock_irqrestore(&cache_lock, cache_lock_flags);
+	spin_unlock(&buffer_lock);
+out:
+	spin_unlock_irqrestore(&cache_lock, flags);
 }
 
 
Index: linux-2.6/arch/powerpc/oprofile/op_model_cell.c
===================================================================
--- linux-2.6.orig/arch/powerpc/oprofile/op_model_cell.c
+++ linux-2.6/arch/powerpc/oprofile/op_model_cell.c
@@ -40,7 +40,8 @@
 #include "../platforms/cell/cbe_regs.h"
 #include "cell/pr_util.h"
 
-/* spu_cycle_reset is the number of cycles between samples.
+/*
+ * spu_cycle_reset is the number of cycles between samples.
  * This variable is used for SPU profiling and should ONLY be set
  * at the beginning of cell_reg_setup; otherwise, it's read-only.
  */
@@ -73,7 +74,6 @@ struct pmc_cntrl_data {
 /*
  * ibm,cbe-perftools rtas parameters
  */
-
 struct pm_signal {
 	u16 cpu;		/* Processor to modify */
 	u16 sub_unit;		/* hw subunit this applies to (if applicable)*/
@@ -123,7 +123,8 @@ static DEFINE_PER_CPU(unsigned long[NR_P
 
 static struct pmc_cntrl_data pmc_cntrl[NUM_THREADS][NR_PHYS_CTRS];
 
-/* The CELL profiling code makes rtas calls to setup the debug bus to
+/*
+ * The CELL profiling code makes rtas calls to setup the debug bus to
  * route the performance signals.  Additionally, SPU profiling requires
  * a second rtas call to setup the hardware to capture the SPU PCs.
  * The EIO error value is returned if the token lookups or the rtas
@@ -137,16 +138,21 @@ static struct pmc_cntrl_data pmc_cntrl[N
  * either.
  */
 
-/* Interpetation of hdw_thread:
+/*
+ * Interpetation of hdw_thread:
  * 0 - even virtual cpus 0, 2, 4,...
  * 1 - odd virtual cpus 1, 3, 5, ...
+ *
+ * FIXME: this is strictly wrong, we need to clean this up in a number
+ * of places. It works for now. -arnd
  */
 static u32 hdw_thread;
 
 static u32 virt_cntr_inter_mask;
 static struct timer_list timer_virt_cntr;
 
-/* pm_signal needs to be global since it is initialized in
+/*
+ * pm_signal needs to be global since it is initialized in
  * cell_reg_setup at the time when the necessary information
  * is available.
  */
@@ -167,7 +173,6 @@ static unsigned char input_bus[NUM_INPUT
 /*
  * Firmware interface functions
  */
-
 static int
 rtas_ibm_cbe_perftools(int subfunc, int passthru,
 		       void *address, unsigned long length)
@@ -183,12 +188,13 @@ static void pm_rtas_reset_signals(u32 no
 	int ret;
 	struct pm_signal pm_signal_local;
 
-	/*  The debug bus is being set to the passthru disable state.
-	 *  However, the FW still expects atleast one legal signal routing
-	 *  entry or it will return an error on the arguments.	If we don't
-	 *  supply a valid entry, we must ignore all return values.  Ignoring
-	 *  all return values means we might miss an error we should be
-	 *  concerned about.
+	/*
+	 * The debug bus is being set to the passthru disable state.
+	 * However, the FW still expects atleast one legal signal routing
+	 * entry or it will return an error on the arguments.	If we don't
+	 * supply a valid entry, we must ignore all return values.  Ignoring
+	 * all return values means we might miss an error we should be
+	 * concerned about.
 	 */
 
 	/*  fw expects physical cpu #. */
@@ -203,7 +209,8 @@ static void pm_rtas_reset_signals(u32 no
 				     sizeof(struct pm_signal));
 
 	if (unlikely(ret))
-		/* Not a fatal error. For Oprofile stop, the oprofile
+		/*
+		 * Not a fatal error. For Oprofile stop, the oprofile
 		 * functions do not support returning an error for
 		 * failure to stop OProfile.
 		 */
@@ -217,7 +224,8 @@ static int pm_rtas_activate_signals(u32 
 	int i, j;
 	struct pm_signal pm_signal_local[NR_PHYS_CTRS];
 
-	/* There is no debug setup required for the cycles event.
+	/*
+	 * There is no debug setup required for the cycles event.
 	 * Note that only events in the same group can be used.
 	 * Otherwise, there will be conflicts in correctly routing
 	 * the signals on the debug bus.  It is the responsiblity
@@ -295,7 +303,8 @@ static void set_pm_event(u32 ctr, int ev
 	pm_regs.pm07_cntrl[ctr] |= PM07_CTR_POLARITY(polarity);
 	pm_regs.pm07_cntrl[ctr] |= PM07_CTR_INPUT_CONTROL(input_control);
 
-	/* Some of the islands signal selection is based on 64 bit words.
+	/*
+	 * Some of the islands signal selection is based on 64 bit words.
 	 * The debug bus words are 32 bits, the input words to the performance
 	 * counters are defined as 32 bits.  Need to convert the 64 bit island
 	 * specification to the appropriate 32 input bit and bus word for the
@@ -345,7 +354,8 @@ out:
 
 static void write_pm_cntrl(int cpu)
 {
-	/* Oprofile will use 32 bit counters, set bits 7:10 to 0
+	/*
+	 * Oprofile will use 32 bit counters, set bits 7:10 to 0
 	 * pmregs.pm_cntrl is a global
 	 */
 
@@ -362,7 +372,8 @@ static void write_pm_cntrl(int cpu)
 	if (pm_regs.pm_cntrl.freeze == 1)
 		val |= CBE_PM_FREEZE_ALL_CTRS;
 
-	/* Routine set_count_mode must be called previously to set
+	/*
+	 * Routine set_count_mode must be called previously to set
 	 * the count mode based on the user selection of user and kernel.
 	 */
 	val |= CBE_PM_COUNT_MODE_SET(pm_regs.pm_cntrl.count_mode);
@@ -372,7 +383,8 @@ static void write_pm_cntrl(int cpu)
 static inline void
 set_count_mode(u32 kernel, u32 user)
 {
-	/* The user must specify user and kernel if they want them. If
+	/*
+	 * The user must specify user and kernel if they want them. If
 	 *  neither is specified, OProfile will count in hypervisor mode.
 	 *  pm_regs.pm_cntrl is a global
 	 */
@@ -413,17 +425,18 @@ static inline void enable_ctr(u32 cpu, u
  * pair of per-cpu arrays is used for storing the previous and next
  * pmc values for a given node.
  * NOTE: We use the per-cpu variable to improve cache performance.
+ *
+ * This routine will alternate loading the virtual counters for
+ * virtual CPUs
  */
 static void cell_virtual_cntr(unsigned long data)
 {
-	/* This routine will alternate loading the virtual counters for
-	 * virtual CPUs
-	 */
 	int i, prev_hdw_thread, next_hdw_thread;
 	u32 cpu;
 	unsigned long flags;
 
-	/* Make sure that the interrupt_hander and the virt counter are
+	/*
+	 * Make sure that the interrupt_hander and the virt counter are
 	 * not both playing with the counters on the same node.
 	 */
 
@@ -435,22 +448,25 @@ static void cell_virtual_cntr(unsigned l
 	hdw_thread = 1 ^ hdw_thread;
 	next_hdw_thread = hdw_thread;
 
-	for (i = 0; i < num_counters; i++)
-	/* There are some per thread events.  Must do the
+	/*
+	 * There are some per thread events.  Must do the
 	 * set event, for the thread that is being started
 	 */
+	for (i = 0; i < num_counters; i++)
 		set_pm_event(i,
 			pmc_cntrl[next_hdw_thread][i].evnts,
 			pmc_cntrl[next_hdw_thread][i].masks);
 
-	/* The following is done only once per each node, but
+	/*
+	 * The following is done only once per each node, but
 	 * we need cpu #, not node #, to pass to the cbe_xxx functions.
 	 */
 	for_each_online_cpu(cpu) {
 		if (cbe_get_hw_thread_id(cpu))
 			continue;
 
-		/* stop counters, save counter values, restore counts
+		/*
+		 * stop counters, save counter values, restore counts
 		 * for previous thread
 		 */
 		cbe_disable_pm(cpu);
@@ -479,13 +495,15 @@ static void cell_virtual_cntr(unsigned l
 						      next_hdw_thread)[i]);
 		}
 
-		/* Switch to the other thread. Change the interrupt
+		/*
+		 * Switch to the other thread. Change the interrupt
 		 * and control regs to be scheduled on the CPU
 		 * corresponding to the thread to execute.
 		 */
 		for (i = 0; i < num_counters; i++) {
 			if (pmc_cntrl[next_hdw_thread][i].enabled) {
-				/* There are some per thread events.
+				/*
+				 * There are some per thread events.
 				 * Must do the set event, enable_cntr
 				 * for each cpu.
 				 */
@@ -517,9 +535,8 @@ static void start_virt_cntrs(void)
 }
 
 /* This function is called once for all cpus combined */
-static int
-cell_reg_setup(struct op_counter_config *ctr,
-	       struct op_system_config *sys, int num_ctrs)
+static int cell_reg_setup(struct op_counter_config *ctr,
+			struct op_system_config *sys, int num_ctrs)
 {
 	int i, j, cpu;
 	spu_cycle_reset = 0;
@@ -527,7 +544,8 @@ cell_reg_setup(struct op_counter_config 
 	if (ctr[0].event == SPU_CYCLES_EVENT_NUM) {
 		spu_cycle_reset = ctr[0].count;
 
-		/* Each node will need to make the rtas call to start
+		/*
+		 * Each node will need to make the rtas call to start
 		 * and stop SPU profiling.  Get the token once and store it.
 		 */
 		spu_rtas_token = rtas_token("ibm,cbe-spu-perftools");
@@ -542,7 +560,8 @@ cell_reg_setup(struct op_counter_config 
 
 	pm_rtas_token = rtas_token("ibm,cbe-perftools");
 
-	/* For all events excetp PPU CYCLEs, each node will need to make
+	/*
+	 * For all events excetp PPU CYCLEs, each node will need to make
 	 * the rtas cbe-perftools call to setup and reset the debug bus.
 	 * Make the token lookup call once and store it in the global
 	 * variable pm_rtas_token.
@@ -579,7 +598,8 @@ cell_reg_setup(struct op_counter_config 
 			per_cpu(pmc_values, j)[i] = 0;
 	}
 
-	/* Setup the thread 1 events, map the thread 0 event to the
+	/*
+	 * Setup the thread 1 events, map the thread 0 event to the
 	 * equivalent thread 1 event.
 	 */
 	for (i = 0; i < num_ctrs; ++i) {
@@ -603,7 +623,8 @@ cell_reg_setup(struct op_counter_config 
 	for (i = 0; i < NUM_INPUT_BUS_WORDS; i++)
 		input_bus[i] = 0xff;
 
-	/* Our counters count up, and "count" refers to
+	/*
+	 * Our counters count up, and "count" refers to
 	 * how much before the next interrupt, and we interrupt
 	 * on overflow.	 So we calculate the starting value
 	 * which will give us "count" until overflow.
@@ -667,19 +688,19 @@ static int cell_cpu_setup(struct op_coun
 		}
 	}
 
-	/* the pm_rtas_activate_signals will return -EIO if the FW
+	/*
+	 * The pm_rtas_activate_signals will return -EIO if the FW
 	 * call failed.
 	 */
-	return (pm_rtas_activate_signals(cbe_cpu_to_node(cpu), num_enabled));
-
+	return pm_rtas_activate_signals(cbe_cpu_to_node(cpu), num_enabled);
 }
 
 #define ENTRIES	 303
 #define MAXLFSR	 0xFFFFFF
 
 /* precomputed table of 24 bit LFSR values */
-int initial_lfsr[] =
-{8221349, 12579195, 5379618, 10097839, 7512963, 7519310, 3955098, 10753424,
+static int initial_lfsr[] = {
+ 8221349, 12579195, 5379618, 10097839, 7512963, 7519310, 3955098, 10753424,
  15507573, 7458917, 285419, 2641121, 9780088, 3915503, 6668768, 1548716,
  4885000, 8774424, 9650099, 2044357, 2304411, 9326253, 10332526, 4421547,
  3440748, 10179459, 13332843, 10375561, 1313462, 8375100, 5198480, 6071392,
@@ -716,7 +737,8 @@ int initial_lfsr[] =
  3258216, 12505185, 6007317, 9218111, 14661019, 10537428, 11731949, 9027003,
  6641507, 9490160, 200241, 9720425, 16277895, 10816638, 1554761, 10431375,
  7467528, 6790302, 3429078, 14633753, 14428997, 11463204, 3576212, 2003426,
- 6123687, 820520, 9992513, 15784513, 5778891, 6428165, 8388607};
+ 6123687, 820520, 9992513, 15784513, 5778891, 6428165, 8388607
+};
 
 /*
  * The hardware uses an LFSR counting sequence to determine when to capture
@@ -777,28 +799,25 @@ int initial_lfsr[] =
 
 static int calculate_lfsr(int n)
 {
-	/* The ranges and steps are in powers of 2 so the calculations
+	/*
+	 * The ranges and steps are in powers of 2 so the calculations
 	 * can be done using shifts rather then divide.
 	 */
 	int index;
 
-	if ((n >> 16) == 0) {
+	if ((n >> 16) == 0)
 		index = 0;
-
-	} else if (((n - V2_16) >> 19) == 0) {
+	else if (((n - V2_16) >> 19) == 0)
 		index = ((n - V2_16) >> 12) + 1;
-
-	} else if (((n - V2_16 - V2_19) >> 22) == 0) {
+	else if (((n - V2_16 - V2_19) >> 22) == 0)
 		index = ((n - V2_16 - V2_19) >> 15 ) + 1 + 128;
+	else if (((n - V2_16 - V2_19 - V2_22) >> 24) == 0)
+		index = ((n - V2_16 - V2_19 - V2_22) >> 18 ) + 1 + 256;
+	else
+		index = ENTRIES-1;
 
-	} else if (((n - V2_16 - V2_19 - V2_22) >> 24) == 0) {
-		index = ((n - V2_16 - V2_19 - V2_22) >> 18 )
-			+ 1 + 256;
-	}
-
-	if ((index > ENTRIES) || (index < 0))	/* make sure index is
-						 * valid
-						 */
+	/* make sure index is valid */
+	if ((index > ENTRIES) || (index < 0))
 		index = ENTRIES-1;
 
 	return initial_lfsr[index];
@@ -809,15 +828,17 @@ static int pm_rtas_activate_spu_profilin
 	int ret, i;
 	struct pm_signal pm_signal_local[NR_PHYS_CTRS];
 
-	/* Set up the rtas call to configure the debug bus to
-	 * route the SPU PCs.  Setup the pm_signal for each SPU */
+	/*
+	 * Set up the rtas call to configure the debug bus to
+	 * route the SPU PCs.  Setup the pm_signal for each SPU
+	 */
 	for (i = 0; i < NUM_SPUS_PER_NODE; i++) {
 		pm_signal_local[i].cpu = node;
 		pm_signal_local[i].signal_group = 41;
-		pm_signal_local[i].bus_word = 1 << i / 2; /* spu i on
-							   * word (i/2)
-							   */
-		pm_signal_local[i].sub_unit = i;	/* spu i */
+		/* spu i on word (i/2) */
+		pm_signal_local[i].bus_word = 1 << i / 2;
+		/* spu i */
+		pm_signal_local[i].sub_unit = i;
 		pm_signal_local[i].bit = 63;
 	}
 
@@ -858,8 +879,8 @@ static int cell_global_start_spu(struct 
 	int subfunc, rtn_value;
 	unsigned int lfsr_value;
 	int cpu;
-	int ret = 0;
-	int rtas_error = 0;
+	int ret;
+	int rtas_error;
 	unsigned int cpu_khzfreq = 0;
 
 	/* The SPU profiling uses time-based profiling based on
@@ -884,24 +905,23 @@ static int cell_global_start_spu(struct 
 	for_each_online_cpu(cpu) {
 		if (cbe_get_hw_thread_id(cpu))
 			continue;
-		/* Setup SPU cycle-based profiling.
+
+		/*
+		 * Setup SPU cycle-based profiling.
 		 * Set perf_mon_control bit 0 to a zero before
 		 * enabling spu collection hardware.
 		 */
 		cbe_write_pm(cpu, pm_control, 0);
 
 		if (spu_cycle_reset > MAX_SPU_COUNT)
-			/* use largest possible value
-			 */
+			/* use largest possible value */
 			lfsr_value = calculate_lfsr(MAX_SPU_COUNT-1);
 		else
-		    lfsr_value = calculate_lfsr(spu_cycle_reset);
+			lfsr_value = calculate_lfsr(spu_cycle_reset);
 
-		if (lfsr_value == 0) {	/* must use a non zero value.  Zero
-					 * disables data collection.
-					 */
-				lfsr_value = calculate_lfsr(1);
-		}
+		/* must use a non zero value. Zero disables data collection. */
+		if (lfsr_value == 0)
+			lfsr_value = calculate_lfsr(1);
 
 		lfsr_value = lfsr_value << 8; /* shift lfsr to correct
 						* register location
@@ -916,7 +936,7 @@ static int cell_global_start_spu(struct 
 		}
 
 
-		subfunc = 2;	// 2 - activate SPU tracing, 3 - deactivate
+		subfunc = 2;	/* 2 - activate SPU tracing, 3 - deactivate */
 
 		/* start profiling */
 		rtn_value = rtas_call(spu_rtas_token, 3, 1, NULL, subfunc,
@@ -976,7 +996,8 @@ static int cell_global_start_ppu(struct 
 	oprofile_running = 1;
 	smp_wmb();
 
-	/* NOTE: start_virt_cntrs will result in cell_virtual_cntr() being
+	/*
+	 * NOTE: start_virt_cntrs will result in cell_virtual_cntr() being
 	 * executed which manipulates the PMU.	We start the "virtual counter"
 	 * here so that we do not need to synchronize access to the PMU in
 	 * the above for-loop.
@@ -986,7 +1007,6 @@ static int cell_global_start_ppu(struct 
 	return 0;
 }
 
-
 static int cell_global_start(struct op_counter_config *ctr)
 {
 	if (spu_cycle_reset) {
@@ -996,14 +1016,15 @@ static int cell_global_start(struct op_c
 	}
 }
 
-static void cell_global_stop_spu(void)
-/* Note the generic OProfile stop calls do not support returning
+/*
+ * Note the generic OProfile stop calls do not support returning
  * an error on stop.  Hence, will not return an error if the FW
  * calls fail on stop.	Failure to reset the debug bus is not an issue.
  * Failure to disable the SPU profiling is not an issue.  The FW calls
  * to enable the performance counters and debug bus will work even if
  * the hardware was not cleanly reset.
  */
+static void cell_global_stop_spu(void)
 {
 	int subfunc, rtn_value;
 	unsigned int lfsr_value;
@@ -1020,7 +1041,8 @@ static void cell_global_stop_spu(void)
 		if (cbe_get_hw_thread_id(cpu))
 			continue;
 
-		subfunc = 3;	/* 2 - activate SPU tracing,
+		subfunc = 3;	/*
+				 * 2 - activate SPU tracing,
 				 * 3 - deactivate
 				 */
 		lfsr_value = 0x8f100000;
@@ -1046,7 +1068,8 @@ static void cell_global_stop_ppu(void)
 {
 	int cpu;
 
-	/* This routine will be called once for the system.
+	/*
+	 * This routine will be called once for the system.
 	 * There is one performance monitor per node, so we
 	 * only need to perform this function once per node.
 	 */
@@ -1079,8 +1102,8 @@ static void cell_global_stop(void)
 	}
 }
 
-static void
-cell_handle_interrupt(struct pt_regs *regs, struct op_counter_config *ctr)
+static void cell_handle_interrupt(struct pt_regs *regs,
+				struct op_counter_config *ctr)
 {
 	u32 cpu;
 	u64 pc;
@@ -1091,13 +1114,15 @@ cell_handle_interrupt(struct pt_regs *re
 
 	cpu = smp_processor_id();
 
-	/* Need to make sure the interrupt handler and the virt counter
+	/*
+	 * Need to make sure the interrupt handler and the virt counter
 	 * routine are not running at the same time. See the
 	 * cell_virtual_cntr() routine for additional comments.
 	 */
 	spin_lock_irqsave(&virt_cntr_lock, flags);
 
-	/* Need to disable and reenable the performance counters
+	/*
+	 * Need to disable and reenable the performance counters
 	 * to get the desired behavior from the hardware.  This
 	 * is hardware specific.
 	 */
@@ -1106,7 +1131,8 @@ cell_handle_interrupt(struct pt_regs *re
 
 	interrupt_mask = cbe_get_and_clear_pm_interrupts(cpu);
 
-	/* If the interrupt mask has been cleared, then the virt cntr
+	/*
+	 * If the interrupt mask has been cleared, then the virt cntr
 	 * has cleared the interrupt.  When the thread that generated
 	 * the interrupt is restored, the data count will be restored to
 	 * 0xffffff0 to cause the interrupt to be regenerated.
@@ -1124,7 +1150,8 @@ cell_handle_interrupt(struct pt_regs *re
 			}
 		}
 
-		/* The counters were frozen by the interrupt.
+		/*
+		 * The counters were frozen by the interrupt.
 		 * Reenable the interrupt and restart the counters.
 		 * If there was a race between the interrupt handler and
 		 * the virtual counter routine.	 The virutal counter
@@ -1134,7 +1161,8 @@ cell_handle_interrupt(struct pt_regs *re
 		cbe_enable_pm_interrupts(cpu, hdw_thread,
 					 virt_cntr_inter_mask);
 
-		/* The writes to the various performance counters only writes
+		/*
+		 * The writes to the various performance counters only writes
 		 * to a latch.	The new values (interrupt setting bits, reset
 		 * counter value etc.) are not copied to the actual registers
 		 * until the performance monitor is enabled.  In order to get
@@ -1147,7 +1175,8 @@ cell_handle_interrupt(struct pt_regs *re
 	spin_unlock_irqrestore(&virt_cntr_lock, flags);
 }
 
-/* This function is called from the generic OProfile
+/*
+ * This function is called from the generic OProfile
  * driver.  When profiling PPUs, we need to do the
  * generic sync start; otherwise, do spu_sync_start.
  */
@@ -1167,7 +1196,6 @@ static int cell_sync_stop(void)
 		return 1;
 }
 
-
 struct op_powerpc_model op_model_cell = {
 	.reg_setup = cell_reg_setup,
 	.cpu_setup = cell_cpu_setup,