[Skiboot] [PATCH v2] occ: Poll OCC throttle status and queue OCC events to host

Tue May 12 15:31:32 AEST 2015

Shilpasri G Bhat <shilpa.bhat at linux.vnet.ibm.com> writes:
> Add a new class of message definition OPAL_MSG_OCC to
> opal_message_type to notify the following OCC events to host:
> 1) OCC Reset
> 2) OCC Load
> 3) OCC Throttle Status Change
>
> Add an opal poller to periodically read throttle status updated by OCC
> for each chip and notify any change in throttle status to host. The
> throttle status indicates the reason why OCC may have limited the max
> Pstate of the chip.

Major comment is that you need to add something to
doc/opal-api/opal-messages.txt

Also include a description on how we may deal with future expansion
(e.g. chip_id and throttle_status only valid for params[0] = 0,1,2 and
if params is > 2 then rest of params is something defined in future OPAL
version and host should just ignore the message)

Do we *really* have to poll? That kind of sucks if so (and how do we get
that fixed?)

> diff --git a/hw/occ.c b/hw/occ.c
> index 34d6de5..d346394 100644
> --- a/hw/occ.c
> +++ b/hw/occ.c
> @@ -24,6 +24,8 @@
>  #include <timebase.h>
>  #include <hostservices.h>
>  #include <errorlog.h>
> +#include <opal-api.h>
> +#include <opal-msg.h>
>  
>  /* OCC Communication Area for PStates */
>  
> @@ -31,6 +33,16 @@
>  
>  #define MAX_PSTATES 256
>  
> +#define OCC_RESET	0
> +#define OCC_LOAD	1
> +#define OCC_THROTTLE	2

Part of ABI? Please add to opal-api.h and doc/

> +
> +#define chip_occ_data(chip) \
> +		((struct occ_pstate_table *)(chip->homer_base + \
> +				P8_HOMER_SAPPHIRE_DATA_OFFSET))
> +
> +static bool occ_reset;
> +
>  struct occ_pstate_entry {
>  	s8 id;
>  	u8 flags;
> @@ -302,6 +314,61 @@ static bool cpu_pstates_prepare_core(struct proc_chip *chip, struct cpu_thread *
>  	return true;
>  }
>  
> +/* occ_throttle_poll: This function will queue a meassage of type
> + * OPAL_MSG_OCC to notify any change in the throttle status of the
> + * chip. Throttle status indicates the reason why OCC may have limited
> + * the max Pstate of the chip.
> + * 0x00 = No throttle
> + * 0x01 = Power Cap
> + * 0x02 = Processor Over Temperature
> + * 0x03 = Power Supply Failure (currently not used)
> + * 0x04 = Over current (currently not used)
> + * 0x05 = OCC Reset (not reliable as some failures will not allow for
> + * OCC to update throttle status, so use 'occ_reset')

Looks like these are part of ABI - please add to opal-api.h and document.

> + */
> +static void occ_throttle_poll(void *data __unused)
> +{
> +	struct proc_chip *chip;
> +	struct occ_pstate_table *occ_data;
> +	int rc;
> +
> +	if (occ_reset) {
> +		int inactive = 0;
> +
> +		for_each_chip(chip) {
> +			occ_data = chip_occ_data(chip);
> +			if (occ_data->valid != 1) {
> +				inactive = 1;
> +				break;
> +			}
> +		}
> +		if (!inactive) {
> +			/*
> +			 * Queue OCC_THROTTLE with throttle status as 0 to
> +			 * indicate all OCCs are active after a reset.
> +			 */
> +			rc = opal_queue_msg(OPAL_MSG_OCC, NULL, NULL,
> +						OCC_THROTTLE, 0, 0);
> +			if (!rc)
> +				occ_reset = false;
> +		}
> +	} else {
> +		for_each_chip(chip) {
> +			occ_data = chip_occ_data(chip);
> +			if ((occ_data->valid == 1) &&
> +			    (chip->prev_throttle != occ_data->throttle) &&
> +			    (occ_data->throttle <= 5)) {
> +				rc = opal_queue_msg(OPAL_MSG_OCC, NULL, NULL,
> +						OCC_THROTTLE, chip->id,
> +						occ_data->throttle);
> +				if (!rc)
> +					chip->prev_throttle =
> +						occ_data->throttle;
> +			}
> +		}
> +	}
> +}
> +
>  /* CPU-OCC PState init */
>  /* Called after OCC init on P8 */
>  void occ_pstates_init(void)
> @@ -345,6 +412,11 @@ void occ_pstates_init(void)
>  			cpu_pstates_prepare_core(chip, c, pstate_nom);
>  		}
>  	}
> +
> +	/* Add opal_poller to poll OCC throttle status of each chip */
> +	for_each_chip(chip)
> +		chip->prev_throttle = 0;
> +	opal_add_poller(occ_throttle_poll, NULL);
>  }
>  
>  struct occ_load_req {
> @@ -386,6 +458,11 @@ static void __occ_do_load(u8 scope, u32 dbob_id __unused, u32 seq_id)
>  		prlog(PR_INFO, "OCC: Load: Fallback to preloaded image\n");
>  		rc = 0;
>  	} else if (!rc) {
> +		rc = opal_queue_msg(OPAL_MSG_OCC, NULL, NULL, OCC_LOAD);
> +		if (rc)
> +			prlog(PR_INFO, "OCC: Failed to queue message %d\n",
> +						OCC_LOAD);
> +
>  		/* Success, start OCC */
>  		rc = host_services_occ_start();
>  	}
> @@ -509,6 +586,24 @@ static void occ_do_reset(u8 scope, u32 dbob_id, u32 seq_id)
>  		rc = 0;
>  	}
>  	if (!rc) {
> +		rc = opal_queue_msg(OPAL_MSG_OCC, NULL, NULL, OCC_RESET);
> +		if (rc)
> +			prlog(PR_INFO, "OCC: Failed to queue message %d\n",
> +						OCC_RESET);
> +		/*
> +		 * Set 'valid' byte of chip_occ_data to 0 since OCC
> +		 * may not clear this byte on a reset.
> +		 * OCC will set the 'valid' byte to 1 when it becomes
> +		 * active again.
> +		 */
> +		for_each_chip(chip) {
> +			struct occ_pstate_table *occ_data;
> +
> +			occ_data = chip_occ_data(chip);
> +			occ_data->valid = 0;
> +			chip->prev_throttle = 0;
> +		}
> +		occ_reset = true;
>  		/* Send a single success response for all chips */
>  		stat = fsp_mkmsg(FSP_CMD_RESET_OCC_STAT, 2, 0, seq_id);
>  		if (stat)
> diff --git a/include/chip.h b/include/chip.h
> index 0547902..340fdfc 100644
> --- a/include/chip.h
> +++ b/include/chip.h
> @@ -147,6 +147,7 @@ struct proc_chip {
>  	uint64_t		homer_size;
>  	uint64_t		occ_common_base;
>  	uint64_t		occ_common_size;
> +	u8			prev_throttle;
>  
>  	/* Must hold capi_lock to change */
>  	u8			capp_phb3_attached_mask;
> diff --git a/include/opal-api.h b/include/opal-api.h
> index 1698311..abe798e 100644
> --- a/include/opal-api.h
> +++ b/include/opal-api.h
> @@ -417,6 +417,13 @@ enum opal_msg_type {
>  	OPAL_MSG_HMI_EVT,
>  	OPAL_MSG_DPO,
>  	OPAL_MSG_PRD,
> +	OPAL_MSG_OCC,			/*
> +					 * params[0] =	0 reset,
> +					 *		1 load,
> +					 *		2 throttle
> +					 * params[1] = chip_id,
> +					 * params[2] = throttle_status
> +					 */

Perhaps define this in a struct/union somewhere too?