[Skiboot] [PATCH v2 3/3] occ: Add support for GPU presence detection

Shilpasri G Bhat shilpa.bhat at linux.vnet.ibm.com
Wed Jun 20 15:40:28 AEST 2018



On 06/20/2018 09:19 AM, Andrew Donnellan wrote:
> On the Witherspoon platform, we need to distinguish between NVLink GPUs and
> OpenCAPI accelerators. In order to do this, we first need to find out
> whether the SXM2 socket is populated.
> 
> On Witherspoon, the SXM2 socket's presence detection pin is only visible
> via I2C from the APSS, and thus can only be exposed to the host via the
> OCC. The OCC, per OCC Firmware Interface Specification for POWER9 version
> 0.22, now exposes this to skiboot through a field in the dynamic data
> shared memory.
> 
> Add the necessary dynamic data changes required to read the version and
> GPU presence fields. Add a function, occ_get_gpu_presence(), that can be
> used to check GPU presence.
> 
> If the OCC isn't reporting presence (old OCC firmware, or some other
> reason), we default to assuming there is a device present and wait until
> link training to fail.
> 
> This will be used in later patches to fix up the NPU2 probe path for
> OpenCAPI support on Witherspoon.
> 
> Signed-off-by: Andrew Donnellan <andrew.donnellan at au1.ibm.com>

Reviewed-by: Shilpasri G Bhat <shilpa.bhat at linux.vnet.ibm.com>

> ---
>  hw/occ.c      | 23 ++++++++++++++++++++---
>  include/occ.h |  4 ++++
>  2 files changed, 24 insertions(+), 3 deletions(-)
> 
> diff --git a/hw/occ.c b/hw/occ.c
> index fc95d3926bb0..10b2de07dd7a 100644
> --- a/hw/occ.c
> +++ b/hw/occ.c
> @@ -229,10 +229,10 @@ struct occ_response_buffer {
>   */
>  struct occ_dynamic_data {
>  	u8 occ_state;
> +	u8 major_version;
> +	u8 minor_version;
> +	u8 gpus_present;
>  	u8 spare1;
> -	u8 spare2;
> -	u8 spare3;
> -	u8 spare4;
>  	u8 cpu_throttle;
>  	u8 mem_throttle;
>  	u8 quick_pwr_drop;
> @@ -1230,6 +1230,23 @@ exit:
>  	unlock(&chip->queue_lock);
>  }
>  
> +bool occ_get_gpu_presence(struct proc_chip *chip, int gpu_num)
> +{
> +	struct occ_dynamic_data *ddata;
> +
> +	assert(gpu_num <= 2);
> +
> +	ddata = get_occ_dynamic_data(chip);
> +
> +	if (ddata->major_version != 0 || ddata->minor_version < 1) {
> +		prlog(PR_INFO, "OCC: OCC not reporting GPU slot presence, "
> +		      "assuming device is present\n");
> +		return true;
> +	}
> +
> +	return (bool)(ddata->gpus_present & 1 << gpu_num);
> +}
> +
>  static void occ_add_powercap_sensors(struct dt_node *power_mgt);
>  static void occ_add_psr_sensors(struct dt_node *power_mgt);
>  
> diff --git a/include/occ.h b/include/occ.h
> index c9faef9fdfb8..a46b9219fc70 100644
> --- a/include/occ.h
> +++ b/include/occ.h
> @@ -14,6 +14,8 @@
>   * limitations under the License.
>   */
>  
> +#include <chip.h>
> +
>  /* OCC Functions */
>  
>  extern void occ_pstates_init(void);
> @@ -36,6 +38,8 @@ enum pnor_owner {
>  };
>  extern void occ_pnor_set_owner(enum pnor_owner owner);
>  
> +/* GPU presence detection */
> +bool occ_get_gpu_presence(struct proc_chip *chip, int gpu_num);
>  
>  /* OCC Inband Sensors */
>  extern bool occ_sensors_init(void);
> 



More information about the Skiboot mailing list