[Skiboot] [PATCH v3] Add purging CPU L2 and L3 caches into NPU hreset.

Tue Dec 4 12:01:50 AEDT 2018

On 03/12/2018 17:49, Rashmica Gupta wrote:
> If a GPU is passed through to a guest and the guest unexpectedly terminates,
> there can be cache lines in CPUs that belong to the GPU. So purge the caches
> as part of the reset sequence. L1 is write through, so doesn't need to be purged.
> 
> This also needs to be called if the guest reboots so call it in
> npu2_dev_cfg_exp_devcap().
> 
> The sequence to purge the L2 and L3 caches from the hw team:
> 
> "L2 purge:
>  (1) initiate purge
>  putspy pu.ex EXP.L2.L2MISC.L2CERRS.PRD_PURGE_CMD_TYPE L2CAC_FLUSH -all
>  putspy pu.ex EXP.L2.L2MISC.L2CERRS.PRD_PURGE_CMD_TRIGGER ON -all
> 
>  (2) check this is off in all caches to know purge completed
>  getspy pu.ex EXP.L2.L2MISC.L2CERRS.PRD_PURGE_CMD_REG_BUSY -all
> 
>  (3) putspy pu.ex EXP.L2.L2MISC.L2CERRS.PRD_PURGE_CMD_TRIGGER OFF -all
> 
> L3 purge:
>  1) Start the purge:
>  putspy pu.ex EXP.L3.L3_MISC.L3CERRS.L3_PRD_PURGE_TTYPE FULL_PURGE -all
>  putspy pu.ex EXP.L3.L3_MISC.L3CERRS.L3_PRD_PURGE_REQ ON -all
> 
>  2) Ensure that the purge has completed by checking the status bit:
>  getspy pu.ex EXP.L3.L3_MISC.L3CERRS.L3_PRD_PURGE_REQ -all
> 
>  You should see it say OFF if it's done:
>  p9n.ex k0:n0:s0:p00:c0
>  EXP.L3.L3_MISC.L3CERRS.L3_PRD_PURGE_REQ
>  OFF"
> 
> Suggested-by: Alistair Popple <alistair at popple.id.au>
> Signed-off-by: Rashmica Gupta <rashmica.g at gmail.com>

Reviewed-by: Alexey Kardashevskiy <aik at ozlabs.ru>

It is correct as it is but there is also some room for bikeshedding, see
below :)

> ---
> 
> This is done synchronously for now as it doesn't seem to take *too* long
> (purging the L2 and L3 caches after building the 4.16 linux kernel on a p9
> with 16 cores took 1.57 ms, 1.49ms and 1.46ms).
> 
> 
>  hw/npu2.c           | 135 +++++++++++++++++++++++++++++++++++++++++++-
>  include/npu2-regs.h |  11 ++++
>  2 files changed, 145 insertions(+), 1 deletion(-)
> 
> diff --git a/hw/npu2.c b/hw/npu2.c
> index 30049f5b..9c0e6114 100644
> --- a/hw/npu2.c
> +++ b/hw/npu2.c
> @@ -326,6 +326,136 @@ static int64_t npu2_dev_cfg_bar(void *dev, struct pci_cfg_reg_filter *pcrf,
>  	return npu2_cfg_read_bar(ndev, pcrf, offset, len, data);
>  }
>  
> +static int start_l2_purge(uint32_t chip_id, uint32_t core_id)
> +{
> +	int rc;
> +	uint64_t addr = XSCOM_ADDR_P9_EX(core_id, L2_PRD_PURGE_CMD_REG);
> +
> +	rc = xscom_write_mask(chip_id, addr, L2CAC_FLUSH,
> +			      L2_PRD_PURGE_CMD_TYPE_MASK);
> +	if (!rc)
> +		rc = xscom_write_mask(chip_id, addr, L2_PRD_PURGE_CMD_TRIGGER,
> +			      L2_PRD_PURGE_CMD_TRIGGER);
> +	if (rc)
> +		prlog(PR_ERR, "PURGE L2 on core 0x%x: XSCOM write_mask "
> +		      "failed %i\n", core_id, rc);
> +	return rc;
> +}
> +
> +static int wait_l2_purge(uint32_t chip_id, uint32_t core_id)
> +{
> +	int rc;
> +	unsigned long now = mftb();
> +	unsigned long end = now + msecs_to_tb(2);
> +	uint64_t val = L2_PRD_PURGE_CMD_REG_BUSY;
> +	uint64_t addr = XSCOM_ADDR_P9_EX(core_id, L2_PRD_PURGE_CMD_REG);
> +
> +	while (val & L2_PRD_PURGE_CMD_REG_BUSY) {

This check is pointless as you never break out of the loop because of it
(you do it explicitly below), could be as simple while(1) or for(;;).

> +		rc = xscom_read(chip_id, addr, &val);
> +		if (rc) {
> +			prlog(PR_ERR, "PURGE L2 on core 0x%x: XSCOM read "
> +			      "failed %i\n", core_id, rc);
> +			break;
> +		}
> +		if (!(val & L2_PRD_PURGE_CMD_REG_BUSY))
> +			break;
> +		now = mftb();
> +		if (tb_compare(now, end) == TB_AAFTERB) {
> +			prlog(PR_ERR, "PURGE L2 on core 0x%x timed out %i\n",
> +			      core_id, rc);
> +			return OPAL_BUSY;
> +		}
> +	}
> +
> +	/* We have to clear the trigger bit ourselves */
> +	val &= ~L2_PRD_PURGE_CMD_TRIGGER;
> +	rc = xscom_write(chip_id, addr, val);
> +	if (rc)
> +		prlog(PR_ERR, "PURGE L2 on core 0x%x: XSCOM write failed %i\n",
> +		      core_id, rc);
> +	return rc;
> +}
> +
> +static int start_l3_purge(uint32_t chip_id, uint32_t core_id)
> +{
> +	int rc;
> +	uint64_t addr = XSCOM_ADDR_P9_EX(core_id, L3_PRD_PURGE_REG);
> +
> +	rc = xscom_write_mask(chip_id, addr, L3_FULL_PURGE,
> +			      L3_PRD_PURGE_TTYPE_MASK);
> +	if (!rc)
> +		rc = xscom_write_mask(chip_id, addr, L3_PRD_PURGE_REQ,
> +			      L3_PRD_PURGE_REQ);
> +	if (rc)
> +		prlog(PR_ERR, "PURGE L3 on core 0x%x: XSCOM write_mask "
> +		      "failed %i\n", core_id, rc);
> +	return rc;
> +}
> +
> +static int wait_l3_purge(uint32_t chip_id, uint32_t core_id)
> +{
> +	int rc;
> +	unsigned long now = mftb();
> +	unsigned long end = now + msecs_to_tb(2);
> +	uint64_t val = L3_PRD_PURGE_REQ;
> +	uint64_t addr = XSCOM_ADDR_P9_EX(core_id, L3_PRD_PURGE_REG);
> +
> +	/* Trigger bit is automatically set to zero when flushing is done */
> +	while (val & L3_PRD_PURGE_REQ) {

Same here.

> +		rc = xscom_read(chip_id, addr, &val);
> +		if (rc) {
> +			prlog(PR_ERR, "PURGE L3 on core 0x%x: XSCOM read "
> +			      "failed %i\n", core_id, rc);
> +			break;
> +		}
> +		if (!(val & L3_PRD_PURGE_REQ))
> +			break;
> +		now = mftb();
> +		if (tb_compare(now, end) == TB_AAFTERB) {
> +			prlog(PR_ERR, "PURGE L3 on core 0x%x timed out %i\n",
> +			      core_id, rc);
> +			return OPAL_BUSY;
> +		}
> +	}
> +	return rc;
> +}
> +
> +static int64_t purge_l2_l3_caches(void)
> +{
> +	struct cpu_thread *t;
> +	uint64_t core_id, prev_core_id = (uint64_t)-1;
> +
> +	for_each_ungarded_cpu(t) {
> +		/* Only need to do it once per core chiplet */
> +		core_id = pir_to_core_id(t->pir);
> +		if (prev_core_id == core_id)
> +			continue;
> +		prev_core_id = core_id;
> +		if (start_l2_purge(t->chip_id, core_id))
> +			goto out;

start_l2_purge() can return OPAL_BUSY but can also return other errors
from xscom_read()/xscom_write() but you convert them all to "busy" anyway.

A better approach would be:
ret = start_l2_purge(t->chip_id, core_id);
if (ret)
	goto out;

and...

> +		if (start_l3_purge(t->chip_id, core_id))
> +			goto out;
> +	}
> +
> +	prev_core_id = (uint64_t)-1;
> +	for_each_ungarded_cpu(t) {
> +		/* Only need to do it once per core chiplet */
> +		core_id = pir_to_core_id(t->pir);
> +		if (prev_core_id == core_id)
> +			continue;
> +		prev_core_id = core_id;
> +
> +		if (wait_l2_purge(t->chip_id, core_id))
> +			goto out;
> +		if (wait_l3_purge(t->chip_id, core_id))
> +			goto out;
> +	}
> +	return OPAL_SUCCESS;
> +out:
> +	prlog(PR_ERR, "Failed on core: 0x%llx\n", core_id);
> +	return OPAL_BUSY_EVENT;

... return ret here.

> +}
> +
>  static int64_t npu2_dev_cfg_exp_devcap(void *dev,
>  		struct pci_cfg_reg_filter *pcrf __unused,
>  		uint32_t offset, uint32_t size,
> @@ -346,6 +476,9 @@ static int64_t npu2_dev_cfg_exp_devcap(void *dev,
>  	if (*data & PCICAP_EXP_DEVCTL_FUNC_RESET)
>  		npu2_dev_procedure_reset(ndev);
>  
> +	if (purge_l2_l3_caches())
> +		return OPAL_BUSY_EVENT;

We do not want to purge caches every time we touch this capability. It
is not going to be often in practice but I'd think more often than just
cases when a driver wants to reset a device.

Also you convert every possible error from purge_l2_l3_caches() to just
"busy" while there are more options.

ret = purge_l2_l3_caches();
if (ret)
	return ret;

We have more than 30 error codes, let them be used :)

> +
>  	return OPAL_PARTIAL;
>  }
>  
> @@ -1125,7 +1258,7 @@ static int64_t npu2_hreset(struct pci_slot *slot __unused)
>  			reset_ntl(ndev);
>  		}
>  	}
> -	return OPAL_SUCCESS;
> +	return purge_l2_l3_caches();
>  }
>  
>  static int64_t npu2_freset(struct pci_slot *slot __unused)
> diff --git a/include/npu2-regs.h b/include/npu2-regs.h
> index 10a28166..8273b2be 100644
> --- a/include/npu2-regs.h
> +++ b/include/npu2-regs.h
> @@ -756,4 +756,15 @@ void npu2_scom_write(uint64_t gcid, uint64_t scom_base,
>  #define OB3_ODL0_ENDPOINT_INFO			0xC010832
>  #define OB3_ODL1_ENDPOINT_INFO			0xC010833
>  
> +/* Registers and bits used to clear the L2 and L3 cache */
> +#define L2_PRD_PURGE_CMD_REG 			0x1080E
> +#define L2_PRD_PURGE_CMD_REG_BUSY 		0x0040000000000000
> +#define L2_PRD_PURGE_CMD_TYPE_MASK		PPC_BIT(1) | PPC_BIT(2) | PPC_BIT(3) | PPC_BIT(4)
> +#define L2_PRD_PURGE_CMD_TRIGGER		PPC_BIT(0)
> +#define L2CAC_FLUSH				0x0
> +#define L3_PRD_PURGE_REG			0x1180E
> +#define L3_PRD_PURGE_REQ			PPC_BIT(0)
> +#define L3_PRD_PURGE_TTYPE_MASK 		PPC_BIT(1) | PPC_BIT(2) | PPC_BIT(3) | PPC_BIT(4)
> +#define L3_FULL_PURGE				0x0
> +
>  #endif /* __NPU2_REGS_H */
> 

-- 
Alexey