[Skiboot] [PATCH v2] Add purging CPU L2 and L3 caches into NPU hreset.

Mon Dec 3 14:24:09 AEDT 2018

On 03/12/2018 09:50, Rashmica Gupta wrote:
> On Fri, 2018-11-30 at 12:05 +1100, Alexey Kardashevskiy wrote:
>>>
>> On 21/11/2018 10:44, Rashmica Gupta wrote:
> 
> ...
> 
>>> +static int start_l2_purge(uint32_t chip_id, uint32_t core_id)
>>> +{
>>> +	int rc;
>>> +	uint64_t addr = XSCOM_ADDR_P9_EX(core_id,
>>> L2_PRD_PURGE_CMD_REG);
>>> +
>>> +	rc = xscom_write_mask(chip_id, addr, L2CAC_FLUSH,
>>> +			      L2_PRD_PURGE_CMD_TYPE_MASK);
>>> +	if (rc)
>>> +		goto out;
>>> +	rc = xscom_write_mask(chip_id, addr, L2_PRD_PURGE_CMD_TRIGGER,
>>> +			      L2_PRD_PURGE_CMD_TRIGGER);
>>> +	if (rc)
>>> +out:		prlog(PR_ERR, "PURGE L2 on core 0x%x: XSCOM
>>> write_mask "
>>> +		      "failed %i\n", core_id, rc);
>>
>>
>> Nit: I find such use of goto with a label inside another scope rather
>> confusing. Goto is usually used to avoid code duplication on a
>> cleanup
>> path and this is not the case.
>>
>> I'd rather ditch "goto" and do "if (!rc) xscom_write_mask" for the
>> second xscom_write_mask()   or   move "out:" 1 line higher. Up to
>> Stewart really :)
>>
>>
>>> +	return rc;
>>> +
>>
>> Unnecessary empty line.
>>
>>> +}
>>> +
>>> +static int wait_l2_purge(uint32_t chip_id, uint32_t core_id)
>>> +{
>>> +	int rc;
>>> +	unsigned long now = mftb();
>>> +	unsigned long end = now + msecs_to_tb(2);
>>> +	uint64_t val = L2_PRD_PURGE_CMD_REG_BUSY;
>>> +	uint64_t addr = XSCOM_ADDR_P9_EX(core_id,
>>> L2_PRD_PURGE_CMD_REG);
>>> +
>>> +	while (val & L2_PRD_PURGE_CMD_REG_BUSY) {
>>
>> for (val = L2_PRD_PURGE_CMD_REG_BUSY;;)
>>
>> and....
>>
>>> +		rc = xscom_read(chip_id, addr, &val);
>>> +		if (rc) {
>>> +			prlog(PR_ERR, "PURGE L2 on core 0x%x: XSCOM
>>> read "
>>> +			      "failed %i\n", core_id, rc);
>>> +			break;
>>> +		}
>>
>>
>> ... break out here:
>> 	if (val & L2_PRD_PURGE_CMD_REG_BUSY)
>> 		break;
>>
>> As if you read val with the bit set, then checking timeout is rather
>> pointless, no?
>>
> 
> I think you mean:
> if (!(val & L2_PRD_PURGE_CMD_REG_BUSY))
>  		break;
> ??

Yes, my bad :)

> Checking timeout is pointless if the purge is completed, but if it
> hasn't completed then we do want to check the timeout. 
> 
>>
>>> +		now = mftb();
>>> +		if (tb_compare(now, end) == TB_AAFTERB) {
>>> +			prlog(PR_ERR, "PURGE L2 on core 0x%x timed out
>>> %i\n",
>>> +			      core_id, rc);
>>> +			return OPAL_BUSY;
>>> +		}
>>> +	}
>>> +
>>> +	/* We have to clear the trigger bit ourselves */
>>> +	val &= ~L2_PRD_PURGE_CMD_TRIGGER;
>>> +	rc = xscom_write(chip_id, addr, val);
>>> +	if (rc)
>>> +		prlog(PR_ERR, "PURGE L2 on core 0x%x: XSCOM write
>>> failed %i\n",
>>> +		      core_id, rc);
>>> +	return rc;
>>> +
>>
>> Unnecessary empty line.
>>
>>
>>> +}
>>> +
>>> +static int start_l3_purge(uint32_t chip_id, uint32_t core_id)
>>> +{
>>> +	int rc = 0;
>>> +	uint64_t addr = XSCOM_ADDR_P9_EX(core_id, L3_PRD_PURGE_REG);
>>> +
>>> +	rc = xscom_write_mask(chip_id, addr, L3_FULL_PURGE,
>>> +			      L3_PRD_PURGE_TTYPE_MASK);
>>> +	if (rc)
>>> +		goto out;
>>> +	rc = xscom_write_mask(chip_id, addr, L3_PRD_PURGE_REQ,
>>> +			      L3_PRD_PURGE_REQ);
>>> +	if (rc)
>>> +out:		prlog(PR_ERR, "PURGE L3 on core 0x%x: XSCOM
>>> write_mask "
>>> +		      "failed %i\n", core_id, rc);
>>> +	return rc;
>>> +
>>> +}
>>> +
>>> +static int wait_l3_purge(uint32_t chip_id, uint32_t core_id)
>>> +{
>>> +	int rc;
>>> +	unsigned long now = mftb();
>>> +	unsigned long end = now + msecs_to_tb(2);
>>> +	uint64_t val = L3_PRD_PURGE_REQ;
>>> +	uint64_t addr = XSCOM_ADDR_P9_EX(core_id, L3_PRD_PURGE_REG);
>>> +
>>> +	/* Trigger bit is automatically set to zero when flushing is
>>> done*/
>>> +	while (val & L3_PRD_PURGE_REQ) {
>>> +		rc = xscom_read(chip_id, addr, &val);
>>> +		if (rc) {
>>> +			prlog(PR_ERR, "PURGE L3 on core 0x%x: XSCOM
>>> read "
>>> +			      "failed %i\n", core_id, rc);
>>> +			break;
>>> +		}
>>> +		now = mftb();
>>> +		if (tb_compare(now, end) == TB_AAFTERB) {
>>> +			prlog(PR_ERR, "PURGE L3 on core 0x%x timed out
>>> %i\n",
>>> +			      core_id, rc);
>>> +			return OPAL_BUSY;
>>> +		}
>>> +	}
>>> +	return rc;
>>> +}
>>> +
>>> +static int64_t purge_l2_l3_caches(void)
>>> +{
>>> +	int rc = 0;
>>> +	struct cpu_thread *t;
>>> +	uint64_t core_id, prev_core_id = 0xdeadbeef;
>>
>> I'd make it (uint64_t)-1 :) deafbeaf is more for poisoning memory.
>>
> 
> Ok!
> 
>>
>>> +
>>> +	if (proc_gen != proc_gen_p9)
>>> +		return OPAL_UNSUPPORTED;
>>
>>
>> I assume this is a leftover from when it was an OPAL call. There is
>> no
>> even small chance for this to be called for anything but p9, right?
>>
> 
> Good point :)
> 
>>
>>> +
>>> +	for_each_ungarded_cpu(t) {
>>> +		/* Only need to do it once per core chiplet */
>>> +		core_id = pir_to_core_id(t->pir);
>>> +		if (prev_core_id == core_id)
>>> +			continue;
>>> +		prev_core_id = core_id;
>>> +		if (start_l2_purge(t->chip_id, core_id))
>>> +			goto out;
>>> +		if (start_l3_purge(t->chip_id, core_id))
>>> +			goto out;
>>> +	}
>>> +
>>> +	for_each_ungarded_cpu(t) {
>>> +		/* Only need to do it once per core chiplet */
>>> +		core_id = pir_to_core_id(t->pir);
>>> +		if (prev_core_id == core_id)
>>> +			continue;
>>
>> Looks like if there is a single core at all (quite unlikely but still
>> a
>> possible situation), we won't wait below at all.
> 
> Sorry I don't understand why you think that. Can you please elaborate?

If there is a single core, then after the first loop prev_core_id will
be the id of that single core so in the second loop you won't get to the
waits because prev_core_id == core_id. You need (prev_core_id = -1)
before the second loop.

> 
>>
>>> +		prev_core_id = core_id;
>>> +
>>> +		if (wait_l2_purge(t->chip_id, core_id))
>>> +			goto out;
>>> +		if (wait_l3_purge(t->chip_id, core_id))
>>> +			goto out;
>>> +	}
>>> +	return rc;
>>
>>
>> return OPAL_SUCCESS;
>>
>>
>>> +out:
>>> +	prlog(PR_ERR, "Failed on core: 0x%llx\n", core_id);
>>> +	return rc;
>>
>> return OPAL_BUSY_EVENT. Now it always returns 0.
>>
> 
> woops, good spot :)
> 
>>
>>> +
>>> +}
>>> +
>>>  static int64_t npu2_hreset(struct pci_slot *slot __unused)
>>>  {
>>>  	struct npu2 *p;
>>> @@ -1125,6 +1260,7 @@ static int64_t npu2_hreset(struct pci_slot
>>> *slot __unused)
>>>  			reset_ntl(ndev);
>>>  		}
>>>  	}
>>> +	purge_l2_l3_caches();
>>
>>
>> This takes care of the guest termination case (which is good).
>>
>> Now in order to cover the period between the guest started rebooting
>> and
>> the guest started onlining the memory, we also need
>> purge_l2_l3_caches()
>> near npu2_dev_procedure_reset() in npu2_dev_cfg_exp_devcap() as this
>> is
>> the reset method used for NPU when the guest reboots.
>>
>>
> 
> OK. Does it matter if purge_l2_l3_caches() is called before or after
> npu2_dev_procedure_reset()?

npu2_dev_procedure_reset() and then purge_l2_l3_caches() should do it.

> 
>>
>>>  	return OPAL_SUCCESS;
>>
>>
>> return purge_l2_l3_caches(), otherwise the timeout error is lost.
>>
>>
> 
> ack.
> 
>>
>>
>>>  }
>>>  
>>> diff --git a/include/npu2-regs.h b/include/npu2-regs.h
>>> index 165e0b79..20a61e28 100644
>>> --- a/include/npu2-regs.h
>>> +++ b/include/npu2-regs.h
>>> @@ -749,4 +749,15 @@ void npu2_scom_write(uint64_t gcid, uint64_t
>>> scom_base,
>>>  #define OB3_ODL1_TRAINING_STATUS		0xC01082F
>>>  #define   OB_ODL_TRAINING_STATUS_STS_RX_PATTERN_B PPC_BITMASK(8,
>>> 15)
>>>  
>>> +/* Registers and bits used to clear the L2 and L3 cache */
>>> +#define L2_PRD_PURGE_CMD_REG 			0x1080E
>>> +#define L2_PRD_PURGE_CMD_REG_BUSY 		0x0040000000000000
>>> +#define L2_PRD_PURGE_CMD_TYPE_MASK		PPC_BIT(1) | PPC_BIT(2)
>>> | PPC_BIT(3) | PPC_BIT(4)
>>> +#define L2_PRD_PURGE_CMD_TRIGGER		PPC_BIT(0)
>>> +#define L2CAC_FLUSH				0x0
>>> +#define L3_PRD_PURGE_REG			0x1180E
>>> +#define L3_PRD_PURGE_REQ			PPC_BIT(0)
>>> +#define L3_PRD_PURGE_TTYPE_MASK 		PPC_BIT(1) | PPC_BIT(2)
>>> | PPC_BIT(3) | PPC_BIT(4)
>>> +#define L3_FULL_PURGE				0x0
>>> +
>>>  #endif /* __NPU2_REGS_H */
>>>
>>
>>
> 

-- 
Alexey