[Skiboot] [PATCH v2] Add purging CPU L2 and L3 caches into NPU hreset.

Mon Dec 3 14:01:52 AEDT 2018

On 30/11/2018 17:16, Andrew Donnellan wrote:
> On 30/11/18 12:05 pm, Alexey Kardashevskiy wrote:>> +static int
> start_l2_purge(uint32_t chip_id, uint32_t core_id)
>>> +{
>>> +    int rc;
>>> +    uint64_t addr = XSCOM_ADDR_P9_EX(core_id, L2_PRD_PURGE_CMD_REG);
>>> +
>>> +    rc = xscom_write_mask(chip_id, addr, L2CAC_FLUSH,
>>> +                  L2_PRD_PURGE_CMD_TYPE_MASK);
>>> +    if (rc)
>>> +        goto out;
>>> +    rc = xscom_write_mask(chip_id, addr, L2_PRD_PURGE_CMD_TRIGGER,
>>> +                  L2_PRD_PURGE_CMD_TRIGGER);
>>> +    if (rc)
>>> +out:        prlog(PR_ERR, "PURGE L2 on core 0x%x: XSCOM write_mask "
>>> +              "failed %i\n", core_id, rc);
>>
>>
>> Nit: I find such use of goto with a label inside another scope rather
>> confusing. Goto is usually used to avoid code duplication on a cleanup
>> path and this is not the case.
>>
>> I'd rather ditch "goto" and do "if (!rc) xscom_write_mask" for the
>> second xscom_write_mask()   or   move "out:" 1 line higher. Up to
>> Stewart really :)
> 
> Just handle the second one exactly like the first one.
> 
> +    if (rc)
> +        goto out;
> +    rc = xscom_write_mask(chip_id, addr, L3_PRD_PURGE_REQ,
> +                  L3_PRD_PURGE_REQ);
> +    if (rc)
> +        goto out;
> +
> +    return rc;
> +
> +out:
> +    prlog(PR_ERR, "PURGE L3 on core 0x%x: XSCOM write_mask "
> +          "failed %i\n", core_id, rc);
> +    return rc;

I still like goto-less option better.

> 
>>
>>
>>> +    return rc;
>>> +
>>
>> Unnecessary empty line.
>>
>>> +}
>>> +
>>> +static int wait_l2_purge(uint32_t chip_id, uint32_t core_id)
>>> +{
>>> +    int rc;
>>> +    unsigned long now = mftb();
>>> +    unsigned long end = now + msecs_to_tb(2);
>>> +    uint64_t val = L2_PRD_PURGE_CMD_REG_BUSY;
>>> +    uint64_t addr = XSCOM_ADDR_P9_EX(core_id, L2_PRD_PURGE_CMD_REG);
>>> +
>>> +    while (val & L2_PRD_PURGE_CMD_REG_BUSY) {
>>
>> for (val = L2_PRD_PURGE_CMD_REG_BUSY;;)
> 
> If the concern is an unnecessary initialiser then use do/while? Better
> than using a for.

imho it is not. We exit the loop because of that flag so making
initialization a part of the loop is a good thing. It is a very minor
thing though, I am fine either way.

> 
>>
>> and....
>>
>>> +        rc = xscom_read(chip_id, addr, &val);
>>> +        if (rc) {
>>> +            prlog(PR_ERR, "PURGE L2 on core 0x%x: XSCOM read "
>>> +                  "failed %i\n", core_id, rc);
>>> +            break;
>>> +        }
>>
>>
>> ... break out here:
>>     if (val & L2_PRD_PURGE_CMD_REG_BUSY)
>>         break;
>>
>> As if you read val with the bit set, then checking timeout is rather
>> pointless, no? >
>>
>>> +        now = mftb();
>>> +        if (tb_compare(now, end) == TB_AAFTERB) {
>>> +            prlog(PR_ERR, "PURGE L2 on core 0x%x timed out %i\n",
>>> +                  core_id, rc);
>>> +            return OPAL_BUSY;
>>> +        }
>>> +    }
>>> +
>>> +    /* We have to clear the trigger bit ourselves */
>>> +    val &= ~L2_PRD_PURGE_CMD_TRIGGER;
>>> +    rc = xscom_write(chip_id, addr, val);
>>> +    if (rc)
>>> +        prlog(PR_ERR, "PURGE L2 on core 0x%x: XSCOM write failed %i\n",
>>> +              core_id, rc);
>>> +    return rc;
>>> +
>>
>> Unnecessary empty line.
>>
>>
>>> +}
>>> +
>>> +static int start_l3_purge(uint32_t chip_id, uint32_t core_id)
>>> +{
>>> +    int rc = 0;
>>> +    uint64_t addr = XSCOM_ADDR_P9_EX(core_id, L3_PRD_PURGE_REG);
>>> +
>>> +    rc = xscom_write_mask(chip_id, addr, L3_FULL_PURGE,
>>> +                  L3_PRD_PURGE_TTYPE_MASK);
>>> +    if (rc)
>>> +        goto out;
>>> +    rc = xscom_write_mask(chip_id, addr, L3_PRD_PURGE_REQ,
>>> +                  L3_PRD_PURGE_REQ);
>>> +    if (rc)
>>> +out:        prlog(PR_ERR, "PURGE L3 on core 0x%x: XSCOM write_mask "
>>> +              "failed %i\n", core_id, rc);
>>> +    return rc;
>>> +
>>> +}
>>> +
>>> +static int wait_l3_purge(uint32_t chip_id, uint32_t core_id)
>>> +{
>>> +    int rc;
>>> +    unsigned long now = mftb();
>>> +    unsigned long end = now + msecs_to_tb(2);
>>> +    uint64_t val = L3_PRD_PURGE_REQ;
>>> +    uint64_t addr = XSCOM_ADDR_P9_EX(core_id, L3_PRD_PURGE_REG);
>>> +
>>> +    /* Trigger bit is automatically set to zero when flushing is done*/
>>> +    while (val & L3_PRD_PURGE_REQ) {
>>> +        rc = xscom_read(chip_id, addr, &val);
>>> +        if (rc) {
>>> +            prlog(PR_ERR, "PURGE L3 on core 0x%x: XSCOM read "
>>> +                  "failed %i\n", core_id, rc);
>>> +            break;
>>> +        }
>>> +        now = mftb();
>>> +        if (tb_compare(now, end) == TB_AAFTERB) {
>>> +            prlog(PR_ERR, "PURGE L3 on core 0x%x timed out %i\n",
>>> +                  core_id, rc);
>>> +            return OPAL_BUSY;
>>> +        }
>>> +    }
>>> +    return rc;
>>> +}
>>> +
>>> +static int64_t purge_l2_l3_caches(void)
>>> +{
>>> +    int rc = 0;
>>> +    struct cpu_thread *t;
>>> +    uint64_t core_id, prev_core_id = 0xdeadbeef;
>>
>> I'd make it (uint64_t)-1 :) deafbeaf is more for poisoning memory.
>>
>>
>>> +
>>> +    if (proc_gen != proc_gen_p9)
>>> +        return OPAL_UNSUPPORTED;
>>
>>
>> I assume this is a leftover from when it was an OPAL call. There is no
>> even small chance for this to be called for anything but p9, right?
>>
>>
>>> +
>>> +    for_each_ungarded_cpu(t) {
>>> +        /* Only need to do it once per core chiplet */
>>> +        core_id = pir_to_core_id(t->pir);
>>> +        if (prev_core_id == core_id)
>>> +            continue;
>>> +        prev_core_id = core_id;
>>> +        if (start_l2_purge(t->chip_id, core_id))
>>> +            goto out;
>>> +        if (start_l3_purge(t->chip_id, core_id))
>>> +            goto out;
>>> +    }
>>> +
>>> +    for_each_ungarded_cpu(t) {
>>> +        /* Only need to do it once per core chiplet */
>>> +        core_id = pir_to_core_id(t->pir);
>>> +        if (prev_core_id == core_id)
>>> +            continue;
>>
>> Looks like if there is a single core at all (quite unlikely but still a
>> possible situation), we won't wait below at all.
>>
>>> +        prev_core_id = core_id;
>>> +
>>> +        if (wait_l2_purge(t->chip_id, core_id))
>>> +            goto out;
>>> +        if (wait_l3_purge(t->chip_id, core_id))
>>> +            goto out;
>>> +    }
>>> +    return rc;
>>
>>
>> return OPAL_SUCCESS;
>>
>>
>>> +out:
>>> +    prlog(PR_ERR, "Failed on core: 0x%llx\n", core_id);
>>> +    return rc;
>>
>> return OPAL_BUSY_EVENT. Now it always returns 0.
>>
>>
>>> +
>>> +}
>>> +
>>>   static int64_t npu2_hreset(struct pci_slot *slot __unused)
>>>   {
>>>       struct npu2 *p;
>>> @@ -1125,6 +1260,7 @@ static int64_t npu2_hreset(struct pci_slot
>>> *slot __unused)
>>>               reset_ntl(ndev);
>>>           }
>>>       }
>>> +    purge_l2_l3_caches();
>>
>>
>> This takes care of the guest termination case (which is good).
>>
>> Now in order to cover the period between the guest started rebooting and
>> the guest started onlining the memory, we also need purge_l2_l3_caches()
>> near npu2_dev_procedure_reset() in npu2_dev_cfg_exp_devcap() as this is
>> the reset method used for NPU when the guest reboots.
>>
>>
>>
>>>       return OPAL_SUCCESS;
>>
>>
>> return purge_l2_l3_caches(), otherwise the timeout error is lost.
>>
>>
>>
>>
>>>   }
>>>   diff --git a/include/npu2-regs.h b/include/npu2-regs.h
>>> index 165e0b79..20a61e28 100644
>>> --- a/include/npu2-regs.h
>>> +++ b/include/npu2-regs.h
>>> @@ -749,4 +749,15 @@ void npu2_scom_write(uint64_t gcid, uint64_t
>>> scom_base,
>>>   #define OB3_ODL1_TRAINING_STATUS        0xC01082F
>>>   #define   OB_ODL_TRAINING_STATUS_STS_RX_PATTERN_B PPC_BITMASK(8, 15)
>>>   +/* Registers and bits used to clear the L2 and L3 cache */
>>> +#define L2_PRD_PURGE_CMD_REG             0x1080E
>>> +#define L2_PRD_PURGE_CMD_REG_BUSY         0x0040000000000000
>>> +#define L2_PRD_PURGE_CMD_TYPE_MASK        PPC_BIT(1) | PPC_BIT(2) |
>>> PPC_BIT(3) | PPC_BIT(4)
>>> +#define L2_PRD_PURGE_CMD_TRIGGER        PPC_BIT(0)
>>> +#define L2CAC_FLUSH                0x0
>>> +#define L3_PRD_PURGE_REG            0x1180E
>>> +#define L3_PRD_PURGE_REQ            PPC_BIT(0)
>>> +#define L3_PRD_PURGE_TTYPE_MASK         PPC_BIT(1) | PPC_BIT(2) |
>>> PPC_BIT(3) | PPC_BIT(4)
>>> +#define L3_FULL_PURGE                0x0
>>> +
>>>   #endif /* __NPU2_REGS_H */
>>>
>>
> 

-- 
Alexey