[Skiboot] [PATCH v3] Add purging CPU L2 and L3 caches into NPU hreset.

Tue Dec 4 11:56:11 AEDT 2018

On Mon, Dec 3, 2018 at 5:49 PM Rashmica Gupta <rashmica.g at gmail.com> wrote:
>
> If a GPU is passed through to a guest and the guest unexpectedly terminates,
> there can be cache lines in CPUs that belong to the GPU. So purge the caches
> as part of the reset sequence. L1 is write through, so doesn't need to be purged.
>
> This also needs to be called if the guest reboots so call it in
> npu2_dev_cfg_exp_devcap().
>
> The sequence to purge the L2 and L3 caches from the hw team:
>
> "L2 purge:
>  (1) initiate purge
>  putspy pu.ex EXP.L2.L2MISC.L2CERRS.PRD_PURGE_CMD_TYPE L2CAC_FLUSH -all
>  putspy pu.ex EXP.L2.L2MISC.L2CERRS.PRD_PURGE_CMD_TRIGGER ON -all
>
>  (2) check this is off in all caches to know purge completed
>  getspy pu.ex EXP.L2.L2MISC.L2CERRS.PRD_PURGE_CMD_REG_BUSY -all
>
>  (3) putspy pu.ex EXP.L2.L2MISC.L2CERRS.PRD_PURGE_CMD_TRIGGER OFF -all
>
> L3 purge:
>  1) Start the purge:
>  putspy pu.ex EXP.L3.L3_MISC.L3CERRS.L3_PRD_PURGE_TTYPE FULL_PURGE -all
>  putspy pu.ex EXP.L3.L3_MISC.L3CERRS.L3_PRD_PURGE_REQ ON -all
>
>  2) Ensure that the purge has completed by checking the status bit:
>  getspy pu.ex EXP.L3.L3_MISC.L3CERRS.L3_PRD_PURGE_REQ -all
>
>  You should see it say OFF if it's done:
>  p9n.ex k0:n0:s0:p00:c0
>  EXP.L3.L3_MISC.L3CERRS.L3_PRD_PURGE_REQ
>  OFF"
>
> Suggested-by: Alistair Popple <alistair at popple.id.au>
> Signed-off-by: Rashmica Gupta <rashmica.g at gmail.com>
> ---
>
> This is done synchronously for now as it doesn't seem to take *too* long
> (purging the L2 and L3 caches after building the 4.16 linux kernel on a p9
> with 16 cores took 1.57 ms, 1.49ms and 1.46ms).
>
>
>  hw/npu2.c           | 135 +++++++++++++++++++++++++++++++++++++++++++-
>  include/npu2-regs.h |  11 ++++
>  2 files changed, 145 insertions(+), 1 deletion(-)
>
> diff --git a/hw/npu2.c b/hw/npu2.c
> index 30049f5b..9c0e6114 100644
> --- a/hw/npu2.c
> +++ b/hw/npu2.c
> @@ -326,6 +326,136 @@ static int64_t npu2_dev_cfg_bar(void *dev, struct pci_cfg_reg_filter *pcrf,
>         return npu2_cfg_read_bar(ndev, pcrf, offset, len, data);
>  }
>
> +static int start_l2_purge(uint32_t chip_id, uint32_t core_id)
> +{
> +       int rc;
> +       uint64_t addr = XSCOM_ADDR_P9_EX(core_id, L2_PRD_PURGE_CMD_REG);
> +
> +       rc = xscom_write_mask(chip_id, addr, L2CAC_FLUSH,
> +                             L2_PRD_PURGE_CMD_TYPE_MASK);
> +       if (!rc)
> +               rc = xscom_write_mask(chip_id, addr, L2_PRD_PURGE_CMD_TRIGGER,
> +                             L2_PRD_PURGE_CMD_TRIGGER);
> +       if (rc)
> +               prlog(PR_ERR, "PURGE L2 on core 0x%x: XSCOM write_mask "
> +                     "failed %i\n", core_id, rc);
> +       return rc;
> +}
> +
> +static int wait_l2_purge(uint32_t chip_id, uint32_t core_id)
> +{
> +       int rc;
> +       unsigned long now = mftb();
> +       unsigned long end = now + msecs_to_tb(2);
> +       uint64_t val = L2_PRD_PURGE_CMD_REG_BUSY;
> +       uint64_t addr = XSCOM_ADDR_P9_EX(core_id, L2_PRD_PURGE_CMD_REG);
> +
> +       while (val & L2_PRD_PURGE_CMD_REG_BUSY) {
> +               rc = xscom_read(chip_id, addr, &val);
> +               if (rc) {
> +                       prlog(PR_ERR, "PURGE L2 on core 0x%x: XSCOM read "
> +                             "failed %i\n", core_id, rc);
> +                       break;
> +               }
> +               if (!(val & L2_PRD_PURGE_CMD_REG_BUSY))
> +                       break;
> +               now = mftb();
> +               if (tb_compare(now, end) == TB_AAFTERB) {
> +                       prlog(PR_ERR, "PURGE L2 on core 0x%x timed out %i\n",
> +                             core_id, rc);
> +                       return OPAL_BUSY;
> +               }
> +       }
> +
> +       /* We have to clear the trigger bit ourselves */
> +       val &= ~L2_PRD_PURGE_CMD_TRIGGER;
> +       rc = xscom_write(chip_id, addr, val);
> +       if (rc)
> +               prlog(PR_ERR, "PURGE L2 on core 0x%x: XSCOM write failed %i\n",
> +                     core_id, rc);
> +       return rc;
> +}
> +
> +static int start_l3_purge(uint32_t chip_id, uint32_t core_id)
> +{
> +       int rc;
> +       uint64_t addr = XSCOM_ADDR_P9_EX(core_id, L3_PRD_PURGE_REG);
> +
> +       rc = xscom_write_mask(chip_id, addr, L3_FULL_PURGE,
> +                             L3_PRD_PURGE_TTYPE_MASK);
> +       if (!rc)
> +               rc = xscom_write_mask(chip_id, addr, L3_PRD_PURGE_REQ,
> +                             L3_PRD_PURGE_REQ);
> +       if (rc)
> +               prlog(PR_ERR, "PURGE L3 on core 0x%x: XSCOM write_mask "
> +                     "failed %i\n", core_id, rc);
> +       return rc;
> +}
> +
> +static int wait_l3_purge(uint32_t chip_id, uint32_t core_id)
> +{
> +       int rc;
> +       unsigned long now = mftb();
> +       unsigned long end = now + msecs_to_tb(2);
> +       uint64_t val = L3_PRD_PURGE_REQ;
> +       uint64_t addr = XSCOM_ADDR_P9_EX(core_id, L3_PRD_PURGE_REG);
> +
> +       /* Trigger bit is automatically set to zero when flushing is done */
> +       while (val & L3_PRD_PURGE_REQ) {
> +               rc = xscom_read(chip_id, addr, &val);
> +               if (rc) {
> +                       prlog(PR_ERR, "PURGE L3 on core 0x%x: XSCOM read "
> +                             "failed %i\n", core_id, rc);
> +                       break;
> +               }
> +               if (!(val & L3_PRD_PURGE_REQ))
> +                       break;
> +               now = mftb();
> +               if (tb_compare(now, end) == TB_AAFTERB) {
> +                       prlog(PR_ERR, "PURGE L3 on core 0x%x timed out %i\n",
> +                             core_id, rc);
> +                       return OPAL_BUSY;
> +               }
> +       }
> +       return rc;
> +}
> +
> +static int64_t purge_l2_l3_caches(void)
> +{
> +       struct cpu_thread *t;
> +       uint64_t core_id, prev_core_id = (uint64_t)-1;
> +
> +       for_each_ungarded_cpu(t) {
> +               /* Only need to do it once per core chiplet */
> +               core_id = pir_to_core_id(t->pir);
> +               if (prev_core_id == core_id)
> +                       continue;
> +               prev_core_id = core_id;
> +               if (start_l2_purge(t->chip_id, core_id))
> +                       goto out;
> +               if (start_l3_purge(t->chip_id, core_id))
> +                       goto out;
> +       }
> +
> +       prev_core_id = (uint64_t)-1;
> +       for_each_ungarded_cpu(t) {
> +               /* Only need to do it once per core chiplet */
> +               core_id = pir_to_core_id(t->pir);
> +               if (prev_core_id == core_id)
> +                       continue;
> +               prev_core_id = core_id;
> +
> +               if (wait_l2_purge(t->chip_id, core_id))
> +                       goto out;
> +               if (wait_l3_purge(t->chip_id, core_id))
> +                       goto out;
> +       }
> +       return OPAL_SUCCESS;
> +out:

> +       prlog(PR_ERR, "Failed on core: 0x%llx\n", core_id);

You can probably delete this. It provides no useful information and
the called functions already print an error message.

> +       return OPAL_BUSY_EVENT;
> +}
> +
>  static int64_t npu2_dev_cfg_exp_devcap(void *dev,
>                 struct pci_cfg_reg_filter *pcrf __unused,
>                 uint32_t offset, uint32_t size,
> @@ -346,6 +476,9 @@ static int64_t npu2_dev_cfg_exp_devcap(void *dev,
>         if (*data & PCICAP_EXP_DEVCTL_FUNC_RESET)
>                 npu2_dev_procedure_reset(ndev);

> +       if (purge_l2_l3_caches())
> +               return OPAL_BUSY_EVENT;
> +
>         return OPAL_PARTIAL;
>  }
>
> @@ -1125,7 +1258,7 @@ static int64_t npu2_hreset(struct pci_slot *slot __unused)
>                         reset_ntl(ndev);
>                 }
>         }
> -       return OPAL_SUCCESS;
> +       return purge_l2_l3_caches();

This is more of a question for Alexy, but why are we returning
OPAL_BUSY_EVENT here? It seems like a weird hack to determine when the
cache flush failed.