[Skiboot] [PATCH] Add in new OPAL call to flush the L2 and L3 caches.

Mon Nov 5 12:57:22 AEDT 2018

On Mon, Oct 29, 2018 at 5:19 PM Rashmica Gupta <rashmica.g at gmail.com> wrote:
>
> Using the dcbf instruction is really slow... This is much faster.
>
> Suggested-by: Alistair Popple <alistair at popple.id.au>
> Signed-off-by: Rashmica Gupta <rashmica.g at gmail.com>
> ---
> Stewart: I realise that cpu.c is probably not where this should live...
> Thoughts on where it should go?
>
> Context: When resetting a GPU we want to make sure all dirty cache lines
> in the CPU cache are cleared. Hit up Alexey and Alistair for the nitty
> gritty details.
>
>  core/cpu.c         | 108 +++++++++++++++++++++++++++++++++++++++++++++
>  include/cpu.h      |   2 +
>  include/opal-api.h |   3 +-
>  3 files changed, 112 insertions(+), 1 deletion(-)
>
> diff --git a/core/cpu.c b/core/cpu.c
> index 4f518a4c..bc4fcaf8 100644
> --- a/core/cpu.c
> +++ b/core/cpu.c
> @@ -1630,3 +1630,111 @@ static int64_t opal_nmmu_set_ptcr(uint64_t chip_id, uint64_t ptcr)
>         return rc;
>  }
>  opal_call(OPAL_NMMU_SET_PTCR, opal_nmmu_set_ptcr, 2);
> +
> +
> +#define L2_PRD_PURGE_CMD_REG 0x1080E
> +#define L2_PRD_PURGE_CMD_REG_BUSY 0x0040000000000000
> +#define L2_PRD_PURGE_CMD_TRIGGER 0x8000000000000000
> +#define L3_PRD_PURGE_REG 0x1180E
> +#define L3_PRD_PURGE_REQ 0x8000000000000000
> +#define TIMEOUT_MS 2
> +
> +static inline bool time_expired(unsigned long start)
> +{
> +       unsigned long time = tb_to_msecs(mftb());
> +
> +       if (time - start > TIMEOUT_MS) {
> +               return true;
> +       }
> +       return false;
> +}
> +
> +static int flush_l2_caches(uint32_t chip_id, uint32_t core_id)
> +{
> +       int rc, timeout = 0;
> +       unsigned long start_time;
> +       uint64_t val = L2_PRD_PURGE_CMD_REG_BUSY;
> +       uint64_t addr = XSCOM_ADDR_P9_EX(core_id, L2_PRD_PURGE_CMD_REG);
> +
> +       rc = xscom_write_mask(chip_id, addr, L2_PRD_PURGE_CMD_TRIGGER,
> +                             L2_PRD_PURGE_CMD_TRIGGER);
> +       if (rc) {
> +               prlog(PR_ERR, "FLUSH L2 on core 0x%x: XSCOM write_mask failed %i\n", core_id, rc);
> +       }
> +       start_time = tb_to_msecs(mftb());
> +       while ((val & L2_PRD_PURGE_CMD_REG_BUSY) && !(timeout = time_expired(start_time))) {
> +               rc = xscom_read(chip_id, addr, &val);
> +               if (rc) {
> +                       prlog(PR_ERR, "FLUSH L2 on core 0x%x: XSCOM read failed %i\n", core_id, rc);
> +                       break;
> +               }
> +       }
> +       if (timeout) {
> +               prlog(PR_ERR, "FLUSH L3 on core 0x%x timed out %i\n", core_id, rc);
> +               return OPAL_BUSY;
> +       }
> +
> +       /* We have to clear the trigger bit ourselves */
> +       val &= ~L2_PRD_PURGE_CMD_TRIGGER;
> +       rc = xscom_write(chip_id, addr, val);
> +       if (rc)
> +               prlog(PR_ERR, "FLUSH L2 on core 0x%x: XSCOM write failed %i\n", core_id, rc);
> +       return 0;
> +
> +}
> +
> +static int flush_l3_caches(uint32_t chip_id, uint32_t core_id)
> +{
> +       int rc, timeout = 0;
> +       unsigned long start_time;
> +       uint64_t val = L3_PRD_PURGE_REQ;
> +       uint64_t addr = XSCOM_ADDR_P9_EX(core_id, L3_PRD_PURGE_REG);
> +
> +       rc = xscom_write_mask(chip_id, addr, L3_PRD_PURGE_REQ, L3_PRD_PURGE_REQ);
> +       if (rc) {
> +               prlog(PR_ERR, "FLUSH L3 on core 0x%x: XSCOM write_mask failed %i\n", core_id, rc);
> +       }
> +
> +       /* Trigger bit is automatically set to zero when flushing is done*/
> +       start_time = tb_to_msecs(mftb());
> +       while ((val & L3_PRD_PURGE_REQ) && !(timeout = time_expired(start_time) )) {
> +               rc = xscom_read(chip_id, addr, &val);
> +               if (rc) {
> +                       prlog(PR_ERR, "FLUSH L3 on core 0x%x: XSCOM read failed %i\n", core_id, rc);
> +                       break;
> +               }
> +       }

This seems like one of the few occasions where a do {} while() loop
makes sense. not a big deal though.

> +       if (timeout) {
> +               prlog(PR_ERR, "FLUSH L3 on core 0x%x timed out %i\n", core_id, rc);
> +               return OPAL_BUSY;
> +       }
> +
> +       return 0;
> +}
> +
> +int flush_caches(void)
> +{
> +       int rc = 0;
> +       struct cpu_thread *t;
> +       uint64_t chip_id, core_id, prev_core_id = 0xdeadbeef;
> +

> +       if ((mfspr(SPR_PVR) & PVR_TYPE_P9) != PVR_TYPE_P9)
> +               return OPAL_UNSUPPORTED;

This looks broken. The processor type portion of the PVR is in the
upper 16 bits:
    #define SPR_PVR_TYPE                    0xffff0000
And the PVE_TYPE_* macros are shifted down:
   #define PVR_TYPE_P9     0x004e

If you want to check the PVR this way then you need:
PVR_TYPE(mfspr(SPR_PVR)) != PVR_TYPE_P9 instead. That said, we
generally don't look at the PVR directly and use the "proc_gen" global
instead: e.g.
   if (proc_gen != proc_gen_p9)

> +       for_each_cpu(t) {

Does this need to be for_each_cpu_ungarded()?

> +               /* Only need to do it once per core chiplet */
> +               core_id = pir_to_core_id(t->pir);
> +               if (prev_core_id == core_id)
> +                       continue;

The L2 and L3 are shared between a pairs of cores (i.e. one per EX
chiplet) rather than being per-core so I think this is doubling up on
flushes. Also, do we need to do a special wakeup here? I'm not sure
what happens to the caches if both cores in the pair are in a deep
sleep state.

> +               prev_core_id = core_id;
> +               chip_id = t->chip_id;
> +
> +               rc |= flush_l2_caches(chip_id, core_id);
> +               rc |= flush_l3_caches(chip_id, core_id);

If each call fails in a different manner you'll get gibberish in rc
since they're ORed together. Maybe we don't care but eh... it seems a
bit bad.

> +       }
> +
> +       return rc;
> +}
> +
> +

> +opal_call(OPAL_CLEAR_CACHE, flush_caches, 0);

I'd rather you called this CLEAR_L2_L3 or something. It doesn't touch
the L1 at all so I think calling it "CLEAR_CACHE" is a bit misleading.
As an alternative, you could also adjust the API to something like
opal_clear_cache(pir, mask_of_levels) and make opal_clear_cache(-1,
PURGE_L2 | PURGE_L3) match the current behaviour.

> diff --git a/include/cpu.h b/include/cpu.h
> index 2fe47982..04c862c5 100644
> --- a/include/cpu.h
> +++ b/include/cpu.h
> @@ -329,4 +329,6 @@ int dctl_set_special_wakeup(struct cpu_thread *t);
>  int dctl_clear_special_wakeup(struct cpu_thread *t);
>  int dctl_core_is_gated(struct cpu_thread *t);
>
> +extern int flush_caches(void);
> +
>  #endif /* __CPU_H */
> diff --git a/include/opal-api.h b/include/opal-api.h
> index 5f397c8e..c24838d2 100644
> --- a/include/opal-api.h
> +++ b/include/opal-api.h
> @@ -226,7 +226,8 @@
>  #define OPAL_NX_COPROC_INIT                    167
>  #define OPAL_NPU_SET_RELAXED_ORDER             168
>  #define OPAL_NPU_GET_RELAXED_ORDER             169
> -#define OPAL_LAST                              169
> +#define OPAL_CLEAR_CACHE                       170
> +#define OPAL_LAST                              170
>
>  #define QUIESCE_HOLD                   1 /* Spin all calls at entry */
>  #define QUIESCE_REJECT                 2 /* Fail all calls with OPAL_BUSY */
> --
> 2.17.2

As a final comment, it seems like we might be better off making this a
multi-pass thing. Currently we walk the list of CPUs once and wait for
the purge to finish on each before continuing. Since we don't really
have a good error handling story (I guess you fall back to doing a
dcbf loop in linux?) it might make sense to do something like:

for_each_cpu()
   start_l2_purge()
for_each_cpu()
   wait_for_l2_purge()
for_each_cpu()
   start_l3_purge()
for_each_cpu()
   wait_for_l3_purge()

and bail out if we hit an error at any point. As a general rule we
want OPAL calls to be handled in microseconds so since opal runs in
real mode with interrupts disabled. I'm not sure how fast a purge is,
but if it's a significant fraction of the 2 millisecond timeout window
then we might end up in OPAL for dozens for milliseconds in the worst
case. This might just be micro-optimising though since I expect this
is only going to be called once in a blue moon, but it's something to
consider.

>
> _______________________________________________
> Skiboot mailing list
> Skiboot at lists.ozlabs.org
> https://lists.ozlabs.org/listinfo/skiboot