[Skiboot] [PATCH] core/flash: Make opal_flash_op() actually asynchronous

Mon Jul 3 11:16:16 AEST 2017

On Fri, 2017-06-30 at 12:21 -0700, Rob Lippert wrote:
> On Thu, Jun 29, 2017 at 10:37 PM, Cyril Bur <cyril.bur at au1.ibm.com> wrote:
> > 
> > This patch provides a simple (although not particularly efficient)
> > asynchronous capability to the opal_flash interface. The advantage of
> > this approach is that it doesn't require any changing of blocklevel or
> > its backends to provide an asynchronous implementation. This is also the
> > disadvantage of this implementation as all it actually does is break the
> > work up in chunks that it can performed quickly, but still
> > synchronously. Only a backend could provide a more asynchronous
> > implementation.
> > 
> > This solves a problem we have right now with the opal_flash_erase call
> > where it can block in Skiboot for around three minutes. This causes a
> > variety of problems in Linux due to a processor being gone for a long
> > time.
> > For example:
> > 
> > [   98.610043] INFO: rcu_sched detected stalls on CPUs/tasks:
> > [   98.610050]         113-...: (1 GPs behind) idle=96f/140000000000000/0 softirq=527/528 fqs=1044
> > [   98.610051]         (detected by 112, t=2102 jiffies, g=223, c=222, q=123)
> > [   98.610060] Task dump for CPU 113:
> > [   98.610062] pflash          R  running task        0  3335   3333 0x00040004
> > [   98.610066] Call Trace:
> > [   98.610070] [c000001fdd847730] [0000000000000001] 0x1 (unreliable)
> > [   98.610076] [c000001fdd847900] [c000000000013854] __switch_to+0x1e8/0x1f4
> > [   98.610081] [c000001fdd847960] [c0000000006122c4] __schedule+0x32c/0x874
> > [   98.610083] [c000001fdd847a30] [c000001fdd847b40] 0xc000001fdd847b40
> > 
> > It is for this reason that breaking the work up in smaller chunks solves
> > this problem as Skiboot can return the CPU to Linux between chunks to
> > avoid Linux getting upset.
> 
> Any timing measurements on how this affects flash read/write speed
> from the host?
> 

Didn't take any measurements, purely interested in removing the RCU
stalls. I suppose in theory it is slower, unfortunately there isn't
much we can do.

The only thing I can think of is actually batching up more than one
erase block per flash_poll(). Unfortunately I don't really have the
cycles for a full performance analysis of how many would be best.

Cyril

> > 
> > 
> > Reported-By: Samuel Mendoza-Jonas <sam at mendozajonas.com>
> > Signed-off-by: Cyril Bur <cyril.bur at au1.ibm.com>
> > ---
> > Like this Alistair? Something something you're correct. Lets go with
> > this one.
> > 
> >  core/flash.c                            | 116 +++++++++++++++++++++++++++-----
> >  doc/opal-api/opal-flash-110-111-112.rst |   4 ++
> >  2 files changed, 103 insertions(+), 17 deletions(-)
> > 
> > diff --git a/core/flash.c b/core/flash.c
> > index 177f7ae1..71cc3c87 100644
> > --- a/core/flash.c
> > +++ b/core/flash.c
> > @@ -28,6 +28,23 @@
> >  #include <libstb/stb.h>
> >  #include <libstb/container.h>
> >  #include <elf.h>
> > +#include <timer.h>
> > +#include <timebase.h>
> > +
> > +enum flash_op {
> > +       FLASH_OP_READ,
> > +       FLASH_OP_WRITE,
> > +       FLASH_OP_ERASE,
> > +};
> > +
> > +struct flash_async_info {
> > +       enum flash_op op;
> > +       struct timer poller;
> > +       uint64_t token;
> > +       uint64_t pos;
> > +       uint64_t len;
> > +       uint64_t buf;
> > +};
> > 
> >  struct flash {
> >         struct list_node        list;
> > @@ -36,6 +53,7 @@ struct flash {
> >         uint64_t                size;
> >         uint32_t                block_size;
> >         int                     id;
> > +       struct flash_async_info async;
> >  };
> > 
> >  static LIST_HEAD(flashes);
> > @@ -200,6 +218,53 @@ static int flash_nvram_probe(struct flash *flash, struct ffs_handle *ffs)
> > 
> >  /* core flash support */
> > 
> > +/*
> > + * Called with flash lock held, drop it on async completion
> > + */
> > +static void flash_poll(struct timer *t __unused, void *data, uint64_t now __unused)
> > +{
> > +       struct flash *flash = data;
> > +       uint64_t offset, buf, len;
> > +       int rc;
> > +
> > +       offset = flash->async.pos;
> > +       buf = flash->async.buf;
> > +       len = MIN(flash->async.len, flash->block_size);
> > +
> > +       switch (flash->async.op) {
> > +       case FLASH_OP_READ:
> > +               rc = blocklevel_raw_read(flash->bl, offset, (void *)buf, len);
> > +               break;
> > +       case FLASH_OP_WRITE:
> > +               rc = blocklevel_raw_write(flash->bl, offset, (void *)buf, len);
> > +               break;
> > +       case FLASH_OP_ERASE:
> > +               rc = blocklevel_erase(flash->bl, offset, len);
> > +               break;
> > +       default:
> > +               assert(0);
> > +       }
> > +
> > +       if (rc)
> > +               rc = OPAL_HARDWARE;
> > +
> > +       flash->async.pos += len;
> > +       flash->async.buf += len;
> > +       flash->async.len -= len;
> > +       if (!rc && flash->async.len) {
> > +               /*
> > +                * We want to get called pretty much straight away, just have
> > +                * to be sure that we jump back out to Linux so that if this
> > +                * very long we don't cause RCU or the scheduler to freak
> > +                */
> > +               schedule_timer(&flash->async.poller, msecs_to_tb(1));
> > +               return;
> > +       }
> > +
> > +       opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL, flash->async.token, rc);
> > +       flash_release(flash);
> > +}
> > +
> >  static struct dt_node *flash_add_dt_node(struct flash *flash, int id)
> >  {
> >         struct dt_node *flash_node;
> > @@ -295,6 +360,7 @@ int flash_register(struct blocklevel_device *bl)
> >         flash->size = size;
> >         flash->block_size = block_size;
> >         flash->id = num_flashes();
> > +       init_timer(&flash->async.poller, flash_poll, flash);
> > 
> >         list_add(&flashes, &flash->list);
> > 
> > @@ -323,16 +389,11 @@ int flash_register(struct blocklevel_device *bl)
> >         return OPAL_SUCCESS;
> >  }
> > 
> > -enum flash_op {
> > -       FLASH_OP_READ,
> > -       FLASH_OP_WRITE,
> > -       FLASH_OP_ERASE,
> > -};
> > -
> >  static int64_t opal_flash_op(enum flash_op op, uint64_t id, uint64_t offset,
> >                 uint64_t buf, uint64_t size, uint64_t token)
> >  {
> >         struct flash *flash = NULL;
> > +       uint64_t len;
> >         int rc;
> > 
> >         list_for_each(&flashes, flash, list)
> > @@ -351,9 +412,16 @@ static int64_t opal_flash_op(enum flash_op op, uint64_t id, uint64_t offset,
> >                 prlog(PR_DEBUG, "Requested flash op %d beyond flash size %" PRIu64 "\n",
> >                                 op, flash->size);
> >                 rc = OPAL_PARAMETER;
> > -               goto err;
> > +               goto out;
> >         }
> > 
> > +       len = MIN(size, flash->block_size);
> > +       flash->async.op = op;
> > +       flash->async.token = token;
> > +       flash->async.buf = buf + len;
> > +       flash->async.len = size - len;
> > +       flash->async.pos = offset + len;
> > +
> >         /*
> >          * These ops intentionally have no smarts (ecc correction or erase
> >          * before write) to them.
> > @@ -363,29 +431,43 @@ static int64_t opal_flash_op(enum flash_op op, uint64_t id, uint64_t offset,
> >          */
> >         switch (op) {
> >         case FLASH_OP_READ:
> > -               rc = blocklevel_raw_read(flash->bl, offset, (void *)buf, size);
> > +               rc = blocklevel_raw_read(flash->bl, offset, (void *)buf, len);
> >                 break;
> >         case FLASH_OP_WRITE:
> > -               rc = blocklevel_raw_write(flash->bl, offset, (void *)buf, size);
> > +               rc = blocklevel_raw_write(flash->bl, offset, (void *)buf, len);
> >                 break;
> >         case FLASH_OP_ERASE:
> > -               rc = blocklevel_erase(flash->bl, offset, size);
> > +               rc = blocklevel_erase(flash->bl, offset, len);
> >                 break;
> >         default:
> >                 assert(0);
> >         }
> > 
> >         if (rc) {
> > +               prlog(PR_ERR, "%s: Op %d failed with rc %d\n", __func__, op, rc);
> >                 rc = OPAL_HARDWARE;
> > -               goto err;
> > +               goto out;
> >         }
> 
> Some code duplication here now with the new flash_poll() function.
> Might be worth a refactor.

Could potentially pull out the switch statement. Not that much though I
suppose.

> 
> > 
> > -       flash_release(flash);
> > -
> > -       opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL, token, rc);
> > -       return OPAL_ASYNC_COMPLETION;
> > -
> > -err:
> > +       if (size - len) {
> > +               /* Work remains */
> > +               schedule_timer(&flash->async.poller, msecs_to_tb(1));
> > +               /* Don't release the flash */
> > +               return OPAL_ASYNC_COMPLETION;
> > +       } else {
> > +               /*
> > +                * As tempting as it might be here to return OPAL_SUCCESS
> > +                * here, don't! As of 1/07/2017 the powernv_flash driver in
> > +                * Linux will handle OPAL_SUCCESS as an error, the only thing
> > +                * that makes it handle things as though they're working is
> > +                * receiving OPAL_ASYNC_COMPLETION.
> > +                *
> > +                * XXX TODO: Revisit this in a few years *sigh*
> > +                */
> > +               opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL, flash->async.token, rc);
> > +       }
> > +       rc = OPAL_ASYNC_COMPLETION;
> > +out:
> >         flash_release(flash);
> >         return rc;
> >  }
> > diff --git a/doc/opal-api/opal-flash-110-111-112.rst b/doc/opal-api/opal-flash-110-111-112.rst
> > index 71ba866d..086c4095 100644
> > --- a/doc/opal-api/opal-flash-110-111-112.rst
> > +++ b/doc/opal-api/opal-flash-110-111-112.rst
> > @@ -20,6 +20,10 @@ success, the calls will return ``OPAL_ASYNC_COMPLETION``, and an
> >  opal_async_completion message will be sent (with the appropriate token
> >  argument) when the operation completes.
> > 
> > +Due to an error in the powernv_flash driver in Linux these three OPAL
> > +calls should never return ``OPAL_SUCCESS`` as the driver is likely to
> > +treat this return value as an error.
> > +
> >  All calls share the same return values:
> > 
> >  ``OPAL_ASYNC_COMPLETION``
> > --
> > 2.13.2
> > 
> 
>