[Skiboot] [PATCH] core/flash: Make opal_flash_op() actually asynchronous

Sat Jul 1 05:21:47 AEST 2017

On Thu, Jun 29, 2017 at 10:37 PM, Cyril Bur <cyril.bur at au1.ibm.com> wrote:
>
> This patch provides a simple (although not particularly efficient)
> asynchronous capability to the opal_flash interface. The advantage of
> this approach is that it doesn't require any changing of blocklevel or
> its backends to provide an asynchronous implementation. This is also the
> disadvantage of this implementation as all it actually does is break the
> work up in chunks that it can performed quickly, but still
> synchronously. Only a backend could provide a more asynchronous
> implementation.
>
> This solves a problem we have right now with the opal_flash_erase call
> where it can block in Skiboot for around three minutes. This causes a
> variety of problems in Linux due to a processor being gone for a long
> time.
> For example:
>
> [   98.610043] INFO: rcu_sched detected stalls on CPUs/tasks:
> [   98.610050]         113-...: (1 GPs behind) idle=96f/140000000000000/0 softirq=527/528 fqs=1044
> [   98.610051]         (detected by 112, t=2102 jiffies, g=223, c=222, q=123)
> [   98.610060] Task dump for CPU 113:
> [   98.610062] pflash          R  running task        0  3335   3333 0x00040004
> [   98.610066] Call Trace:
> [   98.610070] [c000001fdd847730] [0000000000000001] 0x1 (unreliable)
> [   98.610076] [c000001fdd847900] [c000000000013854] __switch_to+0x1e8/0x1f4
> [   98.610081] [c000001fdd847960] [c0000000006122c4] __schedule+0x32c/0x874
> [   98.610083] [c000001fdd847a30] [c000001fdd847b40] 0xc000001fdd847b40
>
> It is for this reason that breaking the work up in smaller chunks solves
> this problem as Skiboot can return the CPU to Linux between chunks to
> avoid Linux getting upset.

Any timing measurements on how this affects flash read/write speed
from the host?

>
>
> Reported-By: Samuel Mendoza-Jonas <sam at mendozajonas.com>
> Signed-off-by: Cyril Bur <cyril.bur at au1.ibm.com>
> ---
> Like this Alistair? Something something you're correct. Lets go with
> this one.
>
>  core/flash.c                            | 116 +++++++++++++++++++++++++++-----
>  doc/opal-api/opal-flash-110-111-112.rst |   4 ++
>  2 files changed, 103 insertions(+), 17 deletions(-)
>
> diff --git a/core/flash.c b/core/flash.c
> index 177f7ae1..71cc3c87 100644
> --- a/core/flash.c
> +++ b/core/flash.c
> @@ -28,6 +28,23 @@
>  #include <libstb/stb.h>
>  #include <libstb/container.h>
>  #include <elf.h>
> +#include <timer.h>
> +#include <timebase.h>
> +
> +enum flash_op {
> +       FLASH_OP_READ,
> +       FLASH_OP_WRITE,
> +       FLASH_OP_ERASE,
> +};
> +
> +struct flash_async_info {
> +       enum flash_op op;
> +       struct timer poller;
> +       uint64_t token;
> +       uint64_t pos;
> +       uint64_t len;
> +       uint64_t buf;
> +};
>
>  struct flash {
>         struct list_node        list;
> @@ -36,6 +53,7 @@ struct flash {
>         uint64_t                size;
>         uint32_t                block_size;
>         int                     id;
> +       struct flash_async_info async;
>  };
>
>  static LIST_HEAD(flashes);
> @@ -200,6 +218,53 @@ static int flash_nvram_probe(struct flash *flash, struct ffs_handle *ffs)
>
>  /* core flash support */
>
> +/*
> + * Called with flash lock held, drop it on async completion
> + */
> +static void flash_poll(struct timer *t __unused, void *data, uint64_t now __unused)
> +{
> +       struct flash *flash = data;
> +       uint64_t offset, buf, len;
> +       int rc;
> +
> +       offset = flash->async.pos;
> +       buf = flash->async.buf;
> +       len = MIN(flash->async.len, flash->block_size);
> +
> +       switch (flash->async.op) {
> +       case FLASH_OP_READ:
> +               rc = blocklevel_raw_read(flash->bl, offset, (void *)buf, len);
> +               break;
> +       case FLASH_OP_WRITE:
> +               rc = blocklevel_raw_write(flash->bl, offset, (void *)buf, len);
> +               break;
> +       case FLASH_OP_ERASE:
> +               rc = blocklevel_erase(flash->bl, offset, len);
> +               break;
> +       default:
> +               assert(0);
> +       }
> +
> +       if (rc)
> +               rc = OPAL_HARDWARE;
> +
> +       flash->async.pos += len;
> +       flash->async.buf += len;
> +       flash->async.len -= len;
> +       if (!rc && flash->async.len) {
> +               /*
> +                * We want to get called pretty much straight away, just have
> +                * to be sure that we jump back out to Linux so that if this
> +                * very long we don't cause RCU or the scheduler to freak
> +                */
> +               schedule_timer(&flash->async.poller, msecs_to_tb(1));
> +               return;
> +       }
> +
> +       opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL, flash->async.token, rc);
> +       flash_release(flash);
> +}
> +
>  static struct dt_node *flash_add_dt_node(struct flash *flash, int id)
>  {
>         struct dt_node *flash_node;
> @@ -295,6 +360,7 @@ int flash_register(struct blocklevel_device *bl)
>         flash->size = size;
>         flash->block_size = block_size;
>         flash->id = num_flashes();
> +       init_timer(&flash->async.poller, flash_poll, flash);
>
>         list_add(&flashes, &flash->list);
>
> @@ -323,16 +389,11 @@ int flash_register(struct blocklevel_device *bl)
>         return OPAL_SUCCESS;
>  }
>
> -enum flash_op {
> -       FLASH_OP_READ,
> -       FLASH_OP_WRITE,
> -       FLASH_OP_ERASE,
> -};
> -
>  static int64_t opal_flash_op(enum flash_op op, uint64_t id, uint64_t offset,
>                 uint64_t buf, uint64_t size, uint64_t token)
>  {
>         struct flash *flash = NULL;
> +       uint64_t len;
>         int rc;
>
>         list_for_each(&flashes, flash, list)
> @@ -351,9 +412,16 @@ static int64_t opal_flash_op(enum flash_op op, uint64_t id, uint64_t offset,
>                 prlog(PR_DEBUG, "Requested flash op %d beyond flash size %" PRIu64 "\n",
>                                 op, flash->size);
>                 rc = OPAL_PARAMETER;
> -               goto err;
> +               goto out;
>         }
>
> +       len = MIN(size, flash->block_size);
> +       flash->async.op = op;
> +       flash->async.token = token;
> +       flash->async.buf = buf + len;
> +       flash->async.len = size - len;
> +       flash->async.pos = offset + len;
> +
>         /*
>          * These ops intentionally have no smarts (ecc correction or erase
>          * before write) to them.
> @@ -363,29 +431,43 @@ static int64_t opal_flash_op(enum flash_op op, uint64_t id, uint64_t offset,
>          */
>         switch (op) {
>         case FLASH_OP_READ:
> -               rc = blocklevel_raw_read(flash->bl, offset, (void *)buf, size);
> +               rc = blocklevel_raw_read(flash->bl, offset, (void *)buf, len);
>                 break;
>         case FLASH_OP_WRITE:
> -               rc = blocklevel_raw_write(flash->bl, offset, (void *)buf, size);
> +               rc = blocklevel_raw_write(flash->bl, offset, (void *)buf, len);
>                 break;
>         case FLASH_OP_ERASE:
> -               rc = blocklevel_erase(flash->bl, offset, size);
> +               rc = blocklevel_erase(flash->bl, offset, len);
>                 break;
>         default:
>                 assert(0);
>         }
>
>         if (rc) {
> +               prlog(PR_ERR, "%s: Op %d failed with rc %d\n", __func__, op, rc);
>                 rc = OPAL_HARDWARE;
> -               goto err;
> +               goto out;
>         }

Some code duplication here now with the new flash_poll() function.
Might be worth a refactor.

>
> -       flash_release(flash);
> -
> -       opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL, token, rc);
> -       return OPAL_ASYNC_COMPLETION;
> -
> -err:
> +       if (size - len) {
> +               /* Work remains */
> +               schedule_timer(&flash->async.poller, msecs_to_tb(1));
> +               /* Don't release the flash */
> +               return OPAL_ASYNC_COMPLETION;
> +       } else {
> +               /*
> +                * As tempting as it might be here to return OPAL_SUCCESS
> +                * here, don't! As of 1/07/2017 the powernv_flash driver in
> +                * Linux will handle OPAL_SUCCESS as an error, the only thing
> +                * that makes it handle things as though they're working is
> +                * receiving OPAL_ASYNC_COMPLETION.
> +                *
> +                * XXX TODO: Revisit this in a few years *sigh*
> +                */
> +               opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL, flash->async.token, rc);
> +       }
> +       rc = OPAL_ASYNC_COMPLETION;
> +out:
>         flash_release(flash);
>         return rc;
>  }
> diff --git a/doc/opal-api/opal-flash-110-111-112.rst b/doc/opal-api/opal-flash-110-111-112.rst
> index 71ba866d..086c4095 100644
> --- a/doc/opal-api/opal-flash-110-111-112.rst
> +++ b/doc/opal-api/opal-flash-110-111-112.rst
> @@ -20,6 +20,10 @@ success, the calls will return ``OPAL_ASYNC_COMPLETION``, and an
>  opal_async_completion message will be sent (with the appropriate token
>  argument) when the operation completes.
>
> +Due to an error in the powernv_flash driver in Linux these three OPAL
> +calls should never return ``OPAL_SUCCESS`` as the driver is likely to
> +treat this return value as an error.
> +
>  All calls share the same return values:
>
>  ``OPAL_ASYNC_COMPLETION``
> --
> 2.13.2
>