[Skiboot] [PATCH] core/flash: Make opal_flash_op() actually asynchronous
Rob Lippert
rlippert at google.com
Sat Jul 1 05:21:47 AEST 2017
On Thu, Jun 29, 2017 at 10:37 PM, Cyril Bur <cyril.bur at au1.ibm.com> wrote:
>
> This patch provides a simple (although not particularly efficient)
> asynchronous capability to the opal_flash interface. The advantage of
> this approach is that it doesn't require any changing of blocklevel or
> its backends to provide an asynchronous implementation. This is also the
> disadvantage of this implementation as all it actually does is break the
> work up in chunks that it can performed quickly, but still
> synchronously. Only a backend could provide a more asynchronous
> implementation.
>
> This solves a problem we have right now with the opal_flash_erase call
> where it can block in Skiboot for around three minutes. This causes a
> variety of problems in Linux due to a processor being gone for a long
> time.
> For example:
>
> [ 98.610043] INFO: rcu_sched detected stalls on CPUs/tasks:
> [ 98.610050] 113-...: (1 GPs behind) idle=96f/140000000000000/0 softirq=527/528 fqs=1044
> [ 98.610051] (detected by 112, t=2102 jiffies, g=223, c=222, q=123)
> [ 98.610060] Task dump for CPU 113:
> [ 98.610062] pflash R running task 0 3335 3333 0x00040004
> [ 98.610066] Call Trace:
> [ 98.610070] [c000001fdd847730] [0000000000000001] 0x1 (unreliable)
> [ 98.610076] [c000001fdd847900] [c000000000013854] __switch_to+0x1e8/0x1f4
> [ 98.610081] [c000001fdd847960] [c0000000006122c4] __schedule+0x32c/0x874
> [ 98.610083] [c000001fdd847a30] [c000001fdd847b40] 0xc000001fdd847b40
>
> It is for this reason that breaking the work up in smaller chunks solves
> this problem as Skiboot can return the CPU to Linux between chunks to
> avoid Linux getting upset.
Any timing measurements on how this affects flash read/write speed
from the host?
>
>
> Reported-By: Samuel Mendoza-Jonas <sam at mendozajonas.com>
> Signed-off-by: Cyril Bur <cyril.bur at au1.ibm.com>
> ---
> Like this Alistair? Something something you're correct. Lets go with
> this one.
>
> core/flash.c | 116 +++++++++++++++++++++++++++-----
> doc/opal-api/opal-flash-110-111-112.rst | 4 ++
> 2 files changed, 103 insertions(+), 17 deletions(-)
>
> diff --git a/core/flash.c b/core/flash.c
> index 177f7ae1..71cc3c87 100644
> --- a/core/flash.c
> +++ b/core/flash.c
> @@ -28,6 +28,23 @@
> #include <libstb/stb.h>
> #include <libstb/container.h>
> #include <elf.h>
> +#include <timer.h>
> +#include <timebase.h>
> +
> +enum flash_op {
> + FLASH_OP_READ,
> + FLASH_OP_WRITE,
> + FLASH_OP_ERASE,
> +};
> +
> +struct flash_async_info {
> + enum flash_op op;
> + struct timer poller;
> + uint64_t token;
> + uint64_t pos;
> + uint64_t len;
> + uint64_t buf;
> +};
>
> struct flash {
> struct list_node list;
> @@ -36,6 +53,7 @@ struct flash {
> uint64_t size;
> uint32_t block_size;
> int id;
> + struct flash_async_info async;
> };
>
> static LIST_HEAD(flashes);
> @@ -200,6 +218,53 @@ static int flash_nvram_probe(struct flash *flash, struct ffs_handle *ffs)
>
> /* core flash support */
>
> +/*
> + * Called with flash lock held, drop it on async completion
> + */
> +static void flash_poll(struct timer *t __unused, void *data, uint64_t now __unused)
> +{
> + struct flash *flash = data;
> + uint64_t offset, buf, len;
> + int rc;
> +
> + offset = flash->async.pos;
> + buf = flash->async.buf;
> + len = MIN(flash->async.len, flash->block_size);
> +
> + switch (flash->async.op) {
> + case FLASH_OP_READ:
> + rc = blocklevel_raw_read(flash->bl, offset, (void *)buf, len);
> + break;
> + case FLASH_OP_WRITE:
> + rc = blocklevel_raw_write(flash->bl, offset, (void *)buf, len);
> + break;
> + case FLASH_OP_ERASE:
> + rc = blocklevel_erase(flash->bl, offset, len);
> + break;
> + default:
> + assert(0);
> + }
> +
> + if (rc)
> + rc = OPAL_HARDWARE;
> +
> + flash->async.pos += len;
> + flash->async.buf += len;
> + flash->async.len -= len;
> + if (!rc && flash->async.len) {
> + /*
> + * We want to get called pretty much straight away, just have
> + * to be sure that we jump back out to Linux so that if this
> + * very long we don't cause RCU or the scheduler to freak
> + */
> + schedule_timer(&flash->async.poller, msecs_to_tb(1));
> + return;
> + }
> +
> + opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL, flash->async.token, rc);
> + flash_release(flash);
> +}
> +
> static struct dt_node *flash_add_dt_node(struct flash *flash, int id)
> {
> struct dt_node *flash_node;
> @@ -295,6 +360,7 @@ int flash_register(struct blocklevel_device *bl)
> flash->size = size;
> flash->block_size = block_size;
> flash->id = num_flashes();
> + init_timer(&flash->async.poller, flash_poll, flash);
>
> list_add(&flashes, &flash->list);
>
> @@ -323,16 +389,11 @@ int flash_register(struct blocklevel_device *bl)
> return OPAL_SUCCESS;
> }
>
> -enum flash_op {
> - FLASH_OP_READ,
> - FLASH_OP_WRITE,
> - FLASH_OP_ERASE,
> -};
> -
> static int64_t opal_flash_op(enum flash_op op, uint64_t id, uint64_t offset,
> uint64_t buf, uint64_t size, uint64_t token)
> {
> struct flash *flash = NULL;
> + uint64_t len;
> int rc;
>
> list_for_each(&flashes, flash, list)
> @@ -351,9 +412,16 @@ static int64_t opal_flash_op(enum flash_op op, uint64_t id, uint64_t offset,
> prlog(PR_DEBUG, "Requested flash op %d beyond flash size %" PRIu64 "\n",
> op, flash->size);
> rc = OPAL_PARAMETER;
> - goto err;
> + goto out;
> }
>
> + len = MIN(size, flash->block_size);
> + flash->async.op = op;
> + flash->async.token = token;
> + flash->async.buf = buf + len;
> + flash->async.len = size - len;
> + flash->async.pos = offset + len;
> +
> /*
> * These ops intentionally have no smarts (ecc correction or erase
> * before write) to them.
> @@ -363,29 +431,43 @@ static int64_t opal_flash_op(enum flash_op op, uint64_t id, uint64_t offset,
> */
> switch (op) {
> case FLASH_OP_READ:
> - rc = blocklevel_raw_read(flash->bl, offset, (void *)buf, size);
> + rc = blocklevel_raw_read(flash->bl, offset, (void *)buf, len);
> break;
> case FLASH_OP_WRITE:
> - rc = blocklevel_raw_write(flash->bl, offset, (void *)buf, size);
> + rc = blocklevel_raw_write(flash->bl, offset, (void *)buf, len);
> break;
> case FLASH_OP_ERASE:
> - rc = blocklevel_erase(flash->bl, offset, size);
> + rc = blocklevel_erase(flash->bl, offset, len);
> break;
> default:
> assert(0);
> }
>
> if (rc) {
> + prlog(PR_ERR, "%s: Op %d failed with rc %d\n", __func__, op, rc);
> rc = OPAL_HARDWARE;
> - goto err;
> + goto out;
> }
Some code duplication here now with the new flash_poll() function.
Might be worth a refactor.
>
> - flash_release(flash);
> -
> - opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL, token, rc);
> - return OPAL_ASYNC_COMPLETION;
> -
> -err:
> + if (size - len) {
> + /* Work remains */
> + schedule_timer(&flash->async.poller, msecs_to_tb(1));
> + /* Don't release the flash */
> + return OPAL_ASYNC_COMPLETION;
> + } else {
> + /*
> + * As tempting as it might be here to return OPAL_SUCCESS
> + * here, don't! As of 1/07/2017 the powernv_flash driver in
> + * Linux will handle OPAL_SUCCESS as an error, the only thing
> + * that makes it handle things as though they're working is
> + * receiving OPAL_ASYNC_COMPLETION.
> + *
> + * XXX TODO: Revisit this in a few years *sigh*
> + */
> + opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL, flash->async.token, rc);
> + }
> + rc = OPAL_ASYNC_COMPLETION;
> +out:
> flash_release(flash);
> return rc;
> }
> diff --git a/doc/opal-api/opal-flash-110-111-112.rst b/doc/opal-api/opal-flash-110-111-112.rst
> index 71ba866d..086c4095 100644
> --- a/doc/opal-api/opal-flash-110-111-112.rst
> +++ b/doc/opal-api/opal-flash-110-111-112.rst
> @@ -20,6 +20,10 @@ success, the calls will return ``OPAL_ASYNC_COMPLETION``, and an
> opal_async_completion message will be sent (with the appropriate token
> argument) when the operation completes.
>
> +Due to an error in the powernv_flash driver in Linux these three OPAL
> +calls should never return ``OPAL_SUCCESS`` as the driver is likely to
> +treat this return value as an error.
> +
> All calls share the same return values:
>
> ``OPAL_ASYNC_COMPLETION``
> --
> 2.13.2
>
More information about the Skiboot
mailing list