[Skiboot] [PATCH v3 06/13] hiomap: Enable Async IPMI messaging

Mon Nov 25 14:12:19 AEDT 2019

On Thu, Nov 7, 2019 at 4:25 AM Deb McLemore <debmc at linux.ibm.com> wrote:
>
> To provide higher layer async operations to access
> the target flash, enable hiomap to perform async
> ipmi messaging for call paths thru opal_flash_op.
>
> Special considerations and constraints are to prevent
> recursive locking and/or polling inside OPAL calls.
>
> Objective is to remove the hiomap_queue_msg_sync for
> moving windows (which already uses pollers) to allow
> async requests to perform their functions.
>
> Call paths thru opal_flash_op will determine if the
> requested operation needs to be re-queued, to allow
> skiboot to jump back out to Linux to prevent RCU or
> scheduler issues.
>
> For example, if a flash window move operation
> is needed to erase a very large flash segment,
> the time spent in opal firmware would be long
> enough that Linux would stall.  To avoid the
> long running duration of various aspects of this
> inherent long running operation we divide up
> the tasks in smaller chunks to avoid cumulating
> time spent in opal firmware.  The return codes
> are used to distinguish differences between cases
> where a re-queue of the transaction would be
> needed versus a true failure return code.
>
> PR_TRACE used since PR_DEBUG seems to always trigger,
> unable to figure out why.
>
> Due to the nature of redefining the sync versus
> async capabilities, this patch is larger than
> desired due to interrelationships of changes
> to functionality.
>
> Signed-off-by: Deb McLemore <debmc at linux.ibm.com>
> ---
>  core/flash.c           | 154 +++++++-
>  libflash/blocklevel.h  |   5 +
>  libflash/errors.h      |   1 +
>  libflash/ipmi-hiomap.c | 955 ++++++++++++++++++++++++++++++++++++++-----------
>  libflash/ipmi-hiomap.h |  41 ++-
>  5 files changed, 936 insertions(+), 220 deletions(-)
>
> diff --git a/core/flash.c b/core/flash.c
> index e98c8e0..98614f7 100644
> --- a/core/flash.c
> +++ b/core/flash.c
> @@ -28,6 +28,14 @@
>  #include <timer.h>
>  #include <timebase.h>
>
> +/*
> + * need to keep this under the BT queue limit
> + * causes are when ipmi to bmc gets bogged down
> + * delay allows for congestion to clear
> + */
> +#define FLASH_RETRY_LIMIT 10
> +#define FLASH_SCHEDULE_DELAY 200
> +
>  enum flash_op {
>         FLASH_OP_READ,
>         FLASH_OP_WRITE,
> @@ -41,6 +49,9 @@ struct flash_async_info {
>         uint64_t pos;
>         uint64_t len;
>         uint64_t buf;
> +       int retry_counter;
> +       int transaction_id;
> +       int in_progress_schedule_delay;
>  };
>
>  struct flash {
> @@ -80,13 +91,63 @@ static u32 nvram_offset, nvram_size;
>  static bool flash_reserve(struct flash *flash)
>  {
>         bool rc = false;
> +       int lock_try_counter = 10;
> +       uint64_t now;
> +       uint64_t start_time;
> +       uint64_t wait_time;
> +       uint64_t flash_reserve_ticks = 10;
> +       uint64_t timeout_counter;
> +
> +       start_time = mftb();
> +       now = mftb();
> +       wait_time = tb_to_msecs(now - start_time);

wait_time is always going to be zero since the two mftb() calls are
going to return a pretty similar values and tb_to_msecs() divides the
difference by 512000 (tb_hz / 1000).

> +       timeout_counter = 0;
> +
> +
> +       while (wait_time < flash_reserve_ticks) {
> +               ++timeout_counter;
> +               if (timeout_counter % 4 == 0) {
> +                       now = mftb();
> +                       wait_time = tb_to_msecs(now - start_time);
> +               }
> +               if (flash->busy == false) {
> +                       break;
> +               }
> +       }

This doesn't make any sense to me. What are you doing here and why?

> -       if (!try_lock(&flash_lock))
> +       while (!try_lock(&flash_lock)) {
> +               --lock_try_counter;
> +               if (lock_try_counter == 0) {
> +                       break;
> +               }
> +       }

If you're going to do this sort of thing then use a for loop. T so the
iteration count is set the same place it's being used.

> +       if (lock_try_counter == 0) {
>                 return false;
> +       }
you should return from inside the loop instead of breaking out and
re-checking the break condition

> +
> +       /* we have the lock if we got here */
>
>         if (!flash->busy) {
>                 flash->busy = true;
>                 rc = true;
> +       } else {
> +               /* probably beat a flash_release and grabbed the lock */
> +               unlock(&flash_lock);
> +               while (!try_lock(&flash_lock)) {
> +                       --lock_try_counter;
> +                       if (lock_try_counter == 0) {
> +                               break;
> +                       }
> +               }
> +               if (lock_try_counter == 0) {
> +                       return false;
> +               }
> +               /* we have the lock if we are here */
> +               if (!flash->busy) {
> +                       flash->busy = true;
> +                       rc = true;
> +               }

This doesn't make much sense to me either. Why is replicating all the
previous logic necessary?

>         }
>         unlock(&flash_lock);

I'm not sure that we even need to use try_lock() with patch 03/13
applied. With that patch the flash's lock should only be held briefly
so we should be able to use the normal (blocking) lock() since we
don't need to deal with waiting for a 64MB flash read to complete.

> @@ -279,12 +340,31 @@ static void flash_poll(struct timer *t __unused, void *data, uint64_t now __unus
>                 assert(0);
>         }
>
> -       if (rc)
> -               rc = OPAL_HARDWARE;
> +       if (rc == 0) {
> +               /*
> +                * if we are good to proceed forward
> +                * otherwise we may have to try again
> +                */
> +               flash->async.pos += len;
> +               flash->async.buf += len;
> +               flash->async.len -= len;
> +               /* if we good clear */
> +               flash->async.retry_counter = 0;
> +               /*
> +                * clear the IN_PROGRESS flags
> +                * we only need IN_PROGRESS active on missed_cc
> +                */
> +               flash->bl->flags &= IN_PROGRESS_MASK;
> +               /* reset time for scheduling gap */
> +               flash->async.in_progress_schedule_delay = FLASH_SCHEDULE_DELAY;

The default delay should come from the blocklevel rather than being
assumed by the user of libflash. The bl has a better idea about what
sort of interval is required between ops and what a reasonable timeout
is.

> +       }
> +
> +       /*
> +        * corner cases if the move window misses and
> +        * the requested window is split (needing adjustment down) problem
> +        * if timing good on move_cb the world is good
> +        */
I can't parse this, but it seems like it's all blocklevel specific
stuff that should be documented there. From the perspective of a
libflash user we don't want to know or care about *why* we need to do
a async delay, just that we have to.

> -       flash->async.pos += len;
> -       flash->async.buf += len;
> -       flash->async.len -= len;
>         if (!rc && flash->async.len) {
>                 /*
>                  * We want to get called pretty much straight away, just have
> @@ -293,10 +373,38 @@ static void flash_poll(struct timer *t __unused, void *data, uint64_t now __unus
>                  */
>                 schedule_timer(&flash->async.poller, 0);
>                 return;
> +       } else {
> +               if (rc == FLASH_ERR_MISSED_CC) {

Try to avoid stacking indentation levels. It causes us to run into the
80col limit even faster and tracing through nested if chains gets
tedious pretty quickly.

You can avoid it using linear if() ... else if() ... else chains, e.g.

if (rc == FLASH_ERR_MISSED_CC) {
   /* new stuff goes here */
} else if (rc) {
   /* old stuff goes here */
}

> +                       ++flash->async.retry_counter;
> +                       flash->async.in_progress_schedule_delay += FLASH_SCHEDULE_DELAY;
> +                       if (flash->async.retry_counter >= FLASH_RETRY_LIMIT) {
> +                               rc = OPAL_HARDWARE;
> +                               prlog(PR_TRACE, "flash_poll PROBLEM FLASH_RETRY_LIMIT of %i reached on transaction_id=%i\n",
> +                                       FLASH_RETRY_LIMIT,
> +                                       flash->async.transaction_id);
> +                       } else {
> +                               /*
> +                                * give the BT time to work and receive response
> +                                * throttle back to allow for congestion to clear
> +                                * cases observed were complete lack of ipmi response until very late
> +                                * or cases immediately following an unaligned window read/move (so slow)
> +                                */
> +                               flash->bl->flags |= IN_PROGRESS;
> +                               schedule_timer(&flash->async.poller, msecs_to_tb(flash->async.in_progress_schedule_delay));
> +                               return;
> +                       }
> +               } else {
> +                       if (rc != 0) {
> +                               rc = OPAL_HARDWARE;
> +                       }
> +               }
>         }
>
> -       opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL, flash->async.token, rc);
> +       flash->bl->flags &= IN_PROGRESS_MASK;
> +       flash->bl->flags &= ASYNC_REQUIRED_MASK;

Masks are for defining bit fields not the inverse of a specific bit.
Use the normal pattern for clearing a flag:
     flash->bl->flags &= ~IN_PROGRESS;
     flash->bl->flags &= ~ASYNC_REQUIRED;

> +       /* release the flash before we allow next opal entry */
>         flash_release(flash);
> +       opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL, flash->async.token, rc);
>  }
>
>  static struct dt_node *flash_add_dt_node(struct flash *flash, int id)
> @@ -454,6 +562,7 @@ int flash_register(struct blocklevel_device *bl)
>         flash->size = size;
>         flash->block_size = block_size;
>         flash->id = num_flashes();
> +       flash->async.transaction_id = 0;
>         init_timer(&flash->async.poller, flash_poll, flash);
>
>         rc = ffs_init(0, flash->size, bl, &ffs, 1);
> @@ -487,7 +596,7 @@ static int64_t opal_flash_op(enum flash_op op, uint64_t id, uint64_t offset,
>                 uint64_t buf, uint64_t size, uint64_t token)
>  {
>         struct flash *flash = NULL;
> -       uint64_t len;
> +       uint64_t len = 0;
Is that needed?

>         int rc;
>
>         list_for_each(&flashes, flash, list)
> @@ -516,6 +625,10 @@ static int64_t opal_flash_op(enum flash_op op, uint64_t id, uint64_t offset,
>         flash->async.buf = buf + len;
>         flash->async.len = size - len;
>         flash->async.pos = offset + len;
> +       flash->async.retry_counter = 0;
> +       flash->async.in_progress_schedule_delay = FLASH_SCHEDULE_DELAY;
> +       flash->bl->flags |= ASYNC_REQUIRED;
> +       ++flash->async.transaction_id;
Use post-increment unless you absolutely have to use pre-increment.
There's only a few instances where pre-increment is used in skiboot
and I'd like to keep it that way.

>
>         /*
>          * These ops intentionally have no smarts (ecc correction or erase
> @@ -539,8 +652,27 @@ static int64_t opal_flash_op(enum flash_op op, uint64_t id, uint64_t offset,
>         }
>
>         if (rc) {
> -               prlog(PR_ERR, "%s: Op %d failed with rc %d\n", __func__, op, rc);
> -               rc = OPAL_HARDWARE;
> +               if (rc == FLASH_ERR_MISSED_CC) {

same comment about stacking indents

> +                       ++flash->async.retry_counter;
> +                       flash->async.buf = buf;
> +                       flash->async.len = size;
> +                       flash->async.pos = offset;
> +                       /* for completeness, opal_flash_op is first time pass so unless the retry limit set to 1 */
> +                       if (flash->async.retry_counter >= FLASH_RETRY_LIMIT) {
> +                               rc = OPAL_HARDWARE;
> +                               prlog(PR_TRACE, "opal_flash_op PROBLEM FLASH_RETRY_LIMIT of %i reached on transaction_id=%i\n",
> +                                       FLASH_RETRY_LIMIT,
> +                                       flash->async.transaction_id);
> +                               goto out;
> +                       }
> +                       flash->bl->flags |= IN_PROGRESS;
> +                       schedule_timer(&flash->async.poller, msecs_to_tb(FLASH_SCHEDULE_DELAY));
> +                       /* Don't release the flash */
explain why

> +                       return OPAL_ASYNC_COMPLETION;
> +               } else {
> +                       prlog(PR_ERR, "%s: Op %d failed with rc %d\n", __func__, op, rc);
> +                       rc = OPAL_HARDWARE;
> +               }
>                 goto out;
>         }
>
> @@ -564,6 +696,8 @@ static int64_t opal_flash_op(enum flash_op op, uint64_t id, uint64_t offset,
>         rc = OPAL_ASYNC_COMPLETION;
>  out:
>         flash_release(flash);
> +       flash->bl->flags &= IN_PROGRESS_MASK;
> +       flash->bl->flags &= ASYNC_REQUIRED_MASK;

>         return rc;
>  }
>
> diff --git a/libflash/blocklevel.h b/libflash/blocklevel.h
> index 492918e..63d8690 100644
> --- a/libflash/blocklevel.h
> +++ b/libflash/blocklevel.h
> @@ -18,8 +18,13 @@ struct blocklevel_range {
>         int total_prot;
>  };
>
> +#define ASYNC_REQUIRED_MASK 0xFFFB
> +#define IN_PROGRESS_MASK 0xFFF7
these can go.

> +
>  enum blocklevel_flags {
>         WRITE_NEED_ERASE = 1,
> +       ASYNC_REQUIRED = 4,
> +       IN_PROGRESS = 8,
>  };
>
>  /*
> diff --git a/libflash/errors.h b/libflash/errors.h
> index c800ada..c24166d 100644
> --- a/libflash/errors.h
> +++ b/libflash/errors.h
> @@ -21,6 +21,7 @@
>  #define FLASH_ERR_BAD_READ             15
>  #define FLASH_ERR_DEVICE_GONE  16
>  #define FLASH_ERR_AGAIN        17
> +#define FLASH_ERR_MISSED_CC    18

The libflash return codes are supposed to be independent of the
backing blocklevel. As far as I can tell this is being used as a
generic "async continuation required" return code, similar to how we
use OPAL_ASYNC_COMPLETION, so rename it something that reflects that.

>  #ifdef __SKIBOOT__
>  #include <skiboot.h>

*snip*

I'll send comments about the HIOMAP bits in a seperate mail since this
is getting unwieldy.