[Skiboot] [PATCH 4/4] libflash/blocklevel: Backend agnostic blocklevel_erase_async()

Thu Jun 29 13:49:28 AEST 2017

On Thu, 2017-06-29 at 13:19 +1000, Alistair Popple wrote:
> On Thu, 29 Jun 2017 09:38:04 AM Cyril Bur wrote:
> > This patch provides a simple (although not particularly efficient)
> > asynchronous erase function. The advantage of this approach is that it
> > doesn't require any of the blocklevel backends to provide an
> > asynchronous implementation. This is also the disadvantage of this
> > implementation as all it actually does is break the work up in chunks
> > that it can performed quickly, but still synchronously. Only a backend
> > could provide a more asynchronous implementation.
> > 
> > The motivation behind this patch is two fold:
> > Firstly this starts a blocklevel async interface - which could easily be
> > extended to pass the async calls down to backends which provide async
> > implementation.
> 
> To be honest having seen the code this seems to add a reasonable amount of
> complexity to the blocklevel layer for no real gain as we don't actually
> implement async and only skiboot uses it.
> 
> Given this I am wondering if it would be simpler to confine the hack to skiboot
> and wait until we have other users of libflash that want async functions, or for
> when we actually implement async at the HW level?
> 

*sigh* maybe, thoughts anyone?

> > Secondly this solves a problem we have right now with the
> > opal_flash_erase call where it can block in Skiboot for around three
> > minutes. This causes a variety of problems in Linux due to a processor
> > being gone for a long time. For example:
> > 
> > [   98.610043] INFO: rcu_sched detected stalls on CPUs/tasks:
> > [   98.610050]         113-...: (1 GPs behind) idle=96f/140000000000000/0 softirq=527/528 fqs=1044
> > [   98.610051]         (detected by 112, t=2102 jiffies, g=223, c=222, q=123)
> > [   98.610060] Task dump for CPU 113:
> > [   98.610062] pflash          R  running task        0  3335   3333 0x00040004
> > [   98.610066] Call Trace:
> > [   98.610070] [c000001fdd847730] [0000000000000001] 0x1 (unreliable)
> > [   98.610076] [c000001fdd847900] [c000000000013854] __switch_to+0x1e8/0x1f4
> > [   98.610081] [c000001fdd847960] [c0000000006122c4] __schedule+0x32c/0x874
> > [   98.610083] [c000001fdd847a30] [c000001fdd847b40] 0xc000001fdd847b40
> > 
> > It is for this reason that breaking the work up in smaller chunks solves
> > this problem as Skiboot can return the CPU to Linux between chunks to
> > avoid Linux getting upset.
> > 
> > Reported-By: Samuel Mendoza-Jonas <sam at mendozajonas.com>
> > Signed-off-by: Cyril Bur <cyril.bur at au1.ibm.com>
> > ---
> >  core/flash.c          | 46 +++++++++++++++++++++++++--
> >  libflash/blocklevel.c | 88 +++++++++++++++++++++++++++++++++++++++++++++++----
> >  libflash/blocklevel.h | 19 +++++++++++
> >  libflash/errors.h     | 14 ++++++++
> >  libflash/libflash.h   | 13 --------
> >  5 files changed, 159 insertions(+), 21 deletions(-)
> > 
> > diff --git a/core/flash.c b/core/flash.c
> > index 177f7ae1..e8b9fc3c 100644
> > --- a/core/flash.c
> > +++ b/core/flash.c
> > @@ -28,6 +28,13 @@
> >  #include <libstb/stb.h>
> >  #include <libstb/container.h>
> >  #include <elf.h>
> > +#include <timer.h>
> > +#include <timebase.h>
> > +
> > +struct flash_async_info {
> > +	struct timer poller;
> > +	uint64_t token;
> > +};
> >  
> >  struct flash {
> >  	struct list_node	list;
> > @@ -36,6 +43,7 @@ struct flash {
> >  	uint64_t		size;
> >  	uint32_t		block_size;
> >  	int			id;
> > +	struct flash_async_info async_info;
> >  };
> >  
> >  static LIST_HEAD(flashes);
> > @@ -200,6 +208,29 @@ static int flash_nvram_probe(struct flash *flash, struct ffs_handle *ffs)
> >  
> >  /* core flash support */
> >  
> > +/*
> > + * Called with flash lock held, drop it on async completion
> > + */
> > +static void flash_poll(struct timer *t __unused, void *data, uint64_t now __unused)
> > +{
> > +	struct flash *flash = data;
> > +	int rc;
> > +
> > +	rc = blocklevel_async_poll(flash->bl);
> > +	if (rc == FLASH_ASYNC_POLL) {
> > +		/*
> > +		 * We want to get called pretty much straight away, just have
> > +		 * to be sure that we jump back out to Linux so that if this
> > +		 * very long we don't cause RCU or the scheduler to freak
> > +		 */
> > +		schedule_timer(&flash->async_info.poller, msecs_to_tb(1));
> > +		return;
> > +	}
> > +
> > +	flash_release(flash);
> > +	opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL, flash->async_info.token, rc);
> > +}
> > +
> >  static struct dt_node *flash_add_dt_node(struct flash *flash, int id)
> >  {
> >  	struct dt_node *flash_node;
> > @@ -295,6 +326,7 @@ int flash_register(struct blocklevel_device *bl)
> >  	flash->size = size;
> >  	flash->block_size = block_size;
> >  	flash->id = num_flashes();
> > +	init_timer(&flash->async_info.poller, flash_poll, flash);
> >  
> >  	list_add(&flashes, &flash->list);
> >  
> > @@ -369,8 +401,18 @@ static int64_t opal_flash_op(enum flash_op op, uint64_t id, uint64_t offset,
> >  		rc = blocklevel_raw_write(flash->bl, offset, (void *)buf, size);
> >  		break;
> >  	case FLASH_OP_ERASE:
> > -		rc = blocklevel_erase(flash->bl, offset, size);
> > -		break;
> > +		rc = blocklevel_erase_async(flash->bl, offset, size);
> > +		if (rc != FLASH_ASYNC_POLL)
> > +			break;
> > +		schedule_timer(&flash->async_info.poller, msecs_to_tb(1));
> > +		/*
> > +		 * FLASH_OP_ERASE can now happen asynchronously, don't go out
> > +		 * the regular path.
> > +		 *
> > +		 * Ensure we hold the lock to flash for the entirety of the
> > +		 * async process
> > +		 */
> > +		return OPAL_ASYNC_COMPLETION;
> >  	default:
> >  		assert(0);
> >  	}
> > diff --git a/libflash/blocklevel.c b/libflash/blocklevel.c
> > index 9802ad0f..acda2066 100644
> > --- a/libflash/blocklevel.c
> > +++ b/libflash/blocklevel.c
> > @@ -29,6 +29,10 @@
> >  
> >  #define PROT_REALLOC_NUM 25
> >  
> > +#ifndef MIN
> > +#define MIN(a, b)	((a) < (b) ? (a) : (b))
> > +#endif
> > +
> >  /* This function returns tristate values.
> >   * 1  - The region is ECC protected
> >   * 0  - The region is not ECC protected
> > @@ -78,6 +82,46 @@ static int release(struct blocklevel_device *bl)
> >  	return rc;
> >  }
> >  
> > +int blocklevel_async_poll(struct blocklevel_device *bl)
> > +{
> > +	uint64_t len;
> > +	int rc;
> > +
> > +	if (!bl->async.active)
> > +		return FLASH_ERR_PARM_ERROR;
> > +
> > +	/* Just chunk things up by erase granule */
> > +	len = MIN(bl->erase_mask + 1, bl->async.len);
> > +
> > +	switch (bl->async.op) {
> > +		case READ:
> > +		case WRITE:
> > +			/*
> > +			 * Very large caveat - do not remove this without auditing
> > +			 * this entire file. smart_write() and smart_erase() will
> > +			 * not 'just work' async. They will need work.
> > +			 */
> > +			bl->async.buf += len;
> 
> How would async.op == READ/WRITE? I'd be inclinded to complain loudly and return
> here as it should never happen.

Perhaps, I wasn't sure so I did nothing... Yes yes inaction is a form
of action... but its less action?

> 
> > +			return FLASH_ERR_PARM_ERROR;
> > +		case ERASE:
> > +			rc = bl->erase(bl, bl->async.pos, len);
> > +			break;
> > +		default:
> > +			return FLASH_ERR_PARM_ERROR;
> > +	}
> > +
> > +	if (rc)
> > +		return rc;
> > +
> > +	bl->async.pos += len;
> > +	bl->async.len -= len;
> 
> Wouldn't storing the end position be a little clearer (ie. bl->async.end)?

Would it? Everything is always specified as a pos and len, doesn't it
make sense to keep it in the same layout as the functions accept their
args? Otherwise there will be a lot of bl->async.end - bl->async.pos.
And I'm not sure how nice if (bl->async.pos == bl->async.end) is over
if (bl->async.len == 0)

> 
> > +	if (bl->async.len == 0)
> > +		bl->async.active = false;
> > +
> > +	return bl->async.active ? FLASH_ASYNC_POLL : 0;
> > +}
> > +
> >  int blocklevel_raw_read(struct blocklevel_device *bl, uint64_t pos, void *buf, uint64_t len)
> >  {
> >  	int rc;
> > @@ -189,9 +233,9 @@ out:
> >  	return rc;
> >  }
> >  
> > -int blocklevel_erase(struct blocklevel_device *bl, uint64_t pos, uint64_t len)
> > +static int check_erase_params(struct blocklevel_device *bl, const char *caller,
> > +		uint64_t pos, uint64_t len)
> >  {
> > -	int rc;
> >  	if (!bl || !bl->erase) {
> >  		errno = EINVAL;
> >  		return FLASH_ERR_PARM_ERROR;
> > @@ -199,16 +243,48 @@ int blocklevel_erase(struct blocklevel_device *bl, uint64_t pos, uint64_t len)
> >  
> >  	/* Programmer may be making a horrible mistake without knowing it */
> >  	if (pos & bl->erase_mask) {
> > -		fprintf(stderr, "blocklevel_erase: pos (0x%"PRIx64") is not erase block (0x%08x) aligned\n",
> > -				pos, bl->erase_mask + 1);
> > +		FL_ERR("%s: pos (0x%"PRIx64") is not erase block (0x%08x) aligned\n",
> > +				caller, pos, bl->erase_mask + 1);
> > +		return FLASH_ERR_ERASE_BOUNDARY;
> >  	}
> >  
> >  	if (len & bl->erase_mask) {
> > -		fprintf(stderr, "blocklevel_erase: len (0x%"PRIx64") is not erase block (0x%08x) aligned\n",
> > -				len, bl->erase_mask + 1);
> > +		FL_ERR("%s: len (0x%"PRIx64") is not erase block (0x%08x) aligned\n",
> > +				caller, len, bl->erase_mask + 1);
> >  		return FLASH_ERR_ERASE_BOUNDARY;
> >  	}
> >  
> > +	return 0;
> > +}
> > +
> > +int blocklevel_erase_async(struct blocklevel_device *bl, uint64_t pos, uint64_t len)
> > +{
> > +	int rc;
> > +
> > +	rc = check_erase_params(bl, __func__, pos, len);
> > +	if (rc)
> > +		return rc;
> > +
> > +	if (bl->async.active)
> > +		return FLASH_ERR_PARM_ERROR;
> > +
> > +	bl->async.op = ERASE;
> > +	bl->async.pos = pos;
> > +	bl->async.len = len;
> > +	bl->async.active = true;
> > +
> > +	/* Call the poll once to get everything going */
> > +	return blocklevel_async_poll(bl);
> > +}
> > +
> > +int blocklevel_erase(struct blocklevel_device *bl, uint64_t pos, uint64_t len)
> > +{
> > +	int rc;
> > +
> > +	rc = check_erase_params(bl, __func__, pos, len);
> > +	if (rc)
> > +		return rc;
> > +
> >  	rc = reacquire(bl);
> >  	if (rc)
> >  		return rc;
> > diff --git a/libflash/blocklevel.h b/libflash/blocklevel.h
> > index ba42c83d..7079e2fc 100644
> > --- a/libflash/blocklevel.h
> > +++ b/libflash/blocklevel.h
> > @@ -34,6 +34,20 @@ enum blocklevel_flags {
> >  	WRITE_NEED_ERASE = 1,
> >  };
> >  
> > +enum blocklevel_async_op {
> > +	READ = 1,
> > +	WRITE,
> > +	ERASE
> > +};
> > +
> > +struct blocklevel_async {
> > +	bool active;
> > +	enum blocklevel_async_op op;
> > +	uint64_t pos;
> > +	uint64_t len;
> > +	void *buf;
> > +};
> > +
> >  /*
> >   * libffs may be used with different backends, all should provide these for
> >   * libflash to get the information it needs
> > @@ -56,6 +70,8 @@ struct blocklevel_device {
> >  	enum blocklevel_flags flags;
> >  
> >  	struct blocklevel_range ecc_prot;
> > +
> > +	struct blocklevel_async async;
> >  };
> >  int blocklevel_raw_read(struct blocklevel_device *bl, uint64_t pos, void *buf, uint64_t len);
> >  int blocklevel_read(struct blocklevel_device *bl, uint64_t pos, void *buf, uint64_t len);
> > @@ -65,6 +81,9 @@ int blocklevel_erase(struct blocklevel_device *bl, uint64_t pos, uint64_t len);
> >  int blocklevel_get_info(struct blocklevel_device *bl, const char **name, uint64_t *total_size,
> >  		uint32_t *erase_granule);
> >  
> > +int blocklevel_async_poll(struct blocklevel_device *bl);
> > +int blocklevel_erase_async(struct blocklevel_device *bl, uint64_t pos, uint64_t len);
> > +
> >  /*
> >   * blocklevel_smart_write() performs reads on the data to see if it
> >   * can skip erase or write calls. This is likely more convenient for
> > diff --git a/libflash/errors.h b/libflash/errors.h
> > index 2f567211..deda7b87 100644
> > --- a/libflash/errors.h
> > +++ b/libflash/errors.h
> 
> Why are we moving this in this patch? Just as clean-up? If so it probably should
> be a seperate patch.

Damn, busted trying to sneak in a cleanup.

Yeah subtly hidden in there is that blocklevel does from using
fprintf(stderr, ...) to FL_ERR() which, for userspace use boils down to
the same but makes it look nicer in the skiboot log.

I'll split it up on resend.

> 
> > @@ -33,5 +33,19 @@
> >  #define FLASH_ERR_BAD_READ		15
> >  #define FLASH_ERR_DEVICE_GONE	16
> >  #define FLASH_ERR_AGAIN	17
> > +#define FLASH_ASYNC_POLL 18
> > +
> > +#ifdef __SKIBOOT__
> > +#include <skiboot.h>
> > +#define FL_INF(fmt...) do { prlog(PR_INFO, fmt);  } while(0)
> > +#define FL_DBG(fmt...) do { prlog(PR_DEBUG, fmt); } while(0)
> > +#define FL_ERR(fmt...) do { prlog(PR_ERR, fmt);   } while(0)
> > +#else
> > +#include <stdio.h>
> > +extern bool libflash_debug;
> > +#define FL_DBG(fmt...) do { if (libflash_debug) printf(fmt); } while(0)
> > +#define FL_INF(fmt...) do { printf(fmt); } while(0)
> > +#define FL_ERR(fmt...) do { fprintf(stderr, fmt); } while(0)
> > +#endif
> >  
> >  #endif /* __LIBFLASH_ERRORS_H */
> > diff --git a/libflash/libflash.h b/libflash/libflash.h
> > index 4fecfe75..ff3a9823 100644
> > --- a/libflash/libflash.h
> > +++ b/libflash/libflash.h
> > @@ -28,19 +28,6 @@
> >   */
> >  #include <libflash/errors.h>
> >  
> > -#ifdef __SKIBOOT__
> > -#include <skiboot.h>
> > -#define FL_INF(fmt...) do { prlog(PR_INFO, fmt);  } while(0)
> > -#define FL_DBG(fmt...) do { prlog(PR_DEBUG, fmt); } while(0)
> > -#define FL_ERR(fmt...) do { prlog(PR_ERR, fmt);   } while(0)
> > -#else
> > -#include <stdio.h>
> > -extern bool libflash_debug;
> > -#define FL_DBG(fmt...) do { if (libflash_debug) printf(fmt); } while(0)
> > -#define FL_INF(fmt...) do { printf(fmt); } while(0)
> > -#define FL_ERR(fmt...) do { printf(fmt); } while(0)
> > -#endif
> > -
> >  /* Flash chip, opaque */
> >  struct flash_chip;
> >  struct spi_flash_ctrl;
> > 
> 
>