[Skiboot] [PATCH] FSP: Do not process incoming FSP messages in fsp_sync_msg path

Tue Mar 10 20:38:32 AEDT 2015

On Tue, 2015-03-10 at 11:52 +0530, Vasant Hegde wrote:
> Presently we process incoming messages from FSP all the time.
> Our fsp_fetch_data function holds lock and calls fsp_sync_msg.
> Due to our callback mechanism its possible that in some cases
> we may hit recursive lock issue in fsp_fetch_data.
> 
> Sample code path:
>   fsp_fetch_data ->
>      fsp_sync_msg ->
>        Incoming message (ex: OCC LID load) ->
>          notify registered client (ex: fsp_occ_msg) ->
> 	   fsp_fetch_data -> Lock issue!
> 
> This patch adds support to queue incoming messages in fsp_fetch_data
> path. That way we are sure that we will not hit above scenario. Also
> I have changed locking code in fsp_fetch_data. Now we just hold lock
> to update flag.

This is not right. You are papering over the problem, if something else
from the pollers decide to load a LID, kaboom. Plus the above shouldn't
happen once you implement the queueing & preloading.

Why don't you add a per-thread flag indicating that we are inside the
pollers so we can catch calls to things that shouldn't happen in pollers
(and make opal_poll_event not recurse while at it) and finally "fix"
fsp_fetch_data as I suggested in a different email ?

Ben.

> Note:
>   We still have one lock issue in poller path. (We held capi_lock in
>   capp_lid_download and call opal_run_pollers via load_resources).
> 
>   [4108781364,3] Running pollers with lock held !
>   CPU 0830 Backtrace:
>   S: 0000000033ac3800 R: 0000000030013520   .backtrace+0x2c
>   S: 0000000033ac3890 R: 0000000030017dd8   .opal_run_pollers+0x44
>   S: 0000000033ac3910 R: 0000000030046190   .fsp_sync_msg+0x40
>   S: 0000000033ac39a0 R: 0000000030047c10   .fsp_fetch_data+0x188
>   S: 0000000033ac3a90 R: 0000000030047ed8   .fsp_load_resource+0x12c
>   S: 0000000033ac3b50 R: 0000000030021bb0   .load_resource+0x38
>   S: 0000000033ac3bc0 R: 000000003003b610   .capp_load_ucode+0x168
>   S: 0000000033ac3c90 R: 0000000030041140   .probe_phb3+0x1224
>   S: 0000000033ac3e30 R: 000000003001420c   .main_cpu_entry+0x444
>   S: 0000000033ac3f00 R: 0000000030002524   boot_entry+0x18c
> 
> Signed-off-by: Vasant Hegde <hegdevasant at linux.vnet.ibm.com>
> ---
>  hw/fsp/fsp.c |   71 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 70 insertions(+), 1 deletion(-)
> 
> diff --git a/hw/fsp/fsp.c b/hw/fsp/fsp.c
> index fdf175a..c73a193 100644
> --- a/hw/fsp/fsp.c
> +++ b/hw/fsp/fsp.c
> @@ -117,6 +117,13 @@ static u32 disr_last_print;
>  static u32 drcr_last_print;
>  static u32 hstate_last_print;
>  
> +/*
> + * Keep track of incoming messages during sychronous FSP call from
> + * fsp_fetch_data and process them later.
> + */
> +static LIST_HEAD(fsp_msg_list);
> +static bool fsp_handle_cmd_ready = true;
> +
>  void fsp_handle_resp(struct fsp_msg *msg);
>  
>  struct fsp_cmdclass {
> @@ -1254,7 +1261,7 @@ static bool fsp_local_command(u32 cmd_sub_mod, struct fsp_msg *msg)
>  
> 
>  /* This is called without the FSP lock */
> -static void fsp_handle_command(struct fsp_msg *msg)
> +static void __fsp_handle_command(struct fsp_msg *msg)
>  {
>  	struct fsp_cmdclass *cmdclass = fsp_get_cmdclass(msg);
>  	struct fsp_client *client, *next;
> @@ -1300,6 +1307,49 @@ static void fsp_handle_command(struct fsp_msg *msg)
>  	fsp_freemsg(msg);
>  }
>  
> +/*
> + * Queue incoming messages, if we are in the middle of
> + * fsp_sync_msg call
> + */
> +static void fsp_handle_command(struct fsp_msg *msg)
> +{
> +	u32 cmd_sub_mod;
> +
> +	/*
> +	 * Check if we are ready to handle command. If not, queue
> +	 * incoming message.
> +	 *
> +	 * XXX: Note that we use same link (msg->link) in fsp_queue_msg()
> +	 *      as well. But these are two different path. Hence we are
> +	 *      fine here.
> +	 */
> +	if (!fsp_handle_cmd_ready) {
> +		cmd_sub_mod =  (msg->word0 & 0xff) << 16;
> +		cmd_sub_mod |= (msg->word1 & 0xff) << 8;
> +		cmd_sub_mod |= (msg->word1 >> 8) & 0xff;
> +		prlog(PR_TRACE, "FSP: Queued incoming message [%x]\n",
> +		      cmd_sub_mod);
> +
> +		list_add_tail(&fsp_msg_list, &msg->link);
> +		return;
> +	}
> +
> +	__fsp_handle_command(msg);
> +}
> +
> +static void fsp_poke_cmd_queue(void)
> +{
> +	struct fsp_msg *msg;
> +
> +	if (list_empty(&fsp_msg_list))
> +		return;
> +
> +	list_for_each(&fsp_msg_list, msg, link) {
> +		list_del(&msg->link);
> +		__fsp_handle_command(msg);
> +	}
> +}
> +
>  static void __fsp_fill_incoming(struct fsp *fsp, struct fsp_msg *msg,
>  				int dlen, u32 w0, u32 w1)
>  {
> @@ -2166,6 +2216,19 @@ int fsp_fetch_data(uint8_t flags, uint16_t id, uint32_t sub_id,
>  	 */
>  	lock(&fsp_fetch_lock);
>  
> +	/*
> +	 * XXX: This means we are re-entering this code. Ideally this
> +	 * should never happen.
> +	 */
> +	if (!fsp_handle_cmd_ready) {
> +		prlog(PR_ERR, "FSP: Re-entering fsp_fetch_data!\n");
> +		unlock(&fsp_fetch_lock);
> +		return -EBUSY;
> +	}
> +
> +	fsp_handle_cmd_ready = false;
> +	unlock(&fsp_fetch_lock);
> +
>  	while(remaining) {
>  		uint32_t chunk, taddr, woffset, wlen;
>  		uint8_t rc;
> @@ -2201,7 +2264,10 @@ int fsp_fetch_data(uint8_t flags, uint16_t id, uint32_t sub_id,
>  
>  		/* XXX Is flash busy (0x3f) a reason for retry ? */
>  		if (rc != 0 && rc != 2) {
> +			lock(&fsp_fetch_lock);
> +			fsp_handle_cmd_ready = true;
>  			unlock(&fsp_fetch_lock);
> +			fsp_poke_cmd_queue();
>  			return -EIO;
>  		}
>  
> @@ -2218,10 +2284,13 @@ int fsp_fetch_data(uint8_t flags, uint16_t id, uint32_t sub_id,
>  		if (wlen < chunk)
>  			break;
>  	}
> +	lock(&fsp_fetch_lock);
> +	fsp_handle_cmd_ready = true;
>  	unlock(&fsp_fetch_lock);
>  
>  	*length = total;
>  
> +	fsp_poke_cmd_queue();
>  	return 0;
>  }
>