[Skiboot] [PATCH] FSP: Do not process incoming FSP messages in fsp_sync_msg path
Benjamin Herrenschmidt
benh at kernel.crashing.org
Tue Mar 10 20:38:32 AEDT 2015
On Tue, 2015-03-10 at 11:52 +0530, Vasant Hegde wrote:
> Presently we process incoming messages from FSP all the time.
> Our fsp_fetch_data function holds lock and calls fsp_sync_msg.
> Due to our callback mechanism its possible that in some cases
> we may hit recursive lock issue in fsp_fetch_data.
>
> Sample code path:
> fsp_fetch_data ->
> fsp_sync_msg ->
> Incoming message (ex: OCC LID load) ->
> notify registered client (ex: fsp_occ_msg) ->
> fsp_fetch_data -> Lock issue!
>
> This patch adds support to queue incoming messages in fsp_fetch_data
> path. That way we are sure that we will not hit above scenario. Also
> I have changed locking code in fsp_fetch_data. Now we just hold lock
> to update flag.
This is not right. You are papering over the problem, if something else
from the pollers decide to load a LID, kaboom. Plus the above shouldn't
happen once you implement the queueing & preloading.
Why don't you add a per-thread flag indicating that we are inside the
pollers so we can catch calls to things that shouldn't happen in pollers
(and make opal_poll_event not recurse while at it) and finally "fix"
fsp_fetch_data as I suggested in a different email ?
Ben.
> Note:
> We still have one lock issue in poller path. (We held capi_lock in
> capp_lid_download and call opal_run_pollers via load_resources).
>
> [4108781364,3] Running pollers with lock held !
> CPU 0830 Backtrace:
> S: 0000000033ac3800 R: 0000000030013520 .backtrace+0x2c
> S: 0000000033ac3890 R: 0000000030017dd8 .opal_run_pollers+0x44
> S: 0000000033ac3910 R: 0000000030046190 .fsp_sync_msg+0x40
> S: 0000000033ac39a0 R: 0000000030047c10 .fsp_fetch_data+0x188
> S: 0000000033ac3a90 R: 0000000030047ed8 .fsp_load_resource+0x12c
> S: 0000000033ac3b50 R: 0000000030021bb0 .load_resource+0x38
> S: 0000000033ac3bc0 R: 000000003003b610 .capp_load_ucode+0x168
> S: 0000000033ac3c90 R: 0000000030041140 .probe_phb3+0x1224
> S: 0000000033ac3e30 R: 000000003001420c .main_cpu_entry+0x444
> S: 0000000033ac3f00 R: 0000000030002524 boot_entry+0x18c
>
> Signed-off-by: Vasant Hegde <hegdevasant at linux.vnet.ibm.com>
> ---
> hw/fsp/fsp.c | 71 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
> 1 file changed, 70 insertions(+), 1 deletion(-)
>
> diff --git a/hw/fsp/fsp.c b/hw/fsp/fsp.c
> index fdf175a..c73a193 100644
> --- a/hw/fsp/fsp.c
> +++ b/hw/fsp/fsp.c
> @@ -117,6 +117,13 @@ static u32 disr_last_print;
> static u32 drcr_last_print;
> static u32 hstate_last_print;
>
> +/*
> + * Keep track of incoming messages during sychronous FSP call from
> + * fsp_fetch_data and process them later.
> + */
> +static LIST_HEAD(fsp_msg_list);
> +static bool fsp_handle_cmd_ready = true;
> +
> void fsp_handle_resp(struct fsp_msg *msg);
>
> struct fsp_cmdclass {
> @@ -1254,7 +1261,7 @@ static bool fsp_local_command(u32 cmd_sub_mod, struct fsp_msg *msg)
>
>
> /* This is called without the FSP lock */
> -static void fsp_handle_command(struct fsp_msg *msg)
> +static void __fsp_handle_command(struct fsp_msg *msg)
> {
> struct fsp_cmdclass *cmdclass = fsp_get_cmdclass(msg);
> struct fsp_client *client, *next;
> @@ -1300,6 +1307,49 @@ static void fsp_handle_command(struct fsp_msg *msg)
> fsp_freemsg(msg);
> }
>
> +/*
> + * Queue incoming messages, if we are in the middle of
> + * fsp_sync_msg call
> + */
> +static void fsp_handle_command(struct fsp_msg *msg)
> +{
> + u32 cmd_sub_mod;
> +
> + /*
> + * Check if we are ready to handle command. If not, queue
> + * incoming message.
> + *
> + * XXX: Note that we use same link (msg->link) in fsp_queue_msg()
> + * as well. But these are two different path. Hence we are
> + * fine here.
> + */
> + if (!fsp_handle_cmd_ready) {
> + cmd_sub_mod = (msg->word0 & 0xff) << 16;
> + cmd_sub_mod |= (msg->word1 & 0xff) << 8;
> + cmd_sub_mod |= (msg->word1 >> 8) & 0xff;
> + prlog(PR_TRACE, "FSP: Queued incoming message [%x]\n",
> + cmd_sub_mod);
> +
> + list_add_tail(&fsp_msg_list, &msg->link);
> + return;
> + }
> +
> + __fsp_handle_command(msg);
> +}
> +
> +static void fsp_poke_cmd_queue(void)
> +{
> + struct fsp_msg *msg;
> +
> + if (list_empty(&fsp_msg_list))
> + return;
> +
> + list_for_each(&fsp_msg_list, msg, link) {
> + list_del(&msg->link);
> + __fsp_handle_command(msg);
> + }
> +}
> +
> static void __fsp_fill_incoming(struct fsp *fsp, struct fsp_msg *msg,
> int dlen, u32 w0, u32 w1)
> {
> @@ -2166,6 +2216,19 @@ int fsp_fetch_data(uint8_t flags, uint16_t id, uint32_t sub_id,
> */
> lock(&fsp_fetch_lock);
>
> + /*
> + * XXX: This means we are re-entering this code. Ideally this
> + * should never happen.
> + */
> + if (!fsp_handle_cmd_ready) {
> + prlog(PR_ERR, "FSP: Re-entering fsp_fetch_data!\n");
> + unlock(&fsp_fetch_lock);
> + return -EBUSY;
> + }
> +
> + fsp_handle_cmd_ready = false;
> + unlock(&fsp_fetch_lock);
> +
> while(remaining) {
> uint32_t chunk, taddr, woffset, wlen;
> uint8_t rc;
> @@ -2201,7 +2264,10 @@ int fsp_fetch_data(uint8_t flags, uint16_t id, uint32_t sub_id,
>
> /* XXX Is flash busy (0x3f) a reason for retry ? */
> if (rc != 0 && rc != 2) {
> + lock(&fsp_fetch_lock);
> + fsp_handle_cmd_ready = true;
> unlock(&fsp_fetch_lock);
> + fsp_poke_cmd_queue();
> return -EIO;
> }
>
> @@ -2218,10 +2284,13 @@ int fsp_fetch_data(uint8_t flags, uint16_t id, uint32_t sub_id,
> if (wlen < chunk)
> break;
> }
> + lock(&fsp_fetch_lock);
> + fsp_handle_cmd_ready = true;
> unlock(&fsp_fetch_lock);
>
> *length = total;
>
> + fsp_poke_cmd_queue();
> return 0;
> }
>
More information about the Skiboot
mailing list