[RFC 01/10] powerpc/rtas: new APIs for busy and extended delay statuses

Alexey Kardashevskiy aik at ozlabs.ru
Thu May 13 19:59:23 AEST 2021



On 04/05/2021 13:03, Nathan Lynch wrote:
> Add new APIs for handling busy (-2) and extended delay
> hint (9900...9905) statuses from RTAS. These are intended to be
> drop-in replacements for existing uses of rtas_busy_delay().
> 
> A problem with rtas_busy_delay() and rtas_busy_delay_time() is that
> they consider -2/busy to be equivalent to 9900 (wait 1ms). In fact,
> the OS should call again as soon as it wants on -2, which at least on
> PowerVM means RTAS is returning only to uphold the general requirement
> that RTAS must return control to the OS in a "timely fashion" (250us).
> 
> Combine this with the fact that msleep(1) actually sleeps for more
> like 20ms in practice: on busy VMs we schedule away for much longer
> than necessary on -2 and 9900.
> 
> This is fixed in rtas_sched_if_busy(), which uses usleep_range() for
> small delay hints, and only schedules away on -2 if there is other
> work available. It also refuses to sleep longer than one second
> regardless of the hinted value, on the assumption that even longer
> running operations can tolerate polling at 1HZ.
> 
> rtas_spin_if_busy() and rtas_force_spin_if_busy() are provided for
> atomic contexts which need to handle busy status and extended delay
> hints.
> 
> Signed-off-by: Nathan Lynch <nathanl at linux.ibm.com>
> ---
>   arch/powerpc/include/asm/rtas.h |   4 +
>   arch/powerpc/kernel/rtas.c      | 168 ++++++++++++++++++++++++++++++++
>   2 files changed, 172 insertions(+)
> 
> diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h
> index 9dc97d2f9d27..555ff3290f92 100644
> --- a/arch/powerpc/include/asm/rtas.h
> +++ b/arch/powerpc/include/asm/rtas.h
> @@ -266,6 +266,10 @@ extern int rtas_set_rtc_time(struct rtc_time *rtc_time);
>   extern unsigned int rtas_busy_delay_time(int status);
>   extern unsigned int rtas_busy_delay(int status);
>   
> +bool rtas_sched_if_busy(int status);
> +bool rtas_spin_if_busy(int status);
> +bool rtas_force_spin_if_busy(int status);
> +
>   extern int early_init_dt_scan_rtas(unsigned long node,
>   		const char *uname, int depth, void *data);
>   
> diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
> index 6bada744402b..4a1dfbfa51ba 100644
> --- a/arch/powerpc/kernel/rtas.c
> +++ b/arch/powerpc/kernel/rtas.c
> @@ -519,6 +519,174 @@ unsigned int rtas_busy_delay(int status)
>   }
>   EXPORT_SYMBOL(rtas_busy_delay);
>   
> +/**
> + * rtas_force_spin_if_busy() - Consume a busy or extended delay status
> + *                             in atomic context.
> + * @status: Return value from rtas_call() or similar function.
> + *
> + * Use this function when you cannot avoid using an RTAS function
> + * which may return an extended delay hint in atomic context. If
> + * possible, use rtas_spin_if_busy() or rtas_sched_if_busy() instead
> + * of this function.
> + *
> + * Return: True if @status is -2 or 990x, in which case
> + *         rtas_spin_if_busy() will have delayed an appropriate amount
> + *         of time, and the caller should call the RTAS function
> + *         again. False otherwise.
> + */
> +bool rtas_force_spin_if_busy(int status)

rtas_force_delay_if_busy()? neither this one nor rtas_spin_if_busy() 
actually spins.


> +{
> +	bool was_busy = true;
> +
> +	switch (status) {
> +	case RTAS_BUSY:
> +		/* OK to call again immediately; do nothing. */
> +		break;
> +	case RTAS_EXTENDED_DELAY_MIN...RTAS_EXTENDED_DELAY_MAX:
> +		mdelay(1);
> +		break;
> +	default:
> +		was_busy = false;
> +		break;
> +	}
> +
> +	return was_busy;
> +}
> +
> +/**
> + * rtas_spin_if_busy() - Consume a busy status in atomic context.
> + * @status: Return value from rtas_call() or similar function.
> + *
> + * Prefer rtas_sched_if_busy() over this function. Prefer this
> + * function over rtas_force_spin_if_busy(). Use this function in
> + * atomic contexts with RTAS calls that are specified to return -2 but
> + * not 990x. This function will complain and execute a minimal delay
> + * if passed a 990x status.
> + *
> + * Return: True if @status is -2 or 990x, in which case
> + *         rtas_spin_if_busy() will have delayed an appropriate amount
> + *         of time, and the caller should call the RTAS function
> + *         again. False otherwise.
> + */
> +bool rtas_spin_if_busy(int status)

rtas_delay_if_busy()?


> +{
> +	bool was_busy = true;
> +
> +	switch (status) {
> +	case RTAS_BUSY:
> +		/* OK to call again immediately; do nothing. */
> +		break;
> +	case RTAS_EXTENDED_DELAY_MIN...RTAS_EXTENDED_DELAY_MAX:
> +		/*
> +		 * Generally, RTAS functions which can return this
> +		 * status should be considered too expensive to use in
> +		 * atomic context. Change the calling code to use
> +		 * rtas_sched_if_busy(), or if that's not possible,
> +		 * use rtas_force_spin_if_busy().
> +		 */
> +		pr_warn_once("%pS may use RTAS call in atomic context which returns extended delay.\n",
> +			     __builtin_return_address(0));
> +		mdelay(1);
> +		break;
> +	default:
> +		was_busy = false;
> +		break;
> +	}
> +
> +	return was_busy;
> +}
> +
> +static unsigned long extended_delay_ms(unsigned int status)

extended_delay_to_ms()?

> +{
> +	unsigned int extdelay;
> +	unsigned int order;
> +	unsigned int ms;
> +
> +	extdelay = clamp((int)status, RTAS_EXTENDED_DELAY_MIN, RTAS_EXTENDED_DELAY_MAX);
> +	WARN_ONCE(extdelay != status, "%s passed invalid status %u\n", __func__, status);
> +
> +	order = status - RTAS_EXTENDED_DELAY_MIN;


Using extdelay instead of status seems safer.


> +	for (ms = 1; order > 0; order--)
> +		ms *= 10;
> +
> +	return ms;
> +}
> +
> +static void handle_extended_delay(unsigned int status)

rtas_sleep()? rtas_extended_delay()?


> +{
> +	unsigned long usecs;
> +
> +	usecs = 1000 * extended_delay_ms(status);


usecs could be msecs, you would need fewer zeroes and would not need 
DIV_ROUND_UP below...


> +
> +	/*
> +	 * If we have no other work pending, there's no reason to
> +	 * sleep.
> +	 */
> +	if (!need_resched())
> +		return;
> +
> +	/*
> +	 * The extended delay hint can be as high as 100
> +	 * seconds. Surely any function returning such a status is
> +	 * either buggy or isn't going to be significantly slowed by
> +	 * us polling at 1HZ. Clamp the sleep time to one second.
> +	 */
> +	usecs = clamp(usecs, 1000UL, 1000000UL);
> +
> +	/*
> +	 * The delay hint is an order-of-magnitude suggestion, not a
> +	 * minimum. It is fine, possibly even advantageous, for us to
> +	 * pause for less time than suggested. For small values, use
> +	 * usleep_range() to ensure we don't sleep much longer than
> +	 * actually suggested.
> +	 *
> +	 * See Documentation/timers/timers-howto.rst for explanation
> +	 * of the threshold used here.
> +	 */
> +	if (usecs <= 20000)
> +		usleep_range(usecs / 2, 2 * usecs);

... and this would be usleep_range(msecs*500, msecs*2000).


> +	else
> +		msleep(DIV_ROUND_UP(usecs, 1000));
> +}
> +
> +/**
> + * rtas_sched_if_busy() - Consume a busy or extended delay status.
> + * @status: Return value from rtas_call() or similar function.
> + *
> + * Prefer this function over rtas_spin_if_busy().
> + *
> + * Context: This function may sleep.
> + *
> + * Return: True if @status is -2 or 990x, in which case
> + *         rtas_sched_if_busy() will have slept an appropriate amount
> + *         of time, and the caller should call the RTAS function
> + *         again. False otherwise.
> + */
> +bool rtas_sched_if_busy(int status)
> +{
> +	bool was_busy = true;
> +
> +	might_sleep();
> +
> +	switch (status) {
> +	case RTAS_BUSY:
> +		/*
> +		 * OK to call again immediately. Schedule if there's
> +		 * other work to do, but no sleep is necessary.
> +		 */
> +		cond_resched();
> +		break;
> +	case RTAS_EXTENDED_DELAY_MIN...RTAS_EXTENDED_DELAY_MAX:
> +		handle_extended_delay(status);
> +		break;
> +	default:
> +		was_busy = false;
> +		break;
> +	}
> +
> +	return was_busy;
> +}

Throughout the series this one is called instead of simple spinning, 
03/10..05/10, and some of those have in_interrupt() checks for a reason 
(which I do not know but nevertheless) so they may be called with 
interrupts disabled which in turn means we should not be calling 
cond_resched() unconditionally. I am told this is should be done:


if (!preemptible())
	/* print warning, don't sleep or cond_resched */



> +
>   static int rtas_error_rc(int rtas_rc)
>   {
>   	int rc;
> 

-- 
Alexey


More information about the Linuxppc-dev mailing list