[Skiboot] [RFC PATCH 2/2] hmi: Remove races in clearing HMER

Wed Jan 17 16:27:26 AEDT 2018

On 01/16/2018 10:15 AM, Benjamin Herrenschmidt wrote:
> Writing to HMER acts as an "AND". The current code writes back the
> value we originally read with the bits we handled cleared. This is
> racy, if a new bit gets set in HW after the original read, we'll end
> up clearing it without handling it.
> 
> Instead, use an all 1's mask with only the bit handled cleared.

Agree.

> 
> Signed-off-by: Benjamin Herrenschmidt <benh at kernel.crashing.org>

patch looks fine to me.

Acked-by: Mahesh Salgaonkar <mahesh at linux.vnet.ibm.com>

> ---
>  core/hmi.c | 22 ++++++++++++----------
>  1 file changed, 12 insertions(+), 10 deletions(-)
> 
> diff --git a/core/hmi.c b/core/hmi.c
> index 5642bd0b..9fc4927d 100644
> --- a/core/hmi.c
> +++ b/core/hmi.c
> @@ -946,7 +946,7 @@ static void hmi_print_debug(const uint8_t *msg, uint64_t hmer)
>  int handle_hmi_exception(uint64_t hmer, struct OpalHMIEvent *hmi_evt)
>  {
>  	int recover = 1;
> -	uint64_t tfmr;
> +	uint64_t tfmr, handled = 0;
> 
>  	/*
>  	 * In case of split core, some of the Timer facility errors need
> @@ -965,7 +965,7 @@ int handle_hmi_exception(uint64_t hmer, struct OpalHMIEvent *hmi_evt)
>  	if (hmi_evt)
>  		hmi_evt->hmer = hmer;
>  	if (hmer & SPR_HMER_PROC_RECV_DONE) {
> -		hmer &= ~SPR_HMER_PROC_RECV_DONE;
> +		handled |= SPR_HMER_PROC_RECV_DONE;
>  		if (hmi_evt) {
>  			hmi_evt->severity = OpalHMI_SEV_NO_ERROR;
>  			hmi_evt->type = OpalHMI_ERROR_PROC_RECOV_DONE;
> @@ -974,7 +974,7 @@ int handle_hmi_exception(uint64_t hmer, struct OpalHMIEvent *hmi_evt)
>  		hmi_print_debug("Processor recovery Done.", hmer);
>  	}
>  	if (hmer & SPR_HMER_PROC_RECV_ERROR_MASKED) {
> -		hmer &= ~SPR_HMER_PROC_RECV_ERROR_MASKED;
> +		handled |= SPR_HMER_PROC_RECV_ERROR_MASKED;
>  		if (hmi_evt) {
>  			hmi_evt->severity = OpalHMI_SEV_NO_ERROR;
>  			hmi_evt->type = OpalHMI_ERROR_PROC_RECOV_MASKED;
> @@ -983,7 +983,7 @@ int handle_hmi_exception(uint64_t hmer, struct OpalHMIEvent *hmi_evt)
>  		hmi_print_debug("Processor recovery Done (masked).", hmer);
>  	}
>  	if (hmer & SPR_HMER_PROC_RECV_AGAIN) {
> -		hmer &= ~SPR_HMER_PROC_RECV_AGAIN;
> +		handled |= SPR_HMER_PROC_RECV_AGAIN;
>  		if (hmi_evt) {
>  			hmi_evt->severity = OpalHMI_SEV_NO_ERROR;
>  			hmi_evt->type = OpalHMI_ERROR_PROC_RECOV_DONE_AGAIN;
> @@ -994,7 +994,7 @@ int handle_hmi_exception(uint64_t hmer, struct OpalHMIEvent *hmi_evt)
>  	}
>  	/* Assert if we see malfunction alert, we can not continue. */
>  	if (hmer & SPR_HMER_MALFUNCTION_ALERT) {
> -		hmer &= ~SPR_HMER_MALFUNCTION_ALERT;
> +		handled |= SPR_HMER_MALFUNCTION_ALERT;
> 
>  		hmi_print_debug("Malfunction Alert", hmer);
>  		if (hmi_evt)
> @@ -1003,7 +1003,7 @@ int handle_hmi_exception(uint64_t hmer, struct OpalHMIEvent *hmi_evt)
> 
>  	/* Assert if we see Hypervisor resource error, we can not continue. */
>  	if (hmer & SPR_HMER_HYP_RESOURCE_ERR) {
> -		hmer &= ~SPR_HMER_HYP_RESOURCE_ERR;
> +		handled |= SPR_HMER_HYP_RESOURCE_ERR;
> 
>  		hmi_print_debug("Hypervisor resource error", hmer);
>  		recover = 0;
> @@ -1020,10 +1020,10 @@ int handle_hmi_exception(uint64_t hmer, struct OpalHMIEvent *hmi_evt)
>  	 */
>  	if (hmer & SPR_HMER_TFAC_ERROR) {
>  		tfmr = mfspr(SPR_TFMR);		/* save original TFMR */
> +		handled |= SPR_HMER_TFAC_ERROR;
> 
>  		hmi_print_debug("Timer Facility Error", hmer);
> 
> -		hmer &= ~SPR_HMER_TFAC_ERROR;
>  		recover = chiptod_recover_tb_errors();
>  		if (hmi_evt) {
>  			hmi_evt->severity = OpalHMI_SEV_ERROR_SYNC;
> @@ -1034,7 +1034,7 @@ int handle_hmi_exception(uint64_t hmer, struct OpalHMIEvent *hmi_evt)
>  	}
>  	if (hmer & SPR_HMER_TFMR_PARITY_ERROR) {
>  		tfmr = mfspr(SPR_TFMR);		/* save original TFMR */
> -		hmer &= ~SPR_HMER_TFMR_PARITY_ERROR;
> +		handled |= SPR_HMER_TFMR_PARITY_ERROR;
> 
>  		hmi_print_debug("TFMR parity Error", hmer);
>  		recover = chiptod_recover_tb_errors();
> @@ -1051,9 +1051,11 @@ int handle_hmi_exception(uint64_t hmer, struct OpalHMIEvent *hmi_evt)
>  	/*
>  	 * HMER bits are sticky, once set to 1 they remain set to 1 until
>  	 * they are set to 0. Reset the error source bit to 0, otherwise
> -	 * we keep getting HMI interrupt again and again.
> +	 * we keep getting HMI interrupt again and again. Writing to HMER
> +	 * acts as an AND, so we write mask of all 1's except for the bits
> +	 * we want to clear.
>  	 */
> -	mtspr(SPR_HMER, hmer);
> +	mtspr(SPR_HMER, ~handled);
>  	hmi_exit();
>  	/* Set the TB state looking at TFMR register before we head out. */
>  	this_cpu()->tb_invalid = !(mfspr(SPR_TFMR) & SPR_TFMR_TB_VALID);
>