[Skiboot] [PATCH V2 01/15] core/pldm: Handle Watchdog timer.

Wed Apr 26 22:23:34 AEST 2023

On 29/04/2022 11:47, Christophe Lombard wrote:
> Encode a PLDM platform event message to send the heartbeat to the BMC.
> Watchdog is "armed" when a
> PLDM_EVENT_MESSAGE_GLOBAL_ENABLE_ASYNC_KEEP_ALIVE is received.
> 
> Signed-off-by: Christophe Lombard <clombard at linux.vnet.ibm.com>
> ---
>   core/pldm/Makefile.inc    |   2 +
>   core/pldm/pldm-watchdog.c | 127 ++++++++++++++++++++++++++++++++++++++
>   core/pldm/pldm.h          |   1 +
>   include/pldm.h            |   5 ++
>   4 files changed, 135 insertions(+)
>   create mode 100644 core/pldm/pldm-watchdog.c
> 
> diff --git a/core/pldm/Makefile.inc b/core/pldm/Makefile.inc
> index 506bf5d7..c6f78822 100644
> --- a/core/pldm/Makefile.inc
> +++ b/core/pldm/Makefile.inc
> @@ -9,11 +9,13 @@ CPPFLAGS += -I$(SRC)/pldm/ibm/libpldm/
>   
>   CFLAGS_$(PLDM_DIR)/pldm-platform-requests.o = -Wno-strict-prototypes
>   CFLAGS_$(PLDM_DIR)/pldm-bios-requests.o = -Wno-strict-prototypes
> +CFLAGS_$(PLDM_DIR)/pldm-watchdog.o = -Wno-strict-prototypes

I know you're going to update to latest libpldm, but it seems that 
-Wno-strict-prototypes is no longer going to be needed (they fixed the 
prototypes with no arguments). Might also be true for the other 
file-specific CFLAGS, this is just a reminder that we should check it.

>   
>   PLDM_OBJS = pldm-common.o pldm-responder.o pldm-requester.o
>   PLDM_OBJS += pldm-base-requests.o pldm-platform-requests.o
>   PLDM_OBJS += pldm-bios-requests.o pldm-fru-requests.o
>   PLDM_OBJS += pldm-file-io-requests.o pldm-lid-files.o
> +PLDM_OBJS += pldm-watchdog.o
>   
>   PLDM = $(PLDM_DIR)/built-in.a
>   $(PLDM): $(PLDM_OBJS:%=$(PLDM_DIR)/%)
> diff --git a/core/pldm/pldm-watchdog.c b/core/pldm/pldm-watchdog.c
> new file mode 100644
> index 00000000..9d2aabe8
> --- /dev/null
> +++ b/core/pldm/pldm-watchdog.c
> @@ -0,0 +1,127 @@
> +// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
> +// Copyright 2022 IBM Corp.
> +
> +#define pr_fmt(fmt) "PLDM: " fmt
> +
> +#include <lock.h>
> +#include <stdlib.h>
> +#include <string.h>
> +#include <opal.h>
> +#include <timebase.h>
> +#include <timer.h>
> +#include <pldm/libpldm/platform.h>
> +#include "pldm.h"
> +
> +#define DEFAULT_WATCHDOG_TIMEOUT_SEC (10 * 60) /* 10 min */
> +
> +/* Whether the watchdog timer is armed and Skiboot should be sending
> + * regular heartbeats.
> + */
> +bool watchdog_armed;
> +
> +/* The period (in seconds) of the PLDM watchdog, as dictated by BMC */
> +int watchdog_period_sec = DEFAULT_WATCHDOG_TIMEOUT_SEC;
> +
> +static struct lock sequence_lock;
> +static uint8_t sequence_number;
> +
> +int pldm_watchdog_reset_timer(void)
> +{
> +	uint8_t heartbeat_elapsed_data[2];
> +	size_t response_len, payload_len;
> +	uint32_t request_length;
> +	void *response_msg;
> +	char *request_msg;
> +	int rc;
> +
> +	struct pldm_platform_event_message_req event_message_req = {
> +		.format_version = PLDM_PLATFORM_EVENT_MESSAGE_FORMAT_VERSION,
> +		.tid = HOST_TID,
> +		.event_class = PLDM_HEARTBEAT_TIMER_ELAPSED_EVENT,
> +	};
> +
> +	struct pldm_platform_event_message_resp response;
> +
> +	/* watchdog is not armed, so no need to send the heartbeat */
> +	if (!watchdog_armed) {
> +		prlog(PR_ERR, "%s - PLDM watchdog is not armed, not sending the heartbeat\n",
> +			      __func__);
> +		return OPAL_PARAMETER;
> +	}
> +
> +	prlog(PR_INFO, "%s - send the heartbeat to the BMC, sequence: %d, period: %d\n",
> +		       __func__, sequence_number, watchdog_period_sec);
> +
> +	/* Send the event request */
> +	heartbeat_elapsed_data[0] = PLDM_PLATFORM_EVENT_MESSAGE_FORMAT_VERSION;
> +
> +	/* We need to make sure that we send the BMC the correct
> +	 * sequence number. To prevent possible race conditions for the
> +	 * sequence number, lock it while we're incrementing and
> +	 * sending it down.
> +	 */
> +	lock(&sequence_lock);
> +	heartbeat_elapsed_data[1] = sequence_number++;
> +
> +	payload_len = PLDM_PLATFORM_EVENT_MESSAGE_MIN_REQ_BYTES + sizeof(heartbeat_elapsed_data);
> +
> +	request_length = sizeof(struct pldm_msg_hdr) +
> +			 sizeof(struct pldm_platform_event_message_req) +
> +			 sizeof(heartbeat_elapsed_data);
> +	request_msg = malloc(request_length);
> +
> +	/* Encode the platform event message request */
> +	rc = encode_platform_event_message_req(
> +			DEFAULT_INSTANCE_ID,
> +			event_message_req.format_version,
> +			event_message_req.tid,
> +			event_message_req.event_class,
> +			heartbeat_elapsed_data,
> +			sizeof(heartbeat_elapsed_data),
> +			(struct pldm_msg *)request_msg,
> +			payload_len);
> +	if (rc != PLDM_SUCCESS) {
> +		prlog(PR_ERR, "Encode PlatformEventMessage Error, rc: %d\n", rc);
> +		free(request_msg);
> +		sequence_number--;

We don't release the sequence_lock in this error path.

Do we really care about decrementing sequence_number in case of error? 
The spec says it allows the BMC to know if it misses one hearbeat. Which 
would be the case in case of error, so decrementing is actually 
misleading. And it would simplify the code.

> +		return OPAL_PARAMETER;
> +	}
> +	unlock(&sequence_lock);
> +
> +	/* Send and get the response message bytes */
> +	rc = pldm_do_request(BMC_EID, request_msg, request_length - 1,
> +			     &response_msg, &response_len);
> +	if (rc) {
> +		prlog(PR_ERR, "Communication Error, req: PlatformEventMessage, rc: %d\n", rc);
> +		free(request_msg);
> +		sequence_number--;

sequence_number is decremented with no lock held. But see previous 
comment, we may not need to decrement.

> +		return rc;
> +	}
> +	free(request_msg);
> +
> +	/* Decode the message */
> +	payload_len = response_len - sizeof(struct pldm_msg_hdr);
> +	rc = decode_platform_event_message_resp(
> +				response_msg,
> +				payload_len,
> +				&response.completion_code,
> +				&response.platform_event_status);
> +	if (rc != PLDM_SUCCESS || response.completion_code != PLDM_SUCCESS) {
> +		prlog(PR_ERR, "Decode PlatformEventMessage Error, rc: %d, cc: %d, pes: %d\n",
> +			      rc, response.completion_code,
> +			      response.platform_event_status);
> +		free(response_msg);
> +		return OPAL_PARAMETER;
> +	}
> +
> +	free(response_msg);
> +

So we only send one heartbeat message? Not emitted periodically?
 From the spec:
"heartbeatTimer
Amount of time in seconds after *each* elapsing of which the terminus 
shall emit a heartbeat event (the heartbeatTimerElapsedEvent) to the 
event receiver. If the terminus cannot produce heartbeat events at
the requested rate, it shall return completion code 
HEARTBEAT_FREQUENCY_TOO_HIGH."

So it seems to me that once a SetEventReceiver command is received 
requesting a heartbeat, we should send one periodically.

> +	return OPAL_SUCCESS;
> +}
> +
> +int pldm_watchdog_init(void)
> +{
> +	init_lock(&sequence_lock);

pldm_watchdog_reset_timer() is an exported API and a misbehaving caller 
might call it and use the lock before it's init. We should use a static 
initializer when declaring the global variable:

	static struct lock sequence_lock = LOCK_UNLOCKED

> +
> +	return pldm_watchdog_reset_timer();

Does it make sense to call pldm_watchdog_reset_timer() during init? Can 
watchdog_timer already be set?

   Fred