[PATCH v2 1/2] powerpc/powernv: Add poweroff (EPOW, DPO) events support for PowerNV platform

trigg mr.triggtrigg at gmail.com
Fri May 8 17:37:24 AEST 2015


Hi Vipin,

These comments are in addition to what Joel has said in his review.

On Thu, May 7, 2015 at 3:00 PM, Vipin K Parashar
<vipin at linux.vnet.ibm.com> wrote:
> This patch adds support for FSP EPOW (Early Power Off Warning) and
> DPO (Delayed Power Off) events support for PowerNV platform.  EPOW events
> are generated by SPCN/FSP due to various critical system conditions that
> need system shutdown.  Few examples of these conditions are high ambient
> temperature or system running on UPS power with low UPS battery. DPO event
> is generated in response to admin initiated system shutdown request.
>         This patch enables host kernel on PowerNV platform to handle OPAL
> notifications for these events and initiate system poweroff. Since EPOW
> notifications are sent in advance of impending shutdown event and thus
> this patch also adds functionality to wait for EPOW condition to return to
> normal. Host allows MAX_POWEROFF_SYS_TIME (600 seconds) as system
> poweroff time (time for host + guests shutdown) and waits for remaining
> time for EPOW condition to return to normal. If EPOW condition doesn't
> return to normal in calculated time it proceeds with graceful system
> shutdown. For EPOW events with smaller timeouts values than
> MAX_POWEROFF_SYS_TIME it proceeds with system shutdown without any wait
> for EPOW condition to return to normal.
>         System admin can also add systemd service shutdown scripts to
> perform any specific actions like graceful guest shutdown upon system
> poweroff. libvirt-guests is systemd service available on recent distros
> for management of guests at system stat/shutdown time.
>
> Signed-off-by: Vipin K Parashar <vipin at linux.vnet.ibm.com>
> ---
>  arch/powerpc/include/asm/opal-api.h            |  30 ++
>  arch/powerpc/include/asm/opal.h                |   3 +-
>  arch/powerpc/platforms/powernv/opal-power.c    | 379 +++++++++++++++++++++++--
>  arch/powerpc/platforms/powernv/opal-wrappers.S |   1 +
>  4 files changed, 391 insertions(+), 22 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h
> index 0321a90..03b3cef 100644
> --- a/arch/powerpc/include/asm/opal-api.h
> +++ b/arch/powerpc/include/asm/opal-api.h
> @@ -730,6 +730,36 @@ struct opal_i2c_request {
>         __be64 buffer_ra;               /* Buffer real address */
>  };
>
> +/*
> + * EPOW status sharing (OPAL and the host)
> + *
> + * The host will pass on OPAL, a buffer of length OPAL_EPOW_MAX_CLASSES
> + * to fetch system wide EPOW status. Each element in the returned buffer
> + * will contain bitwise EPOW status for each EPOW sub class.
> + */
> +
> +/* EPOW types */
> +enum OpalEpow {
> +       OPAL_EPOW_POWER         = 0,    /* Power EPOW */
> +       OPAL_EPOW_TEMP          = 1,    /* Temperature EPOW */
> +       OPAL_EPOW_COOLING       = 2,    /* Cooling EPOW */
> +       OPAL_MAX_EPOW_CLASSES   = 3,    /* Max EPOW categories */
> +};
Dont explicitly assign sequential numbers in an enum. Its taken care
of by the compiler.

> +
> +/* Power EPOW events */
> +enum OpalEpowPower {
> +       OPAL_EPOW_POWER_UPS     = 0x1, /* System on UPS power */
> +       OPAL_EPOW_POWER_UPS_LOW = 0x2, /* System on UPS power with low battery*/
> +};
> +
> +/* Temperature EPOW events */
> +enum OpalEpowTemp {
> +       OPAL_EPOW_TEMP_HIGH_AMB = 0x1, /* High ambient temperature */
> +       OPAL_EPOW_TEMP_CRIT_AMB = 0x2, /* Critical ambient temperature */
> +       OPAL_EPOW_TEMP_HIGH_INT = 0x4, /* High internal temperature */
> +       OPAL_EPOW_TEMP_CRIT_INT = 0x8, /* Critical internal temperature */
> +};
> +
>  #endif /* __ASSEMBLY__ */
>
>  #endif /* __OPAL_API_H */
> diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
> index 042af1a..0777864 100644
> --- a/arch/powerpc/include/asm/opal.h
> +++ b/arch/powerpc/include/asm/opal.h
> @@ -141,7 +141,6 @@ int64_t opal_pci_fence_phb(uint64_t phb_id);
>  int64_t opal_pci_reinit(uint64_t phb_id, uint64_t reinit_scope, uint64_t data);
>  int64_t opal_pci_mask_pe_error(uint64_t phb_id, uint16_t pe_number, uint8_t error_type, uint8_t mask_action);
>  int64_t opal_set_slot_led_status(uint64_t phb_id, uint64_t slot_id, uint8_t led_type, uint8_t led_action);
> -int64_t opal_get_epow_status(__be64 *status);
>  int64_t opal_set_system_attention_led(uint8_t led_action);
>  int64_t opal_pci_next_error(uint64_t phb_id, __be64 *first_frozen_pe,
>                             __be16 *pci_error_type, __be16 *severity);
> @@ -200,6 +199,8 @@ int64_t opal_flash_write(uint64_t id, uint64_t offset, uint64_t buf,
>                 uint64_t size, uint64_t token);
>  int64_t opal_flash_erase(uint64_t id, uint64_t offset, uint64_t size,
>                 uint64_t token);
> +int32_t opal_get_epow_status(__be32 *status, __be32 *num_classes);
> +int32_t opal_get_dpo_status(__be32 *timeout);
>
>  /* Internal functions */
>  extern int early_init_dt_scan_opal(unsigned long node, const char *uname,
> diff --git a/arch/powerpc/platforms/powernv/opal-power.c b/arch/powerpc/platforms/powernv/opal-power.c
> index ac46c2c..7c1b2f8 100644
> --- a/arch/powerpc/platforms/powernv/opal-power.c
> +++ b/arch/powerpc/platforms/powernv/opal-power.c
> @@ -1,5 +1,5 @@
>  /*
> - * PowerNV OPAL power control for graceful shutdown handling
> + * PowerNV poweroff events support
>   *
>   * Copyright 2015 IBM Corp.
>   *
> @@ -9,58 +9,395 @@
>   * 2 of the License, or (at your option) any later version.
>   */
>
> +#define pr_fmt(fmt)    "POWEROFF_EVENT: "    fmt
> +
>  #include <linux/kernel.h>
> +#include <linux/spinlock.h>
> +#include <linux/timer.h>
>  #include <linux/reboot.h>
> -#include <linux/notifier.h>
> -
> +#include <linux/of.h>
>  #include <asm/opal.h>
>  #include <asm/machdep.h>
>
> -#define SOFT_OFF 0x00
> -#define SOFT_REBOOT 0x01
> +/* Power control event types */
> +#define SOFT_OFF       0x00
> +#define SOFT_REBOOT    0x01
> +
> +/* Max time for graceful system shutdown including guests. */
> +#define MAX_POWEROFF_SYS_TIME  600
> +
> +/* System EPOW, DPO support variables */
> +bool epow_supported = false;
> +bool dpo_supported = false;
Make them static and no need to set them to false explicitly.

> +
> +/* System EPOW status */
> +static u32 epow_status[OPAL_MAX_EPOW_CLASSES];
> +static int num_epow_classes;
> +
> +/* EPOW event timer and corresponding locks */
> +static struct timer_list epow_timer;
> +static DEFINE_SPINLOCK(epow_timer_spinlock);
> +
> +/* EPOW events types */
> +#define EPOW_POWER_UPS         0
> +#define EPOW_POWER_UPS_LOW     1
> +#define EPOW_TEMP_HIGH_AMB     2
> +#define EPOW_TEMP_CRIT_AMB     3
> +#define EPOW_TEMP_HIGH_INT     4
> +#define EPOW_TEMP_CRIT_INT     5
> +#define EPOW_UNKNOWN           6
> +#define MAX_EPOW_EVENTS                7
Can make these constants enum.

> +
> +/* Poweroff EPOW events description */
> +static const char * const epow_events_map[] = {
> +       [EPOW_POWER_UPS]        = "ups",
> +       [EPOW_POWER_UPS_LOW]    = "ups-low",
> +       [EPOW_TEMP_HIGH_AMB]    = "high-ambient-temp",
> +       [EPOW_TEMP_CRIT_AMB]    = "crit-ambient-temp",
> +       [EPOW_TEMP_HIGH_INT]    = "high-internal-temp",
> +       [EPOW_TEMP_CRIT_INT]    = "crit-internal-temp",
> +       [EPOW_UNKNOWN]          = "Unknown",
> +};
> +
> +/* Poweroff EPOW events timeout values in seconds */
> +static const int epow_timeout[] = {
> +       [EPOW_POWER_UPS]        = 900,
> +       [EPOW_POWER_UPS_LOW]    = 20,
> +       [EPOW_TEMP_HIGH_AMB]    = 900,
> +       [EPOW_TEMP_CRIT_AMB]    = 20,
> +       [EPOW_TEMP_HIGH_INT]    = 900,
> +       [EPOW_TEMP_CRIT_INT]    = 20,
> +       [EPOW_UNKNOWN]          = 0,
> +};
> +
> +/* System poweroff function. */
> +static void epow_poweroff(unsigned long event)
> +{
> +       pr_info("Powering off system due to %s EPOW event\n",
> +                               epow_events_map[event]);
> +       orderly_poweroff(true);
> +}
> +
> +/* Start EPOW poweroff timer */
> +static void start_epow_timer(unsigned long event, int32_t timeout)
> +{
> +       unsigned long flags;
> +
> +       spin_lock_irqsave(&epow_timer_spinlock, flags);
> +       /* Check for already running epow poweroff timer */
> +       if (timer_pending(&epow_timer)) {
> +               /* Timer for same event */
> +               if (epow_timer.data == event) {
> +                       spin_unlock_irqrestore(&epow_timer_spinlock, flags);
> +                       return;
> +               }
I dont  think this condition is need as the timeout for an event is
fixed and event if this
function is called twice with same event then the condition below will
take care of it.

> +
> +               /* Timer with early poweroff timeout */
> +               if (epow_timer.expires <= (jiffies + timeout * HZ)) {
> +                       event = epow_timer.data;
Load/Store not really needed. You can simply use the epow_timer.data.
The race even if it happens wont be catastrophic.

> +                       spin_unlock_irqrestore(&epow_timer_spinlock, flags);
> +                       pr_info("Poweroff already scheduled for %s EPOW event "
> +                                       "with earlier timeout.\n",
> +                                       epow_events_map[event]);
> +                       return;
> +               }
> +       }
> +
> +       /* Start a new timer/modify existing timer with new timeout value */
> +       epow_timer.data = event;
> +       mod_timer(&epow_timer, jiffies + timeout  * HZ);
> +       spin_unlock_irqrestore(&epow_timer_spinlock, flags);
> +       pr_info("Scheduled system poweroff due to %s EPOW event "
> +                       "after %d seconds\n", epow_events_map[event], timeout);
> +}
> +
> +/* Stop EPOW poweroff timer */
> +static void stop_epow_timer(void)
> +{
> +       int rc;
> +       unsigned long flags;
> +
> +       spin_lock_irqsave(&epow_timer_spinlock, flags);
> +       rc = del_timer(&epow_timer);
> +       spin_unlock_irqrestore(&epow_timer_spinlock, flags);
> +
> +       if (rc)
> +               pr_info("Poweroff timer deactivated\n");
> +}
>
> +/* Get DPO status */
> +static bool get_dpo_status(int32_t *dpo_timeout)
> +{
> +       int rc;
> +       __be32 opal_dpo_timeout;
> +
> +       rc = opal_get_dpo_status(&opal_dpo_timeout);
> +       if (rc == OPAL_WRONG_STATE) {
> +               *dpo_timeout = 0;
> +               return false;
> +       }
> +
> +       *dpo_timeout = be32_to_cpu(opal_dpo_timeout);
> +       return true;
> +}
> +
> +/* Get EPOW status */
> +static bool get_epow_status(void)
> +{
> +       int i;
> +       bool epow_detected = false;
> +
> +       __be32 opal_epow_status[OPAL_MAX_EPOW_CLASSES] = {0};
> +       __be32 epow_classes;
> +
> +       /* Send kernel EPOW classes supported info to OPAL */
> +       epow_classes = cpu_to_be32(OPAL_MAX_EPOW_CLASSES);
> +
> +       /* Get EPOW events information from OPAL */
> +       opal_get_epow_status(opal_epow_status, &epow_classes);
> +
> +       /* Copy EPOW status obtained from OPAL */
> +       memset(epow_status, 0, sizeof(epow_status));
> +       num_epow_classes = be32_to_cpu(epow_classes);
> +       for (i = 0; i < num_epow_classes; i++) {
> +               epow_status[i] = be32_to_cpu(opal_epow_status[i]);
> +               if (epow_status[i])
> +                       epow_detected = true;
> +       }
> +
> +       pr_info("EPOW classes supported OPAL = %d, Host = %d "
> +                       "EPOW Status = 0x%x, 0x%x, 0x%x\n",
> +                       num_epow_classes, OPAL_MAX_EPOW_CLASSES,
> +                       epow_status[0], epow_status[1], epow_status[2]);
> +
> +       return epow_detected;
> +}
> +
> +/* Process EPOW information */
> +static void process_epow(void)
> +{
> +       int timeout;
> +       unsigned int event = EPOW_UNKNOWN;
> +
> +       /* EPOW events in order of criticality */
> +       if (epow_status[OPAL_EPOW_POWER] & OPAL_EPOW_POWER_UPS) {
> +               pr_info("EPOW due to system running on UPS power\n");
> +               event = EPOW_POWER_UPS;
> +       }
> +
> +       if (epow_status[OPAL_EPOW_TEMP] & OPAL_EPOW_TEMP_HIGH_AMB) {
> +               pr_info("EPOW due to high ambient temperature\n");
> +               event = EPOW_TEMP_HIGH_AMB;
> +       }
> +
> +       if (epow_status[OPAL_EPOW_TEMP] & OPAL_EPOW_TEMP_HIGH_INT) {
> +               pr_info("EPOW due to high internal temperature\n");
> +               event = EPOW_TEMP_HIGH_INT;
> +       }
> +
> +       if (epow_status[OPAL_EPOW_POWER] & OPAL_EPOW_POWER_UPS_LOW) {
> +               pr_info("EPOW due to system running on UPS power "
> +                               "with low battery\n");
> +               event = EPOW_POWER_UPS_LOW;
> +       }
> +
> +       if (epow_status[OPAL_EPOW_TEMP] & OPAL_EPOW_TEMP_CRIT_AMB) {
> +               pr_info("EPOW due to critical ambient temperature\n");
> +               event = EPOW_TEMP_CRIT_AMB;
> +       }
> +
> +       if (epow_status[OPAL_EPOW_TEMP] & OPAL_EPOW_TEMP_CRIT_INT) {
> +               pr_info("EPOW due to critical internal temperature\n");
> +               event = EPOW_TEMP_CRIT_INT;
> +       }
> +
> +       if (event == EPOW_UNKNOWN) {
> +               pr_err("Unknown EPOW event.\n");
> +               return;
> +       }
Can convert above series of ifs to an if-else ladder with most
critical events checked for first.
In that way you wont be checking against a non critical event when a
critical event has already arrived.

> +
> +       /*
> +       * Check if we have enough time to perform graceful shutdown.
> +       * Wait for EPOW condition to return to normal if we have more
> +       * time than needed for graceful shutdown time. Otherwise proceed
> +       * with immediate system shutdown
> +       */
> +       timeout = epow_timeout[event];
> +       if (timeout <= MAX_POWEROFF_SYS_TIME)
> +               timeout = 0;
> +       else
> +               timeout = timeout - MAX_POWEROFF_SYS_TIME;
> +
> +       /* Start EPOW poweroff timer */
> +       start_epow_timer(event, timeout);
> +}
> +
> +/* Process existing EPOW, DPO events */
> +static void process_existing_poweroff_events(void)
> +{
> +       bool status;
> +       int32_t dpo_timeout;
> +
> +       /* Check for any existing DPO event */
> +       if (dpo_supported) {
> +               status = get_dpo_status(&dpo_timeout);
> +               if (status) {
> +                       pr_info("Existing DPO event detected. "
> +                                       "Powering off system\n");
> +                       orderly_poweroff(true);
> +                       return;
> +               }
> +               pr_info("No existing DPO event detected\n");
> +       }
> +
> +       /* Check for any existing EPOW event */
> +       if (epow_supported) {
> +               status = get_epow_status();
> +               if (status) {
> +                       pr_info("Existing EPOW event detected.\n");
> +                       process_epow();
> +                       return;
> +               }
> +               pr_info("No existing EPOW event detected\n");
> +       }
> +}
> +
> +/* EPOW event notifier */
> +static int opal_epow_event(struct notifier_block *nb,
> +                       unsigned long msg_type, void *msg)
> +{
> +       bool epow_present;
> +
> +       pr_info("EPOW event received\n");
> +
> +       /* Get EPOW event details */
> +       epow_present = get_epow_status();
> +
> +       /* Stop shutdown timer if EPOW condition has returned to normal */
> +       if (!epow_present) {
Can be replaced with  if (!get_epow_status()) eliminating the variable
epow_present.
> +               stop_epow_timer();
> +               return 0;
> +       }
> +
> +       /* Process EPOW event information */
> +       process_epow();
> +
> +       return 0;
> +}
> +
Not much being done in this function above. May be you can think about merging
it with process_epow

> +/* DPO event notifier */
> +static int opal_dpo_event(struct notifier_block *nb,
> +                               unsigned long msg_type, void *msg)
> +{
> +       pr_info("DPO event received. Powering off system as requested\n");
> +       orderly_poweroff(true);
> +
> +       return 0;
> +}
> +
> +/* IPMI power-control events notifier */
>  static int opal_power_control_event(struct notifier_block *nb,
> -                                   unsigned long msg_type, void *msg)
> +                               unsigned long msg_type, void *msg)
>  {
> -       struct opal_msg *power_msg = msg;
>         uint64_t type;
> +       struct opal_msg *power_msg = msg;
>
>         type = be64_to_cpu(power_msg->params[0]);
>
>         switch (type) {
>         case SOFT_REBOOT:
> -               pr_info("OPAL: reboot requested\n");
> +               pr_info("Reboot requested\n");
>                 orderly_reboot();
>                 break;
>         case SOFT_OFF:
> -               pr_info("OPAL: poweroff requested\n");
> +               pr_info("Poweroff requested\n");
>                 orderly_poweroff(true);
>                 break;
>         default:
> -               pr_err("OPAL: power control type unexpected %016llx\n", type);
> +               pr_err("Unknown event %llu\n", type);
>         }
>
>         return 0;
>  }
>
> +/* OPAL EPOW event notifier block */
> +static struct notifier_block opal_epow_nb = {
> +       .notifier_call  = opal_epow_event,
> +       .next           = NULL,
> +       .priority       = 0,
> +};
> +
> +/* OPAL DPO event notifier block */
> +static struct notifier_block opal_dpo_nb = {
> +       .notifier_call  = opal_dpo_event,
> +       .next           = NULL,
> +       .priority       = 0,
> +};
> +
> +/* OPAL Power control events */
>  static struct notifier_block opal_power_control_nb = {
> -       .notifier_call  = opal_power_control_event,
> -       .next           = NULL,
> -       .priority       = 0,
> +       .notifier_call  = opal_power_control_event,
> +       .next           = NULL,
> +       .priority       = 0,
>  };
>
> -static int __init opal_power_control_init(void)
> +/* Poweroff events init */
> +static int __init opal_poweroff_events_init(void)
>  {
>         int ret;
> +       struct device_node *node_epow;
>
> -       ret = opal_message_notifier_register(OPAL_MSG_SHUTDOWN,
> -                                            &opal_power_control_nb);
> -       if (ret) {
> -               pr_err("%s: Can't register OPAL event notifier (%d)\n",
> -                               __func__, ret);
> -               return ret;
> +       /*
> +       * Determine EPOW, DPO support in hardware.
> +       */
> +       node_epow = of_find_node_by_path("/ibm,opal/epow");
> +       if (node_epow) {
> +               if (of_device_is_compatible(node_epow, "ibm,opal-epow")) {
> +                       epow_supported = true;
> +                       dpo_supported = true;
> +                       pr_info("OPAL EPOW, DPO support detected.\n");
> +               }
> +               of_node_put(node_epow);
> +       }
> +
> +       /* Prepare to handle EPOW events */
> +       if (epow_supported) {
> +               /* Initialize poweroff timer */
> +               init_timer(&epow_timer);
> +               epow_timer.function = epow_poweroff;
Variable initialization should be done while defining the variable
itself not in the code.
Consider using DEFINE_TIMER macro to define this.

> +
> +               /* Register EPOW event notifier */
> +               ret = opal_message_notifier_register(OPAL_MSG_EPOW,
> +                               &opal_epow_nb);
> +               if (ret) {
> +                       pr_err("EPOW event notifier registration failed, "
> +                                       "ret = %d\n", ret);
> +               }
> +       }
> +
> +       if (dpo_supported) {
> +               /* Register DPO event notifier */
> +               ret = opal_message_notifier_register(OPAL_MSG_DPO,
> +                               &opal_dpo_nb);
> +               if (ret)
> +                       pr_err("DPO event notifier registration failed, "
> +                                       "ret = %d\n", ret);
>         }
>
> +       /* Check for any existing EPOW or DPO events. */
> +       process_existing_poweroff_events();
Process existing events only after you have registered all the notifiers or you
may miss some OPAL_MSG_SHUTDOWN notifications.

> +
> +       /* Register IPMI poweroff events notifier */
> +       ret = opal_message_notifier_register(OPAL_MSG_SHUTDOWN,
> +                               &opal_power_control_nb);
> +       if (ret)
> +               pr_err("IPMI power-control events notifier registration "
> +                               "failed, ret = %d\n", ret);
> +
> +
> +       pr_info("Poweroff events support initialized\n");
> +
>         return 0;
>  }
> -machine_subsys_initcall(powernv, opal_power_control_init);
> +
> +machine_subsys_initcall(powernv, opal_poweroff_events_init);
> diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S
> index a7ade94..5d3c8e3 100644
> --- a/arch/powerpc/platforms/powernv/opal-wrappers.S
> +++ b/arch/powerpc/platforms/powernv/opal-wrappers.S
> @@ -249,6 +249,7 @@ OPAL_CALL(opal_pci_reinit,                  OPAL_PCI_REINIT);
>  OPAL_CALL(opal_pci_mask_pe_error,              OPAL_PCI_MASK_PE_ERROR);
>  OPAL_CALL(opal_set_slot_led_status,            OPAL_SET_SLOT_LED_STATUS);
>  OPAL_CALL(opal_get_epow_status,                        OPAL_GET_EPOW_STATUS);
> +OPAL_CALL(opal_get_dpo_status,                 OPAL_GET_DPO_STATUS);
>  OPAL_CALL(opal_set_system_attention_led,       OPAL_SET_SYSTEM_ATTENTION_LED);
>  OPAL_CALL(opal_pci_next_error,                 OPAL_PCI_NEXT_ERROR);
>  OPAL_CALL(opal_pci_poll,                       OPAL_PCI_POLL);
> --
> 1.9.3
>
> _______________________________________________
> Linuxppc-dev mailing list
> Linuxppc-dev at lists.ozlabs.org
> https://lists.ozlabs.org/listinfo/linuxppc-dev


More information about the Linuxppc-dev mailing list