[Skiboot] [PATCH 5/5] hmi: Recover both CAPP units on Naples after malfunction alert

Fri Feb 12 21:11:31 AEDT 2016

Michael Neuling wrote:
> On Mon, 2016-02-08 at 16:30 +0100, Philippe Bergheaud wrote:
> 
>>In decode_one_malfunction, check the chip type and if Naples, then
>>loop
>>on both capp units.
> 
> 
> Does this mean an error on one card takes down both?  That seem broken.
> 
That makes no sense indeed. I should rephrase it as:

"Naples has two capp units. Probe both units to identify the card in error state."

I should also fix the logic of the 'for' loop (see below).
> 
> 
>>Signed-off-by: Philippe Bergheaud <felix at linux.vnet.ibm.com>
>>---
>> core/hmi.c | 29 +++++++++++++++++++++--------
>> 1 file changed, 21 insertions(+), 8 deletions(-)
>>
>>diff --git a/core/hmi.c b/core/hmi.c
>>index d2cca90..5204bb3 100644
>>--- a/core/hmi.c
>>+++ b/core/hmi.c
>>@@ -242,14 +242,19 @@ static int queue_hmi_event(struct OpalHMIEvent
>>*hmi_evt, int recover)
>> 				hmi_data[3]);
>> }
>> 
>>-static int is_capp_recoverable(int chip_id)
>>+static int is_capp_recoverable(int chip_id, int capp)
>> {
>> 	uint64_t reg;
>>-	xscom_read(chip_id, CAPP_ERR_STATUS_CTRL, &reg);
>>+	uint32_t reg_offset = capp ? CAPP1_REG_OFFSET : 0x0;
>>+
>>+	xscom_read(chip_id, CAPP_ERR_STATUS_CTRL + reg_offset,
>>&reg);
>> 	return (reg & PPC_BIT(0)) != 0;
>> }
>> 
>>-static int handle_capp_recoverable(int chip_id)
>>+#define CAPP_PHB3_ATTACHED(chip, phb_index) \
>>+	(chip->capp_phb3_attached_mask & (1 << phb_index))
>>+
>>+static int handle_capp_recoverable(int chip_id, int capp)
>> {
>> 	struct dt_node *np;
>> 	u64 phb_id;
>>@@ -257,14 +262,16 @@ static int handle_capp_recoverable(int chip_id)
>> 	struct phb *phb;
>> 	u32 phb_index;
>> 	struct proc_chip *chip = get_chip(chip_id);
>>-	u8 mask = chip->capp_phb3_attached_mask;
>>+	int dual_capp = (chip->type == PROC_CHIP_P8_NAPLES);
>> 
>> 	dt_for_each_compatible(dt_root, np, "ibm,power8-pciex") {
>> 		dt_chip_id = dt_prop_get_u32(np, "ibm,chip-id");
>> 		phb_index = dt_prop_get_u32(np, "ibm,phb-index");
>> 		phb_id = dt_prop_get_u64(np, "ibm,opal-phbid");
>> 
>>-		if ((mask & (1 << phb_index)) && (chip_id ==
>>dt_chip_id)) {
>>+		if ((chip_id == dt_chip_id) &&
>>+		    CAPP_PHB3_ATTACHED(chip, phb_index) &&
>>+		    (!dual_capp || phb_index == capp)) {
>> 			phb = pci_get_phb(phb_id);
>> 			phb->ops->lock(phb);
>> 			phb->ops->set_capp_recovery(phb);
>>@@ -277,13 +284,19 @@ static int handle_capp_recoverable(int chip_id)
>> 
>> static int decode_one_malfunction(int flat_chip_id, struct
>>OpalHMIEvent *hmi_evt)
>> {
>>+	int capp;
>>+	int recover = 0;
>>+	struct proc_chip *chip = get_chip(flat_chip_id);
>>+	int dual_capp = (chip->type == PROC_CHIP_P8_NAPLES);
>>+
>> 	hmi_evt->severity = OpalHMI_SEV_FATAL;
>> 	hmi_evt->type = OpalHMI_ERROR_MALFUNC_ALERT;
>> 
>>-	if (is_capp_recoverable(flat_chip_id)) {
>>-		if (handle_capp_recoverable(flat_chip_id) == 0)
>>-			return 0;
>>+	for (capp = 0; capp < (dual_capp ? 2 : 1); capp++)
>>+		if (is_capp_recoverable(flat_chip_id, capp))
>>+			recover |=
>>handle_capp_recoverable(flat_chip_id, capp);
>> 
>>+	if (recover) {
>> 		hmi_evt->severity = OpalHMI_SEV_NO_ERROR;
>> 		hmi_evt->type = OpalHMI_ERROR_CAPP_RECOVERY;
>> 		return 1;
 >>
The loop should be exited as soon as a card has been found broken and recovered, ie

for (capp = 0; capp < (dual_capp ? 2 : 1); capp++)
	if (is_capp_recoverable(flat_chip_id, capp))
		if (handle_capp_recoverable(flat_chip_id, capp)) {
			hmi_evt->severity = OpalHMI_SEV_NO_ERROR;
			hmi_evt->type = OpalHMI_ERROR_CAPP_RECOVERY;
			return 1;
		}

Would both cards be broken, then capp1 would be handled on the next malfunction alert.

Philippe

PS I could not test this new logic yet on Naples (g13 seems to be down)