[PATCH] EEH detection in acenic watchdog

Olof Johansson olof at austin.ibm.com
Fri Aug 29 12:36:34 EST 2003


Paul Mackerras wrote:

> This is OK for our local ppc64 trees, but it's a bit ugly.  It's an
> extra ifdef and it is putting something very pSeries-specific into a
> driver that otherwise is platform-agnostic.

Ok, so no shortcut this time then. :-)

I wasn't sure what the prevailing attitude was against adding such
specific hooks. The acenic driver is fairly clean as it is, so I suppose I
should make an effort to keep it that way.

> Maybe what we should propose is to add a "platform_error_check()"
> function which can be called in these kinds of circumstances, with
> null definitions on most architectures.

I've added a pci_check_error(), since EEH is currently limited to PCI on
our machines (and the driver in question is PCI-only. This also required a
minor shuffle in arch/ppc64/kernel/eeh.c to take a struct pci_dev *.

I also chose to #ifndef a dummy definition to be less intrusive on other
architectures. On 2.6 it could probably make sense to modify all
asm-*/pci.h instead, I'm not sure.

See attachment for the patch. Is this more like what you had in mind?

> BTW, I think Jes Sorensen did the acenic driver in the context of his
> work for a previous employer.  I don't know if he has any interest
> in the acenic now (or even any acenic hardware to work on).

Ack, I guess that makes LKML the best venue for patches then? He's still
the official maintainer.


Thanks,

Olof

--
Olof Johansson                                        Office: 4E002/905
pSeries Linux Development                             IBM Systems Group
Email: olof at austin.ibm.com                          Phone: 512-838-9858
All opinions are my own and not those of IBM
-------------- next part --------------
===== arch/ppc64/kernel/eeh.c 1.7 vs edited =====
--- 1.7/arch/ppc64/kernel/eeh.c	Mon Aug 25 23:47:43 2003
+++ edited/arch/ppc64/kernel/eeh.c	Thu Aug 28 21:11:51 2003
@@ -76,8 +76,6 @@
 {
 	unsigned long addr;
 	struct pci_dev *dev;
-	struct device_node *dn;
-	unsigned long ret, rets[2];
 
 	/* IO BAR access could get us here...or if we manually force EEH
 	 * operation on even if the hardware won't support it.
@@ -94,9 +92,21 @@
 		printk("EEH: no pci dev found for addr=0x%lx\n", addr);
 		return val;
 	}
+	return eeh_check_failure_dev(dev, val);
+}
+
+/* Same as eeh_check_failure(), but takes a pci_dev instead of a
+ * token address.
+ */
+
+unsigned long eeh_check_failure_dev(struct pci_dev *dev, unsigned long val)
+{
+	struct device_node *dn;
+	unsigned long ret, rets[2];
+
 	dn = pci_device_to_OF_node(dev);
 	if (!dn) {
-		printk("EEH: no pci dn found for addr=0x%lx\n", addr);
+		printk("EEH: no pci dn found for device %s\n", dev->name);
 		return val;
 	}
 
@@ -133,7 +143,6 @@
 	}
 	eeh_false_positives++;
 	return val;	/* good case */
-
 }
 
 struct eeh_early_enable_info {
===== drivers/net/acenic.c 1.29 vs edited =====
--- 1.29/drivers/net/acenic.c	Fri Jun 20 01:00:08 2003
+++ edited/drivers/net/acenic.c	Thu Aug 28 21:27:34 2003
@@ -1863,6 +1863,10 @@
 		       dev->name, (unsigned int)readl(&regs->HostCtrl));
 		/* This can happen due to ieee flow control. */
 	} else {
+		if (pci_check_error(ap->pdev)) {
+			printk(KERN_WARNING "%s: PCI error detected\n", dev->name);
+		}
+
 		printk(KERN_DEBUG "%s: BUG... transmitter died. Kicking it.\n",
 		       dev->name);
 #if 0
===== include/asm-ppc64/eeh.h 1.6 vs edited =====
--- 1.6/include/asm-ppc64/eeh.h	Mon Aug 25 23:47:51 2003
+++ edited/include/asm-ppc64/eeh.h	Thu Aug 28 20:45:56 2003
@@ -46,6 +46,7 @@
 void eeh_init(void);
 int eeh_get_state(unsigned long ea);
 unsigned long eeh_check_failure(void *token, unsigned long val);
+unsigned long eeh_check_failure_dev(struct pci_dev *dev, unsigned long val);
 void *eeh_ioremap(unsigned long addr, void *vaddr);
 
 #define EEH_DISABLE		0
===== include/asm-ppc64/pci.h 1.3 vs edited =====
--- 1.3/include/asm-ppc64/pci.h	Fri May 10 19:46:04 2002
+++ edited/include/asm-ppc64/pci.h	Thu Aug 28 21:02:10 2003
@@ -145,6 +145,14 @@
  * this boolean for bounce buffer decisions.
  */
 #define PCI_DMA_BUS_IS_PHYS	(0)
+
+#define HAVE_PCI_CHECK_ERROR
+static inline int pci_check_error(struct pci_dev *dev)
+{
+	/* eeh_check_failure returns the second argument on non-failures */
+	return eeh_check_failure_dev(dev, 1);
+}
+
 	
 #endif	/* __KERNEL__ */
 
===== include/linux/pci.h 1.32 vs edited =====
--- 1.32/include/linux/pci.h	Mon Aug 25 23:47:53 2003
+++ edited/include/linux/pci.h	Thu Aug 28 20:47:37 2003
@@ -806,5 +806,16 @@
 #define PCIPCI_VSFX		16
 #define PCIPCI_ALIMAGIK		32
 
+/* Some architectures have additional hardware support to detect problems
+ * with a PCI device, and puts the slot in a frozen state. This is the
+ * generic way to access that functionality.
+ *
+ * Return value is 0 for "no error detected"
+ */
+
+#ifndef HAVE_PCI_CHECK_ERROR
+static inline int pci_check_error(struct pci_dev *dev) { return 0; }
+#endif /* HAVE_PCI_CHECK_ERROR */
+
 #endif /* __KERNEL__ */
 #endif /* LINUX_PCI_H */


More information about the Linuxppc64-dev mailing list