[PATCH] FYI/ Re: PCI Error Recovery API Proposal (updated)

Linas Vepstas linas at austin.ibm.com
Sat May 7 09:05:06 EST 2005


Hi,

This is an "FYI" patch partially implementing the PCI error recovery API 
previously detailed by BenH in an earlier email.  

Its an "FYI patch" because this patch has numerous flaws and limitations 
which I'm hoping to address any day now.  I've been busy with other
things, but have recently been able to carve out a chunk of time to work
on this.

This patch is almost identical to a previous patch I'd mailed out
before, with only minor changes made to bring it into line with
BenH's proposed API.  Basically, I'm just dusting off the old patch,
prior to making more serious changes.  I hope to send a more serious
patch in a few days/week.  Meanwhile, criticism invited.

This patch does actually recover from PCI errors on ethernet cards 
plugged into ppc64 hotplug slots, and from PCI errors on the IPR scsi 
controller. 

--linas

-------------- next part --------------
--- include/linux/pci.h.linas-orig	2005-04-29 20:27:22.000000000 -0500
+++ include/linux/pci.h	2005-05-06 16:34:02.000000000 -0500
@@ -659,6 +659,80 @@ struct pci_dynids {
 	unsigned int use_driver_data:1; /* pci_driver->driver_data is used */
 };
 
+/* ---------------------------------------------------------------- */
+/** PCI error recovery state.  Whenever the PCI bus state changes,
+ *  the io_state_change() callback will be called to notify the 
+ *  device driver os state changes.
+ */
+
+enum pci_channel_state {
+	pci_channel_io_normal = 0, /* I/O channel is in normal state */
+	pci_channel_io_frozen = 1, /* I/O to channel is blocked */
+	pci_channel_io_perm_failure, /* pci card is dead */
+};
+
+enum pcierr_result {
+	PCIERR_RESULT_CAN_RECOVER=1,
+	PCIERR_RESULT_NEED_RESET,
+	PCIERR_RESULT_DISCONNECT,
+	PCIERR_RESULT_RECOVERED,
+};
+
+/* PCI bus error event callbacks */
+struct pci_error_handlers
+{
+	int (*error_detected)(struct pci_dev *dev, enum pci_channel_state error);
+	int (*error_recover)(struct pci_dev *dev);
+	int (*error_restart)(struct pci_dev *dev);
+	int (*link_reset)(struct pci_dev *dev);
+	int (*slot_reset)(struct pci_dev *dev);
+};
+
+/**
+ * PCI Error notifier event flags.
+ */
+#define PEH_NOTIFY_ERROR 1
+
+/** PEH event -- structure holding pci controller data that describes
+ *  a change in the isolation status of a PCI slot.  A pointer
+ *  to this struct is passed as the data pointer in a notify callback.
+ */
+struct peh_event {
+	struct list_head     list;
+	struct pci_dev       *dev;  /* affected device */
+	enum pci_channel_state state; /* PCI bus state for the affected device */
+	int time_unavail;    /* milliseconds until device might be available */
+};
+
+/**
+ * peh_send_failure_event - generate a PCI error event
+ * @dev pci device
+ *
+ * This routine builds a PCI error event which will be delivered 
+ * to all listeners on the peh_notifier_chain.
+ * 
+ * This routine can be called within an interrupt context;
+ * the actual event will be delivered in a normal context
+ * (from a workqueue).  
+ */
+int peh_send_failure_event (struct pci_dev *dev, 
+                            enum pci_channel_state state, 
+                            int time_unavail);
+
+/**
+ * peh_register_notifier - Register to find out about EEH events.
+ * @nb: notifier block to callback on events
+ */
+int peh_register_notifier(struct notifier_block *nb);
+
+/**
+ * peh_unregister_notifier - Unregister to an EEH event notifier.
+ * @nb: notifier block to callback on events
+ */
+int peh_unregister_notifier(struct notifier_block *nb);
+
+/* ---------------------------------------------------------------- */
+
 struct module;
 struct pci_driver {
 	struct list_head node;
@@ -671,6 +745,7 @@ struct pci_driver {
 	int  (*resume) (struct pci_dev *dev);	                /* Device woken up */
 	int  (*enable_wake) (struct pci_dev *dev, u32 state, int enable);   /* Enable wake event */
 
+	struct pci_error_handlers err_handler;
 	struct device_driver	driver;
 	struct pci_dynids dynids;
 };
--- Documentation/pci-error-recovery.txt.linas-orig	2005-05-06 17:44:41.000000000 -0500
+++ Documentation/pci-error-recovery.txt	2005-05-06 17:39:19.000000000 -0500
@@ -0,0 +1,192 @@
+
+                   PCI Error Recovery
+                   ------------------
+
+
+Preliminary sketch of API, cut n pasted from email from BenH.
+circa 5 april 2005
+
+The error recovery API support is exposed by the driver in the form of
+a structure of function pointers pointed to by a new field in struct
+pci_driver. The absence of this pointer in pci_driver denotes an
+"non-aware" driver, behaviour on these is platform dependant. Platforms
+like ppc64 can try to simulate hotplug remove/add.
+
+The definition of "pci_error_token" is not covered here. It is based on
+Seto's work on the synchronous error detection. We still need to define
+functions for extracting infos out of an opaque error token. This is
+separate from this API.
+
+This structure has the form:
+
+struct pci_error_handlers
+{
+        int (*error_detected)(struct pci_dev *dev, pci_error_token error);
+        int (*error_recover)(struct pci_dev *dev);
+        int (*error_restart)(struct pci_dev *dev);
+        int (*link_reset)(struct pci_dev *dev);
+        int (*slot_reset)(struct pci_dev *dev);
+};
+
+A driver doesn't have to implement all of these callbacks. The only mandatory
+one is error_detected. If a callback is not implemented, the corresponding
+feature is considered unsupported. For example, if error_recover and
+error_restart (they really go together, see desscription to understand why)
+aren't there, then the driver is assumed as not doing any direct recovery and
+requires a reset. If link_reset is not implemented, the card is assumed as
+not caring about link resets, in which case, if recover is supported, the core
+can try recover (but not slot_reset unless it really did reset the slot). If slot
+reset is not supported, link reset can be called instead on a slot reset.
+
+At first, the call will always be :
+
+       1) error_detected()
+
+        Error detected. This is sent once after an error has been detected. At
+this point, the device might not be accessible anymore depending on the
+platform (the slot will be isolated on ppc64). The driver may already
+have "noticed" the error because of a failing IO, but this is the proper
+"synchronisation point", that is, it gives a chance to the driver to
+cleanup, waiting for pending stuffs (timers, whatever, etc...) to
+complete, it can take semaphores, schedule, etc... everything but touch
+the device. Within this function and after it returns, the driver
+shouldn't do any new IOs. Called in task context. This is sort of a
+"quiesce" point. See note about interrupts at the end of this doc.
+
+        Result codes:
+                - PCIERR_RESULT_CAN_RECOVER:
+                  Return this if you think you might be able to recover
+                  the HW by just banging IOs or if you want to be given
+                  a chance to extract some diagnostic informations (see
+                  below).
+                - PCIERR_RESULT_NEED_RESET:
+                  Return this if you think you can't recover unless the
+                  slot is reset.
+                - PCIERR_RESULT_DISCONNECT:
+                  Return this if you think you won't recover at all,
+                  (this will detach the driver ? or just leave it
+                  dangling ? to be decided)
+
+
+So at this point, we have called error_detected() for all drivers
+on the segment that had the error. On ppc64, the slot is isolated. What
+happens now typically depends on the result from the drivers. If all
+drivers on the segment/slot return PCIERR_RESULT_CAN_RECOVER, we would
+re-enable IOs on the slot (or do nothing special if the platform doesn't
+isolate slots) and call 2). If not and we can reset slots, we go to 4),
+if neither, we have a dead slot. If it's an hotplug slot, we might
+"simulate" reset by triggering HW unplug/replug tho.
+
+        2) error_recover()
+
+        This is the "early recovery" call. IOs are allowed again, but DMA is
+not (hrm... to be discussed, I prefer not), with some restrictions. This
+is NOT a callback for the driver to start operations again, only to
+peek/poke at the device, extract diagnostic informations  if any, and
+eventually do things like trigger a device local reset or such things,
+but not restart operations. This is sent if all drivers on a segment
+agree that they can try to recover and no automatic link reset was performed
+by the HW. If the platform can't just re-enable IOs without a slot reset or a
+link reset, it doesn't call this callback and goes directly to 3) or 4). All IOs
+should be done _synchronously_ from withing this callback, errors triggered by
+them will be returned via the normal pci_check_whatever() api, no new
+error_detected() callback will be issued due to an error happening here. However,
+such an error might cause IOs to be re-blocked for the whole segment, and thus
+invalidate the recovery that other devices on the same segment might have done,
+forcing the whole segment into one of the next states, that is link reset or
+slot reset.
+
+        Result codes:
+                - PCIERR_RESULT_RECOVERED
+                  Return this if you think your device is fully
+                  functionnal and think you are ready to start
+                  to do your normal driver job again. There is no
+                  guarantee that because you returned that, you'll be
+                  allowed to actually proceed as another driver on the
+                  same segment might have failed and thus triggered a
+                  slot reset on platforms that support it.
+
+                - PCIERR_RESULT_NEED_RESET
+                  Return this if you think your device is not
+                  recoverable in it's current state and you need a slot
+                  reset to proceed.
+
+                - PCIERR_RESULT_DISCONNECT
+                  Same as above. Total failure, no recovery even after
+                  reset driver dead. (To be defined more precisely)
+                                                                   
+        3) link_reset()
+
+        This is called after the link has been reset. This is typically a
+PCI Express specific state at this point and is done wether a non fatal error
+has been detected that can be "solved" by resetting the link. The driver is
+informed here of that reset and should check if the device appears to be in
+working condition. This function acts a bit like 2) error_recover(), that is
+it is not supposed to restart normal driver IO operations right away, just
+"probe" the device to check it's recoverability status. If all is right, then
+the core will call error_restart() once all driver have ack'd link_reset().
+
+        Result codes:
+                (identical to error_recover)
+
+        4) slot_reset()
+
+        This is called after the slot has been hard reset (and PCI BARs
+re-configured by the platform). If the platform supports PCI hotplug,
+it can implement this by toggling power on the slot off/on. Drivers here
+have a chance to re-initialize the hardware (re-download firmware etc...),
+but drivers shouldn't restart normal IO processing operations at this point.
+(see note about interrupts, they aren't guaranteed to be delivered until the
+restart callback has been called). Upon success from this callback, the
+patform will call error_restart() to complete the error handling and let
+the driver restart normal IO request processing.
+
+However, a driver can still return a critical failure from here in case
+it just can't get it's device back from reset. There is just nothing we
+can do about it tho. The driver will just be considered "dead" in this case.
+
+        Result codes:
+                - PCIERR_RESULT_DISCONNECT
+                Same as above.
+
+        5) error_restart()
+
+        This is called if all drivers on the segment have returned
+PCIERR_RESULT_RECOVERED from one of the 3 prevous callbacks. That basically
+tells the driver to restart activity, everything is back & running. No result
+code is taken into account here. If a new error happens, it will restart
+a new error handling process.
+
+That's it. I think this covers all the possibilities. The way those
+callbacks are called is platform policy. A platform with no slot reset
+capability for example may want to just "ignore" drivers that can't
+recover (disconnect them) and try to let other cards on the same segment
+recover. Keep in mind that in most real life cases, though, there will
+be only one driver per segment.
+
+Now, there is a note about interrupts. If you get an interrupt and your
+device is dead or has been isolated, there is a problem :)
+
+After much thinking, I decided to leave that to the platform. That is,
+the recovery API only precies that:
+
+ - There is no guarantee that interrupt delivery can proceed from any
+device on the segment starting from the error detection and until the
+restart callback is sent, at which point interrupts are expected to be
+fully operational.
+
+ - There is no guarantee that interrupt delivery is stopped, that is, ad
+river that gets an interrupts after detecting an error, or that detects
+and error within the interrupt handler such that it prevents proper
+ack'ing of the interrupt (and thus removal of the source) should just
+return IRQ_NOTHANDLED. It's up to the platform to deal with taht
+condition, typically by masking the irq source during the duration of
+the error handling. It is expected that the platform "knows" which
+interrupts are routed to error-management capable slots and can deal
+with temporarily disabling that irq number during error processing (this
+isn't terribly complex). That means some IRQ latency for other devices
+sharing the interrupt, but there is simply no other way. High end
+platforms aren't supposed to share interrupts between many devices
+anyway :)
+
+
--- drivers/pci/Makefile.linas-orig	2005-04-29 20:31:33.000000000 -0500
+++ drivers/pci/Makefile	2005-05-06 12:28:43.000000000 -0500
@@ -3,7 +3,7 @@
 #
 
 obj-y		+= access.o bus.o probe.o remove.o pci.o quirks.o \
-			names.o pci-driver.o search.o pci-sysfs.o \
+			names.o pci-driver.o pci-error.o search.o pci-sysfs.o \
 			rom.o
 obj-$(CONFIG_PROC_FS) += proc.o
 
--- drivers/pci/pci-error.c.linas-orig	2005-05-06 17:44:47.000000000 -0500
+++ drivers/pci/pci-error.c	2005-05-06 16:56:02.000000000 -0500
@@ -0,0 +1,152 @@
+/*
+ * pci-error.c
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <linux/list.h>
+#include <linux/notifier.h>
+#include <linux/pci.h>
+
+#undef DEBUG
+
+/** Overview:
+ *  PEH, or "PCI Error Handling" is a PCI bridge technology for
+ *  dealing with PCI bus errors that can't be dealt with within the
+ *  usual PCI framework, except by check-stopping the CPU.  Systems
+ *  that are designed for high-availability/reliability cannot afford
+ *  to crash due to a "mere" PCI error, thus the need for PEH.
+ *  An PEH-capable bridge operates by converting a detected error
+ *  into a "slot freeze", taking the PCI adapter off-line, making
+ *  the slot behave, from the OS'es point of view, as if the slot
+ *  were "empty": all reads return 0xff's and all writes are silently
+ *  ignored.  PEH slot isolation events can be triggered by parity
+ *  errors on the address or data busses (e.g. during posted writes),
+ *  which in turn might be caused by low voltage on the bus, dust, 
+ *  vibration, humidity, radioactivity or plain-old failed hardware.
+ *
+ *  Note, however, that one of the leading causes of PEH slot
+ *  freeze events are buggy device drivers, buggy device microcode,
+ *  or buggy device hardware.  This is because any attempt by the
+ *  device to bus-master data to a memory address that is not
+ *  assigned to the device will trigger a slot freeze.   (The idea
+ *  is to prevent devices-gone-wild from corrupting system memory).
+ *  Buggy hardware/drivers will have a miserable time co-existing
+ *  with PEH.
+ */
+
+/* PEH event workqueue setup. */
+static spinlock_t peh_eventlist_lock = SPIN_LOCK_UNLOCKED;
+LIST_HEAD(peh_eventlist);
+static void peh_event_handler(void *);
+DECLARE_WORK(peh_event_wq, peh_event_handler, NULL);
+
+static struct notifier_block *peh_notifier_chain;
+
+/**
+ * peh_event_handler - dispatch PEH events.  The detection of a frozen
+ * slot can occur inside an interrupt, where it can be hard to do
+ * anything about it.  The goal of this routine is to pull these
+ * detection events out of the context of the interrupt handler, and
+ * re-dispatch them for processing at a later time in a normal context.
+ *
+ * @dummy - unused
+ */
+static void peh_event_handler(void *dummy)
+{
+	unsigned long flags;
+	struct peh_event	*event;
+
+	while (1) {
+		spin_lock_irqsave(&peh_eventlist_lock, flags);
+		event = NULL;
+		if (!list_empty(&peh_eventlist)) {
+			event = list_entry(peh_eventlist.next, struct peh_event, list);
+			list_del(&event->list);
+		}
+		spin_unlock_irqrestore(&peh_eventlist_lock, flags);
+		if (event == NULL)
+			break;
+
+		printk(KERN_INFO "PEH: Detected PCI bus error on device "
+		       "%s %s\n", 
+		       pci_name(event->dev), pci_pretty_name(event->dev));
+
+		notifier_call_chain (&peh_notifier_chain,
+		           PEH_NOTIFY_ERROR, event);
+
+		pci_dev_put(event->dev);
+		kfree(event);
+	}
+}
+
+
+/**
+ * peh_send_failure_event - generate a PCI error event
+ * @dev pci device
+ *
+ * This routine builds a PCI error event which will be delivered 
+ * to all listeners on the peh_notifier_chain.
+ * 
+ * This routine can be called within an interrupt context;
+ * the actual event will be delivered in a normal context
+ * (from a workqueue).  
+ */
+int peh_send_failure_event (struct pci_dev *dev, 
+                            enum pci_channel_state state, 
+                            int time_unavail)
+{
+	unsigned long flags;
+	struct peh_event *event;
+	
+	event = kmalloc(sizeof(*event), GFP_ATOMIC);
+	if (event == NULL) {
+		printk (KERN_ERR "PEH: out of memory, event not handled\n");
+		return 1;
+ 	}
+
+	event->dev = dev;
+	event->state = state;
+	event->time_unavail = time_unavail;
+
+	/* We may or may not be called in an interrupt context */
+	spin_lock_irqsave(&peh_eventlist_lock, flags);
+	list_add(&event->list, &peh_eventlist);
+	spin_unlock_irqrestore(&peh_eventlist_lock, flags);
+
+	schedule_work(&peh_event_wq);
+
+	return 0;
+}
+
+/**
+ * peh_register_notifier - Register to find out about EEH events.
+ * @nb: notifier block to callback on events
+ */
+int peh_register_notifier(struct notifier_block *nb)
+{
+	return notifier_chain_register(&peh_notifier_chain, nb);
+}
+
+/**
+ * peh_unregister_notifier - Unregister to an EEH event notifier.
+ * @nb: notifier block to callback on events
+ */
+int peh_unregister_notifier(struct notifier_block *nb)
+{
+	return notifier_chain_unregister(&peh_notifier_chain, nb);
+}
+
+
--- drivers/scsi/ipr.c.linas-orig	2005-04-29 20:33:36.000000000 -0500
+++ drivers/scsi/ipr.c	2005-05-06 17:28:15.000000000 -0500
@@ -80,6 +80,11 @@
 #include <scsi/scsi_eh.h>
 #include <scsi/scsi_cmnd.h>
 #include <scsi/scsi_request.h>
+
+#ifdef CONFIG_PPC64
+#define CONFIG_SCSI_IPR_EEH
+#endif /* CONFIG_PPC64 */
+
 #include "ipr.h"
 
 /*
@@ -4993,6 +4998,7 @@ static int ipr_reset_start_bist(struct i
 	return rc;
 }
 
+
 /**
  * ipr_reset_allowed - Query whether or not IOA can be reset
  * @ioa_cfg:	ioa config struct
@@ -5306,6 +5312,69 @@ static void ipr_initiate_ioa_reset(struc
 				shutdown_type);
 }
 
+#ifdef CONFIG_SCSI_IPR_EEH
+
+/** If the PCI slot is frozen, hold off all i/o
+ *  activity; then, as soon as the slot is available again,
+ *  initiate an adapter reset.
+ */
+static int ipr_reset_freeze(struct ipr_cmnd *ipr_cmd)
+{
+	list_add_tail(&ipr_cmd->queue, &ipr_cmd->ioa_cfg->pending_q);
+	ipr_cmd->done = ipr_reset_ioa_job;
+	return IPR_RC_JOB_RETURN;
+}
+
+static void ipr_eeh_frozen (struct pci_dev *pdev)
+{
+	unsigned long flags = 0;
+	struct ipr_ioa_cfg *ioa_cfg = pci_get_drvdata(pdev);
+
+	spin_lock_irqsave(ioa_cfg->host->host_lock, flags);
+	_ipr_initiate_ioa_reset(ioa_cfg, ipr_reset_freeze, IPR_SHUTDOWN_NONE);
+	spin_unlock_irqrestore(ioa_cfg->host->host_lock, flags);
+}
+
+static int ipr_eeh_thawed (struct pci_dev *pdev)
+{
+	unsigned long flags = 0;
+	struct ipr_ioa_cfg *ioa_cfg = pci_get_drvdata(pdev);
+
+	spin_lock_irqsave(ioa_cfg->host->host_lock, flags);
+	_ipr_initiate_ioa_reset(ioa_cfg, ipr_reset_restore_cfg_space, 
+	                                 IPR_SHUTDOWN_NONE);
+	spin_unlock_irqrestore(ioa_cfg->host->host_lock, flags);
+
+	return PCIERR_RESULT_RECOVERED;
+}
+
+static void ipr_eeh_perm_failure (struct pci_dev *pdev)
+{
+#if 0  // XXXXXXXXXXXXXXXXXXXXXXX
+	ipr_cmd->job_step = ipr_reset_shutdown_ioa;
+	rc = IPR_RC_JOB_CONTINUE;
+#endif
+}
+
+static int ipr_eeh_error_detected (struct pci_dev *pdev, 
+                                enum pci_channel_state state)
+{
+	switch (state) {
+		case pci_channel_io_frozen:
+			ipr_eeh_frozen (pdev);
+			return PCIERR_RESULT_NEED_RESET;
+			
+		case pci_channel_io_perm_failure:
+			ipr_eeh_perm_failure (pdev);
+			return PCIERR_RESULT_DISCONNECT;
+			break;
+		default:
+			break;
+	}
+	return PCIERR_RESULT_NEED_RESET;
+}
+#endif
+
 /**
  * ipr_probe_ioa_part2 - Initializes IOAs found in ipr_probe_ioa(..)
  * @ioa_cfg:	ioa cfg struct
@@ -6015,6 +6084,10 @@ static struct pci_driver ipr_driver = {
 	.id_table = ipr_pci_table,
 	.probe = ipr_probe,
 	.remove = ipr_remove,
+	.err_handler = {
+		.error_detected = ipr_eeh_error_detected,
+		.slot_reset = ipr_eeh_thawed,
+	},
 	.driver = {
 		.shutdown = ipr_shutdown,
 	},
--- drivers/scsi/sym53c8xx_2/sym_glue.c.linas-orig	2005-04-29 20:33:12.000000000 -0500
+++ drivers/scsi/sym53c8xx_2/sym_glue.c	2005-05-06 16:55:02.000000000 -0500
@@ -49,6 +49,10 @@
 #include <scsi/scsi_transport.h>
 #include <scsi/scsi_transport_spi.h>
 
+#ifdef CONFIG_PPC64
+#define CONFIG_SCSI_SYM53C8XX_EEH_RECOVERY
+#endif
+
 #include "sym_glue.h"
 #include "sym_nvram.h"
 
@@ -770,6 +774,10 @@ static irqreturn_t sym53c8xx_intr(int ir
 	struct sym_hcb *np = (struct sym_hcb *)dev_id;
 
 	if (DEBUG_FLAGS & DEBUG_TINY) printf_debug ("[");
+#ifdef CONFIG_SCSI_SYM53C8XX_EEH_RECOVERY
+	if (np->s.io_state != pci_channel_io_normal)
+		return IRQ_HANDLED;
+#endif /* CONFIG_SCSI_SYM53C8XX_EEH_RECOVERY */
 
 	spin_lock_irqsave(np->s.host->host_lock, flags);
 	sym_interrupt(np);
@@ -844,6 +852,27 @@ static void sym_eh_done(struct scsi_cmnd
  */
 static void sym_eh_timeout(u_long p) { __sym_eh_done((struct scsi_cmnd *)p, 1); }
 
+#ifdef CONFIG_SCSI_SYM53C8XX_EEH_RECOVERY
+static void sym_eeh_timeout(u_long p) 
+{
+	struct sym_eh_wait *ep = (struct sym_eh_wait *) p;
+	if (!ep)
+		return;
+	complete(&ep->done);
+}
+
+static void sym_eeh_done(struct sym_eh_wait *ep)
+{
+	if (!ep)
+		return;
+	ep->timed_out = 0;
+	if (!del_timer(&ep->timer)) 
+		return;
+				
+	complete(&ep->done);
+}
+#endif /* CONFIG_SCSI_SYM53C8XX_EEH_RECOVERY */
+
 /*
  *  Generic method for our eh processing.
  *  The 'op' argument tells what we have to do.
@@ -905,6 +934,35 @@ prepare:
 		sts = 0;
 		break;
 	case SYM_EH_HOST_RESET:
+#ifdef CONFIG_SCSI_SYM53C8XX_EEH_RECOVERY
+printk("duuuuuude attempting symbios recovery\n");
+dump_stack();
+		int rc = eeh_slot_is_isolated (np->s.device);
+
+printk ("duude symbios is isolated ??=%d\n", rc);
+printk ("duuude the current io state is %d\n", np->s.io_state);
+		if (rc) {
+			struct sym_eh_wait eeh, *eep = &eeh;
+			np->s.io_reset_wait = eep;
+			init_completion(&eep->done);
+			init_timer(&eep->timer);
+			eep->to_do = SYM_EH_DO_WAIT;
+			eep->timer.expires = jiffies + (10*HZ);
+			eep->timer.function = sym_eeh_timeout;
+			eep->timer.data = (u_long)eep;
+			eep->timed_out = 1;	/* Be pessimistic for once :) */
+			add_timer(&eep->timer);
+			spin_unlock_irq(np->s.host->host_lock);
+			wait_for_completion(&eep->done);
+			spin_lock_irq(np->s.host->host_lock);
+			if (eep->timed_out) {
+printk ("duude symbios timed out\n");
+			} else {
+printk ("duude symbios waited for completion\n");
+			}
+			np->s.io_reset_wait = NULL;
+		}
+#endif /* CONFIG_SCSI_SYM53C8XX_EEH_RECOVERY */
 		sym_reset_scsi_bus(np, 0);
 		sym_start_up (np, 1);
 		sts = 0;
@@ -1577,6 +1635,30 @@ static int sym_setup_bus_dma_mask(struct
 	return -1;
 }
 
+#ifdef CONFIG_SCSI_SYM53C8XX_EEH_RECOVERY
+int sym2_io_error_detected (struct pci_dev *pdev, enum pci_channel_state state)
+{
+	struct sym_hcb *np = pci_get_drvdata(pdev);
+printk ("duude symbios got this state change %d jiffies=%ld\n", state, jiffies);
+
+	np->s.io_state = state;
+	// XXX if perm frozen, then ...?
+
+	return 0;
+}
+
+int sym2_io_slot_reset (struct pci_dev *pdev)
+{
+	struct sym_hcb *np = pci_get_drvdata(pdev);
+printk ("duude symbios got slot reset done jiffies=%ld\n", jiffies);
+
+	np->s.io_state = pci_channel_io_normal;
+	sym_eeh_done (np->s.io_reset_wait);
+
+	return 0;
+}
+#endif /* CONFIG_SCSI_SYM53C8XX_EEH_RECOVERY */
+	
 /*
  *  Host attach and initialisations.
  *
@@ -1625,6 +1707,8 @@ static struct Scsi_Host * __devinit sym_
 	if (!np)
 		goto attach_failed;
 	np->s.device = dev->pdev;
+	np->s.io_state = pci_channel_io_normal;
+	np->s.io_reset_wait = NULL;
 	np->bus_dmat = dev->pdev; /* Result in 1 DMA pool per HBA */
 	host_data->ncb = np;
 	np->s.host = instance;
@@ -2359,6 +2443,10 @@ static struct pci_driver sym2_driver = {
 	.id_table	= sym2_id_table,
 	.probe		= sym2_probe,
 	.remove		= __devexit_p(sym2_remove),
+	.err_handler = {
+		.error_detected = sym2_io_error_detected,
+		.slot_reset = sym2_io_slot_reset,
+	},
 };
 
 static int __init sym2_init(void)
--- drivers/scsi/sym53c8xx_2/sym_glue.h.linas-orig	2005-04-29 20:32:45.000000000 -0500
+++ drivers/scsi/sym53c8xx_2/sym_glue.h	2005-05-06 16:29:39.000000000 -0500
@@ -358,6 +358,10 @@ struct sym_shcb {
 	char		chip_name[8];
 	struct pci_dev	*device;
 
+	/* pci bus i/o state; waiter for clearing of i/o state */
+	enum pci_channel_state io_state;
+	struct sym_eh_wait *io_reset_wait;
+
 	struct Scsi_Host *host;
 
 	void __iomem *	mmio_va;	/* MMIO kernel virtual address	*/
--- drivers/scsi/sym53c8xx_2/sym_hipd.c.linas-orig	2005-04-29 20:22:45.000000000 -0500
+++ drivers/scsi/sym53c8xx_2/sym_hipd.c	2005-05-06 12:28:43.000000000 -0500
@@ -2836,6 +2836,7 @@ void sym_interrupt (struct sym_hcb *np)
 	u_char	istat, istatc;
 	u_char	dstat;
 	u_short	sist;
+	u_int    icnt;
 
 	/*
 	 *  interrupt on the fly ?
@@ -2877,6 +2878,7 @@ void sym_interrupt (struct sym_hcb *np)
 	sist	= 0;
 	dstat	= 0;
 	istatc	= istat;
+	icnt = 0;
 	do {
 		if (istatc & SIP)
 			sist  |= INW (nc_sist);
@@ -2884,6 +2886,14 @@ void sym_interrupt (struct sym_hcb *np)
 			dstat |= INB (nc_dstat);
 		istatc = INB (nc_istat);
 		istat |= istatc;
+		icnt ++;
+		if (100 < icnt) {
+#define CONFIG_SCSI_SYM53C8XX_EEH_RECOVERY
+#ifdef CONFIG_SCSI_SYM53C8XX_EEH_RECOVERY
+			if(eeh_slot_is_isolated (np->s.device))
+				return;
+#endif /* CONFIG_SCSI_SYM53C8XX_EEH_RECOVERY */
+		}
 	} while (istatc & (SIP|DIP));
 
 	if (DEBUG_FLAGS & DEBUG_TINY)
--- include/asm-ppc64/eeh.h.linas-orig	2005-04-29 20:34:03.000000000 -0500
+++ include/asm-ppc64/eeh.h	2005-05-06 12:28:43.000000000 -0500
@@ -23,6 +23,7 @@
 #include <linux/config.h>
 #include <linux/init.h>
 #include <linux/list.h>
+#include <linux/notifier.h>
 #include <linux/string.h>
 
 struct pci_dev;
@@ -36,6 +37,11 @@ struct notifier_block;
 #define EEH_MODE_SUPPORTED	(1<<0)
 #define EEH_MODE_NOCHECK	(1<<1)
 #define EEH_MODE_ISOLATED	(1<<2)
+#define EEH_MODE_RECOVERING	(1<<3)
+
+/* Max number of EEH freezes allowed before we consider the device
+ * to be permanently disabled. */
+#define EEH_MAX_ALLOWED_FREEZES 5
 
 void __init eeh_init(void);
 unsigned long eeh_check_failure(const volatile void __iomem *token,
@@ -59,35 +65,82 @@ void eeh_add_device_late(struct pci_dev 
  * eeh_remove_device - undo EEH setup for the indicated pci device
  * @dev: pci device to be removed
  *
- * This routine should be when a device is removed from a running
- * system (e.g. by hotplug or dlpar).
+ * This routine should be called when a device is removed from 
+ * a running system (e.g. by hotplug or dlpar).  It unregisters 
+ * the PCI device from the EEH subsystem.  I/O errors affecting
+ * this device will no longer be detected after this call; thus,
+ * i/o errors affecting this slot may leave this device unusable.
  */
 void eeh_remove_device(struct pci_dev *);
 
-#define EEH_DISABLE		0
-#define EEH_ENABLE		1
-#define EEH_RELEASE_LOADSTORE	2
-#define EEH_RELEASE_DMA		3
+/**
+ * eeh_slot_is_isolated -- return non-zero value if slot is frozen
+ */
+int eeh_slot_is_isolated (struct pci_dev *dev);
 
 /**
- * Notifier event flags.
+ * eeh_ioaddr_is_isolated -- return non-zero value if device at 
+ * io address is frozen.
  */
-#define EEH_NOTIFY_FREEZE  1
+int eeh_ioaddr_is_isolated(const volatile void __iomem *token);
 
-/** EEH event -- structure holding pci slot data that describes
- *  a change in the isolation status of a PCI slot.  A pointer
- *  to this struct is passed as the data pointer in a notify callback.
- */
-struct eeh_event {
-	struct list_head     list;
-	struct pci_dev       *dev;
-	struct device_node   *dn;
-	int                  reset_state;
-};
-
-/** Register to find out about EEH events. */
-int eeh_register_notifier(struct notifier_block *nb);
-int eeh_unregister_notifier(struct notifier_block *nb);
+/**
+ * eeh_slot_error_detail -- record and EEH error condition to the log
+ * @severity: 1 if temporary, 2 if permanent failure.
+ *
+ * Obtains the the EEH error details from the RTAS subsystem, 
+ * and then logs these details with the RTAS error log system.
+ */
+void eeh_slot_error_detail (struct device_node *dn, int severity);
+
+/** 
+ * rtas_set_slot_reset -- unfreeze a frozen slot
+ *
+ * Clear the EEH-frozen condition on a slot.  This routine
+ * does this by asserting the PCI #RST line for 1/8th of 
+ * a second; this routine will sleep while the adapter is 
+ * being reset.
+ */
+void rtas_set_slot_reset (struct device_node *dn);
+
+/** rtas_pci_slot_reset raises/lowers the pci #RST line
+ *  state: 1/0 to raise/lower the #RST
+ *
+ * Clear the EEH-frozen condition on a slot.  This routine
+ * asserts the PCI #RST line if the 'state' argument is '1',
+ * and drops the #RST line if 'state is '0'.  This routine is 
+ * safe to call in an interrupt context.
+ *
+ */
+void rtas_pci_slot_reset(struct device_node *dn, int state);
+void eeh_pci_slot_reset(struct pci_dev *dev, int state);
+
+/** eeh_pci_slot_availability -- Indicates whether a PCI
+ *  slot is ready to be used. After a PCI reset, it may take a while 
+ *  for the PCI fabric to fully reset the comminucations path to the
+ *  given PCI card.  This routine can be used to determine how long
+ *  to wait before a PCI slot might become usable.  
+ *
+ *  This routine returns how long to wait (in milliseconds) before
+ *  the slot is expected to be usable.  A value of zero means the
+ *  slot is immediately usable. A negavitve value means that the 
+ *  slot is permanently disabled.
+ */
+int eeh_pci_slot_availability(struct pci_dev *dev);
+
+/** Restore device configuration info across device resets.
+ */
+void eeh_restore_bars(struct device_node *);
+void eeh_pci_restore_bars(struct pci_dev *dev);
+
+/**
+ * rtas_configure_bridge -- firmware initialization of pci bridge
+ * 
+ * Ask the firmware to configure any PCI bridge devices 
+ * located behind the indicated node. Required after a 
+ * pci device reset.
+ */
+void rtas_configure_bridge(struct device_node *dn);
 
 /**
  * EEH_POSSIBLE_ERROR() -- test for possible MMIO failure.
--- include/asm-ppc64/prom.h.linas-orig	2005-04-29 20:32:46.000000000 -0500
+++ include/asm-ppc64/prom.h	2005-05-06 12:28:43.000000000 -0500
@@ -119,6 +119,7 @@ struct property {
  */
 struct pci_controller;
 struct iommu_table;
+struct eeh_recovery_ops;
 
 struct device_node {
 	char	*name;
@@ -137,8 +138,12 @@ struct device_node {
 	int	devfn;			/* for pci devices */
 	int	eeh_mode;		/* See eeh.h for possible EEH_MODEs */
 	int	eeh_config_addr;
+	int   eeh_check_count;    /* number of times device driver ignored error */
+	int	eeh_freeze_count;   /* number of times this device froze up. */
+	int   eeh_is_bridge;      /* device is pci-to-pci bridge */
 	struct  pci_controller *phb;	/* for pci devices */
 	struct	iommu_table *iommu_table;	/* for phb's or bridges */
+	u32      config_space[16]; /* saved PCI config space */
 
 	struct	property *properties;
 	struct	device_node *parent;
--- include/asm-ppc64/rtas.h.linas-orig	2005-04-29 20:32:32.000000000 -0500
+++ include/asm-ppc64/rtas.h	2005-05-06 12:28:43.000000000 -0500
@@ -243,4 +243,6 @@ extern unsigned long rtas_rmo_buf;
 
 #define GLOBAL_INTERRUPT_QUEUE 9005
 
+extern int rtas_write_config(struct device_node *dn, int where, int size, u32 val);
+
 #endif /* _PPC64_RTAS_H */
--- arch/ppc64/kernel/eeh.c.linas-orig	2005-04-29 20:29:19.000000000 -0500
+++ arch/ppc64/kernel/eeh.c	2005-05-06 16:52:39.000000000 -0500
@@ -17,16 +17,17 @@
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
  */
 
-#include <linux/bootmem.h>
+#include <linux/delay.h>
 #include <linux/init.h>
+#include <linux/irq.h>
 #include <linux/list.h>
-#include <linux/mm.h>
 #include <linux/notifier.h>
 #include <linux/pci.h>
 #include <linux/proc_fs.h>
 #include <linux/rbtree.h>
 #include <linux/seq_file.h>
 #include <linux/spinlock.h>
+#include <asm/atomic.h>
 #include <asm/eeh.h>
 #include <asm/io.h>
 #include <asm/machdep.h>
@@ -49,8 +50,8 @@
  *  were "empty": all reads return 0xff's and all writes are silently
  *  ignored.  EEH slot isolation events can be triggered by parity
  *  errors on the address or data busses (e.g. during posted writes),
- *  which in turn might be caused by dust, vibration, humidity,
- *  radioactivity or plain-old failed hardware.
+ *  which in turn might be caused by low voltage on the bus, dust, 
+ *  vibration, humidity, radioactivity or plain-old failed hardware.
  *
  *  Note, however, that one of the leading causes of EEH slot
  *  freeze events are buggy device drivers, buggy device microcode,
@@ -75,22 +76,13 @@
 #define BUID_HI(buid) ((buid) >> 32)
 #define BUID_LO(buid) ((buid) & 0xffffffff)
 
-/* EEH event workqueue setup. */
-static DEFINE_SPINLOCK(eeh_eventlist_lock);
-LIST_HEAD(eeh_eventlist);
-static void eeh_event_handler(void *);
-DECLARE_WORK(eeh_event_wq, eeh_event_handler, NULL);
-
-static struct notifier_block *eeh_notifier_chain;
-
 /*
  * If a device driver keeps reading an MMIO register in an interrupt
  * handler after a slot isolation event has occurred, we assume it
  * is broken and panic.  This sets the threshold for how many read
  * attempts we allow before panicking.
  */
-#define EEH_MAX_FAILS	1000
-static atomic_t eeh_fail_count;
+#define EEH_MAX_FAILS	100000
 
 /* RTAS tokens */
 static int ibm_set_eeh_option;
@@ -107,6 +99,10 @@ static DEFINE_SPINLOCK(slot_errbuf_lock)
 static int eeh_error_buf_size;
 
 /* System monitoring statistics */
+static DEFINE_PER_CPU(unsigned long, no_device);
+static DEFINE_PER_CPU(unsigned long, no_dn);
+static DEFINE_PER_CPU(unsigned long, no_cfg_addr);
+static DEFINE_PER_CPU(unsigned long, ignored_check);
 static DEFINE_PER_CPU(unsigned long, total_mmio_ffs);
 static DEFINE_PER_CPU(unsigned long, false_positives);
 static DEFINE_PER_CPU(unsigned long, ignored_failures);
@@ -225,9 +221,9 @@ pci_addr_cache_insert(struct pci_dev *de
 	while (*p) {
 		parent = *p;
 		piar = rb_entry(parent, struct pci_io_addr_range, rb_node);
-		if (alo < piar->addr_lo) {
+		if (ahi < piar->addr_lo) {
 			p = &parent->rb_left;
-		} else if (ahi > piar->addr_hi) {
+		} else if (alo > piar->addr_hi) {
 			p = &parent->rb_right;
 		} else {
 			if (dev != piar->pcidev ||
@@ -245,6 +241,11 @@ pci_addr_cache_insert(struct pci_dev *de
 	piar->addr_hi = ahi;
 	piar->pcidev = dev;
 	piar->flags = flags;
+	
+#ifdef DEBUG 
+	printk (KERN_DEBUG "PIAR: insert range=[%lx:%lx] dev=%s\n", 
+	               alo, ahi, pci_name (dev));
+#endif
 
 	rb_link_node(&piar->rb_node, parent, p);
 	rb_insert_color(&piar->rb_node, &pci_io_addr_cache_root.rb_root);
@@ -369,8 +370,12 @@ void pci_addr_cache_remove_device(struct
  */
 void __init pci_addr_cache_build(void)
 {
+	struct device_node *dn;
 	struct pci_dev *dev = NULL;
 
+	if (!eeh_subsystem_enabled)
+		return;
+
 	spin_lock_init(&pci_io_addr_cache_root.piar_lock);
 
 	while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
@@ -379,6 +384,17 @@ void __init pci_addr_cache_build(void)
 			continue;
 		}
 		pci_addr_cache_insert_device(dev);
+		
+		/* Save the BAR's; firmware doesn't restore these after EEH reset */
+		dn = pci_device_to_OF_node(dev);
+		if (dn) {
+			int i;
+			for (i = 0; i < 16; i++)
+				pci_read_config_dword(dev, i * 4, &dn->config_space[i]);
+
+			if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)
+				dn->eeh_is_bridge = 1;
+		}
 	}
 
 #ifdef DEBUG
@@ -390,24 +406,32 @@ void __init pci_addr_cache_build(void)
 /* --------------------------------------------------------------- */
 /* Above lies the PCI Address Cache. Below lies the EEH event infrastructure */
 
-/**
- * eeh_register_notifier - Register to find out about EEH events.
- * @nb: notifier block to callback on events
- */
-int eeh_register_notifier(struct notifier_block *nb)
+void eeh_slot_error_detail (struct device_node *dn, int severity)
 {
-	return notifier_chain_register(&eeh_notifier_chain, nb);
-}
+	unsigned long flags;
+	int rc;
 
-/**
- * eeh_unregister_notifier - Unregister to an EEH event notifier.
- * @nb: notifier block to callback on events
- */
-int eeh_unregister_notifier(struct notifier_block *nb)
-{
-	return notifier_chain_unregister(&eeh_notifier_chain, nb);
+	if (!dn) return;
+
+	/* Log the error with the rtas logger */
+	spin_lock_irqsave(&slot_errbuf_lock, flags);
+	memset(slot_errbuf, 0, eeh_error_buf_size);
+
+	rc = rtas_call(ibm_slot_error_detail,
+	               8, 1, NULL, dn->eeh_config_addr,
+	               BUID_HI(dn->phb->buid),
+	               BUID_LO(dn->phb->buid), NULL, 0,
+	               virt_to_phys(slot_errbuf),
+	               eeh_error_buf_size,
+	               severity);
+
+	if (rc == 0)
+		log_error(slot_errbuf, ERR_TYPE_RTAS_LOG, 0);
+	spin_unlock_irqrestore(&slot_errbuf_lock, flags);
 }
 
+EXPORT_SYMBOL(eeh_slot_error_detail);
+
 /**
  * read_slot_reset_state - Read the reset state of a device node's slot
  * @dn: device node to read
@@ -422,6 +446,7 @@ static int read_slot_reset_state(struct 
 		outputs = 4;
 	} else {
 		token = ibm_read_slot_reset_state;
+		rets[2] = 0; /* fake PE Unavailable info */
 		outputs = 3;
 	}
 
@@ -430,75 +455,8 @@ static int read_slot_reset_state(struct 
 }
 
 /**
- * eeh_panic - call panic() for an eeh event that cannot be handled.
- * The philosophy of this routine is that it is better to panic and
- * halt the OS than it is to risk possible data corruption by
- * oblivious device drivers that don't know better.
- *
- * @dev pci device that had an eeh event
- * @reset_state current reset state of the device slot
- */
-static void eeh_panic(struct pci_dev *dev, int reset_state)
-{
-	/*
-	 * XXX We should create a separate sysctl for this.
-	 *
-	 * Since the panic_on_oops sysctl is used to halt the system
-	 * in light of potential corruption, we can use it here.
-	 */
-	if (panic_on_oops)
-		panic("EEH: MMIO failure (%d) on device:%s %s\n", reset_state,
-		      pci_name(dev), pci_pretty_name(dev));
-	else {
-		__get_cpu_var(ignored_failures)++;
-		printk(KERN_INFO "EEH: Ignored MMIO failure (%d) on device:%s %s\n",
-		       reset_state, pci_name(dev), pci_pretty_name(dev));
-	}
-}
-
-/**
- * eeh_event_handler - dispatch EEH events.  The detection of a frozen
- * slot can occur inside an interrupt, where it can be hard to do
- * anything about it.  The goal of this routine is to pull these
- * detection events out of the context of the interrupt handler, and
- * re-dispatch them for processing at a later time in a normal context.
- *
- * @dummy - unused
- */
-static void eeh_event_handler(void *dummy)
-{
-	unsigned long flags;
-	struct eeh_event	*event;
-
-	while (1) {
-		spin_lock_irqsave(&eeh_eventlist_lock, flags);
-		event = NULL;
-		if (!list_empty(&eeh_eventlist)) {
-			event = list_entry(eeh_eventlist.next, struct eeh_event, list);
-			list_del(&event->list);
-		}
-		spin_unlock_irqrestore(&eeh_eventlist_lock, flags);
-		if (event == NULL)
-			break;
-
-		printk(KERN_INFO "EEH: MMIO failure (%d), notifiying device "
-		       "%s %s\n", event->reset_state,
-		       pci_name(event->dev), pci_pretty_name(event->dev));
-
-		atomic_set(&eeh_fail_count, 0);
-		notifier_call_chain (&eeh_notifier_chain,
-				     EEH_NOTIFY_FREEZE, event);
-
-		__get_cpu_var(slot_resets)++;
-
-		pci_dev_put(event->dev);
-		kfree(event);
-	}
-}
-
-/**
- * eeh_token_to_phys - convert EEH address token to phys address
- * @token i/o token, should be address in the form 0xE....
+ * eeh_token_to_phys - convert I/O address to phys address
+ * @token i/o address, should be address in the form 0xA....
  */
 static inline unsigned long eeh_token_to_phys(unsigned long token)
 {
@@ -513,6 +471,18 @@ static inline unsigned long eeh_token_to
 	return pa | (token & (PAGE_SIZE-1));
 }
 
+
+static inline struct pci_dev * eeh_find_pci_dev(struct device_node *dn)
+{
+	struct pci_dev *dev = NULL;
+	for_each_pci_dev(dev) {
+		if (pci_device_to_OF_node(dev) == dn)
+			return dev;
+	}
+	return NULL;
+}
+
+
 /**
  * eeh_dn_check_failure - check if all 1's data is due to EEH slot freeze
  * @dn device node
@@ -528,29 +498,33 @@ static inline unsigned long eeh_token_to
  *
  * It is safe to call this routine in an interrupt context.
  */
+extern void disable_irq_nosync(unsigned int);
+
 int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
 {
 	int ret;
 	int rets[3];
-	unsigned long flags;
-	int rc, reset_state;
-	struct eeh_event  *event;
+	enum pci_channel_state state;
 
 	__get_cpu_var(total_mmio_ffs)++;
 
 	if (!eeh_subsystem_enabled)
 		return 0;
 
-	if (!dn)
+	if (!dn) {
+		__get_cpu_var(no_dn)++;
 		return 0;
+	}
 
 	/* Access to IO BARs might get this far and still not want checking. */
 	if (!(dn->eeh_mode & EEH_MODE_SUPPORTED) ||
 	    dn->eeh_mode & EEH_MODE_NOCHECK) {
+		__get_cpu_var(ignored_check)++;
 		return 0;
 	}
 
 	if (!dn->eeh_config_addr) {
+		__get_cpu_var(no_cfg_addr)++;
 		return 0;
 	}
 
@@ -559,12 +533,18 @@ int eeh_dn_check_failure(struct device_n
 	 * slot, we know it's bad already, we don't need to check...
 	 */
 	if (dn->eeh_mode & EEH_MODE_ISOLATED) {
-		atomic_inc(&eeh_fail_count);
-		if (atomic_read(&eeh_fail_count) >= EEH_MAX_FAILS) {
+		dn->eeh_check_count ++;
+		if (dn->eeh_check_count >= EEH_MAX_FAILS) {
+			printk (KERN_ERR "EEH: Device driver ignored %d bad reads, panicing\n",
+			        dn->eeh_check_count);
+			dump_stack();
 			/* re-read the slot reset state */
 			if (read_slot_reset_state(dn, rets) != 0)
 				rets[0] = -1;	/* reset state unknown */
-			eeh_panic(dev, rets[0]);
+
+			/* If we are here, then we hit an infinite loop. Stop. */
+			panic("EEH: MMIO halt (%d) on device:%s %s\n", rets[0],
+		      pci_name(dev), pci_pretty_name(dev));
 		}
 		return 0;
 	}
@@ -577,53 +557,41 @@ int eeh_dn_check_failure(struct device_n
 	 * In any case they must share a common PHB.
 	 */
 	ret = read_slot_reset_state(dn, rets);
-	if (!(ret == 0 && rets[1] == 1 && (rets[0] == 2 || rets[0] == 4))) {
+	if (!(ret == 0 && ((rets[1] == 1 && (rets[0] == 2 || rets[0] >= 4))
+	                   || (rets[0] == 5)))) {
 		__get_cpu_var(false_positives)++;
 		return 0;
 	}
 
-	/* prevent repeated reports of this failure */
-	dn->eeh_mode |= EEH_MODE_ISOLATED;
-
-	reset_state = rets[0];
+	/* Note that empty slots will fail; empty slots don't have children... */
+	if ((rets[0] == 5) && (dn->child == NULL)) {
+		__get_cpu_var(false_positives)++;
+		return 0;
+	}
 
-	spin_lock_irqsave(&slot_errbuf_lock, flags);
-	memset(slot_errbuf, 0, eeh_error_buf_size);
+	/* Prevent repeated reports of this failure */
+	dn->eeh_mode |= EEH_MODE_ISOLATED;
+	__get_cpu_var(slot_resets)++;
 
-	rc = rtas_call(ibm_slot_error_detail,
-	               8, 1, NULL, dn->eeh_config_addr,
-	               BUID_HI(dn->phb->buid),
-	               BUID_LO(dn->phb->buid), NULL, 0,
-	               virt_to_phys(slot_errbuf),
-	               eeh_error_buf_size,
-	               1 /* Temporary Error */);
+	if (!dev)
+		dev = eeh_find_pci_dev (dn);
 
-	if (rc == 0)
-		log_error(slot_errbuf, ERR_TYPE_RTAS_LOG, 0);
-	spin_unlock_irqrestore(&slot_errbuf_lock, flags);
+	/* Some devices go crazy if irq's are not ack'ed; disable irq now */
+	if (dev)
+		disable_irq_nosync (dev->irq);
+	
+	state = pci_channel_io_normal;
+	if ((rets[0] == 2) || (rets[0] == 4)) 
+		state = pci_channel_io_frozen;
+	if (rets[0] == 5) 
+		state = pci_channel_io_perm_failure;
 
-	printk(KERN_INFO "EEH: MMIO failure (%d) on device: %s %s\n",
-	       rets[0], dn->name, dn->full_name);
-	event = kmalloc(sizeof(*event), GFP_ATOMIC);
-	if (event == NULL) {
-		eeh_panic(dev, reset_state);
-		return 1;
- 	}
-
-	event->dev = dev;
-	event->dn = dn;
-	event->reset_state = reset_state;
-
-	/* We may or may not be called in an interrupt context */
-	spin_lock_irqsave(&eeh_eventlist_lock, flags);
-	list_add(&event->list, &eeh_eventlist);
-	spin_unlock_irqrestore(&eeh_eventlist_lock, flags);
+	peh_send_failure_event (dev, state, rets[2]);
 
 	/* Most EEH events are due to device driver bugs.  Having
 	 * a stack trace will help the device-driver authors figure
 	 * out what happened.  So print that out. */
-	dump_stack();
-	schedule_work(&eeh_event_wq);
+	if (rets[0] != 5) dump_stack();
 
 	return 0;
 }
@@ -635,7 +603,6 @@ EXPORT_SYMBOL(eeh_dn_check_failure);
  * @token i/o token, should be address in the form 0xA....
  * @val value, should be all 1's (XXX why do we need this arg??)
  *
- * Check for an eeh failure at the given token address.
  * Check for an EEH failure at the given token address.  Call this
  * routine if the result of a read was all 0xff's and you want to
  * find out if this is due to an EEH slot freeze event.  This routine
@@ -643,6 +610,7 @@ EXPORT_SYMBOL(eeh_dn_check_failure);
  *
  * Note this routine is safe to call in an interrupt context.
  */
+
 unsigned long eeh_check_failure(const volatile void __iomem *token, unsigned long val)
 {
 	unsigned long addr;
@@ -652,8 +620,10 @@ unsigned long eeh_check_failure(const vo
 	/* Finding the phys addr + pci device; this is pretty quick. */
 	addr = eeh_token_to_phys((unsigned long __force) token);
 	dev = pci_get_device_by_addr(addr);
-	if (!dev)
+	if (!dev) {
+		__get_cpu_var(no_device)++;
 		return val;
+	}
 
 	dn = pci_device_to_OF_node(dev);
 	eeh_dn_check_failure (dn, dev);
@@ -664,6 +634,249 @@ unsigned long eeh_check_failure(const vo
 
 EXPORT_SYMBOL(eeh_check_failure);
 
+/* ------------------------------------------------------------- */
+/* The code below deals with error recovery */
+
+int
+eeh_slot_is_isolated(struct pci_dev *dev)
+{ 
+	struct device_node *dn;
+	dn = pci_device_to_OF_node(dev);
+	return (dn->eeh_mode & EEH_MODE_ISOLATED);
+}
+
+int
+eeh_ioaddr_is_isolated(const volatile void __iomem *token)
+{ 
+	unsigned long addr;
+	struct pci_dev *dev;
+	int rc;
+
+	addr = eeh_token_to_phys((unsigned long __force) token);
+	dev = pci_get_device_by_addr(addr);
+	if (!dev)
+		return 0;
+	rc = eeh_slot_is_isolated(dev);
+	pci_dev_put(dev);
+	return rc;
+}
+
+/** eeh_pci_slot_reset -- raises/lowers the pci #RST line
+ *  state: 1/0 to raise/lower the #RST 
+ */
+void
+eeh_pci_slot_reset(struct pci_dev *dev, int state)
+{
+	struct device_node *dn = pci_device_to_OF_node(dev);
+	rtas_pci_slot_reset (dn, state);
+}
+
+/** Return negative value if a permanent error, else return 
+ * a number of milliseconds to wait until the PCI slot is 
+ * ready to be used.
+ */
+static int
+eeh_slot_availability(struct device_node *dn)
+{
+	int rc;
+	int rets[3];
+
+	rc = read_slot_reset_state(dn, rets);
+printk ("duuude dn=%s read slot reset state rc=%d rets=%d--%d--%d\n", dn->full_name, rc, rets[0], rets[1], rets[2]);
+
+	if (rc) return rc;
+
+	if (rets[1] == 0) return -1;  /* EEH is not supported */
+	if (rets[0] == 0)  return 0;  /* Oll Korrect */
+	if (rets[0] == 5) {
+		if (rets[2] == 0) return -1; /* permanently unavailable */
+		return rets[2]; /* number of millisecs to wait */
+	}
+	return -1;
+}
+
+int
+eeh_pci_slot_availability(struct pci_dev *dev)
+{
+	struct device_node *dn = pci_device_to_OF_node(dev);
+	if (!dn) return -1;
+
+	BUG_ON (dn->phb==NULL);
+	if (dn->phb==NULL) {
+		printk (KERN_ERR "EEH, checking on slot with no phb dn=%s dev=%s:%s\n",
+		       dn->full_name, pci_name(dev), pci_pretty_name (dev));
+		return -1;
+	}
+	return eeh_slot_availability (dn);
+}
+
+void
+rtas_pci_slot_reset(struct device_node *dn, int state)
+{
+	int rc;
+
+	if (!dn)
+		return;
+	if (!dn->phb) {
+		printk (KERN_WARNING "EEH: in slot reset, device node %s has no phb\n",                    dn->full_name);
+		return;
+	}
+
+	dn->eeh_mode |= EEH_MODE_RECOVERING;
+	rc = rtas_call(ibm_set_slot_reset,4,1, NULL,
+	               dn->eeh_config_addr,
+	               BUID_HI(dn->phb->buid),
+	               BUID_LO(dn->phb->buid),
+	               state);
+	if (rc) {
+		printk (KERN_WARNING "EEH: Unable to reset the failed slot, (%d) #RST=%d\n", rc, state);
+		return;
+	}
+
+	if (state == 0)
+		dn->eeh_mode &= ~(EEH_MODE_RECOVERING|EEH_MODE_ISOLATED);
+}
+
+/** rtas_set_slot_reset -- assert the pci #RST line for 1/4 second 
+ *  dn -- device node to be reset.
+ */
+
+void
+rtas_set_slot_reset(struct device_node *dn)
+{
+	int i, rc;
+
+printk ("duude going to reset device %s\n", dn->full_name);
+eeh_slot_availability(dn);
+	rtas_pci_slot_reset (dn, 1);
+
+	/* The PCI bus requires that the reset be held high for at least
+	 * a 100 milliseconds. We wait a bit longer 'just in case'.  */
+
+#define PCI_BUS_RST_HOLD_TIME_MSEC 250
+	msleep (PCI_BUS_RST_HOLD_TIME_MSEC);
+	rtas_pci_slot_reset (dn, 0); 
+	
+	/* After a PCI slot has been reset, the PCI Express spec requires
+	 * a 1.5 second idle time for the bus to stabilize, before starting 
+	 * up traffic. */
+#define PCI_BUS_SETTLE_TIME_MSEC 1800
+	msleep (PCI_BUS_SETTLE_TIME_MSEC);
+
+	/* Now double check with the firmware to make sure the device is
+	 * ready to be used; if not, wait for recovery. */
+	for (i=0; i<10; i++) {
+		rc = eeh_slot_availability (dn);
+		if (rc <= 0) return;
+
+		msleep (rc+100);
+	}
+eeh_slot_availability (dn);
+printk ("duuude WTFFFFFFFFFFFFFFFFFFFFFFF  done reseting %s\n", dn->full_name);
+extern int rtas_read_config(struct device_node *dn, int where, int size, u32 *val);
+u32 val;
+for(i=0;i<16;i++) {
+rc =  rtas_read_config (dn, i*4,4,&val);
+printk ("duude read config %d rc=%d val=%x expect=%x\n", i, rc, val,dn->config_space[i]);
+}
+		  
+}
+
+EXPORT_SYMBOL(rtas_set_slot_reset);
+
+void
+rtas_configure_bridge(struct device_node *dn)
+{
+	int token = rtas_token ("ibm,configure-bridge");
+	int rc;
+
+	if (token == RTAS_UNKNOWN_SERVICE)
+		return;
+	rc = rtas_call(token,3,1, NULL,
+	               dn->eeh_config_addr,
+	               BUID_HI(dn->phb->buid),
+	               BUID_LO(dn->phb->buid));
+	if (rc) {
+		printk (KERN_WARNING "EEH: Unable to configure device bridge (%d) for %s\n", 
+		        rc, dn->full_name);
+	}
+}
+
+EXPORT_SYMBOL(rtas_configure_bridge);
+
+/* ------------------------------------------------------- */
+/** Save and restore of PCI BARs
+ *
+ * Although firmware will set up BARs during boot, it doesn't
+ * set up device BAR's after a device reset, although it will,
+ * if requested, set up bridge configuration. Thus, we need to
+ * configure the PCI devices ourselves.  Config-space setup is
+ * stored in the PCI structures which are normally deleted during
+ * device removal.  Thus, the "save" routine references the
+ * structures so that they aren't deleted.
+ */
+
+/**
+ * __restore_bars - Restore the Base Address Registers
+ * Loads the PCI configuration space base address registers,
+ * the expansion ROM base address, the latency timer, and etc.
+ * from the saved values in the device node.
+ */
+static inline void __restore_bars (struct device_node *dn)
+{
+	int i;
+
+	if (NULL==dn->phb) return;
+	for (i=4; i<10; i++) {
+		rtas_write_config(dn, i*4, 4, dn->config_space[i]);
+	}
+
+	/* 12 == Expansion ROM Address */
+	rtas_write_config(dn, 12*4, 4, dn->config_space[12]);
+	
+#define BYTE_SWAP(OFF) (8*((OFF)/4)+3-(OFF))
+#define SAVED_BYTE(OFF) (((u8 *)(dn->config_space))[BYTE_SWAP(OFF)])
+	
+	rtas_write_config (dn, PCI_CACHE_LINE_SIZE, 1,
+	            SAVED_BYTE(PCI_CACHE_LINE_SIZE));
+	
+	rtas_write_config (dn, PCI_LATENCY_TIMER, 1,
+	            SAVED_BYTE(PCI_LATENCY_TIMER));
+	
+	/* max latency, min grant, interrupt pin and line */
+	rtas_write_config(dn, 15*4, 4, dn->config_space[15]);
+}
+
+/**
+ * eeh_restore_bars - restore the PCI config space info
+ */
+void eeh_restore_bars(struct device_node *dn)
+{
+	if (! dn->eeh_is_bridge)
+		__restore_bars (dn);
+	
+	if (dn->child)
+		eeh_restore_bars (dn->child);
+#if DO_SIBLINGS
+	if (dn->sibling)
+		eeh_restore_bars (dn->sibling);
+#endif
+}
+
+void eeh_pci_restore_bars(struct pci_dev *dev)
+{
+	struct device_node *dn = pci_device_to_OF_node(dev);
+	eeh_restore_bars (dn);
+}
+
+/* ------------------------------------------------------------- */
+/* The code below deals with enabling EEH for devices during  the
+ * early boot sequence.  EEH must be enabled before any PCI probing
+ * can be done.
+ */
+
+#define EEH_ENABLE 1
+
 struct eeh_early_enable_info {
 	unsigned int buid_hi;
 	unsigned int buid_lo;
@@ -682,6 +895,8 @@ static void *early_enable_eeh(struct dev
 	int enable;
 
 	dn->eeh_mode = 0;
+	dn->eeh_check_count = 0;
+	dn->eeh_freeze_count = 0;
 
 	if (status && strcmp(status, "ok") != 0)
 		return NULL;	/* ignore devices with bad status */
@@ -743,7 +958,7 @@ static void *early_enable_eeh(struct dev
 		       dn->full_name);
 	}
 
-	return NULL; 
+	return NULL;
 }
 
 /*
@@ -824,11 +1039,13 @@ void eeh_add_device_early(struct device_
 	struct pci_controller *phb;
 	struct eeh_early_enable_info info;
 
-	if (!dn || !eeh_subsystem_enabled)
+	if (!dn)
 		return;
 	phb = dn->phb;
 	if (NULL == phb || 0 == phb->buid) {
-		printk(KERN_WARNING "EEH: Expected buid but found none\n");
+		printk(KERN_WARNING "EEH: Expected buid but found none for %s\n",
+		                dn->full_name);
+		dump_stack();
 		return;
 	}
 
@@ -847,6 +1064,9 @@ EXPORT_SYMBOL(eeh_add_device_early);
  */
 void eeh_add_device_late(struct pci_dev *dev)
 {
+	int i;
+	struct device_node *dn;
+
 	if (!dev || !eeh_subsystem_enabled)
 		return;
 
@@ -856,6 +1076,14 @@ void eeh_add_device_late(struct pci_dev 
 #endif
 
 	pci_addr_cache_insert_device (dev);
+
+	/* Save the BAR's; firmware doesn't restore these after EEH reset */
+	dn = pci_device_to_OF_node(dev);
+	for (i = 0; i < 16; i++)
+		pci_read_config_dword(dev, i * 4, &dn->config_space[i]);
+
+	if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)
+		dn->eeh_is_bridge = 1;
 }
 EXPORT_SYMBOL(eeh_add_device_late);
 
@@ -885,12 +1113,17 @@ static int proc_eeh_show(struct seq_file
 	unsigned int cpu;
 	unsigned long ffs = 0, positives = 0, failures = 0;
 	unsigned long resets = 0;
+	unsigned long no_dev = 0, no_dn = 0, no_cfg = 0, no_check = 0;
 
 	for_each_cpu(cpu) {
 		ffs += per_cpu(total_mmio_ffs, cpu);
 		positives += per_cpu(false_positives, cpu);
 		failures += per_cpu(ignored_failures, cpu);
 		resets += per_cpu(slot_resets, cpu);
+		no_dev += per_cpu(no_device, cpu);
+		no_dn += per_cpu(no_dn, cpu);
+		no_cfg += per_cpu(no_cfg_addr, cpu);
+		no_check += per_cpu(ignored_check, cpu);
 	}
 
 	if (0 == eeh_subsystem_enabled) {
@@ -898,13 +1131,17 @@ static int proc_eeh_show(struct seq_file
 		seq_printf(m, "eeh_total_mmio_ffs=%ld\n", ffs);
 	} else {
 		seq_printf(m, "EEH Subsystem is enabled\n");
-		seq_printf(m, "eeh_total_mmio_ffs=%ld\n"
+		seq_printf(m, 
+				"no device=%ld\n"
+				"no device node=%ld\n"
+				"no config address=%ld\n"
+				"check not wanted=%ld\n"
+				"eeh_total_mmio_ffs=%ld\n"
 			   "eeh_false_positives=%ld\n"
 			   "eeh_ignored_failures=%ld\n"
-			   "eeh_slot_resets=%ld\n"
-				"eeh_fail_count=%d\n",
-			   ffs, positives, failures, resets,
-				eeh_fail_count.counter);
+			   "eeh_slot_resets=%ld\n",
+				no_dev, no_dn, no_cfg, no_check,
+			   ffs, positives, failures, resets);
 	}
 
 	return 0;
--- arch/ppc64/kernel/pSeries_pci.c.linas-orig	2005-04-29 20:33:03.000000000 -0500
+++ arch/ppc64/kernel/pSeries_pci.c	2005-05-06 12:28:43.000000000 -0500
@@ -52,7 +52,7 @@ static int s7a_workaround;
 
 extern struct mpic *pSeries_mpic;
 
-static int rtas_read_config(struct device_node *dn, int where, int size, u32 *val)
+int rtas_read_config(struct device_node *dn, int where, int size, u32 *val)
 {
 	int returnval = -1;
 	unsigned long buid, addr;
@@ -101,7 +101,7 @@ static int rtas_pci_read_config(struct p
 	return PCIBIOS_DEVICE_NOT_FOUND;
 }
 
-static int rtas_write_config(struct device_node *dn, int where, int size, u32 val)
+int rtas_write_config(struct device_node *dn, int where, int size, u32 val)
 {
 	unsigned long buid, addr;
 	int ret;
--- drivers/pci/hotplug/rpaphp.h.linas-orig	2005-04-29 20:26:21.000000000 -0500
+++ drivers/pci/hotplug/rpaphp.h	2005-05-06 12:28:43.000000000 -0500
@@ -118,7 +118,8 @@ extern int rpaphp_enable_pci_slot(struct
 extern int register_pci_slot(struct slot *slot);
 extern int rpaphp_unconfig_pci_adapter(struct slot *slot);
 extern int rpaphp_get_pci_adapter_status(struct slot *slot, int is_init, u8 * value);
-extern struct hotplug_slot *rpaphp_find_hotplug_slot(struct pci_dev *dev);
+extern void init_eeh_handler (void);
+extern void exit_eeh_handler (void);
 
 /* rpaphp_core.c */
 extern int rpaphp_add_slot(struct device_node *dn);
--- drivers/pci/hotplug/rpaphp_core.c.linas-orig	2005-04-29 20:32:16.000000000 -0500
+++ drivers/pci/hotplug/rpaphp_core.c	2005-05-06 12:28:43.000000000 -0500
@@ -460,12 +460,18 @@ static int __init rpaphp_init(void)
 {
 	info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
 
+	/* Get set to handle EEH events. */
+	init_eeh_handler();
+
 	/* read all the PRA info from the system */
 	return init_rpa();
 }
 
 static void __exit rpaphp_exit(void)
 {
+	/* Let EEH know we are going away. */
+	exit_eeh_handler();
+
 	cleanup_slots();
 }
 
--- drivers/pci/hotplug/rpaphp_pci.c.linas-orig	2005-04-29 20:22:38.000000000 -0500
+++ drivers/pci/hotplug/rpaphp_pci.c	2005-05-06 17:19:33.000000000 -0500
@@ -22,8 +22,13 @@
  * Send feedback to <lxie at us.ibm.com>
  *
  */
+#include <linux/delay.h>
+#include <linux/interrupt.h>
+#include <linux/notifier.h>
 #include <linux/pci.h>
+#include <asm/eeh.h>
 #include <asm/pci-bridge.h>
+#include <asm/prom.h>
 #include <asm/rtas.h>
 #include <asm/machdep.h>
 #include "../pci.h"		/* for pci_add_new_bus */
@@ -63,6 +68,7 @@ int rpaphp_claim_resource(struct pci_dev
 		    root ? "Address space collision on" :
 		    "No parent found for",
 		    resource, dtype, pci_name(dev), res->start, res->end);
+		dump_stack();
 	}
 	return err;
 }
@@ -188,6 +194,19 @@ rpaphp_fixup_new_pci_devices(struct pci_
 
 static int rpaphp_pci_config_bridge(struct pci_dev *dev);
 
+static void rpaphp_eeh_add_bus_device(struct pci_bus *bus)
+{
+	struct pci_dev *dev;
+	list_for_each_entry(dev, &bus->devices, bus_list) {
+		eeh_add_device_late(dev);
+		if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) {
+			struct pci_bus *subbus = dev->subordinate;
+			if (bus)
+				rpaphp_eeh_add_bus_device (subbus);
+		}
+	}
+}
+
 /*****************************************************************************
  rpaphp_pci_config_slot() will  configure all devices under the 
  given slot->dn and return the the first pci_dev.
@@ -215,6 +234,8 @@ rpaphp_pci_config_slot(struct device_nod
 		}
 		if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) 
 			rpaphp_pci_config_bridge(dev);
+
+		rpaphp_eeh_add_bus_device(bus);
 	}
 	return dev;
 }
@@ -223,7 +244,6 @@ static int rpaphp_pci_config_bridge(stru
 {
 	u8 sec_busno;
 	struct pci_bus *child_bus;
-	struct pci_dev *child_dev;
 
 	dbg("Enter %s:  BRIDGE dev=%s\n", __FUNCTION__, pci_name(dev));
 
@@ -240,11 +260,7 @@ static int rpaphp_pci_config_bridge(stru
 	/* do pci_scan_child_bus */
 	pci_scan_child_bus(child_bus);
 
-	list_for_each_entry(child_dev, &child_bus->devices, bus_list) {
-		eeh_add_device_late(child_dev);
-	}
-
-	 /* fixup new pci devices without touching bus struct */
+	/* Fixup new pci devices without touching bus struct */
 	rpaphp_fixup_new_pci_devices(child_bus, 0);
 
 	/* Make the discovered devices available */
@@ -282,7 +298,7 @@ static void print_slot_pci_funcs(struct 
 	return;
 }
 #else
-static void print_slot_pci_funcs(struct slot *slot)
+static inline void print_slot_pci_funcs(struct slot *slot)
 {
 	return;
 }
@@ -364,7 +380,6 @@ static void rpaphp_eeh_remove_bus_device
 			if (pdev)
 				rpaphp_eeh_remove_bus_device(pdev);
 		}
-
 	}
 	return;
 }
@@ -566,36 +581,280 @@ exit:
 	return retval;
 }
 
-struct hotplug_slot *rpaphp_find_hotplug_slot(struct pci_dev *dev)
+/**
+ * rpaphp_search_bus_for_dev - return 1 if device is under this bus, else 0
+ * @bus: the bus to search for this device.
+ * @dev: the pci device we are looking for.
+ */
+static int rpaphp_search_bus_for_dev (struct pci_bus *bus, struct pci_dev *dev)
+{
+	struct list_head *ln;
+
+	if (!bus) return 0;
+	
+	for (ln = bus->devices.next; ln != &bus->devices; ln = ln->next) {
+		struct pci_dev *pdev = pci_dev_b(ln);
+		if (pdev == dev)
+			return 1;
+		if (pdev->subordinate) {
+			int rc;
+			rc = rpaphp_search_bus_for_dev (pdev->subordinate, dev);
+			if (rc)
+				return 1;
+		}
+	}
+	return 0;
+}
+
+/**
+ * rpaphp_find_slot - find and return the slot holding the device
+ * @dev: pci device for which we want the slot structure.
+ */
+static struct slot *rpaphp_find_slot(struct pci_dev *dev)
 {
-	struct list_head	*tmp, *n;
-	struct slot		*slot;
+	struct list_head *tmp, *n;
+	struct slot	*slot;
 
 	list_for_each_safe(tmp, n, &rpaphp_slot_head) {
 		struct pci_bus *bus;
-		struct list_head *ln;
 
 		slot = list_entry(tmp, struct slot, rpaphp_slot_list);
-		if (slot->bridge == NULL) {
-			if (slot->dev_type == PCI_DEV) {
-				printk(KERN_WARNING "PCI slot missing bridge %s %s \n", 
-				                    slot->name, slot->location);
-			}
+		
+		/* PHB's don't have bridges. */
+		if (slot->bridge == NULL)
 			continue;
-		}
+
+		/* The PCI device could be the slot itself. */
+		if (slot->bridge == dev)
+			return slot;
 
 		bus = slot->bridge->subordinate;
 		if (!bus) {
+			printk (KERN_WARNING "PCI bridge is missing bus: %s %s\n",
+			    pci_name (slot->bridge), pci_pretty_name (slot->bridge));
 			continue;  /* should never happen? */
 		}
-		for (ln = bus->devices.next; ln != &bus->devices; ln = ln->next) {
-                                struct pci_dev *pdev = pci_dev_b(ln);
-				if (pdev == dev)
-					return slot->hotplug_slot;
-		}
+
+		if (rpaphp_search_bus_for_dev (bus, dev))
+			return slot;
 	}
+	return NULL;
+}
+
+/** get_phb_of_device -- find the pci controller for the device 
+ *  @dev the pci device
+ *  This routine returns a pointer to the device node that
+ *  describes the pci controller for the indicated slot.
+ */
 
+static struct device_node *
+get_phb_of_device (struct pci_dev *dev)
+{
+	struct device_node *dn;
+	struct pci_bus *bus;
+
+	while (1) {
+		bus = dev->bus;
+		if (!bus)
+			break;
+		dn = pci_bus_to_OF_node(bus);
+
+		if (dn->phb)
+			return dn;
+	
+		dev = bus->self;
+		BUG_ON (dev==NULL);
+		if (dev == NULL)
+			return NULL;
+	}
 	return NULL;
 }
 
-EXPORT_SYMBOL_GPL(rpaphp_find_hotplug_slot);
+/* ------------------------------------------------------- */
+/**
+ * handle_eeh_events -- reset a PCI device after hard lockup.
+ *
+ * pSeries systems will isolate a PCI slot if the PCI-Host
+ * bridge detects address or data parity errors, DMA's 
+ * occuring to wild addresses (which usually happen due to
+ * bugs in device drivers or in PCI adapter firmware).
+ * Slot isolations also occur if #SERR, #PERR or other misc
+ * PCI-related errors are detected.
+ * 
+ * Recovery process consists of unplugging the device driver
+ * (which generated hotplug events to userspace), then issuing
+ * a PCI #RST to the device, then reconfiguring the PCI config 
+ * space for all bridges & devices under this slot, and then 
+ * finally restarting the device drivers (which cause a second
+ * set of hotplug events to go out to userspace).
+ */
+
+int eeh_reset_device (struct pci_dev *dev, struct device_node *dn, int reconfig)
+{
+	struct slot *frozen_slot= NULL;
+
+	if (!dev)
+		return 1;
+
+	if (reconfig)
+		frozen_slot = rpaphp_find_slot(dev);
+
+	if (reconfig && frozen_slot) rpaphp_unconfig_pci_adapter (frozen_slot);
+	
+	/* Reset the pci controller. (Asserts RST#; resets config space). 
+	 * Reconfigure bridges and devices */
+	rtas_set_slot_reset (dn->child);
+	rtas_configure_bridge(dn);
+	eeh_restore_bars(dn->child);
+printk ("duude, post restore bars, for %s here's the dump\n", dn->full_name);
+{
+extern int rtas_read_config(struct device_node *dn, int where, int size, u32 *val);
+int i, rc;
+u32 val;
+struct device_node *xn=dn->child;
+for(i=0;i<16;i++) {
+rc =  rtas_read_config (xn, i*4,4,&val);
+printk ("duude read config %d rc=%d val=%x expect=%x\n", i, rc, val,xn->config_space[i]);
+}}
+
+	enable_irq (dev->irq);
+
+	/* Give the system 5 seconds to finish running the user-space
+	 * hotplug scripts, e.g. ifdown for ethernet.  Yes, this is a hack, 
+	 * but if we don't do this, weird things happen.
+	 */
+	if (reconfig && frozen_slot) {
+		ssleep (5);
+		rpaphp_enable_pci_slot (frozen_slot);
+	}
+	return 0;
+}
+
+/* The longest amount of time to wait for a pci device
+ * to come back on line, in seconds.
+ */
+#define MAX_WAIT_FOR_RECOVERY 15 
+
+int handle_eeh_events (struct notifier_block *self, 
+                       unsigned long reason, void *ev)
+{
+	int freeze_count=0;
+	struct device_node *frozen_device;
+	struct peh_event *event = ev;
+	struct pci_dev *dev = event->dev;
+	int perm_failure = 0;
+	int rc;
+
+	if (!dev)
+	{
+		printk ("EEH: EEH error caught, but no PCI device specified!\n");
+		return 1;
+	}
+
+	frozen_device = get_phb_of_device (dev);
+
+	if (!frozen_device)
+	{
+		printk (KERN_ERR "EEH: Cannot find PCI conroller for %s %s\n",
+				pci_name(dev), pci_pretty_name (dev));
+
+		return 1;
+	}
+
+	/* We get "permanent failure" messages on empty slots. 
+	 * These are false alarms. Empty slots have no child dn. */
+	if ((event->state == pci_channel_io_perm_failure) && (frozen_device == NULL))
+		return 0;
+
+	if (frozen_device)
+		freeze_count = frozen_device->eeh_freeze_count;
+	freeze_count ++;
+	if (freeze_count > EEH_MAX_ALLOWED_FREEZES)
+		perm_failure = 1;
+	
+	/* If the reset state is a '5' and the time to reset is 0 (infinity) 
+	 * or is more then 15 seconds, then mark this as a permanent failure. 
+	 */
+	if ((event->state == pci_channel_io_perm_failure) && 
+	    ((event->time_unavail <= 0) ||
+	     (event->time_unavail > MAX_WAIT_FOR_RECOVERY*1000))) 
+		perm_failure = 1;
+	
+	/* Log the error with the rtas logger. */
+	if (perm_failure) {
+		/* 
+		 * About 90% of all real-life EEH failures in the field
+		 * are due to poorly seated PCI cards. Only 10% or so are
+		 * due to actual, failed cards.
+		 */
+		printk (KERN_ERR
+		   "EEH: device %s:%s has failed %d times \n"
+			"and has been permanently disabled.  Please try reseating\n"
+		   "this device or replacing it.\n",
+			pci_name (dev),
+			pci_pretty_name (dev),
+			freeze_count);
+
+		eeh_slot_error_detail (frozen_device, 2 /* Permanent Error */);
+
+		/* Notify the device that its about to go down. */
+		/* XXX this should be a recursive walk to children for 
+		 * multi-function devices */
+		if (dev->driver->err_handler.error_detected) {
+			dev->driver->err_handler.error_detected (dev, pci_channel_io_perm_failure);
+		}
+
+		/* If there's a hotplug slot, unconfigure it */
+		struct slot * frozen_slot = rpaphp_find_slot(dev);
+		rpaphp_unconfig_pci_adapter (frozen_slot);
+		return 1;
+	} else {
+		eeh_slot_error_detail (frozen_device, 1 /* Temporary Error */);
+	}
+
+	printk (KERN_WARNING
+	   "EEH: This device has failed %d times since last reboot: %s:%s\n",
+		freeze_count,
+		pci_name (dev),
+		pci_pretty_name (dev));
+
+	/* Walk the various device drivers attached to this slot through
+	 * a reset sequence, giving each an opportunity to do what it needs
+	 * to accomplish the reset */
+	/* XXX this should be a recursive walk to children for 
+	 * multi-function devices; each child should get to report
+	 * status too, if needed ... if any child can't handle the reset,
+	 * then need to hotplug it. 
+	 * XXX This does not follow flow of BenH's last email at all. 
+	 * XXX will be fixed later XXX 
+	 */
+	if (dev->driver->err_handler.error_detected) {
+		dev->driver->err_handler.error_detected (dev, pci_channel_io_frozen);
+		rc = eeh_reset_device (dev, frozen_device, 0);
+		if (dev->driver->err_handler.slot_reset) 
+			dev->driver->err_handler.slot_reset (dev);
+	} else {
+		rc = eeh_reset_device (dev, frozen_device, 1);
+	}
+
+	/* Store the freeze count with the pci adapter, and not the slot.
+	 * This way, if the device is replaced, the count is cleared.
+	 */
+	frozen_device->eeh_freeze_count = freeze_count;
+
+	return rc;
+}
+
+static struct notifier_block eeh_block;
+
+void __init init_eeh_handler (void)
+{
+	eeh_block.notifier_call = handle_eeh_events;
+	peh_register_notifier (&eeh_block);
+}
+
+void __exit exit_eeh_handler (void)
+{
+	peh_unregister_notifier (&eeh_block);
+}
+
--- kernel/printk.c.linas-orig	2005-04-29 20:32:46.000000000 -0500
+++ kernel/printk.c	2005-05-06 12:28:43.000000000 -0500
@@ -383,6 +383,23 @@ asmlinkage long sys_syslog(int type, cha
 	return do_syslog(type, buf, len);
 }
 
+#ifdef   CONFIG_DEBUG_KERNEL
+/**
+ * Its very handy to be able to view the syslog buffer during debug.
+ * But do_syslog() uses locks and so it will deadlock if called during 
+ * a debugging session. The routine provides the start and end of the 
+ * physical and logical logs, and is equivalent to do_syslog(3).
+ */
+
+void debugger_syslog_data(char *syslog_data[4])
+{
+	syslog_data[0] = log_buf;
+	syslog_data[1] = log_buf + __LOG_BUF_LEN;
+	syslog_data[2] = log_buf + log_end - (logged_chars < __LOG_BUF_LEN ? logged_chars : __LOG_BUF_LEN);
+	syslog_data[3] = log_buf + log_end;
+}
+#endif   /* CONFIG_DEBUG_KERNEL */
+
 /*
  * Call the console drivers on a range of log_buf
  */
--- arch/ppc64/xmon/xmon.c.linas-orig	2005-04-29 20:31:03.000000000 -0500
+++ arch/ppc64/xmon/xmon.c	2005-05-06 12:28:43.000000000 -0500
@@ -13,6 +13,7 @@
 #include <linux/sched.h>
 #include <linux/smp.h>
 #include <linux/mm.h>
+#include <linux/ptrace.h>
 #include <linux/reboot.h>
 #include <linux/delay.h>
 #include <linux/kallsyms.h>
@@ -100,6 +101,7 @@ static void prdump(unsigned long, long);
 static int ppc_inst_dump(unsigned long, long, int);
 void print_address(unsigned long);
 static void backtrace(struct pt_regs *);
+static void xmon_show_stack(unsigned long sp, unsigned long lr, unsigned long pc);
 static void excprint(struct pt_regs *);
 static void prregs(struct pt_regs *);
 static void memops(int);
@@ -131,6 +133,7 @@ static void csum(void);
 static void bootcmds(void);
 void dump_segments(void);
 static void symbol_lookup(void);
+static void xmon_show_dmesg(void);
 static void xmon_print_symbol(unsigned long address, const char *mid,
 			      const char *after);
 static const char *getvecname(unsigned long vec);
@@ -170,6 +173,7 @@ Commands:\n\
 #endif
   "\
   C	checksum\n\
+  D	show dmesg (printk) buffer\n\
   d	dump bytes\n\
   di	dump instructions\n\
   df	dump float values\n\
@@ -186,6 +190,7 @@ Commands:\n\
   mz	zero a block of memory\n\
   mi	show information about memory allocation\n\
   p 	show the task list\n\
+  P 	show the task list and stacks\n\
   r	print registers\n\
   s	single step\n\
   S	print special registers\n\
@@ -310,6 +315,7 @@ int xmon_core(struct pt_regs *regs, int 
 #endif
 
 	msr = get_msr();
+	msr |= MSR_SF|MSR_IR|MSR_DR;
 	set_msrd(msr & ~MSR_EE);	/* disable interrupts */
 
 	bp = in_breakpoint_table(regs->nip, &offset);
@@ -323,15 +329,39 @@ int xmon_core(struct pt_regs *regs, int 
 #ifdef CONFIG_SMP
 	cpu = smp_processor_id();
 	if (cpu_isset(cpu, cpus_in_xmon)) {
+		int recursive = 1;
 		get_output_lock();
 		excprint(regs);
 		printf("cpu 0x%x: Exception %lx %s in xmon, "
 		       "returning to main loop\n",
 		       cpu, regs->trap, getvecname(TRAP(regs)));
-		longjmp(xmon_fault_jmp[cpu], 1);
+
+		/* If crash occured in firmware, then saved stack pointer
+		 * is bad, and we get recursive fault. Switch to using
+		 * emergency stack in this case. 
+		 */
+		unsigned long *sp = ((unsigned long *) xmon_fault_jmp[cpu]) + 1;
+ 		if (*sp < 0xc000000000000000)
+		{
+			printf("Bad stack pointer %lx in xmon, using emergency stack\n", *sp);
+			*sp = (unsigned long ) (get_paca()->emergency_sp);
+			sp = (unsigned long *) *sp;
+			*sp = (unsigned long ) (get_paca()->emergency_sp);
+			recursive = -1;
+		}
+		sp = (unsigned long *) *sp;
+ 		if (*sp < 0xc000000000000000)
+		{
+			printf("Bad stack frame %lx in xmon, using emergency stack\n", *sp);
+			*sp = (unsigned long ) (get_paca()->emergency_sp);
+			recursive = -1;
+		}
+printf ("duude planing on returning with setjmp=%p\n", xmon_fault_jmp[cpu]);
+printf ("duude planing on returning to %p w/stack=%p or %p\n", xmon_fault_jmp[cpu][0], sp, xmon_fault_jmp[cpu][1]);
+		longjmp(xmon_fault_jmp[cpu], recursive);
 	}
 
-	if (setjmp(recurse_jmp) != 0) {
+	if (setjmp(recurse_jmp) > 0) {
 		if (!in_xmon || !xmon_gate) {
 			printf("xmon: WARNING: bad recursive fault "
 			       "on cpu 0x%x\n", cpu);
@@ -353,6 +383,11 @@ int xmon_core(struct pt_regs *regs, int 
 	if (!fromipi) {
 		get_output_lock();
 		excprint(regs);
+printf ("duude this was a normal entry\n");
+printf ("duude saved return addr=%p, saves stackp=%p stack=%p\n", recurse_jmp[0], recurse_jmp[1], *((long **)(recurse_jmp[1])));
+printf ("duude my stack really really is %p\n", &msr);
+printf ("duude my my setjmp is %p\n", recurse_jmp);
+
 		if (bp) {
 			printf("cpu 0x%x stopped at breakpoint 0x%x (",
 			       cpu, BP_NUM(bp));
@@ -386,7 +421,7 @@ int xmon_core(struct pt_regs *regs, int 
 			smp_send_debugger_break(MSG_ALL_BUT_SELF);
 			/* wait for other cpus to come in */
 			for (timeout = 100000000; timeout != 0; --timeout) {
-				if (cpus_weight(cpus_in_xmon) >= ncpus)
+				if (cpus_weight(*((cpumask_t *) &cpus_in_xmon)) >= ncpus)
 					break;
 				barrier();
 			}
@@ -757,6 +792,64 @@ static void remove_cpu_bpts(void)
 		set_iabr(0);
 }
 
+static inline int 
+xmon_process_cpu(const task_t *p)
+{
+	return p->thread_info->cpu;
+}
+
+#define xmon_task_has_cpu(p) (task_curr(p))
+
+static void
+xmon_show_task(task_t *p)
+{
+	printf("0x%p %8d %8d  %d %4d   %c  0x%p %c%s\n",
+		   (void *)p, p->pid, p->parent->pid,
+		   xmon_task_has_cpu(p), xmon_process_cpu(p),
+		   (p->state == 0) ? 'R' :
+		     (p->state < 0) ? 'U' :
+		     (p->state & TASK_UNINTERRUPTIBLE) ? 'D' :
+		     (p->state & TASK_STOPPED || p->ptrace & PT_PTRACED) ? 'T' :
+		     (p->state & EXIT_ZOMBIE) ? 'Z' :
+		     (p->state & EXIT_DEAD) ? 'X' :
+		     (p->state & TASK_INTERRUPTIBLE) ? 'S' : '?',
+		   (void *)(&p->thread),
+		   (p == current) ? '*': ' ',
+		   p->comm);
+}
+
+static task_t *xmon_next_thread(const task_t *p) 
+{
+	return pid_task(p->pids[PIDTYPE_TGID].pid_list.next, PIDTYPE_TGID);
+}
+
+static void
+xmon_show_state(int prt_stacks)
+{
+	task_t *g, *p;
+
+	printf("%-*s      Pid   Parent [*] cpu State %-*s Command\n",
+		(int)(2*sizeof(void *))+2, "Task Addr",
+		(int)(2*sizeof(void *))+2, "Thread");
+
+#ifdef PER_CPU_RUNQUEUES_NO_LONGER_DECLARED_STATIC_IN_SCHED_C
+	/* Run the active tasks first */
+	for (cpu = 0; cpu < NR_CPUS; ++cpu) 
+		if (cpu_online(cpu)) {
+			p = cpu_curr(cpu);
+			xmon_show_task(p);
+		}
+#endif
+
+	/* Now the real tasks */
+	do_each_thread(g, p) {
+		xmon_show_task(p);
+		if (prt_stacks) 
+			xmon_show_stack(p->thread.ksp, 0, 0);
+	} while ((p = xmon_next_thread(p)) != g);
+}
+
+
 /* Command interpreting routine */
 static char *last_cmd;
 
@@ -809,6 +902,9 @@ cmds(struct pt_regs *excp)
 		case 'd':
 			dump();
 			break;
+		case 'D':
+			xmon_show_dmesg();
+			break;
 		case 'l':
 			symbol_lookup();
 			break;
@@ -839,7 +935,10 @@ cmds(struct pt_regs *excp)
 			printf(help_string);
 			break;
 		case 'p':
-			show_state();
+			xmon_show_state(0);
+			break;
+		case 'P':
+			xmon_show_state(1);
 			break;
 		case 'b':
 			bpt_cmds();
@@ -2400,6 +2499,58 @@ static void xmon_print_symbol(unsigned l
 	printf("%s", after);
 }
 
+extern void debugger_syslog_data(char *syslog_data[4]);
+#define SYSLOG_WRAP(p) if (p < syslog_data[0]) p = syslog_data[1]-1; \
+	else if (p >= syslog_data[1]) p = syslog_data[0];
+
+static void xmon_show_dmesg(void)
+{
+	char *syslog_data[4], *start, *end, c;
+	int logsize;
+
+	/* syslog_data[0,1] physical start, end+1.  
+	 * syslog_data[2,3] logical start, end+1. 
+	 */
+	debugger_syslog_data(syslog_data);
+	if (syslog_data[2] == syslog_data[3])
+		return;
+	logsize = syslog_data[1] - syslog_data[0];
+	start = syslog_data[0] + (syslog_data[2] - syslog_data[0]) % logsize;
+	end = syslog_data[0] + (syslog_data[3] - syslog_data[0]) % logsize;
+
+	/* Do a line at a time (max 200 chars) to reduce overhead */
+	c = '\0';
+	while(1) {
+		char *p;
+		int chars = 0;
+		if (!*start) {
+			while (!*start) {
+				++start;
+				SYSLOG_WRAP(start);
+				if (start == end)
+					break;
+			}
+			if (start == end)
+				break;
+		}
+		p = start;
+		while (*start && chars < 200) {
+			c = *start;
+			++chars;
+			++start;
+			SYSLOG_WRAP(start);
+			if (start == end || c == '\n')
+				break;
+		}
+		if (chars)
+			printf("%.*s", chars, p);
+		if (start == end)
+			break;
+	}
+	if (c != '\n')
+		printf("\n");
+}
+
 static void debug_trace(void)
 {
         unsigned long val, cmd, on;


More information about the Linuxppc64-dev mailing list