[Skiboot] [PATCH 07/12] opal: Refactor TOD topology failover recovery.

Mahesh J Salgaonkar mahesh at linux.vnet.ibm.com
Sat Mar 28 20:36:08 AEDT 2015


From: Mahesh Salgaonkar <mahesh at linux.vnet.ibm.com>

The current code does not correctly identify need for topology switch and
forces the TOD topology switch even when it is not required do so.
This patch introduces a check to find out if sync/step network is running
and there is no step check error reported on active master. If this check
fails, then we need to trigger a topology switch to recover from TOD error.

Signed-off-by: Mahesh Salgaonkar <mahesh at linux.vnet.ibm.com>
---
 hw/chiptod.c |   76 +++++++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 62 insertions(+), 14 deletions(-)

diff --git a/hw/chiptod.c b/hw/chiptod.c
index 018e39d..9564276 100644
--- a/hw/chiptod.c
+++ b/hw/chiptod.c
@@ -76,6 +76,8 @@
 #define   TOD_ERR_OSC0_PARITY		PPC_BIT(1)
 #define   TOD_ERR_OSC1_PARITY		PPC_BIT(2)
 #define   TOD_ERR_CRITC_PARITY		PPC_BIT(13)
+#define   TOD_ERR_MP0_STEP_CHECK	PPC_BIT(14)
+#define   TOD_ERR_MP1_STEP_CHECK	PPC_BIT(15)
 #define   TOD_ERR_PSS_HAMMING_DISTANCE	PPC_BIT(18)
 #define	  TOD_ERR_DELAY_COMPL_PARITY	PPC_BIT(22)
 /* CNTR errors */
@@ -943,33 +945,79 @@ static bool chiptod_set_ttype4_mode(struct proc_chip *chip, bool enable)
 	return true;
 }
 
+static bool is_topology_switch_required(void)
+{
+	int32_t active_master_chip;
+	uint64_t tod_error;
+
+	active_master_chip = chiptod_get_active_master();
+
+	/* Check if TOD is running on Active master. */
+	if (chiptod_master_running())
+		return false;
+
+	/*
+	 * Check if sync/step network is running.
+	 *
+	 * If sync/step network is not running on current active topology
+	 * then we need switch topology to recover from TOD error.
+	 */
+	if (!chiptod_sync_step_check_running(current_topology))
+		return true;
+
+	/*
+	 * Check if there is a step check error reported on
+	 * Active master.
+	 */
+	if (xscom_read(active_master_chip, TOD_ERROR, &tod_error) != 0) {
+		prerror("CHIPTOD: XSCOM error reading TOD_ERROR reg\n");
+		/*
+		 * Can't do anything here. But we already found that
+		 * sync/step network is running. Hence return false.
+		 */
+		return false;
+	}
+
+	if (tod_error & TOD_ERR_MP0_STEP_CHECK)
+		return true;
+
+	return false;
+}
+
 /*
  * Sync up TOD with other chips and get TOD in running state.
- * For non-master, we request TOD value from another chip.
- * For master chip, Switch the topology to recover.
+ * Check if current topology is active and running. If not, then
+ * trigger a topology switch.
  */
 static int chiptod_start_tod(void)
 {
 	struct proc_chip *chip = NULL;
 	int rc = 1;
 
-	/*  Handle TOD recovery on master chip. */
-	if (this_cpu()->chip_id == chiptod_primary) {
+	/*  Do a topology switch if required. */
+	if (is_topology_switch_required()) {
+		int32_t mchip = chiptod_get_active_master();
+
+		prlog(PR_DEBUG, "CHIPTOD: Need topology switch to recover\n");
 		/*
-		 * TOD is not running on master chip. We need to sync with
-		 * secondary chip TOD. But before we do that we need to
-		 * switch topology to make backup master as the new
-		 * active master. Once we switch the topology we can
-		 * then request TOD value from new master chip TOD.
-		 * But make sure we move local chiptod to Not Set before
-		 * request TOD value.
+		 * There is a failure in StepSync network in current
+		 * active topology. TOD is not running on active master chip.
+		 * We need to sync with backup master chip TOD.
+		 * But before we do that we need to switch topology to make
+		 * backup master as the new active master. Once we switch the
+		 * topology we can then request TOD value from new active
+		 * master. But make sure we move local chiptod to Not Set
+		 * before requesting TOD value.
 		 */
-		if (xscom_writeme(TOD_TTYPE_1, (1UL << 63)) != 0) {
+		if (xscom_write(mchip, TOD_TTYPE_1, (1UL << 63)) != 0) {
 			prerror("CHIPTOD: XSCOM error switching primary/secondary\n");
 			return 0;
 		}
-		chiptod_primary = chiptod_secondary;
-		chiptod_secondary = this_cpu()->chip_id;
+
+		/* Update topology info. */
+		current_topology = query_current_topology();
+		chiptod_update_topology(chiptod_topo_primary);
+		chiptod_update_topology(chiptod_topo_secondary);
 	}
 
 	if (!chiptod_master_running()) {



More information about the Skiboot mailing list