[PATCH] log machine check errors

Jake Moilanen moilanen at austin.ibm.com
Sat Aug 14 04:06:40 EST 2004


Somewhere along the line it looks like logging machine check errors
never got put in 2.6.  Machine check error logs were one of the main
reasons for storing logs to nvram.

Here's a forward port of the 2.4 code the Dave Altobelli originally
wrote.

Thanks,
Jake

Signed-off-by: Dave Altobelli <dalto at austin.ibm.com>
Signed-off-by: Jake Moilanen <moilanen at austin.ibm.com>

---


diff -puN arch/ppc64/kernel/traps.c~machine-check-logging arch/ppc64/kernel/traps.c
--- linux-2.6-ames/arch/ppc64/kernel/traps.c~machine-check-logging	Fri Aug 13 08:01:00 2004
+++ linux-2.6-ames-moilanen/arch/ppc64/kernel/traps.c	Fri Aug 13 09:01:37 2004
@@ -37,10 +37,14 @@
 #include <asm/processor.h>
 #include <asm/ppcdebug.h>
 #include <asm/rtas.h>
+#include <asm/machdep.h>

 #ifdef CONFIG_PPC_PSERIES
 /* This is true if we are using the firmware NMI handler (typically LPAR) */
 extern int fwnmi_active;
+
+char mce_data_buf[RTAS_ERROR_LOG_MAX]__page_aligned;
+
 #endif

 #ifdef CONFIG_DEBUGGER
@@ -149,6 +153,13 @@ _exception(int signr, siginfo_t *info, s
  * FWNMI vectors.  The pt_regs' r3 will be updated to reflect
  * the actual r3 if possible, and a ptr to the error log entry
  * will be returned if found.
+ *
+ * The mce_data_buf does not have any locks or protection around it,
+ * if a second machine check comes in, or a system reset is done
+ * before we have logged the error, then we will get corruption in the
+ * error log.  This is preferable over holding off on calling
+ * ibm,nmi-interlock which would result in us checkstopping if a
+ * second machine check did come in.
  */
 static struct rtas_error_log *FWNMI_get_errinfo(struct pt_regs *regs)
 {
@@ -160,7 +171,9 @@ static struct rtas_error_log *FWNMI_get_
 	    (errdata >= rtas.base && errdata < rtas.base + rtas.size - 16)) {
 		savep = __va(errdata);
 		regs->gpr[3] = savep[0];	/* restore original r3 */
-		errhdr = (struct rtas_error_log *)(savep + 1);
+		memset(mce_data_buf, 0, RTAS_ERROR_LOG_MAX);
+		memcpy(mce_data_buf, (char *)(savep + 1), RTAS_ERROR_LOG_MAX);
+		errhdr = (struct rtas_error_log *)mce_data_buf;
 	} else {
 		printk("FWNMI: corrupt r3\n");
 	}
@@ -211,19 +224,20 @@ SystemResetException(struct pt_regs *reg
  * Return 1 if corrected (or delivered a signal).
  * Return 0 if there is nothing we can do.
  */
-static int recover_mce(struct pt_regs *regs, struct rtas_error_log err)
+static int recover_mce(struct pt_regs *regs, struct rtas_error_log * err)
 {
 	siginfo_t info;
+	int nonfatal = 0;

-	if (err.disposition == DISP_FULLY_RECOVERED) {
+	if (err->disposition == DISP_FULLY_RECOVERED) {
 		/* Platform corrected itself */
-		return 1;
+		nonfatal = 1;
 	} else if ((regs->msr & MSR_RI) &&
 		   user_mode(regs) &&
-		   err.severity == SEVERITY_ERROR_SYNC &&
-		   err.disposition == DISP_NOT_RECOVERED &&
-		   err.target == TARGET_MEMORY &&
-		   err.type == TYPE_ECC_UNCORR &&
+		   err->severity == SEVERITY_ERROR_SYNC &&
+		   err->disposition == DISP_NOT_RECOVERED &&
+		   err->target == TARGET_MEMORY &&
+		   err->type == TYPE_ECC_UNCORR &&
 		   !(current->pid == 0 || current->pid == 1)) {
 		/* Kill off a user process with an ECC error */
 		info.si_signo = SIGBUS;
@@ -234,9 +248,12 @@ static int recover_mce(struct pt_regs *r
 		printk(KERN_ERR "MCE: uncorrectable ecc error for pid %d\n",
 		       current->pid);
 		_exception(SIGBUS, &info, regs);
-		return 1;
+		nonfatal = 1;
 	}
-	return 0;
+
+ 	log_error((char *)err, ERR_TYPE_RTAS_LOG, !nonfatal);
+
+	return nonfatal;
 }
 #endif

@@ -254,14 +271,12 @@ void
 MachineCheckException(struct pt_regs *regs)
 {
 #ifdef CONFIG_PPC_PSERIES
-	struct rtas_error_log err, *errp;
+	struct rtas_error_log *errp;

 	if (fwnmi_active) {
 		errp = FWNMI_get_errinfo(regs);
-		if (errp)
-			err = *errp;
-		FWNMI_release_errinfo();	/* frees errp */
-		if (errp && recover_mce(regs, err))
+		FWNMI_release_errinfo();
+		if (errp && recover_mce(regs, errp))
 			return;
 	}
 #endif

_

** Sent via the linuxppc64-dev mail list. See http://lists.linuxppc.org/





More information about the Linuxppc64-dev mailing list