[Skiboot] [PATCH v2] lpc: Log LPC SYNC errors as unrecoverable ones for manufacturing
Vasant Hegde
hegdevasant at linux.vnet.ibm.com
Fri Aug 5 15:18:44 AEST 2016
On 08/05/2016 04:26 AM, Vipin K Parashar wrote:
> High volume of SYNC errors onto LPC bus cause degraded system
> performance and are likely due to bad hardware present onto system.
> Thus once LPC SYNC errors cross a certain threshold, OPAL should log
> them onto BMC as unrecoverable errors in manufacturing mode. This
> will help manufacturing screen bad parts, causing such errors.
>
> Cc: stable
> Signed-off-by: Vipin K Parashar <vipin at linux.vnet.ibm.com>
> ---
> Changes in v2:
> - Removed duplicate MFG mode detection under lpc_init as well
>
> core/platform.c | 8 +++++++-
> hw/lpc.c | 26 +++++++++++++++++++++-----
> include/errorlog.h | 1 +
> include/platform.h | 2 ++
> 4 files changed, 31 insertions(+), 6 deletions(-)
>
> diff --git a/core/platform.c b/core/platform.c
> index de6e406..9730f8d 100644
> --- a/core/platform.c
> +++ b/core/platform.c
> @@ -24,6 +24,7 @@
> #include <xscom.h>
> #include <errorlog.h>
>
> +bool mfg_mode;
> struct platform platform;
>
> DEFINE_LOG_ENTRY(OPAL_RC_ABNORMAL_REBOOT, OPAL_PLATFORM_ERR_EVT, OPAL_CEC,
> @@ -124,8 +125,13 @@ void probe_platform(void)
> struct platform *platforms = &__platforms_start;
> unsigned int i;
>
> - platform = generic_platform;
> + /* Detect Manufacturing mode */
> + if (dt_find_property(dt_root, "ibm,manufacturing-mode")) {
> + printf("PLAT: Manufacturing mode ON\n");
Please use prlog.
> + mfg_mode = true;
> + }
You are using mfg_mode inside lpc code.. So better move this to lpc.c itself?
>
> + platform = generic_platform;
> for (i = 0; &platforms[i] < &__platforms_end; i++) {
> if (platforms[i].probe && platforms[i].probe()) {
> platform = platforms[i];
> diff --git a/hw/lpc.c b/hw/lpc.c
> index 32cb7b1..bb59ac0 100644
> --- a/hw/lpc.c
> +++ b/hw/lpc.c
> @@ -25,6 +25,7 @@
> #include <timebase.h>
> #include <errorlog.h>
> #include <opal-api.h>
> +#include <platform.h>
>
> //#define DBG_IRQ(fmt...) prerror(fmt)
> #define DBG_IRQ(fmt...) do { } while(0)
> @@ -41,6 +42,10 @@ DEFINE_LOG_ENTRY(OPAL_RC_LPC_SYNC, OPAL_PLATFORM_ERR_EVT, OPAL_LPC,
> OPAL_MISC_SUBSYSTEM, OPAL_PREDICTIVE_ERR_GENERAL,
> OPAL_NA);
>
> +DEFINE_LOG_ENTRY(OPAL_RC_LPC_SYNC_PERF, OPAL_PLATFORM_ERR_EVT, OPAL_LPC,
> + OPAL_MISC_SUBSYSTEM, OPAL_UNRECOVERABLE_ERR_DEGRADE_PERF,
> + OPAL_NA);
> +
> #define ECCB_CTL 0 /* b0020 -> b00200 */
> #define ECCB_STAT 2 /* b0022 -> b00210 */
> #define ECCB_DATA 3 /* b0023 -> b00218 */
> @@ -110,6 +115,9 @@ DEFINE_LOG_ENTRY(OPAL_RC_LPC_SYNC, OPAL_PLATFORM_ERR_EVT, OPAL_LPC,
> LPC_HC_IRQ_BM_TAR_ERR)
> #define LPC_HC_ERROR_ADDRESS 0x40
>
> +
> +#define LPC_BUS_DEGRADED_PERF_THRESHOLD 5
How did you arrive magic number 5? Can you put some comment above ?
> +
> struct lpc_client_entry {
> struct list_node node;
> const struct lpc_client *clt;
> @@ -662,8 +670,10 @@ static void lpc_dispatch_reset(struct proc_chip *chip)
> static void lpc_dispatch_err_irqs(struct proc_chip *chip, uint32_t irqs)
> {
> int rc;
> + struct opal_err_info *info;
> const char *sync_err = "Unknown LPC error";
> uint32_t err_addr;
> + static int lpc_bus_err_count;
>
> /* Write back to clear error interrupts, we clear SerIRQ later
> * as they are handled as level interrupts
> @@ -690,13 +700,19 @@ static void lpc_dispatch_err_irqs(struct proc_chip *chip, uint32_t irqs)
>
> rc = opb_read(chip, lpc_reg_opb_base + LPC_HC_ERROR_ADDRESS,
> &err_addr, 4);
> +
> + lpc_bus_err_count++;
> + if (mfg_mode && (lpc_bus_err_count > LPC_BUS_DEGRADED_PERF_THRESHOLD))
> + info = &e_info(OPAL_RC_LPC_SYNC_PERF);
> + else
> + info = &e_info(OPAL_RC_LPC_SYNC);
> +
I don't know much about LPC.. But look at this code, we may flood BMC with
number of error logs.. Is that correct?
-Vasant
More information about the Skiboot
mailing list