These changes are the raw update to linux-4.4.6-rt14. Kernel sources
[kvmfornfv.git] / kernel / arch / powerpc / platforms / powernv / opal-hmi.c
index b322bfb..d000f4e 100644 (file)
@@ -35,9 +35,134 @@ struct OpalHmiEvtNode {
        struct list_head list;
        struct OpalHMIEvent hmi_evt;
 };
+
+struct xstop_reason {
+       uint32_t xstop_reason;
+       const char *unit_failed;
+       const char *description;
+};
+
 static LIST_HEAD(opal_hmi_evt_list);
 static DEFINE_SPINLOCK(opal_hmi_evt_lock);
 
+static void print_core_checkstop_reason(const char *level,
+                                       struct OpalHMIEvent *hmi_evt)
+{
+       int i;
+       static const struct xstop_reason xstop_reason[] = {
+               { CORE_CHECKSTOP_IFU_REGFILE, "IFU",
+                               "RegFile core check stop" },
+               { CORE_CHECKSTOP_IFU_LOGIC, "IFU", "Logic core check stop" },
+               { CORE_CHECKSTOP_PC_DURING_RECOV, "PC",
+                               "Core checkstop during recovery" },
+               { CORE_CHECKSTOP_ISU_REGFILE, "ISU",
+                               "RegFile core check stop (mapper error)" },
+               { CORE_CHECKSTOP_ISU_LOGIC, "ISU", "Logic core check stop" },
+               { CORE_CHECKSTOP_FXU_LOGIC, "FXU", "Logic core check stop" },
+               { CORE_CHECKSTOP_VSU_LOGIC, "VSU", "Logic core check stop" },
+               { CORE_CHECKSTOP_PC_RECOV_IN_MAINT_MODE, "PC",
+                               "Recovery in maintenance mode" },
+               { CORE_CHECKSTOP_LSU_REGFILE, "LSU",
+                               "RegFile core check stop" },
+               { CORE_CHECKSTOP_PC_FWD_PROGRESS, "PC",
+                               "Forward Progress Error" },
+               { CORE_CHECKSTOP_LSU_LOGIC, "LSU", "Logic core check stop" },
+               { CORE_CHECKSTOP_PC_LOGIC, "PC", "Logic core check stop" },
+               { CORE_CHECKSTOP_PC_HYP_RESOURCE, "PC",
+                               "Hypervisor Resource error - core check stop" },
+               { CORE_CHECKSTOP_PC_HANG_RECOV_FAILED, "PC",
+                               "Hang Recovery Failed (core check stop)" },
+               { CORE_CHECKSTOP_PC_AMBI_HANG_DETECTED, "PC",
+                               "Ambiguous Hang Detected (unknown source)" },
+               { CORE_CHECKSTOP_PC_DEBUG_TRIG_ERR_INJ, "PC",
+                               "Debug Trigger Error inject" },
+               { CORE_CHECKSTOP_PC_SPRD_HYP_ERR_INJ, "PC",
+                               "Hypervisor check stop via SPRC/SPRD" },
+       };
+
+       /* Validity check */
+       if (!hmi_evt->u.xstop_error.xstop_reason) {
+               printk("%s      Unknown Core check stop.\n", level);
+               return;
+       }
+
+       printk("%s      CPU PIR: %08x\n", level,
+                       be32_to_cpu(hmi_evt->u.xstop_error.u.pir));
+       for (i = 0; i < ARRAY_SIZE(xstop_reason); i++)
+               if (be32_to_cpu(hmi_evt->u.xstop_error.xstop_reason) &
+                                       xstop_reason[i].xstop_reason)
+                       printk("%s      [Unit: %-3s] %s\n", level,
+                                       xstop_reason[i].unit_failed,
+                                       xstop_reason[i].description);
+}
+
+static void print_nx_checkstop_reason(const char *level,
+                                       struct OpalHMIEvent *hmi_evt)
+{
+       int i;
+       static const struct xstop_reason xstop_reason[] = {
+               { NX_CHECKSTOP_SHM_INVAL_STATE_ERR, "DMA & Engine",
+                                       "SHM invalid state error" },
+               { NX_CHECKSTOP_DMA_INVAL_STATE_ERR_1, "DMA & Engine",
+                                       "DMA invalid state error bit 15" },
+               { NX_CHECKSTOP_DMA_INVAL_STATE_ERR_2, "DMA & Engine",
+                                       "DMA invalid state error bit 16" },
+               { NX_CHECKSTOP_DMA_CH0_INVAL_STATE_ERR, "DMA & Engine",
+                                       "Channel 0 invalid state error" },
+               { NX_CHECKSTOP_DMA_CH1_INVAL_STATE_ERR, "DMA & Engine",
+                                       "Channel 1 invalid state error" },
+               { NX_CHECKSTOP_DMA_CH2_INVAL_STATE_ERR, "DMA & Engine",
+                                       "Channel 2 invalid state error" },
+               { NX_CHECKSTOP_DMA_CH3_INVAL_STATE_ERR, "DMA & Engine",
+                                       "Channel 3 invalid state error" },
+               { NX_CHECKSTOP_DMA_CH4_INVAL_STATE_ERR, "DMA & Engine",
+                                       "Channel 4 invalid state error" },
+               { NX_CHECKSTOP_DMA_CH5_INVAL_STATE_ERR, "DMA & Engine",
+                                       "Channel 5 invalid state error" },
+               { NX_CHECKSTOP_DMA_CH6_INVAL_STATE_ERR, "DMA & Engine",
+                                       "Channel 6 invalid state error" },
+               { NX_CHECKSTOP_DMA_CH7_INVAL_STATE_ERR, "DMA & Engine",
+                                       "Channel 7 invalid state error" },
+               { NX_CHECKSTOP_DMA_CRB_UE, "DMA & Engine",
+                                       "UE error on CRB(CSB address, CCB)" },
+               { NX_CHECKSTOP_DMA_CRB_SUE, "DMA & Engine",
+                                       "SUE error on CRB(CSB address, CCB)" },
+               { NX_CHECKSTOP_PBI_ISN_UE, "PowerBus Interface",
+               "CRB Kill ISN received while holding ISN with UE error" },
+       };
+
+       /* Validity check */
+       if (!hmi_evt->u.xstop_error.xstop_reason) {
+               printk("%s      Unknown NX check stop.\n", level);
+               return;
+       }
+
+       printk("%s      NX checkstop on CHIP ID: %x\n", level,
+                       be32_to_cpu(hmi_evt->u.xstop_error.u.chip_id));
+       for (i = 0; i < ARRAY_SIZE(xstop_reason); i++)
+               if (be32_to_cpu(hmi_evt->u.xstop_error.xstop_reason) &
+                                       xstop_reason[i].xstop_reason)
+                       printk("%s      [Unit: %-3s] %s\n", level,
+                                       xstop_reason[i].unit_failed,
+                                       xstop_reason[i].description);
+}
+
+static void print_checkstop_reason(const char *level,
+                                       struct OpalHMIEvent *hmi_evt)
+{
+       switch (hmi_evt->u.xstop_error.xstop_type) {
+       case CHECKSTOP_TYPE_CORE:
+               print_core_checkstop_reason(level, hmi_evt);
+               break;
+       case CHECKSTOP_TYPE_NX:
+               print_nx_checkstop_reason(level, hmi_evt);
+               break;
+       case CHECKSTOP_TYPE_UNKNOWN:
+               printk("%s      Unknown Malfunction Alert.\n", level);
+               break;
+       }
+}
+
 static void print_hmi_event_info(struct OpalHMIEvent *hmi_evt)
 {
        const char *level, *sevstr, *error_info;
@@ -95,6 +220,13 @@ static void print_hmi_event_info(struct OpalHMIEvent *hmi_evt)
                (hmi_evt->type == OpalHMI_ERROR_TFMR_PARITY))
                printk("%s      TFMR: %016llx\n", level,
                                                be64_to_cpu(hmi_evt->tfmr));
+
+       if (hmi_evt->version < OpalHMIEvt_V2)
+               return;
+
+       /* OpalHMIEvt_V2 and above provides reason for malfunction alert. */
+       if (hmi_evt->type == OpalHMI_ERROR_MALFUNC_ALERT)
+               print_checkstop_reason(level, hmi_evt);
 }
 
 static void hmi_event_handler(struct work_struct *work)
@@ -103,6 +235,8 @@ static void hmi_event_handler(struct work_struct *work)
        struct OpalHMIEvent *hmi_evt;
        struct OpalHmiEvtNode *msg_node;
        uint8_t disposition;
+       struct opal_msg msg;
+       int unrecoverable = 0;
 
        spin_lock_irqsave(&opal_hmi_evt_lock, flags);
        while (!list_empty(&opal_hmi_evt_list)) {
@@ -118,14 +252,53 @@ static void hmi_event_handler(struct work_struct *work)
 
                /*
                 * Check if HMI event has been recovered or not. If not
-                * then we can't continue, invoke panic.
+                * then kernel can't continue, we need to panic.
+                * But before we do that, display all the HMI event
+                * available on the list and set unrecoverable flag to 1.
                 */
                if (disposition != OpalHMI_DISPOSITION_RECOVERED)
-                       panic("Unrecoverable HMI exception");
+                       unrecoverable = 1;
 
                spin_lock_irqsave(&opal_hmi_evt_lock, flags);
        }
        spin_unlock_irqrestore(&opal_hmi_evt_lock, flags);
+
+       if (unrecoverable) {
+               int ret;
+
+               /* Pull all HMI events from OPAL before we panic. */
+               while (opal_get_msg(__pa(&msg), sizeof(msg)) == OPAL_SUCCESS) {
+                       u32 type;
+
+                       type = be32_to_cpu(msg.msg_type);
+
+                       /* skip if not HMI event */
+                       if (type != OPAL_MSG_HMI_EVT)
+                               continue;
+
+                       /* HMI event info starts from param[0] */
+                       hmi_evt = (struct OpalHMIEvent *)&msg.params[0];
+                       print_hmi_event_info(hmi_evt);
+               }
+
+               /*
+                * Unrecoverable HMI exception. We need to inform BMC/OCC
+                * about this error so that it can collect relevant data
+                * for error analysis before rebooting.
+                */
+               ret = opal_cec_reboot2(OPAL_REBOOT_PLATFORM_ERROR,
+                       "Unrecoverable HMI exception");
+               if (ret == OPAL_UNSUPPORTED) {
+                       pr_emerg("Reboot type %d not supported\n",
+                                               OPAL_REBOOT_PLATFORM_ERROR);
+               }
+
+               /*
+                * Fall through and panic if opal_cec_reboot2() returns
+                * OPAL_UNSUPPORTED.
+                */
+               panic("Unrecoverable HMI exception");
+       }
 }
 
 static DECLARE_WORK(hmi_event_work, hmi_event_handler);
@@ -170,7 +343,7 @@ static struct notifier_block opal_hmi_handler_nb = {
        .priority       = 0,
 };
 
-static int __init opal_hmi_handler_init(void)
+int __init opal_hmi_handler_init(void)
 {
        int ret;
 
@@ -186,4 +359,3 @@ static int __init opal_hmi_handler_init(void)
        }
        return 0;
 }
-machine_subsys_initcall(powernv, opal_hmi_handler_init);