These changes are the raw update to linux-4.4.6-rt14. Kernel sources
[kvmfornfv.git] / kernel / drivers / net / ethernet / mellanox / mlx5 / core / health.c
index 292d76f..f5deb64 100644 (file)
@@ -34,6 +34,7 @@
 #include <linux/module.h>
 #include <linux/random.h>
 #include <linux/vmalloc.h>
+#include <linux/hardirq.h>
 #include <linux/mlx5/driver.h>
 #include <linux/mlx5/cmd.h>
 #include "mlx5_core.h"
@@ -46,39 +47,113 @@ enum {
 enum {
        MLX5_HEALTH_SYNDR_FW_ERR                = 0x1,
        MLX5_HEALTH_SYNDR_IRISC_ERR             = 0x7,
+       MLX5_HEALTH_SYNDR_HW_UNRECOVERABLE_ERR  = 0x8,
        MLX5_HEALTH_SYNDR_CRC_ERR               = 0x9,
        MLX5_HEALTH_SYNDR_FETCH_PCI_ERR         = 0xa,
        MLX5_HEALTH_SYNDR_HW_FTL_ERR            = 0xb,
        MLX5_HEALTH_SYNDR_ASYNC_EQ_OVERRUN_ERR  = 0xc,
        MLX5_HEALTH_SYNDR_EQ_ERR                = 0xd,
+       MLX5_HEALTH_SYNDR_EQ_INV                = 0xe,
        MLX5_HEALTH_SYNDR_FFSER_ERR             = 0xf,
+       MLX5_HEALTH_SYNDR_HIGH_TEMP             = 0x10
 };
 
-static DEFINE_SPINLOCK(health_lock);
-static LIST_HEAD(health_list);
-static struct work_struct health_work;
+enum {
+       MLX5_NIC_IFC_FULL               = 0,
+       MLX5_NIC_IFC_DISABLED           = 1,
+       MLX5_NIC_IFC_NO_DRAM_NIC        = 2
+};
 
-static void health_care(struct work_struct *work)
+static u8 get_nic_interface(struct mlx5_core_dev *dev)
 {
-       struct mlx5_core_health *health, *n;
-       struct mlx5_core_dev *dev;
-       struct mlx5_priv *priv;
-       LIST_HEAD(tlist);
+       return (ioread32be(&dev->iseg->cmdq_addr_l_sz) >> 8) & 3;
+}
+
+static void trigger_cmd_completions(struct mlx5_core_dev *dev)
+{
+       unsigned long flags;
+       u64 vector;
 
-       spin_lock_irq(&health_lock);
-       list_splice_init(&health_list, &tlist);
+       /* wait for pending handlers to complete */
+       synchronize_irq(dev->priv.msix_arr[MLX5_EQ_VEC_CMD].vector);
+       spin_lock_irqsave(&dev->cmd.alloc_lock, flags);
+       vector = ~dev->cmd.bitmask & ((1ul << (1 << dev->cmd.log_sz)) - 1);
+       if (!vector)
+               goto no_trig;
+
+       vector |= MLX5_TRIGGERED_CMD_COMP;
+       spin_unlock_irqrestore(&dev->cmd.alloc_lock, flags);
+
+       mlx5_core_dbg(dev, "vector 0x%llx\n", vector);
+       mlx5_cmd_comp_handler(dev, vector);
+       return;
+
+no_trig:
+       spin_unlock_irqrestore(&dev->cmd.alloc_lock, flags);
+}
+
+static int in_fatal(struct mlx5_core_dev *dev)
+{
+       struct mlx5_core_health *health = &dev->priv.health;
+       struct health_buffer __iomem *h = health->health;
 
-       spin_unlock_irq(&health_lock);
+       if (get_nic_interface(dev) == MLX5_NIC_IFC_DISABLED)
+               return 1;
 
-       list_for_each_entry_safe(health, n, &tlist, list) {
-               priv = container_of(health, struct mlx5_priv, health);
-               dev = container_of(priv, struct mlx5_core_dev, priv);
-               mlx5_core_warn(dev, "handling bad device here\n");
-               /* nothing yet */
-               spin_lock_irq(&health_lock);
-               list_del_init(&health->list);
-               spin_unlock_irq(&health_lock);
+       if (ioread32be(&h->fw_ver) == 0xffffffff)
+               return 1;
+
+       return 0;
+}
+
+void mlx5_enter_error_state(struct mlx5_core_dev *dev)
+{
+       if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)
+               return;
+
+       mlx5_core_err(dev, "start\n");
+       if (pci_channel_offline(dev->pdev) || in_fatal(dev))
+               dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR;
+
+       mlx5_core_event(dev, MLX5_DEV_EVENT_SYS_ERROR, 0);
+       mlx5_core_err(dev, "end\n");
+}
+
+static void mlx5_handle_bad_state(struct mlx5_core_dev *dev)
+{
+       u8 nic_interface = get_nic_interface(dev);
+
+       switch (nic_interface) {
+       case MLX5_NIC_IFC_FULL:
+               mlx5_core_warn(dev, "Expected to see disabled NIC but it is full driver\n");
+               break;
+
+       case MLX5_NIC_IFC_DISABLED:
+               mlx5_core_warn(dev, "starting teardown\n");
+               break;
+
+       case MLX5_NIC_IFC_NO_DRAM_NIC:
+               mlx5_core_warn(dev, "Expected to see disabled NIC but it is no dram nic\n");
+               break;
+       default:
+               mlx5_core_warn(dev, "Expected to see disabled NIC but it is has invalid value %d\n",
+                              nic_interface);
        }
+
+       mlx5_disable_device(dev);
+}
+
+static void health_care(struct work_struct *work)
+{
+       struct mlx5_core_health *health;
+       struct mlx5_core_dev *dev;
+       struct mlx5_priv *priv;
+
+       health = container_of(work, struct mlx5_core_health, work);
+       priv = container_of(health, struct mlx5_priv, health);
+       dev = container_of(priv, struct mlx5_core_dev, priv);
+       mlx5_core_warn(dev, "handling bad device here\n");
+       mlx5_handle_bad_state(dev);
 }
 
 static const char *hsynd_str(u8 synd)
@@ -88,6 +163,8 @@ static const char *hsynd_str(u8 synd)
                return "firmware internal error";
        case MLX5_HEALTH_SYNDR_IRISC_ERR:
                return "irisc not responding";
+       case MLX5_HEALTH_SYNDR_HW_UNRECOVERABLE_ERR:
+               return "unrecoverable hardware error";
        case MLX5_HEALTH_SYNDR_CRC_ERR:
                return "firmware CRC error";
        case MLX5_HEALTH_SYNDR_FETCH_PCI_ERR:
@@ -98,48 +175,81 @@ static const char *hsynd_str(u8 synd)
                return "async EQ buffer overrun";
        case MLX5_HEALTH_SYNDR_EQ_ERR:
                return "EQ error";
+       case MLX5_HEALTH_SYNDR_EQ_INV:
+               return "Invalid EQ refrenced";
        case MLX5_HEALTH_SYNDR_FFSER_ERR:
                return "FFSER error";
+       case MLX5_HEALTH_SYNDR_HIGH_TEMP:
+               return "High temprature";
        default:
                return "unrecognized error";
        }
 }
 
-static u16 read_be16(__be16 __iomem *p)
+static u16 get_maj(u32 fw)
 {
-       return swab16(readl((__force u16 __iomem *) p));
+       return fw >> 28;
 }
 
-static u32 read_be32(__be32 __iomem *p)
+static u16 get_min(u32 fw)
 {
-       return swab32(readl((__force u32 __iomem *) p));
+       return fw >> 16 & 0xfff;
+}
+
+static u16 get_sub(u32 fw)
+{
+       return fw & 0xffff;
 }
 
 static void print_health_info(struct mlx5_core_dev *dev)
 {
        struct mlx5_core_health *health = &dev->priv.health;
        struct health_buffer __iomem *h = health->health;
+       char fw_str[18];
+       u32 fw;
        int i;
 
+       /* If the syndrom is 0, the device is OK and no need to print buffer */
+       if (!ioread8(&h->synd))
+               return;
+
        for (i = 0; i < ARRAY_SIZE(h->assert_var); i++)
-               pr_info("assert_var[%d] 0x%08x\n", i, read_be32(h->assert_var + i));
+               dev_err(&dev->pdev->dev, "assert_var[%d] 0x%08x\n", i, ioread32be(h->assert_var + i));
+
+       dev_err(&dev->pdev->dev, "assert_exit_ptr 0x%08x\n", ioread32be(&h->assert_exit_ptr));
+       dev_err(&dev->pdev->dev, "assert_callra 0x%08x\n", ioread32be(&h->assert_callra));
+       fw = ioread32be(&h->fw_ver);
+       sprintf(fw_str, "%d.%d.%d", get_maj(fw), get_min(fw), get_sub(fw));
+       dev_err(&dev->pdev->dev, "fw_ver %s\n", fw_str);
+       dev_err(&dev->pdev->dev, "hw_id 0x%08x\n", ioread32be(&h->hw_id));
+       dev_err(&dev->pdev->dev, "irisc_index %d\n", ioread8(&h->irisc_index));
+       dev_err(&dev->pdev->dev, "synd 0x%x: %s\n", ioread8(&h->synd), hsynd_str(ioread8(&h->synd)));
+       dev_err(&dev->pdev->dev, "ext_synd 0x%04x\n", ioread16be(&h->ext_synd));
+}
+
+static unsigned long get_next_poll_jiffies(void)
+{
+       unsigned long next;
 
-       pr_info("assert_exit_ptr 0x%08x\n", read_be32(&h->assert_exit_ptr));
-       pr_info("assert_callra 0x%08x\n", read_be32(&h->assert_callra));
-       pr_info("fw_ver 0x%08x\n", read_be32(&h->fw_ver));
-       pr_info("hw_id 0x%08x\n", read_be32(&h->hw_id));
-       pr_info("irisc_index %d\n", readb(&h->irisc_index));
-       pr_info("synd 0x%x: %s\n", readb(&h->synd), hsynd_str(readb(&h->synd)));
-       pr_info("ext_sync 0x%04x\n", read_be16(&h->ext_sync));
+       get_random_bytes(&next, sizeof(next));
+       next %= HZ;
+       next += jiffies + MLX5_HEALTH_POLL_INTERVAL;
+
+       return next;
 }
 
 static void poll_health(unsigned long data)
 {
        struct mlx5_core_dev *dev = (struct mlx5_core_dev *)data;
        struct mlx5_core_health *health = &dev->priv.health;
-       unsigned long next;
        u32 count;
 
+       if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) {
+               trigger_cmd_completions(dev);
+               mod_timer(&health->timer, get_next_poll_jiffies());
+               return;
+       }
+
        count = ioread32be(health->health_counter);
        if (count == health->prev)
                ++health->miss_counter;
@@ -148,18 +258,16 @@ static void poll_health(unsigned long data)
 
        health->prev = count;
        if (health->miss_counter == MAX_MISSES) {
-               mlx5_core_err(dev, "device's health compromised\n");
+               dev_err(&dev->pdev->dev, "device's health compromised - reached miss count\n");
                print_health_info(dev);
-               spin_lock_irq(&health_lock);
-               list_add_tail(&health->list, &health_list);
-               spin_unlock_irq(&health_lock);
-
-               queue_work(mlx5_core_wq, &health_work);
        } else {
-               get_random_bytes(&next, sizeof(next));
-               next %= HZ;
-               next += jiffies + MLX5_HEALTH_POLL_INTERVAL;
-               mod_timer(&health->timer, next);
+               mod_timer(&health->timer, get_next_poll_jiffies());
+       }
+
+       if (in_fatal(dev) && !health->sick) {
+               health->sick = true;
+               print_health_info(dev);
+               queue_work(health->wq, &health->work);
        }
 }
 
@@ -167,7 +275,6 @@ void mlx5_start_health_poll(struct mlx5_core_dev *dev)
 {
        struct mlx5_core_health *health = &dev->priv.health;
 
-       INIT_LIST_HEAD(&health->list);
        init_timer(&health->timer);
        health->health = &dev->iseg->health;
        health->health_counter = &dev->iseg->health_counter;
@@ -183,18 +290,33 @@ void mlx5_stop_health_poll(struct mlx5_core_dev *dev)
        struct mlx5_core_health *health = &dev->priv.health;
 
        del_timer_sync(&health->timer);
-
-       spin_lock_irq(&health_lock);
-       if (!list_empty(&health->list))
-               list_del_init(&health->list);
-       spin_unlock_irq(&health_lock);
 }
 
-void mlx5_health_cleanup(void)
+void mlx5_health_cleanup(struct mlx5_core_dev *dev)
 {
+       struct mlx5_core_health *health = &dev->priv.health;
+
+       destroy_workqueue(health->wq);
 }
 
-void  __init mlx5_health_init(void)
+int mlx5_health_init(struct mlx5_core_dev *dev)
 {
-       INIT_WORK(&health_work, health_care);
+       struct mlx5_core_health *health;
+       char *name;
+
+       health = &dev->priv.health;
+       name = kmalloc(64, GFP_KERNEL);
+       if (!name)
+               return -ENOMEM;
+
+       strcpy(name, "mlx5_health");
+       strcat(name, dev_name(&dev->pdev->dev));
+       health->wq = create_singlethread_workqueue(name);
+       kfree(name);
+       if (!health->wq)
+               return -ENOMEM;
+
+       INIT_WORK(&health->work, health_care);
+
+       return 0;
 }